diff --git a/.github/actions/rust-toolchain-setup/action.yml b/.github/actions/rust-toolchain-setup/action.yml
deleted file mode 100644
index bf73fede16c7..000000000000
--- a/.github/actions/rust-toolchain-setup/action.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# yaml-language-server: $schema=https://json.schemastore.org/github-action.json
-
-name: 'Rust toolchain setup'
-description: 'Common setup steps for GitHub workflows for Rust projects'
-
-runs:
-  using: composite
-  steps:
-    - uses: dtolnay/rust-toolchain@1.71.0
-      with:
-        components: clippy, rustfmt
-    - uses: extractions/setup-just@v1
-      with:
-        just-version: '1.15.0' # optional semver specification, otherwise latest
-
-    ###
-    ### Linux setup
-    ###
-    - name: rustup
-      # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
-      if: ${{ (runner.os == 'Linux') }}
-      run: |
-        rustup set profile minimal
-        rustup install
-      shell: bash
-    # - name: Cargo login
-    #   if: ${{ (runner.os == 'Linux') }}
-    #   run: just cargo-login-ci
-    #   shell: bash
-
-      ###
-      ### Windows setup
-      ###
-    - name: rustup
-      # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
-      if: ${{ (runner.os == 'Windows') }}
-      run: |
-        rustup set profile minimal
-        rustup install
-      shell: pwsh
-    # - name: Cargo login
-    #   if: ${{ (runner.os == 'Windows') }}
-    #   run: just cargo-login-ci-windows
-    #   shell: pwsh
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4a5b87b3e69e..e4d1b91bab73 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -47,6 +47,14 @@ jobs:
         # Details on CodeQL's query packs refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
         queries: security-extended,security-and-quality
 
+    # Setup Java to use a version that is not too old for the project
+    - if: ${{ matrix.language == 'java' }}
+      name: Setup Java 11
+      uses: actions/setup-java@v4
+      with:
+        java-version: '11'
+        distribution: 'microsoft'
+
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - if: ${{ matrix.language != 'cpp' }}
diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml
index 03ea773a2513..73df5e31fda6 100644
--- a/.github/workflows/gradle-wrapper-validation.yml
+++ b/.github/workflows/gradle-wrapper-validation.yml
@@ -11,4 +11,4 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: gradle/wrapper-validation-action@v1
+      - uses: gradle/wrapper-validation-action@v3
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index ce8fb3160954..a196226a4b83 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -3,11 +3,14 @@ on:
   issues:
     types: [opened, edited]
 
+permissions:
+  issues: write
+
 jobs:
   triage:
     runs-on: ubuntu-latest
     steps:
-    - uses: github/issue-labeler@v3.3
+    - uses: github/issue-labeler@v3.4
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         configuration-path: .github/labeler.yml
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index c03399f4693b..5bc21595bf88 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -37,7 +37,7 @@ jobs:
         wget https://github.com/dotnet/docfx/releases/download/v${DOCFXVERSION}/docfx-linux-x64-v${DOCFXVERSION}.zip -O build/docfx/docfx.zip
         unzip build/docfx/docfx.zip -d build/docfx
     - name: Install NuGet
-      uses: nuget/setup-nuget@v1
+      uses: nuget/setup-nuget@v2
     - name: Build Documentation
       run: |
         build/docfx/docfx metadata csharp/ApiDocs/docfx.json
diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
index 708842e59f9f..3e553049a186 100644
--- a/.github/workflows/publish-java-apidocs.yml
+++ b/.github/workflows/publish-java-apidocs.yml
@@ -30,7 +30,7 @@ jobs:
           java-version: '11'
           distribution: 'adopt'
       - name: Build with Gradle
-        uses: gradle/gradle-build-action@v2
+        uses: gradle/gradle-build-action@v3
         with:
           build-root-directory: java
           gradle-executable: java/gradlew
diff --git a/.github/workflows/publish-objectivec-apidocs.yml b/.github/workflows/publish-objectivec-apidocs.yml
index b9f3c0b9a398..ebacd38f1f88 100644
--- a/.github/workflows/publish-objectivec-apidocs.yml
+++ b/.github/workflows/publish-objectivec-apidocs.yml
@@ -21,7 +21,7 @@ permissions:
 jobs:
   build:
     name: Generate Objective-C API docs
-    runs-on: macos-13
+    runs-on: macos-latest
     steps:
     - uses: actions/checkout@v4
 
diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
deleted file mode 100644
index 725c40c2ded5..000000000000
--- a/.github/workflows/rust-ci.yml
+++ /dev/null
@@ -1,132 +0,0 @@
-name: Rust
-
-on: [pull_request]
-
-env:
-  CARGO_TERM_COLOR: always
-  RUST_LOG: onnxruntime=debug,onnxruntime-sys=debug
-  RUST_BACKTRACE: 1
-  MANIFEST_PATH: ${{ github.workspace }}/rust/Cargo.toml
-
-jobs:
-  fmt:
-    name: Rustfmt
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - name: fmt
-        run: cargo fmt --all -- --check
-
-  download:
-    name: Download prebuilt ONNX Runtime archive from build.rs
-    runs-on: ubuntu-latest
-    env:
-      ORT_RUST_STRATEGY: download
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - run: rustup target install x86_64-unknown-linux-gnu
-      - run: rustup target install x86_64-apple-darwin
-      - run: rustup target install i686-pc-windows-msvc
-      - run: rustup target install x86_64-pc-windows-msvc
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-unknown-linux-gnu)
-        run: cargo build --target x86_64-unknown-linux-gnu  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-unknown-linux-gnu)
-        run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-apple-darwin)
-        run: cargo build --target x86_64-apple-darwin  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-apple-darwin)
-        run: ls -lh target/x86_64-apple-darwin/debug/build/onnxruntime-sys-*/out/onnxruntime-osx-x64-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, i686-pc-windows-msvc)
-        run: cargo build --target i686-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, i686-pc-windows-msvc)
-        run: ls -lh target/i686-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x86-1.*.zip
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-pc-windows-msvc)
-        run: cargo build --target x86_64-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-pc-windows-msvc)
-        run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x64-1.*.zip
-      # ******************************************************************
-      - name: Download prebuilt archive (GPU, x86_64-unknown-linux-gnu)
-        env:
-          ORT_USE_CUDA: "yes"
-        run: cargo build --target x86_64-unknown-linux-gnu  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (GPU, x86_64-unknown-linux-gnu)
-        run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-gpu-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (GPU, x86_64-pc-windows-msvc)
-        env:
-          ORT_USE_CUDA: "yes"
-        run: cargo build --target x86_64-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (GPU, x86_64-pc-windows-msvc)
-        run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-gpu-x64-1.*.zip
-
-  test:
-    name: Test Suite
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        target:
-          [
-            x86_64-unknown-linux-gnu,
-            x86_64-apple-darwin,
-            x86_64-pc-windows-msvc,
-            i686-pc-windows-msvc,
-          ]
-        include:
-          - target: x86_64-unknown-linux-gnu
-            os: ubuntu-latest
-          - target: x86_64-apple-darwin
-            os: macos-latest
-          - target: x86_64-pc-windows-msvc
-            os: windows-latest
-          - target: i686-pc-windows-msvc
-            os: windows-latest
-    env:
-      CARGO_BUILD_TARGET: ${{ matrix.target }}
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: rustup target install ${{ matrix.target }}
-      - name: Install additional packages (macOS)
-        if: contains(matrix.target, 'x86_64-apple-darwin')
-        run: brew install libomp
-      - name: Build (cargo build)
-        run: cargo build --all --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Build tests (cargo test)
-        run: cargo test --no-run --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Build onnxruntime with 'model-fetching' feature
-        run: cargo build --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching
-      - name: Test onnxruntime-sys
-        run: cargo build --package onnxruntime-sys -- --test-threads=1 --nocapture
-      - name: Test onnxruntime
-        run: cargo test --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching -- --test-threads=1 --nocapture
-
-  clippy:
-    name: Clippy
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: clippy --all-features --manifest-path ${{ env.MANIFEST_PATH }} -- -D warnings
-
-  package-sys:
-    name: Package onnxruntime-sys
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: cargo package --allow-dirty --package onnxruntime-sys
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index c94e3fa5bcb8..181f3fb17d33 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@v9.0.0
+      - uses: actions/stale@v8
         with:
           # Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: contributions welcome, feature request, regression
diff --git a/.gitmodules b/.gitmodules
index 7bb49e98bfec..f874660971d4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "cmake/external/emsdk"]
 	path = cmake/external/emsdk
 	url = https://github.com/emscripten-core/emsdk.git
-	branch = 3.1.44
+	branch = 3.1.51
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 4e5d077b08ff..be95e03479cf 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -132,6 +132,7 @@ exclude_patterns = [
     'onnxruntime/core/flatbuffers/schema/*.fbs.h', # Generated code
     'onnxruntime/core/graph/contrib_ops/quantization_defs.cc',
     'onnxruntime/core/mlas/**', # Contains assembly code
+    'onnxruntime/core/mickey/cutlass_ext/**', # CUTLASS lib recommends NO automatic code formatting
     'winml/lib/Api.Image/shaders/**',  # Contains data chunks
 ]
 command = [
diff --git a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
index b9de1b79e1d5..fd3b7266d30f 100644
--- a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
+++ b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
@@ -29,6 +29,8 @@ extends:
     git:
       submodules: false
     globalSdl: # https://aka.ms/obpipelines/sdl
+      asyncSdl:
+        enabled: false
       tsa:
         enabled: true
       prefast:
@@ -53,10 +55,6 @@ extends:
           BuildArch: x86
           PythonPackageName: pythonx86
 
-      - template: .pipelines/windowsai-steps.yml@self
-        parameters:
-          BuildArch: arm
-
       - template: .pipelines/windowsai-steps.yml@self
         parameters:
           BuildArch: arm64
@@ -72,11 +70,6 @@ extends:
           PythonPackageName: pythonx86
           Runtime: static
 
-      - template: .pipelines/windowsai-steps.yml@self
-        parameters:
-          BuildArch: arm
-          Runtime: static
-
       - template: .pipelines/windowsai-steps.yml@self
         parameters:
           BuildArch: arm64
@@ -94,11 +87,9 @@ extends:
         dependsOn:
         - Windows_Packaging_x64_dynamic
         - Windows_Packaging_x86_dynamic
-        - Windows_Packaging_arm_dynamic
         - Windows_Packaging_arm64_dynamic
         - Windows_Packaging_x64_static
         - Windows_Packaging_x86_static
-        - Windows_Packaging_arm_static
         - Windows_Packaging_arm64_static
         condition: succeeded()
         steps:
@@ -120,12 +111,6 @@ extends:
             artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_dynamic'
             targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
 
-        - task: DownloadPipelineArtifact@0
-          displayName: 'Download Pipeline Artifact - NuGet DirectML arm'
-          inputs:
-            artifactName: 'drop_Windows_Build_Windows_Packaging_arm_dynamic'
-            targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm'
-
         - task: DownloadPipelineArtifact@0
           displayName: 'Download Pipeline Artifact - NuGet DirectML x64 StaticRuntime'
           inputs:
@@ -144,12 +129,6 @@ extends:
             artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_static'
             targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64-static-runtime'
 
-        - task: DownloadPipelineArtifact@0
-          displayName: 'Download Pipeline Artifact - NuGet DirectML arm StaticRuntime'
-          inputs:
-            artifactName: 'drop_Windows_Build_Windows_Packaging_arm_static'
-            targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm-static-runtime'
-
         - task: PowerShell@2
           displayName: 'Bundle NuGet and other binaries'
           inputs:
@@ -194,17 +173,7 @@ extends:
               $arm64_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm64_static_runtime_nuget_package))
               [System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_static_runtime_nuget_package, $arm64_static_runtime_nupkg_unzipped_directory)
 
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
-              $arm_nuget_package = $nupkgs[0].FullName
-              $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm-static-runtime -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
-              $arm_static_runtime_nuget_package = $nupkgs[0].FullName
-              $arm_static_runtime_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_static_runtime_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_static_runtime_nuget_package, $arm_static_runtime_nupkg_unzipped_directory)
+             
 
               $x64_static_runtime_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native')
               $x64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native', 'static')
@@ -216,10 +185,7 @@ extends:
               $arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_static_runtime_path_old = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native', 'static')
-              $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_static_runtime_path_old = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native', 'static')
+              
               $uap_build_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'build', 'native')
               $uap_build_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'build', 'uap10.0')
 
@@ -228,8 +194,6 @@ extends:
               New-Item -Path $x86_static_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_static_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_static_runtime_path_new -ItemType Directory
 
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.dll'))                    $x86_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.lib'))                    $x86_runtime_path_new
@@ -241,11 +205,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm64_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm64_runtime_path_new
 
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.dll'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.lib'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.lib'))   $arm_runtime_path_new
-
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.dll'))                    ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.dll'))
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.lib'))                    ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.lib'))
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
@@ -261,11 +220,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
               Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
 
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.dll'))                    ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.dll'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.lib'))                    ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.lib'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.lib'))   ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
-
               Copy-Item -Recurse $uap_build_path_old $uap_build_path_new
 
               $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
@@ -304,22 +258,13 @@ extends:
               $arm64_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm64_nuget_package))
               [System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_nuget_package, $arm64_nupkg_unzipped_directory)
 
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.snupkg -Recurse)
-              $arm_nuget_package = $nupkgs[0].FullName
-              $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
               $x86_runtime_path_old = [System.IO.Path]::Combine($x86_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
               $x86_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
               $arm64_runtime_path_old = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
-              $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-
+              
               New-Item -Path $x86_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_runtime_path_new -ItemType Directory
 
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.pdb'))                    $x86_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'microsoft.ai.machinelearning.pdb'))   $x86_runtime_path_new
@@ -327,9 +272,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'onnxruntime.pdb'))                  $arm64_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm64_runtime_path_new
 
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.pdb'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.pdb'))   $arm_runtime_path_new
-
               $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
               if (!(Test-Path $merged_nuget_path)) {
                   New-Item -Path $merged_nuget_path -ItemType Directory
diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
index 2ac650b0e6dc..b862dec5e1c8 100644
--- a/.pipelines/nuget_config/x64/packages.config
+++ b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="python" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.12.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
index f80f96194a23..c348dd3e9cda 100644
--- a/.pipelines/nuget_config/x86/packages.config
+++ b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="pythonx86" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.12.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/.pipelines/windowsai-steps.yml b/.pipelines/windowsai-steps.yml
index 292ce60c6b6c..855573de753b 100644
--- a/.pipelines/windowsai-steps.yml
+++ b/.pipelines/windowsai-steps.yml
@@ -80,11 +80,11 @@ jobs:
 
     # must call vsdevcmd first to add cmake to PATH
     - script: |
-        curl -O -L https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-windows-x86_64.zip
-        7z x cmake-3.26.3-windows-x86_64.zip
+        curl -O -L https://github.com/Kitware/CMake/releases/download/v3.28.3/cmake-3.28.3-windows-x86_64.zip
+        7z x cmake-3.28.3-windows-x86_64.zip
         set PYTHONHOME=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
         set PYTHONPATH=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
-        $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\ctest.exe
+        $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --parallel --use_binskim_compliant_compile_flags --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos --windows_sdk_version "10.0.22621.0" $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE"  --cmake_path $(Build.BinariesDirectory)\cmake-3.28.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.28.3-windows-x86_64\bin\ctest.exe
       workingDirectory: '$(Build.BinariesDirectory)'
       displayName: 'Generate cmake config'
 
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2f2adc78f6de..98d23090fd47 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,7 +11,7 @@
         // Auto sort imports
         "editor.formatOnSave": true,
         "editor.codeActionsOnSave": {
-            "source.organizeImports": true
+            "source.organizeImports": "explicit"
         },
         "editor.defaultFormatter": "ms-python.black-formatter"
     },
@@ -21,5 +21,8 @@
     "cpplint.filters": [
         "-build/include_subdir",
         "-runtime/references"
-    ]
+    ],
+    "files.associations": {
+        "span": "cpp"
+    }
 }
diff --git a/CITATION.cff b/CITATION.cff
index 82bcac5a7b75..10b7290022ae 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -3,8 +3,7 @@ title: ONNX Runtime
 message: "Please use this information to cite ONNX Runtime in
   research or other publications."
 authors:
-  - affiliation: Microsoft Corporation
-    given-names: ONNX Runtime developers
+  - name: ONNX Runtime developers
 date-released: 2018-11-29
 url: "https://onnxruntime.ai"
 repository-code: "https://github.com/microsoft/onnxruntime"
diff --git a/README.md b/README.md
index 33bce867e3bd..24c3e191c115 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 
 * **General Information**: [onnxruntime.ai](https://onnxruntime.ai)
 
-* **Usage documention and tutorials**: [onnxruntime.ai/docs](https://onnxruntime.ai/docs)
+* **Usage documentation and tutorials**: [onnxruntime.ai/docs](https://onnxruntime.ai/docs)
 
 * **YouTube video tutorials**: [youtube.com/@ONNXRuntime](https://www.youtube.com/@ONNXRuntime)
 
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 700206180dec..8ec770da2215 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -1829,7 +1829,7 @@ Zbigniew Skowron <zbychs@gmail.com>
 
 _____
 
-HalidelR
+HalideIR
 
 Copyright (c) 2016 HalideIR contributors
 Copyright (c) 2012-2014 MIT CSAIL, Google Inc., and other contributors
@@ -6299,3 +6299,210 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+_____
+
+neural-speed
+
+https://github.com/intel/neural-speed
+
+                                 Apache License
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   ============================================================================
+
+   Copyright 2016-2019 Intel Corporation
+   Copyright 2018 YANDEX LLC
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   This distribution includes third party software ("third party programs").
+   This third party software, even if included with the distribution of
+   the Intel software, may be governed by separate license terms, including
+   without limitation, third party license terms, other Intel software license
+   terms, and open source software license terms. These separate license terms
+   govern your use of the third party programs as set forth in the
+   "THIRD-PARTY-PROGRAMS" file.
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index 092afa15df4d..84cc529467b0 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.17.0
+1.18.0
diff --git a/build_arm64x.bat b/build_arm64x.bat
index fbcdd373086a..1ed268ae94a4 100644
--- a/build_arm64x.bat
+++ b/build_arm64x.bat
@@ -5,7 +5,6 @@
 
 setlocal
 set PATH=C:\Program Files\Git\usr\bin;%PATH%
-set LINK_REPRO_NAME=/mylink.rsp
 
 rem Requires a Python install to be available in your PATH
 python "%~dp0\tools\ci_build\build.py" --arm64 --buildasx  --build_dir "%~dp0\build\arm64-x" %*
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index e8dbc9cf9eff..cf245e63a3a5 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -469,7 +469,7 @@
             "type": "pip",
             "pip": {
                "Name": "transformers",
-               "Version": "2.11.0"
+               "Version": "4.36.0"
             },
             "comments": "Installed in the training docker image"
          }
@@ -570,7 +570,7 @@
             "git": {
                "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
                "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
-            }            
+            }
          }
       }
    ],
diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py
index 81181d3ccfb2..3cecbb0cc977 100644
--- a/cgmanifests/generate_cgmanifest.py
+++ b/cgmanifests/generate_cgmanifest.py
@@ -115,8 +115,8 @@ def normalize_path_separators(path):
 submodule_lines = proc.stdout.splitlines()
 for submodule_line in submodule_lines:
     (absolute_path, url, commit) = submodule_line.split(" ")
-    git_deps[GitDep(commit, url)] = "git submodule at {}".format(
-        normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))
+    git_deps[GitDep(commit, url)] = (
+        f"git submodule at {normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))}"
     )
 
 with open(os.path.join(SCRIPT_DIR, "..", "cmake", "deps.txt")) as f:
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 137ea8a50c01..b26455379b96 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -6,7 +6,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "a896e3d066448b3530dbcaa48869fafefd738f57",
+          "commitHash": "4e2496141eda15040c44e9bbf237a1326368e34c",
           "repositoryUrl": "https://github.com/emscripten-core/emsdk.git"
         },
         "comments": "git submodule at cmake/external/emsdk"
@@ -26,7 +26,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "b86cc54efce19530fb953e4b21f57e6b3888534c",
+          "commitHash": "990217f043af7222348ca8f0301e17fa7b841781",
           "repositoryUrl": "https://github.com/onnx/onnx.git"
         },
         "comments": "git submodule at cmake/external/onnx"
@@ -36,12 +36,22 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "dcd5bd5fd593e31465af3d9ef291d26c646b0a4f",
+          "commitHash": "4a2c63365eff8823a5221db86ef490e828306f9d",
           "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
         },
         "comments": "abseil_cpp"
       }
     },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "dbb0094fd0cb936469e35320bf37e866ef7a1da4",
+          "repositoryUrl": "https://github.com/apple/coremltools.git"
+        },
+        "comments": "coremltools"
+      }
+    },
     {
       "component": {
         "type": "git",
@@ -76,7 +86,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "6df40a2471737b27271bdd9b900ab5f3aec746c7",
+          "commitHash": "0100f6a5779831fa7a651e4b67ef389a8752bd9b",
           "repositoryUrl": "https://github.com/google/flatbuffers.git"
         },
         "comments": "flatbuffers"
@@ -106,7 +116,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "361e8d1cfe0c6c36d30b39f1b61302ece5507320",
+          "commitHash": "344117638c8ff7e239044fd0fa7085839fc03021",
           "repositoryUrl": "https://github.com/google/benchmark.git"
         },
         "comments": "google_benchmark"
@@ -196,7 +206,17 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "a43ce67187bab219520fd80f21af8bbd4354bc8c",
+          "commitHash": "150e7527d5286ddd3a995c228dedf8d76a7a86bc",
+          "repositoryUrl": "https://github.com/intel/neural-speed.git"
+        },
+        "comments": "neural_speed"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "bacfaaa951653cd4e72efe727a543567cb38f7de",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"
@@ -321,6 +341,16 @@
         },
         "comments": "composable_kernel"
       }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "de28d93dfa9ebf3e473127c1c657e1920a5345ee",
+          "repositoryUrl": "https://github.com/microsoft/DirectX-Headers.git"
+        },
+        "comments": "directx_headers"
+      }
     }
   ]
 }
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 73c974f20c25..87355c94223a 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -76,9 +76,10 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through
 # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical
 # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
-cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
 
 option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
+option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
@@ -87,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" ON)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -96,7 +97,6 @@ option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to prov
 option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF)
 option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF)
 
-cmake_dependent_option(onnxruntime_USE_CUTLASS "Build with cutlass support" ON "onnxruntime_USE_CUDA" OFF)
 cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "NOT WIN32; onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON)
 
@@ -117,9 +117,7 @@ option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
 option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)
 option(onnxruntime_DONT_VECTORIZE "Do not vectorize operations in Eigen" OFF)
 
-#It's preferred to turn it OFF when onnxruntime is dynamically linked to PROTOBUF. But Tensort always required the full version of protobuf.
-cmake_dependent_option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF "NOT onnxruntime_USE_TENSORRT" ON)
-option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
+option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF)
 option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
 option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
 cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
@@ -131,6 +129,7 @@ option(onnxruntime_USE_ACL_1902 "Build with ACL version 1902 support" OFF)
 option(onnxruntime_USE_ACL_1905 "Build with ACL version 1905 support" OFF)
 option(onnxruntime_USE_ACL_1908 "Build with ACL version 1908 support" OFF)
 option(onnxruntime_USE_ACL_2002 "Build with ACL version 2002 support" OFF)
+option(onnxruntime_USE_ACL_2308 "Build with ACL version 2308 support" OFF)
 option(onnxruntime_USE_ARMNN "Build with ArmNN support" OFF)
 option(onnxruntime_ARMNN_RELU_USE_CPU "Use the CPU implementation for the Relu operator for the ArmNN EP" ON)
 option(onnxruntime_ARMNN_BN_USE_CPU "Use the CPU implementation for the Batch Normalization operator for the ArmNN EP" ON)
@@ -324,17 +323,29 @@ if (onnxruntime_USE_ROCM)
   endif()
 
   # replicate strategy used by pytorch to get ROCM_VERSION
-  # https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173
-  file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW)
-  string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
-  if (ROCM_VERSION_DEV_MATCH)
+  # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
+  # with modification
+  if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
+    file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
+    string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
+  elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
+    file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
+    string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
+  elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
+    file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
+    string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
+  endif()
+
+  if (ROCM_VERSION_MATCH)
     set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
     set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
     set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
     set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
     math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
+  else()
+    message(FATAL_ERROR "Cannot determine ROCm version string")
   endif()
-  message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version-dev ****\n")
+  message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
   message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
   message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
   message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
@@ -354,13 +365,7 @@ if (onnxruntime_USE_ROCM)
   endif()
 endif()
 
-if (APPLE)
-    if (NOT CMAKE_OSX_ARCHITECTURES)
-        message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-    endif()
-elseif (NOT WIN32 AND NOT APPLE)
-    message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}")
-endif()
+
 
 # Single output director for all binaries
 set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
@@ -493,6 +498,14 @@ endif()
 
 include(adjust_global_compile_flags.cmake)
 
+if (APPLE)
+  if (NOT CMAKE_OSX_ARCHITECTURES)
+    message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR} CPU ARCH")
+  endif()
+elseif (NOT WIN32 AND NOT APPLE)
+  message("Building ONNX Runtime for ${onnxruntime_target_platform} CPU ARCH")
+endif()
+
 # We need to link with libatomic on systems that do not have built-in atomics, or
 # don't have built-in support for 8 byte atomics
 # Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
@@ -638,8 +651,18 @@ else()
   check_cxx_compiler_flag(-Wunused-but-set-variable HAS_UNUSED_BUT_SET_VARIABLE)
   check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE)
   check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST)
+  check_cxx_compiler_flag(-Wstringop-overflow HAS_STRINGOP_OVERFLOW)
   check_function_exists(reallocarray HAS_REALLOCARRAY)
-
+  if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64")
+   check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16)
+   if(NOT HAS_ARM64_BFLOAT16)
+     message(FATAL_ERROR  "The compiler doesn't support BFLOAT16!!!")
+   endif()
+   check_cxx_compiler_flag(-march=armv8.2-a+fp16 HAS_ARM64_FLOAT16)
+   if(NOT HAS_ARM64_FLOAT16)
+     message(FATAL_ERROR  "The compiler doesn't support FLOAT16!!!")
+   endif()
+  endif()
   if (HAS_TAUTOLOGICAL_POINTER_COMPARE)
     #we may have extra null pointer checkings in debug build, it's not an issue
     list(APPEND ORT_WARNING_FLAGS -Wno-tautological-pointer-compare)
@@ -694,20 +717,19 @@ if (onnxruntime_USE_CUDA)
   enable_language(CUDA)
   message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")
 
+  if (onnxruntime_DISABLE_CONTRIB_OPS)
+    set(onnxruntime_USE_FLASH_ATTENTION OFF)
+    set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
+  endif()
   if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6)
-    message( STATUS "Turn off cutlass since CUDA compiler version < 11.6")
-    set(onnxruntime_USE_CUTLASS OFF)
+    message( STATUS "Turn off flash attention since CUDA compiler version < 11.6")
+    set(onnxruntime_USE_FLASH_ATTENTION OFF)
+    set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
+  endif()
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.4)
+    message( FATAL_ERROR "Failed build due to CUDA compiler version < 11.4")
   endif()
 else()
-  set(onnxruntime_USE_CUTLASS OFF)
-endif()
-
-if (NOT onnxruntime_USE_CUTLASS OR onnxruntime_DISABLE_CONTRIB_OPS)
-    if (onnxruntime_DISABLE_CONTRIB_OPS)
-      message( STATUS "Turn off flash attention/memory efficient attention since contrib ops are disabled")
-    else()
-      message( STATUS "Turn off flash attention/memory efficient attention since cutlass is not enabled")
-    endif()
   set(onnxruntime_USE_FLASH_ATTENTION OFF)
   set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
 endif()
@@ -727,8 +749,8 @@ if (onnxruntime_USE_CUDA)
       list(APPEND ORT_PROVIDER_FLAGS -DUSE_MEMORY_EFFICIENT_ATTENTION=1)
       list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_MEMORY_EFFICIENT_ATTENTION=1)
     endif()
-
 endif()
+
 if (onnxruntime_USE_VITISAI)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_VITISAI=1)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_VITISAI=1)
@@ -769,6 +791,38 @@ if (onnxruntime_USE_QNN)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_QNN=1)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_QNN=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES qnn)
+    if (NOT QNN_ARCH_ABI)
+      string(TOLOWER ${onnxruntime_target_platform} GEN_PLATFORM)
+      if(MSVC)
+        message(STATUS "Building MSVC for architecture ${CMAKE_SYSTEM_PROCESSOR} with CMAKE_GENERATOR_PLATFORM as ${GEN_PLATFORM}")
+        if (${GEN_PLATFORM} STREQUAL "arm64")
+          set(QNN_ARCH_ABI aarch64-windows-msvc)
+        else()
+          set(QNN_ARCH_ABI x86_64-windows-msvc)
+        endif()
+      else()
+        if (${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+          set(QNN_ARCH_ABI aarch64-android-clang6.0)
+        elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+          if (${GEN_PLATFORM} STREQUAL "x86_64")
+            set(QNN_ARCH_ABI x86_64-linux-clang)
+          else()
+            set(QNN_ARCH_ABI aarch64-android)
+          endif()
+        endif()
+      endif()
+    endif()
+
+    if (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+      file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libQnn*.so" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/Qnn*.dll")
+      if (${QNN_ARCH_ABI} STREQUAL "aarch64-windows-msvc")
+        file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so"
+		                                       "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so"
+		                                       "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat")
+        list(APPEND QNN_LIB_FILES ${EXTRA_HTP_LIB})
+      endif()
+      message(STATUS "QNN lib files: " ${QNN_LIB_FILES})
+    endif()
 endif()
 if (onnxruntime_USE_SNPE)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_SNPE=1)
@@ -893,8 +947,8 @@ function(onnxruntime_set_compile_flags target_name)
       target_compile_definitions(${target_name} PRIVATE ENABLE_ATEN)
     endif()
 
-    if (onnxruntime_USE_CUTLASS)
-      target_compile_definitions(${target_name} PRIVATE USE_CUTLASS)
+    if(USE_NEURAL_SPEED)
+      target_compile_definitions(${target_name} PRIVATE ORT_NEURAL_SPEED)
     endif()
 
     set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON)
@@ -976,9 +1030,12 @@ function(onnxruntime_set_compile_flags target_name)
       foreach(FLAG ${ORT_WARNING_FLAGS})
         target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options ${FLAG}>")
       endforeach()
-      if ((NVCC_HAS_STRICT_ALIASING AND "${target_name}" MATCHES "cuda") OR (HAS_STRICT_ALIASING AND NOT "${target_name}" MATCHES "cuda"))
+      if (NVCC_HAS_STRICT_ALIASING AND "${target_name}" MATCHES "cuda")
         target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Wno-strict-aliasing>")
       endif()
+      if (HAS_STRICT_ALIASING AND NOT "${target_name}" MATCHES "cuda")
+          target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-strict-aliasing>")
+      endif()
     endif()
     if (onnxruntime_USE_ROCM)
       # flags are detected with CXX language mode, some flags are not supported with hipclang
@@ -1099,7 +1156,7 @@ function(onnxruntime_add_include_to_target dst_target)
 endfunction()
 
 # ACL
-if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002)
+if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002 OR onnxruntime_USE_ACL_2308)
   set(onnxruntime_USE_ACL ON)
   if (onnxruntime_USE_ACL_1902)
     add_definitions(-DACL_1902=1)
@@ -1110,7 +1167,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905
       if (onnxruntime_USE_ACL_2002)
         add_definitions(-DACL_2002=1)
       else()
-        add_definitions(-DACL_1905=1)
+	if (onnxruntime_USE_ACL_2308)
+	  add_definitions(-DACL_2308=1)
+	else()
+          add_definitions(-DACL_1905=1)
+	endif()
       endif()
     endif()
   endif()
@@ -1177,14 +1238,10 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-set(USE_JBLAS FALSE)
-if (onnxruntime_USE_JBLAS AND NOT onnxruntime_MINIMAL_BUILD)
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
-    add_compile_definitions(MLAS_JBLAS)
-    set(USE_JBLAS TRUE)
-  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
-    add_compile_definitions(MLAS_JBLAS)
-    set(USE_JBLAS TRUE)
+if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_TVM)
+  include(neural_speed)
+  if (USE_NEURAL_SPEED)
+    list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla)
   endif()
 endif()
 
@@ -1228,17 +1285,15 @@ if (onnxruntime_USE_TVM)
     $<TARGET_PROPERTY:tvm,INTERFACE_INCLUDE_DIRECTORIES>)
 
   set(onnxruntime_tvm_libs onnxruntime_providers_tvm)
-
-  # needs to link with stdc++fs in Linux
-  if (UNIX)
-    if (NOT APPLE)
-      set(FS_STDLIB stdc++fs)
-    endif()
-  endif()
-  list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm ${FS_STDLIB})
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm)
   list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm)
 endif()
 
+# needs to link with stdc++fs in Linux
+if (UNIX AND "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
+  set(FS_STDLIB stdc++fs)
+endif()
+list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${FS_STDLIB})
 
 # onnxruntime-extensions
 if (onnxruntime_USE_EXTENSIONS)
@@ -1248,11 +1303,7 @@ endif()
 #Dependencies end. In the next we'll enable "treat warning as error"
 
 #Adjust warning flags
-if (onnxruntime_USE_CUDA)
-  set_msvc_c_cpp_compiler_warning_level(3)
-else()
-  set_msvc_c_cpp_compiler_warning_level(4)
-endif()
+set_msvc_c_cpp_compiler_warning_level(4)
 
 set(onnxruntime_DELAYLOAD_FLAGS "")
 
@@ -1271,34 +1322,6 @@ if (onnxruntime_USE_OPENVINO)
 
   add_definitions(-DUSE_OPENVINO=1)
 
-  if (EXISTS "$ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt")
-    file(READ $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt VER)
-  endif()
-
-  if (NOT DEFINED ENV{INTEL_OPENVINO_DIR})
-    message(FATAL_ERROR "[Couldn't locate OpenVINO] OpenVINO may not have been initialized")
-  endif()
-
-  # Check OpenVINO version for support
-  if ($ENV{INTEL_OPENVINO_DIR} MATCHES "2022.3")
-    set(OPENVINO_VERSION "2022.3")
-    add_definitions(-DOPENVINO_2022_3=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.0")
-    set(OPENVINO_VERSION "2023.0")
-    add_definitions(-DOPENVINO_2023_0=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.1")
-    set(OPENVINO_VERSION "2023.1")
-    add_definitions(-DOPENVINO_2023_1=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.2")
-    set(OPENVINO_VERSION "2023.2")
-    add_definitions(-DOPENVINO_2023_1=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "openvino")
-    set(OPENVINO_VERSION "2023.2")
-    add_definitions(-DOPENVINO_2023_2=1)
-  else()
-    message(FATAL_ERROR "Unsupported OpenVINO version: ${INTEL_OPENVINO_DIR}")
-  endif()
-
   if (onnxruntime_USE_OPENVINO_GPU_FP32)
     add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
   endif()
@@ -1315,6 +1338,10 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1)
   endif()
 
+  if (onnxruntime_USE_OPENVINO_NPU)
+    add_definitions(-DOPENVINO_CONFIG_NPU=1)
+  endif()
+
   if (onnxruntime_USE_OPENVINO_GPU_FP32_NP)
     add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
     add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
@@ -1335,6 +1362,11 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
   endif()
 
+  if (onnxruntime_USE_OPENVINO_NPU_NP)
+    add_definitions(-DOPENVINO_CONFIG_NPU=1)
+    add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
+  endif()
+
   if (onnxruntime_USE_OPENVINO_HETERO)
     add_definitions(-DOPENVINO_CONFIG_HETERO=1)
     add_definitions(-DDEVICE_NAME="${onnxruntime_USE_OPENVINO_DEVICE}")
@@ -1389,6 +1421,10 @@ endif()
 if (onnxruntime_USE_CUDA)
   set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
   set(CMAKE_CUDA_STANDARD 17)
+  if(onnxruntime_CUDA_HOME)
+    file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
+  endif()
+  find_package(CUDAToolkit REQUIRED)
   if(onnxruntime_CUDNN_HOME)
     file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
   endif()
@@ -1430,6 +1466,11 @@ if (onnxruntime_USE_CUDA)
   if (NOT WIN32)
     list(APPEND CUDA_NVCC_FLAGS --compiler-options -fPIC)
   endif()
+  if(MSVC)
+    if(CUDA_NVCC_FLAGS MATCHES "Zi")
+      list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-FS")
+    endif()
+  endif()
   # Options passed to cudafe
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=unsigned_compare_with_zero\"")
@@ -1589,7 +1630,7 @@ if (UNIX AND onnxruntime_USE_NCCL)
 else()
   set(onnxruntime_USE_NCCL OFF)
   set(onnxruntime_USE_MPI OFF)
-message( WARNING "MPI and NCCL disabled on Win build." )
+  message( WARNING "MPI and NCCL are disabled because build is on Windows or USE_NCCL is set to OFF." )
 endif()
 
 if (onnxruntime_USE_MPI)
@@ -1718,14 +1759,12 @@ if(onnxruntime_BUILD_KERNEL_EXPLORER)
 endif()
 
 # When GDK_PLATFORM is set then WINAPI_FAMILY is defined in gdk_toolchain.cmake (along with other relevant flags/definitions).
-if (WIN32 AND NOT GDK_PLATFORM)
+if (WIN32 AND NOT GDK_PLATFORM AND NOT CMAKE_CROSSCOMPILING)
   if (NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib)
     # On onecore, link to the onecore build of the MSVC runtime
     get_filename_component(msvc_path "${CMAKE_C_COMPILER}/../../../.." ABSOLUTE)
     link_directories(BEFORE "${msvc_path}/lib/onecore/${onnxruntime_target_platform}")
-    # The .lib files in the MSVC runtime have a DEFAULITLIB entry for onecore.lib, which in turn links to reverse forwarders.
-    # We ignore that entry and use onecore_apiset.lib instead, since system components must not rely on reverse forwarders.
-    add_link_options("/NODEFAULTLIB:onecore.lib")
+    # The .lib files in the MSVC runtime have a DEFAULITLIB entry for onecore.lib, but it shold not cause any conflict with onecoreuap.lib
   endif()
 endif()
 
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 3085beb37927..74d6418ac541 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -8,6 +8,15 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Android")
   string(APPEND CMAKE_ASM_FLAGS_RELEASE " -O3")
 endif()
 
+# Suggested by https://gitlab.kitware.com/cmake/cmake/-/issues/20132
+# MacCatalyst is not well supported in CMake
+# The error that can emerge without this flag can look like:
+# "clang : error : overriding '-mmacosx-version-min=11.0' option with '-target x86_64-apple-ios14.0-macabi' [-Werror,-Woverriding-t-option]"
+if (PLATFORM_NAME STREQUAL "macabi")
+  add_compile_options(-Wno-overriding-t-option)
+  add_link_options(-Wno-overriding-t-option)
+endif()
+
 # Enable space optimization for gcc/clang
 # Cannot use "-ffunction-sections -fdata-sections" if we enable bitcode (iOS)
 if (NOT MSVC AND NOT onnxruntime_ENABLE_BITCODE)
@@ -16,9 +25,7 @@ if (NOT MSVC AND NOT onnxruntime_ENABLE_BITCODE)
 endif()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-  string(APPEND CMAKE_C_FLAGS " -s STRICT=1 -s DEFAULT_TO_CXX=1")
-  string(APPEND CMAKE_CXX_FLAGS " -s STRICT=1 -s DEFAULT_TO_CXX=1")
-  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s ALLOW_UNIMPLEMENTED_SYSCALLS=1")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s ALLOW_UNIMPLEMENTED_SYSCALLS=1 -s DEFAULT_TO_CXX=1")
 
   # Enable LTO for release single-thread build
   if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -74,11 +81,6 @@ if (onnxruntime_MINIMAL_BUILD)
   endif()
 
   if (MSVC)
-    # turn on LTO (which adds some compiler flags and turns on LTCG) unless it's a Debug build to minimize binary size
-    if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
-      set(onnxruntime_ENABLE_LTO ON)
-    endif()
-
     # undocumented internal flag to allow analysis of a minimal build binary size
     if (ADD_DEBUG_INFO_TO_MINIMAL_BUILD)
       string(APPEND CMAKE_CXX_FLAGS " /Zi")
@@ -99,7 +101,7 @@ if (onnxruntime_MINIMAL_BUILD)
   endif()
 endif()
 
-# enable stream for all the non-minimal build
+# Enable stream for all the non-minimal build
 if (NOT onnxruntime_MINIMAL_BUILD)
   add_compile_definitions(ORT_ENABLE_STREAM)
 endif()
@@ -130,6 +132,11 @@ if (onnxruntime_DISABLE_RTTI)
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/GR->" "$<$<COMPILE_LANGUAGE:CXX>:/we4541>")
   else()
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>")
+    if (onnxruntime_USE_WEBNN)
+      # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
+      # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/7001
+      add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0>")
+    endif()
   endif()
 else()
   #MSVC RTTI flag /GR is not added to CMAKE_CXX_FLAGS by default. But, anyway VC++2019 treats "/GR" default on.
@@ -207,7 +214,7 @@ endif()
 
 
 macro(check_nvcc_compiler_flag _FLAG _RESULT)
-    execute_process(COMMAND ${onnxruntime_CUDA_HOME}/bin/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR)
+    execute_process(COMMAND ${CUDAToolkit_BIN_DIR}/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR)
     message("NVCC_ERROR = ${NVCC_ERROR}")
     message("NVCC_OUT = ${NVCC_OUT}")
     if ("${NVCC_OUT}" MATCHES "0")
@@ -267,39 +274,38 @@ if (MSVC)
     string(APPEND CMAKE_C_FLAGS " /arch:AVX512")
   endif()
 
-  if (NOT GDK_PLATFORM)
-    add_compile_definitions(WINAPI_FAMILY=100) # Desktop app
-    message("Building ONNX Runtime for Windows 10 and newer")
-    add_compile_definitions(WINVER=0x0A00 _WIN32_WINNT=0x0A00 NTDDI_VERSION=0x0A000000)
-  endif()
   if (onnxruntime_ENABLE_LTO AND NOT onnxruntime_USE_CUDA)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Gw /GL")
   endif()
-
-  # The WinML build tool chain builds ARM/ARM64, and the internal tool chain does not have folders for spectre mitigation libs.
-  # WinML performs spectre mitigation differently.
-  if (NOT DEFINED onnxruntime_DISABLE_QSPECTRE_CHECK)
-    check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE)
-    if (HAS_QSPECTRE)
-      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qspectre")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qspectre")
-    endif()
-  endif()
-  set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} /DYNAMICBASE")
-  check_cxx_compiler_flag(-guard:cf HAS_GUARD_CF)
-  if (HAS_GUARD_CF)
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /guard:cf")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /guard:cf")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /guard:cf")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /guard:cf")
-    set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} /guard:cf")
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /guard:cf")
-    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} /guard:cf")
-  endif()
 else()
   if (NOT APPLE)
+    #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker
+    #image on an aarch64 machine with an aarch64 Ubuntu host OS, in the docker instance cmake may still report
+    # CMAKE_SYSTEM_PROCESSOR as aarch64 by default. Given compiling this code may need more than 2GB memory, we do not
+    # support compiling for ARM32 natively(only support cross-compiling), we will ignore this issue for now.
+    if(NOT CMAKE_SYSTEM_PROCESSOR)
+      message(WARNING "CMAKE_SYSTEM_PROCESSOR is not set. Please set it in your toolchain cmake file.")
+      # Try to detect it
+      if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+        execute_process(
+		COMMAND "${CMAKE_C_COMPILER}" -dumpmachine
+		OUTPUT_VARIABLE GCC_DUMP_MACHINE_OUT OUTPUT_STRIP_TRAILING_WHITESPACE
+		ERROR_VARIABLE _err
+		RESULT_VARIABLE _res
+		)
+		if(NOT _res EQUAL 0)
+			message(SEND_ERROR "Failed to run 'gcc -dumpmachine':\n ${_res}")
+		endif()
+		string(REPLACE "-" ";" GCC_DUMP_MACHINE_OUT_LIST "${GCC_DUMP_MACHINE_OUT}")
+		list(LENGTH GCC_DUMP_MACHINE_OUT_LIST GCC_TRIPLET_LEN)
+		if(GCC_TRIPLET_LEN EQUAL 4)
+		  list(GET GCC_DUMP_MACHINE_OUT_LIST 0 CMAKE_SYSTEM_PROCESSOR)
+          message("Setting CMAKE_SYSTEM_PROCESSOR to ${CMAKE_SYSTEM_PROCESSOR}")
+        endif()
+      endif()
+    endif()
     set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR})
   endif()
   if (onnxruntime_BUILD_FOR_NATIVE_MACHINE)
@@ -353,16 +359,9 @@ else()
 
 endif()
 
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    #For Mac compliance
-    message("Adding flags for Mac builds")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
-elseif (WIN32)
-    # parallel build
-    # These compiler opitions cannot be forwarded to NVCC, so cannot use add_compiler_options
-    string(APPEND CMAKE_CXX_FLAGS " /MP")
+if (WIN32)
     # required to be set explicitly to enable Eigen-Unsupported SpecialFunctions
     string(APPEND CMAKE_CXX_FLAGS " -DEIGEN_HAS_C99_MATH")
-else()
+elseif(LINUX)
     add_compile_definitions("_GNU_SOURCE")
 endif()
diff --git a/cmake/deps.txt b/cmake/deps.txt
index ff0780301307..d0f455167168 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -12,7 +12,8 @@
 # NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
 # See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
 #
-abseil_cpp;https://github.com/abseil/abseil-cpp/archive/dcd5bd5fd593e31465af3d9ef291d26c646b0a4f.zip;6cc204586014e189f5c0fe3274f83162fa7c700c
+abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240116.0.zip;bc2cec6baaad67fcb6c0c38972b687d4797927e9
+coremltools;https://github.com/apple/coremltools/archive/refs/tags/7.1.zip;f1bab0f30966f2e217d8e01207d518f230a1641a
 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
 dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b31321e5549591d78aa7f377173445
@@ -22,10 +23,10 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132
 # Until the 3.4.1 release this is the best option we have.
 # Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744
 eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a
-flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v1.12.0.zip;ba0a75fd12dbef8f6557a74e611b7a3d0c5fe7bf
+flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c
 fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
-google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.7.0.zip;e97c368b176e8614e3f1bf13dd9abcf6a7ad9908
+google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip;bf9870756ee3f8d2d3b346b24ee3600a41c74d3d
 google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 googlexnnpack;https://github.com/google/XNNPACK/archive/0da379fc4808f9601faef392352018c741c0f297.zip;663883491e380b628e0a5b162b5f2658032fae73
@@ -34,9 +35,10 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
-#use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/a43ce67187bab219520fd80f21af8bbd4354bc8c.zip;572535aefef477050f86744dfab1fef840198035
+neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d
+#use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
@@ -55,3 +57,4 @@ cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.1.0.zip;757f90a79
 utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
 composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
+directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
\ No newline at end of file
diff --git a/cmake/deps_update_and_upload.py b/cmake/deps_update_and_upload.py
index d357284d9122..63df3f6f0386 100644
--- a/cmake/deps_update_and_upload.py
+++ b/cmake/deps_update_and_upload.py
@@ -1,56 +1,109 @@
-# in case deps.txt is updated, run this file to update and upload the dependencies so that CI can use them.
-# Before running the script, increase the version number found at:
+# If deps.txt is updated, run this file to update and upload the dependencies so that CI can use them.
+#
+# Before running the script, find the latest version number at:
 # https://aiinfra.visualstudio.com/Lotus/_artifacts/feed/Lotus/UPack/onnxruntime_build_dependencies/versions
+# Increment it to obtain a new version number to use.
+#
 # Run without --do-upload once to verify downloading. Use --do-upload when you are ready to publish.
-# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --do-upload
-# update version number in tools\ci_build\github\azure-pipelines\templates\download-deps.yml
+# E.g.:
+#   python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82
+#   # check contents of C:/temp/onnxruntime_deps
+#   python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --no-download --do-upload
+#
+# Next, update the version number in tools/ci_build/github/azure-pipelines/templates/download-deps.yml.
+
+import argparse
+import contextlib
+import pathlib
 import re
 import subprocess
-import os
-import argparse
 import tempfile
 
+script_dir = pathlib.Path(__file__).parent
+
 parser = argparse.ArgumentParser(description="Update dependencies and publish to Azure Artifacts")
 parser.add_argument(
-    "--root-path", type=str, default=tempfile.gettempdir(), help="Target root path for downloaded files"
+    "--root-path",
+    type=pathlib.Path,
+    help="Target root path for downloaded files. If not provided, a temporary directory is used.",
+)
+parser.add_argument(
+    "--version",
+    type=str,
+    help="Package version to publish",
+)
+parser.add_argument(
+    "--do-upload",
+    action="store_true",
+    dest="upload",
+    help="Upload the package to Azure Artifacts",
+)
+parser.add_argument(
+    "--no-download",
+    action="store_false",
+    dest="download",
+    help="Skip downloading the dependency files. "
+    "Use with '--do-upload' and '--root-path' to upload the package from existing dependency files.",
 )
-parser.add_argument("--version", type=str, default="1.0.82", help="Package version to publish")
-parser.add_argument("--do-upload", action="store_true", help="Upload the package to Azure Artifacts")
 args = parser.parse_args()
 
-with open("cmake/deps.txt") as file:
+if args.upload:
+    assert args.version is not None, "'--version' must be specified if uploading."
+
+if args.upload != args.download:
+    assert args.root_path is not None, "'--root-path' must be specified if only downloading or uploading."
+
+deps_path = script_dir / "deps.txt"
+with open(deps_path) as file:
     text = file.read()
 
 lines = [line for line in text.split("\n") if not line.startswith("#") and ";" in line]
 
-root_path = args.root_path
-
-for line in lines:
-    url = re.sub("^[^;]+?;https://([^;]+?);.*", r"https://\1", line)
-    filename = re.sub("^[^;]+?;https://([^;]+?);.*", r"\1", line)
-    full_path = os.path.join(root_path, filename)
-    subprocess.run(["curl", "-sSL", "--create-dirs", "-o", full_path, url])  # noqa: PLW1510
-
-package_name = "onnxruntime_build_dependencies"
-version = args.version
-
-# Check if the user is logged in to Azure
-result = subprocess.run("az account show", shell=True, capture_output=True, text=True)  # noqa: PLW1510
-if "No subscriptions found" in result.stderr:
-    # Prompt the user to log in to Azure
-    print("You are not logged in to Azure. Please log in to continue.")
-    subprocess.run("az login", shell=True)  # noqa: PLW1510
-
-# Publish the package to Azure Artifacts if --no-upload is not specified
-
-cmd = f'az artifacts universal publish --organization https://dev.azure.com/onnxruntime --feed onnxruntime --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
-if args.do_upload:
-    subprocess.run(cmd, shell=True)  # noqa: PLW1510
-else:
-    print("would have run: " + cmd)
-
-cmd = f'az artifacts universal publish --organization https://dev.azure.com/aiinfra --feed Lotus --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
-if args.do_upload:
-    subprocess.run(cmd, shell=True)  # noqa: PLW1510
-else:
-    print("would have run: " + cmd)
+with contextlib.ExitStack() as context_stack:
+    if args.root_path is not None:
+        root_path = args.root_path.resolve()
+        root_path.mkdir(parents=True, exist_ok=True)
+    else:
+        temp_dir_name = context_stack.enter_context(tempfile.TemporaryDirectory())
+        root_path = pathlib.Path(temp_dir_name)
+
+    if args.download:
+        print(f"Downloading dependencies to directory: {root_path}")
+
+        dep_pattern = re.compile(r"^[^;]+;https://([^;]+);.*$")
+
+        for line in lines:
+            match = dep_pattern.fullmatch(line)
+            if match is None:
+                continue
+
+            dep_path = match[1]
+            url = f"https://{dep_path}"
+            full_path = root_path / dep_path
+
+            subprocess.run(["curl", "-sSL", "--create-dirs", "-o", str(full_path), url], check=True)
+
+    package_name = "onnxruntime_build_dependencies"
+    version = args.version if args.version is not None else "VERSION_PLACEHOLDER"
+
+    if args.upload:
+        # Check if the user is logged in to Azure
+        result = subprocess.run("az account show", shell=True, capture_output=True, text=True, check=False)
+        if "No subscriptions found" in result.stderr:
+            # Prompt the user to log in to Azure
+            print("You are not logged in to Azure. Please log in to continue.")
+            subprocess.run("az login", shell=True, check=True)
+
+    # Publish the package to Azure Artifacts if --do-upload is specified
+
+    cmd = f'az artifacts universal publish --organization https://dev.azure.com/onnxruntime --feed onnxruntime --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
+    if args.upload:
+        subprocess.run(cmd, shell=True, check=True)
+    else:
+        print("would have run: " + cmd)
+
+    cmd = f'az artifacts universal publish --organization https://dev.azure.com/aiinfra --feed Lotus --name {package_name} --version {version} --description "onnxruntime build time dependencies" --path {root_path}'
+    if args.upload:
+        subprocess.run(cmd, shell=True, check=True)
+    else:
+        print("would have run: " + cmd)
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index 3bcd4109e288..57cfbee4644e 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -19,7 +19,7 @@ if(WIN32 AND NOT Patch_FOUND)
   set(ABSL_ENABLE_INSTALL ON)
 endif()
 # NB! Advancing Abseil version changes its internal namespace,
-# currently absl::lts_20230125 which affects abseil-cpp.natvis debugger
+# currently absl::lts_20240116 which affects abseil-cpp.natvis debugger
 # visualization file, that must be adjusted accordingly, unless we eliminate
 # that namespace at build time.
 FetchContent_Declare(
diff --git a/cmake/external/abseil-cpp.natvis b/cmake/external/abseil-cpp.natvis
index 1e5a36fb9efb..a4fb63b6a837 100644
--- a/cmake/external/abseil-cpp.natvis
+++ b/cmake/external/abseil-cpp.natvis
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-  <Type Name="absl::lts_20230802::InlinedVector&lt;*&gt;">
+  <Type Name="absl::lts_20240116::InlinedVector&lt;*&gt;">
     <Intrinsic Name="_size" Expression="storage_.metadata_.value >> 1"/>
     <Intrinsic Name="_is_allocated" Expression="(storage_.metadata_.value &amp; 1) == 1"/>
     <Intrinsic Name="_inlined_data" Expression="($T1*)storage_.data_.inlined.inlined_data"/>
@@ -24,7 +24,7 @@
     </Expand>
   </Type>
   <!-- Should handle both flat hash_set and hash_map -->
-  <Type Name="absl::lts_20230802::container_internal::raw_hash_set&lt;*&gt;">
+  <Type Name="absl::lts_20240116::container_internal::raw_hash_set&lt;*&gt;">
     <Intrinsic Name="_commonfields" Expression="settings_.value"/>
     <Intrinsic Name="_size" Expression="settings_.value.compressed_tuple_.value"/>
     <Intrinsic Name="_capacity" Expression="_commonfields().capacity_"/>
@@ -51,7 +51,7 @@
   </Type>
 
   <!-- Primitive types stored as a value -->
-  <Type Name="absl::lts_20230802::container_internal::Storage&lt;*,*,0&gt;">
+  <Type Name="absl::lts_20240116::container_internal::Storage&lt;*,*,0&gt;">
     <DisplayString IncludeView="noparens">*($T1 *){value}</DisplayString>
     <DisplayString ExcludeView="noparens">(*($T1 *){value})</DisplayString>
     <Expand>
@@ -60,7 +60,7 @@
   </Type>
 
   <!-- For storage inherited from the type -->
-  <Type Name="absl::lts_20230802::container_internal::Storage&lt;*,*,1&gt;">
+  <Type Name="absl::lts_20240116::container_internal::Storage&lt;*,*,1&gt;">
     <DisplayString IncludeView="noparens">*($T1 *)this</DisplayString>
     <DisplayString ExcludeView="noparens">(*($T1 *)this)</DisplayString>
     <Expand>
@@ -68,7 +68,7 @@
     </Expand>
   </Type>
 
-  <Type Name="absl::lts_20230802::container_internal::map_slot_type&lt;*&gt;">
+  <Type Name="absl::lts_20240116::container_internal::map_slot_type&lt;*&gt;">
     <DisplayString IncludeView="noparens">{value.first}, {value.second}</DisplayString>
     <DisplayString ExcludeView="noparens">({value.first}, {value.second})</DisplayString>
     <Expand>
diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index efc708bd681c..f04f4bec76cd 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -1,13 +1,11 @@
-if (onnxruntime_USE_CUTLASS)
-  include(FetchContent)
-  FetchContent_Declare(
-    cutlass
-    URL ${DEP_URL_cutlass}
-    URL_HASH SHA1=${DEP_SHA1_cutlass}
-  )
+include(FetchContent)
+FetchContent_Declare(
+  cutlass
+  URL ${DEP_URL_cutlass}
+  URL_HASH SHA1=${DEP_SHA1_cutlass}
+)
 
-  FetchContent_GetProperties(cutlass)
-  if(NOT cutlass_POPULATED)
-    FetchContent_Populate(cutlass)
-  endif()
+FetchContent_GetProperties(cutlass)
+if(NOT cutlass_POPULATED)
+  FetchContent_Populate(cutlass)
 endif()
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index 5d25b9529e03..8f18059ffdfe 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.12.1)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.13.1)
 
   # Restore nuget packages, which will pull down the DirectML redist package.
   add_custom_command(
@@ -72,12 +72,11 @@ else()
   if (dml_EXTERNAL_PROJECT)
     set(dml_preset_config $<IF:$<CONFIG:Debug>,debug,release>)
     set(dml_preset_name ${onnxruntime_target_platform}-win-redist-${dml_preset_config})
-    target_compile_definitions(DirectML INTERFACE DML_TARGET_VERSION_USE_LATEST=1)
     include(ExternalProject)
     ExternalProject_Add(
         directml_repo
         GIT_REPOSITORY https://dev.azure.com/microsoft/WindowsAI/_git/DirectML
-        GIT_TAG d460f0f46967bea878786f1bed69487692c779bf
+        GIT_TAG a5312f72c51864b4d705ac62d25d08bcd88c4fb1
         GIT_SHALLOW OFF # not allowed when GIT_TAG is a commit SHA, which is preferred (it's stable, unlike branches)
         GIT_PROGRESS ON
         BUILD_IN_SOURCE ON
@@ -94,8 +93,20 @@ else()
     target_link_libraries(DirectML INTERFACE ${directml_install_path}/lib/DirectML.lib)
     add_dependencies(DirectML directml_repo-install)
     include_directories(BEFORE ${directml_install_path}/include)
+    target_compile_definitions(DirectML INTERFACE DML_TARGET_VERSION_USE_LATEST=1)
   else()
     include_directories(BEFORE ${dml_INCLUDE_DIR})
     set(DML_PACKAGE_DIR ${dml_INCLUDE_DIR}/..)
   endif()
 endif()
+
+FetchContent_Declare(
+    directx_headers
+    URL ${DEP_URL_directx_headers}
+    URL_HASH SHA1=${DEP_SHA1_directx_headers}
+)
+
+FetchContent_Populate(directx_headers)
+set(directx_headers_INCLUDE_DIRS  "${directx_headers_SOURCE_DIR}/include")
+
+include_directories(BEFORE ${directx_headers_INCLUDE_DIRS})
diff --git a/cmake/external/dnnl.cmake b/cmake/external/dnnl.cmake
index d7b70640781d..9eb5fed7a1af 100644
--- a/cmake/external/dnnl.cmake
+++ b/cmake/external/dnnl.cmake
@@ -2,7 +2,7 @@ include (ExternalProject)
 
 set(DNNL_URL https://github.com/oneapi-src/onednn.git)
 # If DNNL_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated.
-set(DNNL_TAG v3.0)
+set(DNNL_TAG v3.0.1)
 
 if(WIN32)
   set(DNNL_SHARED_LIB dnnl.dll)
diff --git a/cmake/external/emsdk b/cmake/external/emsdk
index a896e3d06644..4e2496141eda 160000
--- a/cmake/external/emsdk
+++ b/cmake/external/emsdk
@@ -1 +1 @@
-Subproject commit a896e3d066448b3530dbcaa48869fafefd738f57
+Subproject commit 4e2496141eda15040c44e9bbf237a1326368e34c
diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
new file mode 100644
index 000000000000..3fe9c660f89d
--- /dev/null
+++ b/cmake/external/neural_speed.cmake
@@ -0,0 +1,16 @@
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
+  set(USE_NEURAL_SPEED TRUE)
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
+  set(USE_NEURAL_SPEED TRUE)
+endif()
+
+if(USE_NEURAL_SPEED)
+  FetchContent_Declare(
+      neural_speed
+      URL ${DEP_URL_neural_speed}
+      URL_HASH SHA1=${DEP_SHA1_neural_speed}
+      PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
+  )
+  set(BTLA_USE_OPENMP OFF)
+  onnxruntime_fetchcontent_makeavailable(neural_speed)
+endif()
diff --git a/cmake/external/onnx b/cmake/external/onnx
index b86cc54efce1..990217f043af 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit b86cc54efce19530fb953e4b21f57e6b3888534c
+Subproject commit 990217f043af7222348ca8f0301e17fa7b841781
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 78f63227c839..8839dbc8fda4 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -14,6 +14,16 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST)
     set(DEP_URL_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP_URL})
     # The third column is SHA1 hash value
     set(DEP_SHA1_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP})
+
+    if(ONNXRUNTIME_DEP_URL MATCHES "^https://")
+      # Search a local mirror folder
+      string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
+
+      if(EXISTS "${LOCAL_URL}")
+        cmake_path(ABSOLUTE_PATH LOCAL_URL)
+        set(DEP_URL_${ONNXRUNTIME_DEP_NAME} "${LOCAL_URL}")
+      endif()
+    endif()
   endif()
 endforeach()
 
@@ -37,8 +47,13 @@ if (onnxruntime_BUILD_UNIT_TESTS)
     set(gtest_disable_pthreads ON)
   endif()
   set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
-  if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
-    # Needs to update onnxruntime/test/xctest/xcgtest.mm
+  if (IOS OR ANDROID)
+    # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing
+    # any args to gtest executables, such as using --gtest_filter to debug a specific test.
+    # Processing of compile definitions:
+    # https://github.com/abseil/abseil-cpp/blob/8dc90ff07402cd027daec520bb77f46e51855889/absl/flags/config.h#L21
+    # If set, this code throws away the flag and does nothing on registration, which results in no flags being known:
+    # https://github.com/abseil/abseil-cpp/blob/8dc90ff07402cd027daec520bb77f46e51855889/absl/flags/flag.h#L205-L217
     set(GTEST_HAS_ABSL OFF CACHE BOOL "" FORCE)
   else()
     set(GTEST_HAS_ABSL ON CACHE BOOL "" FORCE)
@@ -104,45 +119,18 @@ FetchContent_Declare(
     URL ${DEP_URL_flatbuffers}
     URL_HASH SHA1=${DEP_SHA1_flatbuffers}
     PATCH_COMMAND ${ONNXRUNTIME_FLATBUFFERS_PATCH_COMMAND}
-    FIND_PACKAGE_ARGS 1.12.0...<2.0.0 NAMES Flatbuffers
+    FIND_PACKAGE_ARGS 23.5.9 NAMES Flatbuffers
 )
 
 # Download a protoc binary from Internet if needed
-if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
   # download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
   # variable.
-  message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
-  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-    if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
-      FetchContent_Populate(protoc_binary)
-    endif()
-    if(protoc_binary_SOURCE_DIR)
-      message("Use prebuilt protoc")
-      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
-      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-    endif()
-  elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
-    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
-      FetchContent_Populate(protoc_binary)
-    endif()
-    if(protoc_binary_SOURCE_DIR)
-      message("Use prebuilt protoc")
-      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
-      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-    endif()
-  elseif ((CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "iOS") AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
+  if (CMAKE_HOST_APPLE)
+    # Using CMAKE_CROSSCOMPILING is not recommended for Apple target devices.
+    # https://cmake.org/cmake/help/v3.26/variable/CMAKE_CROSSCOMPILING.html
+    # To keep it simple, just download and use the universal protoc binary for all Apple host builds.
     FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
     FetchContent_Populate(protoc_binary)
     if(protoc_binary_SOURCE_DIR)
@@ -150,6 +138,38 @@ if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
       set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
       set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
     endif()
+  elseif (CMAKE_CROSSCOMPILING)
+    message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+    if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+      if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
+        FetchContent_Populate(protoc_binary)
+      endif()
+      if(protoc_binary_SOURCE_DIR)
+        message("Use prebuilt protoc")
+        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
+        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+      endif()
+    elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
+      if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
+        FetchContent_Populate(protoc_binary)
+      endif()
+      if(protoc_binary_SOURCE_DIR)
+        message("Use prebuilt protoc")
+        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+      endif()
+    endif()
   endif()
 endif()
 
@@ -184,9 +204,9 @@ FetchContent_Declare(
 )
 
 set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
-#TODO: we'd better to turn the following option off. However, it will cause 
+#TODO: we'd better to turn the following option off. However, it will cause
 # ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
-# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is 
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
 # not in any export set.
 #set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
 set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
@@ -219,8 +239,6 @@ FetchContent_Declare(
   URL_HASH SHA1=${DEP_SHA1_mp11}
 )
 
-set(JSON_BuildTests OFF CACHE INTERNAL "")
-set(JSON_Install OFF CACHE INTERNAL "")
 set(JSON_BuildTests OFF CACHE INTERNAL "")
 set(JSON_Install OFF CACHE INTERNAL "")
 
@@ -253,14 +271,7 @@ if (onnxruntime_ENABLE_CPUINFO)
       set(CPUINFO_SUPPORTED TRUE)
     endif()
     if (WIN32)
-      # Exclude Windows ARM build and Windows Store
-      if (${onnxruntime_target_platform} MATCHES "^(ARM.*|arm.*)$" )
-        message(WARNING "Cpuinfo not included for compilation problems with Windows ARM.")
-        set(CPUINFO_SUPPORTED FALSE)
-      elseif (WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib)
-        message(WARNING "Cpuinfo not included non-Desktop builds")
-        set(CPUINFO_SUPPORTED FALSE)
-      endif()
+      set(CPUINFO_SUPPORTED TRUE)
     elseif (NOT ${onnxruntime_target_platform} MATCHES "^(i[3-6]86|AMD64|x86(_64)?|armv[5-8].*|aarch64|arm64)$")
       message(WARNING
         "Target processor architecture \"${onnxruntime_target_platform}\" is not supported in cpuinfo. "
@@ -304,13 +315,23 @@ if (CPUINFO_SUPPORTED)
   set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE INTERNAL "")
   set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE INTERNAL "")
   set(CPUINFO_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
-
-  FetchContent_Declare(
-    pytorch_cpuinfo
-    URL ${DEP_URL_pytorch_cpuinfo}
-    URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
-    FIND_PACKAGE_ARGS NAMES cpuinfo
-  )
+  if(onnxruntime_target_platform STREQUAL "ARM64EC")
+      message("Applying a patch for Windows ARM64EC in cpuinfo")
+      FetchContent_Declare(
+        pytorch_cpuinfo
+        URL ${DEP_URL_pytorch_cpuinfo}
+        URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
+        PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
+        FIND_PACKAGE_ARGS NAMES cpuinfo
+      )
+  else()
+      FetchContent_Declare(
+        pytorch_cpuinfo
+        URL ${DEP_URL_pytorch_cpuinfo}
+        URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
+        FIND_PACKAGE_ARGS NAMES cpuinfo
+      )
+  endif()
   set(ONNXRUNTIME_CPUINFO_PROJ pytorch_cpuinfo)
 endif()
 
@@ -536,22 +557,32 @@ if(onnxruntime_ENABLE_TRAINING OR (onnxruntime_ENABLE_TRAINING_APIS AND onnxrunt
   onnxruntime_fetchcontent_makeavailable(cxxopts)
 endif()
 
+if (onnxruntime_USE_COREML)
+  FetchContent_Declare(
+    coremltools
+    URL ${DEP_URL_coremltools}
+    URL_HASH SHA1=${DEP_SHA1_coremltools}
+    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/coremltools/crossplatformbuild.patch
+  )
+  # we don't build directly so use Populate. selected files are built from onnxruntime_providers_coreml.cmake
+  FetchContent_Populate(coremltools)
+endif()
+
 message("Finished fetching external dependencies")
 
 
 set(onnxruntime_LINK_DIRS )
 if (onnxruntime_USE_CUDA)
       #TODO: combine onnxruntime_CUDNN_HOME and onnxruntime_CUDA_HOME, assume they are the same
+      find_package(CUDAToolkit REQUIRED)
       if (WIN32)
         if(onnxruntime_CUDNN_HOME)
           list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib/x64)
         endif()
-        list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64)
       else()
         if(onnxruntime_CUDNN_HOME)
           list(APPEND onnxruntime_LINK_DIRS  ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64)
         endif()
-        list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64)
       endif()
 endif()
 
@@ -562,4 +593,3 @@ endif()
 
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
-
diff --git a/cmake/external/xnnpack.cmake b/cmake/external/xnnpack.cmake
index e661aa51bfc1..41f02ce6f22b 100644
--- a/cmake/external/xnnpack.cmake
+++ b/cmake/external/xnnpack.cmake
@@ -6,10 +6,14 @@ set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
 set(PTHREADPOOL_BUILD_TESTS OFF CACHE INTERNAL "")
 set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
 
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
+  set(XNNPACK_USE_SYSTEM_LIBS OFF)
+endif()
+
 # BF16 instructions cause ICE in Android NDK compiler
 if(CMAKE_ANDROID_ARCH_ABI STREQUAL armeabi-v7a)
   set(XNNPACK_ENABLE_ARM_BF16 OFF)
-ENDIF()
+endif()
 
 # fp16 depends on psimd
 FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
diff --git a/cmake/maccatalyst_prepare_objects_for_prelink.py b/cmake/maccatalyst_prepare_objects_for_prelink.py
new file mode 100644
index 000000000000..34664b4e0523
--- /dev/null
+++ b/cmake/maccatalyst_prepare_objects_for_prelink.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import shutil
+import sys
+
+
+# Note: This script is mainly used for sanity checking/validating the files in the .a library equal to the .o files
+# in the source dir to handle the case of source files having duplicate names under different subdirectories for
+# each onnxruntime library. (Only applicable when doing a Mac Catalyst build.)
+def main():
+    source_dir = sys.argv[1]
+    dest_dir = sys.argv[2]
+    files_from_static_lib = sys.argv[3]
+    files_from_source_dir = []
+    for subdir, _, files in os.walk(source_dir):
+        for file_name in files:
+            if file_name.endswith(".o"):
+                files_from_source_dir.append(file_name.strip())
+                dest_name_without_extension, _ = os.path.splitext(file_name)
+                counter = 0
+
+                dest_file = f"{dest_name_without_extension}.o"
+                while os.path.exists(os.path.join(dest_dir, dest_file)):
+                    print("Duplicate file name from source: " + os.path.join(source_dir, subdir, file_name))
+                    counter += 1
+                    dest_file = f"{dest_name_without_extension}_{counter}.o"
+                    print("Renamed file name in destination: " + os.path.join(dest_dir, dest_file))
+
+                destination_path = os.path.join(dest_dir, dest_file)
+                source_file = os.path.join(source_dir, subdir, file_name)
+                shutil.copy(source_file, destination_path)
+
+    # Sanity check to ensure the number of .o object from the original cmake source directory matches with the number
+    # of .o files extracted from each .a onnxruntime library
+    file_lists_from_static_lib = []
+    with open(files_from_static_lib) as file:
+        filenames = file.readlines()
+    for filename in filenames:
+        file_lists_from_static_lib.append(filename.strip())
+
+    sorted_list1 = sorted(file_lists_from_static_lib)
+    sorted_list2 = sorted(files_from_source_dir)
+
+    if len(sorted_list1) != len(sorted_list2):
+        print(
+            "Caught a mismatch in the number of .o object files from the original cmake source directory: ",
+            len(sorted_list1),
+            "the number of .o files extracted from the static onnxruntime lib: ",
+            len(sorted_list2),
+            "for: ",
+            os.path.basename(source_dir),
+        )
+
+    if sorted_list1 == sorted_list2:
+        print(
+            "Sanity check passed: object files from original source directory matches with files extracted "
+            "from static library for: ",
+            os.path.basename(source_dir),
+        )
+    else:
+        print(
+            "Error: Mismatch between object files from original source directory "
+            "and the .o files extracted from static library for: ",
+            os.path.basename(source_dir),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index c900f4d4b09a..e15c8a046dc2 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -189,7 +189,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${PROVIDERS_SNPE}
   ${PROVIDERS_TVM}
   ${PROVIDERS_RKNPU}
-  ${PROVIDERS_VITISAI}
   ${PROVIDERS_XNNPACK}
   ${PROVIDERS_WEBNN}
   ${PROVIDERS_AZURE}
@@ -282,7 +281,13 @@ endif()
 
 # Assemble the Apple static framework (iOS and macOS)
 if(onnxruntime_BUILD_APPLE_FRAMEWORK)
-  set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+  # when building for mac catalyst, the CMAKE_OSX_SYSROOT is set to MacOSX as well, to avoid duplication,
+  # we specify as `-macabi` in the name of the output static apple framework directory.
+  if (PLATFORM_NAME STREQUAL "macabi")
+    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-macabi)
+  else()
+    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+  endif()
 
   # Setup the various directories required. Remove any existing ones so we start with a clean directory.
   set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)
@@ -300,18 +305,34 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
   # to enforce symbol visibility. doing it this way limits the symbols included from the .a files to symbols used
   # by the ORT .o files.
 
-  # If it's an onnxruntime library, extract .o files to a separate directory for each library to avoid any clashes
-  # with filenames (e.g. utils.o)
+  # If it's an onnxruntime library, extract .o files from the original cmake build path to a separate directory for
+  # each library to avoid any clashes with filenames (e.g. utils.o)
   foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} )
     GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
     if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
       set(CUR_STATIC_LIB_OBJ_DIR ${STATIC_LIB_TEMP_DIR}/$<TARGET_LINKER_FILE_BASE_NAME:${_LIB}>)
       add_custom_command(TARGET onnxruntime POST_BUILD
                          COMMAND ${CMAKE_COMMAND} -E make_directory ${CUR_STATIC_LIB_OBJ_DIR})
-
-      add_custom_command(TARGET onnxruntime POST_BUILD
-                         COMMAND ar ARGS -x $<TARGET_FILE:${_LIB}>
-                         WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      if (PLATFORM_NAME STREQUAL "macabi")
+        # There exists several duplicate names for source files under different subdirectories within
+        # each onnxruntime library. (e.g. onnxruntime/contrib_ops/cpu/element_wise_ops.o
+        # vs. onnxruntime/providers/core/cpu/math/element_wise_ops.o)
+        # In that case, using 'ar ARGS -x' to extract the .o files from .a lib would possibly cause duplicate naming files being overwritten
+        # and lead to missing undefined symbol error in the generated binary.
+        # So we use the below python script as a sanity check to do a recursive find of all .o files in ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR}
+        # and verifies that matches the content of the .a, and then copy from the source dir.
+        # TODO: The copying action here isn't really necessary. For future fix, consider using the script extracts from the ar with the rename to potentially
+        # make both maccatalyst and other builds do the same thing.
+        set(CUR_TARGET_CMAKE_SOURCE_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_LIB}.dir)
+        add_custom_command(TARGET onnxruntime POST_BUILD
+                          COMMAND ar -t $<TARGET_FILE:${_LIB}> | grep "\.o$"  > ${_LIB}.object_file_list.txt
+                          COMMAND ${CMAKE_COMMAND} -E env python3 ${CMAKE_CURRENT_SOURCE_DIR}/maccatalyst_prepare_objects_for_prelink.py ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR} ${CUR_STATIC_LIB_OBJ_DIR} ${CUR_STATIC_LIB_OBJ_DIR}/${_LIB}.object_file_list.txt
+                          WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      else()
+        add_custom_command(TARGET onnxruntime POST_BUILD
+        COMMAND ar ARGS -x $<TARGET_FILE:${_LIB}>
+        WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      endif()
     endif()
   endforeach()
 
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 43d5fa9bdee3..69d8f5fa138c 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -129,7 +129,7 @@ target_include_directories(onnxruntime_common
         ${OPTIONAL_LITE_INCLUDE_DIR})
 
 
-target_link_libraries(onnxruntime_common PUBLIC safeint_interface ${GSL_TARGET} ${ABSEIL_LIBS})
+target_link_libraries(onnxruntime_common PUBLIC safeint_interface ${GSL_TARGET} ${ABSEIL_LIBS} date::date)
 
 add_dependencies(onnxruntime_common ${onnxruntime_EXTERNAL_DEPENDENCIES})
 
@@ -189,6 +189,8 @@ elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
       set(ARM TRUE)
     elseif(dumpmachine_output MATCHES "^aarch64.*")
       set(ARM64 TRUE)
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
+      set(RISCV64 TRUE)
     elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
       set(X86 TRUE)
     elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
@@ -198,11 +200,7 @@ elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
 endif()
 
 
-if (ARM64 OR ARM OR X86 OR X64 OR X86_64)
-  if((WIN32 AND NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) OR ((ARM64 OR ARM) AND MSVC))
-    # msvc compiler report syntax error with cpuinfo arm source files
-    # and cpuinfo does not have code for getting arm uarch info under windows
-  else()
+if (RISCV64 OR ARM64 OR ARM OR X86 OR X64 OR X86_64)
     # Link cpuinfo if supported
     # Using it mainly in ARM with Android.
     # Its functionality in detecting x86 cpu features are lacking, so is support for Windows.
@@ -210,7 +208,6 @@ if (ARM64 OR ARM OR X86 OR X64 OR X86_64)
       onnxruntime_add_include_to_target(onnxruntime_common cpuinfo::cpuinfo)
       list(APPEND onnxruntime_EXTERNAL_LIBRARIES cpuinfo::cpuinfo ${ONNXRUNTIME_CLOG_TARGET_NAME})
     endif()
-  endif()
 endif()
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake
index 3f532ec2c326..4d51325b8414 100644
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@@ -7,8 +7,26 @@ file(GLOB_RECURSE onnxruntime_graph_src CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/core/graph/*.cc"
   )
 
-# create empty list for any excludes
+# start with empty training srcs list
+set(orttraining_graph_src)
+
+if (onnxruntime_ENABLE_TRAINING_OPS AND NOT onnxruntime_ENABLE_TRAINING)
+  set(orttraining_graph_src
+      "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
+      "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
+      )
+endif()
+
+if (onnxruntime_ENABLE_TRAINING)
+  file(GLOB_RECURSE orttraining_graph_src CONFIGURE_DEPENDS
+      "${ORTTRAINING_SOURCE_DIR}/core/graph/*.h"
+      "${ORTTRAINING_SOURCE_DIR}/core/graph/*.cc"
+      )
+endif()
+
+# create empty lists for any excludes
 set(onnxruntime_graph_src_exclude_patterns)
+set(orttraining_graph_src_exclude_patterns)
 
 if (onnxruntime_MINIMAL_BUILD)
   # remove schema registration support
@@ -22,11 +40,18 @@ if (onnxruntime_MINIMAL_BUILD)
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.cc"
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.h"
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.cc"
+    "${ONNXRUNTIME_ROOT}/core/graph/dml_ops/dml_defs.h"
+    "${ONNXRUNTIME_ROOT}/core/graph/dml_ops/dml_defs.cc"
     "${ONNXRUNTIME_ROOT}/core/graph/function_template.h"
     "${ONNXRUNTIME_ROOT}/core/graph/function_utils.h"
     "${ONNXRUNTIME_ROOT}/core/graph/function_utils.cc"
   )
 
+  list(APPEND orttraining_graph_src_exclude_patterns
+    "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
+    "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
+  )
+
   # no Function support initially
   list(APPEND onnxruntime_graph_src_exclude_patterns
     "${ONNXRUNTIME_ROOT}/core/graph/function*"
@@ -64,30 +89,12 @@ endif()
 file(GLOB onnxruntime_graph_src_exclude ${onnxruntime_graph_src_exclude_patterns})
 list(REMOVE_ITEM onnxruntime_graph_src ${onnxruntime_graph_src_exclude})
 
-file(GLOB_RECURSE onnxruntime_ir_defs_src CONFIGURE_DEPENDS
-  "${ONNXRUNTIME_ROOT}/core/defs/*.cc"
-)
-
-if (onnxruntime_ENABLE_TRAINING_OPS AND NOT onnxruntime_ENABLE_TRAINING)
-  set(orttraining_graph_src
-      "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.cc"
-      "${ORTTRAINING_SOURCE_DIR}/core/graph/training_op_defs.h"
-      )
-endif()
-
-if (onnxruntime_ENABLE_TRAINING)
-  file(GLOB_RECURSE orttraining_graph_src CONFIGURE_DEPENDS
-      "${ORTTRAINING_SOURCE_DIR}/core/graph/*.h"
-      "${ORTTRAINING_SOURCE_DIR}/core/graph/*.cc"
-      )
-endif()
-
-set(onnxruntime_graph_lib_src ${onnxruntime_graph_src} ${onnxruntime_ir_defs_src})
 if (onnxruntime_ENABLE_TRAINING_OPS)
-    list(APPEND onnxruntime_graph_lib_src ${orttraining_graph_src})
+  file(GLOB orttraining_graph_src_exclude ${orttraining_graph_src_exclude_patterns})
+  list(REMOVE_ITEM orttraining_graph_src ${orttraining_graph_src_exclude})
 endif()
 
-onnxruntime_add_static_library(onnxruntime_graph ${onnxruntime_graph_lib_src})
+onnxruntime_add_static_library(onnxruntime_graph ${onnxruntime_graph_src} ${orttraining_graph_src})
 add_dependencies(onnxruntime_graph onnx_proto flatbuffers::flatbuffers)
 onnxruntime_add_include_to_target(onnxruntime_graph onnxruntime_common ${WIL_TARGET} onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers safeint_interface Boost::mp11)
 
@@ -120,7 +127,7 @@ endif()
 
 set_target_properties(onnxruntime_graph PROPERTIES FOLDER "ONNXRuntime")
 set_target_properties(onnxruntime_graph PROPERTIES LINKER_LANGUAGE CXX)
-source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_graph_src} ${onnxruntime_ir_defs_src})
+source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_graph_src})
 if (onnxruntime_ENABLE_TRAINING_OPS)
     source_group(TREE ${ORTTRAINING_ROOT} FILES ${orttraining_graph_src})
 endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index bee83ff07c74..f7103c3b00a3 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -1,7 +1,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib)
+set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
+set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
+set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
 
 #
 # All hardware agnostic source files here
@@ -9,6 +11,7 @@ set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib)
 # multi-target build
 #
 onnxruntime_add_static_library(onnxruntime_mlas
+  ${MLAS_SRC_DIR}/mlasi.h
   ${MLAS_SRC_DIR}/platform.cpp
   ${MLAS_SRC_DIR}/threading.cpp
   ${MLAS_SRC_DIR}/sgemm.cpp
@@ -33,9 +36,18 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qpostprocessor.cpp
   ${MLAS_SRC_DIR}/qlgavgpool.cpp
   ${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
+  ${MLAS_SRC_DIR}/sqnbitgemm.h
   ${MLAS_SRC_DIR}/sqnbitgemm.cpp
 )
 
+target_sources(onnxruntime_mlas PRIVATE
+  ${MLAS_INC_DIR}/mlas_float16.h
+  ${MLAS_INC_DIR}/mlas_gemm_postprocessor.h
+  ${MLAS_INC_DIR}/mlas_q4.h
+  ${MLAS_INC_DIR}/mlas_qnbit.h
+  ${MLAS_INC_DIR}/mlas.h
+)
+
 if (NOT onnxruntime_ORT_MINIMAL_BUILD)
   target_sources(onnxruntime_mlas PRIVATE
     ${MLAS_SRC_DIR}/q4_dq.cpp
@@ -45,15 +57,6 @@ endif()
 
 set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
 
-function(add_jblas)
-    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas) 
-    target_link_libraries(onnxruntime_mlas PRIVATE jblas::jblas)
-    target_sources(onnxruntime_mlas PRIVATE
-        ${MLAS_SRC_DIR}/jblas_gemm.cpp
-     )
-    set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR OFF)
-endfunction()
-
 #TODO: set MASM flags properly
 function(setup_mlas_source_for_windows)
 
@@ -143,10 +146,6 @@ function(setup_mlas_source_for_windows)
     target_sources(onnxruntime_mlas PRIVATE
       ${MLAS_SRC_DIR}/arm/sgemmc.cpp
     )
-    # it should be removed after Visual Stuio is upgraded to 17.7
-    if (MSVC)
-      add_compile_options("-d2SSAOptimizer-")
-    endif()
   elseif(onnxruntime_target_platform STREQUAL "x64")
 
     file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS
@@ -198,6 +197,7 @@ function(setup_mlas_source_for_windows)
       ${MLAS_SRC_DIR}/amd64/sgemma.asm
       ${MLAS_SRC_DIR}/amd64/cvtfp16a.asm
       ${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx.asm
+      ${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx512F.asm
       ${MLAS_SRC_DIR}/amd64/TransKernelFma3.asm
       ${MLAS_SRC_DIR}/amd64/TransKernelAvx512F.asm
       ${MLAS_SRC_DIR}/amd64/LogisticKernelFma3.asm
@@ -300,8 +300,8 @@ else()
     if(APPLE)
       get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
     endif()
-    list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH  ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH)
-    if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH GREATER 1)
+    list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH)
+    if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH GREATER 1)
         set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE)
     endif()
     #If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below
@@ -348,25 +348,31 @@ else()
           ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
         )
+        set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+                                    PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
         if (NOT APPLE)
           set(mlas_platform_srcs
             ${mlas_platform_srcs}
             ${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
             ${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S
             ${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S
+            ${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S
             ${MLAS_SRC_DIR}/activate_fp16.cpp
             ${MLAS_SRC_DIR}/dwconv.cpp
             ${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
             ${MLAS_SRC_DIR}/pooling_fp16.cpp
             ${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
             ${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
+            ${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
           )
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
+          set_source_files_properties(${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
         endif()
 
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
@@ -531,6 +537,7 @@ else()
           ${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
           ${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
           ${MLAS_SRC_DIR}/x86_64/SconvKernelAvx512F.S
+          ${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx512F.S
           ${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx512F.S
           ${MLAS_SRC_DIR}/x86_64/TransKernelAvx512F.S
           ${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
@@ -612,15 +619,13 @@ else()
     target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
 endif()
 
-if(USE_JBLAS)
-  add_jblas()
-endif()
-
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
-    target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+    target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
+
+    set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
 endforeach()
-set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")
+
 if (WIN32)
   target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd6385>" "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
   if (onnxruntime_ENABLE_STATIC_ANALYSIS)
@@ -628,6 +633,12 @@ if (WIN32)
   endif()
 endif()
 
+if (PLATFORM_NAME STREQUAL "macabi")
+  # Needed for maccatalyst C compilation
+  # i.e. the flags below add "--target=x86_64-apple-ios14.0-macabi -ffunction-sections -fdata-sections"
+  target_compile_options(onnxruntime_mlas PRIVATE ${CMAKE_C_FLAGS})
+endif()
+
 if (NOT onnxruntime_BUILD_SHARED_LIB)
     install(TARGETS onnxruntime_mlas
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -636,6 +647,21 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
 
+# set up source group for MLAS source files
+block()
+  set(source_group_srcs)
+  foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
+    get_target_property(mlas_target_srcs ${mlas_target} SOURCES)
+    foreach(mlas_target_src ${mlas_target_srcs})
+      cmake_path(IS_PREFIX MLAS_ROOT ${mlas_target_src} in_mlas_root)
+      if(in_mlas_root)
+        list(APPEND source_group_srcs ${mlas_target_src})
+      endif()
+    endforeach()
+  endforeach()
+  source_group(TREE ${MLAS_ROOT} FILES ${source_group_srcs})
+endblock()
+
 
 if (NOT onnxruntime_ORT_MINIMAL_BUILD)
 
@@ -647,7 +673,7 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
   onnxruntime_add_executable(onnxruntime_mlas_q4dq
     ${MLAS_SRC_DIR}/q4_dq_cli.cpp
   )
-  target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+  target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
   set_target_properties(onnxruntime_mlas_q4dq PROPERTIES FOLDER "ONNXRuntimeTest")
 
   target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
index 6053b9d1088c..555baac6f1a5 100644
--- a/cmake/onnxruntime_nodejs.cmake
+++ b/cmake/onnxruntime_nodejs.cmake
@@ -88,7 +88,7 @@ add_custom_target(js_common_npm_ci ALL
 
 add_custom_target(nodejs_binding_wrapper ALL
     COMMAND ${NPM_CLI} ci
-    COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE}
+    COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
         --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_TENSORRT}
         ${NODEJS_BINDING_USE_COREML}
     WORKING_DIRECTORY ${JS_NODE_ROOT}
diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
index 6f09583199ff..f15d5b8dd6f8 100644
--- a/cmake/onnxruntime_optimizer.cmake
+++ b/cmake/onnxruntime_optimizer.cmake
@@ -130,3 +130,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
+
+if (onnxruntime_USE_ROCM)
+  add_dependencies(onnxruntime_optimizer generate_hipified_files)
+endif()
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 2cf0a6b2b9bd..6c5369ca3be3 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -67,7 +67,7 @@ if(onnxruntime_USE_CUDA)
 endif()
 if(onnxruntime_USE_COREML)
   if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "visionOS")
-    set(PROVIDERS_COREML onnxruntime_providers_coreml onnxruntime_coreml_proto)
+    set(PROVIDERS_COREML onnxruntime_providers_coreml coreml_proto)
   else()
     set(PROVIDERS_COREML onnxruntime_providers_coreml)
   endif()
diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake
index 7c712fc40064..b8ebc4ca5323 100644
--- a/cmake/onnxruntime_providers_coreml.cmake
+++ b/cmake/onnxruntime_providers_coreml.cmake
@@ -1,107 +1,220 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-  if (onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD)
-    message(FATAL_ERROR "CoreML EP can not be used in a basic minimal build. Please build with '--minimal_build extended'")
-  endif()
+if (onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD)
+  message(FATAL_ERROR "CoreML EP can not be used in a basic minimal build. Please build with '--minimal_build extended'")
+endif()
+
+add_compile_definitions(USE_COREML=1)
 
-  add_compile_definitions(USE_COREML=1)
-
-  # Compile CoreML proto definition to ${CMAKE_CURRENT_BINARY_DIR}/coreml
-  if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "visionOS")
-    set(COREML_PROTO_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime/core/providers/coreml/mlmodel_format)
-    file(GLOB coreml_proto_srcs
-      "${COREML_PROTO_ROOT}/*.proto"
-    )
-    onnxruntime_add_static_library(onnxruntime_coreml_proto ${coreml_proto_srcs})
-    target_include_directories(onnxruntime_coreml_proto PUBLIC $<TARGET_PROPERTY:${PROTOBUF_LIB},INTERFACE_INCLUDE_DIRECTORIES> "${CMAKE_CURRENT_BINARY_DIR}")
-    target_compile_definitions(onnxruntime_coreml_proto PUBLIC $<TARGET_PROPERTY:${PROTOBUF_LIB},INTERFACE_COMPILE_DEFINITIONS>)
-    set_target_properties(onnxruntime_coreml_proto PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
-    set_target_properties(onnxruntime_coreml_proto PROPERTIES COMPILE_FLAGS "-fvisibility-inlines-hidden")
-    set(_src_sub_dir "coreml/")
-    onnxruntime_protobuf_generate(
-      APPEND_PATH
-      GEN_SRC_SUB_DIR ${_src_sub_dir}
-      IMPORT_DIRS ${COREML_PROTO_ROOT}
-      TARGET onnxruntime_coreml_proto
-    )
-
-    if (NOT onnxruntime_BUILD_SHARED_LIB)
-      install(TARGETS onnxruntime_coreml_proto
-              ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-              LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-              RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
-              FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}
-      )
-    endif()
+# Check if we can build the coremltools code for creating an mlpackage with an mlprogram.
+# The coremltools source requires std::filesystem::path which is only available from iOS 13 on.
+set(_enable_ML_PROGRAM ON)
+if (IOS AND CMAKE_OSX_DEPLOYMENT_TARGET VERSION_LESS 13.0)
+  message(WARNING "CoreML ML Program is not supported on iOS < 13.0. Excluding ML Program support from build.")
+  set(_enable_ML_PROGRAM OFF)
+elseif(LINUX)
+  # uuid-dev is required. we don't bother installing on CIs as it's really for manual developer testing.
+  find_library(LibUUID_LIBRARY NAMES uuid)
+  find_path(LibUUID_INCLUDE_DIR NAMES uuid/uuid.h)
+  if (NOT LibUUID_INCLUDE_DIR)
+    message(STATUS "uuid/uuid.h was not found as is required for ML Program support. "
+                    "Run `sudo apt install uuid-dev` if you need to test ML Program related CoreML EP code. ")
+    set(_enable_ML_PROGRAM OFF)
   endif()
+endif()
+
+if (_enable_ML_PROGRAM)
+  add_compile_definitions(COREML_ENABLE_MLPROGRAM=1)
+endif()
+
+# Compile CoreML proto definition to ${CMAKE_CURRENT_BINARY_DIR}/coreml_proto
+set(COREML_PROTO_ROOT ${coremltools_SOURCE_DIR}/mlmodel/format)
+file(GLOB coreml_proto_srcs "${COREML_PROTO_ROOT}/*.proto")
+
+onnxruntime_add_static_library(coreml_proto ${coreml_proto_srcs})
+target_include_directories(coreml_proto
+                           PUBLIC $<TARGET_PROPERTY:${PROTOBUF_LIB},INTERFACE_INCLUDE_DIRECTORIES>
+                           "${CMAKE_CURRENT_BINARY_DIR}")
+target_compile_definitions(coreml_proto
+                           PUBLIC $<TARGET_PROPERTY:${PROTOBUF_LIB},INTERFACE_COMPILE_DEFINITIONS>)
+set_target_properties(coreml_proto PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+set_target_properties(coreml_proto PROPERTIES COMPILE_FLAGS "-fvisibility-inlines-hidden")
 
-  # These are shared utils,
-  # TODO, move this to a separated lib when used by EPs other than NNAPI and CoreML
-  file(GLOB_RECURSE onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
+set(_src_sub_dir "coreml_proto/")
+onnxruntime_protobuf_generate(
+  APPEND_PATH
+  GEN_SRC_SUB_DIR ${_src_sub_dir}
+  IMPORT_DIRS ${COREML_PROTO_ROOT}
+  TARGET coreml_proto
+)
+
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+  install(TARGETS coreml_proto
+          ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
+          FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}
   )
+endif()
+
+# Add the .proto and generated .cc/.h files to the External/coreml_proto folder in Visual Studio.
+# Separate source_group for each as the .proto files are in the repo and the .cc/.h files are generated in the build
+# output directory.
+set_target_properties(coreml_proto PROPERTIES FOLDER "External")
+source_group(TREE ${COREML_PROTO_ROOT} PREFIX coreml_proto FILES ${coreml_proto_srcs})
+
+# filter to the generated .cc/.h files
+get_target_property(coreml_proto_generated_srcs coreml_proto SOURCES)
+list(FILTER coreml_proto_generated_srcs INCLUDE REGEX "\.pb\.(h|cc)$")
+source_group(TREE ${CMAKE_CURRENT_BINARY_DIR} PREFIX coreml_proto_generated FILES ${coreml_proto_generated_srcs})
+
+# These are shared utils,
+# TODO, move this to a separate lib when used by EPs other than NNAPI and CoreML
+file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
+  "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
+  "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
+)
 
+file(GLOB onnxruntime_providers_coreml_public_headers CONFIGURE_DEPENDS
+  "${ONNXRUNTIME_INCLUDE_DIR}/core/providers/coreml/*.h"
+)
+
+file(GLOB
+  onnxruntime_providers_coreml_cc_srcs_top CONFIGURE_DEPENDS
+  "${ONNXRUNTIME_ROOT}/core/providers/coreml/*.h"
+  "${ONNXRUNTIME_ROOT}/core/providers/coreml/*.cc"
+)
+
+# Add builder source code
+file(GLOB_RECURSE
+  onnxruntime_providers_coreml_cc_srcs_nested CONFIGURE_DEPENDS
+  "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.h"
+  "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.cc"
+)
+
+if(_enable_ML_PROGRAM)
+  # Add helpers to create mlpackage weights. limit to just the files we need to minimize the changes to make them
+  # build on Windows and Linux.
   file(GLOB
-    onnxruntime_providers_coreml_cc_srcs_top CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/coreml/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/coreml/*.cc"
+    onnxruntime_providers_coreml_milblob_cc_srcs CONFIGURE_DEPENDS
+    "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.hpp"
+    "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.cpp"
+    "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Util/*.hpp"
+    "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/BlobDataType.hpp"
+    "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageFormat.hpp"
+    "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/FileWriter.?pp"
+    "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageWriter.?pp"
   )
 
-  # Add builder source code
-  file(GLOB_RECURSE
-    onnxruntime_providers_coreml_cc_srcs_nested CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.cc"
+  # Add helpers to create mlpackage
+  file(GLOB
+    onnxruntime_providers_coreml_modelpackage_cc_srcs CONFIGURE_DEPENDS
+    "${coremltools_SOURCE_DIR}/modelpackage/src/ModelPackage.?pp"
+    "${coremltools_SOURCE_DIR}/modelpackage/src/utils/JsonMap.?pp"
   )
-  if (NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND NOT CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "visionOS")
-    list(REMOVE_ITEM onnxruntime_providers_coreml_cc_srcs_nested
-    "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/model_builder.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/model_builder.cc"
-    )
-  endif()
 
-  # Add CoreML objective c++ source code
-  if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "visionOS")
-    file(GLOB
-      onnxruntime_providers_coreml_objcc_srcs CONFIGURE_DEPENDS
-      "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.h"
-      "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.mm"
-      "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.h"
-      "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.mm"
-    )
-  endif()
-
-  set(onnxruntime_providers_coreml_cc_srcs
-    ${onnxruntime_providers_coreml_cc_srcs_top}
-    ${onnxruntime_providers_coreml_cc_srcs_nested}
-    ${onnxruntime_providers_shared_utils_cc_srcs}
+  set(coremltools_srcs
+    ${onnxruntime_providers_coreml_milblob_cc_srcs}
+    ${onnxruntime_providers_coreml_modelpackage_cc_srcs}
   )
 
-  source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_coreml_cc_srcs})
-  onnxruntime_add_static_library(onnxruntime_providers_coreml
-    ${onnxruntime_providers_coreml_cc_srcs} ${onnxruntime_providers_coreml_objcc_srcs}
+  source_group(TREE ${coremltools_SOURCE_DIR} PREFIX coremltools FILES ${coremltools_srcs})
+endif()
+
+# Add CoreML objective c++ source code
+if (APPLE)
+  file(GLOB
+    onnxruntime_providers_coreml_objcc_srcs CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.mm"
+    "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.mm"
   )
-  onnxruntime_add_include_to_target(onnxruntime_providers_coreml
-    onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB}  flatbuffers::flatbuffers Boost::mp11 safeint_interface
+else()
+  # add the Model implementation that uses the protobuf types but excludes any actual CoreML dependencies
+  # by using stub implementations on non-Apple platforms.
+  file(GLOB
+    onnxruntime_providers_coreml_objcc_srcs CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/host_utils_stub.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/coreml/model/model_stub.cc"
   )
-  if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "visionOS")
-    onnxruntime_add_include_to_target(onnxruntime_providers_coreml onnxruntime_coreml_proto)
-    target_link_libraries(onnxruntime_providers_coreml PRIVATE onnxruntime_coreml_proto "-framework Foundation" "-framework CoreML")
-    add_dependencies(onnxruntime_providers_coreml onnxruntime_coreml_proto)
-  endif()
-  add_dependencies(onnxruntime_providers_coreml ${onnxruntime_EXTERNAL_DEPENDENCIES})
-
-  set_target_properties(onnxruntime_providers_coreml PROPERTIES CXX_STANDARD_REQUIRED ON)
-  set_target_properties(onnxruntime_providers_coreml PROPERTIES FOLDER "ONNXRuntime")
-  target_include_directories(onnxruntime_providers_coreml PRIVATE ${ONNXRUNTIME_ROOT} ${coreml_INCLUDE_DIRS})
-  set_target_properties(onnxruntime_providers_coreml PROPERTIES LINKER_LANGUAGE CXX)
-
-  if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_coreml
-            ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
-            FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
+set(onnxruntime_providers_coreml_cc_srcs
+  ${onnxruntime_providers_coreml_cc_srcs_top}
+  ${onnxruntime_providers_coreml_cc_srcs_nested}
+  ${onnxruntime_providers_shared_utils_cc_srcs}
+  ${onnxruntime_providers_coreml_objcc_srcs}
+)
+
+source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_providers_coreml_cc_srcs})
+source_group(TREE ${ONNXRUNTIME_INCLUDE_DIR} FILES ${onnxruntime_providers_coreml_public_headers})
+
+onnxruntime_add_static_library(onnxruntime_providers_coreml
+  ${onnxruntime_providers_coreml_public_headers}
+  ${onnxruntime_providers_coreml_cc_srcs}
+  ${coremltools_srcs}
+)
+
+onnxruntime_add_include_to_target(onnxruntime_providers_coreml
+  onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11
+  safeint_interface
+)
+
+onnxruntime_add_include_to_target(onnxruntime_providers_coreml coreml_proto)
+target_link_libraries(onnxruntime_providers_coreml PRIVATE coreml_proto)
+add_dependencies(onnxruntime_providers_coreml coreml_proto)
+
+if (APPLE)
+  target_compile_definitions(onnxruntime_providers_coreml PRIVATE __APPLE__)
+endif()
+
+if (_enable_ML_PROGRAM)
+  # Setup coremltools fp16 and json dependencies for creating an mlpackage.
+  #
+  # These are also used by external/xnnpack.cmake. fp16 depends on psimd
+  FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
+  onnxruntime_fetchcontent_makeavailable(psimd)
+  set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})
+  FetchContent_Declare(fp16 URL ${DEP_URL_fp16} URL_HASH SHA1=${DEP_SHA1_fp16})
+  set(FP16_BUILD_TESTS OFF CACHE INTERNAL "")
+  set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
+  onnxruntime_fetchcontent_makeavailable(fp16)
+
+  # need to tweak the include paths to match what the coreml source code expects
+  target_include_directories(onnxruntime_providers_coreml PRIVATE
+                            ${fp16_SOURCE_DIR}/include
+                            ${nlohmann_json_SOURCE_DIR}/single_include/nlohmann
+                            ${coremltools_SOURCE_DIR}
+                            ${coremltools_SOURCE_DIR}/mlmodel/src/
+                            ${coremltools_SOURCE_DIR}/modelpackage/src/
+  )
+
+  add_dependencies(onnxruntime_providers_coreml nlohmann_json::nlohmann_json fp16)
+
+  if (LINUX)
+    target_link_libraries(onnxruntime_providers_coreml PRIVATE uuid)
   endif()
+endif()
+
+if (APPLE)
+  target_link_libraries(onnxruntime_providers_coreml PRIVATE "-framework Foundation" "-framework CoreML")
+endif()
+
+add_dependencies(onnxruntime_providers_coreml ${onnxruntime_EXTERNAL_DEPENDENCIES})
+
+set_target_properties(onnxruntime_providers_coreml PROPERTIES CXX_STANDARD_REQUIRED ON)
+set_target_properties(onnxruntime_providers_coreml PROPERTIES FOLDER "ONNXRuntime")
+target_include_directories(onnxruntime_providers_coreml PRIVATE ${ONNXRUNTIME_ROOT} ${coreml_INCLUDE_DIRS})
+set_target_properties(onnxruntime_providers_coreml PROPERTIES LINKER_LANGUAGE CXX)
+
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+  install(TARGETS onnxruntime_providers_coreml
+          ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
+          FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index 397ef5b5b50e..b211c02f712b 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -60,6 +60,15 @@ if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
       "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op_executor.cc"
     )
   endif()
+  set(onnxruntime_cpu_neural_speed_srcs 
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_defs.h"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.cc"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.h"
+  )
+  if(NOT USE_NEURAL_SPEED)
+    list(REMOVE_ITEM onnxruntime_cpu_contrib_ops_srcs ${onnxruntime_cpu_neural_speed_srcs})
+  endif()
   # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
   source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cpu_contrib_ops_srcs})
   list(APPEND onnxruntime_providers_src ${onnxruntime_cpu_contrib_ops_srcs})
@@ -144,6 +153,12 @@ if (HAS_BITWISE_INSTEAD_OF_LOGICAL)
   target_compile_options(onnxruntime_providers PRIVATE "-Wno-bitwise-instead-of-logical")
 endif()
 
+if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
+  if(USE_NEURAL_SPEED)
+    onnxruntime_add_include_to_target(onnxruntime_providers neural_speed::bestla)
+  endif()
+endif()
+
 if (MSVC)
    target_compile_options(onnxruntime_providers PRIVATE "/bigobj")
 #   if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 84d1376f99d5..1346a9ce968c 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -1,10 +1,25 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-  file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
-  )
+
+  if (onnxruntime_CUDA_MINIMAL)
+    file(GLOB onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.h"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.cc"
+    )
+    # Remove pch files
+    list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/integer_gemm.cc"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/triton_kernel.h"
+    )
+  else()
+    file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+    )
+  endif()
   # Remove pch files
   list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
     "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
@@ -16,11 +31,16 @@
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
-  file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
-  )
 
+
+  if (onnxruntime_CUDA_MINIMAL)
+    set(onnxruntime_providers_cuda_shared_srcs "")
+  else()
+    file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
+    )
+  endif()
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
   set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
 
@@ -102,7 +122,7 @@
   endif()
   if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
     # cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and
-    # add to the lib onnxruntime_providers_cuda separatedly.
+    # added to the lib onnxruntime_providers_cuda separately.
     # onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc.
     set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc)
     list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src})
@@ -121,18 +141,22 @@
     if (HAS_GUARD_CF)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /guard:cf>")
     endif()
+
     if (HAS_QSPECTRE)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /Qspectre>")
     endif()
+
     foreach(ORT_FLAG ${ORT_WARNING_FLAGS})
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler \"${ORT_FLAG}\">")
     endforeach()
+
     # CUDA 11.3+ supports parallel compilation
     # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.3)
       option(onnxruntime_NVCC_THREADS "Number of threads that NVCC can use for compilation." 1)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
     endif()
+
     if (UNIX)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler -Wno-reorder>"
                   "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-reorder>")
@@ -142,6 +166,13 @@
       #mutex.cuh(91): warning C4834: discarding return value of function with 'nodiscard' attribute
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4834>")
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4127>")
+      if (MSVC)
+        # the VS warnings for 'Conditional Expression is Constant' are spurious as they don't handle multiple conditions
+        # e.g. `if (std::is_same_v<T, float> && not_a_const)` will generate the warning even though constexpr cannot
+        # be used due to `&& not_a_const`. This affects too many places for it to be reasonable to disable at a finer
+        # granularity.
+        target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
+      endif()
     endif()
 
     onnxruntime_add_include_to_target(${target} onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers)
@@ -156,10 +187,16 @@
     endif()
 
     add_dependencies(${target} onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
-    target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
-    if(onnxruntime_CUDNN_HOME)
-      target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
-      target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+    if(onnxruntime_CUDA_MINIMAL)
+      target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL)
+      target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface CUDA::cudart)
+    else()
+      target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas cudnn CUDA::curand CUDA::cufft CUDA::cudart
+              ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+      if(onnxruntime_CUDNN_HOME)
+          target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+          target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+      endif()
     endif()
 
     if (onnxruntime_USE_TRITON_KERNEL)
@@ -171,25 +208,24 @@
       target_include_directories(${target} PRIVATE ${triton_kernel_header_dir})
       target_link_libraries(${target} PUBLIC -Wl,--whole-archive ${triton_kernel_obj_file} -Wl,--no-whole-archive)
       # lib cuda needed by cuLaunchKernel
-      target_link_libraries(${target} PRIVATE cuda)
+      target_link_libraries(${target} PRIVATE CUDA::cuda_driver)
     endif()
 
     include(cutlass)
-    target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples)
+    target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples ${cutlass_SOURCE_DIR}/tools/util/include)
 
-    target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}  ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}  ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES}
+     PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
     # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
     set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
     set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")
 
     if (onnxruntime_ENABLE_CUDA_PROFILING) # configure cupti for cuda profiling
-      target_include_directories(${target} PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
-      target_link_directories(${target} PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
-      target_link_libraries(${target} PRIVATE cupti)
+      target_link_libraries(${target} PRIVATE CUDA::cupti)
     endif()
 
-    if (onnxruntime_ENABLE_NVTX_PROFILE AND NOT WIN32)
-      target_link_libraries(${target} PRIVATE nvToolsExt)
+    if (onnxruntime_ENABLE_NVTX_PROFILE)
+      target_link_libraries(${target} PRIVATE CUDA::nvtx3)
     endif()
 
     if (onnxruntime_ENABLE_TRAINING_OPS)
diff --git a/cmake/onnxruntime_providers_nnapi.cmake b/cmake/onnxruntime_providers_nnapi.cmake
index 5ac25a3b76ef..b718a976eb26 100644
--- a/cmake/onnxruntime_providers_nnapi.cmake
+++ b/cmake/onnxruntime_providers_nnapi.cmake
@@ -49,12 +49,10 @@
   endif()
 
   # These are shared utils,
-  # TODO, move this to a separated lib when used by EPs other than NNAPI and CoreML
+  # TODO, move this to a separate lib when used by EPs other than NNAPI and CoreML
   list(APPEND onnxruntime_provider_nnapi_cc_src_patterns
     "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
   )
 
   file(GLOB onnxruntime_providers_nnapi_cc_srcs CONFIGURE_DEPENDS ${onnxruntime_provider_nnapi_cc_src_patterns})
@@ -81,4 +79,4 @@
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
-  endif()
\ No newline at end of file
+  endif()
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index e26f0bfc0b75..5876b2b5c448 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -16,23 +16,19 @@
   endif()
 
   # Header paths
-  find_package(InferenceEngine REQUIRED)
-  find_package(ngraph REQUIRED)
-
-  if (OPENVINO_2022_1 OR OPENVINO_2022_2)
   find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
-  list (OV_20_LIBS openvino::frontend::onnx openvino::runtime)
+  if(OpenVINO_VERSION VERSION_LESS 2023.0)
+    message(FATAL_ERROR "OpenVINO 2023.0 and newer are supported. Please, latest OpenVINO release")
   endif()
 
   if (WIN32)
     unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
   endif()
 
+  list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
   if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}))
     add_definitions(-DIO_BUFFER_ENABLED=1)
-    list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS} ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES})
-  else()
-    list(APPEND OPENVINO_LIB_LIST ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES})
+    list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
   endif()
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
@@ -75,7 +71,14 @@
     message(FATAL_ERROR "onnxruntime_providers_openvino unknown platform, need to specify shared library exports for it")
   endif()
 
-  install(TARGETS onnxruntime_providers_openvino
-          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
\ No newline at end of file
+  if (CMAKE_OPENVINO_LIBRARY_INSTALL_DIR)
+    install(TARGETS onnxruntime_providers_openvino
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_OPENVINO_LIBRARY_INSTALL_DIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+  else()
+    install(TARGETS onnxruntime_providers_openvino
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+  endif()
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index a93a06e960c8..b68d84c23bb3 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -4,12 +4,10 @@
   add_compile_definitions(USE_QNN=1)
 
   # These are shared utils,
-  # TODO, move this to a separated lib when used by EPs other than QNN, NNAPI and CoreML
-  file(GLOB_RECURSE onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
+  # TODO, move to a separate lib when used by EPs other than QNN, NNAPI and CoreML
+  file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
   )
 
   file(GLOB_RECURSE
@@ -42,4 +40,4 @@
   # ignore the warning unknown-pragmas on "pragma region"
   if(NOT MSVC)
     target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
-  endif()
\ No newline at end of file
+  endif()
diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake
index 686a993de3a4..15ffc29e79ff 100644
--- a/cmake/onnxruntime_providers_tensorrt.cmake
+++ b/cmake/onnxruntime_providers_tensorrt.cmake
@@ -8,7 +8,7 @@
   set(BUILD_LIBRARY_ONLY 1)
   add_definitions("-DONNX_ML=1")
   add_definitions("-DONNX_NAMESPACE=onnx")
-  set(CUDA_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
   set(TENSORRT_ROOT ${onnxruntime_TENSORRT_HOME})
   set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
   set(PROTOBUF_LIBRARY ${PROTOBUF_LIB})
@@ -58,7 +58,7 @@
       URL_HASH SHA1=${DEP_SHA1_onnx_tensorrt}
     )
     if (NOT CUDA_INCLUDE_DIR)
-      set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build
+      set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS}) # onnx-tensorrt repo needs this variable to build
     endif()
     # The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
     # unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
@@ -102,11 +102,12 @@
   onnxruntime_add_include_to_target(onnxruntime_providers_tensorrt onnxruntime_common onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface)
   add_dependencies(onnxruntime_providers_tensorrt onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
-    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS})
+    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
   else()
-    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS})
+    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
   endif()
-  target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS}
+    PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
   if(onnxruntime_CUDNN_HOME)
     target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDNN_HOME}/include)
   endif()
diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake
index 0951c2d02664..183a3e196af4 100644
--- a/cmake/onnxruntime_providers_vitisai.cmake
+++ b/cmake/onnxruntime_providers_vitisai.cmake
@@ -14,14 +14,19 @@
     "${ONNXRUNTIME_ROOT}/core/providers/vitisai/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.cc"
     "${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs})
-  onnxruntime_add_static_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_vitisai onnxruntime_common onnxruntime_framework onnx onnx_proto)
-  target_link_libraries(onnxruntime_providers_vitisai PRIVATE onnx protobuf::libprotobuf nlohmann_json::nlohmann_json)
-  if(NOT MSVC)
-    target_compile_options(onnxruntime_providers_vitisai PUBLIC $<$<CONFIG:DEBUG>:-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0>)
-  endif(NOT MSVC)
+  onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})
+  onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} nlohmann_json::nlohmann_json safeint_interface flatbuffers::flatbuffers)
+  target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED})
+  if(MSVC)
+    onnxruntime_add_include_to_target(onnxruntime_providers_vitisai dbghelp)
+    set_property(TARGET onnxruntime_providers_vitisai APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/vitisai/symbols.def")
+  else(MSVC)
+    set_property(TARGET onnxruntime_providers_vitisai APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/vitisai/version_script.lds -Xlinker --gc-sections")
+  endif(MSVC)
 
   target_include_directories(onnxruntime_providers_vitisai PRIVATE "${ONNXRUNTIME_ROOT}/core/providers/vitisai/include" ${XRT_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}/VitisAI)
   if(MSVC)
@@ -30,17 +35,18 @@
     target_compile_options(onnxruntime_providers_vitisai PRIVATE "/wd4251")
     # for unused formal parameter
     target_compile_options(onnxruntime_providers_vitisai PRIVATE "/wd4100")
+    # for type name first seen using 'class' now seen using 'struct'
+    target_compile_options(onnxruntime_providers_vitisai PRIVATE "/wd4099")
   else(MSVC)
+    target_compile_options(onnxruntime_providers_vitisai PUBLIC $<$<CONFIG:DEBUG>:-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0>)
     target_compile_options(onnxruntime_providers_vitisai PRIVATE -Wno-unused-parameter)
   endif(MSVC)
 
   set_target_properties(onnxruntime_providers_vitisai PROPERTIES FOLDER "ONNXRuntime")
   set_target_properties(onnxruntime_providers_vitisai PROPERTIES LINKER_LANGUAGE CXX)
 
-  if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_vitisai
-            ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
-            FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
-  endif()
+  install(TARGETS onnxruntime_providers_vitisai
+          ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
+          FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/cmake/onnxruntime_providers_xnnpack.cmake b/cmake/onnxruntime_providers_xnnpack.cmake
index 9c00703ca084..796536ac9d12 100644
--- a/cmake/onnxruntime_providers_xnnpack.cmake
+++ b/cmake/onnxruntime_providers_xnnpack.cmake
@@ -7,9 +7,6 @@
     "${ONNXRUNTIME_INCLUDE_DIR}/core/providers/xnnpack/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/xnnpack/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/xnnpack/*.cc"
-    # utils for handling QDQ models
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/node_unit/node_unit.cc"
   )
 
   source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_xnnpack_cc_srcs})
@@ -19,6 +16,12 @@
     flatbuffers::flatbuffers Boost::mp11 safeint_interface
   )
 
+  # TODO fix stringop-overflow warnings
+  # Add compile option to suppress stringop-overflow error in Flatbuffers.
+  if (HAS_STRINGOP_OVERFLOW)
+    target_compile_options(onnxruntime_providers_xnnpack PRIVATE -Wno-error=stringop-overflow)
+  endif()
+
   add_dependencies(onnxruntime_providers_xnnpack onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
   set_target_properties(onnxruntime_providers_xnnpack PROPERTIES FOLDER "ONNXRuntime")
 
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 86c1071dba98..17e0f1c5f3fb 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -170,7 +170,6 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
     onnxruntime_session
     ${onnxruntime_libs}
     ${PROVIDERS_TVM}
-    ${PROVIDERS_VITISAI}
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_COREML}
@@ -283,10 +282,7 @@ if (WIN32)
     get_filename_component(CUDNN_DLL_NAME ${CUDNN_DLL_PATH} NAME_WE)
     string(REPLACE "cudnn64_" "" CUDNN_VERSION "${CUDNN_DLL_NAME}")
     if(NOT onnxruntime_CUDA_VERSION)
-      message("Reading json file ${onnxruntime_CUDA_HOME}/version.json")
-      set(CUDA_SDK_JSON_FILE_PATH "${onnxruntime_CUDA_HOME}/version.json")
-      file(READ ${CUDA_SDK_JSON_FILE_PATH} CUDA_SDK_JSON_CONTENT)
-      string(JSON onnxruntime_CUDA_VERSION GET ${CUDA_SDK_JSON_CONTENT} "cuda" "version")
+      set(onnxruntime_CUDA_VERSION ${CUDAToolkit_VERSION})
       message("onnxruntime_CUDA_VERSION=${onnxruntime_CUDA_VERSION}")
     endif()
     file(APPEND "${VERSION_INFO_FILE}"
@@ -354,9 +350,6 @@ if (onnxruntime_ENABLE_TRAINING)
   file(GLOB onnxruntime_python_optim_srcs CONFIGURE_DEPENDS
     "${ORTTRAINING_SOURCE_DIR}/python/training/optim/*.py"
   )
-  file(GLOB onnxruntime_python_torchdynamo_srcs CONFIGURE_DEPENDS
-    "${ORTTRAINING_SOURCE_DIR}/python/training/torchdynamo/*.py"
-  )
   file(GLOB onnxruntime_python_ortmodule_srcs CONFIGURE_DEPENDS
     "${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/*.py"
   )
@@ -477,6 +470,9 @@ file(GLOB onnxruntime_python_transformers_models_llama_src CONFIGURE_DEPENDS
 file(GLOB onnxruntime_python_transformers_models_longformer_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/longformer/*.py"
 )
+file(GLOB onnxruntime_python_transformers_models_phi2_src CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/phi2/*.py"
+)
 file(GLOB onnxruntime_python_transformers_models_stable_diffusion_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/stable_diffusion/*.py"
 )
@@ -547,6 +543,7 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/gpt2
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/llama
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/longformer
+  COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/phi2
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/stable_diffusion
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/t5
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/whisper
@@ -650,6 +647,9 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_transformers_models_longformer_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/longformer/
+  COMMAND ${CMAKE_COMMAND} -E copy
+      ${onnxruntime_python_transformers_models_phi2_src}
+      $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/phi2/
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_transformers_models_stable_diffusion_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/stable_diffusion/
@@ -746,7 +746,6 @@ if (onnxruntime_ENABLE_TRAINING)
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/experimental
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/experimental/gradient_graph
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/optim
-    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/torchdynamo
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/json_config
@@ -777,9 +776,6 @@ if (onnxruntime_ENABLE_TRAINING)
     COMMAND ${CMAKE_COMMAND} -E copy
         ${onnxruntime_python_optim_srcs}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/optim/
-    COMMAND ${CMAKE_COMMAND} -E copy
-        ${onnxruntime_python_torchdynamo_srcs}
-        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/torchdynamo/
     COMMAND ${CMAKE_COMMAND} -E copy
         ${onnxruntime_python_ortmodule_srcs}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/
@@ -859,6 +855,16 @@ if (onnxruntime_USE_DNNL)
   )
 endif()
 
+if (onnxruntime_USE_VITISAI)
+  add_custom_command(
+    TARGET onnxruntime_pybind11_state POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        ${DNNL_DLL_PATH} $<TARGET_FILE:onnxruntime_providers_vitisai>
+        $<TARGET_FILE:onnxruntime_providers_shared>
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
+  )
+endif()
+
 if (onnxruntime_USE_TENSORRT)
   add_custom_command(
     TARGET onnxruntime_pybind11_state POST_BUILD
@@ -995,6 +1001,15 @@ if (onnxruntime_USE_COREML)
   )
 endif()
 
+if (onnxruntime_USE_QNN)
+  add_custom_command(
+    TARGET onnxruntime_pybind11_state POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        ${QNN_LIB_FILES}
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
+  )
+endif()
+
 endif()
 if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
   include(onnxruntime_language_interop_ops.cmake)
diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index f70961a66329..0051f241e4f9 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -20,10 +20,6 @@ set(contrib_ops_excluded_files
   "bert/fastertransformer_decoder_attention/*"
   "bert/multihead_attention.cc"
   "bert/multihead_attention.h"
-  "bert/fast_gelu_impl.cu"
-  "bert/fast_gelu_impl.h"
-  "bert/fast_gelu.cc"
-  "bert/fast_gelu.h"
   "bert/relative_attn_bias.cc"
   "bert/relative_attn_bias.h"
   "bert/relative_attn_bias_impl.cu"
@@ -44,9 +40,7 @@ set(contrib_ops_excluded_files
   "bert/packed_multihead_attention.cc"
   "bert/packed_multihead_attention_impl.h"
   "bert/packed_multihead_attention_impl.cu"
-  "diffusion/group_norm.cc"
   "diffusion/group_norm_impl.cu"
-  "diffusion/group_norm_impl.h"
   "diffusion/nhwc_conv.cc"
   "math/gemm_float8.cc"
   "math/gemm_float8.cu"
@@ -66,6 +60,8 @@ set(contrib_ops_excluded_files
   "quantization/matmul_nbits.cc"
   "quantization/matmul_nbits.cuh"
   "quantization/matmul_nbits.cu"
+  "quantization/moe_quantization.h"
+  "quantization/moe_quantization.cc"
   "quantization/quantize_dequantize_linear.cc"
   "quantization/qordered_ops/qordered_attention_impl.cu"
   "quantization/qordered_ops/qordered_attention_impl.h"
@@ -100,26 +96,18 @@ set(contrib_ops_excluded_files
   "bert/group_query_attention.cc"
   "bert/group_query_attention_impl.h"
   "bert/group_query_attention_impl.cu"
+  "collective/distributed_*"
+  "collective/shard*"
 )
 
-if (NOT onnxruntime_ENABLE_ATEN)
-  list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
-endif()
 if (NOT onnxruntime_USE_NCCL)
   # Those are string patterns to exclude. Do NOT use stars such as
   # collective/*.cc or *.h.
   list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h")
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharding.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharding_spec.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_reduce.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_unsqueeze.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_squeeze.cc")
+endif()
+
+if (NOT onnxruntime_ENABLE_ATEN)
+  list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
 endif()
 
 set(provider_excluded_files
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 6991081f1b0d..fce60090b81f 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
-if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+if (IOS)
   find_package(XCTest REQUIRED)
 endif()
 
@@ -18,7 +18,7 @@ function(AddTest)
   cmake_parse_arguments(_UT "DYN" "TARGET" "LIBS;SOURCES;DEPENDS;TEST_ARGS" ${ARGN})
   list(REMOVE_DUPLICATES _UT_SOURCES)
 
-  if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+  if (IOS)
     onnxruntime_add_executable(${_UT_TARGET} ${TEST_SRC_DIR}/xctest/orttestmain.m)
   else()
     onnxruntime_add_executable(${_UT_TARGET} ${_UT_SOURCES})
@@ -67,7 +67,7 @@ function(AddTest)
     if(onnxruntime_USE_CUDA)
       #XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs,
       # otherwise it will impact when CUDA DLLs can be unloaded.
-      target_link_libraries(${_UT_TARGET} PRIVATE cudart)
+      target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart)
     endif()
     target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
   endif()
@@ -111,7 +111,9 @@ function(AddTest)
     target_compile_options(${_UT_TARGET} PRIVATE ${DISABLED_WARNINGS_FOR_TVM})
     target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options -Wno-error=sign-compare>"
             "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-error=sign-compare>")
-    target_compile_options(${_UT_TARGET} PRIVATE "-Wno-error=uninitialized")
+    if (${HAS_NOERROR})
+      target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=uninitialized>")
+    endif()
   endif()
 
   set(TEST_ARGS ${_UT_TEST_ARGS})
@@ -127,7 +129,7 @@ function(AddTest)
     endif()
   endif(onnxruntime_GENERATE_TEST_REPORTS)
 
-  if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+  if (IOS)
     # target_sources(${_UT_TARGET} PRIVATE ${TEST_SRC_DIR}/xctest/orttestmain.m)
     set_target_properties(${_UT_TARGET} PROPERTIES FOLDER "ONNXRuntimeTest"
       MACOSX_BUNDLE_BUNDLE_NAME ${_UT_TARGET}
@@ -565,11 +567,7 @@ if(onnxruntime_USE_ROCM)
 endif()
 
 if(onnxruntime_USE_COREML)
-  if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS")
-    list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml onnxruntime_coreml_proto)
-  else()
-    list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml)
-  endif()
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml coreml_proto)
 endif()
 
 if(onnxruntime_USE_ACL)
@@ -591,7 +589,6 @@ set(ONNXRUNTIME_TEST_LIBS
     # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_JS}
-    ${PROVIDERS_VITISAI}
     ${PROVIDERS_QNN}
     ${PROVIDERS_SNPE}
     ${PROVIDERS_RKNPU}
@@ -675,15 +672,9 @@ endif()
 
 if(onnxruntime_USE_COREML)
   list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/coreml/*)
-  if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "iOS")
-    list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml onnxruntime_coreml_proto)
-    list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml onnxruntime_coreml_proto)
-    list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml onnxruntime_coreml_proto)
-  else()
-    list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml)
-    list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml)
-    list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml)
-  endif()
+  list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml coreml_proto)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml coreml_proto)
+  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml coreml_proto)
 endif()
 
 if(onnxruntime_USE_XNNPACK)
@@ -743,34 +734,37 @@ target_include_directories(onnxruntime_test_utils PUBLIC "${TEST_SRC_DIR}/util/i
 set_target_properties(onnxruntime_test_utils PROPERTIES FOLDER "ONNXRuntimeTest")
 source_group(TREE ${TEST_SRC_DIR} FILES ${onnxruntime_test_utils_src})
 
-set(onnx_test_runner_src_dir ${TEST_SRC_DIR}/onnx)
-file(GLOB onnx_test_runner_common_srcs CONFIGURE_DEPENDS
-    ${onnx_test_runner_src_dir}/*.h
-    ${onnx_test_runner_src_dir}/*.cc)
+if(NOT IOS)
+    set(onnx_test_runner_src_dir ${TEST_SRC_DIR}/onnx)
+    file(GLOB onnx_test_runner_common_srcs CONFIGURE_DEPENDS
+        ${onnx_test_runner_src_dir}/*.h
+        ${onnx_test_runner_src_dir}/*.cc)
 
-list(REMOVE_ITEM onnx_test_runner_common_srcs ${onnx_test_runner_src_dir}/main.cc)
+    list(REMOVE_ITEM onnx_test_runner_common_srcs ${onnx_test_runner_src_dir}/main.cc)
 
-onnxruntime_add_static_library(onnx_test_runner_common ${onnx_test_runner_common_srcs})
-if(MSVC)
-  target_compile_options(onnx_test_runner_common PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
-          "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
-else()
-  target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11)
-  target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
-  onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp)
-endif()
-if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-  #TODO: fix the warnings, they are dangerous
-  target_compile_options(onnx_test_runner_common PRIVATE "/wd4244")
-endif()
-onnxruntime_add_include_to_target(onnx_test_runner_common onnxruntime_common onnxruntime_framework
-        onnxruntime_test_utils onnx onnx_proto re2::re2 flatbuffers::flatbuffers Boost::mp11 safeint_interface)
+    onnxruntime_add_static_library(onnx_test_runner_common ${onnx_test_runner_common_srcs})
+    if(MSVC)
+      target_compile_options(onnx_test_runner_common PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
+              "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
+    else()
+      target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11)
+      target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
+      onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp)
+    endif()
+    if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+      #TODO: fix the warnings, they are dangerous
+      target_compile_options(onnx_test_runner_common PRIVATE "/wd4244")
+    endif()
+    onnxruntime_add_include_to_target(onnx_test_runner_common onnxruntime_common onnxruntime_framework
+            onnxruntime_test_utils onnx onnx_proto re2::re2 flatbuffers::flatbuffers Boost::mp11 safeint_interface)
 
-add_dependencies(onnx_test_runner_common onnx_test_data_proto ${onnxruntime_EXTERNAL_DEPENDENCIES})
-target_include_directories(onnx_test_runner_common PRIVATE ${eigen_INCLUDE_DIRS}
-        ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
+    add_dependencies(onnx_test_runner_common onnx_test_data_proto ${onnxruntime_EXTERNAL_DEPENDENCIES})
+    target_include_directories(onnx_test_runner_common PRIVATE ${eigen_INCLUDE_DIRS}
+            ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
 
-set_target_properties(onnx_test_runner_common PROPERTIES FOLDER "ONNXRuntimeTest")
+    set_target_properties(onnx_test_runner_common PROPERTIES FOLDER "ONNXRuntimeTest")
+    set(onnx_test_runner_common_lib onnx_test_runner_common)
+endif()
 
 set(all_tests ${onnxruntime_test_common_src} ${onnxruntime_test_ir_src} ${onnxruntime_test_optimizer_src}
         ${onnxruntime_test_framework_src} ${onnxruntime_test_providers_src} ${onnxruntime_test_quantiztion_src})
@@ -783,7 +777,15 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
   config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
+  target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
   target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
+  if (MSVC)
+    # Cutlass code has an issue with the following:
+    # warning C4100: 'magic': unreferenced formal parameter
+    target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4100>"
+                  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4100>")
+  endif()
+
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
 endif()
 
@@ -824,6 +826,17 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
       "${TEST_SRC_DIR}/providers/memcpy_test.cc"
     )
   endif()
+  list(REMOVE_ITEM all_tests "${TEST_SRC_DIR}/providers/cpu/reduction/reduction_ops_test.cc"
+      "${TEST_SRC_DIR}/providers/cpu/tensor/grid_sample_test.cc")
+endif()
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR IOS)
+   # Because we do not run these model tests in our web or iOS CI build pipelines, and some test code uses C++17
+   # filesystem functions that are not available in the iOS version we target.
+   message("Disable model tests in onnxruntime_test_all")
+   list(REMOVE_ITEM all_tests
+      "${TEST_SRC_DIR}/providers/cpu/model_tests.cc"
+    )
 endif()
 
 set(test_all_args)
@@ -843,7 +856,7 @@ AddTest(
   TARGET onnxruntime_test_all
   SOURCES ${all_tests} ${onnxruntime_unittest_main_src}
   LIBS
-    onnx_test_runner_common ${onnxruntime_test_providers_libs} ${onnxruntime_test_common_libs}
+    ${onnx_test_runner_common_lib} ${onnxruntime_test_providers_libs} ${onnxruntime_test_common_libs}
     onnx_test_data_proto
   DEPENDS ${all_dependencies}
   TEST_ARGS ${test_all_args}
@@ -881,7 +894,7 @@ endif()
 # the default logger tests conflict with the need to have an overall default logger
 # so skip in this type of
 target_compile_definitions(onnxruntime_test_all PUBLIC -DSKIP_DEFAULT_LOGGER_TESTS)
-if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+if (IOS)
   target_compile_definitions(onnxruntime_test_all_xc PUBLIC -DSKIP_DEFAULT_LOGGER_TESTS)
 endif()
 if(onnxruntime_RUN_MODELTEST_IN_DEBUG_MODE)
@@ -906,7 +919,7 @@ if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
 endif()
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   set_target_properties(onnxruntime_test_all PROPERTIES LINK_DEPENDS ${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js)
-  set_target_properties(onnxruntime_test_all PROPERTIES LINK_FLAGS "-s STACK_SIZE=5242880 -s ALLOW_MEMORY_GROWTH=1 -s MAXIMUM_MEMORY=4294967296 --pre-js \"${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js\" -s \"EXPORTED_RUNTIME_METHODS=['FS']\" --preload-file ${CMAKE_CURRENT_BINARY_DIR}/testdata@/testdata -s EXIT_RUNTIME=1 -s DEMANGLE_SUPPORT=1")
+  set_target_properties(onnxruntime_test_all PROPERTIES LINK_FLAGS "-s STACK_SIZE=5242880 -s INITIAL_MEMORY=536870912 -s ALLOW_MEMORY_GROWTH=1 -s MAXIMUM_MEMORY=4294967296 -s INCOMING_MODULE_JS_API=[preRun,locateFile,arguments,onExit,wasmMemory,buffer,instantiateWasm] --pre-js \"${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js\" -s \"EXPORTED_RUNTIME_METHODS=['FS']\" --preload-file ${CMAKE_CURRENT_BINARY_DIR}/testdata@/testdata -s EXIT_RUNTIME=1 -s DEMANGLE_SUPPORT=1")
   if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
     set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY LINK_FLAGS " -s DEFAULT_PTHREAD_STACK_SIZE=131072 -s PROXY_TO_PTHREAD=1")
   endif()
@@ -969,39 +982,11 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
   endif()
 
   if (onnxruntime_USE_QNN)
-    if (NOT QNN_ARCH_ABI)
-      string(TOLOWER ${onnxruntime_target_platform} GEN_PLATFORM)
-      if(MSVC)
-          message(STATUS "Building MSVC for architecture ${CMAKE_SYSTEM_PROCESSOR} with CMAKE_GENERATOR_PLATFORM as ${GEN_PLATFORM}")
-          if (${GEN_PLATFORM} STREQUAL "arm64")
-            set(QNN_ARCH_ABI aarch64-windows-msvc)
-          else()
-            set(QNN_ARCH_ABI x86_64-windows-msvc)
-          endif()
-      else()
-          if (${CMAKE_SYSTEM_NAME} STREQUAL "Android")
-            set(QNN_ARCH_ABI aarch64-android-clang6.0)
-          elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
-            if (${GEN_PLATFORM} STREQUAL "x86_64")
-              set(QNN_ARCH_ABI x86_64-linux-clang)
-            else()
-              set(QNN_ARCH_ABI aarch64-android)
-            endif()
-          endif()
-      endif()
-    endif()
-
     if (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
-        file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/*.so" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/*.dll")
-        if (${QNN_ARCH_ABI} STREQUAL "aarch64-windows-msvc")
-          file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so" "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so")
-          list(APPEND QNN_LIB_FILES ${EXTRA_HTP_LIB})
-        endif()
-        message(STATUS "QNN lib files: " ${QNN_LIB_FILES})
-        add_custom_command(
-          TARGET ${test_data_target} POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} $<TARGET_FILE_DIR:${test_data_target}>
-          )
+      add_custom_command(
+        TARGET ${test_data_target} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} $<TARGET_FILE_DIR:${test_data_target}>
+        )
     endif()
   endif()
 
@@ -1052,45 +1037,42 @@ if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS)
   list(APPEND onnx_test_libs onnxruntime_language_interop onnxruntime_pyop)
 endif()
 
-onnxruntime_add_executable(onnx_test_runner ${onnx_test_runner_src_dir}/main.cc)
-if(MSVC)
-  target_compile_options(onnx_test_runner PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
-          "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
-endif()
-if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-  set_target_properties(onnx_test_runner PROPERTIES
-    XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
-  )
-endif()
-if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-  if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
-    set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
-  else()
-    set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1")
-  endif()
-endif()
+if (NOT IOS)
+    onnxruntime_add_executable(onnx_test_runner ${onnx_test_runner_src_dir}/main.cc)
+    if(MSVC)
+      target_compile_options(onnx_test_runner PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
+              "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
+    endif()
+    if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+      if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
+        set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
+      else()
+        set_target_properties(onnx_test_runner PROPERTIES LINK_FLAGS "-s NODERAWFS=1 -s ALLOW_MEMORY_GROWTH=1")
+      endif()
+    endif()
 
-target_link_libraries(onnx_test_runner PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs} nlohmann_json::nlohmann_json)
-target_include_directories(onnx_test_runner PRIVATE ${ONNXRUNTIME_ROOT})
-if (onnxruntime_USE_ROCM)
-  target_include_directories(onnx_test_runner PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
-endif()
-if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
-  target_link_libraries(onnx_test_runner PRIVATE Python::Python)
-endif()
-set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
+    target_link_libraries(onnx_test_runner PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs} nlohmann_json::nlohmann_json)
+    target_include_directories(onnx_test_runner PRIVATE ${ONNXRUNTIME_ROOT})
+    if (onnxruntime_USE_ROCM)
+      target_include_directories(onnx_test_runner PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
+    endif()
+    if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
+      target_link_libraries(onnx_test_runner PRIVATE Python::Python)
+    endif()
+    set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
 
-if (onnxruntime_USE_TVM)
-  if (WIN32)
-    target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
-  endif()
-endif()
+    if (onnxruntime_USE_TVM)
+      if (WIN32)
+        target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
+      endif()
+    endif()
 
-install(TARGETS onnx_test_runner
-        ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        BUNDLE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+    install(TARGETS onnx_test_runner
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            BUNDLE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
 
 if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
   if(onnxruntime_BUILD_BENCHMARKS)
@@ -1171,90 +1153,80 @@ endif()
 
 
 if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
-  #perf test runner
-  set(onnxruntime_perf_test_src_dir ${TEST_SRC_DIR}/perftest)
-  set(onnxruntime_perf_test_src_patterns
-  "${onnxruntime_perf_test_src_dir}/*.cc"
-  "${onnxruntime_perf_test_src_dir}/*.h")
+  if(NOT IOS)
+    #perf test runner
+    set(onnxruntime_perf_test_src_dir ${TEST_SRC_DIR}/perftest)
+    set(onnxruntime_perf_test_src_patterns
+    "${onnxruntime_perf_test_src_dir}/*.cc"
+    "${onnxruntime_perf_test_src_dir}/*.h")
 
-  if(WIN32)
-    list(APPEND onnxruntime_perf_test_src_patterns
-      "${onnxruntime_perf_test_src_dir}/windows/*.cc"
-      "${onnxruntime_perf_test_src_dir}/windows/*.h" )
-  else ()
-    list(APPEND onnxruntime_perf_test_src_patterns
-      "${onnxruntime_perf_test_src_dir}/posix/*.cc"
-      "${onnxruntime_perf_test_src_dir}/posix/*.h" )
-  endif()
+    if(WIN32)
+      list(APPEND onnxruntime_perf_test_src_patterns
+        "${onnxruntime_perf_test_src_dir}/windows/*.cc"
+        "${onnxruntime_perf_test_src_dir}/windows/*.h" )
+    else ()
+      list(APPEND onnxruntime_perf_test_src_patterns
+        "${onnxruntime_perf_test_src_dir}/posix/*.cc"
+        "${onnxruntime_perf_test_src_dir}/posix/*.h" )
+    endif()
 
-  file(GLOB onnxruntime_perf_test_src CONFIGURE_DEPENDS
-    ${onnxruntime_perf_test_src_patterns}
-    )
-  onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc)
-  if(MSVC)
-    target_compile_options(onnxruntime_perf_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
+    file(GLOB onnxruntime_perf_test_src CONFIGURE_DEPENDS
+      ${onnxruntime_perf_test_src_patterns}
+      )
+    onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc)
+    if(MSVC)
+      target_compile_options(onnxruntime_perf_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
             "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
-  endif()
-  target_include_directories(onnxruntime_perf_test PRIVATE ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
+    endif()
+    target_include_directories(onnxruntime_perf_test PRIVATE   ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
           ${eigen_INCLUDE_DIRS} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
           ${CMAKE_CURRENT_BINARY_DIR})
-  if (onnxruntime_USE_ROCM)
-    target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
-  endif()
-  if (WIN32)
-    target_compile_options(onnxruntime_perf_test PRIVATE ${disabled_warnings})
-    if (NOT DEFINED SYS_PATH_LIB)
-      set(SYS_PATH_LIB shlwapi)
+    if (onnxruntime_USE_ROCM)
+      target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
+    endif()
+    if (WIN32)
+      target_compile_options(onnxruntime_perf_test PRIVATE ${disabled_warnings})
+      if (NOT DEFINED SYS_PATH_LIB)
+        set(SYS_PATH_LIB shlwapi)
+      endif()
     endif()
-  endif()
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-    set_target_properties(onnxruntime_perf_test PROPERTIES
-      XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
-    )
-  endif()
 
-  if (onnxruntime_BUILD_SHARED_LIB)
-    #It will dynamically link to onnxruntime. So please don't add onxruntime_graph/onxruntime_framework/... here.
-    #onnxruntime_common is kind of ok because it is thin, tiny and totally stateless.
-    set(onnxruntime_perf_test_libs
+    if (onnxruntime_BUILD_SHARED_LIB)
+      #It will dynamically link to onnxruntime. So please don't add onxruntime_graph/onxruntime_framework/... here.
+      #onnxruntime_common is kind of ok because it is thin, tiny and totally stateless.
+      set(onnxruntime_perf_test_libs
             onnx_test_runner_common onnxruntime_test_utils onnxruntime_common
             onnxruntime onnxruntime_flatbuffers onnx_test_data_proto
             ${onnxruntime_EXTERNAL_LIBRARIES}
             ${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
-    if(NOT WIN32)
-      list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp)
-      if(onnxruntime_USE_SNPE)
-        list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe)
+      if(NOT WIN32)
+        list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp)
+        if(onnxruntime_USE_SNPE)
+          list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe)
+        endif()
       endif()
+      if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+        list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
+      endif()
+      target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
+      if(WIN32)
+        target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
+      endif()
+    else()
+      target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs})
     endif()
-    if (CMAKE_SYSTEM_NAME STREQUAL "Android")
-      list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
-    endif()
-    target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
-    if(WIN32)
-      target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
-    endif()
-    if(tensorflow_C_PACKAGE_PATH)
-      target_include_directories(onnxruntime_perf_test PRIVATE ${tensorflow_C_PACKAGE_PATH}/include)
-      target_link_directories(onnxruntime_perf_test PRIVATE ${tensorflow_C_PACKAGE_PATH}/lib)
-      target_link_libraries(onnxruntime_perf_test PRIVATE tensorflow)
-      target_compile_definitions(onnxruntime_perf_test PRIVATE HAVE_TENSORFLOW)
-    endif()
-  else()
-    target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs})
-  endif()
-  set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
+    set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
 
-  if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS AND NOT onnxruntime_BUILD_SHARED_LIB)
-    target_link_libraries(onnxruntime_perf_test PRIVATE onnxruntime_language_interop onnxruntime_pyop)
-  endif()
+    if (onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS AND NOT onnxruntime_BUILD_SHARED_LIB)
+      target_link_libraries(onnxruntime_perf_test PRIVATE onnxruntime_language_interop onnxruntime_pyop)
+    endif()
 
-  if (onnxruntime_USE_TVM)
-    if (WIN32)
-      target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
+    if (onnxruntime_USE_TVM)
+      if (WIN32)
+        target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
+      endif()
     endif()
   endif()
-
   # shared lib
   if (onnxruntime_BUILD_SHARED_LIB)
     onnxruntime_add_static_library(onnxruntime_mocked_allocator ${TEST_SRC_DIR}/util/test_allocator.cc)
@@ -1275,7 +1247,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       list(APPEND onnxruntime_shared_lib_test_LIBS cpuinfo)
     endif()
     if (onnxruntime_USE_CUDA)
-      list(APPEND onnxruntime_shared_lib_test_LIBS cudart)
+      list(APPEND onnxruntime_shared_lib_test_LIBS CUDA::cudart)
+    endif()
+    if (onnxruntime_USE_ROCM)
+      list(APPEND onnxruntime_shared_lib_test_LIBS hip::host)
     endif()
     if (onnxruntime_USE_TENSORRT)
       list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
@@ -1294,6 +1269,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
       target_sources(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu)
     endif()
+    if (onnxruntime_USE_ROCM)
+      target_include_directories(onnxruntime_shared_lib_test PRIVATE ${onnxruntime_ROCM_HOME}/include)
+      target_compile_definitions(onnxruntime_shared_lib_test PRIVATE __HIP_PLATFORM_AMD__)
+    endif()
     if (CMAKE_SYSTEM_NAME STREQUAL "Android")
       target_sources(onnxruntime_shared_lib_test PRIVATE
         "${ONNXRUNTIME_ROOT}/core/platform/android/cxa_demangle.cc"
@@ -1302,7 +1281,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       target_compile_definitions(onnxruntime_shared_lib_test PRIVATE USE_DUMMY_EXA_DEMANGLE=1)
     endif()
 
-    if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+    if (IOS)
       add_custom_command(
         TARGET onnxruntime_shared_lib_test POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E copy_directory
@@ -1389,7 +1368,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       target_compile_options(onnxruntime_mlas_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd26426>"
                   "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26426>")
     endif()
-    if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+    if(IOS)
       set_target_properties(onnxruntime_mlas_test PROPERTIES
         XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED "NO"
       )
@@ -1590,7 +1569,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
             DEPENDS ${all_dependencies}
     )
 
-    if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+    if (IOS)
       add_custom_command(
         TARGET onnxruntime_customopregistration_test POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E copy_directory
@@ -1662,6 +1641,38 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
                ${ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG})
 endif()
 
+if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUILD OR onnxruntime_MINIMAL_BUILD_CUSTOM_OPS))
+
+  file(GLOB_RECURSE custom_op_local_function_test_library_src
+        "${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.cc"
+        "${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.h"
+        "${TEST_SRC_DIR}/testdata/custom_op_local_function/dummy_gemm.cc"
+        "${TEST_SRC_DIR}/testdata/custom_op_local_function/dummy_gemm.h"
+  )
+
+  onnxruntime_add_shared_library_module(custom_op_local_function ${custom_op_local_function_test_library_src})
+
+  onnxruntime_add_include_to_target(custom_op_local_function onnxruntime_common GTest::gtest GTest::gmock)
+  target_include_directories(custom_op_local_function PRIVATE ${REPO_ROOT}/include/onnxruntime/core/session
+                                                                            ${REPO_ROOT}/include/onnxruntime/core/common)
+
+  if(UNIX)
+    if (APPLE)
+      set(ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip")
+    else()
+      string(CONCAT ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG
+             "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.lds "
+             "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
+    endif()
+  else()
+    set(ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG
+        "-DEF:${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.def")
+  endif()
+
+  set_property(TARGET custom_op_local_function APPEND_STRING PROPERTY LINK_FLAGS
+               ${ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG})
+endif()
+
 if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
   set (onnxruntime_logging_apis_test_SRC
        ${ONNXRUNTIME_LOGGING_APIS_TEST_SRC_DIR}/test_logging_apis.cc)
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 9014089cb611..546d50c1ca2d 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -225,6 +225,7 @@ else()
     "SHELL:-s EXPORT_ALL=0"
     "SHELL:-s VERBOSE=0"
     "SHELL:-s FILESYSTEM=0"
+    "SHELL:-s INCOMING_MODULE_JS_API=[preRun,locateFile,arguments,onExit,wasmMemory,buffer,instantiateWasm,mainScriptUrlOrBlob]"
     ${WASM_API_EXCEPTION_CATCHING}
     --no-entry
   )
@@ -267,7 +268,10 @@ else()
   endif()
 
   if (onnxruntime_USE_WEBNN)
-   set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+    if (onnxruntime_DISABLE_RTTI)
+      set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -fno-rtti -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+    endif()
   endif()
 
   # Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
@@ -281,6 +285,7 @@ else()
     target_link_options(onnxruntime_webassembly PRIVATE
       "SHELL:-s EXPORT_NAME=ortWasmThreaded"
       "SHELL:-s DEFAULT_PTHREAD_STACK_SIZE=131072"
+      "SHELL:-s PTHREAD_POOL_SIZE=Module[\\\"numThreads\\\"]"
     )
   else()
     target_link_options(onnxruntime_webassembly PRIVATE
diff --git a/cmake/patches/abseil/absl_windows.patch b/cmake/patches/abseil/absl_windows.patch
index 66ef0c5125a7..584c49d61229 100644
--- a/cmake/patches/abseil/absl_windows.patch
+++ b/cmake/patches/abseil/absl_windows.patch
@@ -25,17 +25,91 @@ index a6efc98e..8c4de8e7 100644
      "/wd4800",
  ]
 diff --git a/absl/copts/copts.py b/absl/copts/copts.py
-index 0d6c1ec3..75fd935f 100644
+index e6e11949..0aa7d868 100644
 --- a/absl/copts/copts.py
 +++ b/absl/copts/copts.py
-@@ -132,10 +132,6 @@ COPT_VARS = {
-             "/wd4068",  # unknown pragma
-             # qualifier applied to function type has no meaning; ignored
-             "/wd4180",
--            # conversion from 'type1' to 'type2', possible loss of data
--            "/wd4244",
--            # conversion from 'size_t' to 'type', possible loss of data
--            "/wd4267",
-             # The decorated name was longer than the compiler limit
-             "/wd4503",
-             # forcing value to bool 'true' or 'false' (performance warning)
+@@ -115,10 +115,6 @@ MSVC_WARNING_FLAGS = [
+     "/wd4068",  # unknown pragma
+     # qualifier applied to function type has no meaning; ignored
+     "/wd4180",
+-    # conversion from 'type1' to 'type2', possible loss of data
+-    "/wd4244",
+-    # conversion from 'size_t' to 'type', possible loss of data
+-    "/wd4267",
+     # The decorated name was longer than the compiler limit
+     "/wd4503",
+     # forcing value to bool 'true' or 'false' (performance warning)
+diff --git a/absl/debugging/symbolize_win32.inc b/absl/debugging/symbolize_win32.inc
+index 53a099a1..34d210d6 100644
+--- a/absl/debugging/symbolize_win32.inc
++++ b/absl/debugging/symbolize_win32.inc
+@@ -35,15 +35,15 @@ ABSL_NAMESPACE_BEGIN
+ 
+ static HANDLE process = NULL;
+ 
+-void InitializeSymbolizer(const char*) {
+-  if (process != nullptr) {
+-    return;
+-  }
++namespace {
++void InitializeSymbolizerImpl() {
++
+   process = GetCurrentProcess();
+ 
+   // Symbols are not loaded until a reference is made requiring the
+   // symbols be loaded. This is the fastest, most efficient way to use
+   // the symbol handler.
++
+   SymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME);
+   if (!SymInitialize(process, nullptr, true)) {
+     // GetLastError() returns a Win32 DWORD, but we assign to
+@@ -54,6 +54,36 @@ void InitializeSymbolizer(const char*) {
+   }
+ }
+ 
++bool LookupAndInitialize(const void* pc, SYMBOL_INFO* symbol) {
++  auto hProcess = (process != NULL) ? process : GetCurrentProcess();
++  if (SymFromAddr(hProcess, reinterpret_cast<DWORD64>(pc), nullptr, symbol) != TRUE) {
++    if (GetLastError() == ERROR_INVALID_HANDLE && process == NULL) {
++      InitializeSymbolizerImpl();
++      if (SymFromAddr(process, reinterpret_cast<DWORD64>(pc), nullptr, symbol) != TRUE) {
++        return false;
++      }
++    } else {
++      return false;
++    }
++    return false;
++  }
++  return true;
++}
++}
++
++void InitializeSymbolizer(const char*) {
++  if (process != nullptr) {
++    return;
++  }
++
++  alignas(SYMBOL_INFO) char buf[sizeof(SYMBOL_INFO) + MAX_SYM_NAME];
++  SYMBOL_INFO* symbol = reinterpret_cast<SYMBOL_INFO*>(buf);
++  symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
++  symbol->MaxNameLen = MAX_SYM_NAME;
++
++  static_cast<void>(LookupAndInitialize(reinterpret_cast<const void*>(&InitializeSymbolizer), symbol));
++}
++
+ bool Symbolize(const void* pc, char* out, int out_size) {
+   if (out_size <= 0) {
+     return false;
+@@ -62,9 +92,11 @@ bool Symbolize(const void* pc, char* out, int out_size) {
+   SYMBOL_INFO* symbol = reinterpret_cast<SYMBOL_INFO*>(buf);
+   symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+   symbol->MaxNameLen = MAX_SYM_NAME;
+-  if (!SymFromAddr(process, reinterpret_cast<DWORD64>(pc), nullptr, symbol)) {
++
++  if(!LookupAndInitialize(pc, symbol)) {
+     return false;
+   }
++
+   const size_t out_size_t = static_cast<size_t>(out_size);
+   strncpy(out, symbol->Name, out_size_t);
+   if (out[out_size_t - 1] != '\0') {
diff --git a/cmake/patches/coremltools/crossplatformbuild.patch b/cmake/patches/coremltools/crossplatformbuild.patch
new file mode 100644
index 000000000000..7f2268f50c82
--- /dev/null
+++ b/cmake/patches/coremltools/crossplatformbuild.patch
@@ -0,0 +1,155 @@
+diff --git a/mlmodel/src/MILBlob/Blob/FileWriter.cpp b/mlmodel/src/MILBlob/Blob/FileWriter.cpp
+index adc7bfcf..7b2bf9cc 100644
+--- a/mlmodel/src/MILBlob/Blob/FileWriter.cpp
++++ b/mlmodel/src/MILBlob/Blob/FileWriter.cpp
+@@ -8,8 +8,12 @@
+
+ #include <cstdio>
+ #include <stdexcept>
++
++// ORT_EDIT: Exclude mmap on Windows. Not used in this file anyway.
++#if !defined(_WIN32)
+ #include <sys/mman.h>
+ #include <sys/stat.h>
++#endif
+
+ using namespace MILBlob;
+ using namespace MILBlob::Blob;
+diff --git a/mlmodel/src/MILBlob/Fp16.cpp b/mlmodel/src/MILBlob/Fp16.cpp
+index ae1e71a1..77a7161f 100644
+--- a/mlmodel/src/MILBlob/Fp16.cpp
++++ b/mlmodel/src/MILBlob/Fp16.cpp
+@@ -5,6 +5,8 @@
+
+ #include "MILBlob/Fp16.hpp"
+
++// ORT_EDIT: Exclude clang specific pragmas from other builds
++#if defined(__clang__)
+ // fp16 lib code has some conversion warnings we don't want to globally ignore
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wincompatible-pointer-types"
+@@ -12,6 +14,9 @@
+ #pragma clang diagnostic ignored "-Wconversion"
+ #include "fp16/fp16.h"
+ #pragma clang diagnostic pop
++#else
++#include "fp16/fp16.h"
++#endif
+
+ using namespace MILBlob;
+
+diff --git a/modelpackage/src/ModelPackage.cpp b/modelpackage/src/ModelPackage.cpp
+index 8fee56b9..99e0d8d6 100644
+--- a/modelpackage/src/ModelPackage.cpp
++++ b/modelpackage/src/ModelPackage.cpp
+@@ -26,7 +26,14 @@ namespace std {
+ #else
+ #error "missing required header <filesystem>"
+ #endif
++
++// ORT_EDIT: Use UuidCreate on Windows.
++#if defined(_WIN32)
++#pragma comment(lib, "rpcrt4.lib")  // UuidCreate
++#include <windows.h>
++#else
+ #include <uuid/uuid.h>
++#endif
+ #include <vector>
+
+ #if defined(__cplusplus)
+@@ -187,7 +194,10 @@ public:
+     ModelPackageItemInfo createFile(const std::string& name, const std::string& author, const std::string& description);
+ };
+
++// ORT_EDIT: pragma only available on APPLE platforms
++#if defined(__APPLE__)
+ #pragma mark ModelPackageImpl
++#endif
+
+ ModelPackageImpl::ModelPackageImpl(const std::filesystem::path& path, bool createIfNecessary, bool readOnly)
+ : m_packagePath(path),
+@@ -372,6 +382,20 @@ std::filesystem::path ModelPackageImpl::getItemPath(const std::string& name, con
+ }
+
+ std::string ModelPackageImpl::generateIdentifier() const {
++// ORT_EDIT: Use built-in UUID generation on Windows
++#if defined(_WIN32)
++    UUID uuid;
++    UuidCreate(&uuid);
++
++    RPC_CSTR uuidStr;
++    UuidToStringA(&uuid, &uuidStr);
++
++    std::string uuidStrCpp(reinterpret_cast<char*>(uuidStr));
++
++    RpcStringFreeA(&uuidStr);
++
++    return uuidStrCpp;
++#else
+     uuid_t uuid;
+
+     // uuid_unparse generates a 36-character null-terminated string (37 bytes).
+@@ -383,6 +407,7 @@ std::string ModelPackageImpl::generateIdentifier() const {
+     uuid_unparse(uuid, buf);
+
+     return std::string(buf);
++#endif
+ }
+
+ ModelPackageItemInfo ModelPackageImpl::createFile(const std::string& name, const std::string& author, const std::string& description) {
+@@ -468,7 +493,13 @@ std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::stri
+     auto author = itemInfoEntry->getString(kModelPackageItemInfoAuthorKey);
+     auto description = itemInfoEntry->getString(kModelPackageItemInfoDescriptionKey);
+
++// ORT_EDIT: need to use path.string() on Windows
++#if defined(_WIN32)
++    return std::make_shared<ModelPackageItemInfo>(std::make_shared<ModelPackageItemInfoImpl>(identifier, path.string(), name, author, description));
++
++#else
+     return std::make_shared<ModelPackageItemInfo>(std::make_shared<ModelPackageItemInfoImpl>(identifier, path, name, author, description));
++#endif
+ }
+
+ std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::string& name, const std::string& author) const
+@@ -514,7 +545,9 @@ void ModelPackageImpl::removeItem(const std::string& identifier)
+     }
+
+     auto path = m_packageDataDirPath / itemInfoEntry->getString(kModelPackageItemInfoPathKey);
+-    if (0 != std::remove(path.c_str())) {
++    // ORT_EDIT: std::remove doesn't work on Windows. Use std::filesystem::remove instead.
++    // if (0 != std::remove(path.c_str())) {
++    if (!std::filesystem::remove(path)) {
+         throw std::runtime_error("Failed to remove file at path: " + path.string());
+     }
+
+@@ -525,13 +558,16 @@ bool ModelPackageImpl::isValid(const std::filesystem::path& path)
+ {
+     try {
+         ModelPackageImpl(path, false, true);
+-    } catch (std::runtime_error& e) {
++    } catch (std::runtime_error& /*e*/) {  // ORT_EDIT: comment out unused variable
+         return false;
+     }
+     return true;
+ }
+
++// ORT_EDIT: pragma only available on APPLE platforms
++#if defined(__APPLE__)
+ #pragma mark ModelPackage
++#endif
+
+ ModelPackage::ModelPackage(const std::string& packagePath, bool createIfNecessary, bool readOnly)
+ : m_modelPackageImpl(std::make_shared<ModelPackageImpl>(packagePath, createIfNecessary, readOnly))
+@@ -544,7 +580,12 @@ ModelPackage::~ModelPackage()
+
+ std::string ModelPackage::path() const
+ {
++// ORT_EDIT: Windows doesn't automatically convert to std::string as the native format could be char or wchar.
++#if defined(_WIN32)
++    return m_modelPackageImpl->path().string();
++#else
+     return m_modelPackageImpl->path();
++#endif
+ }
+
+ std::string ModelPackage::setRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description)
diff --git a/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
new file mode 100644
index 000000000000..afb19a45ce0f
--- /dev/null
+++ b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
@@ -0,0 +1,22 @@
+diff --git a/include/cpuinfo.h b/include/cpuinfo.h
+index c46b65e..8b83a64 100644
+--- a/include/cpuinfo.h
++++ b/include/cpuinfo.h
+@@ -18,7 +18,7 @@
+ 	#define CPUINFO_ARCH_X86 1
+ #endif
+ 
+-#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
++#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
+ 	#define CPUINFO_ARCH_X86_64 1
+ #endif
+ 
+@@ -26,7 +26,7 @@
+ 	#define CPUINFO_ARCH_ARM 1
+ #endif
+ 
+-#if defined(__aarch64__) || defined(_M_ARM64)
++#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+ 	#define CPUINFO_ARCH_ARM64 1
+ #endif
+ 
diff --git a/cmake/patches/flatbuffers/flatbuffers.patch b/cmake/patches/flatbuffers/flatbuffers.patch
index fb2678ef1bdc..fbe8db37ecb0 100644
--- a/cmake/patches/flatbuffers/flatbuffers.patch
+++ b/cmake/patches/flatbuffers/flatbuffers.patch
@@ -2,35 +2,11 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt
 index 3987eac9..5e5462f1 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -223,7 +223,7 @@ elseif(CMAKE_COMPILER_IS_GNUCXX)
-       "${CMAKE_CXX_FLAGS} -std=c++0x")
-   endif(CYGWIN)
-   set(CMAKE_CXX_FLAGS
--    "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wextra -Werror=shadow")
-+    "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wextra -Werror=shadow -Wno-error=stringop-overflow")
-   set(FLATBUFFERS_PRIVATE_CXX_FLAGS "-Wold-style-cast")
-   if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.4)
-     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-diff --git a/src/idl_gen_rust.cpp b/src/idl_gen_rust.cpp
-index 55b8439b..dc03e8a8 100644
---- a/src/idl_gen_rust.cpp
-+++ b/src/idl_gen_rust.cpp
-@@ -406,7 +406,8 @@ class RustGenerator : public BaseGenerator {
-     // example: f(A, D::E)          -> super::D::E
-     // does not include leaf object (typically a struct type).
- 
--    size_t i = 0;
-+    // fix unused but set variable warning
-+    //size_t i = 0;
-     std::stringstream stream;
- 
-     auto s = src->components.begin();
-@@ -417,7 +418,7 @@ class RustGenerator : public BaseGenerator {
-       if (*s != *d) { break; }
-       ++s;
-       ++d;
--      ++i;
-+      //++i;
-     }
- 
-     for (; s != src->components.end(); ++s) { stream << "super::"; }
+@@ -279,5 +279,5 @@
+ # Append FLATBUFFERS_CXX_FLAGS to CMAKE_CXX_FLAGS.
+ if(DEFINED FLATBUFFERS_CXX_FLAGS)
+   message(STATUS "extend CXX_FLAGS with ${FLATBUFFERS_CXX_FLAGS}")
+-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS}")
++  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS} -Wno-error=stringop-overflow")
+ endif()
+ message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
diff --git a/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
new file mode 100644
index 000000000000..e503a512a74f
--- /dev/null
+++ b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
@@ -0,0 +1,30 @@
+diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
+index 99f3ccc..a11de9d 100644
+--- a/bestla/bestla/bestla_prologue_b.h
++++ b/bestla/bestla/bestla_prologue_b.h
+@@ -456,9 +456,8 @@ class WeightKBlockNInteger {
+     auto tmpscales = tmp;
+     auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
+     if (scales) {
+-      for (size_t i = 0; i < N * blks; i += 2) {
++      for (size_t i = 0; i < N * blks; i ++) {
+         tmpscales[i] = scales[i] / 16;
+-        tmpscales[i + 1] = scales[i + 1] / 16;
+       }
+     }
+     if (zero_points) {
+diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h
+index 6783ee8..59822e5 100644
+--- a/bestla/bestla/kernel_avx512f.h
++++ b/bestla/bestla/kernel_avx512f.h
+@@ -673,8 +673,8 @@ inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8
+     zmm1 = _mm512_sllv_epi32(zmm1, zmm_shift);  // int3_clip => int8
+     zmm2 = _mm512_sllv_epi32(zmm2, zmm_shift);  // int3_clip => int8
+
+-    _mm512_storeu_epi8((__m512i*)dst, zmm1);
+-    _mm512_storeu_epi8((__m512i*)(dst + 64), zmm2);
++    _mm512_storeu_si512((__m512i*)dst, zmm1);
++    _mm512_storeu_si512((__m512i*)(dst + 64), zmm2);
+   };
+
+   assert(head_ignore_num % 8 == 0);
diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index a2d7672a3d48..fe8d6622bcc0 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -1,8 +1,8 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 4dd56b6e..018da488 100644
+﻿diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 6d7ca846..69aa622f 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -397,6 +397,7 @@ if (MSVC)
+@@ -499,6 +499,7 @@ if (MSVC)
    endif()
  else()
    # On non-Windows, hide all symbols we don't need
@@ -10,7 +10,7 @@ index 4dd56b6e..018da488 100644
    set(ONNX_API_DEFINE "-DONNX_API=__attribute__\(\(__visibility__\(\"default\"\)\)\)")
    set_target_properties(onnx_proto PROPERTIES CXX_VISIBILITY_PRESET hidden)
    set_target_properties(onnx_proto PROPERTIES VISIBILITY_INLINES_HIDDEN 1)
-@@ -548,20 +549,9 @@ endif()
+@@ -653,20 +654,9 @@ endif()
  if(MSVC)
    target_compile_options(onnx_proto
                           PRIVATE /MP
@@ -31,14 +31,72 @@ index 4dd56b6e..018da488 100644
                                   ${EXTRA_FLAGS})
    if(ONNX_USE_PROTOBUF_SHARED_LIBS)
        target_compile_options(onnx_proto
+diff --git a/onnx/common/file_utils.h b/onnx/common/file_utils.h
+index b847798e..a6c31904 100644
+--- a/onnx/common/file_utils.h
++++ b/onnx/common/file_utils.h
+@@ -6,7 +6,6 @@
+ 
+ #pragma once
+ 
+-#include <filesystem>
+ #include <fstream>
+ #include <string>
+ 
+@@ -17,8 +16,7 @@ namespace ONNX_NAMESPACE {
+ 
+ template <typename T>
+ void LoadProtoFromPath(const std::string proto_path, T& proto) {
+-  std::filesystem::path proto_u8_path = std::filesystem::u8path(proto_path);
+-  std::fstream proto_stream(proto_u8_path, std::ios::in | std::ios::binary);
++  std::fstream proto_stream(proto_path, std::ios::in | std::ios::binary);
+   if (!proto_stream.good()) {
+     fail_check("Unable to open proto file: ", proto_path, ". Please check if it is a valid proto. ");
+   }
+diff --git a/onnx/defs/quantization/defs.cc b/onnx/defs/quantization/defs.cc
+index 70b4a4db..98c11545 100644
+--- a/onnx/defs/quantization/defs.cc
++++ b/onnx/defs/quantization/defs.cc
+@@ -200,6 +200,9 @@ ONNX_OPERATOR_SET_SCHEMA(
+         .SetDoc(DequantizeLinear_ver21_doc)
+         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+           propagateElemTypeFromInputToOutput(ctx, 1, 0);
++          if (!hasInputShape(ctx, 0)) {
++            return;
++          }
+           auto& input_shape = getInputShape(ctx, 0);
+           updateOutputShape(ctx, 0, input_shape);
+         }));
+diff --git a/onnx/defs/quantization/old.cc b/onnx/defs/quantization/old.cc
+index 3f2d6384..d2f7cfd8 100644
+--- a/onnx/defs/quantization/old.cc
++++ b/onnx/defs/quantization/old.cc
+@@ -130,6 +130,9 @@ ONNX_OPERATOR_SET_SCHEMA(
+         .SetDoc(DequantizeLinear_ver19_doc)
+         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+           propagateElemTypeFromInputToOutput(ctx, 1, 0);
++          if (!hasInputShape(ctx, 0)) {
++            return;
++          }
+           auto& input_shape = getInputShape(ctx, 0);
+           updateOutputShape(ctx, 0, input_shape);
+         }));
+@@ -181,7 +184,6 @@ ONNX_OPERATOR_SET_SCHEMA(
+           if (!hasInputShape(ctx, 0)) {
+             return;
+           }
+-
+           auto& input_shape = getInputShape(ctx, 0);
+           updateOutputShape(ctx, 0, input_shape);
+         }));
 diff --git a/onnx/onnx_pb.h b/onnx/onnx_pb.h
-index 0aab3e26..0f859267 100644
+index 0aab3e26..398ac2d6 100644
 --- a/onnx/onnx_pb.h
 +++ b/onnx/onnx_pb.h
 @@ -47,10 +47,28 @@
  #define ONNX_API ONNX_IMPORT
  #endif
-
+ 
 +#if defined(__GNUC__)
 +#pragma GCC diagnostic push
 +
@@ -58,9 +116,61 @@ index 0aab3e26..0f859267 100644
  #else
  #include "onnx/onnx.pb.h"
  #endif
-
+ 
 +#if defined(__GNUC__)
 +#pragma GCC diagnostic pop
 +#endif
 +
  #endif // ! ONNX_ONNX_PB_H
+diff --git a/onnx/shape_inference/implementation.cc b/onnx/shape_inference/implementation.cc
+index fab1faf2..8723dcd4 100644
+--- a/onnx/shape_inference/implementation.cc
++++ b/onnx/shape_inference/implementation.cc
+@@ -488,29 +488,29 @@ class ShapeInferenceImplBase {
+           ProcessCall(n, *(iter->second), ctx);
+         } else {
+           has_unsupported_op = true;
++          return;
+         }
+       } else {
+         has_unsupported_op = true;
++        return;
+       }
+-      if (!has_unsupported_op) {
+-        for (int i = 0; i < n.output_size(); ++i) {
+-          // skip type and shape propagation for missing optional outputs.
+-          if (!n.output(i).empty())
+-            UpdateType(n.output(i), ctx.getOutputType(i));
+-        }
+-        // Constant values are tracked to improve inference/checking for subsequent nodes.
+-        ProcessConstant(n);
+-        // If data-propagation is enabled, partial-evaluation (aka data-propagation) is performed
+-        // to improve inference/checking for subsequent nodes.
+-        if (options.enable_data_propagation && schema && schema->has_data_propagation_function()) {
+-          if (generated_shape_data_by_name == nullptr) {
+-            fail_shape_inference(
+-                "Container for generated shape data cannot be nullptr when enable_data_propagation option is set.");
+-          }
+-          DataPropagationContextImpl data_propagation_ctx(
+-              n, value_types_by_name, input_data_by_name, *generated_shape_data_by_name);
+-          schema->GetDataPropagationFunction()(data_propagation_ctx);
++      for (int i = 0; i < n.output_size(); ++i) {
++        // skip type and shape propagation for missing optional outputs.
++        if (!n.output(i).empty())
++          UpdateType(n.output(i), ctx.getOutputType(i));
++      }
++      // Constant values are tracked to improve inference/checking for subsequent nodes.
++      ProcessConstant(n);
++      // If data-propagation is enabled, partial-evaluation (aka data-propagation) is performed
++      // to improve inference/checking for subsequent nodes.
++      if (options.enable_data_propagation && schema && schema->has_data_propagation_function()) {
++        if (generated_shape_data_by_name == nullptr) {
++          fail_shape_inference(
++              "Container for generated shape data cannot be nullptr when enable_data_propagation option is set.");
+         }
++        DataPropagationContextImpl data_propagation_ctx(
++            n, value_types_by_name, input_data_by_name, *generated_shape_data_by_name);
++        schema->GetDataPropagationFunction()(data_propagation_ctx);
+       }
+     }
+     ONNX_CATCH(const ONNX_NAMESPACE::InferenceError& ex) {
diff --git a/cmake/riscv64.toolchain.cmake b/cmake/riscv64.toolchain.cmake
new file mode 100644
index 000000000000..0fda239f9a62
--- /dev/null
+++ b/cmake/riscv64.toolchain.cmake
@@ -0,0 +1,35 @@
+# Copyright (c) 2024 SiFive, Inc. All rights reserved.
+# Copyright (c) 2024, Phoebe Chen <phoebe.chen@sifive.com>
+# Licensed under the MIT License.
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+list(APPEND CMAKE_TRY_COMPILE_PLATFORM_VARIABLES RISCV_TOOLCHAIN_ROOT)
+
+if(NOT RISCV_TOOLCHAIN_ROOT)
+  message(FATAL_ERROR "RISCV_TOOLCHAIN_ROOT is not defined. Please set the RISCV_TOOLCHAIN_ROOT variable.")
+endif()
+
+set(CMAKE_C_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc")
+set(CMAKE_ASM_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc")
+set(CMAKE_CXX_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-g++")
+
+set(CMAKE_FIND_ROOT_PATH ${RISCV_TOOLCHAIN_ROOT})
+set(CMAKE_SYSROOT "${RISCV_TOOLCHAIN_ROOT}/sysroot")
+set(CMAKE_INCLUDE_PATH "${RISCV_TOOLCHAIN_ROOT}/sysroot/usr/include/")
+set(CMAKE_LIBRARY_PATH "${RISCV_TOOLCHAIN_ROOT}/sysroot/usr/lib/")
+set(CMAKE_PROGRAM_PATH "${RISCV_TOOLCHAIN_ROOT}/sysroot/usr/bin/")
+
+if(RISCV_QEMU_PATH)
+  message(STATUS "RISCV_QEMU_PATH=${RISCV_QEMU_PATH} is defined during compilation.")
+  set(CMAKE_CROSSCOMPILING_EMULATOR "${RISCV_QEMU_PATH};-L;${CMAKE_SYSROOT}")
+endif()
+
+set(CMAKE_CROSSCOMPILING TRUE)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
diff --git a/cmake/wcos_rules_override.cmake b/cmake/wcos_rules_override.cmake
index f3d8093629a4..ec2303b073d5 100644
--- a/cmake/wcos_rules_override.cmake
+++ b/cmake/wcos_rules_override.cmake
@@ -1,2 +1,2 @@
-set(CMAKE_C_STANDARD_LIBRARIES_INIT onecoreuap_apiset.lib)
-set(CMAKE_CXX_STANDARD_LIBRARIES_INIT onecoreuap_apiset.lib)
+set(CMAKE_C_STANDARD_LIBRARIES_INIT onecoreuap.lib)
+set(CMAKE_CXX_STANDARD_LIBRARIES_INIT onecoreuap.lib)
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index 268ee3960e75..d74250b96262 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -836,6 +836,13 @@ if (winml_is_inbox)
     target_include_directories(${new_target} PRIVATE ${include_directories})
     target_link_libraries(${new_target} PRIVATE ${link_libraries})
     target_link_options(${new_target} PRIVATE ${link_options})
+
+    # Attempt to copy linker flags 
+    get_target_property(link_flags ${target} LINK_FLAGS)
+    
+    if (NOT link_flags MATCHES ".*NOTFOUND")
+      set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}")
+    endif()
   endfunction()
 
   if (WAI_ARCH STREQUAL x64 OR WAI_ARCH STREQUAL arm64)
diff --git a/csharp/ApiDocs/ApiDocs.csproj b/csharp/ApiDocs/ApiDocs.csproj
index 994e57913cf4..6081c444ba1a 100644
--- a/csharp/ApiDocs/ApiDocs.csproj
+++ b/csharp/ApiDocs/ApiDocs.csproj
@@ -7,7 +7,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="docfx.console" Version="2.59.3">
+    <PackageReference Include="docfx.console" Version="2.59.4">
       <PrivateAssets>all</PrivateAssets>
       <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
     </PackageReference>
diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj b/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
index 3d35de1dfc6a..b268079e2cca 100644
--- a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
+++ b/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
@@ -7,8 +7,8 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.11.0" />
-    <PackageReference Include="Sixlabors.ImageSharp" Version="2.1.1" />
+    <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.16.3" />
+    <PackageReference Include="Sixlabors.ImageSharp" Version="2.1.7" />
     <PackageReference Include="SixLabors.ImageSharp.Drawing" Version="1.0.0-beta14" />
   </ItemGroup>
 
diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
index af8fa611a501..647c0bbe6a24 100644
--- a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
+++ b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
@@ -7,8 +7,8 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.11.0" />
-    <PackageReference Include="Sixlabors.ImageSharp" Version="2.1.1" />
+    <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.16.3" />
+    <PackageReference Include="Sixlabors.ImageSharp" Version="2.1.8" />
   </ItemGroup>
 
 </Project>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 4128524b3048..8a8426a0b305 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -362,6 +362,7 @@ static NativeMethods()
             OrtDisableMemPattern = (DOrtDisableMemPattern)Marshal.GetDelegateForFunctionPointer(api_.DisableMemPattern, typeof(DOrtDisableMemPattern));
             OrtEnableCpuMemArena = (DOrtEnableCpuMemArena)Marshal.GetDelegateForFunctionPointer(api_.EnableCpuMemArena, typeof(DOrtEnableCpuMemArena));
             OrtDisableCpuMemArena = (DOrtDisableCpuMemArena)Marshal.GetDelegateForFunctionPointer(api_.DisableCpuMemArena, typeof(DOrtDisableCpuMemArena));
+            OrtDisablePerSessionThreads = (DOrtDisablePerSessionThreads)Marshal.GetDelegateForFunctionPointer(api_.DisablePerSessionThreads, typeof(DOrtDisablePerSessionThreads));
             OrtSetSessionLogId = (DOrtSetSessionLogId)Marshal.GetDelegateForFunctionPointer(api_.SetSessionLogId, typeof(DOrtSetSessionLogId));
             OrtSetSessionLogVerbosityLevel = (DOrtSetSessionLogVerbosityLevel)Marshal.GetDelegateForFunctionPointer(api_.SetSessionLogVerbosityLevel, typeof(DOrtSetSessionLogVerbosityLevel));
             OrtSetSessionLogSeverityLevel = (DOrtSetSessionLogSeverityLevel)Marshal.GetDelegateForFunctionPointer(api_.SetSessionLogSeverityLevel, typeof(DOrtSetSessionLogSeverityLevel));
@@ -992,6 +993,10 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
         public delegate IntPtr /*(OrtStatus*)*/ DOrtDisableCpuMemArena(IntPtr /* OrtSessionOptions* */ options);
         public static DOrtDisableCpuMemArena OrtDisableCpuMemArena;
 
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /*(OrtStatus*)*/ DOrtDisablePerSessionThreads(IntPtr /* OrtSessionOptions* */ options);
+        public static DOrtDisablePerSessionThreads OrtDisablePerSessionThreads;
+
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtSetSessionLogId(IntPtr /* OrtSessionOptions* */ options, byte[] /* const char* */ logId);
         public static DOrtSetSessionLogId OrtSetSessionLogId;
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
index 7a68246c9b67..30d005b3c423 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
@@ -696,6 +696,15 @@ public bool EnableCpuMemArena
         }
         private bool _enableCpuMemArena = true;
 
+        /// <summary>
+        /// Disables the per session threads. Default is true.
+        /// This makes all sessions in the process use a global TP.
+        /// </summary>
+        public void DisablePerSessionThreads()
+        {
+            NativeApiStatus.VerifySuccess(NativeMethods.OrtDisablePerSessionThreads(handle));
+        }
+
         /// <summary>
         /// Log Id to be used for the session. Default is empty string.
         /// </summary>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
index 68a399f8b967..7fe16f4156ef 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
@@ -65,10 +65,10 @@ static NativeTrainingMethods()
                 DOrtGetApi OrtGetApi = (DOrtGetApi)Marshal.GetDelegateForFunctionPointer(NativeMethods.OrtGetApiBase().GetApi, typeof(DOrtGetApi));
 
                 // TODO: Make this save the pointer, and not copy the whole structure across
-                api_ = (OrtApi)OrtGetApi(17 /*ORT_API_VERSION*/);
+                api_ = (OrtApi)OrtGetApi(18 /*ORT_API_VERSION*/);
 
                 OrtGetTrainingApi = (DOrtGetTrainingApi)Marshal.GetDelegateForFunctionPointer(api_.GetTrainingApi, typeof(DOrtGetTrainingApi));
-                trainingApiPtr = OrtGetTrainingApi(17 /*ORT_API_VERSION*/);
+                trainingApiPtr = OrtGetTrainingApi(18 /*ORT_API_VERSION*/);
                 if (trainingApiPtr != IntPtr.Zero)
                 {
                     trainingApi_ = (OrtTrainingApi)Marshal.PtrToStructure(trainingApiPtr, typeof(OrtTrainingApi));
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Training/TrainingSession.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Training/TrainingSession.shared.cs
index 877677dcad57..fec0d46e96df 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Training/TrainingSession.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Training/TrainingSession.shared.cs
@@ -282,6 +282,48 @@ public IDisposableReadOnlyCollection<DisposableNamedOnnxValue> TrainStep(
             }
         }
 
+        /// <summary>
+        /// This function performs a training step that computes the outputs of the training model and the gradients
+        /// of the trainable parameters for the given OrtValue inputs. The train step is performed based on the training model
+        /// that was provided to the training session.
+        /// The TrainStep method is equivalent of running forward propagation and backward propagation in a single
+        /// step.
+        /// The gradients computed are stored inside the training session state so they can be later consumed
+        /// by the OptimizerStep function.
+        /// The gradients can be lazily reset by invoking the LazyResetGrad function.
+        /// Example usage:
+        /// <code>
+        /// using OrtValue x = OrtValue.CreateTensorValueFromMemory(...);
+        /// using OrtValue label = OrtValue.CreateTensorValueFromMemory(...);
+        /// List<OrtValue> inputValues = new List<OrtValue> { x, label };
+        /// using (var loss = trainingSession.TrainStep(inputValues))
+        /// {
+        ///     // process output values
+        /// }
+        /// </code>
+        /// </summary>
+        /// <param name="inputValues">Specify a collection of <see cref="OrtValue"/> that indicates the input values to the training model.</param>
+        /// <returns>Output Tensors in a Collection of NamedOnnxValue. User must dispose the output.</returns>
+        public IDisposableReadOnlyCollection<OrtValue> TrainStep(IReadOnlyCollection<OrtValue> inputValues)
+        {
+            IntPtr[] inputValuesArray = GetOrtValuesHandles(inputValues);
+            IntPtr[] outputValuesArray = new IntPtr[(int)_trainOutputCount];
+
+            NativeApiStatus.VerifySuccess(NativeTrainingMethods.OrtTrainStep(_nativeHandle, IntPtr.Zero, (UIntPtr)inputValues.Count,
+                inputValuesArray, (UIntPtr)_trainOutputCount, outputValuesArray));
+
+
+            var disposableHandles = new DisposableOrtValueHandleArray(outputValuesArray);
+            try
+            {
+                return CreateDisposableResult(disposableHandles);
+            }
+            finally
+            {
+                disposableHandles.Dispose();
+            }
+        }
+
         /// <summary>
         /// Convert native OrtValue handles to OrtValue instances
         /// in an exceptions safe manner.
@@ -370,6 +412,42 @@ public void EvalStep(
                 inputValuesArray, (UIntPtr)outputValues.Count, outputValuesArray));
         }
 
+        /// <summary>
+        /// This function performs an eval step that computes the outputs of the eval model for the given inputs.
+        /// Inputs are expected to be of type OrtValue. The eval step is performed based on the eval model that was
+        /// provided to the training session.
+        /// Example usage:
+        /// <code>
+        /// using OrtValue x = OrtValue.CreateTensorValueFromMemory(...);
+        /// using OrtValue label = OrtValue.CreateTensorValueFromMemory(...);
+        /// List<OrtValue> inputValues = new List<OrtValue> { x, label };
+        /// using (var loss = trainingSession.EvalSteps(inputValues))
+        /// {
+        ///     // process output values
+        /// }
+        /// </code>
+        /// </summary>
+        /// <param name="inputValues">Specify a collection of <see cref="OrtValue"/> that indicates the input values to the eval model.</param>
+        public IDisposableReadOnlyCollection<OrtValue> EvalStep(IReadOnlyCollection<OrtValue> inputValues)
+        {
+            IntPtr[] inputValuesArray = GetOrtValuesHandles(inputValues);
+            IntPtr[] outputValuesArray = new IntPtr[(int)_evalOutputCount];
+
+            NativeApiStatus.VerifySuccess(NativeTrainingMethods.OrtEvalStep(_nativeHandle, IntPtr.Zero, (UIntPtr)inputValues.Count,
+                inputValuesArray, (UIntPtr)_evalOutputCount, outputValuesArray));
+
+
+            var disposableHandles = new DisposableOrtValueHandleArray(outputValuesArray);
+            try
+            {
+                return CreateDisposableResult(disposableHandles);
+            }
+            finally
+            {
+                disposableHandles.Dispose();
+            }
+        }
+
 
         /// <summary>
         /// Sets the learning rate for this training session.
@@ -702,6 +780,35 @@ private IntPtr[] GetOrtValuesHandles(IReadOnlyCollection<FixedBufferOnnxValue> v
             return valuesArray;
         }
 
+        private IntPtr[] GetOrtValuesHandles(IReadOnlyCollection<OrtValue> inputValues)
+        {
+            var valuesArray = new IntPtr[inputValues.Count];
+            for (int index = 0; index < inputValues.Count; ++index)
+            {
+                valuesArray[index] = inputValues.ElementAt(index).Handle;
+            }
+            return valuesArray;
+        }
+
+        private static IDisposableReadOnlyCollection<OrtValue> CreateDisposableResult(DisposableOrtValueHandleArray disposableHandles)
+        {
+            var outputValues = new DisposableList<OrtValue>(disposableHandles.Span.Length);
+            try
+            {
+                for (int i = 0; i < disposableHandles.Span.Length; i++)
+                {
+                    outputValues.Add(new OrtValue(disposableHandles.Span[i]));
+                    disposableHandles.Span[i] = IntPtr.Zero;
+                }
+                return outputValues;
+            }
+            catch (Exception)
+            {
+                outputValues.Dispose();
+                throw;
+            }
+        }
+
         private IntPtr[] ConvertNamesToUtf8(IReadOnlyCollection<string> names, DisposableList<IDisposable> cleanupList)
         {
             cleanupList.Capacity += names.Count;
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Mobile/EndToEndTests.Mobile.Automation/EndToEndTests.Mobile.Automation.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Mobile/EndToEndTests.Mobile.Automation/EndToEndTests.Mobile.Automation.csproj
index b90929ad6d1c..7bda34d26629 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Mobile/EndToEndTests.Mobile.Automation/EndToEndTests.Mobile.Automation.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Mobile/EndToEndTests.Mobile.Automation/EndToEndTests.Mobile.Automation.csproj
@@ -6,7 +6,7 @@
 
   <ItemGroup>
     <PackageReference Include="NUnit" Version="3.13.2" />
-    <PackageReference Include="Newtonsoft.Json" Version="13.0.2" />
+    <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
     <PackageReference Include="Xamarin.UITest" Version="3.2.1" />
   </ItemGroup>
   <ItemGroup>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
index 1c9827c5bac6..5ff924bcf82f 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
@@ -37,10 +37,10 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Google.Protobuf" Version="3.16.0" /> <!-- should match version ORT native build uses -->
+    <PackageReference Include="Google.Protobuf" Version="3.21.12" /> <!-- should match version ORT native build uses -->
     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.8.0" />
     <!-- use Newtonsoft.Json dependency of Microsoft.NET.Test.Sdk at a more recent version -->
-    <PackageReference Include="Newtonsoft.Json" Version="13.0.2" />
+    <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
     <PackageReference Include="xunit" Version="2.4.1" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.4.3" />
     <PackageReference Include="$(PACKAGENAME)" Version="$(CurrentOnnxRuntimeVersion)" />
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
index fd8feda359f9..d6a6b9627f41 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
@@ -55,6 +55,9 @@ public void TestSessionOptions()
                 Assert.Equal(0, opt.InterOpNumThreads);
                 Assert.Equal(GraphOptimizationLevel.ORT_ENABLE_ALL, opt.GraphOptimizationLevel);
 
+                // No get, so no verify
+                opt.DisablePerSessionThreads();
+
                 // try setting options
                 opt.ExecutionMode = ExecutionMode.ORT_PARALLEL;
                 Assert.Equal(ExecutionMode.ORT_PARALLEL, opt.ExecutionMode);
@@ -98,7 +101,7 @@ public void TestSessionOptions()
                 Assert.Contains("[ErrorCode:InvalidArgument] Config key is empty", ex.Message);
 
                 // SessionOptions.RegisterOrtExtensions can be manually tested by referencing the
-                // Microsoft.ML.OnnxRuntime.Extensions nuget package. After that is done, this should not throw.                
+                // Microsoft.ML.OnnxRuntime.Extensions nuget package. After that is done, this should not throw.
                 ex = Assert.Throws<OnnxRuntimeException>(() => { opt.RegisterOrtExtensions(); });
                 Assert.Contains("Microsoft.ML.OnnxRuntime.Extensions NuGet package must be referenced", ex.Message);
 
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
index ee81ab77432d..ab27d62c3bf3 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
@@ -119,8 +119,8 @@
 
   <ItemGroup>
     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.5.0" />
-    <PackageReference Include="Google.Protobuf" Version="3.16.0" />
-    <PackageReference Include="Newtonsoft.Json" Version="13.0.2" />
+    <PackageReference Include="Google.Protobuf" Version="3.21.12" />
+    <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
     <PackageReference Include="xunit" Version="2.4.1" />
   </ItemGroup>
 
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TrainingTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TrainingTest.cs
index 68b1d5bcc614..9b7232620132 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TrainingTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TrainingTest.cs
@@ -612,6 +612,81 @@ public void TestUpdateParameter()
             }
         }
 
+        [Fact(DisplayName = "TestTrainingSessionTrainStepWithOrtValues")]
+        public void TestTrainingSessionTrainStepWithOrtValues()
+        {
+            string checkpointPath = Path.Combine(Directory.GetCurrentDirectory(), "checkpoint.ckpt");
+            using (var cleanUp = new DisposableListTest<IDisposable>())
+            {
+                var state = CheckpointState.LoadCheckpoint(checkpointPath);
+                cleanUp.Add(state);
+                Assert.NotNull(state);
+                string trainingPath = Path.Combine(Directory.GetCurrentDirectory(), "training_model.onnx");
+                string optimizerPath = Path.Combine(Directory.GetCurrentDirectory(), "adamw.onnx");
+
+                var trainingSession = new TrainingSession(state, trainingPath, optimizerPath);
+                cleanUp.Add(trainingSession);
+
+                float[] expectedOutput = TestDataLoader.LoadTensorFromFile("loss_1.out");
+                var expectedOutputDimensions = new int[] { 1 };
+                float[] inputData = TestDataLoader.LoadTensorFromFile("input-0.in");
+                long[] inputShape = { 2, 784 };
+                Int32[] labelsData = { 1, 1 };
+                long[] labelsShape = { 2 };
+
+                using OrtValue inputOrtValue = OrtValue.CreateTensorValueFromMemory<float>(inputData, inputShape);
+                using OrtValue labelsOrtValue = OrtValue.CreateTensorValueFromMemory<Int32>(labelsData, labelsShape);
+                var inputValues = new List<OrtValue> { inputOrtValue, labelsOrtValue };
+
+                using (var results = trainingSession.TrainStep(inputValues))
+                {
+                    Assert.Single(results);
+                    var outputOrtValue = results[0];
+                    Assert.True(outputOrtValue.IsTensor);
+                    var resultSpan = outputOrtValue.GetTensorDataAsSpan<float>().ToArray();
+                    Assert.Equal(expectedOutput, resultSpan, new FloatComparer());
+                }
+            }
+        }
+
+        [Fact(DisplayName = "TestTrainingSessionEvalStepWithOrtValues")]
+        public void TestTrainingSessionEvalStepWithOrtValues()
+        {
+            string checkpointPath = Path.Combine(Directory.GetCurrentDirectory(), "checkpoint.ckpt");
+            using (var cleanUp = new DisposableListTest<IDisposable>())
+            {
+                var state = CheckpointState.LoadCheckpoint(checkpointPath);
+                cleanUp.Add(state);
+                Assert.NotNull(state);
+                string trainingPath = Path.Combine(Directory.GetCurrentDirectory(), "training_model.onnx");
+                string optimizerPath = Path.Combine(Directory.GetCurrentDirectory(), "adamw.onnx");
+                string evalPath = Path.Combine(Directory.GetCurrentDirectory(), "eval_model.onnx");
+
+                var trainingSession = new TrainingSession(state, trainingPath, evalPath, optimizerPath);
+                cleanUp.Add(trainingSession);
+
+                float[] expectedOutput = TestDataLoader.LoadTensorFromFile("loss_1.out");
+                var expectedOutputDimensions = new int[] { 1 };
+                float[] inputData = TestDataLoader.LoadTensorFromFile("input-0.in");
+                long[] inputShape = { 2, 784 };
+                Int32[] labelsData = { 1, 1 };
+                long[] labelsShape = { 2 };
+
+                using OrtValue inputOrtValue = OrtValue.CreateTensorValueFromMemory<float>(inputData, inputShape);
+                using OrtValue labelsOrtValue = OrtValue.CreateTensorValueFromMemory<Int32>(labelsData, labelsShape);
+                var inputValues = new List<OrtValue> { inputOrtValue, labelsOrtValue };
+
+                using (var results = trainingSession.EvalStep(inputValues))
+                {
+                    Assert.Single(results);
+                    var outputOrtValue = results[0];
+                    Assert.True(outputOrtValue.IsTensor);
+                    var resultSpan = outputOrtValue.GetTensorDataAsSpan<float>().ToArray();
+                    Assert.Equal(expectedOutput, resultSpan, new FloatComparer());
+                }
+            }
+        }
+
         internal class FloatComparer : IEqualityComparer<float>
         {
             private float atol = 1e-3f;
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Devices/Microsoft.ML.OnnxRuntime.Tests.Devices.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Devices/Microsoft.ML.OnnxRuntime.Tests.Devices.csproj
index 37e83be5e33a..40f6d453c6a9 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Devices/Microsoft.ML.OnnxRuntime.Tests.Devices.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Devices/Microsoft.ML.OnnxRuntime.Tests.Devices.csproj
@@ -11,6 +11,6 @@
   </ItemGroup>
   <ItemGroup>
     <PackageReference Include="xunit.abstractions" Version="2.0.3" />
-    <PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
+    <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
   </ItemGroup>
 </Project>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
index 11855032584a..ef7e0825e919 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
@@ -134,7 +134,7 @@
     <PackageReference Include="Xamarin.Forms">
       <Version>5.0.0.2083</Version>
     </PackageReference>
-    <PackageReference Include="Google.Protobuf" Version="3.17.3" />
+    <PackageReference Include="Google.Protobuf" Version="3.21.12" />
   </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="..\..\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj">
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
index 715aed7e1d64..7f3d5d6624b0 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
@@ -145,7 +145,7 @@ private void TestCUDAProviderOptions()
         private void CanRunInferenceOnAModelWithTensorRT()
         {
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "squeezenet.onnx");
-            
+
             int deviceId = 0;
             string deviceIdStr = System.Environment.GetEnvironmentVariable("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
             if (!string.IsNullOrEmpty(deviceIdStr) && int.TryParse(deviceIdStr, out int parsedValue) && parsedValue >= 0)
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
index 352de5db0092..56e65833724f 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
@@ -99,7 +99,7 @@
     <PackageReference Include="xunit">
       <Version>2.4.1</Version>
     </PackageReference>
-    <PackageReference Include="Google.Protobuf" Version="3.17.3" />
+    <PackageReference Include="Google.Protobuf" Version="3.21.12" />
     <PackageReference Include="Xamarin.Forms">
       <Version>5.0.0.2083</Version>
     </PackageReference>
diff --git a/csharp/tools/MauiModelTester/MauiModelTester.csproj b/csharp/tools/MauiModelTester/MauiModelTester.csproj
index b0a17978328c..39e688ce6c1b 100644
--- a/csharp/tools/MauiModelTester/MauiModelTester.csproj
+++ b/csharp/tools/MauiModelTester/MauiModelTester.csproj
@@ -1,8 +1,8 @@
 ﻿<Project Sdk="Microsoft.NET.Sdk">
 
 	<PropertyGroup>
-		<TargetFrameworks>net6.0-android;net6.0-ios</TargetFrameworks>
-		<TargetFrameworks Condition="$([MSBuild]::IsOSPlatform('windows'))">$(TargetFrameworks);net6.0-windows10.0.19041.0</TargetFrameworks>
+		<TargetFrameworks>net8.0-ios;net8.0-android34.0</TargetFrameworks>
+		<TargetFrameworks Condition="$([MSBuild]::IsOSPlatform('windows'))">$(TargetFrameworks);net8.0-windows10.0.19041.0</TargetFrameworks>
 		<OutputType>Exe</OutputType>
 		<RootNamespace>MauiModelTester</RootNamespace>
 		<UseMaui>true</UseMaui>
@@ -21,7 +21,7 @@
 		<ApplicationVersion>1</ApplicationVersion>
 
 		<SupportedOSPlatformVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'ios'">12.0</SupportedOSPlatformVersion>
-		<SupportedOSPlatformVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'android'">21.0</SupportedOSPlatformVersion>
+		<SupportedOSPlatformVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'android'">29.0</SupportedOSPlatformVersion>
 		<SupportedOSPlatformVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'windows'">10.0.17763.0</SupportedOSPlatformVersion>
 		<TargetPlatformMinVersion Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'windows'">10.0.17763.0</TargetPlatformMinVersion>
 		<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
@@ -51,7 +51,7 @@
 	</ItemGroup>
 
 	<ItemGroup>
-		<PackageReference Include="Google.Protobuf" Version="3.16.0" />
+		<PackageReference Include="Google.Protobuf" Version="3.21.12" />
 		<PackageReference Include="Microsoft.Extensions.Logging.Debug" Version="7.0.0" />
 		<PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.16.0-dev-20230821-1235-cbaa008391" />
 		<PackageReference Include="Microsoft.ML.OnnxRuntime.Extensions" Version="0.9.0-dev-20230823-0618-3f1ce5ef" />
diff --git a/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml b/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
index cc320dab474a..2ef2296d7441 100644
--- a/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
+++ b/csharp/tools/MauiModelTester/Platforms/Android/AndroidManifest.xml
@@ -4,5 +4,5 @@
 	<uses-permission android:name="android.permission.ACCESS_NETWORK_STATE" />
 	<uses-permission android:name="android.permission.INTERNET" />
 	<uses-permission android:name="android.permission.DIAGNOSTIC" />
-	<uses-sdk android:minSdkVersion="21" android:targetSdkVersion="31" />
+	<uses-sdk android:minSdkVersion="29" android:targetSdkVersion="34" />
 </manifest>
\ No newline at end of file
diff --git a/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Microsoft.ML.OnnxRuntime.PerfTool.csproj b/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Microsoft.ML.OnnxRuntime.PerfTool.csproj
index 24f0d14ad990..e0420a6ed045 100644
--- a/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Microsoft.ML.OnnxRuntime.PerfTool.csproj
+++ b/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Microsoft.ML.OnnxRuntime.PerfTool.csproj
@@ -80,7 +80,7 @@
   <ItemGroup>
     <ProjectReference Include="$(OnnxRuntimeCSharpRoot)\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj" />
     <PackageReference Include="CommandLineParser" Version="2.4.3" />
-    <PackageReference Include="Google.Protobuf" Version="3.16.0" /> <!-- match version ORT native build uses -->
+    <PackageReference Include="Google.Protobuf" Version="3.21.12" /> <!-- match version ORT native build uses -->
   </ItemGroup>
 
   <!-- create OnnxMl.cs and add to project -->
diff --git a/dockerfiles/Dockerfile.migraphx b/dockerfiles/Dockerfile.migraphx
index bc513a8e8ba6..c3541a8bd342 100644
--- a/dockerfiles/Dockerfile.migraphx
+++ b/dockerfiles/Dockerfile.migraphx
@@ -5,57 +5,22 @@
 # Dockerfile to run ONNXRuntime with MIGraphX integration
 #--------------------------------------------------------------------------
 
-FROM ubuntu:20.04
+FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
 
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
-ARG ROCM_VERSION=5.4
-# MIGraphX version should be the same as ROCm version
-ARG MIGRAPHX_VERSION=rocm-5.4.0
-ENV DEBIAN_FRONTEND noninteractive
-ENV MIGRAPHX_DISABLE_FAST_GELU=1
 
-RUN apt-get clean && apt-get update && apt-get install -y locales
-RUN locale-gen en_US.UTF-8
-RUN update-locale LANG=en_US.UTF-8
-ENV LC_ALL C.UTF-8
-ENV LANG C.UTF-8
+ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
 
-# Install rocm
-RUN apt-get update && apt-get install -y gnupg2 --no-install-recommends curl && \
-  curl -sL http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-  sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/${ROCM_VERSION}/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
-
-RUN apt-get update &&\
-    apt-get install -y sudo git bash build-essential rocm-dev python3-dev python3-pip miopen-hip \
-    rocblas half aria2 libnuma-dev pkg-config
-
-RUN aria2c -q -d /tmp -o cmake-3.27.3-linux-x86_64.tar.gz \
-https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.tar.gz &&\
-tar -zxf /tmp/cmake-3.27.3-linux-x86_64.tar.gz --strip=1 -C /usr
-
-# Install rbuild
-RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz numpy yapf==0.28.0
-
-ENV PATH /opt/miniconda/bin:/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
-
-# Install MIGraphX from source
-RUN mkdir -p /migraphx
-RUN cd /migraphx && git clone --depth=1 --branch ${MIGRAPHX_VERSION} https://github.com/ROCmSoftwarePlatform/AMDMIGraphX src
-RUN cd /migraphx && rbuild package --cxx /opt/rocm/llvm/bin/clang++ -d /migraphx/deps -B /migraphx/build -S /migraphx/src/ -DPYTHON_EXECUTABLE=/usr/bin/python3
-RUN dpkg -i /migraphx/build/*.deb
-RUN rm -rf /migraphx
-
-# Install rocm ep dependencies
 RUN apt-get update &&\
-    apt-get install -y rocrand rccl hipsparse hipfft hipcub hipblas rocthrust
+    apt-get install -y migraphx
 
 WORKDIR /code
 
 # Prepare onnxruntime repository & build onnxruntime
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
     /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
-    cd onnxruntime  &&\
+    cd onnxruntime  && pip install --upgrade pip &&\
     /bin/sh ./build.sh --allow_running_as_root --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` --config Release --parallel \
             --skip_tests --build_wheel --use_rocm --rocm_version=${ROCM_VERSION} --rocm_home /opt/rocm --use_migraphx &&\
     pip install /code/onnxruntime/build/Linux/Release/dist/*.whl
diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index 78d04a51ba16..049916fac92f 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -1,9 +1,9 @@
 #-------------------------------------------------------------------------
-# Copyright(C) 2021-2023 Intel Corporation.
+# Copyright(C) 2021-2024 Intel Corporation.
 # SPDX-License-Identifier: MIT
 #--------------------------------------------------------------------------
 
-ARG OPENVINO_VERSION=2023.0.0
+ARG OPENVINO_VERSION=2024.0.0
 
 
 # Build stage
@@ -17,7 +17,7 @@ ARG DEVICE=CPU_FP32
 ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
 ARG ONNXRUNTIME_BRANCH=main
 
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
+ENV OpenVINO_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
 
 USER root
 RUN apt update; apt install -y git protobuf-compiler libprotobuf-dev
diff --git a/dockerfiles/Dockerfile.openvino-centos7 b/dockerfiles/Dockerfile.openvino-centos7
deleted file mode 100755
index 697db44801e3..000000000000
--- a/dockerfiles/Dockerfile.openvino-centos7
+++ /dev/null
@@ -1,105 +0,0 @@
-#-------------------------------------------------------------------------
-# Copyright(C) 2021 Intel Corporation.
-# SPDX-License-Identifier: MIT
-#--------------------------------------------------------------------------
-
-FROM centos:7.8.2003
-
-WORKDIR /code
-
-ARG MY_ROOT=/code
-ARG YUM_OV_PACKAGE=intel-openvino-runtime-centos7-2021.4.752.x86_64
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.752
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/lib/intel64
-ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/ngraph/cmake
-ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/inference_engine/external/gna/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/mkltiny_lnx/lib:$INTEL_OPENVINO_DIR/deployment_tools/ngraph/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/omp/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
-ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/opencv/share/OpenCV
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/opencv/lib:${INTEL_OPENVINO_DIR}/opencv/share/OpenCV/3rdparty/lib:${LD_LIBRARY_PATH}
-ENV HDDL_INSTALL_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl/lib:$LD_LIBRARY_PATH
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:$LD_LIBRARY_PATH
-
-# Install packages
-RUN yum update -y && \
-    yum groupinstall "Development Tools" -y && \
-    yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel boost-devel-1.53.0 && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Install cmake
-    cd $MY_ROOT && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3.tar.gz && \
-    tar -zxvf cmake-3.27.3.tar.gz && rm -rf cmake-3.27.3.tar.gz && \
-    cd cmake-3.27.3 && \
-    ./bootstrap && \
-    make && \
-    make install && \
-    cd $MY_ROOT && \
-# libusb1.0.22
-    cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /opt/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /opt/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig' && \
-# Install openvino
-    yum-config-manager --add-repo https://yum.repos.intel.com/openvino/2021/setup/intel-openvino-2021.repo && \
-    rpm --import https://yum.repos.intel.com/openvino/2021/setup/RPM-GPG-KEY-INTEL-OPENVINO-2021 && \
-    yum update -y && yum list intel-openvino* && \
-    yum install -y $YUM_OV_PACKAGE && \
-    cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && \
-    printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2021.4.752/bin/setupvars.sh && \
-    cd /opt/libusb-1.0.22 && \
-    /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
-    cp /opt/intel/openvino_2021/deployment_tools/inference_engine/external/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
-    ldconfig && \
-# Install GPU runtime and drivers
-    cd ${MY_ROOT} && \
-    mkdir /tmp/opencl && \
-    cd /tmp/opencl && \
-    yum install -y epel-release && \
-    yum install -y ocl-icd ocl-icd-devel && \
-    wget -O intel-igc-core-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-core-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-opencl-19.41.14441-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-opencl-19.41.14441-1.el7.x86_64.rpm/download && \
-    wget -O intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-gmmlib-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-19.3.2-1.el7.x86_64.rpm/download && \
-    wget -O intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm/download && \
-    rpm -i /tmp/opencl/*.rpm  && \
-    ldconfig  && \
-    rm -rf /tmp/opencl && \
-# Installing gcc-10
-    yum install -y centos-release-scl && \
-    yum install -y devtoolset-10-gcc* && \
-    echo 'source scl_source enable devtoolset-10' >> ~/.bashrc && \
-# python installation
-    source scl_source enable devtoolset-10 && \
-    cd /code/ && \
-    wget https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tgz && tar xvf Python-3.8.3.tgz && \
-    cd Python-3.8*/ && ./configure && make && make install && \
-    cd ../ &&  mkdir -p /usr/bin/Python38 && ln -s Python-3.8.3/ /usr/bin/Python38 && \
-# installing dependancies
-    yum install -y python3-lxml python3-six libusb.x86_64 && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Build onnxruntime
-    cd $MY_ROOT && \
-    pip3 install numpy wheel setuptools cython && \
-    git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
-    pip3 install onnx && \
-    cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \
-    pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \
-# Clean up
-    cd  $MY_ROOT && rm -rf onnxruntime Python-3* && \
-    cd ${MY_ROOT}/ && rm -rf cmake* && \
-    cd /usr/share/ && rm -rf gcc* && cd /usr/lib/ && rm -rf gcc cd && rm -rf .cache && \
-    cd ${INTEL_OPENVINO_DIR}/ && rm -rf documentation data_processing && cd deployment_tools/ && rm -rf tools
diff --git a/dockerfiles/Dockerfile.openvino-csharp b/dockerfiles/Dockerfile.openvino-csharp
deleted file mode 100644
index 2529ef4b7320..000000000000
--- a/dockerfiles/Dockerfile.openvino-csharp
+++ /dev/null
@@ -1,90 +0,0 @@
-#-------------------------------------------------------------------------
-# Copyright(C) 2021-2023 Intel Corporation.
-# SPDX-License-Identifier: MIT
-#--------------------------------------------------------------------------
-
-ARG OPENVINO_VERSION=2023.0.0
-
-# Build stage
-FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION} AS base
-
-ENV WORKDIR_PATH=/home/openvino
-WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
-
-USER root
-RUN apt update; apt install -y --no-install-recommends wget gnupg && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install Mono
-RUN wget http://download.mono-project.com/repo/xamarin.gpg && apt-key add xamarin.gpg && rm xamarin.gpg && \
-    echo "deb https://download.mono-project.com/repo/ubuntu stable-bionic main" | tee /etc/apt/sources.list.d/mono-official-stable.list && \
-    apt update -y && \
-    apt install -y mono-devel
-
-# Install nuget.exe
-RUN wget https://dist.nuget.org/win-x86-commandline/latest/nuget.exe && \
-    mv nuget.exe /usr/local/bin/nuget.exe && \
-    echo 'mono /usr/local/bin/nuget.exe $@' > /usr/local/bin/nuget && \
-    chmod a+x /usr/local/bin/nuget
-
-# Install .NET core
-RUN wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb && \
-    dpkg -i packages-microsoft-prod.deb && \
-    apt-get update -y &&\
-    apt-get install -y apt-transport-https && \
-    apt-get update -y && \
-    apt-get install -y dotnet-sdk-5.0
-
-# Build stage
-FROM base AS builder
-
-ENV WORKDIR_PATH=/home/openvino
-WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
-
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV LANG en_US.UTF-8
-
-USER root
-RUN apt update; apt install -y --no-install-recommends git protobuf-compiler libprotobuf-dev ca-certificates unattended-upgrades && \
-    unattended-upgrade && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO}
-RUN /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
-RUN ln -s cmake-* cmake-dir
-RUN python3 -m pip install wheel
-ENV PATH=${WORKDIR_PATH}/cmake-dir/bin:$PATH
-RUN pip3 install onnx
-RUN ln -s /usr/bin/python3 /usr/bin/python
-RUN apt install locales && \
-    locale-gen en_US en_US.UTF-8 && \
-    dpkg-reconfigure locales
-RUN cd onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget --build_shared_lib
-RUN cp /home/openvino/onnxruntime/build/Linux/Release/Microsoft.ML.OnnxRuntime.Managed* /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts
-
-# Deploy stage
-FROM base
-
-ENV DEBIAN_FRONTEND noninteractive
-USER root
-
-RUN apt update; apt install -y unattended-upgrades fonts-freefont-ttf && \
-    unattended-upgrade
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-RUN usermod -a -G video,users ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
-WORKDIR ${WORKDIR_PATH}
-COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts ${WORKDIR_PATH}/nuget-artifacts
-
-USER ${BUILD_USER}
-ENV PATH=${WORKDIR_PATH}/miniconda/bin:${WORKDIR_PATH}/cmake-dir/bin:$PATH
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64
-ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
diff --git a/dockerfiles/Dockerfile.openvino-rhel8 b/dockerfiles/Dockerfile.openvino-rhel8
deleted file mode 100644
index 5c504cfa553a..000000000000
--- a/dockerfiles/Dockerfile.openvino-rhel8
+++ /dev/null
@@ -1,87 +0,0 @@
-# Build stage
-FROM registry.access.redhat.com/ubi8/ubi:8.4
-
-WORKDIR /code
-
-ARG MY_ROOT=/code
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2022.3.0
-
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64/
-ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib/:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
-ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/extras/opencv/cmake
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/extras/opencv/lib:${LD_LIBRARY_PATH}
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:${LD_LIBRARY_PATH}
-ENV PATH=${MY_ROOT}/cmake-dir/bin:$PATH
-
-# Install packages
-RUN yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel git make gcc && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Install python 3.8
-    cd $MY_ROOT && \
-    wget https://www.python.org/ftp/python/3.8.9/Python-3.8.9.tgz && tar xvf Python-3.8.9.tgz && rm -rf Python-3.8.9.tgz && \
-    cd Python-3.8*/ && ./configure && make && make install && \
-    cd ../ &&  mkdir -p /usr/bin/Python38 && ln -s Python-3.8.9/ /usr/bin/Python38 && ln -s /usr/bin/pip3 /usr/bin/pip && \
-# libusb1.0.22
-    cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /opt/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /opt/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig' && \
-# Install openvino
-    cd /opt/ && mkdir intel/ && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2022.3/linux/l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz  && \
-    tar xvf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
-    rm -rf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
-    mv l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64 openvino_2022.3.0 && \
-    cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && ./install_NEO_OCL_driver.sh -y && \
-    printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2022.3.0/setupvars.sh && \
-    cd /opt/libusb-1.0.22 && \
-    /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
-    # MYRIAD plugins are not available for openvino 2022.3.0 release
-    #cp /opt/intel/openvino_2022.3.0/install_dependencies/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
-    ldconfig && \
-#Install protobuf
-    cd $MY_ROOT && \
-    git clone https://github.com/protocolbuffers/protobuf.git && \
-    cd protobuf && \
-    git checkout v3.16.0 && \
-    git submodule update --init --recursive && \
-    mkdir build_source && cd build_source && \
-    cmake ../cmake  -DCMAKE_INSTALL_LIBDIR=lib64 -Dprotobuf_BUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_SYSCONFDIR=/etc -DCMAKE_POSITION_INDEPENDENT_CODE=ON -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=Release && \
-    make -j$(nproc) && \
-    make install && \
-# Build onnxruntime
-    cd $MY_ROOT && \
-    pip3 install numpy wheel setuptools cython onnx && \
-    git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
-    bash onnxruntime/dockerfiles/scripts/install_common_deps.sh && \
-    ln -s cmake-* cmake-dir && \
-    source /opt/intel/openvino_2022.3.0/setupvars.sh && \
-    cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \
-    pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \
-# Clean up
-    cd ${MY_ROOT} && rm -rf onnxruntime && rm -rf Python-3.8.9 && rm -rf protobuf
-
-# Deploy stage
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-RUN usermod -a -G video,users,render ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
-
-WORKDIR ${WORKDIR_PATH}
-USER ${BUILD_USER}
diff --git a/dockerfiles/Dockerfile.rocm b/dockerfiles/Dockerfile.rocm
index 35a676383337..c242933f677f 100644
--- a/dockerfiles/Dockerfile.rocm
+++ b/dockerfiles/Dockerfile.rocm
@@ -5,14 +5,14 @@
 # Dockerfile to run ONNXRuntime with ROCm integration
 #--------------------------------------------------------------------------
 
-FROM rocm/pytorch:rocm5.4_ubuntu20.04_py3.7_pytorch_1.12.1
+FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
 
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
 
 WORKDIR /code
 
-ENV PATH /opt/miniconda/bin:/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
+ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
 
 # Prepare onnxruntime repository & build onnxruntime
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/README.md b/dockerfiles/README.md
index f226ebfe8b19..a2e99d66d465 100644
--- a/dockerfiles/README.md
+++ b/dockerfiles/README.md
@@ -277,7 +277,7 @@ Nothing else from ONNX Runtime source tree will be copied/installed to the image
 Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropiate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
 
 ## MIGraphX
-**Ubuntu 20.04, ROCm5.4, AMDMIGraphX v1.2**
+**Ubuntu 20.04, ROCm6.0, MIGraphX**
 
 1. Build the docker image from the Dockerfile in this repository.
   ```
@@ -291,7 +291,7 @@ Note: When running the container you built in Docker, please either use 'nvidia-
   ```
 
    ## ROCm
-**Ubuntu 20.04, ROCm5.4**
+**Ubuntu 20.04, ROCm6.0**
 
 1. Build the docker image from the Dockerfile in this repository.
   ```
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 131db5d8d9b3..3d984a54c049 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -41,6 +41,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.Gelu">com.microsoft.Gelu</a>
   * <a href="#com.microsoft.GemmFastGelu">com.microsoft.GemmFastGelu</a>
   * <a href="#com.microsoft.GemmFloat8">com.microsoft.GemmFloat8</a>
+  * <a href="#com.microsoft.GemmaRotaryEmbedding">com.microsoft.GemmaRotaryEmbedding</a>
   * <a href="#com.microsoft.GreedySearch">com.microsoft.GreedySearch</a>
   * <a href="#com.microsoft.GridSample">com.microsoft.GridSample</a>
   * <a href="#com.microsoft.GroupNorm">com.microsoft.GroupNorm</a>
@@ -78,6 +79,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.QLinearSigmoid">com.microsoft.QLinearSigmoid</a>
   * <a href="#com.microsoft.QLinearSoftmax">com.microsoft.QLinearSoftmax</a>
   * <a href="#com.microsoft.QLinearWhere">com.microsoft.QLinearWhere</a>
+  * <a href="#com.microsoft.QMoE">com.microsoft.QMoE</a>
   * <a href="#com.microsoft.QOrderedAttention">com.microsoft.QOrderedAttention</a>
   * <a href="#com.microsoft.QOrderedGelu">com.microsoft.QOrderedGelu</a>
   * <a href="#com.microsoft.QOrderedLayerNormalization">com.microsoft.QOrderedLayerNormalization</a>
@@ -155,6 +157,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Corresponding past and present are same tensor, its size is (2, batch_size, num_heads, max_sequence_length, head_size)</dd>
 <dt><tt>qkv_hidden_sizes</tt> : list of ints</dt>
 <dd>Hidden dimension of Q, K, V: hidden_size, hidden_size and v_hidden_size</dd>
+<dt><tt>rotary_embedding_dim</tt> : int</dt>
+<dd>Dimension of rotary embedding. Limited to 32, 64 or 128. Default value is head_size</dd>
 <dt><tt>scale</tt> : float</dt>
 <dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
 <dt><tt>unidirectional</tt> : int</dt>
@@ -459,7 +463,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>repetition_penalty</tt> (optional) : T</dt>
 <dd>The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)</dd>
 <dt><tt>vocab_mask</tt> (optional) : M</dt>
-<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)</dd>
+<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)</dd>
 <dt><tt>prefix_vocab_mask</tt> (optional) : M</dt>
 <dd>Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)</dd>
 <dt><tt>attention_mask</tt> (optional) : I</dt>
@@ -1586,6 +1590,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0.</dd>
 <dt><tt>ep_sdk_version</tt> : string</dt>
 <dd>(Optional) SDK version used to convert the model.</dd>
+<dt><tt>hardware_architecture</tt> : string</dt>
+<dd>(Optional) Hardware architecture.</dd>
 <dt><tt>main_context</tt> : int</dt>
 <dd>Usually each single EPContext associate with a graph partition.But for some case like QNN, it has single EPContext contains all partitions.In that case, the node with ep_cache_context should set main_context=1. Other nodes set main_context=0 and skip ep_cache_context.The path is relative to this Onnx file. Default is 1.</dd>
 <dt><tt>notes</tt> : string</dt>
@@ -2205,6 +2211,69 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.GemmaRotaryEmbedding"></a><a name="com.microsoft.gemmarotaryembedding">**com.microsoft.GemmaRotaryEmbedding**</a>
+
+  GemmaRotaryEmbedding is the implementation of below part of rotary positional embeddings (RoPE). It implements below from modeling_gemma.py.
+  
+  Here's onnxscript that was tested
+  
+  from onnxscript import FLOAT, FLOAT16, script
+  from onnxscript import opset18 as op
+  
+  @script()
+  def gemma_rotary_embedding(emb: FLOAT["bs", "seq_len", "dim"], q: FLOAT16["bs", "num_heads", "seq_len", "dim"], q_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"], k: FLOAT16["bs", "num_heads", "seq_len", "dim"], k_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"]):
+    sin_val = op.Sin(emb)
+    casted_sin = op.Cast(sin_val, to=10) # for fp16 mix-precision training. Other types are not supported.
+    cos_val = op.Cos(emb)
+    casted_cos = op.Cast(cos_val, to=10)
+    unsqueezed_sin = op.Unsqueeze(casted_sin, [1])
+    unsqueezed_cos = op.Unsqueeze(casted_cos, [1])
+    q_embed = (q * casted_cos) + (q_rot * casted_sin)
+    k_embed = (k * casted_cos) + (k_rot * casted_sin)
+    return q_embed, k_embed
+  
+  onnx_model = gemma_rotary_embedding.to_model_proto()
+  
+  
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>emb</tt> : U</dt>
+<dd>embeddding - 3D tensor with shape (batch_size, seq_len, dim)</dd>
+<dt><tt>q</tt> : T</dt>
+<dd>q state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)</dd>
+<dt><tt>q_rot</tt> : T</dt>
+<dd>half rotated q state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)</dd>
+<dt><tt>k</tt> : T</dt>
+<dd>k state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)</dd>
+<dt><tt>k_rot</tt> : T</dt>
+<dd>k state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output1</tt> : T</dt>
+<dd>4D tensor with shape (batch_size, num_heads, seq_len, dim)</dd>
+<dt><tt>output2</tt> : T</dt>
+<dd>4D tensor with shape (batch_size, num_heads, seq_len, dim)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float16)</dt>
+<dd>Constrain input and output types to float16 tensors.</dd>
+<dt><tt>U</tt> : tensor(float)</dt>
+<dd>Constrain input 0 type to float tensors</dd>
+</dl>
+
+
 ### <a name="com.microsoft.GreedySearch"></a><a name="com.microsoft.greedysearch">**com.microsoft.GreedySearch**</a>
 
   Greedy Search for text generation.
@@ -2248,7 +2317,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>repetition_penalty</tt> (optional) : T</dt>
 <dd>The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)</dd>
 <dt><tt>vocab_mask</tt> (optional) : I</dt>
-<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)</dd>
+<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)</dd>
 <dt><tt>prefix_vocab_mask</tt> (optional) : I</dt>
 <dd>Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)</dd>
 <dt><tt>attention_mask</tt> (optional) : I</dt>
@@ -2394,24 +2463,28 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Attributes
 
 <dl>
+<dt><tt>do_rotary</tt> : int</dt>
+<dd>Whether to use rotary position embedding. Default value is 0.</dd>
 <dt><tt>kv_num_heads</tt> : int (required)</dt>
 <dd>Number of attention heads for k and v</dd>
 <dt><tt>local_window_size</tt> : int</dt>
 <dd>left_window_size for local attention (like Mistral). Default value is -1 meaning unused.</dd>
 <dt><tt>num_heads</tt> : int (required)</dt>
 <dd>Number of attention heads for q</dd>
+<dt><tt>rotary_interleaved</tt> : int</dt>
+<dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
 <dt><tt>scale</tt> : float</dt>
 <dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
 </dl>
 
-#### Inputs
+#### Inputs (7 - 9)
 
 <dl>
 <dt><tt>query</tt> : T</dt>
-<dd>Query with shape (batch_size, sequence_length, hidden_size)</dd>
-<dt><tt>key</tt> : T</dt>
+<dd>Query with shape (batch_size, sequence_length, hidden_size), or packed QKV with shape(batch_size, sequence_length, d) where d is (num_heads * head_size + 2 * kv_num_heads * head_size).</dd>
+<dt><tt>key</tt> (optional) : T</dt>
 <dd>Key with shape (batch_size, kv_sequence_length, kv_hidden_size) </dd>
-<dt><tt>value</tt> : T</dt>
+<dt><tt>value</tt> (optional) : T</dt>
 <dd>Value with shape (batch_size, kv_sequence_length, kv_hidden_size)</dd>
 <dt><tt>past_key</tt> (optional) : T</dt>
 <dd>past state key with support for format BNSH. When past_key uses same tensor as present_key(k-v cache), it is of length max_sequence_length... otherwise of length past_sequence_length.</dd>
@@ -2421,6 +2494,10 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>1d Tensor of shape (batch_size). Indicates past sequence lengths for token generation case.</dd>
 <dt><tt>total_sequence_length</tt> : M</dt>
 <dd>Scalar tensor of total sequence length (past + new).</dd>
+<dt><tt>cos_cache</tt> (optional) : T</dt>
+<dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
+<dt><tt>sin_cache</tt> (optional) : T</dt>
+<dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
 </dl>
 
 #### Outputs
@@ -2437,7 +2514,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16)</dt>
+<dt><tt>T</tt> : tensor(float16), tensor(bfloat16)</dt>
 <dd>Constrain input and output to float tensors.</dd>
 <dt><tt>M</tt> : tensor(int32)</dt>
 <dd>Constrain mask to int tensor.</dd>
@@ -2783,7 +2860,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Constrain input A data type to 8-bit integer tensor.</dd>
 <dt><tt>T2</tt> : tensor(int8), tensor(uint8)</dt>
 <dd>Constrain input B data type to 8-bit integer tensor.</dd>
-<dt><tt>T3</tt> : tensor(float)</dt>
+<dt><tt>T3</tt> : tensor(float), tensor(float16)</dt>
 <dd>Constrain input a_scale, b_scale and output Y data type as float tensor.</dd>
 </dl>
 
@@ -2796,22 +2873,23 @@ This version of the operator has been available since version 1 of the 'com.micr
        And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,..
     3. Input B's scale and zero point are specified by input scales and zero_points.
   
-  Input B is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
-  - n_blocks_per_col = (K + block_size - 1) / block_size
-  - blob_size = block_size / 8 * bits
+    Input is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
+    - n_blocks_per_col = (K + block_size - 1) / block_size
+    - blob_size = CeilDiv(block_size * bits, bitsof(uint8_t)<8>)
+    For all bits from 2-8, a row of data is stored squeezely and represented by uint8_t.
+      - for 2,4,8 bits, 4x2bit,2x4bit,1x8bit are stored in one uint8_t.
+          4bit example:
+          |.|.|.|.| .|.|.|.| =uint8_t (2x4bit)
+      - for 3,5,6,7 bits, 32x3bit,32x5bit,16x6bit,32x7bit are stored in 12xuint8_t,20xuint8_t,12xuint8_t,28xuint8_t separately. no bits are wasted.
+          3bit example:
+          |.|.|. |.|.|. |.|.|. = 9bit, which across 2 uint8_t, the highest bit for the second uint8_t is used.
+    The last uint_8 may have some bits unused.
   
-    For a block blob. It is stored in format:
-    struct Blob {
-      uint8 one_bits[(bits & 0x1) * 1 * block_size / 8];  // highest 1 bit for 3, 5, 7 bits quantization
-      uint8 two_bits[(bits & 0x2) * 2 * block_size / 8];  // high 2 bits for 2, 6, 7 bits quantization
-      uint8 four_bits[(bits & 0x4) * 4 * block_size / 8]; // low 4 bits for 4, 5, 6 bits quantization
-    }
   
   Input scales is stored in same type as original type of B(float32, float16) with shape like: [N * n_blocks_per_col]
-  Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored as one unit8_t. If bits > 4, one zero point is stored with one unit8_t. Thus, its shape is:
-    - [(N * n_blocks_per_col + 1) / 2] if bits <=4
-    - [N * n_blocks_per_col] if bits > 4
-  
+  Input zero_points is stored as uint8_t or same as type(A). It has the same packing method as input B.
+    - [CeilDiv((N * n_blocks_per_col + 1) *bits, 8)]
+    If zero_points has same type as A, it's not packed and has the same shape as Scales.
 
 #### Version
 
@@ -2832,17 +2910,19 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.</dd>
 </dl>
 
-#### Inputs (3 - 4)
+#### Inputs (3 - 5)
 
 <dl>
 <dt><tt>A</tt> : T1</dt>
 <dd>The input tensor, not quantized</dd>
 <dt><tt>B</tt> : T2</dt>
-<dd>1-dimensional data blob</dd>
+<dd>1 or 2 dimensional data blob</dd>
 <dt><tt>scales</tt> : T1</dt>
 <dd>quantization scale</dd>
-<dt><tt>zero_points</tt> (optional) : T2</dt>
+<dt><tt>zero_points</tt> (optional) : T3</dt>
 <dd>quantization zero points</dd>
+<dt><tt>g_idx</tt> (optional) : T4</dt>
+<dd>group_idx</dd>
 </dl>
 
 #### Outputs
@@ -2857,8 +2937,12 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dl>
 <dt><tt>T1</tt> : tensor(float), tensor(float16)</dt>
 <dd>Constrain input and output types to float/half_float tensors.</dd>
-<dt><tt>T2</tt> : tensor(uint8)</dt>
-<dd>Constrain quantized weight types to uint8.</dd>
+<dt><tt>T2</tt> : tensor(uint8), tensor(int32)</dt>
+<dd>Constrain quantized weight types to uint8/int32.</dd>
+<dt><tt>T3</tt> : tensor(uint8), tensor(int32), tensor(float16), tensor(float)</dt>
+<dd>Constrain quantized zero point types to uint8/int32/float16/float.</dd>
+<dt><tt>T4</tt> : tensor(int32)</dt>
+<dd>the index tensor.</dd>
 </dl>
 
 
@@ -2912,8 +2996,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 ### <a name="com.microsoft.MoE"></a><a name="com.microsoft.moe">**com.microsoft.MoE**</a>
 
   Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1,
-        GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
-        usually uses top 32 experts.
+        GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
+        usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral).
         
 
 #### Version
@@ -2927,9 +3011,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
 <dt><tt>k</tt> : int</dt>
 <dd>Number of top experts to select from expert pool</dd>
+<dt><tt>normalize_routing_weights</tt> : int</dt>
+<dd>Whether to normalize routing weights</dd>
 </dl>
 
-#### Inputs (4 - 6)
+#### Inputs (5 - 8)
 
 <dl>
 <dt><tt>input</tt> : T</dt>
@@ -2938,12 +3024,16 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>2D input tensor with shape (num_rows, num_experts)</dd>
 <dt><tt>fc1_experts_weights</tt> : T</dt>
 <dd>3D input tensor with shape (num_experts, hidden_size, inter_size)</dd>
-<dt><tt>fc2_experts_weights</tt> : T</dt>
-<dd>3D input tensor with shape (num_experts, inter_size, hidden_size)</dd>
 <dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc2_experts_weights</tt> : T</dt>
+<dd>3D input tensor with shape (num_experts, inter_size, hidden_size)</dd>
 <dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
+<dt><tt>fc3_experts_weights</tt> (optional) : T</dt>
+<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size)</dd>
+<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
 </dl>
 
 #### Outputs
@@ -3027,6 +3117,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Number of attention heads</dd>
 <dt><tt>scale</tt> : float</dt>
 <dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
+<dt><tt>unidirectional</tt> : int</dt>
+<dd>Whether every token can only attend to previous tokens. Default value is 0.</dd>
 </dl>
 
 #### Inputs (1 - 8)
@@ -4234,6 +4326,69 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.QMoE"></a><a name="com.microsoft.qmoe">**com.microsoft.QMoE**</a>
+
+  Int4 MoE
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>activation_type</tt> : string</dt>
+<dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
+<dt><tt>k</tt> : int</dt>
+<dd>Number of top experts to select from expert pool</dd>
+<dt><tt>normalize_routing_weights</tt> : int</dt>
+<dd>Whether to normalize routing weights</dd>
+</dl>
+
+#### Inputs (7 - 11)
+
+<dl>
+<dt><tt>input</tt> : T</dt>
+<dd>2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+<dt><tt>router_probs</tt> : T</dt>
+<dd>2D input tensor with shape (num_rows, num_experts)</dd>
+<dt><tt>fc1_experts_weights</tt> : T1</dt>
+<dd>3D input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
+<dt><tt>fc1_scales</tt> : T</dt>
+<dd>2D input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc2_experts_weights</tt> : T1</dt>
+<dd>3D input tensor with shape (num_experts, inter_size, hidden_size / 2)</dd>
+<dt><tt>fc2_scales</tt> : T</dt>
+<dd>2D input tensor with shape (num_experts, hidden_size)</dd>
+<dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
+<dt><tt>fc3_experts_weights</tt> (optional) : T1</dt>
+<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
+<dt><tt>fc3_scales</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float16)</dt>
+<dd>Constrain input and output types to float or float16 tensors.</dd>
+<dt><tt>T1</tt> : tensor(uint8)</dt>
+<dd>Constrain weights type to uint8 tensors.</dd>
+</dl>
+
+
 ### <a name="com.microsoft.QOrderedAttention"></a><a name="com.microsoft.qorderedattention">**com.microsoft.QOrderedAttention**</a>
 
   Quantized version of simplified Multi-Head Self Attention(using int8 with specific matrix Layout).
@@ -5017,6 +5172,10 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dl>
 <dt><tt>interleaved</tt> : int</dt>
 <dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
+<dt><tt>num_heads</tt> : int</dt>
+<dd>Number of attention heads. Default value is 0. Must use with rotary_embedding_dim</dd>
+<dt><tt>rotary_embedding_dim</tt> : int</dt>
+<dd>Rotary embedding dimension. Default value is 0.</dd>
 <dt><tt>scale</tt> : float</dt>
 <dd>Custom scale will be used if specified. Default value is 1.0</dd>
 </dl>
@@ -5029,9 +5188,9 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>position_ids</tt> : M</dt>
 <dd>1D tensor with shape (1) or 2D tensor with shape (batch_size, sequence_length)</dd>
 <dt><tt>cos_cache</tt> : T</dt>
-<dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
+<dd>2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)</dd>
 <dt><tt>sin_cache</tt> : T</dt>
-<dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
+<dd>2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)</dd>
 </dl>
 
 #### Outputs
@@ -5044,7 +5203,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dt><tt>T</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 <dt><tt>M</tt> : tensor(int64)</dt>
 <dd>Constrain input and output types to integer tensors</dd>
@@ -5136,7 +5295,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>repetition_penalty</tt> (optional) : T</dt>
 <dd>The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)</dd>
 <dt><tt>vocab_mask</tt> (optional) : I</dt>
-<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)</dd>
+<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)</dd>
 <dt><tt>prefix_vocab_mask</tt> (optional) : I</dt>
 <dd>Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)</dd>
 <dt><tt>attention_mask</tt> (optional) : I</dt>
@@ -5725,12 +5884,14 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Attributes
 
 <dl>
+<dt><tt>beginning_timestamp_token_id</tt> : int</dt>
+<dd>The id of the first timestamp</dd>
 <dt><tt>decoder</tt> : graph (required)</dt>
 <dd>Decoder subgraph to execute in a loop.</dd>
 <dt><tt>decoder_output_cross_qk</tt> : int</dt>
 <dd>If nozero, decoder subgraph contains output Q*K from cross attentions. Default 0.</dd>
 <dt><tt>decoder_start_token_id</tt> : int</dt>
-<dd>The id of the token that indicates decoding starts.</dd>
+<dd>The id of the token that indicates decoding starts (i.e. the start of transcription token id)</dd>
 <dt><tt>early_stopping</tt> : int</dt>
 <dd>early stop or not</dd>
 <dt><tt>encoder</tt> : graph</dt>
@@ -5743,15 +5904,23 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Must be 2 for whisper</dd>
 <dt><tt>no_repeat_ngram_size</tt> : int</dt>
 <dd>no repeat ngrams size</dd>
-<dt><tt>no_speech_token</tt> : int</dt>
+<dt><tt>no_speech_token_id</tt> : int</dt>
 <dd>The token in whisper model that marks all sequence empty. With this model, whisper could output no_speech_prob after. Default -1.</dd>
+<dt><tt>no_timestamps_token_id</tt> : int</dt>
+<dd>The id of the token that indicates no timestamps</dd>
 <dt><tt>pad_token_id</tt> : int (required)</dt>
 <dd>The id of the padding token</dd>
+<dt><tt>start_of_lm_token_id</tt> : int</dt>
+<dd>The id of the token that indicates LM starts</dd>
+<dt><tt>transcribe_token_id</tt> : int</dt>
+<dd>The id of the transcribe task</dd>
+<dt><tt>translate_token_id</tt> : int</dt>
+<dd>The id of the translate task</dd>
 <dt><tt>vocab_size</tt> : int</dt>
 <dd>Size of the vocabulary. If not provided, it will be inferred from the decoder subgraph's output shape</dd>
 </dl>
 
-#### Inputs (5 - 14)
+#### Inputs (5 - 15)
 
 <dl>
 <dt><tt>input_ids</tt> : F</dt>
@@ -5765,11 +5934,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>num_return_sequences</tt> : I</dt>
 <dd>The number of returned sequences in the batch. Shape is (1)</dd>
 <dt><tt>length_penalty</tt> (optional) : T</dt>
-<dd>Exponential penalty to the length. Default value 1.0 means no penalty.Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences.Shape is (1,)</dd>
+<dd>Exponential penalty to the length. Default value 1.0 means no penalty. Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences. Shape is (1,)</dd>
 <dt><tt>repetition_penalty</tt> (optional) : T</dt>
 <dd>The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)</dd>
 <dt><tt>vocab_mask</tt> (optional) : M</dt>
-<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)</dd>
+<dd>Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)</dd>
 <dt><tt>prefix_vocab_mask</tt> (optional) : M</dt>
 <dd>Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)</dd>
 <dt><tt>attention_mask</tt> (optional) : I</dt>
@@ -5779,9 +5948,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>logits_processor</tt> (optional) : I</dt>
 <dd>Specific logits processor for different types of beamsearch models. Default value 0 means no specific logit processor. Accepts value >= 0. Shape is (1)</dd>
 <dt><tt>cross_qk_layer_head</tt> (optional) : I</dt>
-<dd>Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect allits shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]</dd>
+<dd>Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect all its shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]</dd>
 <dt><tt>extra_decoding_ids</tt> (optional) : I</dt>
 <dd>Part of the decoder_input_ids that we need cross qk for it. it is of shape  (batch_size, extra_decoding_ids_len).In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) are treated as stop of the extra_decoding_ids for corresponding batch.</dd>
+<dt><tt>temperature</tt> (optional) : T</dt>
+<dd>Temperature value to apply to logits processing during this execution's decoding. Shape is (1)</dd>
 </dl>
 
 #### Outputs (1 - 5)
@@ -5792,11 +5963,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>sequences_scores</tt> (optional) : T</dt>
 <dd>Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)</dd>
 <dt><tt>scores</tt> (optional) : T</dt>
-<dd>Processed beam scores for each vocabulary token at each generation step.Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam.Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)</dd>
+<dd>Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam. Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)</dd>
 <dt><tt>cross_qk</tt> (optional) : V</dt>
-<dd>Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers,B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F].If cross_qk_layer_head is given, shape is [B, R, cross_qk_layer_head.shape[0], T, F]</dd>
+<dd>Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers, B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F]. If cross_qk_layer_head is given, shape is [B, R, cross_qk_layer_head.shape[0], T, F]</dd>
 <dt><tt>non_speech_probs</tt> (optional) : T</dt>
-<dd>For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token.Currently we treat the last token's logits is what we need, in future extra graph logic may be add to the encoder/context-decoder subgraph.The prob is save before logits may be updated by extra-decoding-ids. The shape of non_speech_probs is [B]</dd>
+<dd>For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token_id. The shape of non_speech_probs is [B]</dd>
 </dl>
 
 #### Type Constraints
diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md
index fd787b017617..264c620a8e69 100644
--- a/docs/How_To_Update_ONNX_Dev_Notes.md
+++ b/docs/How_To_Update_ONNX_Dev_Notes.md
@@ -17,9 +17,12 @@ git add onnx
 1. Update [cgmanifests/generated/cgmanifest.json](/cgmanifests/generated/cgmanifest.json).
 This file should be generated. See [cgmanifests/README](/cgmanifests/README.md) for instructions.
 
-1. Update [tools/ci_build/github/linux/docker/scripts/requirements.txt](/tools/ci_build/github/linux/docker/scripts/requirements.txt)
-   and [tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt](/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt).
-   Update the commit hash for `git+http://github.com/onnx/onnx.git@targetonnxcommithash#egg=onnx`.
+1. Update Python requirements files with the updated ONNX version (e.g., `onnx==1.16.0`) or commit hash if building from source (e.g., `git+http://github.com/onnx/onnx.git@targetonnxcommithash#egg=onnx`).
+- [onnxruntime/test/python/requirements.txt](/onnxruntime/test/python/requirements.txt)
+- [tools/ci_build/github/linux/docker/scripts/requirements.txt](/tools/ci_build/github/linux/docker/scripts/requirements.txt)
+- [tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt](/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt)
+- [tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt](/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt)
+- Run `git grep -rn "onnx==1" .` to find other locations and update this document if necessary.
 
 1. If there is any change to `cmake/external/onnx/onnx/*.in.proto`, you need to regenerate OnnxMl.cs.
    [Building onnxruntime with Nuget](https://onnxruntime.ai/docs/build/inferencing.html#build-nuget-packages) will do
diff --git a/docs/Memory_Optimizer.md b/docs/Memory_Optimizer.md
index 97f7e7ff2c14..d08ba7b8f83c 100644
--- a/docs/Memory_Optimizer.md
+++ b/docs/Memory_Optimizer.md
@@ -30,10 +30,10 @@ Integrate models using `ORTModule`.
 ```
 
 There are two modes to enable the memory optimizations:
-- Aggressively Recompute All Within Each Transformer Layer, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=1`. This will recompute all detected subgraphs within each Transformer Attention+MLP layer. It is easy to enable, but be noted this recompute plan may NOT be the best one. In this mode, `ORTMODULE_MEMORY_OPT_CONFIG` env values passed by users are not respected.
-- User Specified Subgraph Recompute, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=0` and `export ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,...`. This is an advanced usage, that allows users to find the most suitable graphs to recompute, at the cost of overhead to look for the best plans.
+- Transformer layerwise recompute, e.g. aggressively recompute all supported nodes within each transformer layer (usually including attention and mlp sublayers), enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=1`. In this mode, `ORTMODULE_MEMORY_OPT_CONFIG` env values passed by users are not respected.
+- Manual selected subgraph recompute, enabled by `export ORTMODULE_MEMORY_OPT_LEVEL=0` and `export ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,...`. This is an advanced usage, that allows users to find the most suitable graphs to recompute, at the cost of overhead to look for the best plans.
 
-### Mode 1 - Simple Usage (Aggressively Recompute All Within Each Transformer Layer)
+### Mode 1 - Simple Usage (Transformer Layerwise Recompute)
 
 
 1. Set memory optimization level to be TRANSFORMER_LAYERWISE_RECOMPUTE, by `export ORTMODULE_MEMORY_OPT_LEVEL=1`
@@ -51,6 +51,7 @@ There are two modes to enable the memory optimizations:
 	- Plan 8            :  OFF  :  Cast+:2:-1                                           1     2,048              2.0*inputs_input_ids_dim0*inputs_input_ids_dim1
 	```
 3. As shown above, `Config` is a string representative for a re-computable subgraph. All are enabled for recompute in this case.
+4. By `export ORTMODULE_MEMORY_OPT_LEVEL=2`, all plans including compromised recomptable subgraphs will also be enabled.
 
 
 ### Mode 2 -  Advanced Usage (User Selected Subgraph Recompute)
diff --git a/docs/ORTModule_Convergence_Notes.md b/docs/ORTModule_Convergence_Notes.md
index 791b6c32c9b4..2374e7b7c538 100644
--- a/docs/ORTModule_Convergence_Notes.md
+++ b/docs/ORTModule_Convergence_Notes.md
@@ -89,7 +89,7 @@ The limitation of `GlobalSubscriberManager` is, only 'nn.Module's forward output
 dump the intermediate tensors in a `nn.Module`'s forward function, refer to the following example:
 
 ```diff
-+   from onnxruntime.training.utils import inspect_activation
++   from onnxruntime.training.utils.hooks import inspect_activation
 class BloomForCausalLM(BloomPreTrainedModel):
   def __init__(self, config: BloomConfig):
     ...
diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index bede16204d42..54137937ad56 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -246,7 +246,7 @@ to standard outputs.
 #### ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER
 
 - **Feature Area**: *ORTMODULE/Optimizations*
-- **Description**: By default, this is disabled. This env var can be used for enabling or disabling the embedding input
+- **Description**: By default, this is enabled. This env var can be used for enabling or disabling the embedding input
 data sparsity based performance optimizations.
 
 	```bash
@@ -287,12 +287,25 @@ A classical usage of disabling the deep copy: when the deep copy before module e
 #### ORTMODULE_MEMORY_OPT_LEVEL
 
 - **Feature Area**: *ORTMODULE/Optimizations*
-- **Description**: By default, the level is 0. This env var can be used for enabling recomputation for reducing memory peak requirement. Setting the level to be 0 means all detected subgraphs with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint. When level is not 0, check Check [Memory Optimizer for ONNX Runtime Training](Memory_Optimizer.md) for more details.
+- **Description**: By default, the level is 0. This env var can be used for enabling recomputation for reducing memory peak requirement.
+   - Setting the level to be 1 means all detected recomputable subgraphs (NOT including compromised recomputable graphs)  with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint.
+   - Setting the level to be 2 means all detected recomputable subgraphs (including compromised recomputable graphs) with each transformer-based model layer generating stashed activations will be recomputed. This is conceptually equivalent to PyTorch's gradient checkpoint.
+   - When the level is 0, check Check [Memory Optimizer for ONNX Runtime Training](Memory_Optimizer.md) for more details.
 
     ```bash
     export ORTMODULE_MEMORY_OPT_LEVEL=0
     ```
 
+#### ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT
+
+- **Feature Area**: *ORTMODULE/Optimizations*
+- **Description**: By default, the memory-efficient gradient management is turned off. The gradient after it is computed in ONNX Runtime, will trigger the corresponding parameter's backward function through `PythonOpGrad` operator. This would help release the gradient buffer managed in ONNX Runtime, which originally is released once all backward computation finishes.
+
+	```bash
+	export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=1 # Enable
+	export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 # Disable
+	```
+
 ### 2.2 Memory Optimization
 
 Q: *Want to run a bigger batch size?*
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 1ce9b3254d91..5bae5ea62657 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -51,7 +51,8 @@ Do not modify directly.*
 |BitwiseOr|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |BitwiseXor|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |BlackmanWindow|*in* size:**T1**<br> *out* output:**T2**|17+|**T1** = tensor(int32), tensor(int64)<br/> **T2** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Cast|*in* input:**T1**<br> *out* output:**T2**|19+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Cast|*in* input:**T1**<br> *out* output:**T2**|21+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[13, 18]|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[6, 12]|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Ceil|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
@@ -68,7 +69,8 @@ Do not modify directly.*
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[4, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |ConcatFromSequence|*in* input_sequence:**S**<br> *out* concat_result:**T**|11+|**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|ConstantOfShape|*in* input:**T1**<br> *out* output:**T2**|20+|**T1** = tensor(int64)<br/> **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|ConstantOfShape|*in* input:**T1**<br> *out* output:**T2**|21+|**T1** = tensor(int64)<br/> **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||20|**T1** = tensor(int64)<br/> **T2** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[9, 19]|**T1** = tensor(int64)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Conv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|11+|**T** = tensor(float)|
 |||[1, 10]|**T** = tensor(float)|
@@ -85,7 +87,8 @@ Do not modify directly.*
 |DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[11, 12]|**T** = tensor(double), tensor(float)|
 |||[1, 10]|**T** = tensor(double), tensor(float)|
-|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|19+|**T1** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|21+|**T1** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||[19, 20]|**T1** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
 |||[13, 18]|**T** = tensor(int32), tensor(int8), tensor(uint8)|
 |||[10, 12]|**T** = tensor(int32), tensor(int8), tensor(uint8)|
 |Det|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(float)|
@@ -111,7 +114,8 @@ Do not modify directly.*
 |Expand|*in* input:**T**<br> *in* shape:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[8, 12]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |EyeLike|*in* input:**T1**<br> *out* output:**T2**|9+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint64)<br/> **T2** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint64)|
-|Flatten|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Flatten|*in* input:**T**<br> *out* output:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[9, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 8]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -127,6 +131,7 @@ Do not modify directly.*
 |GatherND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
 |||12|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
 |||11|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
+|Gelu|*in* X:**T**<br> *out* Y:**T**|20+|**T** = tensor(float)|
 |Gemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[11, 12]|**T** = tensor(double), tensor(float)|
 |||[9, 10]|**T** = tensor(double), tensor(float)|
@@ -147,21 +152,23 @@ Do not modify directly.*
 |Hardmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float)|
 |||[11, 12]|**T** = tensor(float)|
 |||[1, 10]|**T** = tensor(float)|
-|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|19+|**V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|21+|**V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[16, 18]|**V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[14, 15]|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|If|*in* cond:**B**<br> *out* outputs:**V**|19+|**B** = tensor(bool)<br/> **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|If|*in* cond:**B**<br> *out* outputs:**V**|21+|**B** = tensor(bool)<br/> **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**B** = tensor(bool)<br/> **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[16, 18]|**B** = tensor(bool)<br/> **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[13, 15]|**B** = tensor(bool)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 10]|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |ImageScaler|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float)|
 |InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(float)|
-|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
 |||[10, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(bool)|
-|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
 |||[13, 19]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float)|
@@ -182,7 +189,8 @@ Do not modify directly.*
 |LogSoftmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[11, 12]|**T** = tensor(double), tensor(float)|
 |||[1, 10]|**T** = tensor(double), tensor(float)|
-|Loop|*in* M:**I**<br> *in* cond:**B**<br> *in* v_initial:**V**<br> *out* v_final_and_scan_outputs:**V**|19+|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Loop|*in* M:**I**<br> *in* cond:**B**<br> *in* v_initial:**V**<br> *out* v_final_and_scan_outputs:**V**|21+|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[16, 18]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = optional(seq(tensor(bfloat16))), optional(seq(tensor(bool))), optional(seq(tensor(double))), optional(seq(tensor(float))), optional(seq(tensor(float16))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(int8))), optional(seq(tensor(string))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(uint8))), optional(tensor(bfloat16)), optional(tensor(bool)), optional(tensor(double)), optional(tensor(float)), optional(tensor(float16)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(int8)), optional(tensor(string)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(uint8)), seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[13, 15]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**B** = tensor(bool)<br/> **I** = tensor(int64)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -238,7 +246,8 @@ Do not modify directly.*
 |PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(float)|
 |||[9, 15]|**T** = tensor(float)|
 |||[7, 8]|**T** = tensor(float)|
-|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||18|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[13, 17]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -249,8 +258,9 @@ Do not modify directly.*
 |||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[7, 11]|**T** = tensor(double), tensor(float)|
 |QLinearConv|*in* x:**T1**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T1**<br> *in* w:**T2**<br> *in* w_scale:**tensor(float)**<br> *in* w_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *in* B:**T4**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)<br/> **T4** = tensor(int32)|
-|QLinearMatMul|*in* a:**T1**<br> *in* a_scale:**tensor(float)**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**tensor(float)**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
-|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|19+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int8), tensor(uint8)|
+|QLinearMatMul|*in* a:**T1**<br> *in* a_scale:**TS**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**TS**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**TS**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**<br><br>or<br><br>*in* a:**T1**<br> *in* a_scale:**tensor(float)**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**tensor(float)**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
+|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int8), tensor(uint16), tensor(uint8)|
+|||[19, 20]|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int8), tensor(uint8)|
 |||[13, 18]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |||[10, 12]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(float)<br/> **T1** = tensor(int32)|
@@ -278,7 +288,8 @@ Do not modify directly.*
 |||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[1, 10]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
-|ReduceMax|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
+|ReduceMax|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
+|||[18, 19]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
 |||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
 |||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
 |||11|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
@@ -287,7 +298,8 @@ Do not modify directly.*
 |||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32)|
 |||[1, 10]|**T** = tensor(double), tensor(float), tensor(int32)|
-|ReduceMin|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
+|ReduceMin|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
+|||[18, 19]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
 |||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
 |||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint8)|
 |||11|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
@@ -303,10 +315,12 @@ Do not modify directly.*
 |||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[1, 10]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
+|RegexFullMatch|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(string)<br/> **T2** = tensor(bool)|
 |Relu|*in* X:**T**<br> *out* Y:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int8)|
 |||13|**T** = tensor(double), tensor(float)|
 |||[6, 12]|**T** = tensor(double), tensor(float)|
-|Reshape|*in* data:**T**<br> *in* shape:**tensor(int64)**<br> *out* reshaped:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reshaped:**T**|19+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
+|Reshape|*in* data:**T**<br> *in* shape:**tensor(int64)**<br> *out* reshaped:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reshaped:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
+|||[19, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||[14, 18]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||[5, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
@@ -323,7 +337,8 @@ Do not modify directly.*
 |STFT|*in* signal:**T1**<br> *in* frame_step:**T2**<br> *in* window:**T1**<br> *in* frame_length:**T2**<br> *out* output:**T1**|17+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int32), tensor(int64)|
 |Scale|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float)|
 |ScaledTanh|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float)|
-|Scan|*in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**<br><br>or<br><br>*in* sequence_lens:**I**<br> *in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**|19+|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Scan|*in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**<br><br>or<br><br>*in* sequence_lens:**I**<br> *in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**|21+|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[19, 20]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[16, 18]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 15]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[9, 10]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -344,7 +359,8 @@ Do not modify directly.*
 |SequenceErase|*in* input_sequence:**S**<br> *in* position:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
 |SequenceInsert|*in* input_sequence:**S**<br> *in* tensor:**T**<br> *in* position:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
 |SequenceLength|*in* input_sequence:**S**<br> *out* length:**I**|11+|**I** = tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|Shape|*in* data:**T**<br> *out* shape:**T1**|19+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|Shape|*in* data:**T**<br> *out* shape:**T1**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||[19, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||[15, 18]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||[13, 14]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
@@ -356,7 +372,8 @@ Do not modify directly.*
 |SimplifiedLayerNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**<br> *out* inv_std_var:**U**|1+|**T** = tensor(double), tensor(float)<br/> **U** = tensor(double), tensor(float)<br/> **V** = tensor(double), tensor(float)|
 |Sin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(double), tensor(float)|
 |Sinh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float)|
-|Size|*in* data:**T**<br> *out* size:**T1**|19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|Size|*in* data:**T**<br> *out* size:**T1**|21+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||[19, 20]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||[13, 18]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||[1, 12]|**T** = tensor(bool), tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |Slice|*in* data:**T**<br> *in* starts:**Tind**<br> *in* ends:**Tind**<br> *in* axes:**Tind**<br> *in* steps:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
@@ -377,10 +394,13 @@ Do not modify directly.*
 |SplitToSequence|*in* input:**T**<br> *in* split:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(string)|
 |Sqrt|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[6, 12]|**T** = tensor(double), tensor(float)|
-|Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|StringConcat|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**|20+|**T** = tensor(string)|
 |StringNormalizer|*in* X:**tensor(string)**<br> *out* Y:**tensor(string)**|10+|**X** = tensor(string)|
+|StringSplit|*in* X:**T1**<br> *out* Y:**T2**<br> *out* Z:**T3**|20+|**T1** = tensor(string)<br/> **T2** = tensor(string)<br/> **T3** = tensor(int64)|
 |Sub|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||13|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[7, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
@@ -398,11 +418,13 @@ Do not modify directly.*
 |TopK|*in* X:**T**<br> *in* K:**tensor(int64)**<br> *out* Values:**T**<br> *out* Indices:**I**<br><br>or<br><br>*in* X:**T**<br> *out* Values:**T**<br> *out* Indices:**I**|11+|**I** = tensor(int64)<br/> **T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||10|**I** = tensor(int64)<br/> **T** = tensor(double), tensor(float)|
 |||[1, 9]|**I** = tensor(int64)<br/> **T** = tensor(double), tensor(float)|
-|Transpose|*in* data:**T**<br> *out* transposed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Transpose|*in* data:**T**<br> *out* transposed:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Trilu|*in* input:**T**<br> *in* k:**tensor(int64)**<br> *out* output:**T**|14+|**T** = tensor(double), tensor(float), tensor(int64)|
 |Unique|*in* X:**T**<br> *out* Y:**T**<br> *out* indices:**tensor(int64)**<br> *out* inverse_indices:**tensor(int64)**<br> *out* counts:**tensor(int64)**|11+|**T** = tensor(double), tensor(float), tensor(int64), tensor(int8), tensor(string)|
-|Unsqueeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* expanded:**T**<br><br>or<br><br>*in* data:**T**<br> *out* expanded:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Unsqueeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* expanded:**T**<br><br>or<br><br>*in* data:**T**<br> *out* expanded:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Upsample|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T**<br> *out* Y:**T**|9|**T** = tensor(float), tensor(int32), tensor(int8), tensor(uint8)|
@@ -420,7 +442,8 @@ Do not modify directly.*
 |DictVectorizer|*in* X:**T1**<br> *out* Y:**T2**|1+|**T1** = map(int64,tensor(double)), map(int64,tensor(float)), map(int64,tensor(string)), map(string,tensor(double)), map(string,tensor(float)), map(string,tensor(int64))<br/> **T2** = tensor(double), tensor(float), tensor(int64), tensor(string)|
 |FeatureVectorizer|*in* X:**T1**<br> *out* Y:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |Imputer|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(int64)|
-|LabelEncoder|*in* X:**T1**<br> *out* Y:**T2**|2+|**T1** = tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(float), tensor(int64), tensor(string)|
+|LabelEncoder|*in* X:**T1**<br> *out* Y:**T2**|4+|**T1** = tensor(double), tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(double), tensor(float), tensor(int16), tensor(int64), tensor(string)|
+|||[2, 3]|**T1** = tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(float), tensor(int64), tensor(string)|
 |||1|**T1** = tensor(int64), tensor(string)<br/> **T2** = tensor(int64), tensor(string)|
 |LinearClassifier|*in* X:**T1**<br> *out* Y:**T2**<br> *out* Z:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int64), tensor(string)|
 |LinearRegressor|*in* X:**T**<br> *out* Y:**tensor(float)**|1+|**T** = tensor(float)|
@@ -463,7 +486,7 @@ Do not modify directly.*
 |MatMulFpQ4|*in* A:**T1**<br> *in* B:**T2**<br> *in* B_shape:**T3**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)<br/> **T3** = tensor(int64)|
 |MatMulInteger16|*in* A:**T1**<br> *in* B:**T2**<br> *out* Y:**T3**|1+|**T1** = tensor(int16)<br/> **T2** = tensor(int16)<br/> **T3** = tensor(int32)|
 |MatMulIntegerToFloat|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_scale:**T3**<br> *in* b_scale:**T3**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T3**<br> *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float)|
-|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)|
+|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)<br/> **T3** = tensor(float), tensor(uint8)<br/> **T4** = tensor(int32)|
 |MaxpoolWithMask|*in* X:**T**<br> *in* M:**tensor(int32)**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float)|
 |MurmurHash3|*in* X:**T1**<br> *out* Y:**T2**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(string), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(int32), tensor(uint32)|
@@ -493,7 +516,7 @@ Do not modify directly.*
 |TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |Trilu|*in* X:**T**<br> *in* k:**tensor(int64)**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(int64)|
 |Unique|*in* x:**T**<br> *out* y:**T**<br> *out* idx:**tensor(int64)**<br> *out* counts:**tensor(int64)**|1+|**T** = tensor(float)|
-|WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float)|
+|WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *in* temperature:**T**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float)|
 |WordConvEmbedding|*in* Sequence:**T**<br> *in* W:**T1**<br> *in* B:**T1**<br> *in* C:**T1**<br> *out* Y:**T1**|1+|**T** = tensor(int32)<br/> **T1** = tensor(float)|
 | |
 | |
@@ -600,6 +623,7 @@ Do not modify directly.*
 |GatherND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
 |||12|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
 |||11|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
+|Gelu|*in* X:**T**<br> *out* Y:**T**|20+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Gemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[9, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
@@ -611,6 +635,7 @@ Do not modify directly.*
 |||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
 |GreaterOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
 |||[12, 15]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
+|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
 |HardSigmoid|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|19+|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[14, 18]|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -622,6 +647,11 @@ Do not modify directly.*
 |||[1, 10]|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |ImageScaler|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
+|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|||[10, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|||[13, 19]|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
+|||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
@@ -676,7 +706,8 @@ Do not modify directly.*
 |PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[9, 15]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
+|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
+|||[13, 17]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[2, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
 |ParametricSoftplus|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
@@ -724,7 +755,8 @@ Do not modify directly.*
 |||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||[5, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||[1, 4]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|13+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
+|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|18+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
+|||[13, 17]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |||[11, 12]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |||10|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -737,7 +769,9 @@ Do not modify directly.*
 |||[9, 10]|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||8|**I** = tensor(int64)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Scatter|*in* data:**T**<br> *in* indices:**Tind**<br> *in* updates:**T**<br> *out* output:**T**|[9, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
-|ScatterElements|*in* data:**T**<br> *in* indices:**Tind**<br> *in* updates:**T**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|ScatterElements|*in* data:**T**<br> *in* indices:**Tind**<br> *in* updates:**T**<br> *out* output:**T**|18+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||[16, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
+|||[13, 15]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
 |ScatterND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *in* updates:**T**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -756,7 +790,7 @@ Do not modify directly.*
 |Sigmoid|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |Sign|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|SimplifiedLayerNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**<br> *out* inv_std_var:**U**|1+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(double), tensor(float)<br/> **V** = tensor(double), tensor(float), tensor(float16)|
+|SimplifiedLayerNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**<br> *out* inv_std_var:**U**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(double), tensor(float)<br/> **V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |Sin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Size|*in* data:**T**<br> *out* size:**T1**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
@@ -775,7 +809,7 @@ Do not modify directly.*
 |||[13, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[2, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Sqrt|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|Sqrt|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -834,22 +868,24 @@ Do not modify directly.*
 |GatedRelativePositionBias|*in* query_layer:**T**<br> *in* query_bias:**T**<br> *in* rel_pos:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* eco_a:**T**<br> *in* token_offset:**M**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |Gelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |GemmFloat8|*in* A:**TA**<br> *in* B:**TB**<br> *in* C:**TC**<br> *in* scaleA:**TS**<br> *in* scaleB:**TS**<br> *in* scaleY:**TS**<br> *out* Y:**TR**|1+|**TA** = tensor(bfloat16), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e5m2)<br/> **TB** = tensor(bfloat16), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e5m2)<br/> **TR** = tensor(bfloat16), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e5m2)<br/> **TS** = tensor(float)|
+|GemmaRotaryEmbedding|*in* emb:**U**<br> *in* q:**T**<br> *in* q_rot:**T**<br> *in* k:**T**<br> *in* k_rot:**T**<br> *out* output1:**T**<br> *out* output2:**T**|1+|**T** = tensor(float16)<br/> **U** = tensor(float)|
 |GreedySearch|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *out* sequences:**I**|1+|**T** = tensor(float), tensor(float16)|
 |GridSample|*in* X:**T1**<br> *in* Grid:**T1**<br> *out* Y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
 |GroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|GroupQueryAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* seqlens_k:**M**<br> *in* total_sequence_length:**M**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float16)|
+|GroupQueryAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* seqlens_k:**M**<br> *in* total_sequence_length:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(bfloat16), tensor(float16)|
 |Inverse|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Irfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
-|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
-|MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
+|MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T**<br> *in* fc3_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
 |NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|
 |NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |PackedAttention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* token_offset:**M**<br> *in* cumulative_sequence_length:**M**<br> *in* relative_position_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |PackedMultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* token_offset:**M**<br> *in* cumulative_sequence_length:**M**<br> *in* relative_position_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |QAttention|*in* input:**T1**<br> *in* weight:**T2**<br> *in* bias:**T3**<br> *in* input_scale:**T3**<br> *in* weight_scale:**T3**<br> *in* mask_index:**T4**<br> *in* input_zero_point:**T1**<br> *in* weight_zero_point:**T2**<br> *in* past:**T3**<br> *out* output:**T3**<br> *out* present:**T3**|1+|**T1** = tensor(int8)<br/> **T2** = tensor(int8)<br/> **T3** = tensor(float), tensor(float16)<br/> **T4** = tensor(int32)|
+|QMoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T1**<br> *in* fc1_scales:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T1**<br> *in* fc2_scales:**T**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T1**<br> *in* fc3_scales:**T**<br> *in* fc3_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float16)<br/> **T1** = tensor(uint8)|
 |QOrderedAttention|*in* input:**Q**<br> *in* scale_input:**S**<br> *in* scale_Q_gemm:**S**<br> *in* scale_K_gemm:**S**<br> *in* scale_V_gemm:**S**<br> *in* Q_weight:**Q**<br> *in* K_weight:**Q**<br> *in* V_weight:**Q**<br> *in* scale_Q_weight:**S**<br> *in* scale_K_weight:**S**<br> *in* scale_V_weight:**S**<br> *in* Q_bias:**S**<br> *in* K_bias:**S**<br> *in* V_bias:**S**<br> *in* scale_QKT_gemm:**S**<br> *in* scale_QKT_softmax:**S**<br> *in* scale_values_gemm:**S**<br> *in* mask_index:**G**<br> *in* past:**Q**<br> *in* relative_position_bias:**S**<br> *out* output:**Q**|1+|**G** = tensor(int32)<br/> **Q** = tensor(int8)<br/> **S** = tensor(float)|
 |QOrderedGelu|*in* X:**Q**<br> *in* scale_X:**S**<br> *in* scale_Y:**S**<br> *out* Y:**Q**|1+|**Q** = tensor(int8)<br/> **S** = tensor(float)|
 |QOrderedLayerNormalization|*in* X:**Q**<br> *in* scale_X:**S**<br> *in* scale:**F**<br> *in* B:**F**<br> *in* scale_Y:**S**<br> *out* Y:**Q**|1+|**F** = tensor(float), tensor(float16)<br/> **Q** = tensor(int8)<br/> **S** = tensor(float)|
@@ -862,7 +898,7 @@ Do not modify directly.*
 |RemovePadding|*in* input:**T**<br> *in* sequence_token_count:**M**<br> *out* output:**T**<br> *out* token_offset:**M**<br> *out* cumulated_seq_len:**M**<br> *out* max_seq_len:**M**|1+|**T** = tensor(float), tensor(float16)|
 |RestorePadding|*in* input:**T**<br> *in* token_offset:**M**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |Rfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
-|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
+|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(bfloat16), tensor(float), tensor(float16)|
 |Sampling|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *in* presence_mask:**I**<br> *in* seed:**I**<br> *out* sequences:**I**<br> *out* filtered_logits:**T**|1+|**T** = tensor(float), tensor(float16)|
 |SkipGroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *in* skip:**T**<br> *in* bias:**T**<br> *out* Y:**T**<br> *out* S:**T**|1+|**T** = tensor(float), tensor(float16)|
 |SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
@@ -870,7 +906,7 @@ Do not modify directly.*
 |TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |Trilu|*in* X:**T**<br> *in* k:**tensor(int64)**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |UnfoldTensor|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)|
+|WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *in* temperature:**T**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)|
 | |
 | |
 
@@ -903,7 +939,8 @@ Do not modify directly.*
 |Asinh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
 |Atan|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
 |Atanh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|AveragePool|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|AveragePool|*in* X:**T**<br> *out* Y:**T**|19+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
 |||10+|**T** = tensor(float), tensor(float16)|
 |||7+|**T** = tensor(float), tensor(float16)|
 |BatchNormalization|*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* input_mean:**U**<br> *in* input_var:**U**<br> *out* Y:**T**<br> *out* running_mean:**U**<br> *out* running_var:**U**<br><br>or<br><br>*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* mean:**T**<br> *in* var:**T**<br> *out* Y:**T**<br> *out* mean:**T**<br> *out* var:**T**<br> *out* saved_mean:**T**<br> *out* saved_var:**T**<br><br>or<br><br>*in* X:**T**<br> *in* scale:**T1**<br> *in* B:**T1**<br> *in* input_mean:**T2**<br> *in* input_var:**T2**<br> *out* Y:**T**<br> *out* running_mean:**T2**<br> *out* running_var:**T2**|15+|**T** = tensor(float), tensor(float16)|
@@ -915,10 +952,12 @@ Do not modify directly.*
 |BitwiseNot|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |BitwiseOr|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |BitwiseXor|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|18+|**T** = tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Cast|*in* input:**T1**<br> *out* output:**T2**|13+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Cast|*in* input:**T1**<br> *out* output:**T2**|19+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||13+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||9+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||6+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|CastLike|*in* input:**T1**<br> *in* target_type:**T2**<br> *out* output:**T2**|15+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|CastLike|*in* input:**T1**<br> *in* target_type:**T2**<br> *out* output:**T2**|19+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||15+|**T1** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Ceil|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
 |||6+|**T** = tensor(float), tensor(float16)|
 |Celu|*in* X:**T**<br> *out* Y:**T**|12+|**T** = tensor(float), tensor(float16)|
@@ -945,16 +984,18 @@ Do not modify directly.*
 |DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|13+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
+|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|19+|**T1** = tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||13+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
 |||10+|**T** = tensor(int32), tensor(int8), tensor(uint8)|
 |Div|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||7+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Dropout|*in* data:**T**<br> *in* ratio:**T1**<br> *in* training_mode:**T2**<br> *out* output:**T**<br> *out* mask:**T2**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**<br> *out* mask:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**<br> *out* mask:**T1**|7+|**T** = tensor(float), tensor(float16)|
-|DynamicQuantizeLinear|*in* x:**T1**<br> *out* y:**T2**<br> *out* y_scale:**tensor(float)**<br> *out* y_zero_point:**T2**|11+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)|
+|DynamicQuantizeLinear|*in* x:**T1**<br> *out* y:**T2**<br> *out* y_scale:**tensor(float)**<br> *out* y_zero_point:**T2**|11+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |Einsum|*in* Inputs:**T**<br> *out* Output:**T**|12+|**T** = tensor(float), tensor(float16)|
 |Elu|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
-|Equal|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|Equal|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|19+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
+|||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
 |||11+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
 |||7+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(bool)|
 |Erf|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
@@ -997,7 +1038,8 @@ Do not modify directly.*
 |Hardmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
 |||11+|**T** = tensor(float), tensor(float16)|
 |||1+|**T** = tensor(float), tensor(float16)|
-|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|16+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|19+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||16+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||14+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||1+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -1030,7 +1072,8 @@ Do not modify directly.*
 |||11+|**T** = tensor(float), tensor(float16)|
 |||1+|**T** = tensor(float), tensor(float16)|
 |LpNormalization|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|LpPool|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|LpPool|*in* X:**T**<br> *out* Y:**T**|18+|**T** = tensor(float), tensor(float16)|
+|||11+|**T** = tensor(float), tensor(float16)|
 |||2+|**T** = tensor(float), tensor(float16)|
 |MatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|13+|**T** = tensor(float), tensor(float16)|
 |||9+|**T** = tensor(float), tensor(float16)|
@@ -1090,8 +1133,9 @@ Do not modify directly.*
 |||12+|**T** = tensor(float), tensor(float16), tensor(int32)<br/> **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
 |||7+|**T** = tensor(float), tensor(float16)|
 |QLinearConv|*in* x:**T1**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T1**<br> *in* w:**T2**<br> *in* w_scale:**tensor(float)**<br> *in* w_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *in* B:**T4**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)<br/> **T4** = tensor(int32)|
-|QLinearMatMul|*in* a:**T1**<br> *in* a_scale:**tensor(float)**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**tensor(float)**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
-|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|13+|**T1** = tensor(float), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
+|QLinearMatMul|*in* a:**T1**<br> *in* a_scale:**TS**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**TS**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**TS**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**<br><br>or<br><br>*in* a:**T1**<br> *in* a_scale:**tensor(float)**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**tensor(float)**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
+|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|19+|**T1** = tensor(float), tensor(float16), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
+|||13+|**T1** = tensor(float), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
 |||10+|**T1** = tensor(float), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
 |RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(float), tensor(float16)|
 |||7+|**T** = tensor(float), tensor(float16)|
@@ -1142,11 +1186,12 @@ Do not modify directly.*
 |Relu|*in* X:**T**<br> *out* Y:**T**|14+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
 |||13+|**T** = tensor(float), tensor(float16)|
 |||6+|**T** = tensor(float), tensor(float16)|
-|Reshape|*in* data:**T**<br> *in* shape:**tensor(int64)**<br> *out* reshaped:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reshaped:**T**|14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Reshape|*in* data:**T**<br> *in* shape:**tensor(int64)**<br> *out* reshaped:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reshaped:**T**|19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||5+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|13+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float), tensor(float16)|
-|||11+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float), tensor(float16)|
+|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|13+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||11+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
 |||10+|**T** = tensor(float), tensor(float16)|
 |ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |RoiAlign|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int32), tensor(int64)|
@@ -1170,7 +1215,8 @@ Do not modify directly.*
 |SequenceErase|*in* input_sequence:**S**<br> *in* position:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
 |SequenceInsert|*in* input_sequence:**S**<br> *in* tensor:**T**<br> *in* position:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
 |SequenceLength|*in* input_sequence:**S**<br> *out* length:**I**|11+|**I** = tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|Shape|*in* data:**T**<br> *out* shape:**T1**|15+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|Shape|*in* data:**T**<br> *out* shape:**T1**|19+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||15+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |Shrink|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
@@ -1180,7 +1226,8 @@ Do not modify directly.*
 |||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Sin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(float), tensor(float16)|
 |Sinh|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(float), tensor(float16)|
-|Size|*in* data:**T**<br> *out* size:**T1**|13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|Size|*in* data:**T**<br> *out* size:**T1**|19+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
+|||13+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |||1+|**T** = seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
 |Slice|*in* data:**T**<br> *in* starts:**Tind**<br> *in* ends:**Tind**<br> *in* axes:**Tind**<br> *in* steps:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
 |||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **Tind** = tensor(int32), tensor(int64)|
@@ -1239,14 +1286,21 @@ Do not modify directly.*
 |BiasSplitGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|DynamicQuantizeMatMul|*in* A:**T1**<br> *in* B:**T2**<br> *in* b_scale:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |EmbedLayerNormalization|*in* input_ids:**T1**<br> *in* segment_ids:**T1**<br> *in* word_embedding:**T**<br> *in* position_embedding:**T**<br> *in* segment_embedding:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* mask:**T1**<br> *in* position_ids:**T1**<br> *out* output:**T**<br> *out* mask_index:**T1**<br> *out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
+|FastGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |FusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |FusedMatMulActivation|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |Gelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |GroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *out* Y:**T**|1+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
+|MatMulIntegerToFloat|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_scale:**T3**<br> *in* b_scale:**T3**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T3**<br> *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float), tensor(float16)|
 |NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|QAttention|*in* input:**T1**<br> *in* weight:**T2**<br> *in* bias:**T3**<br> *in* input_scale:**T3**<br> *in* weight_scale:**T3**<br> *in* mask_index:**T4**<br> *in* input_zero_point:**T1**<br> *in* weight_zero_point:**T2**<br> *in* past:**T3**<br> *out* output:**T3**<br> *out* present:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float), tensor(float16)<br/> **T4** = tensor(int32)|
 |QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QLinearAveragePool|*in* X:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
+|QLinearConcat|*in* Y_scale:**TF**<br> *in* Y_zero_point:**T8**<br> *in* inputs:**TV**<br> *out* Y:**T8**|1+|**T8** = tensor(int8), tensor(uint8)<br/> **TF** = tensor(float)<br/> **TV** = tensor(float), tensor(int8), tensor(uint8)|
+|QLinearGlobalAveragePool|*in* X:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QLinearSigmoid|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* X_zero_point:**T**<br> *in* Y_scale:**tensor(float)**<br> *in* Y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float), tensor(float16), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
 |QuickGelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/docs/python/README.rst b/docs/python/README.rst
index 32bb3729e01d..bbc8571fe3f1 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.18.0
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.18.0
+
 1.17.0
 ^^^^^^
 
diff --git a/docs/python/conf.py b/docs/python/conf.py
index 065149441b72..438c21570eaa 100644
--- a/docs/python/conf.py
+++ b/docs/python/conf.py
@@ -2,12 +2,10 @@
 # Licensed under the MIT License.
 # pylint: disable=C0103
 
-# -*- coding: utf-8 -*-
-#
-# Configuration file for the Sphinx documentation builder.
+"""Configuration file for the Sphinx documentation builder."""
 
 import os
-import shutil  # noqa: F401
+import shutil
 import sys
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "_common"))
@@ -17,7 +15,7 @@
 # -- Project information -----------------------------------------------------
 
 project = "Python API"
-copyright = "2018-2023, Microsoft"
+copyright = "2018-2024, Microsoft"
 author = "Microsoft"
 
 # -- General configuration ---------------------------------------------------
@@ -127,7 +125,5 @@ def setup(app):
         urllib.request.urlretrieve(url, dest)
     loc = os.path.split(dest)[-1]
     if not os.path.exists(loc):
-        import shutil  # noqa: F811
-
         shutil.copy(dest, loc)
     return app
diff --git a/docs/python/examples/plot_train_convert_predict.py b/docs/python/examples/plot_train_convert_predict.py
index dcbc84b20767..44b6bb74c29d 100644
--- a/docs/python/examples/plot_train_convert_predict.py
+++ b/docs/python/examples/plot_train_convert_predict.py
@@ -134,7 +134,7 @@ def loop(X_test, fct, n=None):
     nrow = X_test.shape[0]
     if n is None:
         n = nrow
-    for i in range(0, n):
+    for i in range(n):
         im = i % nrow
         fct(X_test[im : im + 1])
 
diff --git a/docs/python/on_device_training/training_api.rst b/docs/python/on_device_training/training_api.rst
index 64f81f3f1814..f4856b085b7f 100644
--- a/docs/python/on_device_training/training_api.rst
+++ b/docs/python/on_device_training/training_api.rst
@@ -42,12 +42,32 @@ Sample usage:
     CheckpointState.save_checkpoint(state, path_to_the_checkpoint_artifact)
 
 
+.. autoclass:: onnxruntime.training.api.checkpoint_state.Parameter
+    :members:
+    :show-inheritance:
+    :member-order: bysource
+    :inherited-members:
+    :special-members: __repr__
+
+.. autoclass:: onnxruntime.training.api.checkpoint_state.Parameters
+    :members:
+    :show-inheritance:
+    :member-order: bysource
+    :inherited-members:
+    :special-members: __getitem__, __setitem__, __contains__, __iter__, __repr__, __len__
+
+.. autoclass:: onnxruntime.training.api.checkpoint_state.Properties
+    :members:
+    :show-inheritance:
+    :member-order: bysource
+    :inherited-members:
+    :special-members: __getitem__, __setitem__, __contains__, __iter__, __repr__, __len__
+
 .. autoclass:: onnxruntime.training.api.CheckpointState
     :members:
     :show-inheritance:
     :member-order: bysource
     :inherited-members:
-    :special-members: __getitem__, __setitem__, __contains__
 
 .. autoclass:: onnxruntime.training.api.Module
     :members:
diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index bea3fa1d09cc..f62053a5e44a 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -18,6 +18,8 @@
 
 #include "core/common/logging/macros.h"
 
+#include "date/date.h"
+
 /*
 
   Logging overview and expected usage:
@@ -56,6 +58,15 @@ namespace logging {
 
 using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
 
+// TODO: When other compilers support std::chrono::operator<<, update this.
+// TODO: Check support for other compilers' version before enable C++20 for other compilers.
+// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4.
+#if __cplusplus >= 202002L && __MAC_OS_X_VERSION_MAX_ALLOWED >= 140400L
+namespace timestamp_ns = std::chrono;
+#else
+namespace timestamp_ns = ::date;
+#endif
+
 #ifndef NDEBUG
 ORT_ATTRIBUTE_UNUSED static bool vlog_enabled = true;  // Set directly based on your needs.
 #else
@@ -75,6 +86,21 @@ struct Category {
   // TODO: What other high level categories are meaningful? Model? Optimizer? Execution?
 };
 
+/// <summary>
+/// ORT TraceLogging keywords for categories of dynamic logging enablement
+/// </summary>
+enum class ORTTraceLoggingKeyword : uint64_t {
+  Session = 0x1,    // ORT Session TraceLoggingWrite
+  Logs = 0x2,       // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
+  Reserved1 = 0x4,  // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
+  Reserved2 = 0x8,
+  Reserved3 = 0x10,
+  Reserved4 = 0x20,
+  Reserved5 = 0x40,
+  Reserved6 = 0x80,
+  Profiling = 0x100  // Enables profiling. At higher levels >5 can impact inference performance
+};
+
 class ISink;
 class Logger;
 class Capture;
@@ -333,5 +359,17 @@ unsigned int GetThreadId();
 */
 unsigned int GetProcessId();
 
+/**
+   If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then adds to the existing logger.
+*/
+std::unique_ptr<ISink> EnhanceLoggerWithEtw(std::unique_ptr<ISink> existingLogger, logging::Severity originalSeverity,
+                                            logging::Severity etwSeverity);
+
+/**
+  If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then can override the logging level.
+  But this overrided level only applies to the ETW sink. The original logger(s) retain their original logging level
+*/
+Severity OverrideLevelWithEtw(Severity originalSeverity);
+
 }  // namespace logging
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index 9015b23296e0..097873c5e365 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -80,7 +80,6 @@ class IAllocator {
 
   virtual void Free(void* p) = 0;
 
-  // TODO: Find a better name than Reserve() and update in all places.
   // Reserve() is an interface exposed for an implementation of IAllocator
   // to optionally implement some allocation logic that by-passes any arena-based
   // logic that may be housed in the Alloc() implementation.
diff --git a/include/onnxruntime/core/framework/data_types_internal.h b/include/onnxruntime/core/framework/data_types_internal.h
index fbeee8a2aedc..3a3b5cb6888f 100644
--- a/include/onnxruntime/core/framework/data_types_internal.h
+++ b/include/onnxruntime/core/framework/data_types_internal.h
@@ -305,7 +305,7 @@ class CallableDispatchableHelper {
     return 0;
   }
 
-  void CheckCalledOnce() {
+  void CheckCalledOnce() const {
     ORT_ENFORCE(called_ == 1, "Unsupported data type: ", dt_type_);
   }
 };
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index ea4f52f99649..16ad943a5f47 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -33,6 +33,8 @@ class Node;
 #include "core/framework/stream_handles.h"
 #include "core/framework/tuning_context.h"
 
+struct OrtRunOptions;
+
 namespace onnxruntime {
 
 /**
@@ -51,6 +53,8 @@ struct NodeComputeInfo {
   DestroyFunctionStateFunc release_state_func;
 };
 
+using RunOptions = ::OrtRunOptions;
+
 enum class DataLayout {
   NCHW,
   NHWC,
@@ -59,14 +63,11 @@ enum class DataLayout {
 
 class IExecutionProvider {
  protected:
-  IExecutionProvider(const std::string& type, bool use_metadef_id_creator = false)
-      : IExecutionProvider(type, OrtDevice(), use_metadef_id_creator) {}
+  IExecutionProvider(const std::string& type)
+      : IExecutionProvider(type, OrtDevice()) {}
 
-  IExecutionProvider(const std::string& type, OrtDevice device, bool use_metadef_id_creator = false)
+  IExecutionProvider(const std::string& type, OrtDevice device)
       : default_device_(device), type_{type} {
-    if (use_metadef_id_creator) {
-      metadef_id_generator_ = std::make_unique<ModelMetadefIdGenerator>();
-    }
   }
 
   /*
@@ -187,7 +188,7 @@ class IExecutionProvider {
      Run may not be finished on device This function should be regarded as the
      point after which a new Run would start to submit commands from CPU
   */
-  virtual common::Status OnRunStart() { return Status::OK(); }
+  virtual common::Status OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { return Status::OK(); }
 
   /**
      Called when InferenceSession::Run ended
@@ -195,25 +196,27 @@ class IExecutionProvider {
      may not be finished on device This function should be regarded as the point
      that all commands of current Run has been submmited by CPU
   */
-  virtual common::Status OnRunEnd(bool /*sync_stream*/) { return Status::OK(); }
+  virtual common::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) {
+    return Status::OK();
+  }
 
   /**
      Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
-     the provider. Currently only CUDA execution provider supports it.
+     the provider.
    */
   virtual bool IsGraphCaptureEnabled() const { return false; }
 
   /**
-     Indicate whether the graph has been captured and instantiated. Currently
-     only CUDA execution provider supports it.
+     Indicate whether the graph has been captured and instantiated.
    */
-  virtual bool IsGraphCaptured() const { return false; }
+  virtual bool IsGraphCaptured(int /*graph_annotation_id*/) const { return false; }
 
   /**
-     Run the instantiated graph. Currently only CUDA execution provider supports
-     it.
+     Run the instantiated graph.
    */
-  virtual common::Status ReplayGraph() { return Status::OK(); }
+  virtual common::Status ReplayGraph(int /*graph_annotation_id*/) {
+    return Status::OK();
+  }
 
   /**
      Called when session creation is complete
@@ -274,19 +277,6 @@ class IExecutionProvider {
     return logger_;
   }
 
-  /** Generate a unique id that can be used in a MetaDef name. Values are unique for a model instance.
-   The model hash is also returned if you wish to include that in the MetaDef name to ensure uniqueness across models.
-   @param graph_viewer[in] Graph viewer that GetCapability was called with. Can be for the main graph or nested graph.
-   @param model_hash[out] Returns the hash for the main (i.e. top level) graph in the model.
-                          This is created using the model path if available,
-                          or the model input names and the output names from all nodes in the main graph.
-   @remarks e.g. the TensorRT Execution Provider is used in multiple sessions and the underlying infrastructure caches
-            compiled kernels, so the name must be unique and deterministic across models and sessions.
-            NOTE: Ideally this would be a protected method, but to work across the EP bridge it has to be public and
-                  virtual, and ModelMetadefIdGenerator but be defined in the header as well.
-   */
-  virtual int GenerateMetaDefId(const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash) const;
-
   virtual std::unique_ptr<profiling::EpProfiler> GetProfiler() {
     return {};
   }
@@ -326,23 +316,19 @@ class IExecutionProvider {
    */
   virtual std::vector<AllocatorPtr> CreatePreferredAllocators() { return std::vector<AllocatorPtr>(); };
 
+  /**
+   * Get the array of pointers for EPContext nodes
+   * EP needs to implement this if has the requirement to generate the context cache model. Otherwise leave it.
+   * Default return an empty vector if not provided by the Execution Provider
+   */
+  virtual const InlinedVector<const Node*> GetEpContextNodes() const {
+    return InlinedVector<const Node*>();
+  }
+
  private:
   const std::string type_;
 
   // It will be set when this object is registered to a session
   const logging::Logger* logger_ = nullptr;
-
-  // helper to generate ids that are unique to model and deterministic, even if the execution provider is shared across
-  // multiple sessions.
-  class ModelMetadefIdGenerator {
-   public:
-    int GenerateId(const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash);
-
-   private:
-    std::unordered_map<HashValue, HashValue> main_graph_hash_;  // map graph instance hash to model contents hash
-    std::unordered_map<HashValue, int> model_metadef_id_;       // current unique id for model
-  };
-
-  std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/op_kernel_info.h b/include/onnxruntime/core/framework/op_kernel_info.h
index b31c85e32f80..a0bbfe50a700 100644
--- a/include/onnxruntime/core/framework/op_kernel_info.h
+++ b/include/onnxruntime/core/framework/op_kernel_info.h
@@ -28,7 +28,8 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
                         const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                         const OrtValueNameIdxMap& mlvalue_name_idx_map,
                         const DataTransferManager& data_transfer_mgr,
-                        const AllocatorMap& allocators = {});
+                        const AllocatorMap& allocators,
+                        const ConfigOptions& config_options);
 
   OpKernelInfo(const OpKernelInfo& other);
 
@@ -50,6 +51,8 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
 
   const AllocatorMap& GetAllocators() const { return allocators_; }
 
+  const ConfigOptions& GetConfigOptions() const { return config_options_; }
+
  private:
   ORT_DISALLOW_MOVE(OpKernelInfo);
   ORT_DISALLOW_ASSIGNMENT(OpKernelInfo);
@@ -64,6 +67,7 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
   const DataTransferManager& data_transfer_mgr_;
   ProtoHelperNodeContext proto_helper_context_;
   const AllocatorMap& allocators_;
+  const ConfigOptions& config_options_;
 };
 
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h
index 5444c825d799..789c3b13f2c3 100644
--- a/include/onnxruntime/core/framework/run_options.h
+++ b/include/onnxruntime/core/framework/run_options.h
@@ -45,5 +45,5 @@ struct OrtRunOptions {
 };
 
 namespace onnxruntime {
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/framework/stream_handles.h b/include/onnxruntime/core/framework/stream_handles.h
index c235ee904762..26d78133b52f 100644
--- a/include/onnxruntime/core/framework/stream_handles.h
+++ b/include/onnxruntime/core/framework/stream_handles.h
@@ -100,6 +100,8 @@ class Stream {
     return nullptr;
   }
 
+  virtual WaitNotificationFn GetWaitNotificationFn() const { return nullptr; }
+
  private:
   StreamHandle handle_;
   const OrtDevice& device_;
diff --git a/include/onnxruntime/core/graph/constants.h b/include/onnxruntime/core/graph/constants.h
index 9b26ba914c7d..8e04050d089a 100644
--- a/include/onnxruntime/core/graph/constants.h
+++ b/include/onnxruntime/core/graph/constants.h
@@ -31,6 +31,7 @@ constexpr size_t kMaxExecutionProviderNameLen = 30;
 
 constexpr const char* kCpuExecutionProvider = "CPUExecutionProvider";
 constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
+constexpr const char* kCudaNHWCExecutionProvider = "CUDANHWCExecutionProvider";
 constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
 constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider";
 constexpr const char* kVitisAIExecutionProvider = "VitisAIExecutionProvider";
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 22827d43b200..3b417a362d2c 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -21,7 +21,7 @@
 #pragma warning(pop)
 #endif
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/gsl.h"
 
@@ -621,6 +621,22 @@ class Node {
 
   // Reference to the function template defined in the model.
   const FunctionTemplate* func_template_ = nullptr;
+
+  // set/clear NodeProto that the Node was created from.
+  // Set by Graph ctor when loading a model from file.
+  // Cleared after first call to onnx::check_node in VerifyNodeAndOpMatch when the first Graph::Resolve runs.
+  void SetOriginalNodeProto(const ONNX_NAMESPACE::NodeProto* node_proto) {
+    original_node_proto_ = node_proto;
+  }
+
+  const ONNX_NAMESPACE::NodeProto* GetOriginalNodeProto() const {
+    return original_node_proto_;
+  }
+
+  // NodeProto that the Node was created from. We temporarily set this as a performance optimization to avoid calling
+  // Node::ToProto when running onnx::check_node in the first Graph::Resolve. At that point we know all the nodes are
+  // unchanged from the original model.
+  const ONNX_NAMESPACE::NodeProto* original_node_proto_ = nullptr;
 #endif
 
   // Execution priority, lower value for higher priority
@@ -753,7 +769,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   cannot be overridden at runtime. If the initializer is not found or is not constant, a nullptr is returned.
   @param check_outer_scope If true and the graph is a subgraph,
          check ancestor graph/s for 'name' if not found in 'graph'.
-  @remarks check_outer_scope of true is not supported in a minimal build
   */
   const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const std::string& name, bool check_outer_scope) const;
 
diff --git a/include/onnxruntime/core/graph/graph_viewer.h b/include/onnxruntime/core/graph/graph_viewer.h
index 3cdbb07099ca..1023d5031018 100644
--- a/include/onnxruntime/core/graph/graph_viewer.h
+++ b/include/onnxruntime/core/graph/graph_viewer.h
@@ -165,7 +165,8 @@ class GraphViewer {
            if a const initializer is part of the underlying Graph but not part of this GraphViewer,
            it will still be returned instead of nullptr
   */
-  const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const std::string& name, bool check_outer_scope) const;
+  const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const std::string& name,
+                                                            bool check_outer_scope = true) const;
 
   /** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
   const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
diff --git a/include/onnxruntime/core/providers/cann/cann_provider_options.h b/include/onnxruntime/core/providers/cann/cann_provider_options.h
index ac60fbe4a293..51b423e68110 100644
--- a/include/onnxruntime/core/providers/cann/cann_provider_options.h
+++ b/include/onnxruntime/core/providers/cann/cann_provider_options.h
@@ -16,6 +16,7 @@ struct OrtCANNProviderOptions {
   int enable_cann_graph;                                   // Flag indicating if prioritizing the use of
                                                            // CANN's graph-running capabilities
   int dump_graphs;                                         // Flag indicating if dumping graphs
+  int dump_om_model;                                       // Flag indicating if dumping om model
   std::string precision_mode;                              // Operator Precision Mode
   std::string op_select_impl_mode;                         // Operator-level model compilation options:
                                                            // Mode selection
diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index 03715eb5b78b..55abb90b981f 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -28,9 +28,12 @@ enum COREMLFlags {
   // dynamic shapes. However, the performance may be negatively impacted if inputs have dynamic shapes.
   COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES = 0x008,
 
+  // Create an MLProgram. By default it will create a NeuralNetwork model. Requires Core ML 5 or later.
+  COREML_FLAG_CREATE_MLPROGRAM = 0x010,
+
   // Keep COREML_FLAG_LAST at the end of the enum definition
   // And assign the last COREMLFlag to it
-  COREML_FLAG_LAST = COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES,
+  COREML_FLAG_LAST = COREML_FLAG_CREATE_MLPROGRAM,
 };
 
 #ifdef __cplusplus
diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index d73d551920d4..7104e70c3a8a 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -16,9 +16,10 @@
 #include "core/providers/custom_op_context.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
+#ifndef USE_CUDA_MINIMAL
 #include <cublas_v2.h>
 #include <cudnn.h>
-
+#endif
 namespace Ort {
 
 namespace Custom {
@@ -28,38 +29,47 @@ struct CudaContext : public CustomOpContext {
   cudnnHandle_t cudnn_handle = {};
   cublasHandle_t cublas_handle = {};
   OrtAllocator* deferred_cpu_allocator = {};
+  // below are cuda ep options
+  int16_t device_id = 0;
+  int32_t arena_extend_strategy = 0;
+  int32_t cudnn_conv_algo_search = 0;
+  bool cudnn_conv_use_max_workspace = true;
+  bool cudnn_conv1d_pad_to_nc1d = false;
+  bool enable_skip_layer_norm_strict_mode = false;
+  bool prefer_nhwc = false;
+  bool use_tf32 = true;
 
   void Init(const OrtKernelContext& kernel_ctx) {
-    const auto& ort_api = Ort::GetApi();
-    void* resource = {};
-    OrtStatus* status = nullptr;
-
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cuda_stream_t, &resource);
-    if (status) {
-      ORT_CXX_API_THROW("failed to fetch cuda stream", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
-    }
-    cuda_stream = reinterpret_cast<cudaStream_t>(resource);
-
-    resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cudnn_handle_t, &resource);
-    if (status) {
-      ORT_CXX_API_THROW("failed to fetch cudnn handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
-    }
-    cudnn_handle = reinterpret_cast<cudnnHandle_t>(resource);
+    cuda_stream = FetchResource<cudaStream_t>(kernel_ctx, CudaResource::cuda_stream_t);
+    cudnn_handle = FetchResource<cudnnHandle_t>(kernel_ctx, CudaResource::cudnn_handle_t);
+    cublas_handle = FetchResource<cublasHandle_t>(kernel_ctx, CudaResource::cublas_handle_t);
+    deferred_cpu_allocator = FetchResource<OrtAllocator*>(kernel_ctx, CudaResource::deferred_cpu_allocator_t);
+
+    device_id = FetchResource<int16_t>(kernel_ctx, CudaResource::device_id_t);
+    arena_extend_strategy = FetchResource<int32_t>(kernel_ctx, CudaResource::arena_extend_strategy_t);
+    cudnn_conv_algo_search = FetchResource<int32_t>(kernel_ctx, CudaResource::cudnn_conv_algo_search_t);
+    cudnn_conv_use_max_workspace = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t);
+
+    cudnn_conv1d_pad_to_nc1d = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t);
+    enable_skip_layer_norm_strict_mode = FetchResource<bool>(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
+    prefer_nhwc = FetchResource<bool>(kernel_ctx, CudaResource::prefer_nhwc_t);
+    use_tf32 = FetchResource<bool>(kernel_ctx, CudaResource::use_tf32_t);
+  }
 
-    resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cublas_handle_t, &resource);
-    if (status) {
-      ORT_CXX_API_THROW("failed to fetch cublas handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  template <typename T>
+  T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) {
+    if constexpr (sizeof(T) > sizeof(void*)) {
+      ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT);
     }
-    cublas_handle = reinterpret_cast<cublasHandle_t>(resource);
-
-    resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::deferred_cpu_allocator_t, &resource);
+    const auto& ort_api = Ort::GetApi();
+    void* resource = {};
+    OrtStatus* status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, resource_type, &resource);
     if (status) {
-      ORT_CXX_API_THROW("failed to fetch deferred cpu allocator", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+      ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resouce type: " + std::to_string(resource_type), OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
-    deferred_cpu_allocator = reinterpret_cast<OrtAllocator*>(resource);
+    T t = {};
+    memcpy(&t, &resource, sizeof(T));
+    return t;
   }
 
   void* AllocDeferredCpuMem(size_t size) const {
diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
index 82bb8ba83be4..6d53760ab60b 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -37,4 +37,5 @@ struct OrtCUDAProviderOptionsV2 {
                                                                                                                // The strict mode has better accuracy but lower performance.
   int prefer_nhwc = 0;                                                                                         // make the CUDA EP NHWC preferred
   int use_ep_level_unified_stream = 0;                                                                         // flag specifying if ep level stream is used or not
+  int use_tf32 = 1;                                                                                            // use TF32
 };
diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h
index 8c3ed46ade6a..00e7dec5727d 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_resource.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -3,11 +3,20 @@
 
 #include "core/providers/resource.h"
 
-#define ORT_CUDA_RESOUCE_VERSION 2
+#define ORT_CUDA_RESOUCE_VERSION 3
 
 enum CudaResource : int {
-  cuda_stream_t = cuda_resource_offset,
+  cuda_stream_t = cuda_resource_offset,  // 10000
   cudnn_handle_t,
   cublas_handle_t,
   deferred_cpu_allocator_t,
-};
\ No newline at end of file
+  // below are cuda ep options
+  device_id_t,  // 10004
+  arena_extend_strategy_t,
+  cudnn_conv_algo_search_t,
+  cudnn_conv_use_max_workspace_t,
+  cudnn_conv1d_pad_to_nc1d_t,
+  enable_skip_layer_norm_strict_mode_t,
+  prefer_nhwc_t,
+  use_tf32_t,
+};
diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
index 7d7f05193f48..33b98edf3bf4 100644
--- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h
+++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
@@ -27,14 +27,8 @@ typedef struct IDMLDevice IDMLDevice;
 #include "onnxruntime_c_api.h"
 
 #ifdef __cplusplus
-extern "C" {
-#endif
 
-enum OrtDmlPerformancePreference {
-  Default = 0,
-  HighPerformance = 1,
-  MinimumPower = 2
-};
+extern "C" {
 
 enum OrtDmlDeviceFilter : uint32_t {
 #ifdef ENABLE_NPU_ADAPTER_ENUMERATION
@@ -54,11 +48,33 @@ inline OrtDmlDeviceFilter& operator|=(OrtDmlDeviceFilter& a, OrtDmlDeviceFilter
 inline OrtDmlDeviceFilter& operator&=(OrtDmlDeviceFilter& a, OrtDmlDeviceFilter b) { return (OrtDmlDeviceFilter&)((int&)a &= (int)b); }
 inline OrtDmlDeviceFilter& operator^=(OrtDmlDeviceFilter& a, OrtDmlDeviceFilter b) { return (OrtDmlDeviceFilter&)((int&)a ^= (int)b); }
 
+#else
+
+typedef enum OrtDmlDeviceFilter {
+#ifdef ENABLE_NPU_ADAPTER_ENUMERATION
+  Any = 0xffffffff,
+  Gpu = 1 << 0,
+  Npu = 1 << 1,
+#else
+  Gpu = 1 << 0,
+#endif
+} OrtDmlDeviceFilter;
+
+#endif
+
+typedef enum OrtDmlPerformancePreference {
+  Default = 0,
+  HighPerformance = 1,
+  MinimumPower = 2
+} OrtDmlPerformancePreference;
+
 struct OrtDmlDeviceOptions {
   OrtDmlPerformancePreference Preference;
   OrtDmlDeviceFilter Filter;
 };
 
+typedef struct OrtDmlDeviceOptions OrtDmlDeviceOptions;
+
 /**
  * [[deprecated]]
  * This export is deprecated.
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index 680ce1cc5b9a..32a9f06464ac 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -11,6 +11,8 @@
 /// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
 /// </summary>
 struct OrtTensorRTProviderOptionsV2 {
+  OrtTensorRTProviderOptionsV2& operator=(const OrtTensorRTProviderOptionsV2& other);  // copy assignment operator
+
   int device_id{0};                                      // cuda device id.
   int has_user_compute_stream{0};                        // indicator of user specified CUDA compute stream.
   void* user_compute_stream{nullptr};                    // user specified CUDA compute stream.
@@ -46,4 +48,26 @@ struct OrtTensorRTProviderOptionsV2 {
   const char* trt_profile_max_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   const char* trt_profile_opt_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   int trt_cuda_graph_enable{0};                          // Enable CUDA graph in ORT TRT
+
+  /*
+   * Please note that there are rules for using following context model related provider options:
+   *
+   * 1. In the case of dumping the context model and loading the context model,
+   *    for security reason, TRT EP doesn't allow the "ep_cache_context" node attribute of EP context node to be
+   *    the absolute path or relative path that is outside of context model directory.
+   *    It means engine cache needs to be in the same directory or sub-directory of context model.
+   *
+   * 2. In the case of dumping the context model, the engine cache path will be changed to the relative path of context model directory.
+   *    For example:
+   *    If "trt_dump_ep_context_model" is enabled and "trt_engine_cache_enable" is enabled,
+   *       if "trt_ep_context_file_path" is "./context_model_dir",
+   *       - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
+   *       - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
+   *
+   */
+  int trt_dump_ep_context_model{0};               // Dump EP context node model
+  const char* trt_ep_context_file_path{nullptr};  // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
+  int trt_ep_context_embed_mode{0};               // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
+
+  const char* trt_engine_cache_prefix{nullptr};  // specify engine cache prefix
 };
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index dbd5ad41255f..e7b8f1487112 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -29,15 +29,16 @@
  */
 
 #pragma once
-#include <stdlib.h>
+#include <stdbool.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <string.h>
 
 /** \brief The API version defined in this header
  *
  * This value is used by some API functions to behave as this version of the header expects.
  */
-#define ORT_API_VERSION 17
+#define ORT_API_VERSION 18
 
 #ifdef __cplusplus
 extern "C" {
@@ -318,6 +319,12 @@ typedef struct OrtAllocator {
   void*(ORT_API_CALL* Alloc)(struct OrtAllocator* this_, size_t size);                ///< Returns a pointer to an allocated block of `size` bytes
   void(ORT_API_CALL* Free)(struct OrtAllocator* this_, void* p);                      ///< Free a block of memory previously allocated with OrtAllocator::Alloc
   const struct OrtMemoryInfo*(ORT_API_CALL* Info)(const struct OrtAllocator* this_);  ///< Return a pointer to an ::OrtMemoryInfo that describes this allocator
+  /**
+   * @brief Optional allocation function to use for memory allocations made during session initialization.
+   * Use this function if you want to separate allocations made by ORT during Run() calls from
+   * those made during session initialization. This allows for separate memory management strategies for these allocations.
+   */
+  void*(ORT_API_CALL* Reserve)(struct OrtAllocator* this_, size_t size);  ///< Returns a pointer to an allocated block of `size` bytes
 } OrtAllocator;
 
 typedef void(ORT_API_CALL* OrtLoggingFunction)(
@@ -495,6 +502,7 @@ typedef struct OrtROCMProviderOptions {
         has_user_compute_stream{},
         user_compute_stream{},
         default_memory_arena_cfg{},
+        enable_hip_graph{false},
         tunable_op_enable{false},
         tunable_op_tuning_enable{false},
         tunable_op_max_tuning_duration_ms{} {}
@@ -547,6 +555,8 @@ typedef struct OrtROCMProviderOptions {
    */
   OrtArenaCfg* default_memory_arena_cfg;
 
+  int enable_hip_graph;
+
   /** \brief Enable TunableOp for using.
    *   Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
    *   This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
@@ -1833,14 +1843,28 @@ struct OrtApi {
 
   /** \brief Used for custom operators, get an input of a kernel
    *
-   * \see ::OrtCustomOp
+   * The function attempts fetches the input of the kernel. If the input is optional
+   * and not present, the function returns success and out is set to nullptr.
+   *
+   * \param[in] context ::OrtKernelContext instance
+   * \param[in] input index. See KernelContext_GetInputCount for boundaries check.
+   * \param[in, out] returns a ptr to OrtValue if the input is present
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(KernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index,
                   _Out_ const OrtValue** out);
 
   /** \brief Used for custom operators, get an output of a kernel
    *
-   * \see ::OrtCustomOp
+   * The function attempts fetches the output of the kernel. If the output is optional
+   * and not present, the function returns success and out is set to nullptr.
+   *
+   * \param[in] context ::OrtKernelContext instance
+   * \param[in] output index. See KernelContext_GetOutputCount for boundaries check.
+   * \param[in, out] returns a ptr to OrtValue if the output is present
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(KernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index,
                   _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out);
@@ -3594,10 +3618,11 @@ struct OrtApi {
    * QNN supported keys:
    *   "backend_path": file path to QNN backend library.
    *   "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
+   *   "profiling_file_path": QNN profiling file path if ETW not enabled.
    *   "rpc_control_latency": QNN RPC control latency.
    *   "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
    *   "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
-   *   "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
+   *   "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *   dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *   may alter model/EP partitioning. Use only for debugging.
@@ -3607,6 +3632,18 @@ struct OrtApi {
    *     - "1": Faster preparation time, less optimal graph.
    *     - "2": Longer preparation time, more optimal graph.
    *     - "3": Longest preparation time, most likely even more optimal graph. See QNN SDK documentation for specific details.
+   *   "soc_model": The SoC model number. Refer to the QNN SDK documentation for valid values. Defaults to "0" (unknown).
+   *   "htp_arch": The minimum HTP architecture the driver will use to select compatible QNN operators. Available options:
+   *     - "0": Default (none).
+   *     - "68"
+   *     - "69"
+   *     - "73"
+   *     - "75"
+   *   "device_id": The ID of the device to use when setting 'htp_arch'. Defaults to "0" (for single device).
+       "enable_htp_fp16_precision": Only used for float32 model.
+       Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision.
+         - "0": Default. With fp32 precision.
+         - "1": With fp16 precision.
    *
    * SNPE supported keys:
    *   "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
@@ -4417,7 +4454,7 @@ struct OrtApi {
   ORT_API2_STATUS(GetCUDAProviderOptionsByName, _In_ const OrtCUDAProviderOptionsV2* cuda_options, _In_ const char* key, _Outptr_ void** ptr);
 
   /**
-   * Get a EP resoure.
+   * Get a EP resource.
    * E.g. a cuda stream or a cublas handle
    *
    * \param context - Kernel context
@@ -4515,6 +4552,85 @@ struct OrtApi {
    * \since Version 1.17.
    */
   ORT_API2_STATUS(ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data, _In_ size_t len, _Out_ size_t* out);
+
+  /** \brief Set whether to use deterministic compute.
+   *
+   * Default is false. If set to true, this will enable deterministic compute for GPU kernels where possible.
+   * Note that this most likely will have a performance cost.
+   *
+   * \param[in] options
+   * \param[in] value
+   *
+   * \since Version 1.17.
+   */
+  ORT_API2_STATUS(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
+
+  /**
+   * Run fn in parallel
+   *
+   * \param[in] context
+   * \param[in] fn Function accepting usr_data and an integer as iterator
+   * \param[in] total The number of times fn is to be invoked
+   * \param[in] num_batch Number of batches by which the "total" is to be divided in maximum. When zero, there is no limit
+   * \param[in] usr_data User data to be passed back to fn
+   *
+   * \since Version 1.17.
+   */
+  ORT_API2_STATUS(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data);
+
+  /** \brief Append OpenVINO execution provider to the session options
+   *
+   * If OpenVINO is not available (due to a non OpenVINO enabled build, or if OpenVINO is not installed on the system), this function will fail.
+   *
+   * \param[in] options
+   * \param[in] provider_options_keys
+   * \param[in] provider_options_values
+   * \param[in] num_keys
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+                  _In_ OrtSessionOptions* options,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
+
+  /** \brief Append VitisAI provider to session options
+   *
+   * If VitisAI is not available (due to a non VitisAI enabled build, or if VitisAI is not installed on the system), this function will return failure.
+   *
+   * \param[in] options
+   * \param[in] provider_options_keys
+   * \param[in] provider_options_values
+   * \param[in] num_keys
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_VitisAI,
+                  _In_ OrtSessionOptions* options,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
+
+  /** \brief Get scratch buffer from the corresponding allocator under the sepcific OrtMemoryInfo object.
+   *         NOTE: callers are responsible to release this scratch buffer from the corresponding allocator
+   *  \param[in] context OrtKernelContext instance
+   *  \param[in] mem_info OrtMemoryInfo instance
+   *  \param[in] count_or_bytes How many bytes is this scratch buffer
+   *  \param[out] out A pointer to the scrach buffer
+   *  \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out);
+
+  /** \brief Get allocator from KernelInfo for a specific memory type. Please use C API ReleaseAllocator to release out object
+   *
+   * \param[in] info OrtKernelInfo instance
+   * \param[in] mem_type OrtMemType object
+   * \param[out] out A pointer to OrtAllocator
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out);
 };
 
 /*
@@ -4612,6 +4728,21 @@ struct OrtCustomOp {
   // Get start range
   int(ORT_API_CALL* GetStartVersion)(_In_ const struct OrtCustomOp* op);
   int(ORT_API_CALL* GetEndVersion)(_In_ const struct OrtCustomOp* op);
+
+  // Get the inplace_map that defines which output can reuse which input
+  // Callers will provide 2 raw int* and pass in their address, this function will fill these 2 arrays
+  // when return, output (*output_index)[i] may reuse the input (*input_index[i]).
+  // The return value is the size of these 2 arrays.
+  // Callers are responsible to delete these 2 arrays after use by calling OrtCustomOp::ReleaseMayInplace().
+  size_t(ORT_API_CALL* GetMayInplace)(_Out_ int** input_index, _Out_ int** output_index);
+
+  // Release the pointer input_index and output_index allocated from GetMayInplace() function.
+  // If GetMayInplace() is defined, this function MUST be defined as well.
+  void(ORT_API_CALL* ReleaseMayInplace)(_Frees_ptr_opt_ int* input_index, _Frees_ptr_opt_ int* output_index);
+
+  // Same as GetMayInplace() and ReleaseMayInplace()
+  size_t(ORT_API_CALL* GetAliasMap)(_Out_ int** input_index, _Out_ int** output_index);
+  void(ORT_API_CALL* ReleaseAliasMap)(_Frees_ptr_opt_ int* input_index, _Frees_ptr_opt_ int* output_index);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 92c25d8688b6..fd0e3490426a 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -845,6 +845,7 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
   SessionOptionsImpl& SetIntraOpNumThreads(int intra_op_num_threads);                              ///< Wraps OrtApi::SetIntraOpNumThreads
   SessionOptionsImpl& SetInterOpNumThreads(int inter_op_num_threads);                              ///< Wraps OrtApi::SetInterOpNumThreads
   SessionOptionsImpl& SetGraphOptimizationLevel(GraphOptimizationLevel graph_optimization_level);  ///< Wraps OrtApi::SetSessionGraphOptimizationLevel
+  SessionOptionsImpl& SetDeterministicCompute(bool value);                                         ///< Wraps OrtApi::SetDeterministicCompute
 
   SessionOptionsImpl& EnableCpuMemArena();   ///< Wraps OrtApi::EnableCpuMemArena
   SessionOptionsImpl& DisableCpuMemArena();  ///< Wraps OrtApi::DisableCpuMemArena
@@ -873,10 +874,12 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
   SessionOptionsImpl& AddInitializer(const char* name, const OrtValue* ort_val);                                             ///< Wraps OrtApi::AddInitializer
   SessionOptionsImpl& AddExternalInitializers(const std::vector<std::string>& names, const std::vector<Value>& ort_values);  ///< Wraps OrtApi::AddExternalInitializers
 
-  SessionOptionsImpl& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options);               ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA
-  SessionOptionsImpl& AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA_V2
-  SessionOptionsImpl& AppendExecutionProvider_ROCM(const OrtROCMProviderOptions& provider_options);               ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_ROCM
-  SessionOptionsImpl& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+  SessionOptionsImpl& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA
+  SessionOptionsImpl& AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options);     ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA_V2
+  SessionOptionsImpl& AppendExecutionProvider_ROCM(const OrtROCMProviderOptions& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_ROCM
+  SessionOptionsImpl& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options);  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO_V2
+  SessionOptionsImpl& AppendExecutionProvider_OpenVINO_V2(const std::unordered_map<std::string, std::string>& provider_options = {});
   SessionOptionsImpl& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
   SessionOptionsImpl& AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options);  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
   SessionOptionsImpl& AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_MIGraphX
@@ -898,6 +901,9 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
   SessionOptionsImpl& RegisterCustomOpsLibrary(const ORTCHAR_T* library_name, const CustomOpConfigs& custom_op_configs = {});
 
   SessionOptionsImpl& RegisterCustomOpsUsingFunction(const char* function_name);  ///< Wraps OrtApi::RegisterCustomOpsUsingFunction
+
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_VitisAI
+  SessionOptionsImpl& AppendExecutionProvider_VitisAI(const std::unordered_map<std::string, std::string>& provider_options = {});
 };
 }  // namespace detail
 
@@ -2049,13 +2055,18 @@ struct KernelContext {
   explicit KernelContext(OrtKernelContext* context);
   size_t GetInputCount() const;
   size_t GetOutputCount() const;
+  // If input is optional and is not present, the method returns en empty ConstValue
+  // which can be compared to nullptr.
   ConstValue GetInput(size_t index) const;
+  // If outout is optional and is not present, the method returns en empty UnownedValue
+  // which can be compared to nullptr.
   UnownedValue GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const;
   UnownedValue GetOutput(size_t index, const std::vector<int64_t>& dims) const;
   void* GetGPUComputeStream() const;
   Logger GetLogger() const;
   OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;
   OrtKernelContext* GetOrtKernelContext() const { return ctx_; }
+  void ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const;
 
  private:
   OrtKernelContext* ctx_;
@@ -2290,6 +2301,11 @@ struct CustomOpBase : OrtCustomOp {
     OrtCustomOp::GetEndVersion = [](const OrtCustomOp* this_) {
       return static_cast<const TOp*>(this_)->end_ver_;
     };
+
+    OrtCustomOp::GetMayInplace = nullptr;
+    OrtCustomOp::ReleaseMayInplace = nullptr;
+    OrtCustomOp::GetAliasMap = nullptr;
+    OrtCustomOp::ReleaseAliasMap = nullptr;
   }
 
   // Default implementation of GetExecutionProviderType that returns nullptr to default to the CPU provider
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 860a27fc73f7..9d1e8c944308 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -7,17 +7,27 @@
 // These are the inline implementations of the C++ header APIs. They're in this separate file as to not clutter
 // the main C++ file with implementation details.
 
-#include <cstring>
+#include <algorithm>
 #include <functional>
-
-#define RETURN_ON_API_FAIL(expression) \
-  {                                    \
-    auto err = (expression);           \
-    if (err) {                         \
-      return Status(err);              \
-    }                                  \
+#include <iterator>
+#include <type_traits>
+
+// Convert OrtStatus to Ort::Status and return
+// instead of throwing
+#define ORT_CXX_RETURN_ON_API_FAIL(expression) \
+  {                                            \
+    auto ort_status = (expression);            \
+    if (ort_status) {                          \
+      return Ort::Status(ort_status);          \
+    }                                          \
   }
 
+#ifdef __cpp_if_constexpr
+#define ORT_CXX_IF_CONSTEXPR if constexpr
+#else
+#define ORT_CXX_IF_CONSTEXPR if
+#endif
+
 namespace Ort {
 
 namespace detail {
@@ -656,6 +666,12 @@ inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetGraphOptimizationLevel(G
   return *this;
 }
 
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetDeterministicCompute(bool value) {
+  ThrowOnError(GetApi().SetDeterministicCompute(this->p_, value));
+  return *this;
+}
+
 template <typename T>
 inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetOptimizedModelFilePath(const ORTCHAR_T* optimized_model_filepath) {
   ThrowOnError(GetApi().SetOptimizedModelFilePath(this->p_, optimized_model_filepath));
@@ -859,6 +875,45 @@ inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_Ope
   return *this;
 }
 
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_OpenVINO_V2(const std::unordered_map<std::string, std::string>& provider_options) {
+  auto num_entries = provider_options.size();
+  std::vector<const char*> keys, values;
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+
+    for (const auto& entry : provider_options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO_V2(this->p_,
+                                                                          keys.data(), values.data(), num_entries));
+
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_VitisAI(const std::unordered_map<std::string, std::string>& provider_options) {
+  auto num_entries = provider_options.size();
+  std::vector<const char*> keys, values;
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+
+    for (const auto& entry : provider_options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_VitisAI(this->p_, keys.data(), values.data(), num_entries));
+
+  return *this;
+}
+
 template <typename T>
 inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::RegisterCustomOpsLibrary(const ORTCHAR_T* library_name,
                                                                               const CustomOpConfigs& custom_op_configs) {
@@ -1652,6 +1707,10 @@ inline Logger KernelContext::GetLogger() const {
   return Logger{out};
 }
 
+inline void KernelContext::ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const {
+  ThrowOnError(GetApi().KernelContext_ParallelFor(ctx_, fn, total, num_batch, usr_data));
+}
+
 inline OpAttr::OpAttr(const char* name, const void* data, int len, OrtOpAttrType type) {
   Ort::ThrowOnError(GetApi().CreateOpAttr(name, data, len, type, &p_));
 }
@@ -1918,7 +1977,7 @@ inline ShapeInferContext::ShapeInferContext(const OrtApi* ort_api,
 
 inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shape) {
   OrtTensorTypeAndShapeInfo* info = {};
-  RETURN_ON_API_FAIL(ort_api_->CreateTensorTypeAndShapeInfo(&info));
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->CreateTensorTypeAndShapeInfo(&info));
 
   using InfoPtr = std::unique_ptr<OrtTensorTypeAndShapeInfo, std::function<void(OrtTensorTypeAndShapeInfo*)>>;
 
@@ -1942,9 +2001,9 @@ inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shap
     }
   }
 
-  RETURN_ON_API_FAIL(ort_api_->SetDimensions(info, integer_dims.data(), integer_dims.size()));
-  RETURN_ON_API_FAIL(ort_api_->SetSymbolicDimensions(info, symbolic_dims.data(), symbolic_dims.size()));
-  RETURN_ON_API_FAIL(ort_api_->ShapeInferContext_SetOutputTypeShape(ctx_, indice, info));
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->SetDimensions(info, integer_dims.data(), integer_dims.size()));
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->SetSymbolicDimensions(info, symbolic_dims.data(), symbolic_dims.size()));
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->ShapeInferContext_SetOutputTypeShape(ctx_, indice, info));
   return Status{nullptr};
 }
 
diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
index 0c0af16d4e20..ee60f25da115 100644
--- a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
+++ b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -862,6 +862,11 @@ struct OrtLiteCustomOp : public OrtCustomOp {
       auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
       return self->end_ver_;
     };
+
+    OrtCustomOp::GetMayInplace = {};
+    OrtCustomOp::ReleaseMayInplace = {};
+    OrtCustomOp::GetAliasMap = {};
+    OrtCustomOp::ReleaseAliasMap = {};
   }
 
   const std::string op_name_;
@@ -1111,4 +1116,4 @@ OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
 }
 
 }  // namespace Custom
-}  // namespace Ort
\ No newline at end of file
+}  // namespace Ort
diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
index 1f5fcd50e185..c80b8c0c164b 100644
--- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -30,3 +30,22 @@ static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memor
 // Per default it will be set to '0'
 // Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream.
 static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers";
+
+// Set HTP performance mode for QNN HTP backend before session run.
+// options for HTP performance mode: "burst", "balanced", "default", "high_performance",
+// "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver",
+// "sustained_high_performance". Default to "default".
+static const char* const kOrtRunOptionsConfigQnnPerfMode = "qnn.htp_perf_mode";
+
+// Set HTP performance mode for QNN HTP backend post session run.
+static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_mode_post_run";
+
+// Set RPC control latency for QNN HTP backend
+static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
+
+// Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true.
+// The value should be an integer. If the value is not set, the default value is 0 and
+// ORT session only captures one cuda graph before another capture is requested.
+// If the value is set to -1, cuda graph capture/replay is disabled in that run.
+// User are not expected to set the value to 0 as it is reserved for internal use.
+static const char* const kOrtRunOptionsConfigCudaGraphAnnotation = "gpu_graph_id";
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index df79cb6e5b21..bb5e0344895e 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -93,6 +93,15 @@ static const char* const kOrtSessionOptionsMemoryOptimizerEnabler = "optimizatio
 static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimization.enable_memory_probe_recompute_config";
 #endif
 
+// This setting if set should contain a comma separated list of optimizers names that should be disabled.
+// Optimizers may take time to execute and affect model loading time. If you feel that a specific optimizer
+// does not provider runtime benefits, but affects your model loading time you may disable it using this config
+// entry. This option is not enabled in ORT_MINIMAL_BUILD build.
+// A list of optimizes is available in onnxruntime/core/optimizer/graph_transformer_utils.cc
+//
+// Default is an empty string which means no optimizers are disabled.
+static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers";
+
 // Enable or disable using device allocator for allocating initialized tensor memory. "1": enable; "0": disable. The default is "0".
 // Using device allocators means the memory allocation is made using malloc/new.
 static const char* const kOrtSessionOptionsUseDeviceAllocatorForInitializers = "session.use_device_allocator_for_initializers";
@@ -236,7 +245,7 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
-// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
+// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
 // "1": enable.
@@ -249,4 +258,10 @@ static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_p
 // Flag to specify whether to dump the EP context into the Onnx model.
 // "0": dump the EP context into separate file, keep the file name in the Onnx model.
 // "1": dump the EP context into the Onnx model. (default).
-static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
\ No newline at end of file
+static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
+
+// Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
+// Option values:
+// - "0": Gemm FastMath mode is not enabled. [DEFAULT]
+// - "1": Gemm FastMath mode is enabled.
+static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
diff --git a/java/README.md b/java/README.md
index 2ce9a8bf62e4..5c5baeb43a27 100644
--- a/java/README.md
+++ b/java/README.md
@@ -14,7 +14,7 @@ Use the main project's [build instructions](https://www.onnxruntime.ai/docs/how-
 
 #### Requirements
 
-JDK version 8 or later is required.
+Java 11 or later is required to build the library. The compiled jar file will run on Java 8 or later.
 
 The [Gradle](https://gradle.org/) build system is used here to manage the Java project's dependency management, compilation, testing, and assembly.
 In particular, the Gradle [wrapper](https://docs.gradle.org/current/userguide/gradle_wrapper.html) at `java/gradlew[.bat]` is used, locking the Gradle version to the one specified in the `java/gradle/wrapper/gradle-wrapper.properties` configuration.
@@ -35,6 +35,7 @@ This allows the CMake system to ensure all of the C/C++ compilation is achieved
 The Java build depends on C/C++ onnxruntime shared library and a C JNI shared library (source located in the `src/main/native` directory).
 The JNI shared library is the glue that allows for Java to call functions in onnxruntime shared library.
 Given the fact that CMake injects native dependencies during CMake builds, some gradle tasks (primarily, `build`, `test`, and `check`) may fail.
+To run the Java build independently of CMake supply `-DcmakeBuildDir=<path-to-onnx-runtime-build-dir>`, though this will only succeed after an initial build of the native libraries has completed.
 
 When running the build script, CMake will compile the `onnxruntime` target and the JNI glue `onnxruntime4j_jni` target and expose the resulting libraries in a place where Gradle can ingest them.
 Upon successful compilation of those targets, a special Gradle task to build will be executed. The results will be placed in the output directory stated above.
@@ -61,4 +62,4 @@ Then the corresponding C files in `./src/main/native/ai_onnxruntime*.c` may be u
 
 ### Dependencies
 
-The Java API does not have any runtime or compile dependencies currently.
+The Java API does not have any runtime or compile dependencies.
diff --git a/java/build.gradle b/java/build.gradle
index c0a75f8165f7..fd66ec220b78 100644
--- a/java/build.gradle
+++ b/java/build.gradle
@@ -3,7 +3,7 @@ plugins {
 	id 'maven-publish'
 	id 'signing'
 	id 'jacoco'
-	id "com.diffplug.spotless" version "6.13.0"
+	id "com.diffplug.spotless" version "6.25.0"
 }
 
 allprojects {
@@ -185,7 +185,7 @@ test {
 	if (cmakeBuildDir != null) {
 		workingDir cmakeBuildDir
 	}
-	systemProperties System.getProperties().subMap(['USE_CUDA', 'USE_ROCM', 'USE_TENSORRT', 'USE_DNNL', 'USE_OPENVINO', 'USE_COREML', 'JAVA_FULL_TEST', 'ENABLE_TRAINING_APIS'])
+	systemProperties System.getProperties().subMap(['USE_CUDA', 'USE_ROCM', 'USE_TENSORRT', 'USE_DNNL', 'USE_OPENVINO', 'USE_COREML', 'USE_DML', 'JAVA_FULL_TEST', 'ENABLE_TRAINING_APIS'])
 	testLogging {
 		events "passed", "skipped", "failed"
 		showStandardStreams = true
diff --git a/java/gradle/wrapper/gradle-wrapper.jar b/java/gradle/wrapper/gradle-wrapper.jar
index ccebba7710de..d64cd4917707 100644
Binary files a/java/gradle/wrapper/gradle-wrapper.jar and b/java/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/java/gradle/wrapper/gradle-wrapper.properties b/java/gradle/wrapper/gradle-wrapper.properties
index f396aaac2d31..4baf5a11d45a 100644
--- a/java/gradle/wrapper/gradle-wrapper.properties
+++ b/java/gradle/wrapper/gradle-wrapper.properties
@@ -1,7 +1,8 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionSha256Sum=1b6b558be93f29438d3df94b7dfee02e794b94d9aca4611a92cdb79b6b88e909
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.0.1-bin.zip
+distributionSha256Sum=9631d53cf3e74bfa726893aee1f8994fee4e060c401335946dba2156f440f24c
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
 networkTimeout=10000
+validateDistributionUrl=true
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
diff --git a/java/gradlew b/java/gradlew
index 79a61d421cc4..1aa94a426907 100755
--- a/java/gradlew
+++ b/java/gradlew
@@ -83,10 +83,8 @@ done
 # This is normally unused
 # shellcheck disable=SC2034
 APP_BASE_NAME=${0##*/}
-APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
+APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
 
 # Use the maximum available, or set MAX_FD != -1 to use that value.
 MAX_FD=maximum
@@ -133,10 +131,13 @@ location of your Java installation."
     fi
 else
     JAVACMD=java
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+    if ! command -v java >/dev/null 2>&1
+    then
+        die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 
 Please set the JAVA_HOME variable in your environment to match the
 location of your Java installation."
+    fi
 fi
 
 # Increase the maximum file descriptors if we can.
@@ -144,7 +145,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
     case $MAX_FD in #(
       max*)
         # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
-        # shellcheck disable=SC3045
+        # shellcheck disable=SC2039,SC3045
         MAX_FD=$( ulimit -H -n ) ||
             warn "Could not query maximum file descriptor limit"
     esac
@@ -152,7 +153,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
       '' | soft) :;; #(
       *)
         # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
-        # shellcheck disable=SC3045
+        # shellcheck disable=SC2039,SC3045
         ulimit -n "$MAX_FD" ||
             warn "Could not set maximum file descriptor limit to $MAX_FD"
     esac
@@ -197,11 +198,15 @@ if "$cygwin" || "$msys" ; then
     done
 fi
 
-# Collect all arguments for the java command;
-#   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
-#     shell script including quotes and variable substitutions, so put them in
-#     double quotes to make sure that they get re-expanded; and
-#   * put everything else in single quotes, so that it's not re-expanded.
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Collect all arguments for the java command:
+#   * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments,
+#     and any embedded shellness will be escaped.
+#   * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be
+#     treated as '${Hostname}' itself on the command line.
 
 set -- \
         "-Dorg.gradle.appname=$APP_BASE_NAME" \
diff --git a/java/src/main/java/ai/onnxruntime/OnnxJavaType.java b/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
index 24bf6ad4b95f..6f3ca13984f4 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
@@ -45,8 +45,10 @@ public enum OnnxJavaType {
 
   /** The native value of the enum. */
   public final int value;
+
   /** The Java side type used as the carrier. */
   public final Class<?> clazz;
+
   /** The number of bytes used by a single value of this type. */
   public final int size;
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxMap.java b/java/src/main/java/ai/onnxruntime/OnnxMap.java
index 354ebec61274..68d91d0d9e74 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxMap.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxMap.java
@@ -8,6 +8,7 @@
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.logging.Logger;
 
 /**
  * A container for a map returned by {@link OrtSession#run(Map)}.
@@ -16,6 +17,7 @@
  * values: String, Long, Float, Double.
  */
 public class OnnxMap implements OnnxValue {
+  private static final Logger logger = Logger.getLogger(OnnxMap.class.getName());
 
   static {
     try {
@@ -37,6 +39,7 @@ public enum OnnxMapValueType {
     FLOAT(3),
     /** A 64-bit floating point value. */
     DOUBLE(4);
+
     /** The native enum value. */
     final int value;
 
@@ -107,6 +110,8 @@ public static OnnxMapValueType mapFromOnnxJavaType(OnnxJavaType type) {
 
   private final OnnxMapValueType valueType;
 
+  private boolean closed;
+
   /**
    * Constructs an OnnxMap containing a reference to the native map along with the type information.
    *
@@ -122,6 +127,7 @@ public static OnnxMapValueType mapFromOnnxJavaType(OnnxJavaType type) {
     this.info = info;
     this.stringKeys = info.keyType == OnnxJavaType.STRING;
     this.valueType = OnnxMapValueType.mapFromOnnxJavaType(info.valueType);
+    this.closed = false;
   }
 
   /**
@@ -146,6 +152,7 @@ public OnnxValueType getType() {
    */
   @Override
   public Map<? extends Object, ? extends Object> getValue() throws OrtException {
+    checkClosed();
     Object[] keys = getMapKeys();
     Object[] values = getMapValues();
     HashMap<Object, Object> map = new HashMap<>(OrtUtil.capacityFromSize(keys.length));
@@ -222,10 +229,27 @@ public String toString() {
     return "ONNXMap(size=" + size() + ",info=" + info.toString() + ")";
   }
 
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   /** Closes this map, releasing the native memory backing it and it's elements. */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed map.");
+    }
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
   }
 
   private native String[] getStringKeys(long apiHandle, long nativeHandle, long allocatorHandle)
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index ed739dd9729d..f552badd4f83 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -54,19 +54,25 @@ final class OnnxRuntime {
 
   /** The short name of the ONNX runtime shared library */
   static final String ONNXRUNTIME_LIBRARY_NAME = "onnxruntime";
+
   /** The short name of the ONNX runtime JNI shared library */
   static final String ONNXRUNTIME_JNI_LIBRARY_NAME = "onnxruntime4j_jni";
 
   /** The short name of the ONNX runtime shared provider library */
   static final String ONNXRUNTIME_LIBRARY_SHARED_NAME = "onnxruntime_providers_shared";
+
   /** The short name of the ONNX runtime CUDA provider library */
   static final String ONNXRUNTIME_LIBRARY_CUDA_NAME = "onnxruntime_providers_cuda";
+
   /** The short name of the ONNX runtime ROCM provider library */
   static final String ONNXRUNTIME_LIBRARY_ROCM_NAME = "onnxruntime_providers_rocm";
+
   /** The short name of the ONNX runtime DNNL provider library */
   static final String ONNXRUNTIME_LIBRARY_DNNL_NAME = "onnxruntime_providers_dnnl";
+
   /** The short name of the ONNX runtime OpenVINO provider library */
   static final String ONNXRUNTIME_LIBRARY_OPENVINO_NAME = "onnxruntime_providers_openvino";
+
   /** The short name of the ONNX runtime TensorRT provider library */
   static final String ONNXRUNTIME_LIBRARY_TENSORRT_NAME = "onnxruntime_providers_tensorrt";
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSequence.java b/java/src/main/java/ai/onnxruntime/OnnxSequence.java
index 93e1be21588b..7722514b913b 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSequence.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSequence.java
@@ -8,6 +8,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.logging.Logger;
 
 /**
  * A sequence of {@link OnnxValue}s all of the same type.
@@ -24,6 +25,7 @@
  * </ul>
  */
 public class OnnxSequence implements OnnxValue {
+  private static final Logger logger = Logger.getLogger(OnnxSequence.class.getName());
 
   static {
     try {
@@ -40,6 +42,8 @@ public class OnnxSequence implements OnnxValue {
 
   private final SequenceInfo info;
 
+  private boolean closed;
+
   /**
    * Creates the wrapper object for a native sequence.
    *
@@ -53,6 +57,7 @@ public class OnnxSequence implements OnnxValue {
     this.nativeHandle = nativeHandle;
     this.allocatorHandle = allocatorHandle;
     this.info = info;
+    this.closed = false;
   }
 
   @Override
@@ -76,6 +81,7 @@ public OnnxValueType getType() {
    */
   @Override
   public List<? extends OnnxValue> getValue() throws OrtException {
+    checkClosed();
     if (info.sequenceOfMaps) {
       OnnxMap[] maps = getMaps(OnnxRuntime.ortApiHandle, nativeHandle, allocatorHandle);
       return Collections.unmodifiableList(Arrays.asList(maps));
@@ -110,10 +116,27 @@ public String toString() {
     return "OnnxSequence(info=" + info.toString() + ")";
   }
 
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   /** Closes this sequence, releasing the native memory backing it and it's elements. */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed sequence.");
+    }
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
   }
 
   private native OnnxMap[] getMaps(long apiHandle, long nativeHandle, long allocatorHandle)
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
index 53bd4c7f9b3e..8400ef53ff6d 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
@@ -14,6 +14,7 @@
 import java.nio.LongBuffer;
 import java.nio.ShortBuffer;
 import java.util.Arrays;
+import java.util.logging.Logger;
 
 /**
  * A Java object wrapping an OnnxSparseTensor.
@@ -22,6 +23,7 @@
  * different static inner class representing each type.
  */
 public final class OnnxSparseTensor extends OnnxTensorLike {
+  private static final Logger logger = Logger.getLogger(OnnxSparseTensor.class.getName());
   private final SparseTensorType sparseTensorType;
 
   // Held to prevent deallocation while used in native code.
@@ -198,6 +200,7 @@ public OnnxValueType getType() {
 
   @Override
   public SparseTensor<? extends Buffer> getValue() throws OrtException {
+    checkClosed();
     Buffer buffer = getValuesBuffer();
     long[] indicesShape = getIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
     switch (sparseTensorType) {
@@ -234,8 +237,13 @@ public SparseTensor<? extends Buffer> getValue() throws OrtException {
   }
 
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed OnnxSparseTensor.");
+    }
   }
 
   /**
@@ -257,6 +265,7 @@ public SparseTensorType getSparseTensorType() {
    * @return The indices.
    */
   public Buffer getIndicesBuffer() {
+    checkClosed();
     switch (sparseTensorType) {
       case COO:
       case CSRC:
@@ -295,6 +304,7 @@ public Buffer getIndicesBuffer() {
    * @return The inner indices.
    */
   public LongBuffer getInnerIndicesBuffer() {
+    checkClosed();
     if (sparseTensorType == SparseTensorType.CSRC) {
       LongBuffer buf =
           getInnerIndicesBuffer(OnnxRuntime.ortApiHandle, nativeHandle)
@@ -320,6 +330,7 @@ public LongBuffer getInnerIndicesBuffer() {
    * @return The data buffer.
    */
   public Buffer getValuesBuffer() {
+    checkClosed();
     ByteBuffer buffer =
         getValuesBuffer(OnnxRuntime.ortApiHandle, nativeHandle).order(ByteOrder.nativeOrder());
     switch (info.type) {
@@ -396,6 +407,7 @@ public Buffer getValuesBuffer() {
    * @return The indices shape.
    */
   public long[] getIndicesShape() {
+    checkClosed();
     return getIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
   }
 
@@ -405,6 +417,7 @@ public long[] getIndicesShape() {
    * @return The indices shape.
    */
   public long[] getInnerIndicesShape() {
+    checkClosed();
     if (sparseTensorType == SparseTensorType.CSRC) {
       return getInnerIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
     } else {
@@ -420,6 +433,7 @@ public long[] getInnerIndicesShape() {
    * @return The values shape.
    */
   public long[] getValuesShape() {
+    checkClosed();
     return getValuesShape(OnnxRuntime.ortApiHandle, nativeHandle);
   }
 
@@ -623,6 +637,7 @@ public abstract static class SparseTensor<T extends Buffer> {
 
     /** The buffer holding the indices. */
     final T indices;
+
     /** The buffer holding the values. */
     final Buffer values;
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxTensor.java b/java/src/main/java/ai/onnxruntime/OnnxTensor.java
index 0078adb6402f..e1ee2c14fd9d 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxTensor.java
@@ -14,12 +14,14 @@
 import java.nio.LongBuffer;
 import java.nio.ShortBuffer;
 import java.util.Optional;
+import java.util.logging.Logger;
 
 /**
  * A Java object wrapping an OnnxTensor. Tensors are the main input to the library, and can also be
  * returned as outputs.
  */
 public class OnnxTensor extends OnnxTensorLike {
+  private static final Logger logger = Logger.getLogger(OnnxTensor.class.getName());
 
   /**
    * This reference is held for OnnxTensors backed by a java.nio.Buffer to ensure the buffer does
@@ -97,6 +99,7 @@ public OnnxValueType getType() {
    */
   @Override
   public Object getValue() throws OrtException {
+    checkClosed();
     if (info.isScalar()) {
       switch (info.type) {
         case FLOAT:
@@ -144,16 +147,21 @@ public Object getValue() throws OrtException {
 
   @Override
   public String toString() {
-    return "OnnxTensor(info=" + info.toString() + ")";
+    return "OnnxTensor(info=" + info.toString() + ",closed=" + closed + ")";
   }
 
   /**
-   * Closes the tensor, releasing it's underlying memory (if it's not backed by an NIO buffer). If
-   * it is backed by a buffer then the memory is released when the buffer is GC'd.
+   * Closes the tensor, releasing its underlying memory (if it's not backed by an NIO buffer). If it
+   * is backed by a buffer then the memory is released when the buffer is GC'd.
    */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed tensor.");
+    }
   }
 
   /**
@@ -165,6 +173,7 @@ public void close() {
    * @return A ByteBuffer copy of the OnnxTensor.
    */
   public ByteBuffer getByteBuffer() {
+    checkClosed();
     if (info.type != OnnxJavaType.STRING) {
       ByteBuffer buffer = getBuffer(OnnxRuntime.ortApiHandle, nativeHandle);
       ByteBuffer output = ByteBuffer.allocate(buffer.capacity());
@@ -183,6 +192,7 @@ public ByteBuffer getByteBuffer() {
    * @return A FloatBuffer copy of the OnnxTensor.
    */
   public FloatBuffer getFloatBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.FLOAT) {
       // if it's fp32 use the efficient copy.
       FloatBuffer buffer = getBuffer().asFloatBuffer();
@@ -212,6 +222,7 @@ public FloatBuffer getFloatBuffer() {
    * @return A DoubleBuffer copy of the OnnxTensor.
    */
   public DoubleBuffer getDoubleBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.DOUBLE) {
       DoubleBuffer buffer = getBuffer().asDoubleBuffer();
       DoubleBuffer output = DoubleBuffer.allocate(buffer.capacity());
@@ -230,6 +241,7 @@ public DoubleBuffer getDoubleBuffer() {
    * @return A ShortBuffer copy of the OnnxTensor.
    */
   public ShortBuffer getShortBuffer() {
+    checkClosed();
     if ((info.type == OnnxJavaType.INT16)
         || (info.type == OnnxJavaType.FLOAT16)
         || (info.type == OnnxJavaType.BFLOAT16)) {
@@ -250,6 +262,7 @@ public ShortBuffer getShortBuffer() {
    * @return An IntBuffer copy of the OnnxTensor.
    */
   public IntBuffer getIntBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.INT32) {
       IntBuffer buffer = getBuffer().asIntBuffer();
       IntBuffer output = IntBuffer.allocate(buffer.capacity());
@@ -268,6 +281,7 @@ public IntBuffer getIntBuffer() {
    * @return A LongBuffer copy of the OnnxTensor.
    */
   public LongBuffer getLongBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.INT64) {
       LongBuffer buffer = getBuffer().asLongBuffer();
       LongBuffer output = LongBuffer.allocate(buffer.capacity());
diff --git a/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java b/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
index c2989fe296dc..bbfd4e981ece 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
@@ -28,6 +28,9 @@ public abstract class OnnxTensorLike implements OnnxValue {
   /** The size and shape information for this tensor. */
   protected final TensorInfo info;
 
+  /** Is this value closed? */
+  protected boolean closed;
+
   /**
    * Constructs a tensor-like (the base class of OnnxTensor and OnnxSparseTensor).
    *
@@ -39,6 +42,7 @@ public abstract class OnnxTensorLike implements OnnxValue {
     this.nativeHandle = nativeHandle;
     this.allocatorHandle = allocatorHandle;
     this.info = info;
+    this.closed = false;
   }
 
   /**
@@ -59,4 +63,16 @@ long getNativeHandle() {
   public TensorInfo getInfo() {
     return info;
   }
+
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
+  }
 }
diff --git a/java/src/main/java/ai/onnxruntime/OnnxValue.java b/java/src/main/java/ai/onnxruntime/OnnxValue.java
index 752a0e74267d..e829bc80f09f 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxValue.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxValue.java
@@ -64,7 +64,14 @@ public enum OnnxValueType {
    */
   public ValueInfo getInfo();
 
-  /** Closes the OnnxValue, freeing it's native memory. */
+  /**
+   * Checks if this value is closed (i.e., the native object has been released).
+   *
+   * @return True if the value is closed and the native object has been released.
+   */
+  public boolean isClosed();
+
+  /** Closes the OnnxValue, freeing its native memory. */
   @Override
   public void close();
 
diff --git a/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java b/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
index 39a5121fad7a..70af10ff8cd7 100644
--- a/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
+++ b/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
@@ -5,11 +5,14 @@
 package ai.onnxruntime;
 
 import java.io.IOException;
+import java.util.logging.Logger;
 
 /** An abstract base class for execution provider options classes. */
 // Note this lives in ai.onnxruntime to allow subclasses to access the OnnxRuntime.ortApiHandle
 // package private field.
 public abstract class OrtProviderOptions implements AutoCloseable {
+  private static final Logger logger = Logger.getLogger(OrtProviderOptions.class.getName());
+
   static {
     try {
       OnnxRuntime.init();
@@ -21,6 +24,9 @@ public abstract class OrtProviderOptions implements AutoCloseable {
   /** The native pointer. */
   protected final long nativeHandle;
 
+  /** Is the native object closed? */
+  protected boolean closed;
+
   /**
    * Constructs a OrtProviderOptions wrapped around a native pointer.
    *
@@ -28,6 +34,7 @@ public abstract class OrtProviderOptions implements AutoCloseable {
    */
   protected OrtProviderOptions(long nativeHandle) {
     this.nativeHandle = nativeHandle;
+    this.closed = false;
   }
 
   /**
@@ -46,9 +53,30 @@ protected static long getApiHandle() {
    */
   public abstract OrtProvider getProvider();
 
+  /**
+   * Is the native object closed?
+   *
+   * @return True if the native object has been released.
+   */
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   @Override
   public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed tensor.");
+    }
+  }
+
+  /** Checks if the OrtProviderOptions is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OrtProviderOptions");
+    }
   }
 
   /**
diff --git a/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java b/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
index 49ddf29c2233..eeede3a1bed0 100644
--- a/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
@@ -12,6 +12,7 @@
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.logging.Logger;
 
 /**
  * Wraps an ONNX training model and allows training and inference calls.
@@ -1049,8 +1050,12 @@ private native void exportModelForInference(
 
   /** Wrapper class for the checkpoint state. */
   static final class OrtCheckpointState implements AutoCloseable {
+    private static final Logger logger = Logger.getLogger(OrtCheckpointState.class.getName());
+
     final long nativeHandle;
 
+    private boolean closed;
+
     /**
      * Wraps an object around the checkpoint native handle.
      *
@@ -1058,6 +1063,7 @@ static final class OrtCheckpointState implements AutoCloseable {
      */
     OrtCheckpointState(long nativeHandle) {
       this.nativeHandle = nativeHandle;
+      this.closed = false;
     }
 
     /**
@@ -1097,6 +1103,7 @@ static OrtCheckpointState loadCheckpoint(String checkpoint) throws OrtException
      * @throws OrtException If the checkpoint failed to save.
      */
     public void saveCheckpoint(Path outputPath, boolean saveOptimizer) throws OrtException {
+      checkClosed();
       Objects.requireNonNull(outputPath, "checkpoint path must not be null");
       String outputStr = outputPath.toString();
       saveCheckpoint(
@@ -1115,6 +1122,7 @@ public void saveCheckpoint(Path outputPath, boolean saveOptimizer) throws OrtExc
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, float value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1127,6 +1135,7 @@ public void addProperty(String name, float value) throws OrtException {
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, int value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1139,6 +1148,7 @@ public void addProperty(String name, int value) throws OrtException {
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, String value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1152,6 +1162,7 @@ public void addProperty(String name, String value) throws OrtException {
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public float getFloatProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getFloatProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1169,6 +1180,7 @@ public float getFloatProperty(OrtAllocator allocator, String name) throws OrtExc
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public int getIntProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getIntProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1186,6 +1198,7 @@ public int getIntProperty(OrtAllocator allocator, String name) throws OrtExcepti
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public String getStringProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getStringProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1194,9 +1207,25 @@ public String getStringProperty(OrtAllocator allocator, String name) throws OrtE
           name);
     }
 
+    /** Checks if the OrtCheckpointState is closed, if so throws {@link IllegalStateException}. */
+    private void checkClosed() {
+      if (closed) {
+        throw new IllegalStateException("Trying to use a closed OrtCheckpointState");
+      }
+    }
+
+    public synchronized boolean isClosed() {
+      return closed;
+    }
+
     @Override
-    public void close() {
-      close(OnnxRuntime.ortTrainingApiHandle, nativeHandle);
+    public synchronized void close() {
+      if (!closed) {
+        close(OnnxRuntime.ortTrainingApiHandle, nativeHandle);
+        closed = true;
+      } else {
+        logger.warning("Closing a checkpoint twice");
+      }
     }
 
     /*
diff --git a/java/src/main/java/ai/onnxruntime/TensorInfo.java b/java/src/main/java/ai/onnxruntime/TensorInfo.java
index 69ccb954e8af..1c21387b5045 100644
--- a/java/src/main/java/ai/onnxruntime/TensorInfo.java
+++ b/java/src/main/java/ai/onnxruntime/TensorInfo.java
@@ -7,6 +7,7 @@
 import java.lang.reflect.Array;
 import java.nio.Buffer;
 import java.util.Arrays;
+import java.util.stream.Collectors;
 
 /** Describes an {@link OnnxTensor}, including it's size, shape and element type. */
 public class TensorInfo implements ValueInfo {
@@ -159,6 +160,12 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
   /** The shape of the tensor. */
   final long[] shape;
 
+  /** The names of the unbound dimensions. */
+  final String[] dimensionNames;
+
+  /** If there are non-empty dimension names */
+  private final boolean hasNames;
+
   /** The Java type of this tensor. */
   public final OnnxJavaType type;
 
@@ -177,6 +184,9 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
    */
   TensorInfo(long[] shape, OnnxJavaType type, OnnxTensorType onnxType) {
     this.shape = shape;
+    this.dimensionNames = new String[shape.length];
+    Arrays.fill(dimensionNames, "");
+    this.hasNames = false;
     this.type = type;
     this.onnxType = onnxType;
     this.numElements = elementCount(shape);
@@ -188,10 +198,20 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
    * <p>Called from JNI.
    *
    * @param shape The tensor shape.
+   * @param names The dimension names.
    * @param typeInt The native type int.
    */
-  TensorInfo(long[] shape, int typeInt) {
+  TensorInfo(long[] shape, String[] names, int typeInt) {
     this.shape = shape;
+    this.dimensionNames = names;
+    boolean hasNames = false;
+    for (String s : names) {
+      if (!s.isEmpty()) {
+        hasNames = true;
+        break;
+      }
+    }
+    this.hasNames = hasNames;
     this.onnxType = OnnxTensorType.mapFromInt(typeInt);
     this.type = OnnxJavaType.mapFromOnnxTensorType(this.onnxType);
     this.numElements = elementCount(shape);
@@ -206,15 +226,42 @@ public long[] getShape() {
     return Arrays.copyOf(shape, shape.length);
   }
 
+  /**
+   * Get a copy of the tensor's named dimensions.
+   *
+   * @return A copof the tensor's named dimensions.
+   */
+  public String[] getDimensionNames() {
+    return Arrays.copyOf(dimensionNames, dimensionNames.length);
+  }
+
   @Override
   public String toString() {
-    return "TensorInfo(javaType="
-        + type.toString()
-        + ",onnxType="
-        + onnxType.toString()
-        + ",shape="
-        + Arrays.toString(shape)
-        + ")";
+    String output =
+        "TensorInfo(javaType="
+            + type.toString()
+            + ",onnxType="
+            + onnxType.toString()
+            + ",shape="
+            + Arrays.toString(shape);
+    if (hasNames) {
+      output =
+          output
+              + ",dimNames=["
+              + Arrays.stream(dimensionNames)
+                  .map(
+                      a -> {
+                        if (a.isEmpty()) {
+                          return "\"\"";
+                        } else {
+                          return a;
+                        }
+                      })
+                  .collect(Collectors.joining(","))
+              + "]";
+    }
+    output = output + ")";
+    return output;
   }
 
   /**
diff --git a/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java b/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java
index eb124decf75f..cec3fadf446c 100644
--- a/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java
+++ b/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
  * Licensed under the MIT License.
  */
 package ai.onnxruntime.providers;
@@ -14,7 +14,18 @@ public enum CoreMLFlags implements OrtFlags {
   /** Enables CoreML on subgraphs. */
   ENABLE_ON_SUBGRAPH(2), // COREML_FLAG_ENABLE_ON_SUBGRAPH(0x002)
   /** Only enable usage of CoreML if the device has an Apple Neural Engine. */
-  ONLY_ENABLE_DEVICE_WITH_ANE(4); // COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE(0x004),
+  ONLY_ENABLE_DEVICE_WITH_ANE(4), // COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE(0x004)
+  /**
+   * Only allow CoreML EP to take nodes with inputs with static shapes. By default it will also
+   * allow inputs with dynamic shapes. However, the performance may be negatively impacted if inputs
+   * have dynamic shapes.
+   */
+  ONLY_ALLOW_STATIC_INPUT_SHAPES(8), // COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES(0x008)
+  /**
+   * Create an MLProgram. By default it will create a NeuralNetwork model. Requires Core ML 5 or
+   * later.
+   */
+  CREATE_MLPROGRAM(16); // COREML_FLAG_CREATE_MLPROGRAM(0x010)
 
   /** The native value of the enum. */
   public final int value;
diff --git a/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java b/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
index 02207b2949e5..961163035c9a 100644
--- a/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
+++ b/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
@@ -32,6 +32,7 @@ protected StringConfigProviderOptions(long nativeHandle) {
    * @throws OrtException If the addition failed.
    */
   public void add(String key, String value) throws OrtException {
+    checkClosed();
     Objects.requireNonNull(key, "Key must not be null");
     Objects.requireNonNull(value, "Value must not be null");
     options.put(key, value);
diff --git a/java/src/main/native/OrtJniUtil.c b/java/src/main/native/OrtJniUtil.c
index 879ba8a31061..7b2629158139 100644
--- a/java/src/main/native/OrtJniUtil.c
+++ b/java/src/main/native/OrtJniUtil.c
@@ -342,7 +342,6 @@ jobject convertToTensorInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTensorT
   if (code != ORT_OK) {
     return NULL;
   }
-  //printf("numDim %d\n",numDim);
   int64_t* dimensions = (int64_t*) malloc(sizeof(int64_t)*numDim);
   code = checkOrtStatus(jniEnv, api, api->GetDimensions(info, dimensions, numDim));
   if (code != ORT_OK) {
@@ -358,12 +357,31 @@ jobject convertToTensorInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTensorT
   free(dimensions);
   dimensions = NULL;
 
+  // Create the string array for the names.
+  const char** dimensionNames = (const char**) malloc(sizeof(char*)*numDim);
+  if (dimensionNames == NULL) {
+    throwOrtException(jniEnv, 1, "Not enough memory");
+    return NULL;
+  }
+  code = checkOrtStatus(jniEnv, api, api->GetSymbolicDimensions(info, dimensionNames, numDim));
+  if (code != ORT_OK) {
+    // extraction failed, exception has been thrown, return to Java.
+    free(dimensionNames);
+    return NULL;
+  }
+  jclass stringClazz = (*jniEnv)->FindClass(jniEnv, "java/lang/String");
+  jobjectArray names = (*jniEnv)->NewObjectArray(jniEnv, safecast_size_t_to_jsize(numDim), stringClazz, NULL);
+  for (size_t i = 0; i < numDim; i++) {
+    jobject javaName = (*jniEnv)->NewStringUTF(jniEnv, dimensionNames[i]);
+    (*jniEnv)->SetObjectArrayElement(jniEnv, names, safecast_size_t_to_jsize(i), javaName);
+  }
+  free(dimensionNames);
+
   // Create the TensorInfo object
   static const char *tensorInfoClassName = "ai/onnxruntime/TensorInfo";
   jclass clazz = (*jniEnv)->FindClass(jniEnv, tensorInfoClassName);
-  jmethodID tensorInfoConstructor = (*jniEnv)->GetMethodID(jniEnv,clazz, "<init>", "([JI)V");
-  //printf("TensorInfo class %p, methodID %p\n",clazz,tensorInfoConstructor);
-  jobject tensorInfo = (*jniEnv)->NewObject(jniEnv, clazz, tensorInfoConstructor, shape, onnxTypeInt);
+  jmethodID tensorInfoConstructor = (*jniEnv)->GetMethodID(jniEnv,clazz, "<init>", "([J[Ljava/lang/String;I)V");
+  jobject tensorInfo = (*jniEnv)->NewObject(jniEnv, clazz, tensorInfoConstructor, shape, names, onnxTypeInt);
   return tensorInfo;
 }
 
diff --git a/java/src/main/native/ai_onnxruntime_OrtSession_SessionOptions.c b/java/src/main/native/ai_onnxruntime_OrtSession_SessionOptions.c
index 3a1c0d1bb8fa..337f4c1921c6 100644
--- a/java/src/main/native/ai_onnxruntime_OrtSession_SessionOptions.c
+++ b/java/src/main/native/ai_onnxruntime_OrtSession_SessionOptions.c
@@ -8,7 +8,7 @@
 #include "onnxruntime/core/session/onnxruntime_c_api.h"
 #include "OrtJniUtil.h"
 #include "ai_onnxruntime_OrtSession_SessionOptions.h"
-#ifdef WIN32
+#ifdef _WIN32
 #include <Windows.h>
 #else
 #include <dlfcn.h>
@@ -318,7 +318,7 @@ JNIEXPORT void JNICALL Java_ai_onnxruntime_OrtSession_00024SessionOptions_closeC
 
   // Iterate the handles, calling the appropriate close function
   for (jint i = 0; i < numHandles; i++) {
-#ifdef WIN32
+#ifdef _WIN32
     FreeLibrary((void*)handles[i]);
 #else
     dlclose((void*)handles[i]);
@@ -630,7 +630,7 @@ JNIEXPORT void JNICALL Java_ai_onnxruntime_OrtSession_00024SessionOptions_addMIG
 JNIEXPORT void JNICALL Java_ai_onnxruntime_OrtSession_00024SessionOptions_addDirectML
   (JNIEnv * jniEnv, jobject jobj, jlong apiHandle, jlong handle, jint deviceID) {
   (void)jobj;
-  #ifdef USE_DIRECTML
+  #ifdef USE_DML
     checkOrtStatus(jniEnv,(const OrtApi*)apiHandle,OrtSessionOptionsAppendExecutionProvider_DML((OrtSessionOptions*) handle, deviceID));
   #else
     (void)apiHandle;(void)handle;(void)deviceID; // Parameters used when DirectML is defined.
diff --git a/java/src/main/native/ai_onnxruntime_OrtTrainingSession.c b/java/src/main/native/ai_onnxruntime_OrtTrainingSession.c
index 9f7b8d3a3dcf..464234c34798 100644
--- a/java/src/main/native/ai_onnxruntime_OrtTrainingSession.c
+++ b/java/src/main/native/ai_onnxruntime_OrtTrainingSession.c
@@ -66,7 +66,7 @@ JNIEXPORT jlong JNICALL Java_ai_onnxruntime_OrtTrainingSession_createTrainingSes
     }
   }
   wchar_t* optimizerStr = NULL;
-  if (optimizerPath == NULL) {
+  if (optimizerPath != NULL) {
     optimizerStr = copyAndPad(jniEnv, optimizerPath);
     if (optimizerStr == NULL) {
       // exception has been thrown in Java, go to cleanup and return null.
diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index e975117fb75b..ac65cbab146b 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -69,7 +69,9 @@ public void environmentTest() {
     // Checks that the environment instance is the same.
     OrtEnvironment otherEnv = OrtEnvironment.getEnvironment();
     assertSame(env, otherEnv);
+    TestHelpers.quietLogger(OrtEnvironment.class);
     otherEnv = OrtEnvironment.getEnvironment("test-name");
+    TestHelpers.loudLogger(OrtEnvironment.class);
     assertSame(env, otherEnv);
   }
 
@@ -588,6 +590,12 @@ public void testSymbolicDimensionAssignment() throws OrtException {
         Map<String, NodeInfo> infoMap = session.getInputInfo();
         TensorInfo aInfo = (TensorInfo) infoMap.get("A").getInfo();
         assertArrayEquals(new long[] {-1, 2}, aInfo.shape);
+        assertEquals(2, aInfo.dimensionNames.length);
+        assertEquals("n", aInfo.dimensionNames[0]);
+        assertEquals("", aInfo.dimensionNames[1]);
+        TensorInfo bInfo = (TensorInfo) infoMap.get("B").getInfo();
+        assertEquals(1, bInfo.dimensionNames.length);
+        assertEquals("m", bInfo.dimensionNames[0]);
       }
     }
     // Check that when the options are assigned it overrides the symbolic dimension
@@ -643,6 +651,12 @@ public void testCoreML() throws OrtException {
     runProvider(OrtProvider.CORE_ML);
   }
 
+  @Test
+  @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
+  public void testDirectML() throws OrtException {
+    runProvider(OrtProvider.DIRECT_ML);
+  }
+
   private void runProvider(OrtProvider provider) throws OrtException {
     EnumSet<OrtProvider> providers = OrtEnvironment.getAvailableProviders();
     assertTrue(providers.size() > 1);
@@ -665,7 +679,7 @@ private void runProvider(OrtProvider provider) throws OrtException {
           // CoreML gives slightly different answers on a 2020 13" M1 MBP
           assertArrayEquals(expectedOutput, resultArray, 1e-2f);
         } else {
-          assertArrayEquals(expectedOutput, resultArray, 1e-6f);
+          assertArrayEquals(expectedOutput, resultArray, 1e-5f);
         }
       } catch (OrtException e) {
         throw new IllegalStateException("Failed to execute a scoring operation", e);
@@ -1918,6 +1932,8 @@ private static SqueezeNetTuple openSessionSqueezeNet(EnumSet<OrtProvider> provid
           options.addNnapi();
           break;
         case DIRECT_ML:
+          options.setMemoryPatternOptimization(false);
+          options.setExecutionMode(ExecutionMode.SEQUENTIAL);
           options.addDirectML(0);
           break;
         case ACL:
diff --git a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
index a5f285ba86a1..c060cf73ecf1 100644
--- a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
+++ b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
@@ -4,6 +4,10 @@
  */
 package ai.onnxruntime;
 
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
 import ai.onnxruntime.platform.Fp16Conversions;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
@@ -97,8 +101,8 @@ public void testBufferCreation() throws OrtException {
     float[] arrValues = new float[] {0, 1, 2, 3, 4};
     try (OnnxTensor t = OnnxTensor.createTensor(env, arrValues)) {
       // array creation isn't backed by buffers
-      Assertions.assertFalse(t.ownsBuffer());
-      Assertions.assertFalse(t.getBufferRef().isPresent());
+      assertFalse(t.ownsBuffer());
+      assertFalse(t.getBufferRef().isPresent());
       FloatBuffer buf = t.getFloatBuffer();
       float[] output = new float[arrValues.length];
       buf.get(output);
@@ -146,7 +150,7 @@ public void testBufferCreation() throws OrtException {
     directBuffer.rewind();
     try (OnnxTensor t = OnnxTensor.createTensor(env, directBuffer, new long[] {1, 5})) {
       // direct buffers don't trigger a copy
-      Assertions.assertFalse(t.ownsBuffer());
+      assertFalse(t.ownsBuffer());
       // tensors backed by buffers can get the buffer ref back out
       Assertions.assertTrue(t.getBufferRef().isPresent());
       FloatBuffer buf = t.getFloatBuffer();
@@ -428,4 +432,21 @@ public void testBf16RoundTrip() {
       }
     }
   }
+
+  @Test
+  public void testClose() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    long[] input = new long[] {1, 2, 3, 4, 5};
+    OnnxTensor value = OnnxTensor.createTensor(env, input);
+    assertFalse(value.isClosed());
+    long[] output = (long[]) value.getValue();
+    assertArrayEquals(input, output);
+    value.close();
+    // check use after close throws
+    assertThrows(IllegalStateException.class, value::getValue);
+    // check double close doesn't crash (emits warning)
+    TestHelpers.quietLogger(OnnxTensor.class);
+    value.close();
+    TestHelpers.loudLogger(OnnxTensor.class);
+  }
 }
diff --git a/java/src/test/java/ai/onnxruntime/TestHelpers.java b/java/src/test/java/ai/onnxruntime/TestHelpers.java
index 55d8169434d4..c13cdf222b15 100644
--- a/java/src/test/java/ai/onnxruntime/TestHelpers.java
+++ b/java/src/test/java/ai/onnxruntime/TestHelpers.java
@@ -22,6 +22,8 @@
 import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
 import java.util.regex.Pattern;
 import org.junit.jupiter.api.Assertions;
 
@@ -258,6 +260,16 @@ static void flattenStringBase(String[] input, List<String> output) {
     output.addAll(Arrays.asList(input));
   }
 
+  static void loudLogger(Class<?> loggerClass) {
+    Logger l = Logger.getLogger(loggerClass.getName());
+    l.setLevel(Level.INFO);
+  }
+
+  static void quietLogger(Class<?> loggerClass) {
+    Logger l = Logger.getLogger(loggerClass.getName());
+    l.setLevel(Level.OFF);
+  }
+
   public static Path getResourcePath(String path) {
     return new File(TestHelpers.class.getResource(path).getFile()).toPath();
   }
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index 1ed883ace36e..0e3bc15ba9c7 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -96,7 +96,7 @@ private static void runProvider(OrtProvider provider, OrtSession.SessionOptions
         OnnxValue resultTensor = result.get(0);
         float[] resultArray = TestHelpers.flattenFloat(resultTensor.getValue());
         assertEquals(expectedOutput.length, resultArray.length);
-        assertArrayEquals(expectedOutput, resultArray, 1e-6f);
+        assertArrayEquals(expectedOutput, resultArray, 1e-5f);
       } catch (OrtException e) {
         throw new IllegalStateException("Failed to execute a scoring operation", e);
       }
diff --git a/java/src/test/java/sample/ScoreMNIST.java b/java/src/test/java/sample/ScoreMNIST.java
index 5587b58e17f5..6ecbc5cd56d1 100644
--- a/java/src/test/java/sample/ScoreMNIST.java
+++ b/java/src/test/java/sample/ScoreMNIST.java
@@ -30,6 +30,7 @@
 public class ScoreMNIST {
 
   private static final Logger logger = Logger.getLogger(ScoreMNIST.class.getName());
+
   /** Pattern for splitting libsvm format files. */
   private static final Pattern splitPattern = Pattern.compile("\\s+");
 
diff --git a/js/common/lib/backend-impl.ts b/js/common/lib/backend-impl.ts
index 3e1e833addb9..e90efd7b97c2 100644
--- a/js/common/lib/backend-impl.ts
+++ b/js/common/lib/backend-impl.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import {Backend} from './backend.js';
+import {InferenceSession} from './inference-session.js';
 
 interface BackendInfo {
   backend: Backend;
@@ -10,6 +11,7 @@ interface BackendInfo {
   initPromise?: Promise<void>;
   initialized?: boolean;
   aborted?: boolean;
+  error?: string;
 }
 
 const backends: Map<string, BackendInfo> = new Map();
@@ -60,43 +62,100 @@ export const registerBackend = (name: string, backend: Backend, priority: number
 };
 
 /**
- * Resolve backend by specified hints.
+ * Try to resolve and initialize a backend.
  *
- * @param backendHints - a list of execution provider names to lookup. If omitted use registered backends as list.
- * @returns a promise that resolves to the backend.
+ * @param backendName - the name of the backend.
+ * @returns the backend instance if resolved and initialized successfully, or an error message if failed.
+ */
+const tryResolveAndInitializeBackend = async(backendName: string): Promise<Backend|string> => {
+  const backendInfo = backends.get(backendName);
+  if (!backendInfo) {
+    return 'backend not found.';
+  }
+
+  if (backendInfo.initialized) {
+    return backendInfo.backend;
+  } else if (backendInfo.aborted) {
+    return backendInfo.error!;
+  } else {
+    const isInitializing = !!backendInfo.initPromise;
+    try {
+      if (!isInitializing) {
+        backendInfo.initPromise = backendInfo.backend.init(backendName);
+      }
+      await backendInfo.initPromise;
+      backendInfo.initialized = true;
+      return backendInfo.backend;
+    } catch (e) {
+      if (!isInitializing) {
+        backendInfo.error = `${e}`;
+        backendInfo.aborted = true;
+      }
+      return backendInfo.error!;
+    } finally {
+      delete backendInfo.initPromise;
+    }
+  }
+};
+
+/**
+ * Resolve execution providers from the specific session options.
+ *
+ * @param options - the session options object.
+ * @returns a promise that resolves to a tuple of an initialized backend instance and a session options object with
+ * filtered EP list.
  *
  * @ignore
  */
-export const resolveBackend = async(backendHints: readonly string[]): Promise<Backend> => {
-  const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints;
-  const errors = [];
-  for (const backendName of backendNames) {
-    const backendInfo = backends.get(backendName);
-    if (backendInfo) {
-      if (backendInfo.initialized) {
-        return backendInfo.backend;
-      } else if (backendInfo.aborted) {
-        continue;  // current backend is unavailable; try next
-      }
+export const resolveBackendAndExecutionProviders = async(options: InferenceSession.SessionOptions):
+    Promise<[backend: Backend, options: InferenceSession.SessionOptions]> => {
+      // extract backend hints from session options
+      const eps = options.executionProviders || [];
+      const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
+      const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints;
 
-      const isInitializing = !!backendInfo.initPromise;
-      try {
-        if (!isInitializing) {
-          backendInfo.initPromise = backendInfo.backend.init(backendName);
+      // try to resolve and initialize all requested backends
+      let backend: Backend|undefined;
+      const errors = [];
+      const availableBackendNames = new Set<string>();
+      for (const backendName of backendNames) {
+        const resolveResult = await tryResolveAndInitializeBackend(backendName);
+        if (typeof resolveResult === 'string') {
+          errors.push({name: backendName, err: resolveResult});
+        } else {
+          if (!backend) {
+            backend = resolveResult;
+          }
+          if (backend === resolveResult) {
+            availableBackendNames.add(backendName);
+          }
         }
-        await backendInfo.initPromise;
-        backendInfo.initialized = true;
-        return backendInfo.backend;
-      } catch (e) {
-        if (!isInitializing) {
-          errors.push({name: backendName, err: e});
+      }
+
+      // if no backend is available, throw error.
+      if (!backend) {
+        throw new Error(`no available backend found. ERR: ${errors.map(e => `[${e.name}] ${e.err}`).join(', ')}`);
+      }
+
+      // for each explicitly requested backend, if it's not available, output warning message.
+      for (const {name, err} of errors) {
+        if (backendHints.includes(name)) {
+          // eslint-disable-next-line no-console
+          console.warn(`removing requested execution provider "${
+              name}" from session options because it is not available: ${err}`);
         }
-        backendInfo.aborted = true;
-      } finally {
-        delete backendInfo.initPromise;
       }
-    }
-  }
 
-  throw new Error(`no available backend found. ERR: ${errors.map(e => `[${e.name}] ${e.err}`).join(', ')}`);
-};
+      const filteredEps = eps.filter(i => availableBackendNames.has(typeof i === 'string' ? i : i.name));
+
+      return [
+        backend, new Proxy(options, {
+          get: (target, prop) => {
+            if (prop === 'executionProviders') {
+              return filteredEps;
+            }
+            return Reflect.get(target, prop);
+          }
+        })
+      ];
+    };
diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
index 9bfcb1220605..8c07bdd5c5c4 100644
--- a/js/common/lib/backend.ts
+++ b/js/common/lib/backend.ts
@@ -58,7 +58,7 @@ export interface TrainingSessionHandler extends SessionHandler {
       options: InferenceSession.RunOptions): Promise<SessionHandler.ReturnType>;
 
   getParametersSize(trainableOnly: boolean): Promise<number>;
-  loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
+  loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise<void>;
   getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
 }
 
@@ -77,8 +77,8 @@ export interface Backend {
       Promise<InferenceSessionHandler>;
 
   createTrainingSessionHandler?
-      (checkpointStateUriOrBuffer: TrainingSession.URIorBuffer, trainModelUriOrBuffer: TrainingSession.URIorBuffer,
-       evalModelUriOrBuffer: TrainingSession.URIorBuffer, optimizerModelUriOrBuffer: TrainingSession.URIorBuffer,
+      (checkpointStateUriOrBuffer: TrainingSession.UriOrBuffer, trainModelUriOrBuffer: TrainingSession.UriOrBuffer,
+       evalModelUriOrBuffer: TrainingSession.UriOrBuffer, optimizerModelUriOrBuffer: TrainingSession.UriOrBuffer,
        options: InferenceSession.SessionOptions): Promise<TrainingSessionHandler>;
 }
 
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index 0cded7e5edbc..c8df1613b326 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -33,6 +33,14 @@ export declare namespace Env {
      */
     simd?: boolean;
 
+    /**
+     * set or get a boolean value indicating whether to enable trace.
+     *
+     * @deprecated Use `env.trace` instead. If `env.trace` is set, this property will be ignored.
+     * @defaultValue `false`
+     */
+    trace?: boolean;
+
     /**
      * Set or get a number specifying the timeout for initialization of WebAssembly backend, in milliseconds. A zero
      * value indicates no timeout is set.
@@ -103,6 +111,7 @@ export declare namespace Env {
     kernelId: number;
     kernelType: string;
     kernelName: string;
+    programName: string;
     startTime: number;
     endTime: number;
   }
@@ -134,13 +143,52 @@ export declare namespace Env {
        */
       ondata?: (data: WebGpuProfilingData) => void;
     };
+    /**
+     * Set or get the power preference.
+     *
+     * Setting this property only has effect before the first WebGPU inference session is created. The value will be
+     * used as options for `navigator.gpu.requestAdapter()`.
+     *
+     * See {@link https://gpuweb.github.io/gpuweb/#dictdef-gpurequestadapteroptions} for more details.
+     *
+     * @defaultValue `undefined`
+     */
+    powerPreference?: 'low-power'|'high-performance';
+    /**
+     * Set or get the force fallback adapter flag.
+     *
+     * Setting this property only has effect before the first WebGPU inference session is created. The value will be
+     * used as options for `navigator.gpu.requestAdapter()`.
+     *
+     * See {@link https://gpuweb.github.io/gpuweb/#dictdef-gpurequestadapteroptions} for more details.
+     *
+     * @defaultValue `undefined`
+     */
+    forceFallbackAdapter?: boolean;
+    /**
+     * Set or get the adapter for WebGPU.
+     *
+     * Setting this property only has effect before the first WebGPU inference session is created. The value will be
+     * used as the GPU adapter for the underlying WebGPU backend to create GPU device.
+     *
+     * If this property is not set, it will be available to get after the first WebGPU inference session is created. The
+     * value will be the GPU adapter that created by the underlying WebGPU backend.
+     *
+     * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types".
+     * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type.
+     *
+     * see comments on {@link Tensor.GpuBufferType}
+     */
+    adapter: unknown;
     /**
      * Get the device for WebGPU.
      *
+     * This property is only available after the first WebGPU inference session is created.
+     *
      * When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types".
      * Use `const device = env.webgpu.device as GPUDevice;` in TypeScript to access this property with correct type.
      *
-     * see comments on {@link GpuBufferType} for more details about why not use types defined in "@webgpu/types".
+     * see comments on {@link Tensor.GpuBufferType} for more details about why not use types defined in "@webgpu/types".
      */
     readonly device: unknown;
     /**
@@ -159,6 +207,7 @@ export interface Env {
    * @defaultValue `'warning'`
    */
   logLevel?: 'verbose'|'info'|'warning'|'error'|'fatal';
+
   /**
    * Indicate whether run in debug mode.
    *
@@ -166,6 +215,13 @@ export interface Env {
    */
   debug?: boolean;
 
+  /**
+   * set or get a boolean value indicating whether to enable trace.
+   *
+   * @defaultValue `false`
+   */
+  trace?: boolean;
+
   /**
    * Get version of the current package.
    */
diff --git a/js/common/lib/index.ts b/js/common/lib/index.ts
index 9cbfcc4e8bcd..3ed56b3c2e81 100644
--- a/js/common/lib/index.ts
+++ b/js/common/lib/index.ts
@@ -11,7 +11,7 @@
  * - [onnxruntime-react-native](https://www.npmjs.com/package/onnxruntime-react-native)
  *
  * See also:
- * - [Get Started](https://onnxruntime.ai/docs/get-started/with-javascript.html)
+ * - [Get Started](https://onnxruntime.ai/docs/get-started/with-javascript/)
  * - [Inference examples](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/js)
  *
  * @packageDocumentation
@@ -21,5 +21,9 @@ export * from './backend.js';
 export * from './env.js';
 export * from './inference-session.js';
 export * from './tensor.js';
+export * from './tensor-conversion.js';
+export * from './tensor-factory.js';
+export * from './trace.js';
+export * from './onnx-model.js';
 export * from './onnx-value.js';
 export * from './training-session.js';
diff --git a/js/common/lib/inference-session-impl.ts b/js/common/lib/inference-session-impl.ts
index 9bc2088f2088..ab4c6a3e0c46 100644
--- a/js/common/lib/inference-session-impl.ts
+++ b/js/common/lib/inference-session-impl.ts
@@ -1,11 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {resolveBackend} from './backend-impl.js';
+import {resolveBackendAndExecutionProviders} from './backend-impl.js';
 import {InferenceSessionHandler} from './backend.js';
 import {InferenceSession as InferenceSessionInterface} from './inference-session.js';
 import {OnnxValue} from './onnx-value.js';
 import {Tensor} from './tensor.js';
+import {TRACE_FUNC_BEGIN, TRACE_FUNC_END} from './trace.js';
 
 type SessionOptions = InferenceSessionInterface.SessionOptions;
 type RunOptions = InferenceSessionInterface.RunOptions;
@@ -20,6 +21,7 @@ export class InferenceSession implements InferenceSessionInterface {
   run(feeds: FeedsType, options?: RunOptions): Promise<ReturnType>;
   run(feeds: FeedsType, fetches: FetchesType, options?: RunOptions): Promise<ReturnType>;
   async run(feeds: FeedsType, arg1?: FetchesType|RunOptions, arg2?: RunOptions): Promise<ReturnType> {
+    TRACE_FUNC_BEGIN();
     const fetches: {[name: string]: OnnxValue|null} = {};
     let options: RunOptions = {};
     // check inputs
@@ -117,6 +119,7 @@ export class InferenceSession implements InferenceSessionInterface {
         }
       }
     }
+    TRACE_FUNC_END();
     return returnValue;
   }
 
@@ -132,6 +135,7 @@ export class InferenceSession implements InferenceSessionInterface {
   static async create(
       arg0: string|ArrayBufferLike|Uint8Array, arg1?: SessionOptions|number, arg2?: number,
       arg3?: SessionOptions): Promise<InferenceSessionInterface> {
+    TRACE_FUNC_BEGIN();
     // either load from a file or buffer
     let filePathOrUint8Array: string|Uint8Array;
     let options: SessionOptions = {};
@@ -191,11 +195,10 @@ export class InferenceSession implements InferenceSessionInterface {
       throw new TypeError('Unexpected argument[0]: must be \'path\' or \'buffer\'.');
     }
 
-    // get backend hints
-    const eps = options.executionProviders || [];
-    const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
-    const backend = await resolveBackend(backendHints);
-    const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, options);
+    // resolve backend, update session options with validated EPs, and create session handler
+    const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
+    const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, optionsWithValidatedEPs);
+    TRACE_FUNC_END();
     return new InferenceSession(handler);
   }
 
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index c7760692eed0..14db5c59d972 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import {InferenceSession as InferenceSessionImpl} from './inference-session-impl.js';
+import {OnnxModelOptions} from './onnx-model.js';
 import {OnnxValue, OnnxValueDataLocation} from './onnx-value.js';
 
 /* eslint-disable @typescript-eslint/no-redeclare */
@@ -43,7 +44,7 @@ export declare namespace InferenceSession {
   /**
    * A set of configurations for session behavior.
    */
-  export interface SessionOptions {
+  export interface SessionOptions extends OnnxModelOptions {
     /**
      * An array of execution provider options.
      *
@@ -110,7 +111,7 @@ export declare namespace InferenceSession {
     optimizedModelFilePath?: string;
 
     /**
-     * Wether enable profiling.
+     * Whether enable profiling.
      *
      * This setting is a placeholder for a future use.
      */
@@ -153,6 +154,12 @@ export declare namespace InferenceSession {
      */
     preferredOutputLocation?: OnnxValueDataLocation|{readonly [outputName: string]: OnnxValueDataLocation};
 
+    /**
+     * Whether enable graph capture.
+     * This setting is available only in ONNXRuntime Web for WebGPU EP.
+     */
+    enableGraphCapture?: boolean;
+
     /**
      * Store configurations for a session. See
      * https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/
@@ -179,22 +186,22 @@ export declare namespace InferenceSession {
   // #region execution providers
 
   // Currently, we have the following backends to support execution providers:
-  // Backend Node.js binding: supports 'cpu' and 'cuda'.
-  // Backend WebAssembly: supports 'cpu', 'wasm', 'xnnpack' and 'webnn'.
+  // Backend Node.js binding: supports 'cpu', 'dml' (win32), 'coreml' (macOS) and 'cuda' (linux).
+  // Backend WebAssembly: supports 'cpu', 'wasm', 'webgpu' and 'webnn'.
   // Backend ONNX.js: supports 'webgl'.
   // Backend React Native: supports 'cpu', 'xnnpack', 'coreml' (iOS), 'nnapi' (Android).
   interface ExecutionProviderOptionMap {
+    coreml: CoreMLExecutionProviderOption;
     cpu: CpuExecutionProviderOption;
-    coreml: CoreMlExecutionProviderOption;
     cuda: CudaExecutionProviderOption;
     dml: DmlExecutionProviderOption;
+    nnapi: NnapiExecutionProviderOption;
     tensorrt: TensorRtExecutionProviderOption;
     wasm: WebAssemblyExecutionProviderOption;
     webgl: WebGLExecutionProviderOption;
-    xnnpack: XnnpackExecutionProviderOption;
     webgpu: WebGpuExecutionProviderOption;
     webnn: WebNNExecutionProviderOption;
-    nnapi: NnapiExecutionProviderOption;
+    xnnpack: XnnpackExecutionProviderOption;
   }
 
   type ExecutionProviderName = keyof ExecutionProviderOptionMap;
@@ -212,10 +219,6 @@ export declare namespace InferenceSession {
     readonly name: 'cuda';
     deviceId?: number;
   }
-  export interface CoreMlExecutionProviderOption extends ExecutionProviderOption {
-    readonly name: 'coreml';
-    coreMlFlags?: number;
-  }
   export interface DmlExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'dml';
     deviceId?: number;
@@ -240,14 +243,45 @@ export declare namespace InferenceSession {
   }
   export interface WebNNExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'webnn';
-    deviceType?: 'cpu'|'gpu';
+    deviceType?: 'cpu'|'gpu'|'npu';
     numThreads?: number;
     powerPreference?: 'default'|'low-power'|'high-performance';
   }
   export interface CoreMLExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'coreml';
+    /**
+     * The bit flags for CoreML execution provider.
+     *
+     * ```
+     * COREML_FLAG_USE_CPU_ONLY = 0x001
+     * COREML_FLAG_ENABLE_ON_SUBGRAPH = 0x002
+     * COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE = 0x004
+     * COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES = 0x008
+     * COREML_FLAG_CREATE_MLPROGRAM = 0x010
+     * ```
+     *
+     * See include/onnxruntime/core/providers/coreml/coreml_provider_factory.h for more details.
+     *
+     * This flag is available only in ONNXRuntime (Node.js binding).
+     */
+    coreMlFlags?: number;
+    /**
+     * Specify whether to use CPU only in CoreML EP.
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     useCPUOnly?: boolean;
+    /**
+     * Specify whether to enable CoreML EP on subgraph.
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     enableOnSubgraph?: boolean;
+    /**
+     * Specify whether to only enable CoreML EP for Apple devices with ANE (Apple Neural Engine).
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     onlyEnableDeviceWithANE?: boolean;
   }
   export interface NnapiExecutionProviderOption extends ExecutionProviderOption {
diff --git a/js/common/lib/onnx-model.ts b/js/common/lib/onnx-model.ts
new file mode 100644
index 000000000000..1cd3eedb6fcc
--- /dev/null
+++ b/js/common/lib/onnx-model.ts
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+/**
+ * A string that represents a file's URL or path.
+ *
+ * Path is vailable only in onnxruntime-node or onnxruntime-web running in Node.js.
+ */
+export type FileUrlOrPath = string;
+
+/**
+ * A Blob object that represents a file.
+ */
+export type FileBlob = Blob;
+
+/**
+ * A Uint8Array, ArrayBuffer or SharedArrayBuffer object that represents a file content.
+ *
+ * When it is an ArrayBuffer or SharedArrayBuffer, the whole buffer is assumed to be the file content.
+ */
+export type FileData = Uint8Array|ArrayBufferLike;
+
+/**
+ * Represents a file that can be loaded by the ONNX Runtime JavaScript API.
+ */
+export type FileType = FileUrlOrPath|FileBlob|FileData;
+
+/**
+ * Represents an external data file.
+ */
+export interface ExternalDataFileDescription {
+  /**
+   * Specify the external data file.
+   */
+  data: FileType;
+  /**
+   * Specify the file path.
+   */
+  path: string;
+}
+
+/**
+ * Represents an external data file.
+ *
+ * When using a string, it should be a file URL or path that in the same directory as the model file.
+ */
+export type ExternalDataFileType = ExternalDataFileDescription|FileUrlOrPath;
+
+/**
+ * Options for model loading.
+ */
+export interface OnnxModelOptions {
+  /**
+   * Specifying a list of files that represents the external data.
+   */
+  externalData?: readonly ExternalDataFileType[];
+}
diff --git a/js/common/lib/onnx-value.ts b/js/common/lib/onnx-value.ts
index a16a30d25d83..72369ce8b420 100644
--- a/js/common/lib/onnx-value.ts
+++ b/js/common/lib/onnx-value.ts
@@ -3,7 +3,7 @@
 
 import {Tensor} from './tensor.js';
 
-type NonTensorType = never;
+export type NonTensorType = never;
 
 /**
  * Type OnnxValue Represents both tensors and non-tensors value for model's inputs/outputs.
diff --git a/js/common/lib/tensor-conversion-impl.ts b/js/common/lib/tensor-conversion-impl.ts
index 22397321e8c6..b1de48a10c0e 100644
--- a/js/common/lib/tensor-conversion-impl.ts
+++ b/js/common/lib/tensor-conversion-impl.ts
@@ -8,10 +8,11 @@ import {Tensor} from './tensor.js';
  * implementation of Tensor.toDataURL()
  */
 export const tensorToDataURL = (tensor: Tensor, options?: TensorToDataUrlOptions): string => {
-  const canvas = document.createElement('canvas');
+  const canvas = typeof document !== 'undefined' ? document.createElement('canvas') : (new OffscreenCanvas(1, 1));
   canvas.width = tensor.dims[3];
   canvas.height = tensor.dims[2];
-  const pixels2DContext = canvas.getContext('2d');
+  const pixels2DContext =
+      canvas.getContext('2d') as (CanvasRenderingContext2D | OffscreenCanvasRenderingContext2D | null);
 
   if (pixels2DContext != null) {
     // Default values for height and width & format
@@ -88,7 +89,11 @@ export const tensorToDataURL = (tensor: Tensor, options?: TensorToDataUrlOptions
         pixels2DContext.fillRect(j, i, 1, 1);
       }
     }
-    return canvas.toDataURL();
+    if ('toDataURL' in canvas) {
+      return canvas.toDataURL();
+    } else {
+      throw new Error('toDataURL is not supported');
+    }
   } else {
     throw new Error('Can not access image data');
   }
@@ -98,7 +103,9 @@ export const tensorToDataURL = (tensor: Tensor, options?: TensorToDataUrlOptions
  * implementation of Tensor.toImageData()
  */
 export const tensorToImageData = (tensor: Tensor, options?: TensorToImageDataOptions): ImageData => {
-  const pixels2DContext = document.createElement('canvas').getContext('2d');
+  const pixels2DContext = typeof document !== 'undefined' ?
+      document.createElement('canvas').getContext('2d') :
+      new OffscreenCanvas(1, 1).getContext('2d') as OffscreenCanvasRenderingContext2D;
   let image: ImageData;
   if (pixels2DContext != null) {
     // Default values for height and width & format
diff --git a/js/common/lib/tensor-factory-impl.ts b/js/common/lib/tensor-factory-impl.ts
index 7228c4a97055..19c62cb54bfe 100644
--- a/js/common/lib/tensor-factory-impl.ts
+++ b/js/common/lib/tensor-factory-impl.ts
@@ -110,13 +110,31 @@ export const tensorFromImage = async(
   let data: Uint8ClampedArray|undefined;
   let bufferToTensorOptions: BufferToTensorOptions = options ?? {};
 
+  const createCanvas = () => {
+    if (typeof document !== 'undefined') {
+      return document.createElement('canvas');
+    } else if (typeof OffscreenCanvas !== 'undefined') {
+      return new OffscreenCanvas(1, 1);
+    } else {
+      throw new Error('Canvas is not supported');
+    }
+  };
+  const createCanvasContext = (canvas: HTMLCanvasElement|OffscreenCanvas) => {
+    if (canvas instanceof HTMLCanvasElement) {
+      return canvas.getContext('2d');
+    } else if (canvas instanceof OffscreenCanvas) {
+      return canvas.getContext('2d') as OffscreenCanvasRenderingContext2D;
+    } else {
+      return null;
+    }
+  };
   // filling and checking image configuration options
   if (isHTMLImageEle) {
     // HTMLImageElement - image object - format is RGBA by default
-    const canvas = document.createElement('canvas');
+    const canvas = createCanvas();
     canvas.width = image.width;
     canvas.height = image.height;
-    const pixels2DContext = canvas.getContext('2d');
+    const pixels2DContext = createCanvasContext(canvas);
 
     if (pixels2DContext != null) {
       let height = image.height;
@@ -166,12 +184,12 @@ export const tensorFromImage = async(
     bufferToTensorOptions.width = width;
 
     if (options !== undefined) {
-      const tempCanvas = document.createElement('canvas');
+      const tempCanvas = createCanvas();
 
       tempCanvas.width = width;
       tempCanvas.height = height;
 
-      const pixels2DContext = tempCanvas.getContext('2d');
+      const pixels2DContext = createCanvasContext(tempCanvas);
 
       if (pixels2DContext != null) {
         pixels2DContext.putImageData(image, 0, 0);
@@ -188,10 +206,10 @@ export const tensorFromImage = async(
       throw new Error('Please provide image config with format for Imagebitmap');
     }
 
-    const canvas = document.createElement('canvas');
+    const canvas = createCanvas();
     canvas.width = image.width;
     canvas.height = image.height;
-    const pixels2DContext = canvas.getContext('2d');
+    const pixels2DContext = createCanvasContext(canvas);
 
     if (pixels2DContext != null) {
       const height = image.height;
@@ -206,8 +224,8 @@ export const tensorFromImage = async(
     }
   } else if (isString) {
     return new Promise((resolve, reject) => {
-      const canvas = document.createElement('canvas');
-      const context = canvas.getContext('2d');
+      const canvas = createCanvas();
+      const context = createCanvasContext(canvas);
       if (!image || !context) {
         return reject();
       }
diff --git a/js/common/lib/tensor-factory.ts b/js/common/lib/tensor-factory.ts
index 6e19d7fb898a..431de4c3635c 100644
--- a/js/common/lib/tensor-factory.ts
+++ b/js/common/lib/tensor-factory.ts
@@ -253,7 +253,7 @@ export interface TensorFactory {
   /**
    * create a tensor from an ImageBitmap object
    *
-   * @param bitMap - the ImageBitmap object to create tensor from
+   * @param bitmap - the ImageBitmap object to create tensor from
    * @param options - An optional object representing options for creating tensor from URL.
    *
    * The following default settings will be applied:
diff --git a/js/common/lib/tensor-impl-type-mapping.ts b/js/common/lib/tensor-impl-type-mapping.ts
index c4a43ea27fea..b29cb8cbd6d3 100644
--- a/js/common/lib/tensor-impl-type-mapping.ts
+++ b/js/common/lib/tensor-impl-type-mapping.ts
@@ -14,7 +14,6 @@ export const NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP = new Map<string, SupportedTy
   ['uint8', Uint8Array],
   ['int8', Int8Array],
   ['uint16', Uint16Array],
-  ['float16', Uint16Array],
   ['int16', Int16Array],
   ['int32', Int32Array],
   ['bool', Uint8Array],
@@ -34,16 +33,22 @@ export const NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP = new Map<SupportedTypedArray
   [Uint32Array, 'uint32'],
 ]);
 
-// the following code allows delaying execution of BigInt checking. This allows lazy initialization for
-// NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP and NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, which allows BigInt polyfill
-// if available.
-let isBigIntChecked = false;
-export const checkBigInt = () => {
-  if (!isBigIntChecked) {
-    isBigIntChecked = true;
-    const isBigInt64ArrayAvailable = typeof BigInt64Array !== 'undefined' && typeof BigInt64Array.from === 'function';
-    const isBigUint64ArrayAvailable =
-        typeof BigUint64Array !== 'undefined' && typeof BigUint64Array.from === 'function';
+// a dummy type declaration for Float16Array in case any polyfill is available.
+declare global {
+  // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any
+  const Float16Array: any;
+}
+
+// the following code allows delaying execution of BigInt/Float16Array checking. This allows lazy initialization for
+// NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP and NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, which allows BigInt/Float16Array
+// polyfill if available.
+let isTypedArrayChecked = false;
+export const checkTypedArray = () => {
+  if (!isTypedArrayChecked) {
+    isTypedArrayChecked = true;
+    const isBigInt64ArrayAvailable = typeof BigInt64Array !== 'undefined' && BigInt64Array.from;
+    const isBigUint64ArrayAvailable = typeof BigUint64Array !== 'undefined' && BigUint64Array.from;
+    const isFloat16ArrayAvailable = typeof Float16Array !== 'undefined' && Float16Array.from;
 
     if (isBigInt64ArrayAvailable) {
       NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('int64', BigInt64Array);
@@ -53,5 +58,12 @@ export const checkBigInt = () => {
       NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('uint64', BigUint64Array);
       NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP.set(BigUint64Array, 'uint64');
     }
+    if (isFloat16ArrayAvailable) {
+      NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('float16', Float16Array);
+      NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP.set(Float16Array, 'float16');
+    } else {
+      // if Float16Array is not available, use 'Uint16Array' to store the data.
+      NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('float16', Uint16Array);
+    }
   }
 };
diff --git a/js/common/lib/tensor-impl.ts b/js/common/lib/tensor-impl.ts
index e3e2b9c72855..56682ef98e11 100644
--- a/js/common/lib/tensor-impl.ts
+++ b/js/common/lib/tensor-impl.ts
@@ -5,7 +5,7 @@ import {tensorToDataURL, tensorToImageData} from './tensor-conversion-impl.js';
 import {TensorToDataUrlOptions, TensorToImageDataOptions} from './tensor-conversion.js';
 import {tensorFromGpuBuffer, tensorFromImage, tensorFromPinnedBuffer, tensorFromTexture} from './tensor-factory-impl.js';
 import {CpuPinnedConstructorParameters, GpuBufferConstructorParameters, TensorFromGpuBufferOptions, TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, TensorFromTextureOptions, TensorFromUrlOptions, TextureConstructorParameters} from './tensor-factory.js';
-import {checkBigInt, NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP, NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, SupportedTypedArray, SupportedTypedArrayConstructors} from './tensor-impl-type-mapping.js';
+import {checkTypedArray, NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP, NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, SupportedTypedArray, SupportedTypedArrayConstructors} from './tensor-impl-type-mapping.js';
 import {calculateSize, tensorReshape} from './tensor-utils-impl.js';
 import {Tensor as TensorInterface} from './tensor.js';
 
@@ -67,8 +67,8 @@ export class Tensor implements TensorInterface {
       arg0: TensorType|TensorDataType|readonly string[]|readonly boolean[]|CpuPinnedConstructorParameters|
       TextureConstructorParameters|GpuBufferConstructorParameters,
       arg1?: TensorDataType|readonly number[]|readonly string[]|readonly boolean[], arg2?: readonly number[]) {
-    // perform one-time check for BigInt support
-    checkBigInt();
+    // perform one-time check for BigInt/Float16Array support
+    checkTypedArray();
 
     let type: TensorType;
     let dims: readonly number[];
@@ -103,7 +103,7 @@ export class Tensor implements TensorInterface {
         }
         case 'gpu-buffer': {
           if ((type !== 'float32' && type !== 'float16' && type !== 'int32' && type !== 'int64' && type !== 'uint32' &&
-               type !== 'bool')) {
+               type !== 'uint8' && type !== 'bool')) {
             throw new TypeError(`unsupported type "${type}" to create tensor from gpu buffer`);
           }
           this.gpuBufferData = arg0.gpuBuffer;
@@ -142,7 +142,9 @@ export class Tensor implements TensorInterface {
             throw new TypeError(`Unsupported tensor type: ${arg0}.`);
           }
           if (Array.isArray(arg1)) {
-            if (arg0 === 'float16') {
+            if (arg0 === 'float16' && typedArrayConstructor === Uint16Array) {
+              // When no Float16Array polyfill is used, we cannot create 'float16' tensor from number array.
+              //
               // Throw error here because when user try to use number array as data,
               // e.g. new Tensor('float16', [1, 2, 3, 4], dims)), it will actually call
               // Uint16Array.from(arg1) which generates wrong data.
diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts
index 6c08d1fe8e05..20319ebb800c 100644
--- a/js/common/lib/tensor.ts
+++ b/js/common/lib/tensor.ts
@@ -135,7 +135,7 @@ export declare namespace Tensor {
   /**
    * supported data types for constructing a tensor from a WebGPU buffer
    */
-  export type GpuBufferDataTypes = 'float32'|'float16'|'int32'|'int64'|'uint32'|'bool';
+  export type GpuBufferDataTypes = 'float32'|'float16'|'int32'|'int64'|'uint32'|'uint8'|'bool';
 
   /**
    * represent where the tensor data is stored
@@ -160,7 +160,7 @@ export interface Tensor extends TypedTensorBase<Tensor.Type>, TypedTensorUtils<T
 /**
  * type TensorConstructor defines the constructors of 'Tensor' to create CPU tensor instances.
  */
-export interface TensorConstructor {
+export interface TensorConstructor extends TensorFactory {
   // #region CPU tensor - specify element type
   /**
    * Construct a new string tensor object from the given type, data and dims.
@@ -326,4 +326,4 @@ export interface TensorConstructor {
 }
 
 // eslint-disable-next-line @typescript-eslint/naming-convention
-export const Tensor = TensorImpl as (TensorConstructor & TensorFactory);
+export const Tensor = TensorImpl as TensorConstructor;
diff --git a/js/common/lib/trace.ts b/js/common/lib/trace.ts
new file mode 100644
index 000000000000..44ad6cacb4bb
--- /dev/null
+++ b/js/common/lib/trace.ts
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {env} from './env-impl.js';
+
+/**
+ * @ignore
+ */
+export const TRACE = (deviceType: string, label: string) => {
+  if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
+    return;
+  }
+  // eslint-disable-next-line no-console
+  console.timeStamp(`${deviceType}::ORT::${label}`);
+};
+
+const TRACE_FUNC = (msg: string, extraMsg?: string) => {
+  const stack = new Error().stack?.split(/\r\n|\r|\n/g) || [];
+  let hasTraceFunc = false;
+  for (let i = 0; i < stack.length; i++) {
+    if (hasTraceFunc && !stack[i].includes('TRACE_FUNC')) {
+      let label = `FUNC_${msg}::${stack[i].trim().split(' ')[1]}`;
+      if (extraMsg) {
+        label += `::${extraMsg}`;
+      }
+      TRACE('CPU', label);
+      return;
+    }
+    if (stack[i].includes('TRACE_FUNC')) {
+      hasTraceFunc = true;
+    }
+  }
+};
+
+/**
+ * @ignore
+ */
+export const TRACE_FUNC_BEGIN = (extraMsg?: string) => {
+  if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
+    return;
+  }
+  TRACE_FUNC('BEGIN', extraMsg);
+};
+
+/**
+ * @ignore
+ */
+export const TRACE_FUNC_END = (extraMsg?: string) => {
+  if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
+    return;
+  }
+  TRACE_FUNC('END', extraMsg);
+};
diff --git a/js/common/lib/training-session-impl.ts b/js/common/lib/training-session-impl.ts
index 23bd4421ae67..bae38b0dfda5 100644
--- a/js/common/lib/training-session-impl.ts
+++ b/js/common/lib/training-session-impl.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {resolveBackend} from './backend-impl.js';
+import {resolveBackendAndExecutionProviders} from './backend-impl.js';
 import {SessionHandler, TrainingSessionHandler} from './backend.js';
 import {InferenceSession as InferenceSession} from './inference-session.js';
 import {OnnxValue} from './onnx-value.js';
@@ -55,13 +55,12 @@ export class TrainingSession implements TrainingSessionInterface {
     const optimizerModel: string|Uint8Array = trainingOptions.optimizerModel || '';
     const options: SessionOptions = sessionOptions || {};
 
-    // get backend hints
-    const eps = options.executionProviders || [];
-    const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
-    const backend = await resolveBackend(backendHints);
+    // resolve backend, update session options with validated EPs, and create session handler
+    const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
     if (backend.createTrainingSessionHandler) {
       const handler = await backend.createTrainingSessionHandler(
-          trainingOptions.checkpointState, trainingOptions.trainModel, evalModel, optimizerModel, options);
+          trainingOptions.checkpointState, trainingOptions.trainModel, evalModel, optimizerModel,
+          optionsWithValidatedEPs);
       return new TrainingSession(handler, !!trainingOptions.optimizerModel, !!trainingOptions.evalModel);
     } else {
       throw new Error(noBackendErrMsg);
diff --git a/js/common/lib/training-session.ts b/js/common/lib/training-session.ts
index e54aed90e702..f9de77e3ac7d 100644
--- a/js/common/lib/training-session.ts
+++ b/js/common/lib/training-session.ts
@@ -11,7 +11,7 @@ export declare namespace TrainingSession {
   /**
    * Either URI file path (string) or Uint8Array containing model or checkpoint information.
    */
-  type URIorBuffer = string|Uint8Array;
+  type UriOrBuffer = string|Uint8Array;
 }
 
 /**
@@ -98,13 +98,13 @@ export interface TrainingSession {
   getParametersSize(trainableOnly: boolean): Promise<number>;
 
   /**
-   * Copies parameter values from the given array to the training state. Currently, only supporting models with
+   * Copies parameter values from the given buffer to the training state. Currently, only supporting models with
    * parameters of type Float32.
    *
-   * @param buffer - Float32 buffer containing parameters converted to a Uint8Array.
+   * @param buffer - A Uint8Array representation of Float32 parameters.
    * @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true.
    */
-  loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
+  loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise<void>;
 
   /**
    * Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning.
@@ -157,19 +157,19 @@ export interface TrainingSessionCreateOptions {
   /**
    * URI or buffer for a .ckpt file that contains the checkpoint for the training model.
    */
-  checkpointState: TrainingSession.URIorBuffer;
+  checkpointState: TrainingSession.UriOrBuffer;
   /**
    * URI or buffer for the .onnx training file.
    */
-  trainModel: TrainingSession.URIorBuffer;
+  trainModel: TrainingSession.UriOrBuffer;
   /**
    * Optional. URI or buffer for the .onnx optimizer model file.
    */
-  optimizerModel?: TrainingSession.URIorBuffer;
+  optimizerModel?: TrainingSession.UriOrBuffer;
   /**
    * Optional. URI or buffer for the .onnx eval model file.
    */
-  evalModel?: TrainingSession.URIorBuffer;
+  evalModel?: TrainingSession.UriOrBuffer;
 }
 
 /**
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index 96c2361cceab..40f970ddf02a 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index 84f6dba83fa5..3988ac80707e 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,21 +1,21 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "node_modules/ansi-sequence-parser": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.0.tgz",
-      "integrity": "sha512-lEm8mt52to2fT8GhciPCGeCXACSz2UwIN4X2e2LJSnZ5uAbn2/dsYdOmUXq0AtWS5cpAupysIneExOgH0Vd2TQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.1.tgz",
+      "integrity": "sha512-vJXt3yiaUL4UU546s3rPXlsry/RnM730G1+HkpKE012AN0sx1eOrxSu95oKDIonskeLTijMgqWZ3uDEe3NFvyg==",
       "dev": true
     },
     "node_modules/balanced-match": {
@@ -34,9 +34,9 @@
       }
     },
     "node_modules/jsonc-parser": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz",
-      "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==",
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.1.tgz",
+      "integrity": "sha512-AilxAyFOAcK5wA1+LeaySVBrHsGQvUFCDWXKpZjzaL0PqW+xfBOttn8GNtWKFWqneyMZj41MWF9Kl6iPWLwgOA==",
       "dev": true
     },
     "node_modules/lunr": {
@@ -46,9 +46,9 @@
       "dev": true
     },
     "node_modules/marked": {
-      "version": "4.2.12",
-      "resolved": "https://registry.npmjs.org/marked/-/marked-4.2.12.tgz",
-      "integrity": "sha512-yr8hSKa3Fv4D3jdZmtMMPghgVt6TWbk86WQaWhDloQjRSQhMMYCAro7jP7VDJrjjdV8pxVxMssXS8B8Y5DZ5aw==",
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz",
+      "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==",
       "dev": true,
       "bin": {
         "marked": "bin/marked.js"
@@ -58,24 +58,24 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "7.4.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.2.tgz",
-      "integrity": "sha512-xy4q7wou3vUoC9k1xGTXc+awNdGaGVHtFUaey8tiX4H1QRc04DZ/rmDFwNm2EBsuYEhAZ6SgMmYf3InGY6OauA==",
+      "version": "9.0.3",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz",
+      "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==",
       "dev": true,
       "dependencies": {
         "brace-expansion": "^2.0.1"
       },
       "engines": {
-        "node": ">=10"
+        "node": ">=16 || 14 >=14.17"
       },
       "funding": {
         "url": "https://github.com/sponsors/isaacs"
       }
     },
     "node_modules/shiki": {
-      "version": "0.14.1",
-      "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.1.tgz",
-      "integrity": "sha512-+Jz4nBkCBe0mEDqo1eKRcCdjRtrCjozmcbTUjbPTX7OOJfEbTZzlUWlZtGe3Gb5oV1/jnojhG//YZc3rs9zSEw==",
+      "version": "0.14.7",
+      "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.7.tgz",
+      "integrity": "sha512-dNPAPrxSc87ua2sKJ3H5dQ/6ZaY8RNnaAqK+t0eG7p0Soi2ydiqbGOTaZCqaYvA/uZYfS1LJnemt3Q+mSfcPCg==",
       "dev": true,
       "dependencies": {
         "ansi-sequence-parser": "^1.1.0",
@@ -85,30 +85,30 @@
       }
     },
     "node_modules/typedoc": {
-      "version": "0.23.26",
-      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.23.26.tgz",
-      "integrity": "sha512-5m4KwR5tOLnk0OtMaRn9IdbeRM32uPemN9kur7YK9wFqx8U0CYrvO9aVq6ysdZSV1c824BTm+BuQl2Ze/k1HtA==",
+      "version": "0.25.7",
+      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.25.7.tgz",
+      "integrity": "sha512-m6A6JjQRg39p2ZVRIN3NKXgrN8vzlHhOS+r9ymUYtcUP/TIQPvWSq7YgE5ZjASfv5Vd5BW5xrir6Gm2XNNcOow==",
       "dev": true,
       "dependencies": {
         "lunr": "^2.3.9",
-        "marked": "^4.2.12",
-        "minimatch": "^7.1.3",
-        "shiki": "^0.14.1"
+        "marked": "^4.3.0",
+        "minimatch": "^9.0.3",
+        "shiki": "^0.14.7"
       },
       "bin": {
         "typedoc": "bin/typedoc"
       },
       "engines": {
-        "node": ">= 14.14"
+        "node": ">= 16"
       },
       "peerDependencies": {
-        "typescript": "4.6.x || 4.7.x || 4.8.x || 4.9.x"
+        "typescript": "4.6.x || 4.7.x || 4.8.x || 4.9.x || 5.0.x || 5.1.x || 5.2.x || 5.3.x"
       }
     },
     "node_modules/typescript": {
-      "version": "4.9.5",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz",
-      "integrity": "sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==",
+      "version": "5.2.2",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.2.2.tgz",
+      "integrity": "sha512-mI4WrpHsbCIcwT9cF4FZvr80QUeKvsUsUvKDoR+X/7XHQH98xYD8YHZg7ANtz2GtZt/CBq2QJ0thkGJMHfqc1w==",
       "dev": true,
       "peer": true,
       "bin": {
@@ -116,7 +116,7 @@
         "tsserver": "bin/tsserver"
       },
       "engines": {
-        "node": ">=4.2.0"
+        "node": ">=14.17"
       }
     },
     "node_modules/vscode-oniguruma": {
@@ -134,9 +134,9 @@
   },
   "dependencies": {
     "ansi-sequence-parser": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.0.tgz",
-      "integrity": "sha512-lEm8mt52to2fT8GhciPCGeCXACSz2UwIN4X2e2LJSnZ5uAbn2/dsYdOmUXq0AtWS5cpAupysIneExOgH0Vd2TQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.1.tgz",
+      "integrity": "sha512-vJXt3yiaUL4UU546s3rPXlsry/RnM730G1+HkpKE012AN0sx1eOrxSu95oKDIonskeLTijMgqWZ3uDEe3NFvyg==",
       "dev": true
     },
     "balanced-match": {
@@ -155,9 +155,9 @@
       }
     },
     "jsonc-parser": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz",
-      "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==",
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.1.tgz",
+      "integrity": "sha512-AilxAyFOAcK5wA1+LeaySVBrHsGQvUFCDWXKpZjzaL0PqW+xfBOttn8GNtWKFWqneyMZj41MWF9Kl6iPWLwgOA==",
       "dev": true
     },
     "lunr": {
@@ -167,24 +167,24 @@
       "dev": true
     },
     "marked": {
-      "version": "4.2.12",
-      "resolved": "https://registry.npmjs.org/marked/-/marked-4.2.12.tgz",
-      "integrity": "sha512-yr8hSKa3Fv4D3jdZmtMMPghgVt6TWbk86WQaWhDloQjRSQhMMYCAro7jP7VDJrjjdV8pxVxMssXS8B8Y5DZ5aw==",
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz",
+      "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==",
       "dev": true
     },
     "minimatch": {
-      "version": "7.4.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.2.tgz",
-      "integrity": "sha512-xy4q7wou3vUoC9k1xGTXc+awNdGaGVHtFUaey8tiX4H1QRc04DZ/rmDFwNm2EBsuYEhAZ6SgMmYf3InGY6OauA==",
+      "version": "9.0.3",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz",
+      "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==",
       "dev": true,
       "requires": {
         "brace-expansion": "^2.0.1"
       }
     },
     "shiki": {
-      "version": "0.14.1",
-      "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.1.tgz",
-      "integrity": "sha512-+Jz4nBkCBe0mEDqo1eKRcCdjRtrCjozmcbTUjbPTX7OOJfEbTZzlUWlZtGe3Gb5oV1/jnojhG//YZc3rs9zSEw==",
+      "version": "0.14.7",
+      "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.7.tgz",
+      "integrity": "sha512-dNPAPrxSc87ua2sKJ3H5dQ/6ZaY8RNnaAqK+t0eG7p0Soi2ydiqbGOTaZCqaYvA/uZYfS1LJnemt3Q+mSfcPCg==",
       "dev": true,
       "requires": {
         "ansi-sequence-parser": "^1.1.0",
@@ -194,21 +194,21 @@
       }
     },
     "typedoc": {
-      "version": "0.23.26",
-      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.23.26.tgz",
-      "integrity": "sha512-5m4KwR5tOLnk0OtMaRn9IdbeRM32uPemN9kur7YK9wFqx8U0CYrvO9aVq6ysdZSV1c824BTm+BuQl2Ze/k1HtA==",
+      "version": "0.25.7",
+      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.25.7.tgz",
+      "integrity": "sha512-m6A6JjQRg39p2ZVRIN3NKXgrN8vzlHhOS+r9ymUYtcUP/TIQPvWSq7YgE5ZjASfv5Vd5BW5xrir6Gm2XNNcOow==",
       "dev": true,
       "requires": {
         "lunr": "^2.3.9",
-        "marked": "^4.2.12",
-        "minimatch": "^7.1.3",
-        "shiki": "^0.14.1"
+        "marked": "^4.3.0",
+        "minimatch": "^9.0.3",
+        "shiki": "^0.14.7"
       }
     },
     "typescript": {
-      "version": "4.9.5",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz",
-      "integrity": "sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==",
+      "version": "5.2.2",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.2.2.tgz",
+      "integrity": "sha512-mI4WrpHsbCIcwT9cF4FZvr80QUeKvsUsUvKDoR+X/7XHQH98xYD8YHZg7ANtz2GtZt/CBq2QJ0thkGJMHfqc1w==",
       "dev": true,
       "peer": true
     },
diff --git a/js/common/package.json b/js/common/package.json
index beab7d29be26..cd2612aab498 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,14 +2,14 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
   },
   "author": "fs-eire",
   "scripts": {
-    "build:cjs": "tsc --module commonjs --outDir ./dist/cjs",
+    "build:cjs": "tsc --module commonjs --moduleResolution node10 --outDir ./dist/cjs",
     "build:esm": "tsc",
     "build:bundles": "webpack",
     "build": "node ./build.js",
@@ -18,7 +18,7 @@
     "test": "mocha ./test/**/*.js --timeout 30000"
   },
   "devDependencies": {
-    "typedoc": "^0.23.22"
+    "typedoc": "^0.25.7"
   },
   "main": "dist/cjs/index.js",
   "exports": {
diff --git a/js/common/test/tsconfig.json b/js/common/test/tsconfig.json
index 2e4927ac3b32..e9068ad837a8 100644
--- a/js/common/test/tsconfig.json
+++ b/js/common/test/tsconfig.json
@@ -2,7 +2,7 @@
   "extends": "../../tsconfig.tools.json",
   "exclude": ["type-tests/**/*.ts"],
   "compilerOptions": {
-    "module": "ES2022",
+    "module": "Node16",
     "sourceMap": true
   }
 }
diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index c3898fbad740..8157df288eeb 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -66,9 +66,17 @@ if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
   execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
 endif()
 
+if (WIN32)
+  if (${ONNXRUNTIME_GENERATOR} MATCHES "Ninja")
+    set(ONNXRUNTIME_WIN_BIN_DIR ${ONNXRUNTIME_BUILD_DIR})
+  else()
+    set(ONNXRUNTIME_WIN_BIN_DIR ${ONNXRUNTIME_BUILD_DIR}/${CMAKE_BUILD_TYPE})
+  endif()
+  message(STATUS "onnxruntime dist dir: ${ONNXRUNTIME_WIN_BIN_DIR}")
+endif()
 # add libraries
 if (WIN32)
-  target_link_directories(onnxruntime_binding PRIVATE ${ONNXRUNTIME_BUILD_DIR}/${CMAKE_BUILD_TYPE})
+  target_link_directories(onnxruntime_binding PRIVATE ${ONNXRUNTIME_WIN_BIN_DIR})
 else()
   target_link_directories(onnxruntime_binding PRIVATE ${ONNXRUNTIME_BUILD_DIR})
 endif()
@@ -95,14 +103,14 @@ if (WIN32)
   add_custom_command(
     TARGET onnxruntime_binding POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy
-      ${ONNXRUNTIME_BUILD_DIR}/${CMAKE_BUILD_TYPE}/onnxruntime.dll
+      ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
       ${dist_folder}
   )
   if (USE_DML)
     add_custom_command(
       TARGET onnxruntime_binding POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E copy
-      ${ONNXRUNTIME_BUILD_DIR}/${CMAKE_BUILD_TYPE}/DirectML.dll
+      ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll
       ${dist_folder}
     )
   endif ()
@@ -110,7 +118,7 @@ if (WIN32)
     add_custom_command(
       TARGET onnxruntime_binding POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E copy
-        ${ONNXRUNTIME_BUILD_DIR}/${CMAKE_BUILD_TYPE}/onnxruntime.pdb
+        ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.pdb
         ${dist_folder}
       COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE_DIR:onnxruntime_binding>/onnxruntime_binding.pdb ${dist_folder}
     )
diff --git a/js/node/README.md b/js/node/README.md
index 98b2ea66de2a..234eaa111a22 100644
--- a/js/node/README.md
+++ b/js/node/README.md
@@ -22,7 +22,7 @@ Following platforms are supported with pre-built binaries:
 - Linux x64 CPU NAPI_v3
 - MacOS x64 CPU NAPI_v3
 
-To use on platforms without pre-built binaries, you can build Node.js binding from source and consume it by `npm install <onnxruntime_repo_root>/js/node/`. See also [instructions](https://www.onnxruntime.ai/docs/how-to/build.html#apis-and-language-bindings) for building ONNX Runtime Node.js binding locally.
+To use on platforms without pre-built binaries, you can build Node.js binding from source and consume it by `npm install <onnxruntime_repo_root>/js/node/`. See also [instructions](https://onnxruntime.ai/docs/build/inferencing.html#apis-and-language-bindings) for building ONNX Runtime Node.js binding locally.
 
 # GPU Support
 
diff --git a/js/node/lib/backend.ts b/js/node/lib/backend.ts
index e8eb0e9babf5..927953b4f1dd 100644
--- a/js/node/lib/backend.ts
+++ b/js/node/lib/backend.ts
@@ -36,7 +36,7 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler {
   async run(feeds: SessionHandler.FeedsType, fetches: SessionHandler.FetchesType, options: InferenceSession.RunOptions):
       Promise<SessionHandler.ReturnType> {
     return new Promise((resolve, reject) => {
-      process.nextTick(() => {
+      setImmediate(() => {
         try {
           resolve(this.#inferenceSession.run(feeds, fetches, options));
         } catch (e) {
@@ -56,7 +56,7 @@ class OnnxruntimeBackend implements Backend {
   async createInferenceSessionHandler(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions):
       Promise<InferenceSessionHandler> {
     return new Promise((resolve, reject) => {
-      process.nextTick(() => {
+      setImmediate(() => {
         try {
           resolve(new OnnxruntimeSessionHandler(pathOrBuffer, options || {}));
         } catch (e) {
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index 96c2361cceab..40f970ddf02a 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index c1cf8af4bb80..62b47698a143 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "os": [
         "win32",
@@ -27,10 +27,10 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "node_modules/@protobufjs/aspromise": {
@@ -336,9 +336,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -1242,9 +1242,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "form-data": {
@@ -1503,7 +1503,7 @@
     "onnxruntime-common": {
       "version": "file:../common",
       "requires": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "parse-json": {
diff --git a/js/node/package.json b/js/node/package.json
index 8e591d8f46b9..026840742e29 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -13,7 +13,7 @@
       3
     ]
   },
-  "version": "1.17.0",
+  "version": "1.18.0",
   "dependencies": {
     "onnxruntime-common": "file:../common"
   },
diff --git a/js/node/script/build.ts b/js/node/script/build.ts
index dfa88821a8d0..cc5950717908 100644
--- a/js/node/script/build.ts
+++ b/js/node/script/build.ts
@@ -23,6 +23,8 @@ if (ARCH !== 'x64' && ARCH !== 'ia32' && ARCH !== 'arm64' && ARCH !== 'arm') {
 }
 // --onnxruntime-build-dir=
 const ONNXRUNTIME_BUILD_DIR = buildArgs['onnxruntime-build-dir'];
+// --onnxruntime-generator=
+const ONNXRUNTIME_GENERATOR = buildArgs['onnxruntime-generator'];
 // --rebuild
 const REBUILD = !!buildArgs.rebuild;
 // --use_dml
@@ -55,6 +57,9 @@ const args = [
 if (ONNXRUNTIME_BUILD_DIR && typeof ONNXRUNTIME_BUILD_DIR === 'string') {
   args.push(`--CDONNXRUNTIME_BUILD_DIR=${ONNXRUNTIME_BUILD_DIR}`);
 }
+if (ONNXRUNTIME_GENERATOR && typeof ONNXRUNTIME_GENERATOR === 'string') {
+  args.push(`--CDONNXRUNTIME_GENERATOR=${ONNXRUNTIME_GENERATOR}`);
+}
 if (USE_DML) {
   args.push('--CDUSE_DML=ON');
 }
diff --git a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java
index fd085f953380..707a356b949a 100644
--- a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java
+++ b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/OnnxruntimeModule.java
@@ -199,6 +199,12 @@ private WritableMap loadModelImpl(String uri, byte[] modelData, ReadableMap opti
     if (modelData != null && modelData.length > 0) {
       // load model via model data array
       ortSession = ortEnvironment.createSession(modelData, sessionOptions);
+    } else if (uri.startsWith("file://") || uri.startsWith("/")) {
+      // load model from local
+      if (uri.startsWith("file://")) {
+        uri = uri.substring(7);
+      }
+      ortSession = ortEnvironment.createSession(uri, sessionOptions);
     } else {
       // load model via model path string uri
       InputStream modelStream =
diff --git a/js/react_native/e2e/yarn.lock b/js/react_native/e2e/yarn.lock
index 9e20a286c4e2..6f05faf04609 100644
--- a/js/react_native/e2e/yarn.lock
+++ b/js/react_native/e2e/yarn.lock
@@ -3351,9 +3351,9 @@ invariant@^2.2.4:
     loose-envify "^1.0.0"
 
 ip@^1.1.5:
-  version "1.1.8"
-  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
-  integrity sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg==
+  version "1.1.9"
+  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.9.tgz#8dfbcc99a754d07f425310b86a99546b1151e396"
+  integrity sha512-cyRxvOEpNHNtchU3Ln9KC/auJgup87llfQpQ+t5ghoC/UhL16SWzbueiCsdTnWmqAWl7LadfuwhlqmtOaqMHdQ==
 
 is-accessor-descriptor@^0.1.6:
   version "0.1.6"
diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
index 96c2361cceab..40f970ddf02a 100644
--- a/js/react_native/lib/version.ts
+++ b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 39e6cb08bb06..47324a76fe55 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -36,7 +36,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index ff9be7fbe3a5..bbb0c4f3d1e2 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -3701,9 +3701,9 @@ invariant@^2.2.4:
     loose-envify "^1.0.0"
 
 ip@^1.1.5:
-  version "1.1.8"
-  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
-  integrity sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg==
+  version "1.1.9"
+  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.9.tgz#8dfbcc99a754d07f425310b86a99546b1151e396"
+  integrity sha512-cyRxvOEpNHNtchU3Ln9KC/auJgup87llfQpQ+t5ghoC/UhL16SWzbueiCsdTnWmqAWl7LadfuwhlqmtOaqMHdQ==
 
 is-absolute@^1.0.0:
   version "1.0.0"
@@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2:
     mimic-fn "^2.1.0"
 
 "onnxruntime-common@file:../common":
-  version "1.17.0"
+  version "1.18.0"
 
 open@^6.2.0:
   version "6.4.0"
diff --git a/js/web/README.md b/js/web/README.md
index c75a40ad6da2..906c78a1b7ec 100644
--- a/js/web/README.md
+++ b/js/web/README.md
@@ -12,7 +12,7 @@ The [Open Neural Network Exchange](http://onnx.ai/) (ONNX) is an open standard f
 
 With ONNX Runtime Web, web developers can score models directly on browsers with various benefits including reducing server-client communication and protecting user privacy, as well as offering install-free and cross-platform in-browser ML experience.
 
-ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web complies the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend.
+ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web compiles the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend.
 
 See [Compatibility](#Compatibility) and [Operators Supported](#Operators) for a list of platforms and operators ONNX Runtime Web currently supports.
 
@@ -22,7 +22,7 @@ Refer to [ONNX Runtime JavaScript examples](https://github.com/microsoft/onnxrun
 
 ## Documents
 
-### Developement
+### Development
 
 Refer to the following links for development information:
 
diff --git a/js/web/docs/webgl-operators.md b/js/web/docs/webgl-operators.md
index 7c129b66bfa3..cd25819a2069 100644
--- a/js/web/docs/webgl-operators.md
+++ b/js/web/docs/webgl-operators.md
@@ -29,7 +29,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [BitwiseOr](https://github.com/onnx/onnx/blob/main/docs/Operators.md#BitwiseOr) |  |
 | [BitwiseXor](https://github.com/onnx/onnx/blob/main/docs/Operators.md#BitwiseXor) |  |
 | [BlackmanWindow](https://github.com/onnx/onnx/blob/main/docs/Operators.md#BlackmanWindow) |  |
-| [Cast](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Cast) | [6-8](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-6), [9-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-9), [13-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-13), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-19) |
+| [Cast](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Cast) | [6-8](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-6), [9-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-9), [13-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-13), [19-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-19), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Cast-21) |
 | [CastLike](https://github.com/onnx/onnx/blob/main/docs/Operators.md#CastLike) |  |
 | [Ceil](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Ceil) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Ceil-6), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Ceil-13) |
 | [Celu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Celu) |  |
@@ -62,7 +62,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [Exp](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Exp) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Exp-6), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Exp-13) |
 | [Expand](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Expand) |  |
 | [EyeLike](https://github.com/onnx/onnx/blob/main/docs/Operators.md#EyeLike) |  |
-| [Flatten](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Flatten) | [1-8](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-1), [9-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-9), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-11), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-13) |
+| [Flatten](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Flatten) | [1-8](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-1), [9-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-9), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-11), [13-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-13), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Flatten-21) |
 | [Floor](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Floor) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Floor-6), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Floor-13) |
 | [GRU](https://github.com/onnx/onnx/blob/main/docs/Operators.md#GRU) |  |
 | [Gather](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gather) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gather-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gather-11), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gather-13) |
@@ -82,7 +82,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [HardSigmoid](https://github.com/onnx/onnx/blob/main/docs/Operators.md#HardSigmoid) |  |
 | [HardSwish](https://github.com/onnx/onnx/blob/main/docs/Operators.md#HardSwish) |  |
 | [Hardmax](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Hardmax) |  |
-| [Identity](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Identity) | [1-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-1), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-13), [14-15](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-14), [16-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-16), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-19) |
+| [Identity](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Identity) | [1-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-1), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-13), [14-15](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-14), [16-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-16), [19-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-19), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-21) |
 | [If](https://github.com/onnx/onnx/blob/main/docs/Operators.md#If) |  |
 | [ImageDecoder](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ImageDecoder) |  |
 | [InstanceNormalization](https://github.com/onnx/onnx/blob/main/docs/Operators.md#InstanceNormalization) | [6+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#InstanceNormalization-6) |
@@ -124,7 +124,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [OptionalHasElement](https://github.com/onnx/onnx/blob/main/docs/Operators.md#OptionalHasElement) |  |
 | [Or](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Or) | [7+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Or-7) |
 | [PRelu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#PRelu) | [7-8](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#PRelu-7), [9-15](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#PRelu-9), [16+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#PRelu-16) |
-| [Pad](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pad) | [2-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-2), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-13), [18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-18), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-19) |
+| [Pad](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pad) | [2-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-2), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-13), [18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-18), [19-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-19), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pad-21) |
 | [Pow](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pow) | [7-11](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pow-7), [12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pow-12), [13-14](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pow-13), [15+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Pow-15) |
 | [QLinearConv](https://github.com/onnx/onnx/blob/main/docs/Operators.md#QLinearConv) |  |
 | [QLinearMatMul](https://github.com/onnx/onnx/blob/main/docs/Operators.md#QLinearMatMul) |  |
@@ -148,7 +148,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [ReduceSumSquare](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceSumSquare) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSumSquare-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSumSquare-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSumSquare-13), [18+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSumSquare-18) |
 | [RegexFullMatch](https://github.com/onnx/onnx/blob/main/docs/Operators.md#RegexFullMatch) |  |
 | [Relu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Relu) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Relu-6), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Relu-13), [14+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Relu-14) |
-| [Reshape](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Reshape) | [5-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-5), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-13), [14-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-14), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-19) |
+| [Reshape](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Reshape) | [5-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-5), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-13), [14-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-14), [19-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-19), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-21) |
 | [Resize](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize) | [10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-10), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-13), [18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-18), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-19) |
 | [ReverseSequence](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReverseSequence) |  |
 | [RoiAlign](https://github.com/onnx/onnx/blob/main/docs/Operators.md#RoiAlign) |  |
@@ -166,7 +166,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [SequenceInsert](https://github.com/onnx/onnx/blob/main/docs/Operators.md#SequenceInsert) |  |
 | [SequenceLength](https://github.com/onnx/onnx/blob/main/docs/Operators.md#SequenceLength) |  |
 | [SequenceMap](https://github.com/onnx/onnx/blob/main/docs/Operators.md#SequenceMap) |  |
-| [Shape](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Shape) | [1-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-1), [13-14](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-13), [15-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-15), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-19) |
+| [Shape](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Shape) | [1-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-1), [13-14](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-13), [15-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-15), [19-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-19), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Shape-21) |
 | [Shrink](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Shrink) |  |
 | [Sigmoid](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sigmoid) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sigmoid-6), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sigmoid-13) |
 | [Sign](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sign) |  |
@@ -182,7 +182,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [Split](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Split) | [2-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Split-2), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Split-11) |
 | [SplitToSequence](https://github.com/onnx/onnx/blob/main/docs/Operators.md#SplitToSequence) |  |
 | [Sqrt](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sqrt) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sqrt-6), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sqrt-13) |
-| [Squeeze](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Squeeze) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-11), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-13) |
+| [Squeeze](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Squeeze) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-11), [13-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-13), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-21) |
 | [StringConcat](https://github.com/onnx/onnx/blob/main/docs/Operators.md#StringConcat) |  |
 | [StringNormalizer](https://github.com/onnx/onnx/blob/main/docs/Operators.md#StringNormalizer) |  |
 | [StringSplit](https://github.com/onnx/onnx/blob/main/docs/Operators.md#StringSplit) |  |
@@ -194,10 +194,10 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [ThresholdedRelu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ThresholdedRelu) |  |
 | [Tile](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Tile) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Tile-6), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Tile-13) |
 | [TopK](https://github.com/onnx/onnx/blob/main/docs/Operators.md#TopK) |  |
-| [Transpose](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Transpose) | [1-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Transpose-1), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Transpose-13) |
+| [Transpose](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Transpose) | [1-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Transpose-1), [13-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Transpose-13), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Transpose-21) |
 | [Trilu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Trilu) |  |
 | [Unique](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Unique) |  |
-| [Unsqueeze](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Unsqueeze) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Unsqueeze-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Unsqueeze-11), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Unsqueeze-13) |
+| [Unsqueeze](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Unsqueeze) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Unsqueeze-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Unsqueeze-11), [13-20](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Unsqueeze-13), [21+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Unsqueeze-21) |
 | [Upsample](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Upsample) | [7-8](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Upsample-7), [9](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Upsample-9) |
 | [Where](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Where) |  |
 | [Xor](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Xor) | [7+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Xor-7) |
diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index 2f510308d930..c93f4f3cce68 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -34,6 +34,7 @@ Do not modify directly.*
 | Cos | ai.onnx(7+) |  |
 | Cosh | ai.onnx(9+) |  |
 | CumSum | ai.onnx(11-13,14+) |  |
+| DepthToSpace | ai.onnx(11-12,13+); com.ms.internal.nhwc(11-12,13+) |  |
 | Div | ai.onnx(7-12,13,14+) |  |
 | Einsum | ai.onnx(12+) |  |
 | Elu | ai.onnx(6+) |  |
@@ -41,6 +42,7 @@ Do not modify directly.*
 | Erf | ai.onnx(9-12,13+) |  |
 | Exp | ai.onnx(6-12,13+) |  |
 | Expand | ai.onnx(8-12,13+) |  |
+| FastGelu | com.microsoft(1+) |  |
 | Flatten | ai.onnx(1-8,9-10,11-12,13+) |  |
 | Floor | ai.onnx(6-12,13+) |  |
 | FusedConv | com.microsoft(1+) |  |
@@ -52,6 +54,7 @@ Do not modify directly.*
 | GlobalMaxPool | ai.onnx(1+); com.ms.internal.nhwc(1+) |  |
 | Greater | ai.onnx(7-8,9-12,13+) |  |
 | GreaterOrEqual | ai.onnx(12-15,16+) |  |
+| HardSigmoid | ai.onnx(6+) |  |
 | If | ai.onnx(1-10,11-12,13-18,19+) |  |
 | InstanceNormalization | ai.onnx(6+); com.ms.internal.nhwc(6+) |  |
 | LayerNormalization | ai.onnx(17+) |  |
@@ -60,6 +63,7 @@ Do not modify directly.*
 | LessOrEqual | ai.onnx(12-15,16+) |  |
 | Log | ai.onnx(6-12,13+) |  |
 | MatMul | ai.onnx(1-12,13+) |  |
+| MatMulNBits | com.microsoft(1+) |  |
 | MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(1-7,8-9,10,11,12+) | need perf optimization; need implementing activation |
 | MemcpyFromHost | ai.onnx(1+) |  |
 | MemcpyToHost | ai.onnx(1+) |  |
@@ -84,11 +88,14 @@ Do not modify directly.*
 | Relu | ai.onnx(6-12,13,14+) |  |
 | Reshape | ai.onnx(5-12,13,14+) | no GPU kernel |
 | Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(10,11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling |
+| RotaryEmbedding | com.microsoft(1+) |  |
 | Shape | ai.onnx(1-12,13-14,15+) | no GPU kernel; an ORT warning is generated - need to fix |
 | Sigmoid | ai.onnx(6-12,13+) |  |
+| SimplifiedLayerNormalization | ai.onnx(1+) |  |
 | Sin | ai.onnx(7+) |  |
 | Sinh | ai.onnx(9+) |  |
 | SkipLayerNormalization | com.microsoft(1+) |  |
+| SkipSimplifiedLayerNormalization | com.microsoft(1+) |  |
 | Slice | ai.onnx(1-9,10,11-12,13+) |  |
 | Softmax | ai.onnx(1-10,11-12,13+) |  |
 | Split | ai.onnx(1,2-10,11-12,13-17,18+) |  |
diff --git a/js/web/karma.conf.js b/js/web/karma.conf.js
index 8fce79843f61..507da0de2b4a 100644
--- a/js/web/karma.conf.js
+++ b/js/web/karma.conf.js
@@ -9,6 +9,8 @@ const karmaPlugins = args['karma-plugins'] || undefined;
 const timeoutMocha = args['timeout-mocha'] || 60000;
 const forceLocalHost = !!args['force-localhost'];
 
+// user data directory; will be passed to the Edge/Chrome/ChromeCanary/Firefox launchers
+const userDataDir = args['user-data-dir'];
 // parse chromium flags
 let chromiumFlags = args['chromium-flags'];
 if (!chromiumFlags) {
@@ -86,11 +88,12 @@ module.exports = function(config) {
     hostname,
     listenAddress,
     customLaunchers: {
-      // the following flags are used to make sure Edge on CI agents to initialize WebGPU correctly.
-      EdgeTest: {base: 'Edge', flags: chromiumFlags},
-      ChromeTest: {base: 'Chrome', flags: chromiumFlags},
-      ChromeTestHeadless: {base: 'ChromeHeadless', flags: chromiumFlags},
-      ChromeCanaryTest: {base: 'ChromeCanary', flags: chromiumFlags},
+      // Chromium-based browsers
+      EdgeTest: {base: 'Edge', flags: chromiumFlags, edgeDataDir: userDataDir},
+      ChromeTest: {base: 'Chrome', flags: chromiumFlags, chromeDataDir: userDataDir},
+      ChromeCanaryTest: {base: 'ChromeCanary', flags: chromiumFlags, chromeDataDir: userDataDir},
+      FirefoxTest: {base: 'Firefox', profile: userDataDir},
+
       //
       // ==== BrowserStack browsers ====
       //
diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
index 2d123cdb7129..31ecffb07e40 100644
--- a/js/web/lib/backend-wasm.ts
+++ b/js/web/lib/backend-wasm.ts
@@ -26,7 +26,17 @@ export const initializeFlags = (): void => {
     env.wasm.proxy = false;
   }
 
+  if (typeof env.wasm.trace !== 'boolean') {
+    env.wasm.trace = false;
+  }
+
   if (typeof env.wasm.numThreads !== 'number' || !Number.isInteger(env.wasm.numThreads) || env.wasm.numThreads <= 0) {
+    // Web: when crossOriginIsolated is false, SharedArrayBuffer is not available so WebAssembly threads will not work.
+    // Node.js: onnxruntime-web does not support multi-threads in Node.js.
+    if ((typeof self !== 'undefined' && !self.crossOriginIsolated) ||
+        (typeof process !== 'undefined' && process.versions && process.versions.node)) {
+      env.wasm.numThreads = 1;
+    }
     const numCpuLogicalCores = typeof navigator === 'undefined' ? cpus().length : navigator.hardwareConcurrency;
     env.wasm.numThreads = Math.min(4, Math.ceil((numCpuLogicalCores || 1) / 2));
   }
diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts
index fb714bf5996f..2c9cd88a375b 100644
--- a/js/web/lib/build-def.d.ts
+++ b/js/web/lib/build-def.d.ts
@@ -19,7 +19,7 @@ interface BuildDefinitions {
    */
   readonly DISABLE_WEBGPU: boolean;
   /**
-   * defines whether to disable the whole WebAssembly backend in the build.
+   * defines whether to disable the whole WebNN backend in the build.
    */
   readonly DISABLE_WASM: boolean;
   /**
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 499327741c82..b212c0f49df3 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -23,13 +23,10 @@ if (!BUILD_DEFS.DISABLE_WASM) {
                                                     require('./backend-wasm-training').wasmBackend;
   if (!BUILD_DEFS.DISABLE_WEBGPU) {
     registerBackend('webgpu', wasmBackend, 5);
+    registerBackend('webnn', wasmBackend, 5);
   }
   registerBackend('cpu', wasmBackend, 10);
   registerBackend('wasm', wasmBackend, 10);
-  if (BUILD_DEFS.DISABLE_TRAINING) {
-    registerBackend('xnnpack', wasmBackend, 9);
-    registerBackend('webnn', wasmBackend, 9);
-  }
 }
 
 Object.defineProperty(env.versions, 'web', {value: version, enumerable: true});
diff --git a/js/web/lib/onnxjs/model.ts b/js/web/lib/onnxjs/model.ts
index f9a1b6e76089..8e689626011b 100644
--- a/js/web/lib/onnxjs/model.ts
+++ b/js/web/lib/onnxjs/model.ts
@@ -16,6 +16,7 @@ export class Model {
   constructor() {}
 
   load(buf: Uint8Array, graphInitializer?: Graph.Initializer, isOrtFormat?: boolean): void {
+    let onnxError: Error|undefined;
     if (!isOrtFormat) {
       // isOrtFormat === false || isOrtFormat === undefined
       try {
@@ -25,10 +26,19 @@ export class Model {
         if (isOrtFormat !== undefined) {
           throw e;
         }
+        onnxError = e;
       }
     }
 
-    this.loadFromOrtFormat(buf, graphInitializer);
+    try {
+      this.loadFromOrtFormat(buf, graphInitializer);
+    } catch (e) {
+      if (isOrtFormat !== undefined) {
+        throw e;
+      }
+      // Tried both formats and failed (when isOrtFormat === undefined)
+      throw new Error(`Failed to load model as ONNX format: ${onnxError}\nas ORT format: ${e}`);
+    }
   }
 
   private loadFromOnnxFormat(buf: Uint8Array, graphInitializer?: Graph.Initializer): void {
diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
index 96c2361cceab..40f970ddf02a 100644
--- a/js/web/lib/version.ts
+++ b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 00431a4e86d5..56925b728e9a 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -13,25 +13,105 @@ export declare namespace JSEP {
   type ReleaseKernelFunction = (kernel: number) => void;
   type RunFunction =
       (kernel: number, contextDataOffset: number, sessionHandle: number, errors: Array<Promise<string|null>>) => number;
+  type CaptureBeginFunction = () => void;
+  type CaptureEndFunction = () => void;
+  type ReplayFunction = () => void;
+
+  export interface Module extends WebGpuModule {
+    /**
+     * Mount the external data file to an internal map, which will be used during session initialization.
+     *
+     * @param externalDataFilePath - specify the relative path of the external data file.
+     * @param externalDataFileData - specify the content data.
+     */
+    mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
+    /**
+     * Unmount all external data files from the internal map.
+     */
+    unmountExternalData(): void;
+
+    /**
+     * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime per
+     * backend. This function initializes Asyncify support. If name is 'webgpu', also initializes WebGPU backend and
+     * registers a few callbacks that will be called in C++ code.
+     */
+    jsepInit(name: 'webgpu', initParams: [
+      backend: BackendType, alloc: AllocFunction, free: FreeFunction, upload: UploadFunction,
+      download: DownloadFunction, createKernel: CreateKernelFunction, releaseKernel: ReleaseKernelFunction,
+      run: RunFunction, captureBegin: CaptureBeginFunction, captureEnd: CaptureEndFunction, replay: ReplayFunction
+    ]): void;
+    jsepInit(name: 'webnn', initParams?: never): void;
+  }
+
+  export interface WebGpuModule {
+    /**
+     * [exported from wasm] Specify a kernel's output when running OpKernel::Compute().
+     *
+     * @param context - specify the kernel context pointer.
+     * @param index - specify the index of the output.
+     * @param data - specify the pointer to encoded data of type and dims.
+     */
+    _JsepOutput(context: number, index: number, data: number): number;
+    /**
+     * [exported from wasm] Get name of an operator node.
+     *
+     * @param kernel - specify the kernel pointer.
+     * @returns the pointer to a C-style UTF8 encoded string representing the node name.
+     */
+    _JsepGetNodeName(kernel: number): number;
+
+    /**
+     * [exported from js_internal_api.js] Register a user GPU buffer for usage of a session's input or output.
+     *
+     * @param sessionId - specify the session ID.
+     * @param index - specify an integer to represent which input/output it is registering for. For input, it is the
+     *     input_index corresponding to the session's inputNames. For output, it is the inputCount + output_index
+     *     corresponding to the session's ouputNames.
+     * @param buffer - specify the GPU buffer to register.
+     * @param size - specify the original data size in byte.
+     * @returns the GPU data ID for the registered GPU buffer.
+     */
+    jsepRegisterBuffer: (sessionId: number, index: number, buffer: GPUBuffer, size: number) => number;
+    /**
+     * [exported from js_internal_api.js] Get the GPU buffer by GPU data ID.
+     *
+     * @param dataId - specify the GPU data ID
+     * @returns the GPU buffer.
+     */
+    jsepGetBuffer: (dataId: number) => GPUBuffer;
+    /**
+     * [exported from js_internal_api.js] Create a function to be used to create a GPU Tensor.
+     *
+     * @param gpuBuffer - specify the GPU buffer
+     * @param size - specify the original data size in byte.
+     * @param type - specify the tensor type.
+     * @returns the generated downloader function.
+     */
+    jsepCreateDownloader:
+        (gpuBuffer: GPUBuffer, size: number,
+         type: Tensor.GpuBufferDataTypes) => () => Promise<Tensor.DataTypeMap[Tensor.GpuBufferDataTypes]>;
+    /**
+     *  [exported from js_internal_api.js] Called when InferenceSession.run started. This function will be called before
+     * _OrtRun[WithBinding]() is called.
+     * @param sessionId - specify the session ID.
+     */
+    jsepOnRunStart: (sessionId: number) => void;
+    /**
+     * [exported from js_internal_api.js] Release a session. This function will be called before _OrtReleaseSession() is
+     * called.
+     * @param sessionId - specify the session ID.
+     * @returns
+     */
+    jsepOnReleaseSession: (sessionId: number) => void;
+  }
 }
 
-export interface OrtWasmModule extends EmscriptenModule {
-  // #region emscripten functions
-  stackSave(): number;
-  stackRestore(stack: number): void;
-  stackAlloc(size: number): number;
-
-  UTF8ToString(offset: number, maxBytesToRead?: number): string;
-  lengthBytesUTF8(str: string): number;
-  stringToUTF8(str: string, offset: number, maxBytes: number): void;
-  // #endregion
-
-  // #region ORT APIs
+export interface OrtInferenceAPIs {
   _OrtInit(numThreads: number, loggingLevel: number): number;
 
   _OrtGetLastError(errorCodeOffset: number, errorMessageOffset: number): void;
 
-  _OrtCreateSession(dataOffset: number, dataLength: number, sessionOptionsHandle: number): number;
+  _OrtCreateSession(dataOffset: number, dataLength: number, sessionOptionsHandle: number): Promise<number>;
   _OrtReleaseSession(sessionHandle: number): void;
   _OrtGetInputOutputCount(sessionHandle: number, inputCountOffset: number, outputCountOffset: number): number;
   _OrtGetInputName(sessionHandle: number, index: number): number;
@@ -71,112 +151,61 @@ export interface OrtWasmModule extends EmscriptenModule {
   _OrtReleaseRunOptions(runOptionsHandle: number): void;
 
   _OrtEndProfiling(sessionHandle: number): number;
-  // #endregion
+}
 
-  // #region ORT Training APIs
-  _OrtTrainingLoadCheckpoint?(dataOffset: number, dataLength: number): number;
+export interface OrtTrainingAPIs {
+  _OrtTrainingLoadCheckpoint(dataOffset: number, dataLength: number): number;
 
-  _OrtTrainingReleaseCheckpoint?(checkpointHandle: number): void;
+  _OrtTrainingReleaseCheckpoint(checkpointHandle: number): void;
 
-  _OrtTrainingCreateSession?
-      (sessionOptionsHandle: number, checkpointHandle: number, trainOffset: number, trainLength: number,
-       evalOffset: number, evalLength: number, optimizerOffset: number, optimizerLength: number): number;
+  _OrtTrainingCreateSession(
+      sessionOptionsHandle: number, checkpointHandle: number, trainOffset: number, trainLength: number,
+      evalOffset: number, evalLength: number, optimizerOffset: number, optimizerLength: number): number;
 
-  _OrtTrainingLazyResetGrad?(trainingHandle: number): number;
+  _OrtTrainingLazyResetGrad(trainingHandle: number): number;
 
-  _OrtTrainingRunTrainStep?
-      (trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
-       runOptionsHandle: number): number;
+  _OrtTrainingRunTrainStep(
+      trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
+      runOptionsHandle: number): number;
 
-  _OrtTrainingOptimizerStep?(trainingHandle: number, runOptionsHandle: number): number;
+  _OrtTrainingOptimizerStep(trainingHandle: number, runOptionsHandle: number): number;
 
-  _OrtTrainingEvalStep?
-      (trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
-       runOptionsHandle: number): number;
+  _OrtTrainingEvalStep(
+      trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
+      runOptionsHandle: number): number;
 
-  _OrtTrainingGetParametersSize?(trainingHandle: number, paramSizeT: number, trainableOnly: boolean): number;
+  _OrtTrainingGetParametersSize(trainingHandle: number, paramSizeT: number, trainableOnly: boolean): number;
 
-  _OrtTrainingCopyParametersToBuffer?
-      (trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
+  _OrtTrainingCopyParametersToBuffer(
+      trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
 
-  _OrtTrainingCopyParametersFromBuffer?
-      (trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
+  _OrtTrainingCopyParametersFromBuffer(
+      trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
 
-  _OrtTrainingGetModelInputOutputCount?
-      (trainingHandle: number, inputCount: number, outputCount: number, isEvalModel: boolean): number;
-  _OrtTrainingGetModelInputOutputName?
-      (trainingHandle: number, index: number, isInput: boolean, isEvalModel: boolean): number;
+  _OrtTrainingGetModelInputOutputCount(
+      trainingHandle: number, inputCount: number, outputCount: number, isEvalModel: boolean): number;
+  _OrtTrainingGetModelInputOutputName(trainingHandle: number, index: number, isInput: boolean, isEvalModel: boolean):
+      number;
 
-  _OrtTrainingReleaseSession?(trainingHandle: number): void;
+  _OrtTrainingReleaseSession(trainingHandle: number): void;
+}
+
+export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Partial<OrtTrainingAPIs>,
+                                       Partial<JSEP.Module> {
+  // #region emscripten functions
+  stackSave(): number;
+  stackRestore(stack: number): void;
+  stackAlloc(size: number): number;
+
+  UTF8ToString(offset: number, maxBytesToRead?: number): string;
+  lengthBytesUTF8(str: string): number;
+  stringToUTF8(str: string, offset: number, maxBytes: number): void;
   // #endregion
 
   // #region config
+  numThreads?: number;
   mainScriptUrlOrBlob?: string|Blob;
   // #endregion
-
-  // #region JSEP
-  /**
-   * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime.
-   * This function initializes WebGPU backend and registers a few callbacks that will be called in C++ code.
-   */
-  jsepInit?
-      (backend: JSEP.BackendType, alloc: JSEP.AllocFunction, free: JSEP.FreeFunction, upload: JSEP.UploadFunction,
-       download: JSEP.DownloadFunction, createKernel: JSEP.CreateKernelFunction,
-       releaseKernel: JSEP.ReleaseKernelFunction, run: JSEP.RunFunction): void;
-
-  /**
-   * [exported from wasm] Specify a kernel's output when running OpKernel::Compute().
-   *
-   * @param context - specify the kernel context pointer.
-   * @param index - specify the index of the output.
-   * @param data - specify the pointer to encoded data of type and dims.
-   */
-  _JsepOutput(context: number, index: number, data: number): number;
-  /**
-   * [exported from wasm] Get name of an operator node.
-   *
-   * @param kernel - specify the kernel pointer.
-   * @returns the pointer to a C-style UTF8 encoded string representing the node name.
-   */
-  _JsepGetNodeName(kernel: number): number;
-
-  /**
-   * [exported from js_internal_api.js] Register a user GPU buffer for usage of a session's input or output.
-   *
-   * @param sessionId - specify the session ID.
-   * @param index - specify an integer to represent which input/output it is registering for. For input, it is the
-   *     input_index corresponding to the session's inputNames. For output, it is the inputCount + output_index
-   *     corresponding to the session's ouputNames.
-   * @param buffer - specify the GPU buffer to register.
-   * @param size - specify the original data size in byte.
-   * @returns the GPU data ID for the registered GPU buffer.
-   */
-  jsepRegisterBuffer: (sessionId: number, index: number, buffer: GPUBuffer, size: number) => number;
-  /**
-   * [exported from js_internal_api.js] Unregister all user GPU buffers for a session.
-   *
-   * @param sessionId - specify the session ID.
-   */
-  jsepUnregisterBuffers?: (sessionId: number) => void;
-  /**
-   * [exported from js_internal_api.js] Get the GPU buffer by GPU data ID.
-   *
-   * @param dataId - specify the GPU data ID
-   * @returns the GPU buffer.
-   */
-  jsepGetBuffer: (dataId: number) => GPUBuffer;
-  /**
-   * [exported from js_internal_api.js] Create a function to be used to create a GPU Tensor.
-   *
-   * @param gpuBuffer - specify the GPU buffer
-   * @param size - specify the original data size in byte.
-   * @param type - specify the tensor type.
-   * @returns the generated downloader function.
-   */
-  jsepCreateDownloader:
-      (gpuBuffer: GPUBuffer, size: number,
-       type: Tensor.GpuBufferDataTypes) => () => Promise<Tensor.DataTypeMap[Tensor.GpuBufferDataTypes]>;
-  // #endregion
 }
 
 declare const moduleFactory: EmscriptenModuleFactory<OrtWasmModule>;
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 6c3d22352772..1b421029cc7a 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -1,14 +1,37 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Env, Tensor} from 'onnxruntime-common';
+import {Env, Tensor, TRACE, TRACE_FUNC_BEGIN, TRACE_FUNC_END} from 'onnxruntime-common';
+
+import {DataType, tensorDataTypeEnumToString} from '../wasm-common';
 
 import {configureLogger, LOG_DEBUG} from './log';
 import {createView, TensorView} from './tensor-view';
 import {createGpuDataManager, downloadGpuData, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency} from './webgpu/types';
+import {AdapterInfo, ComputeContext, GpuArchitecture, GpuData, GpuVendor, ProgramInfo, ProgramInputTensorInfoDependency, SessionState, TimestampQuery} from './webgpu/types';
+
+interface CommandInfo {
+  readonly kernelId: number;
+  readonly computePipeline: GPUComputePipeline;
+  readonly bindGroup: GPUBindGroup;
+  readonly dispatchGroup: [number, number, number];
+}
+
+interface KernelInfo {
+  readonly kernelType: string;
+  readonly kernelName: string;
+  readonly kernelEntry: RunFunction;
+  readonly attributes: [((attribute: unknown) => unknown)|undefined, unknown];
+}
+
+interface PendingKernelInfo {
+  readonly kernelId: number;
+  readonly programName: string;
+  readonly inputTensorViews: readonly TensorView[];
+  readonly outputTensorViews: readonly TensorView[];
+}
 
 const getProgramInputTensorInfoDependencyKey =
     (inputTensors: readonly TensorView[], inputDependencies: readonly ProgramInputTensorInfoDependency[]): string => {
@@ -71,11 +94,32 @@ const getProgramInfoUniqueKey =
       return key;
     };
 
+class AdapterInfoImpl implements AdapterInfo {
+  readonly architecture?: string;
+  readonly vendor?: string;
+
+  constructor(adapterInfo: GPUAdapterInfo) {
+    if (adapterInfo) {
+      this.architecture = adapterInfo.architecture;
+      this.vendor = adapterInfo.vendor;
+    }
+  }
+
+  isArchitecture(architecture: GpuArchitecture): boolean {
+    return this.architecture === architecture;
+  }
+
+  isVendor(vendor: GpuVendor): boolean {
+    return this.vendor === vendor;
+  }
+}
+
 /**
  * this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
  * the first parameter so that it is stored for future use.
  */
 export class WebGpuBackend {
+  adapterInfo: AdapterInfoImpl;
   device: GPUDevice;
   /**
    * an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
@@ -87,6 +131,13 @@ export class WebGpuBackend {
    */
   programManager: ProgramManager;
 
+  /**
+   * representing the session ID of which is currently being run.
+   * `null` means no session is being run.
+   * only valid when session.run is executed.
+   */
+  currentSessionId: number|null = null;
+
   /**
    * representing the kernel ID of which is currently being computed (CPU code perspective).
    * `null` means no kernel is being computed.
@@ -122,22 +173,33 @@ export class WebGpuBackend {
     return data;
   }
 
-  /**
-   * a KernelID -> kernel info mapping. value is
-   * [ op_type, name, run function, [optional] preprocess_attribute_once function ]
-   */
-  kernels: Map<number, [string, string, RunFunction, [((attribute: unknown) => unknown) | undefined, unknown]]>;
-
+  // KernelID -> kernelInfo mapping
+  kernels: Map<number, KernelInfo>;
   private commandEncoder: GPUCommandEncoder|null = null;
   private computePassEncoder: GPUComputePassEncoder|null = null;
+  maxDispatchNumber = 16;
   pendingDispatchNumber = 0;
 
-  queryData?: GpuData;
-  querySet?: GPUQuerySet;
-  querySetCount = 2;
-  queryTimeBase?: bigint;
+  // info of kernels pending submission for a single batch
+  private pendingKernels: PendingKernelInfo[] = [];
+  // queryReadBuffer -> pendingKernels mapping for all the batches
+  private pendingQueries: Map<GPUBuffer, PendingKernelInfo[]> = new Map();
+  private queryResolveBuffer?: GPUBuffer;
+  private querySet?: GPUQuerySet;
+  private queryTimeBase?: bigint;
+  queryType: TimestampQuery;
 
   env: Env;
+  sessionStatus: SessionState = 'default';
+  /**
+   * a SessionID -> CommandInfo[] mapping. It's used to record all GPU commands for corresponding session.
+   */
+  capturedCommandList: Map<number, CommandInfo[]> = new Map();
+
+  /**
+   * a SessionID -> PendingKernelInfo[] mapping for profiling.
+   */
+  private capturedPendingKernels: Map<number, PendingKernelInfo[]> = new Map();
 
   /**
    * a SessionID -> a Map of (InputOutputIndex -> [ID, GPUBuffer]) mapping.
@@ -161,7 +223,9 @@ export class WebGpuBackend {
       requiredFeatures,
     };
 
-    if (adapter.features.has('timestamp-query')) {
+    if (adapter.features.has('chromium-experimental-timestamp-query-inside-passes')) {
+      requiredFeatures.push('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName);
+    } else if (adapter.features.has('timestamp-query')) {
       requiredFeatures.push('timestamp-query');
     }
     if (adapter.features.has('shader-f16')) {
@@ -169,6 +233,7 @@ export class WebGpuBackend {
     }
 
     this.device = await adapter.requestDevice(deviceDescriptor);
+    this.adapterInfo = new AdapterInfoImpl(await adapter.requestAdapterInfo());
     this.gpuDataManager = createGpuDataManager(this);
     this.programManager = new ProgramManager(this);
     this.kernels = new Map();
@@ -187,7 +252,13 @@ export class WebGpuBackend {
       }
     };
 
-    Object.defineProperty(this.env.webgpu, 'device', {value: this.device});
+    Object.defineProperty(
+        this.env.webgpu, 'device', {value: this.device, writable: false, enumerable: true, configurable: false});
+    Object.defineProperty(
+        this.env.webgpu, 'adapter', {value: adapter, writable: false, enumerable: true, configurable: false});
+
+    // init queryType, which is necessary for InferenceSession.create
+    this.setQueryType();
   }
 
   dispose(): void {
@@ -206,22 +277,18 @@ export class WebGpuBackend {
 
   getComputePassEncoder(): GPUComputePassEncoder {
     if (!this.computePassEncoder) {
+      const commandEncoder = this.getCommandEncoder();
       const computePassDescriptor: GPUComputePassDescriptor = {};
-      if (this.isQueryEnabled()) {
-        if (typeof this.querySet === 'undefined') {
-          this.querySet = this.device.createQuerySet({
-            type: 'timestamp',
-            count: this.querySetCount,
-          });
-        }
+
+      if (this.queryType === 'at-passes') {
         computePassDescriptor.timestampWrites = {
-          querySet: this.querySet,
-          beginningOfPassWriteIndex: 0,
-          endOfPassWriteIndex: 1,
+          querySet: this.querySet!,
+          beginningOfPassWriteIndex: this.pendingDispatchNumber * 2,
+          endOfPassWriteIndex: this.pendingDispatchNumber * 2 + 1,
         };
       }
 
-      this.computePassEncoder = this.getCommandEncoder().beginComputePass(computePassDescriptor);
+      this.computePassEncoder = commandEncoder.beginComputePass(computePassDescriptor);
     }
     return this.computePassEncoder;
   }
@@ -234,19 +301,95 @@ export class WebGpuBackend {
   }
 
   flush(): void {
-    if (this.commandEncoder) {
-      this.endComputePass();
-      this.device.queue.submit([this.getCommandEncoder().finish()]);
-      this.gpuDataManager.refreshPendingBuffers();
-      this.commandEncoder = null;
-      this.pendingDispatchNumber = 0;
+    if (!this.commandEncoder) {
+      return;
     }
-  }
 
-  isQueryEnabled(): boolean {
-    return this.device.features.has('timestamp-query') &&
-        (this.env.webgpu.profiling?.mode === 'default' ||
-         (!this.env.webgpu.profiling?.mode && this.env.webgpu.profilingMode === 'default'));
+    TRACE_FUNC_BEGIN();
+
+    this.endComputePass();
+    let queryReadBuffer: GPUBuffer;
+    if (this.queryType !== 'none') {
+      this.commandEncoder.resolveQuerySet(
+          this.querySet!, 0, this.pendingDispatchNumber * 2, this.queryResolveBuffer!, 0);
+
+      queryReadBuffer = this.device.createBuffer(
+          // eslint-disable-next-line no-bitwise
+          {size: this.pendingDispatchNumber * 2 * 8, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST});
+
+      this.pendingQueries.set(queryReadBuffer, this.pendingKernels);
+      this.pendingKernels = [];
+      this.commandEncoder.copyBufferToBuffer(
+          this.queryResolveBuffer!, 0, queryReadBuffer, 0, this.pendingDispatchNumber * 2 * 8);
+    }
+
+    this.device.queue.submit([this.commandEncoder.finish()]);
+    this.gpuDataManager.refreshPendingBuffers();
+    this.commandEncoder = null;
+    this.pendingDispatchNumber = 0;
+
+    if (this.queryType !== 'none') {
+      void queryReadBuffer!.mapAsync(GPUMapMode.READ).then(() => {
+        const mappedData = new BigUint64Array(queryReadBuffer.getMappedRange());
+        const pendingKernels = this.pendingQueries.get(queryReadBuffer)!;
+        for (let i = 0; i < mappedData.length / 2; i++) {
+          const pendingKernelInfo = pendingKernels[i];
+          const kernelId = pendingKernelInfo.kernelId;
+          const kernelInfo = this.kernels.get(kernelId)!;
+          const kernelType = kernelInfo.kernelType;
+          const kernelName = kernelInfo.kernelName;
+          const programName = pendingKernelInfo.programName;
+          const inputTensorViews = pendingKernelInfo.inputTensorViews;
+          const outputTensorViews = pendingKernelInfo.outputTensorViews;
+          const startTimeU64 = mappedData[i * 2];
+          const endTimeU64 = mappedData[i * 2 + 1];
+
+          if (typeof this.queryTimeBase === 'undefined') {
+            this.queryTimeBase = startTimeU64;
+          }
+
+          const startTime = Number(startTimeU64 - this.queryTimeBase);
+          const endTime = Number(endTimeU64 - this.queryTimeBase);
+
+          if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
+            throw new RangeError('incorrect timestamp range');
+          }
+
+          if (this.env.webgpu.profiling?.ondata) {
+            this.env.webgpu.profiling.ondata({
+              version: 1,
+              inputsMetadata: inputTensorViews.map(
+                  value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
+              outputsMetadata: outputTensorViews.map(
+                  value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
+              kernelId,
+              kernelType,
+              kernelName,
+              programName,
+              startTime,
+              endTime,
+            });
+          } else {
+            // if no callback is provided, print the profiling message to console
+            let inputShapes = '';
+            inputTensorViews.forEach((value, i) => {
+              inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
+            });
+            let outputShapes = '';
+            outputTensorViews.forEach((value, i) => {
+              outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
+            });
+            // eslint-disable-next-line no-console
+            console.log(`[profiling] kernel "${kernelId}|${kernelType}|${kernelName}|${programName}" ${inputShapes}${
+                outputShapes}execution time: ${endTime - startTime} ns`);
+          }
+          TRACE('GPU', `${programName}::${startTimeU64}::${endTimeU64}`);
+        }
+        queryReadBuffer.unmap();
+        this.pendingQueries.delete(queryReadBuffer);
+      });
+    }
+    TRACE_FUNC_END();
   }
 
   /**
@@ -263,14 +406,20 @@ export class WebGpuBackend {
   run(program: ProgramInfo, inputTensorViews: readonly TensorView[], outputIndices: readonly number[],
       createKernelOutput: (index: number, dataType: number, dims: readonly number[]) => TensorView,
       createIntermediateOutput: (dataType: number, dims: readonly number[]) => TensorView): TensorView[] {
+    TRACE_FUNC_BEGIN(program.name);
     // create info for inputs
     const inputDatas: GpuData[] = [];
     for (let i = 0; i < inputTensorViews.length; ++i) {
-      const gpuData = this.gpuDataManager.get(inputTensorViews[i].data);
+      const data = inputTensorViews[i].data;
+      // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
+      if (data === 0) {
+        continue;
+      }
+      const gpuData = this.gpuDataManager.get(data);
       if (!gpuData) {
-        throw new Error(`no GPU data for input: ${inputTensorViews[i].data}`);
+        throw new Error(`no GPU data for input: ${data}`);
       }
-      inputDatas[i] = gpuData;
+      inputDatas.push(gpuData);
     }
 
     const {outputs, dispatchGroup, programUniforms} = program.getRunData(inputTensorViews);
@@ -300,6 +449,11 @@ export class WebGpuBackend {
       const tensorView = (isTemporary || isPersistent) ?
           createIntermediateOutput(outputs[i].dataType, outputs[i].dims) :
           createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims);
+      outputTensorViews.push(tensorView);
+      // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
+      if (tensorView.data === 0) {
+        continue;
+      }
       const gpuData = this.gpuDataManager.get(tensorView.data);
       if (!gpuData) {
         throw new Error(`no GPU data for output: ${tensorView.data}`);
@@ -315,10 +469,24 @@ export class WebGpuBackend {
         }
         persistentData.push(gpuData);
       }
-      outputTensorViews.push(tensorView);
       outputDatas.push(gpuData);
     }
 
+    // when there are any zero-sized tensor in the inputs or outputs, we should report error unless all outputs are
+    // zero-sized tensors.
+    if (inputDatas.length !== inputTensorViews.length || outputDatas.length !== outputTensorViews.length) {
+      // if all outputs are zero-sized tensors, there is no need to run the program.
+      if (outputDatas.length === 0) {
+        TRACE_FUNC_END(program.name);
+        return outputTensorViews;
+      }
+      // if some outputs are zero-sized tensors, report an error.
+      //
+      // TODO: so far we don't see any use case that outputs include both zero-sized tensors and non-zero-sized tensors.
+      // If we see such use case, we need to make a change here to support it.
+      throw new Error(
+          `Program ${program.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`);
+    }
 
     // load uniforms
     // TODO: add cache for uniform (is it necessary?)
@@ -334,13 +502,26 @@ export class WebGpuBackend {
           return;
         }
         // https://www.w3.org/TR/WGSL/#alignof
-        const baseAlignment = data.length <= 2 ? data.length * 4 : 16;
+        const sizeOfElement = v.type === DataType.float16 ? 2 : 4;
+        let sizeOfVecOrMat;
+        let baseAlignment;
+        if (v.type === DataType.float16) {
+          baseAlignment = data.length > 4 ? 16 : (data.length > 2 ? 8 : data.length * sizeOfElement);
+          sizeOfVecOrMat = data.length > 4 ? 16 : sizeOfElement * data.length;
+        } else {
+          baseAlignment = data.length <= 2 ? data.length * sizeOfElement : 16;
+          sizeOfVecOrMat = 16;
+        }
         currentOffset = Math.ceil(currentOffset / baseAlignment) * baseAlignment;
         offsets.push(currentOffset);
-        // When data.length > 4, the uniform variable is of type array<vec4<i32|u32|f32>,N>, where N =
-        // Math.ceil(data.length / 4) and SizeOf(vec4<i32|u32|f32>) = 16. The total byte length is N *
-        // SizeOf(vec4<i32|u32|f32>).
-        currentOffset += data.length > 4 ? Math.ceil(data.length / 4) * 16 : data.length * 4;
+        // For non-float16 type, when data.length > 4, the uniform variable is of type array<vec4<i32|u32|f32>,N>, where
+        // N = Math.ceil(data.length / 4) and SizeOf(vec4<i32|u32|f32>) = 16. The total byte length is N *
+        // SizeOf(vec4<i32|u32|f32>). For float16 type, when data.length > 4, the uniform variable is of type
+        // array<mat2x4<f16>,N>, where N = Math.ceil(data.length / 8) and SizeOf(mat2x4<f16>) = 16. The total byte
+        // length is N * SizeOf(mat2x4<f16>).
+        const elementPerVecOrMat = v.type === DataType.float16 ? 8 : 4;
+        currentOffset += data.length > 4 ? Math.ceil(data.length / elementPerVecOrMat) * sizeOfVecOrMat :
+                                           data.length * sizeOfElement;
       });
 
       // Meet alignment of struct here: https://www.w3.org/TR/WGSL/#alignment-and-size. For simplicity, set
@@ -351,12 +532,17 @@ export class WebGpuBackend {
       programUniforms.forEach((v, i) => {
         const offset = offsets[i];
         const data = typeof v.data === 'number' ? [v.data] : v.data;
-        if (v.type === 'int32') {
+        if (v.type === DataType.int32) {
           new Int32Array(arrayBuffer, offset, data.length).set(data);
-        } else if (v.type === 'uint32') {
+        } else if (v.type === DataType.uint32) {
           new Uint32Array(arrayBuffer, offset, data.length).set(data);
-        } else {
+        } else if (v.type === DataType.float16) {
+          // TODO: use Float16Array.
+          new Uint16Array(arrayBuffer, offset, data.length).set(data);
+        } else if (v.type === DataType.float) {
           new Float32Array(arrayBuffer, offset, data.length).set(data);
+        } else {
+          throw new Error(`Unsupported uniform type: ${tensorDataTypeEnumToString(v.type)}`);
         }
       });
 
@@ -379,14 +565,47 @@ export class WebGpuBackend {
       LOG_DEBUG('info', () => `[artifact] key: ${key}, programName: ${program.name}`);
     }
 
+    // validate uniform variables
+    if (programUniforms && artifact.uniformVariablesInfo) {
+      if (programUniforms.length !== artifact.uniformVariablesInfo.length) {
+        throw new Error(`Uniform variables count mismatch: expect ${artifact.uniformVariablesInfo.length}, got ${
+            programUniforms.length} in program "${artifact.programInfo.name}".`);
+      }
+      for (let i = 0; i < programUniforms.length; i++) {
+        const uniform = programUniforms[i];
+        const actualType = uniform.type;
+        const actualLength = typeof uniform.data === 'number' ? 1 : uniform.data.length;
+        const [type, length] = artifact.uniformVariablesInfo[i];
+        if (actualType !== type || actualLength !== length) {
+          throw new Error(`Uniform variable ${i} mismatch: expect type ${type} with size ${length}, got type ${
+              actualType} with size ${actualLength} in program "${artifact.programInfo.name}".`);
+        }
+      }
+    }
+
     LOG_DEBUG(
         'info',
         () => `[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
             normalizedDispatchGroup[1]}x${normalizedDispatchGroup[2]}`);
-    this.programManager.run(
-        artifact, inputTensorViews, outputTensorViews, inputDatas, outputDatas, normalizedDispatchGroup,
-        uniformBufferBinding);
 
+    if (this.queryType !== 'none' || this.sessionStatus === 'capturing') {
+      const pendingKernelInfo: PendingKernelInfo = {
+        kernelId: this.currentKernelId!,
+        programName: artifact.programInfo.name,
+        inputTensorViews,
+        outputTensorViews,
+      };
+      this.pendingKernels.push(pendingKernelInfo);
+
+      if (this.sessionStatus === 'capturing') {
+        const sessionPendingKernels = this.capturedPendingKernels.get(this.currentSessionId!);
+        sessionPendingKernels!.push(pendingKernelInfo);
+      }
+    }
+
+    this.programManager.run(artifact, inputDatas, outputDatas, normalizedDispatchGroup, uniformBufferBinding);
+
+    TRACE_FUNC_END(program.name);
     return outputTensorViews;
   }
 
@@ -412,13 +631,19 @@ export class WebGpuBackend {
     return this.gpuDataManager.release(ptr);
   }
 
-  createKernel(opType: string, kernelId: number, attribute: unknown, nodeName: string): void {
-    const op = WEBGPU_OP_RESOLVE_RULES.get(opType);
+  createKernel(kernelType: string, kernelId: number, attribute: unknown, kernelName: string): void {
+    const op = WEBGPU_OP_RESOLVE_RULES.get(kernelType);
     if (!op) {
-      throw new Error(`kernel not implemented: ${opType}`);
+      throw new Error(`kernel not implemented: ${kernelType}`);
     }
 
-    this.kernels.set(kernelId, [opType, nodeName, op[0], [op[1], attribute]]);
+    const kernelInfo: KernelInfo = {
+      kernelType,
+      kernelName,
+      kernelEntry: op[0],
+      attributes: [op[1], attribute],
+    };
+    this.kernels.set(kernelId, kernelInfo);
   }
 
   releaseKernel(kernelId: number): void {
@@ -439,9 +664,12 @@ export class WebGpuBackend {
     if (!kernel) {
       throw new Error(`kernel not created: ${kernelId}`);
     }
-    const [opType, nodeName, kernelEntry, attributes] = kernel;
+    const kernelType = kernel.kernelType;
+    const kernelName = kernel.kernelName;
+    const kernelEntry = kernel.kernelEntry;
+    const attributes = kernel.attributes;
     if (this.currentKernelId !== null) {
-      throw new Error(`kernel "[${opType}] ${nodeName}" is not allowed to be called recursively`);
+      throw new Error(`kernel "[${kernelType}] ${kernelName}" is not allowed to be called recursively`);
     }
     this.currentKernelId = kernelId;
 
@@ -451,7 +679,7 @@ export class WebGpuBackend {
       attributes[0] = undefined;
     }
 
-    LOG_DEBUG('info', () => `[WebGPU] Start to run kernel "[${opType}] ${nodeName}"...`);
+    LOG_DEBUG('info', () => `[WebGPU] Start to run kernel "[${kernelType}] ${kernelName}"...`);
 
     const useErrorScope = this.env.debug;
 
@@ -464,12 +692,12 @@ export class WebGpuBackend {
       kernelEntry(context, attributes[1]);
       return 0;  // ORT_OK
     } catch (e) {
-      errors.push(Promise.resolve(`[WebGPU] Kernel "[${opType}] ${nodeName}" failed. ${e}`));
+      errors.push(Promise.resolve(`[WebGPU] Kernel "[${kernelType}] ${kernelName}" failed. ${e}`));
       return 1;  // ORT_FAIL
     } finally {
       if (useErrorScope) {
         errors.push(this.device.popErrorScope().then(
-            err => err ? `GPU validation error for kernel "[${opType}] ${nodeName}": ${err.message}` : null));
+            err => err ? `GPU validation error for kernel "[${kernelType}] ${kernelName}": ${err.message}` : null));
       }
 
       for (const data of this.temporaryData) {
@@ -515,4 +743,98 @@ export class WebGpuBackend {
     };
   }
   // #endregion
+  writeTimestamp(index: number): void {
+    if (this.queryType !== 'inside-passes') {
+      return;
+    }
+
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    (this.computePassEncoder as any).writeTimestamp(this.querySet, index);
+  }
+  setQueryType(): void {
+    this.queryType = 'none';
+    if (this.env.webgpu.profiling?.mode === 'default' ||
+        (typeof this.env.trace === 'undefined' ? this.env.wasm.trace : this.env.trace)) {
+      if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) {
+        this.queryType = 'inside-passes';
+      } else if (this.device.features.has('timestamp-query')) {
+        this.queryType = 'at-passes';
+      }
+
+      if (this.queryType !== 'none' && typeof this.querySet === 'undefined') {
+        this.querySet = this.device.createQuerySet({
+          type: 'timestamp',
+          count: this.maxDispatchNumber * 2,
+        });
+        this.queryResolveBuffer = this.device.createBuffer(
+            // eslint-disable-next-line no-bitwise
+            {size: this.maxDispatchNumber * 2 * 8, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE});
+      }
+    }
+  }
+
+  captureBegin(): void {
+    LOG_DEBUG('info', 'captureBegin');
+    if (!this.capturedCommandList.get(this.currentSessionId!)) {
+      this.capturedCommandList.set(this.currentSessionId!, []);
+    }
+    if (!this.capturedPendingKernels.get(this.currentSessionId!)) {
+      this.capturedPendingKernels.set(this.currentSessionId!, []);
+    }
+    // flush the left commands before we change the status.
+    this.flush();
+    this.sessionStatus = 'capturing';
+  }
+  captureEnd(): void {
+    LOG_DEBUG('info', 'captureEnd');
+    // flush the left commands before we change the status.
+    this.flush();
+    this.sessionStatus = 'default';
+  }
+  replay(): void {
+    LOG_DEBUG('info', 'replay');
+    this.sessionStatus = 'replaying';
+    const sessionCommandList = this.capturedCommandList.get(this.currentSessionId!);
+    const sessionPendingKernels = this.capturedPendingKernels.get(this.currentSessionId!);
+    const length = sessionCommandList!.length;
+    this.pendingKernels = [];
+    for (let i = 0; i < length; i++) {
+      const computePassEncoder = this.getComputePassEncoder();
+      const command = sessionCommandList![i];
+      this.writeTimestamp(this.pendingDispatchNumber * 2);
+      computePassEncoder.setPipeline(command.computePipeline);
+      computePassEncoder.setBindGroup(0, command.bindGroup);
+      computePassEncoder.dispatchWorkgroups(...command.dispatchGroup);
+      this.writeTimestamp(this.pendingDispatchNumber * 2 + 1);
+      this.pendingDispatchNumber++;
+      if (this.queryType !== 'none') {
+        this.pendingKernels.push(sessionPendingKernels![i]);
+      }
+      if (this.pendingDispatchNumber >= this.maxDispatchNumber || this.queryType === 'at-passes') {
+        this.endComputePass();
+      }
+      if (this.pendingDispatchNumber >= this.maxDispatchNumber) {
+        this.flush();
+      }
+    }
+    // flush the left commands before we change the status.
+    this.flush();
+    this.sessionStatus = 'default';
+  }
+
+  onReleaseSession(sessionId: number): void {
+    this.unregisterBuffers(sessionId);
+    if (this.capturedCommandList.has(sessionId)) {
+      this.capturedCommandList.delete(sessionId);
+    }
+    if (this.capturedPendingKernels.has(sessionId)) {
+      this.capturedPendingKernels.delete(sessionId);
+    }
+    this.gpuDataManager.onReleaseSession(sessionId);
+  }
+
+  onRunStart(sessionId: number): void {
+    this.currentSessionId = sessionId;
+    this.setQueryType();
+  }
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 3c6edf3ebb35..1ceae2394f46 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -10,7 +10,7 @@ import {WebGpuBackend} from './backend-webgpu';
 import {LOG_DEBUG} from './log';
 import {TensorView} from './tensor-view';
 import {ShapeUtil} from './util';
-import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
+import {AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
 
 /* eslint-disable no-bitwise */
 
@@ -54,6 +54,7 @@ class TensorViewImpl implements TensorView {
 }
 
 class ComputeContextImpl implements ComputeContext {
+  readonly adapterInfo: AdapterInfo;
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
   readonly outputCount: number;
@@ -66,6 +67,7 @@ class ComputeContextImpl implements ComputeContext {
   private customDataOffset = 0;
   private customDataSize = 0;
   constructor(private module: OrtWasmModule, private backend: WebGpuBackend, contextDataOffset: number) {
+    this.adapterInfo = backend.adapterInfo;
     const heapU32 = module.HEAPU32;
 
     // extract context data
@@ -90,6 +92,17 @@ class ComputeContextImpl implements ComputeContext {
     this.inputs = inputs;
   }
 
+  getMaxComputeWorkgroupSizes(): [number, number, number] {
+    return [
+      this.backend.device.limits.maxComputeWorkgroupSizeX, this.backend.device.limits.maxComputeWorkgroupSizeY,
+      this.backend.device.limits.maxComputeWorkgroupSizeZ
+    ];
+  }
+
+  getMaxComputeWorkgroupStoragesize(): number {
+    return this.backend.device.limits.maxComputeWorkgroupStorageSize;
+  }
+
   compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[] {
     // prepare inputs. inputs should always be valid data.
     const mappedInputs =
@@ -104,7 +117,8 @@ class ComputeContextImpl implements ComputeContext {
         throw new Error(`Unsupported data type: ${dataType}`);
       }
       const bufferSize = elementSize * ShapeUtil.size(dims);
-      return new TensorViewImpl(this.module, dataType, this.backend.gpuDataManager.create(bufferSize).id, dims);
+      const gpuDataId = bufferSize > 0 ? this.backend.gpuDataManager.create(bufferSize).id : 0;
+      return new TensorViewImpl(this.module, dataType, gpuDataId, dims);
     };
     return this.backend.run(program, mappedInputs, outputIndices, createKernelOutput, createTemporaryOutput);
   }
@@ -118,7 +132,7 @@ class ComputeContextImpl implements ComputeContext {
       for (let i = 0; i < dims.length; i++) {
         this.module.HEAPU32[offset++] = dims[i];
       }
-      return this.module._JsepOutput(this.opKernelContext, index, data);
+      return this.module._JsepOutput!(this.opKernelContext, index, data);
     } catch (e) {
       throw new Error(
           `Failed to generate kernel's output[${index}] with dims [${dims}]. ` +
@@ -133,27 +147,39 @@ class ComputeContextImpl implements ComputeContext {
 /**
  * Initialize JSEP with WebGPU backend.
  *
- * This function will be called only once after the WebAssembly module is loaded and initialized ("_OrtInit" is called).
- * This function expects:
+ * This function will be called after the WebAssembly module is loaded and initialized ("_OrtInit" is called), once for
+ * each of the following EPs if they are specified:
+ * - "webgpu"
+ * - "webnn"
+ *
+ * For WebGPU, this function expects:
  *  - WebGPU is enabled in build (BUILD_DEFS.DISABLE_WEBGPU === false).
  *  - WebGPU is available in current environment. (a valid GPUAdapter is passed in)
+ *
+ * For WebNN, this function expects:
+ * - WebNN is enabled in build (BUILD_DEFS.DISABLE_WEBGPU === false).
+ * - WebNN is available in current environment. (navigator.ml is not undefined)
+ *
  * If the WebAssembly module is not built with JSEP support, this function will throw an error. This will invalidate
- * 'webgpu' backend.
+ * 'webgpu'/'webnn' backend.
  *
+ * @param name - the name of the EP, either "webgpu" or "webnn"
  * @param module - the ORT WebAssembly module
  * @param env - the ORT environment variable (ort.env)
  * @param gpuAdapter - the pre-created GPU adapter
  */
-export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapter): Promise<void> => {
+export const init =
+    async(name: 'webgpu'|'webnn', module: OrtWasmModule, env: Env, gpuAdapter?: GPUAdapter): Promise<void> => {
   const jsepInit = module.jsepInit;
   if (!jsepInit) {
     throw new Error('Failed to initialize JSEP. The WebAssembly module is not built with JSEP support.');
   }
 
-  const backend = new WebGpuBackend();
-  await backend.initialize(env, gpuAdapter);
+  if (name === 'webgpu') {
+    const backend = new WebGpuBackend();
+    await backend.initialize(env, gpuAdapter!);
 
-  jsepInit(
+    jsepInit('webgpu', [
       // backend
       backend,
 
@@ -170,7 +196,7 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
           backend.memcpy(src, dst);
         } else {
           LOG_DEBUG('verbose', () => `[WebGPU] jsepCopyCpuToGpu: dataOffset=${src}, gpuDataId=${dst}, size=${size}`);
-          const data = module.HEAPU8.subarray(src, src + size);
+          const data = module.HEAPU8.subarray(src >>> 0, (src >>> 0) + size);
           backend.upload(dst, data);
         }
       },
@@ -182,13 +208,13 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
                 'verbose',
                 () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
 
-            await backend.download(gpuDataId, () => module.HEAPU8.subarray(dataOffset, dataOffset + size));
+            await backend.download(
+                gpuDataId, () => module.HEAPU8.subarray(dataOffset >>> 0, (dataOffset >>> 0) + size));
           },
 
       // jsepCreateKernel
-      (name: string, kernel: number, attribute: unknown) => backend.createKernel(
-          name, kernel, attribute,
-          env.debug || backend.isQueryEnabled() ? module.UTF8ToString(module._JsepGetNodeName(kernel)) : `${kernel}`),
+      (kernelType: string, kernelId: number, attribute: unknown) => backend.createKernel(
+          kernelType, kernelId, attribute, module.UTF8ToString(module._JsepGetNodeName!(kernelId))),
 
       // jsepReleaseKernel
       (kernel: number) => backend.releaseKernel(kernel),
@@ -201,5 +227,15 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
                 contextDataOffset}`);
         const context = new ComputeContextImpl(module, backend, contextDataOffset);
         return backend.computeKernel(kernel, context, errors);
-      });
+      },
+      // jsepCaptureBegin
+      () => backend.captureBegin(),
+      // jsepCaptureEnd
+      () => backend.captureEnd(),
+      // jsepReplay
+      () => backend.replay()
+    ]);
+  } else {
+    jsepInit('webnn');
+  }
 };
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index 6922d7ff5df6..9a1d5463f784 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -56,7 +56,16 @@ export class BroadcastUtil {
       if (aLen !== bLen && aLen > 1 && bLen > 1) {
         return undefined;
       }
-      cdims[crank - i] = Math.max(aLen, bLen);
+      const max = Math.max(aLen, bLen);
+      if (aLen && bLen) {
+        cdims[crank - i] = Math.max(aLen, bLen);
+      } else {
+        // when either aLen or bLen is 0, the other should be either 0 or 1, otherwise it is not broadcastable.
+        if (max > 1) {
+          return undefined;
+        }
+        cdims[crank - i] = 0;
+      }
     }
 
     return cdims;
@@ -92,6 +101,34 @@ export class ShapeUtil {
     return ShapeUtil.getSizeFromDimensionRange(dims, 0, dims.length);
   }
 
+  /**
+   * convert dims corresponding to type change to pack. ex. uint8 data to uint32
+   */
+  static convertShape(dims: readonly number[], size = 4): readonly number[] {
+    const rank = dims.length;
+    if (rank === 0) {
+      return [];
+    }
+    const newDims = new Array(rank);
+    let i = rank - 1;
+    while (i >= 0) {
+      if (dims[i] % size === 0) {
+        newDims[i] = dims[i] / size;
+        break;
+      }
+      if (size % dims[i] !== 0) {
+        throw new Error('cannot convert shape');
+      }
+      newDims[i] = 1;
+      size /= dims[i];
+      i--;
+    }
+    for (i--; i >= 0; i--) {
+      newDims[i] = dims[i];
+    }
+    return newDims;
+  }
+
   /**
    * calculate the size (number of elements) from the given axis (inclusive)
    */
diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index 6f3d9a52d9f5..c17bd1e1477e 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -60,9 +60,15 @@ export interface GpuDataManager {
   unregisterExternalBuffer(buffer: GPUBuffer): void;
 
   /**
-   * destroy all gpu buffers. Call this when the session.release is called.
+   * destroy all gpu buffers.
    */
   dispose(): void;
+
+  /**
+   * release session related data.
+   * @param sessionId - specify the session ID.
+   */
+  onReleaseSession(sessionId: number): void;
 }
 
 interface StorageCacheValue {
@@ -139,6 +145,10 @@ class GpuDataManagerImpl implements GpuDataManager {
   // The external buffers registered users for IO Binding.
   private externalBuffers: Map<GPUBuffer, GpuDataId>;
 
+  // The pendingBuffers for capture graph.
+  // a SessionID -> GPUBuffer[] mapping.
+  private capturedPendingBuffers: Map<number, GPUBuffer[]>;
+
   constructor(private backend: WebGpuBackend) {
     this.storageCache = new Map();
     this.freeBuffers = new Map();
@@ -146,6 +156,7 @@ class GpuDataManagerImpl implements GpuDataManager {
     this.buffersForUploadingPending = [];
     this.buffersPending = [];
     this.externalBuffers = new Map();
+    this.capturedPendingBuffers = new Map();
   }
 
   upload(id: GpuDataId, data: Uint8Array): void {
@@ -220,6 +231,9 @@ class GpuDataManagerImpl implements GpuDataManager {
             () => `[WebGPU] GpuDataManager.registerExternalBuffer(size=${originalSize}) => id=${
                 id}, buffer is the same, skip.`);
         return id;
+      } else if (this.backend.capturedCommandList.has(this.backend.currentSessionId!)) {
+        throw new Error(`Registering a different external buffer under graph capture mode is not supported yet.
+             Please use the previous external buffer!`);
       }
       this.externalBuffers.delete(previousBuffer);
     } else {
@@ -312,20 +326,39 @@ class GpuDataManagerImpl implements GpuDataManager {
       buffer.destroy();
     }
     this.buffersForUploadingPending = [];
-    for (const buffer of this.buffersPending) {
-      // eslint-disable-next-line no-bitwise
-      if ((buffer.usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE) {
-        // Put the pending buffer to freeBuffers list instead of really destroying it for buffer reusing.
-        this.freeBuffers.get(buffer.size)!.push(buffer);
+
+    if (this.buffersPending.length === 0) {
+      return;
+    }
+
+    if (this.backend.sessionStatus === 'default') {
+      for (const buffer of this.buffersPending) {
         // eslint-disable-next-line no-bitwise
-      } else if ((buffer.usage & GPUBufferUsage.UNIFORM) === GPUBufferUsage.UNIFORM) {
-        // Put the pending buffer to freeUniformBuffers list instead of really destroying it for buffer reusing.
-        this.freeUniformBuffers.get(buffer.size)!.push(buffer);
-      } else {
-        buffer.destroy();
+        if ((buffer.usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE) {
+          // Put the pending buffer to freeBuffers list instead of really destroying it for buffer reusing.
+          this.freeBuffers.get(buffer.size)!.push(buffer);
+          // eslint-disable-next-line no-bitwise
+        } else if ((buffer.usage & GPUBufferUsage.UNIFORM) === GPUBufferUsage.UNIFORM) {
+          // Put the pending buffer to freeUniformBuffers list instead of really destroying it for buffer reusing.
+          this.freeUniformBuffers.get(buffer.size)!.push(buffer);
+        } else {
+          buffer.destroy();
+        }
+      }
+      this.buffersPending = [];
+    } else {
+      // Don't release intermediate tensors in non-default mode.
+      // TODO: reuse the storage buffers in non-default mode.
+      let capturedBuffers = this.capturedPendingBuffers.get(this.backend.currentSessionId!);
+      if (!capturedBuffers) {
+        capturedBuffers = [];
+        this.capturedPendingBuffers.set(this.backend.currentSessionId!, capturedBuffers);
       }
+      for (const buffer of this.buffersPending) {
+        capturedBuffers.push(buffer);
+      }
+      this.buffersPending = [];
     }
-    this.buffersPending = [];
   }
 
   dispose() {
@@ -344,9 +377,26 @@ class GpuDataManagerImpl implements GpuDataManager {
       storage.gpuData.buffer.destroy();
     });
 
+    this.capturedPendingBuffers.forEach((buffers) => {
+      buffers.forEach(buffer => {
+        buffer.destroy();
+      });
+    });
     this.storageCache = new Map();
     this.freeBuffers = new Map();
     this.freeUniformBuffers = new Map();
+    this.capturedPendingBuffers = new Map();
+  }
+
+  onReleaseSession(sessionId: number) {
+    // release the captured pending buffers.
+    const pendingBuffers = this.capturedPendingBuffers.get(sessionId);
+    if (pendingBuffers) {
+      pendingBuffers.forEach(buffer => {
+        buffer.destroy();
+      });
+      this.capturedPendingBuffers.delete(sessionId);
+    }
   }
 }
 
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 8e1ec782079b..5627365100d9 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {argMax, argMin, parseArgMinMaxAttributes} from './ops/argminmax';
-import {attention, parseAttentionAttributes} from './ops/attention';
+import {attention} from './ops/attention';
 import {batchNorm} from './ops/batch-norm';
 import {biasAdd} from './ops/bias-add';
 import {biasSplitGelu} from './ops/bias-split-gelu';
@@ -11,21 +11,25 @@ import {concat, parseConcatAttributes} from './ops/concat';
 import {conv, parseConvAttributes} from './ops/conv';
 import {convTranspose, parseConvTransposeAttributes} from './ops/conv-transpose';
 import {cumsum, parseCumSumAttributes} from './ops/cumsum';
+import {depthToSpace, parseDepthToSpaceAttributes} from './ops/depth-to-space';
 import {einsum, parseEinsumAttributes} from './ops/einsum';
 import {expand} from './ops/expand';
+import {fastGelu} from './ops/fast-gelu';
 import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
 import {gemm, parseGemmAttributes} from './ops/gemm';
-import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
-import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
+import {instanceNorm} from './ops/instance-norm';
+import {layerNorm} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
+import {matMulNBits, parseMatMulNBitsAttributes} from './ops/matmulnbits';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
-import {pad, parsePadAttributes} from './ops/pad';
+import {pad} from './ops/pad';
 import * as pool from './ops/pool';
 import {range} from './ops/range';
 import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
 import {parseResizeAttributes, resize} from './ops/resize';
-import {parseSkipLayerNormAttributes, skipLayerNorm} from './ops/skip-layer-norm';
+import {rotaryEmbedding} from './ops/rotary-embedding';
+import {skipLayerNorm} from './ops/skip-layer-norm';
 import {parseSliceAttributes, slice} from './ops/slice';
 import {parseSoftmaxAttributes, softmax} from './ops/softmax';
 import {parseSplitAttributes, split} from './ops/split';
@@ -50,7 +54,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Asinh', [unaryOps.asinh]],
   ['Atan', [unaryOps.atan]],
   ['Atanh', [unaryOps.atanh]],
-  ['Attention', [attention, parseAttentionAttributes]],
+  ['Attention', [attention]],
   // TODO: support new attributes for AveragePool-10
   ['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]],
   ['BatchNormalization', [batchNorm]],
@@ -65,6 +69,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Cos', [unaryOps.cos]],
   ['Cosh', [unaryOps.cosh]],
   ['CumSum', [cumsum, parseCumSumAttributes]],
+  ['DepthToSpace', [depthToSpace, parseDepthToSpaceAttributes]],
   ['Div', [binaryOps.div]],
   ['Einsum', [einsum, parseEinsumAttributes]],
   ['Elu', [unaryOps.elu, unaryOps.parseAlphaAttributes]],
@@ -72,6 +77,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Erf', [unaryOps.erf]],
   ['Exp', [unaryOps.exp]],
   ['Expand', [expand]],
+  ['FastGelu', [fastGelu]],
   ['Floor', [unaryOps.floor]],
   ['FusedConv', [conv, parseConvAttributes]],
   ['Gather', [gather, parseGatherAttributes]],
@@ -82,20 +88,22 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
   ['Greater', [binaryOps.greater]],
   ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
-  ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]],
-  ['LayerNormalization', [layerNorm, parseLayerNormAttributes]],
+  ['HardSigmoid', [unaryOps.hardSigmoid, unaryOps.parseHardSigmoidAttributes]],
+  ['InstanceNormalization', [instanceNorm]],
+  ['LayerNormalization', [layerNorm]],
   ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]],
   ['Less', [binaryOps.less]],
   ['LessOrEqual', [binaryOps.lessOrEqual]],
   ['Log', [unaryOps.log]],
   ['MatMul', [matMul]],
+  ['MatMulNBits', [matMulNBits, parseMatMulNBitsAttributes]],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
   ['MaxPool', [pool.maxPool, pool.parseMaxPoolAttributes]],
   ['Mul', [binaryOps.mul]],
   ['MultiHeadAttention', [multiHeadAttention, parseMultiHeadAttentionAttributes]],
   ['Neg', [unaryOps.neg]],
   ['Not', [unaryOps.not]],
-  ['Pad', [pad, parsePadAttributes]],
+  ['Pad', [pad]],
   ['Pow', [binaryOps.pow]],
   ['Range', [range]],
   ['Reciprocal', [unaryOps.reciprocal]],
@@ -111,11 +119,12 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['ReduceSumSquare', [reduceSumSquare]],
   ['Relu', [unaryOps.relu]],
   ['Resize', [resize, parseResizeAttributes]],
+  ['RotaryEmbedding', [rotaryEmbedding]],
   ['Sigmoid', [unaryOps.sigmoid]],
   ['Sin', [unaryOps.sin]],
   ['Sinh', [unaryOps.sinh]],
   ['Slice', [slice, parseSliceAttributes]],
-  ['SkipLayerNormalization', [skipLayerNorm, parseSkipLayerNormAttributes]],
+  ['SkipLayerNormalization', [skipLayerNorm]],
   ['Split', [split, parseSplitAttributes]],
   ['Sqrt', [unaryOps.sqrt]],
   ['Softmax', [softmax, parseSoftmaxAttributes]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index 3638938df7db..24006d393592 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -19,12 +19,13 @@
 //
 // modified to fit the needs of the project
 
+import {DataType} from '../../../../wasm-common';
 import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor-view';
-import {ProgramInfo, ProgramUniform} from '../../types';
-import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
+import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common';
 import {ConvAttributes} from '../conv';
-import {getActivationSnippet} from '../fuse-utils';
+import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet} from '../fuse-utils';
 
 import {biasSnippet, typeSnippet} from './activation_util';
 import {utilFunctions} from './conv_util';
@@ -88,10 +89,10 @@ const conv2dCommonSnippet =
     let outRow = ${row} / outWidth;
     let outCol = ${row} % outWidth;
 
-    let WRow = ${col} / (filterDims[1] * inChannels);
-    let WCol = ${col} / inChannels % filterDims[1];
-    let xRow = outRow * stride[0] + dilation[0] * WRow - pad[0];
-    let xCol = outCol * stride[1] + dilation[1] * WCol - pad[1];
+    let WRow = ${col} / (i32(uniforms.w_shape[1]) * inChannels);
+    let WCol = ${col} / inChannels % i32(uniforms.w_shape[1]);
+    let xRow = outRow * uniforms.stride[0] + uniforms.dilation[0] * WRow - uniforms.pad[0];
+    let xCol = outCol * uniforms.stride[1] + uniforms.dilation[1] * WCol - uniforms.pad[1];
     let xCh = ${col} % inChannels;
     var resData = ${typeSnippet(innerElementSizeX, dataType)}(0.0);
     // The bounds checking is always needed since we use it to pad zero for
@@ -108,7 +109,7 @@ const conv2dCommonSnippet =
     ${readXSnippet}` :
                                                                 `
     let col = colIn * ${innerElementSizeX};
-    if (row < uniforms.dimAOuter && col < uniforms.dimInner) {
+    if (row < uniforms.dim_a_outer && col < uniforms.dim_inner) {
       ${readXSnippet}
     }
     return ${typeSnippet(innerElementSizeX, dataType)}(0.0);`) :
@@ -117,7 +118,7 @@ const conv2dCommonSnippet =
     ${readXSnippet}` :
                                                                 `
     let col = colIn * ${innerElementSizeX};
-    if (row < uniforms.dimInner && col < uniforms.dimBOuter) {
+    if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) {
       ${readXSnippet}
     }
     return ${typeSnippet(innerElementSizeX, dataType)}(0.0);`);
@@ -129,9 +130,8 @@ const conv2dCommonSnippet =
           isChannelsLast ? typeSnippet(innerElementSizeX, dataType) : typeSnippet(innerElementSizeW, dataType);
       const bType =
           isChannelsLast ? typeSnippet(innerElementSizeW, dataType) : typeSnippet(innerElementSizeX, dataType);
-      const {activationFunction, applyActivation} = getActivationSnippet(attributes, resType);
+      const applyActivation = getActivationSnippet(attributes, resType, dataType);
       const userCode = `
-    ${activationFunction}
     fn mm_readA(batch: i32, row : i32, colIn : i32) -> ${aType} {
       ${isChannelsLast ? sampleX : sampleW}
     }
@@ -142,7 +142,7 @@ const conv2dCommonSnippet =
 
     fn mm_write(batch: i32, row : i32, colIn : i32, valueIn : ${resType}) {
       let col = colIn * ${innerElementSize};
-      if (row < uniforms.dimAOuter && col < uniforms.dimBOuter)
+      if (row < uniforms.dim_a_outer && col < uniforms.dim_b_outer)
       {
       var value = valueIn;
       let outWidth = ${isChannelsLast ? 'i32(uniforms.result_shape[2])' : 'i32(uniforms.result_shape[3])'};
@@ -181,31 +181,40 @@ export const createConv2DMatMulProgramInfo =
       LOG_DEBUG('verbose', () => `[conv2d_mm_webgpu] dispatch = ${dispatch}`);
 
       const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : 1;
-
       const tileAOuter = workGroupSize[1] * elementsPerThread[1];
       const tileBOuter = workGroupSize[0] * elementsPerThread[0];
       const tileInner = Math.max(workGroupSize[0] * innerElementSize, workGroupSize[1]);
-
       const fitAOuter = dimAOuter % tileAOuter === 0;
       const fitBOuter = dimBOuter % tileBOuter === 0;
       const fitInner = dimInner % tileInner === 0;
-
       const elementsSize = isVec4 ? [innerElementSize, 4, 4] : [1, 1, 1];
-      const t = tensorTypeToWsglStorageType(inputs[0].dataType);
 
-      // TODO: support component 2, 3.
-      const components = isVec4 ? 4 : 1;
-      const programUniforms: ProgramUniform[] =
-          [{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
-      const x =
-          inputVariable('x', inputs[0].dataType, inputs[0].dims.length, innerElementSize === 3 ? 1 : innerElementSize);
-      const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components);
-      const inputVariables = [x, w];
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.int32, data: dimAOuter}, {type: DataType.int32, data: dimBOuter},
+        {type: DataType.int32, data: dimInner}, {type: DataType.int32, data: [attributes.pads[0], attributes.pads[1]]},
+        {type: DataType.int32, data: attributes.strides}, {type: DataType.int32, data: attributes.dilations}
+      ];
+      appendActivationUniformsData(attributes, programUniforms);
+      programUniforms.push(...createTensorShapeVariables(inputs[0].dims, inputs[1].dims));
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
+      if (hasBias) {
+        programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+        inputDependencies.push('rank');
+      }
+      programUniforms.push(...createTensorShapeVariables(outputShape));
 
-      programUniforms.push(...createTensorShapeVariables(inputs[0].dims));
-      programUniforms.push(...createTensorShapeVariables(inputs[1].dims));
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const uniforms: UniformsArrayType = [
+          {name: 'dim_a_outer', type: 'i32'}, {name: 'dim_b_outer', type: 'i32'}, {name: 'dim_inner', type: 'i32'},
+          {name: 'pad', type: 'i32', length: 2}, {name: 'stride', type: 'i32', length: 2},
+          {name: 'dilation', type: 'i32', length: 2}
+        ];
+        appendActivationUniforms(attributes, uniforms);
 
-      let declareFunctions = `
+        // TODO: support component 2, 3.
+        const components = isVec4 ? 4 : 1;
+        const t = tensorTypeToWsglStorageType(inputs[0].dataType);
+        let declareFunctions = `
       fn setOutputAtIndex(flatIndex : i32, value : ${isVec4 ? `vec4<${t}>` : t}) {
         result[flatIndex] = ${isVec4 ? `vec4<${t}>` : t}(value);
       }
@@ -213,51 +222,50 @@ export const createConv2DMatMulProgramInfo =
         let flatIndex = getOutputIndexFromCoords(vec4<i32>(d0, d1, d2, d3));
         setOutputAtIndex(flatIndex ${isVec4 ? '/ 4' : ''}, value);
       }`;
-      if (hasBias) {
-        const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, components);
-        inputVariables.push(bias);
-
-        programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
-
-        declareFunctions += `
+        const x = inputVariable(
+            'x', inputs[0].dataType, inputs[0].dims.length, innerElementSize === 3 ? 1 : innerElementSize);
+        const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components);
+        const inputVariables = [x, w];
+        const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
+        if (hasBias) {
+          const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, components);
+          inputVariables.push(bias);
+          declareFunctions += `
         fn getBiasByOutputCoords(coords : vec4<i32>) -> ${isVec4 ? `vec4<${t}>` : t} {
           return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
         }`;
-      }
-      const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
-      programUniforms.push(...createTensorShapeVariables(outputShape));
-      return {
-        name: 'Conv2DMatMul',
-        shaderCache: {hint: attributes.cacheKey},
-        getRunData: () => ({
-          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-          dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
-          programUniforms,
-        }),
-        getShaderSource: (shaderHelper: ShaderHelper) => `
+        }
+
+        return `
         ${utilFunctions('uniforms.result_strides')}
         //struct Uniforms { xShape : vec4<i32>, wShape : vec4<i32>, outShape : vec4<i32>,
         //  outShapeStrides: vec3<i32>, filterDims : vec2<i32>, pad : vec2<i32>, stride : vec2<i32>,
         //  dilation : vec2<i32>, dimAOuter : i32, dimBOuter : i32, dimInner : i32 };
-        ${
-            shaderHelper.registerUniform('dimAOuter', 'i32')
-                .registerUniform('dimBOuter', 'i32')
-                .registerUniform('dimInner', 'i32')
-                .declareVariables(...inputVariables, output)}
-        const filterDims : vec2<i32> = vec2<i32>(${attributes.kernelShape[0]}, ${attributes.kernelShape[1]});
-        const pad : vec2<i32> = vec2<i32>(${attributes.pads[0]}, ${attributes.pads[1]});
-        const stride : vec2<i32> = vec2<i32>(${attributes.strides[0]}, ${attributes.strides[1]});
-        const dilation : vec2<i32> = vec2<i32>(${attributes.dilations[0]}, ${attributes.dilations[1]});
+        ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
         ${declareFunctions}
         ${
             conv2dCommonSnippet(
                 isChannelsLast, fitAOuter, fitBOuter, fitInner, hasBias, attributes, elementsSize[0], elementsSize[1],
                 elementsSize[2], t)}
-            ${
+        ${
             isVec4 ?
                 makeMatMulPackedVec4Source(elementsPerThread, workGroupSize, t, undefined, !isChannelsLast, tileInner) :
                 makeMatMulPackedSource(
                     elementsPerThread, workGroupSize, t, undefined, !isChannelsLast, tileInner, false, undefined,
-                    sequentialAccessByThreads)}`
+                    sequentialAccessByThreads)}`;
+      };
+      return {
+        name: 'Conv2DMatMul',
+        shaderCache: {
+          hint: `${attributes.cacheKey};${innerElementSize};${isVec4};${fitAOuter};${fitBOuter};${fitInner};${
+              tileAOuter};${tileBOuter};${tileInner}`,
+          inputDependencies
+        },
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
+          programUniforms,
+        }),
+        getShaderSource
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
index d425155857e1..080b24a2432a 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
@@ -19,20 +19,21 @@
 //
 // modified to fit the needs of the project
 
+import {DataType} from '../../../../wasm-common';
 import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor-view';
-import {ProgramInfo, ProgramUniform} from '../../types';
-import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper} from '../common';
+import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common';
 import {ConvTransposeAttributes} from '../conv-transpose';
-import {getActivationSnippet} from '../fuse-utils';
+import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet} from '../fuse-utils';
 
-import {biasSnippet, typeSnippet} from './activation_util';
+import {biasSnippet} from './activation_util';
 import {utilFunctions} from './conv_util';
 import {makeMatMulPackedSource, makeMatMulPackedVec4Source} from './matmul_packed_webgpu';
 
 const conv2dTransposeCommonSnippet =
-    (isChannelsLast: boolean, addBias = false, attributes: ConvTransposeAttributes, innerElementSize = 4): string => {
-      const type = typeSnippet(innerElementSize, 'f32');
+    (isChannelsLast: boolean, addBias = false, attributes: ConvTransposeAttributes, type: string,
+     innerElementSize = 4): string => {
       const getWSnippet = (innerElementSize: number) => {
         switch (innerElementSize) {
           case 1:
@@ -46,7 +47,7 @@ const conv2dTransposeCommonSnippet =
             let v1 = w[getIndexFromCoords4D(coord1, vec4<i32>(uniforms.w_shape))];
             let v2 = w[getIndexFromCoords4D(coord2, vec4<i32>(uniforms.w_shape))];
             let v3 = w[getIndexFromCoords4D(coord3, vec4<i32>(uniforms.w_shape))];
-            return vec4<f32>(v0, v1, v2, v3);
+            return ${type}(v0, v1, v2, v3);
             `;
           default:
             throw new Error(`innerElementSize ${innerElementSize} is not supported.`);
@@ -74,21 +75,21 @@ const conv2dTransposeCommonSnippet =
       col % outWidth);
     `;
 
-      const xHeight = isChannelsLast ? 'outBackprop[1]' : 'outBackprop[2]';
-      const xWidth = isChannelsLast ? 'outBackprop[2]' : 'outBackprop[3]';
+      const xHeight = isChannelsLast ? 'i32(uniforms.x_shape[1])' : 'i32(uniforms.x_shape[2])';
+      const xWidth = isChannelsLast ? 'i32(uniforms.x_shape[2])' : 'i32(uniforms.x_shape[3])';
       const row = isChannelsLast ? 'row' : 'col';
       const col = isChannelsLast ? 'col' : 'row';
 
       const readASnippet = `
-      let inChannels = ${isChannelsLast ? 'outBackprop[3]' : 'outBackprop[1]'};
+      let inChannels = ${isChannelsLast ? 'i32(uniforms.x_shape[3])' : 'i32(uniforms.x_shape[1])'};
       let outWidth = ${isChannelsLast ? 'i32(uniforms.result_shape[2])' : 'i32(uniforms.result_shape[3])'};
       let outRow = ${row} / outWidth;
       let outCol = ${row} % outWidth;
 
-      let WRow = ${col} / (filterDims[1] * inChannels);
-      let WCol = ${col} / inChannels % filterDims[1];
-      let xR = f32(outRow - pads[0] + dilation[0] * WRow) / f32(strides[0]);
-      let xC = f32(outCol - pads[1] + dilation[1] * WCol) / f32(strides[1]);
+      let WRow = ${col} / (uniforms.filter_dims[1] * inChannels);
+      let WCol = ${col} / inChannels % uniforms.filter_dims[1];
+      let xR = f32(outRow - uniforms.pads[0] + uniforms.dilations[0] * WRow) / f32(uniforms.strides[0]);
+      let xC = f32(outCol - uniforms.pads[1] + uniforms.dilations[1] * WCol) / f32(uniforms.strides[1]);
       if (xR < 0.0 || xR >= f32(${xHeight}) || fract(xR) > 0.0) {
         return ${type}(0.0);
       }
@@ -103,25 +104,25 @@ const conv2dTransposeCommonSnippet =
 
       const sampleA = isChannelsLast ? `
       let col = colIn * ${innerElementSize};
-      if (row < uniforms.dimAOuter && col < uniforms.dimInner) {
+      if (row < uniforms.dim_a_outer && col < uniforms.dim_inner) {
         ${readASnippet}
       }
       return ${type}(0.0);` :
                                        `
       let col = colIn * ${innerElementSize};
-      if (row < uniforms.dimInner && col < uniforms.dimBOuter) {
+      if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) {
         ${readASnippet}
       }
       return ${type}(0.0);`;
 
       const sampleW = `
       let col = colIn * ${innerElementSize};
-      let inChannels = ${isChannelsLast ? 'outBackprop[3]' : 'outBackprop[1]'};
-      let coordX = filterDims.x - 1 - row / (filterDims[1] * inChannels);
-      let coordY = filterDims.y - 1 - (row / inChannels) % filterDims[1];
+      let inChannels = ${isChannelsLast ? 'i32(uniforms.x_shape[3])' : 'i32(uniforms.x_shape[1])'};
+      let coordX = uniforms.filter_dims[0] - 1 - row / (uniforms.filter_dims[1] * inChannels);
+      let coordY = uniforms.filter_dims[1] - 1 - (row / inChannels) % uniforms.filter_dims[1];
       if (${
-          isChannelsLast ? 'row < uniforms.dimInner && col < uniforms.dimBOuter' :
-                           'row < uniforms.dimInner && col < uniforms.dimAOuter'}  && coordX >= 0 && coordY >= 0) {
+          isChannelsLast ? 'row < uniforms.dim_inner && col < uniforms.dim_b_outer' :
+                           'row < uniforms.dim_inner && col < uniforms.dim_a_outer'}  && coordX >= 0 && coordY >= 0) {
         let rowInner = row % inChannels;
         let coord = vec4<i32>(coordX, coordY, col, rowInner);
         ${getWSnippet(innerElementSize)}
@@ -129,9 +130,8 @@ const conv2dTransposeCommonSnippet =
       return ${type}(0.0);
       `;
 
-      const {activationFunction, applyActivation} = getActivationSnippet(attributes, type);
+      const applyActivation = getActivationSnippet(attributes, type);
       const userCode = `
-      ${activationFunction}
   fn mm_readA(batch: i32, row : i32, colIn : i32) -> ${type} {
     ${isChannelsLast ? sampleA : sampleW}
   }
@@ -142,7 +142,7 @@ const conv2dTransposeCommonSnippet =
 
   fn mm_write(batch: i32, row : i32, colIn : i32, valueInput : ${type}) {
     let col = colIn * ${innerElementSize};
-    if (row < uniforms.dimAOuter && col < uniforms.dimBOuter) {
+    if (row < uniforms.dim_a_outer && col < uniforms.dim_b_outer) {
       var value = valueInput;
       let outWidth = ${isChannelsLast ? 'i32(uniforms.result_shape[2])' : 'i32(uniforms.result_shape[3])'};
       ${coordResSnippet}
@@ -164,17 +164,14 @@ export const createConv2DTransposeMatMulProgramInfo =
       const outWidth = isChannelsLast ? outputShape[2] : outputShape[3];
       const outHeight = isChannelsLast ? outputShape[1] : outputShape[2];
       const outChannels = isChannelsLast ? outputShape[3] : outputShape[1];
-      const isVec4 =
-          isChannelsLast ? inChannels % 4 === 0 && outChannels % 4 === 0 : outWidth % 4 === 0 && outChannels % 4 === 0;
+      // TODO: enable vec4 for NCHW
+      const isVec4 = isChannelsLast && (inChannels % 4 === 0 && inChannels % 3) && outChannels % 4 === 0;
 
       // TODO: fine tune size
       const dispatchX = isChannelsLast ? outChannels : outWidth * outHeight;
       const dispatchY = isChannelsLast ? outWidth * outHeight : outChannels;
-      const workGroupSize: [number, number, number] = isVec4 ?
-          [8, 8, 1] :
-          [(dispatchX <= 4 || dispatchY <= 4) ? 4 : 16, dispatchX > 4 && dispatchY <= 4 ? 4 : 16, 1];
-      const elementsPerThread =
-          isVec4 ? [4, 4, 1] : [dispatchX <= 4 ? 1 : 4, dispatchX > 4 && dispatchY <= 4 ? 1 : 4, 1];
+      const workGroupSize: [number, number, number] = [8, 8, 1];
+      const elementsPerThread = dimAOuter <= 8 ? [4, 1, 1] : [4, 4, 1];
       const dispatch = [
         Math.ceil(dispatchX / workGroupSize[0] / elementsPerThread[0]),
         Math.ceil(dispatchY / workGroupSize[1] / elementsPerThread[1]),
@@ -186,72 +183,82 @@ export const createConv2DTransposeMatMulProgramInfo =
       const innerElementSize = isVec4 ? 4 : 1;
       const tileInner = Math.max(workGroupSize[0] * innerElementSize, workGroupSize[1]);
       const components = isVec4 ? 4 : 1;
-      const programUniforms: ProgramUniform[] =
-          [{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
-      const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length, components);
-      const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, 1);
-      const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
-      const inputVariables = [x, w];
-      programUniforms.push(...createTensorShapeVariables(inputs[0].dims));
-      programUniforms.push(...createTensorShapeVariables(inputs[1].dims));
+      const filterDims =
+          [attributes.kernelShape[isChannelsLast ? 1 : 2], attributes.kernelShape[isChannelsLast ? 2 : 3]];
+      const effectiveFilterDims = [
+        filterDims[0] + (attributes.dilations[0] <= 1 ? 0 : (filterDims[0] - 1) * (attributes.dilations[0] - 1)),
+        filterDims[1] + (attributes.dilations[1] <= 1 ? 0 : (filterDims[1] - 1) * (attributes.dilations[1] - 1))
+      ];
+      const pads = [
+        effectiveFilterDims[0] - 1 - Math.floor((attributes.pads[0] + attributes.pads[2]) / 2),
+        effectiveFilterDims[1] - 1 - Math.floor((attributes.pads[1] + attributes.pads[3]) / 2)
+      ];
+
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.int32, data: dimAOuter}, {type: DataType.int32, data: dimBOuter},
+        {type: DataType.int32, data: dimInner}, {type: DataType.int32, data: attributes.strides},
+        {type: DataType.int32, data: attributes.dilations}, {type: DataType.int32, data: filterDims},
+        {type: DataType.int32, data: pads}
+      ];
+      appendActivationUniformsData(attributes, programUniforms);
+      programUniforms.push(...createTensorShapeVariables(inputs[0].dims, inputs[1].dims));
 
-      let declareFunctions = '';
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
       if (hasBias) {
-        const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, components);
-        inputVariables.push(bias);
         programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
-
-        declareFunctions += `
-        fn getBiasByOutputCoords(coords : vec4<i32>) -> ${isVec4 ? 'vec4<f32>' : 'f32'} {
-          return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
-        }`;
+        inputDependencies.push('rank');
       }
-
       programUniforms.push(...createTensorShapeVariables(outputShape));
 
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length, components);
+        const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, 1);
+        const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
+        const inputVariables = [x, w];
+
+        let declareFunctions = '';
+        if (hasBias) {
+          const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, components);
+          inputVariables.push(bias);
+          declareFunctions += `
+          fn getBiasByOutputCoords(coords : vec4<i32>) -> ${bias.type.value} {
+            return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
+          }`;
+        }
+
+        const uniforms: UniformsArrayType = [
+          {name: 'dim_a_outer', type: 'i32'}, {name: 'dim_b_outer', type: 'i32'}, {name: 'dim_inner', type: 'i32'},
+          {name: 'strides', type: 'i32', length: 2}, {name: 'dilations', type: 'i32', length: 2},
+          {name: 'filter_dims', type: 'i32', length: filterDims.length},
+          {name: 'pads', type: 'i32', length: pads.length}
+        ];
+        appendActivationUniforms(attributes, uniforms);
+        const elemType = tensorTypeToWsglStorageType(inputs[0].dataType, 1);
+        if (elemType !== 'f16' && elemType !== 'f32') {
+          throw new Error(`elemType ${elemType} is not supported.`);
+        }
+        return `
+        ${utilFunctions('uniforms.result_strides')}
+        ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)};
+        ${declareFunctions}
+        ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, attributes, x.type.value, innerElementSize)}
+        ${
+            isVec4 ? makeMatMulPackedVec4Source(
+                         elementsPerThread, workGroupSize, elemType, undefined, !isChannelsLast, tileInner) :
+                     makeMatMulPackedSource(
+                         elementsPerThread, workGroupSize, elemType, undefined, !isChannelsLast, tileInner, false,
+                         undefined, sequentialAccessByThreads)}`;
+      };
+
       return {
         name: 'Conv2DTransposeMatMul',
-        shaderCache: {hint: attributes.cacheKey},
+        shaderCache:
+            {hint: `${attributes.cacheKey};${elementsPerThread};${workGroupSize};${isVec4}`, inputDependencies},
         getRunData: () => ({
           outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
           dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
           programUniforms
         }),
-        getShaderSource: (shaderHelper: ShaderHelper) => `
-        ${utilFunctions('uniforms.result_strides')}
-        ${
-            shaderHelper.registerUniform('dimAOuter', 'i32')
-                .registerUniform('dimBOuter', 'i32')
-                .registerUniform('dimInner', 'i32')
-                .declareVariables(...inputVariables, output)};
-        const outBackprop : vec4<i32> = vec4<i32>(${inputs[0].dims.join(',')});
-        const filterDims : vec2<i32> = vec2<i32>(${attributes.kernelShape[isChannelsLast ? 1 : 2]}, ${
-            attributes.kernelShape[isChannelsLast ? 2 : 3]});
-        const effectiveFilterDims : vec2<i32> = filterDims + vec2<i32>(
-              ${
-            attributes.dilations[0] <= 1 ?
-                0 :
-                (attributes.kernelShape[isChannelsLast ? 1 : 2] - 1) * (attributes.dilations[0] - 1)},
-              ${
-            attributes.dilations[1] <= 1 ?
-                0 :
-                (attributes.kernelShape[isChannelsLast ? 2 : 3] - 1) * (attributes.dilations[1] - 1)});
-        const pads : vec2<i32> = vec2<i32>(i32(effectiveFilterDims[0]) - 1 - (${
-            attributes.pads[0] + attributes.pads[2]})/2,
-                                         i32(effectiveFilterDims[1]) - 1 - (${
-            attributes.pads[1] + attributes.pads[3]})/2);
-        const strides : vec2<i32> = vec2<i32>(${attributes.strides[0]}, ${attributes.strides[1]});
-        const dilation : vec2<i32> = vec2<i32>(${attributes.dilations[0]}, ${attributes.dilations[1]});
-        const dimAOuter : i32 = ${dimAOuter};
-        const dimBOuter : i32 = ${dimBOuter};
-        const dimInner : i32 = ${dimInner};
-        ${declareFunctions}
-        ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, attributes, innerElementSize)}
-        ${
-            isVec4 ? makeMatMulPackedVec4Source(
-                         elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner) :
-                     makeMatMulPackedSource(
-                         elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner, false,
-                         undefined, sequentialAccessByThreads)}`
+        getShaderSource
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index 2e6392aada45..45c89406e173 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -17,27 +17,22 @@
 
 // sampled from [@tensorflow/tfjs] tfjs-backend-webgpu/src/conv_backprop_webgpu.ts
 
+import {DataType} from '../../../../wasm-common';
 import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
-import {ProgramInfo} from '../../types';
-import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
+import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common';
 import {ConvTransposeAttributes} from '../conv-transpose';
 
 const createConvTranspose2DOpProgramShaderSource =
-    (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: ConvTransposeAttributes,
-     outputShape: readonly number[], hasBias: boolean, is1DimensionDispatch: boolean, isVec4 = false,
-     dataType: string): string => {
-      const isChannelsLast = attributes.format === 'NHWC';
+    (shaderHelper: ShaderHelper, inputs: readonly TensorView[], outputShape: readonly number[], hasBias: boolean,
+     is1DimensionDispatch: boolean, isVec4 = false, dataType: string, uniforms: UniformsArrayType,
+     isChannelsLast = false): string => {
       const rowDim = isChannelsLast ? 1 : 2;
       const colDim = isChannelsLast ? 2 : 3;
       const channelDim = isChannelsLast ? 3 : 1;
-      const outputSize = ShapeUtil.size(outputShape);
       const workPerThread = isVec4 ? 2 : 1;
-      const group = attributes.group;
-      const wShape = inputs[1].dims;
-      const inputChannelsPerGroup = wShape[0] / group;
-      const outputChannelsPerGroup = wShape[1];
 
       let declareFunctions = `
   fn setOutputAtIndex(flatIndex : u32, value : ${isVec4 ? `vec4<${dataType}>` : dataType}) {
@@ -50,20 +45,21 @@ const createConvTranspose2DOpProgramShaderSource =
     }`;
       }
       const components = isVec4 ? 4 : 1;
-      const w = inputVariable('W', inputs[1].dataType, inputs[1].dims, components);
-      const dy = inputVariable('Dy', inputs[0].dataType, inputs[0].dims, components);
+      const w = inputVariable('W', inputs[1].dataType, inputs[1].dims.length, components);
+      const dy = inputVariable('Dy', inputs[0].dataType, inputs[0].dims.length, components);
       const inputVariables = [dy, w];
       if (hasBias) {
-        inputVariables.push(inputVariable('bias', inputs[2].dataType, [outputShape[channelDim]], components));
+        inputVariables.push(inputVariable('bias', inputs[2].dataType, [outputShape[channelDim]].length, components));
       }
-      const output = outputVariable('result', inputs[0].dataType, outputShape, components);
+      const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
+
       const codeSnippet4 = `{
-        let batch: u32 = ${is1DimensionDispatch ? 'global_id.z' : 'workgroup_id.z'} / outShape[1];
-        let r = ${is1DimensionDispatch ? 'global_id.z' : 'workgroup_id.z'} % outShape[1];
+        let batch: u32 = ${is1DimensionDispatch ? 'global_id.z' : 'workgroup_id.z'} / uniforms.result_shape[1];
+        let r = ${is1DimensionDispatch ? 'global_id.z' : 'workgroup_id.z'} % uniforms.result_shape[1];
         let c = ${is1DimensionDispatch ? 'global_id.y' : 'workgroup_id.y'} * ${workPerThread};
         let d1: u32 = ${is1DimensionDispatch ? 'global_id.x' : 'workgroup_id.x'} * 4;
 
-        let dyCorner = vec2<i32>(i32(r), i32(c)) - vec2<i32>(pads);
+        let dyCorner = vec2<i32>(i32(r), i32(c)) - vec2<i32>(uniforms.pads);
 
         // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
         // ? = to be determined. : = across all values in that axis.
@@ -71,29 +67,29 @@ const createConvTranspose2DOpProgramShaderSource =
         for (var i = 0; i < ${workPerThread}; i++) {
           dotProd[i] = vec4<${dataType}>(0.0);
         }
-        for (var wR: u32 = 0; wR < filterDims[0]; wR = wR + 1) {
-          var dyR = (${dataType}(dyCorner.x) + ${dataType}(wR)) / ${dataType}(strides.x);
-          let wRPerm = filterDims[0] - 1 - wR;
-          if (dyR < 0.0 || dyR >= ${dataType}(outBackprop[1]) ||
+        for (var wR: u32 = 0; wR < uniforms.filter_dims[0]; wR = wR + 1) {
+          var dyR = (${dataType}(dyCorner.x) + ${dataType}(wR)) / ${dataType}(uniforms.strides.x);
+          let wRPerm = uniforms.filter_dims[0] - 1 - wR;
+          if (dyR < 0.0 || dyR >= ${dataType}(uniforms.Dy_shape[1]) ||
               fract(dyR) > 0.0 || wRPerm < 0) {
             continue;
           }
           let idyR: u32 = u32(dyR);
 
-          for (var wC: u32 = 0; wC < filterDims[1]; wC = wC + 1) {
-            let dyC = (${dataType}(dyCorner.y) + ${dataType}(wC)) / ${dataType}(strides.y);
-            let dyC2 = (${dataType}(dyCorner.y) + 1.0 + ${dataType}(wC)) / ${dataType}(strides.y);
-            let wCPerm = filterDims[1] - 1 - wC;
+          for (var wC: u32 = 0; wC < uniforms.filter_dims[1]; wC = wC + 1) {
+            let dyC = (${dataType}(dyCorner.y) + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
+            let dyC2 = (${dataType}(dyCorner.y) + 1.0 + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
+            let wCPerm = uniforms.filter_dims[1] - 1 - wC;
             if (wCPerm < 0) {
               continue;
             }
             var bDyCVal = true;
             var bDyCVal2 = true;
-            if (dyC < 0.0 || dyC >= ${dataType}(outBackprop[2]) ||
+            if (dyC < 0.0 || dyC >= ${dataType}(uniforms.Dy_shape[2]) ||
                 fract(dyC) > 0.0) {
               bDyCVal = false;
             }
-            if (dyC2 < 0.0 || dyC2 >= ${dataType}(outBackprop[2]) ||
+            if (dyC2 < 0.0 || dyC2 >= ${dataType}(uniforms.Dy_shape[2]) ||
                 fract(dyC2) > 0.0) {
               bDyCVal2 = false;
             }
@@ -101,7 +97,7 @@ const createConvTranspose2DOpProgramShaderSource =
             let idyC: u32 = u32(dyC);
             let idyC2: u32 = u32(dyC2);
             if (bDyCVal && bDyCVal2) {
-              let d2Length = outBackprop[3];
+              let d2Length = uniforms.Dy_shape[3];
               for (var d2 :u32 = 0; d2 < d2Length; d2 = d2 + 4) {
                 let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
                 let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
@@ -123,7 +119,7 @@ const createConvTranspose2DOpProgramShaderSource =
                                                     dot(xValue, wValue3));
               }
             } else if (bDyCVal) {
-              let d2Length = outBackprop[${channelDim}];
+              let d2Length = uniforms.Dy_shape[${channelDim}];
               for (var d2: u32 = 0; d2 < d2Length; d2 = d2 + 4) {
                 let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
                 let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
@@ -138,7 +134,7 @@ const createConvTranspose2DOpProgramShaderSource =
                 dotProd[0] = dotProd[0] + tmpval;
               }
             } else if (bDyCVal2) {
-              let d2Length = outBackprop[3];
+              let d2Length = uniforms.Dy_shape[3];
               for (var d2: u32 = 0; d2 < d2Length; d2 = d2 + 4) {
                 let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
                 let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
@@ -157,7 +153,7 @@ const createConvTranspose2DOpProgramShaderSource =
         }
 
         for (var i: u32 = 0; i < ${workPerThread}; i = i + 1) {
-          let value = dotProd[i] + ${hasBias ? 'bias[c+i]' : '0.0'};
+          let value = dotProd[i] + ${hasBias ? 'bias[c+i]' : `vec4<${dataType}>(0.0)`};
           ${output.set('batch', 'r', 'c + i', 'd1', 'value')};
         }
       }`;
@@ -167,39 +163,39 @@ const createConvTranspose2DOpProgramShaderSource =
           let d1 = ${output.indicesGet('outputIndices', channelDim)};
           let r = ${output.indicesGet('outputIndices', rowDim)};
           let c = ${output.indicesGet('outputIndices', colDim)};
-          let dyCorner = vec2<i32>(i32(r), i32(c)) - pads;
+          let dyCorner = vec2<i32>(i32(r), i32(c)) - uniforms.pads;
           let dyRCorner = dyCorner.x;
           let dyCCorner = dyCorner.y;
-          let groupId = d1 / ${outputChannelsPerGroup};
-          let wOutChannel = d1 - groupId * ${outputChannelsPerGroup};
+          let groupId = d1 / uniforms.output_channels_per_group;
+          let wOutChannel = d1 - groupId * uniforms.output_channels_per_group;
           // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
           // ? = to be determined. : = across all values in that axis.
-          var dotProd = 0.0;
-          for (var wR: u32 = 0; wR < effectiveFilterDims.x; wR = wR + 1) {
-            if (wR % dilations.x != 0) {
+          var dotProd = ${dataType}(0.0);
+          for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
+            if (wR % uniforms.dilations.x != 0) {
               continue;
             }
-            let dyR = (${dataType}(dyRCorner) + ${dataType}(wR)) / ${dataType}(strides[0]);
-            let wRPerm = filterDims.x - 1 - wR / dilations.x;
-            if (dyR < 0.0 || dyR >= ${dataType}(outBackprop[${rowDim}]) || fract(dyR) > 0.0 ||
+            let dyR = (${dataType}(dyRCorner) + ${dataType}(wR)) / ${dataType}(uniforms.strides[0]);
+            let wRPerm = uniforms.filter_dims.x - 1 - wR / uniforms.dilations.x;
+            if (dyR < 0.0 || dyR >= ${dataType}(uniforms.Dy_shape[${rowDim}]) || fract(dyR) > 0.0 ||
                 wRPerm < 0) {
               continue;
             }
             let idyR: u32 = u32(dyR);
 
-            for (var wC: u32 = 0; wC < effectiveFilterDims.y; wC = wC + 1) {
-              if (wC % dilations.y != 0) {
+            for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
+              if (wC % uniforms.dilations.y != 0) {
                 continue;
               }
-              let dyC = (${dataType}(dyCCorner) + ${dataType}(wC)) / ${dataType}(strides.y);
-              let wCPerm = filterDims.y - 1 - wC / dilations.y;
-              if (dyC < 0.0 || dyC >= ${dataType}(outBackprop[${colDim}]) ||
+              let dyC = (${dataType}(dyCCorner) + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
+              let wCPerm = uniforms.filter_dims.y - 1 - wC / uniforms.dilations.y;
+              if (dyC < 0.0 || dyC >= ${dataType}(uniforms.Dy_shape[${colDim}]) ||
                   fract(dyC) > 0.0 || wCPerm < 0) {
                 continue;
               }
               let idyC: u32 = u32(dyC);
-              var inputChannel = groupId * ${inputChannelsPerGroup};
-              for (var d2: u32 = 0; d2 < ${inputChannelsPerGroup}; d2 = d2 + 1) {
+              var inputChannel = groupId * uniforms.input_channels_per_group;
+              for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + 1) {
                 let xValue = ${
           isChannelsLast ? dy.get('batch', 'idyR', 'idyC', 'inputChannel') :
                            dy.get('batch', 'inputChannel', 'idyR', 'idyC')};
@@ -209,32 +205,16 @@ const createConvTranspose2DOpProgramShaderSource =
               }
             }
           }
-          let value = dotProd + ${hasBias ? 'bias[d1]' : '0.0'};
+          let value = dotProd + ${hasBias ? 'bias[d1]' : `${dataType}(0.0)`};
           ${output.setByOffset('global_idx', 'value')};
         `;
 
       return `
-  ${shaderHelper.declareVariables(...inputVariables, output)}
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
   ${declareFunctions}
-  const outShape : vec4<u32> = vec4<u32>(${outputShape.join(',')});
-  const outBackprop : vec4<u32> = vec4<u32>(${inputs[0].dims.join(',')});
-  const strides : vec2<u32> = vec2<u32>(${attributes.strides[0]}, ${attributes.strides[1]});
-  const filterDims : vec2<u32> = vec2<u32>(${attributes.kernelShape[isChannelsLast ? 1 : 2]}, ${
-          attributes.kernelShape[isChannelsLast ? 2 : 3]});
-  const dilations : vec2<u32> = vec2<u32>(${attributes.dilations[0]}, ${attributes.dilations[1]});
-  const effectiveFilterDims : vec2<u32> = filterDims + vec2<u32>(
-          ${
-          attributes.dilations[0] <= 1 ?
-              0 :
-              (attributes.kernelShape[isChannelsLast ? 1 : 2] - 1) * (attributes.dilations[0] - 1)},
-          ${
-          attributes.dilations[1] <= 1 ?
-              0 :
-              (attributes.kernelShape[isChannelsLast ? 2 : 3] - 1) * (attributes.dilations[1] - 1)});
-  const pads : vec2<i32> = vec2<i32>(i32(effectiveFilterDims[0]) - 1 - (${attributes.pads[0] + attributes.pads[2]})/2,
-                                     i32(effectiveFilterDims[1]) - 1 - (${attributes.pads[1] + attributes.pads[3]})/2);
+
     ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)};
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')};
   ${isVec4 ? codeSnippet4 : codeSnippet}}`;
     };
 
@@ -257,19 +237,73 @@ export const createConvTranspose2DProgramInfo =
       ];
       LOG_DEBUG('verbose', () => `[conv2d_backprop_webgpu] dispatch = ${dispatch}`);
 
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+      const isChannelsLast = attributes.format === 'NHWC';
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
+      const strides = [attributes.strides[0], attributes.strides[1]];
+      const filterDims =
+          [attributes.kernelShape[isChannelsLast ? 1 : 2], attributes.kernelShape[isChannelsLast ? 2 : 3]];
+      const dilations = [attributes.dilations[0], attributes.dilations[1]];
+      const effectiveFilterDims = [
+        filterDims[0] +
+            (attributes.dilations[0] <= 1 ?
+                 0 :
+                 (attributes.kernelShape[isChannelsLast ? 1 : 2] - 1) * (attributes.dilations[0] - 1)),
+        filterDims[1] +
+            (attributes.dilations[1] <= 1 ?
+                 0 :
+                 (attributes.kernelShape[isChannelsLast ? 2 : 3] - 1) * (attributes.dilations[1] - 1))
+      ];
+      const pads = [
+        effectiveFilterDims[0] - 1 - Math.floor((attributes.pads[0] + attributes.pads[2]) / 2),
+        effectiveFilterDims[1] - 1 - Math.floor(attributes.pads[1] + attributes.pads[3]) / 2
+      ];
+
+      const isVec4 = false;
+      const group = attributes.group;
+      const wShape = inputs[1].dims;
+      const inputChannelsPerGroup = wShape[0] / group;
+      const outputChannelsPerGroup = wShape[1];
+
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: strides},
+        {type: DataType.uint32, data: filterDims}, {type: DataType.uint32, data: dilations},
+        {type: DataType.uint32, data: effectiveFilterDims}, {type: DataType.int32, data: pads},
+        {type: DataType.uint32, data: inputChannelsPerGroup}, {type: DataType.uint32, data: outputChannelsPerGroup},
+        ...createTensorShapeVariables(inputs[0].dims, inputs[1].dims)
+      ];
+      if (hasBias) {
+        programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+        inputDependencies.push('rank');
+      }
+      programUniforms.push(...createTensorShapeVariables(outputShape));
+
+      const is1DimensionDispatch = dispatch[1] === 1 && dispatch[2] === 1;
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const uniforms: UniformsArrayType = [
+          {name: 'output_size', type: 'u32'}, {name: 'strides', type: 'u32', length: strides.length},
+          {name: 'filter_dims', type: 'u32', length: filterDims.length},
+          {name: 'dilations', type: 'u32', length: filterDims.length},
+          {name: 'effective_filter_dims', type: 'u32', length: effectiveFilterDims.length},
+          {name: 'pads', type: 'i32', length: pads.length}, {name: 'input_channels_per_group', type: 'u32'},
+          {name: 'output_channels_per_group', type: 'u32'}
+        ];
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        return `${
+            createConvTranspose2DOpProgramShaderSource(
+                shaderHelper, inputs, outputShape, hasBias, is1DimensionDispatch, isVec4, dataType, uniforms,
+                isChannelsLast)}`;
+      };
       return {
         name: 'ConvTranspose2D',
-        shaderCache: {hint: attributes.cacheKey},
+        shaderCache: {hint: `${attributes.cacheKey};`, inputDependencies},
         getRunData: () => ({
           dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
           outputs: [{
             dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
             dataType: inputs[0].dataType
-          }]
+          }],
+          programUniforms
         }),
-        getShaderSource: (shaderHelper: ShaderHelper) => createConvTranspose2DOpProgramShaderSource(
-            shaderHelper, inputs, attributes, outputShape, hasBias, dispatch[1] === 1 && dispatch[2] === 1, false,
-            dataType),
+        getShaderSource
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index 47ec16a29671..29c7941e6bd3 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -19,11 +19,12 @@
 //
 // modified to fit the needs of the project
 
+import {DataType} from '../../../../wasm-common';
 import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
-import {createTensorShapeVariables, enableShapesUniforms, getBroadcastDims, IndicesHelper, inputVariable, internalVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
-import {getActivationSnippet, InternalActivationAttributes} from '../fuse-utils';
+import {createTensorShapeVariables, getBroadcastDims, IndicesHelper, inputVariable, internalVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common';
+import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet, InternalActivationAttributes} from '../fuse-utils';
 
 import {typeSnippet} from './activation_util';
 
@@ -112,14 +113,14 @@ fn main(@builtin(local_invocation_id) localId : vec3<u32>,
   ${batchDims ? `let batchIndices = ${batchDims.offsetToIndices('u32(batch)')};` : ''}
   let globalRowStart = i32(workgroupId.y) * ${tileAOuter};
 
-  let numTiles = ${splitK ? `${Math.ceil(splitedDimInner / tileInner)}` : '(uniforms.dimInner - 1) / tileInner + 1'};
+  let num_tiles = ${splitK ? `${Math.ceil(splitedDimInner / tileInner)}` : '(uniforms.dim_inner - 1) / tileInner + 1'};
   var kStart = ${splitK ? `i32(globalId.z) * ${splitedDimInner}` : '0'};
 
   var acc: array<vec4<${type}>, rowPerThread>;
 
   // Loop over shared dimension.
   let tileRowB = localRow * ${rowPerThreadB};
-  for (var t = 0; t < numTiles; t = t + 1) {
+  for (var t = 0; t < num_tiles; t = t + 1) {
       // Load one tile of A into local memory.
       for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
           let inputRow = tileRow + innerRow;
@@ -204,7 +205,7 @@ export const makeMatMulPackedSource =
     let globalColStart = i32(workgroupId.x) * ${tileBOuter};
 
     // Loop over shared dimension.
-    for (var t = 0; t < numTiles; t = t + 1) {
+    for (var t = 0; t < num_tiles; t = t + 1) {
       // Load one tile of A into local memory.
       for (var inputRow = localRow; inputRow < ${tileAHight}; inputRow = inputRow + ${workgroupSize[1]}) {
         for (var inputCol = localCol; inputCol < ${tileAWidth}; inputCol = inputCol + ${workgroupSize[0]}) {
@@ -260,7 +261,7 @@ let tileRowA = i32(localId.y) * ${rowPerThreadA};
 let tileColA = i32(localId.x) * ${colPerThreadA};
 let tileRowB = i32(localId.y) * ${rowPerThreadB};
 // Loop over shared dimension.
-for (var t = 0; t < numTiles; t = t + 1) {
+for (var t = 0; t < num_tiles; t = t + 1) {
   // Load one tile of A into local memory.
   for (var innerRow = 0; innerRow < ${rowPerThreadA}; innerRow = innerRow + 1) {
     for (var innerCol = 0; innerCol < ${colPerThreadA}; innerCol = innerCol + 1) {
@@ -322,7 +323,8 @@ fn main(@builtin(local_invocation_id) localId : vec3<u32>,
         @builtin(workgroup_id) workgroupId : vec3<u32>) {
     let batch = ${splitK ? '0' : 'i32(globalId.z)'};
     ${batchDims ? `let batchIndices = ${batchDims.offsetToIndices('u32(batch)')};` : ''}
-    let numTiles = ${splitK ? `${Math.ceil(splitedDimInner / tileInner)}` : '(uniforms.dimInner - 1) / tileInner + 1'};
+    let num_tiles = ${
+          splitK ? `${Math.ceil(splitedDimInner / tileInner)}` : '(uniforms.dim_inner - 1) / tileInner + 1'};
     var kStart = ${splitK ? `i32(globalId.z) * ${splitedDimInner}` : '0'};
 
     var acc : array<array<${type}, colPerThread>, rowPerThread>;
@@ -379,7 +381,7 @@ const matMulReadWriteFnSource =
           typeSnippet(component, dataType)} {
       var value = ${typeSnippet(component, dataType)}(0.0);
       let col = colIn * ${component};
-      if(row < uniforms.dimAOuter && col < uniforms.dimInner)
+      if(row < uniforms.dim_a_outer && col < uniforms.dim_inner)
       {
         ${getAIndices()}
         value = ${aVariable.getByIndices('aIndices')};
@@ -391,7 +393,7 @@ const matMulReadWriteFnSource =
           typeSnippet(component, dataType)} {
       var value = ${typeSnippet(component, dataType)}(0.0);
       let col = colIn * ${component};
-      if(row < uniforms.dimInner && col < uniforms.dimBOuter)
+      if(row < uniforms.dim_inner && col < uniforms.dim_b_outer)
       {
         ${getBIndices()}
         value = ${bVariable.getByIndices('bIndices')};
@@ -401,7 +403,7 @@ const matMulReadWriteFnSource =
 
     fn mm_write(batch: i32, row: i32, colIn: i32, valueIn: ${typeSnippet(component, dataType)}) {
       let col = colIn * ${component};
-      if (row < uniforms.dimAOuter && col < uniforms.dimBOuter) {
+      if (row < uniforms.dim_a_outer && col < uniforms.dim_b_outer) {
         var value = valueIn;
         let coords = vec3<i32>(batch, row, colIn);
         ${
@@ -422,16 +424,10 @@ export const createMatmulProgramInfo =
      isChannelsLast = false /* only used for conv2dByMatMul*/): ProgramInfo => {
       const aShape = inputs[0].dims;
       const bShape = inputs[1].dims;
-
       const outerDimsA = aShape.slice(0, -2);
       const outerDimsB = bShape.slice(0, -2);
-
       const outerDims = reshapedOutputShape ? reshapedOutputShape.slice(0, -2) : outputShape.slice(0, -2);
-      const enableBatchUniforms = enableShapesUniforms(outerDims.length);
-      const batchShapeOrRank = enableBatchUniforms ? outerDims.length : outerDims;
-      const batchDims = internalVariable('batchDims', inputs[0].dataType, batchShapeOrRank, 1);
       const batchSize = ShapeUtil.size(outerDims);
-
       const dimAOuter = aShape[aShape.length - 2];
       const dimInner = aShape[aShape.length - 1];
       const dimBOuter = bShape[bShape.length - 1];
@@ -446,72 +442,62 @@ export const createMatmulProgramInfo =
         Math.ceil(batchSize / workgroupSize[2] / elementsPerThread[2])
       ];
 
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
       const components = isVec4 ? 4 : 1;
-
       const aShapeTemp = [...outerDimsA, dimAOuter, dimInner / components];
-      const enableAShapesUniforms = enableShapesUniforms(aShapeTemp.length);
-      const aShapeOrRank = enableAShapesUniforms ? aShapeTemp.length : aShapeTemp;
-
+      const aRank = aShapeTemp.length;
       const bShapeTemp = [...outerDimsB, dimInner, dimBOuter / components];
-      const enableBShapesUniforms = enableShapesUniforms(bShapeTemp.length);
-      const bShapeOrRank = enableBShapesUniforms ? bShapeTemp.length : bShapeTemp;
-
+      const bRank = bShapeTemp.length;
       const outputShapeTemp = [batchSize, dimAOuter, dimBOuter / components];
-
-      const A = inputVariable('a', inputs[0].dataType, aShapeOrRank, components);
-      const B = inputVariable('b', inputs[1].dataType, bShapeOrRank, components);
-      const output = outputVariable('result', inputs[0].dataType, outputShapeTemp.length, components);
-      const inputVariables = [A, B];
-      const programUniforms: ProgramUniform[] =
-          [{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
-      if (enableBatchUniforms) {
-        programUniforms.push(...createTensorShapeVariables(outerDims));
-      }
-      if (enableAShapesUniforms) {
-        programUniforms.push(...createTensorShapeVariables(aShapeTemp));
-      }
-      if (enableBShapesUniforms) {
-        programUniforms.push(...createTensorShapeVariables(bShapeTemp));
-      }
-      const inputDependencies: ProgramInputTensorInfoDependency[] = [];
-      inputDependencies.push(enableAShapesUniforms ? 'rank' : 'dims');
-      inputDependencies.push(enableBShapesUniforms ? 'rank' : 'dims');
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.int32, data: dimAOuter}, {type: DataType.int32, data: dimBOuter},
+        {type: DataType.int32, data: dimInner}
+      ];
+      appendActivationUniformsData(activationAttributes, programUniforms);
+      programUniforms.push(...createTensorShapeVariables(outerDims, aShapeTemp, bShapeTemp));
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
 
       const hasBias = inputs.length > 2;
-      const {activationFunction, applyActivation} = getActivationSnippet(activationAttributes, output.type.value);
-      const declareFunctions = matMulReadWriteFnSource(
-          components, hasBias, applyActivation, [batchDims, A, B, output], [outerDimsA, outerDimsB, outerDims],
-          isChannelsLast);
       if (hasBias) {
-        const biasComponents = isChannelsLast ? components : 1;
-        inputVariables.push(inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, biasComponents));
         programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
-
         inputDependencies.push('rank');
       }
       programUniforms.push(...createTensorShapeVariables(outputShapeTemp));
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const batchRank = outerDims.length;
+        const batchDims = internalVariable('batchDims', inputs[0].dataType, batchRank, 1);
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+
+        const A = inputVariable('a', inputs[0].dataType, aRank, components);
+        const B = inputVariable('b', inputs[1].dataType, bRank, components);
+        const output = outputVariable('result', inputs[0].dataType, outputShapeTemp.length, components);
+        const inputVariables = [A, B];
+        if (hasBias) {
+          const biasComponents = isChannelsLast ? components : 1;
+          inputVariables.push(inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, biasComponents));
+        }
+        const uniforms: UniformsArrayType =
+            [{name: 'dim_a_outer', type: 'i32'}, {name: 'dim_b_outer', type: 'i32'}, {name: 'dim_inner', type: 'i32'}];
+        appendActivationUniforms(activationAttributes, uniforms);
+        const baseType = tensorTypeToWsglStorageType(output.type.tensor);
+        const applyActivation = getActivationSnippet(activationAttributes, output.type.value, baseType);
+        const declareFunctions = matMulReadWriteFnSource(
+            components, hasBias, applyActivation, [batchDims, A, B, output], [outerDimsA, outerDimsB, outerDims],
+            isChannelsLast);
+        return `
   ${
-          shaderHelper.registerUniform('dimAOuter', 'i32')
-              .registerUniform('dimBOuter', 'i32')
-              .registerUniform('dimInner', 'i32')
-              .registerInternalVariables(batchDims)
-              .declareVariables(...inputVariables, output)}
-  ${activationFunction}
+            shaderHelper.registerUniforms(uniforms).registerInternalVariables(batchDims).declareVariables(
+                ...inputVariables, output)}
   ${declareFunctions}
   ${
-          isVec4 ? makeMatMulPackedVec4Source(elementsPerThread, workgroupSize, dataType, batchDims) :
-                   makeMatMulPackedSource(elementsPerThread, workgroupSize, dataType, batchDims)}
+            isVec4 ? makeMatMulPackedVec4Source(elementsPerThread, workgroupSize, dataType, batchDims) :
+                     makeMatMulPackedSource(elementsPerThread, workgroupSize, dataType, batchDims)}
                    `;
-      // TODO: turn clipMax and clipMin to uniforms.
+      };
       return {
         name: 'MatMul',
         shaderCache: {
-          hint: activationAttributes.activationCacheKey + `${elementsPerThread}` +
-              `${isVec4}` +
-              `${isChannelsLast}`,
+          hint: `${elementsPerThread};${activationAttributes.activation};${isVec4};${isChannelsLast}`,
           inputDependencies
         },
         getRunData: () => ({
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
index e1f2a47301bf..37606232a726 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
-import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, GpuDataType} from '../types';
+import {ComputeContext, GpuDataType, ProgramUniform} from '../types';
 
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, tensorTypeToWsglValueType, UniformDataElementType, UniformsArrayType} from './common';
 
 export const enum AttentionQkvFormat {
   unknown,          // enum value not set, or depends on qkv projection implementation details
@@ -231,20 +231,8 @@ const validateAttentionInputs = (inputs: readonly TensorView[], attributes: Atte
   };
 };
 
-export const parseAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs =>
-    createAttributeWithCacheKey({...attributes});
-
 export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView, n: number, d: number) => {
   const components = getMaxComponents(d);
-  const inputHelper = outputVariable('x', input.dataType, input.dims, components);
-
-  let threadMaxValue = 'threadMaxVector';
-  if (components === 2) {
-    threadMaxValue = 'max(threadMaxVector.x, threadMaxVector.y)';
-  } else if (components === 4) {
-    threadMaxValue = 'max(max(threadMaxVector.x, threadMaxVector.y), max(threadMaxVector.z, threadMaxVector.w))';
-  }
-  const dataType = tensorTypeToWsglStorageType(input.dataType);
   let WG = 64;
   const dComp = d / components;
   if (dComp < WG) {
@@ -253,25 +241,42 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     WG = Math.ceil(dComp / 8);
   }
   const elementsPerWG = Math.ceil(d / components / WG);
+  const programUniforms: ProgramUniform[] = [
+    {type: input.dataType, data: 1 / d}, {type: DataType.uint32, data: dComp},
+    {type: DataType.uint32, data: elementsPerWG}
+  ];
+  const dataType = tensorTypeToWsglStorageType(input.dataType, components);
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const inputHelper = outputVariable('x', input.dataType, input.dims, components);
+    let threadMaxValue = 'thread_max_vector';
+    if (components === 2) {
+      threadMaxValue = 'max(thread_max_vector.x, thread_max_vector.y)';
+    } else if (components === 4) {
+      threadMaxValue =
+          'max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))';
+    }
+    const elemValueType = tensorTypeToWsglValueType(input.dataType);
+    const uniforms: UniformsArrayType = [
+      {name: 'd_inv', type: elemValueType as UniformDataElementType}, {name: 'd_comp', type: 'u32'},
+      {name: 'elements_per_wg', type: 'u32'}
+    ];
 
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const dInv: ${dataType} = 1 / ${d};
-  const dComp = ${d / components};
+    return `
   var<workgroup> wgMax: array<f32, ${WG}>;
   var<workgroup> wgSum: array<f32, ${WG}>;
-
-  ${shaderHelper.declareVariables(inputHelper)}
-  @compute @workgroup_size(${WG}, 1, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-    @builtin(local_invocation_index) local_index : u32) {
-    let localOffset = local_index * ${elementsPerWG};
-    let offset: u32 = workgroup_id.x * dComp + localOffset;
-
-    var threadMaxVector = ${fillVector('f32', components, '-3.402823e+38f')};
-    for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-      threadMaxVector = max(${castToF32(dataType, components, 'x[offset + i]')}, threadMaxVector);
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(inputHelper)}
+  ${shaderHelper.mainStart([
+      WG, 1, 1
+    ])}
+    let localOffset = local_idx * uniforms.elements_per_wg;
+    let offset: u32 = workgroup_id.x * uniforms.d_comp + localOffset;
+
+    var thread_max_vector = ${fillVector('f32', components, '-3.402823e+38f')};
+    for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+      thread_max_vector = max(${castToF32(elemValueType, components, 'x[offset + i]')}, thread_max_vector);
     }
-    wgMax[local_index] = ${threadMaxValue};
+    wgMax[local_idx] = ${threadMaxValue};
     workgroupBarrier();
 
     var maxValue = -3.402823e+38f;
@@ -280,10 +285,10 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     }
 
     var sumVector = ${fillVector('f32', components, '0')};
-    for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-      sumVector += exp(${castToF32(dataType, components, 'x[offset + i]')} - maxValue);
+    for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+      sumVector += exp(${castToF32(elemValueType, components, 'x[offset + i]')} - maxValue);
     }
-    wgSum[local_index] = ${sumVector('sumVector', components)};
+    wgSum[local_idx] = ${sumVector('sumVector', components)};
     workgroupBarrier();
 
     var sum: f32 = 0;
@@ -292,26 +297,24 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     }
 
     if (sum == 0) {
-      for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-        x[offset + i] = ${fillVector(dataType, components, 'dInv')};
+      for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+        x[offset + i] = ${fillVector(elemValueType, components, 'uniforms.d_inv')};
       }
     } else {
-      for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-        let f32input = ${castToF32(dataType, components, 'x[offset + i]')};
+      for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+        let f32input = ${castToF32(elemValueType, components, 'x[offset + i]')};
         x[offset + i] = ${inputHelper.type.value}(exp(f32input - maxValue) / sum);
       }
     }
   }`;
+  };
 
   context.compute(
       {
         name: 'AttentionProbsSoftmax',
-        shaderCache: {hint: `${d}`},
+        shaderCache: {hint: `${WG};${dataType};${components}`},
         getShaderSource,
-        getRunData: () => ({
-          outputs: [],
-          dispatchGroup: {x: n},
-        }),
+        getRunData: () => ({outputs: [], dispatchGroup: {x: n}, programUniforms}),
       },
       {inputs: [input], outputs: []});
 };
@@ -326,88 +329,82 @@ const computeAttentionProbs =
       // TODO: handle mask
 
       const alpha = attributes.scale === 0 ? 1.0 / Math.sqrt(parameters.headSize) : attributes.scale;
-
-      const dataType = tensorTypeToWsglStorageType(q.dataType);
-
       const components = getMaxComponents(parameters.headSize);
-      const qInput = inputVariable('q', q.dataType, q.dims, components);
-      const kInput = inputVariable('key', key.dataType, key.dims, components);
-      const output = outputVariable('output', q.dataType, probsShape);
-
       const vectorizedHeadSize = parameters.headSize / components;
-      const M = parameters.sequenceLength;
-      const N = parameters.totalSequenceLength;
-      const K = vectorizedHeadSize;
-
       const TILE_SIZE = 12;
-
       const dispatch = {
         x: Math.ceil(parameters.totalSequenceLength / TILE_SIZE),
         y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
         z: parameters.batchSize * parameters.numHeads
       };
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: parameters.sequenceLength}, {type: DataType.uint32, data: vectorizedHeadSize},
+        {type: DataType.uint32, data: parameters.totalSequenceLength},
+        {type: DataType.uint32, data: parameters.kvSequenceLength}, {type: q.dataType, data: alpha}
+      ];
 
       const inputs = [q, key];
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-  const alpha: ${dataType} = ${alpha};
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const qInput = inputVariable('q', q.dataType, q.dims, components);
+        const kInput = inputVariable('key', key.dataType, key.dims, components);
+        const output = outputVariable('output', q.dataType, probsShape);
+        const dataType = tensorTypeToWsglStorageType(q.dataType);
+
+        const uniforms: UniformsArrayType = [
+          {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
+          {name: 'kv_sequence_length', type: 'u32'}, {name: 'alpha', type: dataType as UniformDataElementType}
+        ];
+        return `
   const beta: ${dataType} = 1.0;
   const TILE_SIZE = ${TILE_SIZE}u;
 
   var<workgroup> tileQ: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileK: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-
-  ${shaderHelper.declareVariables(qInput, kInput, output)}
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(qInput, kInput, output)}
+  ${shaderHelper.mainStart([
+          TILE_SIZE, TILE_SIZE, 1
+        ])}
     // x holds the N and y holds the M
     let headIdx = workgroup_id.z;
     let m = workgroup_id.y * TILE_SIZE;
     let n = workgroup_id.x * TILE_SIZE;
-    let lm = m + local_id.y;
-    let ln = n + local_id.x;
-
-    let qOffset = ${parameters.sequenceLength * vectorizedHeadSize} * headIdx + m * K;
-    let kOffset = ${parameters.kvSequenceLength * vectorizedHeadSize} * headIdx + n * K;
+    let qOffset = uniforms.M * uniforms.K * headIdx + m * uniforms.K;
+    let kOffset = uniforms.kv_sequence_length * uniforms.K * headIdx + n * uniforms.K;
 
     var value = ${fillVector(dataType, components)};
-    for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-      if (m + local_id.y < M && w + local_id.x < K) {
-        tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * K + w + local_id.x];
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (global_id.y < uniforms.M && w + local_id.x < uniforms.K) {
+        tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x];
       }
-      if (n + local_id.y < N && w + local_id.x < K) {
-        tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * K + w + local_id.x];
+      if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) {
+        tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * uniforms.K + w + local_id.x];
       }
       workgroupBarrier();
 
-      for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+      for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
         value += tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * local_id.x + k];
       }
 
       workgroupBarrier();
     }
 
-    let headOffset = headIdx * M * N;
-    if (lm < M && ln < N) {
-      let outputIdx = headOffset + lm * N + ln;
-      output[outputIdx] = ${sumVector('value', components)} * alpha;
+    let headOffset = headIdx * uniforms.M * uniforms.N;
+    if (global_id.y < uniforms.M && global_id.x < uniforms.N) {
+      let outputIdx = headOffset + global_id.y * uniforms.N + global_id.x;
+      output[outputIdx] = ${sumVector('value', components)} * uniforms.alpha;
     }
   }`;
+      };
 
       const probs = context.compute(
           {
             name: 'AttentionProbs',
-            shaderCache: {hint: JSON.stringify(parameters)},
+            shaderCache: {hint: `${components}`, inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: probsShape, dataType: q.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: dispatch,
+              programUniforms
             }),
             getShaderSource,
           },
@@ -423,78 +420,76 @@ const computeAttentionProbs =
 const computeVxAttentionScore =
     (context: ComputeContext, probs: TensorView, v: TensorView, params: AttentionParameters) => {
       const outputShape = [params.batchSize, params.sequenceLength, params.vHiddenSize];
-
-      const probsHelper = inputVariable('probs', probs.dataType, probs.dims);
-      const vHelper = inputVariable('v', v.dataType, v.dims);
-      const output = outputVariable('output', probs.dataType, outputShape);
-
-      const dataType = tensorTypeToWsglStorageType(probs.dataType);
-
       const TILE_SIZE = 12;
       const dispatch = {
         x: Math.ceil(params.vHeadSize / TILE_SIZE),
         y: Math.ceil(params.sequenceLength / TILE_SIZE),
         z: params.batchSize * params.numHeads
       };
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: params.sequenceLength}, {type: DataType.uint32, data: params.totalSequenceLength},
+        {type: DataType.uint32, data: params.vHeadSize}, {type: DataType.uint32, data: params.numHeads},
+        {type: DataType.uint32, data: params.vHiddenSize}
+      ];
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${params.sequenceLength}u;
-  const N: u32 = ${params.vHeadSize}u;
-  const K: u32 = ${params.totalSequenceLength}u;
-  const numHeads: u32 = ${params.numHeads}u;
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const probsHelper = inputVariable('probs', probs.dataType, probs.dims);
+        const vHelper = inputVariable('v', v.dataType, v.dims);
+        const output = outputVariable('output', probs.dataType, outputShape);
+        const uniforms: UniformsArrayType = [
+          {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
+          {name: 'num_heads', type: 'u32'}, {name: 'v_hidden_size', type: 'u32'}
+        ];
+        return `
   const TILE_SIZE = ${TILE_SIZE}u;
-
-  var<workgroup> tileQ: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-  var<workgroup> tileK: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-
-  ${shaderHelper.declareVariables(probsHelper, vHelper, output)}
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
+  var<workgroup> tileQ: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>;
+  var<workgroup> tileK: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>;
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(probsHelper, vHelper, output)}
+  ${shaderHelper.mainStart([
+          TILE_SIZE, TILE_SIZE, 1
+        ])}
    let headIdx = workgroup_id.z;
-   let m = workgroup_id.y * TILE_SIZE + local_id.y;
-   let n = workgroup_id.x * TILE_SIZE + local_id.x;
+   let m = global_id.y;
+   let n = global_id.x;
 
-   let offsetA = headIdx * (M * K) + m * K;
-   let offsetB = headIdx * (N * K) + n;
+   let offsetA = headIdx * (uniforms.M * uniforms.K) + m * uniforms.K;
+   let offsetB = headIdx * (uniforms.N * uniforms.K) + n;
 
-   var value = ${dataType}(0);
-   for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-     if (m < M && w + local_id.x < K) {
+   var value = ${probsHelper.type.storage}(0);
+   for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+     if (m < uniforms.M && w + local_id.x < uniforms.K) {
        tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x];
      }
-     if (n < N && w + local_id.y < K) {
-       tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * N];
+     if (n < uniforms.N && w + local_id.y < uniforms.K) {
+       tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * uniforms.N];
      }
      workgroupBarrier();
-     for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+     for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
        value += tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * k + local_id.x];
      }
      workgroupBarrier();
    }
 
    // we need to transpose output from BNSH_v to BSND_v
-   let batchIdx = workgroup_id.z / ${params.numHeads};
-   let currentBatchHeadNumber = workgroup_id.z % ${params.numHeads};
-   let headOffset = (batchIdx * M * ${params.numHeads} + currentBatchHeadNumber) * ${params.vHeadSize};
-   if (m < M && n < N) {
-     let outputIdx = batchIdx * ${params.sequenceLength * params.vHiddenSize} + m * ${params.vHiddenSize}
-       + currentBatchHeadNumber * ${params.vHeadSize} + n;
+   let batchIdx = workgroup_id.z / uniforms.num_heads;
+   let currentBatchHeadNumber = workgroup_id.z % uniforms.num_heads;
+   let headOffset = (batchIdx * uniforms.M * uniforms.num_heads + currentBatchHeadNumber) * uniforms.N;
+   if (m < uniforms.M && n < uniforms.N) {
+     let outputIdx = batchIdx * uniforms.M *uniforms.v_hidden_size + m * uniforms.v_hidden_size
+       + currentBatchHeadNumber * uniforms.N + n;
      output[outputIdx] = value;
    }
   }`;
+      };
 
       return context.compute(
           {
             name: 'AttentionScore',
-            shaderCache: {hint: JSON.stringify(params)},
+            shaderCache: {inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: probs.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: dispatch,
+              programUniforms
             }),
             getShaderSource,
           },
@@ -517,71 +512,71 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
     parameters.sequenceLength,
     parameters.headSize,
   ];
-
-  const dataType = tensorTypeToWsglStorageType(context.inputs[0].dataType);
-
   const M = parameters.sequenceLength;
   const K = parameters.inputHiddenSize;
   const N = parameters.headSize;
-
   const TILE_SIZE = 12;
   const dispatch = {
     x: Math.ceil(parameters.headSize / TILE_SIZE),
     y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
     z: parameters.batchSize * parameters.numHeads
   };
+  const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]];
+  const programUniforms: ProgramUniform[] = [
+    {type: DataType.uint32, data: M}, {type: DataType.uint32, data: K}, {type: DataType.uint32, data: N},
+    {type: DataType.uint32, data: parameters.numHeads}, {type: DataType.uint32, data: parameters.headSize},
+    {type: DataType.uint32, data: parameters.hiddenSize},
+    {type: DataType.uint32, data: parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}
+  ];
 
-  const getShaderSource = () => `
-  const M: u32 = ${M}u;
-  const K: u32 = ${K}u;
-  const N: u32 = ${N}u;
-  const numHeads: u32 = ${parameters.numHeads};
-  const ldb = ${parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}u;
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const outputQ = outputVariable('output_q', inputs[0].dataType, outputShape);
+    const outputK = outputVariable('output_k', inputs[0].dataType, outputShape);
+    const outputV = outputVariable('output_v', inputs[0].dataType, outputShape);
+    const input = inputVariable('input', inputs[0].dataType, inputs[0].dims);
+    const weight = inputVariable('weight', inputs[1].dataType, inputs[1].dims);
+    const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+    const dataType = input.type.storage;
+
+    const uniforms: UniformsArrayType = [
+      {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'num_heads', type: 'u32'},
+      {name: 'head_size', type: 'u32'}, {name: 'hidden_size', type: 'u32'}, {name: 'ldb', type: 'u32'}
+    ];
+    return `
   const TILE_SIZE = ${TILE_SIZE}u;
-
   var<workgroup> tileInput: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightQ: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightK: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightV: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
-
-  @group(0) @binding(0) var<storage, read> input: array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> weight: array<${dataType}>;
-  @group(0) @binding(2) var<storage, read> bias: array<${dataType}>;
-  @group(0) @binding(3) var<storage, read_write> outputQ: array<${dataType}>;
-  @group(0) @binding(4) var<storage, read_write> outputK: array<${dataType}>;
-  @group(0) @binding(5) var<storage, read_write> outputV: array<${dataType}>;
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
-    let batchIndex = workgroup_id.z / ${parameters.numHeads};
-    let headNumber = workgroup_id.z % ${parameters.numHeads};
-    let m = workgroup_id.y * TILE_SIZE + local_id.y;
-    let n = workgroup_id.x * TILE_SIZE + local_id.x;
-
-    let inputOffset = batchIndex * (M * K) + m * K;
-    let biasOffsetQ = headNumber * ${parameters.headSize};
-    let biasOffsetK = ${parameters.hiddenSize} + biasOffsetQ;
-    let biasOffsetV = ${parameters.hiddenSize} + biasOffsetK;
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(input, weight, bias, outputQ, outputK, outputV)}
+  ${shaderHelper.mainStart([
+      TILE_SIZE, TILE_SIZE, 1
+    ])}
+    let batchIndex = workgroup_id.z / uniforms.num_heads;
+    let headNumber = workgroup_id.z % uniforms.num_heads;
+    let m = global_id.y;
+    let n = global_id.x;
+
+    let inputOffset = batchIndex * (uniforms.M * uniforms.K) + m * uniforms.K;
+    let biasOffsetQ = headNumber * uniforms.head_size;
+    let biasOffsetK = uniforms.hidden_size + biasOffsetQ;
+    let biasOffsetV = uniforms.hidden_size + biasOffsetK;
 
     var valueQ = ${dataType}(0);
     var valueK = ${dataType}(0);
     var valueV = ${dataType}(0);
-    for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-      if (m < M && w + local_id.x < K) {
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m < uniforms.M && w + local_id.x < uniforms.K) {
         tileInput[TILE_SIZE * local_id.y + local_id.x] = input[inputOffset + w + local_id.x];
       }
-      if (n < N && w + local_id.y < K) {
-        let offset = n + (w + local_id.y) * ldb;
+      if (n < uniforms.N && w + local_id.y < uniforms.K) {
+        let offset = n + (w + local_id.y) * uniforms.ldb;
         tileWeightQ[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetQ + offset];
         tileWeightK[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetK + offset];
         tileWeightV[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetV + offset];
       }
       workgroupBarrier();
-      for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+      for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
         let inputTileOffset = TILE_SIZE * local_id.y + k;
         let weightTileOffset = TILE_SIZE * k + local_id.x;
         valueQ += tileInput[inputTileOffset] * tileWeightQ[weightTileOffset];
@@ -592,26 +587,25 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
       workgroupBarrier();
     }
 
-    let headOffset = (m * N + n) % ${parameters.headSize};
+    let headOffset = (m * uniforms.N + n) % uniforms.head_size;
     valueQ += bias[headOffset + biasOffsetQ];
     valueK += bias[headOffset + biasOffsetK];
     valueV += bias[headOffset + biasOffsetV];
 
-    let offset = workgroup_id.z * M * N;
-    if (m < M && n < N) {
-      let outputIdx = offset + m * N + n;
-      outputQ[outputIdx] = valueQ;
-      outputK[outputIdx] = valueK;
-      outputV[outputIdx] = valueV;
+    let offset = workgroup_id.z * uniforms.M * uniforms.N;
+    if (m < uniforms.M && n < uniforms.N) {
+      let outputIdx = offset + m * uniforms.N + n;
+      output_q[outputIdx] = valueQ;
+      output_k[outputIdx] = valueK;
+      output_v[outputIdx] = valueV;
     }
   }`;
-
-  const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]];
+  };
 
   return context.compute(
       {
         name: 'AttentionPrepare',
-        shaderCache: {hint: JSON.stringify(parameters)},
+        shaderCache: {inputDependencies: ['type', 'type', 'type']},
         getRunData: () => ({
           outputs: [
             {dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default},
@@ -619,6 +613,7 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
             {dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default},
           ],
           dispatchGroup: dispatch,
+          programUniforms
         }),
         getShaderSource,
       },
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts
index ec9da2613f40..39b932375891 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts
@@ -3,12 +3,13 @@
 
 import {env} from 'onnxruntime-common';
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, ProgramInfo} from '../types';
 
-import {createTensorShapeVariables, enableShapesUniforms, getMaxComponents, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper} from './common';
 
 export interface BatchNormAttributes extends AttributeWithCacheKey {
   readonly epsilon: number;
@@ -61,7 +62,7 @@ const createBatchNormInferenceProgramInfo =
       const cComponents = format === 'NHWC' && yShape.length > 1 ? components : 1;
       const outputSize = ShapeUtil.size(yShape) / components;
       // Only support uniforms for opset version >= 9 (spatial = true).
-      const useShapesUniforms = enableShapesUniforms(yShape.length) && spatial;
+      const useShapesUniforms = spatial;
       const shapeOrRank = useShapesUniforms ? yShape.length : yShape;
       const x = inputVariable('x', inputs[0].dataType, inputs[0].dims, components);
       const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims, cComponents);
@@ -108,7 +109,7 @@ const createBatchNormInferenceProgramInfo =
     let inputMean = ${inputMean.getByOffset('cOffset')};
     let inputVar = ${inputVar.getByOffset('cOffset')};
     let x = ${x.getByOffset('global_idx')};
-    let value = (x - inputMean) / sqrt(inputVar + epsilon) * scale + bias;
+    let value = (x - inputMean) * inverseSqrt(inputVar + epsilon) * scale + bias;
     ${y.setByOffset('global_idx', 'value')}
   }`;
       return {
@@ -123,11 +124,11 @@ const createBatchNormInferenceProgramInfo =
           dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
           programUniforms: useShapesUniforms ?
               [
-                {type: 'uint32', data: outputSize},
+                {type: DataType.uint32, data: outputSize},
                 ...createTensorShapeVariables(yShape),
               ] :
               [
-                {type: 'uint32', data: outputSize},
+                {type: DataType.uint32, data: outputSize},
               ],
         }),
       };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/bias-split-gelu.ts b/js/web/lib/wasm/jsep/webgpu/ops/bias-split-gelu.ts
index a81a7a8f1df5..089fecd758e3 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/bias-split-gelu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/bias-split-gelu.ts
@@ -43,7 +43,7 @@ const createBiasSplitGeluProgramInfo = (inputs: readonly TensorView[]): ProgramI
 
   ${shaderHelper.declareVariables(input, bias, output)}
 
-  ${erfImpl(`vec4<${dataType}>`, dataType)}
+  ${erfImpl(dataType)}
 
   ${shaderHelper.mainStart()}
     ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index c033c0ba0535..a094fffe239c 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -6,7 +6,7 @@ import {TensorView} from '../../tensor-view';
 import {BroadcastUtil, ShapeUtil} from '../../util';
 import {ComputeContext, ProgramInfo} from '../types';
 
-import {createTensorShapeVariables, enableShapesUniforms, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper} from './common';
 
 type BuiltinFunctionName = string;
 type BinaryCustomExpression = (expressionA: string, expressionB: string) => string;
@@ -18,8 +18,7 @@ type BinaryFunctionCall = BuiltinFunctionName|BinaryCustomExpression|{
 const createBinaryOpProgramShader =
     (shaderHelper: ShaderHelper, dimsA: readonly number[], dimsB: readonly number[], dimsOutput: readonly number[],
      vectorize: boolean, doBroadcast: boolean, sharedDimensionDivisibleBy4: boolean, funcCall: BinaryFunctionCall,
-     typeA: number, typeB: number, typeOutput: number, useShapesUniforms: boolean,
-     additionalImplementation?: string) => {
+     typeA: number, typeB: number, typeOutput: number, additionalImplementation?: string) => {
       let expressionScalar: BinaryCustomExpression;
       let expressionVector: BinaryCustomExpression;
       if (typeof funcCall === 'string') {
@@ -31,12 +30,9 @@ const createBinaryOpProgramShader =
         expressionVector = funcCall.vector;
       }
 
-      const inputAShapeOrRank = useShapesUniforms ? dimsA.length : dimsA;
-      const inputBShapeOrRank = useShapesUniforms ? dimsB.length : dimsB;
-      const outputShapeOrRank = useShapesUniforms ? dimsOutput.length : dimsOutput;
-      const output = outputVariable('outputData', typeOutput, outputShapeOrRank, 4);
-      const a = inputVariable('aData', typeA, inputAShapeOrRank, 4);
-      const b = inputVariable('bData', typeB, inputBShapeOrRank, 4);
+      const output = outputVariable('outputData', typeOutput, dimsOutput.length, 4);
+      const a = inputVariable('aData', typeA, dimsA.length, 4);
+      const b = inputVariable('bData', typeB, dimsB.length, 4);
 
       let assignment: string;
       if (vectorize) {
@@ -169,30 +165,23 @@ const createBinaryOpProgramInfo =
         vectorize = true;
       }
       cacheKeyAux.push(vectorize);
-      const useShapesUniforms = enableShapesUniforms(a.dims.length) && enableShapesUniforms(b.dims.length) &&
-          enableShapesUniforms(outputShape.length);
+
       return {
         name,
         shaderCache: {
           hint: cacheKey + cacheKeyAux.map((x) => x.toString()).join('_'),
-          inputDependencies: useShapesUniforms ? ['rank', 'rank'] : ['dims', 'dims'],
+          inputDependencies: ['rank', 'rank'],
         },
         getShaderSource: (shaderHelper) => createBinaryOpProgramShader(
             shaderHelper, a.dims, b.dims, outputShape, vectorize, isBroadcast, sharedDimensionDivisibleBy4, funcCall,
-            a.dataType, b.dataType, outputDataType, useShapesUniforms, additionalImplementation),
+            a.dataType, b.dataType, outputDataType, additionalImplementation),
         getRunData: () => ({
           outputs: [{dims: outputShape, dataType: outputDataType}],
           dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* component size */)},
-          programUniforms: useShapesUniforms ?
-              [
-                {type: 'uint32', data: Math.ceil(ShapeUtil.size(outputShape) / 4)},
-                ...createTensorShapeVariables(a.dims),
-                ...createTensorShapeVariables(b.dims),
-                ...createTensorShapeVariables(outputShape),
-              ] :
-              [
-                {type: 'uint32', data: Math.ceil(ShapeUtil.size(outputShape) / 4)},
-              ],
+          programUniforms: [
+            {type: DataType.uint32, data: Math.ceil(ShapeUtil.size(outputShape) / 4)},
+            ...createTensorShapeVariables(a.dims, b.dims, outputShape)
+          ],
         }),
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 3ce114c5d388..17ac814c4403 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -3,7 +3,7 @@
 
 import {DataType} from '../../../wasm-common';
 import {ShapeUtil} from '../../util';
-import {ProgramUniform} from '../types';
+import {ProgramUniform, ProgramUniformVariableInfo} from '../types';
 
 /**
  * constant value for a workgroup size.
@@ -259,8 +259,16 @@ export const tensorTypeToWsglValueType = (type: DataType, components: 1|2|3|4 =
   return typeof mappedType === 'string' ? mappedType : mappedType[1];
 };
 
-export const createTensorShapeVariables = (dims: readonly number[]): ProgramUniform[] =>
-    dims.length === 0 ? [] : [{type: 'uint32', data: dims}, {type: 'uint32', data: ShapeUtil.computeStrides(dims)}];
+export const createTensorShapeVariables = (...dims: ReadonlyArray<readonly number[]>): ProgramUniform[] => {
+  const programUniforms: ProgramUniform[] = [];
+  dims.forEach(dim => {
+    if (dim.length !== 0) {
+      programUniforms.push(
+          {type: DataType.uint32, data: dim}, {type: DataType.uint32, data: ShapeUtil.computeStrides(dim)});
+    }
+  });
+  return programUniforms;
+};
 
 /**
  * A helper function to get maximum vector size for specified data length
@@ -330,18 +338,28 @@ export const sumVector = (name: string, components: number) => {
  * @param name - the name of variable.
  * @param index - the index of variable element.
  * @param length - the length of variable.
+ * @param type - the type of variable, optional.
  */
-export const getElementAt = (name: string, index: number|string, length: number): string => {
-  if (name.startsWith('uniforms.') && length > 4) {
-    if (typeof (index) === 'string') {
-      return `${name}[(${index}) / 4][(${index}) % 4]`;
-    } else {
-      return `${name}[${Math.floor(index / 4)}][${index % 4}]`;
-    }
-  } else {
-    return length > 1 ? `${name}[${index}]` : name;
-  }
-};
+export const getElementAt =
+    (name: string, index: number|string, length: number, type?: UniformDataElementType): string => {
+      if (name.startsWith('uniforms.') && length > 4) {
+        if (typeof (index) === 'string') {
+          if (type === 'f16') {
+            return `${name}[(${index}) / 8][(${index}) % 8 / 4][(${index}) % 8 % 4]`;
+          } else {
+            return `${name}[(${index}) / 4][(${index}) % 4]`;
+          }
+        } else {
+          if (type === 'f16') {
+            return `${name}[${Math.floor(index / 8)}][${Math.floor(index % 8 / 4)}][${index % 8 % 4}]`;
+          } else {
+            return `${name}[${Math.floor(index / 4)}][${index % 4}]`;
+          }
+        }
+      } else {
+        return length > 1 ? `${name}[${index}]` : name;
+      }
+    };
 
 /**
  * A helper function to get a IndicesHelper for a given input or output.
@@ -688,7 +706,7 @@ export const internalVariable =
     (name: string, type: number, shapeOrRank: number|readonly number[], components: 1|2|3|4 = 1): IndicesHelper =>
         createIndicesHelper(name, type, shapeOrRank, 'internal', components);
 
-export type UniformDataElementType = 'u32'|'f32'|'i32';
+export type UniformDataElementType = 'u32'|'f16'|'f32'|'i32';
 export type UniformsArrayType = Array<{name: string; type: UniformDataElementType; length?: number}>;
 
 /**
@@ -765,7 +783,7 @@ export interface ShaderHelper {
 }
 
 class ShaderHelperImpl implements ShaderHelper {
-  constructor(private normalizedDispatchGroup: [number, number, number]) {}
+  constructor(private normalizedDispatchGroup: [number, number, number], private limits: GPUSupportedLimits) {}
 
   guardAgainstOutOfBoundsWorkgroupSizes(size: number|string): string {
     // Guard against out-of-bounds work group sizes
@@ -778,10 +796,27 @@ class ShaderHelperImpl implements ShaderHelper {
     const workgroupSizeY = typeof workgroupSize === 'number' ? 1 : workgroupSize[1];
     const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2];
 
+    if (workgroupSizeX > this.limits.maxComputeWorkgroupSizeX ||
+        workgroupSizeY > this.limits.maxComputeWorkgroupSizeY ||
+        workgroupSizeZ > this.limits.maxComputeWorkgroupSizeZ) {
+      throw new Error(`workgroup size [${workgroupSizeX}, ${workgroupSizeY}, ${
+          workgroupSizeZ}] exceeds the maximum workgroup size [${this.limits.maxComputeWorkgroupSizeX}, ${
+          this.limits.maxComputeWorkgroupSizeY}, ${this.limits.maxComputeWorkgroupSizeZ}].`);
+    }
+
+    if (workgroupSizeX * workgroupSizeY * workgroupSizeZ > this.limits.maxComputeInvocationsPerWorkgroup) {
+      throw new Error(`workgroup size [${workgroupSizeX}, ${workgroupSizeY}, ${
+          workgroupSizeZ}] exceeds the maximum workgroup invocations ${
+          this.limits.maxComputeInvocationsPerWorkgroup}.`);
+    }
+
     const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
     const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>,
     @builtin(local_invocation_id) local_id : vec3<u32>` :
-                                             `@builtin(local_invocation_index) local_idx : u32,
+                                             `@builtin(global_invocation_id) global_id : vec3<u32>,
+                                             @builtin(local_invocation_id) local_id : vec3<u32>,
+    @builtin(local_invocation_index) local_idx : u32,
     @builtin(workgroup_id) workgroup_id : vec3<u32>,
     @builtin(num_workgroups) num_workgroups : vec3<u32>`;
     const globalIdxDefinition = is1DimensionDispatch ?
@@ -859,7 +894,11 @@ class ShaderHelperImpl implements ShaderHelper {
     const uniformSnippets: string[] = [];
     for (const {name, type, length} of this.uniforms) {
       if (length && length > 4) {
-        uniformSnippets.push(`${name}:array<vec4<${type}>, ${Math.ceil(length / 4)}>`);
+        if (type === 'f16') {
+          uniformSnippets.push(`@align(16) ${name}:array<mat2x4<${type}>, ${Math.ceil(length / 8)}>`);
+        } else {
+          uniformSnippets.push(`${name}:array<vec4<${type}>, ${Math.ceil(length / 4)}>`);
+        }
       } else {
         const typeTemp = length == null || length === 1 ? type : `vec${length}<${type}>`;
         uniformSnippets.push(`${name}:${typeTemp}`);
@@ -879,9 +918,24 @@ class ShaderHelperImpl implements ShaderHelper {
     return this.uniformDeclaration() + this.variables.map(i => i.impl()).join('\n') +
         this.internalVariables.map(i => i.impl()).join('\n');
   }
+
+  /**
+   * Get the variable info of the shader program.
+   */
+  get variablesInfo(): ProgramUniformVariableInfo[]|undefined {
+    if (this.uniforms.length === 0) {
+      return undefined;
+    }
+
+    const uniformWgslTypeToDataType = (type: UniformDataElementType) =>
+        ([DataType.uint32, DataType.float16, DataType.float,
+          DataType.int32][['u32', 'f16', 'f32', 'i32'].indexOf(type)]);
+    return this.uniforms.map(u => ([uniformWgslTypeToDataType(u.type), u.length ?? 1]));
+  }
 }
 
-export const createShaderHelper = (dispatchGroup: [number, number, number]) => new ShaderHelperImpl(dispatchGroup);
+export const createShaderHelper = (dispatchGroup: [number, number, number], limits: GPUSupportedLimits) =>
+    new ShaderHelperImpl(dispatchGroup, limits);
 
 /**
  * This function comes from https://github.com/tensorflow/tfjs/blob/master/tfjs-core/src/ops/broadcast_util.ts#L18-L40
@@ -906,6 +960,3 @@ export const getBroadcastDims = (inShape: readonly number[], outShape: readonly
   }
   return dims;
 };
-
-// TODO: remove this when all related uses have been removed.
-export const enableShapesUniforms = (_rank: number): boolean => true;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index 43cc4a4c080b..010ee589c44f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -1,36 +1,44 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {createTensorShapeVariables, enableShapesUniforms, IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
 
 export interface ConcatAttributes extends AttributeWithCacheKey {
   readonly axis: number;
 }
 
-const validateInputs = (inputs: readonly TensorView[]): void => {
+const validateInputs = (inputs: readonly TensorView[], axis: number): void => {
   if (!inputs || inputs.length < 1) {
     throw new Error('too few inputs');
   }
-
-  const inputType = inputs[0].dataType;
-  const inputDimensionality = inputs[0].dims.length;
-
-  for (const input of inputs) {
+  const referenceIndex = 0;
+  const referenceInput = inputs[referenceIndex];
+  const inputType = referenceInput.dataType;
+  const inputRank = referenceInput.dims.length;
+  inputs.forEach((input, i) => {
+    if (i === referenceIndex) {
+      return;
+    }
     // make sure types of all inputs match
     if (input.dataType !== inputType) {
       throw new Error('input tensors should be one type');
     }
-
     // make sure the dimensionality of all inputs are the same
-    if (input.dims.length !== inputDimensionality) {
+    if (input.dims.length !== inputRank) {
       throw new Error('input tensors should have the same shape');
     }
-  }
+    input.dims.forEach((dim, i) => {
+      if (i !== axis && dim !== referenceInput.dims[i]) {
+        throw new Error('non concat dimensions must match');
+      }
+    });
+  });
 };
 
 const calculateInputIndexImpl = (numberOfTensors: number, sizeInConcatAxisStr: string): string => `
@@ -63,75 +71,43 @@ const assignOutputData = (inputs: readonly IndicesHelper[], output: IndicesHelpe
   return codeLines.join('\n');
 };
 
-const createConcatProgramInfo = (inputs: readonly TensorView[], axis: number): ProgramInfo => {
-  const inputShape = inputs[0].dims.slice();
-  if (axis >= inputShape.length || axis < (-1 * inputShape.length)) {
-    throw new Error('axis specified for concat doesn\'t match input dimensionality');
-  }
-  const adjustedAxis = (axis < 0) ? inputShape.length + axis : axis;
-  // ensure all of the non-concatenated axes match each other
-  // calculate the shape of the output tensor while we do that
-  const outputShape = inputShape.slice(0);
-  for (let i = 1; i < inputs.length; i++) {
-    const dataNShape = inputs[i].dims.slice();
-    for (let axisIndex = 0; axisIndex < inputShape.length; axisIndex++) {
-      // add to the placeholder for computing output shape
-      if (axisIndex === adjustedAxis) {
-        outputShape[adjustedAxis] += dataNShape[axisIndex];
+const createConcatProgramInfo =
+    (inputs: readonly TensorView[], adjustedAxis: number, outputShape: number[], dataType: DataType): ProgramInfo => {
+      const outputSize = ShapeUtil.size(outputShape);
+
+      const sizeInConcatAxis = new Array<number>(inputs.length);
+      const inputVars = new Array<IndicesHelper>(inputs.length);
+
+      let previousSum = 0;
+      const inputDependencies: ProgramInputTensorInfoDependency[] = [];
+      const inputRanks = [];
+      const programUniforms: ProgramUniform[] = [{type: DataType.uint32, data: outputSize}];
+      for (let i = 0; i < inputs.length; ++i) {
+        previousSum += inputs[i].dims[adjustedAxis];
+        sizeInConcatAxis[i] = previousSum;
+        inputRanks.push(inputs[i].dims.length);
+        inputVars[i] = inputVariable(`input${i}`, dataType, inputRanks[i]);
+        inputDependencies.push('rank');
+        programUniforms.push({type: DataType.uint32, data: sizeInConcatAxis[i]});
       }
-      // ensure all non-cancatenated axes match each other
-      else if (inputShape[axisIndex] !== dataNShape[axisIndex]) {
-        throw new Error('non concat dimensions must match');
+      for (let i = 0; i < inputs.length; ++i) {
+        programUniforms.push(...createTensorShapeVariables(inputs[i].dims));
       }
-    }
-  }
-
-  const outputSize = ShapeUtil.size(outputShape);
-
-  const sizeInConcatAxis = new Array<number>(inputs.length);
-  const inputVars = new Array<IndicesHelper>(inputs.length);
-  const dataType = inputs[0].dataType;
-
-  let previousSum = 0;
-  const inputDependencies: ProgramInputTensorInfoDependency[] = [];
-  const inputShapeOrRanks = [];
-  const enableInputShapesUniforms = [];
-  const programUniforms: ProgramUniform[] = [{type: 'uint32', data: outputSize}];
-  for (let i = 0; i < inputs.length; ++i) {
-    previousSum += inputs[i].dims[adjustedAxis];
-    sizeInConcatAxis[i] = previousSum;
-    enableInputShapesUniforms.push(enableShapesUniforms(inputs[i].dims.length));
-    inputShapeOrRanks.push(enableInputShapesUniforms[i] ? inputs[i].dims.length : inputs[i].dims);
-    inputVars[i] = inputVariable(`input${i}`, dataType, inputShapeOrRanks[i]);
-    inputDependencies.push(enableInputShapesUniforms[i] ? 'rank' : 'dims');
-    programUniforms.push({type: 'uint32', data: sizeInConcatAxis[i]});
-  }
-  for (let i = 0; i < inputs.length; ++i) {
-    if (enableInputShapesUniforms[i]) {
-      programUniforms.push(...createTensorShapeVariables(inputs[i].dims));
-    }
-  }
+      programUniforms.push(...createTensorShapeVariables(outputShape));
 
-  const enableOutputShapesUniforms = enableShapesUniforms(outputShape.length);
-  if (enableOutputShapesUniforms) {
-    programUniforms.push(...createTensorShapeVariables(outputShape));
-  }
-
-  const outputShapeOrRank = enableOutputShapesUniforms ? outputShape.length : outputShape;
-  const output = outputVariable('output', dataType, outputShapeOrRank);
-
-  const indicesAxis = output.indicesGet('indices', adjustedAxis);
-  const sizeInConcatAxisStr =
-      Array.from(Array(sizeInConcatAxis.length).keys()).map(i => `uniforms.sizeInConcatAxis${i}`).join(',');
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
+      const output = outputVariable('output', dataType, outputShape.length);
+      const indicesAxis = output.indicesGet('indices', adjustedAxis);
+      const sizeInConcatAxisStr =
+          Array.from(Array(sizeInConcatAxis.length).keys()).map(i => `uniforms.sizeInConcatAxis${i}`).join(',');
+      const getShaderSource = (shaderHelper: ShaderHelper) => `
 
   ${(() => {
-    shaderHelper.registerUniform('outputSize', 'u32');
-    for (let i = 0; i < inputs.length; i++) {
-      shaderHelper.registerUniform(`sizeInConcatAxis${i}`, 'u32');
-    }
-    return shaderHelper.declareVariables(...inputVars, output);
-  })()}
+        shaderHelper.registerUniform('outputSize', 'u32');
+        for (let i = 0; i < inputs.length; i++) {
+          shaderHelper.registerUniform(`sizeInConcatAxis${i}`, 'u32');
+        }
+        return shaderHelper.declareVariables(...inputVars, output);
+      })()}
 
   ${calculateInputIndexImpl(sizeInConcatAxis.length, sizeInConcatAxisStr)}
 
@@ -149,21 +125,30 @@ const createConcatProgramInfo = (inputs: readonly TensorView[], axis: number): P
     ${assignOutputData(inputVars, output)}
   }`;
 
-  return {
-    name: 'Concat',
-    shaderCache: {hint: `${axis}`, inputDependencies},
-    getRunData: () => ({
-      outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
-      programUniforms,
-    }),
-    getShaderSource,
-  };
-};
+      return {
+        name: 'Concat',
+        shaderCache: {hint: `${adjustedAxis}`, inputDependencies},
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+          programUniforms,
+        }),
+        getShaderSource,
+      };
+    };
 
 export const concat = (context: ComputeContext, attributes: ConcatAttributes): void => {
-  validateInputs(context.inputs);
-  context.compute(createConcatProgramInfo(context.inputs, attributes.axis));
+  const inputs = context.inputs;
+  const inputShape = inputs[0].dims;
+  const adjustedAxis = ShapeUtil.normalizeAxis(attributes.axis, inputShape.length);
+  validateInputs(inputs, adjustedAxis);
+  const outputShape = inputShape.slice();
+  outputShape[adjustedAxis] =
+      inputs.reduce((sum, input) => sum + (input.dims.length > adjustedAxis ? input.dims[adjustedAxis] : 0), 0);
+  // 0 length tensors are valid for concat, remove them
+  const nonEmptyInputs = inputs.filter(input => ShapeUtil.size(input.dims) > 0);
+  context.compute(
+      createConcatProgramInfo(nonEmptyInputs, adjustedAxis, outputShape, inputs[0].dataType), {inputs: nonEmptyInputs});
 };
 
 export const parseConcatAttributes = (attributes: Record<string, unknown>): ConcatAttributes =>
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 14482272bad3..924030125c42 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -1,13 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {ProgramInfo} from '../types';
+import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
 import {calculateOutputShape, ConvAttributes} from './conv';
-import {getActivationSnippet} from './fuse-utils';
+import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet} from './fuse-utils';
 
 /**
  * naive grouped conv implementation, supports 1d/2d conv
@@ -27,52 +28,70 @@ export const createGroupedConvProgramInfo =
           xShape, wShape, attributes.dilations, attributes.pads, attributes.strides, isChannelLast);
       const outputSize = ShapeUtil.size(outputShape);
 
-      const output = outputVariable('output', inputs[0].dataType, outputShape);
-      const {activationFunction, applyActivation} = getActivationSnippet(attributes, output.type.value);
-      const x = inputVariable('x', inputs[0].dataType, xShape);
-      const w = inputVariable('w', inputs[1].dataType, wShape);
-      const inputVars = [x, w];
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: attributes.dilations},
+        {type: DataType.uint32, data: [attributes.strides[0], attributes.strides[1]]},
+        {type: DataType.uint32, data: [attributes.pads[0], attributes.pads[1]]},
+        {type: DataType.uint32, data: outputChannelsPerGroup}
+      ];
+      appendActivationUniformsData(attributes, programUniforms);
+      programUniforms.push(...createTensorShapeVariables(xShape, wShape));
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
       if (hasBias) {
-        inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims));
+        programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+        inputDependencies.push('rank');
       }
+      programUniforms.push(...createTensorShapeVariables(outputShape));
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const strides: vec2<u32> = vec2(${attributes.strides[0]}u, ${attributes.strides[1]}u);
-  const pads: vec2<u32> = vec2(${attributes.pads[0]}u, ${attributes.pads[1]}u);
-
-  ${shaderHelper.declareVariables(...inputVars, output)}
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+        const baseType = tensorTypeToWsglStorageType(output.type.tensor);
+        const applyActivation = getActivationSnippet(attributes, output.type.value, baseType);
+        const x = inputVariable('x', inputs[0].dataType, xShape.length);
+        const w = inputVariable('w', inputs[1].dataType, wShape.length);
+        const inputVars = [x, w];
+        if (hasBias) {
+          inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims.length));
+        }
 
-  ${activationFunction}
+        const uniforms: UniformsArrayType = [
+          {name: 'output_size', type: 'u32'}, {name: 'dilations', type: 'u32', length: attributes.dilations.length},
+          {name: 'strides', type: 'u32', length: 2}, {name: 'pads', type: 'u32', length: 2},
+          {name: 'output_channels_per_group', type: 'u32'}
+        ];
+        appendActivationUniforms(attributes, uniforms);
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVars, output)}
 
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
 
     let outputIndices = ${output.offsetToIndices('global_idx')};
     let batch: u32 = outputIndices[0];
     let output_channel: u32 = outputIndices[${isChannelLast ? 3 : 1}];
     let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[${isChannelLast ? 1 : 2}], outputIndices[${
-          isChannelLast ? 2 : 3}]) * strides - pads;
-    let group_id: u32 = output_channel / ${outputChannelsPerGroup}u;
+            isChannelLast ? 2 : 3}]) * uniforms.strides - uniforms.pads;
+    let group_id: u32 = output_channel / uniforms.output_channels_per_group;
 
     var value: ${output.type.value} = ${output.type.value}(0);
-    for (var wInChannel: u32 = 0u; wInChannel < ${wShape[1]}u; wInChannel++) {
-      let input_channel = group_id * ${wShape[1]}u + wInChannel;
-      for (var wHeight: u32 = 0u; wHeight < ${wShape[2]}u; wHeight++) {
-        let xHeight = xRCCorner.x + wHeight * ${attributes.dilations[0]}u;
+    for (var wInChannel: u32 = 0u; wInChannel < uniforms.w_shape[1]; wInChannel++) {
+      let input_channel = group_id * uniforms.w_shape[1] + wInChannel;
+      for (var wHeight: u32 = 0u; wHeight < uniforms.w_shape[2]; wHeight++) {
+        let xHeight = xRCCorner.x + wHeight * uniforms.dilations[0];
 
-        if (xHeight < 0u || xHeight >= ${xShape[isChannelLast ? 1 : 2]}u) {
+        if (xHeight < 0u || xHeight >= uniforms.x_shape[${isChannelLast ? 1 : 2}]) {
           continue;
         }
 
-        for (var wWidth: u32 = 0u; wWidth < ${wShape[3]}u; wWidth++) {
-          let xWidth = xRCCorner.y + wWidth * ${attributes.dilations[1]}u;
-          if (xWidth < 0u || xWidth >= ${xShape[isChannelLast ? 2 : 3]}u) {
+        for (var wWidth: u32 = 0u; wWidth < uniforms.w_shape[3]; wWidth++) {
+          let xWidth = xRCCorner.y + wWidth * uniforms.dilations[1];
+          if (xWidth < 0u || xWidth >= uniforms.x_shape[${isChannelLast ? 2 : 3}]) {
             continue;
           }
 
           let xVal = ${
-          isChannelLast ? x.get('batch', 'xHeight', 'xWidth', 'input_channel') :
-                          x.get('batch', 'input_channel', 'xHeight', 'xWidth')};
+            isChannelLast ? x.get('batch', 'xHeight', 'xWidth', 'input_channel') :
+                            x.get('batch', 'input_channel', 'xHeight', 'xWidth')};
           let wVal = ${w.get('output_channel', 'wInChannel', 'wHeight', 'wWidth')};
           value += xVal*wVal;
         }
@@ -82,15 +101,115 @@ export const createGroupedConvProgramInfo =
     ${applyActivation}
     ${output.setByOffset('global_idx', 'value')}
   }`;
+      };
       return {
         name: 'GroupedConv',
-        shaderCache: {hint: attributes.cacheKey},
+        shaderCache: {hint: attributes.cacheKey, inputDependencies},
         getRunData: () => ({
           outputs: [{
             dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
             dataType: inputs[0].dataType
           }],
           dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+          programUniforms
+        }),
+        getShaderSource,
+      };
+    };
+
+export const createGroupedConvVectorizeProgramInfo =
+    (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[]): ProgramInfo => {
+      const hasBias = inputs.length > 2;
+      const components = getMaxComponents(outputShape[3]);
+      const outputNumber = getMaxComponents(outputShape[2]);
+      const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
+      const xShape = [inputs[0].dims[0], inputs[0].dims[1], inputs[0].dims[2], inputs[0].dims[3] / components];
+      const wShape = [inputs[1].dims[0], inputs[1].dims[1], inputs[1].dims[2], inputs[1].dims[3] / components];
+      const outputShapeInShader = [outputShape[0], outputShape[1], outputShape[2], outputShape[3] / components];
+
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: outputSize},
+        {type: DataType.int32, data: [attributes.strides[0], attributes.strides[1]]},
+        {type: DataType.int32, data: [attributes.pads[0], attributes.pads[1]]}
+      ];
+      appendActivationUniformsData(attributes, programUniforms);
+      programUniforms.push(...createTensorShapeVariables(xShape, wShape, outputShapeInShader));
+      const xNumber = (outputNumber - 1) * attributes.strides[1] + wShape[1];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components);
+        const baseType = tensorTypeToWsglStorageType(output.type.tensor);
+        const applyActivation = getActivationSnippet(attributes, output.type.value, baseType);
+        const x = inputVariable('x', inputs[0].dataType, xShape.length, components);
+        const w = inputVariable('w', inputs[1].dataType, wShape.length, components);
+        const inputVars = [x, w];
+        if (hasBias) {
+          inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims, components));
+        }
+        const processBias = hasBias ? 'value += b[output_channel];' : '';
+        const uniforms: UniformsArrayType = [
+          {name: 'output_size', type: 'u32'},
+          {name: 'strides', type: 'i32', length: 2},
+          {name: 'pads', type: 'i32', length: 2},
+        ];
+        appendActivationUniforms(attributes, uniforms);
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVars, output)}
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let width0 = uniforms.output_shape[3];
+    let output_channel = global_idx % width0;
+    var index1 = global_idx / width0;
+    let width1 = uniforms.output_shape[2] / ${outputNumber}u;
+    let col = (index1 % width1) * ${outputNumber}u;
+    index1 = index1 / width1;
+    let row = index1 % uniforms.output_shape[1];
+    let batch = index1 / uniforms.output_shape[1];
+
+    let x_corner = vec2<i32>(i32(row), i32(col)) * uniforms.strides - uniforms.pads;
+
+    var x_vals: array<${x.type.value}, ${xNumber}>;
+    var values: array<${output.type.value}, ${outputNumber}>;
+    let input_channel = output_channel;
+    // Use constant instead of uniform can give better performance for w's height/width.
+    for (var w_height: u32 = 0u; w_height < ${wShape[0]}; w_height++) {
+      let x_height = x_corner.x + i32(w_height);
+      if (x_height >= 0 && u32(x_height) < uniforms.x_shape[1]) {
+        for (var i = 0; i < ${xNumber}; i++) {
+          let x_width = x_corner.y + i;
+          if (x_width >= 0 && u32(x_width) < uniforms.x_shape[2]) {
+            x_vals[i] = ${x.get('batch', 'u32(x_height)', 'u32(x_width)', 'input_channel')};
+          } else {
+            x_vals[i] = ${x.type.value}(0);
+          }
+        }
+        for (var w_width: u32 = 0u; w_width < ${wShape[1]}; w_width++) {
+          let w_val = ${w.get('w_height', 'w_width', '0', 'output_channel')};
+          for (var i = 0u; i < ${outputNumber}u; i++) {
+            values[i] = fma(x_vals[i * u32(uniforms.strides[1]) + w_width], w_val, values[i]);
+          }
+        }
+      }
+    }
+
+    for (var i = 0u; i < ${outputNumber}u; i++) {
+      var value = values[i];
+      ${processBias}
+      ${applyActivation}
+      ${output.set('batch', 'row', 'col + i', 'output_channel', 'value')};
+    }
+  }`;
+      };
+
+      return {
+        name: 'GroupedConv-Vectorize',
+        shaderCache: {
+          hint: `${attributes.cacheKey};${components};${outputNumber};${xNumber};${wShape[0]};${wShape[1]}`,
+          inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank']
+        },
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+          programUniforms
         }),
         getShaderSource,
       };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
index 32b1d52ed94c..41bd1d5326dc 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 import {TensorView} from '../../tensor-view';
-import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext} from '../types';
 
 import {createConv2DTransposeMatMulProgramInfo} from './3rd-party/conv_backprop_mm_webgpu';
@@ -59,7 +58,6 @@ export interface ConvTransposeAttributes extends ConvAttributes {
   readonly outputShape: readonly number[];
 }
 
-
 const getAdjustedConvTransposeAttributes =
     <T extends ConvTransposeAttributes>(attributes: T, inputs: readonly TensorView[]): T => {
       const kernelShape = attributes.kernelShape.slice();
@@ -96,11 +94,7 @@ const getAdjustedConvTransposeAttributes =
 
       // always return a new object so does not modify the original attributes
       const newAttributes: T = Object.assign({}, attributes);
-      const cacheKey = attributes.cacheKey + [
-        kernelShape.join('n,'), pads.join(','), strides.join(','), outputPadding.join(','), outputShape.join(','),
-        dilations.join(',')
-      ].join('_');
-      Object.assign(newAttributes, {kernelShape, pads, outputPadding, outputShape, dilations, strides, cacheKey});
+      Object.assign(newAttributes, {kernelShape, pads, outputPadding, outputShape, dilations, strides});
       return newAttributes;
     };
 
@@ -119,7 +113,7 @@ export const parseConvTransposeAttributes = (attributes: Record<string, unknown>
   const wIsConst = (attributes.wIsConst as () => boolean)();
   const outputPadding = attributes.outputPadding as [number, number, number, number];
   const outputShape = attributes.outputShape as [number, number];
-  return createAttributeWithCacheKey({
+  return {
     autoPad,
     format,
     dilations,
@@ -130,8 +124,9 @@ export const parseConvTransposeAttributes = (attributes: Record<string, unknown>
     pads,
     strides,
     wIsConst,
-    ...activationAttributes
-  });
+    ...activationAttributes,
+    cacheKey: `${attributes.format};${activationAttributes.activation};`
+  };
 };
 
 const validateInputs = (inputs: readonly TensorView[], attributes: ConvTransposeAttributes): void => {
@@ -273,7 +268,7 @@ const convTranspose1d = (context: ComputeContext, attributes: ConvTransposeAttri
     //[FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, kW] -> [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, kH=1, kW]
     context.inputs[1].reshape([context.inputs[1].dims[0], context.inputs[1].dims[1], 1, context.inputs[1].dims[2]])
   ];
-  if (inputs.length === 3) {
+  if (context.inputs.length === 3) {
     inputs.push(context.inputs[2]);
   }
   let kernelShape = attributes.kernelShape;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 33a5db7ff6b2..b68d4dcae4cb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -3,12 +3,12 @@
 
 import {TensorView} from '../../tensor-view';
 import {PoolConvUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {AttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext} from '../types';
 
 import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu';
 import {createMatmulProgramInfo} from './3rd-party/matmul_packed_webgpu';
-import {createGroupedConvProgramInfo} from './conv-grouped';
+import {createGroupedConvProgramInfo, createGroupedConvVectorizeProgramInfo} from './conv-grouped';
 import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
 import {createNaiveMatmulProgramInfo} from './matmul';
 import {createTransposeProgramInfo} from './transpose';
@@ -110,7 +110,7 @@ const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inpu
 
   // always return a new object so does not modify the original attributes
   const newAttributes: T = Object.assign({}, attributes);
-  Object.assign(newAttributes, {kernelShape, pads, cacheKey: attributes.cacheKey});
+  Object.assign(newAttributes, {kernelShape, pads});
   return newAttributes;
 };
 
@@ -126,8 +126,18 @@ export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAt
   const strides = attributes.strides as [number, number];
   const wIsConst = (attributes.w_is_const as () => boolean)();
 
-  return createAttributeWithCacheKey(
-      {autoPad, format, dilations, group, kernelShape, pads, strides, wIsConst, ...activationAttributes});
+  return {
+    autoPad,
+    format,
+    dilations,
+    group,
+    kernelShape,
+    pads,
+    strides,
+    wIsConst,
+    ...activationAttributes,
+    cacheKey: `${attributes.format};${activationAttributes.activation};`
+  };
 };
 
 const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes): void => {
@@ -136,12 +146,37 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   // check attributes
 
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
+  const isChannelsLast = attributes.format === 'NHWC';
   if (attributes.group !== 1) {
-    context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+    // NVIDIA GPU with ampere architecture fails with below 2 cases, but we couldn't repro them with any other
+    // GPUs. So just disable vectorize on NVIDIA ampere to ensure always correct outputs.
+    // [webgpu]Conv - conv - vectorize group - B
+    // [webgpu]Conv - conv - vectorize group - D
+    const enableGroupedConvVectorize = !context.adapterInfo.isArchitecture('ampere');
+    if (enableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
+        inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
+      const outputShape = calculateOutputShape(
+          inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
+          isChannelsLast);
+      const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+          context.compute(
+              createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
+              {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+      if (attributes.wIsConst && !context.kernelCustomData.wT) {
+        context.kernelCustomData.wT = transposedWeight;
+      }
+      const convInputs = [inputs[0], transposedWeight];
+      if (inputs.length === 3) {
+        convInputs.push(inputs[2]);
+      }
+      context.compute(
+          createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {inputs: convInputs});
+    } else {
+      context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+    }
     return;
   }
 
-  const isChannelsLast = attributes.format === 'NHWC';
   const hasBias = inputs.length === 3;
   const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
   const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/cumsum.ts b/js/web/lib/wasm/jsep/webgpu/ops/cumsum.ts
index 2ff909c30e62..6080301d9946 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/cumsum.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/cumsum.ts
@@ -54,8 +54,8 @@ const createCumsumProgramInfo =
               outputs: [{dims: inputShape, dataType: inputType}],
               dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
               programUniforms: [
-                {type: 'uint32', data: outputSize}, {type: 'int32', data: axis},
-                ...createTensorShapeVariables(inputShape), ...createTensorShapeVariables(inputShape)
+                {type: DataType.uint32, data: outputSize}, {type: DataType.int32, data: axis},
+                ...createTensorShapeVariables(inputShape, inputShape)
               ]
 
             }),
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/depth-to-space.ts b/js/web/lib/wasm/jsep/webgpu/ops/depth-to-space.ts
new file mode 100644
index 000000000000..83809b3d5de6
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/depth-to-space.ts
@@ -0,0 +1,110 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo} from '../types';
+
+import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+
+export interface FormatAttributes {
+  readonly format: 'NHWC'|'NCHW';
+}
+
+export interface DepthToSpaceAttributes extends FormatAttributes, AttributeWithCacheKey {
+  readonly blocksize: number;
+  readonly mode: string;
+}
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (!inputs || inputs.length !== 1) {
+    throw new Error('DepthToSpace requires 1 input.');
+  }
+  if (inputs[0].dims.length !== 4) {
+    throw new Error('DepthToSpace requires 4D input.');
+  }
+};
+
+const permFunctionBody = (perm: number[], rank: number, input: IndicesHelper, output: IndicesHelper): string => {
+  const reverseFunc = [];
+  reverseFunc.push(`fn perm(i: ${output.type.indices}) -> ${input.type.indices} {
+    var a: ${input.type.indices};`);
+  for (let i = 0; i < rank; ++i) {
+    reverseFunc.push(input.indicesSet('a', perm[i], `i[${i}]`));
+  }
+  reverseFunc.push('return a;}');
+  return reverseFunc.join('\n');
+};
+
+const createDepthToSpaceProgramInfo = (inputTensor: TensorView, attributes: DepthToSpaceAttributes): ProgramInfo => {
+  let n: number, h: number, w: number, c: number;
+  let shape: number[];
+  let perm: number[];
+  const isChannelLast = attributes.format === 'NHWC';
+  const blocksize = attributes.blocksize;
+  const isDCRmode = attributes.mode === 'DCR';
+  if (isChannelLast) {
+    [n, h, w, c] = inputTensor.dims;
+    shape = isDCRmode ? [n, h, w, blocksize, blocksize, c / (blocksize ** 2)] :
+                        [n, h, w, c / (blocksize ** 2), blocksize, blocksize];
+    perm = isDCRmode ? [0, 1, 3, 2, 4, 5] : [0, 1, 4, 2, 5, 3];
+  } else {
+    [n, h, w, c] = [inputTensor.dims[0], inputTensor.dims[2], inputTensor.dims[3], inputTensor.dims[1]];
+    shape = isDCRmode ? [n, blocksize, blocksize, c / (blocksize ** 2), h, w] :
+                        [n, c / (blocksize ** 2), blocksize, blocksize, h, w];
+    perm = isDCRmode ? [0, 3, 4, 1, 5, 2] : [0, 1, 4, 2, 5, 3];
+  }
+  const reshapedInputTensor = inputTensor.reshape(shape);
+  const reshapedInputRank = reshapedInputTensor.dims.length;
+  const inputDataType = inputTensor.dataType;
+
+  const reshapedInput = inputVariable('a', inputDataType, reshapedInputRank);
+  const permedOutput = outputVariable('output', inputDataType, reshapedInputRank);
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => `
+  ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(reshapedInput, permedOutput)}
+
+  ${permFunctionBody(perm, reshapedInputRank, reshapedInput, permedOutput)}
+
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+
+    let indices = ${permedOutput.offsetToIndices('global_idx')};
+    let aIndices = perm(indices);
+
+    ${permedOutput.setByOffset('global_idx', reshapedInput.getByIndices('aIndices'))}
+  }`;
+
+  return {
+    name: 'DepthToSpace',
+    shaderCache: {hint: `${inputTensor.dims};${attributes.blocksize};${attributes.mode}`, inputDependencies: ['rank']},
+    getRunData: (inputs) => {
+      const outputShape = isChannelLast ? [n, h * blocksize, w * blocksize, c / (blocksize ** 2)] :
+                                          [n, c / (blocksize ** 2), h * blocksize, w * blocksize];
+      const outputSize = ShapeUtil.size(outputShape);
+      const shapeBeforePerm = reshapedInputTensor.dims;
+      const shapeAfterPerm = ShapeUtil.sortBasedOnPerm(shapeBeforePerm, perm);
+      return {
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+        dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+        programUniforms:
+            [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(shapeBeforePerm, shapeAfterPerm)],
+      };
+    },
+    getShaderSource,
+  };
+};
+
+export const depthToSpace = (context: ComputeContext, attributes: DepthToSpaceAttributes): void => {
+  validateInputs(context.inputs);
+  context.compute(createDepthToSpaceProgramInfo(context.inputs[0], attributes));
+};
+
+export const parseDepthToSpaceAttributes = (attributes: Record<string, unknown>): DepthToSpaceAttributes =>
+    createAttributeWithCacheKey({
+      blocksize: attributes.blocksize as number,
+      mode: attributes.mode as string,
+      format: attributes.format as 'NHWC' | 'NCHW'
+    });
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
index 4db7c04ad67b..19a009c2eb79 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
@@ -1,13 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
-import {createTensorShapeVariables, enableShapesUniforms, inputVariable, outputVariable, ShaderHelper} from './common';
-
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper} from './common';
 
 export interface EinsumAttributes extends AttributeWithCacheKey {
   readonly equation: string;
@@ -181,14 +181,12 @@ class EinsumEquation {
 const appendMax = (name: string): string => name + '_max';
 
 const createEinsumProgramInfo =
-    (enableInputShapesUniforms: readonly boolean[], inputShapes: Array<readonly number[]>, dataType: number,
-     einsumEquation: EinsumEquation, outputShape: readonly number[]): ProgramInfo => {
-      const shapeOrRanks = inputShapes.map((dims, index) => enableInputShapesUniforms[index] ? dims.length : dims);
-      const inputVars = shapeOrRanks.map((shapeOrRank, index) => inputVariable(`input${index}`, dataType, shapeOrRank));
+    (inputShapes: Array<readonly number[]>, dataType: number, einsumEquation: EinsumEquation,
+     outputShape: readonly number[]): ProgramInfo => {
+      const ranks = inputShapes.map((dims) => dims.length);
+      const inputVars = ranks.map((rank, index) => inputVariable(`input${index}`, dataType, rank));
       const outputSize = ShapeUtil.size(outputShape);
-      const enableOutputShapesUniforms = enableShapesUniforms(outputShape.length);
-      const outputShapeOrRank = enableOutputShapesUniforms ? outputShape.length : outputShape;
-      const output = outputVariable('output', dataType, outputShapeOrRank);
+      const output = outputVariable('output', dataType, outputShape.length);
       const uniformsSymbols =
           [...einsumEquation.symbolToInfo.keys()].filter((symbol) => !einsumEquation.rhs.symbolToIndices.has(symbol));
       const getShaderSource = (shaderHelper: ShaderHelper) => {
@@ -269,24 +267,20 @@ const createEinsumProgramInfo =
       };
       return {
         name: 'Einsum',
-        shaderCache: {
-          hint: einsumEquation.equation,
-          inputDependencies: enableInputShapesUniforms.map((enableShapeUniform) => enableShapeUniform ? 'rank' : 'dims')
-        },
+        shaderCache: {hint: einsumEquation.equation, inputDependencies: inputShapes.map(() => 'rank')},
         getRunData: () => {
           // The symbols from uniformSymbols array are guaranteed to exist in einsumEquations.symbolToInfo map. The
           // filter is added to make sure that dimValue is never 0.
           const programUniformsInit: ProgramUniform[] =
               uniformsSymbols.filter((symbol) => einsumEquation.symbolToInfo.has(symbol))
-                  .map((symbol) => ({type: 'uint32', data: einsumEquation.symbolToInfo.get(symbol)?.dimValue || 0}));
-          programUniformsInit.push({type: 'uint32', data: outputSize});
+                  .map(
+                      (symbol) =>
+                          ({type: DataType.uint32, data: einsumEquation.symbolToInfo.get(symbol)?.dimValue || 0}));
+          programUniformsInit.push({type: DataType.uint32, data: outputSize});
           const programUniforms: ProgramUniform[] =
-              inputShapes.filter((_, index) => enableInputShapesUniforms[index])
-                  .map((dims, _) => [...createTensorShapeVariables(dims)])
+              inputShapes.map((dims, _) => [...createTensorShapeVariables(dims)])
                   .reduce((acc, inputProgramUniforms) => acc.concat(inputProgramUniforms), programUniformsInit);
-          if (enableOutputShapesUniforms) {
-            programUniforms.push(...createTensorShapeVariables(outputShape));
-          }
+          programUniforms.push(...createTensorShapeVariables(outputShape));
           return ({
             outputs: [{dims: outputShape, dataType}],
             dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
@@ -299,11 +293,9 @@ const createEinsumProgramInfo =
 
 export const einsum = (context: ComputeContext, attributes: EinsumAttributes): void => {
   const einsumEquation = new EinsumEquation(context.inputs, attributes.equation);
-  const enableInputShapesUniforms = context.inputs.map((input, _) => enableShapesUniforms(input.dims.length));
   const outputShape = einsumEquation.outputDims;
   const inputShapes = context.inputs.map((input, _) => input.dims);
-  context.compute(createEinsumProgramInfo(
-      enableInputShapesUniforms, inputShapes, context.inputs[0].dataType, einsumEquation, outputShape));
+  context.compute(createEinsumProgramInfo(inputShapes, context.inputs[0].dataType, einsumEquation, outputShape));
 };
 
 export const parseEinsumAttributes = (attributes: Record<string, unknown>): EinsumAttributes => {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
index 3dc4e957e0fe..80ee906423e1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
@@ -6,7 +6,7 @@ import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
-import {createTensorShapeVariables, enableShapesUniforms, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper} from './common';
 
 const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs || inputs.length !== 2) {
@@ -47,17 +47,11 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
   const outputShape: number[] = calculateOutputShape(inputShape, shape);
   const dataType = inputs[0].dataType;
   const components = dataType === DataType.bool ? 4 : 1;
-  const outputSize = ShapeUtil.size(outputShape) / components;
-
-  const enableInputShapeUniform = enableShapesUniforms(inputShape.length);
-  const enableOutputShapeUniform = enableShapesUniforms(outputShape.length);
-
+  const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components);
 
   const getShaderSource = (shaderHelper: ShaderHelper) => {
-    const inputShapeOrRank = enableInputShapeUniform ? inputShape.length : inputShape;
-    const outputShapeOrRank = enableOutputShapeUniform ? outputShape.length : outputShape;
-    const input = inputVariable('input', dataType, inputShapeOrRank, components);
-    const output = outputVariable('output', dataType, outputShapeOrRank, components);
+    const input = inputVariable('input', dataType, inputShape.length, components);
+    const output = outputVariable('output', dataType, outputShape.length, components);
     let assignment: string;
     if (dataType === DataType.bool) {
       const singleAssignment = (resStr: string, x: number, typeCast = '') => `
@@ -90,16 +84,11 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
     ${assignment}`;
   };
 
-  const programUniforms: ProgramUniform[] = [{type: 'uint32', data: outputSize}];
-  if (enableInputShapeUniform) {
-    programUniforms.push(...createTensorShapeVariables(inputShape));
-  }
-  if (enableOutputShapeUniform) {
-    programUniforms.push(...createTensorShapeVariables(outputShape));
-  }
+  const programUniforms: ProgramUniform[] =
+      [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(inputShape, outputShape)];
   return {
     name: 'Expand',
-    shaderCache: {hint: `${outputShape.length}`, inputDependencies: [enableInputShapeUniform ? 'rank' : 'dims']},
+    shaderCache: {hint: `${outputShape.length}`, inputDependencies: ['rank']},
     getShaderSource,
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fast-gelu.ts b/js/web/lib/wasm/jsep/webgpu/ops/fast-gelu.ts
new file mode 100644
index 000000000000..f50a6a3f011f
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/fast-gelu.ts
@@ -0,0 +1,69 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {ComputeContext, ProgramInfo} from '../types';
+
+import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglValueType, UniformsArrayType, WORKGROUP_SIZE} from './common';
+import * as unary from './unary-op';
+
+// GELU is defined as Y=0.5*X*(1+tanh(0.797885*X+0.035677*X*X*X)), where X may pre-add a bias.
+
+const createFastGeluProgramInfo = (inputTensors: readonly TensorView[]): ProgramInfo => {
+  const dataType = inputTensors[0].dataType;
+  const outputSize = ShapeUtil.size(inputTensors[0].dims);
+  const biasLength = ShapeUtil.size(inputTensors[1].dims);
+  // can only use vec4 when bias length is multiple of 4
+  const useVec4 = biasLength % 4 === 0;
+  const getShaderSource = (shaderHelper: ShaderHelper): string => {
+    const x = inputVariable('x', dataType, [1], 4);
+    const bias = inputVariable('bias', dataType, [1], 4);
+    const y = outputVariable('y', dataType, [1], 4);
+
+    const uniforms: UniformsArrayType = [{name: 'output_vec_size', type: 'u32'}, {name: 'bias_size', type: 'u32'}];
+
+    const singleElementBias = (i: 0|1|2|3) => `
+      let bias${i}_offset: u32 = (global_idx * 4 + ${i}) % uniforms.bias_size;
+      let bias${i} = ${bias.getByOffset(`bias${i}_offset / 4`)}[bias${i}_offset % 4];`;
+    const biasGetExpression = useVec4 ?
+        `
+      let bias = ${bias.getByOffset('global_idx % (uniforms.bias_size / 4)')};` :
+        `${singleElementBias(0)}${singleElementBias(1)}${singleElementBias(2)}${singleElementBias(3)}
+      let bias = ${x.type.value}(bias0, bias1, bias2, bias3);`;
+
+    return `${shaderHelper.registerUniforms(uniforms).declareVariables(x, bias, y)}
+
+    ${unary.fastGeluImpl(tensorTypeToWsglValueType(dataType))}
+
+    ${shaderHelper.mainStart(WORKGROUP_SIZE)}
+      ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_vec_size')}
+
+      let x = ${x.getByOffset('global_idx')};
+      ${biasGetExpression}
+      let x_in = x + bias;
+      ${y.setByOffset('global_idx', unary.fastGeluExpression('x_in'))}
+    }`;
+  };
+
+  return {
+    name: 'FastGeluWithBias',
+    shaderCache: {hint: `${useVec4}`, inputDependencies: ['type', 'type']},
+    getShaderSource,
+    getRunData: (inputs) => ({
+      outputs: [{dims: inputs[0].dims, dataType: inputs[0].dataType}],
+      programUniforms:
+          [{type: DataType.uint32, data: Math.ceil(outputSize / 4)}, {type: DataType.uint32, data: biasLength}],
+      dispatchGroup: {x: Math.ceil(outputSize / WORKGROUP_SIZE / 4)}
+    })
+  };
+};
+
+export const fastGelu = (context: ComputeContext): void => {
+  if (context.inputs.length < 2 || ShapeUtil.size(context.inputs[1].dims) === 0) {
+    unary.fastGelu(context);
+  } else {
+    context.compute(createFastGeluProgramInfo(context.inputs));
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
index 0b5c0db2b511..6e66abacf347 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
@@ -1,44 +1,78 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {MAX_CLIP, MIN_CLIP} from '../../util';
+import {ProgramUniform} from '../types';
+
+import {UniformsArrayType} from './common';
 
 export interface InternalActivationAttributes {
   readonly activation: string;
   readonly clipMin?: number;
   readonly clipMax?: number;
-  readonly activationCacheKey: string;
+  readonly alpha?: number;
+  readonly beta?: number;
 }
 
-export const getActivationSnippet = (attributes: InternalActivationAttributes, valueType: string):
-    {activationFunction: string; applyActivation: string} => {
+export const getActivationSnippet =
+    (attributes: InternalActivationAttributes, valueType: string, baseType = 'f32'): string => {
       switch (attributes.activation) {
         case 'Relu':
-          return {activationFunction: '', applyActivation: `value = max(value, ${valueType}(0.0));`};
+          return `value = max(value, ${valueType}(0.0));`;
         case 'Sigmoid':
-          return {
-            activationFunction: '',
-            applyActivation: `value = (${valueType}(1.0) / (${valueType}(1.0) + exp(-value)));`
-          };
+          return `value = (${valueType}(1.0) / (${valueType}(1.0) + exp(-value)));`;
         case 'Clip':
-          return {
-            activationFunction: `const clip_min_=${valueType}(${attributes.clipMin!});const clip_max_=${valueType}(${
-                attributes.clipMax!});`,
-            applyActivation: 'value = clamp(value, clip_min_, clip_max_);'
-          };
-          // TODO: adding other activations that can be fused.
+          return `value = clamp(value, ${valueType}(${baseType}(uniforms.clip_min)), ${valueType}(${
+              baseType}(uniforms.clip_max)));`;
+        case 'HardSigmoid':
+          return `value = max(${valueType}(0.0), min(${valueType}(1.0), ${baseType}(uniforms.alpha) * value + ${
+              baseType}(uniforms.beta)));`;
+        case 'LeakyRelu':
+          return `value = select(${baseType}(uniforms.alpha) * value, value, value >= ${valueType}(0.0));`;
+        case '':
+          return '';
+        // TODO: adding other activations that can be fused.
         default:
-          return {activationFunction: '', applyActivation: ''};
+          throw new Error(`Unsupported activation ${attributes.activation}`);
+      }
+    };
+
+export const appendActivationUniformsData =
+    (attributes: InternalActivationAttributes, programUniform: ProgramUniform[]) => {
+      if (attributes.activation === 'Clip') {
+        programUniform.push(
+            {type: DataType.float, data: attributes.clipMax!}, {type: DataType.float, data: attributes.clipMin!});
+      } else if (attributes.activation === 'HardSigmoid') {
+        programUniform.push(
+            {type: DataType.float, data: attributes.alpha!}, {type: DataType.float, data: attributes.beta!});
+      } else if (attributes.activation === 'LeakyRelu') {
+        programUniform.push({type: DataType.float, data: attributes.alpha!});
       }
     };
 
+export const appendActivationUniforms = (attributes: InternalActivationAttributes, uniforms: UniformsArrayType) => {
+  if (attributes.activation === 'Clip') {
+    uniforms.push({name: 'clip_max', type: 'f32'}, {name: 'clip_min', type: 'f32'});
+  } else if (attributes.activation === 'HardSigmoid') {
+    uniforms.push({name: 'alpha', type: 'f32'}, {name: 'beta', type: 'f32'});
+  } else if (attributes.activation === 'LeakyRelu') {
+    uniforms.push({name: 'alpha', type: 'f32'});
+  }
+};
+
 export const parseInternalActivationAttributes =
     (attributes: Record<string, unknown>|undefined): InternalActivationAttributes => {
       const activation = attributes?.activation as string || '';
-
-      if (activation === 'Clip') {
+      if (activation === 'HardSigmoid') {
+        const [alpha, beta] = attributes?.activation_params as [number, number] || [0.2, 0.5];
+        return {activation, alpha, beta};
+      } else if (activation === 'Clip') {
         const [clipMin, clipMax] = attributes?.activation_params as [number, number] || [MIN_CLIP, MAX_CLIP];
-        return {activation, clipMax, clipMin, activationCacheKey: `${activation}:${clipMin},${clipMax}`};
+        return {activation, clipMax, clipMin};
+      } else if (activation === 'LeakyRelu') {
+        const [alpha] = attributes?.activation_params as [number] || [0.01];
+        return {activation, alpha};
       }
-      return {activation, activationCacheKey: activation};
+      return {activation};
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
index a945954adcaa..4ab6c175a67e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
@@ -46,11 +47,11 @@ const createGatherElementsProgramInfo =
       const output = outputVariable('output', inputOutputDataType, outputShape.length);
 
 
-      const programUniforms: ProgramUniform[] =
-          [{type: 'uint32', data: outputSize}, {type: 'int32', data: axisDimLimit}, {type: 'uint32', data: axis}];
-      programUniforms.push(...createTensorShapeVariables(inputShape));
-      programUniforms.push(...createTensorShapeVariables(indicesShape));
-      programUniforms.push(...createTensorShapeVariables(outputShape));
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: outputSize}, {type: DataType.int32, data: axisDimLimit},
+        {type: DataType.uint32, data: axis}
+      ];
+      programUniforms.push(...createTensorShapeVariables(inputShape, indicesShape, outputShape));
       const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
 
       // int64 indices would be treated as little endian i32 with assumption they fall in i32 limits
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
index 53ca094abfd6..d48bb909f7f8 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -5,9 +5,9 @@ import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
-import {createTensorShapeVariables, enableShapesUniforms, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper} from './common';
 
 export interface GatherAttributes extends AttributeWithCacheKey {
   axis: number;
@@ -31,35 +31,17 @@ const createGatherProgramInfo = (inputs: readonly TensorView[], attributes: Gath
 
   const axisDimLimit = inputShape[axis];
   const components = inputs[0].dataType === DataType.bool ? 4 : 1;
-  const outputSize = ShapeUtil.size(outputShape) / components;
+  const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components);
 
-  const enableInputShapesUniforms = enableShapesUniforms(inputs[0].dims.length);
-  const inputShapeOrRank = enableInputShapesUniforms ? inputs[0].dims.length : inputs[0].dims;
-  const enableIndicesShapesUniforms = enableShapesUniforms(inputs[1].dims.length);
-  const indicesShapeOrRank = enableIndicesShapesUniforms ? inputs[1].dims.length : inputs[1].dims;
-  const enableOutputShapesUniforms = enableShapesUniforms(outputShape.length);
-  const outputShapeOrRank = enableOutputShapesUniforms ? outputShape.length : outputShape;
-
-  const programUniforms: ProgramUniform[] =
-      [{type: 'uint32', data: outputSize}, {type: 'int32', data: axisDimLimit}, {type: 'uint32', data: axis}];
-  if (enableInputShapesUniforms) {
-    programUniforms.push(...createTensorShapeVariables(inputs[0].dims));
-  }
-  if (enableIndicesShapesUniforms) {
-    programUniforms.push(...createTensorShapeVariables(inputs[1].dims));
-  }
-  if (enableOutputShapesUniforms) {
-    programUniforms.push(...createTensorShapeVariables(outputShape));
-  }
-
-  const inputDependencies: ProgramInputTensorInfoDependency[] = [];
-  inputDependencies.push(enableInputShapesUniforms ? 'rank' : 'dims');
-  inputDependencies.push(enableIndicesShapesUniforms ? 'rank' : 'dims');
+  const programUniforms: ProgramUniform[] = [
+    {type: DataType.uint32, data: outputSize}, {type: DataType.int32, data: axisDimLimit},
+    {type: DataType.uint32, data: axis}, ...createTensorShapeVariables(inputs[0].dims, inputs[1].dims, outputShape)
+  ];
 
   const getShaderSource = (shaderHelper: ShaderHelper) => {
-    const data = inputVariable('data', inputs[0].dataType, inputShapeOrRank, components);
-    const indices = inputVariable('inputIndices', inputs[1].dataType, indicesShapeOrRank);
-    const output = outputVariable('output', inputs[0].dataType, outputShapeOrRank, components);
+    const data = inputVariable('data', inputs[0].dataType, inputs[0].dims.length, components);
+    const indices = inputVariable('inputIndices', inputs[1].dataType, inputs[1].dims.length);
+    const output = outputVariable('output', inputs[0].dataType, outputShape.length, components);
 
     const calcDataIndices = (x: number|string): string => {
       const indicesRank = indicesShape.length;
@@ -73,7 +55,7 @@ const createGatherProgramInfo = (inputs: readonly TensorView[], attributes: Gath
           if (idx${x} < 0) {
             idx${x} = idx${x} + uniforms.axisDimLimit;
           }
-          var dataIndices${x} = ${data.type.indices}(0);
+          var dataIndices${x} : ${data.type.indices};
         `;
       for (let i = 0, j = 0; i < inputRank; i++) {
         if (i === axis) {
@@ -127,7 +109,7 @@ const createGatherProgramInfo = (inputs: readonly TensorView[], attributes: Gath
   };
   return {
     name: 'Gather',
-    shaderCache: {hint: attributes.cacheKey, inputDependencies},
+    shaderCache: {hint: attributes.cacheKey, inputDependencies: ['rank', 'rank']},
     getRunData: () => ({
       outputs: [
         {dims: outputShape, dataType: inputs[0].dataType},
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 1c5d28e4b8e3..76302e1af2e5 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -1,12 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {GemmUtil, ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {AttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
 
 const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs) {
@@ -34,25 +35,6 @@ export interface GemmAttributes extends AttributeWithCacheKey {
   beta: number;
 }
 
-const offsetC = (m: number, n: number, dims: readonly number[]): string => {
-  if (dims.length === 0) {
-    return '0u';
-  }
-
-  const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
-  const broadcastN = dims[dims.length - 1] !== n;
-
-  let offset = '0u';
-  if (!broadcastM) {
-    offset += `+ m * ${dims[dims.length - 1]}u`;
-  }
-  if (!broadcastN) {
-    offset += '+n';
-  }
-
-  return offset;
-};
-
 const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAttributes): ProgramInfo => {
   const aShape = inputs[0].dims.slice();
   const bShape = inputs[1].dims.slice();
@@ -63,68 +45,93 @@ const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAt
     throw new Error('Can\'t use gemm on the given tensors');
   }
   const outputSize = ShapeUtil.size(outputShape);
-  let line = '';
-  if (attributes.transA && attributes.transB) {
-    line = 'value += a[k * M + m] * b[n * K + k];';
-  } else if (attributes.transA && !attributes.transB) {
-    line = 'value += a[k * M + m] * b[k * N + n];';
-  } else if (!attributes.transA && attributes.transB) {
-    line = 'value += a[m * K + k] * b[n * K + k];';
-  } else if (!attributes.transA && !attributes.transB) {
-    line = 'value += a[m * K + k] * b[k * N + n];';
-  }
-
-  const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-  const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
-  const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
-  const inputStorageBuffersDeclarations = [
-    `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
-    `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
+  const programUniforms: ProgramUniform[] = [
+    {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: M}, {type: DataType.uint32, data: N},
+    {type: DataType.uint32, data: K}, {type: DataType.float, data: attributes.alpha},
+    {type: DataType.float, data: attributes.beta}
   ];
+  const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
   if (inputs.length === 3) {
-    inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
+    programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+    inputDependencies.push('rank');
   }
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-  const alpha = ${dataType}(${attributes.alpha});
-  const beta = ${dataType}(${attributes.beta});
+  programUniforms.push(...createTensorShapeVariables(outputShape));
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    let line = '';
+    if (attributes.transA && attributes.transB) {
+      line = 'value += a[k * uniforms.M + m] * b[n * uniforms.K + k];';
+    } else if (attributes.transA && !attributes.transB) {
+      line = 'value += a[k * uniforms.M + m] * b[k * uniforms.N + n];';
+    } else if (!attributes.transA && attributes.transB) {
+      line = 'value += a[m * uniforms.K + k] * b[n * uniforms.K + k];';
+    } else if (!attributes.transA && !attributes.transB) {
+      line = 'value += a[m * uniforms.K + k] * b[k * uniforms.N + n];';
+    }
 
-  ${inputStorageBuffersDeclarations.join('\n')}
-  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+    const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= uniforms.alpha;';
+    const a = inputVariable('a', inputs[0].dataType, inputs[0].dims);
+    const b = inputVariable('b', inputs[1].dataType, inputs[1].dims);
+    const dataType = a.type.value;
+    let c: IndicesHelper|null = null;
+    const variables = [a, b];
+    if (inputs.length === 3) {
+      c = inputVariable('c', inputs[2].dataType, inputs[2].dims.length);
+      variables.push(c);
+    }
+    const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+    variables.push(output);
+    const uniforms: UniformsArrayType = [
+      {name: 'output_size', type: 'u32'}, {name: 'M', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'K', type: 'u32'},
+      {name: 'alpha', type: 'f32'}, {name: 'beta', type: 'f32'}
+    ];
+    return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
 
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
 
-    let m = global_idx / N;
-    let n = global_idx % N;
+    let m = global_idx / uniforms.N;
+    let n = global_idx % uniforms.N;
 
     var value = ${dataType}(0);
-    for (var k: u32 = 0u; k<${K}u; k++) {
+    for (var k: u32 = 0u; k < uniforms.K; k++) {
       ${line}
     }
 
     ${calculateAlpha}
-    ${calculateC}
+    ${(() => {
+      if (c != null) {
+        return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += ${
+            dataType}(uniforms.beta) * ${c.getByOffset('cOffset')};`;
+      }
+      return '';
+    })()}
     output[global_idx] = value;
-
   }`;
+  };
+
   return {
     name: 'Gemm',
-    shaderCache: {hint: attributes.cacheKey},
+    shaderCache: {hint: `${attributes.cacheKey}`, inputDependencies},
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}
+      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+      programUniforms
     }),
     getShaderSource,
   };
 };
 
+export const parseGemmAttributes = (attributes: Record<string, unknown>): GemmAttributes => {
+  const transA = attributes.transA as boolean;
+  const transB = attributes.transB as boolean;
+  const alpha = attributes.alpha as number;
+  const beta = attributes.beta as number;
+  return {transA, transB, alpha, beta, cacheKey: `${attributes.transA};${attributes.transB};${attributes.alpha === 1}`};
+};
+
 export const gemm = (context: ComputeContext, attributes: GemmAttributes): void => {
   validateInputs(context.inputs);
   context.compute(createGemmProgramInfo(context.inputs, attributes));
 };
-
-export const parseGemmAttributes = (attributes: Record<string, unknown>): GemmAttributes =>
-    createAttributeWithCacheKey(attributes as Omit<GemmAttributes, keyof AttributeWithCacheKey>);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index 3a84844544c9..c1d762e62aaa 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -4,58 +4,56 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';
+import {createTensorShapeVariables, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
 
-export interface InstanceNormAttributes extends AttributeWithCacheKey {
+export interface InstanceNormAttributes {
   epsilon: number;
   format: 'NHWC'|'NCHW';
 }
 
-const metadata = {
-  name: 'InstanceNormalization'
-};
-
 const createInstanceNormProgramInfo =
     (inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
       const xShape = inputs[0].dims;
-
       const outputShape = xShape;
       const axis = 2;
       const normCount = ShapeUtil.sizeToDimension(xShape, axis);
       const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
       const components = getMaxComponents(normSize);
       const normPackedSize = normSize / components;
-      const C = xShape[1];
-      const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
-      const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
-      const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
-      const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
-      const variables = [x, scale, bias, output];
-      const dataType = x.type.value;
-      const f32Type = components === 1 ? 'f32' : `vec${components}<f32>`;
-      const workgroupSize = 64;
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-
-  const C: u32 = ${C};
-  const normSize: u32 = ${normSize};
-  const epsilon: f32 = ${attributes.epsilon};
+      const inputShape = [xShape[0], xShape[1], normPackedSize];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'type', 'type'];
+      const programUniforms: ProgramUniform[] =
+          [{type: DataType.uint32, data: normSize}, {type: DataType.uint32, data: normPackedSize}];
+      programUniforms.push(...createTensorShapeVariables(inputShape, inputShape));
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const x = inputVariable('x', inputs[0].dataType, inputShape.length, components);
+        const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+        const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+        const output = outputVariable('output', inputs[0].dataType, inputShape.length, components);
+        const variables = [x, scale, bias, output];
+        const dataType = x.type.value;
+        const f32Type = components === 1 ? 'f32' : `vec${components}<f32>`;
+        const workgroupSize = 64;
+
+        const uniforms: UniformsArrayType = [{name: 'normSize', type: 'u32'}, {name: 'normPackedSize', type: 'u32'}];
+        return `
   var<workgroup> meanShared : f32;
   var<workgroup> squaredNormShared : f32;
   var<workgroup> workgroupShared : array<${f32Type}, ${workgroupSize}>;
   const workgroupSize = ${workgroupSize}u;
-  ${shaderHelper.declareVariables(...variables)}
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
   ${shaderHelper.mainStart(workgroupSize)}
     let norm = global_idx / workgroupSize;
-    let batch = norm / C;
-    let channel = norm % C;
+    let batch = norm / uniforms.x_shape[1];
+    let channel = norm % uniforms.x_shape[1];
     let localIndex = local_id.x;
 
     // initialize workgroup memory
     var initial = ${f32Type}(0);
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       initial = initial + ${f32Type}(${x.get('batch', 'channel', 'h')});
     }
     workgroupShared[localIndex] = initial;
@@ -69,13 +67,13 @@ const createInstanceNormProgramInfo =
       workgroupBarrier();
     }
     if (localIndex == 0) {
-      meanShared = ${sumVector('workgroupShared[0]', components)} / f32(normSize);
+      meanShared = ${sumVector('workgroupShared[0]', components)} / f32(uniforms.normSize);
     }
     workgroupBarrier();
 
     // reinitialize workgroup memory.
     initial = ${f32Type}(0);
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       let deviation =  ${f32Type}(${x.get('batch', 'channel', 'h')}) - ${f32Type}(meanShared);
       initial = initial + deviation * deviation;
     }
@@ -94,23 +92,26 @@ const createInstanceNormProgramInfo =
     }
     workgroupBarrier();
 
-    let invStdDev = 1 / sqrt(squaredNormShared / f32(normSize) + epsilon);
+    let invStdDev = inverseSqrt(squaredNormShared / f32(uniforms.normSize) + f32(${attributes.epsilon}));
     let channelScale = invStdDev * f32(${scale.getByOffset('channel')});
     let channelShift = f32(${bias.getByOffset('channel')}) - meanShared * channelScale;
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       let value = ${x.get('batch', 'channel', 'h')} * ${dataType}(${f32Type}(channelScale)) + ${dataType}(${
-          f32Type}(channelShift));
+            f32Type}(channelShift));
       ${output.set('batch', 'channel', 'h', 'value')};
     }
   }`;
+      };
       return {
-        ...metadata,
-        shaderCache: {hint: attributes.cacheKey},
+        ...{name: 'InstanceNormalization'},
+        // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon.
+        shaderCache: {hint: `${attributes.epsilon};${components}`, inputDependencies},
         getRunData: () => ({
           outputs: [
             {dims: outputShape, dataType: inputs[0].dataType},
           ],
-          dispatchGroup: {x: normCount}
+          dispatchGroup: {x: normCount},
+          programUniforms
         }),
         getShaderSource,
       };
@@ -120,10 +121,6 @@ const computeMean =
     (context: ComputeContext, input: TensorView, scale: TensorView, bias: TensorView, n: number, h: number, c: number,
      epsilon: number) => {
       const components = getMaxComponents(c);
-      const inputHelper = inputVariable('input', input.dataType, input.dims, components);
-      const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components);
-      const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components);
-
       const WG = 64;
       // we will store channel scale and channel shift in [2, components] matrix
       // or in vec2 when components == 1
@@ -133,90 +130,107 @@ const computeMean =
       const unitsOfWork = n * c / components;
       const wgSize = Math.ceil(h / WG);
 
-      const getMeanShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${h};
-  const C: u32 = ${c / components};
-  const imageSize: u32 = ${h * c / components};
+      const meanInputDependencies: ProgramInputTensorInfoDependency[] = ['type'];
+      const meanProgramUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: wgSize}, {type: DataType.uint32, data: h},
+        {type: DataType.uint32, data: Math.floor(c / components)},
+        {type: DataType.uint32, data: Math.floor(h * c / components)}
+      ];
 
+      const getMeanShaderSource = (shaderHelper: ShaderHelper) => {
+        const inputHelper = inputVariable('input', input.dataType, input.dims, components);
+        return `
   ${shaderHelper.declareVariables(inputHelper)}
   @group(0) @binding(1) var<storage, read_write> output : array<${outputType}>;
+  struct Uniforms {wg_size:u32, H:u32, C:u32, image_size:u32};
+  @group(0) @binding(2) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart(WG)}
-    let currentImageNumber = global_idx / ${WG} / C;
-    let currentChannelNumber = (global_idx / ${WG}) % C;
-    let wgId = global_idx % ${WG};
-    let wgOffset = wgId * ${wgSize};
-    if (wgOffset >= H) {
+    let currentImageNumber = global_idx / ${WG} / uniforms.C;
+    let currentChannelNumber = (global_idx / ${WG}) % uniforms.C;
+    let wgOffset = local_id.x * uniforms.wg_size;
+    if (wgOffset >= uniforms.H) {
         return;
     }
-    let wgMax = min(wgOffset + ${wgSize}, H);
+    let wgMax = min(wgOffset + uniforms.wg_size, uniforms.H);
 
-    let offset = currentImageNumber * imageSize + currentChannelNumber;
+    let offset = currentImageNumber * uniforms.image_size + currentChannelNumber;
     var sum = ${fillVector('f32', components)};
     var squaredSum = ${fillVector('f32', components)};
     for (var i: u32 = wgOffset; i < wgMax; i++) {
-        let value = ${sumCastType}(input[offset + i * C]);
+        let value = ${sumCastType}(input[offset + i * uniforms.C]);
         sum += value;
         squaredSum += value * value;
     }
     output[global_idx] = ${setOutputValue('sum', 'squaredSum')};
   }`;
+      };
 
       const meanValues = context.compute(
           {
             name: 'InstanceNormComputeMean',
-            shaderCache: {hint: JSON.stringify({components, n, h, c})},
+            shaderCache: {hint: `${components}`, inputDependencies: meanInputDependencies},
             getRunData: () => ({
               outputs: [
                 {dims: [n, c, WG, 2], dataType: DataType.float},
               ],
               dispatchGroup: {x: n * c / components},
+              programUniforms: meanProgramUniforms
             }),
             getShaderSource: getMeanShaderSource,
           },
           {inputs: [input], outputs: [-1]})[0];
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${h};
-  const C: u32 = ${c / components};
-  const imageSize: u32 = ${WG * c / components};
-  const epsilon: f32 = ${epsilon};
 
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: unitsOfWork}, {type: DataType.uint32, data: h},
+        {type: DataType.uint32, data: Math.floor(c / components)},
+        {type: DataType.uint32, data: Math.floor(WG * c / components)}
+      ];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type', 'type'];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components);
+        const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components);
+        return `
   @group(0) @binding(0) var<storage, read> input : array<${outputType}>;
   @group(0) @binding(1) var<storage, read> scale : array<${scaleHelper.type.storage}>;
   @group(0) @binding(2) var<storage, read> bias : array<${biasHelper.type.storage}>;
   @group(0) @binding(3) var<storage, read_write> output : array<${outputType}>;
+  struct Uniforms {units_of_work : u32, H: u32, C : u32, image_size : u32};
+  @group(0) @binding(4) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(unitsOfWork)}
-    let currentImageNumber = global_idx / C;
-    let currentChannelNumber = global_idx % C;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.units_of_work')}
+    let currentImageNumber = global_idx / uniforms.C;
+    let currentChannelNumber = global_idx % uniforms.C;
 
-    let offset = currentImageNumber * imageSize;
+    let offset = currentImageNumber * uniforms.image_size;
     var sum = ${fillVector('f32', components)};
     var squaredSum = ${fillVector('f32', components)};
-    for (var i: u32 = 0; i < ${WG}; i++) {
+    for (var i: u32 = 0; i < min(${WG}, uniforms.H); i++) {
         let value = input[offset + i + currentChannelNumber * ${WG}];
         sum += value[0];
         squaredSum += value[1];
     }
-    sum = sum / f32(H);
-    squaredSum = squaredSum / f32(H);
-    let invStdDev = 1 / sqrt(squaredSum - sum * sum + epsilon);
+    sum = sum / f32(uniforms.H);
+    squaredSum = squaredSum / f32(uniforms.H);
+    let invStdDev = inverseSqrt(squaredSum - sum * sum + f32(${epsilon}));
     let channelScale = invStdDev * ${sumCastType}(scale[currentChannelNumber]);
     let channelShift = ${sumCastType}(bias[currentChannelNumber]) - sum * channelScale;
 
     output[global_idx] = ${setOutputValue('channelScale', 'channelShift')};
   }`;
-
+      };
       return context.compute(
           {
             name: 'InstanceNormComputeChannelScaleShift',
-            shaderCache: {hint: JSON.stringify({components, n, h, c, epsilon})},
+            // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon.
+            shaderCache: {hint: `${components};${epsilon}`, inputDependencies},
             getRunData: () => ({
               outputs: [
                 {dims: [n, c, 2], dataType: DataType.float},
               ],
               dispatchGroup: {x: Math.ceil(unitsOfWork / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },
@@ -230,50 +244,51 @@ const createInstanceNormNHWCProgramInfo =
       const N = xShape[0];
       const C = xShape[xShape.length - 1];
       const H = ShapeUtil.sizeFromDimension(xShape, 1) / C;
-
       const components = getMaxComponents(C);
       const outputSize = ShapeUtil.size(outputShape) / components;
-      const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components);
-      const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components);
-
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-      const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`;
-      const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`;
+      const programUniforms: ProgramUniform[] =
+          [{type: DataType.uint32, data: H}, {type: DataType.uint32, data: Math.floor(C / components)}];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
       // first compute mean
       const channelScaleShift = computeMean(context, inputs[0], inputs[1], inputs[2], N, H, C, attributes.epsilon);
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`;
+        const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`;
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${H};
-  const C: u32 = ${C / components};
+        const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components);
+        const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components);
 
+        return `
   @group(0) @binding(0) var<storage, read> input : array<${inputHelper.type.storage}>;
   @group(0) @binding(1) var<storage, read> scaleInput : array<${scaleType}>;
   @group(0) @binding(2) var<storage, read_write> output : array<${outputHelper.type.storage}>;
+  struct Uniforms {H: u32, C : u32};
+  @group(0) @binding(3) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart()}
-    let currentImageNumber = global_idx / (C * H);
-    let currentChannelNumber = global_idx % C;
+    let currentImageNumber = global_idx / (uniforms.C * uniforms.H);
+    let currentChannelNumber = global_idx % uniforms.C;
 
-    let scaleOffset = currentImageNumber * C + currentChannelNumber;
+    let scaleOffset = currentImageNumber * uniforms.C + currentChannelNumber;
     let scale = scaleInput[scaleOffset];
     output[global_idx] = fma(input[global_idx], ${scaleCastType}(scale[0]), ${scaleCastType}(scale[1]));
   }`;
+      };
       context.compute(
           {
-            name: 'InstanceNormalization',
-            shaderCache: {hint: `${attributes.cacheKey}`},
+            name: 'InstanceNormalizationNHWC',
+            shaderCache: {hint: `${components}`, inputDependencies},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-              dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}
+              dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },
           {inputs: [inputs[0], channelScaleShift]});
     };
 
-export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes): InstanceNormAttributes =>
-    createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
-
 export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
   if (attributes.format === 'NHWC') {
     createInstanceNormNHWCProgramInfo(context, context.inputs, attributes);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index 8a9eeecf2c68..b2a1bbe2bea4 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -4,12 +4,12 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType,} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType,} from './common';
 
-export interface LayerNormAttributes extends AttributeWithCacheKey {
+interface LayerNormAttributes {
+  simplified: boolean;
   axis: number;
   epsilon: number;
 }
@@ -22,9 +22,11 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
 
 const createLayerNormProgramInfo =
     (inputs: readonly TensorView[], attributes: LayerNormAttributes, outputCount: number): ProgramInfo => {
+      const simplified = attributes.simplified;
+
       const xShape = inputs[0].dims;
       const scale = inputs[1];
-      const bias = inputs[2];
+      const bias = !simplified && inputs[2];
 
       const outputShape = xShape;
       const axis = ShapeUtil.normalizeAxis(attributes.axis, xShape.length);
@@ -39,7 +41,7 @@ const createLayerNormProgramInfo =
        Got scale size of ${scaleSize} and bias size of ${biasSize}`);
       }
 
-      const meanInvStdDevDim = [];
+      const meanInvStdDevDim: number[] = [];
       for (let i = 0; i < xShape.length; ++i) {
         if (i < axis) {
           meanInvStdDevDim.push(xShape[i]);
@@ -47,60 +49,69 @@ const createLayerNormProgramInfo =
           meanInvStdDevDim.push(1);
         }
       }
-
       const components = getMaxComponents(normSize);
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-      const variables = [
-        inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
-        inputVariable('scale', scale.dataType, scale.dims, components),
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: normCount}, {type: DataType.float, data: normSize},
+        {type: DataType.uint32, data: Math.floor(normSize / components)},
+        {type: DataType.float, data: attributes.epsilon}
       ];
       if (bias) {
-        variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+        inputDependencies.push('type');
       }
-      variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
-
       const hasMeanDataOutput = outputCount > 1;
       const hasInvStdOutput = outputCount > 2;
 
-      if (hasMeanDataOutput) {
-        variables.push(outputVariable('meanDataOutput', DataType.float, meanInvStdDevDim));
-      }
-      if (hasInvStdOutput) {
-        variables.push(outputVariable('invStdOutput', DataType.float, meanInvStdDevDim));
-      }
-
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const normSize: f32 = ${normSize};
-  const normSizeVectorized: u32 = ${normSize / components};
-  const epsilon: f32 = ${attributes.epsilon};
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        const variables = [
+          inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
+          inputVariable('scale', scale.dataType, scale.dims, components),
+        ];
+        if (bias) {
+          variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+        }
+        variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
+        if (hasMeanDataOutput) {
+          variables.push(outputVariable('mean_data_output', DataType.float, meanInvStdDevDim));
+        }
+        if (hasInvStdOutput) {
+          variables.push(outputVariable('inv_std_output', DataType.float, meanInvStdDevDim));
+        }
 
-  ${shaderHelper.declareVariables(...variables)}
+        const uniforms: UniformsArrayType = [
+          {name: 'norm_count', type: 'u32'}, {name: 'norm_size', type: 'f32'},
+          {name: 'norm_size_vectorized', type: 'u32'}, {name: 'epsilon', type: 'f32'}
+        ];
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(normCount)}
-    let offset = global_idx * normSizeVectorized;
-    var meanVector = ${fillVector('f32', components)};
-    var meanSquareVector = ${fillVector('f32', components)};
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')}
+    let offset = global_idx * uniforms.norm_size_vectorized;
+    var mean_vector = ${fillVector('f32', components)};
+    var mean_square_vector = ${fillVector('f32', components)};
 
-    for (var h: u32 = 0u; h < normSizeVectorized; h++) {
+    for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) {
       let value = ${castToF32(dataType, components, 'x[h + offset]')};
-      meanVector += value;
-      meanSquareVector += value * value;
+      mean_vector += value;
+      mean_square_vector += value * value;
     }
-    let mean = ${sumVector('meanVector', components)} / normSize;
-    let meanSquare = sqrt(${sumVector('meanSquareVector', components)} 
-      / normSize - mean * mean + epsilon);
+    let mean = ${sumVector('mean_vector', components)} / uniforms.norm_size;
+    let inv_std_dev = inverseSqrt(${sumVector('mean_square_vector', components)} / uniforms.norm_size ${
+            simplified ? '' : '- mean * mean'} + uniforms.epsilon);
 
-    for (var j: u32 = 0; j < normSizeVectorized; j++) {
+    for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
       let f32input = ${castToF32(dataType, components, 'x[j + offset]')};
       let f32scale = ${castToF32(dataType, components, 'scale[j]')};
-      output[j + offset] = ${variables[0].type.value}((f32input - mean) / meanSquare * f32scale
+      output[j + offset] = ${variables[0].type.value}((f32input ${simplified ? '' : '- mean'}) * inv_std_dev * f32scale
         ${bias ? `+ ${castToF32(dataType, components, 'bias[j]')}` : ''}
       );
     }
 
-    ${hasMeanDataOutput ? 'meanDataOutput[global_idx] = mean' : ''};
-    ${hasInvStdOutput ? 'invStdOutput[global_idx] = 1 / meanSquare' : ''};
+    ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''};
+    ${hasInvStdOutput ? 'inv_std_output[global_idx] = inv_std_dev' : ''};
   }`;
+      };
       const outputs = [{dims: outputShape, dataType: inputs[0].dataType}];
       if (hasMeanDataOutput) {
         outputs.push({dims: meanInvStdDevDim, dataType: DataType.float});
@@ -111,15 +122,13 @@ const createLayerNormProgramInfo =
 
       return {
         name: 'LayerNormalization',
-        shaderCache: {hint: `${attributes.cacheKey}|${outputCount}|${inputs.length}`},
-        getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}}),
+        shaderCache: {hint: `${components};${outputCount};${simplified}`, inputDependencies},
+        getRunData: () =>
+            ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}, programUniforms}),
         getShaderSource,
       };
     };
 
-export const parseLayerNormAttributes = (attributes: LayerNormAttributes): LayerNormAttributes =>
-    createAttributeWithCacheKey({axis: attributes.axis, epsilon: attributes.epsilon});
-
 export const layerNorm = (context: ComputeContext, attributes: LayerNormAttributes): void => {
   validateInputs(context.inputs);
   context.compute(createLayerNormProgramInfo(context.inputs, attributes, context.outputCount));
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index de9309d1e436..1a92d861002f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -1,13 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {BroadcastUtil, ShapeUtil} from '../../util';
 import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
 import {createMatmulProgramInfo} from './3rd-party/matmul_packed_webgpu';
-import {createTensorShapeVariables, getBroadcastDims, getMaxComponents, IndicesHelper, inputVariable, internalVariable, outputVariable, ShaderHelper,} from './common';
-import {getActivationSnippet, InternalActivationAttributes} from './fuse-utils';
+import {createTensorShapeVariables, getBroadcastDims, getMaxComponents, IndicesHelper, inputVariable, internalVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
+import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet, InternalActivationAttributes} from './fuse-utils';
 
 export const createNaiveMatmulProgramInfo =
     (inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes, outputShape: readonly number[],
@@ -27,11 +28,13 @@ export const createNaiveMatmulProgramInfo =
       const outerDims = reshapedOutputShape ? reshapedOutputShape.slice(0, -2) : outputShape.slice(0, -2);
       const batchSize = ShapeUtil.size(outerDims);
       const outputShapeInShader = [batchSize, M, N];
+
       const programUniforms: ProgramUniform[] = [
-        {type: 'uint32', data: outputSize}, {type: 'uint32', data: M}, {type: 'uint32', data: N},
-        {type: 'uint32', data: K}, ...createTensorShapeVariables(outerDims), ...createTensorShapeVariables(aShape),
-        ...createTensorShapeVariables(bShape)
+        {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: M}, {type: DataType.uint32, data: N},
+        {type: DataType.uint32, data: K}
       ];
+      appendActivationUniformsData(activationAttributes, programUniforms);
+      programUniforms.push(...createTensorShapeVariables(outerDims, aShape, bShape));
       if (hasBias) {
         programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
       }
@@ -42,7 +45,8 @@ export const createNaiveMatmulProgramInfo =
         const a = inputVariable('a', inputs[0].dataType, aShape.length, aComponents);
         const b = inputVariable('b', inputs[1].dataType, bShape.length, components);
         const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components);
-        const {activationFunction, applyActivation} = getActivationSnippet(activationAttributes, output.type.value);
+        const baseType = tensorTypeToWsglStorageType(output.type.tensor);
+        const applyActivation = getActivationSnippet(activationAttributes, output.type.value, baseType);
         const inputVariables = [a, b];
         let processBias = '';
         if (hasBias) {
@@ -57,6 +61,12 @@ export const createNaiveMatmulProgramInfo =
         const outerDimsB = bShape.slice(0, -2);
         const broadCastADims = getBroadcastDims(outerDimsA, outerDims);
         const broadCastBDims = getBroadcastDims(outerDimsB, outerDims);
+        const uniforms: UniformsArrayType = [
+          {name: 'output_size', type: 'u32'}, {name: 'M', type: 'u32'}, {name: 'N', type: 'u32'},
+          {name: 'K', type: 'u32'}
+        ];
+        appendActivationUniforms(activationAttributes, uniforms);
+
         const getIndices = (variable: IndicesHelper, broadCastDims: number[]) => {
           const rank = variable.rank;
           const name = variable.name;
@@ -96,15 +106,10 @@ export const createNaiveMatmulProgramInfo =
 
         return `
   ${
-            shaderHelper.registerUniform('outputSize', 'u32')
-                .registerUniform('M', 'u32')
-                .registerUniform('N', 'u32')
-                .registerUniform('K', 'u32')
-                .registerInternalVariables(batchDims)
-                .declareVariables(...inputVariables, output)}
-  ${activationFunction}
+            shaderHelper.registerUniforms(uniforms).registerInternalVariables(batchDims).declareVariables(
+                ...inputVariables, output)}
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.outputSize')}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
     let col = (global_idx % (uniforms.N / ${components})) * ${components};
     var index1 = global_idx / (uniforms.N / ${components});
     let stride1 = uniforms.M / ${outputNumber};
@@ -134,8 +139,7 @@ export const createNaiveMatmulProgramInfo =
       return {
         name: 'MatMulNaive',
         shaderCache: {
-          hint: `${activationAttributes.activationCacheKey}_${components}_${aComponents}_${outputNumber}_${
-              isChannelsLast}`,
+          hint: `${activationAttributes.activation};${components};${aComponents};${outputNumber};${isChannelsLast}`,
           inputDependencies: hasBias ? ['rank', 'rank', 'rank'] : ['rank', 'rank']
         },
         getRunData: () => ({
@@ -166,9 +170,8 @@ export const matMul = (context: ComputeContext): void => {
   const N = outputShape[outputShape.length - 1];
   const K = context.inputs[0].dims[context.inputs[0].dims.length - 1];
   if (N < 8 && K < 8) {
-    context.compute(
-        createNaiveMatmulProgramInfo(context.inputs, {activation: '', activationCacheKey: ''}, outputShape));
+    context.compute(createNaiveMatmulProgramInfo(context.inputs, {activation: ''}, outputShape));
   } else {
-    context.compute(createMatmulProgramInfo(context.inputs, {activation: '', activationCacheKey: ''}, outputShape));
+    context.compute(createMatmulProgramInfo(context.inputs, {activation: ''}, outputShape));
   }
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts
new file mode 100644
index 000000000000..7f1a5b96863f
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmulnbits.ts
@@ -0,0 +1,304 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType, getTensorElementSize} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
+
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
+
+//  TODO support quantization bits not equal to 4
+export interface MatMulNBitsAttributes extends AttributeWithCacheKey {
+  k: number;
+  n: number;
+  accuracyLevel: number;
+  bits: number;
+  blockSize: number;
+}
+
+const validateInputs = (inputs: readonly TensorView[], attributes: MatMulNBitsAttributes): void => {
+  if (inputs.length < 3 || inputs.length > 4) {
+    throw new Error('MatMulNBits requires 3 or 4 inputs');
+  }
+  const a = inputs[0];
+  const aRank = a.dims.length;
+  if (a.dims[aRank - 1] !== attributes.k) {
+    throw new Error('The last dim of input shape does not match the k value');
+  }
+  const nBlocksPerCol = Math.floor((attributes.k + attributes.blockSize - 1) / attributes.blockSize);
+  const blobSize = attributes.blockSize / 8 * attributes.bits;
+  const b = inputs[1];
+  if (!ShapeUtil.areEqual(b.dims, [attributes.n, nBlocksPerCol, blobSize])) {
+    throw new Error('The second inputs must be 3D tensor with shape N X nBlocksPerCol X blobSize');
+  }
+  const scales = inputs[2];
+  const scalesShape = scales.dims;
+  if (ShapeUtil.size(scalesShape) !== attributes.n * nBlocksPerCol) {
+    throw new Error('scales input size error.');
+  }
+  if (inputs.length === 4) {
+    const zeroPoints = inputs[3];
+    const zeroPointsShape = zeroPoints.dims;
+    const expectedZeroPointsSize =
+        attributes.bits > 4 ? (attributes.n * nBlocksPerCol) : attributes.n * Math.floor((nBlocksPerCol + 1) / 2);
+    if (ShapeUtil.size(zeroPointsShape) !== expectedZeroPointsSize) {
+      throw new Error('zeroPoints input size error.');
+    }
+  }
+};
+
+export const createMatMulNBitsProgramInfo =
+    (inputs: readonly TensorView[], attributes: MatMulNBitsAttributes,
+     maxComputeWorkgroupSizes: [number, number, number], maxComputeWorkgroupStorageSize: number): ProgramInfo => {
+      const inputShape = inputs[0].dims;
+      const aRank = inputShape.length;
+      const nBlocksPerCol = Math.floor((attributes.k + attributes.blockSize - 1) / attributes.blockSize);
+      const dimAOuter = inputShape[aRank - 2];
+      const dimInner = attributes.k;
+      const dimBOuter = attributes.n;
+      const batchDims = inputShape.slice(0, aRank - 2);
+      const batchSize = ShapeUtil.size(batchDims);
+      const blobSize = attributes.blockSize / 8 * attributes.bits;
+      const blobSizeInWords = blobSize / 4;
+      const dataType = inputs[0].dataType;
+      const outputNumber = getMaxComponents(dimAOuter);
+      const aComponents = getMaxComponents(attributes.k);
+      const bComponents = getMaxComponents(blobSizeInWords);
+      const elementSize = getTensorElementSize(dataType)!;
+      const workgroupOutputSize = dimAOuter * nBlocksPerCol * elementSize;
+      const maxNumberOfComponents = Math.floor(maxComputeWorkgroupStorageSize / workgroupOutputSize);
+      const useBlockwiseMatMulNBits = nBlocksPerCol <= maxComputeWorkgroupSizes[0] && maxNumberOfComponents > 0;
+      const components = (!useBlockwiseMatMulNBits || maxNumberOfComponents >= 4) ? getMaxComponents(dimBOuter) :
+          ((maxNumberOfComponents >= 2) && getMaxComponents(dimBOuter) >= 2)      ? 2 :
+                                                                                    1;
+      const outputShape = batchDims.concat([dimAOuter, dimBOuter]);
+      const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
+
+      const programUniforms: ProgramUniform[] = useBlockwiseMatMulNBits ?
+          [] :
+          [{type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: attributes.blockSize}];
+      const inputShapeTemp = [batchSize, dimAOuter, dimInner / aComponents];
+      const bShape = ShapeUtil.convertShape(inputs[1].dims).slice();
+      bShape.splice(-1, 1, blobSizeInWords / bComponents);
+      programUniforms.push(...createTensorShapeVariables(inputShapeTemp));
+      programUniforms.push(...createTensorShapeVariables(bShape));
+      programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+      if (inputs.length === 4) {
+        programUniforms.push(...createTensorShapeVariables(ShapeUtil.convertShape(inputs[3].dims)));
+      }
+      const outputShapeTemp = [batchSize, dimAOuter, dimBOuter / components];
+      programUniforms.push(...createTensorShapeVariables(outputShapeTemp));
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const inputRank = inputShapeTemp.length;
+        const a = inputVariable('a', inputs[0].dataType, inputRank, aComponents);
+        const b = inputVariable('b', DataType.uint32, bShape.length, bComponents);
+        const scales = inputVariable('scales', inputs[2].dataType, inputs[2].dims.length);
+        const inputVariables = [a, b, scales];
+        const zeroPoints =
+            inputs.length === 4 ? inputVariable('zero_points', DataType.uint32, inputs[3].dims.length) : undefined;
+        if (zeroPoints) {
+          inputVariables.push(zeroPoints);
+        }
+        const outputRank = outputShapeTemp.length;
+        const output = outputVariable('output', inputs[0].dataType, outputRank, components);
+        const uniforms: UniformsArrayType = [{name: 'output_size', type: 'u32'}, {name: 'block_size', type: 'u32'}];
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+
+        const qDqDataType = (() => {
+          switch (aComponents) {
+            case 1:
+              return `array<${dataType}, 8>`;
+            case 2:
+              return `mat4x2<${dataType}>`;
+            case 4:
+              return `mat2x4<${dataType}>`;
+            default:
+              throw new Error(`${aComponents}-component is not supported.`);
+          }
+        })();
+
+        const processOneBlock = `
+        for (var word: u32 = 0; word < ${blobSizeInWords}; word += ${bComponents}) {
+          ${b.indicesSet('b_indices', '2', 'word')};
+          let b_data = ${b.getByIndices('b_indices')};
+          for (var i: u32 = 0; i < ${bComponents}; i++) {
+            let b_value: u32 = ${bComponents === 1 ? 'b_data' : 'b_data[word + i]'};
+            let b_mask: u32 = 0x0F0F0F0Fu;
+            let b_value_lower: vec4<u32> = unpack4xU8(b_value & b_mask);
+            let b_value_upper: vec4<u32> = unpack4xU8((b_value >> 4) & b_mask);
+            let b_quantized_values = ${qDqDataType}(${
+            Array.from({length: 4}, (_, i) => `${dataType}(b_value_lower[${i}]), ${dataType}(b_value_upper[${i}])`)
+                .join(', ')});
+            let b_dequantized_values = ${(() => {
+          if (aComponents === 1) {
+            return `${qDqDataType}(${
+                Array.from({length: 8}, (_, i) => `(b_quantized_values[${i}] - zero_point) * scale`).join(', ')});`;
+          } else {
+            return `(b_quantized_values - ${qDqDataType}(${Array(8).fill('zero_point').join(',')})) * scale;`;
+          }
+        })()};
+            // Number of B elements per 32-bit word is 32/bits = 32/4 = 8
+            for (var m: u32 = 0; m < ${useBlockwiseMatMulNBits ? dimAOuter : outputNumber}u; m++) {
+              ${a.indicesSet('a_indices', inputRank - 2, useBlockwiseMatMulNBits ? 'm' : `row * ${outputNumber} + m`)};
+              ${a.indicesSet('a_indices', inputRank - 1, 'word_offset')};
+              var input_offset = ${a.indicesToOffset('a_indices')};
+              var a_data: ${qDqDataType};
+              for (var j: u32 = 0; j < ${8 / aComponents}; j++) {
+                a_data[j] = ${a.getByOffset('input_offset')};
+                input_offset++;
+              }
+              ${useBlockwiseMatMulNBits ? 'workgroup_shared[workgroup_shared_offset + m]' : 'output_values[m]'}${
+            components > 1 ? '[c]' : ''} += ${
+            Array
+                .from(
+                    {length: 8 / aComponents},
+                    (_, i) => `${
+                        aComponents === 1 ? `a_data[${i}] * b_dequantized_values[${i}]` :
+                                            `dot(a_data[${i}], b_dequantized_values[${i}])`}`)
+                .join(' + ')};
+            }
+            word_offset += ${8 / aComponents};
+          }
+        }`;
+        const updateZeroPointIndex = zeroPoints ? `
+          zero_point_offset += 4;
+          if (zero_point_offset == 32) {
+            zero_point_offset = 0;
+            zero_point_index++;
+            zero_point_word = ${zeroPoints.getByOffset('zero_point_index')};
+          }` :
+                                                  '';
+
+        return useBlockwiseMatMulNBits ? `
+        var<workgroup> workgroup_shared: array<${output.type.value}, ${dimAOuter * nBlocksPerCol}>;
+        ${shaderHelper.declareVariables(...inputVariables, output)}
+        ${shaderHelper.mainStart([
+          nBlocksPerCol, 1, 1
+        ])}
+          var a_indices: ${a.type.indices};
+          var block = local_id.x;
+          var col = workgroup_id.y;
+          var batch = workgroup_id.z;
+          ${a.indicesSet('a_indices', '0', 'batch')};
+          // Two zero points are packed into one byte when uniforms.bits is 4.
+          for (var c: u32 = 0; c < ${components}; c++) {
+            let col_times_components_plus_c = col * ${components} + c;
+              ${
+                                             zeroPoints ? `
+            var zero_point_bytes_per_col: u32 = (${nBlocksPerCol} + 1) / 2;
+            var zero_point_byte_count: u32 = col_times_components_plus_c * zero_point_bytes_per_col + (block >> 0x1u);
+            var zero_point_word_index: u32 = zero_point_byte_count >> 0x2u;
+            var zero_point_byte_offset: u32 = zero_point_byte_count & 0x3u;
+            var zero_point_nibble_offset: u32 = block & 0x1u;
+            var zero_point_bits_offset: u32 = (zero_point_byte_offset << 3) + (zero_point_nibble_offset << 2);
+            var zero_point_word: u32 = ${zeroPoints.getByOffset('zero_point_word_index')} >> zero_point_bits_offset;` :
+                                                          ''}
+            var b_indices: ${b.type.indices};
+            ${b.indicesSet('b_indices', '0', 'col_times_components_plus_c')};
+            // The scale and zero points are computed per block.
+            var scales_index = col_times_components_plus_c * ${nBlocksPerCol} + block;
+            let scale = ${scales.getByOffset('scales_index')};
+            // The default zero point is 8 for unsigned 4-bit quantization.
+            let zero_point = ${dataType}(${zeroPoints ? '(zero_point_word) & 0xFu' : 8.0});
+            ${b.indicesSet('b_indices', '1', 'block')};
+            var word_offset: u32 = block * ${attributes.blockSize / aComponents};
+            var workgroup_shared_offset: u32 = block * ${dimAOuter};
+            ${processOneBlock}
+          }
+          workgroupBarrier();
+          if (local_id.x == 0u) {
+            var output_indices: ${output.type.indices};
+            ${output.indicesSet('output_indices', '0', 'batch')};
+            ${output.indicesSet('output_indices', outputRank - 1, 'col')};
+            ${output.indicesSet('output_indices', outputRank - 2, '0')};
+            var output_offset = ${output.indicesToOffset('output_indices')};
+            for (var m: u32 = 0u; m < ${dimAOuter}u; m++) {
+              var output_value: ${output.type.value} = ${output.type.value}(0);
+              var workgroup_shared_offset: u32 = m;
+              for (var b: u32 = 0u; b < ${nBlocksPerCol}u; b++) {
+                output_value += workgroup_shared[workgroup_shared_offset];
+                workgroup_shared_offset += ${dimAOuter};
+              }
+              ${output.setByOffset('output_offset', 'output_value')};
+              output_offset += ${dimBOuter / components};
+            }
+          }
+        }` :
+                                         `
+        ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
+        ${shaderHelper.mainStart()}
+          ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+          var output_values: array<${output.type.value}, ${outputNumber}>;
+          var output_indices = ${output.offsetToIndices('global_idx')};
+          var col = ${output.indicesGet('output_indices', outputRank - 1)};
+          var row = ${output.indicesGet('output_indices', outputRank - 2)};
+          var a_indices: ${a.type.indices} = output_indices;
+          // Two zero points are packed into one byte because uniforms.bits <= 4.
+          // zero_point_offset is either 0 or 4. It is bit offset within one byte.
+          // TODO support zero_point_offset for bits > 4
+          ${
+                                             zeroPoints ? `
+          var zero_point_abs_offset = col * ${components} * ((${nBlocksPerCol} + 1) / 2);
+          var zero_point_index: u32 = zero_point_abs_offset / 4;
+          var zero_point_word: u32 = ${zeroPoints.getByOffset('zero_point_index')};
+          var zero_point_offset: u32 = (zero_point_abs_offset % 4) * 8;` :
+                                                          ''}
+          var scale_index = col * ${nBlocksPerCol * components};
+          var b_indices: ${b.type.indices};
+          for (var c: u32 = 0; c < ${components}; c++) {
+            ${b.indicesSet('b_indices', '0', `col * ${components} + c`)};
+            var block_offset: u32 = 0;
+            for (var block: u32 = 0; block < ${nBlocksPerCol}; block++) {
+              // The scale and zero points are computed per block.
+              let scale = ${scales.getByOffset('scale_index')};
+              // The default zero point is 8 for unsigned 4-bit quantization.
+              let zero_point = ${dataType}(${zeroPoints ? 'extractBits(zero_point_word, zero_point_offset, 4)' : 8.0});
+              ${b.indicesSet('b_indices', '1', 'block')};
+              var word_offset: u32 = block_offset;
+              ${processOneBlock}
+              scale_index++;
+              ${updateZeroPointIndex}
+              block_offset += uniforms.block_size / ${aComponents};
+            }
+            // Drop the trailing 4 bits if the zero_poit_offset is not a byte boundary to align with the next byte.
+            ${
+                                             zeroPoints ? `if (zero_point_offset % 8 > 0) {
+                ${updateZeroPointIndex}
+              }` :
+                                                          ''}
+            }
+            for (var k: u32 = 0u; k < ${outputNumber}u; k++) {
+              ${output.indicesSet('output_indices', outputRank - 2, `${outputNumber} * row + k`)};
+              ${output.setByIndices('output_indices', 'output_values[k]')}
+            }
+        }`;
+      };
+      return {
+        name: useBlockwiseMatMulNBits ? 'BlockwiseMatMulNBits' : 'MatMulNBits',
+        shaderCache: {
+          hint: `${attributes.cacheKey};${dimAOuter};${dataType};${inputs.length}`,
+          inputDependencies: Array(inputs.length).fill('rank')
+        },
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType}],
+          name: useBlockwiseMatMulNBits ? 'BlockwiseMatMulNBits' : 'MatMulNBits',
+          dispatchGroup: useBlockwiseMatMulNBits ? {x: 1, y: Math.ceil(dimBOuter / components), z: batchSize} :
+                                                   {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+          programUniforms
+        }),
+        getShaderSource
+      };
+    };
+
+export const matMulNBits = (context: ComputeContext, attributes: MatMulNBitsAttributes): void => {
+  validateInputs(context.inputs, attributes);
+  const maxComputeWorkgroupSizes: [number, number, number] = context.getMaxComputeWorkgroupSizes();
+  const maxComputeWorkgroupStorageSize = context.getMaxComputeWorkgroupStoragesize();
+  context.compute(createMatMulNBitsProgramInfo(
+      context.inputs, attributes, maxComputeWorkgroupSizes, maxComputeWorkgroupStorageSize));
+};
+
+export const parseMatMulNBitsAttributes = (attributes: Record<string, unknown>): MatMulNBitsAttributes =>
+    createAttributeWithCacheKey(attributes as Omit<MatMulNBitsAttributes, keyof AttributeWithCacheKey>);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
index b7726a36bcaa..5c5c849d9981 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
@@ -1,13 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, GpuDataType} from '../types';
+import {ComputeContext, GpuDataType, ProgramUniform} from '../types';
 
 import {applyAttention, AttentionAttrs, AttentionMaskType, AttentionParameters, AttentionQkvFormat} from './attention';
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
 import {createTransposeProgramInfo, TransposeAttributes} from './transpose';
 
 const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttrs): AttentionParameters => {
@@ -228,7 +229,6 @@ const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttr
   };
 };
 
-
 export const parseMultiHeadAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs =>
     createAttributeWithCacheKey({...attributes});
 
@@ -239,30 +239,37 @@ const addBiasTranspose =
      hiddenSize: number, biasOffset: number) => {
       const outputShape = [batchSize, sequenceLength, hiddenSize];
       const outputSize = ShapeUtil.size(outputShape);
-
-      const dataType = tensorTypeToWsglStorageType(qkv.dataType);
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const biasOffset = ${biasOffset}u;
-  const hiddenSize = ${hiddenSize}u;
-
-  @group(0) @binding(0) var<storage, read> qkv: array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> bias: array<${dataType}>;
-  @group(0) @binding(2) var<storage, read_write> qkv_with_bias: array<${dataType}>;
-
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: biasOffset},
+        {type: DataType.uint32, data: hiddenSize}
+      ];
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const output = outputVariable('qkv_with_bias', qkv.dataType, outputShape);
+        const qkvInput = inputVariable('qkv', qkv.dataType, outputShape);
+        const biasInput = inputVariable('bias', bias.dataType, outputShape);
+
+        const uniforms: UniformsArrayType = [
+          {name: 'output_size', type: 'u32'}, {name: 'bias_offset', type: 'u32'}, {name: 'hidden_size', type: 'u32'}
+        ];
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(qkvInput, biasInput, output)}
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-    let biasOffsetIdx = (global_idx % hiddenSize) + biasOffset;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let bias_offset_idx = (global_idx % uniforms.hidden_size) + uniforms.bias_offset;
 
-    qkv_with_bias[global_idx] = qkv[global_idx] + bias[biasOffsetIdx];
+    qkv_with_bias[global_idx] = qkv[global_idx] + bias[bias_offset_idx];
   }`;
+      };
 
       return context.compute(
           {
             name: 'MultiHeadAttentionAddBias',
-            shaderCache: {hint: JSON.stringify({batchSize, sequenceLength, hiddenSize, biasOffset})},
+            shaderCache: {inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: qkv.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
index 18859e253aa0..d649d3d220ae 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
@@ -4,12 +4,11 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getElementAt, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common';
 
-export interface PadAttributes extends AttributeWithCacheKey {
+interface PadAttributes {
   // 0-constant, 1-reflect, 2-edge, 3-wrap
   readonly mode: number;
   readonly value: number;
@@ -20,8 +19,8 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs || inputs.length < 1) {
     throw new Error('Too few inputs');
   }
-  if (inputs[0].dataType !== DataType.float) {
-    throw new Error('Input type must be float.');
+  if (inputs[0].dataType !== DataType.float && inputs[0].dataType !== DataType.float16) {
+    throw new Error('Input type must be float or float16.');
   }
 
   if (inputs.length >= 2) {
@@ -35,27 +34,23 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   }
 };
 
-const getPadConstant =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[],
-     dataType: string, constantValue: number): string => {
-      const inputRank = inputDims.length;
-
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-            k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadConstant = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+            k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
             if (k < 0) {
               break;
             }
-            if (k >= ${inputDims[i]}) {
+            if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
               break;
             }
-            offset += k * ${inputStrides[i]};
+            offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
         `;
-      }
+  }
 
-      return `
-          value = ${dataType}(${constantValue});
+  return `
+          value = ${output.type.value}(uniforms.constant_value);
           for (var i = 0; i < 1; i++) {
             var offset = 0;
             var k = 0;
@@ -63,143 +58,142 @@ const getPadConstant =
             value = x[offset];
           }
       `;
-    };
-
-const getPadReflect =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadReflect = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0) {
                   k = -k;
                 }
                 {
-                  let _2n_1 = ${2 * (inputDims[i] - 1)};
+                  let _2n_1 = 2 * (i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1);
                   k = k % _2n_1;
-                  if(k >= ${inputDims[i]}) {
+                  if(k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
                     k = _2n_1 - k;
                   }
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadEdge =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadEdge = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0) {
                   k = 0;
                 }
-                if (k >= ${inputDims[i]}) {
-                  k = ${inputDims[i] - 1};
+                if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
+                  k = i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1;
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadWrap =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadWrap = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0)  {
-                  k += ${inputDims[i]};
+                  k += i32(${getElementAt('uniforms.x_shape', i, inputRank)}]);
                 }
-                if (k >= ${inputDims[i]}) {
-                  k -= ${inputDims[i]};
+                if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
+                  k -= i32(${getElementAt('uniforms.x_shape', i, inputRank)});
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadSnippet =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], attributes: PadAttributes,
-     dataType: string): string => {
-      switch (attributes.mode) {
-        case 0:
-          return getPadConstant(output, inputDims, inputStrides, attributes.pads, dataType, attributes.value);
-        case 1:
-          return getPadReflect(output, inputDims, inputStrides, attributes.pads);
-        case 2:
-          return getPadEdge(output, inputDims, inputStrides, attributes.pads);
-        case 3:
-          return getPadWrap(output, inputDims, inputStrides, attributes.pads);
-        default:
-          throw new Error('Invalid mode');
-      }
-    };
-
-const generatePadCode =
-    (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: PadAttributes, dataType: string):
-        string => {
-          const inputDims = inputs[0].dims;
-          const outputDims = ShapeUtil.padShape(inputDims.slice(), attributes.pads);
-          const outputSize = ShapeUtil.size(outputDims);
-          const inputStrides = ShapeUtil.computeStrides(inputDims);
-
-          const output = outputVariable('output', inputs[0].dataType, outputDims);
-          const input = inputVariable('x', inputs[0].dataType, inputDims);
-
-          const padSnippet = getPadSnippet(output, inputDims, inputStrides, attributes, dataType);
-          const padCode = `
-              ${shaderHelper.declareVariables(input, output)}
-              ${shaderHelper.mainStart()}
-              ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-
-              let indices = ${output.offsetToIndices('global_idx')};
-
-              var value = ${dataType}(0);
-              ${padSnippet}
-              output[global_idx] = value;
-          }`;
-          return padCode;
-        };
+};
+
+const getPadSnippet = (output: IndicesHelper, inputRank: number, attributes: PadAttributes): string => {
+  switch (attributes.mode) {
+    case 0:
+      return getPadConstant(output, inputRank, attributes.pads.length);
+    case 1:
+      return getPadReflect(output, inputRank, attributes.pads.length);
+    case 2:
+      return getPadEdge(output, inputRank, attributes.pads.length);
+    case 3:
+      return getPadWrap(output, inputRank, attributes.pads.length);
+    default:
+      throw new Error('Invalid mode');
+  }
+};
 
 const createPadProgramInfo = (inputs: readonly TensorView[], attributes: PadAttributes): ProgramInfo => {
   const outputShape = ShapeUtil.padShape(inputs[0].dims.slice(), attributes.pads);
+  const inputDims = inputs[0].dims;
+  const outputSize = ShapeUtil.size(outputShape);
+  const programUniforms: ProgramUniform[] =
+      [{type: DataType.uint32, data: outputSize}, {type: DataType.int32, data: attributes.pads}];
+  if (attributes.mode === 0) {
+    programUniforms.push({type: inputs[0].dataType, data: attributes.value});
+  }
+
+  programUniforms.push(...createTensorShapeVariables(inputs[0].dims, outputShape));
+  const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank'];
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+    const input = inputVariable('x', inputs[0].dataType, inputDims.length);
+    const dataType = input.type.value;
+    const padSnippet = getPadSnippet(output, inputDims.length, attributes);
+    const uniforms: UniformsArrayType =
+        [{name: 'output_size', type: 'u32'}, {name: 'pads', type: 'i32', length: attributes.pads.length}];
+    if (attributes.mode === 0) {
+      uniforms.push({name: 'constant_value', type: dataType as UniformDataElementType});
+    }
+
+    return `
+            ${shaderHelper.registerUniforms(uniforms).declareVariables(input, output)}
+            ${shaderHelper.mainStart()}
+            ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+
+            let indices = ${output.offsetToIndices('global_idx')};
+
+            var value = ${dataType}(0);
+            ${padSnippet}
+            output[global_idx] = value;
+        }`;
+  };
+
   return {
     name: 'Pad',
-    shaderCache: {hint: attributes.cacheKey},
+    shaderCache: {hint: `${attributes.mode}`, inputDependencies},
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-      dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)}
+      dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)},
+      programUniforms
     }),
-    getShaderSource: shaderHelper => generatePadCode(shaderHelper, inputs, attributes, 'f32'),
+    getShaderSource,
   };
 };
 
@@ -223,7 +217,7 @@ const createPadAttributesFromInputs = (inputs: readonly TensorView[], attributes
     const pads: number[] = [];
     updatePads.forEach(v => pads.push(v));
 
-    return createAttributeWithCacheKey({mode: attributes.mode, value, pads});
+    return {mode: attributes.mode, value, pads};
   } else {
     return attributes;
   }
@@ -234,10 +228,3 @@ export const pad = (context: ComputeContext, attributes: PadAttributes): void =>
   const updatedAttributes = createPadAttributesFromInputs(context.inputs, attributes);
   context.compute(createPadProgramInfo(context.inputs, updatedAttributes), {inputs: [0]});
 };
-
-export const parsePadAttributes = (attributes: Record<string, unknown>): PadAttributes => {
-  const mode = attributes.mode as number;
-  const value = attributes.value as number;
-  const pads = attributes.pads as number[];
-  return createAttributeWithCacheKey({mode, value, pads});
-};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 9e9b361c1af1..5521650e8ded 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -3,6 +3,7 @@
 
 import {env} from 'onnxruntime-common';
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {PoolConvUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey} from '../attribute-with-cache-key';
@@ -56,7 +57,8 @@ const getUniformAndPadInfo = <AttributeType extends AveragePoolAttributes|MaxPoo
   const isChannelsLast = attributes.format === 'NHWC';
   const outputSize = ShapeUtil.size(outputShape);
   const kernelSize = ShapeUtil.size(attributes.kernelShape);
-  const programUniforms: ProgramUniform[] = [{type: 'uint32', data: outputSize}, {type: 'uint32', data: kernelSize}];
+  const programUniforms: ProgramUniform[] =
+      [{type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: kernelSize}];
   const uniforms: UniformsArrayType = [{name: 'outputSize', type: 'u32'}, {name: 'kernelSize', type: 'u32'}];
   if (attributes.kernelShape.length <= 2) {
     const kw = attributes.kernelShape[attributes.kernelShape.length - 1];
@@ -65,10 +67,10 @@ const getUniformAndPadInfo = <AttributeType extends AveragePoolAttributes|MaxPoo
     const pwEnd = attributes.pads[attributes.pads.length - 1];
     const pwStartEndNotZero = !!(pwStart + pwEnd);
     programUniforms.push(
-        {type: 'uint32', data: kw},
-        {type: 'uint32', data: sw},
-        {type: 'uint32', data: pwStart},
-        {type: 'uint32', data: pwEnd},
+        {type: DataType.uint32, data: kw},
+        {type: DataType.uint32, data: sw},
+        {type: DataType.uint32, data: pwStart},
+        {type: DataType.uint32, data: pwEnd},
     );
     uniforms.push(
         {name: 'kw', type: 'u32'}, {name: 'sw', type: 'u32'}, {name: 'pwStart', type: 'u32'},
@@ -82,8 +84,8 @@ const getUniformAndPadInfo = <AttributeType extends AveragePoolAttributes|MaxPoo
       const phEnd = attributes.pads[attributes.pads.length - 2];
       phStartEndNotZero = !!(phStart + phEnd);
       programUniforms.push(
-          {type: 'uint32', data: kh}, {type: 'uint32', data: sh}, {type: 'uint32', data: phStart},
-          {type: 'uint32', data: phEnd});
+          {type: DataType.uint32, data: kh}, {type: DataType.uint32, data: sh}, {type: DataType.uint32, data: phStart},
+          {type: DataType.uint32, data: phEnd});
 
       uniforms.push(
           {name: 'kh', type: 'u32'}, {name: 'sh', type: 'u32'}, {name: 'phStart', type: 'u32'},
@@ -96,8 +98,8 @@ const getUniformAndPadInfo = <AttributeType extends AveragePoolAttributes|MaxPoo
     }
     const kernelStrides = ShapeUtil.computeStrides(attributes.kernelShape);
     programUniforms.push(
-        {type: 'uint32', data: kernelStrides}, {type: 'uint32', data: attributes.pads},
-        {type: 'uint32', data: attributes.strides});
+        {type: DataType.uint32, data: kernelStrides}, {type: DataType.uint32, data: attributes.pads},
+        {type: DataType.uint32, data: attributes.strides});
     uniforms.push(
         {name: 'kernelStrides', type: 'u32', length: kernelStrides.length},
         {name: 'pads', type: 'u32', length: attributes.pads.length},
@@ -296,7 +298,7 @@ const createAveragePoolProgramInfo =
       }
       const [programUniforms, uniforms, hasPads, pwStartEndNotZero, phStartEndNotZero] =
           getUniformAndPadInfo(outputShape, adjustedAttributes);
-      programUniforms.push(...createTensorShapeVariables(input.dims), ...createTensorShapeVariables(outputShape));
+      programUniforms.push(...createTensorShapeVariables(input.dims, outputShape));
       const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank'];
       return {
         name,
@@ -368,7 +370,7 @@ const createMaxPoolProgramInfo =
       const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank'];
       const [programUniforms, uniforms, hasPads, pwStartEndNotZero, phStartEndNotZero] =
           getUniformAndPadInfo(outputShape, adjustedAttributes);
-      programUniforms.push(...createTensorShapeVariables(input.dims), ...createTensorShapeVariables(outputShape));
+      programUniforms.push(...createTensorShapeVariables(input.dims, outputShape));
       return {
         name,
         shaderCache:
@@ -379,8 +381,9 @@ const createMaxPoolProgramInfo =
           programUniforms
         }),
         getShaderSource: shaderHelper => generatePoolingCode(
-            shaderHelper, x, input.dims.length, outputShape.length, adjustedAttributes, op1, op2, -1e5, uniforms,
-            hasPads, pwStartEndNotZero, phStartEndNotZero),
+            shaderHelper, x, input.dims.length, outputShape.length, adjustedAttributes, op1, op2,
+            (input.dataType === DataType.float16) ? -65504 : -1e5, uniforms, hasPads, pwStartEndNotZero,
+            phStartEndNotZero),
       };
     };
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/range.ts b/js/web/lib/wasm/jsep/webgpu/ops/range.ts
index 9cf66111bf70..a21f48ef9ded 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/range.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/range.ts
@@ -4,9 +4,9 @@
 import {env} from 'onnxruntime-common';
 
 import {DataType} from '../../../wasm-common';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
-import {outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common';
 
 const validateInputsContent = (start: number, limit: number, delta: number): void => {
   const sameStartLimit = start === limit;
@@ -22,23 +22,35 @@ const createRangeProgramInfo = (start: number, limit: number, delta: number, dat
   const numElements = Math.abs(Math.ceil((limit - start) / delta));
   const outputShape: number[] = [numElements];
   const outputSize = numElements;
+  const programUniforms: ProgramUniform[] = [
+    {type: DataType.uint32, data: outputSize}, {type: dataType, data: start}, {type: dataType, data: delta},
+    ...createTensorShapeVariables(outputShape)
+  ];
 
-  const output = outputVariable('output', dataType, outputShape);
-  const wgslType = output.type.storage;
-
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-        ${shaderHelper.declareVariables(output)}
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const output = outputVariable('output', dataType, outputShape.length);
+    const wgslType = output.type.value;
+    const uniforms: UniformsArrayType = [
+      {name: 'outputSize', type: 'u32'}, {name: 'start', type: wgslType as UniformDataElementType},
+      {name: 'delta', type: wgslType as UniformDataElementType}
+    ];
+    return `
+        ${shaderHelper.registerUniforms(uniforms).declareVariables(output)}
         ${shaderHelper.mainStart()}
-        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-        output[global_idx] = ${wgslType}(${start}) + ${wgslType}(global_idx) * ${wgslType}(${delta});
+        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.outputSize')}
+        output[global_idx] = uniforms.start + ${wgslType}(global_idx) * uniforms.delta;
       }`;
+  };
+
   return {
     name: 'Range',
-    shaderCache: {hint: [start, limit, delta].map(x => x.toString()).join('_')},
+    shaderCache: {hint: `${dataType}`},
     getShaderSource,
-    getRunData: () => (
-        {outputs: [{dims: outputShape, dataType}],
-         dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}})
+    getRunData: () => ({
+      outputs: [{dims: outputShape, dataType}],
+      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+      programUniforms
+    })
   };
 };
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
index 7c440cbffea7..210b3ee7e2fc 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
@@ -131,7 +131,7 @@ export const createReduceSharedProgramInfo =
       const workgroupSize = 32;
 
       const sharedMemorySnippet = `
-          var<workgroup> aBestValues : array<${output.type.storage}, ${workgroupSize}>;
+          var<workgroup> aBestValues : array<f32, ${workgroupSize}>;
        `;
 
       const getShaderSource = (shaderHelper: ShaderHelper) => `
@@ -145,10 +145,10 @@ export const createReduceSharedProgramInfo =
           let outputIndex = global_idx / ${workgroupSize};
           let offset = outputIndex * uniforms.reduceSize;
 
-          var bestValue = ${output.type.storage}(${reduceInitValues[reduceType]});
+          var bestValue = f32(${reduceInitValues[reduceType]});
           let Length = uniforms.reduceSize;
           for (var k = local_idx; k < Length; k = k + ${workgroupSize}) {
-           let candidate = ${output.type.storage}(${input.getByOffset('offset + k')});
+           let candidate = f32(${input.getByOffset('offset + k')});
            bestValue = ${reduceOps[reduceType]};
           }
           aBestValues[local_idx] = bestValue;
@@ -172,8 +172,8 @@ export const createReduceSharedProgramInfo =
           output.setByOffset(
               'outputIndex',
               `${
-                  reduceType === 'mean' ? `bestValue / ${output.type.storage}(uniforms.reduceSize)` :
-                                          `${reduceOutputValues[reduceType]}`}`)};
+                  reduceType === 'mean' ? `${output.type.storage}(bestValue / f32(uniforms.reduceSize))` :
+                                          `${output.type.storage}(${reduceOutputValues[reduceType]})`}`)};
          }
         }`;
 
@@ -185,7 +185,7 @@ export const createReduceSharedProgramInfo =
         getRunData: () => ({
           outputs: [{dims: outputShape, dataType: outputDataType}],
           dispatchGroup: {x: outputSize},
-          programUniforms: [{type: 'uint32', data: reduceSize}]
+          programUniforms: [{type: DataType.uint32, data: reduceSize}]
         }),
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
index e8851ac54694..e8205ba6fd92 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
@@ -100,10 +100,8 @@ export const createReduceProgramInfo =
         getRunData: () => ({
           outputs: [{dims: outputShape, dataType: outputDataType}],
           dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
-          programUniforms: [
-            {type: 'uint32', data: outputSize}, ...createTensorShapeVariables(inputShape),
-            ...createTensorShapeVariables(outputShape)
-          ]
+          programUniforms:
+              [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(inputShape, outputShape)]
         }),
       };
     };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
index bea3e8625b41..2c6b537de1f0 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
@@ -70,7 +71,6 @@ const validateInputs =
       const rank = inputs[0].dims.length;
       if (roiInputIndex > 0 && inputs.length > roiInputIndex && inputs[roiInputIndex].dims.length > 0) {
         inputs[roiInputIndex].getFloat32Array().forEach((value) => roi.push(value));
-
       } else if (attributes.coordinateTransformMode === 'tf_crop_and_resize') {
         throw new Error('Resize requires RoI input to be specified when coordinateTransformMode is tfCropAndResize');
       }
@@ -110,41 +110,48 @@ const validateInputs =
 
 const getOriginalCoordinateFromResizedCoordinate =
     (coordinateTransferMode: CoordinateTransformMode, dType: string): string =>
-        `fn getOriginalCoordinateFromResizedCoordinate(xResized: ${dType}, xScale: ${dType}, lengthResized: ${dType},
-     lengthOriginal: ${dType}, roiStart: ${dType}, roiEnd: ${dType}) -> ${dType} { ` +
+        `fn getOriginalCoordinateFromResizedCoordinate(xResized: u32, xScale: f32, lengthResized: u32,
+     lengthOriginal: u32, roiStart: f32, roiEnd: f32) -> ${dType} { ` +
     (() => {
           switch (coordinateTransferMode) {
             case 'asymmetric':
-              return 'return xResized / xScale;';
+              return `return ${dType}(xResized) / ${dType}(xScale);`;
             case 'pytorch_half_pixel':
-              return 'if (lengthResized > 1) { \
-                    return (xResized + 0.5) / xScale - 0.5; \
-                  } else { \
-                    return 0.0; \
-                  }';
+              return `if (lengthResized > 1) {
+                    return (${dType}(xResized) + 0.5) / ${dType}(xScale) - 0.5;
+                  } else {
+                    return 0.0;
+                  }`;
             case 'tf_half_pixel_for_nn':
-              return 'return (xResized + 0.5) / xScale;';
+              return `return (${dType}(xResized) + 0.5) / ${dType}(xScale);`;
             case 'align_corners':
-              return 'if (lengthResized == 1) { \
-                    return 0.0; \
-                  } else { \
-                    return xResized * (lengthOriginal - 1) / (lengthResized - 1); \
-                  }';
+              return `if (lengthResized == 1) {
+                    return 0.0;
+                  } else {
+                    // The whole part and the fractional part are calculated separately due to inaccuracy of floating
+                    // point division. As an example, f32(21) / f32(7) may evaluate to 2.99... instead of 3, causing an
+                    // offset-by-one error later in floor().
+                    let whole = ${dType}(xResized * (lengthOriginal - 1) / (lengthResized - 1));
+                    let fract =
+                        ${dType}(xResized * (lengthOriginal - 1) % (lengthResized - 1)) / ${dType}(lengthResized - 1);
+                    return whole + fract;
+                  }`;
             case 'tf_crop_and_resize':
-              return `if (lengthResized > 1) { \
-                    return roiStart * (lengthOriginal - 1) + \
-                          (xResized * (roiEnd - roiStart) * (lengthOriginal - 1)) / (lengthResized - 1); \
-                  } else { \
-                    return 0.5 * (roiStart + roiEnd) * ${dType}(lengthOriginal - 1); \
+              return `if (lengthResized > 1) {
+                    return ${dType}(roiStart) * ${dType}(lengthOriginal - 1) +
+                        (${dType}(xResized) * ${dType}(roiEnd - roiStart) * ${dType}(lengthOriginal - 1)) /
+                        ${dType}(lengthResized - 1);
+                  } else {
+                    return 0.5 * ${dType}(roiStart + roiEnd) * ${dType}(lengthOriginal - 1);
                   }`;
             case 'half_pixel_symmetric':
-              return [
-                'const outputWidth = xScale * lengthResized;', 'const adjustment = lengthResized / outputWidth;',
-                'const center = lengthOriginal / 2;', 'const offset = center * (1 - adjustment);',
-                'return offset + ((xResized + 0.5) / xScale) - 0.5;'
-              ].join('\n');
+              return `const outputWidth = ${dType}xScale * ${dType}(lengthResized);
+                  const adjustment = ${dType}(lengthResized) / outputWidth;
+                  const center = ${dType}(lengthOriginal) / 2;
+                  const offset = center * (1 - adjustment);
+                  return offset + ((${dType}(xResized) + 0.5) / ${dType}(xScale)) - 0.5;`;
             case 'half_pixel':
-              return 'return ((xResized + 0.5) / xScale) - 0.5;';
+              return `return ((${dType}(xResized) + 0.5) / ${dType}(xScale)) - 0.5;`;
             default:
               throw new Error(`Coordinate transform mode ${coordinateTransferMode} is not supported`);
           }
@@ -254,15 +261,15 @@ const calculateOriginalIndicesFromOutputIndices =
         output.type.value}, ${outputShape.length}> {
       var original_indices: array<${output.type.value}, ${outputShape.length}>;
       for (var i:u32 = 0; i < ${outputShape.length}; i++) {
-        var output_index = ${output.type.value}(${output.indicesGet('output_indices', 'i')});
+        var output_index = ${output.indicesGet('output_indices', 'i')};
         var scale = ${getElementAt('uniforms.scales', 'i', scalesLength)};
         var roi_low = ${getElementAt('uniforms.roi', 'i', roiLength)};
         var roi_hi = ${getElementAt('uniforms.roi', `i + ${inputShape.length}`, roiLength)};
         if (scale == 1.0) {
-          original_indices[i] = output_index;
+          original_indices[i] = ${output.type.value}(output_index);
         } else {
-          var input_shape_i = ${output.type.value}(${getElementAt('uniforms.input_shape', 'i', inputShape.length)});
-          var output_shape_i = ${output.type.value}(${getElementAt('uniforms.output_shape', 'i', outputShape.length)});
+          var input_shape_i = ${getElementAt('uniforms.input_shape', 'i', inputShape.length)};
+          var output_shape_i = ${getElementAt('uniforms.output_shape', 'i', outputShape.length)};
           original_indices[i] = getOriginalCoordinateFromResizedCoordinate(output_index, scale, output_shape_i,
                                                                            input_shape_i, roi_low, roi_hi);
         }
@@ -276,23 +283,23 @@ const calculateInputIndicesFromOutputIndices =
     fn calculateInputIndicesFromOutputIndices(output_indices: ${output.type.indices}) -> ${input.type.indices} {
       var input_indices: ${input.type.indices};
       for (var i:u32 = 0; i < ${outputShape.length}; i++) {
-        var output_index = ${output.type.value}(${output.indicesGet('output_indices', 'i')});
+        var output_index = ${output.indicesGet('output_indices', 'i')};
         var input_index: u32;
         var scale = ${getElementAt('uniforms.scales', 'i', scalesLength)};
         if (scale == 1.0) {
-          input_index = u32(output_index);
+          input_index = output_index;
         } else {
           var roi_low = ${getElementAt('uniforms.roi', 'i', roiLength)};
           var roi_hi = ${getElementAt('uniforms.roi', `i + ${inputShape.length}`, roiLength)};
-          var input_shape_i = ${output.type.value}(${getElementAt('uniforms.input_shape', 'i', inputShape.length)});
-          var output_shape_i = ${output.type.value}(${getElementAt('uniforms.output_shape', 'i', outputShape.length)});
+          var input_shape_i = ${getElementAt('uniforms.input_shape', 'i', inputShape.length)};
+          var output_shape_i = ${getElementAt('uniforms.output_shape', 'i', outputShape.length)};
           var original_idx = getOriginalCoordinateFromResizedCoordinate(output_index, scale, output_shape_i,
                                                                         input_shape_i, roi_low, roi_hi);
-          if (!${useExtrapolation} || (original_idx >= 0 && original_idx < input_shape_i)) {
+          if (!${useExtrapolation} || (original_idx >= 0 && original_idx < ${output.type.value}(input_shape_i))) {
             if (original_idx < 0) {
               input_index = 0;
-            } else if (original_idx > (input_shape_i - 1)) {
-              input_index = u32(input_shape_i) - 1;
+            } else if (original_idx > ${output.type.value}(input_shape_i - 1)) {
+              input_index = input_shape_i - 1;
             } else {
               input_index = u32(getNearestPixelFromOriginal(original_idx, scale < 1));
             }
@@ -391,8 +398,8 @@ const bicubicInterpolation =
       fn ${direction}CubicInterpolation(input_indices: ${input.type.indices}, output_indices: ${
             output.type.indices}) -> ${dType} {
         var output_index = ${output.indicesGet('output_indices', idx)};
-        var originalIdx: ${dType} = getOriginalCoordinateFromResizedCoordinate(${dType}(output_index), ${scales[idx]},
-        ${dType}(${outputShape[idx]}), ${dType}(${inputShape[idx]}), ${roi[idx]}, ${roi[idx]} + ${inputShape.length});
+        var originalIdx: ${dType} = getOriginalCoordinateFromResizedCoordinate(output_index, ${scales[idx]},
+        ${outputShape[idx]}, ${inputShape[idx]}, ${roi[idx]}, ${roi[idx]} + ${inputShape.length});
         var fractOriginalIdx: ${dType} = originalIdx - floor(originalIdx);
         var coefs = getCubicInterpolationCoefs(fractOriginalIdx);
 
@@ -635,11 +642,8 @@ const createResizeProgramInfo =
           outputs: [{dims: outputShape, dataType: inputTensor.dataType}],
           dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
           programUniforms: [
-            {type: 'uint32', data: outputSize},
-            {type: 'float32', data: scales},
-            {type: 'float32', data: roi},
-            ...createTensorShapeVariables(inputShape),
-            ...createTensorShapeVariables(outputShape),
+            {type: DataType.uint32, data: outputSize}, {type: DataType.float, data: scales},
+            {type: DataType.float, data: roi}, ...createTensorShapeVariables(inputShape, outputShape)
           ]
         })
       };
@@ -656,6 +660,10 @@ export const resize = (context: ComputeContext, attributes: ResizeAttributes): v
   const scales: number[] = [];
   const sizes: number[] = [];
   const roi: number[] = [];
+
+  // Note that scales in resize are always f32. roi can be f32 or f16.
+  // TODO: Currently this code does not support f16 for roi when passed as optional input.
+
   const opsetVersion = getOpsetVersionFromCustomDataBuffer(context);
   if (attributes.antialias !== 0) {
     throw Error('Only default value (0) for Antialias attribute is supported');
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
new file mode 100644
index 000000000000..a58087072e4c
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
@@ -0,0 +1,170 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
+
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, WORKGROUP_SIZE} from './common';
+
+export interface RotaryEmbeddingAttributes {
+  readonly interleaved: boolean;
+  readonly numHeads: number;
+  readonly rotaryEmbeddingDim: number;
+  readonly scale: number;
+}
+
+const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddingAttributes): void => {
+  const [input, positionIds, cosCache, sinCache] = inputs;
+  const {numHeads, rotaryEmbeddingDim} = attributes;
+
+  if (input.dims.length !== 3 && input.dims.length !== 4) {
+    throw new Error(`Input 'x' is expected to have 3 or 4 dimensions, got ${input.dims.length}`);
+  }
+  if (!ShapeUtil.areEqual(positionIds.dims, []) && !ShapeUtil.areEqual(positionIds.dims, [1]) &&
+      positionIds.dims.length !== 2) {
+    throw new Error(`Input 'position_ids' is expected to have 0, 1, or 2 dimensions, got ${positionIds.dims.length}`);
+  }
+  if (cosCache.dims.length !== 2) {
+    throw new Error(`Input 'cos_cache' is expected to have 2 dimensions, got ${cosCache.dims.length}`);
+  }
+  if (sinCache.dims.length !== 2) {
+    throw new Error(`Input 'sin_cache' is expected to have 2 dimensions, got ${sinCache.dims.length}`);
+  }
+  if (!ShapeUtil.areEqual(cosCache.dims, sinCache.dims)) {
+    throw new Error('Inputs \'cos_cache\' and \'sin_cache\' are expected to have the same shape');
+  }
+
+  if (rotaryEmbeddingDim > 0 && numHeads === 0) {
+    throw new Error('num_heads must be provided if rotary_embedding_dim is specified');
+  }
+
+  const batchSize = input.dims[0];
+  const sequenceLength = input.dims[input.dims.length - 2];
+  const maxSequenceLength = cosCache.dims[0];
+  const hiddenSize = ShapeUtil.sizeFromDimension(input.dims, 1) / sequenceLength;
+  const headSize = rotaryEmbeddingDim === 0 ? cosCache.dims[1] * 2 : hiddenSize / numHeads;
+  if (rotaryEmbeddingDim > headSize) {
+    throw new Error('rotary_embedding_dim must be less than or equal to head_size');
+  }
+
+  if (positionIds.dims.length === 2) {
+    if (batchSize !== positionIds.dims[0]) {
+      throw new Error(`Input 'position_ids' dimension 0 should be of size batch_size, got ${positionIds.dims[0]}`);
+    }
+    if (sequenceLength !== positionIds.dims[1]) {
+      throw new Error(`Input 'position_ids' dimension 1 should be of size sequence_length, got ${positionIds.dims[1]}`);
+    }
+  }
+
+  if (headSize / 2 !== cosCache.dims[1] && rotaryEmbeddingDim / 2 !== cosCache.dims[1]) {
+    throw new Error(`Input 'cos_cache' dimension 1 should be same as head_size / 2 or rotary_embedding_dim / 2, got ${
+        cosCache.dims[1]}`);
+  }
+
+  if (sequenceLength > maxSequenceLength) {
+    throw new Error('Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported');
+  }
+};
+
+const createRotaryEmbeddingProgramInfo =
+    (inputs: readonly TensorView[], attributes: RotaryEmbeddingAttributes): ProgramInfo => {
+      const {interleaved, numHeads, rotaryEmbeddingDim, scale} = attributes;
+      const batchSize = inputs[0].dims[0];
+      const batchStride = ShapeUtil.sizeFromDimension(inputs[0].dims, 1);
+      const sequenceLength = inputs[0].dims[inputs[0].dims.length - 2];
+      const hiddenSize = batchStride / sequenceLength;
+      const halfRotaryEmbeddingDim = inputs[2].dims[1];
+      const headSize = rotaryEmbeddingDim === 0 ? halfRotaryEmbeddingDim * 2 : hiddenSize / numHeads;
+
+      // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape
+      // [batch size, sequence length, num of heads, num of pairs to rotate + num of dims to copy]
+      // to unfold the global index in shader.
+      const globalShape =
+          new Array<number>(batchSize, sequenceLength, hiddenSize / headSize, headSize - halfRotaryEmbeddingDim);
+      const globalStrides = ShapeUtil.computeStrides(globalShape);
+
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.float, data: scale},
+        {type: DataType.uint32, data: globalShape},
+        {type: DataType.uint32, data: globalStrides},
+
+        // strides for addressing the input/output tensor, in permutated order to align with the unfolded global index,
+        // i.e. BSNH
+        ...(inputs[0].dims.length === 3 ?
+                new Array<ProgramUniform>({type: DataType.uint32, data: [batchStride, hiddenSize, headSize, 1]}) :
+                []),
+        ...(inputs[0].dims.length === 4 ?
+                new Array<ProgramUniform>(
+                    {type: DataType.uint32, data: [batchStride, headSize, sequenceLength * headSize, 1]}) :
+                []),
+
+        ...createTensorShapeVariables(inputs[0].dims, inputs[1].dims, inputs[2].dims, inputs[3].dims, inputs[0].dims),
+      ];
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const input = inputVariable('input', inputs[0].dataType, inputs[0].dims.length);
+        const positionIds = inputVariable('position_ids', inputs[1].dataType, inputs[1].dims.length);
+        const cosCache = inputVariable('cos_cache', inputs[2].dataType, inputs[2].dims.length);
+        const sinCache = inputVariable('sin_cache', inputs[3].dataType, inputs[3].dims.length);
+        const output = outputVariable('output', inputs[0].dataType, inputs[0].dims.length);
+
+        shaderHelper.registerUniforms([
+          {name: 'scale', type: 'f32'},
+          {name: 'global_shape', type: 'u32', length: globalShape.length},
+          {name: 'global_strides', type: 'u32', length: globalStrides.length},
+          {name: 'input_output_strides', type: 'u32', length: globalStrides.length},
+        ]);
+
+        return `
+        ${shaderHelper.declareVariables(input, positionIds, cosCache, sinCache, output)}
+
+        ${shaderHelper.mainStart(WORKGROUP_SIZE)}
+          let half_rotary_emb_dim = uniforms.${cosCache.name}_shape[1];
+          let bsnh = global_idx / uniforms.global_strides % uniforms.global_shape;
+          let size = uniforms.global_shape[0] * uniforms.global_strides[0];
+          ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('size')}
+
+          if (bsnh[3] < half_rotary_emb_dim) {
+            let position_ids_idx =
+                ${positionIds.broadcastedIndicesToOffset('bsnh.xy', outputVariable('', positionIds.type.tensor, 2))};
+            let position_id =
+                u32(${positionIds.getByOffset('position_ids_idx')}) + select(0, bsnh[1], position_ids_idx == 0);
+            let i = dot(bsnh, uniforms.input_output_strides) + select(0, bsnh[3], ${interleaved});
+            let j = i + select(half_rotary_emb_dim, 1, ${interleaved});
+            let re = ${input.getByOffset('i')} * ${cosCache.get('position_id', 'bsnh[3]')} -
+                ${input.getByOffset('j')} * ${sinCache.get('position_id', 'bsnh[3]')};
+            ${output.setByOffset('i', 're')}
+            let im = ${input.getByOffset('i')} * ${sinCache.get('position_id', 'bsnh[3]')} +
+                ${input.getByOffset('j')} * ${cosCache.get('position_id', 'bsnh[3]')};
+            ${output.setByOffset('j', 'im')}
+          } else {
+            let k = dot(bsnh, uniforms.input_output_strides) + half_rotary_emb_dim;
+            ${output.setByOffset('k', input.getByOffset('k'))}
+          }
+        }`;
+      };
+
+      return {
+        name: 'RotaryEmbedding',
+        shaderCache: {
+          hint: createAttributeWithCacheKey({
+                  interleaved,
+                }).cacheKey,
+          inputDependencies: ['rank', 'rank', 'rank', 'rank'],
+        },
+        getShaderSource,
+        getRunData: () => ({
+          outputs: [{dims: inputs[0].dims, dataType: inputs[0].dataType}],
+          dispatchGroup: {x: Math.ceil(ShapeUtil.size(globalShape) / WORKGROUP_SIZE)},
+          programUniforms,
+        }),
+      };
+    };
+
+export const rotaryEmbedding = (context: ComputeContext, attributes: RotaryEmbeddingAttributes): void => {
+  validateInputs(context.inputs, attributes);
+  context.compute(createRotaryEmbeddingProgramInfo(context.inputs, attributes));
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
index 7e500f865c19..e7dc34d2fc75 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
@@ -4,12 +4,12 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType,} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
 
-export interface SkipLayerNormAttributes extends AttributeWithCacheKey {
+export interface SkipLayerNormAttributes {
+  simplified: boolean;
   epsilon: number;
 }
 
@@ -73,73 +73,89 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
 const createSkipLayerNormProgramInfo =
     (inputs: readonly TensorView[], attributes: SkipLayerNormAttributes, outputCount: number, isTraining: boolean):
         ProgramInfo => {
+          const simplified = attributes.simplified;
+
           const inputShape = inputs[0].dims;
           const inputSize = ShapeUtil.size(inputShape);
           const outputShape = inputShape;
           const outputSize = inputSize;
           const hiddenSize = inputShape.slice(-1)[0];
           const meanInvStdDevDim = isTraining ? inputShape.slice(0, -1).concat(1) : [];
-          const hasBetaInput = inputs.length > 3;
+          const hasBetaInput = !simplified && inputs.length > 3;
           const hasBiasInput = inputs.length > 4;
           const hasMeanOutput = isTraining && outputCount > 1;
           const hasInvStdDevOutput = isTraining && outputCount > 2;
           const hasInputSkipBiasSumOutput = outputCount > 3;
 
           const components = getMaxComponents(hiddenSize);
-          const variables = [
-            inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
-            inputVariable('skip', inputs[1].dataType, inputs[1].dims, components),
-            inputVariable('gamma', inputs[2].dataType, inputs[2].dims, components),
-          ];
-          if (hasBetaInput) {
-            variables.push(inputVariable('beta', inputs[3].dataType, inputs[3].dims, components));
-          }
-          if (hasBiasInput) {
-            variables.push(inputVariable('bias', inputs[4].dataType, inputs[4].dims, components));
-          }
-          variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
-          if (hasMeanOutput) {
-            variables.push(outputVariable('meanOutput', DataType.float, meanInvStdDevDim));
-          }
-          if (hasInvStdDevOutput) {
-            variables.push(outputVariable('invStdOutput', DataType.float, meanInvStdDevDim));
-          }
-          if (hasInputSkipBiasSumOutput) {
-            variables.push(outputVariable('inputSkipBiasSum', inputs[0].dataType, outputShape, components));
-          }
-          const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-          const getShaderSource = (shaderHelper: ShaderHelper) => `
-      const hiddenSize: f32 = ${hiddenSize};
-      const hiddenSizeVectorized: u32 = ${hiddenSize / components};
-      const epsilon: f32 = ${attributes.epsilon};
 
-      ${shaderHelper.declareVariables(...variables)}
+          const programUniforms: ProgramUniform[] = [
+            {type: DataType.uint32, data: outputSize},
+            {type: DataType.uint32, data: components},
+            {type: DataType.uint32, data: hiddenSize},
+            {type: DataType.float, data: attributes.epsilon},
+          ];
+          const getShaderSource = (shaderHelper: ShaderHelper) => {
+            const uniformsArray: UniformsArrayType = [
+              {name: 'output_size', type: 'u32'},
+              {name: 'components', type: 'u32'},
+              {name: 'hidden_size', type: 'u32'},
+              {name: 'epsilon', type: 'f32'},
+            ];
+            const variables = [
+              inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
+              inputVariable('skip', inputs[1].dataType, inputs[1].dims, components),
+              inputVariable('gamma', inputs[2].dataType, inputs[2].dims, components),
+            ];
+            if (hasBetaInput) {
+              variables.push(inputVariable('beta', inputs[3].dataType, inputs[3].dims, components));
+            }
+            if (hasBiasInput) {
+              variables.push(inputVariable('bias', inputs[4].dataType, inputs[4].dims, components));
+            }
+            variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
+            if (hasMeanOutput) {
+              variables.push(outputVariable('mean_output', DataType.float, meanInvStdDevDim));
+            }
+            if (hasInvStdDevOutput) {
+              variables.push(outputVariable('inv_std_output', DataType.float, meanInvStdDevDim));
+            }
+            if (hasInputSkipBiasSumOutput) {
+              variables.push(outputVariable('input_skip_bias_sum', inputs[0].dataType, outputShape, components));
+            }
+            const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+            return `
+
+      ${shaderHelper.registerUniforms(uniformsArray).declareVariables(...variables)}
 
       ${shaderHelper.mainStart()}
-        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize / hiddenSize)}
-        let offset = global_idx * hiddenSizeVectorized;
+        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size / uniforms.hidden_size')}
+        let hidden_size_vectorized: u32 = uniforms.hidden_size / uniforms.components;
+        let offset = global_idx * hidden_size_vectorized;
         var sum = ${fillVector('f32', components)};
         var squareSum = ${fillVector('f32', components)};
-        for (var i: u32 = 0; i < hiddenSizeVectorized; i++) {
-          let skipValue = skip[offset + i];
-          let biasValue = ${hasBiasInput ? 'bias[i]' : '0.0'};
-          let inputValue = x[offset + i];
-          let value = inputValue + skipValue + biasValue;
-          ${hasInputSkipBiasSumOutput ? 'inputSkipBiasSum[offset + i] = value;' : ''}
+        for (var i: u32 = 0; i < hidden_size_vectorized; i++) {
+          let skip_value = skip[offset + i];
+          let bias_value = ${hasBiasInput ? 'bias[i]' : '0.0'};
+          let input_value = x[offset + i];
+          let value = input_value + skip_value + bias_value;
+          ${hasInputSkipBiasSumOutput ? 'input_skip_bias_sum[offset + i] = value;' : ''}
           output[offset + i] = value;
-          let f32Value = ${castToF32(dataType, components, 'value')};
-          sum += f32Value;
-          squareSum += f32Value * f32Value;
+          let f32_value = ${castToF32(dataType, components, 'value')};
+          sum += f32_value;
+          squareSum += f32_value * f32_value;
         }
-        let mean = ${sumVector('sum', components)} / hiddenSize;
-        let variance = sqrt(${sumVector('squareSum', components)} / hiddenSize - mean * mean + epsilon);
-        ${hasMeanOutput ? 'meanOutput[global_idx] = mean;' : ''}
-        ${hasInvStdDevOutput ? 'invStdOutput[global_idx] = 1.0 / variance;' : ''}
-        for (var i: u32 = 0; i < hiddenSizeVectorized; i++) {
-          output[offset + i] = (output[offset + i] - ${dataType}(mean)) / ${dataType}(variance) * gamma[i]
-           + ${hasBetaInput ? 'beta[i]' : '0.0'};
+        let mean = ${sumVector('sum', components)} / f32(uniforms.hidden_size);
+        let inv_std_dev = inverseSqrt(${sumVector('squareSum', components)} / f32(uniforms.hidden_size) ${
+                simplified ? '' : '- mean * mean'} + uniforms.epsilon);
+        ${hasMeanOutput ? 'mean_output[global_idx] = mean;' : ''}
+        ${hasInvStdDevOutput ? 'inv_std_output[global_idx] = inv_std_dev;' : ''}
+        for (var i: u32 = 0; i < hidden_size_vectorized; i++) {
+          output[offset + i] = (output[offset + i] ${simplified ? '' : `- ${dataType}(mean)`}) * ${
+                dataType}(inv_std_dev) * gamma[i] ${hasBetaInput ? '+ beta[i]' : ''};
         }
       }`;
+          };
           const outputs = [{dims: outputShape, dataType: inputs[0].dataType}];
           if (outputCount > 1) {
             outputs.push({dims: meanInvStdDevDim, dataType: DataType.float});
@@ -150,12 +166,14 @@ const createSkipLayerNormProgramInfo =
           if (outputCount > 3) {
             outputs.push({dims: inputShape, dataType: inputs[0].dataType});
           }
-
           return {
             name: 'SkipLayerNormalization',
-            shaderCache: {hint: attributes.cacheKey},
+            shaderCache: {
+              hint: `${components};${hasMeanOutput};${hasInvStdDevOutput};${hasInputSkipBiasSumOutput}`,
+              inputDependencies: inputs.map((_input, _index) => 'type')
+            },
             getShaderSource,
-            getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(outputSize / hiddenSize / 64)}}),
+            getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(outputSize / hiddenSize / 64)}, programUniforms}),
           };
         };
 
@@ -178,8 +196,3 @@ export const skipLayerNorm = (context: ComputeContext, attributes: SkipLayerNorm
   context.compute(
       createSkipLayerNormProgramInfo(context.inputs, attributes, context.outputCount, isTraining), {outputs});
 };
-
-export const parseSkipLayerNormAttributes = (attributes: Record<string, unknown>): SkipLayerNormAttributes => {
-  const epsilon = attributes.epsilon as number;
-  return createAttributeWithCacheKey({epsilon});
-};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
index 5212c6475dce..a5e71f30e596 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
@@ -155,9 +155,9 @@ const createSliceProgramInfo = (inputs: readonly TensorView[], attributes: Slice
   ];
 
   const programUniforms: ProgramUniform[] = [
-    {type: 'uint32', data: outputSize}, {type: 'uint32', data: starts}, {type: 'int32', data: signs},
-    {type: 'uint32', data: steps}, ...createTensorShapeVariables(inputs[0].dims),
-    ...createTensorShapeVariables(outputShape)
+    {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: starts},
+    {type: DataType.int32, data: signs}, {type: DataType.uint32, data: steps},
+    ...createTensorShapeVariables(inputs[0].dims, outputShape)
   ];
 
   const getShaderSource = (shaderHelper: ShaderHelper) => `
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
index 324dc3af1a71..b0e3ddd14965 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
@@ -5,6 +5,7 @@
 // performance limitations when the reduced axis is long. Need to add
 // a optimized codepath for this.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
@@ -136,7 +137,7 @@ const createSoftmaxProgramInfo = (input: TensorView, attributes: SoftmaxAttribut
     getRunData: () => ({
       outputs: [{dims: shape, dataType: input.dataType}],
       dispatchGroup: {x: rows},
-      programUniforms: [{type: 'uint32', data: packedCols}]
+      programUniforms: [{type: DataType.int32, data: packedCols}]
     }),
     getShaderSource,
   };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/split.ts b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
index b8582614fa21..a09ac78b1700 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/split.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
@@ -67,24 +68,23 @@ const createSplitProgramInfo = (inputs: readonly TensorView[], attributes: Split
   const dataType = inputs[0].dataType;
   const axis = ShapeUtil.normalizeAxis(attributes.axis, inputShape.length);
   const outputs = new Array<IndicesHelper>(attributes.numOutputs);
-  const input = inputVariable('input', dataType, inputShape);
+  const input = inputVariable('input', dataType, inputShape.length);
   const sizeInSplitAxis = new Array<number>(attributes.numOutputs);
   const outputsTensorInfo: TensorInfo[] = [];
   const outputShapes: number[][] = [];
   let previousSum = 0;
-  const programUniforms: ProgramUniform[] = [{type: 'uint32', data: inputSize}];
+  const programUniforms: ProgramUniform[] = [{type: DataType.uint32, data: inputSize}];
   for (let i = 0; i < attributes.numOutputs; i++) {
     previousSum += attributes.splitSizes[i];
     sizeInSplitAxis[i] = previousSum;
     const outputShape = inputShape.slice();
     outputShape[attributes.axis] = attributes.splitSizes[i];
     outputShapes.push(outputShape);
-    outputs[i] = outputVariable(`output${i}`, dataType, outputShape);
+    outputs[i] = outputVariable(`output${i}`, dataType, outputShape.length);
     outputsTensorInfo.push({dims: outputShapes[i], dataType: inputs[0].dataType});
   }
-  programUniforms.push({type: 'uint32', data: sizeInSplitAxis});
-  programUniforms.push(...createTensorShapeVariables(inputShape));
-  outputShapes.forEach((outputShape) => programUniforms.push(...createTensorShapeVariables(outputShape)));
+  programUniforms.push(
+      {type: DataType.uint32, data: sizeInSplitAxis}, ...createTensorShapeVariables(inputShape, ...outputShapes));
   const getShaderSource = (shaderHelper: ShaderHelper) => `
   ${
       shaderHelper.registerUniform('input_size', 'u32')
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/tile.ts b/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
index 90a36a7bec2a..f9728575fe07 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
@@ -79,10 +79,8 @@ export const createTileProgramInfo = (inputs: readonly TensorView[]): ProgramInf
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
       dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
-      programUniforms: [
-        {type: 'uint32', data: outputSize}, ...createTensorShapeVariables(inputs[0].dims),
-        ...createTensorShapeVariables(outputShape)
-      ],
+      programUniforms:
+          [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(inputs[0].dims, outputShape)],
     }),
     getShaderSource,
   };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index c4d43e9f466f..7ae801222b87 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -1,12 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, ProgramInfo} from '../types';
 
-import {createTensorShapeVariables, enableShapesUniforms, IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
 
 export interface TransposeAttributes extends AttributeWithCacheKey {
   readonly perm: number[];
@@ -39,12 +40,9 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
   const inputDataType = inputTensor.dataType;
   const inputRank = inputTensor.dims.length;
   const perm = getAdjustedPerm(inputRank, permAttr);
-  const useShapesUniforms = enableShapesUniforms(inputRank);
   const outputShape = getOutputShape(inputTensor.dims, perm);
-  const outShapeOrRank = useShapesUniforms ? outputShape.length : outputShape;
-  const inShapeOrRank = useShapesUniforms ? inputRank : inputTensor.dims;
-  const output = outputVariable('output', inputDataType, outShapeOrRank);
-  const input = inputVariable('a', inputDataType, inShapeOrRank);
+  const output = outputVariable('output', inputDataType, outputShape.length);
+  const input = inputVariable('a', inputDataType, inputRank);
 
   const getShaderSource = (shaderHelper: ShaderHelper) => `
   ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)}
@@ -61,21 +59,14 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
   }`;
   return {
     name: 'Transpose',
-    shaderCache: {hint: `${permAttr}`, inputDependencies: useShapesUniforms ? ['rank'] : ['dims']},
+    shaderCache: {hint: `${permAttr}`, inputDependencies: ['rank']},
     getRunData: (inputs) => {
       const outputSize = ShapeUtil.size(outputShape);
       return {
         outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
         dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
-        programUniforms: useShapesUniforms ?
-            [
-              {type: 'uint32', data: outputSize},
-              ...createTensorShapeVariables(inputs[0].dims),
-              ...createTensorShapeVariables(outputShape),
-            ] :
-            [
-              {type: 'uint32', data: outputSize},
-            ],
+        programUniforms:
+            [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(inputs[0].dims, outputShape)],
       };
     },
     getShaderSource,
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index a25e7fe4229b..5f105c745739 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -53,7 +53,7 @@ const createElementwiseProgramInfo =
         dispatchGroup:
             {x: Math.ceil(ShapeUtil.size(inputTensors[0].dims) / 64 /* workgroup size */ / 4 /* vec size */)},
         programUniforms: [
-          {type: 'uint32', data: Math.ceil(ShapeUtil.size(input.dims) / 4)},
+          {type: DataType.uint32, data: Math.ceil(ShapeUtil.size(input.dims) / 4)},
         ],
       })
     });
@@ -178,7 +178,7 @@ export const elu = (context: ComputeContext, attributes: AlphaAttributes): void
       attributes.cacheKey));
 };
 
-export const erfImpl = (dataType: string, varType = 'f32') => `
+export const erfImpl = (varType = 'f32') => `
 const r0: ${varType} = 0.3275911;
 const r1: ${varType} = 0.254829592;
 const r2: ${varType} = -0.284496736;
@@ -186,7 +186,7 @@ const r3: ${varType} = 1.421413741;
 const r4: ${varType} = -1.453152027;
 const r5: ${varType} = 1.061405429;
 
-fn erf_vf32(v: ${dataType}) -> ${dataType} {
+fn erf_vf32(v: vec4<${varType}>) -> vec4<${varType}> {
   let absv = abs(v);
   let x = 1.0 / (1.0 + r0 * absv);
   return sign(v) * (1.0 - ((((r5 * x + r4) * x + r3) * x + r2) * x + r1) * x * exp(-absv * absv));
@@ -194,8 +194,7 @@ fn erf_vf32(v: ${dataType}) -> ${dataType} {
 
 export const erf = (context: ComputeContext): void => {
   const dataType = tensorTypeToWsglValueType(context.inputs[0].dataType);
-  context.compute(createElementwiseProgramInfo(
-      context.inputs[0], 'Erf', a => `erf_vf32(${a})`, erfImpl(`vec4<${dataType}>`, dataType)));
+  context.compute(createElementwiseProgramInfo(context.inputs[0], 'Erf', a => `erf_vf32(${a})`, erfImpl(dataType)));
 };
 
 export const exp = (context: ComputeContext): void => {
@@ -209,8 +208,7 @@ export const floor = (context: ComputeContext): void => {
 export const gelu = (context: ComputeContext): void => {
   const dataType = tensorTypeToWsglValueType(context.inputs[0].dataType);
   context.compute(createElementwiseProgramInfo(
-      context.inputs[0], 'Gelu', a => `0.5 * ${a} * (1.0 + erf_vf32(${a} * 0.7071067811865475))`,
-      erfImpl(`vec4<${dataType}>`, dataType)));
+      context.inputs[0], 'Gelu', a => `0.5 * ${a} * (1.0 + erf_vf32(${a} * 0.7071067811865475))`, erfImpl(dataType)));
 };
 
 export const leakyRelu = (context: ComputeContext, attributes: AlphaAttributes): void => {
@@ -242,6 +240,26 @@ export const sigmoid = (context: ComputeContext): void => {
   context.compute(createElementwiseProgramInfo(context.inputs[0], 'Sigmoid', a => `(1.0 / (1.0 + exp(-${a})))`));
 };
 
+export interface HardSigmoidAttributes extends AttributeWithCacheKey {
+  readonly alpha: number;
+  readonly beta: number;
+}
+
+export const parseHardSigmoidAttributes = (attributes: Record<string, unknown>): HardSigmoidAttributes =>
+    createAttributeWithCacheKey(attributes as {
+      alpha: number;
+      beta: number;
+    });
+
+export const hardSigmoid = (context: ComputeContext, attributes: HardSigmoidAttributes): void => {
+  const dataType = tensorTypeToWsglValueType(context.inputs[0].dataType);
+  context.compute(createElementwiseProgramInfo(
+      context.inputs[0], 'HardSigmoid',
+      a => `max(vec4<${dataType}>(0.0), min(vec4<${dataType}>(1.0), ${attributes.alpha} * ${a} + vec4<${dataType}>(${
+          attributes.beta})))`,
+      undefined, attributes.cacheKey));
+};
+
 export const sin = (context: ComputeContext): void => {
   context.compute(createElementwiseProgramInfo(context.inputs[0], 'Sin', 'sin'));
 };
@@ -258,8 +276,31 @@ export const tan = (context: ComputeContext): void => {
   context.compute(createElementwiseProgramInfo(context.inputs[0], 'Tan', 'tan'));
 };
 
+export const tanhExpression = (a: string) => `sign(${a}) * (1 - exp(-2 * abs(${a}))) / (1 + exp(-2 * abs(${a})))`;
+
 export const tanh = (context: ComputeContext): void => {
-  context.compute(createElementwiseProgramInfo(context.inputs[0], 'Tanh', 'tanh'));
+  // TODO: revisit after https://github.com/gpuweb/gpuweb/issues/4458 is resolved
+  context.compute(createElementwiseProgramInfo(context.inputs[0], 'Tanh', tanhExpression));
+};
+
+export const fastGeluImpl = (varType = 'f32') => `
+const fast_gelu_a: ${varType} = 0.5;
+const fast_gelu_b: ${varType} = 0.7978845608028654;
+const fast_gelu_c: ${varType} = 0.035677408136300125;
+
+fn tanh_v(v: vec4<${varType}>) -> vec4<${varType}> {
+  return ${tanhExpression('v')};
+}
+`;
+
+export const fastGeluExpression = (x: string) =>
+    `(fast_gelu_a + fast_gelu_a * tanh_v(${x} * (fast_gelu_c * ${x} * ${x} + fast_gelu_b))) * ${x}`;
+
+export const fastGelu = (context: ComputeContext): void => {
+  const dataType = tensorTypeToWsglValueType(context.inputs[0].dataType);
+  context.compute(createElementwiseProgramInfo(
+      context.inputs[0], 'FastGelu', fastGeluExpression, fastGeluImpl(dataType), undefined,
+      context.inputs[0].dataType));
 };
 
 export const thresholdedRelu = (context: ComputeContext, attributes: AlphaAttributes): number => {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/where.ts b/js/web/lib/wasm/jsep/webgpu/ops/where.ts
index 687ee054096c..a6375847fc42 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/where.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/where.ts
@@ -27,7 +27,7 @@ const createWhereOpProgramShader =
           const expressionA = `a_data[index_a${x}][component_a${x}]`;
           const expressionB = `b_data[index_b${x}][component_b${x}]`;
           // eslint-disable-next-line no-bitwise
-          const expressionC = `bool(c_data[index_c${x}] & ${0xff000000 >>> ((3 - x) * 8)}u)`;
+          const expressionC = `bool(c_data[index_c${x}] & (0xffu << (component_c${x} * 8)))`;
           return `
             let output_indices${x} = ${output.offsetToIndices(`global_idx * 4u + ${x}u`)};
             let offset_a${x} = ${a.broadcastedIndicesToOffset(`output_indices${x}`, output)};
@@ -38,6 +38,7 @@ const createWhereOpProgramShader =
             let index_c${x} = offset_c${x} / 4u;
             let component_a${x} = offset_a${x} % 4u;
             let component_b${x} = offset_b${x} % 4u;
+            let component_c${x} = offset_c${x} % 4u;
             ${resStr}[${x}] = ${typeCast}(${expression(expressionA, expressionB, expressionC)});
           `;
         };
@@ -76,7 +77,6 @@ const createWhereOpProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
   const isBroadcast = !(ShapeUtil.areEqual(dimsA, dimsB) && ShapeUtil.areEqual(dimsB, dimsC));
   let outputShape = dimsA;
   let outputSize = ShapeUtil.size(dimsA);
-  const vecSize = Math.ceil(outputSize / 4);
   // TODO: deal with zero-sized tensors (eg. dims=[1,0])
 
   if (isBroadcast) {
@@ -88,6 +88,8 @@ const createWhereOpProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
     outputSize = ShapeUtil.size(outputShape);
   }
 
+  const vecSize = Math.ceil(outputSize / 4);
+
   return {
     name: 'Where',
     shaderCache: {inputDependencies: ['rank', 'rank', 'rank']},
@@ -96,10 +98,8 @@ const createWhereOpProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: outputDataType}],
       dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* vec size */)},
-      programUniforms: [
-        {type: 'uint32', data: vecSize}, ...createTensorShapeVariables(dimsC), ...createTensorShapeVariables(dimsA),
-        ...createTensorShapeVariables(dimsB), ...createTensorShapeVariables(outputShape)
-      ],
+      programUniforms:
+          [{type: DataType.uint32, data: vecSize}, ...createTensorShapeVariables(dimsC, dimsA, dimsB, outputShape)],
     }),
   };
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index ae5bf68483b4..ccbcbe48505d 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {tensorDataTypeEnumToString} from '../../wasm-common';
+import {TRACE_FUNC_BEGIN, TRACE_FUNC_END} from 'onnxruntime-common';
+
 import {WebGpuBackend} from '../backend-webgpu';
 import {LOG_DEBUG} from '../log';
-import {TensorView} from '../tensor-view';
 
 import {createShaderHelper} from './ops/common';
 import {Artifact, GpuData, ProgramInfo} from './types';
@@ -32,13 +32,12 @@ export class ProgramManager {
   setArtifact(key: unknown, artifact: Artifact): void {
     this.repo.set(key, artifact);
   }
-  run(buildArtifact: Artifact, inputTensorViews: readonly TensorView[], outputTensorViews: readonly TensorView[],
-      inputs: GpuData[], outputs: GpuData[], dispatchGroup: [number, number, number],
+  run(buildArtifact: Artifact, inputs: GpuData[], outputs: GpuData[], dispatchGroup: [number, number, number],
       uniformBufferBinding: GPUBindingResource|undefined): void {
+    TRACE_FUNC_BEGIN(buildArtifact.programInfo.name);
     const device = this.backend.device;
-
     const computePassEncoder = this.backend.getComputePassEncoder();
-    computePassEncoder.setPipeline(buildArtifact.computePipeline);
+    this.backend.writeTimestamp(this.backend.pendingDispatchNumber * 2);
     const entries = [];
     for (const input of inputs) {
       entries.push({binding: entries.length, resource: {buffer: input.buffer}});
@@ -51,94 +50,44 @@ export class ProgramManager {
     }
     const bindGroup = device.createBindGroup(
         {layout: buildArtifact.computePipeline.getBindGroupLayout(0), entries, label: buildArtifact.programInfo.name});
-    computePassEncoder.setBindGroup(0, bindGroup);
 
-    computePassEncoder.dispatchWorkgroups(...dispatchGroup);
+    if (this.backend.sessionStatus === 'capturing') {
+      const commandInfo = {
+        kernelId: this.backend.currentKernelId!,
+        computePipeline: buildArtifact.computePipeline,
+        bindGroup,
+        dispatchGroup
+      };
+      const sessionCommandList = this.backend.capturedCommandList.get(this.backend.currentSessionId!);
+      sessionCommandList!.push(commandInfo);
+    }
 
+    computePassEncoder.setPipeline(buildArtifact.computePipeline);
+    computePassEncoder.setBindGroup(0, bindGroup);
+    computePassEncoder.dispatchWorkgroups(...dispatchGroup);
+    this.backend.writeTimestamp(this.backend.pendingDispatchNumber * 2 + 1);
     this.backend.pendingDispatchNumber++;
 
-    if (this.backend.isQueryEnabled()) {
-      if (typeof this.backend.queryData === 'undefined') {
-        this.backend.queryData = this.backend.gpuDataManager.create(
-            // eslint-disable-next-line no-bitwise
-            this.backend.querySetCount * 8, GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE);
-      }
-      const syncData = this.backend.gpuDataManager.create(
-          // eslint-disable-next-line no-bitwise
-          this.backend.querySetCount * 8, GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST);
-
+    if (this.backend.pendingDispatchNumber >= this.backend.maxDispatchNumber ||
+        this.backend.queryType === 'at-passes') {
       this.backend.endComputePass();
-      this.backend.getCommandEncoder().resolveQuerySet(this.backend.querySet!, 0, 2, this.backend.queryData.buffer, 0);
-      this.backend.getCommandEncoder().copyBufferToBuffer(
-          this.backend.queryData.buffer, 0, syncData.buffer, 0, this.backend.querySetCount * 8);
-      this.backend.flush();
-
-      const kernelId = this.backend.currentKernelId!;
-      const kernelInfo = this.backend.kernels.get(kernelId)!;
-
-      void syncData.buffer.mapAsync(GPUMapMode.READ).then(() => {
-        const mappedData = new BigUint64Array(syncData.buffer.getMappedRange());
-        const [startTimeU64, endTimeU64] = mappedData;
-        const [kernelType, kernelName] = kernelInfo;
-
-        syncData.buffer.unmap();
-
-        if (typeof this.backend.queryTimeBase === 'undefined') {
-          this.backend.queryTimeBase = startTimeU64;
-        }
-
-        const startTime = Number(startTimeU64 - this.backend.queryTimeBase);
-        const endTime = Number(endTimeU64 - this.backend.queryTimeBase);
-
-        if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
-          throw new RangeError('incorrect timestamp range');
-        }
-
-        this.backend.gpuDataManager.release(syncData.id);
-        if (this.backend.env.webgpu.profiling?.ondata) {
-          this.backend.env.webgpu.profiling.ondata({
-            version: 1,
-            inputsMetadata: inputTensorViews.map(
-                value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
-            outputsMetadata: outputTensorViews.map(
-                value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
-            kernelId,
-            kernelType,
-            kernelName,
-            startTime,
-            endTime,
-          });
-        } else {
-          // if no callback is provided, print the profiling message to console
-          let inputShapes = '';
-          inputTensorViews.forEach((value, i) => {
-            inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
-          });
-          let outputShapes = '';
-          outputTensorViews.forEach((value, i) => {
-            outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
-          });
-          // eslint-disable-next-line no-console
-          console.log(`[profiling] kernel "${kernelId}|${kernelName}|${buildArtifact.programInfo.name}" ${inputShapes}${
-              outputShapes}execution time: ${endTime - startTime} ns`);
-        }
-      });
     }
-
-    if (this.backend.pendingDispatchNumber >= 16) {
+    if (this.backend.pendingDispatchNumber >= this.backend.maxDispatchNumber) {
       this.backend.flush();
     }
+    TRACE_FUNC_END(buildArtifact.programInfo.name);
   }
   dispose(): void {
     // this.repo.forEach(a => this.glContext.deleteProgram(a.program));
   }
   build(programInfo: ProgramInfo, normalizedDispatchGroupSize: [number, number, number]): Artifact {
+    TRACE_FUNC_BEGIN(programInfo.name);
     const device = this.backend.device;
     const extensions: string[] = [];
     if (device.features.has('shader-f16')) {
       extensions.push('enable f16;');
     }
-    const shaderHelper = createShaderHelper(normalizedDispatchGroupSize);
+    const shaderHelper = createShaderHelper(normalizedDispatchGroupSize, this.backend.device.limits);
     const userCode = programInfo.getShaderSource(shaderHelper);
     const code = `${extensions.join('\n')}\n${shaderHelper.additionalImplementations}\n${userCode}`;
     const shaderModule = device.createShaderModule({code, label: programInfo.name});
@@ -147,7 +96,8 @@ export class ProgramManager {
     const computePipeline = device.createComputePipeline(
         {compute: {module: shaderModule, entryPoint: 'main'}, layout: 'auto', label: programInfo.name});
 
-    return {programInfo, computePipeline};
+    TRACE_FUNC_END(programInfo.name);
+    return {programInfo, computePipeline, uniformVariablesInfo: shaderHelper.variablesInfo};
   }
 
   normalizeDispatchGroupSize(dispatchGroup: ReturnType<ProgramInfo['getRunData']>['dispatchGroup']):
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 23fa33a9bba8..2a584fc0a221 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -1,10 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {DataType} from '../../wasm-common';
 import {TensorView} from '../tensor-view';
 
 import {ShaderHelper} from './ops/common';
 
+export type SessionState = 'default'|'capturing'|'replaying';
+
 export enum GpuDataType {
   default = 0,
   upload = 1,
@@ -12,6 +15,13 @@ export enum GpuDataType {
 }
 export type GpuDataId = number;
 
+export type GpuArchitecture = 'ampere';
+export type GpuVendor = 'amd'|'intel'|'nvidia';
+export interface AdapterInfo {
+  isArchitecture: (architecture: GpuArchitecture) => boolean;
+  isVendor: (vendor: GpuVendor) => boolean;
+}
+
 export interface GpuData {
   type: GpuDataType;
   id: GpuDataId;
@@ -23,12 +33,13 @@ export interface TensorInfo {
   dataType: number;
 }
 
-
 export interface ProgramUniform {
-  type: 'int32'|'float32'|'uint32';
+  type: DataType;
   data: number|readonly number[];
 }
 
+export type ProgramUniformVariableInfo = [type: DataType, length: number];
+
 /**
  * Represent the dependency of a program on a specific input tensor.
  *
@@ -116,6 +127,7 @@ export interface ProgramInfo {
 export interface Artifact {
   programInfo: ProgramInfo;
   computePipeline: GPUComputePipeline;
+  uniformVariablesInfo: readonly ProgramUniformVariableInfo[]|undefined;
 }
 
 export interface ComputeContextInputsOutputsMapping {
@@ -144,6 +156,11 @@ export interface ComputeContextInputsOutputsMapping {
  * A ComputeContext instance carries the states that representing the current running of a kernel.
  */
 export interface ComputeContext {
+  /**
+   * gpu adapter info
+   */
+  readonly adapterInfo: AdapterInfo;
+
   /**
    * stores the pointer to OpKernelContext
    */
@@ -171,4 +188,8 @@ export interface ComputeContext {
 
   compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[];
   output(index: number, dims: readonly number[]): number;
+  getMaxComputeWorkgroupSizes(): [number, number, number];
+  getMaxComputeWorkgroupStoragesize(): number;
 }
+
+export type TimestampQuery = 'none'|'inside-passes'|'at-passes';
diff --git a/js/web/lib/wasm/proxy-worker/main.ts b/js/web/lib/wasm/proxy-worker/main.ts
index 4df524cdcfb2..3ce37a2d6b65 100644
--- a/js/web/lib/wasm/proxy-worker/main.ts
+++ b/js/web/lib/wasm/proxy-worker/main.ts
@@ -79,8 +79,14 @@ self.onmessage = (ev: MessageEvent<OrtWasmMessage>): void => {
       }
       case 'create': {
         const {model, options} = message!;
-        const sessionMetadata = createSession(model, options);
-        postMessage({type, out: sessionMetadata} as OrtWasmMessage);
+        createSession(model, options)
+            .then(
+                sessionMetadata => {
+                  postMessage({type, out: sessionMetadata} as OrtWasmMessage);
+                },
+                err => {
+                  postMessage({type, err});
+                });
         break;
       }
       case 'release':
@@ -97,7 +103,7 @@ self.onmessage = (ev: MessageEvent<OrtWasmMessage>): void => {
                   } else {
                     postMessage(
                         {type, out: outputs} as OrtWasmMessage,
-                        extractTransferableBuffers(outputs as SerializableTensorMetadata[]));
+                        extractTransferableBuffers([...inputs, ...outputs] as SerializableTensorMetadata[]));
                   }
                 },
                 err => {
diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts
index 86017a4ec690..6ff4e86b1235 100644
--- a/js/web/lib/wasm/proxy-wrapper.ts
+++ b/js/web/lib/wasm/proxy-wrapper.ts
@@ -155,7 +155,7 @@ export const createSession =
             ensureWorker();
             return new Promise<SerializableSessionMetadata>((resolve, reject) => {
               enqueueCallbacks('create', [resolve, reject]);
-              const message: OrtWasmMessage = {type: 'create', in : {model, options}};
+              const message: OrtWasmMessage = {type: 'create', in : {model, options: {...options}}};
               const transferable: Transferable[] = [];
               if (model instanceof Uint8Array) {
                 transferable.push(model.buffer);
diff --git a/js/web/lib/wasm/session-handler-inference.ts b/js/web/lib/wasm/session-handler-inference.ts
index b62287483208..2bece248669f 100644
--- a/js/web/lib/wasm/session-handler-inference.ts
+++ b/js/web/lib/wasm/session-handler-inference.ts
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {readFile} from 'node:fs/promises';
-import {InferenceSession, InferenceSessionHandler, SessionHandler, Tensor} from 'onnxruntime-common';
+import {InferenceSession, InferenceSessionHandler, SessionHandler, Tensor, TRACE_FUNC_BEGIN, TRACE_FUNC_END} from 'onnxruntime-common';
 
 import {SerializableInternalBuffer, TensorMetadata} from './proxy-messages';
 import {copyFromExternalBuffer, createSession, endProfiling, releaseSession, run} from './proxy-wrapper';
 import {isGpuBufferSupportedType} from './wasm-common';
+import {loadFile} from './wasm-utils-load-file';
 
 export const encodeTensorMetadata = (tensor: Tensor, getName: () => string): TensorMetadata => {
   switch (tensor.location) {
@@ -43,23 +43,18 @@ export class OnnxruntimeWebAssemblySessionHandler implements InferenceSessionHan
   outputNames: string[];
 
   async fetchModelAndCopyToWasmMemory(path: string): Promise<SerializableInternalBuffer> {
-    // fetch model from url and move to wasm heap. The arraybufffer that held the http
-    // response is freed once we return
-    const response = await fetch(path);
-    if (response.status !== 200) {
-      throw new Error(`failed to load model: ${path}`);
-    }
-    const arrayBuffer = await response.arrayBuffer();
-    return copyFromExternalBuffer(new Uint8Array(arrayBuffer));
+    // fetch model from url and move to wasm heap.
+    return copyFromExternalBuffer(await loadFile(path));
   }
 
   async loadModel(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions): Promise<void> {
+    TRACE_FUNC_BEGIN();
     let model: Parameters<typeof createSession>[0];
 
     if (typeof pathOrBuffer === 'string') {
       if (typeof process !== 'undefined' && process.versions && process.versions.node) {
         // node
-        model = await readFile(pathOrBuffer);
+        model = await loadFile(pathOrBuffer);
       } else {
         // browser
         // fetch model and copy to wasm heap.
@@ -70,6 +65,7 @@ export class OnnxruntimeWebAssemblySessionHandler implements InferenceSessionHan
     }
 
     [this.sessionId, this.inputNames, this.outputNames] = await createSession(model, options);
+    TRACE_FUNC_END();
   }
 
   async dispose(): Promise<void> {
@@ -78,6 +74,7 @@ export class OnnxruntimeWebAssemblySessionHandler implements InferenceSessionHan
 
   async run(feeds: SessionHandler.FeedsType, fetches: SessionHandler.FetchesType, options: InferenceSession.RunOptions):
       Promise<SessionHandler.ReturnType> {
+    TRACE_FUNC_BEGIN();
     const inputArray: Tensor[] = [];
     const inputIndices: number[] = [];
     Object.entries(feeds).forEach(kvp => {
@@ -115,6 +112,7 @@ export class OnnxruntimeWebAssemblySessionHandler implements InferenceSessionHan
     for (let i = 0; i < results.length; i++) {
       resultMap[this.outputNames[outputIndices[i]]] = outputArray[i] ?? decodeTensorMetadata(results[i]);
     }
+    TRACE_FUNC_END();
     return resultMap;
   }
 
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 45ea48a2df20..48eac5749472 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -60,9 +60,6 @@ const setExecutionProviders =
 
         // check EP name
         switch (epName) {
-          case 'xnnpack':
-            epName = 'XNNPACK';
-            break;
           case 'webnn':
             epName = 'WEBNN';
             if (typeof ep !== 'string') {
@@ -171,6 +168,18 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
       setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
     }
 
+    if (sessionOptions.enableGraphCapture !== undefined) {
+      if (typeof sessionOptions.enableGraphCapture !== 'boolean') {
+        throw new Error(`enableGraphCapture must be a boolean value: ${sessionOptions.enableGraphCapture}`);
+      }
+      const keyDataOffset = allocWasmString('enableGraphCapture', allocs);
+      const valueDataOffset = allocWasmString(sessionOptions.enableGraphCapture.toString(), allocs);
+      if (wasm._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
+        checkLastError(
+            `Can't set a session config entry: 'enableGraphCapture' - ${sessionOptions.enableGraphCapture}.`);
+      }
+    }
+
     if (sessionOptions.freeDimensionOverrides) {
       for (const [name, value] of Object.entries(sessionOptions.freeDimensionOverrides)) {
         if (typeof name !== 'string') {
diff --git a/js/web/lib/wasm/wasm-common.ts b/js/web/lib/wasm/wasm-common.ts
index b9eff45e890c..54eaf5e0c43c 100644
--- a/js/web/lib/wasm/wasm-common.ts
+++ b/js/web/lib/wasm/wasm-common.ts
@@ -3,6 +3,12 @@
 
 import {Tensor} from 'onnxruntime-common';
 
+// a dummy type declaration for Float16Array in case any polyfill is available.
+declare global {
+  // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any
+  const Float16Array: any;
+}
+
 // This file includes common definitions. They do NOT have dependency on the WebAssembly instance.
 
 /**
@@ -117,7 +123,8 @@ export const tensorTypeToTypedArrayConstructor = (type: Tensor.Type): Float32Arr
     Uint8ArrayConstructor|Float64ArrayConstructor|Uint32ArrayConstructor|BigUint64ArrayConstructor => {
       switch (type) {
         case 'float16':
-          return Uint16Array;
+          // allow Float16Array polyfill.
+          return typeof Float16Array !== 'undefined' && Float16Array.from ? Float16Array : Uint16Array;
         case 'float32':
           return Float32Array;
         case 'uint8':
@@ -169,7 +176,8 @@ export const logLevelStringToEnum = (logLevel?: 'verbose'|'info'|'warning'|'erro
  * Check whether the given tensor type is supported by GPU buffer
  */
 export const isGpuBufferSupportedType = (type: Tensor.Type): type is Tensor.GpuBufferDataTypes => type === 'float32' ||
-    type === 'int32' || type === 'int64' || type === 'bool' || type === 'float16' || type === 'uint32';
+    type === 'float16' || type === 'int32' || type === 'int64' || type === 'uint32' || type === 'uint8' ||
+    type === 'bool';
 
 /**
  * Map string data location to integer value
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index a9dfd9218bb6..9b27051f1b9f 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -9,6 +9,7 @@ import {setSessionOptions} from './session-options';
 import {dataLocationStringToEnum, getTensorElementSize, isGpuBufferSupportedType, logLevelStringToEnum, tensorDataTypeEnumToString, tensorDataTypeStringToEnum, tensorTypeToTypedArrayConstructor} from './wasm-common';
 import {getInstance} from './wasm-factory';
 import {allocWasmString, checkLastError} from './wasm-utils';
+import {loadFile} from './wasm-utils-load-file';
 
 // #region Initializations
 
@@ -83,27 +84,57 @@ export const initRuntime = async(env: Env): Promise<void> => {
  * @param epName
  */
 export const initEp = async(env: Env, epName: string): Promise<void> => {
-  if (!BUILD_DEFS.DISABLE_WEBGPU && epName === 'webgpu') {
-    // perform WebGPU availability check
-    if (typeof navigator === 'undefined' || !navigator.gpu) {
-      throw new Error('WebGPU is not supported in current environment');
-    }
-    const adapter = await navigator.gpu.requestAdapter();
-    if (!adapter) {
-      throw new Error(
-          'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');
-    }
+  if (!BUILD_DEFS.DISABLE_WEBGPU) {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
+    const initJsep = require('./jsep/init').init;
 
-    if (!env.wasm.simd) {
-      throw new Error(
-          'Not supported for WebGPU=ON and SIMD=OFF. Please set `env.wasm.simd` to true when using `webgpu` EP');
-    }
+    if (epName === 'webgpu') {
+      // perform WebGPU availability check
+      if (typeof navigator === 'undefined' || !navigator.gpu) {
+        throw new Error('WebGPU is not supported in current environment');
+      }
 
-    // init JSEP if available
+      let adapter = env.webgpu.adapter as GPUAdapter | null;
+      if (!adapter) {
+        // if adapter is not set, request a new adapter.
+        const powerPreference = env.webgpu.powerPreference;
+        if (powerPreference !== undefined && powerPreference !== 'low-power' &&
+            powerPreference !== 'high-performance') {
+          throw new Error(`Invalid powerPreference setting: "${powerPreference}"`);
+        }
+        const forceFallbackAdapter = env.webgpu.forceFallbackAdapter;
+        if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') {
+          throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`);
+        }
+        adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter});
+        if (!adapter) {
+          throw new Error(
+              'Failed to get GPU adapter. ' +
+              'You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');
+        }
+      } else {
+        // if adapter is set, validate it.
+        if (typeof adapter.limits !== 'object' || typeof adapter.features !== 'object' ||
+            typeof adapter.requestDevice !== 'function') {
+          throw new Error('Invalid GPU adapter set in `env.webgpu.adapter`. It must be a GPUAdapter object.');
+        }
+      }
 
-    // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
-    const initJsep = require('./jsep/init').init;
-    await initJsep(getInstance(), env, adapter);
+      if (!env.wasm.simd) {
+        throw new Error(
+            'Not supported for WebGPU=ON and SIMD=OFF. Please set `env.wasm.simd` to true when using `webgpu` EP');
+      }
+
+      await initJsep('webgpu', getInstance(), env, adapter);
+    }
+    if (epName === 'webnn') {
+      // perform WebNN availability check
+      if (typeof navigator === 'undefined' || !(navigator as unknown as {ml: unknown}).ml) {
+        throw new Error('WebNN is not supported in current environment');
+      }
+
+      await initJsep('webnn', getInstance(), env);
+    }
   }
 };
 
@@ -138,7 +169,7 @@ type IOBindingState = {
  */
 type SessionMetadata = [
   inferenceSessionId: number, inputNamesUTF8Encoded: number[], outputNamesUTF8Encoded: number[],
-  bindingState: IOBindingState|null
+  bindingState: IOBindingState|null, enableGraphCapture: boolean, inputOutputBound: boolean
 ];
 
 const activeSessions = new Map<number, SessionMetadata>();
@@ -187,108 +218,136 @@ export const copyFromExternalBuffer = (model: Uint8Array): [number, number] => {
  * @param options an optional session options object.
  * @returns a 3-elements tuple containing [session handle, input names, output names]
  */
-export const createSession =
-    (modelData: Uint8Array|SerializableInternalBuffer,
-     options?: InferenceSession.SessionOptions): SerializableSessionMetadata => {
-      let modelDataOffset: number, modelDataLength: number;
-      const wasm = getInstance();
+export const createSession = async(
+    modelData: Uint8Array|SerializableInternalBuffer,
+    options?: InferenceSession.SessionOptions): Promise<SerializableSessionMetadata> => {
+  let modelDataOffset: number, modelDataLength: number;
+  const wasm = getInstance();
 
-      if (Array.isArray(modelData)) {
-        // if model data is an array, it must be a 2-elements tuple containing the pointer and size of the model data
-        [modelDataOffset, modelDataLength] = modelData;
-      } else if (modelData.buffer === wasm.HEAPU8.buffer) {
-        // if model data uses the same buffer as the WASM heap, we don't need to copy it.
-        [modelDataOffset, modelDataLength] = [modelData.byteOffset, modelData.byteLength];
-      } else {
-        // otherwise, copy the model data to the WASM heap.
-        [modelDataOffset, modelDataLength] = copyFromExternalBuffer(modelData);
+  if (Array.isArray(modelData)) {
+    // if model data is an array, it must be a 2-elements tuple containing the pointer and size of the model data
+    [modelDataOffset, modelDataLength] = modelData;
+  } else if (modelData.buffer === wasm.HEAPU8.buffer) {
+    // if model data uses the same buffer as the WASM heap, we don't need to copy it.
+    [modelDataOffset, modelDataLength] = [modelData.byteOffset, modelData.byteLength];
+  } else {
+    // otherwise, copy the model data to the WASM heap.
+    [modelDataOffset, modelDataLength] = copyFromExternalBuffer(modelData);
+  }
+
+  let sessionHandle = 0;
+  let sessionOptionsHandle = 0;
+  let ioBindingHandle = 0;
+  let allocs: number[] = [];
+  const inputNamesUTF8Encoded = [];
+  const outputNamesUTF8Encoded = [];
+
+  try {
+    [sessionOptionsHandle, allocs] = setSessionOptions(options);
+
+    if (options?.externalData && wasm.mountExternalData) {
+      const loadingPromises = [];
+      for (const file of options.externalData) {
+        const path = typeof file === 'string' ? file : file.path;
+        loadingPromises.push(loadFile(typeof file === 'string' ? file : file.data).then(data => {
+          wasm.mountExternalData!(path, data);
+        }));
       }
 
-      let sessionHandle = 0;
-      let sessionOptionsHandle = 0;
-      let ioBindingHandle = 0;
-      let allocs: number[] = [];
-      const inputNamesUTF8Encoded = [];
-      const outputNamesUTF8Encoded = [];
+      // wait for all external data files to be loaded
+      await Promise.all(loadingPromises);
+    }
 
-      try {
-        [sessionOptionsHandle, allocs] = setSessionOptions(options);
+    sessionHandle = await wasm._OrtCreateSession(modelDataOffset, modelDataLength, sessionOptionsHandle);
+    if (sessionHandle === 0) {
+      checkLastError('Can\'t create a session.');
+    }
 
-        sessionHandle = wasm._OrtCreateSession(modelDataOffset, modelDataLength, sessionOptionsHandle);
-        if (sessionHandle === 0) {
-          checkLastError('Can\'t create a session.');
-        }
+    const [inputCount, outputCount] = getSessionInputOutputCount(sessionHandle);
 
-        const [inputCount, outputCount] = getSessionInputOutputCount(sessionHandle);
+    const enableGraphCapture = !!options?.enableGraphCapture;
 
-        const inputNames = [];
-        const outputNames = [];
-        const outputPreferredLocations: SupportedTensorDataLocationForInputOutput[] = [];
-        for (let i = 0; i < inputCount; i++) {
-          const name = wasm._OrtGetInputName(sessionHandle, i);
-          if (name === 0) {
-            checkLastError('Can\'t get an input name.');
-          }
-          inputNamesUTF8Encoded.push(name);
-          inputNames.push(wasm.UTF8ToString(name));
+    const inputNames = [];
+    const outputNames = [];
+    const outputPreferredLocations: SupportedTensorDataLocationForInputOutput[] = [];
+    for (let i = 0; i < inputCount; i++) {
+      const name = wasm._OrtGetInputName(sessionHandle, i);
+      if (name === 0) {
+        checkLastError('Can\'t get an input name.');
+      }
+      inputNamesUTF8Encoded.push(name);
+      inputNames.push(wasm.UTF8ToString(name));
+    }
+    for (let i = 0; i < outputCount; i++) {
+      const name = wasm._OrtGetOutputName(sessionHandle, i);
+      if (name === 0) {
+        checkLastError('Can\'t get an output name.');
+      }
+      outputNamesUTF8Encoded.push(name);
+      const nameString = wasm.UTF8ToString(name);
+      outputNames.push(nameString);
+
+      if (!BUILD_DEFS.DISABLE_WEBGPU) {
+        if (enableGraphCapture && options?.preferredOutputLocation === undefined) {
+          outputPreferredLocations.push('gpu-buffer');
+          continue;
         }
-        for (let i = 0; i < outputCount; i++) {
-          const name = wasm._OrtGetOutputName(sessionHandle, i);
-          if (name === 0) {
-            checkLastError('Can\'t get an output name.');
-          }
-          outputNamesUTF8Encoded.push(name);
-          const nameString = wasm.UTF8ToString(name);
-          outputNames.push(nameString);
-
-          if (!BUILD_DEFS.DISABLE_WEBGPU) {
-            const location = typeof options?.preferredOutputLocation === 'string' ?
-                options.preferredOutputLocation :
-                options?.preferredOutputLocation?.[nameString] ?? 'cpu';
-            if (location !== 'cpu' && location !== 'cpu-pinned' && location !== 'gpu-buffer') {
-              throw new Error(`Not supported preferred output location: ${location}.`);
-            }
-            outputPreferredLocations.push(location);
-          }
+        const location = typeof options?.preferredOutputLocation === 'string' ?
+            options.preferredOutputLocation :
+            options?.preferredOutputLocation?.[nameString] ?? 'cpu';
+        if (location !== 'cpu' && location !== 'cpu-pinned' && location !== 'gpu-buffer') {
+          throw new Error(`Not supported preferred output location: ${location}.`);
+        }
+        if (enableGraphCapture && location !== 'gpu-buffer') {
+          throw new Error(`Not supported preferred output location: ${
+              location}. Only 'gpu-buffer' location is supported when enableGraphCapture is true.`);
         }
+        outputPreferredLocations.push(location);
+      }
+    }
 
-        // use IO binding only when at least one output is preffered to be on GPU.
-        let bindingState: IOBindingState|null = null;
-        if (!BUILD_DEFS.DISABLE_WEBGPU && outputPreferredLocations.some(l => l === 'gpu-buffer')) {
-          ioBindingHandle = wasm._OrtCreateBinding(sessionHandle);
-          if (ioBindingHandle === 0) {
-            checkLastError('Can\'t create IO binding.');
-          }
+    // use IO binding only when at least one output is preffered to be on GPU.
+    let bindingState: IOBindingState|null = null;
+    if (!BUILD_DEFS.DISABLE_WEBGPU && outputPreferredLocations.some(l => l === 'gpu-buffer')) {
+      ioBindingHandle = wasm._OrtCreateBinding(sessionHandle);
+      if (ioBindingHandle === 0) {
+        checkLastError('Can\'t create IO binding.');
+      }
 
-          bindingState = {
-            handle: ioBindingHandle,
-            outputPreferredLocations,
-            outputPreferredLocationsEncoded: outputPreferredLocations.map(l => dataLocationStringToEnum(l)),
-          };
-        }
+      bindingState = {
+        handle: ioBindingHandle,
+        outputPreferredLocations,
+        outputPreferredLocationsEncoded: outputPreferredLocations.map(l => dataLocationStringToEnum(l)),
+      };
+    }
 
-        activeSessions.set(sessionHandle, [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, bindingState]);
-        return [sessionHandle, inputNames, outputNames];
-      } catch (e) {
-        inputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
-        outputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
+    activeSessions.set(
+        sessionHandle,
+        [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, bindingState, enableGraphCapture, false]);
+    return [sessionHandle, inputNames, outputNames];
+  } catch (e) {
+    inputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
+    outputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
 
-        if (ioBindingHandle !== 0) {
-          wasm._OrtReleaseBinding(ioBindingHandle);
-        }
+    if (ioBindingHandle !== 0) {
+      wasm._OrtReleaseBinding(ioBindingHandle);
+    }
 
-        if (sessionHandle !== 0) {
-          wasm._OrtReleaseSession(sessionHandle);
-        }
-        throw e;
-      } finally {
-        wasm._free(modelDataOffset);
-        if (sessionOptionsHandle !== 0) {
-          wasm._OrtReleaseSessionOptions(sessionOptionsHandle);
-        }
-        allocs.forEach(alloc => wasm._free(alloc));
-      }
-    };
+    if (sessionHandle !== 0) {
+      wasm._OrtReleaseSession(sessionHandle);
+    }
+    throw e;
+  } finally {
+    wasm._free(modelDataOffset);
+    if (sessionOptionsHandle !== 0) {
+      wasm._OrtReleaseSessionOptions(sessionOptionsHandle);
+    }
+    allocs.forEach(alloc => wasm._free(alloc));
+
+    // unmount external data if necessary
+    wasm.unmountExternalData?.();
+  }
+};
 
 export const releaseSession = (sessionId: number): void => {
   const wasm = getInstance();
@@ -296,13 +355,16 @@ export const releaseSession = (sessionId: number): void => {
   if (!session) {
     throw new Error(`cannot release session. invalid session id: ${sessionId}`);
   }
-  const [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, ioBindingState] = session;
+  const [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, ioBindingState, enableGraphCapture] = session;
 
   if (ioBindingState) {
+    if (enableGraphCapture) {
+      wasm._OrtClearBoundOutputs(ioBindingState.handle);
+    }
     wasm._OrtReleaseBinding(ioBindingState.handle);
   }
 
-  wasm.jsepUnregisterBuffers?.(sessionId);
+  wasm.jsepOnReleaseSession?.(sessionId);
 
   inputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
   outputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
@@ -311,70 +373,80 @@ export const releaseSession = (sessionId: number): void => {
 };
 
 export const prepareInputOutputTensor =
-    (tensor: TensorMetadata|null, tensorHandles: number[], allocs: number[], sessionId: number, index: number):
-        void => {
-          if (!tensor) {
-            tensorHandles.push(0);
-            return;
-          }
+    (tensor: TensorMetadata|null, tensorHandles: number[], allocs: number[], sessionId: number, index: number,
+     enableGraphCapture = false): void => {
+      if (!tensor) {
+        tensorHandles.push(0);
+        return;
+      }
 
-          const wasm = getInstance();
+      const wasm = getInstance();
 
-          const dataType = tensor[0];
-          const dims = tensor[1];
-          const location = tensor[3];
+      const dataType = tensor[0];
+      const dims = tensor[1];
+      const location = tensor[3];
 
-          let rawData: number;
-          let dataByteLength: number;
+      let rawData: number;
+      let dataByteLength: number;
 
-          if (dataType === 'string' && location === 'gpu-buffer') {
-            throw new Error('String tensor is not supported on GPU.');
-          }
+      if (dataType === 'string' && location === 'gpu-buffer') {
+        throw new Error('String tensor is not supported on GPU.');
+      }
 
-          if (location === 'gpu-buffer') {
-            const gpuBuffer = tensor[2].gpuBuffer as GPUBuffer;
-            const elementSizeInBytes = getTensorElementSize(tensorDataTypeStringToEnum(dataType))!;
-            dataByteLength = dims.reduce((a, b) => a * b, 1) * elementSizeInBytes;
-            rawData = wasm.jsepRegisterBuffer(sessionId, index, gpuBuffer, dataByteLength);
-          } else {
-            const data = tensor[2];
-
-            if (Array.isArray(data)) {
-              // string tensor
-              dataByteLength = 4 * data.length;
-              rawData = wasm._malloc(dataByteLength);
-              allocs.push(rawData);
-              let dataIndex = rawData / 4;
-              for (let i = 0; i < data.length; i++) {
-                if (typeof data[i] !== 'string') {
-                  throw new TypeError(`tensor data at index ${i} is not a string`);
-                }
-                wasm.HEAPU32[dataIndex++] = allocWasmString(data[i], allocs);
-              }
-            } else {
-              dataByteLength = data.byteLength;
-              rawData = wasm._malloc(dataByteLength);
-              allocs.push(rawData);
-              wasm.HEAPU8.set(new Uint8Array(data.buffer, data.byteOffset, dataByteLength), rawData);
-            }
-          }
+      if (enableGraphCapture && location !== 'gpu-buffer') {
+        throw new Error(
+            `External buffer must be provided for input/output index ${index} when enableGraphCapture is true.`);
+      }
+
+      if (location === 'gpu-buffer') {
+        const gpuBuffer = tensor[2].gpuBuffer as GPUBuffer;
+        const elementSizeInBytes = getTensorElementSize(tensorDataTypeStringToEnum(dataType))!;
+        dataByteLength = dims.reduce((a, b) => a * b, 1) * elementSizeInBytes;
 
-          const stack = wasm.stackSave();
-          const dimsOffset = wasm.stackAlloc(4 * dims.length);
-          try {
-            let dimIndex = dimsOffset / 4;
-            dims.forEach(d => wasm.HEAP32[dimIndex++] = d);
-            const tensor = wasm._OrtCreateTensor(
-                tensorDataTypeStringToEnum(dataType), rawData, dataByteLength, dimsOffset, dims.length,
-                dataLocationStringToEnum(location));
-            if (tensor === 0) {
-              checkLastError(`Can't create tensor for input/output. session=${sessionId}, index=${index}.`);
+        const registerBuffer = wasm.jsepRegisterBuffer;
+        if (!registerBuffer) {
+          throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+        }
+        rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength);
+      } else {
+        const data = tensor[2];
+
+        if (Array.isArray(data)) {
+          // string tensor
+          dataByteLength = 4 * data.length;
+          rawData = wasm._malloc(dataByteLength);
+          allocs.push(rawData);
+          let dataIndex = rawData / 4;
+          for (let i = 0; i < data.length; i++) {
+            if (typeof data[i] !== 'string') {
+              throw new TypeError(`tensor data at index ${i} is not a string`);
             }
-            tensorHandles.push(tensor);
-          } finally {
-            wasm.stackRestore(stack);
+            wasm.HEAPU32[dataIndex++] = allocWasmString(data[i], allocs);
           }
-        };
+        } else {
+          dataByteLength = data.byteLength;
+          rawData = wasm._malloc(dataByteLength);
+          allocs.push(rawData);
+          wasm.HEAPU8.set(new Uint8Array(data.buffer, data.byteOffset, dataByteLength), rawData);
+        }
+      }
+
+      const stack = wasm.stackSave();
+      const dimsOffset = wasm.stackAlloc(4 * dims.length);
+      try {
+        let dimIndex = dimsOffset / 4;
+        dims.forEach(d => wasm.HEAP32[dimIndex++] = d);
+        const tensor = wasm._OrtCreateTensor(
+            tensorDataTypeStringToEnum(dataType), rawData, dataByteLength, dimsOffset, dims.length,
+            dataLocationStringToEnum(location));
+        if (tensor === 0) {
+          checkLastError(`Can't create tensor for input/output. session=${sessionId}, index=${index}.`);
+        }
+        tensorHandles.push(tensor);
+      } finally {
+        wasm.stackRestore(stack);
+      }
+    };
 
 /**
  * perform inference run
@@ -387,7 +459,12 @@ export const run = async(
   if (!session) {
     throw new Error(`cannot run inference. invalid session id: ${sessionId}`);
   }
-  const [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, ioBindingState] = session;
+  const sessionHandle = session[0];
+  const inputNamesUTF8Encoded = session[1];
+  const outputNamesUTF8Encoded = session[2];
+  const ioBindingState = session[3];
+  const enableGraphCapture = session[4];
+  const inputOutputBound = session[5];
 
   const inputCount = inputIndices.length;
   const outputCount = outputIndices.length;
@@ -410,13 +487,15 @@ export const run = async(
 
     // create input tensors
     for (let i = 0; i < inputCount; i++) {
-      prepareInputOutputTensor(inputTensors[i], inputTensorHandles, inputOutputAllocs, sessionId, inputIndices[i]);
+      prepareInputOutputTensor(
+          inputTensors[i], inputTensorHandles, inputOutputAllocs, sessionId, inputIndices[i], enableGraphCapture);
     }
 
     // create output tensors
     for (let i = 0; i < outputCount; i++) {
       prepareInputOutputTensor(
-          outputTensors[i], outputTensorHandles, inputOutputAllocs, sessionId, inputCount + outputIndices[i]);
+          outputTensors[i], outputTensorHandles, inputOutputAllocs, sessionId, inputCount + outputIndices[i],
+          enableGraphCapture);
     }
 
     let inputValuesIndex = inputValuesOffset / 4;
@@ -432,7 +511,7 @@ export const run = async(
       wasm.HEAPU32[outputNamesIndex++] = outputNamesUTF8Encoded[outputIndices[i]];
     }
 
-    if (!BUILD_DEFS.DISABLE_WEBGPU && ioBindingState) {
+    if (!BUILD_DEFS.DISABLE_WEBGPU && ioBindingState && !inputOutputBound) {
       const {handle, outputPreferredLocations, outputPreferredLocationsEncoded} = ioBindingState;
 
       if (inputNamesUTF8Encoded.length !== inputCount) {
@@ -469,10 +548,13 @@ export const run = async(
           }
         }
       }
+      activeSessions.set(
+          sessionId,
+          [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, ioBindingState, enableGraphCapture, true]);
     }
 
+    wasm.jsepOnRunStart?.(sessionHandle);
     let errorCode: number;
-
     if (!BUILD_DEFS.DISABLE_WEBGPU && ioBindingState) {
       errorCode = await wasm._OrtRunWithBinding(
           sessionHandle, ioBindingState.handle, outputCount, outputValuesOffset, runOptionsHandle);
@@ -540,7 +622,11 @@ export const run = async(
           // If a certain output's preferred location is GPU but the tensor is empty, we still need to create a CPU
           // tensor for it. There is no mapping GPU buffer for an empty tensor.
           if (preferredLocation === 'gpu-buffer' && size > 0) {
-            const gpuBuffer = wasm.jsepGetBuffer(dataOffset);
+            const getBuffer = wasm.jsepGetBuffer;
+            if (!getBuffer) {
+              throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
+            }
+            const gpuBuffer = getBuffer(dataOffset);
             const elementSize = getTensorElementSize(dataType);
             if (elementSize === undefined || !isGpuBufferSupportedType(type)) {
               throw new Error(`Unsupported data type: ${type}`);
@@ -552,7 +638,7 @@ export const run = async(
             output.push([
               type, dims, {
                 gpuBuffer,
-                download: wasm.jsepCreateDownloader(gpuBuffer, size * elementSize, type),
+                download: wasm.jsepCreateDownloader!(gpuBuffer, size * elementSize, type),
                 dispose: () => {
                   wasm._OrtReleaseTensor(tensor);
                 }
@@ -578,10 +664,12 @@ export const run = async(
       }
     }
 
-    if (ioBindingState) {
+    if (ioBindingState && !enableGraphCapture) {
       wasm._OrtClearBoundOutputs(ioBindingState.handle);
+      activeSessions.set(
+          sessionId,
+          [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, ioBindingState, enableGraphCapture, false]);
     }
-
     return output;
   } finally {
     wasm.stackRestore(beforeRunStack);
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 2b7d492cc70b..9b9334c93b78 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -28,13 +28,34 @@ let initialized = false;
 let initializing = false;
 let aborted = false;
 
-const isMultiThreadSupported = (): boolean => {
-  try {
-    // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
-    if (typeof SharedArrayBuffer === 'undefined') {
-      return false;
+const isMultiThreadSupported = (numThreads: number): boolean => {
+  // WebAssembly threads are set to 1 (single thread).
+  if (numThreads === 1) {
+    return false;
+  }
+
+  // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
+  if (typeof SharedArrayBuffer === 'undefined') {
+    if (typeof self !== 'undefined' && !self.crossOriginIsolated) {
+      // eslint-disable-next-line no-console
+      console.warn(
+          'env.wasm.numThreads is set to ' + numThreads +
+          ', but this will not work unless you enable crossOriginIsolated mode. ' +
+          'See https://web.dev/cross-origin-isolation-guide/ for more info.');
     }
+    return false;
+  }
+
+  // onnxruntime-web does not support multi-threads in Node.js.
+  if (typeof process !== 'undefined' && process.versions && process.versions.node) {
+    // eslint-disable-next-line no-console
+    console.warn(
+        'env.wasm.numThreads is set to ' + numThreads +
+        ', however, currently onnxruntime-web does not support multi-threads in Node.js. ' +
+        'Please consider using onnxruntime-node for performance critical scenarios.');
+  }
 
+  try {
     // Test for transferability of SABs (for browsers. needed for Firefox)
     // https://groups.google.com/forum/#!msg/mozilla.dev.platform/IHkBZlHETpA/dwsMNchWEQAJ
     if (typeof MessageChannel !== 'undefined') {
@@ -106,7 +127,7 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
   const numThreads = flags.numThreads!;
   const simd = flags.simd!;
 
-  const useThreads = numThreads > 1 && isMultiThreadSupported();
+  const useThreads = isMultiThreadSupported(numThreads);
   const useSimd = simd && isSimdSupported();
 
   const wasmPaths = flags.wasmPaths;
@@ -167,6 +188,7 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
     };
 
     if (!BUILD_DEFS.DISABLE_WASM_THREAD && useThreads) {
+      config.numThreads = numThreads;
       if (typeof Blob === 'undefined') {
         config.mainScriptUrlOrBlob = path.join(__dirname, 'ort-wasm-threaded.js');
       } else {
diff --git a/js/web/lib/wasm/wasm-utils-load-file.ts b/js/web/lib/wasm/wasm-utils-load-file.ts
new file mode 100644
index 000000000000..c6cdba2320bd
--- /dev/null
+++ b/js/web/lib/wasm/wasm-utils-load-file.ts
@@ -0,0 +1,87 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import * as fs from 'fs';
+import {readFile} from 'node:fs/promises';
+
+/**
+ * Load a file into a Uint8Array.
+ *
+ * @param file - the file to load. Can be a URL/path, a Blob, an ArrayBuffer, or a Uint8Array.
+ * @returns a Uint8Array containing the file data.
+ */
+export const loadFile = async(file: string|Blob|ArrayBufferLike|Uint8Array): Promise<Uint8Array> => {
+  if (typeof file === 'string') {
+    if (typeof process !== 'undefined' && process.versions && process.versions.node) {
+      // load file into ArrayBuffer in Node.js
+      try {
+        return new Uint8Array(await readFile(file));
+      } catch (e) {
+        if (e.code === 'ERR_FS_FILE_TOO_LARGE') {
+          // file is too large, use fs.createReadStream instead
+          const stream = fs.createReadStream(file);
+          const chunks: Uint8Array[] = [];
+          for await (const chunk of stream) {
+            chunks.push(chunk);
+          }
+          return new Uint8Array(Buffer.concat(chunks));
+        }
+        throw e;
+      }
+    } else {
+      // load file into ArrayBuffer in browsers
+      const response = await fetch(file);
+      if (!response.ok) {
+        throw new Error(`failed to load external data file: ${file}`);
+      }
+      const contentLengthHeader = response.headers.get('Content-Length');
+      const fileSize = contentLengthHeader ? parseInt(contentLengthHeader, 10) : 0;
+      if (fileSize < 1073741824 /* 1GB */) {
+        // when Content-Length header is not set, we cannot determine the file size. We assume it is small enough to
+        // load into memory.
+        return new Uint8Array(await response.arrayBuffer());
+      } else {
+        // file is too large, use stream instead
+        if (!response.body) {
+          throw new Error(`failed to load external data file: ${file}, no response body.`);
+        }
+        const reader = response.body.getReader();
+
+        let buffer;
+        try {
+          // try to create ArrayBuffer directly
+          buffer = new ArrayBuffer(fileSize);
+        } catch (e) {
+          if (e instanceof RangeError) {
+            // use WebAssembly Memory to allocate larger ArrayBuffer
+            const pages = Math.ceil(fileSize / 65536);
+            buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+          } else {
+            throw e;
+          }
+        }
+
+        let offset = 0;
+        // eslint-disable-next-line no-constant-condition
+        while (true) {
+          const {done, value} = await reader.read();
+          if (done) {
+            break;
+          }
+          const chunkSize = value.byteLength;
+          const chunk = new Uint8Array(buffer, offset, chunkSize);
+          chunk.set(value);
+          offset += chunkSize;
+        }
+        return new Uint8Array(buffer, 0, fileSize);
+      }
+    }
+
+  } else if (file instanceof Blob) {
+    return new Uint8Array(await file.arrayBuffer());
+  } else if (file instanceof Uint8Array) {
+    return file;
+  } else {
+    return new Uint8Array(file);
+  }
+};
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 890c5a0f3476..72fe383f04fe 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^1.12.0",
@@ -28,7 +28,7 @@
         "@webgpu/types": "^0.1.38",
         "base64-js": "^1.5.1",
         "chai": "^4.3.7",
-        "electron": "^23.1.2",
+        "electron": "^28.1.4",
         "globby": "^13.1.3",
         "karma": "^6.4.1",
         "karma-browserstack-launcher": "^1.6.0",
@@ -49,10 +49,10 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "node_modules/@chiragrupani/karma-chromium-edge-launcher": {
@@ -862,9 +862,9 @@
       }
     },
     "node_modules/cross-spawn/node_modules/semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
       "dev": true,
       "bin": {
         "semver": "bin/semver"
@@ -1042,14 +1042,14 @@
       "dev": true
     },
     "node_modules/electron": {
-      "version": "23.3.13",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
-      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
+      "version": "28.1.4",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-28.1.4.tgz",
+      "integrity": "sha512-WE6go611KOhtH6efRPMnVC7FE7DCKnQ3ZyHFeI1DbaCy8OU4UjZ8/CZGcuZmZgRdxSBEHoHdgaJkWRHZzF0FOg==",
       "dev": true,
       "hasInstallScript": true,
       "dependencies": {
         "@electron/get": "^2.0.0",
-        "@types/node": "^16.11.26",
+        "@types/node": "^18.11.18",
         "extract-zip": "^2.0.1"
       },
       "bin": {
@@ -1059,12 +1059,6 @@
         "node": ">= 12.20.55"
       }
     },
-    "node_modules/electron/node_modules/@types/node": {
-      "version": "16.18.14",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-16.18.14.tgz",
-      "integrity": "sha512-wvzClDGQXOCVNU4APPopC2KtMYukaF1MN/W3xAmslx22Z4/IF1/izDMekuyoUlwfnDHYCIZGaj7jMwnJKBTxKw==",
-      "dev": true
-    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -1357,9 +1351,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -1432,9 +1426,9 @@
       }
     },
     "node_modules/get-func-name": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
-      "integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
+      "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
       "dev": true,
       "engines": {
         "node": "*"
@@ -1542,9 +1536,9 @@
       }
     },
     "node_modules/global-agent/node_modules/semver": {
-      "version": "7.3.8",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
-      "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+      "version": "7.5.4",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+      "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
       "dev": true,
       "optional": true,
       "dependencies": {
@@ -2635,9 +2629,9 @@
       }
     },
     "node_modules/protobufjs": {
-      "version": "7.2.4",
-      "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.4.tgz",
-      "integrity": "sha512-AT+RJgD2sH8phPmCf7OUZR8xGdcJRga4+1cOaXJ64hvcSkVhNcRHOwIxUatPH15+nj59WAGTDv3LSGZPEQbJaQ==",
+      "version": "7.2.5",
+      "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.5.tgz",
+      "integrity": "sha512-gGXRSXvxQ7UiPgfw8gevrfRWcTlSbOFg+p/N+JVJEK5VhueL2miT6qTymqAmjr1Q5WbOCyJbyrk6JfWKwlFn6A==",
       "hasInstallScript": true,
       "dependencies": {
         "@protobufjs/aspromise": "^1.1.2",
@@ -2908,9 +2902,9 @@
       "dev": true
     },
     "node_modules/semver": {
-      "version": "6.3.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
       "dev": true,
       "bin": {
         "semver": "bin/semver.js"
@@ -4203,9 +4197,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "5.7.1",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
           "dev": true
         }
       }
@@ -4339,22 +4333,14 @@
       "dev": true
     },
     "electron": {
-      "version": "23.3.13",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
-      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
+      "version": "28.1.4",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-28.1.4.tgz",
+      "integrity": "sha512-WE6go611KOhtH6efRPMnVC7FE7DCKnQ3ZyHFeI1DbaCy8OU4UjZ8/CZGcuZmZgRdxSBEHoHdgaJkWRHZzF0FOg==",
       "dev": true,
       "requires": {
         "@electron/get": "^2.0.0",
-        "@types/node": "^16.11.26",
+        "@types/node": "^18.11.18",
         "extract-zip": "^2.0.1"
-      },
-      "dependencies": {
-        "@types/node": {
-          "version": "16.18.14",
-          "resolved": "https://registry.npmjs.org/@types/node/-/node-16.18.14.tgz",
-          "integrity": "sha512-wvzClDGQXOCVNU4APPopC2KtMYukaF1MN/W3xAmslx22Z4/IF1/izDMekuyoUlwfnDHYCIZGaj7jMwnJKBTxKw==",
-          "dev": true
-        }
       }
     },
     "emoji-regex": {
@@ -4609,9 +4595,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "from": {
@@ -4657,9 +4643,9 @@
       "dev": true
     },
     "get-func-name": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
-      "integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
+      "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
       "dev": true
     },
     "get-intrinsic": {
@@ -4742,9 +4728,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "7.3.8",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
-          "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+          "version": "7.5.4",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+          "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
           "dev": true,
           "optional": true,
           "requires": {
@@ -5517,7 +5503,7 @@
     "onnxruntime-common": {
       "version": "file:../common",
       "requires": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "p-cancelable": {
@@ -5595,9 +5581,9 @@
       "dev": true
     },
     "protobufjs": {
-      "version": "7.2.4",
-      "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.4.tgz",
-      "integrity": "sha512-AT+RJgD2sH8phPmCf7OUZR8xGdcJRga4+1cOaXJ64hvcSkVhNcRHOwIxUatPH15+nj59WAGTDv3LSGZPEQbJaQ==",
+      "version": "7.2.5",
+      "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.5.tgz",
+      "integrity": "sha512-gGXRSXvxQ7UiPgfw8gevrfRWcTlSbOFg+p/N+JVJEK5VhueL2miT6qTymqAmjr1Q5WbOCyJbyrk6JfWKwlFn6A==",
       "requires": {
         "@protobufjs/aspromise": "^1.1.2",
         "@protobufjs/base64": "^1.1.2",
@@ -5780,9 +5766,9 @@
       "dev": true
     },
     "semver": {
-      "version": "6.3.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
       "dev": true
     },
     "semver-compare": {
diff --git a/js/web/package.json b/js/web/package.json
index 9b4531d7766f..384565dc0da9 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -1,6 +1,5 @@
 {
   "license": "MIT",
-  "browser": "dist/ort-web.min.js",
   "unpkg": "dist/ort.min.js",
   "name": "onnxruntime-web",
   "repository": {
@@ -8,7 +7,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^1.12.0",
@@ -24,6 +23,7 @@
     "build:doc": "node ./script/generate-webgl-operator-md && node ./script/generate-webgpu-operator-md",
     "pull:wasm": "node ./script/pull-prebuilt-wasm-artifacts",
     "test:e2e": "node ./test/e2e/run",
+    "test:training:e2e": "node ./test/training/e2e/run",
     "prebuild": "tsc -p . --noEmit && tsc -p lib/wasm/proxy-worker --noEmit",
     "build": "node ./script/build",
     "test": "tsc --build ../scripts && node ../scripts/prepare-onnx-node-tests && node ./script/test-runner-cli",
@@ -46,7 +46,7 @@
     "@webgpu/types": "^0.1.38",
     "base64-js": "^1.5.1",
     "chai": "^4.3.7",
-    "electron": "^23.1.2",
+    "electron": "^28.1.4",
     "globby": "^13.1.3",
     "karma": "^6.4.1",
     "karma-browserstack-launcher": "^1.6.0",
@@ -68,11 +68,14 @@
   "exports": {
     ".": {
       "node": "./dist/ort.node.min.js",
+      "types": "./types.d.ts",
       "default": {
         "import": "./dist/esm/ort.min.js",
         "require": "./dist/cjs/ort.min.js",
+        "types": "./types.d.ts",
         "default": {
           "development": "./dist/ort.js",
+          "types": "./types.d.ts",
           "default": "./dist/ort.min.js"
         }
       }
@@ -80,34 +83,41 @@
     "./experimental": {
       "import": "./dist/esm/ort.all.min.js",
       "require": "./dist/cjs/ort.all.min.js",
+      "types": "./types.d.ts",
       "default": {
         "development": "./dist/ort.all.js",
+        "types": "./types.d.ts",
         "default": "./dist/ort.all.min.js"
       }
     },
     "./wasm": {
       "import": "./dist/esm/ort.wasm.min.js",
       "require": "./dist/cjs/ort.wasm.min.js",
+      "types": "./types.d.ts",
       "default": "./dist/ort.wasm.min.js"
     },
     "./wasm-core": {
       "import": "./dist/esm/ort.wasm-core.min.js",
       "require": "./dist/cjs/ort.wasm-core.min.js",
+      "types": "./types.d.ts",
       "default": "./dist/ort.wasm-core.min.js"
     },
     "./webgl": {
       "import": "./dist/esm/ort.webgl.min.js",
       "require": "./dist/cjs/ort.webgl.min.js",
+      "types": "./types.d.ts",
       "default": "./dist/ort.webgl.min.js"
     },
     "./webgpu": {
       "import": "./dist/esm/ort.webgpu.min.js",
       "require": "./dist/cjs/ort.webgpu.min.js",
+      "types": "./types.d.ts",
       "default": "./dist/ort.webgpu.min.js"
     },
     "./training": {
       "import": "./dist/esm/ort.training.wasm.min.js",
       "require": "./dist/cjs/ort.training.wasm.min.js",
+      "types": "./types.d.ts",
       "default": "./dist/ort.training.wasm.min.js"
     }
   },
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 5151f27582c1..d3652f382035 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -121,7 +121,11 @@ async function buildOrt({
           case 'node:fs/promises':
           case 'node:fs':
           case 'fs':
-            return {contents: 'export const readFile = undefined;'};
+            return {
+              contents: 'export const readFile = undefined;' +
+                  'export const readFileSync = undefined;' +
+                  'export const createReadStream = undefined;'
+            };
           case 'node:os':
           case 'os':
             return {contents: 'export const cpus = undefined;'};
@@ -367,10 +371,7 @@ async function main() {
 
   if (BUNDLE_MODE === 'dev') {
     // ort.all.js
-    await addBuildTask(buildOrt({
-      outputBundleName: 'ort.all',
-      format: 'iife',
-    }));
+    await addBuildTask(buildOrt({outputBundleName: 'ort.all', format: 'iife', define: {...DEFAULT_DEFINE}}));
   }
 
   if (BUNDLE_MODE === 'perf') {
@@ -404,7 +405,11 @@ async function main() {
     // ort.webgl[.min].js
     await addAllWebBuildTasks({
       outputBundleName: 'ort.webgl',
-      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGPU': 'true', 'BUILD_DEFS.DISABLE_WASM': 'true'},
+      define: {
+        ...DEFAULT_DEFINE,
+        'BUILD_DEFS.DISABLE_WEBGPU': 'true',
+        'BUILD_DEFS.DISABLE_WASM': 'true',
+      },
     });
     // ort.wasm-core[.min].js
     await addAllWebBuildTasks({
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index ee955ec8d4f1..adcd940178e0 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -29,14 +29,15 @@ Options:
 *** General Options ***
 
  -h, --help                    Print this message.
- -d, --debug                   Specify to run test runner in debug mode.
-                                 Debug mode outputs verbose log for test runner, sets up environment debug flag, and keeps karma not to exit after tests completed.
+ -d, --debug                   Specify to run test runner in debug mode. Debug mode does the following:
+                                 - outputs verbose log for test runner
+                                 - sets up environment debug flag (env.debug = true)
+                                 - opens Chromium debug port at 9333 and keeps karma not to exit after tests completed.
  -b=<...>, --backend=<...>     Specify one or more backend(s) to run the test upon.
                                  Backends can be one or more of the following, splitted by comma:
                                    webgl
                                    webgpu
                                    wasm
-                                   xnnpack
                                    webnn
  -e=<...>, --env=<...>         Specify the environment to run the test. Should be one of the following:
                                  chrome     (default)
@@ -48,42 +49,61 @@ Options:
                                  bs         (for BrowserStack tests)
  -p, --profile                 Enable profiler.
                                  Profiler will generate extra logs which include the information of events time consumption
+ -t, --trace                   Enable trace.
  -P[=<...>], --perf[=<...>]    Generate performance number. Cannot be used with flag --debug.
                                  This flag can be used with a number as value, specifying the total count of test cases to run. The test cases may be used multiple times. Default value is 10.
  -c, --file-cache              Enable file cache.
+
+*** Session Options ***
+ -u=<...>, --optimized-model-file-path=<...>        Specify whether to dump the optimized model.
+ -o=<...>, --graph-optimization-level=<...>         Specify graph optimization level.
+                                                      Default is 'all'. Valid values are 'disabled', 'basic', 'extended', 'all'.
  -i=<...>, --io-binding=<...>  Specify the IO binding testing type. Should be one of the following:
-                                 none          (default)
+                                 none            (default)
                                  gpu-tensor      use pre-allocated GPU tensors for inputs and outputs
                                  gpu-location    use pre-allocated GPU tensors for inputs and set preferredOutputLocation to 'gpu-buffer'
 
-*** Session Options ***
- -u=<...>, --optimized-model-file-path=<...>        Specify whether to dump the optimized model.
- -o=<...>, --graph-optimization-level=<...>    Specify graph optimization level.
-                                                 Default is 'all'. Valid values are 'disabled', 'basic', 'extended', 'all'.
 *** Logging Options ***
 
- --log-verbose=<...>           Set log level to verbose
- --log-info=<...>              Set log level to info
- --log-warning=<...>           Set log level to warning
- --log-error=<...>             Set log level to error
-                                 The 4 flags above specify the logging configuration. Each flag allows to specify one or more category(s), splitted by comma. If use the flags without value, the log level will be applied to all category.
+ --log-verbose                 Set log level to verbose
+ --log-info                    Set log level to info
+ --log-warning                 Set log level to warning
+ --log-error                   Set log level to error
+                                 The 4 flags above specify the logging configuration.
 
 *** Backend Options ***
 
+ --wasm.<...>=<...>            Set global environment flags for each backend.
+ --webgl.<...>=<...>             These flags can be used multiple times to set multiple flags. For example:
+ --webgpu.<...>=<...>            --webgpu.profiling.mode=default --wasm.numThreads=1 --wasm.simd=false
+ --webnn.<...>=<...>
+
+ --webnn-device-type           Set the WebNN device type (cpu/gpu/npu)
+
  -x, --wasm-number-threads     Set the WebAssembly number of threads
+                                ("--wasm-number-threads" is deprecated. use "--wasm.numThreads" or "-x" instead)
  --wasm-init-timeout           Set the timeout for WebAssembly backend initialization, in milliseconds
+                                (deprecated. use "--wasm.initTimeout" instead)
  --wasm-enable-simd            Set whether to enable SIMD
+                                (deprecated. use "--wasm.simd" instead)
  --wasm-enable-proxy           Set whether to enable proxy worker
+                                (deprecated. use "--wasm.proxy" instead)
  --webgl-context-id            Set the WebGL context ID (webgl/webgl2)
+                                (deprecated. use "--webgl.contextId" instead)
  --webgl-matmul-max-batch-size Set the WebGL matmulMaxBatchSize
+                                (deprecated. use "--webgl.matmulMaxBatchSize" instead)
  --webgl-texture-cache-mode    Set the WebGL texture cache mode (initializerOnly/full)
+                                (deprecated. use "--webgl.textureCacheMode" instead)
  --webgl-texture-pack-mode     Set the WebGL texture pack mode (true/false)
+                                (deprecated. use "--webgl.pack" instead)
  --webgpu-profiling-mode       Set the WebGPU profiling mode (off/default)
+                                (deprecated. use "--webgpu.profiling.mode" instead)
 
 *** Browser Options ***
 
  --no-sandbox                  This flag will be passed to Chrome.
                                  Sometimes Chrome need this flag to work together with Karma.
+ --user-data-dir=<...>         This flag will be passed to browsers to specify the user data directory.
  --chromium-flags=<...>        This flag will be passed to Chrome and Edge browsers. Can be used multiple times.
 
 Examples:
@@ -110,7 +130,7 @@ Examples:
 
 export declare namespace TestRunnerCliArgs {
   type Mode = 'suite0'|'suite1'|'model'|'unittest'|'op';
-  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'xnnpack'|'webnn';
+  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'webnn';
   type Environment = 'chrome'|'edge'|'firefox'|'electron'|'safari'|'node'|'bs';
   type BundleMode = 'dev'|'perf';
   type IOBindingMode = 'none'|'gpu-tensor'|'gpu-location';
@@ -171,11 +191,12 @@ export interface TestRunnerCliArgs {
 
   cpuOptions?: InferenceSession.CpuExecutionProviderOption;
   cudaOptions?: InferenceSession.CudaExecutionProviderOption;
-  cudaFlags?: Record<string, unknown>;
   wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
   webglOptions?: InferenceSession.WebGLExecutionProviderOption;
+  webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
   globalEnvFlags?: Test.Options['globalEnvFlags'];
   noSandbox?: boolean;
+  userDataDir?: string;
   chromiumFlags: string[];
 }
 
@@ -259,40 +280,29 @@ function parseCpuOptions(_args: minimist.ParsedArgs): InferenceSession.CpuExecut
   return {name: 'cpu'};
 }
 
-function parseCpuFlags(_args: minimist.ParsedArgs): Record<string, unknown> {
-  return {};
-}
-
 function parseWasmOptions(_args: minimist.ParsedArgs): InferenceSession.WebAssemblyExecutionProviderOption {
   return {name: 'wasm'};
 }
 
 function parseWasmFlags(args: minimist.ParsedArgs): Env.WebAssemblyFlags {
-  const numThreads = args.x || args['wasm-number-threads'];
+  const wasm = args.wasm || {};
+  const numThreads = wasm.numThreads = wasm.numThreads ?? (args.x ?? args['wasm-number-threads']);
   if (typeof numThreads !== 'undefined' && typeof numThreads !== 'number') {
-    throw new Error('Flag "x"/"wasm-number-threads" must be a number value');
+    throw new Error('Flag "wasm.numThreads"/"x"/"wasm-number-threads" must be a number value');
   }
-  const initTimeout = args['wasm-init-timeout'];
+  const initTimeout = wasm.initTimeout = wasm.initTimeout ?? args['wasm-init-timeout'];
   if (typeof initTimeout !== 'undefined' && typeof initTimeout !== 'number') {
-    throw new Error('Flag "wasm-init-timeout" must be a number value');
-  }
-  let simd = args['wasm-enable-simd'];
-  if (simd === 'true') {
-    simd = true;
-  } else if (simd === 'false') {
-    simd = false;
-  } else if (typeof simd !== 'undefined' && typeof simd !== 'boolean') {
-    throw new Error('Flag "wasm-enable-simd" must be a boolean value');
-  }
-  let proxy = args['wasm-enable-proxy'];
-  if (proxy === 'true') {
-    proxy = true;
-  } else if (proxy === 'false') {
-    proxy = false;
-  } else if (typeof proxy !== 'undefined' && typeof proxy !== 'boolean') {
-    throw new Error('Flag "wasm-enable-proxy" must be a boolean value');
-  }
-  return {numThreads, initTimeout, simd, proxy};
+    throw new Error('Flag "wasm.initTimeout"/"wasm-init-timeout" must be a number value');
+  }
+  const simd = wasm.simd = parseBooleanArg(wasm.simd ?? args['wasm-enable-simd']);
+  if (typeof simd !== 'undefined' && typeof simd !== 'boolean') {
+    throw new Error('Flag "wasm.simd"/"wasm-enable-simd" must be a boolean value');
+  }
+  const proxy = wasm.proxy = parseBooleanArg(wasm.proxy ?? args['wasm-enable-proxy']);
+  if (typeof proxy !== 'undefined' && typeof proxy !== 'boolean') {
+    throw new Error('Flag "wasm.proxy"/"wasm-enable-proxy" must be a boolean value');
+  }
+  return wasm;
 }
 
 function parseWebglOptions(_args: minimist.ParsedArgs): InferenceSession.WebGLExecutionProviderOption {
@@ -300,47 +310,58 @@ function parseWebglOptions(_args: minimist.ParsedArgs): InferenceSession.WebGLEx
 }
 
 function parseWebglFlags(args: minimist.ParsedArgs): Partial<Env.WebGLFlags> {
-  const contextId = args['webgl-context-id'];
+  const webgl = args.webgl || {};
+  const contextId = webgl.contextId = webgl.contextId ?? args['webgl-context-id'];
   if (contextId !== undefined && contextId !== 'webgl' && contextId !== 'webgl2') {
-    throw new Error('Flag "webgl-context-id" is invalid');
+    throw new Error('Flag "webgl.contextId"/"webgl-context-id" is invalid');
   }
-  const matmulMaxBatchSize = args['webgl-matmul-max-batch-size'];
+  const matmulMaxBatchSize = webgl.matmulMaxBatchSize = webgl.matmulMaxBatchSize ?? args['webgl-matmul-max-batch-size'];
   if (matmulMaxBatchSize !== undefined && typeof matmulMaxBatchSize !== 'number') {
-    throw new Error('Flag "webgl-matmul-max-batch-size" must be a number value');
+    throw new Error('Flag "webgl.matmulMaxBatchSize"/"webgl-matmul-max-batch-size" must be a number value');
   }
-  const textureCacheMode = args['webgl-texture-cache-mode'];
+  const textureCacheMode = webgl.textureCacheMode = webgl.textureCacheMode ?? args['webgl-texture-cache-mode'];
   if (textureCacheMode !== undefined && textureCacheMode !== 'initializerOnly' && textureCacheMode !== 'full') {
-    throw new Error('Flag "webgl-texture-cache-mode" is invalid');
+    throw new Error('Flag "webgl.textureCacheMode"/"webgl-texture-cache-mode" is invalid');
   }
-  const pack = args['webgl-texture-pack-mode'];
+  const pack = webgl.pack = parseBooleanArg(webgl.pack ?? args['webgl-texture-pack-mode']);
   if (pack !== undefined && typeof pack !== 'boolean') {
-    throw new Error('Flag "webgl-texture-pack-mode" is invalid');
+    throw new Error('Flag "webgl.pack"/"webgl-texture-pack-mode" is invalid');
   }
-  const async = args['webgl-async'];
+  const async = webgl.async = parseBooleanArg(webgl.async ?? args['webgl-async']);
   if (async !== undefined && typeof async !== 'boolean') {
-    throw new Error('Flag "webgl-async" is invalid');
+    throw new Error('Flag "webgl.async"/"webgl-async" is invalid');
   }
-  return {contextId, matmulMaxBatchSize, textureCacheMode, pack};
+  return webgl;
 }
 
 function parseWebgpuFlags(args: minimist.ParsedArgs): Partial<Env.WebGpuFlags> {
-  const profilingMode = args['webgpu-profiling-mode'];
+  const webgpu = args.webgpu || {};
+  const profilingMode = (webgpu.profiling = webgpu.profiling ?? {}).mode =
+      webgpu?.profiling?.mode ?? webgpu.profilingMode ?? args['webgpu-profiling-mode'];
   if (profilingMode !== undefined && profilingMode !== 'off' && profilingMode !== 'default') {
     throw new Error('Flag "webgpu-profiling-mode" is invalid');
   }
-  const validateInputContent = args['webgpu-validate-input-content'];
+  const validateInputContent = webgpu.validateInputContent =
+      parseBooleanArg(webgpu.validateInputContent ?? args['webgpu-validate-input-content']);
   if (validateInputContent !== undefined && typeof validateInputContent !== 'boolean') {
     throw new Error('Flag "webgpu-validate-input-content" is invalid');
   }
-  return {profilingMode, validateInputContent};
+  return webgpu;
 }
 
-function parseGlobalEnvFlags(args: minimist.ParsedArgs): NonNullable<TestRunnerCliArgs['globalEnvFlags']> {
+function parseWebNNOptions(args: minimist.ParsedArgs): InferenceSession.WebNNExecutionProviderOption {
+  const deviceType = args['webnn-device-type'];
+  if (deviceType !== undefined && !['cpu', 'gpu', 'npu'].includes(deviceType)) {
+    throw new Error('Flag "webnn-device-type" is invalid');
+  }
+  return {name: 'webnn', deviceType};
+}
+
+function parseGlobalEnvFlags(args: minimist.ParsedArgs) {
   const wasm = parseWasmFlags(args);
   const webgl = parseWebglFlags(args);
   const webgpu = parseWebgpuFlags(args);
-  const cpuFlags = parseCpuFlags(args);
-  return {webgl, wasm, webgpu, ...cpuFlags};
+  return {webgl, wasm, webgpu};
 }
 
 export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs {
@@ -368,13 +389,13 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   }
 
   // Option: -b=<...>, --backend=<...>
-  const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack', 'webnn'];
+  const browserBackends = ['webgl', 'webgpu', 'wasm', 'webnn'];
 
   // TODO: remove this when Chrome support WebNN.
   //       we need this for now because Chrome does not support webnn yet,
   //       and ChromeCanary is not in CI.
 
-  const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack' /*, 'webnn'*/];
+  const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm' /*, 'webnn'*/];
   const nodejsBackends = ['cpu', 'wasm'];
   const backendArgs = args.backend || args.b;
   const backend = (typeof backendArgs !== 'string') ? (env === 'node' ? nodejsBackends : defaultBrowserBackends) :
@@ -385,19 +406,14 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     }
   }
 
-  const globalEnvFlags = parseGlobalEnvFlags(args);
-
-  if (backend.includes('webnn') && !globalEnvFlags.wasm!.proxy) {
-    throw new Error('Backend webnn requires flag "wasm-enable-proxy" to be set to true.');
-  }
-
   // Options:
   // --log-verbose=<...>
   // --log-info=<...>
   // --log-warning=<...>
   // --log-error=<...>
   const logConfig = parseLogConfig(args);
-  globalEnvFlags.logLevel = logConfig[0]?.config.minimalSeverity;
+  let logLevel = logConfig[0]?.config.minimalSeverity;
+
   // Option: -p, --profile
   const profile = (args.profile || args.p) ? true : false;
   if (profile) {
@@ -405,9 +421,18 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     logConfig.push({category: 'Profiler.node', config: {minimalSeverity: 'verbose'}});
     logConfig.push({category: 'Profiler.op', config: {minimalSeverity: 'verbose'}});
     logConfig.push({category: 'Profiler.backend', config: {minimalSeverity: 'verbose'}});
-    globalEnvFlags.logLevel = 'verbose';
+    logLevel = 'verbose';
   }
 
+  // Option: -t, --trace
+  const trace = parseBooleanArg(args.trace || args.t, false);
+
+  // Options:
+  // --wasm.<...>=<...>
+  // --webgl.<...>=<...>
+  // --webgpu.<...>=<...>
+  const globalEnvFlags = {...parseGlobalEnvFlags(args), debug, trace, logLevel};
+
   // Option: -P[=<...>], --perf[=<...>]
   const perfArg = (args.perf || args.P);
   const perf = perfArg ? true : false;
@@ -449,10 +474,14 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   const wasmOptions = parseWasmOptions(args);
 
   const webglOptions = parseWebglOptions(args);
+  const webnnOptions = parseWebNNOptions(args);
 
   // Option: --no-sandbox
   const noSandbox = !!args['no-sandbox'];
 
+  // Option: --user-data-dir
+  const userDataDir = args['user-data-dir'];
+
   // parse chromium flags
   let chromiumFlags = args['chromium-flags'];
   if (!chromiumFlags) {
@@ -487,9 +516,11 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     fileCache,
     cpuOptions,
     webglOptions,
+    webnnOptions,
     wasmOptions,
     globalEnvFlags,
     noSandbox,
+    userDataDir,
     chromiumFlags
   };
 }
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 74a03290332a..03d637b35bc7 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -12,6 +12,7 @@ import * as os from 'os';
 import * as path from 'path';
 import {inspect} from 'util';
 
+import {onnx} from '../lib/onnxjs/ort-schema/protobuf/onnx';
 import {bufferToBase64} from '../test/test-shared';
 import {Test} from '../test/test-types';
 
@@ -165,6 +166,7 @@ async function main() {
       debug: args.debug,
       cpuOptions: args.cpuOptions,
       webglOptions: args.webglOptions,
+      webnnOptions: args.webnnOptions,
       wasmOptions: args.wasmOptions,
       globalEnvFlags: args.globalEnvFlags
     }
@@ -263,10 +265,12 @@ async function main() {
 
     let modelUrl: string|null = null;
     let cases: Test.ModelTestCase[] = [];
+    let externalData: Array<{data: string; path: string}>|undefined;
 
     npmlog.verbose('TestRunnerCli.Init.Model', `Start to prepare test data from folder: ${testDataRootFolder}`);
 
     try {
+      const maybeExternalDataFiles: Array<[fileNameWithoutExtension: string, size: number]> = [];
       for (const thisPath of fs.readdirSync(testDataRootFolder)) {
         const thisFullPath = path.join(testDataRootFolder, thisPath);
         const stat = fs.lstatSync(thisFullPath);
@@ -281,6 +285,8 @@ async function main() {
             } else {
               throw new Error('there are multiple model files under the folder specified');
             }
+          } else {
+            maybeExternalDataFiles.push([path.parse(thisPath).name, stat.size]);
           }
         } else if (stat.isDirectory()) {
           const dataFiles: string[] = [];
@@ -306,6 +312,34 @@ async function main() {
       if (modelUrl === null) {
         throw new Error('there are no model file under the folder specified');
       }
+      // for performance consideration, we do not parse every model. when we think it's likely to have external
+      // data, we will parse it. We think it's "likely" when one of the following conditions is met:
+      // 1. any file in the same folder has the similar file name as the model file
+      //    (e.g., model file is "model_abc.onnx", and there is a file "model_abc.pb" or "model_abc.onnx.data")
+      // 2. the file size is larger than 1GB
+      const likelyToHaveExternalData = maybeExternalDataFiles.some(
+          ([fileNameWithoutExtension, size]) =>
+              path.basename(modelUrl!).startsWith(fileNameWithoutExtension) || size >= 1 * 1024 * 1024 * 1024);
+      if (likelyToHaveExternalData) {
+        const model = onnx.ModelProto.decode(fs.readFileSync(path.join(testDataRootFolder, path.basename(modelUrl!))));
+        const externalDataPathSet = new Set<string>();
+        for (const initializer of model.graph!.initializer!) {
+          if (initializer.externalData) {
+            for (const data of initializer.externalData) {
+              if (data.key === 'location') {
+                externalDataPathSet.add(data.value!);
+              }
+            }
+          }
+        }
+        externalData = [];
+        const externalDataPaths = [...externalDataPathSet];
+        for (const dataPath of externalDataPaths) {
+          const fullPath = path.resolve(testDataRootFolder, dataPath);
+          const url = path.join(TEST_DATA_BASE, path.relative(TEST_ROOT, fullPath));
+          externalData.push({data: url, path: dataPath});
+        }
+      }
     } catch (e) {
       npmlog.error('TestRunnerCli.Init.Model', `Failed to prepare test data. Error: ${inspect(e)}`);
       throw e;
@@ -339,9 +373,23 @@ async function main() {
     npmlog.verbose('TestRunnerCli.Init.Model', ` Model file: ${modelUrl}`);
     npmlog.verbose('TestRunnerCli.Init.Model', ` Backend: ${backend}`);
     npmlog.verbose('TestRunnerCli.Init.Model', ` Test set(s): ${cases.length} (${caseCount})`);
+    if (externalData) {
+      npmlog.verbose('TestRunnerCli.Init.Model', ` External data: ${externalData.length}`);
+      for (const data of externalData) {
+        npmlog.verbose('TestRunnerCli.Init.Model', `  - ${data.path}`);
+      }
+    }
     npmlog.verbose('TestRunnerCli.Init.Model', '===============================================================');
 
-    return {name: path.basename(testDataRootFolder), platformCondition, modelUrl, backend, cases, ioBinding};
+    return {
+      name: path.basename(testDataRootFolder),
+      platformCondition,
+      modelUrl,
+      backend,
+      cases,
+      ioBinding,
+      externalData
+    };
   }
 
   function tryLocateModelTestFolder(searchPattern: string): string {
@@ -494,14 +542,13 @@ async function main() {
       npmlog.info('TestRunnerCli.Run', '(4/4) Running karma to start test runner...');
       const webgpu = args.backends.indexOf('webgpu') > -1;
       const webnn = args.backends.indexOf('webnn') > -1;
-      const browser = getBrowserNameFromEnv(
-          args.env,
-          args.bundleMode === 'perf' ? 'perf' :
-              args.debug             ? 'debug' :
-                                       'test',
-          webgpu, webnn);
+      const browser = getBrowserNameFromEnv(args.env);
       const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
       const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags];
+      if (args.bundleMode === 'dev' && !args.debug) {
+        // use headless for 'test' mode (when 'perf' and 'debug' are OFF)
+        chromiumFlags.push('--headless=new');
+      }
       if (args.debug) {
         karmaArgs.push('--log-level info --timeout-mocha 9999999');
         chromiumFlags.push('--remote-debugging-port=9333');
@@ -522,7 +569,13 @@ async function main() {
       if (webnn) {
         chromiumFlags.push('--enable-experimental-web-platform-features');
       }
+      if (process.argv.includes('--karma-debug')) {
+        karmaArgs.push('--log-level debug');
+      }
       karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
+      if (args.userDataDir) {
+        karmaArgs.push(`--user-data-dir="${args.userDataDir}"`);
+      }
       karmaArgs.push(...chromiumFlags.map(flag => `--chromium-flags=${flag}`));
       if (browser.startsWith('Edge')) {
         // There are currently 2 Edge browser launchers:
@@ -614,15 +667,14 @@ async function main() {
     fs.writeJSONSync(path.join(TEST_ROOT, './testdata-config.json'), config);
   }
 
-  function getBrowserNameFromEnv(
-      env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
+  function getBrowserNameFromEnv(env: TestRunnerCliArgs['env']) {
     switch (env) {
       case 'chrome':
-        return selectChromeBrowser(mode, webgpu, webnn);
+        return 'ChromeTest';
       case 'edge':
         return 'EdgeTest';
       case 'firefox':
-        return 'Firefox';
+        return 'FirefoxTest';
       case 'electron':
         return 'Electron';
       case 'safari':
@@ -633,22 +685,6 @@ async function main() {
         throw new Error(`env "${env}" not supported.`);
     }
   }
-
-  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
-    if (webnn) {
-      return 'ChromeCanaryTest';
-    } else if (webgpu) {
-      return 'ChromeTest';
-    } else {
-      switch (mode) {
-        case 'debug':
-        case 'perf':
-          return 'ChromeTest';
-        default:
-          return 'ChromeTestHeadless';
-      }
-    }
-  }
 }
 
 void main();
diff --git a/js/web/test/data/ops/add_zero-sized.jsonc b/js/web/test/data/ops/add_zero-sized.jsonc
new file mode 100644
index 000000000000..37e08cd7f20a
--- /dev/null
+++ b/js/web/test/data/ops/add_zero-sized.jsonc
@@ -0,0 +1,31 @@
+[
+  {
+    "name": "Add with no attributes",
+    "operator": "Add",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[2,0] T[2,1]",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [2, 0],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2],
+            "dims": [2, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [],
+            "dims": [2, 0],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/concat_zero-sized.jsonc b/js/web/test/data/ops/concat_zero-sized.jsonc
new file mode 100644
index 000000000000..be9625145d15
--- /dev/null
+++ b/js/web/test/data/ops/concat_zero-sized.jsonc
@@ -0,0 +1,641 @@
+[
+  {
+    "name": "Concat 2D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": -2, "type": "int" }],
+    "cases": [
+      {
+        "name": "X",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [1, 4, 0, 64],
+            "type": "float32"
+          },
+          {
+            "data": [
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+            ],
+            "dims": [1, 4, 36, 64],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+              2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+            ],
+            "dims": [1, 4, 36, 64],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 2D axis=1; Preserve dims",
+    "operator": "Concat",
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 0,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Some but not all input tensors are zero-sized",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [0, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1],
+            "dims": [1, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1],
+            "dims": [1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 2D axis=1; Preserve dims",
+    "operator": "Concat",
+    "attributes": [
+      {
+        "name": "axis",
+        "data": 1,
+        "type": "int"
+      }
+    ],
+    "cases": [
+      {
+        "name": "All input tensors are zero-sized",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [0, 0],
+            "type": "float32"
+          },
+          {
+            "data": [],
+            "dims": [0, 1],
+            "type": "float32"
+          },
+          {
+            "data": [],
+            "dims": [0, 2],
+            "type": "float32"
+          },
+          {
+            "data": [],
+            "dims": [0, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [],
+            "dims": [0, 6],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/conv-transpose.jsonc b/js/web/test/data/ops/conv-transpose.jsonc
index 7038e2a4f876..8ed48dd07e6f 100644
--- a/js/web/test/data/ops/conv-transpose.jsonc
+++ b/js/web/test/data/ops/conv-transpose.jsonc
@@ -392,5 +392,267 @@
         ]
       }
     ]
+  },
+  {
+    "name": "ConvTranspose without bias addition C",
+    "operator": "ConvTranspose",
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+              26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+              23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+              20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+              17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+              11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6,
+              7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2,
+              3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+              31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+              28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+              25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+              22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+              19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+              16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+              13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+              10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5,
+              6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1,
+              2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+              27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+              24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+              21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+              18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+              15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+              9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4,
+              5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+              27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+              24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+              21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+              18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+              15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ],
+            "dims": [1, 4, 16, 16],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+              15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15
+            ],
+            "dims": [4, 4, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 0, 4, 0, 8, 0, 12, 0, 16, 0, 20, 0, 24, 0, 28, 0, 32, 0, 36, 0, 40, 0, 44, 0, 48, 0, 52, 0, 56, 0,
+              60, 0, 0, 8, 12, 16, 24, 24, 36, 32, 48, 40, 60, 48, 72, 56, 84, 64, 96, 72, 108, 80, 120, 88, 132, 96,
+              144, 104, 156, 112, 168, 120, 180, 0, 64, 0, 68, 0, 72, 0, 76, 0, 80, 0, 84, 0, 88, 0, 92, 0, 96, 0, 100,
+              0, 104, 0, 108, 0, 112, 0, 116, 0, 120, 0, 124, 128, 192, 136, 204, 144, 216, 152, 228, 160, 240, 168,
+              252, 176, 264, 184, 276, 192, 288, 200, 300, 208, 312, 216, 324, 224, 336, 232, 348, 240, 360, 248, 372,
+              0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220,
+              192, 240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168,
+              196, 192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340,
+              288, 360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448,
+              560, 464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616,
+              552, 644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 16, 20,
+              32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220, 192, 240, 208,
+              260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168, 196, 192, 224,
+              216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340, 288, 360, 304,
+              380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448, 560, 464, 580,
+              480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616, 552, 644, 576,
+              672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 16, 20, 32, 40, 48, 60,
+              64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220, 192, 240, 208, 260, 224, 280,
+              240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168, 196, 192, 224, 216, 252, 240,
+              280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340, 288, 360, 304, 380, 320, 400,
+              336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448, 560, 464, 580, 480, 600, 496,
+              620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616, 552, 644, 576, 672, 600, 700,
+              624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100,
+              96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220, 192, 240, 208, 260, 224, 280, 240, 300, 0, 0,
+              24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168, 196, 192, 224, 216, 252, 240, 280, 264, 308,
+              288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340, 288, 360, 304, 380, 320, 400, 336, 420, 352,
+              440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448, 560, 464, 580, 480, 600, 496, 620, 384, 448,
+              408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616, 552, 644, 576, 672, 600, 700, 624, 728, 648,
+              756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112,
+              140, 128, 160, 144, 180, 160, 200, 176, 220, 192, 240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56,
+              72, 84, 96, 112, 120, 140, 144, 168, 168, 196, 192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364,
+              336, 392, 360, 420, 256, 320, 272, 340, 288, 360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384,
+              480, 400, 500, 416, 520, 432, 540, 448, 560, 464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504,
+              456, 532, 480, 560, 504, 588, 528, 616, 552, 644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696,
+              812, 720, 840, 744, 868, 0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144,
+              180, 160, 200, 176, 220, 192, 240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112,
+              120, 140, 144, 168, 168, 196, 192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360,
+              420, 256, 320, 272, 340, 288, 360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500,
+              416, 520, 432, 540, 448, 560, 464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480,
+              560, 504, 588, 528, 616, 552, 644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840,
+              744, 868, 0, 0, 16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200,
+              176, 220, 192, 240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144,
+              168, 168, 196, 192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320,
+              272, 340, 288, 360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432,
+              540, 448, 560, 464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588,
+              528, 616, 552, 644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0,
+              16, 20, 32, 40, 48, 60, 64, 80, 80, 100, 96, 120, 112, 140, 128, 160, 144, 180, 160, 200, 176, 220, 192,
+              240, 208, 260, 224, 280, 240, 300, 0, 0, 24, 28, 48, 56, 72, 84, 96, 112, 120, 140, 144, 168, 168, 196,
+              192, 224, 216, 252, 240, 280, 264, 308, 288, 336, 312, 364, 336, 392, 360, 420, 256, 320, 272, 340, 288,
+              360, 304, 380, 320, 400, 336, 420, 352, 440, 368, 460, 384, 480, 400, 500, 416, 520, 432, 540, 448, 560,
+              464, 580, 480, 600, 496, 620, 384, 448, 408, 476, 432, 504, 456, 532, 480, 560, 504, 588, 528, 616, 552,
+              644, 576, 672, 600, 700, 624, 728, 648, 756, 672, 784, 696, 812, 720, 840, 744, 868, 0, 0, 32, 36, 64, 72,
+              96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288, 288, 324, 320, 360, 352, 396, 384, 432, 416,
+              468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264, 280, 308, 320, 352,
+              360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616, 600, 660, 512, 576, 544, 612, 576, 648, 608,
+              684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800, 900, 832, 936, 864, 972, 896, 1008, 928, 1044,
+              960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760, 836, 800, 880, 840, 924, 880, 968, 920, 1012,
+              960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 32,
+              36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288, 288, 324, 320, 360, 352, 396, 384,
+              432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264, 280, 308,
+              320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616, 600, 660, 512, 576, 544, 612, 576,
+              648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800, 900, 832, 936, 864, 972, 896, 1008,
+              928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760, 836, 800, 880, 840, 924, 880, 968,
+              920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364,
+              0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288, 288, 324, 320, 360, 352,
+              396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264,
+              280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616, 600, 660, 512, 576, 544,
+              612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800, 900, 832, 936, 864, 972,
+              896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760, 836, 800, 880, 840, 924,
+              880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320,
+              1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288, 288, 324,
+              320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200,
+              220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616, 600, 660,
+              512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800, 900, 832,
+              936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760, 836, 800,
+              880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276,
+              1200, 1320, 1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252, 256, 288,
+              288, 324, 320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120, 132, 160,
+              176, 200, 220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572, 560, 616,
+              600, 660, 512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768, 864, 800,
+              900, 832, 936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720, 792, 760,
+              836, 800, 880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188, 1120, 1232,
+              1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192, 216, 224, 252,
+              256, 288, 288, 324, 320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40, 44, 80, 88, 120,
+              132, 160, 176, 200, 220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480, 528, 520, 572,
+              560, 616, 600, 660, 512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792, 736, 828, 768,
+              864, 800, 900, 832, 936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704, 680, 748, 720,
+              792, 760, 836, 800, 880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144, 1080, 1188,
+              1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160, 180, 192,
+              216, 224, 252, 256, 288, 288, 324, 320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540, 0, 0, 40,
+              44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440, 484, 480,
+              528, 520, 572, 560, 616, 600, 660, 512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756, 704, 792,
+              736, 828, 768, 864, 800, 900, 832, 936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116, 640, 704,
+              680, 748, 720, 792, 760, 836, 800, 880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100, 1040, 1144,
+              1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 32, 36, 64, 72, 96, 108, 128, 144, 160,
+              180, 192, 216, 224, 252, 256, 288, 288, 324, 320, 360, 352, 396, 384, 432, 416, 468, 448, 504, 480, 540,
+              0, 0, 40, 44, 80, 88, 120, 132, 160, 176, 200, 220, 240, 264, 280, 308, 320, 352, 360, 396, 400, 440, 440,
+              484, 480, 528, 520, 572, 560, 616, 600, 660, 512, 576, 544, 612, 576, 648, 608, 684, 640, 720, 672, 756,
+              704, 792, 736, 828, 768, 864, 800, 900, 832, 936, 864, 972, 896, 1008, 928, 1044, 960, 1080, 992, 1116,
+              640, 704, 680, 748, 720, 792, 760, 836, 800, 880, 840, 924, 880, 968, 920, 1012, 960, 1056, 1000, 1100,
+              1040, 1144, 1080, 1188, 1120, 1232, 1160, 1276, 1200, 1320, 1240, 1364, 0, 0, 48, 52, 96, 104, 144, 156,
+              192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432, 468, 480, 520, 528, 572, 576, 624, 624, 676, 672,
+              728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224, 240, 280, 300, 336, 360, 392, 420, 448, 480, 504,
+              540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840, 840, 900, 768, 832, 816, 884, 864, 936, 912, 988,
+              960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152, 1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456,
+              1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952, 1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260,
+              1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500, 1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680,
+              1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156, 192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432,
+              468, 480, 520, 528, 572, 576, 624, 624, 676, 672, 728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224,
+              240, 280, 300, 336, 360, 392, 420, 448, 480, 504, 540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840,
+              840, 900, 768, 832, 816, 884, 864, 936, 912, 988, 960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152,
+              1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456, 1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952,
+              1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260, 1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500,
+              1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680, 1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156,
+              192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432, 468, 480, 520, 528, 572, 576, 624, 624, 676, 672,
+              728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224, 240, 280, 300, 336, 360, 392, 420, 448, 480, 504,
+              540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840, 840, 900, 768, 832, 816, 884, 864, 936, 912, 988,
+              960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152, 1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456,
+              1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952, 1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260,
+              1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500, 1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680,
+              1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156, 192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432,
+              468, 480, 520, 528, 572, 576, 624, 624, 676, 672, 728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224,
+              240, 280, 300, 336, 360, 392, 420, 448, 480, 504, 540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840,
+              840, 900, 768, 832, 816, 884, 864, 936, 912, 988, 960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152,
+              1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456, 1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952,
+              1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260, 1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500,
+              1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680, 1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156,
+              192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432, 468, 480, 520, 528, 572, 576, 624, 624, 676, 672,
+              728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224, 240, 280, 300, 336, 360, 392, 420, 448, 480, 504,
+              540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840, 840, 900, 768, 832, 816, 884, 864, 936, 912, 988,
+              960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152, 1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456,
+              1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952, 1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260,
+              1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500, 1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680,
+              1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156, 192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432,
+              468, 480, 520, 528, 572, 576, 624, 624, 676, 672, 728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224,
+              240, 280, 300, 336, 360, 392, 420, 448, 480, 504, 540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840,
+              840, 900, 768, 832, 816, 884, 864, 936, 912, 988, 960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152,
+              1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456, 1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952,
+              1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260, 1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500,
+              1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680, 1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156,
+              192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432, 468, 480, 520, 528, 572, 576, 624, 624, 676, 672,
+              728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224, 240, 280, 300, 336, 360, 392, 420, 448, 480, 504,
+              540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840, 840, 900, 768, 832, 816, 884, 864, 936, 912, 988,
+              960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152, 1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456,
+              1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952, 1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260,
+              1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500, 1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680,
+              1800, 1736, 1860, 0, 0, 48, 52, 96, 104, 144, 156, 192, 208, 240, 260, 288, 312, 336, 364, 384, 416, 432,
+              468, 480, 520, 528, 572, 576, 624, 624, 676, 672, 728, 720, 780, 0, 0, 56, 60, 112, 120, 168, 180, 224,
+              240, 280, 300, 336, 360, 392, 420, 448, 480, 504, 540, 560, 600, 616, 660, 672, 720, 728, 780, 784, 840,
+              840, 900, 768, 832, 816, 884, 864, 936, 912, 988, 960, 1040, 1008, 1092, 1056, 1144, 1104, 1196, 1152,
+              1248, 1200, 1300, 1248, 1352, 1296, 1404, 1344, 1456, 1392, 1508, 1440, 1560, 1488, 1612, 896, 960, 952,
+              1020, 1008, 1080, 1064, 1140, 1120, 1200, 1176, 1260, 1232, 1320, 1288, 1380, 1344, 1440, 1400, 1500,
+              1456, 1560, 1512, 1620, 1568, 1680, 1624, 1740, 1680, 1800, 1736, 1860
+            ],
+            "dims": [1, 4, 32, 32],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index 2e8eaaba191d..cc10df586423 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -298,7 +298,157 @@
       }
     ]
   },
-
+  {
+    "name": "conv - vectorize group - A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [1, 1], "type": "ints" },
+      { "name": "group", "data": 2, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0],
+            "dims": [2, 1, 1, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 34.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - B",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
+            "dims": [1, 3, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - C",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 44, 54, 74, 84, 94, 386, 412, 438, 490, 516, 542, 1122, 1164, 1206, 1290, 1332, 1374],
+            "dims": [1, 3, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - D",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0] strides = [2, 2]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 54, 386, 438, 1122, 1206],
+            "dims": [1, 3, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "conv - pointwise",
     "operator": "Conv",
diff --git a/js/web/test/data/ops/expand.jsonc b/js/web/test/data/ops/expand.jsonc
index 22bc04d558d9..613b4507b2b1 100644
--- a/js/web/test/data/ops/expand.jsonc
+++ b/js/web/test/data/ops/expand.jsonc
@@ -168,20 +168,39 @@
         "name": "Expand - last dim is not divisible by 4",
         "inputs": [
           {
-            "data": [true, false, false, true, true, true, false, false, false, true, true, true],
-            "dims": [2, 6],
+            "data": [true, false, false, true, true, true],
+            "dims": [1, 6],
             "type": "bool"
           },
           {
-            "data": [2, 1],
+            "data": [3, 1],
             "dims": [2],
             "type": "int64"
           }
         ],
         "outputs": [
           {
-            "data": [true, false, false, true, true, true, false, false, false, true, true, true],
-            "dims": [2, 6],
+            "data": [
+              true,
+              false,
+              false,
+              true,
+              true,
+              true,
+              true,
+              false,
+              false,
+              true,
+              true,
+              true,
+              true,
+              false,
+              false,
+              true,
+              true,
+              true
+            ],
+            "dims": [3, 6],
             "type": "bool"
           }
         ]
diff --git a/js/web/test/data/ops/fast-gelu.jsonc b/js/web/test/data/ops/fast-gelu.jsonc
new file mode 100644
index 000000000000..2550173e9540
--- /dev/null
+++ b/js/web/test/data/ops/fast-gelu.jsonc
@@ -0,0 +1,211 @@
+[
+  {
+    "name": "FastGelu test without bias",
+    "operator": "FastGelu",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "scalar",
+        "inputs": [
+          {
+            "data": [1],
+            "dims": [],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.841192],
+            "dims": [],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "[2x4]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.0539828, 0.115851, 0.185371, 0.262161, 0.345714, 0.435415, 0.53057, 0.630432],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "[3x5]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3, 4, 5, 1.1, 1.2, 1.3, 1.4, 1.5],
+            "dims": [3, 5],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0.0539828, 0.115851, 0.185371, 0.262161, 0.345714, 0.841192, 1.9546, 2.99636, 3.99993, 5, 0.950581,
+              1.0617, 1.17393, 1.28671, 1.39957
+            ],
+            "dims": [3, 5],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "FastGelu test with bias",
+    "operator": "FastGelu",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "scalar",
+        "inputs": [
+          {
+            "data": [1],
+            "dims": [],
+            "type": "float32"
+          },
+          {
+            "data": [0.5],
+            "dims": [],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1.39957],
+            "dims": [],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "[2x4], [4]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
+            "dims": [2, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.950581, 2.16968, 3.29869, 4.39999, 1.39957, 2.58835, 3.69973, 4.8],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "[2x4], [3]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
+            "dims": [2, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.950581, 2.16968, 3.29869, 1.28671, 2.48492, 3.59959, 1.62411, 2.79331],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "[3x5], [2]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3, 4, 5, 1.1, 1.2, 1.3, 1.4, 1.5],
+            "dims": [3, 5],
+            "type": "float32"
+          },
+          {
+            "data": [2, 3],
+            "dims": [2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.06267, 3.19813, 2.27567, 3.39909, 2.48492, 3.99993, 3.99993, 6, 6, 8, 3.09737, 4.19997, 3.29869,
+              4.39999, 3.49938
+            ],
+            "dims": [3, 5],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "[3x5], [7]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3, 4, 5, 1.1, 1.2, 1.3, 1.4, 1.5],
+            "dims": [3, 5],
+            "type": "float32"
+          },
+          {
+            "data": [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7],
+            "dims": [7],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.16968, 2.38072, 2.58835, 2.79331, 2.99636, 3.59959, 4.7, 5.1, 6.2, 7.3, 3.49938, 3.69973, 3.89989,
+              4.09996, 3.59959
+            ],
+            "dims": [3, 5],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "[4x4], [8]",
+        "inputs": [
+          {
+            "data": [0.8, -0.5, 0.0, 1, 1.3, 2.1, -0.2, 1.1, 0.5, 0.2, 0.3, -0.6, 3.1, 2.2, -1.1, 0.0],
+            "dims": [4, 4],
+            "type": "float32"
+          },
+          {
+            "data": [-0.5, 0.6, 1.2, 2.1, 1.3, -1, 0, 3.1],
+            "dims": [8],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0.185371, 0.0539828, 1.0617, 3.09737, 2.58835, 0.950581, -0.0841486, 4.19997, 0, 0.630432, 1.39957,
+              1.39957, 4.39999, 1.0617, -0.149419, 3.09737
+            ],
+            "dims": [4, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/fused-conv.jsonc b/js/web/test/data/ops/fused-conv.jsonc
index 812e9d7c2def..6a10e3b96a26 100644
--- a/js/web/test/data/ops/fused-conv.jsonc
+++ b/js/web/test/data/ops/fused-conv.jsonc
@@ -108,5 +108,327 @@
         ]
       }
     ]
+  },
+  {
+    "name": "fused conv with clip",
+    "operator": "FusedConv",
+    "attributes": [
+      { "name": "activation", "data": "Clip", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "activation_params", "data": [400.0, 600.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, 30, 40, 50, 60, 70, 80, 90],
+            "dims": [1, 1, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [400, 470, 600, 600],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "fused conv with HardSigmoid",
+    "operator": "FusedConv",
+    "attributes": [
+      { "name": "activation", "data": "HardSigmoid", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, -30, -40, -50, -60, 70, 80, 90],
+            "dims": [1, 1, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0, 0, 1, 1],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "NHWC conv with HardSigmoid",
+    "operator": "Conv",
+    "attributes": [
+      { "name": "activation", "data": "HardSigmoid", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, -30, -40, -50, -60, 70, 80, 90],
+            "dims": [1, 3, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0, 0, 1, 1],
+            "dims": [1, 2, 2, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "fused group-conv with HardSigmoid",
+    "operator": "FusedConv",
+    "attributes": [
+      { "name": "activation", "data": "HardSigmoid", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1],
+            "dims": [1, 3, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "NHWC group-conv with HardSigmoid",
+    "operator": "Conv",
+    "attributes": [
+      { "name": "activation", "data": "HardSigmoid", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            "dims": [1, 2, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "fused group-conv with LeakyRelu",
+    "operator": "FusedConv",
+    "attributes": [
+      { "name": "activation", "data": "LeakyRelu", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "activation_params", "data": [2.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [9, -6, 51, 47, -170, -10, 251, 229, 847, 889, 973, 1015],
+            "dims": [1, 3, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "NHWC group-conv with LeakyRelu",
+    "operator": "Conv",
+    "attributes": [
+      { "name": "activation", "data": "LeakyRelu", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "activation_params", "data": [2.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609],
+            "dims": [1, 2, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "fused conv with LeakyRelu",
+    "operator": "FusedConv",
+    "attributes": [
+      { "name": "activation", "data": "LeakyRelu", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "activation_params", "data": [2.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, -30, -40, -50, -60, 70, 80, 90],
+            "dims": [1, 1, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [-540, -860, 390, 430],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "NHWC conv with LeakyRelu",
+    "operator": "Conv",
+    "attributes": [
+      { "name": "activation", "data": "LeakyRelu", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "activation_params", "data": [2.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, -30, -40, -50, -60, 70, 80, 90],
+            "dims": [1, 3, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [-540, -860, 390, 430],
+            "dims": [1, 2, 2, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/js/web/test/data/ops/gather.jsonc b/js/web/test/data/ops/gather.jsonc
index 0be077d237b8..d218d120d356 100644
--- a/js/web/test/data/ops/gather.jsonc
+++ b/js/web/test/data/ops/gather.jsonc
@@ -99,6 +99,28 @@
     "operator": "Gather",
     "attributes": [],
     "cases": [
+      {
+        "name": "data[4] indices[]",
+        "inputs": [
+          {
+            "data": [false, true, false, false],
+            "dims": [4],
+            "type": "bool"
+          },
+          {
+            "data": [1],
+            "dims": [],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [true],
+            "dims": [],
+            "type": "bool"
+          }
+        ]
+      },
       {
         "name": "data[2,4] indices[1]",
         "inputs": [
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
index 6a4e6912405e..f28b016d47ab 100644
--- a/js/web/test/data/ops/instance-norm.jsonc
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -38,6 +38,79 @@
       }
     ]
   },
+  {
+    "name": "Simple test with NHWC, components 1",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5],
+            "dims": [5],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8],
+            "dims": [5],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539
+            ],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NHWC, components 2",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8],
+            "dims": [2, 6, 1, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [6],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8, 9],
+            "dims": [6],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9],
+            "dims": [2, 6, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "Simple test with NCHW",
     "operator": "InstanceNormalization",
@@ -75,5 +148,161 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Simple test with NCHW, components 1",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5],
+            "dims": [5],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8],
+            "dims": [5],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539
+            ],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NCHW, components 2",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2],
+            "dims": [1, 3, 6, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.5361523628234863, 3.1216912269592285, 3.70723032951355, 4.292769432067871, 4.878308296203613,
+              5.4638471603393555, 1.8666191101074219, 3.9555397033691406, 6.044460296630859, 8.133380889892578,
+              6.044460296630859, 3.9555397033691406, 10.3915433883667, 8.634925842285156, 6.878308296203613,
+              5.121691703796387, 3.365074634552002, 1.6084575653076172
+            ],
+            "dims": [1, 3, 6, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NHWC, components 1, buffer reuse",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": {
+      "domain": "",
+      "version": 17
+    },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3, 1, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 5, 6, 4, 5, 6],
+            "dims": [2, 3, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NHWC, components 2, buffer reuse",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": {
+      "domain": "",
+      "version": 17
+    },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2],
+            "dims": [1, 6, 1, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [6],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8, 9],
+            "dims": [6],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539,
+              16.348413467407227, 9, 1.6515865325927734
+            ],
+            "dims": [1, 6, 1, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/js/web/test/data/ops/matmulnbits.jsonc b/js/web/test/data/ops/matmulnbits.jsonc
new file mode 100644
index 000000000000..63e0a0ed5287
--- /dev/null
+++ b/js/web/test/data/ops/matmulnbits.jsonc
@@ -0,0 +1,2486 @@
+[
+  {
+    "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127
+            ],
+            "dims": [8, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64
+            ]
+          },
+          {
+            "dims": [8],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              0, -385, -1120, -963, -1984, -1285, -2592, -1351, 0, -1073, -3808, -2643, -6848, -3445, -9120, -3479, 0,
+              -1761, -6496, -4323, -11712, -5605, -15648, -5607, 0, -2449, -9184, -6003, -16576, -7765, -22176, -7735,
+              0, -3137, -11872, -7683, -21440, -9925, -28704, -9863, 0, -3825, -14560, -9363, -26304, -12085, -35232,
+              -11991, 0, -4513, -17248, -11043, -31168, -14245, -41760, -14119, 0, -5201, -19936, -12723, -36032,
+              -16405, -48288, -16247
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127
+            ],
+            "dims": [8, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64
+            ]
+          },
+          {
+            "dims": [8],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7]
+          },
+          {
+            "dims": [8],
+            "type": "uint8",
+            "data": [248, 249, 250, 251, 252, 253, 254, 255]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              0, -505, -1600, -2043, -3904, -4285, -6912, -7231, 0, -1449, -5312, -6027, -12864, -12845, -22656, -21903,
+              0, -2393, -9024, -10011, -21824, -21405, -38400, -36575, 0, -3337, -12736, -13995, -30784, -29965, -54144,
+              -51247, 0, -4281, -16448, -17979, -39744, -38525, -69888, -65919, 0, -5225, -20160, -21963, -48704,
+              -47085, -85632, -80591, 0, -6169, -23872, -25947, -57664, -55645, -101376, -95263, 0, -7113, -27584,
+              -29931, -66624, -64205, -117120, -109935
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=8, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255
+            ],
+            "dims": [8, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              -1073, -3763, -5429, -6071, -5689, -4283, -1853, 1601, -2449, -12499, -19477, -23383, -24217, -21979,
+              -16669, -8287, -3825, -21235, -33525, -40695, -42745, -39675, -31485, -18175, -5201, -29971, -47573,
+              -58007, -61273, -57371, -46301, -28063, -6577, -38707, -61621, -75319, -79801, -75067, -61117, -37951,
+              -7953, -47443, -75669, -92631, -98329, -92763, -75933, -47839, -9329, -56179, -89717, -109943, -116857,
+              -110459, -90749, -57727, -10705, -64915, -103765, -127255, -135385, -128155, -105565, -67615
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=8, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255
+            ],
+            "dims": [8, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+          },
+          {
+            "dims": [8],
+            "type": "uint8",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              1935, 6941, 12491, 18585, 25223, 32405, 40131, 48401, 4655, 17661, 31211, 45305, 59943, 75125, 90851,
+              107121, 7375, 28381, 49931, 72025, 94663, 117845, 141571, 165841, 10095, 39101, 68651, 98745, 129383,
+              160565, 192291, 224561, 12815, 49821, 87371, 125465, 164103, 203285, 243011, 283281, 15535, 60541, 106091,
+              152185, 198823, 246005, 293731, 342001, 18255, 71261, 124811, 178905, 233543, 288725, 344451, 400721,
+              20975, 81981, 143531, 205625, 268263, 331445, 395171, 459441
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=48, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 48, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=48, N=8, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
+              274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
+              295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315,
+              316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336,
+              337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357,
+              358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378,
+              379, 380, 381, 382, 383
+            ],
+            "dims": [8, 48],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 3, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192
+            ]
+          },
+          {
+            "dims": [24],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              -7569, -13416, -24375, -14292, -20445, 5568, 4221, 46164, -17697, -39528, -73383, -45588, -66861, 10560,
+              1869, 128916, -27825, -65640, -122391, -76884, -113277, 15552, -483, 211668, -37953, -91752, -171399,
+              -108180, -159693, 20544, -2835, 294420, -48081, -117864, -220407, -139476, -206109, 25536, -5187, 377172,
+              -58209, -143976, -269415, -170772, -252525, 30528, -7539, 459924, -68337, -170088, -318423, -202068,
+              -298941, 35520, -9891, 542676, -78465, -196200, -367431, -233364, -345357, 40512, -12243, 625428
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=48, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 48, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=48, N=8, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
+              274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
+              295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315,
+              316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336,
+              337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357,
+              358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378,
+              379, 380, 381, 382, 383
+            ],
+            "dims": [8, 48],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 3, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192
+            ]
+          },
+          {
+            "dims": [24],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
+          },
+          {
+            "dims": [16],
+            "type": "uint8",
+            "data": [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              -1353, -5984, -24751, -31500, -63509, -72376, -117627, -128612, -6105, -20576, -74527, -94284, -190565,
+              -215608, -354219, -384548, -10857, -35168, -124303, -157068, -317621, -358840, -590811, -640484, -15609,
+              -49760, -174079, -219852, -444677, -502072, -827403, -896420, -20361, -64352, -223855, -282636, -571733,
+              -645304, -1063995, -1152356, -25113, -78944, -273631, -345420, -698789, -788536, -1300587, -1408292,
+              -29865, -93536, -323407, -408204, -825845, -931768, -1537179, -1664228, -34617, -108128, -373183, -470988,
+              -952901, -1075000, -1773771, -1920164
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=64, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 64, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=64, N=8, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
+              274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
+              295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315,
+              316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336,
+              337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357,
+              358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378,
+              379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399,
+              400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
+              421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+              442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462,
+              463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483,
+              484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504,
+              505, 506, 507, 508, 509, 510, 511
+            ],
+            "dims": [8, 64],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 4, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              -13572, -28812, -27668, -10140, 23772, 74068, 140748, 192564, -33796, -91532, -100116, -59548, 30172,
+              169044, 357068, 531252, -54020, -154252, -172564, -108956, 36572, 264020, 573388, 869940, -74244, -216972,
+              -245012, -158364, 42972, 358996, 789708, 1208628, -94468, -279692, -317460, -207772, 49372, 453972,
+              1006028, 1547316, -114692, -342412, -389908, -257180, 55772, 548948, 1222348, 1886004, -134916, -405132,
+              -462356, -306588, 62172, 643924, 1438668, 2224692, -155140, -467852, -534804, -355996, 68572, 738900,
+              1654988, 2563380
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=64, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 64, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=64, N=8, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
+              274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
+              295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315,
+              316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336,
+              337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357,
+              358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378,
+              379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399,
+              400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
+              421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+              442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462,
+              463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483,
+              484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504,
+              505, 506, 507, 508, 509, 510, 511
+            ],
+            "dims": [8, 64],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 4, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "uint8",
+            "data": [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              -26004, -63644, -96932, -125868, -150452, -170684, -186564, -229340, -60564, -157084, -249252, -337068,
+              -420532, -499644, -574404, -707804, -95124, -250524, -401572, -548268, -690612, -828604, -962244,
+              -1186268, -129684, -343964, -553892, -759468, -960692, -1157564, -1350084, -1664732, -164244, -437404,
+              -706212, -970668, -1230772, -1486524, -1737924, -2143196, -198804, -530844, -858532, -1181868, -1500852,
+              -1815484, -2125764, -2621660, -233364, -624284, -1010852, -1393068, -1770932, -2144444, -2513604,
+              -3100124, -267924, -717724, -1163172, -1604268, -2041012, -2473404, -2901444, -3578588
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=80, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 80, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=80, N=8, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
+              274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
+              295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315,
+              316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336,
+              337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357,
+              358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378,
+              379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399,
+              400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
+              421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+              442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462,
+              463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483,
+              484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504,
+              505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525,
+              526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546,
+              547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567,
+              568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588,
+              589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609,
+              610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630,
+              631, 632, 633, 634, 635, 636, 637, 638, 639
+            ],
+            "dims": [8, 80],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 5, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320
+            ]
+          },
+          {
+            "dims": [40],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39
+            ]
+          },
+          {
+            "dims": [24],
+            "type": "uint8",
+            "data": [
+              240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
+              261, 262, 263
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 8],
+            "type": "float32",
+            "data": [
+              -19988, -63429, -155448, -216179, -358428, 740351, 259888, 172481, -56788, -186869, -451128, -632899,
+              -1053788, 1574031, 1165488, 546481, -93588, -310309, -746808, -1049619, -1749148, 2407711, 2071088,
+              920481, -130388, -433749, -1042488, -1466339, -2444508, 3241391, 2976688, 1294481, -167188, -557189,
+              -1338168, -1883059, -3139868, 4075071, 3882288, 1668481, -203988, -680629, -1633848, -2299779, -3835228,
+              4908751, 4787888, 2042481, -240788, -804069, -1929528, -2716499, -4530588, 5742431, 5693488, 2416481,
+              -277588, -927509, -2225208, -3133219, -5225948, 6576111, 6599088, 2790481
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255
+            ],
+            "dims": [16, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 16],
+            "type": "float32",
+            "data": [
+              0, -385, -1120, -963, -1984, -1285, -2592, -1351, -2944, -1161, -3040, -715, -2880, -13, -2464, 945, 0,
+              -1073, -3808, -2643, -6848, -3445, -9120, -3479, -10624, -2745, -11360, -1243, -11328, 1027, -10528, 4065,
+              0, -1761, -6496, -4323, -11712, -5605, -15648, -5607, -18304, -4329, -19680, -1771, -19776, 2067, -18592,
+              7185, 0, -2449, -9184, -6003, -16576, -7765, -22176, -7735, -25984, -5913, -28000, -2299, -28224, 3107,
+              -26656, 10305, 0, -3137, -11872, -7683, -21440, -9925, -28704, -9863, -33664, -7497, -36320, -2827,
+              -36672, 4147, -34720, 13425, 0, -3825, -14560, -9363, -26304, -12085, -35232, -11991, -41344, -9081,
+              -44640, -3355, -45120, 5187, -42784, 16545, 0, -4513, -17248, -11043, -31168, -14245, -41760, -14119,
+              -49024, -10665, -52960, -3883, -53568, 6227, -50848, 19665, 0, -5201, -19936, -12723, -36032, -16405,
+              -48288, -16247, -56704, -12249, -61280, -4411, -62016, 7267, -58912, 22785, 0, -5889, -22624, -14403,
+              -40896, -18565, -54816, -18375, -64384, -13833, -69600, -4939, -70464, 8307, -66976, 25905, 0, -6577,
+              -25312, -16083, -45760, -20725, -61344, -20503, -72064, -15417, -77920, -5467, -78912, 9347, -75040,
+              29025, 0, -7265, -28000, -17763, -50624, -22885, -67872, -22631, -79744, -17001, -86240, -5995, -87360,
+              10387, -83104, 32145, 0, -7953, -30688, -19443, -55488, -25045, -74400, -24759, -87424, -18585, -94560,
+              -6523, -95808, 11427, -91168, 35265, 0, -8641, -33376, -21123, -60352, -27205, -80928, -26887, -95104,
+              -20169, -102880, -7051, -104256, 12467, -99232, 38385, 0, -9329, -36064, -22803, -65216, -29365, -87456,
+              -29015, -102784, -21753, -111200, -7579, -112704, 13507, -107296, 41505, 0, -10017, -38752, -24483,
+              -70080, -31525, -93984, -31143, -110464, -23337, -119520, -8107, -121152, 14547, -115360, 44625, 0,
+              -10705, -41440, -26163, -74944, -33685, -100512, -33271, -118144, -24921, -127840, -8635, -129600, 15587,
+              -123424, 47745
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255
+            ],
+            "dims": [16, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 1, 8],
+            "type": "uint8",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+          },
+          {
+            "dims": [16],
+            "type": "uint8",
+            "data": [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 16],
+            "type": "float32",
+            "data": [
+              0, 608, 208, 1296, -288, 1280, -1488, 560, -3392, -864, -6000, -2992, -9312, -5824, -13328, -9360, 0,
+              1824, 336, 3792, -1568, 3520, -5712, 1008, -12096, -3744, -20720, -10736, -31584, -19968, -44688, -31440,
+              0, 3040, 464, 6288, -2848, 5760, -9936, 1456, -20800, -6624, -35440, -18480, -53856, -34112, -76048,
+              -53520, 0, 4256, 592, 8784, -4128, 8000, -14160, 1904, -29504, -9504, -50160, -26224, -76128, -48256,
+              -107408, -75600, 0, 5472, 720, 11280, -5408, 10240, -18384, 2352, -38208, -12384, -64880, -33968, -98400,
+              -62400, -138768, -97680, 0, 6688, 848, 13776, -6688, 12480, -22608, 2800, -46912, -15264, -79600, -41712,
+              -120672, -76544, -170128, -119760, 0, 7904, 976, 16272, -7968, 14720, -26832, 3248, -55616, -18144,
+              -94320, -49456, -142944, -90688, -201488, -141840, 0, 9120, 1104, 18768, -9248, 16960, -31056, 3696,
+              -64320, -21024, -109040, -57200, -165216, -104832, -232848, -163920, 0, 10336, 1232, 21264, -10528, 19200,
+              -35280, 4144, -73024, -23904, -123760, -64944, -187488, -118976, -264208, -186000, 0, 11552, 1360, 23760,
+              -11808, 21440, -39504, 4592, -81728, -26784, -138480, -72688, -209760, -133120, -295568, -208080, 0,
+              12768, 1488, 26256, -13088, 23680, -43728, 5040, -90432, -29664, -153200, -80432, -232032, -147264,
+              -326928, -230160, 0, 13984, 1616, 28752, -14368, 25920, -47952, 5488, -99136, -32544, -167920, -88176,
+              -254304, -161408, -358288, -252240, 0, 15200, 1744, 31248, -15648, 28160, -52176, 5936, -107840, -35424,
+              -182640, -95920, -276576, -175552, -389648, -274320, 0, 16416, 1872, 33744, -16928, 30400, -56400, 6384,
+              -116544, -38304, -197360, -103664, -298848, -189696, -421008, -296400, 0, 17632, 2000, 36240, -18208,
+              32640, -60624, 6832, -125248, -41184, -212080, -111408, -321120, -203840, -452368, -318480, 0, 18848,
+              2128, 38736, -19488, 34880, -64848, 7280, -133952, -44064, -226800, -119152, -343392, -217984, -483728,
+              -340560
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=32, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=32, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ],
+            "dims": [32, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              0, -428, -1288, -1068, -2288, -1420, -3000, -1484, -3424, -1260, -3560, -748, -3408, 52, -2968, 1140,
+              -2272, 2516, -1224, 4180, 80, 6132, 1672, 8372, 3552, 10900, 5720, 13716, 8176, 16820, 10920, 12276, 0,
+              -1116, -3976, -2748, -7152, -3580, -9528, -3612, -11104, -2844, -11880, -1276, -11856, 1092, -11032, 4260,
+              -8160, 8228, -6984, 12996, -3760, 18564, 264, 24932, 5088, 32100, 10712, 40068, 17136, 48836, 24360,
+              42532, 0, -1804, -6664, -4428, -12016, -5740, -16056, -5740, -18784, -4428, -20200, -1804, -20304, 2132,
+              -19096, 7380, -14048, 13940, -12744, 21812, -7600, 30996, -1144, 41492, 6624, 53300, 15704, 66420, 26096,
+              80852, 37800, 72788, 0, -2492, -9352, -6108, -16880, -7900, -22584, -7868, -26464, -6012, -28520, -2332,
+              -28752, 3172, -27160, 10500, -19936, 19652, -18504, 30628, -11440, 43428, -2552, 58052, 8160, 74500,
+              20696, 92772, 35056, 112868, 51240, 103044, 0, -3180, -12040, -7788, -21744, -10060, -29112, -9996,
+              -34144, -7596, -36840, -2860, -37200, 4212, -35224, 13620, -25824, 25364, -24264, 39444, -15280, 55860,
+              -3960, 74612, 9696, 95700, 25688, 119124, 44016, 144884, 64680, 133300, 0, -3868, -14728, -9468, -26608,
+              -12220, -35640, -12124, -41824, -9180, -45160, -3388, -45648, 5252, -43288, 16740, -31712, 31076, -30024,
+              48260, -19120, 68292, -5368, 91172, 11232, 116900, 30680, 145476, 52976, 176900, 78120, 163556, 0, -4556,
+              -17416, -11148, -31472, -14380, -42168, -14252, -49504, -10764, -53480, -3916, -54096, 6292, -51352,
+              19860, -37600, 36788, -35784, 57076, -22960, 80724, -6776, 107732, 12768, 138100, 35672, 171828, 61936,
+              208916, 91560, 193812, 0, -5244, -20104, -12828, -36336, -16540, -48696, -16380, -57184, -12348, -61800,
+              -4444, -62544, 7332, -59416, 22980, -43488, 42500, -41544, 65892, -26800, 93156, -8184, 124292, 14304,
+              159300, 40664, 198180, 70896, 240932, 105000, 224068, 0, -5932, -22792, -14508, -41200, -18700, -55224,
+              -18508, -64864, -13932, -70120, -4972, -70992, 8372, -67480, 26100, -49376, 48212, -47304, 74708, -30640,
+              105588, -9592, 140852, 15840, 180500, 45656, 224532, 79856, 272948, 118440, 254324, 0, -6620, -25480,
+              -16188, -46064, -20860, -61752, -20636, -72544, -15516, -78440, -5500, -79440, 9412, -75544, 29220,
+              -55264, 53924, -53064, 83524, -34480, 118020, -11000, 157412, 17376, 201700, 50648, 250884, 88816, 304964,
+              131880, 284580, 0, -7308, -28168, -17868, -50928, -23020, -68280, -22764, -80224, -17100, -86760, -6028,
+              -87888, 10452, -83608, 32340, -61152, 59636, -58824, 92340, -38320, 130452, -12408, 173972, 18912, 222900,
+              55640, 277236, 97776, 336980, 145320, 314836, 0, -7996, -30856, -19548, -55792, -25180, -74808, -24892,
+              -87904, -18684, -95080, -6556, -96336, 11492, -91672, 35460, -67040, 65348, -64584, 101156, -42160,
+              142884, -13816, 190532, 20448, 244100, 60632, 303588, 106736, 368996, 158760, 345092, 0, -8684, -33544,
+              -21228, -60656, -27340, -81336, -27020, -95584, -20268, -103400, -7084, -104784, 12532, -99736, 38580,
+              -72928, 71060, -70344, 109972, -46000, 155316, -15224, 207092, 21984, 265300, 65624, 329940, 115696,
+              401012, 172200, 375348, 0, -9372, -36232, -22908, -65520, -29500, -87864, -29148, -103264, -21852,
+              -111720, -7612, -113232, 13572, -107800, 41700, -78816, 76772, -76104, 118788, -49840, 167748, -16632,
+              223652, 23520, 286500, 70616, 356292, 124656, 433028, 185640, 405604, 0, -10060, -38920, -24588, -70384,
+              -31660, -94392, -31276, -110944, -23436, -120040, -8140, -121680, 14612, -115864, 44820, -84704, 82484,
+              -81864, 127604, -53680, 180180, -18040, 240212, 25056, 307700, 75608, 382644, 133616, 465044, 199080,
+              435860, 0, -10748, -41608, -26268, -75248, -33820, -100920, -33404, -118624, -25020, -128360, -8668,
+              -130128, 15652, -123928, 47940, -90592, 88196, -87624, 136420, -57520, 192612, -19448, 256772, 26592,
+              328900, 80600, 408996, 142576, 497060, 212520, 466116, 0, -11436, -44296, -27948, -80112, -35980, -107448,
+              -35532, -126304, -26604, -136680, -9196, -138576, 16692, -131992, 51060, -96480, 93908, -93384, 145236,
+              -61360, 205044, -20856, 273332, 28128, 350100, 85592, 435348, 151536, 529076, 225960, 496372, 0, -12124,
+              -46984, -29628, -84976, -38140, -113976, -37660, -133984, -28188, -145000, -9724, -147024, 17732, -140056,
+              54180, -102368, 99620, -99144, 154052, -65200, 217476, -22264, 289892, 29664, 371300, 90584, 461700,
+              160496, 561092, 239400, 526628, 0, -12812, -49672, -31308, -89840, -40300, -120504, -39788, -141664,
+              -29772, -153320, -10252, -155472, 18772, -148120, 57300, -108256, 105332, -104904, 162868, -69040, 229908,
+              -23672, 306452, 31200, 392500, 95576, 488052, 169456, 593108, 252840, 556884, 0, -13500, -52360, -32988,
+              -94704, -42460, -127032, -41916, -149344, -31356, -161640, -10780, -163920, 19812, -156184, 60420,
+              -114144, 111044, -110664, 171684, -72880, 242340, -25080, 323012, 32736, 413700, 100568, 514404, 178416,
+              625124, 266280, 587140, 0, -14188, -55048, -34668, -99568, -44620, -133560, -44044, -157024, -32940,
+              -169960, -11308, -172368, 20852, -164248, 63540, -120032, 116756, -116424, 180500, -76720, 254772, -26488,
+              339572, 34272, 434900, 105560, 540756, 187376, 657140, 279720, 617396, 0, -14876, -57736, -36348, -104432,
+              -46780, -140088, -46172, -164704, -34524, -178280, -11836, -180816, 21892, -172312, 66660, -125920,
+              122468, -122184, 189316, -80560, 267204, -27896, 356132, 35808, 456100, 110552, 567108, 196336, 689156,
+              293160, 647652, 0, -15564, -60424, -38028, -109296, -48940, -146616, -48300, -172384, -36108, -186600,
+              -12364, -189264, 22932, -180376, 69780, -131808, 128180, -127944, 198132, -84400, 279636, -29304, 372692,
+              37344, 477300, 115544, 593460, 205296, 721172, 306600, 677908, 0, -16252, -63112, -39708, -114160, -51100,
+              -153144, -50428, -180064, -37692, -194920, -12892, -197712, 23972, -188440, 72900, -137696, 133892,
+              -133704, 206948, -88240, 292068, -30712, 389252, 38880, 498500, 120536, 619812, 214256, 753188, 320040,
+              708164, 0, -16940, -65800, -41388, -119024, -53260, -159672, -52556, -187744, -39276, -203240, -13420,
+              -206160, 25012, -196504, 76020, -143584, 139604, -139464, 215764, -92080, 304500, -32120, 405812, 40416,
+              519700, 125528, 646164, 223216, 785204, 333480, 738420, 0, -17628, -68488, -43068, -123888, -55420,
+              -166200, -54684, -195424, -40860, -211560, -13948, -214608, 26052, -204568, 79140, -149472, 145316,
+              -145224, 224580, -95920, 316932, -33528, 422372, 41952, 540900, 130520, 672516, 232176, 817220, 346920,
+              768676, 0, -18316, -71176, -44748, -128752, -57580, -172728, -56812, -203104, -42444, -219880, -14476,
+              -223056, 27092, -212632, 82260, -155360, 151028, -150984, 233396, -99760, 329364, -34936, 438932, 43488,
+              562100, 135512, 698868, 241136, 849236, 360360, 798932, 0, -19004, -73864, -46428, -133616, -59740,
+              -179256, -58940, -210784, -44028, -228200, -15004, -231504, 28132, -220696, 85380, -161248, 156740,
+              -156744, 242212, -103600, 341796, -36344, 455492, 45024, 583300, 140504, 725220, 250096, 881252, 373800,
+              829188, 0, -19692, -76552, -48108, -138480, -61900, -185784, -61068, -218464, -45612, -236520, -15532,
+              -239952, 29172, -228760, 88500, -167136, 162452, -162504, 251028, -107440, 354228, -37752, 472052, 46560,
+              604500, 145496, 751572, 259056, 913268, 387240, 859444, 0, -20380, -79240, -49788, -143344, -64060,
+              -192312, -63196, -226144, -47196, -244840, -16060, -248400, 30212, -236824, 91620, -173024, 168164,
+              -168264, 259844, -111280, 366660, -39160, 488612, 48096, 625700, 150488, 777924, 268016, 945284, 400680,
+              889700, 0, -21068, -81928, -51468, -148208, -66220, -198840, -65324, -233824, -48780, -253160, -16588,
+              -256848, 31252, -244888, 94740, -178912, 173876, -174024, 268660, -115120, 379092, -40568, 505172, 49632,
+              646900, 155480, 804276, 276976, 977300, 414120, 919956, 0, -21756, -84616, -53148, -153072, -68380,
+              -205368, -67452, -241504, -50364, -261480, -17116, -265296, 32292, -252952, 97860, -184800, 179588,
+              -179784, 277476, -118960, 391524, -41976, 521732, 51168, 668100, 160472, 830628, 285936, 1009316, 427560,
+              950212
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=32, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=32, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ],
+            "dims": [32, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "uint8",
+            "data": [
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              0, 660, 888, 2196, 2064, 4020, 3528, 6132, 5280, 8532, 7320, 11220, 9648, 14196, 12264, 17460, 15136,
+              21012, 18360, 24852, 21840, 28980, 25608, 33396, 29664, 38100, 34008, 43092, 38640, 48372, 43560, 46004,
+              0, 2020, 2296, 6660, 5392, 12100, 9288, 18340, 13984, 25380, 19480, 33220, 25776, 41860, 32872, 51300,
+              42016, 61540, 49464, 72580, 58960, 84420, 69256, 97060, 80352, 110500, 92248, 124740, 104944, 139780,
+              118440, 139748, 0, 3380, 3704, 11124, 8720, 20180, 15048, 30548, 22688, 42228, 31640, 55220, 41904, 69524,
+              53480, 85140, 68896, 102068, 80568, 120308, 96080, 139860, 112904, 160724, 131040, 182900, 150488, 206388,
+              171248, 231188, 193320, 233492, 0, 4740, 5112, 15588, 12048, 28260, 20808, 42756, 31392, 59076, 43800,
+              77220, 58032, 97188, 74088, 118980, 95776, 142596, 111672, 168036, 133200, 195300, 156552, 224388, 181728,
+              255300, 208728, 288036, 237552, 322596, 268200, 327236, 0, 6100, 6520, 20052, 15376, 36340, 26568, 54964,
+              40096, 75924, 55960, 99220, 74160, 124852, 94696, 152820, 122656, 183124, 142776, 215764, 170320, 250740,
+              200200, 288052, 232416, 327700, 266968, 369684, 303856, 414004, 343080, 420980, 0, 7460, 7928, 24516,
+              18704, 44420, 32328, 67172, 48800, 92772, 68120, 121220, 90288, 152516, 115304, 186660, 149536, 223652,
+              173880, 263492, 207440, 306180, 243848, 351716, 283104, 400100, 325208, 451332, 370160, 505412, 417960,
+              514724, 0, 8820, 9336, 28980, 22032, 52500, 38088, 79380, 57504, 109620, 80280, 143220, 106416, 180180,
+              135912, 220500, 176416, 264180, 204984, 311220, 244560, 361620, 287496, 415380, 333792, 472500, 383448,
+              532980, 436464, 596820, 492840, 608468, 0, 10180, 10744, 33444, 25360, 60580, 43848, 91588, 66208, 126468,
+              92440, 165220, 122544, 207844, 156520, 254340, 203296, 304708, 236088, 358948, 281680, 417060, 331144,
+              479044, 384480, 544900, 441688, 614628, 502768, 688228, 567720, 702212, 0, 11540, 12152, 37908, 28688,
+              68660, 49608, 103796, 74912, 143316, 104600, 187220, 138672, 235508, 177128, 288180, 230176, 345236,
+              267192, 406676, 318800, 472500, 374792, 542708, 435168, 617300, 499928, 696276, 569072, 779636, 642600,
+              795956, 0, 12900, 13560, 42372, 32016, 76740, 55368, 116004, 83616, 160164, 116760, 209220, 154800,
+              263172, 197736, 322020, 257056, 385764, 298296, 454404, 355920, 527940, 418440, 606372, 485856, 689700,
+              558168, 777924, 635376, 871044, 717480, 889700, 0, 14260, 14968, 46836, 35344, 84820, 61128, 128212,
+              92320, 177012, 128920, 231220, 170928, 290836, 218344, 355860, 283936, 426292, 329400, 502132, 393040,
+              583380, 462088, 670036, 536544, 762100, 616408, 859572, 701680, 962452, 792360, 983444, 0, 15620, 16376,
+              51300, 38672, 92900, 66888, 140420, 101024, 193860, 141080, 253220, 187056, 318500, 238952, 389700,
+              310816, 466820, 360504, 549860, 430160, 638820, 505736, 733700, 587232, 834500, 674648, 941220, 767984,
+              1053860, 867240, 1077188, 0, 16980, 17784, 55764, 42000, 100980, 72648, 152628, 109728, 210708, 153240,
+              275220, 203184, 346164, 259560, 423540, 337696, 507348, 391608, 597588, 467280, 694260, 549384, 797364,
+              637920, 906900, 732888, 1022868, 834288, 1145268, 942120, 1170932, 0, 18340, 19192, 60228, 45328, 109060,
+              78408, 164836, 118432, 227556, 165400, 297220, 219312, 373828, 280168, 457380, 364576, 547876, 422712,
+              645316, 504400, 749700, 593032, 861028, 688608, 979300, 791128, 1104516, 900592, 1236676, 1017000,
+              1264676, 0, 19700, 20600, 64692, 48656, 117140, 84168, 177044, 127136, 244404, 177560, 319220, 235440,
+              401492, 300776, 491220, 391456, 588404, 453816, 693044, 541520, 805140, 636680, 924692, 739296, 1051700,
+              849368, 1186164, 966896, 1328084, 1091880, 1358420, 0, 21060, 22008, 69156, 51984, 125220, 89928, 189252,
+              135840, 261252, 189720, 341220, 251568, 429156, 321384, 525060, 418336, 628932, 484920, 740772, 578640,
+              860580, 680328, 988356, 789984, 1124100, 907608, 1267812, 1033200, 1419492, 1166760, 1452164, 0, 22420,
+              23416, 73620, 55312, 133300, 95688, 201460, 144544, 278100, 201880, 363220, 267696, 456820, 341992,
+              558900, 445216, 669460, 516024, 788500, 615760, 916020, 723976, 1052020, 840672, 1196500, 965848, 1349460,
+              1099504, 1510900, 1241640, 1545908, 0, 23780, 24824, 78084, 58640, 141380, 101448, 213668, 153248, 294948,
+              214040, 385220, 283824, 484484, 362600, 592740, 472096, 709988, 547128, 836228, 652880, 971460, 767624,
+              1115684, 891360, 1268900, 1024088, 1431108, 1165808, 1602308, 1316520, 1639652, 0, 25140, 26232, 82548,
+              61968, 149460, 107208, 225876, 161952, 311796, 226200, 407220, 299952, 512148, 383208, 626580, 498976,
+              750516, 578232, 883956, 690000, 1026900, 811272, 1179348, 942048, 1341300, 1082328, 1512756, 1232112,
+              1693716, 1391400, 1733396, 0, 26500, 27640, 87012, 65296, 157540, 112968, 238084, 170656, 328644, 238360,
+              429220, 316080, 539812, 403816, 660420, 525856, 791044, 609336, 931684, 727120, 1082340, 854920, 1243012,
+              992736, 1413700, 1140568, 1594404, 1298416, 1785124, 1466280, 1827140, 0, 27860, 29048, 91476, 68624,
+              165620, 118728, 250292, 179360, 345492, 250520, 451220, 332208, 567476, 424424, 694260, 552736, 831572,
+              640440, 979412, 764240, 1137780, 898568, 1306676, 1043424, 1486100, 1198808, 1676052, 1364720, 1876532,
+              1541160, 1920884, 0, 29220, 30456, 95940, 71952, 173700, 124488, 262500, 188064, 362340, 262680, 473220,
+              348336, 595140, 445032, 728100, 579616, 872100, 671544, 1027140, 801360, 1193220, 942216, 1370340,
+              1094112, 1558500, 1257048, 1757700, 1431024, 1967940, 1616040, 2014628, 0, 30580, 31864, 100404, 75280,
+              181780, 130248, 274708, 196768, 379188, 274840, 495220, 364464, 622804, 465640, 761940, 606496, 912628,
+              702648, 1074868, 838480, 1248660, 985864, 1434004, 1144800, 1630900, 1315288, 1839348, 1497328, 2059348,
+              1690920, 2108372, 0, 31940, 33272, 104868, 78608, 189860, 136008, 286916, 205472, 396036, 287000, 517220,
+              380592, 650468, 486248, 795780, 633376, 953156, 733752, 1122596, 875600, 1304100, 1029512, 1497668,
+              1195488, 1703300, 1373528, 1920996, 1563632, 2150756, 1765800, 2202116, 0, 33300, 34680, 109332, 81936,
+              197940, 141768, 299124, 214176, 412884, 299160, 539220, 396720, 678132, 506856, 829620, 660256, 993684,
+              764856, 1170324, 912720, 1359540, 1073160, 1561332, 1246176, 1775700, 1431768, 2002644, 1629936, 2242164,
+              1840680, 2295860, 0, 34660, 36088, 113796, 85264, 206020, 147528, 311332, 222880, 429732, 311320, 561220,
+              412848, 705796, 527464, 863460, 687136, 1034212, 795960, 1218052, 949840, 1414980, 1116808, 1624996,
+              1296864, 1848100, 1490008, 2084292, 1696240, 2333572, 1915560, 2389604, 0, 36020, 37496, 118260, 88592,
+              214100, 153288, 323540, 231584, 446580, 323480, 583220, 428976, 733460, 548072, 897300, 714016, 1074740,
+              827064, 1265780, 986960, 1470420, 1160456, 1688660, 1347552, 1920500, 1548248, 2165940, 1762544, 2424980,
+              1990440, 2483348, 0, 37380, 38904, 122724, 91920, 222180, 159048, 335748, 240288, 463428, 335640, 605220,
+              445104, 761124, 568680, 931140, 740896, 1115268, 858168, 1313508, 1024080, 1525860, 1204104, 1752324,
+              1398240, 1992900, 1606488, 2247588, 1828848, 2516388, 2065320, 2577092, 0, 38740, 40312, 127188, 95248,
+              230260, 164808, 347956, 248992, 480276, 347800, 627220, 461232, 788788, 589288, 964980, 767776, 1155796,
+              889272, 1361236, 1061200, 1581300, 1247752, 1815988, 1448928, 2065300, 1664728, 2329236, 1895152, 2607796,
+              2140200, 2670836, 0, 40100, 41720, 131652, 98576, 238340, 170568, 360164, 257696, 497124, 359960, 649220,
+              477360, 816452, 609896, 998820, 794656, 1196324, 920376, 1408964, 1098320, 1636740, 1291400, 1879652,
+              1499616, 2137700, 1722968, 2410884, 1961456, 2699204, 2215080, 2764580, 0, 41460, 43128, 136116, 101904,
+              246420, 176328, 372372, 266400, 513972, 372120, 671220, 493488, 844116, 630504, 1032660, 821536, 1236852,
+              951480, 1456692, 1135440, 1692180, 1335048, 1943316, 1550304, 2210100, 1781208, 2492532, 2027760, 2790612,
+              2289960, 2858324, 0, 42820, 44536, 140580, 105232, 254500, 182088, 384580, 275104, 530820, 384280, 693220,
+              509616, 871780, 651112, 1066500, 848416, 1277380, 982584, 1504420, 1172560, 1747620, 1378696, 2006980,
+              1600992, 2282500, 1839448, 2574180, 2094064, 2882020, 2364840, 2952068
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=16, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ],
+            "dims": [16, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 16],
+            "type": "float32",
+            "data": [
+              -1116, -4036, -5868, -6612, -6268, -4836, -2316, 1292, 5956, 11772, 18644, 26604, 35652, 45788, 57012,
+              53452, -2492, -12772, -19916, -23924, -24796, -22532, -17132, -8596, 5604, 17884, 35828, 56908, 81124,
+              108476, 138964, 140844, -3868, -21508, -33964, -41236, -43324, -40228, -31948, -18484, 5252, 23996, 53012,
+              87212, 126596, 171164, 220916, 228236, -5244, -30244, -48012, -58548, -61852, -57924, -46764, -28372,
+              4900, 30108, 70196, 117516, 172068, 233852, 302868, 315628, -6620, -38980, -62060, -75860, -80380, -75620,
+              -61580, -38260, 4548, 36220, 87380, 147820, 217540, 296540, 384820, 403020, -7996, -47716, -76108, -93172,
+              -98908, -93316, -76396, -48148, 4196, 42332, 104564, 178124, 263012, 359228, 466772, 490412, -9372,
+              -56452, -90156, -110484, -117436, -111012, -91212, -58036, 3844, 48444, 121748, 208428, 308484, 421916,
+              548724, 577804, -10748, -65188, -104204, -127796, -135964, -128708, -106028, -67924, 3492, 54556, 138932,
+              238732, 353956, 484604, 630676, 665196, -12124, -73924, -118252, -145108, -154492, -146404, -120844,
+              -77812, 3140, 60668, 156116, 269036, 399428, 547292, 712628, 752588, -13500, -82660, -132300, -162420,
+              -173020, -164100, -135660, -87700, 2788, 66780, 173300, 299340, 444900, 609980, 794580, 839980, -14876,
+              -91396, -146348, -179732, -191548, -181796, -150476, -97588, 2436, 72892, 190484, 329644, 490372, 672668,
+              876532, 927372, -16252, -100132, -160396, -197044, -210076, -199492, -165292, -107476, 2084, 79004,
+              207668, 359948, 535844, 735356, 958484, 1014764, -17628, -108868, -174444, -214356, -228604, -217188,
+              -180108, -117364, 1732, 85116, 224852, 390252, 581316, 798044, 1040436, 1102156, -19004, -117604, -188492,
+              -231668, -247132, -234884, -194924, -127252, 1380, 91228, 242036, 420556, 626788, 860732, 1122388,
+              1189548, -20380, -126340, -202540, -248980, -265660, -252580, -209740, -137140, 1028, 97340, 259220,
+              450860, 672260, 923420, 1204340, 1276940, -21756, -135076, -216588, -266292, -284188, -270276, -224556,
+              -147028, 676, 103452, 276404, 481164, 717732, 986108, 1286292, 1364332
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=16, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ],
+            "dims": [16, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "uint8",
+            "data": [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 16],
+            "type": "float32",
+            "data": [
+              -1116, -1860, -1516, -84, 2436, 6044, 10740, 16524, 23364, 31356, 40404, 50540, 61764, 74076, 87476,
+              86092, -2492, -2404, 820, 7180, 16676, 29308, 45076, 63980, 88548, 111196, 139508, 170956, 205540, 243260,
+              284116, 296364, -3868, -2948, 3156, 14444, 30916, 52572, 79412, 111436, 153732, 191036, 238612, 291372,
+              349316, 412444, 480756, 506636, -5244, -3492, 5492, 21708, 45156, 75836, 113748, 158892, 218916, 270876,
+              337716, 411788, 493092, 581628, 677396, 716908, -6620, -4036, 7828, 28972, 59396, 99100, 148084, 206348,
+              284100, 350716, 436820, 532204, 636868, 750812, 874036, 927180, -7996, -4580, 10164, 36236, 73636, 122364,
+              182420, 253804, 349284, 430556, 535924, 652620, 780644, 919996, 1070676, 1137452, -9372, -5124, 12500,
+              43500, 87876, 145628, 216756, 301260, 414468, 510396, 635028, 773036, 924420, 1089180, 1267316, 1347724,
+              -10748, -5668, 14836, 50764, 102116, 168892, 251092, 348716, 479652, 590236, 734132, 893452, 1068196,
+              1258364, 1463956, 1557996, -12124, -6212, 17172, 58028, 116356, 192156, 285428, 396172, 544836, 670076,
+              833236, 1013868, 1211972, 1427548, 1660596, 1768268, -13500, -6756, 19508, 65292, 130596, 215420, 319764,
+              443628, 610020, 749916, 932340, 1134284, 1355748, 1596732, 1857236, 1978540, -14876, -7300, 21844, 72556,
+              144836, 238684, 354100, 491084, 675204, 829756, 1031444, 1254700, 1499524, 1765916, 2053876, 2188812,
+              -16252, -7844, 24180, 79820, 159076, 261948, 388436, 538540, 740388, 909596, 1130548, 1375116, 1643300,
+              1935100, 2250516, 2399084, -17628, -8388, 26516, 87084, 173316, 285212, 422772, 585996, 805572, 989436,
+              1229652, 1495532, 1787076, 2104284, 2447156, 2609356, -19004, -8932, 28852, 94348, 187556, 308476, 457108,
+              633452, 870756, 1069276, 1328756, 1615948, 1930852, 2273468, 2643796, 2819628, -20380, -9476, 31188,
+              101612, 201796, 331740, 491444, 680908, 935940, 1149116, 1427860, 1736364, 2074628, 2442652, 2840436,
+              3029900, -21756, -10020, 33524, 108876, 216036, 355004, 525780, 728364, 1001124, 1228956, 1526964,
+              1856780, 2218404, 2611836, 3037076, 3240172
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=32, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=32, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
+              527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
+              548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568,
+              569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589,
+              590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+              611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631,
+              632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+              653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
+              674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694,
+              695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715,
+              716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
+              737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+              758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+              779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+              800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820,
+              821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841,
+              842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,
+              863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+              884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904,
+              905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
+              926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946,
+              947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967,
+              968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988,
+              989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+              1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024
+            ],
+            "dims": [32, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ]
+          },
+          {
+            "dims": [64],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              -1116, -4036, -5868, -6612, -6268, -4836, -2316, 1292, 5956, 11772, 18644, 26604, 35652, 45788, 57012,
+              53452, -59740, -53956, -47084, -39124, -30076, -19940, -8716, 3596, 16996, 31484, 47060, 63724, 81476,
+              100316, 120244, 109004, -2492, -12772, -19916, -23924, -24796, -22532, -17132, -8596, 5604, 17884, 35828,
+              56908, 81124, 108476, 138964, 140844, -199356, -184548, -166604, -145524, -121308, -93956, -63468, -29844,
+              6916, 46812, 89844, 136012, 185316, 237756, 293332, 287532, -3868, -21508, -33964, -41236, -43324, -40228,
+              -31948, -18484, 5252, 23996, 53012, 87212, 126596, 171164, 220916, 228236, -338972, -315140, -286124,
+              -251924, -212540, -167972, -118220, -63284, -3164, 62140, 132628, 208300, 289156, 375196, 466420, 466060,
+              -5244, -30244, -48012, -58548, -61852, -57924, -46764, -28372, 4900, 30108, 70196, 117516, 172068, 233852,
+              302868, 315628, -478588, -445732, -405644, -358324, -303772, -241988, -172972, -96724, -13244, 77468,
+              175412, 280588, 392996, 512636, 639508, 644588, -6620, -38980, -62060, -75860, -80380, -75620, -61580,
+              -38260, 4548, 36220, 87380, 147820, 217540, 296540, 384820, 403020, -618204, -576324, -525164, -464724,
+              -395004, -316004, -227724, -130164, -23324, 92796, 218196, 352876, 496836, 650076, 812596, 823116, -7996,
+              -47716, -76108, -93172, -98908, -93316, -76396, -48148, 4196, 42332, 104564, 178124, 263012, 359228,
+              466772, 490412, -757820, -706916, -644684, -571124, -486236, -390020, -282476, -163604, -33404, 108124,
+              260980, 425164, 600676, 787516, 985684, 1001644, -9372, -56452, -90156, -110484, -117436, -111012, -91212,
+              -58036, 3844, 48444, 121748, 208428, 308484, 421916, 548724, 577804, -897436, -837508, -764204, -677524,
+              -577468, -464036, -337228, -197044, -43484, 123452, 303764, 497452, 704516, 924956, 1158772, 1180172,
+              -10748, -65188, -104204, -127796, -135964, -128708, -106028, -67924, 3492, 54556, 138932, 238732, 353956,
+              484604, 630676, 665196, -1037052, -968100, -883724, -783924, -668700, -538052, -391980, -230484, -53564,
+              138780, 346548, 569740, 808356, 1062396, 1331860, 1358700, -12124, -73924, -118252, -145108, -154492,
+              -146404, -120844, -77812, 3140, 60668, 156116, 269036, 399428, 547292, 712628, 752588, -1176668, -1098692,
+              -1003244, -890324, -759932, -612068, -446732, -263924, -63644, 154108, 389332, 642028, 912196, 1199836,
+              1504948, 1537228, -13500, -82660, -132300, -162420, -173020, -164100, -135660, -87700, 2788, 66780,
+              173300, 299340, 444900, 609980, 794580, 839980, -1316284, -1229284, -1122764, -996724, -851164, -686084,
+              -501484, -297364, -73724, 169436, 432116, 714316, 1016036, 1337276, 1678036, 1715756, -14876, -91396,
+              -146348, -179732, -191548, -181796, -150476, -97588, 2436, 72892, 190484, 329644, 490372, 672668, 876532,
+              927372, -1455900, -1359876, -1242284, -1103124, -942396, -760100, -556236, -330804, -83804, 184764,
+              474900, 786604, 1119876, 1474716, 1851124, 1894284, -16252, -100132, -160396, -197044, -210076, -199492,
+              -165292, -107476, 2084, 79004, 207668, 359948, 535844, 735356, 958484, 1014764, -1595516, -1490468,
+              -1361804, -1209524, -1033628, -834116, -610988, -364244, -93884, 200092, 517684, 858892, 1223716, 1612156,
+              2024212, 2072812, -17628, -108868, -174444, -214356, -228604, -217188, -180108, -117364, 1732, 85116,
+              224852, 390252, 581316, 798044, 1040436, 1102156, -1735132, -1621060, -1481324, -1315924, -1124860,
+              -908132, -665740, -397684, -103964, 215420, 560468, 931180, 1327556, 1749596, 2197300, 2251340, -19004,
+              -117604, -188492, -231668, -247132, -234884, -194924, -127252, 1380, 91228, 242036, 420556, 626788,
+              860732, 1122388, 1189548, -1874748, -1751652, -1600844, -1422324, -1216092, -982148, -720492, -431124,
+              -114044, 230748, 603252, 1003468, 1431396, 1887036, 2370388, 2429868, -20380, -126340, -202540, -248980,
+              -265660, -252580, -209740, -137140, 1028, 97340, 259220, 450860, 672260, 923420, 1204340, 1276940,
+              -2014364, -1882244, -1720364, -1528724, -1307324, -1056164, -775244, -464564, -124124, 246076, 646036,
+              1075756, 1535236, 2024476, 2543476, 2608396, -21756, -135076, -216588, -266292, -284188, -270276, -224556,
+              -147028, 676, 103452, 276404, 481164, 717732, 986108, 1286292, 1364332, -2153980, -2012836, -1839884,
+              -1635124, -1398556, -1130180, -829996, -498004, -134204, 261404, 688820, 1148044, 1639076, 2161916,
+              2716564, 2786924, -23132, -143812, -230636, -283604, -302716, -287972, -239372, -156916, 324, 109564,
+              293588, 511468, 763204, 1048796, 1368244, 1451724, -2293596, -2143428, -1959404, -1741524, -1489788,
+              -1204196, -884748, -531444, -144284, 276732, 731604, 1220332, 1742916, 2299356, 2889652, 2965452, -24508,
+              -152548, -244684, -300916, -321244, -305668, -254188, -166804, -28, 115676, 310772, 541772, 808676,
+              1111484, 1450196, 1539116, -2433212, -2274020, -2078924, -1847924, -1581020, -1278212, -939500, -564884,
+              -154364, 292060, 774388, 1292620, 1846756, 2436796, 3062740, 3143980, -25884, -161284, -258732, -318228,
+              -339772, -323364, -269004, -176692, -380, 121788, 327956, 572076, 854148, 1174172, 1532148, 1626508,
+              -2572828, -2404612, -2198444, -1954324, -1672252, -1352228, -994252, -598324, -164444, 307388, 817172,
+              1364908, 1950596, 2574236, 3235828, 3322508, -27260, -170020, -272780, -335540, -358300, -341060, -283820,
+              -186580, -732, 127900, 345140, 602380, 899620, 1236860, 1614100, 1713900, -2712444, -2535204, -2317964,
+              -2060724, -1763484, -1426244, -1049004, -631764, -174524, 322716, 859956, 1437196, 2054436, 2711676,
+              3408916, 3501036, -28636, -178756, -286828, -352852, -376828, -358756, -298636, -196468, -1084, 134012,
+              362324, 632684, 945092, 1299548, 1696052, 1801292, -2852060, -2665796, -2437484, -2167124, -1854716,
+              -1500260, -1103756, -665204, -184604, 338044, 902740, 1509484, 2158276, 2849116, 3582004, 3679564, -30012,
+              -187492, -300876, -370164, -395356, -376452, -313452, -206356, -1436, 140124, 379508, 662988, 990564,
+              1362236, 1778004, 1888684, -2991676, -2796388, -2557004, -2273524, -1945948, -1574276, -1158508, -698644,
+              -194684, 353372, 945524, 1581772, 2262116, 2986556, 3755092, 3858092, -31388, -196228, -314924, -387476,
+              -413884, -394148, -328268, -216244, -1788, 146236, 396692, 693292, 1036036, 1424924, 1859956, 1976076,
+              -3131292, -2926980, -2676524, -2379924, -2037180, -1648292, -1213260, -732084, -204764, 368700, 988308,
+              1654060, 2365956, 3123996, 3928180, 4036620, -32764, -204964, -328972, -404788, -432412, -411844, -343084,
+              -226132, -2140, 152348, 413876, 723596, 1081508, 1487612, 1941908, 2063468, -3270908, -3057572, -2796044,
+              -2486324, -2128412, -1722308, -1268012, -765524, -214844, 384028, 1031092, 1726348, 2469796, 3261436,
+              4101268, 4215148, -34140, -213700, -343020, -422100, -450940, -429540, -357900, -236020, -2492, 158460,
+              431060, 753900, 1126980, 1550300, 2023860, 2150860, -3410524, -3188164, -2915564, -2592724, -2219644,
+              -1796324, -1322764, -798964, -224924, 399356, 1073876, 1798636, 2573636, 3398876, 4274356, 4393676,
+              -35516, -222436, -357068, -439412, -469468, -447236, -372716, -245908, -2844, 164572, 448244, 784204,
+              1172452, 1612988, 2105812, 2238252, -3550140, -3318756, -3035084, -2699124, -2310876, -1870340, -1377516,
+              -832404, -235004, 414684, 1116660, 1870924, 2677476, 3536316, 4447444, 4572204, -36892, -231172, -371116,
+              -456724, -487996, -464932, -387532, -255796, -3196, 170684, 465428, 814508, 1217924, 1675676, 2187764,
+              2325644, -3689756, -3449348, -3154604, -2805524, -2402108, -1944356, -1432268, -865844, -245084, 430012,
+              1159444, 1943212, 2781316, 3673756, 4620532, 4750732, -38268, -239908, -385164, -474036, -506524, -482628,
+              -402348, -265684, -3548, 176796, 482612, 844812, 1263396, 1738364, 2269716, 2413036, -3829372, -3579940,
+              -3274124, -2911924, -2493340, -2018372, -1487020, -899284, -255164, 445340, 1202228, 2015500, 2885156,
+              3811196, 4793620, 4929260, -39644, -248644, -399212, -491348, -525052, -500324, -417164, -275572, -3900,
+              182908, 499796, 875116, 1308868, 1801052, 2351668, 2500428, -3968988, -3710532, -3393644, -3018324,
+              -2584572, -2092388, -1541772, -932724, -265244, 460668, 1245012, 2087788, 2988996, 3948636, 4966708,
+              5107788, -41020, -257380, -413260, -508660, -543580, -518020, -431980, -285460, -4252, 189020, 516980,
+              905420, 1354340, 1863740, 2433620, 2587820, -4108604, -3841124, -3513164, -3124724, -2675804, -2166404,
+              -1596524, -966164, -275324, 475996, 1287796, 2160076, 3092836, 4086076, 5139796, 5286316, -42396, -266116,
+              -427308, -525972, -562108, -535716, -446796, -295348, -4604, 195132, 534164, 935724, 1399812, 1926428,
+              2515572, 2675212, -4248220, -3971716, -3632684, -3231124, -2767036, -2240420, -1651276, -999604, -285404,
+              491324, 1330580, 2232364, 3196676, 4223516, 5312884, 5464844, -43772, -274852, -441356, -543284, -580636,
+              -553412, -461612, -305236, -4956, 201244, 551348, 966028, 1445284, 1989116, 2597524, 2762604, -4387836,
+              -4102308, -3752204, -3337524, -2858268, -2314436, -1706028, -1033044, -295484, 506652, 1373364, 2304652,
+              3300516, 4360956, 5485972, 5643372
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=32, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=32, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
+              527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
+              548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568,
+              569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589,
+              590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+              611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631,
+              632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+              653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
+              674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694,
+              695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715,
+              716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
+              737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+              758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+              779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+              800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820,
+              821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841,
+              842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,
+              863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+              884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904,
+              905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
+              926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946,
+              947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967,
+              968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988,
+              989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+              1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024
+            ],
+            "dims": [32, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 2, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ]
+          },
+          {
+            "dims": [64],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "uint8",
+            "data": [
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              -1116, -1860, -1516, -84, 2436, 6044, 10740, 16524, 23364, 31356, 40404, 50540, 61764, 74076, 87476,
+              86092, -24924, -16964, -7916, 2220, 13444, 25756, 39156, 53644, 69220, 85884, 103636, 122476, 142404,
+              163420, 185524, 176460, -2492, -2404, 820, 7180, 16676, 29308, 45076, 63980, 88548, 111196, 139508,
+              170956, 205540, 243260, 284116, 296364, -33468, -8292, 20020, 51468, 86052, 123772, 164628, 208620,
+              255748, 306012, 359412, 415948, 475620, 538428, 604372, 608940, -3868, -2948, 3156, 14444, 30916, 52572,
+              79412, 111436, 153732, 191036, 238612, 291372, 349316, 412444, 480756, 506636, -42012, 380, 47956, 100716,
+              158660, 221788, 290100, 363596, 442276, 526140, 615188, 709420, 808836, 913436, 1023220, 1041420, -5244,
+              -3492, 5492, 21708, 45156, 75836, 113748, 158892, 218916, 270876, 337716, 411788, 493092, 581628, 677396,
+              716908, -50556, 9052, 75892, 149964, 231268, 319804, 415572, 518572, 628804, 746268, 870964, 1002892,
+              1142052, 1288444, 1442068, 1473900, -6620, -4036, 7828, 28972, 59396, 99100, 148084, 206348, 284100,
+              350716, 436820, 532204, 636868, 750812, 874036, 927180, -59100, 17724, 103828, 199212, 303876, 417820,
+              541044, 673548, 815332, 966396, 1126740, 1296364, 1475268, 1663452, 1860916, 1906380, -7996, -4580, 10164,
+              36236, 73636, 122364, 182420, 253804, 349284, 430556, 535924, 652620, 780644, 919996, 1070676, 1137452,
+              -67644, 26396, 131764, 248460, 376484, 515836, 666516, 828524, 1001860, 1186524, 1382516, 1589836,
+              1808484, 2038460, 2279764, 2338860, -9372, -5124, 12500, 43500, 87876, 145628, 216756, 301260, 414468,
+              510396, 635028, 773036, 924420, 1089180, 1267316, 1347724, -76188, 35068, 159700, 297708, 449092, 613852,
+              791988, 983500, 1188388, 1406652, 1638292, 1883308, 2141700, 2413468, 2698612, 2771340, -10748, -5668,
+              14836, 50764, 102116, 168892, 251092, 348716, 479652, 590236, 734132, 893452, 1068196, 1258364, 1463956,
+              1557996, -84732, 43740, 187636, 346956, 521700, 711868, 917460, 1138476, 1374916, 1626780, 1894068,
+              2176780, 2474916, 2788476, 3117460, 3203820, -12124, -6212, 17172, 58028, 116356, 192156, 285428, 396172,
+              544836, 670076, 833236, 1013868, 1211972, 1427548, 1660596, 1768268, -93276, 52412, 215572, 396204,
+              594308, 809884, 1042932, 1293452, 1561444, 1846908, 2149844, 2470252, 2808132, 3163484, 3536308, 3636300,
+              -13500, -6756, 19508, 65292, 130596, 215420, 319764, 443628, 610020, 749916, 932340, 1134284, 1355748,
+              1596732, 1857236, 1978540, -101820, 61084, 243508, 445452, 666916, 907900, 1168404, 1448428, 1747972,
+              2067036, 2405620, 2763724, 3141348, 3538492, 3955156, 4068780, -14876, -7300, 21844, 72556, 144836,
+              238684, 354100, 491084, 675204, 829756, 1031444, 1254700, 1499524, 1765916, 2053876, 2188812, -110364,
+              69756, 271444, 494700, 739524, 1005916, 1293876, 1603404, 1934500, 2287164, 2661396, 3057196, 3474564,
+              3913500, 4374004, 4501260, -16252, -7844, 24180, 79820, 159076, 261948, 388436, 538540, 740388, 909596,
+              1130548, 1375116, 1643300, 1935100, 2250516, 2399084, -118908, 78428, 299380, 543948, 812132, 1103932,
+              1419348, 1758380, 2121028, 2507292, 2917172, 3350668, 3807780, 4288508, 4792852, 4933740, -17628, -8388,
+              26516, 87084, 173316, 285212, 422772, 585996, 805572, 989436, 1229652, 1495532, 1787076, 2104284, 2447156,
+              2609356, -127452, 87100, 327316, 593196, 884740, 1201948, 1544820, 1913356, 2307556, 2727420, 3172948,
+              3644140, 4140996, 4663516, 5211700, 5366220, -19004, -8932, 28852, 94348, 187556, 308476, 457108, 633452,
+              870756, 1069276, 1328756, 1615948, 1930852, 2273468, 2643796, 2819628, -135996, 95772, 355252, 642444,
+              957348, 1299964, 1670292, 2068332, 2494084, 2947548, 3428724, 3937612, 4474212, 5038524, 5630548, 5798700,
+              -20380, -9476, 31188, 101612, 201796, 331740, 491444, 680908, 935940, 1149116, 1427860, 1736364, 2074628,
+              2442652, 2840436, 3029900, -144540, 104444, 383188, 691692, 1029956, 1397980, 1795764, 2223308, 2680612,
+              3167676, 3684500, 4231084, 4807428, 5413532, 6049396, 6231180, -21756, -10020, 33524, 108876, 216036,
+              355004, 525780, 728364, 1001124, 1228956, 1526964, 1856780, 2218404, 2611836, 3037076, 3240172, -153084,
+              113116, 411124, 740940, 1102564, 1495996, 1921236, 2378284, 2867140, 3387804, 3940276, 4524556, 5140644,
+              5788540, 6468244, 6663660, -23132, -10564, 35860, 116140, 230276, 378268, 560116, 775820, 1066308,
+              1308796, 1626068, 1977196, 2362180, 2781020, 3233716, 3450444, -161628, 121788, 439060, 790188, 1175172,
+              1594012, 2046708, 2533260, 3053668, 3607932, 4196052, 4818028, 5473860, 6163548, 6887092, 7096140, -24508,
+              -11108, 38196, 123404, 244516, 401532, 594452, 823276, 1131492, 1388636, 1725172, 2097612, 2505956,
+              2950204, 3430356, 3660716, -170172, 130460, 466996, 839436, 1247780, 1692028, 2172180, 2688236, 3240196,
+              3828060, 4451828, 5111500, 5807076, 6538556, 7305940, 7528620, -25884, -11652, 40532, 130668, 258756,
+              424796, 628788, 870732, 1196676, 1468476, 1824276, 2218028, 2649732, 3119388, 3626996, 3870988, -178716,
+              139132, 494932, 888684, 1320388, 1790044, 2297652, 2843212, 3426724, 4048188, 4707604, 5404972, 6140292,
+              6913564, 7724788, 7961100, -27260, -12196, 42868, 137932, 272996, 448060, 663124, 918188, 1261860,
+              1548316, 1923380, 2338444, 2793508, 3288572, 3823636, 4081260, -187260, 147804, 522868, 937932, 1392996,
+              1888060, 2423124, 2998188, 3613252, 4268316, 4963380, 5698444, 6473508, 7288572, 8143636, 8393580, -28636,
+              -12740, 45204, 145196, 287236, 471324, 697460, 965644, 1327044, 1628156, 2022484, 2458860, 2937284,
+              3457756, 4020276, 4291532, -195804, 156476, 550804, 987180, 1465604, 1986076, 2548596, 3153164, 3799780,
+              4488444, 5219156, 5991916, 6806724, 7663580, 8562484, 8826060, -30012, -13284, 47540, 152460, 301476,
+              494588, 731796, 1013100, 1392228, 1707996, 2121588, 2579276, 3081060, 3626940, 4216916, 4501804, -204348,
+              165148, 578740, 1036428, 1538212, 2084092, 2674068, 3308140, 3986308, 4708572, 5474932, 6285388, 7139940,
+              8038588, 8981332, 9258540, -31388, -13828, 49876, 159724, 315716, 517852, 766132, 1060556, 1457412,
+              1787836, 2220692, 2699692, 3224836, 3796124, 4413556, 4712076, -212892, 173820, 606676, 1085676, 1610820,
+              2182108, 2799540, 3463116, 4172836, 4928700, 5730708, 6578860, 7473156, 8413596, 9400180, 9691020, -32764,
+              -14372, 52212, 166988, 329956, 541116, 800468, 1108012, 1522596, 1867676, 2319796, 2820108, 3368612,
+              3965308, 4610196, 4922348, -221436, 182492, 634612, 1134924, 1683428, 2280124, 2925012, 3618092, 4359364,
+              5148828, 5986484, 6872332, 7806372, 8788604, 9819028, 10123500, -34140, -14916, 54548, 174252, 344196,
+              564380, 834804, 1155468, 1587780, 1947516, 2418900, 2940524, 3512388, 4134492, 4806836, 5132620, -229980,
+              191164, 662548, 1184172, 1756036, 2378140, 3050484, 3773068, 4545892, 5368956, 6242260, 7165804, 8139588,
+              9163612, 10237876, 10555980, -35516, -15460, 56884, 181516, 358436, 587644, 869140, 1202924, 1652964,
+              2027356, 2518004, 3060940, 3656164, 4303676, 5003476, 5342892, -238524, 199836, 690484, 1233420, 1828644,
+              2476156, 3175956, 3928044, 4732420, 5589084, 6498036, 7459276, 8472804, 9538620, 10656724, 10988460,
+              -36892, -16004, 59220, 188780, 372676, 610908, 903476, 1250380, 1718148, 2107196, 2617108, 3181356,
+              3799940, 4472860, 5200116, 5553164, -247068, 208508, 718420, 1282668, 1901252, 2574172, 3301428, 4083020,
+              4918948, 5809212, 6753812, 7752748, 8806020, 9913628, 11075572, 11420940, -38268, -16548, 61556, 196044,
+              386916, 634172, 937812, 1297836, 1783332, 2187036, 2716212, 3301772, 3943716, 4642044, 5396756, 5763436,
+              -255612, 217180, 746356, 1331916, 1973860, 2672188, 3426900, 4237996, 5105476, 6029340, 7009588, 8046220,
+              9139236, 10288636, 11494420, 11853420, -39644, -17092, 63892, 203308, 401156, 657436, 972148, 1345292,
+              1848516, 2266876, 2815316, 3422188, 4087492, 4811228, 5593396, 5973708, -264156, 225852, 774292, 1381164,
+              2046468, 2770204, 3552372, 4392972, 5292004, 6249468, 7265364, 8339692, 9472452, 10663644, 11913268,
+              12285900, -41020, -17636, 66228, 210572, 415396, 680700, 1006484, 1392748, 1913700, 2346716, 2914420,
+              3542604, 4231268, 4980412, 5790036, 6183980, -272700, 234524, 802228, 1430412, 2119076, 2868220, 3677844,
+              4547948, 5478532, 6469596, 7521140, 8633164, 9805668, 11038652, 12332116, 12718380, -42396, -18180, 68564,
+              217836, 429636, 703964, 1040820, 1440204, 1978884, 2426556, 3013524, 3663020, 4375044, 5149596, 5986676,
+              6394252, -281244, 243196, 830164, 1479660, 2191684, 2966236, 3803316, 4702924, 5665060, 6689724, 7776916,
+              8926636, 10138884, 11413660, 12750964, 13150860, -43772, -18724, 70900, 225100, 443876, 727228, 1075156,
+              1487660, 2044068, 2506396, 3112628, 3783436, 4518820, 5318780, 6183316, 6604524, -289788, 251868, 858100,
+              1528908, 2264292, 3064252, 3928788, 4857900, 5851588, 6909852, 8032692, 9220108, 10472100, 11788668,
+              13169812, 13583340
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=32, block_size=32, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 32, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=32, block_size=32, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
+              527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
+              548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568,
+              569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589,
+              590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+              611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631,
+              632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+              653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
+              674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694,
+              695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715,
+              716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
+              737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+              758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+              779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+              800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820,
+              821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841,
+              842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,
+              863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+              884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904,
+              905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
+              926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946,
+              947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967,
+              968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988,
+              989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+              1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024
+            ],
+            "dims": [32, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 1, 16],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              0, -1560, -2576, -3048, -2976, -2360, -1200, 504, 2736, 5544, 8880, 12760, 17184, 22152, 27664, 26040,
+              -29312, -26520, -23184, -19304, -14880, -9912, -4400, 1656, 8256, 15400, 23088, 31320, 40096, 49416,
+              59280, 53816, 0, -5368, -9168, -11400, -12064, -11160, -8688, -4648, 2224, 8136, 16880, 27192, 39072,
+              52520, 67536, 68760, -98432, -91256, -82512, -72200, -60320, -46872, -31856, -15272, 2880, 22600, 43888,
+              66744, 91168, 117160, 144720, 142104, 0, -9176, -15760, -19752, -21152, -19960, -16176, -9800, 1712,
+              10728, 24880, 41624, 60960, 82888, 107408, 111480, -167552, -155992, -141840, -125096, -105760, -83832,
+              -59312, -32200, -2496, 29800, 64688, 102168, 142240, 184904, 230160, 230392, 0, -12984, -22352, -28104,
+              -30240, -28760, -23664, -14952, 1200, 13320, 32880, 56056, 82848, 113256, 147280, 154200, -236672,
+              -220728, -201168, -177992, -151200, -120792, -86768, -49128, -7872, 37000, 85488, 137592, 193312, 252648,
+              315600, 318680, 0, -16792, -28944, -36456, -39328, -37560, -31152, -20104, 688, 15912, 40880, 70488,
+              104736, 143624, 187152, 196920, -305792, -285464, -260496, -230888, -196640, -157752, -114224, -66056,
+              -13248, 44200, 106288, 173016, 244384, 320392, 401040, 406968, 0, -20600, -35536, -44808, -48416, -46360,
+              -38640, -25256, 176, 18504, 48880, 84920, 126624, 173992, 227024, 239640, -374912, -350200, -319824,
+              -283784, -242080, -194712, -141680, -82984, -18624, 51400, 127088, 208440, 295456, 388136, 486480, 495256,
+              0, -24408, -42128, -53160, -57504, -55160, -46128, -30408, -336, 21096, 56880, 99352, 148512, 204360,
+              266896, 282360, -444032, -414936, -379152, -336680, -287520, -231672, -169136, -99912, -24000, 58600,
+              147888, 243864, 346528, 455880, 571920, 583544, 0, -28216, -48720, -61512, -66592, -63960, -53616, -35560,
+              -848, 23688, 64880, 113784, 170400, 234728, 306768, 325080, -513152, -479672, -438480, -389576, -332960,
+              -268632, -196592, -116840, -29376, 65800, 168688, 279288, 397600, 523624, 657360, 671832, 0, -32024,
+              -55312, -69864, -75680, -72760, -61104, -40712, -1360, 26280, 72880, 128216, 192288, 265096, 346640,
+              367800, -582272, -544408, -497808, -442472, -378400, -305592, -224048, -133768, -34752, 73000, 189488,
+              314712, 448672, 591368, 742800, 760120, 0, -35832, -61904, -78216, -84768, -81560, -68592, -45864, -1872,
+              28872, 80880, 142648, 214176, 295464, 386512, 410520, -651392, -609144, -557136, -495368, -423840,
+              -342552, -251504, -150696, -40128, 80200, 210288, 350136, 499744, 659112, 828240, 848408, 0, -39640,
+              -68496, -86568, -93856, -90360, -76080, -51016, -2384, 31464, 88880, 157080, 236064, 325832, 426384,
+              453240, -720512, -673880, -616464, -548264, -469280, -379512, -278960, -167624, -45504, 87400, 231088,
+              385560, 550816, 726856, 913680, 936696, 0, -43448, -75088, -94920, -102944, -99160, -83568, -56168, -2896,
+              34056, 96880, 171512, 257952, 356200, 466256, 495960, -789632, -738616, -675792, -601160, -514720,
+              -416472, -306416, -184552, -50880, 94600, 251888, 420984, 601888, 794600, 999120, 1024984, 0, -47256,
+              -81680, -103272, -112032, -107960, -91056, -61320, -3408, 36648, 104880, 185944, 279840, 386568, 506128,
+              538680, -858752, -803352, -735120, -654056, -560160, -453432, -333872, -201480, -56256, 101800, 272688,
+              456408, 652960, 862344, 1084560, 1113272, 0, -51064, -88272, -111624, -121120, -116760, -98544, -66472,
+              -3920, 39240, 112880, 200376, 301728, 416936, 546000, 581400, -927872, -868088, -794448, -706952, -605600,
+              -490392, -361328, -218408, -61632, 109000, 293488, 491832, 704032, 930088, 1170000, 1201560, 0, -54872,
+              -94864, -119976, -130208, -125560, -106032, -71624, -4432, 41832, 120880, 214808, 323616, 447304, 585872,
+              624120, -996992, -932824, -853776, -759848, -651040, -527352, -388784, -235336, -67008, 116200, 314288,
+              527256, 755104, 997832, 1255440, 1289848, 0, -58680, -101456, -128328, -139296, -134360, -113520, -76776,
+              -4944, 44424, 128880, 229240, 345504, 477672, 625744, 666840, -1066112, -997560, -913104, -812744,
+              -696480, -564312, -416240, -252264, -72384, 123400, 335088, 562680, 806176, 1065576, 1340880, 1378136, 0,
+              -62488, -108048, -136680, -148384, -143160, -121008, -81928, -5456, 47016, 136880, 243672, 367392, 508040,
+              665616, 709560, -1135232, -1062296, -972432, -865640, -741920, -601272, -443696, -269192, -77760, 130600,
+              355888, 598104, 857248, 1133320, 1426320, 1466424, 0, -66296, -114640, -145032, -157472, -151960, -128496,
+              -87080, -5968, 49608, 144880, 258104, 389280, 538408, 705488, 752280, -1204352, -1127032, -1031760,
+              -918536, -787360, -638232, -471152, -286120, -83136, 137800, 376688, 633528, 908320, 1201064, 1511760,
+              1554712, 0, -70104, -121232, -153384, -166560, -160760, -135984, -92232, -6480, 52200, 152880, 272536,
+              411168, 568776, 745360, 795000, -1273472, -1191768, -1091088, -971432, -832800, -675192, -498608, -303048,
+              -88512, 145000, 397488, 668952, 959392, 1268808, 1597200, 1643000, 0, -73912, -127824, -161736, -175648,
+              -169560, -143472, -97384, -6992, 54792, 160880, 286968, 433056, 599144, 785232, 837720, -1342592,
+              -1256504, -1150416, -1024328, -878240, -712152, -526064, -319976, -93888, 152200, 418288, 704376, 1010464,
+              1336552, 1682640, 1731288, 0, -77720, -134416, -170088, -184736, -178360, -150960, -102536, -7504, 57384,
+              168880, 301400, 454944, 629512, 825104, 880440, -1411712, -1321240, -1209744, -1077224, -923680, -749112,
+              -553520, -336904, -99264, 159400, 439088, 739800, 1061536, 1404296, 1768080, 1819576, 0, -81528, -141008,
+              -178440, -193824, -187160, -158448, -107688, -8016, 59976, 176880, 315832, 476832, 659880, 864976, 923160,
+              -1480832, -1385976, -1269072, -1130120, -969120, -786072, -580976, -353832, -104640, 166600, 459888,
+              775224, 1112608, 1472040, 1853520, 1907864, 0, -85336, -147600, -186792, -202912, -195960, -165936,
+              -112840, -8528, 62568, 184880, 330264, 498720, 690248, 904848, 965880, -1549952, -1450712, -1328400,
+              -1183016, -1014560, -823032, -608432, -370760, -110016, 173800, 480688, 810648, 1163680, 1539784, 1938960,
+              1996152, 0, -89144, -154192, -195144, -212000, -204760, -173424, -117992, -9040, 65160, 192880, 344696,
+              520608, 720616, 944720, 1008600, -1619072, -1515448, -1387728, -1235912, -1060000, -859992, -635888,
+              -387688, -115392, 181000, 501488, 846072, 1214752, 1607528, 2024400, 2084440, 0, -92952, -160784, -203496,
+              -221088, -213560, -180912, -123144, -9552, 67752, 200880, 359128, 542496, 750984, 984592, 1051320,
+              -1688192, -1580184, -1447056, -1288808, -1105440, -896952, -663344, -404616, -120768, 188200, 522288,
+              881496, 1265824, 1675272, 2109840, 2172728, 0, -96760, -167376, -211848, -230176, -222360, -188400,
+              -128296, -10064, 70344, 208880, 373560, 564384, 781352, 1024464, 1094040, -1757312, -1644920, -1506384,
+              -1341704, -1150880, -933912, -690800, -421544, -126144, 195400, 543088, 916920, 1316896, 1743016, 2195280,
+              2261016, 0, -100568, -173968, -220200, -239264, -231160, -195888, -133448, -10576, 72936, 216880, 387992,
+              586272, 811720, 1064336, 1136760, -1826432, -1709656, -1565712, -1394600, -1196320, -970872, -718256,
+              -438472, -131520, 202600, 563888, 952344, 1367968, 1810760, 2280720, 2349304, 0, -104376, -180560,
+              -228552, -248352, -239960, -203376, -138600, -11088, 75528, 224880, 402424, 608160, 842088, 1104208,
+              1179480, -1895552, -1774392, -1625040, -1447496, -1241760, -1007832, -745712, -455400, -136896, 209800,
+              584688, 987768, 1419040, 1878504, 2366160, 2437592, 0, -108184, -187152, -236904, -257440, -248760,
+              -210864, -143752, -11600, 78120, 232880, 416856, 630048, 872456, 1144080, 1222200, -1964672, -1839128,
+              -1684368, -1500392, -1287200, -1044792, -773168, -472328, -142272, 217000, 605488, 1023192, 1470112,
+              1946248, 2451600, 2525880, 0, -111992, -193744, -245256, -266528, -257560, -218352, -148904, -12112,
+              80712, 240880, 431288, 651936, 902824, 1183952, 1264920, -2033792, -1903864, -1743696, -1553288, -1332640,
+              -1081752, -800624, -489256, -147648, 224200, 626288, 1058616, 1521184, 2013992, 2537040, 2614168, 0,
+              -115800, -200336, -253608, -275616, -266360, -225840, -154056, -12624, 83304, 248880, 445720, 673824,
+              933192, 1223824, 1307640, -2102912, -1968600, -1803024, -1606184, -1378080, -1118712, -828080, -506184,
+              -153024, 231400, 647088, 1094040, 1572256, 2081736, 2622480, 2702456, 0, -119608, -206928, -261960,
+              -284704, -275160, -233328, -159208, -13136, 85896, 256880, 460152, 695712, 963560, 1263696, 1350360,
+              -2172032, -2033336, -1862352, -1659080, -1423520, -1155672, -855536, -523112, -158400, 238600, 667888,
+              1129464, 1623328, 2149480, 2707920, 2790744
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=32, N=32, block_size=32, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 32, "type": "int" },
+      { "name": "N", "data": 32, "type": "int" },
+      { "name": "block_size", "data": 32, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=32, N=32, block_size=32, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
+              527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
+              548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568,
+              569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589,
+              590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+              611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631,
+              632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+              653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
+              674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694,
+              695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715,
+              716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
+              737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+              758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+              779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+              800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820,
+              821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841,
+              842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,
+              863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+              884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904,
+              905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
+              926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946,
+              947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967,
+              968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988,
+              989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+              1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024
+            ],
+            "dims": [32, 32],
+            "type": "float32"
+          },
+          {
+            "dims": [32, 1, 16],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+              82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+              107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+              128, 29, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+              149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+              170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+              191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
+              212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
+              233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+              254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+              275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
+              296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,
+              317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+              338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
+              359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
+              380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400,
+              401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421,
+              422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
+              443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
+              464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
+              485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+              506, 507, 508, 509, 510, 511, 512
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "float32",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ]
+          },
+          {
+            "dims": [32],
+            "type": "uint8",
+            "data": [
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+              128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+            ]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [32, 32],
+            "type": "float32",
+            "data": [
+              0, 2664, 5872, 9624, 13920, 18760, 24144, 30072, 36528, 43560, 51120, 59224, 67872, 77064, 86800, 89400,
+              38272, 45288, 52848, 60952, 69600, 78792, 88528, 98808, 109632, 121000, 132912, 145368, 158368, 171912,
+              186000, 184760, 0, 7048, 15664, 25848, 37600, 50920, 65808, 82264, 101552, 119880, 141040, 163768, 188064,
+              213928, 241360, 255000, 100224, 119816, 140976, 163704, 188000, 213864, 241296, 270296, 300864, 333000,
+              366704, 401976, 438816, 477224, 517200, 527000, 0, 11432, 25456, 42072, 61280, 83080, 107472, 134456,
+              166576, 196200, 230960, 268312, 308256, 350792, 395920, 420600, 162176, 194344, 229104, 266456, 306400,
+              348936, 394064, 441784, 492096, 545000, 600496, 658584, 719264, 782536, 848400, 869240, 0, 15816, 35248,
+              58296, 84960, 115240, 149136, 186648, 231600, 272520, 320880, 372856, 428448, 487656, 550480, 586200,
+              224128, 268872, 317232, 369208, 424800, 484008, 546832, 613272, 683328, 757000, 834288, 915192, 999712,
+              1087848, 1179600, 1211480, 0, 20200, 45040, 74520, 108640, 147400, 190800, 238840, 296624, 348840, 410800,
+              477400, 548640, 624520, 705040, 751800, 286080, 343400, 405360, 471960, 543200, 619080, 699600, 784760,
+              874560, 969000, 1068080, 1171800, 1280160, 1393160, 1510800, 1553720, 0, 24584, 54832, 90744, 132320,
+              179560, 232464, 291032, 361648, 425160, 500720, 581944, 668832, 761384, 859600, 917400, 348032, 417928,
+              493488, 574712, 661600, 754152, 852368, 956248, 1065792, 1181000, 1301872, 1428408, 1560608, 1698472,
+              1842000, 1895960, 0, 28968, 64624, 106968, 156000, 211720, 274128, 343224, 426672, 501480, 590640, 686488,
+              789024, 898248, 1014160, 1083000, 409984, 492456, 581616, 677464, 780000, 889224, 1005136, 1127736,
+              1257024, 1393000, 1535664, 1685016, 1841056, 2003784, 2173200, 2238200, 0, 33352, 74416, 123192, 179680,
+              243880, 315792, 395416, 491696, 577800, 680560, 791032, 909216, 1035112, 1168720, 1248600, 471936, 566984,
+              669744, 780216, 898400, 1024296, 1157904, 1299224, 1448256, 1605000, 1769456, 1941624, 2121504, 2309096,
+              2504400, 2580440, 0, 37736, 84208, 139416, 203360, 276040, 357456, 447608, 556720, 654120, 770480, 895576,
+              1029408, 1171976, 1323280, 1414200, 533888, 641512, 757872, 882968, 1016800, 1159368, 1310672, 1470712,
+              1639488, 1817000, 2003248, 2198232, 2401952, 2614408, 2835600, 2922680, 0, 42120, 94000, 155640, 227040,
+              308200, 399120, 499800, 621744, 730440, 860400, 1000120, 1149600, 1308840, 1477840, 1579800, 595840,
+              716040, 846000, 985720, 1135200, 1294440, 1463440, 1642200, 1830720, 2029000, 2237040, 2454840, 2682400,
+              2919720, 3166800, 3264920, 0, 46504, 103792, 171864, 250720, 340360, 440784, 551992, 686768, 806760,
+              950320, 1104664, 1269792, 1445704, 1632400, 1745400, 657792, 790568, 934128, 1088472, 1253600, 1429512,
+              1616208, 1813688, 2021952, 2241000, 2470832, 2711448, 2962848, 3225032, 3498000, 3607160, 0, 50888,
+              113584, 188088, 274400, 372520, 482448, 604184, 751792, 883080, 1040240, 1209208, 1389984, 1582568,
+              1786960, 1911000, 719744, 865096, 1022256, 1191224, 1372000, 1564584, 1768976, 1985176, 2213184, 2453000,
+              2704624, 2968056, 3243296, 3530344, 3829200, 3949400, 0, 55272, 123376, 204312, 298080, 404680, 524112,
+              656376, 816816, 959400, 1130160, 1313752, 1510176, 1719432, 1941520, 2076600, 781696, 939624, 1110384,
+              1293976, 1490400, 1699656, 1921744, 2156664, 2404416, 2665000, 2938416, 3224664, 3523744, 3835656,
+              4160400, 4291640, 0, 59656, 133168, 220536, 321760, 436840, 565776, 708568, 881840, 1035720, 1220080,
+              1418296, 1630368, 1856296, 2096080, 2242200, 843648, 1014152, 1198512, 1396728, 1608800, 1834728, 2074512,
+              2328152, 2595648, 2877000, 3172208, 3481272, 3804192, 4140968, 4491600, 4633880, 0, 64040, 142960, 236760,
+              345440, 469000, 607440, 760760, 946864, 1112040, 1310000, 1522840, 1750560, 1993160, 2250640, 2407800,
+              905600, 1088680, 1286640, 1499480, 1727200, 1969800, 2227280, 2499640, 2786880, 3089000, 3406000, 3737880,
+              4084640, 4446280, 4822800, 4976120, 0, 68424, 152752, 252984, 369120, 501160, 649104, 812952, 1011888,
+              1188360, 1399920, 1627384, 1870752, 2130024, 2405200, 2573400, 967552, 1163208, 1374768, 1602232, 1845600,
+              2104872, 2380048, 2671128, 2978112, 3301000, 3639792, 3994488, 4365088, 4751592, 5154000, 5318360, 0,
+              72808, 162544, 269208, 392800, 533320, 690768, 865144, 1076912, 1264680, 1489840, 1731928, 1990944,
+              2266888, 2559760, 2739000, 1029504, 1237736, 1462896, 1704984, 1964000, 2239944, 2532816, 2842616,
+              3169344, 3513000, 3873584, 4251096, 4645536, 5056904, 5485200, 5660600, 0, 77192, 172336, 285432, 416480,
+              565480, 732432, 917336, 1141936, 1341000, 1579760, 1836472, 2111136, 2403752, 2714320, 2904600, 1091456,
+              1312264, 1551024, 1807736, 2082400, 2375016, 2685584, 3014104, 3360576, 3725000, 4107376, 4507704,
+              4925984, 5362216, 5816400, 6002840, 0, 81576, 182128, 301656, 440160, 597640, 774096, 969528, 1206960,
+              1417320, 1669680, 1941016, 2231328, 2540616, 2868880, 3070200, 1153408, 1386792, 1639152, 1910488,
+              2200800, 2510088, 2838352, 3185592, 3551808, 3937000, 4341168, 4764312, 5206432, 5667528, 6147600,
+              6345080, 0, 85960, 191920, 317880, 463840, 629800, 815760, 1021720, 1271984, 1493640, 1759600, 2045560,
+              2351520, 2677480, 3023440, 3235800, 1215360, 1461320, 1727280, 2013240, 2319200, 2645160, 2991120,
+              3357080, 3743040, 4149000, 4574960, 5020920, 5486880, 5972840, 6478800, 6687320, 0, 90344, 201712, 334104,
+              487520, 661960, 857424, 1073912, 1337008, 1569960, 1849520, 2150104, 2471712, 2814344, 3178000, 3401400,
+              1277312, 1535848, 1815408, 2115992, 2437600, 2780232, 3143888, 3528568, 3934272, 4361000, 4808752,
+              5277528, 5767328, 6278152, 6810000, 7029560, 0, 94728, 211504, 350328, 511200, 694120, 899088, 1126104,
+              1402032, 1646280, 1939440, 2254648, 2591904, 2951208, 3332560, 3567000, 1339264, 1610376, 1903536,
+              2218744, 2556000, 2915304, 3296656, 3700056, 4125504, 4573000, 5042544, 5534136, 6047776, 6583464,
+              7141200, 7371800, 0, 99112, 221296, 366552, 534880, 726280, 940752, 1178296, 1467056, 1722600, 2029360,
+              2359192, 2712096, 3088072, 3487120, 3732600, 1401216, 1684904, 1991664, 2321496, 2674400, 3050376,
+              3449424, 3871544, 4316736, 4785000, 5276336, 5790744, 6328224, 6888776, 7472400, 7714040, 0, 103496,
+              231088, 382776, 558560, 758440, 982416, 1230488, 1532080, 1798920, 2119280, 2463736, 2832288, 3224936,
+              3641680, 3898200, 1463168, 1759432, 2079792, 2424248, 2792800, 3185448, 3602192, 4043032, 4507968,
+              4997000, 5510128, 6047352, 6608672, 7194088, 7803600, 8056280, 0, 107880, 240880, 399000, 582240, 790600,
+              1024080, 1282680, 1597104, 1875240, 2209200, 2568280, 2952480, 3361800, 3796240, 4063800, 1525120,
+              1833960, 2167920, 2527000, 2911200, 3320520, 3754960, 4214520, 4699200, 5209000, 5743920, 6303960,
+              6889120, 7499400, 8134800, 8398520, 0, 112264, 250672, 415224, 605920, 822760, 1065744, 1334872, 1662128,
+              1951560, 2299120, 2672824, 3072672, 3498664, 3950800, 4229400, 1587072, 1908488, 2256048, 2629752,
+              3029600, 3455592, 3907728, 4386008, 4890432, 5421000, 5977712, 6560568, 7169568, 7804712, 8466000,
+              8740760, 0, 116648, 260464, 431448, 629600, 854920, 1107408, 1387064, 1727152, 2027880, 2389040, 2777368,
+              3192864, 3635528, 4105360, 4395000, 1649024, 1983016, 2344176, 2732504, 3148000, 3590664, 4060496,
+              4557496, 5081664, 5633000, 6211504, 6817176, 7450016, 8110024, 8797200, 9083000, 0, 121032, 270256,
+              447672, 653280, 887080, 1149072, 1439256, 1792176, 2104200, 2478960, 2881912, 3313056, 3772392, 4259920,
+              4560600, 1710976, 2057544, 2432304, 2835256, 3266400, 3725736, 4213264, 4728984, 5272896, 5845000,
+              6445296, 7073784, 7730464, 8415336, 9128400, 9425240, 0, 125416, 280048, 463896, 676960, 919240, 1190736,
+              1491448, 1857200, 2180520, 2568880, 2986456, 3433248, 3909256, 4414480, 4726200, 1772928, 2132072,
+              2520432, 2938008, 3384800, 3860808, 4366032, 4900472, 5464128, 6057000, 6679088, 7330392, 8010912,
+              8720648, 9459600, 9767480, 0, 129800, 289840, 480120, 700640, 951400, 1232400, 1543640, 1922224, 2256840,
+              2658800, 3091000, 3553440, 4046120, 4569040, 4891800, 1834880, 2206600, 2608560, 3040760, 3503200,
+              3995880, 4518800, 5071960, 5655360, 6269000, 6912880, 7587000, 8291360, 9025960, 9790800, 10109720, 0,
+              134184, 299632, 496344, 724320, 983560, 1274064, 1595832, 1987248, 2333160, 2748720, 3195544, 3673632,
+              4182984, 4723600, 5057400, 1896832, 2281128, 2696688, 3143512, 3621600, 4130952, 4671568, 5243448,
+              5846592, 6481000, 7146672, 7843608, 8571808, 9331272, 10122000, 10451960, 0, 138568, 309424, 512568,
+              748000, 1015720, 1315728, 1648024, 2052272, 2409480, 2838640, 3300088, 3793824, 4319848, 4878160, 5223000,
+              1958784, 2355656, 2784816, 3246264, 3740000, 4266024, 4824336, 5414936, 6037824, 6693000, 7380464,
+              8100216, 8852256, 9636584, 10453200, 10794200
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4, batchDim = [1]",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4, batchDim = [1]; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127
+            ],
+            "dims": [1, 8, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64
+            ]
+          },
+          {
+            "dims": [1, 8],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [1, 8, 8],
+            "type": "float32",
+            "data": [
+              0, -385, -1120, -963, -1984, -1285, -2592, -1351, 0, -1073, -3808, -2643, -6848, -3445, -9120, -3479, 0,
+              -1761, -6496, -4323, -11712, -5605, -15648, -5607, 0, -2449, -9184, -6003, -16576, -7765, -22176, -7735,
+              0, -3137, -11872, -7683, -21440, -9925, -28704, -9863, 0, -3825, -14560, -9363, -26304, -12085, -35232,
+              -11991, 0, -4513, -17248, -11043, -31168, -14245, -41760, -14119, 0, -5201, -19936, -12723, -36032,
+              -16405, -48288, -16247
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4, batchDim = [1, 2]",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4; symmetric, batchDim = [1, 2]",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255
+            ],
+            "dims": [1, 2, 8, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64
+            ]
+          },
+          {
+            "dims": [1, 8],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [1, 2, 8, 8],
+            "type": "float32",
+            "data": [
+              0, -385, -1120, -963, -1984, -1285, -2592, -1351, 0, -1073, -3808, -2643, -6848, -3445, -9120, -3479, 0,
+              -1761, -6496, -4323, -11712, -5605, -15648, -5607, 0, -2449, -9184, -6003, -16576, -7765, -22176, -7735,
+              0, -3137, -11872, -7683, -21440, -9925, -28704, -9863, 0, -3825, -14560, -9363, -26304, -12085, -35232,
+              -11991, 0, -4513, -17248, -11043, -31168, -14245, -41760, -14119, 0, -5201, -19936, -12723, -36032,
+              -16405, -48288, -16247, 0, -5889, -22624, -14403, -40896, -18565, -54816, -18375, 0, -6577, -25312,
+              -16083, -45760, -20725, -61344, -20503, 0, -7265, -28000, -17763, -50624, -22885, -67872, -22631, 0,
+              -7953, -30688, -19443, -55488, -25045, -74400, -24759, 0, -8641, -33376, -21123, -60352, -27205, -80928,
+              -26887, 0, -9329, -36064, -22803, -65216, -29365, -87456, -29015, 0, -10017, -38752, -24483, -70080,
+              -31525, -93984, -31143, 0, -10705, -41440, -26163, -74944, -33685, -100512, -33271
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; output shape = 8 X 16; K=16, N=16, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 16, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=16, block_size=16, bits=4; asymmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127
+            ],
+            "dims": [8, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [16, 1, 8],
+            "type": "uint8",
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127
+            ]
+          },
+          {
+            "dims": [16],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+          },
+          {
+            "dims": [16],
+            "type": "uint8",
+            "data": [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [8, 16],
+            "type": "float32",
+            "data": [
+              0, 728, 688, 2376, 1632, 4280, 2832, 6440, 4288, 8856, 6000, 11528, 7968, 14456, 10192, 17640, 0, 2200,
+              1840, 7176, 4448, 12920, 7824, 19432, 11968, 26712, 16880, 34760, 22560, 43576, 29008, 53160, 0, 3672,
+              2992, 11976, 7264, 21560, 12816, 32424, 19648, 44568, 27760, 57992, 37152, 72696, 47824, 88680, 0, 5144,
+              4144, 16776, 10080, 30200, 17808, 45416, 27328, 62424, 38640, 81224, 51744, 101816, 66640, 124200, 0,
+              6616, 5296, 21576, 12896, 38840, 22800, 58408, 35008, 80280, 49520, 104456, 66336, 130936, 85456, 159720,
+              0, 8088, 6448, 26376, 15712, 47480, 27792, 71400, 42688, 98136, 60400, 127688, 80928, 160056, 104272,
+              195240, 0, 9560, 7600, 31176, 18528, 56120, 32784, 84392, 50368, 115992, 71280, 150920, 95520, 189176,
+              123088, 230760, 0, 11032, 8752, 35976, 21344, 64760, 37776, 97384, 58048, 133848, 82160, 174152, 110112,
+              218296, 141904, 266280
+            ]
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "MatMulNBits; output shape = 16 X 8; K=16, N=8, block_size=16, bits=4",
+    "operator": "MatMulNBits",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "K", "data": 16, "type": "int" },
+      { "name": "N", "data": 8, "type": "int" },
+      { "name": "block_size", "data": 16, "type": "int" },
+      { "name": "bits", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "MatMulNBits; K=16, N=8, block_size=16, bits=4; symmetric",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+              106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+              127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
+              211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
+              232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+              253, 254, 255
+            ],
+            "dims": [16, 16],
+            "type": "float32"
+          },
+          {
+            "dims": [8, 1, 8],
+            "type": "uint8",
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64
+            ]
+          },
+          {
+            "dims": [8],
+            "type": "float32",
+            "data": [0, 1, 2, 3, 4, 5, 6, 7]
+          }
+        ],
+        "outputs": [
+          {
+            "dims": [16, 8],
+            "type": "float32",
+            "data": [
+              0, -385, -1120, -963, -1984, -1285, -2592, -1351, 0, -1073, -3808, -2643, -6848, -3445, -9120, -3479, 0,
+              -1761, -6496, -4323, -11712, -5605, -15648, -5607, 0, -2449, -9184, -6003, -16576, -7765, -22176, -7735,
+              0, -3137, -11872, -7683, -21440, -9925, -28704, -9863, 0, -3825, -14560, -9363, -26304, -12085, -35232,
+              -11991, 0, -4513, -17248, -11043, -31168, -14245, -41760, -14119, 0, -5201, -19936, -12723, -36032,
+              -16405, -48288, -16247, 0, -5889, -22624, -14403, -40896, -18565, -54816, -18375, 0, -6577, -25312,
+              -16083, -45760, -20725, -61344, -20503, 0, -7265, -28000, -17763, -50624, -22885, -67872, -22631, 0,
+              -7953, -30688, -19443, -55488, -25045, -74400, -24759, 0, -8641, -33376, -21123, -60352, -27205, -80928,
+              -26887, 0, -9329, -36064, -22803, -65216, -29365, -87456, -29015, 0, -10017, -38752, -24483, -70080,
+              -31525, -93984, -31143, 0, -10705, -41440, -26163, -74944, -33685, -100512, -33271
+            ]
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/rotary-embedding.jsonc b/js/web/test/data/ops/rotary-embedding.jsonc
new file mode 100644
index 000000000000..1b564ecc7740
--- /dev/null
+++ b/js/web/test/data/ops/rotary-embedding.jsonc
@@ -0,0 +1,925 @@
+[
+  {
+    "name": "RotaryEmbedding with no attributes",
+    "operator": "RotaryEmbedding",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[2,8,24] T[1] T[16,3] T[16,3]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853,
+              0.1036, -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 1.0311, -1.9557, -0.1482, 1.7376, 2.2039, -0.6589,
+              -1.0574, -0.1188, -0.9078, 0.3452, -0.5713, -0.2351, -0.5912, 1.1312, 0.7562, -1.2023, -0.5833, -0.4407,
+              0.1766, 1.0224, -0.4826, -0.5421, -0.5342, -0.6413, 1.3314, -0.4498, 0.5493, 0.0539, 0.2601, 0.857,
+              1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586, -1.9791, 0.7787, -0.7749, -0.1398, 1.1414, -0.6354,
+              0.0352, -0.4765, -0.0409, 1.1993, 0.5374, -0.193, 2.5211, -0.0452, -0.3105, -0.9407, -0.0034, 1.5199,
+              -0.848, 0.5266, 0.0299, -0.0498, 1.0651, 0.886, -1.4702, -0.2134, -0.8707, 1.6159, -0.2356, 0.9444,
+              0.5937, 0.7203, 0.5061, 1.5192, -0.4897, 0.9231, 0.2654, -0.1441, 0.5407, -1.5476, 0.6455, -1.1382, 0.464,
+              -0.4986, 0.1289, 2.7631, 0.1405, 1.1191, 2.1134, -0.9754, 0.1757, -0.1319, -0.2735, 0.3355, -0.6008,
+              -1.1164, 0.2577, -0.7226, -0.9244, 1.8737, 0.6052, 1.1904, 1.2195, -0.047, -1.0914, 1.0223, 0.3152,
+              1.7528, -0.765, 1.8299, -0.2784, -0.2719, 0.1885, 2.1432, 0.8527, 0.0965, -0.0625, 0.8269, 1.0122,
+              -1.4482, -0.0644, 0.3215, 0.5908, -1.4197, 0.2113, 0.0306, 0.3604, 0.3166, -0.8975, -0.6393, -1.2944,
+              -0.0243, -0.2354, -0.7087, 1.1566, 0.4296, 0.5599, -0.7776, 0.3339, 0.1759, 2.1108, 1.0702, 0.8279,
+              -0.2969, 0.712, -0.2068, -0.1548, 0.1553, 0.6207, -0.169, -0.5816, 1.2632, 0.0695, 1.1862, -1.1874,
+              -0.7468, -0.932, -0.8579, -0.9647, -0.0991, 0.0195, 1.1213, -1.4873, -0.2043, -1.0466, -1.5772, -0.0489,
+              0.343, 0.1264, 0.1519, -1.3639, -1.6593, 1.8127, -1.4459, -0.2158, -0.9792, -1.4392, 0.6508, 0.8964,
+              0.5717, -0.239, 0.6983, -1.3416, 0.2715, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306, 1.4788, 0.2754,
+              -0.0261, -0.4618, -0.5646, -1.0389, 0.5819, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, -0.5996,
+              -1.0962, 1.6327, 1.3951, 0.8784, 0.3389, 1.2907, 0.3124, 0.7299, 1.422, 0.3375, 0.0438, 1.8698, -0.2635,
+              -2.0799, -0.6313, 0.409, -1.1458, 0.0784, -1.8848, -1.6165, 0.6179, 0.9905, -0.0729, 0.5054, -0.6681,
+              -1.4382, 1.7547, -0.9605, -0.4558, -1.6105, 0.2979, 1.1537, -1.5604, 1.2779, -1.2514, 0.6056, 0.5763,
+              -3.3558, 0.2836, 0.6909, -0.7631, 2.4451, -0.35, 1.3289, -0.6494, 0.3478, 1.0038, -0.2937, 0.9238,
+              -1.2185, 0.4138, 0.5033, 0.9174, 1.8131, 1.4436, -0.4207, 0.022, -0.6807, -1.3306, 1.5646, 0.3338, 0.7105,
+              0.4683, -0.6179, 0.0818, -0.0488, -0.981, -1.3632, 0.0929, -1.7926, -0.2921, -0.4792, 0.6756, -0.3413,
+              -0.2242, -0.2111, 0.6282, 0.1667, -1.4055, 1.5895, 1.0838, -0.9077, -0.806, 0.7967, -2.9351, 2.4179,
+              -0.4026, 0.6451, 1.6845, -0.0901, 0.6106, 2.3603, 1.3908, -0.7917, -0.6734, -0.1213, -1.1116, -0.7401,
+              -0.7879, 0.0606, -2.3337, -1.2603, -1.7245, -0.3533, -0.9421, -0.1776, 0.3992, -1.7142, -0.5319, -0.8848,
+              0.6513, 1.0002, -1.4699, -1.4254, 0.7013, 0.2414, 0.2551, -0.7457, 0.3133, -1.0941, -0.3682, -0.0163,
+              -0.0645, -0.8101, 0.1415, 0.0551, 0.5873, -0.5887, -1.4733, -0.8565, 0.74, -0.5033, 0.0553, 0.9265,
+              -0.8652, -0.0288, -0.2209, 0.061, 0.6776, 0.4361, -0.8052, 0.3955, 0.8988, 0.8238, 0.2262, 1.2912, 0.6488,
+              1.2114, 1.3569, 0.2983, 0.4718, -1.1936, 0.7928, -0.8665, 0.9468, 1.1629, 0.0616, -1.3136, -0.2764,
+              0.0277, -0.1126, 0.2342, -0.5866, -1.8219, 1.1079, 0.5795, -1.4249
+            ],
+            "dims": [2, 8, 24],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [1],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 1.0, 0.5403, 0.9989, 1.0, -0.4161, 0.9957, 1.0, -0.99, 0.9903, 1.0, -0.6536, 0.9828, 1.0,
+              0.2837, 0.9732, 0.9999, 0.9602, 0.9615, 0.9999, 0.7539, 0.9477, 0.9999, -0.1455, 0.9318, 0.9999, -0.9111,
+              0.914, 0.9998, -0.8391, 0.8942, 0.9998, 0.0044, 0.8725, 0.9997, 0.8439, 0.8488, 0.9997, 0.9074, 0.8234,
+              0.9996, 0.1367, 0.7962, 0.9995, -0.7597, 0.7673, 0.9995
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.0, 0.8415, 0.0464, 0.0022, 0.9093, 0.0927, 0.0043, 0.1411, 0.1388, 0.0065, -0.7568, 0.1846,
+              0.0086, -0.9589, 0.23, 0.0108, -0.2794, 0.2749, 0.0129, 0.657, 0.3192, 0.0151, 0.9894, 0.3629, 0.0172,
+              0.4121, 0.4057, 0.0194, -0.544, 0.4477, 0.0215, -1.0, 0.4887, 0.0237, -0.5366, 0.5286, 0.0259, 0.4202,
+              0.5675, 0.028, 0.9906, 0.605, 0.0302, 0.6503, 0.6413, 0.0323
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853,
+              0.1036, -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 1.0311, -1.9557, -0.1482, 1.7376, 2.2039, -0.6589,
+              -0.8618, -0.0922, -0.9073, -0.7032, -0.5762, -0.2371, 0.6923, 1.1571, 0.7572, -1.1471, -0.5302, -0.4391,
+              0.5516, 1.0461, -0.4812, -0.1443, -0.4862, -0.6423, 0.674, -0.4614, 0.5475, 1.1495, 0.2389, 0.8582,
+              -0.0259, -0.6099, -0.223, 1.0963, -1.5704, -0.4595, 0.9507, 0.6696, -0.7721, -1.7415, 1.2087, -0.6387,
+              -1.1052, -0.5243, -0.04, -0.4671, 0.4909, -0.1931, -0.1937, -0.0447, -0.3171, 2.6839, -0.0076, 1.5185,
+              0.8465, 0.3737, 0.0242, -0.0703, 1.1279, 0.8862, 1.2275, -0.1786, -0.8767, -1.8072, -0.263, 0.9387,
+              -0.8021, 0.7813, 0.5001, -1.4202, -0.385, 0.9263, -0.0443, -0.2323, 0.548, 1.5696, 0.6193, -1.1346,
+              1.7878, -0.516, 0.1192, -2.1572, 0.046, 1.1202, -1.4812, -0.9082, 0.1728, -1.5132, -0.4489, 0.337,
+              -0.1541, -0.9266, 0.2416, 0.927, -1.1146, 1.8758, -0.4312, 1.3714, 1.2106, -0.4272, -0.8529, 1.0328,
+              1.8441, 1.7698, -0.762, 0.2168, 0.1322, -0.2802, 0.146, 2.1002, 0.8437, -0.1534, 0.4321, 0.836, 0.5955,
+              -1.5452, -0.0491, -0.8794, 0.2418, -1.4203, 0.3635, 0.2362, 0.3672, -0.1128, -0.8664, -0.6354, -1.4409,
+              -0.3413, -0.2409, -0.3188, 1.1054, 0.4265, 0.5867, -1.3279, 0.3201, 0.0125, 1.8157, 1.0745, 0.7372,
+              -0.2429, 0.71, -0.4299, -0.2304, 0.1645, 0.9489, -0.1816, -0.5968, 1.0394, 0.0204, 1.1786, -0.3315,
+              -0.3997, -0.9304, -1.4268, -1.1526, -0.1132, 0.149, 1.3967, -1.4634, -0.1412, -0.6339, -1.5995, -0.1366,
+              0.7604, 0.1514, 0.0824, -1.183, -1.6572, 2.0099, -0.9108, -0.2256, 0.4527, -1.8254, 0.6475, 0.8964,
+              0.5717, -0.239, 0.6983, -1.3416, 0.2715, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306, 1.4788, 0.2754,
+              -0.0261, -0.4618, -0.5646, -1.0389, 0.5819, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, -1.4979,
+              -1.1358, 1.632, 0.2493, 0.8266, 0.3424, -0.4992, 0.2964, 0.7298, 1.8544, 0.3516, 0.0454, 1.5415, -0.2822,
+              -2.0774, 1.2323, 0.3963, -1.1503, -0.4775, -1.9287, -1.6164, 0.3998, 0.902, -0.0764, -1.8059, -0.5762,
+              -1.4362, -0.2706, -1.0183, -0.462, 2.0891, 0.1782, 1.1591, -0.8151, 1.3, -1.2464, -0.5099, 0.5098,
+              -3.3525, 0.4326, 0.7414, -0.7775, -0.4271, -0.3807, 1.3245, 2.4936, 0.3139, 1.0095, 0.2323, 0.845,
+              -1.2244, -0.4511, 0.6266, 0.9095, -1.7981, 1.5241, -0.4121, 0.2341, -0.4737, -1.3333, -1.615, 0.4164,
+              0.71, -0.2429, -0.5656, 0.0863, 0.0352, -0.7227, -1.3613, -0.0988, -1.9114, -0.3009, 0.1435, 0.7029,
+              -0.3467, 0.5092, -0.0828, 0.6253, 0.7113, -1.2138, 1.5964, -0.8346, -1.1515, -0.7923, -0.8254, -3.0038,
+              2.4033, -0.3398, 0.0922, 1.7053, 1.1114, 0.7462, 2.366, -0.8409, -0.6654, -0.653, -0.7899, -1.0957,
+              -0.7149, -0.1072, -0.1967, -2.3416, -1.2609, -1.6375, -0.3576, 0.9413, -0.5694, 0.3954, 0.1383, -0.7477,
+              -0.8689, 1.8286, 0.851, -1.4793, -0.1597, 0.8541, 0.238, 1.4392, -0.5644, 0.3158, -1.0686, -0.1313,
+              -0.0181, 0.2438, -0.8801, 0.1413, -0.3587, 0.8002, -0.5982, -1.4301, -0.662, 0.7324, -0.725, 0.061,
+              0.9293, -0.6902, -0.0125, -0.2089, -0.1664, 0.5428, 0.4245, -0.7901, 0.5665, 0.9044, 0.1948, -0.1723,
+              1.2705, 1.0303, 1.2202, 1.3762, -0.2959, 0.7237, -1.2077, 0.7937, -0.6705, 0.9287, 1.0583, 0.0496,
+              -1.3118, 0.5556, 0.0459, -0.1324, -0.5513, -0.7409, -1.8002, 0.9892, 0.3619, -1.4522
+            ],
+            "dims": [2, 8, 24],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[2,8,24] Scalar T[16,3] T[16,3]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853,
+              0.1036, -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 1.0311, -1.9557, -0.1482, 1.7376, 2.2039, -0.6589,
+              -1.0574, -0.1188, -0.9078, 0.3452, -0.5713, -0.2351, -0.5912, 1.1312, 0.7562, -1.2023, -0.5833, -0.4407,
+              0.1766, 1.0224, -0.4826, -0.5421, -0.5342, -0.6413, 1.3314, -0.4498, 0.5493, 0.0539, 0.2601, 0.857,
+              1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586, -1.9791, 0.7787, -0.7749, -0.1398, 1.1414, -0.6354,
+              0.0352, -0.4765, -0.0409, 1.1993, 0.5374, -0.193, 2.5211, -0.0452, -0.3105, -0.9407, -0.0034, 1.5199,
+              -0.848, 0.5266, 0.0299, -0.0498, 1.0651, 0.886, -1.4702, -0.2134, -0.8707, 1.6159, -0.2356, 0.9444,
+              0.5937, 0.7203, 0.5061, 1.5192, -0.4897, 0.9231, 0.2654, -0.1441, 0.5407, -1.5476, 0.6455, -1.1382, 0.464,
+              -0.4986, 0.1289, 2.7631, 0.1405, 1.1191, 2.1134, -0.9754, 0.1757, -0.1319, -0.2735, 0.3355, -0.6008,
+              -1.1164, 0.2577, -0.7226, -0.9244, 1.8737, 0.6052, 1.1904, 1.2195, -0.047, -1.0914, 1.0223, 0.3152,
+              1.7528, -0.765, 1.8299, -0.2784, -0.2719, 0.1885, 2.1432, 0.8527, 0.0965, -0.0625, 0.8269, 1.0122,
+              -1.4482, -0.0644, 0.3215, 0.5908, -1.4197, 0.2113, 0.0306, 0.3604, 0.3166, -0.8975, -0.6393, -1.2944,
+              -0.0243, -0.2354, -0.7087, 1.1566, 0.4296, 0.5599, -0.7776, 0.3339, 0.1759, 2.1108, 1.0702, 0.8279,
+              -0.2969, 0.712, -0.2068, -0.1548, 0.1553, 0.6207, -0.169, -0.5816, 1.2632, 0.0695, 1.1862, -1.1874,
+              -0.7468, -0.932, -0.8579, -0.9647, -0.0991, 0.0195, 1.1213, -1.4873, -0.2043, -1.0466, -1.5772, -0.0489,
+              0.343, 0.1264, 0.1519, -1.3639, -1.6593, 1.8127, -1.4459, -0.2158, -0.9792, -1.4392, 0.6508, 0.8964,
+              0.5717, -0.239, 0.6983, -1.3416, 0.2715, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306, 1.4788, 0.2754,
+              -0.0261, -0.4618, -0.5646, -1.0389, 0.5819, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, -0.5996,
+              -1.0962, 1.6327, 1.3951, 0.8784, 0.3389, 1.2907, 0.3124, 0.7299, 1.422, 0.3375, 0.0438, 1.8698, -0.2635,
+              -2.0799, -0.6313, 0.409, -1.1458, 0.0784, -1.8848, -1.6165, 0.6179, 0.9905, -0.0729, 0.5054, -0.6681,
+              -1.4382, 1.7547, -0.9605, -0.4558, -1.6105, 0.2979, 1.1537, -1.5604, 1.2779, -1.2514, 0.6056, 0.5763,
+              -3.3558, 0.2836, 0.6909, -0.7631, 2.4451, -0.35, 1.3289, -0.6494, 0.3478, 1.0038, -0.2937, 0.9238,
+              -1.2185, 0.4138, 0.5033, 0.9174, 1.8131, 1.4436, -0.4207, 0.022, -0.6807, -1.3306, 1.5646, 0.3338, 0.7105,
+              0.4683, -0.6179, 0.0818, -0.0488, -0.981, -1.3632, 0.0929, -1.7926, -0.2921, -0.4792, 0.6756, -0.3413,
+              -0.2242, -0.2111, 0.6282, 0.1667, -1.4055, 1.5895, 1.0838, -0.9077, -0.806, 0.7967, -2.9351, 2.4179,
+              -0.4026, 0.6451, 1.6845, -0.0901, 0.6106, 2.3603, 1.3908, -0.7917, -0.6734, -0.1213, -1.1116, -0.7401,
+              -0.7879, 0.0606, -2.3337, -1.2603, -1.7245, -0.3533, -0.9421, -0.1776, 0.3992, -1.7142, -0.5319, -0.8848,
+              0.6513, 1.0002, -1.4699, -1.4254, 0.7013, 0.2414, 0.2551, -0.7457, 0.3133, -1.0941, -0.3682, -0.0163,
+              -0.0645, -0.8101, 0.1415, 0.0551, 0.5873, -0.5887, -1.4733, -0.8565, 0.74, -0.5033, 0.0553, 0.9265,
+              -0.8652, -0.0288, -0.2209, 0.061, 0.6776, 0.4361, -0.8052, 0.3955, 0.8988, 0.8238, 0.2262, 1.2912, 0.6488,
+              1.2114, 1.3569, 0.2983, 0.4718, -1.1936, 0.7928, -0.8665, 0.9468, 1.1629, 0.0616, -1.3136, -0.2764,
+              0.0277, -0.1126, 0.2342, -0.5866, -1.8219, 1.1079, 0.5795, -1.4249
+            ],
+            "dims": [2, 8, 24],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 1.0, 0.5403, 0.9989, 1.0, -0.4161, 0.9957, 1.0, -0.99, 0.9903, 1.0, -0.6536, 0.9828, 1.0,
+              0.2837, 0.9732, 0.9999, 0.9602, 0.9615, 0.9999, 0.7539, 0.9477, 0.9999, -0.1455, 0.9318, 0.9999, -0.9111,
+              0.914, 0.9998, -0.8391, 0.8942, 0.9998, 0.0044, 0.8725, 0.9997, 0.8439, 0.8488, 0.9997, 0.9074, 0.8234,
+              0.9996, 0.1367, 0.7962, 0.9995, -0.7597, 0.7673, 0.9995
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.0, 0.8415, 0.0464, 0.0022, 0.9093, 0.0927, 0.0043, 0.1411, 0.1388, 0.0065, -0.7568, 0.1846,
+              0.0086, -0.9589, 0.23, 0.0108, -0.2794, 0.2749, 0.0129, 0.657, 0.3192, 0.0151, 0.9894, 0.3629, 0.0172,
+              0.4121, 0.4057, 0.0194, -0.544, 0.4477, 0.0215, -1.0, 0.4887, 0.0237, -0.5366, 0.5286, 0.0259, 0.4202,
+              0.5675, 0.028, 0.9906, 0.605, 0.0302, 0.6503, 0.6413, 0.0323
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853,
+              0.1036, -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 1.0311, -1.9557, -0.1482, 1.7376, 2.2039, -0.6589,
+              -0.8618, -0.0922, -0.9073, -0.7032, -0.5762, -0.2371, 0.6923, 1.1571, 0.7572, -1.1471, -0.5302, -0.4391,
+              0.5516, 1.0461, -0.4812, -0.1443, -0.4862, -0.6423, 0.674, -0.4614, 0.5475, 1.1495, 0.2389, 0.8582,
+              -0.0259, -0.6099, -0.223, 1.0963, -1.5704, -0.4595, 0.9507, 0.6696, -0.7721, -1.7415, 1.2087, -0.6387,
+              -1.1052, -0.5243, -0.04, -0.4671, 0.4909, -0.1931, -0.1937, -0.0447, -0.3171, 2.6839, -0.0076, 1.5185,
+              0.8465, 0.3737, 0.0242, -0.0703, 1.1279, 0.8862, 1.2275, -0.1786, -0.8767, -1.8072, -0.263, 0.9387,
+              -0.8021, 0.7813, 0.5001, -1.4202, -0.385, 0.9263, -0.0443, -0.2323, 0.548, 1.5696, 0.6193, -1.1346,
+              1.7878, -0.516, 0.1192, -2.1572, 0.046, 1.1202, -1.4812, -0.9082, 0.1728, -1.5132, -0.4489, 0.337,
+              -0.1541, -0.9266, 0.2416, 0.927, -1.1146, 1.8758, -0.4312, 1.3714, 1.2106, -0.4272, -0.8529, 1.0328,
+              1.8441, 1.7698, -0.762, 0.2168, 0.1322, -0.2802, 0.146, 2.1002, 0.8437, -0.1534, 0.4321, 0.836, 0.5955,
+              -1.5452, -0.0491, -0.8794, 0.2418, -1.4203, 0.3635, 0.2362, 0.3672, -0.1128, -0.8664, -0.6354, -1.4409,
+              -0.3413, -0.2409, -0.3188, 1.1054, 0.4265, 0.5867, -1.3279, 0.3201, 0.0125, 1.8157, 1.0745, 0.7372,
+              -0.2429, 0.71, -0.4299, -0.2304, 0.1645, 0.9489, -0.1816, -0.5968, 1.0394, 0.0204, 1.1786, -0.3315,
+              -0.3997, -0.9304, -1.4268, -1.1526, -0.1132, 0.149, 1.3967, -1.4634, -0.1412, -0.6339, -1.5995, -0.1366,
+              0.7604, 0.1514, 0.0824, -1.183, -1.6572, 2.0099, -0.9108, -0.2256, 0.4527, -1.8254, 0.6475, 0.8964,
+              0.5717, -0.239, 0.6983, -1.3416, 0.2715, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306, 1.4788, 0.2754,
+              -0.0261, -0.4618, -0.5646, -1.0389, 0.5819, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, -1.4979,
+              -1.1358, 1.632, 0.2493, 0.8266, 0.3424, -0.4992, 0.2964, 0.7298, 1.8544, 0.3516, 0.0454, 1.5415, -0.2822,
+              -2.0774, 1.2323, 0.3963, -1.1503, -0.4775, -1.9287, -1.6164, 0.3998, 0.902, -0.0764, -1.8059, -0.5762,
+              -1.4362, -0.2706, -1.0183, -0.462, 2.0891, 0.1782, 1.1591, -0.8151, 1.3, -1.2464, -0.5099, 0.5098,
+              -3.3525, 0.4326, 0.7414, -0.7775, -0.4271, -0.3807, 1.3245, 2.4936, 0.3139, 1.0095, 0.2323, 0.845,
+              -1.2244, -0.4511, 0.6266, 0.9095, -1.7981, 1.5241, -0.4121, 0.2341, -0.4737, -1.3333, -1.615, 0.4164,
+              0.71, -0.2429, -0.5656, 0.0863, 0.0352, -0.7227, -1.3613, -0.0988, -1.9114, -0.3009, 0.1435, 0.7029,
+              -0.3467, 0.5092, -0.0828, 0.6253, 0.7113, -1.2138, 1.5964, -0.8346, -1.1515, -0.7923, -0.8254, -3.0038,
+              2.4033, -0.3398, 0.0922, 1.7053, 1.1114, 0.7462, 2.366, -0.8409, -0.6654, -0.653, -0.7899, -1.0957,
+              -0.7149, -0.1072, -0.1967, -2.3416, -1.2609, -1.6375, -0.3576, 0.9413, -0.5694, 0.3954, 0.1383, -0.7477,
+              -0.8689, 1.8286, 0.851, -1.4793, -0.1597, 0.8541, 0.238, 1.4392, -0.5644, 0.3158, -1.0686, -0.1313,
+              -0.0181, 0.2438, -0.8801, 0.1413, -0.3587, 0.8002, -0.5982, -1.4301, -0.662, 0.7324, -0.725, 0.061,
+              0.9293, -0.6902, -0.0125, -0.2089, -0.1664, 0.5428, 0.4245, -0.7901, 0.5665, 0.9044, 0.1948, -0.1723,
+              1.2705, 1.0303, 1.2202, 1.3762, -0.2959, 0.7237, -1.2077, 0.7937, -0.6705, 0.9287, 1.0583, 0.0496,
+              -1.3118, 0.5556, 0.0459, -0.1324, -0.5513, -0.7409, -1.8002, 0.9892, 0.3619, -1.4522
+            ],
+            "dims": [2, 8, 24],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[2,4,8,6] T[1] T[16,3] T[16,3]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.0574, -0.1188, -0.9078, 0.3452, -0.5713, -0.2351,
+              1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586, -0.848, 0.5266, 0.0299, -0.0498, 1.0651, 0.886, 0.464,
+              -0.4986, 0.1289, 2.7631, 0.1405, 1.1191, 0.3152, 1.7528, -0.765, 1.8299, -0.2784, -0.2719, -1.2944,
+              -0.0243, -0.2354, -0.7087, 1.1566, 0.4296, -1.1874, -0.7468, -0.932, -0.8579, -0.9647, -0.0991, -1.019,
+              0.3157, -1.6036, 1.8493, 0.0447, 1.5853, -0.5912, 1.1312, 0.7562, -1.2023, -0.5833, -0.4407, -1.9791,
+              0.7787, -0.7749, -0.1398, 1.1414, -0.6354, -1.4702, -0.2134, -0.8707, 1.6159, -0.2356, 0.9444, 2.1134,
+              -0.9754, 0.1757, -0.1319, -0.2735, 0.3355, 0.1885, 2.1432, 0.8527, 0.0965, -0.0625, 0.8269, 0.5599,
+              -0.7776, 0.3339, 0.1759, 2.1108, 1.0702, 0.0195, 1.1213, -1.4873, -0.2043, -1.0466, -1.5772, 0.1036,
+              -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 0.1766, 1.0224, -0.4826, -0.5421, -0.5342, -0.6413, 0.0352,
+              -0.4765, -0.0409, 1.1993, 0.5374, -0.193, 0.5937, 0.7203, 0.5061, 1.5192, -0.4897, 0.9231, -0.6008,
+              -1.1164, 0.2577, -0.7226, -0.9244, 1.8737, 1.0122, -1.4482, -0.0644, 0.3215, 0.5908, -1.4197, 0.8279,
+              -0.2969, 0.712, -0.2068, -0.1548, 0.1553, -0.0489, 0.343, 0.1264, 0.1519, -1.3639, -1.6593, 1.0311,
+              -1.9557, -0.1482, 1.7376, 2.2039, -0.6589, 1.3314, -0.4498, 0.5493, 0.0539, 0.2601, 0.857, 2.5211,
+              -0.0452, -0.3105, -0.9407, -0.0034, 1.5199, 0.2654, -0.1441, 0.5407, -1.5476, 0.6455, -1.1382, 0.6052,
+              1.1904, 1.2195, -0.047, -1.0914, 1.0223, 0.2113, 0.0306, 0.3604, 0.3166, -0.8975, -0.6393, 0.6207, -0.169,
+              -0.5816, 1.2632, 0.0695, 1.1862, 1.8127, -1.4459, -0.2158, -0.9792, -1.4392, 0.6508, 0.8964, 0.5717,
+              -0.239, 0.6983, -1.3416, 0.2715, -0.5996, -1.0962, 1.6327, 1.3951, 0.8784, 0.3389, 0.5054, -0.6681,
+              -1.4382, 1.7547, -0.9605, -0.4558, -0.2937, 0.9238, -1.2185, 0.4138, 0.5033, 0.9174, -0.4792, 0.6756,
+              -0.3413, -0.2242, -0.2111, 0.6282, -0.1213, -1.1116, -0.7401, -0.7879, 0.0606, -2.3337, -1.0941, -0.3682,
+              -0.0163, -0.0645, -0.8101, 0.1415, 0.8238, 0.2262, 1.2912, 0.6488, 1.2114, 1.3569, -0.2852, 0.6051,
+              0.2167, -0.2181, -1.6306, 1.4788, 1.2907, 0.3124, 0.7299, 1.422, 0.3375, 0.0438, -1.6105, 0.2979, 1.1537,
+              -1.5604, 1.2779, -1.2514, 1.8131, 1.4436, -0.4207, 0.022, -0.6807, -1.3306, 0.1667, -1.4055, 1.5895,
+              1.0838, -0.9077, -0.806, -1.2603, -1.7245, -0.3533, -0.9421, -0.1776, 0.3992, 0.0551, 0.5873, -0.5887,
+              -1.4733, -0.8565, 0.74, 0.2983, 0.4718, -1.1936, 0.7928, -0.8665, 0.9468, 0.2754, -0.0261, -0.4618,
+              -0.5646, -1.0389, 0.5819, 1.8698, -0.2635, -2.0799, -0.6313, 0.409, -1.1458, 0.6056, 0.5763, -3.3558,
+              0.2836, 0.6909, -0.7631, 1.5646, 0.3338, 0.7105, 0.4683, -0.6179, 0.0818, 0.7967, -2.9351, 2.4179,
+              -0.4026, 0.6451, 1.6845, -1.7142, -0.5319, -0.8848, 0.6513, 1.0002, -1.4699, -0.5033, 0.0553, 0.9265,
+              -0.8652, -0.0288, -0.2209, 1.1629, 0.0616, -1.3136, -0.2764, 0.0277, -0.1126, 1.3697, 0.0002, 1.5333,
+              -1.0556, -0.1254, 0.1527, 0.0784, -1.8848, -1.6165, 0.6179, 0.9905, -0.0729, 2.4451, -0.35, 1.3289,
+              -0.6494, 0.3478, 1.0038, -0.0488, -0.981, -1.3632, 0.0929, -1.7926, -0.2921, -0.0901, 0.6106, 2.3603,
+              1.3908, -0.7917, -0.6734, -1.4254, 0.7013, 0.2414, 0.2551, -0.7457, 0.3133, 0.061, 0.6776, 0.4361,
+              -0.8052, 0.3955, 0.8988, 0.2342, -0.5866, -1.8219, 1.1079, 0.5795, -1.4249
+            ],
+            "dims": [2, 4, 8, 6],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [1],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 1.0, 0.5403, 0.9989, 1.0, -0.4161, 0.9957, 1.0, -0.99, 0.9903, 1.0, -0.6536, 0.9828, 1.0,
+              0.2837, 0.9732, 0.9999, 0.9602, 0.9615, 0.9999, 0.7539, 0.9477, 0.9999, -0.1455, 0.9318, 0.9999, -0.9111,
+              0.914, 0.9998, -0.8391, 0.8942, 0.9998, 0.0044, 0.8725, 0.9997, 0.8439, 0.8488, 0.9997, 0.9074, 0.8234,
+              0.9996, 0.1367, 0.7962, 0.9995, -0.7597, 0.7673, 0.9995
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.0, 0.8415, 0.0464, 0.0022, 0.9093, 0.0927, 0.0043, 0.1411, 0.1388, 0.0065, -0.7568, 0.1846,
+              0.0086, -0.9589, 0.23, 0.0108, -0.2794, 0.2749, 0.0129, 0.657, 0.3192, 0.0151, 0.9894, 0.3629, 0.0172,
+              0.4121, 0.4057, 0.0194, -0.544, 0.4477, 0.0215, -1.0, 0.4887, 0.0237, -0.5366, 0.5286, 0.0259, 0.4202,
+              0.5675, 0.028, 0.9906, 0.605, 0.0302, 0.6503, 0.6413, 0.0323
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -0.8618, -0.0922, -0.9073, -0.7032, -0.5762, -0.2371,
+              -0.0259, -0.6099, -0.223, 1.0963, -1.5704, -0.4595, 0.8465, 0.3737, 0.0242, -0.0703, 1.1279, 0.8862,
+              1.7878, -0.516, 0.1192, -2.1572, 0.046, 1.1202, 1.8441, 1.7698, -0.762, 0.2168, 0.1322, -0.2802, -1.4409,
+              -0.3413, -0.2409, -0.3188, 1.1054, 0.4265, -0.3315, -0.3997, -0.9304, -1.4268, -1.1526, -0.1132, -1.019,
+              0.3157, -1.6036, 1.8493, 0.0447, 1.5853, 0.6923, 1.1571, 0.7572, -1.1471, -0.5302, -0.4391, 0.9507,
+              0.6696, -0.7721, -1.7415, 1.2087, -0.6387, 1.2275, -0.1786, -0.8767, -1.8072, -0.263, 0.9387, -1.4812,
+              -0.9082, 0.1728, -1.5132, -0.4489, 0.337, 0.146, 2.1002, 0.8437, -0.1534, 0.4321, 0.836, 0.5867, -1.3279,
+              0.3201, 0.0125, 1.8157, 1.0745, 0.149, 1.3967, -1.4634, -0.1412, -0.6339, -1.5995, 0.1036, -0.3514,
+              0.2421, 0.6463, 0.873, -0.9276, 0.5516, 1.0461, -0.4812, -0.1443, -0.4862, -0.6423, -1.1052, -0.5243,
+              -0.04, -0.4671, 0.4909, -0.1931, -0.8021, 0.7813, 0.5001, -1.4202, -0.385, 0.9263, -0.1541, -0.9266,
+              0.2416, 0.927, -1.1146, 1.8758, 0.5955, -1.5452, -0.0491, -0.8794, 0.2418, -1.4203, 0.7372, -0.2429, 0.71,
+              -0.4299, -0.2304, 0.1645, -0.1366, 0.7604, 0.1514, 0.0824, -1.183, -1.6572, 1.0311, -1.9557, -0.1482,
+              1.7376, 2.2039, -0.6589, 0.674, -0.4614, 0.5475, 1.1495, 0.2389, 0.8582, -0.1937, -0.0447, -0.3171,
+              2.6839, -0.0076, 1.5185, -0.0443, -0.2323, 0.548, 1.5696, 0.6193, -1.1346, -0.4312, 1.3714, 1.2106,
+              -0.4272, -0.8529, 1.0328, 0.3635, 0.2362, 0.3672, -0.1128, -0.8664, -0.6354, 0.9489, -0.1816, -0.5968,
+              1.0394, 0.0204, 1.1786, 2.0099, -0.9108, -0.2256, 0.4527, -1.8254, 0.6475, 0.8964, 0.5717, -0.239, 0.6983,
+              -1.3416, 0.2715, -1.4979, -1.1358, 1.632, 0.2493, 0.8266, 0.3424, -1.8059, -0.5762, -1.4362, -0.2706,
+              -1.0183, -0.462, 0.2323, 0.845, -1.2244, -0.4511, 0.6266, 0.9095, 0.1435, 0.7029, -0.3467, 0.5092,
+              -0.0828, 0.6253, -0.7899, -1.0957, -0.7149, -0.1072, -0.1967, -2.3416, -1.0686, -0.1313, -0.0181, 0.2438,
+              -0.8801, 0.1413, 0.1948, -0.1723, 1.2705, 1.0303, 1.2202, 1.3762, -0.2852, 0.6051, 0.2167, -0.2181,
+              -1.6306, 1.4788, -0.4992, 0.2964, 0.7298, 1.8544, 0.3516, 0.0454, 2.0891, 0.1782, 1.1591, -0.8151, 1.3,
+              -1.2464, -1.7981, 1.5241, -0.4121, 0.2341, -0.4737, -1.3333, 0.7113, -1.2138, 1.5964, -0.8346, -1.1515,
+              -0.7923, -1.2609, -1.6375, -0.3576, 0.9413, -0.5694, 0.3954, -0.3587, 0.8002, -0.5982, -1.4301, -0.662,
+              0.7324, -0.2959, 0.7237, -1.2077, 0.7937, -0.6705, 0.9287, 0.2754, -0.0261, -0.4618, -0.5646, -1.0389,
+              0.5819, 1.5415, -0.2822, -2.0774, 1.2323, 0.3963, -1.1503, -0.5099, 0.5098, -3.3525, 0.4326, 0.7414,
+              -0.7775, -1.615, 0.4164, 0.71, -0.2429, -0.5656, 0.0863, -0.8254, -3.0038, 2.4033, -0.3398, 0.0922,
+              1.7053, 0.1383, -0.7477, -0.8689, 1.8286, 0.851, -1.4793, -0.725, 0.061, 0.9293, -0.6902, -0.0125,
+              -0.2089, 1.0583, 0.0496, -1.3118, 0.5556, 0.0459, -0.1324, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254,
+              0.1527, -0.4775, -1.9287, -1.6164, 0.3998, 0.902, -0.0764, -0.4271, -0.3807, 1.3245, 2.4936, 0.3139,
+              1.0095, 0.0352, -0.7227, -1.3613, -0.0988, -1.9114, -0.3009, 1.1114, 0.7462, 2.366, -0.8409, -0.6654,
+              -0.653, -0.1597, 0.8541, 0.238, 1.4392, -0.5644, 0.3158, -0.1664, 0.5428, 0.4245, -0.7901, 0.5665, 0.9044,
+              -0.5513, -0.7409, -1.8002, 0.9892, 0.3619, -1.4522
+            ],
+            "dims": [2, 4, 8, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[1,2,18] T[1,2] T[4,3] T[4,3]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, 1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586,
+              -0.8663, -0.2656, 0.1665, 0.7911, -0.932, -0.8579, -1.0574, -0.1188, -0.9078, 0.3452, -0.5713, -0.2351,
+              -0.848, 0.5266, -1.2944, -0.0243, -0.2354, -0.7087, -0.9647, -0.0991, -0.2994, -0.065, -1.572, -1.3211
+            ],
+            "dims": [1, 2, 18],
+            "type": "float32"
+          },
+          {
+            "data": [0, 1],
+            "dims": [1, 2],
+            "type": "int64"
+          },
+          {
+            "data": [1.0, 1.0, 1.0, 0.5403, 0.9989, 1.0, -0.4161, 0.9957, 1.0, -0.99, 0.9903, 1.0],
+            "dims": [4, 3],
+            "type": "float32"
+          },
+          {
+            "data": [0.0, 0.0, 0.0, 0.8415, 0.0464, 0.0022, 0.9093, 0.0927, 0.0043, 0.1411, 0.1388, 0.0065],
+            "dims": [4, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, 1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586,
+              -0.8663, -0.2656, 0.1665, 0.7911, -0.932, -0.8579, -0.8618, -0.0922, -0.9073, -0.7032, -0.5762, -0.2371,
+              -0.4377, 0.537, -1.2929, -0.7267, -0.2107, -0.7115, -0.4666, -0.0261, -0.2965, -0.8469, -1.5749, -1.3217
+            ],
+            "dims": [1, 2, 18],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[1,3,2,6] T[1,2] T[4,3] T[4,3]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.0574, -0.1188, -0.9078, 0.3452, -0.5713, -0.2351,
+              1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586, -0.848, 0.5266, -1.2944, -0.0243, -0.2354, -0.7087,
+              -0.8663, -0.2656, 0.1665, 0.7911, -0.932, -0.8579, -0.9647, -0.0991, -0.2994, -0.065, -1.572, -1.3211
+            ],
+            "dims": [1, 3, 2, 6],
+            "type": "float32"
+          },
+          {
+            "data": [0, 1],
+            "dims": [1, 2],
+            "type": "int64"
+          },
+          {
+            "data": [1.0, 1.0, 1.0, 0.5403, 0.9989, 1.0, -0.4161, 0.9957, 1.0, -0.99, 0.9903, 1.0],
+            "dims": [4, 3],
+            "type": "float32"
+          },
+          {
+            "data": [0.0, 0.0, 0.0, 0.8415, 0.0464, 0.0022, 0.9093, 0.0927, 0.0043, 0.1411, 0.1388, 0.0065],
+            "dims": [4, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -0.8618, -0.0922, -0.9073, -0.7032, -0.5762, -0.2371,
+              1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586, -0.4377, 0.537, -1.2929, -0.7267, -0.2107, -0.7115,
+              -0.8663, -0.2656, 0.1665, 0.7911, -0.932, -0.8579, -0.4666, -0.0261, -0.2965, -0.8469, -1.5749, -1.3217
+            ],
+            "dims": [1, 3, 2, 6],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "RotaryEmbedding with interleaved pattern",
+    "operator": "RotaryEmbedding",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [{ "name": "interleaved", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "T[1,3,8] T[1] T[8,2] T[8,2]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -0.132, -0.2751, -0.235, 0.0937, -1.2188, 1.1676, -1.0574, -0.1188,
+              -0.7396, -1.2425, -0.1752, 0.699, -0.811, 0.6737, -1.1233, -0.0919, -0.6861, 0.7202, 0.1963, 0.6142
+            ],
+            "dims": [1, 3, 8],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [1],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 0.5403, 0.9999, -0.4161, 0.9998, -0.99, 0.9996, -0.6536, 0.9992, 0.2837, 0.9988, 0.9602, 0.9982,
+              0.7539, 0.9976
+            ],
+            "dims": [8, 2],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.8415, 0.01, 0.9093, 0.02, 0.1411, 0.03, -0.7568, 0.04, -0.9589, 0.05, -0.2794, 0.06, 0.657,
+              0.0699
+            ],
+            "dims": [8, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -0.132, -0.2751, -0.235, 0.0937, -1.6411, -0.3948, -1.0561, -0.1294,
+              0.646, -1.2937, -0.1822, 0.6972, -0.2751, -1.0178, -1.1212, -0.1143, -0.3694, -0.9235, 0.184, 0.618
+            ],
+            "dims": [1, 3, 8],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[1,3,8] Scalar T[8,2] T[8,2]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -0.132, -0.2751, -0.235, 0.0937, -1.2188, 1.1676, -1.0574, -0.1188,
+              -0.7396, -1.2425, -0.1752, 0.699, -0.811, 0.6737, -1.1233, -0.0919, -0.6861, 0.7202, 0.1963, 0.6142
+            ],
+            "dims": [1, 3, 8],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 0.5403, 0.9999, -0.4161, 0.9998, -0.99, 0.9996, -0.6536, 0.9992, 0.2837, 0.9988, 0.9602, 0.9982,
+              0.7539, 0.9976
+            ],
+            "dims": [8, 2],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.8415, 0.01, 0.9093, 0.02, 0.1411, 0.03, -0.7568, 0.04, -0.9589, 0.05, -0.2794, 0.06, 0.657,
+              0.0699
+            ],
+            "dims": [8, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -0.132, -0.2751, -0.235, 0.0937, -1.6411, -0.3948, -1.0561, -0.1294,
+              0.646, -1.2937, -0.1822, 0.6972, -0.2751, -1.0178, -1.1212, -0.1143, -0.3694, -0.9235, 0.184, 0.618
+            ],
+            "dims": [1, 3, 8],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[1,2,3,4] T[1] T[8,2] T[8,2]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.0574, -0.1188, -0.811, 0.6737, -1.1233, -0.0919,
+              -0.132, -0.2751, -0.235, 0.0937, -0.7396, -1.2425, -0.1752, 0.699, -0.6861, 0.7202, 0.1963, 0.6142
+            ],
+            "dims": [1, 2, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [1],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 0.5403, 0.9999, -0.4161, 0.9998, -0.99, 0.9996, -0.6536, 0.9992, 0.2837, 0.9988, 0.9602, 0.9982,
+              0.7539, 0.9976
+            ],
+            "dims": [8, 2],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.8415, 0.01, 0.9093, 0.02, 0.1411, 0.03, -0.7568, 0.04, -0.9589, 0.05, -0.2794, 0.06, 0.657,
+              0.0699
+            ],
+            "dims": [8, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.6411, -0.3948, -1.0561, -0.1294, -0.2751, -1.0178, -1.1212, -0.1143,
+              -0.132, -0.2751, -0.235, 0.0937, 0.646, -1.2937, -0.1822, 0.6972, -0.3694, -0.9235, 0.184, 0.618
+            ],
+            "dims": [1, 2, 3, 4],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[2,8,24] T[1] T[16,3] T[16,3]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853,
+              0.1036, -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 1.0311, -1.9557, -0.1482, 1.7376, 2.2039, -0.6589,
+              -1.0574, -0.1188, -0.9078, 0.3452, -0.5713, -0.2351, -0.5912, 1.1312, 0.7562, -1.2023, -0.5833, -0.4407,
+              0.1766, 1.0224, -0.4826, -0.5421, -0.5342, -0.6413, 1.3314, -0.4498, 0.5493, 0.0539, 0.2601, 0.857,
+              1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586, -1.9791, 0.7787, -0.7749, -0.1398, 1.1414, -0.6354,
+              0.0352, -0.4765, -0.0409, 1.1993, 0.5374, -0.193, 2.5211, -0.0452, -0.3105, -0.9407, -0.0034, 1.5199,
+              -0.848, 0.5266, 0.0299, -0.0498, 1.0651, 0.886, -1.4702, -0.2134, -0.8707, 1.6159, -0.2356, 0.9444,
+              0.5937, 0.7203, 0.5061, 1.5192, -0.4897, 0.9231, 0.2654, -0.1441, 0.5407, -1.5476, 0.6455, -1.1382, 0.464,
+              -0.4986, 0.1289, 2.7631, 0.1405, 1.1191, 2.1134, -0.9754, 0.1757, -0.1319, -0.2735, 0.3355, -0.6008,
+              -1.1164, 0.2577, -0.7226, -0.9244, 1.8737, 0.6052, 1.1904, 1.2195, -0.047, -1.0914, 1.0223, 0.3152,
+              1.7528, -0.765, 1.8299, -0.2784, -0.2719, 0.1885, 2.1432, 0.8527, 0.0965, -0.0625, 0.8269, 1.0122,
+              -1.4482, -0.0644, 0.3215, 0.5908, -1.4197, 0.2113, 0.0306, 0.3604, 0.3166, -0.8975, -0.6393, -1.2944,
+              -0.0243, -0.2354, -0.7087, 1.1566, 0.4296, 0.5599, -0.7776, 0.3339, 0.1759, 2.1108, 1.0702, 0.8279,
+              -0.2969, 0.712, -0.2068, -0.1548, 0.1553, 0.6207, -0.169, -0.5816, 1.2632, 0.0695, 1.1862, -1.1874,
+              -0.7468, -0.932, -0.8579, -0.9647, -0.0991, 0.0195, 1.1213, -1.4873, -0.2043, -1.0466, -1.5772, -0.0489,
+              0.343, 0.1264, 0.1519, -1.3639, -1.6593, 1.8127, -1.4459, -0.2158, -0.9792, -1.4392, 0.6508, 0.8964,
+              0.5717, -0.239, 0.6983, -1.3416, 0.2715, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306, 1.4788, 0.2754,
+              -0.0261, -0.4618, -0.5646, -1.0389, 0.5819, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, -0.5996,
+              -1.0962, 1.6327, 1.3951, 0.8784, 0.3389, 1.2907, 0.3124, 0.7299, 1.422, 0.3375, 0.0438, 1.8698, -0.2635,
+              -2.0799, -0.6313, 0.409, -1.1458, 0.0784, -1.8848, -1.6165, 0.6179, 0.9905, -0.0729, 0.5054, -0.6681,
+              -1.4382, 1.7547, -0.9605, -0.4558, -1.6105, 0.2979, 1.1537, -1.5604, 1.2779, -1.2514, 0.6056, 0.5763,
+              -3.3558, 0.2836, 0.6909, -0.7631, 2.4451, -0.35, 1.3289, -0.6494, 0.3478, 1.0038, -0.2937, 0.9238,
+              -1.2185, 0.4138, 0.5033, 0.9174, 1.8131, 1.4436, -0.4207, 0.022, -0.6807, -1.3306, 1.5646, 0.3338, 0.7105,
+              0.4683, -0.6179, 0.0818, -0.0488, -0.981, -1.3632, 0.0929, -1.7926, -0.2921, -0.4792, 0.6756, -0.3413,
+              -0.2242, -0.2111, 0.6282, 0.1667, -1.4055, 1.5895, 1.0838, -0.9077, -0.806, 0.7967, -2.9351, 2.4179,
+              -0.4026, 0.6451, 1.6845, -0.0901, 0.6106, 2.3603, 1.3908, -0.7917, -0.6734, -0.1213, -1.1116, -0.7401,
+              -0.7879, 0.0606, -2.3337, -1.2603, -1.7245, -0.3533, -0.9421, -0.1776, 0.3992, -1.7142, -0.5319, -0.8848,
+              0.6513, 1.0002, -1.4699, -1.4254, 0.7013, 0.2414, 0.2551, -0.7457, 0.3133, -1.0941, -0.3682, -0.0163,
+              -0.0645, -0.8101, 0.1415, 0.0551, 0.5873, -0.5887, -1.4733, -0.8565, 0.74, -0.5033, 0.0553, 0.9265,
+              -0.8652, -0.0288, -0.2209, 0.061, 0.6776, 0.4361, -0.8052, 0.3955, 0.8988, 0.8238, 0.2262, 1.2912, 0.6488,
+              1.2114, 1.3569, 0.2983, 0.4718, -1.1936, 0.7928, -0.8665, 0.9468, 1.1629, 0.0616, -1.3136, -0.2764,
+              0.0277, -0.1126, 0.2342, -0.5866, -1.8219, 1.1079, 0.5795, -1.4249
+            ],
+            "dims": [2, 8, 24],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [1],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 1.0, 0.5403, 0.9989, 1.0, -0.4161, 0.9957, 1.0, -0.99, 0.9903, 1.0, -0.6536, 0.9828, 1.0,
+              0.2837, 0.9732, 0.9999, 0.9602, 0.9615, 0.9999, 0.7539, 0.9477, 0.9999, -0.1455, 0.9318, 0.9999, -0.9111,
+              0.914, 0.9998, -0.8391, 0.8942, 0.9998, 0.0044, 0.8725, 0.9997, 0.8439, 0.8488, 0.9997, 0.9074, 0.8234,
+              0.9996, 0.1367, 0.7962, 0.9995, -0.7597, 0.7673, 0.9995
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.0, 0.8415, 0.0464, 0.0022, 0.9093, 0.0927, 0.0043, 0.1411, 0.1388, 0.0065, -0.7568, 0.1846,
+              0.0086, -0.9589, 0.23, 0.0108, -0.2794, 0.2749, 0.0129, 0.657, 0.3192, 0.0151, 0.9894, 0.3629, 0.0172,
+              0.4121, 0.4057, 0.0194, -0.544, 0.4477, 0.0215, -1.0, 0.4887, 0.0237, -0.5366, 0.5286, 0.0259, 0.4202,
+              0.5675, 0.028, 0.9906, 0.605, 0.0302, 0.6503, 0.6413, 0.0323
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853,
+              0.1036, -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 1.0311, -1.9557, -0.1482, 1.7376, 2.2039, -0.6589,
+              -0.4713, -0.954, -0.9229, 0.3027, -0.5708, -0.2363, -1.2713, 0.1137, 0.8112, -1.1659, -0.5824, -0.4419,
+              -0.7649, 0.7011, -0.4569, -0.5639, -0.5328, -0.6424, 1.0979, 0.8773, 0.5462, 0.0793, 0.2582, 0.8576,
+              0.2653, 1.2295, -0.1839, -0.4517, -1.5052, -0.4651, 0.1155, -2.1237, -0.7586, -0.211, 1.1441, -0.6304,
+              0.4186, 0.2303, -0.1519, 1.1903, 0.5382, -0.1906, -1.008, 2.3112, -0.222, -0.9655, -0.0099, 1.5198,
+              0.7652, -0.641, 0.0365, -0.0452, 1.0593, 0.8929, 1.4856, 0.0038, -1.0865, 1.4794, -0.2417, 0.9428,
+              -0.6894, -0.6293, 0.2904, 1.5747, -0.4956, 0.9199, -0.2424, 0.1801, 0.7503, -1.4576, 0.6529, -1.134,
+              -0.6807, -0.0252, -0.3834, 2.7394, 0.1308, 1.1203, -2.1196, -0.9618, 0.197, -0.0972, -0.2764, 0.3332,
+              -0.4522, 1.1844, 0.3867, -0.6626, -0.9405, 1.8656, 0.5053, -1.2361, 1.2072, 0.1789, -1.1002, 1.0129,
+              1.7702, 0.1949, -1.1653, 1.6049, -0.2755, -0.2749, 2.1087, 0.4272, 0.8076, 0.29, -0.0714, 0.8261, -1.1016,
+              -1.3814, -0.1366, 0.2981, 0.606, -1.4132, 0.0893, -0.1939, 0.2779, 0.391, -0.8906, -0.6489, -1.2496,
+              0.3383, -0.0315, -0.7461, 1.151, 0.4445, 0.3203, -0.9031, 0.2727, 0.2609, 2.0968, 1.0974, 0.712, -0.5164,
+              0.7415, -0.0031, -0.1568, 0.1533, 0.5487, -0.3357, -0.9064, 1.0546, 0.0542, 1.187, -0.4045, -1.3431,
+              -0.6094, -1.1105, -0.9631, -0.1137, -0.7219, 0.8582, -1.3443, -0.6684, -1.0227, -1.5929, -0.2622, 0.2264,
+              0.0713, 0.1843, -1.3387, -1.6797, 2.3165, 0.1009, 0.1081, -0.9969, -1.4488, 0.6291, 0.8964, 0.5717,
+              -0.239, 0.6983, -1.3416, 0.2715, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306, 1.4788, 0.2754, -0.0261,
+              -0.4618, -0.5646, -1.0389, 0.5819, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, 0.5985, -1.0968,
+              1.5662, 1.4693, 0.8776, 0.3408, 0.4345, 1.2549, 0.6631, 1.4543, 0.3374, 0.0445, 1.232, 1.4311, -2.0483,
+              -0.7272, 0.4114, -1.1449, 1.6283, -0.9524, -1.6435, 0.5422, 0.9907, -0.0708, 0.3972, 0.7376, -1.5947,
+              1.6138, -0.9586, -0.46, 0.3993, -1.5884, 1.2934, -1.4467, 1.2833, -1.2459, -0.776, 0.3108, -3.3677,
+              -0.0287, 0.6942, -0.7601, -0.6993, 2.369, 1.3834, -0.5234, 0.3435, 1.0053, 0.1604, -0.956, -1.2641,
+              0.2406, 0.4973, 0.9206, -1.9987, -1.1733, -0.4197, -0.0366, -0.672, -1.335, -1.596, -0.1097, 0.6386,
+              0.5624, -0.6184, 0.0778, 0.1867, 0.9643, -1.3629, -0.0972, -1.7907, -0.3037, 0.8245, -0.0789, -0.294,
+              -0.2833, -0.2165, 0.6264, -1.1726, 0.7926, 1.3621, 1.3586, -0.9007, -0.8138, -2.7421, 1.3155, 2.4507,
+              0.0507, 0.6305, 1.69, 0.521, -0.3309, 2.063, 1.8026, -0.7859, -0.6802, -1.1003, -0.199, -0.5391, -0.937,
+              0.0857, -2.333, -2.0112, 0.7193, -0.1272, -0.9981, -0.1818, 0.3973, -0.9963, 1.4929, -1.0109, 0.4304,
+              1.016, -1.459, 0.2682, 1.5658, 0.1762, 0.3038, -0.7491, 0.3052, -1.1534, -0.0478, 0.0021, -0.0665,
+              -0.8118, 0.131, 0.2171, 0.5485, -0.161, -1.5784, -0.866, 0.7289, -0.4678, 0.1937, 1.1287, -0.5772,
+              -0.0259, -0.2212, 0.2479, 0.6336, 0.6407, -0.6543, 0.3838, 0.9039, 0.4724, 0.7117, 1.0165, 1.027, 1.1908,
+              1.375, -0.085, 0.5517, -1.3842, 0.3703, -0.8806, 0.9336, 0.8362, 0.8105, -1.1566, -0.6813, 0.0294,
+              -0.1122, 0.562, -0.2884, -2.0803, 0.4684, 0.6009, -1.416
+            ],
+            "dims": [2, 8, 24],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[2,8,24] Scalar T[16,3] T[16,3]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853,
+              0.1036, -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 1.0311, -1.9557, -0.1482, 1.7376, 2.2039, -0.6589,
+              -1.0574, -0.1188, -0.9078, 0.3452, -0.5713, -0.2351, -0.5912, 1.1312, 0.7562, -1.2023, -0.5833, -0.4407,
+              0.1766, 1.0224, -0.4826, -0.5421, -0.5342, -0.6413, 1.3314, -0.4498, 0.5493, 0.0539, 0.2601, 0.857,
+              1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586, -1.9791, 0.7787, -0.7749, -0.1398, 1.1414, -0.6354,
+              0.0352, -0.4765, -0.0409, 1.1993, 0.5374, -0.193, 2.5211, -0.0452, -0.3105, -0.9407, -0.0034, 1.5199,
+              -0.848, 0.5266, 0.0299, -0.0498, 1.0651, 0.886, -1.4702, -0.2134, -0.8707, 1.6159, -0.2356, 0.9444,
+              0.5937, 0.7203, 0.5061, 1.5192, -0.4897, 0.9231, 0.2654, -0.1441, 0.5407, -1.5476, 0.6455, -1.1382, 0.464,
+              -0.4986, 0.1289, 2.7631, 0.1405, 1.1191, 2.1134, -0.9754, 0.1757, -0.1319, -0.2735, 0.3355, -0.6008,
+              -1.1164, 0.2577, -0.7226, -0.9244, 1.8737, 0.6052, 1.1904, 1.2195, -0.047, -1.0914, 1.0223, 0.3152,
+              1.7528, -0.765, 1.8299, -0.2784, -0.2719, 0.1885, 2.1432, 0.8527, 0.0965, -0.0625, 0.8269, 1.0122,
+              -1.4482, -0.0644, 0.3215, 0.5908, -1.4197, 0.2113, 0.0306, 0.3604, 0.3166, -0.8975, -0.6393, -1.2944,
+              -0.0243, -0.2354, -0.7087, 1.1566, 0.4296, 0.5599, -0.7776, 0.3339, 0.1759, 2.1108, 1.0702, 0.8279,
+              -0.2969, 0.712, -0.2068, -0.1548, 0.1553, 0.6207, -0.169, -0.5816, 1.2632, 0.0695, 1.1862, -1.1874,
+              -0.7468, -0.932, -0.8579, -0.9647, -0.0991, 0.0195, 1.1213, -1.4873, -0.2043, -1.0466, -1.5772, -0.0489,
+              0.343, 0.1264, 0.1519, -1.3639, -1.6593, 1.8127, -1.4459, -0.2158, -0.9792, -1.4392, 0.6508, 0.8964,
+              0.5717, -0.239, 0.6983, -1.3416, 0.2715, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306, 1.4788, 0.2754,
+              -0.0261, -0.4618, -0.5646, -1.0389, 0.5819, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, -0.5996,
+              -1.0962, 1.6327, 1.3951, 0.8784, 0.3389, 1.2907, 0.3124, 0.7299, 1.422, 0.3375, 0.0438, 1.8698, -0.2635,
+              -2.0799, -0.6313, 0.409, -1.1458, 0.0784, -1.8848, -1.6165, 0.6179, 0.9905, -0.0729, 0.5054, -0.6681,
+              -1.4382, 1.7547, -0.9605, -0.4558, -1.6105, 0.2979, 1.1537, -1.5604, 1.2779, -1.2514, 0.6056, 0.5763,
+              -3.3558, 0.2836, 0.6909, -0.7631, 2.4451, -0.35, 1.3289, -0.6494, 0.3478, 1.0038, -0.2937, 0.9238,
+              -1.2185, 0.4138, 0.5033, 0.9174, 1.8131, 1.4436, -0.4207, 0.022, -0.6807, -1.3306, 1.5646, 0.3338, 0.7105,
+              0.4683, -0.6179, 0.0818, -0.0488, -0.981, -1.3632, 0.0929, -1.7926, -0.2921, -0.4792, 0.6756, -0.3413,
+              -0.2242, -0.2111, 0.6282, 0.1667, -1.4055, 1.5895, 1.0838, -0.9077, -0.806, 0.7967, -2.9351, 2.4179,
+              -0.4026, 0.6451, 1.6845, -0.0901, 0.6106, 2.3603, 1.3908, -0.7917, -0.6734, -0.1213, -1.1116, -0.7401,
+              -0.7879, 0.0606, -2.3337, -1.2603, -1.7245, -0.3533, -0.9421, -0.1776, 0.3992, -1.7142, -0.5319, -0.8848,
+              0.6513, 1.0002, -1.4699, -1.4254, 0.7013, 0.2414, 0.2551, -0.7457, 0.3133, -1.0941, -0.3682, -0.0163,
+              -0.0645, -0.8101, 0.1415, 0.0551, 0.5873, -0.5887, -1.4733, -0.8565, 0.74, -0.5033, 0.0553, 0.9265,
+              -0.8652, -0.0288, -0.2209, 0.061, 0.6776, 0.4361, -0.8052, 0.3955, 0.8988, 0.8238, 0.2262, 1.2912, 0.6488,
+              1.2114, 1.3569, 0.2983, 0.4718, -1.1936, 0.7928, -0.8665, 0.9468, 1.1629, 0.0616, -1.3136, -0.2764,
+              0.0277, -0.1126, 0.2342, -0.5866, -1.8219, 1.1079, 0.5795, -1.4249
+            ],
+            "dims": [2, 8, 24],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 1.0, 0.5403, 0.9989, 1.0, -0.4161, 0.9957, 1.0, -0.99, 0.9903, 1.0, -0.6536, 0.9828, 1.0,
+              0.2837, 0.9732, 0.9999, 0.9602, 0.9615, 0.9999, 0.7539, 0.9477, 0.9999, -0.1455, 0.9318, 0.9999, -0.9111,
+              0.914, 0.9998, -0.8391, 0.8942, 0.9998, 0.0044, 0.8725, 0.9997, 0.8439, 0.8488, 0.9997, 0.9074, 0.8234,
+              0.9996, 0.1367, 0.7962, 0.9995, -0.7597, 0.7673, 0.9995
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.0, 0.8415, 0.0464, 0.0022, 0.9093, 0.0927, 0.0043, 0.1411, 0.1388, 0.0065, -0.7568, 0.1846,
+              0.0086, -0.9589, 0.23, 0.0108, -0.2794, 0.2749, 0.0129, 0.657, 0.3192, 0.0151, 0.9894, 0.3629, 0.0172,
+              0.4121, 0.4057, 0.0194, -0.544, 0.4477, 0.0215, -1.0, 0.4887, 0.0237, -0.5366, 0.5286, 0.0259, 0.4202,
+              0.5675, 0.028, 0.9906, 0.605, 0.0302, 0.6503, 0.6413, 0.0323
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853,
+              0.1036, -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 1.0311, -1.9557, -0.1482, 1.7376, 2.2039, -0.6589,
+              -0.4713, -0.954, -0.9229, 0.3027, -0.5708, -0.2363, -1.2713, 0.1137, 0.8112, -1.1659, -0.5824, -0.4419,
+              -0.7649, 0.7011, -0.4569, -0.5639, -0.5328, -0.6424, 1.0979, 0.8773, 0.5462, 0.0793, 0.2582, 0.8576,
+              0.2653, 1.2295, -0.1839, -0.4517, -1.5052, -0.4651, 0.1155, -2.1237, -0.7586, -0.211, 1.1441, -0.6304,
+              0.4186, 0.2303, -0.1519, 1.1903, 0.5382, -0.1906, -1.008, 2.3112, -0.222, -0.9655, -0.0099, 1.5198,
+              0.7652, -0.641, 0.0365, -0.0452, 1.0593, 0.8929, 1.4856, 0.0038, -1.0865, 1.4794, -0.2417, 0.9428,
+              -0.6894, -0.6293, 0.2904, 1.5747, -0.4956, 0.9199, -0.2424, 0.1801, 0.7503, -1.4576, 0.6529, -1.134,
+              -0.6807, -0.0252, -0.3834, 2.7394, 0.1308, 1.1203, -2.1196, -0.9618, 0.197, -0.0972, -0.2764, 0.3332,
+              -0.4522, 1.1844, 0.3867, -0.6626, -0.9405, 1.8656, 0.5053, -1.2361, 1.2072, 0.1789, -1.1002, 1.0129,
+              1.7702, 0.1949, -1.1653, 1.6049, -0.2755, -0.2749, 2.1087, 0.4272, 0.8076, 0.29, -0.0714, 0.8261, -1.1016,
+              -1.3814, -0.1366, 0.2981, 0.606, -1.4132, 0.0893, -0.1939, 0.2779, 0.391, -0.8906, -0.6489, -1.2496,
+              0.3383, -0.0315, -0.7461, 1.151, 0.4445, 0.3203, -0.9031, 0.2727, 0.2609, 2.0968, 1.0974, 0.712, -0.5164,
+              0.7415, -0.0031, -0.1568, 0.1533, 0.5487, -0.3357, -0.9064, 1.0546, 0.0542, 1.187, -0.4045, -1.3431,
+              -0.6094, -1.1105, -0.9631, -0.1137, -0.7219, 0.8582, -1.3443, -0.6684, -1.0227, -1.5929, -0.2622, 0.2264,
+              0.0713, 0.1843, -1.3387, -1.6797, 2.3165, 0.1009, 0.1081, -0.9969, -1.4488, 0.6291, 0.8964, 0.5717,
+              -0.239, 0.6983, -1.3416, 0.2715, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306, 1.4788, 0.2754, -0.0261,
+              -0.4618, -0.5646, -1.0389, 0.5819, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, 0.5985, -1.0968,
+              1.5662, 1.4693, 0.8776, 0.3408, 0.4345, 1.2549, 0.6631, 1.4543, 0.3374, 0.0445, 1.232, 1.4311, -2.0483,
+              -0.7272, 0.4114, -1.1449, 1.6283, -0.9524, -1.6435, 0.5422, 0.9907, -0.0708, 0.3972, 0.7376, -1.5947,
+              1.6138, -0.9586, -0.46, 0.3993, -1.5884, 1.2934, -1.4467, 1.2833, -1.2459, -0.776, 0.3108, -3.3677,
+              -0.0287, 0.6942, -0.7601, -0.6993, 2.369, 1.3834, -0.5234, 0.3435, 1.0053, 0.1604, -0.956, -1.2641,
+              0.2406, 0.4973, 0.9206, -1.9987, -1.1733, -0.4197, -0.0366, -0.672, -1.335, -1.596, -0.1097, 0.6386,
+              0.5624, -0.6184, 0.0778, 0.1867, 0.9643, -1.3629, -0.0972, -1.7907, -0.3037, 0.8245, -0.0789, -0.294,
+              -0.2833, -0.2165, 0.6264, -1.1726, 0.7926, 1.3621, 1.3586, -0.9007, -0.8138, -2.7421, 1.3155, 2.4507,
+              0.0507, 0.6305, 1.69, 0.521, -0.3309, 2.063, 1.8026, -0.7859, -0.6802, -1.1003, -0.199, -0.5391, -0.937,
+              0.0857, -2.333, -2.0112, 0.7193, -0.1272, -0.9981, -0.1818, 0.3973, -0.9963, 1.4929, -1.0109, 0.4304,
+              1.016, -1.459, 0.2682, 1.5658, 0.1762, 0.3038, -0.7491, 0.3052, -1.1534, -0.0478, 0.0021, -0.0665,
+              -0.8118, 0.131, 0.2171, 0.5485, -0.161, -1.5784, -0.866, 0.7289, -0.4678, 0.1937, 1.1287, -0.5772,
+              -0.0259, -0.2212, 0.2479, 0.6336, 0.6407, -0.6543, 0.3838, 0.9039, 0.4724, 0.7117, 1.0165, 1.027, 1.1908,
+              1.375, -0.085, 0.5517, -1.3842, 0.3703, -0.8806, 0.9336, 0.8362, 0.8105, -1.1566, -0.6813, 0.0294,
+              -0.1122, 0.562, -0.2884, -2.0803, 0.4684, 0.6009, -1.416
+            ],
+            "dims": [2, 8, 24],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "T[2,4,8,6] T[1] T[16,3] T[16,3]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -1.0574, -0.1188, -0.9078, 0.3452, -0.5713, -0.2351,
+              1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586, -0.848, 0.5266, 0.0299, -0.0498, 1.0651, 0.886, 0.464,
+              -0.4986, 0.1289, 2.7631, 0.1405, 1.1191, 0.3152, 1.7528, -0.765, 1.8299, -0.2784, -0.2719, -1.2944,
+              -0.0243, -0.2354, -0.7087, 1.1566, 0.4296, -1.1874, -0.7468, -0.932, -0.8579, -0.9647, -0.0991, -1.019,
+              0.3157, -1.6036, 1.8493, 0.0447, 1.5853, -0.5912, 1.1312, 0.7562, -1.2023, -0.5833, -0.4407, -1.9791,
+              0.7787, -0.7749, -0.1398, 1.1414, -0.6354, -1.4702, -0.2134, -0.8707, 1.6159, -0.2356, 0.9444, 2.1134,
+              -0.9754, 0.1757, -0.1319, -0.2735, 0.3355, 0.1885, 2.1432, 0.8527, 0.0965, -0.0625, 0.8269, 0.5599,
+              -0.7776, 0.3339, 0.1759, 2.1108, 1.0702, 0.0195, 1.1213, -1.4873, -0.2043, -1.0466, -1.5772, 0.1036,
+              -0.3514, 0.2421, 0.6463, 0.873, -0.9276, 0.1766, 1.0224, -0.4826, -0.5421, -0.5342, -0.6413, 0.0352,
+              -0.4765, -0.0409, 1.1993, 0.5374, -0.193, 0.5937, 0.7203, 0.5061, 1.5192, -0.4897, 0.9231, -0.6008,
+              -1.1164, 0.2577, -0.7226, -0.9244, 1.8737, 1.0122, -1.4482, -0.0644, 0.3215, 0.5908, -1.4197, 0.8279,
+              -0.2969, 0.712, -0.2068, -0.1548, 0.1553, -0.0489, 0.343, 0.1264, 0.1519, -1.3639, -1.6593, 1.0311,
+              -1.9557, -0.1482, 1.7376, 2.2039, -0.6589, 1.3314, -0.4498, 0.5493, 0.0539, 0.2601, 0.857, 2.5211,
+              -0.0452, -0.3105, -0.9407, -0.0034, 1.5199, 0.2654, -0.1441, 0.5407, -1.5476, 0.6455, -1.1382, 0.6052,
+              1.1904, 1.2195, -0.047, -1.0914, 1.0223, 0.2113, 0.0306, 0.3604, 0.3166, -0.8975, -0.6393, 0.6207, -0.169,
+              -0.5816, 1.2632, 0.0695, 1.1862, 1.8127, -1.4459, -0.2158, -0.9792, -1.4392, 0.6508, 0.8964, 0.5717,
+              -0.239, 0.6983, -1.3416, 0.2715, -0.5996, -1.0962, 1.6327, 1.3951, 0.8784, 0.3389, 0.5054, -0.6681,
+              -1.4382, 1.7547, -0.9605, -0.4558, -0.2937, 0.9238, -1.2185, 0.4138, 0.5033, 0.9174, -0.4792, 0.6756,
+              -0.3413, -0.2242, -0.2111, 0.6282, -0.1213, -1.1116, -0.7401, -0.7879, 0.0606, -2.3337, -1.0941, -0.3682,
+              -0.0163, -0.0645, -0.8101, 0.1415, 0.8238, 0.2262, 1.2912, 0.6488, 1.2114, 1.3569, -0.2852, 0.6051,
+              0.2167, -0.2181, -1.6306, 1.4788, 1.2907, 0.3124, 0.7299, 1.422, 0.3375, 0.0438, -1.6105, 0.2979, 1.1537,
+              -1.5604, 1.2779, -1.2514, 1.8131, 1.4436, -0.4207, 0.022, -0.6807, -1.3306, 0.1667, -1.4055, 1.5895,
+              1.0838, -0.9077, -0.806, -1.2603, -1.7245, -0.3533, -0.9421, -0.1776, 0.3992, 0.0551, 0.5873, -0.5887,
+              -1.4733, -0.8565, 0.74, 0.2983, 0.4718, -1.1936, 0.7928, -0.8665, 0.9468, 0.2754, -0.0261, -0.4618,
+              -0.5646, -1.0389, 0.5819, 1.8698, -0.2635, -2.0799, -0.6313, 0.409, -1.1458, 0.6056, 0.5763, -3.3558,
+              0.2836, 0.6909, -0.7631, 1.5646, 0.3338, 0.7105, 0.4683, -0.6179, 0.0818, 0.7967, -2.9351, 2.4179,
+              -0.4026, 0.6451, 1.6845, -1.7142, -0.5319, -0.8848, 0.6513, 1.0002, -1.4699, -0.5033, 0.0553, 0.9265,
+              -0.8652, -0.0288, -0.2209, 1.1629, 0.0616, -1.3136, -0.2764, 0.0277, -0.1126, 1.3697, 0.0002, 1.5333,
+              -1.0556, -0.1254, 0.1527, 0.0784, -1.8848, -1.6165, 0.6179, 0.9905, -0.0729, 2.4451, -0.35, 1.3289,
+              -0.6494, 0.3478, 1.0038, -0.0488, -0.981, -1.3632, 0.0929, -1.7926, -0.2921, -0.0901, 0.6106, 2.3603,
+              1.3908, -0.7917, -0.6734, -1.4254, 0.7013, 0.2414, 0.2551, -0.7457, 0.3133, 0.061, 0.6776, 0.4361,
+              -0.8052, 0.3955, 0.8988, 0.2342, -0.5866, -1.8219, 1.1079, 0.5795, -1.4249
+            ],
+            "dims": [2, 4, 8, 6],
+            "type": "float32"
+          },
+          {
+            "data": [0],
+            "dims": [1],
+            "type": "int64"
+          },
+          {
+            "data": [
+              1.0, 1.0, 1.0, 0.5403, 0.9989, 1.0, -0.4161, 0.9957, 1.0, -0.99, 0.9903, 1.0, -0.6536, 0.9828, 1.0,
+              0.2837, 0.9732, 0.9999, 0.9602, 0.9615, 0.9999, 0.7539, 0.9477, 0.9999, -0.1455, 0.9318, 0.9999, -0.9111,
+              0.914, 0.9998, -0.8391, 0.8942, 0.9998, 0.0044, 0.8725, 0.9997, 0.8439, 0.8488, 0.9997, 0.9074, 0.8234,
+              0.9996, 0.1367, 0.7962, 0.9995, -0.7597, 0.7673, 0.9995
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          },
+          {
+            "data": [
+              0.0, 0.0, 0.0, 0.8415, 0.0464, 0.0022, 0.9093, 0.0927, 0.0043, 0.1411, 0.1388, 0.0065, -0.7568, 0.1846,
+              0.0086, -0.9589, 0.23, 0.0108, -0.2794, 0.2749, 0.0129, 0.657, 0.3192, 0.0151, 0.9894, 0.3629, 0.0172,
+              0.4121, 0.4057, 0.0194, -0.544, 0.4477, 0.0215, -1.0, 0.4887, 0.0237, -0.5366, 0.5286, 0.0259, 0.4202,
+              0.5675, 0.028, 0.9906, 0.605, 0.0302, 0.6503, 0.6413, 0.0323
+            ],
+            "dims": [16, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, -0.4713, -0.954, -0.9229, 0.3027, -0.5708, -0.2363,
+              0.2653, 1.2295, -0.1839, -0.4517, -1.5052, -0.4651, 0.7652, -0.641, 0.0365, -0.0452, 1.0593, 0.8929,
+              -0.6807, -0.0252, -0.3834, 2.7394, 0.1308, 1.1203, 1.7702, 0.1949, -1.1653, 1.6049, -0.2755, -0.2749,
+              -1.2496, 0.3383, -0.0315, -0.7461, 1.151, 0.4445, -0.4045, -1.3431, -0.6094, -1.1105, -0.9631, -0.1137,
+              -1.019, 0.3157, -1.6036, 1.8493, 0.0447, 1.5853, -1.2713, 0.1137, 0.8112, -1.1659, -0.5824, -0.4419,
+              0.1155, -2.1237, -0.7586, -0.211, 1.1441, -0.6304, 1.4856, 0.0038, -1.0865, 1.4794, -0.2417, 0.9428,
+              -2.1196, -0.9618, 0.197, -0.0972, -0.2764, 0.3332, 2.1087, 0.4272, 0.8076, 0.29, -0.0714, 0.8261, 0.3203,
+              -0.9031, 0.2727, 0.2609, 2.0968, 1.0974, -0.7219, 0.8582, -1.3443, -0.6684, -1.0227, -1.5929, 0.1036,
+              -0.3514, 0.2421, 0.6463, 0.873, -0.9276, -0.7649, 0.7011, -0.4569, -0.5639, -0.5328, -0.6424, 0.4186,
+              0.2303, -0.1519, 1.1903, 0.5382, -0.1906, -0.6894, -0.6293, 0.2904, 1.5747, -0.4956, 0.9199, -0.4522,
+              1.1844, 0.3867, -0.6626, -0.9405, 1.8656, -1.1016, -1.3814, -0.1366, 0.2981, 0.606, -1.4132, 0.712,
+              -0.5164, 0.7415, -0.0031, -0.1568, 0.1533, -0.2622, 0.2264, 0.0713, 0.1843, -1.3387, -1.6797, 1.0311,
+              -1.9557, -0.1482, 1.7376, 2.2039, -0.6589, 1.0979, 0.8773, 0.5462, 0.0793, 0.2582, 0.8576, -1.008, 2.3112,
+              -0.222, -0.9655, -0.0099, 1.5198, -0.2424, 0.1801, 0.7503, -1.4576, 0.6529, -1.134, 0.5053, -1.2361,
+              1.2072, 0.1789, -1.1002, 1.0129, 0.0893, -0.1939, 0.2779, 0.391, -0.8906, -0.6489, 0.5487, -0.3357,
+              -0.9064, 1.0546, 0.0542, 1.187, 2.3165, 0.1009, 0.1081, -0.9969, -1.4488, 0.6291, 0.8964, 0.5717, -0.239,
+              0.6983, -1.3416, 0.2715, 0.5985, -1.0968, 1.5662, 1.4693, 0.8776, 0.3408, 0.3972, 0.7376, -1.5947, 1.6138,
+              -0.9586, -0.46, 0.1604, -0.956, -1.2641, 0.2406, 0.4973, 0.9206, 0.8245, -0.0789, -0.294, -0.2833,
+              -0.2165, 0.6264, -1.1003, -0.199, -0.5391, -0.937, 0.0857, -2.333, -1.1534, -0.0478, 0.0021, -0.0665,
+              -0.8118, 0.131, 0.4724, 0.7117, 1.0165, 1.027, 1.1908, 1.375, -0.2852, 0.6051, 0.2167, -0.2181, -1.6306,
+              1.4788, 0.4345, 1.2549, 0.6631, 1.4543, 0.3374, 0.0445, 0.3993, -1.5884, 1.2934, -1.4467, 1.2833, -1.2459,
+              -1.9987, -1.1733, -0.4197, -0.0366, -0.672, -1.335, -1.1726, 0.7926, 1.3621, 1.3586, -0.9007, -0.8138,
+              -2.0112, 0.7193, -0.1272, -0.9981, -0.1818, 0.3973, 0.2171, 0.5485, -0.161, -1.5784, -0.866, 0.7289,
+              -0.085, 0.5517, -1.3842, 0.3703, -0.8806, 0.9336, 0.2754, -0.0261, -0.4618, -0.5646, -1.0389, 0.5819,
+              1.232, 1.4311, -2.0483, -0.7272, 0.4114, -1.1449, -0.776, 0.3108, -3.3677, -0.0287, 0.6942, -0.7601,
+              -1.596, -0.1097, 0.6386, 0.5624, -0.6184, 0.0778, -2.7421, 1.3155, 2.4507, 0.0507, 0.6305, 1.69, -0.9963,
+              1.4929, -1.0109, 0.4304, 1.016, -1.459, -0.4678, 0.1937, 1.1287, -0.5772, -0.0259, -0.2212, 0.8362,
+              0.8105, -1.1566, -0.6813, 0.0294, -0.1122, 1.3697, 0.0002, 1.5333, -1.0556, -0.1254, 0.1527, 1.6283,
+              -0.9524, -1.6435, 0.5422, 0.9907, -0.0708, -0.6993, 2.369, 1.3834, -0.5234, 0.3435, 1.0053, 0.1867,
+              0.9643, -1.3629, -0.0972, -1.7907, -0.3037, 0.521, -0.3309, 2.063, 1.8026, -0.7859, -0.6802, 0.2682,
+              1.5658, 0.1762, 0.3038, -0.7491, 0.3052, 0.2479, 0.6336, 0.6407, -0.6543, 0.3838, 0.9039, 0.562, -0.2884,
+              -2.0803, 0.4684, 0.6009, -1.416
+            ],
+            "dims": [2, 4, 8, 6],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "RotaryEmbedding with custom rotary dim",
+    "operator": "RotaryEmbedding",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "num_heads", "data": 1, "type": "int" },
+      { "name": "rotary_embedding_dim", "data": 4, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[1,2,6] T[1,2] T[2,2] T[2,2]",
+        "inputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, 1.0076, -0.7529, -0.225, -0.4327, -1.5071, -0.4586
+            ],
+            "dims": [1, 2, 6],
+            "type": "float32"
+          },
+          {
+            "data": [0, 1],
+            "dims": [1, 2],
+            "type": "int64"
+          },
+          {
+            "data": [1.0, 1.0, 1.0, 0.5403],
+            "dims": [2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.0, 0.0, 0.0, 0.8415],
+            "dims": [2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              -1.0408, 0.9166, -1.3042, -1.1097, -1.2188, 1.1676, 1.0076, -0.0427, -0.225, -0.8673, -1.5071, -0.4586
+            ],
+            "dims": [1, 2, 6],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/simplified-layer-norm.jsonc b/js/web/test/data/ops/simplified-layer-norm.jsonc
new file mode 100644
index 000000000000..346919ab63e4
--- /dev/null
+++ b/js/web/test/data/ops/simplified-layer-norm.jsonc
@@ -0,0 +1,48 @@
+[
+  {
+    "name": "SimplifiedLayerNormalization",
+    "operator": "SimplifiedLayerNormalization",
+    "opset": { "domain": "", "version": 16 },
+    "attributes": [
+      {
+        "name": "epsilon",
+        "data": 1e-5,
+        "type": "float"
+      }
+    ],
+    "inputShapeDefinitions": "rankOnly",
+    "cases": [
+      {
+        "name": "default",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [1, 2, 8],
+            "type": "float32"
+          },
+          {
+            "data": [2, 2, 2, 2, 2, 2, 2, 2],
+            "dims": [8],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0.39605894684791565, 0.7921178936958313, 1.1881768703460693, 1.5842357873916626, 1.9802948236465454,
+              2.3763537406921387, 2.7724127769470215, 3.168471574783325, 1.4164010286331177, 1.5737788677215576,
+              1.731156826019287, 1.888534665107727, 2.045912504196167, 2.2032904624938965, 2.360668420791626,
+              2.5180463790893555
+            ],
+            "dims": [1, 2, 8],
+            "type": "float32"
+          },
+          {
+            "data": null,
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/skip-simplified-layer-norm.jsonc b/js/web/test/data/ops/skip-simplified-layer-norm.jsonc
new file mode 100644
index 000000000000..9cf521238224
--- /dev/null
+++ b/js/web/test/data/ops/skip-simplified-layer-norm.jsonc
@@ -0,0 +1,53 @@
+[
+  {
+    "name": "SkipSimplifiedLayerNormalization",
+    "operator": "SkipSimplifiedLayerNormalization",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      {
+        "name": "epsilon",
+        "data": 1e-5,
+        "type": "float"
+      }
+    ],
+    "inputShapeDefinitions": "rankOnly",
+    "cases": [
+      {
+        "name": "default",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [1, 2, 8],
+            "type": "float32"
+          },
+          {
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            "dims": [1, 2, 8],
+            "type": "float32"
+          },
+          {
+            "data": [2, 2, 2, 2, 2, 2, 2, 2],
+            "dims": [8],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0.21693046391010284, 0.650791347026825, 1.084652304649353, 1.5185132026672363, 1.9523741006851196,
+              2.386234998703003, 2.820096015930176, 3.2539567947387695, 1.3915272951126099, 1.5552364587783813,
+              1.7189455032348633, 1.8826546669006348, 2.046363592147827, 2.2100727558135986, 2.37378191947937,
+              2.5374910831451416
+            ],
+            "dims": [1, 2, 8],
+            "type": "float32"
+          },
+          {
+            "data": null,
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/tanh.jsonc b/js/web/test/data/ops/tanh.jsonc
new file mode 100644
index 000000000000..f7691535bd71
--- /dev/null
+++ b/js/web/test/data/ops/tanh.jsonc
@@ -0,0 +1,26 @@
+[
+  {
+    "name": "tanh with no attributes",
+    "operator": "Tanh",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[2,4]",
+        "inputs": [
+          {
+            "data": [-1000, -1, 0, 0.1, 0.2, 0.3, 0.4, 1000],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [-1, -0.761594, 0, 0.099668, 0.197375, 0.291313, 0.379949, 1],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/where.jsonc b/js/web/test/data/ops/where.jsonc
index 047fd6fd7511..990120dd3708 100644
--- a/js/web/test/data/ops/where.jsonc
+++ b/js/web/test/data/ops/where.jsonc
@@ -168,5 +168,39 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Where with no attributes",
+    "operator": "Where",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[1 1 2 1] T[1 4] T[1 1 2 4] float32 broadcast 1",
+        "inputs": [
+          {
+            "data": [true, false],
+            "dims": [1, 1, 2, 1],
+            "type": "bool"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 4],
+            "type": "float32"
+          },
+          {
+            "data": [5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [1, 1, 2, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 4, 9, 10, 11, 12],
+            "dims": [1, 1, 2, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/js/web/test/e2e/browser-test-webgl.js b/js/web/test/e2e/browser-test-webgl.js
index e503f38ae573..974c81d064c8 100644
--- a/js/web/test/e2e/browser-test-webgl.js
+++ b/js/web/test/e2e/browser-test-webgl.js
@@ -6,3 +6,16 @@
 it('Browser E2E testing - WebGL backend', async function() {
   await testFunction(ort, {executionProviders: ['webgl']});
 });
+
+it('Browser E2E testing - invalid buffer', async () => {
+  try {
+    await ort.InferenceSession.create(
+        new Uint8Array(Array.from({length: 100}, () => 42)), {executionProviders: ['webgl']});
+
+    // Should not reach here.
+    assert(false);
+  } catch (e) {
+    assert(e.message.includes('as ONNX format'));
+    assert(e.message.includes('as ORT format'));
+  }
+});
diff --git a/js/web/test/e2e/browser-test-webgpu-external-data.js b/js/web/test/e2e/browser-test-webgpu-external-data.js
new file mode 100644
index 000000000000..8fb0b4d6ec54
--- /dev/null
+++ b/js/web/test/e2e/browser-test-webgpu-external-data.js
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+it('Browser E2E testing - WebGPU backend with external data', async function() {
+  const session = await ort.InferenceSession.create('./model_with_orig_ext_data.onnx', {
+    executionProviders: ['webgpu'],
+    externalData: [{data: './model_with_orig_ext_data.bin', path: 'model_with_orig_ext_data.bin'}]
+  });
+
+  const fetches = await session.run({X: new ort.Tensor('float32', [1, 1], [1, 2])});
+
+  const Y = fetches.Y;
+
+  assert(Y instanceof ort.Tensor);
+  assert(Y.dims.length === 2 && Y.dims[0] === 2 && Y.dims[1] === 3);
+  assert(Y.data[0] === 1);
+  assert(Y.data[1] === 1);
+  assert(Y.data[2] === 0);
+  assert(Y.data[3] === 0);
+  assert(Y.data[4] === 0);
+  assert(Y.data[5] === 0);
+});
diff --git a/js/web/test/e2e/karma.conf.js b/js/web/test/e2e/karma.conf.js
index b7ff408fa29c..b541d9d12011 100644
--- a/js/web/test/e2e/karma.conf.js
+++ b/js/web/test/e2e/karma.conf.js
@@ -15,6 +15,8 @@ if (typeof USER_DATA !== 'string') {
   throw new Error('flag --user-data=<CHROME_USER_DATA_FOLDER> is required');
 }
 
+const flags = ['--ignore-gpu-blocklist', '--gpu-vendor-id=0x10de'];
+
 module.exports = function(config) {
   const distPrefix = SELF_HOST ? './node_modules/onnxruntime-web/dist/' : 'http://localhost:8081/dist/';
   config.set({
@@ -25,10 +27,14 @@ module.exports = function(config) {
       {pattern: TEST_MAIN},
       {pattern: './node_modules/onnxruntime-web/dist/*.wasm', included: false, nocache: true},
       {pattern: './model.onnx', included: false},
+      {pattern: './model_with_orig_ext_data.onnx', included: false},
+      {pattern: './model_with_orig_ext_data.bin', included: false},
     ],
     plugins: [require('@chiragrupani/karma-chromium-edge-launcher'), ...config.plugins],
     proxies: {
       '/model.onnx': '/base/model.onnx',
+      '/model_with_orig_ext_data.onnx': '/base/model_with_orig_ext_data.onnx',
+      '/model_with_orig_ext_data.bin': '/base/model_with_orig_ext_data.bin',
       '/test-wasm-path-override/ort-wasm.wasm': '/base/node_modules/onnxruntime-web/dist/ort-wasm.wasm',
       '/test-wasm-path-override/renamed.wasm': '/base/node_modules/onnxruntime-web/dist/ort-wasm.wasm',
     },
@@ -43,10 +49,11 @@ module.exports = function(config) {
     hostname: 'localhost',
     browsers: [],
     customLaunchers: {
-      Chrome_default: {base: 'ChromeHeadless', chromeDataDir: USER_DATA},
+      Chrome_default: {base: 'Chrome', flags, chromeDataDir: USER_DATA},
       Chrome_no_threads: {
-        base: 'ChromeHeadless',
+        base: 'Chrome',
         chromeDataDir: USER_DATA,
+        flags
         // TODO: no-thread flags
       },
       Edge_default: {base: 'Edge', edgeDataDir: USER_DATA}
diff --git a/js/web/test/e2e/model_with_orig_ext_data.bin b/js/web/test/e2e/model_with_orig_ext_data.bin
new file mode 100644
index 000000000000..d69e6beeff85
Binary files /dev/null and b/js/web/test/e2e/model_with_orig_ext_data.bin differ
diff --git a/js/web/test/e2e/model_with_orig_ext_data.onnx b/js/web/test/e2e/model_with_orig_ext_data.onnx
new file mode 100644
index 000000000000..6f9cce0bc5b4
--- /dev/null
+++ b/js/web/test/e2e/model_with_orig_ext_data.onnx
@@ -0,0 +1,19 @@
+	onnx-example:�
+:
+X
+model_with_orig_ext_dataY"Pad*
+mode"constant�
+test-model*JBmodel_with_orig_ext_dataj(
+locationmodel_with_orig_ext_data.binpZ
+X
+
+
+Z&
+model_with_orig_ext_data
+
+
+b
+Y
+
+
+B
\ No newline at end of file
diff --git a/js/web/test/e2e/run.js b/js/web/test/e2e/run.js
index 2776f6dff46a..46c04792f1b9 100644
--- a/js/web/test/e2e/run.js
+++ b/js/web/test/e2e/run.js
@@ -119,6 +119,7 @@ async function testAllBrowserCases({hostInKarma}) {
   await runKarma({hostInKarma, main: './browser-test-wasm-path-override-prefix.js'});
   await runKarma({hostInKarma, main: './browser-test-wasm-path-override-prefix.js', ortMain: 'ort.wasm.min.js'});
   await runKarma({hostInKarma, main: './browser-test-wasm-image-tensor-image.js'});
+  await runKarma({hostInKarma, main: './browser-test-webgpu-external-data.js', ortMain: 'ort.webgpu.min.js'});
 }
 
 async function runKarma({hostInKarma, main, browser = BROWSER, ortMain = 'ort.min.js'}) {
diff --git a/js/web/test/e2e/simple-http-server.js b/js/web/test/e2e/simple-http-server.js
index 1244aaddafd2..6a6162855df8 100644
--- a/js/web/test/e2e/simple-http-server.js
+++ b/js/web/test/e2e/simple-http-server.js
@@ -16,6 +16,7 @@ const validRequests = {
   '/dist/ort-wasm-simd.wasm': ['dist/ort-wasm-simd.wasm', 'application/wasm'],
   '/dist/ort-wasm-threaded.wasm': ['dist/ort-wasm-threaded.wasm', 'application/wasm'],
   '/dist/ort-wasm-simd-threaded.wasm': ['dist/ort-wasm-simd-threaded.wasm', 'application/wasm'],
+  '/dist/ort-wasm-simd.jsep.wasm': ['dist/ort-wasm-simd.jsep.wasm', 'application/wasm'],
 
   // proxied .wasm files:
   '/test-wasm-path-override/ort-wasm.wasm': ['dist/ort-wasm.wasm', 'application/wasm'],
@@ -25,6 +26,7 @@ const validRequests = {
   '/dist/ort.min.js': ['dist/ort.min.js', 'text/javascript'],
   '/dist/ort.js': ['dist/ort.js', 'text/javascript'],
   '/dist/ort.webgl.min.js': ['dist/ort.webgl.min.js', 'text/javascript'],
+  '/dist/ort.webgpu.min.js': ['dist/ort.webgpu.min.js', 'text/javascript'],
   '/dist/ort.wasm.min.js': ['dist/ort.wasm.min.js', 'text/javascript'],
   '/dist/ort.wasm-core.min.js': ['dist/ort.wasm-core.min.js', 'text/javascript'],
 };
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 594ce9feed31..811e3659b598 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -472,11 +472,11 @@
       // "test_cumsum_2d_axis_0",
       // "test_cumsum_2d_axis_1",
       // "test_cumsum_2d_negative_axis",
-      // "test_depthtospace_crd_mode_example",
-      // "test_depthtospace_crd_mode",
-      // "test_depthtospace_dcr_mode",
-      // "test_depthtospace_example",
-      // "test_depthtospace",
+      "test_depthtospace_crd_mode_example",
+      "test_depthtospace_crd_mode",
+      "test_depthtospace_dcr_mode",
+      "test_depthtospace_example",
+      "test_depthtospace",
       // // "test_dequantizelinear_axis",
       // // "test_dequantizelinear",
       // // "test_det_2d",
@@ -553,7 +553,7 @@
       "test_gemm_broadcast",
       "test_gemm_default_matrix_bias",
       "test_gemm_default_no_bias",
-      "test_gemm_default_scalar_bias",
+      // "test_gemm_default_scalar_bias",
       "test_gemm_default_single_elem_vector_bias",
       "test_gemm_default_vector_bias",
       "test_gemm_default_zero_bias",
@@ -597,9 +597,9 @@
       // // "test_hardmax_example",
       // // "test_hardmax_negative_axis",
       // // "test_hardmax_one_hot",
-      // // "test_hardsigmoid_default",
-      // // "test_hardsigmoid_example",
-      // // "test_hardsigmoid",
+      "test_hardsigmoid_default",
+      "test_hardsigmoid_example",
+      "test_hardsigmoid",
       // // "test_hardswish_expanded",
       // // "test_hardswish",
       "test_if",
@@ -637,9 +637,9 @@
       "test_layer_normalization_4d_axis_negative_1",
       // // "test_layer_normalization_4d_axis_negative_2_expanded",
       "test_layer_normalization_4d_axis_negative_2",
-      "test_layer_normalization_4d_axis_negative_3_expanded",
+      // "test_layer_normalization_4d_axis_negative_3_expanded",
       "test_layer_normalization_4d_axis_negative_3",
-      "test_layer_normalization_4d_axis_negative_4_expanded",
+      // "test_layer_normalization_4d_axis_negative_4_expanded",
       "test_layer_normalization_4d_axis_negative_4",
       "test_layer_normalization_4d_axis0_expanded",
       "test_layer_normalization_4d_axis0",
@@ -1231,7 +1231,7 @@
       "test_split_variable_parts_1d",
       "test_split_variable_parts_2d",
       "test_split_variable_parts_default_axis",
-      // // "test_split_zero_size_splits",
+      "test_split_zero_size_splits",
       "test_sqrt_example",
       "test_sqrt",
       "test_squeeze_negative_axes",
@@ -1334,6 +1334,7 @@
       "acos.jsonc",
       "add.jsonc",
       "add_int32.jsonc",
+      "add_zero-sized.jsonc",
       //"and.jsonc",
       "asin.jsonc",
       "attention.jsonc",
@@ -1343,16 +1344,19 @@
       "ceil.jsonc",
       "concat.jsonc",
       "concat_int32.jsonc",
+      "concat_zero-sized.jsonc",
       "cast.jsonc",
       "conv.jsonc",
       "cos.jsonc",
       "div.jsonc",
       "div_int32.jsonc",
-      //"depth-to-space.jsonc",
+      "depth-to-space.jsonc",
       "equal.jsonc",
       "exp.jsonc",
       "expand.jsonc",
+      "fast-gelu.jsonc",
       "floor.jsonc",
+      "fused-conv.jsonc",
       "gather-elements.jsonc",
       "gemm.jsonc",
       "global-average-pool.jsonc",
@@ -1361,6 +1365,7 @@
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",
+      "matmulnbits.jsonc",
       "matmul-broadcast.jsonc",
       "mul.jsonc",
       "mul_int32.jsonc",
@@ -1380,7 +1385,10 @@
       "pow_int32.jsonc",
       "pow-big-number.jsonc",
       "reshape.jsonc",
+      "rotary-embedding.jsonc",
+      "simplified-layer-norm.jsonc",
       "skip-layer-norm.jsonc",
+      "skip-simplified-layer-norm.jsonc",
       "slice.jsonc",
       //"softmax.jsonc",
       "sin.jsonc",
@@ -1389,6 +1397,7 @@
       "sub.jsonc",
       "sub_int32.jsonc",
       "tan.jsonc",
+      "tanh.jsonc",
       "tile.jsonc",
       "transpose.jsonc",
       "transpose_int32_uint32.jsonc",
@@ -1501,99 +1510,1046 @@
   "webnn": {
     "onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"],
     "node": [
-      // Check in node tests that have native Wasm implementations.
-      // (i.e.) not tests that rely on the fallback cpu implementations.
-      // Use the 'cpu' level of node tests to test those implementations.
+      "test_abs",
+      // "test_acos_example",
+      // "test_acos",
+      // "test_acosh_example",
+      // "test_acosh",
+      // // "test_adagrad_multiple",
+      // // "test_adagrad",
+      // // "test_adam_multiple",
+      // // "test_adam",
       "test_add_bcast",
+      // "test_add_uint8",
       "test_add",
-      "test_sub_bcast",
-      "test_sub_example",
-      "test_sub",
-      "test_mul_bcast",
-      "test_mul_example",
-      "test_mul",
-      "test_div_bcast",
-      "test_div_example",
-      "test_div",
-      "test_xor_bcast3v1d",
-      "test_xor_bcast3v2d",
-      "test_xor_bcast4v2d",
-      "test_xor_bcast4v3d",
-      "test_xor_bcast4v4d",
-      "test_xor2d",
-      "test_xor3d",
-      "test_xor4d",
-      "test_or_bcast3v1d",
-      "test_or_bcast3v2d",
-      "test_or_bcast4v2d",
-      "test_or_bcast4v3d",
-      "test_or_bcast4v4d",
-      "test_and_bcast3v1d",
-      "test_and_bcast3v2d",
-      "test_and_bcast4v2d",
-      "test_and_bcast4v3d",
-      "test_and_bcast4v4d",
-      "test_and2d",
-      "test_and3d",
-      "test_and4d",
-      "test_prelu_broadcast",
-      "test_prelu_example",
+      // "test_and_bcast3v1d",
+      // "test_and_bcast3v2d",
+      // "test_and_bcast4v2d",
+      // "test_and_bcast4v3d",
+      // "test_and_bcast4v4d",
+      // "test_and2d",
+      // "test_and3d",
+      // "test_and4d",
+      "test_argmax_default_axis_example_select_last_index",
+      "test_argmax_default_axis_example",
+      "test_argmax_default_axis_random_select_last_index",
+      "test_argmax_default_axis_random",
+      "test_argmax_keepdims_example_select_last_index",
+      "test_argmax_keepdims_example",
+      "test_argmax_keepdims_random_select_last_index",
+      "test_argmax_keepdims_random",
+      "test_argmax_negative_axis_keepdims_example_select_last_index",
+      "test_argmax_negative_axis_keepdims_example",
+      "test_argmax_negative_axis_keepdims_random_select_last_index",
+      "test_argmax_negative_axis_keepdims_random",
+      "test_argmax_no_keepdims_example_select_last_index",
+      "test_argmax_no_keepdims_example",
+      "test_argmax_no_keepdims_random_select_last_index",
+      "test_argmax_no_keepdims_random",
+      "test_argmin_default_axis_example_select_last_index",
+      "test_argmin_default_axis_example",
+      "test_argmin_default_axis_random_select_last_index",
+      "test_argmin_default_axis_random",
+      "test_argmin_keepdims_example_select_last_index",
+      "test_argmin_keepdims_example",
+      "test_argmin_keepdims_random_select_last_index",
+      "test_argmin_keepdims_random",
+      "test_argmin_negative_axis_keepdims_example_select_last_index",
+      "test_argmin_negative_axis_keepdims_example",
+      "test_argmin_negative_axis_keepdims_random_select_last_index",
+      "test_argmin_negative_axis_keepdims_random",
+      "test_argmin_no_keepdims_example_select_last_index",
+      "test_argmin_no_keepdims_example",
+      "test_argmin_no_keepdims_random_select_last_index",
+      "test_argmin_no_keepdims_random",
+      // "test_asin_example",
+      // "test_asin",
+      // "test_asinh_example",
+      // "test_asinh",
+      // "test_atan_example",
+      // "test_atan",
+      // "test_atanh_example",
+      // "test_atanh",
+      // "test_averagepool_1d_default",
+      // "test_averagepool_2d_ceil",
+      "test_averagepool_2d_default",
+      "test_averagepool_2d_pads_count_include_pad",
+      "test_averagepool_2d_pads",
+      "test_averagepool_2d_precomputed_pads_count_include_pad",
+      "test_averagepool_2d_precomputed_pads",
+      "test_averagepool_2d_precomputed_same_upper",
+      "test_averagepool_2d_precomputed_strides",
+      "test_averagepool_2d_same_lower",
+      "test_averagepool_2d_same_upper",
+      "test_averagepool_2d_strides",
+      // "test_averagepool_3d_default",
       "test_basic_conv_with_padding",
       "test_basic_conv_without_padding",
+      // "test_basic_convinteger",
+      "test_batchnorm_epsilon_training_mode",
       "test_batchnorm_epsilon",
+      "test_batchnorm_example_training_mode",
       "test_batchnorm_example",
-      "opset{10,11,12}/test_cast_STRING_to_FLOAT",
-      "test_clip_splitbounds",
-      "test_clip_outbounds",
-      "test_clip_inbounds",
-      "test_clip_example",
-      "test_clip_default_min",
-      "test_clip_default_max",
+      // // "test_bernoulli_double_expanded",
+      // // "test_bernoulli_double",
+      // // "test_bernoulli_expanded",
+      // // "test_bernoulli_seed_expanded",
+      // // "test_bernoulli_seed",
+      // // "test_bernoulli",
+      // // "test_bitshift_left_uint16",
+      // // "test_bitshift_left_uint32",
+      // // "test_bitshift_left_uint64",
+      // // "test_bitshift_left_uint8",
+      // // "test_bitshift_right_uint16",
+      // // "test_bitshift_right_uint32",
+      // // "test_bitshift_right_uint64",
+      // // "test_bitshift_right_uint8",
+      // // "test_blackmanwindow_expanded",
+      // // "test_blackmanwindow_symmetric_expanded",
+      // // "test_blackmanwindow_symmetric",
+      // // "test_blackmanwindow",
+      // // "test_cast_BFLOAT16_to_FLOAT",
+      "test_cast_DOUBLE_to_FLOAT",
+      // "test_cast_DOUBLE_to_FLOAT16",
+      // // "test_cast_FLOAT_to_BFLOAT16",
+      "test_cast_FLOAT_to_DOUBLE",
+      // // "test_cast_FLOAT_to_FLOAT16",
+      // // "test_cast_FLOAT_to_STRING",
+      // "test_cast_FLOAT16_to_DOUBLE",
+      // "test_cast_FLOAT16_to_FLOAT",
+      // // "test_cast_STRING_to_FLOAT",
+      // // "test_castlike_BFLOAT16_to_FLOAT_expanded",
+      // // "test_castlike_BFLOAT16_to_FLOAT",
+      // // "test_castlike_DOUBLE_to_FLOAT_expanded",
+      // // "test_castlike_DOUBLE_to_FLOAT",
+      // // "test_castlike_DOUBLE_to_FLOAT16_expanded",
+      // // "test_castlike_DOUBLE_to_FLOAT16",
+      // // "test_castlike_FLOAT_to_BFLOAT16_expanded",
+      // // "test_castlike_FLOAT_to_BFLOAT16",
+      // // "test_castlike_FLOAT_to_DOUBLE_expanded",
+      // // "test_castlike_FLOAT_to_DOUBLE",
+      // // "test_castlike_FLOAT_to_FLOAT16_expanded",
+      // // "test_castlike_FLOAT_to_FLOAT16",
+      // // "test_castlike_FLOAT_to_STRING_expanded",
+      // // "test_castlike_FLOAT_to_STRING",
+      // // "test_castlike_FLOAT16_to_DOUBLE_expanded",
+      // // "test_castlike_FLOAT16_to_DOUBLE",
+      // // "test_castlike_FLOAT16_to_FLOAT_expanded",
+      // // "test_castlike_FLOAT16_to_FLOAT",
+      // // "test_castlike_STRING_to_FLOAT_expanded",
+      // // "test_castlike_STRING_to_FLOAT",
+      "test_ceil_example",
+      "test_ceil",
+      // "test_celu_expanded",
+      // "test_celu",
       "test_clip_default_inbounds",
+      "test_clip_default_int8_inbounds",
+      "test_clip_default_int8_max",
+      "test_clip_default_int8_min",
+      "test_clip_default_max",
+      "test_clip_default_min",
+      "test_clip_example",
+      "test_clip_inbounds",
+      "test_clip_outbounds",
+      "test_clip_splitbounds",
       "test_clip",
+      // // "test_compress_0",
+      // // "test_compress_1",
+      // // "test_compress_default_axis",
+      // // "test_compress_negative_axis",
+      "test_concat_1d_axis_0",
+      "test_concat_1d_axis_negative_1",
+      "test_concat_2d_axis_0",
+      "test_concat_2d_axis_1",
+      "test_concat_2d_axis_negative_1",
+      "test_concat_2d_axis_negative_2",
+      "test_concat_3d_axis_0",
+      "test_concat_3d_axis_1",
+      "test_concat_3d_axis_2",
+      "test_concat_3d_axis_negative_1",
+      "test_concat_3d_axis_negative_2",
+      "test_concat_3d_axis_negative_3",
+      "test_conv_with_autopad_same",
       "test_conv_with_strides_and_asymmetric_padding",
       "test_conv_with_strides_no_padding",
       "test_conv_with_strides_padding",
-      "test_gemm_nobroadcast",
-      "test_gemm_broadcast",
-      "test_matmul_2d",
-      "test_matmul_3d",
-      "test_matmul_4d",
-      "test_softmax_axis_0",
-      "test_softmax_axis_1",
-      "test_softmax_axis_2",
-      "test_softmax_default_axis",
-      "test_softmax_example",
-      "test_softmax_large_number",
-      "test_sum_example",
-      "test_sum_one_input",
-      "test_sum_two_inputs",
-      "test_averagepool_1d_default",
-      "test_averagepool_2d_default",
-      "test_averagepool_2d_pads",
-      "test_averagepool_2d_precomputed_pads",
-      "test_averagepool_2d_precomputed_same_upper",
-      "test_averagepool_2d_precomputed_strides",
-      "test_averagepool_2d_same_upper",
-      "test_averagepool_2d_same_lower",
-      "test_averagepool_2d_strides",
-      "test_averagepool_3d_default",
-      "test_maxpool_1d_default",
-      "test_maxpool_2d_default",
-      "test_maxpool_2d_pads",
-      "test_maxpool_2d_precomputed_pads",
-      "test_maxpool_2d_precomputed_same_upper",
-      "test_maxpool_2d_precomputed_strides",
-      "test_maxpool_2d_same_lower",
-      "test_maxpool_2d_same_upper",
-      "test_maxpool_2d_strides",
-      "test_maxpool_3d_default",
-      "test_globalaveragepool_precomputed",
-      "test_globalaveragepool",
-      "test_globalmaxpool_precomputed",
-      "test_globalmaxpool",
-      "test_instancenorm_epsilon",
-      "test_instancenorm_example"
+      // // "test_convinteger_with_padding",
+      // // "test_convinteger_without_padding",
+      "test_convtranspose_1d",
+      // // "test_convtranspose_3d",
+      // "test_convtranspose_autopad_same",
+      "test_convtranspose_dilations",
+      "test_convtranspose_kernel_shape",
+      "opset{9,17}/test_convtranspose_output_shape",
+      "test_convtranspose_pad",
+      "test_convtranspose_pads",
+      "test_convtranspose_with_kernel",
+      "test_convtranspose",
+      "test_cos_example",
+      "test_cos",
+      // "test_cosh_example",
+      // "test_cosh",
+      // "test_cumsum_1d_exclusive",
+      // "test_cumsum_1d_reverse_exclusive",
+      // "test_cumsum_1d_reverse",
+      // "test_cumsum_1d",
+      // "test_cumsum_2d_axis_0",
+      // "test_cumsum_2d_axis_1",
+      // "test_cumsum_2d_negative_axis",
+      // "test_depthtospace_crd_mode_example",
+      // "test_depthtospace_crd_mode",
+      // "test_depthtospace_dcr_mode",
+      // "test_depthtospace_example",
+      // "test_depthtospace",
+      // // "test_dequantizelinear_axis",
+      // // "test_dequantizelinear",
+      // // "test_det_2d",
+      // // "test_det_nd",
+      // // "test_dft_axis",
+      // // "test_dft_inverse",
+      // // "test_dft",
+      "test_div_bcast",
+      "test_div_example",
+      // "test_div_uint8",
+      "test_div",
+      // // "test_dropout_default_mask_ratio",
+      // // "test_dropout_default_mask",
+      // // "test_dropout_default_old",
+      // // "test_dropout_default_ratio",
+      // // "test_dropout_default",
+      // // "test_dropout_random_old",
+      // // "test_dropout_random",
+      // // "test_dynamic_slice_default_axes",
+      // // "test_dynamic_slice_end_out_of_bounds",
+      // // "test_dynamic_slice_neg",
+      // // "test_dynamic_slice_start_out_of_bounds",
+      // // "test_dynamic_slice",
+      // // "test_dynamicquantizelinear_expanded",
+      // // "test_dynamicquantizelinear_max_adjusted_expanded",
+      // // "test_dynamicquantizelinear_max_adjusted",
+      // // "test_dynamicquantizelinear_min_adjusted_expanded",
+      // // "test_dynamicquantizelinear_min_adjusted",
+      // // "test_dynamicquantizelinear",
+      // "test_edge_pad",
+      // "test_einsum_batch_diagonal",
+      // "test_einsum_batch_matmul",
+      // "test_einsum_inner_prod",
+      // "test_einsum_sum",
+      // "test_einsum_transpose",
+      "test_elu_default",
+      "test_elu_example",
+      "test_elu",
+      "test_equal_bcast",
+      "test_equal",
+      // "test_erf",
+      "test_exp_example",
+      "test_exp",
+      // "test_expand_dim_changed",
+      // "test_expand_dim_unchanged",
+      // "test_eyelike_populate_off_main_diagonal",
+      // "test_eyelike_with_dtype",
+      // "test_eyelike_without_dtype",
+      "test_flatten_axis0",
+      "test_flatten_axis1",
+      "test_flatten_axis2",
+      "test_flatten_axis3",
+      "test_flatten_default_axis",
+      "test_flatten_negative_axis1",
+      "test_flatten_negative_axis2",
+      "test_flatten_negative_axis3",
+      "test_flatten_negative_axis4",
+      "test_floor_example",
+      "test_floor",
+      "test_gather_0",
+      "test_gather_1",
+      "test_gather_2d_indices",
+      "test_gather_negative_indices",
+      "test_gather_elements_0",
+      "test_gather_elements_1",
+      "test_gather_elements_negative_indices",
+      // "test_gathernd_example_float32",
+      // "test_gathernd_example_int32_batch_dim1",
+      // "test_gathernd_example_int32",
+      "test_gemm_all_attributes",
+      "test_gemm_alpha",
+      "test_gemm_beta",
+      "test_gemm_broadcast",
+      "test_gemm_default_matrix_bias",
+      "test_gemm_default_no_bias",
+      // "test_gemm_default_scalar_bias",
+      "test_gemm_default_single_elem_vector_bias",
+      "test_gemm_default_vector_bias",
+      "test_gemm_default_zero_bias",
+      "test_gemm_nobroadcast",
+      "test_gemm_transposeA",
+      "test_gemm_transposeB",
+      "test_globalaveragepool_precomputed",
+      "test_globalaveragepool",
+      "test_globalmaxpool_precomputed",
+      "test_globalmaxpool",
+      "test_greater_bcast",
+      "test_greater_equal_bcast_expanded",
+      "test_greater_equal_bcast",
+      "test_greater_equal_expanded",
+      "test_greater_equal",
+      "test_greater",
+      // // "test_gridsample_aligncorners_true",
+      // // "test_gridsample_bicubic",
+      // // "test_gridsample_bilinear",
+      // // "test_gridsample_border_padding",
+      // // "test_gridsample_nearest",
+      // // "test_gridsample_reflection_padding",
+      // // "test_gridsample_zeros_padding",
+      // // "test_gridsample",
+      // // "test_gru_batchwise",
+      // // "test_gru_defaults",
+      // // "test_gru_seq_length",
+      // // "test_gru_with_initial_bias",
+      // // "test_hammingwindow_expanded",
+      // // "test_hammingwindow_symmetric_expanded",
+      // // "test_hammingwindow_symmetric",
+      // // "test_hammingwindow",
+      // // "test_hannwindow_expanded",
+      // // "test_hannwindow_symmetric_expanded",
+      // // "test_hannwindow_symmetric",
+      // // "test_hannwindow",
+      // // "test_hardmax_axis_0",
+      // // "test_hardmax_axis_1",
+      // // "test_hardmax_axis_2",
+      // // "test_hardmax_default_axis",
+      // // "test_hardmax_example",
+      // // "test_hardmax_negative_axis",
+      // // "test_hardmax_one_hot",
+      "test_hardsigmoid_default",
+      "test_hardsigmoid_example",
+      "test_hardsigmoid",
+      "test_hardswish_expanded",
+      "test_hardswish",
+      // "test_if",
+      // TODO: Uncomment 'test_if_seq' and 'test_if_opt' once the test infra
+      // supports Sequence and Optional types
+      // "test_if_seq",
+      // "test_if_opt",
+      "test_instancenorm_epsilon",
+      "test_instancenorm_example",
+      // "test_isinf_negative",
+      // "test_isinf_positive",
+      // "test_isinf",
+      // "test_isnan",
+      // "test_layer_normalization_2d_axis_negative_1_expanded",
+      "test_layer_normalization_2d_axis_negative_1",
+      // "test_layer_normalization_2d_axis_negative_2_expanded",
+      "test_layer_normalization_2d_axis_negative_2",
+      // "test_layer_normalization_2d_axis0_expanded",
+      "test_layer_normalization_2d_axis0",
+      // "test_layer_normalization_2d_axis1_expanded",
+      "test_layer_normalization_2d_axis1",
+      // "test_layer_normalization_3d_axis_negative_1_epsilon_expanded",
+      "test_layer_normalization_3d_axis_negative_1_epsilon",
+      // "test_layer_normalization_3d_axis_negative_2_epsilon_expanded",
+      "test_layer_normalization_3d_axis_negative_2_epsilon",
+      // "test_layer_normalization_3d_axis_negative_3_epsilon_expanded",
+      "test_layer_normalization_3d_axis_negative_3_epsilon",
+      // "test_layer_normalization_3d_axis0_epsilon_expanded",
+      "test_layer_normalization_3d_axis0_epsilon",
+      // "test_layer_normalization_3d_axis1_epsilon_expanded",
+      "test_layer_normalization_3d_axis1_epsilon",
+      // "test_layer_normalization_3d_axis2_epsilon_expanded",
+      "test_layer_normalization_3d_axis2_epsilon",
+      // "test_layer_normalization_4d_axis_negative_1_expanded",
+      "test_layer_normalization_4d_axis_negative_1",
+      // "test_layer_normalization_4d_axis_negative_2_expanded",
+      "test_layer_normalization_4d_axis_negative_2",
+      // "test_layer_normalization_4d_axis_negative_3_expanded",
+      "test_layer_normalization_4d_axis_negative_3",
+      // "test_layer_normalization_4d_axis_negative_4_expanded",
+      "test_layer_normalization_4d_axis_negative_4",
+      // "test_layer_normalization_4d_axis0_expanded",
+      "test_layer_normalization_4d_axis0",
+      // "test_layer_normalization_4d_axis1_expanded",
+      "test_layer_normalization_4d_axis1",
+      // "test_layer_normalization_4d_axis2_expanded",
+      "test_layer_normalization_4d_axis2",
+      // "test_layer_normalization_4d_axis3_expanded",
+      "test_layer_normalization_4d_axis3",
+      // "test_layer_normalization_default_axis_expanded",
+      "test_layer_normalization_default_axis",
+      "test_leakyrelu_default",
+      "test_leakyrelu_example",
+      "test_leakyrelu",
+      "test_less_bcast",
+      "test_less_equal_bcast_expanded",
+      "test_less_equal_bcast",
+      "test_less_equal_expanded",
+      "test_less_equal",
+      "test_less",
+      "test_log_example",
+      "test_log",
+      // // "test_logsoftmax_axis_0_expanded",
+      // // "test_logsoftmax_axis_0",
+      // // "test_logsoftmax_axis_1_expanded",
+      // // "test_logsoftmax_axis_1",
+      // // "test_logsoftmax_axis_2_expanded",
+      // // "test_logsoftmax_axis_2",
+      // // "test_logsoftmax_default_axis_expanded",
+      // // "test_logsoftmax_default_axis",
+      // // "test_logsoftmax_example_1_expanded",
+      // // "test_logsoftmax_example_1",
+      // // "test_logsoftmax_large_number_expanded",
+      // // "test_logsoftmax_large_number",
+      // // "test_logsoftmax_negative_axis_expanded",
+      // // "test_logsoftmax_negative_axis",
+      // "test_lrn_default",
+      // "test_lrn",
+      // // "test_lstm_batchwise",
+      // // "test_lstm_defaults",
+      // // "test_lstm_with_initial_bias",
+      // // "test_lstm_with_peepholes",
+      "test_matmul_2d",
+      "test_matmul_3d",
+      "test_matmul_4d",
+      // // "test_matmulinteger",
+      "test_max_example",
+      // "test_max_float16",
+      "test_max_float32",
+      "test_max_float64",
+      // "test_max_int16",
+      // "test_max_int32",
+      // "test_max_int64",
+      // "test_max_int8",
+      "test_max_one_input",
+      "test_max_two_inputs",
+      // "test_max_uint16",
+      // "test_max_uint32",
+      // "test_max_uint64",
+      // "test_max_uint8",
+      // "test_maxpool_1d_default",
+      // "test_maxpool_2d_ceil",
+      "test_maxpool_2d_default",
+      "test_maxpool_2d_dilations",
+      "test_maxpool_2d_pads",
+      "test_maxpool_2d_precomputed_pads",
+      "test_maxpool_2d_precomputed_same_upper",
+      "test_maxpool_2d_precomputed_strides",
+      "test_maxpool_2d_same_lower",
+      "test_maxpool_2d_same_upper",
+      "test_maxpool_2d_strides",
+      // "test_maxpool_2d_uint8",
+      // "test_maxpool_3d_default",
+      // "test_maxpool_with_argmax_2d_precomputed_pads",
+      // "test_maxpool_with_argmax_2d_precomputed_strides",
+      // // "test_maxunpool_export_with_output_shape",
+      // // "test_maxunpool_export_without_output_shape",
+      // // "test_mean_example",
+      // // "test_mean_one_input",
+      // // "test_mean_two_inputs",
+      // // "test_melweightmatrix",
+      "test_min_example",
+      // "test_min_float16",
+      "test_min_float32",
+      "test_min_float64",
+      // "test_min_int16",
+      // "test_min_int32",
+      // "test_min_int64",
+      // "test_min_int8",
+      "test_min_one_input",
+      "test_min_two_inputs",
+      // "test_min_uint16",
+      // "test_min_uint32",
+      // "test_min_uint64",
+      // "test_min_uint8",
+      // "test_mod_bcast",
+      // "test_mod_broadcast",
+      // "test_mod_float_mixed_sign_example",
+      // "test_mod_fmod_mixed_sign_example",
+      // "test_mod_int64_fmod",
+      // "test_mod_int64_mixed_sign_example",
+      // "test_mod_mixed_sign_float16",
+      // "test_mod_mixed_sign_float32",
+      // "test_mod_mixed_sign_float64",
+      // "test_mod_mixed_sign_int16",
+      // "test_mod_mixed_sign_int32",
+      // "test_mod_mixed_sign_int64",
+      // "test_mod_mixed_sign_int8",
+      // "test_mod_uint16",
+      // "test_mod_uint32",
+      // "test_mod_uint64",
+      // "test_mod_uint8",
+      // // "test_momentum_multiple",
+      // // "test_momentum",
+      "test_mul_bcast",
+      "test_mul_example",
+      // "test_mul_uint8",
+      "test_mul",
+      // "test_mvn_expanded",
+      // "test_mvn",
+      "test_neg_example",
+      "test_neg",
+      // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NC_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NC",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+      // // "test_nesterov_momentum",
+      // // "test_nllloss_NC_expanded",
+      // // "test_nllloss_NC",
+      // // "test_nllloss_NCd1_expanded",
+      // // "test_nllloss_NCd1_ii_expanded",
+      // // "test_nllloss_NCd1_ii",
+      // // "test_nllloss_NCd1_mean_weight_negative_ii_expanded",
+      // // "test_nllloss_NCd1_mean_weight_negative_ii",
+      // // "test_nllloss_NCd1_weight_expanded",
+      // // "test_nllloss_NCd1_weight_ii_expanded",
+      // // "test_nllloss_NCd1_weight_ii",
+      // // "test_nllloss_NCd1_weight",
+      // // "test_nllloss_NCd1",
+      // // "test_nllloss_NCd1d2_expanded",
+      // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded",
+      // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii",
+      // // "test_nllloss_NCd1d2_reduction_mean_expanded",
+      // // "test_nllloss_NCd1d2_reduction_mean",
+      // // "test_nllloss_NCd1d2_reduction_sum_expanded",
+      // // "test_nllloss_NCd1d2_reduction_sum",
+      // // "test_nllloss_NCd1d2_with_weight_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_mean_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_mean",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum",
+      // // "test_nllloss_NCd1d2_with_weight",
+      // // "test_nllloss_NCd1d2",
+      // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
+      // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii",
+      // // "test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded",
+      // // "test_nllloss_NCd1d2d3_sum_weight_high_ii",
+      // // "test_nllloss_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_nllloss_NCd1d2d3d4d5_mean_weight",
+      // // "test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_nllloss_NCd1d2d3d4d5_none_no_weight",
+      // "test_nonmaxsuppression_center_point_box_format",
+      // "test_nonmaxsuppression_flipped_coordinates",
+      // "test_nonmaxsuppression_identical_boxes",
+      // "test_nonmaxsuppression_limit_output_size",
+      // "test_nonmaxsuppression_single_box",
+      // "test_nonmaxsuppression_suppress_by_IOU_and_scores",
+      // "test_nonmaxsuppression_suppress_by_IOU",
+      // "test_nonmaxsuppression_two_batches",
+      // "test_nonmaxsuppression_two_classes",
+      // "test_nonzero_example",
+      "test_not_2d",
+      "test_not_3d",
+      "test_not_4d",
+      // // "test_onehot_negative_indices",
+      // // "test_onehot_with_axis",
+      // // "test_onehot_with_negative_axis",
+      // // "test_onehot_without_axis",
+      // // "test_optional_get_element_sequence",
+      // // "test_optional_get_element",
+      // // "test_optional_has_element_empty",
+      // // "test_optional_has_element",
+      // "test_or_bcast3v1d",
+      // "test_or_bcast3v2d",
+      // "test_or_bcast4v2d",
+      // "test_or_bcast4v3d",
+      // "test_or_bcast4v4d",
+      // "test_or2d",
+      // "test_or3d",
+      // "test_or4d",
+      "test_pow_bcast_array",
+      "test_pow_bcast_scalar",
+      "test_pow_example",
+      // "test_pow_types_float",
+      // "test_pow_types_float32_int32",
+      // "test_pow_types_float32_int64",
+      // "test_pow_types_float32_uint32",
+      // "test_pow_types_float32_uint64",
+      // "test_pow_types_int",
+      // "test_pow_types_int32_float32",
+      // "test_pow_types_int32_int32",
+      // "test_pow_types_int64_float32",
+      // "test_pow_types_int64_int64",
+      "test_pow",
+      "test_prelu_broadcast",
+      "test_prelu_example",
+      // // "test_qlinearconv",
+      // // "test_qlinearmatmul_2D",
+      // // "test_qlinearmatmul_3D",
+      // // "test_quantizelinear_axis",
+      // // "test_quantizelinear",
+      // "test_range_float_type_positive_delta_expanded",
+      // "test_range_float_type_positive_delta",
+      // "test_range_int32_type_negative_delta_expanded",
+      // "test_range_int32_type_negative_delta",
+      "test_reciprocal_example",
+      "test_reciprocal",
+      "test_reduce_l1_default_axes_keepdims_example",
+      "test_reduce_l1_default_axes_keepdims_random",
+      "test_reduce_l1_do_not_keepdims_example",
+      "test_reduce_l1_do_not_keepdims_random",
+      "test_reduce_l1_keep_dims_example",
+      "test_reduce_l1_keep_dims_random",
+      "test_reduce_l1_negative_axes_keep_dims_example",
+      "test_reduce_l1_negative_axes_keep_dims_random",
+      "test_reduce_l2_default_axes_keepdims_example",
+      "test_reduce_l2_default_axes_keepdims_random",
+      "test_reduce_l2_do_not_keepdims_example",
+      "test_reduce_l2_do_not_keepdims_random",
+      "test_reduce_l2_keep_dims_example",
+      "test_reduce_l2_keep_dims_random",
+      "test_reduce_l2_negative_axes_keep_dims_example",
+      "test_reduce_l2_negative_axes_keep_dims_random",
+      "test_reduce_log_sum_asc_axes",
+      "test_reduce_log_sum_default",
+      "test_reduce_log_sum_desc_axes",
+      // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
+      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
+      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
+      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
+      "test_reduce_log_sum_negative_axes",
+      "test_reduce_log_sum",
+      "test_reduce_max_default_axes_keepdim_example",
+      // "test_reduce_max_default_axes_keepdims_random",
+      // "test_reduce_max_do_not_keepdims_example",
+      // "test_reduce_max_do_not_keepdims_random",
+      // "test_reduce_max_keepdims_example",
+      // "test_reduce_max_keepdims_random",
+      // "test_reduce_max_negative_axes_keepdims_example",
+      // "test_reduce_max_negative_axes_keepdims_random",
+      // "test_reduce_mean_default_axes_keepdims_example",
+      // "test_reduce_mean_default_axes_keepdims_random",
+      // "test_reduce_mean_do_not_keepdims_example",
+      // "test_reduce_mean_do_not_keepdims_random",
+      // "test_reduce_mean_keepdims_example",
+      // "test_reduce_mean_keepdims_random",
+      // "test_reduce_mean_negative_axes_keepdims_example",
+      // "test_reduce_mean_negative_axes_keepdims_random",
+      // "test_reduce_min_default_axes_keepdims_example",
+      // "test_reduce_min_default_axes_keepdims_random",
+      // "test_reduce_min_do_not_keepdims_example",
+      // "test_reduce_min_do_not_keepdims_random",
+      // "test_reduce_min_keepdims_example",
+      // "test_reduce_min_keepdims_random",
+      // "test_reduce_min_negative_axes_keepdims_example",
+      // "test_reduce_min_negative_axes_keepdims_random",
+      // "test_reduce_prod_default_axes_keepdims_example",
+      // "test_reduce_prod_default_axes_keepdims_random",
+      // "test_reduce_prod_do_not_keepdims_example",
+      // "test_reduce_prod_do_not_keepdims_random",
+      // "test_reduce_prod_keepdims_example",
+      // "test_reduce_prod_keepdims_random",
+      // "test_reduce_prod_negative_axes_keepdims_example",
+      // "test_reduce_prod_negative_axes_keepdims_random",
+      // "test_reduce_sum_default_axes_keepdims_example",
+      // "test_reduce_sum_default_axes_keepdims_random",
+      // "test_reduce_sum_do_not_keepdims_example",
+      // "test_reduce_sum_do_not_keepdims_random",
+      "test_reduce_sum_empty_axes_input_noop_example",
+      "test_reduce_sum_empty_axes_input_noop_random",
+      // "test_reduce_sum_keepdims_example",
+      // "test_reduce_sum_keepdims_random",
+      // "test_reduce_sum_negative_axes_keepdims_example",
+      // "test_reduce_sum_negative_axes_keepdims_random",
+      // "test_reduce_sum_square_default_axes_keepdims_example",
+      // "test_reduce_sum_square_default_axes_keepdims_random",
+      // "test_reduce_sum_square_do_not_keepdims_example",
+      // "test_reduce_sum_square_do_not_keepdims_random",
+      // "test_reduce_sum_square_keepdims_example",
+      // "test_reduce_sum_square_keepdims_random",
+      // "test_reduce_sum_square_negative_axes_keepdims_example",
+      // "test_reduce_sum_square_negative_axes_keepdims_random",
+      // "test_reflect_pad",
+      "test_relu",
+      "test_reshape_allowzero_reordered",
+      "test_reshape_extended_dims",
+      "test_reshape_negative_dim",
+      "test_reshape_negative_extended_dims",
+      "test_reshape_one_dim",
+      "test_reshape_reduced_dims",
+      "test_reshape_reordered_all_dims",
+      "test_reshape_reordered_dims",
+      "test_reshape_reordered_last_dims",
+      "test_reshape_zero_and_negative_dim",
+      "test_reshape_zero_dim",
+      "test_resize_downsample_linear",
+      "test_resize_downsample_nearest",
+      "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside",
+      // "test_resize_downsample_scales_cubic_align_corners",
+      "test_resize_downsample_scales_cubic",
+      // "test_resize_downsample_scales_linear_align_corners",
+      "test_resize_downsample_scales_linear",
+      "test_resize_downsample_scales_nearest",
+      "test_resize_downsample_sizes_cubic",
+      "test_resize_downsample_sizes_linear_pytorch_half_pixel",
+      "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn",
+      "test_resize_downsample_sizes_nearest",
+      "test_resize_nearest",
+      "test_resize_tf_crop_and_resize",
+      "test_resize_upsample_linear",
+      "test_resize_upsample_nearest",
+      "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside",
+      "test_resize_upsample_scales_cubic_align_corners",
+      "test_resize_upsample_scales_cubic_asymmetric",
+      "test_resize_upsample_scales_cubic",
+      "test_resize_upsample_scales_linear_align_corners",
+      "test_resize_upsample_scales_linear",
+      "test_resize_upsample_scales_nearest",
+      "test_resize_upsample_sizes_cubic",
+      "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_ceil_half_pixel",
+      "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_floor_align_corners",
+      "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric",
+      "test_resize_upsample_sizes_nearest",
+      // // "test_reversesequence_batch",
+      // // "test_reversesequence_time",
+      // // "test_rnn_seq_length",
+      // // "test_roialign_aligned_false",
+      // // "test_roialign_aligned_true",
+      // // "test_roialign",
+      // // "test_round",
+      // // "test_scan_sum",
+      // // "test_scan9_sum",
+      // // "test_scatter_elements_with_axis",
+      // // "test_scatter_elements_with_duplicate_indices",
+      // // "test_scatter_elements_with_negative_indices",
+      // // "test_scatter_elements_without_axis",
+      // // "test_scatter_with_axis",
+      // // "test_scatter_without_axis",
+      // // "test_scatternd_add",
+      // // "test_scatternd_multiply",
+      // // "test_scatternd",
+      // // "test_sce_mean_3d_expanded",
+      // // "test_sce_mean_3d_log_prob_expanded",
+      // // "test_sce_mean_3d_log_prob",
+      // // "test_sce_mean_3d",
+      // // "test_sce_mean_expanded",
+      // // "test_sce_mean_log_prob_expanded",
+      // // "test_sce_mean_log_prob",
+      // // "test_sce_mean_no_weight_ii_3d_expanded",
+      // // "test_sce_mean_no_weight_ii_3d_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_3d_log_prob",
+      // // "test_sce_mean_no_weight_ii_3d",
+      // // "test_sce_mean_no_weight_ii_4d_expanded",
+      // // "test_sce_mean_no_weight_ii_4d_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_4d_log_prob",
+      // // "test_sce_mean_no_weight_ii_4d",
+      // // "test_sce_mean_no_weight_ii_expanded",
+      // // "test_sce_mean_no_weight_ii_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_log_prob",
+      // // "test_sce_mean_no_weight_ii",
+      // // "test_sce_mean_weight_expanded",
+      // // "test_sce_mean_weight_ii_3d_expanded",
+      // // "test_sce_mean_weight_ii_3d_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_3d_log_prob",
+      // // "test_sce_mean_weight_ii_3d",
+      // // "test_sce_mean_weight_ii_4d_expanded",
+      // // "test_sce_mean_weight_ii_4d_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_4d_log_prob",
+      // // "test_sce_mean_weight_ii_4d",
+      // // "test_sce_mean_weight_ii_expanded",
+      // // "test_sce_mean_weight_ii_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_log_prob",
+      // // "test_sce_mean_weight_ii",
+      // // "test_sce_mean_weight_log_prob_expanded",
+      // // "test_sce_mean_weight_log_prob",
+      // // "test_sce_mean_weight",
+      // // "test_sce_mean",
+      // // "test_sce_NCd1_mean_weight_negative_ii_expanded",
+      // // "test_sce_NCd1_mean_weight_negative_ii_log_prob_expanded",
+      // // "test_sce_NCd1_mean_weight_negative_ii_log_prob",
+      // // "test_sce_NCd1_mean_weight_negative_ii",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_expanded",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight",
+      // // "test_sce_none_expanded",
+      // // "test_sce_none_log_prob_expanded",
+      // // "test_sce_none_log_prob",
+      // // "test_sce_none_weights_expanded",
+      // // "test_sce_none_weights_log_prob_expanded",
+      // // "test_sce_none_weights_log_prob",
+      // // "test_sce_none_weights",
+      // // "test_sce_none",
+      // // "test_sce_sum_expanded",
+      // // "test_sce_sum_log_prob_expanded",
+      // // "test_sce_sum_log_prob",
+      // // "test_sce_sum",
+      // "test_selu_default",
+      // "test_selu_example",
+      // "test_selu",
+      // // "test_sequence_insert_at_back",
+      // // "test_sequence_insert_at_front",
+      // // "test_sequence_map_add_1_sequence_1_tensor_expanded",
+      // // "test_sequence_map_add_1_sequence_1_tensor",
+      // // "test_sequence_map_add_2_sequences_expanded",
+      // // "test_sequence_map_add_2_sequences",
+      // // "test_sequence_map_extract_shapes_expanded",
+      // // "test_sequence_map_extract_shapes",
+      // // "test_sequence_map_identity_1_sequence_1_tensor_expanded",
+      // // "test_sequence_map_identity_1_sequence_1_tensor",
+      // // "test_sequence_map_identity_1_sequence_expanded",
+      // // "test_sequence_map_identity_1_sequence",
+      // // "test_sequence_map_identity_2_sequences_expanded",
+      // // "test_sequence_map_identity_2_sequences",
+      // "test_shrink_hard",
+      // "test_shrink_soft",
+      "test_sigmoid_example",
+      "test_sigmoid",
+      // "test_sign",
+      // "test_simple_rnn_batchwise",
+      // "test_simple_rnn_defaults",
+      // "test_simple_rnn_with_initial_bias",
+      "test_sin_example",
+      "test_sin",
+      // "test_sinh_example",
+      // "test_sinh",
+      // // "test_size_example",
+      // // "test_size",
+      // "test_slice_default_axes",
+      // "test_slice_default_steps",
+      // "test_slice_end_out_of_bounds",
+      // "test_slice_neg_steps",
+      // "test_slice_neg",
+      // "test_slice_negative_axes",
+      // "test_slice_start_out_of_bounds",
+      // "test_slice",
+      // "test_softmax_axis_0_expanded",
+      "test_softmax_axis_0",
+      // "test_softmax_axis_1_expanded",
+      "test_softmax_axis_1",
+      // "test_softmax_axis_2_expanded",
+      "test_softmax_axis_2",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+      // "test_softmax_cross_entropy_mean_3d_expanded",
+      // "test_softmax_cross_entropy_mean_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_3d",
+      // "test_softmax_cross_entropy_mean_expanded",
+      // "test_softmax_cross_entropy_mean_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index",
+      // "test_softmax_cross_entropy_mean_weight_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index",
+      // "test_softmax_cross_entropy_mean_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_log_prob",
+      // "test_softmax_cross_entropy_mean_weight",
+      // "test_softmax_cross_entropy_mean",
+      // "test_softmax_cross_entropy_none_expanded",
+      // "test_softmax_cross_entropy_none_log_prob_expanded",
+      // "test_softmax_cross_entropy_none_log_prob",
+      // "test_softmax_cross_entropy_none_weights_expanded",
+      // "test_softmax_cross_entropy_none_weights_log_prob_expanded",
+      // "test_softmax_cross_entropy_none_weights_log_prob",
+      // "test_softmax_cross_entropy_none_weights",
+      // "test_softmax_cross_entropy_none",
+      // "test_softmax_cross_entropy_sum_expanded",
+      // "test_softmax_cross_entropy_sum_log_prob_expanded",
+      // "test_softmax_cross_entropy_sum_log_prob",
+      // "test_softmax_cross_entropy_sum",
+      // "opset13/test_softmax_default_axis_expanded",
+      "opset13/test_softmax_default_axis",
+      // "test_softmax_example_expanded",
+      "test_softmax_example",
+      // "test_softmax_large_number_expanded",
+      "test_softmax_large_number",
+      // "test_softmax_negative_axis_expanded",
+      "test_softmax_negative_axis",
+      // // "test_softplus_example",
+      // // "test_softplus",
+      // // "test_softsign_example",
+      // // "test_softsign",
+      // "test_spacetodepth_example",
+      // "test_spacetodepth",
+      "test_split_equal_parts_1d",
+      "test_split_equal_parts_2d",
+      "test_split_equal_parts_default_axis",
+      "test_split_variable_parts_1d",
+      "test_split_variable_parts_2d",
+      "test_split_variable_parts_default_axis",
+      "test_split_zero_size_splits",
+      "test_sqrt_example",
+      "test_sqrt",
+      "test_squeeze_negative_axes",
+      "test_squeeze",
+      // // "test_stft_with_window",
+      // // "test_stft",
+      // // "test_strnormalizer_export_monday_casesensintive_lower",
+      // // "test_strnormalizer_export_monday_casesensintive_nochangecase",
+      // // "test_strnormalizer_export_monday_casesensintive_upper",
+      // // "test_strnormalizer_export_monday_empty_output",
+      // // "test_strnormalizer_export_monday_insensintive_upper_twodim",
+      // // "test_strnormalizer_nostopwords_nochangecase",
+      "test_sub_bcast",
+      "test_sub_example",
+      // "test_sub_uint8",
+      "test_sub",
+      // "test_sum_example",
+      // "test_sum_one_input",
+      // "test_sum_two_inputs",
+      "test_tan_example",
+      "test_tan",
+      "test_tanh_example",
+      "test_tanh",
+      // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
+      // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
+      // // "test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
+      // // "test_tfidfvectorizer_tf_only_bigrams_skip0",
+      // // "test_tfidfvectorizer_tf_onlybigrams_levelempty",
+      // // "test_tfidfvectorizer_tf_onlybigrams_skip5",
+      // // "test_tfidfvectorizer_tf_uniandbigrams_skip5",
+      // "test_thresholdedrelu_default",
+      // "test_thresholdedrelu_example",
+      // "test_thresholdedrelu",
+      // "test_tile_precomputed",
+      // "test_tile",
+      // // "test_top_k_negative_axis",
+      // // "test_top_k_smallest",
+      // // "test_top_k",
+      // // "test_training_dropout_default_mask",
+      // // "test_training_dropout_default",
+      // // "test_training_dropout_mask",
+      // // "test_training_dropout_zero_ratio_mask",
+      // // "test_training_dropout_zero_ratio",
+      // // "test_training_dropout",
+      "test_transpose_all_permutations_0",
+      "test_transpose_all_permutations_1",
+      "test_transpose_all_permutations_2",
+      "test_transpose_all_permutations_3",
+      "test_transpose_all_permutations_4",
+      "test_transpose_all_permutations_5",
+      "test_transpose_default",
+      // "test_tril_neg",
+      // "test_tril_one_row_neg",
+      // "test_tril_out_neg",
+      // "test_tril_out_pos",
+      // "test_tril_pos",
+      // "test_tril_square_neg",
+      // "test_tril_square",
+      // "test_tril_zero",
+      // "test_tril",
+      // "test_triu_neg",
+      // "test_triu_one_row",
+      // "test_triu_out_neg_out",
+      // "test_triu_out_pos",
+      // "test_triu_pos",
+      // "test_triu_square_neg",
+      // "test_triu_square",
+      // "test_triu_zero",
+      // "test_triu",
+      // // "test_unique_not_sorted_without_axis",
+      // // "test_unique_sorted_with_axis_3d",
+      // // "test_unique_sorted_with_axis",
+      // // "test_unique_sorted_with_negative_axis",
+      // // "test_unique_sorted_without_axis",
+      "test_unsqueeze_axis_0",
+      "test_unsqueeze_axis_1",
+      "test_unsqueeze_axis_2",
+      "test_unsqueeze_axis_3",
+      "test_unsqueeze_negative_axes",
+      "test_unsqueeze_three_axes",
+      "test_unsqueeze_two_axes",
+      "test_unsqueeze_unsorted_axes",
+      "test_unsqueeze",
+      // "test_wrap_pad"
+      // "test_upsample_nearest",
+      "test_where_example"
+      // "test_where_long_example",
+      // "test_xor_bcast3v1d",
+      // "test_xor_bcast3v2d",
+      // "test_xor_bcast4v2d",
+      // "test_xor_bcast4v3d",
+      // "test_xor_bcast4v4d",
+      // "test_xor2d",
+      // "test_xor3d",
+      // "test_xor4d"
     ],
     "ops": []
   }
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 9bd0ec1425f9..96e374f87aed 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -19,49 +19,7 @@ if (ORT_WEB_TEST_CONFIG.model.some(testGroup => testGroup.tests.some(test => tes
 }
 
 // set flags
-const options = ORT_WEB_TEST_CONFIG.options;
-if (options.debug !== undefined) {
-  ort.env.debug = options.debug;
-}
-if (options.globalEnvFlags) {
-  const flags = options.globalEnvFlags;
-  if (flags.logLevel !== undefined) {
-    ort.env.logLevel = flags.logLevel;
-  }
-  if (flags.webgl?.contextId !== undefined) {
-    ort.env.webgl.contextId = flags.webgl.contextId;
-  }
-  if (flags.webgl?.matmulMaxBatchSize !== undefined) {
-    ort.env.webgl.matmulMaxBatchSize = flags.webgl.matmulMaxBatchSize;
-  }
-  if (flags.webgl?.textureCacheMode !== undefined) {
-    ort.env.webgl.textureCacheMode = flags.webgl.textureCacheMode;
-  }
-  if (flags.webgl?.pack !== undefined) {
-    ort.env.webgl.pack = flags.webgl.pack;
-  }
-  if (flags.webgl?.async !== undefined) {
-    ort.env.webgl.async = flags.webgl.async;
-  }
-  if (flags.wasm?.numThreads !== undefined) {
-    ort.env.wasm.numThreads = flags.wasm.numThreads;
-  }
-  if (flags.wasm?.simd !== undefined) {
-    ort.env.wasm.simd = flags.wasm.simd;
-  }
-  if (flags.wasm?.proxy !== undefined) {
-    ort.env.wasm.proxy = flags.wasm.proxy;
-  }
-  if (flags.wasm?.initTimeout !== undefined) {
-    ort.env.wasm.initTimeout = flags.wasm.initTimeout;
-  }
-  if (flags.webgpu?.profilingMode !== undefined) {
-    ort.env.webgpu.profiling = {mode: flags.webgpu.profilingMode};
-  }
-  if (flags.webgpu?.validateInputContent !== undefined) {
-    ort.env.webgpu.validateInputContent = flags.webgpu.validateInputContent;
-  }
-}
+Object.assign(ort.env, ORT_WEB_TEST_CONFIG.options.globalEnvFlags);
 
 // Set logging configuration
 for (const logConfig of ORT_WEB_TEST_CONFIG.log) {
@@ -110,8 +68,7 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
         let context: ModelTestContext;
 
         before('prepare session', async () => {
-          context = await ModelTestContext.create(
-              test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
+          context = await ModelTestContext.create(test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options);
         });
 
         after('release session', async () => {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 5e9b0910a2c6..d8ee5ef95320 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -39,10 +39,6 @@ const ONNXRUNTIME_THRESHOLD_RELATIVE_ERROR = 1.00001;
  */
 const now = (typeof performance !== 'undefined' && performance.now) ? () => performance.now() : Date.now;
 
-function toInternalTensor(tensor: ort.Tensor): Tensor {
-  return new Tensor(
-      tensor.dims, tensor.type as Tensor.DataType, undefined, undefined, tensor.data as Tensor.NumberType);
-}
 function fromInternalTensor(tensor: Tensor): ort.Tensor {
   return new ort.Tensor(tensor.type, tensor.data as ort.Tensor.DataType, tensor.dims);
 }
@@ -96,7 +92,7 @@ async function loadTensors(
   const outputs: Test.NamedTensor[] = [];
   let dataFileType: 'none'|'pb'|'npy' = 'none';
 
-  const allowInt64 = ['wasm', 'xnnpack', 'webgpu'].includes(backendName);
+  const allowInt64 = ['wasm', 'webgpu', 'webnn'].includes(backendName);
 
   for (const dataFile of testCase.dataFiles) {
     const ext = extname(dataFile);
@@ -137,7 +133,8 @@ async function loadTensors(
 }
 
 async function initializeSession(
-    modelFilePath: string, backendHint: string, ioBindingMode: Test.IOBindingMode, profile: boolean,
+    modelFilePath: string, backendHint: ort.InferenceSession.ExecutionProviderConfig, ioBindingMode: Test.IOBindingMode,
+    profile: boolean, externalData: ort.InferenceSession.SessionOptions['externalData'],
     sessionOptions: ort.InferenceSession.SessionOptions, fileCache?: FileCacheBuffer): Promise<ort.InferenceSession> {
   const preloadModelData: Uint8Array|undefined =
       fileCache && fileCache[modelFilePath] ? fileCache[modelFilePath] : undefined;
@@ -152,7 +149,8 @@ async function initializeSession(
     executionProviders: [backendHint],
     profiler: profilerConfig,
     enableProfiling: profile,
-    preferredOutputLocation: ioBindingMode === 'gpu-location' ? ('gpu-buffer' as const) : undefined
+    preferredOutputLocation: ioBindingMode === 'gpu-location' ? ('gpu-buffer' as const) : undefined,
+    externalData
   };
 
   let session: ort.InferenceSession;
@@ -161,7 +159,8 @@ async function initializeSession(
     if (preloadModelData) {
       session = await ort.InferenceSession.create(preloadModelData, sessionConfig);
     } else {
-      session = await ort.InferenceSession.create(modelFilePath, sessionConfig);
+      const modelData = await readFile(modelFilePath);
+      session = await ort.InferenceSession.create(modelData, sessionConfig);
     }
   } catch (e) {
     Logger.error(
@@ -232,9 +231,8 @@ export class ModelTestContext {
   /**
    * create a ModelTestContext object that used in every test cases in the given ModelTest.
    */
-  static async create(
-      modelTest: Test.ModelTest, profile: boolean,
-      sessionOptions?: ort.InferenceSession.SessionOptions): Promise<ModelTestContext> {
+  static async create(modelTest: Test.ModelTest, profile: boolean, testOptions?: Test.Options):
+      Promise<ModelTestContext> {
     if (this.initializing) {
       throw new Error('cannot create a ModelTestContext object when the previous creation is not done');
     }
@@ -243,8 +241,12 @@ export class ModelTestContext {
       this.initializing = true;
 
       const initStart = now();
+      const executionProviderConfig =
+          modelTest.backend === 'webnn' ? (testOptions?.webnnOptions || 'webnn') : modelTest.backend!;
       const session = await initializeSession(
-          modelTest.modelUrl, modelTest.backend!, modelTest.ioBinding, profile, sessionOptions || {}, this.cache);
+          modelTest.modelUrl, executionProviderConfig, modelTest.ioBinding, profile, modelTest.externalData,
+          testOptions?.sessionOptions || {}, this.cache);
+
       const initEnd = now();
 
       for (const testCase of modelTest.cases) {
@@ -313,7 +315,7 @@ export class TensorResultValidator {
     } else if (backend === 'webgpu') {
       this.absoluteThreshold = WEBGPU_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WEBGPU_THRESHOLD_RELATIVE_ERROR;
-    } else if (backend === 'wasm' || backend === 'xnnpack' || backend === 'webnn') {
+    } else if (backend === 'wasm' || backend === 'webnn') {
       this.absoluteThreshold = WASM_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WASM_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'onnxruntime') {
@@ -325,6 +327,10 @@ export class TensorResultValidator {
   }
 
   checkTensorResult(actual: Tensor[], expected: Tensor[]): void {
+    this.checkApiTensorResult(actual.map(fromInternalTensor), expected.map(fromInternalTensor));
+  }
+
+  checkApiTensorResult(actual: ort.Tensor[], expected: ort.Tensor[]): void {
     // check output size
     expect(actual.length, 'size of output tensors').to.equal(expected.length);
 
@@ -342,10 +348,6 @@ export class TensorResultValidator {
     }
   }
 
-  checkApiTensorResult(actual: ort.Tensor[], expected: ort.Tensor[]): void {
-    this.checkTensorResult(actual.map(toInternalTensor), expected.map(toInternalTensor));
-  }
-
   checkNamedTensorResult(actual: Record<string, ort.Tensor>, expected: Test.NamedTensor[]): void {
     // check output size
     expect(Object.getOwnPropertyNames(actual).length, 'size of output tensors').to.equal(expected.length);
@@ -359,7 +361,7 @@ export class TensorResultValidator {
   }
 
   // This function check whether 2 tensors should be considered as 'match' or not
-  areEqual(actual: Tensor, expected: Tensor): boolean {
+  areEqual(actual: ort.Tensor, expected: ort.Tensor): boolean {
     if (!actual || !expected) {
       return false;
     }
@@ -387,13 +389,13 @@ export class TensorResultValidator {
 
     switch (actualType) {
       case 'string':
-        return this.strictEqual(actual.stringData, expected.stringData);
+        return this.strictEqual(actual.data, expected.data);
 
       case 'float32':
       case 'float64':
         return this.floatEqual(
-            actual.numberData as number[] | Float32Array | Float64Array,
-            expected.numberData as number[] | Float32Array | Float64Array);
+            actual.data as number[] | Float32Array | Float64Array,
+            expected.data as number[] | Float32Array | Float64Array);
 
       case 'uint8':
       case 'int8':
@@ -404,10 +406,8 @@ export class TensorResultValidator {
       case 'int64':
       case 'bool':
         return TensorResultValidator.integerEqual(
-            actual.numberData as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array |
-                Int32Array,
-            expected.numberData as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array |
-                Int32Array);
+            actual.data as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array | Int32Array,
+            expected.data as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array | Int32Array);
 
       default:
         throw new Error('type not implemented or not supported');
@@ -574,7 +574,9 @@ export async function sessionRun(options: {
       // replace the CPU tensors in feeds into GPU tensors
       for (const name in feeds) {
         if (Object.hasOwnProperty.call(feeds, name)) {
-          feeds[name] = createGpuTensorForInput(feeds[name]);
+          if (feeds[name].size > 0) {
+            feeds[name] = createGpuTensorForInput(feeds[name]);
+          }
         }
       }
     }
@@ -583,7 +585,11 @@ export async function sessionRun(options: {
       for (const name in options.outputsMetaInfo) {
         if (Object.hasOwnProperty.call(options.outputsMetaInfo, name)) {
           const {type, dims} = options.outputsMetaInfo[name];
-          fetches[name] = createGpuTensorForOutput(type, dims);
+          if (dims.some(d => d === 0)) {
+            fetches[name] = new ort.Tensor(type, [], dims);
+          } else {
+            fetches[name] = createGpuTensorForOutput(type, dims);
+          }
         }
       }
     }
@@ -628,8 +634,8 @@ export async function runModelTestSet(
   try {
     const feeds: Record<string, ort.Tensor> = {};
     const outputsMetaInfo: Record<string, ort.Tensor> = {};
-    testCase.inputs!.forEach((tensor, i) => feeds[context.session.inputNames[i]] = tensor);
-    testCase.outputs!.forEach((tensor, i) => outputsMetaInfo[context.session.outputNames[i]] = tensor);
+    testCase.inputs!.forEach((tensor) => feeds[tensor.name] = tensor);
+    testCase.outputs!.forEach((tensor) => outputsMetaInfo[tensor.name] = tensor);
     const [start, end, outputs] =
         await sessionRun({session: context.session, feeds, outputsMetaInfo, ioBinding: context.ioBinding});
     if (context.perfData.count === 0) {
diff --git a/js/web/test/test-shared.ts b/js/web/test/test-shared.ts
index 7c327e7c97ac..55beb66e37e6 100644
--- a/js/web/test/test-shared.ts
+++ b/js/web/test/test-shared.ts
@@ -15,14 +15,33 @@ export function bufferToBase64(buffer: Uint8Array): string {
   return base64.fromByteArray(buffer);
 }
 
+async function retry<T>(fn: () => Promise<T>, maxRetries = 3, delay = 100): Promise<T> {
+  let retries = maxRetries;
+  do {
+    try {
+      return await fn();
+    } catch (err) {
+      if (retries-- === 0) {
+        throw err;
+      }
+      await new Promise(resolve => setTimeout(resolve, delay));
+    }
+    // eslint-disable-next-line no-constant-condition
+  } while (true);
+}
+
 export async function readFile(file: string) {
   if (typeof process !== 'undefined' && process.versions && process.versions.node) {
     // node
     return fs.readFile(file);
   } else {
     // browser
-    const response = await fetch(file);
-    return new Uint8Array(await response.arrayBuffer());
+    //
+    // use "retry" to workaround the error "TypeError: Failed to fetch" in some test environments
+    return retry(async () => {
+      const response = await fetch(file);
+      return new Uint8Array(await response.arrayBuffer());
+    });
   }
 }
 
diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts
index 5bdc8d84cc7a..14b9fd7c005a 100644
--- a/js/web/test/test-types.ts
+++ b/js/web/test/test-types.ts
@@ -65,6 +65,7 @@ export declare namespace Test {
   export interface ModelTest {
     name: string;
     modelUrl: string;
+    externalData?: InferenceSession.SessionOptions['externalData'];
     backend?: string;  // value should be populated at build time
     ioBinding: IOBindingMode;
     platformCondition?: PlatformCondition;
@@ -143,6 +144,7 @@ export declare namespace Test {
     cudaFlags?: Record<string, unknown>;
     wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
     webglOptions?: InferenceSession.WebGLExecutionProviderOption;
+    webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
     globalEnvFlags?: EnvOptions;
   }
 
diff --git a/js/web/test/training/e2e/browser-test-wasm.js b/js/web/test/training/e2e/browser-test-wasm.js
new file mode 100644
index 000000000000..fa87389f7ac4
--- /dev/null
+++ b/js/web/test/training/e2e/browser-test-wasm.js
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+describe('Browser E2E testing for training package', function() {
+  it('Check that training package encompasses inference', async function() {
+    ort.env.wasm.numThreads = 1;
+    await testInferenceFunction(ort, {executionProviders: ['wasm']});
+  });
+
+  it('Check training functionality, all options', async function() {
+    ort.env.wasm.numThreads = 1;
+    await testTrainingFunctionAll(ort, {executionProviders: ['wasm']});
+  });
+
+  it('Check training functionality, minimum options', async function() {
+    ort.env.wasm.numThreads = 1;
+    await testTrainingFunctionMin(ort, {executionProviders: ['wasm']});
+  });
+});
diff --git a/js/web/test/training/e2e/common.js b/js/web/test/training/e2e/common.js
new file mode 100644
index 000000000000..b6040b63d56b
--- /dev/null
+++ b/js/web/test/training/e2e/common.js
@@ -0,0 +1,246 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const DATA_FOLDER = 'data/';
+const TRAININGDATA_TRAIN_MODEL = DATA_FOLDER + 'training_model.onnx';
+const TRAININGDATA_OPTIMIZER_MODEL = DATA_FOLDER + 'adamw.onnx';
+const TRAININGDATA_EVAL_MODEL = DATA_FOLDER + 'eval_model.onnx';
+const TRAININGDATA_CKPT = DATA_FOLDER + 'checkpoint.ckpt';
+
+const trainingSessionAllOptions = {
+  checkpointState: TRAININGDATA_CKPT,
+  trainModel: TRAININGDATA_TRAIN_MODEL,
+  evalModel: TRAININGDATA_EVAL_MODEL,
+  optimizerModel: TRAININGDATA_OPTIMIZER_MODEL
+}
+
+const trainingSessionMinOptions = {
+  checkpointState: TRAININGDATA_CKPT,
+  trainModel: TRAININGDATA_TRAIN_MODEL,
+}
+
+// ASSERT METHODS
+
+function assert(cond) {
+  if (!cond) throw new Error();
+}
+
+function assertStrictEquals(actual, expected) {
+  if (actual !== expected) {
+    let strRep = actual;
+    if (typeof actual === 'object') {
+      strRep = JSON.stringify(actual);
+    }
+    throw new Error(`expected: ${expected}; got: ${strRep}`);
+  }
+}
+
+function assertTwoListsUnequal(list1, list2) {
+  if (list1.length !== list2.length) {
+    return;
+  }
+  for (let i = 0; i < list1.length; i++) {
+    if (list1[i] !== list2[i]) {
+      return;
+    }
+  }
+  throw new Error(`expected ${list1} and ${list2} to be unequal; got two equal lists`);
+}
+
+// HELPER METHODS FOR TESTS
+
+function generateGaussianRandom(mean=0, scale=1) {
+  const u = 1 - Math.random();
+  const v = Math.random();
+  const z = Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v);
+  return z * scale + mean;
+}
+
+function generateGaussianFloatArray(length) {
+  const array = new Float32Array(length);
+
+  for (let i = 0; i < length; i++) {
+    array[i] = generateGaussianRandom();
+  }
+
+  return array;
+}
+
+/**
+ * creates the TrainingSession and verifies that the input and output names of the training model loaded into the
+ * training session are correct.
+ * @param {} ort
+ * @param {*} createOptions
+ * @param {*} options
+ * @returns
+ */
+async function createTrainingSessionAndCheckTrainingModel(ort, createOptions, options) {
+  const trainingSession = await ort.TrainingSession.create(createOptions, options);
+
+  assertStrictEquals(trainingSession.trainingInputNames[0], 'input-0');
+  assertStrictEquals(trainingSession.trainingInputNames[1], 'labels');
+  assertStrictEquals(trainingSession.trainingInputNames.length, 2);
+  assertStrictEquals(trainingSession.trainingOutputNames[0], 'onnx::loss::21273');
+  assertStrictEquals(trainingSession.trainingOutputNames.length, 1);
+  return trainingSession;
+}
+
+/**
+ * verifies that the eval input and output names associated with the eval model loaded into the given training session
+ * are correct.
+ */
+function checkEvalModel(trainingSession) {
+  assertStrictEquals(trainingSession.evalInputNames[0], 'input-0');
+  assertStrictEquals(trainingSession.evalInputNames[1], 'labels');
+  assertStrictEquals(trainingSession.evalInputNames.length, 2);
+  assertStrictEquals(trainingSession.evalOutputNames[0], 'onnx::loss::21273');
+  assertStrictEquals(trainingSession.evalOutputNames.length, 1);
+}
+
+/**
+ * Checks that accessing trainingSession.evalInputNames or trainingSession.evalOutputNames will throw an error if
+ * accessed
+ * @param {} trainingSession
+ */
+function checkNoEvalModel(trainingSession) {
+  try {
+    assertStrictEquals(trainingSession.evalInputNames, "should have thrown an error upon accessing");
+  } catch (error) {
+    assertStrictEquals(error.message, 'This training session has no evalModel loaded.');
+  }
+  try {
+    assertStrictEquals(trainingSession.evalOutputNames, "should have thrown an error upon accessing");
+  } catch (error) {
+    assertStrictEquals(error.message, 'This training session has no evalModel loaded.');
+  }
+}
+
+/**
+ * runs the train step with the given inputs and checks that the tensor returned is of type float32 and has a length
+ * of 1 for the loss.
+ * @param {} trainingSession
+ * @param {*} feeds
+ * @returns
+ */
+var runTrainStepAndCheck = async function(trainingSession, feeds) {
+  const results =  await trainingSession.runTrainStep(feeds);
+  assertStrictEquals(Object.keys(results).length, 1);
+  assertStrictEquals(results['onnx::loss::21273'].data.length, 1);
+  assertStrictEquals(results['onnx::loss::21273'].type, 'float32');
+  return results;
+};
+
+var loadParametersBufferAndCheck = async function(trainingSession, paramsLength, constant, paramsBefore) {
+  // make a float32 array that is filled with the constant
+  const newParams = new Float32Array(paramsLength);
+  for (let i = 0; i < paramsLength; i++) {
+    newParams[i] = constant;
+  }
+
+  const newParamsUint8 = new Uint8Array(newParams.buffer, newParams.byteOffset, newParams.byteLength);
+
+  await trainingSession.loadParametersBuffer(newParamsUint8);
+  const paramsAfterLoad = await trainingSession.getContiguousParameters();
+
+  // check that the parameters have changed
+  assertTwoListsUnequal(paramsAfterLoad.data, paramsBefore.data);
+  assertStrictEquals(paramsAfterLoad.dims[0], paramsLength);
+
+  // check that the parameters have changed to what they should be
+  for (let i = 0; i < paramsLength; i++) {
+    // round to the same number of digits (4 decimal places)
+    assertStrictEquals(paramsAfterLoad.data[i].toFixed(4), constant.toFixed(4));
+  }
+
+  return paramsAfterLoad;
+}
+
+// TESTS
+
+var testInferenceFunction = async function(ort, options) {
+  const session = await ort.InferenceSession.create('data/model.onnx', options || {});
+
+  const dataA = Float32Array.from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]);
+  const dataB = Float32Array.from([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]);
+
+  const fetches =
+      await session.run({a: new ort.Tensor('float32', dataA, [3, 4]), b: new ort.Tensor('float32', dataB, [4, 3])});
+
+  const c = fetches.c;
+
+  assert(c instanceof ort.Tensor);
+  assert(c.dims.length === 2 && c.dims[0] === 3 && c.dims[1] === 3);
+  assert(c.data[0] === 700);
+  assert(c.data[1] === 800);
+  assert(c.data[2] === 900);
+  assert(c.data[3] === 1580);
+  assert(c.data[4] === 1840);
+  assert(c.data[5] === 2100);
+  assert(c.data[6] === 2460);
+  assert(c.data[7] === 2880);
+  assert(c.data[8] === 3300);
+};
+
+var testTrainingFunctionMin = async function(ort, options) {
+  const trainingSession = await createTrainingSessionAndCheckTrainingModel(ort, trainingSessionMinOptions, options);
+  checkNoEvalModel(trainingSession);
+  const input0 = new ort.Tensor('float32', generateGaussianFloatArray(2 * 784), [2, 784]);
+  const labels = new ort.Tensor('int32', [2, 1], [2]);
+  const feeds = {"input-0": input0, "labels": labels};
+
+  // check getParametersSize
+  const paramsSize = await trainingSession.getParametersSize();
+  assertStrictEquals(paramsSize, 397510);
+
+  // check getContiguousParameters
+  const originalParams = await trainingSession.getContiguousParameters();
+  assertStrictEquals(originalParams.dims.length, 1);
+  assertStrictEquals(originalParams.dims[0], 397510);
+  assertStrictEquals(originalParams.data[0], -0.025190064683556557);
+  assertStrictEquals(originalParams.data[2000], -0.034044936299324036);
+
+  await runTrainStepAndCheck(trainingSession, feeds);
+
+  await loadParametersBufferAndCheck(trainingSession, 397510, -1.2, originalParams);
+}
+
+var testTrainingFunctionAll = async function(ort, options) {
+  const trainingSession = await createTrainingSessionAndCheckTrainingModel(ort, trainingSessionAllOptions, options);
+  checkEvalModel(trainingSession);
+
+  const input0 = new ort.Tensor('float32', generateGaussianFloatArray(2 * 784), [2, 784]);
+  const labels = new ort.Tensor('int32', [2, 1], [2]);
+  let feeds = {"input-0": input0, "labels": labels};
+
+  // check getParametersSize
+  const paramsSize = await trainingSession.getParametersSize();
+  assertStrictEquals(paramsSize, 397510);
+
+  // check getContiguousParameters
+  const originalParams = await trainingSession.getContiguousParameters();
+  assertStrictEquals(originalParams.dims.length, 1);
+  assertStrictEquals(originalParams.dims[0], 397510);
+  assertStrictEquals(originalParams.data[0], -0.025190064683556557);
+  assertStrictEquals(originalParams.data[2000], -0.034044936299324036);
+
+  const results = await runTrainStepAndCheck(trainingSession, feeds);
+
+  await trainingSession.runOptimizerStep(feeds);
+  feeds = {"input-0": input0, "labels": labels};
+  // check getContiguousParameters after optimizerStep -- that the parameters have been updated
+  const optimizedParams = await trainingSession.getContiguousParameters();
+  assertTwoListsUnequal(originalParams.data, optimizedParams.data);
+
+  const results2 = await runTrainStepAndCheck(trainingSession, feeds);
+
+  // check that loss decreased after optimizer step and training again
+  assert(results2['onnx::loss::21273'].data < results['onnx::loss::21273'].data);
+
+  await loadParametersBufferAndCheck(trainingSession, 397510, -1.2, optimizedParams);
+}
+
+if (typeof module === 'object') {
+  module.exports = [testInferenceFunction, testTrainingFunctionMin, testTrainingFunctionAll, testTest];
+}
diff --git a/js/web/test/training/e2e/data/model.onnx b/js/web/test/training/e2e/data/model.onnx
new file mode 100644
index 000000000000..088124bd4862
--- /dev/null
+++ b/js/web/test/training/e2e/data/model.onnx
@@ -0,0 +1,16 @@
+backend-test:b
+
+a
+bc"MatMultest_matmul_2dZ
+a
+
+
+Z
+b
+
+
+b
+c
+
+
+B	
\ No newline at end of file
diff --git a/js/web/test/training/e2e/karma.conf.js b/js/web/test/training/e2e/karma.conf.js
new file mode 100644
index 000000000000..e441cb65b412
--- /dev/null
+++ b/js/web/test/training/e2e/karma.conf.js
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const args = require('minimist')(process.argv.slice(2));
+const SELF_HOST = !!args['self-host'];
+const ORT_MAIN = args['ort-main'];
+const TEST_MAIN = args['test-main'];
+if (typeof TEST_MAIN !== 'string') {
+  throw new Error('flag --test-main=<TEST_MAIN_JS_FILE> is required');
+}
+const USER_DATA = args['user-data'];
+if (typeof USER_DATA !== 'string') {
+  throw new Error('flag --user-data=<CHROME_USER_DATA_FOLDER> is required');
+}
+
+module.exports = function(config) {
+  const distPrefix = SELF_HOST ? './node_modules/onnxruntime-web/dist/' : 'http://localhost:8081/dist/';
+  config.set({
+    frameworks: ['mocha'],
+    files: [
+      {pattern: distPrefix + ORT_MAIN},
+      {pattern: './common.js'},
+      {pattern: TEST_MAIN},
+      {pattern: './node_modules/onnxruntime-web/dist/*.wasm', included: false, nocache: true},
+      {pattern: './data/*', included: false},
+    ],
+    plugins: [require('@chiragrupani/karma-chromium-edge-launcher'), ...config.plugins],
+    proxies: {
+      '/model.onnx': '/base/model.onnx',
+      '/data/': '/base/data/',
+    },
+    client: {captureConsole: true, mocha: {expose: ['body'], timeout: 60000}},
+    reporters: ['mocha'],
+    captureTimeout: 120000,
+    reportSlowerThan: 100,
+    browserDisconnectTimeout: 600000,
+    browserNoActivityTimeout: 300000,
+    browserDisconnectTolerance: 0,
+    browserSocketTimeout: 60000,
+    hostname: 'localhost',
+    browsers: [],
+    customLaunchers: {
+      Chrome_default: {base: 'ChromeHeadless', chromeDataDir: USER_DATA},
+      Chrome_no_threads: {
+        base: 'ChromeHeadless',
+        chromeDataDir: USER_DATA,
+        // TODO: no-thread flags
+      },
+      Edge_default: {base: 'Edge', edgeDataDir: USER_DATA}
+    }
+  });
+};
diff --git a/js/web/test/training/e2e/package.json b/js/web/test/training/e2e/package.json
new file mode 100644
index 000000000000..5f11a27de6df
--- /dev/null
+++ b/js/web/test/training/e2e/package.json
@@ -0,0 +1,14 @@
+{
+  "devDependencies": {
+    "@chiragrupani/karma-chromium-edge-launcher": "^2.2.2",
+    "fs-extra": "^11.1.0",
+    "globby": "^13.1.3",
+    "karma": "^6.4.1",
+    "karma-chrome-launcher": "^3.1.1",
+    "karma-mocha": "^2.0.1",
+    "karma-mocha-reporter": "^2.2.5",
+    "light-server": "^2.9.1",
+    "minimist": "^1.2.7",
+    "mocha": "^10.2.0"
+  }
+}
diff --git a/js/web/test/training/e2e/run.js b/js/web/test/training/e2e/run.js
new file mode 100644
index 000000000000..379a8136f3ff
--- /dev/null
+++ b/js/web/test/training/e2e/run.js
@@ -0,0 +1,138 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const path = require('path');
+const fs = require('fs-extra');
+const {spawn} = require('child_process');
+const startServer = require('./simple-http-server');
+const minimist = require('minimist');
+
+// copy whole folder to out-side of <ORT_ROOT>/js/ because we need to test in a folder that no `package.json` file
+// exists in its parent folder.
+// here we use <ORT_ROOT>/build/js/e2e-training/ for the test
+
+const TEST_E2E_SRC_FOLDER = __dirname;
+const JS_ROOT_FOLDER = path.resolve(__dirname, '../../../..');
+const TEST_E2E_RUN_FOLDER = path.resolve(JS_ROOT_FOLDER, '../build/js/e2e-training');
+const NPM_CACHE_FOLDER = path.resolve(TEST_E2E_RUN_FOLDER, '../npm_cache');
+const CHROME_USER_DATA_FOLDER = path.resolve(TEST_E2E_RUN_FOLDER, '../user_data');
+fs.emptyDirSync(TEST_E2E_RUN_FOLDER);
+fs.emptyDirSync(NPM_CACHE_FOLDER);
+fs.emptyDirSync(CHROME_USER_DATA_FOLDER);
+fs.copySync(TEST_E2E_SRC_FOLDER, TEST_E2E_RUN_FOLDER);
+
+// training data to copy
+const ORT_ROOT_FOLDER = path.resolve(JS_ROOT_FOLDER, '..');
+const TRAINING_DATA_FOLDER = path.resolve(ORT_ROOT_FOLDER, 'onnxruntime/test/testdata/training_api');
+const TRAININGDATA_DEST = path.resolve(TEST_E2E_RUN_FOLDER, 'data');
+
+// always use a new folder as user-data-dir
+let nextUserDataDirId = 0;
+function getNextUserDataDir() {
+  const dir = path.resolve(CHROME_USER_DATA_FOLDER, nextUserDataDirId.toString())
+  nextUserDataDirId++;
+  fs.emptyDirSync(dir);
+  return dir;
+}
+
+// commandline arguments
+const BROWSER = minimist(process.argv.slice(2)).browser || 'Chrome_default';
+
+async function main() {
+  // find packed package
+  const {globbySync} = await import('globby');
+
+  const ORT_COMMON_FOLDER = path.resolve(JS_ROOT_FOLDER, 'common');
+  const ORT_COMMON_PACKED_FILEPATH_CANDIDATES = globbySync('onnxruntime-common-*.tgz', {cwd: ORT_COMMON_FOLDER});
+
+  const PACKAGES_TO_INSTALL = [];
+
+  if (ORT_COMMON_PACKED_FILEPATH_CANDIDATES.length === 1) {
+    PACKAGES_TO_INSTALL.push(path.resolve(ORT_COMMON_FOLDER, ORT_COMMON_PACKED_FILEPATH_CANDIDATES[0]));
+  } else if (ORT_COMMON_PACKED_FILEPATH_CANDIDATES.length > 1) {
+    throw new Error('multiple packages found for onnxruntime-common.');
+  }
+
+  const ORT_WEB_FOLDER = path.resolve(JS_ROOT_FOLDER, 'web');
+  const ORT_WEB_PACKED_FILEPATH_CANDIDATES = globbySync('onnxruntime-web-*.tgz', {cwd: ORT_WEB_FOLDER});
+  if (ORT_WEB_PACKED_FILEPATH_CANDIDATES.length !== 1) {
+    throw new Error('cannot find exactly single package for onnxruntime-web.');
+  }
+  PACKAGES_TO_INSTALL.push(path.resolve(ORT_WEB_FOLDER, ORT_WEB_PACKED_FILEPATH_CANDIDATES[0]));
+
+  // we start here:
+
+  // install dev dependencies
+  await runInShell(`npm install`);
+
+  // npm install with "--cache" to install packed packages with an empty cache folder
+  await runInShell(`npm install --cache "${NPM_CACHE_FOLDER}" ${PACKAGES_TO_INSTALL.map(i => `"${i}"`).join(' ')}`);
+
+  // prepare training data
+  prepareTrainingDataByCopying();
+
+  console.log('===============================================================');
+  console.log("Running self-hosted tests");
+  console.log('===============================================================');
+  // test cases with self-host (ort hosted in same origin)
+  await testAllBrowserCases({hostInKarma: true});
+
+  console.log('===============================================================');
+  console.log("Running not self-hosted tests");
+  console.log('===============================================================');
+  // test cases without self-host (ort hosted in same origin)
+  startServer(path.resolve(TEST_E2E_RUN_FOLDER, 'node_modules', 'onnxruntime-web'));
+  await testAllBrowserCases({hostInKarma: false});
+
+  // no error occurs, exit with code 0
+  process.exit(0);
+}
+
+async function testAllBrowserCases({hostInKarma}) {
+  await runKarma({hostInKarma, main: './browser-test-wasm.js'});
+}
+
+async function runKarma({hostInKarma, main, browser = BROWSER, ortMain = 'ort.training.wasm.min.js'}) {
+  console.log('===============================================================');
+  console.log(`Running karma with the following binary: ${ortMain}`);
+  console.log('===============================================================');
+  const selfHostFlag = hostInKarma ? '--self-host' : '';
+  await runInShell(`npx karma start --single-run --browsers ${browser} ${selfHostFlag} --ort-main=${
+      ortMain} --test-main=${main} --user-data=${getNextUserDataDir()}`);
+}
+
+async function runInShell(cmd) {
+  console.log('===============================================================');
+  console.log(' Running command in shell:');
+  console.log(' > ' + cmd);
+  console.log('===============================================================');
+  let complete = false;
+  const childProcess = spawn(cmd, {shell: true, stdio: 'inherit', cwd: TEST_E2E_RUN_FOLDER});
+  childProcess.on('close', function(code) {
+    if (code !== 0) {
+      process.exit(code);
+    } else {
+      complete = true;
+    }
+  });
+  while (!complete) {
+    await delay(100);
+  }
+}
+
+async function delay(ms) {
+  return new Promise(function(resolve) {
+    setTimeout(function() {
+      resolve();
+    }, ms);
+  });
+}
+
+function prepareTrainingDataByCopying() {
+  fs.copySync(TRAINING_DATA_FOLDER, TRAININGDATA_DEST);
+  console.log(`Copied ${TRAINING_DATA_FOLDER} to ${TRAININGDATA_DEST}`);
+}
+
+main();
diff --git a/js/web/test/training/e2e/simple-http-server.js b/js/web/test/training/e2e/simple-http-server.js
new file mode 100644
index 000000000000..a157c7dd93ad
--- /dev/null
+++ b/js/web/test/training/e2e/simple-http-server.js
@@ -0,0 +1,64 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+// this is a simple HTTP server that enables CORS.
+// following code is based on https://developer.mozilla.org/en-US/docs/Learn/Server-side/Node_server_without_framework
+
+const http = require('http');
+const fs = require('fs');
+const path = require('path');
+
+const validRequests = {
+  // .wasm files
+  '/dist/ort-wasm.wasm': ['dist/ort-wasm.wasm', 'application/wasm'],
+  '/dist/ort-wasm-simd.wasm': ['dist/ort-wasm-simd.wasm', 'application/wasm'],
+  '/dist/ort-training-wasm-simd.wasm': ['dist/ort-training-wasm-simd.wasm', 'application/wasm'],
+  '/dist/ort-wasm-threaded.wasm': ['dist/ort-wasm-threaded.wasm', 'application/wasm'],
+  '/dist/ort-wasm-simd-threaded.wasm': ['dist/ort-wasm-simd-threaded.wasm', 'application/wasm'],
+
+  // proxied .wasm files:
+  '/test-wasm-path-override/ort-wasm.wasm': ['dist/ort-training-wasm.wasm', 'application/wasm'],
+  //'/test-wasm-path-override/renamed.wasm': ['dist/ort-wasm.wasm', 'application/wasm'],
+
+  // .js files
+  '/dist/ort.min.js': ['dist/ort.min.js', 'text/javascript'],
+  '/dist/ort.training.simd.wasm.min.js': ['dist/ort.training.simd.wasm.min.js', 'text/javascript'],
+  '/dist/ort.training.wasm.min.js': ['dist/ort.training.wasm.min.js', 'text/javascript'],
+  '/dist/ort.js': ['dist/ort.js', 'text/javascript'],
+  '/dist/ort.webgl.min.js': ['dist/ort.webgl.min.js', 'text/javascript'],
+  '/dist/ort.wasm.min.js': ['dist/ort.wasm.min.js', 'text/javascript'],
+  '/dist/ort.wasm-core.min.js': ['dist/ort.wasm-core.min.js', 'text/javascript'],
+};
+
+module.exports = function(dir) {
+  http.createServer(function(request, response) {
+        console.log(`request ${request.url.replace(/\n|\r/g, '')}`);
+
+        const requestData = validRequests[request.url];
+        if (!request) {
+          response.writeHead(404);
+          response.end('404');
+        } else {
+          const [filePath, contentType] = requestData;
+          fs.readFile(path.resolve(dir, filePath), function(error, content) {
+            if (error) {
+              if (error.code == 'ENOENT') {
+                response.writeHead(404);
+                response.end('404');
+              } else {
+                response.writeHead(500);
+                response.end('500');
+              }
+            } else {
+              response.setHeader('access-control-allow-origin', '*');
+              response.writeHead(200, {'Content-Type': contentType});
+              response.end(content, 'utf-8');
+            }
+          });
+        }
+      })
+      .listen(8081);
+  console.log('Server running at http://127.0.0.1:8081/');
+    };
diff --git a/js/web/test/unittests/backends/webgl/test-conv-new.ts b/js/web/test/unittests/backends/webgl/test-conv-new.ts
index 8c186b9b3645..014fc57f2155 100644
--- a/js/web/test/unittests/backends/webgl/test-conv-new.ts
+++ b/js/web/test/unittests/backends/webgl/test-conv-new.ts
@@ -893,7 +893,9 @@ describe('New Conv tests', () => {
             const expected = cpuConv(
                 inputTensor, kernelTensor, biasTensor, testData.autoPad, testData.dilations, testData.pads,
                 testData.strides);
-            if (!validator.areEqual(actual, expected)) {
+            try {
+              validator.checkTensorResult([actual], [expected]);
+            } catch {
               console.log(actual.dims, `[${actual.numberData.slice(0, 20).join(',')},...]`);
               console.log(expected.dims, `[${expected.numberData.slice(0, 20).join(',')},...]`);
               throw new Error('Expected and Actual did not match');
diff --git a/objectivec/include/ort_coreml_execution_provider.h b/objectivec/include/ort_coreml_execution_provider.h
index a015b6fd60c8..6ff18176ebeb 100644
--- a/objectivec/include/ort_coreml_execution_provider.h
+++ b/objectivec/include/ort_coreml_execution_provider.h
@@ -41,6 +41,17 @@ NS_ASSUME_NONNULL_BEGIN
  */
 @property BOOL onlyEnableForDevicesWithANE;
 
+/**
+ * Only allow CoreML EP to take nodes with inputs with static shapes. By default it will also allow inputs with
+ * dynamic shapes. However, the performance may be negatively impacted if inputs have dynamic shapes.
+ */
+@property BOOL onlyAllowStaticInputShapes;
+
+/**
+ * Create an MLProgram. By default it will create a NeuralNetwork model. Requires Core ML 5 or later.
+ */
+@property BOOL createMLProgram;
+
 @end
 
 @interface ORTSessionOptions (ORTSessionOptionsCoreMLEP)
diff --git a/objectivec/ort_coreml_execution_provider.mm b/objectivec/ort_coreml_execution_provider.mm
index 6340fdea1c3a..58b47d68eea6 100644
--- a/objectivec/ort_coreml_execution_provider.mm
+++ b/objectivec/ort_coreml_execution_provider.mm
@@ -26,7 +26,10 @@ - (BOOL)appendCoreMLExecutionProviderWithOptions:(ORTCoreMLExecutionProviderOpti
     const uint32_t flags =
         (options.useCPUOnly ? COREML_FLAG_USE_CPU_ONLY : 0) |
         (options.enableOnSubgraphs ? COREML_FLAG_ENABLE_ON_SUBGRAPH : 0) |
-        (options.onlyEnableForDevicesWithANE ? COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE : 0);
+        (options.onlyEnableForDevicesWithANE ? COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE : 0) |
+        (options.onlyAllowStaticInputShapes ? COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES : 0) |
+        (options.createMLProgram ? COREML_FLAG_CREATE_MLPROGRAM : 0);
+
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(
         [self CXXAPIOrtSessionOptions], flags));
     return YES;
diff --git a/objectivec/ort_value.mm b/objectivec/ort_value.mm
index b9dc1a9885c6..c61a7ea80923 100644
--- a/objectivec/ort_value.mm
+++ b/objectivec/ort_value.mm
@@ -148,6 +148,9 @@ - (nullable ORTValueTypeInfo*)typeInfoWithError:(NSError**)error {
 - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     return CXXAPIToPublicTensorTypeAndShapeInfo(tensorTypeAndShapeInfo);
   }
   ORT_OBJC_API_IMPL_CATCH_RETURNING_NULLABLE(error)
@@ -156,6 +159,9 @@ - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError*
 - (nullable NSMutableData*)tensorDataWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     if (tensorTypeAndShapeInfo.GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
       ORT_CXX_API_THROW(
           "This ORTValue holds string data. Please call tensorStringDataWithError: "
@@ -182,6 +188,9 @@ - (nullable NSMutableData*)tensorDataWithError:(NSError**)error {
 - (nullable NSArray<NSString*>*)tensorStringDataWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     const size_t elementCount = tensorTypeAndShapeInfo.GetElementCount();
     const size_t tensorStringDataLength = _value->GetStringTensorDataLength();
     std::vector<char> tensorStringData(tensorStringDataLength, '\0');
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 57219c50f39a..c3699f0fb33a 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -7,7 +7,7 @@
 For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
-__version__ = "1.17.0"
+__version__ = "1.18.0"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/contrib_ops/cpu/activations.cc b/onnxruntime/contrib_ops/cpu/activations.cc
index 556699192d2e..3e0533dd8b9e 100644
--- a/onnxruntime/contrib_ops/cpu/activations.cc
+++ b/onnxruntime/contrib_ops/cpu/activations.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/activation/activations.h"
-#include "activations.h"
+#include "contrib_ops/cpu/activations.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -26,14 +26,6 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     ThresholdedRelu<float>);
 
-ONNX_OPERATOR_KERNEL_EX(
-    Gelu,
-    kMSDomain,
-    1,
-    kCpuExecutionProvider,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-    Gelu<float>);
-
 ONNX_OPERATOR_KERNEL_EX(
     QuickGelu,
     kMSDomain,
diff --git a/onnxruntime/contrib_ops/cpu/activations.h b/onnxruntime/contrib_ops/cpu/activations.h
index aed4c2229215..7e64235d3fc3 100644
--- a/onnxruntime/contrib_ops/cpu/activations.h
+++ b/onnxruntime/contrib_ops/cpu/activations.h
@@ -54,47 +54,6 @@ namespace contrib {
 DEFINE_ELE_KERNEL(ScaledTanh);
 DEFINE_ELE_KERNEL(ParametricSoftplus);
 
-template <typename T>
-class Gelu : public OpKernel {
- public:
-  Gelu(const OpKernelInfo& info) : OpKernel(info) {
-  }
-
-  Status Compute(OpKernelContext* context) const override {
-    const Tensor* input = context->Input<Tensor>(0);
-    const T* input_data = input->Data<T>();
-
-    Tensor* output = context->Output(0, input->Shape());
-    T* output_data = output->MutableData<T>();
-
-    concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
-    int64_t elem_count = input->Shape().Size();
-    constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
-    int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
-    concurrency::ThreadPool::TryBatchParallelFor(
-        tp, static_cast<int32_t>(task_count),
-        [&](ptrdiff_t task_idx) {
-          const auto start = task_idx * length_per_task;
-          const T* p_input = input_data + start;
-          T* p_output = output_data + start;
-          int64_t count = std::min(length_per_task, elem_count - start);
-
-          for (int64_t i = 0; i < count; i++) {
-            T value = p_input[i];
-            p_output[i] = value * static_cast<T>(M_SQRT1_2);
-          }
-
-          MlasComputeErf(p_output, p_output, narrow<size_t>(count));
-
-          for (int64_t i = 0; i < count; i++) {
-            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
-          }
-        },
-        0);
-    return Status::OK();
-  }
-};
-
 // Implement a new one instead of inheriting from ElementWiseRangedTransform so that we can call
 // MlasComputeLogistic instead of using Eigen for better perf.
 template <typename T>
diff --git a/onnxruntime/contrib_ops/cpu/aten_ops/aten_op_executor.h b/onnxruntime/contrib_ops/cpu/aten_ops/aten_op_executor.h
index d72868cd8fa9..56c8e2911e28 100644
--- a/onnxruntime/contrib_ops/cpu/aten_ops/aten_op_executor.h
+++ b/onnxruntime/contrib_ops/cpu/aten_ops/aten_op_executor.h
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace aten_ops {
 
-typedef bool (*IsCpuArgumentFunc)(const char* op_name, const char* overload_name, size_t index, bool is_input);
+typedef bool (*IsTensorArgumentFunc)(const char* op_name, const char* overload_name, size_t index, bool is_input);
 typedef void (*ExecuteATenOperatorFunc)(const char* op_name, const char* overload_name, size_t input_size,
                                         DLManagedTensor** dlpack_inputs, size_t output_size,
                                         DLManagedTensor** dlpack_outputs);
@@ -22,17 +22,17 @@ class ATenOperatorExecutor {
     return instance;
   }
 
-  void Initialize(void* p_is_cpu_argument_func_raw, void* p_execute_aten_op_func_raw) {
-    ORT_ENFORCE(p_is_cpu_argument_func_raw && p_execute_aten_op_func_raw);
-    p_is_cpu_argument_func_ = reinterpret_cast<IsCpuArgumentFunc>(p_is_cpu_argument_func_raw);
+  void Initialize(void* p_is_tensor_argument_func_raw, void* p_execute_aten_op_func_raw) {
+    ORT_ENFORCE(p_is_tensor_argument_func_raw && p_execute_aten_op_func_raw);
+    p_is_tensor_argument_func_ = reinterpret_cast<IsTensorArgumentFunc>(p_is_tensor_argument_func_raw);
     p_execute_aten_op_func_ = reinterpret_cast<ExecuteATenOperatorFunc>(p_execute_aten_op_func_raw);
   }
 
   bool IsInitialized() { return p_execute_aten_op_func_ != nullptr; }
 
-  bool IsCpuArgument(const std::string& op_name, const std::string& overload_name, size_t index, bool is_input) {
-    ORT_ENFORCE(p_is_cpu_argument_func_, "ATenOperatorExecutor is not initialized.");
-    return p_is_cpu_argument_func_(op_name.c_str(), overload_name.c_str(), index, is_input);
+  bool IsTensorArgument(const std::string& op_name, const std::string& overload_name, size_t index, bool is_input) {
+    ORT_ENFORCE(p_is_tensor_argument_func_, "ATenOperatorExecutor is not initialized.");
+    return p_is_tensor_argument_func_(op_name.c_str(), overload_name.c_str(), index, is_input);
   }
 
   void operator()(const std::string& op_name, const std::string& overload_name, size_t input_size,
@@ -43,7 +43,7 @@ class ATenOperatorExecutor {
   }
 
  private:
-  IsCpuArgumentFunc p_is_cpu_argument_func_ = nullptr;
+  IsTensorArgumentFunc p_is_tensor_argument_func_ = nullptr;
   ExecuteATenOperatorFunc p_execute_aten_op_func_ = nullptr;
 };
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc
index 4711ccf487cc..768676259aa1 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/attention.cc
@@ -211,6 +211,12 @@ Status Attention<T>::Compute(OpKernelContext* context) const {
                                   relative_position_bias,
                                   &parameters));
 
+  if (parameters.do_rotary) {
+    ORT_NOT_IMPLEMENTED(
+        "Rotary embedding is not supported in Attention CPU kernel. \
+                        Please fuse the model with MHA + RotaryEmbedding.");
+  }
+
   const int batch_size = parameters.batch_size;
   const int sequence_length = parameters.sequence_length;
   const int input_hidden_size = parameters.input_hidden_size;
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
index 5d224bdc2235..515a967aa238 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
@@ -253,6 +253,7 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,
     output_parameters->is_unidirectional = is_unidirectional_;
     output_parameters->past_present_share_buffer = (past_present_share_buffer_ != 0 && past != nullptr);
     output_parameters->do_rotary = do_rotary_;
+    output_parameters->rotary_embedding = rotary_embedding_ == 0 ? (int)(output_parameters->head_size) : rotary_embedding_;
     output_parameters->mask_filter_value = mask_filter_value_;
     output_parameters->scale = scale_;
     output_parameters->mask_type = mask_type;
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
index 5ee40c4b9866..a6782daa58f1 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
@@ -38,6 +38,7 @@ class AttentionBase {
 
     is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
     do_rotary_ = info.GetAttrOrDefault<int64_t>("do_rotary", 0) == 1;
+    rotary_embedding_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
     mask_filter_value_ = info.GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
     scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
@@ -72,6 +73,7 @@ class AttentionBase {
   bool require_same_hidden_size_;          // whether the implementation supports different hidden sizes of Q/K/V.
   bool past_present_share_buffer_;         // whether or not the past (if used) and present tensor share the same buffer
   bool do_rotary_;                         // whether or not to use rotary embeddings
+  int rotary_embedding_;                   // rotary embedding dimension
   float mask_filter_value_;                // the value to be used for filtered out positions
   float scale_;                            // the scale to be used for softmax
 };
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index a7f83469a768..5a0c3af05c9d 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -56,6 +56,7 @@ struct AttentionParameters {
   int v_head_size;            // hidden size per head of V
   int num_heads;
   int num_splits;
+  int rotary_embedding;
   bool is_unidirectional;
   bool past_present_share_buffer;
   bool do_rotary;
@@ -63,6 +64,7 @@ struct AttentionParameters {
   bool pass_past_in_kv;
   float mask_filter_value;
   float scale;
+  bool use_tf32;
   AttentionMaskType mask_type;
   AttentionQkvFormat qkv_format;
 };
@@ -81,6 +83,7 @@ struct PackedAttentionParameters {
   int token_count;
   bool has_relative_position_bias;
   bool broadcast_res_pos_bias;
+  bool use_tf32;
 };
 
 // Parameters deduced from node attributes and inputs/outputs.
@@ -95,13 +98,19 @@ struct GroupQueryAttentionParameters {
   int kv_hidden_size;
   int kv_num_heads;
   int num_splits;          // number of splits for splitkv
+  int rotary_dim;          // rotary embedding dimension
   bool is_unidirectional;  // causal
   int local_window_size;
   bool kv_share_buffer;
+  bool is_packed_qkv;
   bool is_prompt;  // determines if seqlens_k is past or kv sequence length tensor
+  bool do_rotary;
+  bool rotary_interleaved;
   float scale;
   AttentionQkvFormat qkv_format;
   AttentionQkvFormat past_kv_format;
+  int zeros_count;
+  int* zero_ptr;
 };
 
 namespace attention {
@@ -132,6 +141,10 @@ constexpr const char* kMinSeqLenForFlashAttentionPackedQKV = "ORT_MIN_SEQ_LEN_FL
 // Default value for the above setting.
 constexpr int kDefaultMinSeqLenForFlashAttentionPackedQKV = 513;
 
+// Environment variable to enable loading more KV data in flight in
+// DecoderMaskedMultiHeadAttention/DecoderMaskedSelfAttention kernels
+constexpr const char* kDecoderMaskedAttentionLoadKVDataInFlight = "ORT_DECODER_MASKED_ATTENTION_LOAD_KV_DATA_IN_FLIGHT";
+
 }  // namespace attention
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
index b761b1afd852..34f57c1655cc 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
@@ -140,26 +140,35 @@ class AttentionCPUBase : public AttentionBase {
       if (mask_data != nullptr) {
         PrepareMask(mask_index, mask_index_dims, mask_data,
                     causal, batch_size, sequence_length, past_sequence_length, mask_filter_value_);
-      } else {  // no any mask
-        const int memset_loop_len = batch_size * num_heads_;
-        const double memset_cost = static_cast<double>(sequence_length) * total_sequence_length;
-
-        ThreadPool::TryParallelFor(tp, memset_loop_len, memset_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
-          for (std::ptrdiff_t i = begin; i != end; ++i) {
-            const int output_offset = static_cast<int>(i) * sequence_length * total_sequence_length;
-            T* output = attention_probs + output_offset;
-            memset(output, 0, static_cast<size_t>(sequence_length) * total_sequence_length * sizeof(T));
-          }
-        });
       }
 
       const int loop_len = batch_size * num_heads_;
       const float alpha = scale_ == 0.0f ? 1.0f / sqrt(static_cast<float>(head_size)) : scale_;
 
-      // The cost of Gemm
-      const double cost = static_cast<double>(head_size) * sequence_length * total_sequence_length;
+      TensorOpCost unit_cost;
+      const size_t probs_matrix_bytes = SafeInt<size_t>(sequence_length) * total_sequence_length * sizeof(T);
+      unit_cost.compute_cycles = static_cast<double>(2 * sequence_length * head_size * total_sequence_length);
+      unit_cost.bytes_loaded = static_cast<double>((sequence_length + total_sequence_length) * head_size * sizeof(T));
+      unit_cost.bytes_stored = static_cast<double>(probs_matrix_bytes);
+
+      if (mask_data != nullptr) {
+        unit_cost.bytes_loaded += static_cast<double>(probs_matrix_bytes);
+        unit_cost.bytes_stored += static_cast<double>(probs_matrix_bytes);
+      }
+
+      if (present || present_key) {
+        double bytes_to_copy_key = static_cast<double>(sizeof(T) * present_chunk_length);
+        unit_cost.bytes_loaded += bytes_to_copy_key;
+        unit_cost.bytes_stored += bytes_to_copy_key;
+      }
 
-      ThreadPool::TryParallelFor(tp, loop_len, cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+      if (relative_position_bias_data != nullptr) {
+        unit_cost.compute_cycles += static_cast<double>(sequence_length * total_sequence_length);
+        unit_cost.bytes_loaded += probs_matrix_bytes * 2;
+        unit_cost.bytes_stored += probs_matrix_bytes;
+      }
+
+      ThreadPool::TryParallelFor(tp, loop_len, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
         for (std::ptrdiff_t i = begin; i != end; ++i) {
           const int batch_index = static_cast<int>(i) / num_heads_;
 
@@ -171,7 +180,7 @@ class AttentionCPUBase : public AttentionBase {
           if (mask_data != nullptr) {
             memcpy(output,
                    mask_data + mask_offset,
-                   static_cast<size_t>(sequence_length) * total_sequence_length * sizeof(T));
+                   probs_matrix_bytes);
           }
 
           const T* k = K + kv_input_chunk_length * i;
@@ -188,7 +197,7 @@ class AttentionCPUBase : public AttentionBase {
           // B: K'               (B x N x) T x H          (B x N x) H x T        H x T
           // C: attention_probs  (B x N x) S x T          (B x N x) S x T        S x T
           math::Gemm<T, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_sequence_length, head_size, alpha,
-                                    Q + q_input_chunk_length * i, k, 1.0,
+                                    Q + q_input_chunk_length * i, k, mask_data != nullptr ? 1.0f : 0.0f,
                                     output, nullptr);
 
           if (relative_position_bias_data != nullptr) {
@@ -238,10 +247,24 @@ class AttentionCPUBase : public AttentionBase {
       present += SafeInt<ptrdiff_t>(batch_size) * num_heads_ * total_sequence_length * v_head_size;
     }
 
-    const double cost =
-        static_cast<double>(sequence_length) * static_cast<double>(v_head_size) * static_cast<double>(sequence_length);
+    // The cost of Gemm
+    TensorOpCost unit_cost;
+    unit_cost.compute_cycles = static_cast<double>(2 * sequence_length * v_head_size * total_sequence_length);
+    unit_cost.bytes_loaded = static_cast<double>((sequence_length + v_head_size) * total_sequence_length * sizeof(T));
+    unit_cost.bytes_stored = static_cast<double>(sequence_length * v_head_size * sizeof(T));
+
+    if (present || present_value) {
+      double bytes_to_copy_value = static_cast<double>(present_chunk_length * sizeof(T));
+      unit_cost.bytes_loaded += bytes_to_copy_value;
+      unit_cost.bytes_stored += bytes_to_copy_value;
+    }
+
+    const size_t bytes_to_copy_trans = SafeInt<size_t>(v_head_size) * sizeof(T);
+    double bytes_to_copy_trans_all = static_cast<double>(sequence_length * bytes_to_copy_trans);
+    unit_cost.bytes_loaded += bytes_to_copy_trans_all;
+    unit_cost.bytes_stored += bytes_to_copy_trans_all;
 
-    ThreadPool::TryParallelFor(tp, SafeInt<ptrdiff_t>(batch_size) * num_heads_, cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+    ThreadPool::TryParallelFor(tp, SafeInt<ptrdiff_t>(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
       for (std::ptrdiff_t i = begin; i != end; ++i) {
         const T* v = V + kv_input_chunk_length * i;
         if (nullptr != present) {
@@ -263,9 +286,8 @@ class AttentionCPUBase : public AttentionBase {
         T* src = current_tmp_data;
         ptrdiff_t dest_offset = (SafeInt<ptrdiff_t>(batch_index) * sequence_length * num_heads_ + head_index) * v_head_size;
         T* dest = output + dest_offset;
-        const auto bytes_to_copy = SafeInt<size_t>(v_head_size) * sizeof(T);
         for (int j = 0; j < sequence_length; j++) {
-          memcpy(dest, src, bytes_to_copy);
+          memcpy(dest, src, bytes_to_copy_trans);
           src += v_head_size;
           dest += v_hidden_size;
         }
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
index 694c40bf3eda..c4e4b4ec707f 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
@@ -40,6 +40,7 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info) : OpKernel(i
   num_heads_ = static_cast<int>(num_heads);
 
   mask_filter_value_ = info.GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
+  is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
 }
 
 // Reshape Q/K/V from BxSxD to BxSxNxH
@@ -57,11 +58,12 @@ Status Reshape_BSD_to_BSNH(Tensor* qkv,
 
 // Transpose Q/K/V from BxSxNxH to BxNxSxH
 Status Transpose_BSNH_to_BNSH(const Tensor* qkv,
-                              OrtValue& qkv_transposed) {
+                              OrtValue& qkv_transposed,
+                              concurrency::ThreadPool* tp = nullptr) {
   std::vector<size_t> permutations({0, 2, 1, 3});
   gsl::span<const size_t> permutations_span{permutations};
   size_t from = 2, to = 1;
-  SingleAxisTranspose(permutations_span, *qkv, *qkv_transposed.GetMutable<Tensor>(), from, to);
+  SingleAxisTranspose(permutations_span, *qkv, *qkv_transposed.GetMutable<Tensor>(), from, to, nullptr, tp);
   return Status::OK();
 }
 
@@ -142,7 +144,8 @@ Status AddBiasTranspose(const Tensor* qkv,                   // Input: Q/K/V dat
   ORT_RETURN_IF_ERROR(Reshape_BSD_to_BSNH(qkv_with_bias.GetMutable<Tensor>(), batch_size, sequence_length, num_heads, head_size));
 
   // Transpose Q from BxSxNxH to BxNxSxH
-  ORT_RETURN_IF_ERROR(Transpose_BSNH_to_BNSH(qkv_with_bias.GetMutable<Tensor>(), qkv_with_bias_transposed));
+  auto tp = context->GetOperatorThreadPool();
+  ORT_RETURN_IF_ERROR(Transpose_BSNH_to_BNSH(qkv_with_bias.GetMutable<Tensor>(), qkv_with_bias_transposed, tp));
 
   return Status::OK();
 }
@@ -283,8 +286,9 @@ Status MultiHeadAttention<T>::Compute(OpKernelContext* context) const {
                                                                       nullptr,
                                                                       &parameters,
                                                                       num_heads_,
-                                                                      scale,
                                                                       mask_filter_value_,
+                                                                      scale,
+                                                                      is_unidirectional_,
                                                                       past_present_share_buffer,
                                                                       false));
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.h b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.h
index 4c86b777e984..fb7da78a5c0a 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.h
@@ -18,6 +18,7 @@ class MultiHeadAttention final : public OpKernel, public AttentionCPUBase {
  protected:
   int num_heads_;  // number of attention heads
   float mask_filter_value_;
+  bool is_unidirectional_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
index 00e82c9844b3..c91f5b601b4e 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
@@ -25,6 +25,7 @@ Status CheckInputs(const T* query,
                    int num_heads,
                    float mask_filter_value,
                    float scale,
+                   bool is_unidirectional,
                    bool past_present_share_buffer,
                    bool dmmha_packing) {
   //     key_padding_mask (K/V)     : (B) or (2*B + 1) or (B, L) or None
@@ -315,7 +316,7 @@ Status CheckInputs(const T* query,
     output_parameters->head_size = hidden_size / num_heads;
     output_parameters->v_head_size = v_hidden_size / num_heads;
     output_parameters->num_heads = num_heads;
-    output_parameters->is_unidirectional = false;
+    output_parameters->is_unidirectional = is_unidirectional;
     output_parameters->past_present_share_buffer = past_present_share_buffer;
     output_parameters->mask_filter_value = mask_filter_value;
     output_parameters->mask_type = mask_type;
@@ -342,6 +343,7 @@ Status CheckInputs(const T* query,
                    int num_heads,
                    float mask_filter_value,
                    float scale,
+                   bool is_unidirectional,
                    bool past_present_share_buffer,
                    bool dmmha_packing,
                    int max_threads_per_block) {
@@ -350,8 +352,8 @@ Status CheckInputs(const T* query,
   }
 
   return CheckInputs(query, key, value, bias, key_padding_mask, relative_position_bias, past_key, past_value,
-                     past_seq_len, parameters, num_heads, mask_filter_value, scale, past_present_share_buffer,
-                     dmmha_packing);
+                     past_seq_len, parameters, num_heads, mask_filter_value, scale, is_unidirectional,
+                     past_present_share_buffer, dmmha_packing);
 }
 
 }  // namespace multihead_attention_helper
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
index 47f462d75fcc..aa8b5b5f608f 100644
--- a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
@@ -27,7 +27,13 @@ ONNX_OPERATOR_TYPED_KERNEL_EX(
 template <typename T>
 RotaryEmbedding<T>::RotaryEmbedding(const OpKernelInfo& info) : OpKernel(info) {
   scale = info.GetAttrOrDefault<float>("scale", 1.0);
+  rotary_embedding_dim = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
+  num_heads = static_cast<int>(info.GetAttrOrDefault<int64_t>("num_heads", 0));
   interleaved = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1);
+
+  if (rotary_embedding_dim > 0) {
+    ORT_ENFORCE(num_heads > 0, "num_heads must be provided if rotary_embedding_dim is specified");
+  }
 }
 
 template <typename T>
@@ -42,6 +48,8 @@ Status RotaryEmbedding<T>::Compute(OpKernelContext* context) const {
                                                                    position_ids,
                                                                    cos_cache,
                                                                    sin_cache,
+                                                                   num_heads,
+                                                                   rotary_embedding_dim,
                                                                    &parameters));
 
   Tensor* output = context->Output(0, input->Shape());
@@ -59,61 +67,66 @@ Status RotaryEmbedding<T>::Compute(OpKernelContext* context) const {
 
   const int batch_size = parameters.batch_size;
   const int sequence_length = parameters.sequence_length;
-  const int num_heads = parameters.num_heads;
+  const int n_heads = parameters.num_heads;
   const int head_size = parameters.head_size;
   const int position_ids_format = parameters.position_ids_format;
-  const int half_head_size = head_size / 2;
+  const int rotary_emb_dim = parameters.rotary_embedding_dim;
+  const int half_rotary_emb_dim = rotary_emb_dim / 2;
+
   // Default input tensor shape is [batch, seq_len, hidden_size]
   int head_stride = head_size;
-  int seq_stride = num_heads * head_stride;
+  int seq_stride = n_heads * head_stride;
   int batch_stride = sequence_length * seq_stride;
   if (parameters.transposed) {
-    // Transposed input tensor shape is [batch, num_heads, seq_len, head_size]
+    // Transposed input tensor shape is [batch, n_heads, seq_len, head_size]
     seq_stride = head_size;
     head_stride = sequence_length * seq_stride;
-    batch_stride = num_heads * head_stride;
+    batch_stride = n_heads * head_stride;
   }
 
   AllocatorPtr allocator;
   ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
   auto* tp = context->GetOperatorThreadPool();
 
-  const int loop_len = batch_size * sequence_length * num_heads;
-  const double cost = static_cast<double>(head_size);
+  const int loop_len = batch_size * sequence_length * n_heads;
+  const double cost = static_cast<double>(rotary_emb_dim);
   ThreadPool::TryParallelFor(tp, loop_len, cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
     for (std::ptrdiff_t ptr = begin; ptr != end; ++ptr) {
-      const int b = static_cast<int>((ptr / num_heads) / sequence_length);
-      const int s = static_cast<int>((ptr / num_heads) % sequence_length);
-      const int n = static_cast<int>(ptr % num_heads);
+      const int b = static_cast<int>((ptr / n_heads) / sequence_length);
+      const int s = static_cast<int>((ptr / n_heads) % sequence_length);
+      const int n = static_cast<int>(ptr % n_heads);
 
       const int block_offset = b * batch_stride + s * seq_stride + n * head_stride;
 
       const T* input_data = input_src + block_offset;
       T* output_data = output_dest + block_offset;
 
-      // Cache is (M, H/2)
+      // Cache is (M, H/2) or (M, rotary_embedding_dim/2)
       const int position_id = (position_ids_format == 0)
                                   ? static_cast<int>(pos_ids_data[0]) + s
                                   : static_cast<int>(pos_ids_data[b * sequence_length + s]);
-      const int cache_offset = position_id * half_head_size;
+      const int cache_offset = position_id * half_rotary_emb_dim;
       const T* cos_data = cos_cache_data + cache_offset;
       const T* sin_data = sin_cache_data + cache_offset;
 
       int cache_idx = 0;
       T sign = 0;
       int j = 0;
-      for (int i = 0; i < head_size; i++) {
+      for (int i = 0; i < rotary_emb_dim; i++) {
         if (interleaved) {
-          cache_idx = (i / 2) % half_head_size;
+          cache_idx = (i / 2) % half_rotary_emb_dim;
           sign = (i % 2 == 0) ? static_cast<T>(-1) : static_cast<T>(1);
           j = (i % 2 == 0) ? i + 1 : i - 1;  // i - sign
         } else {
-          cache_idx = i % half_head_size;
-          sign = (i < half_head_size) ? static_cast<T>(-1) : static_cast<T>(1);
-          j = (i + half_head_size) % head_size;
+          cache_idx = i % half_rotary_emb_dim;
+          sign = (i < half_rotary_emb_dim) ? static_cast<T>(-1) : static_cast<T>(1);
+          j = (i + half_rotary_emb_dim) % rotary_emb_dim;
         }
         output_data[i] = input_data[i] * cos_data[cache_idx] + sign * input_data[j] * sin_data[cache_idx];
       }
+      for (int i = rotary_emb_dim; i < head_size; i++) {
+        output_data[i] = input_data[i];
+      }
     }
   });
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h
index be834a66cdc6..4e32424a22b6 100644
--- a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h
@@ -16,6 +16,8 @@ class RotaryEmbedding final : public OpKernel {
 
  protected:
   float scale;
+  int num_heads;
+  int rotary_embedding_dim;
   bool interleaved;
 };
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h
index 7b2e8289f7b0..dcbb36d1c4a3 100644
--- a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h
@@ -11,14 +11,15 @@ namespace rotary_embedding_helper {
 
 // Parameters deduced from node attributes and inputs/outputs.
 struct RotaryParameters {
-  int batch_size;           // Batch size used by input
-  int sequence_length;      // Sequence length used by input
-  int hidden_size;          // Hidden size used by input
-  int head_size;            // Head size used by cos/sin cache * 2
-  int num_heads;            // num_heads = hidden_size / head_size
-  int max_sequence_length;  // Sequence length used by cos/sin cache
-  int position_ids_format;  // Format of position ids - 0 is (1), 1 is (batch_size, sequence_length)
-  bool transposed;          // Whether the input tensor has been transposed into (batch, num_heads, seq_len, hidden)
+  int batch_size;            // Batch size used by input
+  int sequence_length;       // Sequence length used by input
+  int hidden_size;           // Hidden size used by input
+  int head_size;             // Head size
+  int rotary_embedding_dim;  // Rotary embedding dimension.
+  int num_heads;             // num_heads = hidden_size / head_size
+  int max_sequence_length;   // Sequence length used by cos/sin cache
+  int position_ids_format;   // Format of position ids - 0 is (1), 1 is (batch_size, sequence_length)
+  bool transposed;           // Whether the input tensor has been transposed into (batch, num_heads, seq_len, hidden)
 };
 
 template <typename T>
@@ -26,11 +27,13 @@ Status CheckInputs(const T* input,
                    const T* position_ids,
                    const T* cos_cache,
                    const T* sin_cache,
+                   int num_heads,
+                   int rotary_embedding_dim,
                    void* parameters) {
   //    input        : (batch_size, sequence_length, hidden_size)
   //    position ids : (1) or (batch_size, sequence_length)
-  //    cos cache    : (max_sequence_length, head_size / 2)
-  //    sin cache    : (max_sequence_length, head_size / 2)
+  //    cos cache    : (max_sequence_length, rotary_embedding_dim / 2)
+  //    sin cache    : (max_sequence_length, rotary_embedding_dim / 2)
 
   // Check input
   const auto& input_dims = input->Shape().GetDims();
@@ -60,6 +63,12 @@ Status CheckInputs(const T* input,
                            "the same shape");
   }
 
+  // Check num_heads and rotary_embedding_dim
+  if (rotary_embedding_dim > 0 && num_heads == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads must be provided if rotary_embedding_dim is ",
+                           "specified");
+  }
+
   // Get attributes from inputs
   int batch_size = static_cast<int>(input_dims[0]);
   int sequence_length = static_cast<int>(input_dims[1]);
@@ -73,8 +82,13 @@ Status CheckInputs(const T* input,
     transposed = true;
   }
   int max_sequence_length = static_cast<int>(cos_cache_dims[0]);
-  int head_size = static_cast<int>(cos_cache_dims[1]) * 2;
-  int num_heads = hidden_size / head_size;
+  int head_size = rotary_embedding_dim == 0 ? static_cast<int>(cos_cache_dims[1]) * 2
+                                            : static_cast<int>(hidden_size / num_heads);
+  if (rotary_embedding_dim > 0 && rotary_embedding_dim > head_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "rotary_embedding_dim must be less than or equal to ",
+                           "head_size");
+  }
+
   int position_ids_format = -1;
 
   // Check position_ids input shapes
@@ -91,23 +105,15 @@ Status CheckInputs(const T* input,
   } else {
     position_ids_format = 0;
   }
+
   // Check cos_cache input shapes
   if (max_sequence_length != static_cast<int>(cos_cache_dims[0])) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'cos_cache' dimension 0 should be same as ",
                            "max_sequence_length, got ", cos_cache_dims[0]);
   }
-  if ((head_size / 2) != static_cast<int>(cos_cache_dims[1])) {
+  if ((head_size / 2) != static_cast<int>(cos_cache_dims[1]) && (rotary_embedding_dim > 0 && (rotary_embedding_dim / 2) != static_cast<int>(cos_cache_dims[1]))) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'cos_cache' dimension 1 should be same as ",
-                           "head_size / 2, got ", cos_cache_dims[1]);
-  }
-  // Check sin_cache input shapes
-  if (max_sequence_length != static_cast<int>(sin_cache_dims[0])) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'sin_cache' dimension 0 should be same as ",
-                           "max_sequence_length, got ", sin_cache_dims[0]);
-  }
-  if ((head_size / 2) != static_cast<int>(sin_cache_dims[1])) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'sin_cache' dimension 1 should be same as ",
-                           "head_size / 2, got ", sin_cache_dims[1]);
+                           "head_size / 2 or rotary_embedding_dim / 2, got ", cos_cache_dims[1]);
   }
 
   // Set rotary parameters
@@ -117,10 +123,11 @@ Status CheckInputs(const T* input,
     output_parameters->sequence_length = sequence_length;
     output_parameters->hidden_size = hidden_size;
     output_parameters->head_size = head_size;
-    output_parameters->num_heads = num_heads;
+    output_parameters->num_heads = num_heads > 0 ? num_heads : static_cast<int>(hidden_size / head_size);
     output_parameters->max_sequence_length = max_sequence_length;
     output_parameters->position_ids_format = position_ids_format;
     output_parameters->transposed = transposed;
+    output_parameters->rotary_embedding_dim = rotary_embedding_dim > 0 ? rotary_embedding_dim : head_size;
   }
 
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index a9703dc68dd2..602dd98d8c0d 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -1,6 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "contrib_ops/cpu/quantization/matmul_nbits_impl.h"
+
+#include <cstdint>
+#include <type_traits>
+
+#include "core/common/common.h"
 #include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
@@ -10,9 +16,57 @@
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/common.h"
 
+#ifdef ORT_NEURAL_SPEED
+#include "contrib_ops/cpu/quantization/neural_speed_gemm.h"
+#endif
+
 namespace onnxruntime {
 namespace contrib {
 
+namespace {
+int64_t GetAccuracyLevel(size_t nbits, size_t block_size, int64_t accuracy_level_attr) {
+  const auto accuracy_level = std::clamp(accuracy_level_attr,
+                                         static_cast<int64_t>(CompMostAccurate),
+                                         static_cast<int64_t>(CompLeastAccurate));
+
+#if defined(ORT_NEURAL_SPEED)
+
+  ORT_UNUSED_PARAMETER(nbits);
+  ORT_UNUSED_PARAMETER(block_size);
+
+  // Neural Speed APIs already expect a minimum accuracy level so just use the given value.
+  return accuracy_level;
+
+#else  // defined(ORT_NEURAL_SPEED)
+
+  // Find a supported accuracy level that is not less accurate than the one given.
+  // CompMostAccurate is always supported with the fallback implementation.
+  // Note: A higher numeric accuracy level value means lower accuracy, so the comparison order is reversed.
+  int64_t effective_accuracy_level = accuracy_level;
+  for (; effective_accuracy_level > CompMostAccurate; --effective_accuracy_level) {
+    const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(effective_accuracy_level);
+    if (MlasIsSQNBitGemmAvailable(nbits, block_size, compute_type)) {
+      break;
+    }
+  }
+
+  return effective_accuracy_level;
+
+#endif  // defined(ORT_NEURAL_SPEED)
+}
+}  // namespace
+
+bool GetType(const NodeArg& node_arg, int32_t& type) {
+  type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto || !type_proto->has_tensor_type() || !type_proto->tensor_type().has_elem_type()) {
+    return false;
+  }
+
+  type = type_proto->tensor_type().elem_type();
+  return true;
+}
+
 class MatMulNBits final : public OpKernel {
  public:
   MatMulNBits(const OpKernelInfo& info)
@@ -21,18 +75,31 @@ class MatMulNBits final : public OpKernel {
         N_{narrow<size_t>(info.GetAttr<int64_t>("N"))},
         block_size_{narrow<size_t>(info.GetAttr<int64_t>("block_size"))},
         nbits_{narrow<size_t>(info.GetAttr<int64_t>("bits"))},
-        accuracy_level_{info.GetAttr<int64_t>("accuracy_level")} {
+        accuracy_level_{GetAccuracyLevel(nbits_, block_size_, info.GetAttr<int64_t>("accuracy_level"))} {
+    const auto& node = info.node();
+    auto input_defs = node.InputDefs();
+    // g_idx
+    if (input_defs.size() > 4) {
+      act_order_ = true;
+    }
+    int32_t type;
+    if (input_defs.size() > 3 && GetType(*input_defs[3], type)) {
+      zero_point_is_not_quant_ = type != ONNX_NAMESPACE::TensorProto_DataType_UINT8;
+    }
+
     ORT_ENFORCE(nbits_ == 4,
                 "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
-    is_asym_ = info.GetInputCount() >= 4;
+#ifdef ORT_NEURAL_SPEED
     const Tensor* tensor_B = nullptr;
     const Tensor* tensor_scale = nullptr;
     const Tensor* tensor_zero_point = nullptr;
     bool B_constant = info.TryGetConstantInput(1, &tensor_B);
     bool scale_constant = info.TryGetConstantInput(2, &tensor_scale);
     bool zero_point_constant = info.TryGetConstantInput(3, &tensor_zero_point);
+    is_asym_ = info.GetInputCount() >= 4;
     all_constant_ = B_constant && scale_constant;
     all_constant_ = is_asym_ ? all_constant_ && zero_point_constant : all_constant_;
+#endif
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -49,31 +116,47 @@ class MatMulNBits final : public OpKernel {
   const size_t N_;
   const size_t block_size_;
   const size_t nbits_;
+  bool act_order_{false};
+  bool zero_point_is_not_quant_{false};
   const int64_t accuracy_level_;
   const bool column_wise_quant_{true};
   IAllocatorUniquePtr<void> packed_b_;
   size_t packed_b_size_{0};
+
+#if defined(ORT_NEURAL_SPEED)
+
   bool is_asym_{false};
   bool all_constant_{false};
+
+#endif  // defined(ORT_NEURAL_SPEED)
 };
 
 Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
                             /*out*/ bool& is_packed,
                             /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
+  if (act_order_ || zero_point_is_not_quant_) {
+    return Status::OK();
+  }
+#if defined(ORT_NEURAL_SPEED)
+
   if (!all_constant_) {
     return Status::OK();
   }
-  auto compt_type = static_cast<MLAS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
   MLAS_THREADPOOL* pool = NULL;
+  if (nbits_ != 4) {
+    return Status::OK();
+  }
+  auto comp_type = static_cast<NS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
+  auto nbits = static_cast<int>(nbits_);
   if (input_idx == 1) {
-    packed_b_size_ = MlasNBitsGemmPackBSize(N_, K_, block_size_, static_cast<int>(nbits_), is_asym_, compt_type);
+    packed_b_size_ = NSNBitsGemmPackBSize(N_, K_, block_size_, nbits, is_asym_, comp_type);
     if (packed_b_size_ == 0) return Status::OK();
     auto qptr = tensor.Data<uint8_t>();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
     std::memset(packed_b_.get(), 0, packed_b_size_);
-    MlasNBitsGemmPackB(packed_b_.get(), qptr, nullptr, nullptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, false, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), qptr, nullptr, nullptr, N_, K_, K_, block_size_, nbits, is_asym_, false,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -82,8 +165,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   }
   if (input_idx == 2 && packed_b_ != nullptr) {
     auto sptr = tensor.Data<float>();
-    MlasNBitsGemmPackB(packed_b_.get(), nullptr, sptr, nullptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, !is_asym_, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), nullptr, sptr, nullptr, N_, K_, K_, block_size_, nbits, is_asym_, !is_asym_,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -92,8 +175,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   }
   if (input_idx == 3 && packed_b_ != nullptr) {
     auto zptr = tensor.Data<uint8_t>();
-    MlasNBitsGemmPackB(packed_b_.get(), nullptr, nullptr, zptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, is_asym_, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), nullptr, nullptr, zptr, N_, K_, K_, block_size_, nbits, is_asym_, is_asym_,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -101,12 +184,38 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
+#else  // defined(ORT_NEURAL_SPEED)
+
+  if (input_idx == 1) {
+    const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level_);
+    if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type)) {
+      return Status::OK();
+    }
+    packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type);
+    if (packed_b_size_ == 0) {
+      return Status::OK();
+    }
+    auto qptr = tensor.DataRaw();
+    packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
+    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, qptr, packed_b_.get());
+    if (prepacked_weights) {
+      prepacked_weights->buffers_.push_back(std::move(packed_b_));
+      prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
+    }
+    is_packed = true;
+  }
+
+#endif  // defined(ORT_NEURAL_SPEED)
+
   return Status::OK();
 }
 
 Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                               /*out*/ bool& used_shared_buffers) {
   used_shared_buffers = false;
+
+#if defined(ORT_NEURAL_SPEED)
+
   // Pack three tensors into one buffer
   if (input_idx == 1) {
     used_shared_buffers = true;
@@ -120,16 +229,27 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prep
     used_shared_buffers = true;
     packed_b_ = std::move(prepacked_buffers[0]);
   }
+
+#else  // defined(ORT_NEURAL_SPEED)
+
+  if (input_idx == 1) {
+    used_shared_buffers = true;
+    packed_b_ = std::move(prepacked_buffers[0]);
+  }
+
+#endif  // defined(ORT_NEURAL_SPEED)
+
   return Status::OK();
 }
 
 Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
-
   const Tensor* a = ctx->Input<Tensor>(0);
   const auto* a_data = a->Data<float>();
 
-  if (packed_b_.get()) {
+#if defined(ORT_NEURAL_SPEED)
+
+  if (packed_b_) {
     TensorShape b_shape({static_cast<int64_t>(N_), static_cast<int64_t>(K_)});
 
     MatMulComputeHelper helper;
@@ -147,7 +267,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
     const size_t N = static_cast<size_t>(helper.N());
     const size_t K = static_cast<size_t>(helper.K());
     const size_t lda = helper.Lda(false);
-    std::vector<MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS> gemm_params(max_len);
+    std::vector<NS_SQNBITS_GEMM_DATA_PACKED_PARAMS> gemm_params(max_len);
     AllocatorPtr allocator;
     auto status = ctx->GetTempSpaceAllocator(&allocator);
     ORT_RETURN_IF_ERROR(status);
@@ -158,22 +278,24 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
       gemm_params[i].C = y_data + helper.OutputOffsets()[i];
       gemm_params[i].ldc = N;
     }
-    auto ws_size = MlasSQNBitsGemmBatchWorkspaceSize(M, N, K, max_len, gemm_params.data());
+    auto ws_size = NSSQNBitsGemmBatchWorkspaceSize(M, N, K, max_len, gemm_params.data());
     // workspace for activation process(dynamic quantization and others)
     auto ws_ptr = IAllocator::MakeUniquePtr<int8_t>(allocator, ws_size);
-    MlasSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(),
-                                thread_pool);
+    NSSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(), thread_pool);
     return Status::OK();
   }
 
-  const Tensor* b = ctx->Input<Tensor>(1);
+#endif  // defined(ORT_NEURAL_SPEED)
+
   const Tensor* scales = ctx->Input<Tensor>(2);
-  const Tensor* zero_points = ctx->Input<Tensor>(3);
-  const uint8_t* b_data = b->Data<uint8_t>();
+  const Tensor* zero_points = ctx->InputCount() > 3 ? ctx->Input<Tensor>(3) : nullptr;
+  const Tensor* reorder_idx = ctx->InputCount() > 4 ? ctx->Input<Tensor>(4) : nullptr;
+
   const auto* scales_data = scales->Data<float>();
-  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->Data<uint8_t>();
+  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
 
   TensorShape b_shape({static_cast<int64_t>(N_), static_cast<int64_t>(K_)});
+  const auto* reorder_idx_data = reorder_idx == nullptr ? nullptr : reorder_idx->Data<int32_t>();
 
   MatMulComputeHelper helper;
   ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape, false, true));
@@ -181,8 +303,9 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   Tensor* y = ctx->Output(0, helper.OutputShape());
 
   // Bail out early if the output is going to be empty
-  if (y->Shape().Size() == 0)
+  if (y->Shape().Size() == 0) {
     return Status::OK();
+  }
 
   auto* y_data = y->MutableData<float>();
 
@@ -192,53 +315,98 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   const size_t K = static_cast<size_t>(helper.K());
   const size_t lda = helper.Lda(false);
 
-  if (MlasIsSQNBitGemmAvailable(nbits_, block_size_)) {
-    // number of bytes or elements between adjacent matrices
-    size_t b_data_matrix_stride_in_bytes, b_scale_matrix_stride, b_zero_point_matrix_stride_in_bytes;
-    MlasBlockwiseQuantizedBufferSizes(static_cast<int>(nbits_), static_cast<int>(block_size_), /* columnwise */ true,
-                                      static_cast<int>(K), static_cast<int>(N),
-                                      b_data_matrix_stride_in_bytes, b_scale_matrix_stride,
-                                      &b_zero_point_matrix_stride_in_bytes);
-
-    const size_t b_matrix_size = K * N;
-
-    InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
-    for (size_t i = 0; i < batch_count; ++i) {
-      const size_t b_matrix_offset = helper.RightOffsets()[i] / b_matrix_size;
-
-      data[i].A = a_data + helper.LeftOffsets()[i];
-      data[i].lda = lda;
-      data[i].QuantBData = b_data + b_matrix_offset * b_data_matrix_stride_in_bytes;
-      data[i].QuantBScale = scales_data + b_matrix_offset * b_scale_matrix_stride;
-      data[i].QuantBZeroPoint = zero_points_data != nullptr
-                                    ? zero_points_data + b_matrix_offset * b_zero_point_matrix_stride_in_bytes
-                                    : nullptr;
-      data[i].C = y_data + helper.OutputOffsets()[i];
-      data[i].ldc = N;
+  const bool has_single_b_matrix =
+      (!act_order_) && (!zero_point_is_not_quant_) &&
+      std::all_of(helper.RightOffsets().begin(), helper.RightOffsets().end(), [](size_t offset) { return offset == 0; });
+
+  if (has_single_b_matrix) {
+    const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level_);
+
+    if (MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type)) {
+      IAllocatorUniquePtr<std::byte> workspace{};
+      if (const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, batch_count,
+                                                                         nbits_, block_size_, compute_type);
+          workspace_size > 0) {
+        AllocatorPtr allocator;
+        ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator));
+        workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size);
+      }
+
+      const void* b_data = [&]() -> const void* {
+        if (packed_b_) {
+          return packed_b_.get();
+        }
+
+        const Tensor* b = ctx->Input<Tensor>(1);
+        return b->DataRaw();
+      }();
+
+      InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
+      for (size_t i = 0; i < batch_count; ++i) {
+        data[i].A = a_data + helper.LeftOffsets()[i];
+        data[i].lda = lda;
+        data[i].QuantBData = b_data;
+        data[i].QuantBScale = scales_data;
+        data[i].QuantBZeroPoint = zero_points_data;
+        data[i].C = y_data + helper.OutputOffsets()[i];
+        data[i].ldc = N;
+      }
+
+      MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type, data.data(), workspace.get(),
+                          thread_pool);
+
+      return Status::OK();
     }
-
-    MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, data.data(), thread_pool);
-
-    return Status::OK();
   }
 
-  const size_t ldb = helper.Ldb(true);
+  const Tensor* b = ctx->Input<Tensor>(1);
+  const uint8_t* b_data = b->Data<uint8_t>();
 
+  const size_t ldb = helper.Ldb(true);
   AllocatorPtr allocator;
   ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator));
   auto tmp_b_data_ptr = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_);
-  // dequantize b, only 4b quantization is supported for now
-  MlasDequantizeBlockwise<float, 4>(
-      tmp_b_data_ptr.get(),               // dequantized output
-      b_data,                             // quantized input
-      scales_data,                        // quantization scales
-      zero_points_data,                   // quantization zero points
-      static_cast<int32_t>(block_size_),  // quantization block size
-      column_wise_quant_,                 // columnwise quantization or row-wise
-      static_cast<int32_t>(K_),           // number of rows in quantized input
-      static_cast<int32_t>(N_),           // number of columns in quantized input
-      thread_pool);
-
+  if ((reorder_idx_data == nullptr) && (!zero_points || !zero_points->IsDataType<float>())) {
+    // dequantize b, only 4b quantization is supported for now
+    MlasDequantizeBlockwise<float, 4>(
+        tmp_b_data_ptr.get(),                           // dequantized output
+        b_data,                                         // quantized input
+        scales_data,                                    // quantization scales
+        static_cast<const uint8_t*>(zero_points_data),  // quantization zero points
+        static_cast<int32_t>(block_size_),              // quantization block size
+        column_wise_quant_,                             // columnwise quantization or row-wise
+        static_cast<int32_t>(K_),                       // number of rows in quantized input
+        static_cast<int32_t>(N_),                       // number of columns in quantized input
+        thread_pool);
+  } else {
+    ORT_ENFORCE(column_wise_quant_, "Row-wise quantization is not supported for now");
+    // !!!!!!!!!!!!!! naive implementation, need to be optimized !!!!!!!!!!!!!!
+    if ((zero_points && zero_points->IsDataType<float>())) {
+      DequantizeBlockwise<float, float>(
+          tmp_b_data_ptr.get(),                         // dequantized output
+          b_data,                                       // quantized input
+          scales_data,                                  // quantization scales
+          static_cast<const float*>(zero_points_data),  // quantization zero points
+          reorder_idx_data,
+          static_cast<int32_t>(block_size_),  // quantization block size
+          column_wise_quant_,                 // columnwise quantization or row-wise
+          static_cast<int32_t>(K_),           // number of rows in quantized input
+          static_cast<int32_t>(N_),           // number of columns in quantized input
+          thread_pool);
+    } else {
+      DequantizeBlockwise<float, uint8_t>(
+          tmp_b_data_ptr.get(),                           // dequantized output
+          b_data,                                         // quantized input
+          scales_data,                                    // quantization scales
+          static_cast<const uint8_t*>(zero_points_data),  // quantization zero points
+          reorder_idx_data,
+          static_cast<int32_t>(block_size_),  // quantization block size
+          column_wise_quant_,                 // columnwise quantization or row-wise
+          static_cast<int32_t>(K_),           // number of rows in quantized input
+          static_cast<int32_t>(N_),           // number of columns in quantized input
+          thread_pool);
+    }
+  }
 #if 0  // for debug
   auto tm_b_data_ptr_trans = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_);
   MlasTranspose(tmp_b_data_ptr.get(), tm_b_data_ptr_trans.get(), N_, K_);
@@ -269,7 +437,9 @@ ONNX_OPERATOR_KERNEL_EX(
     kCpuExecutionProvider,
     KernelDefBuilder()
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>())
+        .TypeConstraint("T3", {DataTypeImpl::GetTensorType<uint8_t>(), DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T4", DataTypeImpl::GetTensorType<int32_t>()),
     MatMulNBits);
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
new file mode 100644
index 000000000000..7e343d85f404
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
@@ -0,0 +1,109 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "contrib_ops/cpu/quantization/matmul_nbits_impl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <type_traits>
+
+#include "core/common/common.h"
+#include "core/framework/float16.h"
+#include "core/providers/common.h"
+#include "core/platform/threadpool.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <class T, class zeroT>
+void Dequantize4BitsKernelReOrder(
+    T* output, const uint8_t* quant_data, const T* scale_data,
+    const zeroT* zero_points, const int32_t* reorder_idx, int block_size,
+    int groups_per_threadblock, int total_groups, int out_rows, int out_cols,
+    int blockIdx_x, int threadIdx_x) {
+  const int group_id = blockIdx_x * groups_per_threadblock + ((threadIdx_x * 8) / block_size);
+  if (group_id >= total_groups) {
+    return;
+  }
+  const int scales_shape_x = (out_cols + block_size - 1) / block_size;
+  const int zero_point_shape_x = (scales_shape_x + 1) / 2;
+
+  int n_idx = group_id / scales_shape_x;
+  int kb_idx = group_id % scales_shape_x;
+  int element_offset = group_id * block_size + ((threadIdx_x * 8) & (block_size - 1));
+
+  const int out_x = element_offset % (scales_shape_x * block_size);
+  const int out_y = element_offset / (scales_shape_x * block_size);
+  if (out_y >= out_rows || out_x >= out_cols) {
+    return;
+  }
+  T* output_i = output + out_y * out_cols + out_x;
+  uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
+  const int remain_x = std::min(8, out_cols - out_x);
+  const int32_t* reorder_idx_with_off = reorder_idx + kb_idx * block_size + ((threadIdx_x * 8) & (block_size - 1));
+  for (int i = 0; i < remain_x; i++) {
+    int32_t rid = reorder_idx ? reorder_idx_with_off[i] : kb_idx;
+    T scale = *(scale_data + n_idx * scales_shape_x + rid);
+    float zp_f = 8;
+    if (zero_points) {
+      if constexpr (std::is_same_v<zeroT, T>) {
+        zp_f = *(zero_points + n_idx * scales_shape_x + rid);
+      } else {
+        uint8_t zp = 8;
+        zp = zero_points[n_idx * zero_point_shape_x + rid / 2];
+        zp = (rid & 0x01) ? (zp >> 4) : (zp & 0x0f);
+      }
+    }
+
+    if constexpr (std::is_same_v<T, MLFloat16>) {
+      T zp_adjust = -scale * MLFloat16(zp_f);
+      output_i[i] = static_cast<float>((quant_value >> (4 * i)) & 0xF) * scale + zp_adjust;
+    } else {
+      T zp_adjust = -scale * zp_f;
+      output_i[i] = T((quant_value >> (4 * i)) & 0xF) * scale + zp_adjust;
+    }
+  }
+}
+
+template <typename inputT, typename zeroT>
+void DequantizeBlockwise(
+    inputT* output,              // dequantized output
+    const uint8_t* quant_data,   // quantized input
+    const inputT* scales_data,   // quantization scales
+    const zeroT* zero_points,    // quantization zero points
+    const int32_t* reorder_idx,  // reorder_idx for groupwise quantization
+    int32_t block_size,          // quantization block size
+    bool,                        // columnwise quantization or row-wise
+    int32_t K,                   // number of rows in quantized input
+    int32_t N,                   // number of columns in quantized input
+    onnxruntime::concurrency::ThreadPool* pool) {
+  auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
+  constexpr int element_per_thread = 8;
+  int groups_per_threadblock = 256 * element_per_thread / block_size;
+  int groups_per_K = ceildiv(K, block_size);
+  int total_groups = N * groups_per_K;  // total elemenets in quant_data
+  int blocks_per_grid = static_cast<int>(ceildiv(total_groups, groups_per_threadblock));
+  concurrency::ThreadPool::TrySimpleParallelFor(
+      pool, static_cast<std::ptrdiff_t>(blocks_per_grid),
+      [&](std::ptrdiff_t block_id) {
+        for (int j = 0; j < 256; j++) {
+          Dequantize4BitsKernelReOrder(output, quant_data, scales_data, zero_points,
+                                       reorder_idx, block_size, groups_per_threadblock,
+                                       total_groups, N, K, static_cast<int>(block_id), j);
+        }
+      });
+}
+
+template void DequantizeBlockwise<float, uint8_t>(
+    float* output, const uint8_t* quant_data, const float* scales_data,
+    const uint8_t* zero_points, const int32_t* reorder_idx, int32_t block_size,
+    bool columnwise, int32_t K, int32_t N, onnxruntime::concurrency::ThreadPool* thread_pool);
+
+template void DequantizeBlockwise<float, float>(
+    float* output, const uint8_t* quant_data, const float* scales_data,
+    const float* zero_points, const int32_t* reorder_idx, int32_t block_size,
+    bool columnwise, int32_t K, int32_t N, onnxruntime::concurrency::ThreadPool* thread_pool);
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h
new file mode 100644
index 000000000000..5061ac5c800a
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/common.h"
+#include "core/platform/threadpool.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename inputT, typename zeroT>
+void DequantizeBlockwise(
+    inputT* output,              // dequantized output
+    const uint8_t* quant_data,   // quantized input
+    const inputT* scales_data,   // quantization scales
+    const zeroT* zero_points,    // quantization zero points
+    const int32_t* reorder_idx,  // quantization zero points
+    int32_t block_size,          // quantization block size
+    bool,                        // columnwise quantization or row-wise
+    int32_t K,                   // number of rows in quantized input
+    int32_t N,                   // number of columns in quantized input
+    onnxruntime::concurrency::ThreadPool* thread_pool);
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
new file mode 100644
index 000000000000..864abffd131f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
@@ -0,0 +1,45 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+--*/
+
+#pragma once
+
+#include "contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+
+namespace bestla {
+
+using tAVX512F = gemm::SCoreRowNAvx512f<48, 8>;
+using tAMX_BF16 = gemm::HCoreRowNAmxbf16<64, 16>;
+using tAVX512_FP16 = gemm::HCoreRowNAvx512fp16<96, 8>;
+using tAVX_VNNI = gemm::ICoreRowNAvxvnni<24, 4>;
+using tAVX512_VNNI = gemm::ICoreRowNAvx512vnni<48, 8>;
+using tAMX_INT8_US = gemm::ICoreRowNAmxint8<64, 16>;
+using tAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<64, 16>;
+using tAVX2 = gemm::SCoreRowNAvx2<24, 4>;
+using tAVX_VNNI_KBlock = gemm::ICoreRowNAvxvnniKBlock<24, 2>;
+using tAVX512_VNNI_KBlock = gemm::ICoreRowNAvx512vnniKBlock<48, 4>;
+using tAMX_INT8_US_KBlock = gemm::ICoreRowNAmxint8KBlock<48, 16>;
+using tAMX_INT8_SS_KBlock = gemm::ICoreRowNAmxint8SSKBlock<48, 16>;
+
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNInt = prologue_b::gemm::WeightKBlockNInteger<GC_T, ISA_T>;
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNFloat = prologue_b::gemm::WeightKBlockNFloat<GC_T, ISA_T>;
+
+class ORTThreading : public parallel::IThreading {
+ public:
+  explicit ORTThreading(void* tp);
+  void parallel_for(const parallel::thread_func& func) const override;
+  void set_threads(int nthreads) override {
+    (void)(nthreads);
+    assert(0);
+  }
+  void sync() const override { assert(0); }
+  void* mTp;
+};
+
+}  // namespace bestla
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
new file mode 100644
index 000000000000..73aaa4ae61a6
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
@@ -0,0 +1,438 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    neural_speed_gemm.cpp
+
+Abstract:
+
+    GEMM template combinations of neural_speed.
+--*/
+
+#include "contrib_ops/cpu/quantization/neural_speed_defs.h"
+#include "contrib_ops/cpu/quantization/neural_speed_gemm.h"
+#include "core/platform/threadpool.h"
+
+using ThreadPool = onnxruntime::concurrency::ThreadPool;
+
+namespace bestla {
+
+ORTThreading::ORTThreading(void* tp)
+    : IThreading(ThreadPool::DegreeOfParallelism(reinterpret_cast<ThreadPool*>(tp))), mTp(tp) {}
+
+void ORTThreading::parallel_for(const parallel::thread_func& func) const {
+  ThreadPool::TrySimpleParallelFor(reinterpret_cast<ThreadPool*>(mTp), mThreadNum,
+                                   [&](ptrdiff_t tid) { func(static_cast<int>(tid)); });
+}
+
+template <class GemmCore_T>
+static void NSSQ4GemmCompF32(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                             storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc, int8_t* WorkSpace,
+                             parallel::IThreading* th) {
+  auto M_ = static_cast<int>(M);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto lda_ = static_cast<int>(lda);
+  auto ldc_ = static_cast<int>(ldc);
+  utils::GemmProblem gp(1, M_, N_, K_, B->mBlockSize);
+  if (M <= 16) {
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
+                                      prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::CompFp32BlockEpilogue,
+                                      epilogue::gemm::AccumulatorWriteBackFp32>;
+    static Launcher kernel;
+    auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
+    if (B->IsAsym()) {
+      reduceA.assign(WorkSpace);
+      ORTThreading single(nullptr);
+      kernel.mProA.reduce({A, lda_, &reduceA}, M_, K_, B->mBlockSize, &single);
+    }
+    typename Launcher::Param args{gp,
+                                  {A, lda_, &reduceA},
+                                  {B},
+                                  {B->template SPtr<int8_t>(), B->SDtype(), B->CStep(), B->template ZPtr<int8_t>(),
+                                   reduceA.template RPtr<float>(), reduceA.lda},
+                                  {C, ldc_, nullptr}};
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  } else {
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+    static Launcher kernel;
+    typename Launcher::Param args{gp, {A, lda_}, {B}, {C, ldc_, nullptr}};
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  }
+}
+
+template <class GemmCore_T>
+static void NSSQ4GemmCompInt8(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                              storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc, int8_t* WorkSpace,
+                              parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher =
+      wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationF32KBlockQuantize,
+                                       prologue_b::gemm::WeightKBlockNInteger,
+                                       epilogue::gemm::AccumulatorWriteBackFp32>;
+  auto M_ = static_cast<int>(M);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto lda_ = static_cast<int>(lda);
+  auto ldc_ = static_cast<int>(ldc);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createStorage(M_, K_, B->mBlockSize, B->IsAsym());
+  quanA.assign(WorkSpace);
+  if (M <= 16) {
+    ORTThreading single(nullptr);
+    kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, &single);
+  } else {
+    kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, th);
+  }
+  utils::GemmProblem gp(1, M_, N_, K_, B->mBlockSize);
+  typename Launcher::Param args{gp, {A, lda_, &quanA}, {B}, {C, ldc_, nullptr}};
+  parallel::GemmRun<Parallel>(kernel, args, th);
+}
+
+template <class GemmCore_T>
+static size_t NSSQ4GemmCompF32WorkspaceSize(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                                            storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc) {
+  auto M_ = static_cast<int>(M);
+  auto K_ = static_cast<int>(K);
+  (void)(A);
+  (void)(N);
+  (void)(C);
+  (void)(lda);
+  (void)(ldc);
+  if (M <= 16) {
+    using ProA = prologue_a::gemm::ActivationKBlockBaseF32<GemmCore_T, GemmCore_T::ISA>;
+    static ProA proA;
+    if (B->IsAsym()) {
+      auto reduceA = proA.createStorage(M_, K_, B->mBlockSize);
+      return reduceA.mSize;
+    }
+    return 0;
+  } else {
+    // using ProA = prologue_a::gemm::ActivationBase<GemmCore_T, GemmCore_T::ISA>;
+    return 0;
+  }
+}
+
+template <class GemmCore_T>
+static size_t NSSQ4GemmCompInt8WorkspaceSize(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                                             storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc) {
+  (void)(N);
+  (void)(lda);
+  (void)(ldc);
+  (void)(A);
+  (void)(C);
+  using ProA = prologue_a::gemm::ActivationF32KBlockQuantize<GemmCore_T, GemmCore_T::ISA>;
+  static ProA proA;
+  auto quanA =
+      proA.createStorage(static_cast<int>(M), static_cast<int>(K), static_cast<int>(B->mBlockSize), B->IsAsym());
+  return quanA.mSize;
+}
+
+}  // namespace bestla
+
+using namespace bestla;
+
+static bool NSSQ4GemmBatchDriver(size_t M, size_t N, size_t K, size_t BatchN,
+                                 const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace,
+                                 void* ThreadPool) {
+  GetCPUDevice();
+  bestla::ORTThreading orth(ThreadPool);
+  bool processed = true;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = bestla::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
+    auto uptr = std::unique_ptr<bestla::storage::gemm::IWeightBase>(ptr);
+    if (ptr) {
+      auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+      auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+      auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+      auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto kptr = reinterpret_cast<bestla::storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+        auto BlkSize = kptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == bestla::tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            bestla::NSSQ4GemmCompF32<bestla::tAVX512F>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                       DataParams[i].C, DataParams[i].ldc, WorkSpace, &orth);
+          } else if (NTile == bestla::tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            bestla::NSSQ4GemmCompF32<bestla::tAVX2>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C,
+                                                    DataParams[i].ldc, WorkSpace, &orth);
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          if (NTile == bestla::tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() &&
+              BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAMX_INT8_SS_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                   DataParams[i].C, DataParams[i].ldc, WorkSpace,
+                                                                   &orth);
+          } else if (NTile == bestla::tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAVX512_VNNI_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                   DataParams[i].C, DataParams[i].ldc, WorkSpace,
+                                                                   &orth);
+          } else if (NTile == bestla::tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() &&
+                     BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAVX_VNNI_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                DataParams[i].C, DataParams[i].ldc, WorkSpace, &orth);
+          }
+        }
+      }
+    } else {
+      processed = false;
+      break;
+    }
+  }
+  return processed;
+}
+
+static size_t NSSQ4GemmBatchWorkspaceSize(size_t M, size_t N, size_t K, size_t BatchN,
+                                          const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams) {
+  GetCPUDevice();
+  size_t size = 0;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
+    auto uptr = std::unique_ptr<storage::gemm::IWeightBase>(ptr);
+    if (ptr) {
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto kptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+        auto NTile =
+            gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+        auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+        auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+        auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+        auto BlkSize = kptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompF32WorkspaceSize<tAVX512F>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                    DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompF32WorkspaceSize<tAVX2>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                 DataParams[i].C, DataParams[i].ldc),
+                            size);
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_SS_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAVX512_VNNI_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAVX_VNNI_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          }
+        }
+      }
+    }
+  }
+  return size;
+}
+
+template <typename T>
+static size_t NSQ4BuSize(size_t block_size, size_t N, size_t K, bool isAsym) {
+  static T proB;
+  auto stor = proB.createStorage(static_cast<int>(N), static_cast<int>(K), static_cast<int>(block_size),
+                                 BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, BTLA_DTYPE::BF16, isAsym);
+  // TODO(Yu) support more scale dtype
+  return stor.mSize;
+}
+
+static bool NSQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(PackedBuf);
+  auto uptr = std::unique_ptr<storage::gemm::IWeightBase>(ptr);
+  ORTThreading orth(ThreadPool);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto ldb_ = static_cast<int>(ldb);
+  GetCPUDevice();
+  if (ptr) {
+    auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+    auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+    auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+    auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+    if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      auto wptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+      auto BlkSize = wptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          static tWeiNInt<tAVX512F, tAVX512F::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          static tWeiNInt<tAVX2, tAVX2::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        }
+      }
+      if (btype == gemm::CompType::tS8 && PackRow == 4) {
+        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          static tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                   BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          static tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          static tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+template <typename T>
+static void NSQ4GemmPackBImpl(void* PackedBuf, size_t BlkSize, const uint8_t* QData, const float* Scale,
+                              const uint8_t* Zp, size_t N, size_t K, bool IsAsym, bool lastCall, size_t ldb,
+                              void* ThreadPool) {
+  static T proB;
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto stor = proB.createStorage(N_, K_, static_cast<int>(BlkSize), BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                 BTLA_DTYPE::BF16, IsAsym);
+  stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
+  ORTThreading orth(ThreadPool);
+  proB.packNbitsWeightQ4(N_, K_, IsAsym, QData, static_cast<int>(ldb), Scale, Zp, &stor, &orth);
+  if (lastCall) {
+    proB.reduceWeight(&stor, &orth);
+  }
+}
+
+static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, NS_SQNBIT_COMPUTE_TYPE CompType) {
+  GetCPUDevice();
+  if (K % BlkSize != 0) {
+    return 0;
+  }
+  // from low precision to high precision
+  switch (CompType) {
+    case NSCompInt8:
+      if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+      }
+      [[fallthrough]];
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        return NSQ4BuSize<tWeiNInt<tAVX512F, tAVX512F::ISA>>(BlkSize, N, K, isAsym);
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        return NSQ4BuSize<tWeiNInt<tAVX2, tAVX2::ISA>>(BlkSize, N, K, isAsym);
+      }
+      [[fallthrough]];
+    default:
+      return 0;
+  }
+}
+
+static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N,
+                          size_t K, size_t ldb, size_t BlkSize, bool isAsym, bool lastCall,
+                          NS_SQNBIT_COMPUTE_TYPE CompType, void* ThreadPool) {
+  GetCPUDevice();
+  // explicit statement fall through.
+  switch (CompType) {
+    case NSCompInt8:
+      if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(
+              PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA>>(
+              PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N,
+                                                                               K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+      }
+      [[fallthrough]];
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        NSQ4GemmPackBImpl<tWeiNInt<tAVX512F, tAVX512F::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym,
+                                                             lastCall, ldb, ThreadPool);
+        return true;
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        NSQ4GemmPackBImpl<tWeiNInt<tAVX2, tAVX2::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall,
+                                                       ldb, ThreadPool);
+        return true;
+      }
+      [[fallthrough]];
+    default:
+      return false;
+  }
+}
+
+size_t NSNBitsGemmPackBSize(size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym,
+                            NS_SQNBIT_COMPUTE_TYPE CompType) {
+  if (nbits == 4) {
+    auto jsize = NSQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
+    if (jsize) {
+      return jsize;
+    }
+  }
+  return 0;
+}
+
+void NSNBitsGemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N, size_t K,
+                      size_t ldb, size_t BlkSize, int nbits, bool isAsym, bool lastCall,
+                      NS_SQNBIT_COMPUTE_TYPE CompType, void* ThreadPool) {
+  if (nbits == 4) {
+    if (NSQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
+      return;
+    }
+  }
+}
+
+void NSNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  if (NSQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
+    return;
+  }
+}
+
+size_t NSSQNBitsGemmBatchWorkspaceSize(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                                       const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  return NSSQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
+}
+
+void NSSQNBitsGemmBatchPackedB(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                               const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, void* WorkSpace,
+                               void* ThreadPool) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  if (NSSQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
+    // PackedWeight is created by bestla
+    return;
+  }
+}
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
new file mode 100644
index 000000000000..ebcb3027a209
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
@@ -0,0 +1,129 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    neural_speed_gemm.h
+
+Abstract:
+
+    Prepack-weight GEMM APIs of neural_speed.
+--*/
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+
+/**
+ * @brief Define compute types of block quantization
+ */
+enum NS_SQNBIT_COMPUTE_TYPE {
+  NSCompUndef = 0, /*!< undef */
+  NSCompFp32 = 1,  /*!< input fp32, accumulator fp32 */
+  NSCompFp16 = 2,  /*!< input fp16, accumulator fp16 */
+  NSCompBf16 = 3,  /*!< input bf16, accumulator fp32 */
+  NSCompInt8 = 4   /*!< input int8, accumulator int32 */
+};
+
+/**
+ * @brief Data parameters for NBits GEMM routine
+ *        C = A * B
+ *        A, C must be a float32 matrix
+ *        B must be a packed nbits blob
+ *        All except C are [in] parameters
+ */
+struct NS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
+  const float* A = nullptr; /**< address of A (float32 matrix)*/
+  const void* B = nullptr;  /**< address of B (packed nbits blob)*/
+  float* C = nullptr;       /**< address of result matrix */
+  size_t lda = 0;           /**< leading dimension of A */
+  size_t ldc = 0;           /**< leading dimension of C*/
+};
+
+/**
+ * @brief Compute the byte size of the parameter combination
+ *
+ * @param N      the number of columns of matrix B.
+ * @param K      the number of rows of matrix B.
+ * @param block_size    size of the block to quantize, elements from the same block share the same
+ * scale and zero point
+ * @param nbits  number of bits used for weight quantization
+ * @param is_asym  flag for asymmetric quantization
+ * @param comp_type  specify input data type and accumulator data type
+ * @return size of the packing buffer, 0 if the operation is not yet supported.
+ */
+size_t NSNBitsGemmPackBSize(size_t N, size_t K, size_t block_size, int nbits, bool is_asym,
+                            NS_SQNBIT_COMPUTE_TYPE comp_type);
+
+/**
+ * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
+ *
+ * @param PackedBuf     packed data buffer
+ * @param QData         quantized data buffer
+ * @param Scale         scale pointer
+ * @param Zp            zero point pointer
+ * @param N             the number of columns of matrix B.
+ * @param K             the number of rows of matrix B.
+ * @param ldb           leading dimension of B
+ * @param block_size    size of the block to quantize, elements from the same block share the same
+ * scale and zero point
+ * @param nbits         number of bits used for weight quantization (default 4)
+ * @param is_asym       flag for asymmetric quantization
+ * @param comp_type     specify input data type and accumulator data type
+ * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
+ * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
+ * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
+ * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
+ * (is_asym is false) and Zp(is_asym is true).
+ * @param thread_pool
+ */
+void NSNBitsGemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N, size_t K,
+                      size_t ldb, size_t block_size, int nbits, bool is_asym, bool last_call,
+                      NS_SQNBIT_COMPUTE_TYPE comp_type, void* thread_pool);
+
+/**
+ * @brief Unpack and dequantize to fp32
+ *
+ * @param FpData     unpacked float32 data
+ * @param PackedBuf  quantized and packed data
+ * @param N          the number of columns of matrix B.
+ * @param K          the number of rows of matrix B.
+ * @param ldb        leading dimension of B
+ * @param thread_pool
+ */
+void NSNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* thread_pool);
+
+/**
+ * @brief Get the workspace size required by computation.
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @return     Workspace size in bytes
+ */
+size_t NSSQNBitsGemmBatchWorkspaceSize(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                                       const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams);
+
+/**
+ * @brief Batched GEMM:  C = A * B
+ *        A, C must be a float32 matrix
+ *        B must be a packed nbits blob
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @param[in]  WorkSpace  temporary buffer
+ * @param[in]  ThreadPool
+ * @return
+ */
+void NSSQNBitsGemmBatchPackedB(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                               const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, void* WorkSpace,
+                               void* ThreadPool = nullptr);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
new file mode 100644
index 000000000000..e7df50408ef0
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
@@ -0,0 +1,40 @@
+//-----------------------------------------------------------------------------
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//
+//-----------------------------------------------------------------------------
+#pragma once
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-value"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4457)
+#pragma warning(disable : 4189)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4267)
+#pragma warning(disable : 4702)
+#pragma warning(disable : 4127)
+#endif
+
+#include "bestla/bestla_prologue_a.h"
+#include "bestla/bestla_wrapper.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
index ee9ae7167945..af163b6be702 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
@@ -1,4 +1,4 @@
-// Copyright (c Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include "qlinear_util.h"
diff --git a/onnxruntime/contrib_ops/cpu/tokenizer.cc b/onnxruntime/contrib_ops/cpu/tokenizer.cc
index 1787fb9b3c4a..89371106b379 100644
--- a/onnxruntime/contrib_ops/cpu/tokenizer.cc
+++ b/onnxruntime/contrib_ops/cpu/tokenizer.cc
@@ -2,12 +2,29 @@
 // Licensed under the MIT License.
 
 #include "core/common/common.h"
+#include "core/common/inlined_containers.h"
 #include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/common/utf8_util.h"
-#include "core/framework/tensor.h"
 #include "core/framework/op_kernel.h"
+#include "core/framework/tensor.h"
 #include "re2/re2.h"
 
+#ifdef _MSC_VER
+#include <memory_resource>
+#define ORT_PMR_ALLOCATOR_SUPPORTED
+#endif
+
+#include <optional>
+#include <type_traits>
+#include <vector>
+
+#ifdef ORT_PMR_ALLOCATOR_SUPPORTED
+using SlicesVector = std::pmr::vector<re2::StringPiece>;
+#else
+using SlicesVector = std::vector<re2::StringPiece>;
+#endif
+
 namespace onnxruntime {
 namespace contrib {
 
@@ -21,6 +38,10 @@ class Tokenizer final : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
  private:
+  Status EstimateNumberOfTokens(gsl::span<const std::string> input_span,
+                                size_t& max_tokens_per_row,
+                                size_t& total_tokens_estimate) const;
+
   Status CharTokenize(OpKernelContext* context, size_t N, size_t C,
                       gsl::span<const int64_t> input_dims) const;
 
@@ -31,11 +52,14 @@ class Tokenizer final : public OpKernel {
                          size_t N, size_t C,
                          gsl::span<const int64_t> input_dims) const;
 
+  void OutputData(gsl::span<const SlicesVector> rows,
+                  size_t max_tokens, size_t max_output_index, std::string* output_data) const;
+
   bool mark_{false};
   std::string pad_value_;
-  int64_t mincharnum_{0};
+  size_t mincharnum_{0};
   bool char_tokenezation_{false};
-  std::vector<std::unique_ptr<re2::RE2>> separators_;
+  InlinedVector<std::unique_ptr<re2::RE2>> separators_;
   std::unique_ptr<re2::RE2> regex_;
 };
 
@@ -50,8 +74,8 @@ ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
     contrib::Tokenizer);
 
 namespace tokenizer_details {
-constexpr char start_text = 0x2;
-constexpr char end_text = 0x3;
+constexpr char kStartMarker = 0x2;
+constexpr char kEndMarker = 0x3;
 }  // namespace tokenizer_details
 
 using namespace tokenizer_details;
@@ -65,9 +89,11 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) {
   status = info.GetAttr("pad_value", &pad_value_);
   ORT_ENFORCE(status.IsOK(), "attribute pad_value is not set");
 
-  status = info.GetAttr("mincharnum", &mincharnum_);
+  int64_t mincharnum = 0;
+  status = info.GetAttr("mincharnum", &mincharnum);
   ORT_ENFORCE(status.IsOK(), "attribute mincharnum is not set");
-  ORT_ENFORCE(mincharnum_ > 0, "attribute mincharnum must have a positive value");
+  ORT_ENFORCE(mincharnum > 0, "attribute mincharnum must have a positive value");
+  mincharnum_ = narrow<size_t>(mincharnum);
 
   // Optional attributes either or
   std::vector<std::string> separators;
@@ -114,6 +140,25 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) {
   }
 }
 
+Status Tokenizer::EstimateNumberOfTokens(gsl::span<const std::string> input_span,
+                                         size_t& max_tokens_per_row, size_t& total_tokens_estimate) const {
+  total_tokens_estimate = 0;
+  max_tokens_per_row = 0;
+  for (const auto& s : input_span) {
+    size_t utf8_chars = 0;  // length in utf8 chars
+    if (!utf8_validate(reinterpret_cast<const unsigned char*>(s.data()), s.size(),
+                       utf8_chars)) {
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                    "Input string contains invalid utf8 chars: " + s);
+    }
+    auto tokens = std::max<size_t>(1, utf8_chars / mincharnum_);
+    total_tokens_estimate += tokens;
+    max_tokens_per_row = std::max(max_tokens_per_row, tokens);
+  }
+
+  return Status::OK();
+}
+
 Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C,
                                gsl::span<const int64_t> input_dims) const {
   // With char tokenzation we get as many tokens as the number of
@@ -131,14 +176,13 @@ Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C,
                        tokens)) {
       // Please do not include the input text in the error message as it could
       // be deemed as a compliance violation by teams using this operator
-      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                    "Input string contains invalid utf8 chars");
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input string contains invalid utf8 chars:", s);
     }
     max_tokens = std::max(max_tokens, tokens);
     ++curr_input;
   }
 
-  std::vector<int64_t> output_dims(input_dims.begin(), input_dims.end());
+  TensorShapeVector output_dims(input_dims.begin(), input_dims.end());
   // Check if we have no output due to apparently empty strings input.
   if (max_tokens == 0) {
     output_dims.push_back(0);
@@ -160,31 +204,30 @@ Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C,
   while (curr_input != last) {
     const auto& s = *curr_input;
     if (mark_) {
-      (output_data + output_index)->assign(&start_text, 1);
+      output_data[output_index].assign(&kStartMarker, 1);
       ++output_index;
     }
     size_t tokens = 0;
     const size_t str_len = s.size();
     for (size_t token_idx = 0; token_idx < str_len;) {
       size_t tlen = 0;
-      bool result = utf8_bytes(static_cast<unsigned char>(s[token_idx]), tlen);
+      [[maybe_unused]] bool result = utf8_bytes(static_cast<unsigned char>(s[token_idx]), tlen);
       assert(result);
-      (void)result;
       assert(token_idx + tlen <= str_len);
-      *(output_data + output_index) = s.substr(token_idx, tlen);
+      output_data[output_index] = s.substr(token_idx, tlen);
       ++output_index;
       token_idx += tlen;
       ++tokens;
     }
     if (mark_) {
-      (output_data + output_index)->assign(&end_text, 1);
+      output_data[output_index].assign(&kEndMarker, 1);
       ++output_index;
     }
     // Padding strings
     assert(tokens + (static_cast<size_t>(mark_) * 2) <= max_tokens);
     const size_t pads = max_tokens - (static_cast<size_t>(mark_) * 2) - tokens;
     for (size_t p = 0; p < pads; ++p) {
-      *(output_data + output_index) = pad_value_;
+      output_data[output_index] = pad_value_;
       ++output_index;
     }
     ++curr_input;
@@ -192,37 +235,162 @@ Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C,
   return Status::OK();
 }
 
+namespace {
+
+// We use std::vector in this case, because InlinedVector::clear() is incompatible
+// with std::vector. It also deallocates memory, which is not what we want.
+
+// The compiler we are using GCC on Linux and Clang on MacOS does not
+// have the library that support C++17 PMR. So we are only using it on Windows
+// since the problem is acute on the platform.
+
+#ifdef ORT_PMR_ALLOCATOR_SUPPORTED
+/// <summary>
+/// This class provides a thin abstraction over the std::pmr::monotonic_buffer_resource
+/// If the allocated buffer is not enough, additional allocations are done using
+/// new/delete.
+/// </summary>
+class MonotonicAllocatorWithDefault : public std::pmr::monotonic_buffer_resource {
+ public:
+  MonotonicAllocatorWithDefault(void* ptr, size_t size_in_bytes)
+      : monotonic_buffer_resource(ptr, size_in_bytes, std::pmr::get_default_resource()) {}
+  MonotonicAllocatorWithDefault(void* ptr, size_t size_in_bytes, std::pmr::memory_resource* upstream)
+      : monotonic_buffer_resource(ptr, size_in_bytes, upstream) {}
+};
+
+class MemoryAllocator {
+ public:
+  explicit MemoryAllocator(size_t num_of_slices) {
+    size_t allocated_size = 0;
+    void* ptr = AlignedAllocate(num_of_slices, allocated_size);
+    resource_.emplace(ptr, allocated_size);
+  }
+
+  SlicesVector CreateVectorWithAllocator() {
+    return SlicesVector(&resource_.value());
+  }
+
+  SlicesVector& EmplaceBack(std::vector<SlicesVector>& rows) {
+    return rows.emplace_back(&resource_.value());
+  }
+
+ private:
+  /// <summary>
+  /// Pre-allocate memory for the tokens to reduce a number of individual
+  /// allocations and thus memory contention.
+  /// Used in conjunction with PMR memory allocatior
+  /// </summary>
+  /// <param name="num">number of objects of T</param>
+  /// <param name="buf">buffer holder</param>
+  /// <param name="allocated_size">aligned allocated size</param>
+  /// <returns>pointer to the buffer</returns>
+  void* AlignedAllocate(size_t num, size_t& allocated_size) {
+    constexpr size_t alignment = alignof(re2::StringPiece);
+    const size_t size_bytes = SafeInt<size_t>(num) * sizeof(re2::StringPiece) + alignment;
+    buf_holder_ = std::make_unique<uint8_t[]>(size_bytes);
+    void* ptr = buf_holder_.get();
+    allocated_size = size_bytes;
+    return std::align(alignment, size_bytes, ptr, allocated_size);
+  }
+
+  std::unique_ptr<uint8_t[]> buf_holder_;
+  std::optional<MonotonicAllocatorWithDefault> resource_;
+};
+
+#else
+
+class MemoryAllocator {
+ public:
+  explicit MemoryAllocator(size_t /* num_of_slices */) {
+  }
+
+  SlicesVector CreateVectorWithAllocator() const {
+    return SlicesVector{};
+  }
+
+  SlicesVector& EmplaceBack(std::vector<SlicesVector>& rows) const {
+    return rows.emplace_back();
+  }
+};
+
+#endif
+}  // namespace
+
+void Tokenizer::OutputData(gsl::span<const SlicesVector> rows,
+                           size_t max_tokens, [[maybe_unused]] size_t max_output_index, std::string* output_data) const {
+  size_t output_index = 0;
+  for (const auto& row : rows) {
+    [[maybe_unused]] size_t c_idx = output_index;
+    if (mark_) {
+      output_data[output_index++].assign(&kStartMarker, 1);
+    }
+    // Output tokens for this row
+    for (const auto& token : row) {
+      output_data[output_index++].assign(token.data(), token.length());
+    }
+    if (mark_) {
+      output_data[output_index++].assign(&kEndMarker, 1);
+    }
+    const size_t pads = max_tokens - (static_cast<size_t>(mark_) * 2) - row.size();
+    for (size_t p = 0; p < pads; ++p) {
+      output_data[output_index++] = pad_value_;
+    }
+    assert(output_index <= max_output_index);
+    assert((output_index - c_idx) <= max_tokens);
+  }
+}
+
 Status Tokenizer::SeparatorExpressionTokenizer(OpKernelContext* ctx,
                                                size_t N, size_t C,
                                                gsl::span<const int64_t> input_dims) const {
   using namespace re2;
-  std::vector<std::vector<StringPiece>> rows;
-  rows.reserve(N * C);
+
+  auto X = ctx->Input<Tensor>(0);
+  const auto input_span = X->DataAsSpan<std::string>();
+
+  // Let's estimate maximum number of tokens
+  // It is hard to estimate the number of separate characters that would not appear in the
+  // output.
+  size_t total_tokens_estimate = 0;
+  size_t max_tokens_per_row = 0;
+  ORT_RETURN_IF_ERROR(EstimateNumberOfTokens(input_span, max_tokens_per_row, total_tokens_estimate));
+  // Add a scratch token vector allocation
+  total_tokens_estimate += max_tokens_per_row;
+
+  // Pre-allocate memory for all tokens (StringPieces)
+  MemoryAllocator allocator(total_tokens_estimate);
+
+  // Make sure the vectors below are destroyed before the allocator
+  const size_t vector_num = SafeInt<size_t>(N) * C;
+
+  std::vector<SlicesVector> rows;
+  rows.reserve(vector_num);
+
+  // Re-use the same vector for each tokenization round
+  SlicesVector tokens = allocator.CreateVectorWithAllocator();
+  tokens.reserve(max_tokens_per_row);
 
   // We do not constraint the search to match
   // on the beginning or end of the string
-  const RE2::Anchor anchor = RE2::UNANCHORED;
+  constexpr RE2::Anchor anchor = RE2::UNANCHORED;
 
   // Scan all strings and attempt to find separators in them
   // collect all the output tokens here
   size_t max_tokens = 0;
-  auto X = ctx->Input<Tensor>(0);
-  auto const input_data = X->Data<std::string>();
-  auto curr_input = input_data;
-  auto const last = input_data + N * C;
-  while (curr_input != last) {
-    const auto& s = *curr_input;
+  for (const auto& s : input_span) {
     size_t utf8_chars = 0;  // length in utf8 chars
-    if (!utf8_validate(reinterpret_cast<const unsigned char*>(s.data()), s.size(),
-                       utf8_chars)) {
+    if (!utf8_len(reinterpret_cast<const unsigned char*>(s.data()), s.size(),
+                  utf8_chars)) {
       return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
                     "Input string contains invalid utf8 chars: " + s);
     }
 
-    std::vector<StringPiece> row{s};
+    const auto expected_tokens = std::max<size_t>(1, utf8_chars / mincharnum_);
+    auto& row = allocator.EmplaceBack(rows);
+    row.reserve(expected_tokens);
+    row.emplace_back(s);
 
     for (const auto& sep : separators_) {
-      std::vector<StringPiece> tokens;
       for (const auto& text : row) {
         const auto end_pos = text.length();
         size_t start_pos = 0;
@@ -244,7 +412,7 @@ Status Tokenizer::SeparatorExpressionTokenizer(OpKernelContext* ctx,
               return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
                             "Match contains invalid utf8 chars: " + std::string{submatch});
             }
-            if (utf8_chars >= size_t(mincharnum_)) {
+            if (utf8_chars >= mincharnum_) {
               tokens.emplace_back(text.data() + start_pos, token_len);
             }
             // Update starting position
@@ -263,23 +431,32 @@ Status Tokenizer::SeparatorExpressionTokenizer(OpKernelContext* ctx,
             utf8_chars = 0;
             utf8_len(reinterpret_cast<const unsigned char*>(text.data() + start_pos),
                      trailing_len, utf8_chars);
-            if (utf8_chars >= size_t(mincharnum_)) {
+            if (utf8_chars >= mincharnum_) {
               tokens.emplace_back(text.data() + start_pos, trailing_len);
             }
           }
         } while (match);
       }  // row
-      // Replace the row with the results of this tokenezation
-      row.swap(tokens);
+
+      // We want to preserve the buffer for the next separator
+      // copying slices is cheaper than allocating new memory
+      if (!tokens.empty()) {
+        row = tokens;
+        tokens.clear();
+        continue;
+      }
+
+      // Nothing more to match for any remaining separators
+      row.clear();
+      tokens.clear();
+      break;
     }  // separators_
     max_tokens = std::max(max_tokens, row.size());
-    rows.push_back(std::move(row));
-    ++curr_input;
   }
 
-  std::vector<int64_t> output_dims(input_dims.begin(), input_dims.end());
+  TensorShapeVector output_dims(input_dims.begin(), input_dims.end());
   // Check if we have no output due to either empty input
-  // everything is a separator
+  // or everything is a separator
   if (max_tokens == 0) {
     output_dims.push_back(0);
     TensorShape output_shape(output_dims);
@@ -297,39 +474,8 @@ Status Tokenizer::SeparatorExpressionTokenizer(OpKernelContext* ctx,
   auto output_tensor = ctx->Output(0, output_shape);
   auto const output_data = output_tensor->MutableData<std::string>();
 
-#ifdef _DEBUG
-  const size_t max_output_index = N * C * max_tokens;
-#endif
-  size_t output_index = 0;
-  curr_input = input_data;
-  for (auto& row : rows) {
-#ifdef _DEBUG
-    size_t c_idx = output_index;
-#endif
-    if (mark_) {
-      (output_data + output_index)->assign(&start_text, 1);
-      ++output_index;
-    }
-    // Output tokens for this row
-    for (const auto& token : row) {
-      (output_data + output_index)->assign(token.data(), token.size());
-      ++output_index;
-    }
-    if (mark_) {
-      (output_data + output_index)->assign(&end_text, 1);
-      ++output_index;
-    }
-    const size_t pads = max_tokens - (static_cast<size_t>(mark_) * 2) - row.size();
-    for (size_t p = 0; p < pads; ++p) {
-      *(output_data + output_index) = pad_value_;
-      ++output_index;
-    }
-#ifdef _DEBUG
-    assert(output_index <= max_output_index);
-    assert((output_index - c_idx) <= max_tokens);
-#endif
-    ++curr_input;
-  }
+  OutputData(rows, max_tokens, narrow<size_t>(output_shape.Size()), output_data);
+
   return Status::OK();
 }
 
@@ -337,71 +483,78 @@ Status Tokenizer::TokenExpression(OpKernelContext* ctx,
                                   size_t N, size_t C,
                                   gsl::span<const int64_t> input_dims) const {
   using namespace re2;
-  // Represents a token that will be output after
-  // first is the index, second is the size;
-  std::vector<std::vector<StringPiece>> tokens;
-  tokens.reserve(N * C);
 
   size_t max_tokens = 0;
   auto X = ctx->Input<Tensor>(0);
-  auto const input_data = X->Data<std::string>();
-  auto curr_input = input_data;
-  auto const last = input_data + N * C;
+  const auto input_span = X->DataAsSpan<std::string>();
+
+  // Let's estimate maximum number of tokens
+  size_t total_tokens_estimate = 0;
+  size_t max_tokens_per_row = 0;
+  ORT_RETURN_IF_ERROR(EstimateNumberOfTokens(input_span, max_tokens_per_row, total_tokens_estimate));
+
+  // Pre-allocate memory for all tokens (StringPieces)
+  MemoryAllocator allocator(total_tokens_estimate);
+
+  // Make sure the vectors below are destroyed before the allocator
+  const size_t vector_num = SafeInt<size_t>(N) * C;
+
+  // We use std::vector in this case, because InlinedVector::clear() is incompatible
+  // with std::vector. It also deallocates memory, which is not what we want.
+  std::vector<SlicesVector> rows;
+  rows.reserve(vector_num);
 
   // We do not constraint the search to match
   // on the beginning or end of the string
-  const RE2::Anchor anchor = RE2::UNANCHORED;
-
-  while (curr_input != last) {
-    const auto& s = *curr_input;
+  constexpr RE2::Anchor anchor = RE2::UNANCHORED;
 
+  for (const auto& s : input_span) {
     size_t utf8_chars = 0;
-    if (!utf8_validate(reinterpret_cast<const unsigned char*>(s.data()), s.size(),
-                       utf8_chars)) {
-      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                    "Input string contains invalid utf8 chars: " + s);
-    }
-
-    tokens.emplace_back();
-    auto& row = tokens.back();
-
-    StringPiece text(s);
-    const auto end_pos = s.length();
-    size_t start_pos = 0;
-    StringPiece submatch;
-
-    bool match = true;
-    do {
-      match = regex_->Match(text, start_pos, end_pos, anchor, &submatch, 1);
-      if (match) {
-        // Record  pos/len
-        assert(submatch.data() != nullptr);
-        size_t match_pos = submatch.data() - s.data();
-        assert(match_pos >= start_pos);
-        // Guard against empty match and make
-        // sure we make progress either way
-        auto token_len = submatch.length();
-        utf8_chars = 0;
-        if (!utf8_len(reinterpret_cast<const unsigned char*>(submatch.data()), token_len, utf8_chars)) {
-          return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                        "Match contains invalid utf8 chars: " + std::string{submatch});
-        }
-        if (utf8_chars >= size_t(mincharnum_)) {
-          row.push_back(submatch);
-          start_pos = match_pos + token_len;
-        } else {
-          size_t bytes = 0;
-          utf8_bytes(*submatch.data(), bytes);
-          start_pos = match_pos + bytes;
+    utf8_len(reinterpret_cast<const unsigned char*>(s.data()), s.size(), utf8_chars);
+
+    auto& row = allocator.EmplaceBack(rows);
+
+    if (utf8_chars >= mincharnum_) {
+      auto estimated_tokens = std::max<size_t>(1, utf8_chars / mincharnum_);
+      row.reserve(estimated_tokens);
+
+      StringPiece text(s);
+      const auto end_pos = s.length();
+      size_t start_pos = 0;
+      StringPiece submatch;
+
+      bool match = true;
+      do {
+        match = regex_->Match(text, start_pos, end_pos, anchor, &submatch, 1);
+        if (match) {
+          // Record  pos/len
+          assert(submatch.data() != nullptr);
+          size_t match_pos = submatch.data() - s.data();
+          assert(match_pos >= start_pos);
+          // Guard against empty match and make
+          // sure we make progress either way
+          auto token_len = submatch.length();
+          utf8_chars = 0;
+          if (!utf8_len(reinterpret_cast<const unsigned char*>(submatch.data()), token_len, utf8_chars)) {
+            return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                          "Match contains invalid utf8 chars: " + std::string{submatch});
+          }
+          if (utf8_chars >= mincharnum_) {
+            row.push_back(submatch);
+            start_pos = match_pos + token_len;
+          } else {
+            size_t bytes = 0;
+            utf8_bytes(*submatch.data(), bytes);
+            start_pos = match_pos + bytes;
+          }
         }
-      }
-    } while (match);
+      } while (match);
+    }
     max_tokens = std::max(max_tokens, row.size());
-    ++curr_input;
   }
 
   // Check for empty output
-  std::vector<int64_t> output_dims(input_dims.begin(), input_dims.end());
+  TensorShapeVector output_dims(input_dims.begin(), input_dims.end());
   // Check if we have no output due to either empty input
   // everything is a separator
   if (max_tokens == 0) {
@@ -421,40 +574,7 @@ Status Tokenizer::TokenExpression(OpKernelContext* ctx,
   auto output_tensor = ctx->Output(0, output_shape);
   auto const output_data = output_tensor->MutableData<std::string>();
 
-#ifdef _DEBUG
-  const size_t max_output_index = N * C * max_tokens;
-#endif
-  curr_input = input_data;
-  size_t output_index = 0;
-  for (const auto& row : tokens) {
-    assert(curr_input != last);
-#ifdef _DEBUG
-    size_t c_idx = output_index;
-#endif
-    if (mark_) {
-      (output_data + output_index)->assign(&start_text, 1);
-      ++output_index;
-    }
-    // Output tokens for this row
-    for (const auto& token : row) {
-      (output_data + output_index)->assign(token.data(), token.length());
-      ++output_index;
-    }
-    if (mark_) {
-      (output_data + output_index)->assign(&end_text, 1);
-      ++output_index;
-    }
-    const size_t pads = max_tokens - (static_cast<size_t>(mark_) * 2) - row.size();
-    for (size_t p = 0; p < pads; ++p) {
-      *(output_data + output_index) = pad_value_;
-      ++output_index;
-    }
-#ifdef _DEBUG
-    assert(output_index <= max_output_index);
-    assert((output_index - c_idx) <= max_tokens);
-#endif
-    ++curr_input;
-  }
+  OutputData(rows, max_tokens, narrow<size_t>(output_shape.Size()), output_data);
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
index 56d950ca2f41..b18e122980ed 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -258,7 +258,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
     cpu_state.sequences.InitDevice(beam_state.sequences_device);
     ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
                                                       cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
-                                                      nullptr,
+                                                      this->ort_stream_,
                                                       DeviceCopyDirection::hostToDevice));
   }
 
@@ -397,12 +397,8 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
                                output_sequences_scores);
 
   // Output per token scores
-  if (output_scores) {
-    gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
-    gsl::span<const float> source = beam_state.scores;
-    assert(target.size() == source.size());
-    ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
-  }
+  gsl::span<const float> per_token_scores = beam_state.scores;
+  this->beam_scorer_->OutputScores(per_token_scores, output_scores);
 
   return status;
 }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
index 94547887d3a9..8f5cdc97f27e 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@@ -214,7 +214,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
     cpu_state.sequences.InitDevice(beam_state.sequences_device);
     ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
                                                       cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
-                                                      nullptr,
+                                                      this->ort_stream_,
                                                       DeviceCopyDirection::hostToDevice));
   }
 
@@ -404,12 +404,8 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
                                output_sequences_scores);
 
   // Output per token scores
-  if (output_scores) {
-    gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
-    gsl::span<const float> source = beam_state.scores;
-    assert(target.size() == source.size());
-    ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
-  }
+  gsl::span<const float> per_token_scores = beam_state.scores;
+  this->beam_scorer_->OutputScores(per_token_scores, output_scores);
 
   return status;
 }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
index 91b93a125ad7..af0904b7d6e4 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -134,8 +134,8 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
     TensorShape no_speech_probs_shape{parameters->batch_size};
     Tensor* no_speech_probs = this->context_.Output(parameters->no_speech_probs_output_id, no_speech_probs_shape);
     if (no_speech_probs && no_speech_probs->MutableData<T>()) {
-      ORT_ENFORCE(parameters->no_speech_token >= 0 && parameters->no_speech_token < parameters->vocab_size,
-                  "no_speech_token id out of range, it is ", parameters->no_speech_token,
+      ORT_ENFORCE(parameters->no_speech_token_id >= 0 && parameters->no_speech_token_id < parameters->vocab_size,
+                  "no_speech_token_id is out of range, it is ", parameters->no_speech_token_id,
                   ", vocab_size is ", parameters->vocab_size);
       this->parameters_->no_speech_probs = (void*)no_speech_probs->MutableData<T>();
     }
@@ -226,7 +226,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
     cpu_state.sequences.InitDevice(beam_state.sequences_device);
     ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
                                                       cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
-                                                      nullptr,
+                                                      this->ort_stream_,
                                                       DeviceCopyDirection::hostToDevice));
   }
 
@@ -500,12 +500,8 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
                                output_sequences_scores);
 
   // Output per token scores
-  if (output_scores) {
-    gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
-    gsl::span<const float> source = beam_state.scores;
-    assert(target.size() == source.size());
-    ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
-  }
+  gsl::span<const float> per_token_scores = beam_state.scores;
+  this->beam_scorer_->OutputScores(per_token_scores, output_scores);
 
   return status;
 }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index 3962486d5b5e..93837e785b4a 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -123,8 +123,20 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   logits_processor = logits_processor_tensor ? static_cast<int>(*logits_processor_tensor->Data<int32_t>()) : 0;
   ORT_ENFORCE(logits_processor >= 0,
               "logits_processor shall be a non-negative integer, got ", logits_processor);
-}
 
+  if (this->model_type == IGenerationParameters::kModelTypeWhisper) {
+    auto* temperature_tensor = context->Input<Tensor>(14);
+    if (temperature_tensor) {
+      if (temperature_tensor->IsDataType<float>()) {
+        temperature = *temperature_tensor->Data<float>();
+      } else {
+        temperature = static_cast<float>(*temperature_tensor->Data<MLFloat16>());
+      }
+    } else {
+      temperature = 1.0f;
+    }
+  }
+}
 void BeamSearchParameters::SetSubgraphParameters(int vocabulary_size, int heads, int hidden_size_per_head, int layers) {
   // Override vocab_size using the inferred shape from the decoder subgraph ONLY IF
   // the vocab_size hasn't been explicitly specified by the user (as an attribute of BeamSearch)
@@ -141,7 +153,13 @@ void WhisperBeamSearchParameters::ParseFromAttributes(const OpKernelInfo& info)
   model_type = static_cast<int>(info.GetAttrOrDefault<int64_t>("model_type", IGenerationParameters::kModelTypeWhisper));
   ORT_ENFORCE(model_type == IGenerationParameters::kModelTypeWhisper);
 
-  no_speech_token = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_speech_token", -1LL));
+  // Token ids are defined below in the order that they appear in the tokenizer
+  translate_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("translate_token_id", -1LL));
+  transcribe_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("transcribe_token_id", -1LL));
+  start_of_lm_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("start_of_lm_token_id", -1LL));
+  no_speech_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_speech_token_id", -1LL));
+  no_timestamps_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_timestamps_token_id", -1LL));
+  beginning_timestamp_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("beginning_timestamp_token_id", -1LL));
   cross_qk_layer_head_input_id = 12;
   extra_decoding_ids_input_id = 13;
   cross_qk_output_id = 3;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
index 7e2e5b212922..0eccbe26605f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@@ -50,11 +50,12 @@ bool BeamHypotheses::CanImprove(float best_sum_logprobs, int current_length) con
   return beams_.back().score < current_score;
 }
 
+template <typename T>
 void BeamHypotheses::Output(
     int top_k,
     int max_length,
-    gsl::span<int32_t>& sequences,       // buffer filled with pad token ID, shape (num_return_sequences, max_length)
-    gsl::span<float>& sequences_scores)  // buffer of shape (num_return_sequences) or empty
+    gsl::span<int32_t>& sequences,   // buffer filled with pad token ID, shape (num_return_sequences, max_length)
+    gsl::span<T>& sequences_scores)  // buffer of shape (num_return_sequences) or empty
 {
   // Copy the top_k beams into the sequences
   ORT_ENFORCE(top_k <= beams_used_);
@@ -67,7 +68,7 @@ void BeamHypotheses::Output(
     gsl::copy(item.hypothesis, target);
 
     if (!sequences_scores.empty())
-      sequences_scores[index] = item.score;
+      sequences_scores[index] = (T)item.score;
   }
 }
 
@@ -181,21 +182,21 @@ void BeamSearchScorer::Process(ISequences& sequences,
   }
 }
 
-void BeamSearchScorer::Finalize(ISequences& sequences,
-                                gsl::span<const float>& final_beam_scores,
-                                Tensor* output_sequences,
-                                Tensor* output_sequence_scores) {
-  ORT_ENFORCE(output_sequences != nullptr);
-
+template <typename T>
+void OutputSequenceScores(BeamSearchScorer* scorer,
+                          ISequences& sequences,
+                          gsl::span<const float>& final_beam_scores,
+                          Tensor* output_sequences,
+                          Tensor* output_sequence_scores) {
   // Finalize all open beam hypotheses and add to generated hypotheses.
-  for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
-    BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
+  for (size_t batch_index = 0; batch_index < scorer->batch_size_; batch_index++) {
+    BeamHypotheses& beam_hyp = scorer->beam_hyps_[batch_index];
     if (beam_hyp.done_) {
       continue;
     }
 
-    for (size_t beam_index = 0; beam_index < num_beams_; beam_index++) {
-      size_t batch_beam_index = batch_index * num_beams_ + beam_index;
+    for (size_t beam_index = 0; beam_index < scorer->num_beams_; beam_index++) {
+      size_t batch_beam_index = batch_index * scorer->num_beams_ + beam_index;
       float final_score = final_beam_scores[batch_beam_index];
       auto final_tokens = sequences.GetSequence(narrow<int>(batch_beam_index));
       beam_hyp.Add(final_tokens, final_score);
@@ -206,26 +207,59 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
   gsl::span<int32_t> output = output_sequences->MutableDataAsSpan<int32_t>();
 
   // Fill output sequences with pad token ID so that we do not need append it later.
-  std::fill_n(output.data(), output.size(), pad_token_id_);
+  std::fill_n(output.data(), output.size(), scorer->pad_token_id_);
 
   // Score of each sequence, with shape (batch_size * num_return_sequences).
-  gsl::span<float> sequence_scores;
+  gsl::span<T> sequence_scores;
   if (output_sequence_scores) {
-    sequence_scores = output_sequence_scores->MutableDataAsSpan<float>();
+    sequence_scores = output_sequence_scores->MutableDataAsSpan<T>();
   }
 
   // Select the best hypotheses according to number of sequences to return.
-  for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
-    BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
+  for (size_t batch_index = 0; batch_index < scorer->batch_size_; batch_index++) {
+    BeamHypotheses& beam_hyp = scorer->beam_hyps_[batch_index];
 
-    auto batch_output = output.subspan(batch_index * num_return_sequences_ * max_length_,
-                                       num_return_sequences_ * max_length_);
-    gsl::span<float> sequence_scores_buffer;
+    auto batch_output = output.subspan(batch_index * scorer->num_return_sequences_ * scorer->max_length_,
+                                       scorer->num_return_sequences_ * scorer->max_length_);
+    gsl::span<T> sequence_scores_buffer;
     if (!sequence_scores.empty())
-      sequence_scores_buffer = sequence_scores.subspan(batch_index * num_return_sequences_, num_return_sequences_);
+      sequence_scores_buffer = sequence_scores.subspan(batch_index * scorer->num_return_sequences_, scorer->num_return_sequences_);
+
+    beam_hyp.template Output<T>(narrow<int>(scorer->num_return_sequences_), narrow<int>(scorer->max_length_), batch_output,
+                                sequence_scores_buffer);
+  }
+}
+
+void BeamSearchScorer::Finalize(ISequences& sequences,
+                                gsl::span<const float>& final_beam_scores,
+                                Tensor* output_sequences,
+                                Tensor* output_sequence_scores) {
+  ORT_ENFORCE(output_sequences != nullptr);
 
-    beam_hyp.Output(narrow<int>(num_return_sequences_), narrow<int>(max_length_), batch_output,
-                    sequence_scores_buffer);
+  if (output_sequence_scores == nullptr || output_sequence_scores->IsDataType<float>()) {
+    OutputSequenceScores<float>(this, sequences, final_beam_scores, output_sequences, output_sequence_scores);
+  } else {
+    ORT_ENFORCE(output_sequence_scores->IsDataType<MLFloat16>());
+    OutputSequenceScores<MLFloat16>(this, sequences, final_beam_scores, output_sequences, output_sequence_scores);
+  }
+}
+
+void BeamSearchScorer::OutputScores(gsl::span<const float>& final_scores, Tensor* output_scores) {
+  if (output_scores) {
+    if (output_scores->IsDataType<float>()) {
+      gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
+      ORT_ENFORCE(target.size() == final_scores.size());
+      std::copy_n(final_scores.data(), final_scores.size(), target.data());
+    } else {
+      ORT_ENFORCE(output_scores->IsDataType<MLFloat16>());
+      gsl::span<MLFloat16> target = output_scores->MutableDataAsSpan<MLFloat16>();
+      ORT_ENFORCE(target.size() == final_scores.size());
+      const float* src = final_scores.data();
+      MLFloat16* dst = target.data();
+      for (size_t i = 0; i < target.size(); i++) {
+        dst[i] = MLFloat16(src[i]);
+      }
+    }
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.h
index 94b6d340d9f4..dc92e8038a68 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.h
@@ -35,10 +35,11 @@ struct BeamHypotheses {
   bool CanImprove(float best_sum_logprobs, int current_length) const;
 
   // Output results
-  void Output(int top_k,                            // number of sequences to return
-              int max_length,                       // max sequence length
-              gsl::span<int32_t>& sequences,        // buffer with pad token, shape (num_return_sequences, max_length)
-              gsl::span<float>& sequences_scores);  // buffer for sequence scores, with shape (num_return_sequences)
+  template <typename T>
+  void Output(int top_k,                        // number of sequences to return
+              int max_length,                   // max sequence length
+              gsl::span<int32_t>& sequences,    // buffer with pad token, shape (num_return_sequences, max_length)
+              gsl::span<T>& sequences_scores);  // buffer for sequence scores, with shape (num_return_sequences)
 
   gsl::span<HypothesisScore> beams_;  // Beam width sized array of hypotheses, sorted by highest scoring
   int beams_used_;                    // Number of elements used in beams_
@@ -60,13 +61,14 @@ struct BeamSearchScorer : IBeamScorer {
                 Tensor* output_sequences,
                 Tensor* output_sequence_scores) override;
 
+  void OutputScores(gsl::span<const float>& final_scores, Tensor* output_scores) override;
+
   bool IsDone() const override { return not_done_count_ == 0; }
 
   gsl::span<float> GetNextScores() override { return next_beam_scores_; }
   gsl::span<int32_t> GetNextTokens() override { return next_beam_tokens_; }
   gsl::span<int32_t> GetNextIndicesCPU() override { return next_beam_indices_; }
 
- private:
   size_t batch_size_;
   size_t num_beams_;
   size_t max_length_;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
index f6faf2e325f8..b1dd55eb20f3 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
@@ -120,6 +120,9 @@ struct IBeamScorer {
                         Tensor* output_sequences,
                         Tensor* output_sequence_scores) = 0;
 
+  virtual void OutputScores(gsl::span<const float>& final_scores,
+                            Tensor* output_scores) = 0;
+
   virtual bool IsDone() const = 0;                    // GPU version will return false here, as it asynchronously queues up the event
   virtual bool IsDoneLater() const { return false; }  // GPU version waits for the asynchous result to complete here
 
@@ -180,7 +183,14 @@ struct IGenerationParameters {
   // Parameters for whisper model
   bool decoder_output_cross_qk = false;
   gsl::span<const int32_t> extra_decoding_ids;
-  int32_t no_speech_token = -1;
+
+  // Token ids are defined below in the order that they appear in the tokenizer
+  int32_t translate_token_id = -1;
+  int32_t transcribe_token_id = -1;
+  int32_t start_of_lm_token_id = -1;
+  int32_t no_speech_token_id = -1;
+  int32_t no_timestamps_token_id = -1;
+  int32_t beginning_timestamp_token_id = -1;
   void* no_speech_probs = nullptr;
 
   int cross_qk_layer_head_input_id = -1;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
index f39f090c78b0..c74e9160cc43 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
@@ -17,14 +17,6 @@ namespace onnxruntime {
 namespace contrib {
 namespace transformers {
 
-#ifdef DEBUG_GENERATION
-template <typename T>
-void DumpScores(const char* name, const NextTokenScores<T>& next_token_scores) {
-  std::cout << name << std::endl;
-  ORT_UNUSED_PARAMETER(next_token_scores);
-}
-#endif
-
 // Interface for all scorers for beam search or beam sample.
 template <typename T>
 MinLengthLogitsProcessor<T>::MinLengthLogitsProcessor(int min_length, int eos_token_id)
@@ -36,10 +28,6 @@ void MinLengthLogitsProcessor<T>::Process(const ISequences* sequences,
   if (sequences->GetSequenceLength() < min_length_) {
     next_token_scores.SetScore(eos_token_id_, std::numeric_limits<T>::lowest());
   }
-
-#ifdef DEBUG_GENERATION
-  DumpScores("MinLengthLogitsProcessor", next_token_scores);
-#endif
 }
 
 template <typename T>
@@ -68,10 +56,6 @@ void RepetitionPenaltyLogitsProcessor<T>::Process(const ISequences* sequences,
       beam_token_scores[word_id] = (score < 0 ? score * penalty_ : score / penalty_);
     }
   }
-
-#ifdef DEBUG_GENERATION
-  DumpScores("RepetitionPenaltyLogitsProcessor", next_token_scores);
-#endif
 }
 
 template <typename T>
@@ -109,10 +93,6 @@ void NoRepeatNGramLogitsProcessor<T>::Process(const ISequences* sequences,
       beam_token_scores[word_id] = std::numeric_limits<T>::lowest();
     }
   }
-
-#ifdef DEBUG_GENERATION
-  DumpScores("NoRepeatNGramLogitsProcessor", next_token_scores);
-#endif
 }
 
 template <typename T>
@@ -136,10 +116,6 @@ void VocabMaskLogitsProcessor<T>::Process(const ISequences* /*sequences*/,
       }
     }
   }
-
-#ifdef DEBUG_GENERATION
-  DumpScores("VocabMaskLogitsProcessor", next_token_scores);
-#endif
 }
 
 template <typename T>
@@ -171,10 +147,6 @@ void PrefixVocabMaskLogitsProcessor<T>::Process(const ISequences* /*sequences*/,
       }
     }
   }
-
-#ifdef DEBUG_GENERATION
-  DumpScores("PrefixVocabMaskLogitsProcessor", next_token_scores);
-#endif
 }
 
 template <typename T>
@@ -193,10 +165,6 @@ void TemperatureLogitsProcessor<T>::Process(const ISequences* /*sequences*/,
     *p /= temperature_;
     ++p;
   }
-
-#ifdef DEBUG_GENERATION
-  DumpScores("TemperatureLogitsProcessor", next_token_scores);
-#endif
 }
 
 template <typename T>
@@ -218,10 +186,6 @@ void PresencePenaltyLogitsProcessor<T>::Process(const ISequences*,
   for (size_t i = 0; i < next_token_scores.scores.size(); i++) {
     *p -= presence_mask_[i] * presence_penalty_;
   }
-
-#ifdef DEBUG_GENERATION
-  DumpScores("PresencePenaltyLogitsProcessor", next_token_scores);
-#endif
 }
 
 void LogitsProcessorList::Init(const BeamSearchParameters& parameters) {
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
index 4688ff272cee..231eb17d1a94 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
@@ -10,6 +10,7 @@
 #include "contrib_ops/cpu/transformers/greedy_search_parameters.h"
 #include "contrib_ops/cpu/transformers/sampling_parameters.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"
+#include <iostream>
 
 namespace onnxruntime {
 namespace contrib {
@@ -34,6 +35,14 @@ struct NextTokenScores {
   }
 };
 
+#ifdef DEBUG_GENERATION
+template <typename T>
+void DumpScores(const char* name, const NextTokenScores<T>& next_token_scores) {
+  std::cout << name << std::endl;
+  ORT_UNUSED_PARAMETER(next_token_scores);
+}
+#endif
+
 // Interface for all scorers for beam search or beam sample.
 template <typename T>
 class ILogitsProcessor {
@@ -150,19 +159,25 @@ class PresencePenaltyLogitsProcessor : public ILogitsProcessor<T> {
 template <typename T>
 class TimestampLogitsProcessor : public ILogitsProcessor<T> {
  public:
-  TimestampLogitsProcessor(int eos_token_id, int max_initial_timestamp_index)
-      : eos_token_id_(eos_token_id), max_initial_timestamp_index_(max_initial_timestamp_index) {}
+  TimestampLogitsProcessor(int end_of_text_token_id,          // <|endoftext|>
+                           int start_of_transcript_token_id,  // <|startoftranscript|>
+                           int translate_token_id,            // <|translate|>
+                           int transcribe_token_id,           // <|transcribe|>
+                           int start_of_lm_token_id,          // <|startoflm|>
+                           int no_timestamps_token_id,        // <|notimestamps|>
+                           int beginning_timestamp_token_id,  // <|0.00|>
+                           int max_initial_timestamp_index)
+      : end_of_text_token_id_(end_of_text_token_id),
+        start_of_transcript_token_id_(start_of_transcript_token_id),
+        translate_token_id_(translate_token_id),
+        transcribe_token_id_(transcribe_token_id),
+        start_of_lm_token_id_(start_of_lm_token_id),
+        no_timestamps_token_id_(no_timestamps_token_id),
+        beginning_timestamp_token_id_(beginning_timestamp_token_id),
+        max_initial_timestamp_index_(max_initial_timestamp_index) {}
 
   void Process(const ISequences* sequences,
                NextTokenScores<T>& next_token_scores) override {
-    // TODO: translate_token_id_ and transcribe_token_id_ need to support both multilingual and English-only models.
-    const int beg_token_id_ = eos_token_id_ + 107;
-    const int not_token_id_ = eos_token_id_ + 106;
-    const int solm_token_id_ = eos_token_id_ + 105;
-    const int sot_token_id_ = eos_token_id_ + 1;
-    constexpr int translate_token_id_ = 50358;
-    constexpr int transcribe_token_id_ = 50359;
-
     const int batch_beam_size = next_token_scores.batch_beam_size;
     const int vocab_size = next_token_scores.vocab_size;
     for (int i = 0; i < batch_beam_size; i++) {
@@ -174,7 +189,7 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       size_t sample_begin = 0;
       for (size_t j = 0; j < seq_length; j++) {
         sample_begin++;
-        if (sequence[j] >= beg_token_id_) {
+        if (sequence[j] >= beginning_timestamp_token_id_) {
           break;
         }
       }
@@ -182,30 +197,30 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       // Suppress tokens
       for (int j = 0; j < vocab_size; j++) {
         // Suppress notimestamps and solm tokens
-        if (j == not_token_id_ || j == solm_token_id_) {
+        if (j == no_timestamps_token_id_ || j == start_of_lm_token_id_) {
           beam_token_scores[j] = std::numeric_limits<T>::lowest();
         }
 
         // Suppress sot, translate and transcribe tokens
         if (seq_length > sample_begin) {
-          if (j == sot_token_id_ || j == translate_token_id_ || j == transcribe_token_id_) {
+          if (j == start_of_transcript_token_id_ || j == translate_token_id_ || j == transcribe_token_id_) {
             beam_token_scores[j] = std::numeric_limits<T>::lowest();
           }
         }
       }
 
       // Timestamps should be in pair except the first one
-      const bool last_was_timestamp = seq_length > 0 && sequence.back() >= beg_token_id_;
-      const bool penultimate_was_timestamp = seq_length <= sample_begin || sequence[seq_length - 2] >= beg_token_id_;
+      const bool last_was_timestamp = seq_length > 0 && sequence.back() >= beginning_timestamp_token_id_;
+      const bool penultimate_was_timestamp = seq_length <= sample_begin || sequence[seq_length - 2] >= beginning_timestamp_token_id_;
       if (last_was_timestamp) {
         if (penultimate_was_timestamp) {
           // If timestamps show up in pair, or it's the first timestamp, no more timestamp is generated
-          for (int j = beg_token_id_; j < vocab_size; j++) {
+          for (int j = beginning_timestamp_token_id_; j < vocab_size; j++) {
             beam_token_scores[j] = std::numeric_limits<T>::lowest();
           }
         } else {
           // If timestamp doesn't show up in pair, generate timestamp
-          for (int j = 0; j < eos_token_id_; j++) {
+          for (int j = 0; j < end_of_text_token_id_; j++) {
             beam_token_scores[j] = std::numeric_limits<T>::lowest();
           }
         }
@@ -214,7 +229,7 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       // Find timestamp tokens
       std::vector<int32_t> timestamps;
       for (const auto& word_id : sequence) {
-        if (word_id >= beg_token_id_) {
+        if (word_id >= beginning_timestamp_token_id_) {
           timestamps.push_back(word_id);
         }
       }
@@ -231,13 +246,13 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
           timestamp_last = timestamps.back() + 1;
         }
 
-        for (int j = beg_token_id_; j < timestamp_last; j++) {
+        for (int j = beginning_timestamp_token_id_; j < timestamp_last; j++) {
           beam_token_scores[j] = std::numeric_limits<T>::lowest();
         }
       }
 
       if (seq_length == sample_begin) {
-        const int last_allowed = beg_token_id_ + max_initial_timestamp_index_;
+        const int last_allowed = beginning_timestamp_token_id_ + max_initial_timestamp_index_;
         for (int j = last_allowed + 1; j < vocab_size; j++) {
           beam_token_scores[j] = std::numeric_limits<T>::lowest();
         }
@@ -247,8 +262,8 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       float timestamp_logprob = std::numeric_limits<T>::lowest();
       {
         float logsumexp = 0.0f;
-        const float logprob_max = *std::max_element(beam_token_scores.begin() + beg_token_id_, beam_token_scores.end());
-        for (int j = beg_token_id_; j < vocab_size; ++j) {
+        const float logprob_max = *std::max_element(beam_token_scores.begin() + beginning_timestamp_token_id_, beam_token_scores.end());
+        for (int j = beginning_timestamp_token_id_; j < vocab_size; ++j) {
           if (beam_token_scores[j] > std::numeric_limits<T>::lowest()) {
             logsumexp += expf(beam_token_scores[j] - logprob_max);
           }
@@ -258,21 +273,23 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
         }
       }
 
-      const float max_text_token_logprob = *std::max_element(beam_token_scores.begin(), beam_token_scores.begin() + beg_token_id_);
+      const float max_text_token_logprob = *std::max_element(beam_token_scores.begin(), beam_token_scores.begin() + beginning_timestamp_token_id_);
       if (timestamp_logprob > max_text_token_logprob) {
-        for (int j = 0; j < beg_token_id_; ++j) {
+        for (int j = 0; j < beginning_timestamp_token_id_; ++j) {
           beam_token_scores[j] = std::numeric_limits<T>::lowest();
         }
       }
     }
-
-#ifdef DEBUG_GENERATION
-    DumpScores("TimestampLogitsProcessor", next_token_scores);
-#endif
   }
 
  private:
-  int eos_token_id_;
+  int end_of_text_token_id_;
+  int start_of_transcript_token_id_;
+  int translate_token_id_;
+  int transcribe_token_id_;
+  int start_of_lm_token_id_;
+  int no_timestamps_token_id_;
+  int beginning_timestamp_token_id_;
   int max_initial_timestamp_index_;
 };
 
@@ -334,7 +351,15 @@ class LogitsProcessorList : public ILogitsProcessorList {
     // Add timestamp processor for whisper model
     if (parameters.model_type == IGenerationParameters::kModelTypeWhisper && parameters.logits_processor == IGenerationParameters::kLogitsProcessorTypeWhisper) {
       constexpr int max_initial_timestamp_index = 50;
-      timestamp_processor_ = std::make_unique<TimestampLogitsProcessor<float>>(parameters.eos_token_id, max_initial_timestamp_index);
+      // Token ids are passed below in the order that they appear in the tokenizer
+      timestamp_processor_ = std::make_unique<TimestampLogitsProcessor<float>>(parameters.eos_token_id,
+                                                                               parameters.decoder_start_token_id,
+                                                                               parameters.translate_token_id,
+                                                                               parameters.transcribe_token_id,
+                                                                               parameters.start_of_lm_token_id,
+                                                                               parameters.no_timestamps_token_id,
+                                                                               parameters.beginning_timestamp_token_id,
+                                                                               max_initial_timestamp_index);
       processor_list_.push_back(timestamp_processor_.get());
     }
 
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.cc b/onnxruntime/contrib_ops/cuda/activation/activations.cc
index 1a86c5dbece5..6303858b9bd4 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations.cc
+++ b/onnxruntime/contrib_ops/cuda/activation/activations.cc
@@ -49,7 +49,6 @@ namespace cuda {
 UNARY_ACTIVATION_OP_HFD(Affine, 1, kOnnxDomain);
 UNARY_ACTIVATION_OP_HFD(ParametricSoftplus, 1, kOnnxDomain);
 UNARY_ACTIVATION_OP_HFD(ScaledTanh, 1, kOnnxDomain);
-UNARY_ACTIVATION_OP_HFD(Gelu, 1, kMSDomain);
 UNARY_ACTIVATION_OP_HFD(QuickGelu, 1, kMSDomain);
 
 REGISTER_ACTIVATION_KERNEL(ThresholdedRelu, 1, kOnnxDomain, MLFloat16)
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.h b/onnxruntime/contrib_ops/cuda/activation/activations.h
index ab339f276c2b..fc9a71b0b7fa 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations.h
+++ b/onnxruntime/contrib_ops/cuda/activation/activations.h
@@ -66,17 +66,6 @@ class ScaledTanh final : public UnaryElementwise {
   float beta_;
 };
 
-template <typename T>
-class Gelu final : public UnaryElementwise {
- public:
-  Gelu(const OpKernelInfo& info) : UnaryElementwise(info) {}
-
-  Status ComputeInternal(OpKernelContext* context) const override;
-
- private:
-  MAKE_FUNC_CTX_NULL()
-};
-
 template <typename T>
 class QuickGelu final : public UnaryElementwise {
  public:
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
index 0c856815fd43..36f33fbb24c1 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
@@ -36,20 +36,6 @@ struct OP_ScaledTanh : public CtxScaledTanh {
   }
 };
 
-template <typename T>
-struct OP_Gelu : public CtxGelu {
-  __device__ __inline__ T operator()(const T& a) const {
-    return _Gelu(a);
-  }
-};
-
-template <>
-struct OP_Gelu<half> : public CtxGelu {
-  __device__ __inline__ half operator()(const half& a) const {
-    return static_cast<half>(_Gelu(static_cast<float>(a)));
-  }
-};
-
 template <typename T>
 struct OP_QuickGelu : public CtxQuickGelu {
   __device__ __inline__ T operator()(const T& a) const {
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
index 5d18283a395e..782d4bf59a5a 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
+++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
@@ -11,14 +11,12 @@ namespace cuda {
 typedef onnxruntime::cuda::CtxAlphaBeta CtxAffine;
 typedef onnxruntime::cuda::CtxAlphaBeta CtxParametricSoftplus;
 typedef onnxruntime::cuda::CtxAlphaBeta CtxScaledTanh;
-typedef onnxruntime::cuda::CtxNull CtxGelu;
 typedef onnxruntime::cuda::CtxAlpha CtxQuickGelu;
 
 #define UNARY_CONTRIB_ACTIVATION_OPS()         \
   UNARY_ACTIVATION_OP_NAME(ScaledTanh)         \
   UNARY_ACTIVATION_OP_NAME(Affine)             \
   UNARY_ACTIVATION_OP_NAME(ParametricSoftplus) \
-  UNARY_ACTIVATION_OP_NAME(Gelu)               \
   UNARY_ACTIVATION_OP_NAME(QuickGelu)
 
 #define UNARY_ACTIVATION_OP_NAME(name) UNARY_ACTIVATION_IMPL_DECLARATION(name);
diff --git a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu
index 626e4c0b87a3..9e6752b45186 100644
--- a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu
@@ -640,7 +640,7 @@ void InvokeAddBiasTranspose(
     cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block,
     const int batch_size, const int sequence_length, const int num_heads, const int qk_head_size,
     const T* input, const T* biases, T* output, T* qkv_add_bias, const int v_head_size, int total_matrix_count,
-    bool do_rotary = false, int past_sequence_length = 0) {
+    bool do_rotary = false, int rotary_embedding = 0, int past_sequence_length = 0) {
   assert(num_heads <= max_threads_per_block);
 
   if (do_rotary) {
@@ -650,20 +650,20 @@ void InvokeAddBiasTranspose(
     if (format != 1 && format != 2 && format != 3) {
       ORT_THROW("format must be 1, 2 or 3 for rotary attention");
     }
-    if (qk_head_size != 64 && qk_head_size != 128) {
-      ORT_THROW("qk_head_size must be 64 or 128 for rotary attention");
+    if (rotary_embedding != 32 && rotary_embedding != 64 && rotary_embedding != 128) {
+      ORT_THROW("rotary_embedding must be 32, 64 or 128 for rotary attention");
     }
     if (v_head_size != -1 && qk_head_size != v_head_size) {
       ORT_THROW("qk_head_size must be equal to v_head_size for rotary attention");
     }
 
     const int step = past_sequence_length == 0 ? sequence_length : past_sequence_length;
-    size_t smem_size = 2 * qk_head_size * sizeof(T);
+    size_t smem_size = 2 * rotary_embedding * sizeof(T);
 
     const dim3 grid(sequence_length, num_heads, batch_size);
     const dim3 block((qk_head_size / 2 + 31) / 32 * 32, 1, 1);
     AddBiasTransposeQKV<T><<<grid, block, smem_size, stream>>>(total_matrix_count, input, biases, output,
-                                                               qkv_add_bias, qk_head_size, qk_head_size,
+                                                               qkv_add_bias, rotary_embedding, qk_head_size,
                                                                step, format);
 #else
     ORT_THROW("Rotary Attention is supported on sm >= 530. Current sm is", __CUDA_ARCH__);
@@ -727,7 +727,7 @@ void LaunchAddBiasTranspose(
     cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block,
     const int batch_size, const int sequence_length, const int num_heads, const int qk_head_size,
     const half* input, const half* biases, half* output, bool enable_half4, const int v_head_size,
-    half* qkv_add_bias, int total_matrix_count, bool do_rotary, int past_sequence_length) {
+    half* qkv_add_bias, int total_matrix_count, bool do_rotary, int rotary_embedding, int past_sequence_length) {
   total_matrix_count = std::max(num_matrices, total_matrix_count);
   if (enable_half4 && 0 == (qk_head_size % 4) && (v_head_size == -1 || 0 == (v_head_size % 4)) && !do_rotary) {
     const int H = qk_head_size / 4;
@@ -753,7 +753,7 @@ void LaunchAddBiasTranspose(
     InvokeAddBiasTranspose<half>(
         stream, num_matrices, format, max_threads_per_block,
         batch_size, sequence_length, num_heads, qk_head_size, input, biases, output,
-        qkv_add_bias, v_head_size, total_matrix_count, do_rotary, past_sequence_length);
+        qkv_add_bias, v_head_size, total_matrix_count, do_rotary, rotary_embedding, past_sequence_length);
   }
 }
 
@@ -763,7 +763,7 @@ void LaunchAddBiasTranspose(
     const int batch_size, const int sequence_length, const int num_heads, const int qk_head_size,
     const float* input, const float* biases, float* output, bool /*enable_half4*/,
     const int v_head_size, float* qkv_add_bias, int total_matrix_count, bool do_rotary,
-    int past_sequence_length) {
+    int rotary_embedding, int past_sequence_length) {
   total_matrix_count = std::max(num_matrices, total_matrix_count);
   if (0 == (qk_head_size % 4) && (v_head_size == -1 || 0 == (v_head_size % 4)) && !do_rotary) {
     const int H = qk_head_size / 4;
@@ -789,7 +789,8 @@ void LaunchAddBiasTranspose(
     InvokeAddBiasTranspose<float>(
         stream, num_matrices, format, max_threads_per_block,
         batch_size, sequence_length, num_heads, qk_head_size, input, biases, output,
-        qkv_add_bias, v_head_size, total_matrix_count, do_rotary, past_sequence_length);
+        qkv_add_bias, v_head_size, total_matrix_count, do_rotary, rotary_embedding,
+        past_sequence_length);
   }
 }
 
@@ -842,11 +843,11 @@ void InvokeAddBiasTransposeTrt(
 
 template <>
 void LaunchAddBiasTransposeTrt(
-    cudaStream_t stream, const int max_threads_per_block,
-    const int batch_size, const int sequence_length,
-    const int num_heads, const int head_size,
-    const float* biases, const float* query, const float* key, const float* value, float* output,
-    bool is_cross_attention, int kv_sequence_length) {
+    cudaStream_t /*stream*/, const int /*max_threads_per_block*/,
+    const int /*batch_size*/, const int /*sequence_length*/,
+    const int /*num_heads*/, const int /*head_size*/,
+    const float* /*biases*/, const float* /*query*/, const float* /*key*/, const float* /*value*/, float* /*output*/,
+    bool /*is_cross_attention*/, int /*kv_sequence_length*/) {
   ORT_ENFORCE(false, "Shall not call this since fused kernel does not support float input.");
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.h b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.h
index d903267c99a0..efc31db43bcd 100644
--- a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.h
+++ b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.h
@@ -33,7 +33,7 @@ void LaunchAddBiasTranspose(
     cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block,
     const int batch_size, const int sequence_length, const int num_heads, const int qk_head_size,
     const T* input, const T* biases, T* output, bool enable_half4, const int v_head_size, T* qkv_add_bias = nullptr,
-    int total_matrix_count = -1, bool do_rotary = false, int past_sequence_length = 0);
+    int total_matrix_count = -1, bool do_rotary = false, int rotary_embedding = 0, int past_sequence_length = 0);
 
 // Add (bias) and Transpose for separated inputs of Q, K and V, and output Trt format.
 // For self attention:
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc
index bf6431cf1afb..7a807342ad68 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc
@@ -84,6 +84,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
 
   auto& device_prop = GetDeviceProp();
   AttentionParameters parameters;
+  parameters.use_tf32 = UseTF32();
+
   // Use the second dimension from weight for bias to get q_hidden_size when bias is nullptr
   std::vector<int64_t> bias_dims{weights->Shape().GetDims()[1]};
   const TensorShape bias_shape{bias_dims};
@@ -251,7 +253,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
       cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
       reinterpret_cast<const CudaT*>(weights->Data<T>()), n,
       reinterpret_cast<const CudaT*>(input->Data<T>()), k,
-      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop));
+      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop, UseTF32()));
 
   constexpr size_t element_size = sizeof(T);
   constexpr bool use_fused_cross_attention = false;
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index 83c426e7e6ed..a93fdf74dc28 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -58,12 +58,12 @@ size_t AlignSize(size_t bytes) {
   return bytesAligned;
 }
 
-void CumulatedSequenceLengthCache::Initialize(int32_t sequence_length, cudaStream_t stream) {
-  if (this->sequence_length != sequence_length) {
+void CumulatedSequenceLengthCache::Initialize(int32_t seq_length, cudaStream_t stream) {
+  if (this->sequence_length != seq_length) {
     ORT_ENFORCE(buffer.get() != nullptr && this->max_batch_size > 0);
     LaunchTrtSequenceOffset(reinterpret_cast<int32_t*>(buffer.get()), nullptr,
-                            this->max_batch_size, sequence_length, stream);
-    this->sequence_length = sequence_length;
+                            this->max_batch_size, seq_length, stream);
+    this->sequence_length = seq_length;
   }
 }
 
@@ -213,9 +213,9 @@ Status FusedTrtCrossAttention(
 
 template <>
 Status FusedTrtCrossAttention<float>(
-    cudaStream_t stream,
-    contrib::AttentionParameters& parameters,
-    AttentionData<float>& data) {
+    cudaStream_t /*stream*/,
+    contrib::AttentionParameters& /*parameters*/,
+    AttentionData<float>& /*data*/) {
   return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
                          "Trt fused cross attention does not support float tensor");
 }
@@ -276,9 +276,9 @@ Status FusedTrtSelfAttention(
 // Template Specialization for float type
 template <>
 Status FusedTrtSelfAttention<float>(
-    cudaStream_t stream,
-    contrib::AttentionParameters& parameters,
-    AttentionData<float>& data) {
+    cudaStream_t /*stream*/,
+    contrib::AttentionParameters& /*parameters*/,
+    AttentionData<float>& /*data*/) {
   return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
                          "Trt fused attention does not support float tensor");
 }
@@ -313,10 +313,11 @@ Status FlashAttention(
                 parameters.batch_size, parameters.total_sequence_length,
                 parameters.num_heads, parameters.v_head_size);
 
+  bool is_bf16 = false;
   ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd(
       device_prop, stream, query, key, value, data.output, reinterpret_cast<void*>(data.scratch),
       parameters.batch_size, parameters.num_heads, parameters.num_heads, parameters.head_size,
-      parameters.sequence_length, parameters.total_sequence_length, scale, parameters.is_unidirectional,
+      parameters.sequence_length, parameters.total_sequence_length, scale, parameters.is_unidirectional, is_bf16,
       parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum), reinterpret_cast<void*>(data.out_accum),
       true));
 
@@ -460,7 +461,8 @@ Status UnfusedAttention(
       total_sequence_length, sequence_length, qk_head_size,
       &alpha, data.k, qk_head_size, present_size_per_batch_k,
       data.q, qk_head_size, sequence_length * qk_head_size,
-      &zero, data.scratch, total_sequence_length, sequence_length * total_sequence_length, batches, device_prop));
+      &zero, data.scratch, total_sequence_length, sequence_length * total_sequence_length, batches,
+      device_prop, parameters.use_tf32));
 
   DUMP_TENSOR_D("Q", data.q, batch_size, num_heads, sequence_length, qk_head_size);
   DUMP_TENSOR_D("K", data.k, batch_size, num_heads, qk_head_size, sequence_length);
@@ -513,7 +515,7 @@ Status UnfusedAttention(
       v_head_size, sequence_length, total_sequence_length,
       &one, data.v, v_head_size, present_size_per_batch_v,
       scratch2, total_sequence_length, sequence_length * total_sequence_length,
-      &zero, temp_output, v_head_size, sequence_length * v_head_size, batches, device_prop));
+      &zero, temp_output, v_head_size, sequence_length * v_head_size, batches, device_prop, parameters.use_tf32));
 
   // Temp_output is BxNxSxH_v, transpose to output BxSxNxH_v
   Status result = LaunchTransCtx(stream, sequence_length, batch_size, v_head_size, num_heads,
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu b/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu
index 5c65a30918ec..b843966d88e8 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu
@@ -65,7 +65,8 @@ Status PrepareQkv_Attention(contrib::AttentionParameters& parameters,
     LaunchAddBiasTranspose(stream, matrix_to_transpose, format, max_threads_per_block,
                            batch_size, sequence_length, num_heads, qk_head_size,
                            data.gemm_buffer, data.bias, qkv, true, v_head_size, qkv_add_bias,
-                           3, parameters.do_rotary, parameters.past_sequence_length);
+                           3, parameters.do_rotary, parameters.rotary_embedding,
+                           parameters.past_sequence_length);
   }
   return Status::OK();
 }
@@ -230,7 +231,7 @@ Status PrepareQkv_MHA_PackedQKV(contrib::AttentionParameters& parameters,
                                 AttentionData<T>& data,
                                 cudaStream_t stream,
                                 int max_threads_per_block,
-                                T* q, T* k, T* v, AttentionQkvFormat& qkv_format) {
+                                T* /*q*/, T* /*k*/, T* /*v*/, AttentionQkvFormat& qkv_format) {
   const int batch_size = parameters.batch_size;
   const int sequence_length = parameters.sequence_length;
   const int num_heads = parameters.num_heads;
@@ -278,7 +279,7 @@ Status PrepareQkv_MHA_PackedKV(contrib::AttentionParameters& parameters,
                                AttentionData<T>& data,
                                cudaStream_t stream,
                                int max_threads_per_block,
-                               T* q, T* k, T* v, AttentionQkvFormat& qkv_format) {
+                               T* /*q*/, T* k, T* /*v*/, AttentionQkvFormat& qkv_format) {
   const int batch_size = parameters.batch_size;
   const int kv_sequence_length = parameters.kv_sequence_length;
   const int num_heads = parameters.num_heads;
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
index db78722cc0e4..c12cb374d9ad 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -242,18 +242,18 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) {
   using AlignedAK = AttentionKernel<T, ArchTag, true, queries_per_block, keys_per_block, single_value_iteration>;
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
-#pragma warning(disable : 6287)
+#pragma warning(disable : 6287 4189)  // kAligned is used via capture so 4189 warning seems incorrect
 #endif
   // Run a more efficient kernel with `isAligned=True` when memory is correctly aligned.
   bool is_aligned = params.qk_head_size % AlignedAK::kAlignmentQ == 0 &&
                     params.qk_head_size % AlignedAK::kAlignmentK == 0 &&
                     params.v_head_size % AlignedAK::kAlignmentV == 0;
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma warning(pop)
-#endif
   DISPATCH_BOOL(is_aligned, kIsAligned, ([&]() {
                   LaunchCutlassFmha<T, ArchTag, kIsAligned, queries_per_block, keys_per_block, single_value_iteration>(params);
                 }));
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 }
 
 template <typename T, typename ArchTag>
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
index 3f703ae3d05e..ceee17c2a2d0 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
@@ -273,13 +273,13 @@ Status DecoderAttention<T>::ComputeInternal(OpKernelContext* context) const {
       cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
       reinterpret_cast<const CudaT*>(bias->Data<T>()), n,
       GetConstOnes<CudaT>(m, Stream(context)), 1,
-      &zero, reinterpret_cast<CudaT*>(gemm_query_buffer_p.get()), n, device_prop));
+      &zero, reinterpret_cast<CudaT*>(gemm_query_buffer_p.get()), n, device_prop, UseTF32()));
   // matmul: (h2, h1)*(h1, S*B)
   CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
       cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
       reinterpret_cast<const CudaT*>(q_weights->Data<T>()), n,
       reinterpret_cast<const CudaT*>(query->Data<T>()), k,
-      &one, reinterpret_cast<CudaT*>(gemm_query_buffer_p.get()), n, device_prop));
+      &one, reinterpret_cast<CudaT*>(gemm_query_buffer_p.get()), n, device_prop, UseTF32()));
   // gemm_query_buffer in col-base: (h2, S*B)
 
   // calcualte k, v
@@ -298,13 +298,13 @@ Status DecoderAttention<T>::ComputeInternal(OpKernelContext* context) const {
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
           reinterpret_cast<const CudaT*>(bias->Data<T>() + hidden_size), n,
           GetConstOnes<CudaT>(m, Stream(context)), 1,
-          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop, UseTF32()));
       // matmul: (2*h2, h1)*(h1, T_S*B)
       CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
           reinterpret_cast<const CudaT*>(kv_weights->Data<T>()), n,
           reinterpret_cast<const CudaT*>(query->Data<T>()), k,
-          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop, UseTF32()));
       // gemm_kv_buffer in col-base: (2*h2, T_S*B)
     } else {
       gemm_kv_buffer_p = GetScratchBuffer<T>(static_cast<size_t>(batch_size) * 2 * key_sequence_length * hidden_size,
@@ -318,13 +318,13 @@ Status DecoderAttention<T>::ComputeInternal(OpKernelContext* context) const {
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
           reinterpret_cast<const CudaT*>(bias->Data<T>() + hidden_size), n,
           GetConstOnes<CudaT>(m, Stream(context)), 1,
-          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop, UseTF32()));
       // matmul: (2*h2, h1)*(h1, T_S*B)
       CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
           reinterpret_cast<const CudaT*>(kv_weights->Data<T>()), n,
           reinterpret_cast<const CudaT*>(key->Data<T>()), k,
-          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop, UseTF32()));
       // gemm_kv_buffer in col-base: (2*h2, T_S*B)
     }
   } else {
@@ -342,13 +342,13 @@ Status DecoderAttention<T>::ComputeInternal(OpKernelContext* context) const {
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
           reinterpret_cast<const CudaT*>(bias->Data<T>() + hidden_size), n,
           GetConstOnes<CudaT>(m, Stream(context)), 1,
-          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop, UseTF32()));
       // matmul: (2*h2, h1)*(h1, T_S*B)
       CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
           reinterpret_cast<const CudaT*>(kv_weights->Data<T>()), n,
           reinterpret_cast<const CudaT*>(query->Data<T>()), k,
-          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop, UseTF32()));
       // gemm_kv_buffer in col-base: (2*h2, T_S*B)
     } else {
       kv_sequence_length = cache_sequence_length;
@@ -372,6 +372,8 @@ Status DecoderAttention<T>::ComputeInternal(OpKernelContext* context) const {
       device_prop,
 #ifdef USE_ROCM
       GetTuningContext(),
+#else
+      UseTF32(),
 #endif
       context->GetComputeStream(),
       cublas,
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.cu
index 1dc22a9c8ea9..c0b199678918 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.cu
@@ -17,7 +17,7 @@ Status DecoderQkvToContext(
     const cudaDeviceProp& device_prop,
     Stream* ort_stream,
     cublasHandle_t& cublas,
-    const size_t element_size,
+    const size_t /*element_size*/,
     const int batch_size,
     const int sequence_length,
     const int kv_sequence_length,
@@ -37,7 +37,8 @@ Status DecoderQkvToContext(
     T* workspace_buffer,
     T* output,
     T* new_key_cache,
-    T* new_value_cache) {
+    T* new_value_cache,
+    bool use_tf32) {
   const int max_threads_per_block = device_prop.maxThreadsPerBlock;
   const int BN = batch_size * num_heads;
   const int BHN = BN * head_size;
@@ -128,14 +129,14 @@ Status DecoderQkvToContext(
         kv_sequence_length, sequence_length, head_size,
         &alpha, key_cache, head_size, strideA,
         q, head_size, strideB,
-        &zero, scratch1, kv_sequence_length, temp_matrix_size, BN, device_prop));
+        &zero, scratch1, kv_sequence_length, temp_matrix_size, BN, device_prop, use_tf32));
   } else {
     CUBLAS_RETURN_IF_ERROR(cublasGemmStridedBatchedHelper(
         cublas, CUBLAS_OP_T, CUBLAS_OP_N,
         kv_sequence_length, sequence_length, head_size,
         &alpha, k, head_size, strideA,
         q, head_size, strideB,
-        &zero, scratch1, kv_sequence_length, temp_matrix_size, BN, device_prop));
+        &zero, scratch1, kv_sequence_length, temp_matrix_size, BN, device_prop, use_tf32));
   }
 
   constexpr bool is_unidirectional = false;
@@ -163,14 +164,14 @@ Status DecoderQkvToContext(
         head_size, sequence_length, kv_sequence_length,
         &one, value_cache, head_size, strideA,
         scratch2, kv_sequence_length, temp_matrix_size,
-        &zero, scratch3, head_size, strideB, BN, device_prop));
+        &zero, scratch3, head_size, strideB, BN, device_prop, use_tf32));
   } else {
     CUBLAS_RETURN_IF_ERROR(cublasGemmStridedBatchedHelper(
         cublas, CUBLAS_OP_N, CUBLAS_OP_N,
         head_size, sequence_length, kv_sequence_length,
         &one, v, head_size, strideA,
         scratch2, kv_sequence_length, temp_matrix_size,
-        &zero, scratch3, head_size, strideB, BN, device_prop));
+        &zero, scratch3, head_size, strideB, BN, device_prop, use_tf32));
   }
 
   // scratch3 is BxNxSxH, transpose to output SxBxNxH
@@ -180,6 +181,7 @@ Status DecoderQkvToContext(
 
 Status LaunchDecoderAttentionKernel(
     const cudaDeviceProp& device_prop,
+    bool use_tf32,
     Stream* stream,
     cublasHandle_t& cublas,
     const size_t element_size,
@@ -228,7 +230,8 @@ Status LaunchDecoderAttentionKernel(
         reinterpret_cast<half*>(workspace_buffer),
         reinterpret_cast<half*>(output),
         reinterpret_cast<half*>(new_key_cache),
-        reinterpret_cast<half*>(new_value_cache));
+        reinterpret_cast<half*>(new_value_cache),
+        use_tf32);
   } else {
     return DecoderQkvToContext(
         device_prop,
@@ -254,7 +257,8 @@ Status LaunchDecoderAttentionKernel(
         reinterpret_cast<float*>(workspace_buffer),
         reinterpret_cast<float*>(output),
         reinterpret_cast<float*>(new_key_cache),
-        reinterpret_cast<float*>(new_value_cache));
+        reinterpret_cast<float*>(new_value_cache),
+        use_tf32);
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.h
index 9db9ccb45e33..f9667a613e64 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_attention_impl.h
@@ -11,6 +11,7 @@ namespace cuda {
 
 Status LaunchDecoderAttentionKernel(
     const cudaDeviceProp& prop,       // Device Properties
+    bool use_tf32,                    // Use TF32
     Stream* stream,                   // ORT Stream
     cublasHandle_t& cublas,           // Cublas handle
     const size_t element_size,        // Element size of input tensor
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
index 54aad9cbaf38..66c0aceaed1e 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
@@ -70,6 +70,11 @@ Status DecoderMaskedMultiHeadAttention<T1, T2>::ComputeInternal(OpKernelContext*
 
   auto& device_prop = GetDeviceProp();
   DecoderMaskedMultiHeadAttentionParams parameters;
+
+  parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault<bool>(
+      attention::kDecoderMaskedAttentionLoadKVDataInFlight, false);
+
+  bool is_unidirectional = false;
   bool is_dmmha_packing = (key == nullptr && value == nullptr);
   ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckInputs<Tensor>(query,
                                                                       key,
@@ -84,6 +89,7 @@ Status DecoderMaskedMultiHeadAttention<T1, T2>::ComputeInternal(OpKernelContext*
                                                                       num_heads_,
                                                                       mask_filter_value_,
                                                                       scale_,
+                                                                      is_unidirectional,
                                                                       past_present_share_buffer_,
                                                                       is_dmmha_packing,  // dmmha_packing
                                                                       device_prop.maxThreadsPerBlock));
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
index 69ed07101e64..07a6fbd60e17 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
@@ -52,6 +52,10 @@ Status DecoderMaskedSelfAttention<T1, T2>::ComputeInternal(OpKernelContext* cont
 
   auto& device_prop = GetDeviceProp();
   DecoderMaskedMultiHeadAttentionParams parameters;
+
+  parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault<bool>(
+      attention::kDecoderMaskedAttentionLoadKVDataInFlight, false);
+
   ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(),
                                   weights->Shape(),
                                   bias->Shape(),
@@ -139,7 +143,7 @@ Status DecoderMaskedSelfAttention<T1, T2>::ComputeInternal(OpKernelContext* cont
       cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
       reinterpret_cast<const CudaT*>(weights->Data<T1>()), n,
       reinterpret_cast<const CudaT*>(input->Data<T1>()), k,
-      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop));
+      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop, UseTF32()));
 
   // Update the q, k, and v buffers
   parameters.q = gemm_buffer.get();
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
index 892f5c181a60..8b8e4e267f89 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
@@ -4,9 +4,13 @@
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cudnn_common.h"
 #include "fast_gelu.h"
-#include "fast_gelu_impl.h"
+#include "core/providers/cuda/tensor/gelu_impl.h"
 #include "contrib_ops/cpu/bert/bias_gelu_helper.h"
-#include "transformer_common.h"
+#ifdef USE_ROCM
+#include "contrib_ops/rocm/bert/elementwise.h"
+#else
+#include "contrib_ops/cuda/bert/transformer_common.h"
+#endif
 
 namespace onnxruntime {
 namespace contrib {
@@ -31,8 +35,10 @@ using namespace ONNX_NAMESPACE;
 
 template <typename T>
 FastGelu<T>::FastGelu(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info) {
+#ifndef USE_ROCM
   const TransformerOptions* options = TransformerOptions::GetInstance();
   use_half2_ = !options->DisableHalf2();
+#endif
 }
 
 template <typename T>
@@ -50,6 +56,13 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
   int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
   typedef typename ToCudaType<T>::MappedType CudaT;
 
+#ifdef USE_ROCM
+  return LaunchElementwiseKernel<functor::FastGeLU, CudaT>(
+      GetTuningContext(), context->GetComputeStream(),
+      reinterpret_cast<const CudaT*>(input->Data<T>()), static_cast<int>(input_length),
+      (nullptr != bias) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr, static_cast<int>(bias_length),
+      reinterpret_cast<CudaT*>(output->MutableData<T>()));
+#else
   return LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
                                      Stream(context),
                                      static_cast<int>(input_length),
@@ -58,6 +71,7 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
                                      (nullptr != bias) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr,
                                      reinterpret_cast<CudaT*>(output->MutableData<T>()),
                                      use_half2_);
+#endif
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
index 3e642a70afef..26f3bd5a0392 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
@@ -18,7 +18,9 @@ class FastGelu final : public CudaKernel {
   Status ComputeInternal(OpKernelContext* ctx) const override;
 
  private:
+#ifndef USE_ROCM
   bool use_half2_;
+#endif
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
index 33e7a3349477..9efb6f08e8e9 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
@@ -344,52 +344,148 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
   bool has_beams = params.cache_indir != nullptr && !params.is_cross_attention;
   const int* beam_indices = has_beams ? &params.cache_indir[bi_max_seq_length] : nullptr;
 
-  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0);
+  if (!params.kv_data_in_flight) {
+    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+      bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0);
 
-    // The keys loaded from the key cache.
-    K_vec_k k_vec[K_VECS_PER_THREAD];
-    if (ti < tlength) {
-      if (has_beams) {
-        const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size;
+      // The keys loaded from the key cache.
+      K_vec_k k_vec[K_VECS_PER_THREAD];
+      if (ti < tlength) {
+        if (has_beams) {
+          const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size;
 
 #pragma unroll
-        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-          int jj = ii * params.max_sequence_length + ti;
+          for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            int jj = ii * params.max_sequence_length + ti;
 
-          k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
-              (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
-        }
-      } else {
+            k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
+                (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
+          }
+        } else {
 #pragma unroll
-        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-          int jj = ii * params.max_sequence_length + ti;
+          for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            int jj = ii * params.max_sequence_length + ti;
 
-          k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
-              (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+            k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
+                (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+          }
         }
       }
-    }
 
-    // Perform the dot product and normalize qk.
-    // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
-    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec) * inv_sqrt_dh;
+      // Perform the dot product and normalize qk.
+      // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+      float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec) * inv_sqrt_dh;
 
-    // This is a deviation from FasterTransformer kernel implementation
-    // but this aligns with ORT's other Attention kernels which strives to
-    // mimic PyTorch when dealing with mask filter values
-    if (is_masked) {
-      qk += params.mask_filter_value;
+      // This is a deviation from FasterTransformer kernel implementation
+      // but this aligns with ORT's other Attention kernels which strives to
+      // mimic PyTorch when dealing with mask filter values
+      if (is_masked) {
+        qk += params.mask_filter_value;
+      }
+
+      // Store the product to shared memory. There's one qk value per timestep. Update the max.
+      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+        if (params.relative_attention_bias != nullptr) {
+          qk = add_vec(qk,
+                       reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]);
+        }
+        qk_max = fmaxf(qk_max, qk);
+        qk_smem[ti] = qk;
+      }
     }
+  } else {
+    // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model
+    // Also tune it for different architectures. This works best for Whisper on 80GB A100.
+    constexpr int K_CACHE_DATA_LOAD_UNROLL = 4;
 
-    // Store the product to shared memory. There's one qk value per timestep. Update the max.
-    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-      if (params.relative_attention_bias != nullptr) {
-        qk = add_vec(qk,
-                     reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]);
+    for (int ti = ko; ti < ti_end; ti += (K_CACHE_DATA_LOAD_UNROLL * K_PER_ITER)) {
+      int is_masked[K_CACHE_DATA_LOAD_UNROLL];
+      int beam_offset[K_CACHE_DATA_LOAD_UNROLL];
+      int time_step[K_CACHE_DATA_LOAD_UNROLL];
+      bool time_bounds_cond[K_CACHE_DATA_LOAD_UNROLL];
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        is_masked[k_unroll] = 1;
+        beam_offset[k_unroll] = 0;
+        time_step[k_unroll] = ti + k_unroll * K_PER_ITER;
+        time_bounds_cond[k_unroll] = (time_step[k_unroll] < tlength);
+      }
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && params.mask != nullptr) {
+          is_masked[k_unroll] = params.mask[bi_total_seq_length + time_step[k_unroll]];
+        }
+      }
+
+      if (has_beams) {
+        int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size;
+
+#pragma unroll
+        for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+          if (time_bounds_cond[k_unroll]) {
+            beam_offset[k_unroll] = beam_indices[time_step[k_unroll]] * head_maxlength_headsize_prod;
+          }
+        }
+      }
+
+      // The keys loaded from the key cache.
+      K_vec_k k_vec[K_CACHE_DATA_LOAD_UNROLL][K_VECS_PER_THREAD];
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll]) {
+          if (has_beams) {
+#pragma unroll
+            for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+              int jj = ii * params.max_sequence_length + time_step[k_unroll];
+
+              k_vec[k_unroll][ii] = vec_conversion<K_vec_k, K_vec_m>(
+                  (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset[k_unroll] + jj * QK_ELTS_IN_16B])));
+            }
+          } else {
+#pragma unroll
+            for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+              int jj = ii * params.max_sequence_length + time_step[k_unroll];
+
+              k_vec[k_unroll][ii] = vec_conversion<K_vec_k, K_vec_m>(
+                  (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+            }
+          }
+        }
+      }
+
+      // Perform the dot product and normalize qk.
+      // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+      float qk[K_CACHE_DATA_LOAD_UNROLL];
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        qk[k_unroll] = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec[k_unroll]) * inv_sqrt_dh;
+      }
+
+// This is a deviation from FasterTransformer kernel implementation
+// but this aligns with ORT's other Attention kernels which strives to
+// mimic PyTorch when dealing with mask filter values
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && is_masked[k_unroll] == 0) {
+          qk[k_unroll] += params.mask_filter_value;
+        }
+      }
+
+// Store the product to shared memory. There's one qk value per timestep. Update the max.
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && (tidx % THREADS_PER_KEY == 0)) {
+          if (params.relative_attention_bias != nullptr) {
+            qk[k_unroll] = add_vec(qk[k_unroll],
+                                   reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + time_step[k_unroll]]);
+          }
+          qk_max = fmaxf(qk_max, qk[k_unroll]);
+          qk_smem[time_step[k_unroll]] = qk[k_unroll];
+        }
       }
-      qk_max = fmaxf(qk_max, qk);
-      qk_smem[ti] = qk;
     }
   }
 
@@ -504,18 +600,80 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
   V_vec_acum out;
   zero(out);
 
-  // Loop over the timesteps to compute the partial outputs.
-  for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
-    // Fetch offset based on cache_indir when beam sampling
-    const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0;
-    const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0;
+  if (!params.kv_data_in_flight) {
+    // Loop over the timesteps to compute the partial outputs.
+    for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
+      // Fetch offset based on cache_indir when beam sampling
+      const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0;
+      const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0;
+
+      // Load the values from the cache.
+      V_vec_k v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * head_size]));
+
+      // Load the logits from shared memory.
+      T logit = logits_smem[ti];
+      out = fma(logit, v, out);
+    }
+  } else {
+    // Loop over the timesteps to compute the partial outputs.
+
+    // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model
+    // Also tune it for different architectures. This works best for Whisper on 80GB A100.
+    constexpr int V_CACHE_DATA_LOAD_UNROLL = 8;
+
+    for (int ti = vo; ti < tlength; ti += V_CACHE_DATA_LOAD_UNROLL * V_PER_ITER) {
+      int beam_src[V_CACHE_DATA_LOAD_UNROLL];
+      int beam_offset[V_CACHE_DATA_LOAD_UNROLL];
+      int time_step[V_CACHE_DATA_LOAD_UNROLL];
+      bool time_bounds_cond[V_CACHE_DATA_LOAD_UNROLL];
+
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        beam_src[v_unroll] = 0;
+        beam_offset[v_unroll] = 0;
+        time_step[v_unroll] = ti + v_unroll * V_PER_ITER;
+        time_bounds_cond[v_unroll] = (time_step[v_unroll] < tlength);
+      }
+
+      int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size;
+
+      if (has_beams) {
+// Do the global memory read and corresponding compute in separate unrolled loops
+#pragma unroll
+        for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+          if (time_bounds_cond[v_unroll]) {
+            beam_src[v_unroll] = params.cache_indir[bi_max_seq_length + time_step[v_unroll]];
+          }
+        }
+
+#pragma unroll
+        for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+          if (time_bounds_cond[v_unroll]) {
+            beam_offset[v_unroll] = beam_src[v_unroll] * head_maxlength_headsize_prod;
+          }
+        }
+      }
 
-    // Load the values from the cache.
-    V_vec_k v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * head_size]));
+      // Load the values from the V-cache and logits from shared memory.
+      V_vec_k v[V_CACHE_DATA_LOAD_UNROLL];
+      T logits[V_CACHE_DATA_LOAD_UNROLL];
 
-    // Load the logits from shared memory.
-    T logit = logits_smem[ti];
-    out = fma(logit, v, out);
+// Do the global memory read and compute in separate unrolled loops
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        if (time_bounds_cond[v_unroll]) {
+          v[v_unroll] = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset[v_unroll] + time_step[v_unroll] * head_size]));
+          logits[v_unroll] = logits_smem[time_step[v_unroll]];
+        }
+      }
+
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        if (time_bounds_cond[v_unroll]) {
+          out = fma(logits[v_unroll], v[v_unroll], out);
+        }
+      }
+    }
   }
 
   // One group of threads computes the product(s) for the current timestep.
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
index 4b408dafa2d8..1a17757d1ec2 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
@@ -22,6 +22,12 @@ struct DecoderMaskedMultiHeadAttentionParams : AttentionParameters {
   bool is_cross_attention = false;
   bool is_packed_qkv = false;
 
+  // Useful to better use global memory bandwidth on certain CUDA architectures.
+  // Turned off by default for now until we fully understand performance implications
+  // for all types of workloads.
+  // Can be turned on by appropriate environment variable (see attention_common.h).
+  bool kv_data_in_flight = false;
+
   void* q = nullptr;
   void* q_bias = nullptr;
 
@@ -62,4 +68,4 @@ void mmha_launch_kernel(const DecoderMaskedMultiHeadAttentionParams& params, cud
 }  // namespace cuda
 
 }  // namespace contrib
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
index 76190aad68fd..0f58a74c4d2f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
@@ -35,6 +35,7 @@ void set_params_fprop(Flash_fwd_params& params,
                       void* softmax_lse_d,
                       float softmax_scale,
                       bool is_causal,
+                      bool is_bf16,
                       bool kv_bsnh = true,
                       int window_size_left = -1,
                       int window_size_right = -1) {
@@ -44,7 +45,7 @@ void set_params_fprop(Flash_fwd_params& params,
   params.v_ptr = v;
   params.o_ptr = out;
 
-  params.is_bf16 = false;
+  params.is_bf16 = is_bf16;
 
   // All stride are in elements, not bytes.
   if (kv_bsnh) {
@@ -240,6 +241,7 @@ Status mha_fwd(const cudaDeviceProp& dprops,
                int seqlen_k,
                float softmax_scale,
                bool is_causal,
+               bool is_bf16,
                int num_splits,
                void* softmax_lse_accum,  // num_splits x batch_size x seqlen_q x num_heads
                void* out_accum,          // num_splits x batch_size x seqlen_q x num_heads x head_size_rounded
@@ -264,6 +266,7 @@ Status mha_fwd(const cudaDeviceProp& dprops,
                    softmax_lse,
                    softmax_scale,
                    is_causal,
+                   is_bf16,
                    kv_bsnh,
                    local_window_size,
                    is_causal ? 0 : -1);
@@ -306,7 +309,8 @@ Status mha_varlen_fwd(const cudaDeviceProp& dprops,
                       int max_seqlen_q,
                       int max_seqlen_k,
                       float softmax_scale,
-                      bool is_causal) {
+                      bool is_causal,
+                      bool is_bf16) {
   auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
   const int head_size_rounded = round_multiple(head_size, 32);
   const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
@@ -326,6 +330,7 @@ Status mha_varlen_fwd(const cudaDeviceProp& dprops,
                    softmax_lse,
                    softmax_scale,
                    is_causal,
+                   is_bf16,
                    true,
                    -1,
                    is_causal ? 0 : -1);
@@ -350,13 +355,15 @@ bool is_supported(const cudaDeviceProp& dprops, int head_size, int num_heads, in
 Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        cudaStream_t stream,
                        void* q,            // batch_size x seqlen_q x num_heads x head_size
-                       void* kcache,       // batch_size x seqlen_k x num_heads_k x head_size or batch_size x num_heads_k seqlen_k x head_size
-                       void* vcache,       // batch_size x seqlen_k x num_heads_k x head_size or batch_size x num_heads_k seqlen_k x head_size
-                       void* k,            // (optional) batch_size x seqlen_k_new x num_heads_k x head_size
-                       void* v,            // (optional) batch_size x seqlen_k_new x num_heads_k x head_size
+                       void* kcache,       // batch_size x seqlen_k_max x num_heads_k x head_size or batch_size x num_heads_k seqlen_k_max x head_size
+                       void* vcache,       // batch_size x seqlen_k_max x num_heads_k x head_size or batch_size x num_heads_k seqlen_k_max x head_size
+                       void* k_new,        // (optional) batch_size x seqlen_k_new x num_heads_k x head_size
+                       void* v_new,        // (optional) batch_size x seqlen_k_new x num_heads_k x head_size
                        void* out,          // batch_size x seqlen_q x num_heads x head_size
                        void* softmax_lse,  // batch_size x num_heads x seqlen_q
                        void* seqlens_k_,   // batch_size
+                       void* rotary_cos,   // seqlen_ro x (rotary_dim / 2)
+                       void* rotary_sin,   // seqlen_ro x (rotary_dim / 2)
                        int batch_size,
                        int num_heads,
                        int num_heads_k,
@@ -364,22 +371,23 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        int seqlen_q,
                        int seqlen_k,
                        int seqlen_k_new,
+                       int rotary_dim,
                        const float softmax_scale,
                        bool is_causal,
+                       bool is_bf16,
                        bool past_bsnh,  // otherwise bnsh
                        int num_splits,
                        void* softmax_lse_accum,  // num_splits x batch_size x seqlen_q x num_heads
                        void* out_accum,          // num_splits x batch_size x seqlen_q x num_heads x head_size_rounded
-                       int local_window_size) {
-  // if (seqlen_q == 1) {
-  //   is_causal = false;
-  // }  // causal=true is the same as causal=false in this case
-
+                       int local_window_size,
+                       bool is_rotary_interleaved,
+                       bool is_packed_qkv) {
   auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
   const int head_size_rounded = round_multiple(head_size, 32);
   const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
   const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
 
+  // In kv-cache case, seqlen_k_max as kv sequence length
   Flash_fwd_params params;
   set_params_fprop(params,
                    batch_size,
@@ -394,20 +402,30 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                    softmax_lse,
                    softmax_scale,
                    is_causal,
+                   is_bf16,
                    past_bsnh,
                    local_window_size,
                    is_causal ? 0 : -1);
   params.dprops = &dprops;
 
-  if (k != nullptr && v != nullptr) {
+  if (k_new != nullptr && v_new != nullptr) {
     params.seqlen_knew = seqlen_k_new;
-    params.knew_ptr = k;
-    params.vnew_ptr = v;
+    params.knew_ptr = k_new;
+    params.vnew_ptr = v_new;
     // All stride are in elements, not bytes.
-    params.knew_batch_stride = seqlen_k_new * num_heads_k * head_size;
-    params.vnew_batch_stride = seqlen_k_new * num_heads_k * head_size;
-    params.knew_row_stride = num_heads_k * head_size;
-    params.vnew_row_stride = num_heads_k * head_size;
+    if (is_packed_qkv) {
+      params.q_batch_stride = (seqlen_q * num_heads * head_size) + (2 * seqlen_k_new * num_heads_k * head_size);
+      params.q_row_stride = (num_heads * head_size) + (2 * num_heads_k * head_size);
+      params.knew_batch_stride = (seqlen_q * num_heads * head_size) + (2 * seqlen_k_new * num_heads_k * head_size);
+      params.vnew_batch_stride = (seqlen_q * num_heads * head_size) + (2 * seqlen_k_new * num_heads_k * head_size);
+      params.knew_row_stride = (num_heads * head_size) + (2 * num_heads_k * head_size);
+      params.vnew_row_stride = (num_heads * head_size) + (2 * num_heads_k * head_size);
+    } else {
+      params.knew_batch_stride = seqlen_k_new * num_heads_k * head_size;
+      params.vnew_batch_stride = seqlen_k_new * num_heads_k * head_size;
+      params.knew_row_stride = num_heads_k * head_size;
+      params.vnew_row_stride = num_heads_k * head_size;
+    }
     params.knew_head_stride = head_size;
     params.vnew_head_stride = head_size;
   } else {
@@ -427,6 +445,13 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
     params.cu_seqlens_k = static_cast<int*>(seqlens_k_);
   }
 
+  if (rotary_cos != nullptr) {
+    params.rotary_cos_ptr = rotary_cos;
+    params.rotary_sin_ptr = rotary_sin;
+    params.is_rotary_interleaved = is_rotary_interleaved;
+    params.rotary_dim = rotary_dim;
+  }
+
   params.num_splits = num_splits;
   if (params.num_splits > 1 && softmax_lse_accum != nullptr && out_accum != nullptr) {
     params.softmax_lseaccum_ptr = softmax_lse_accum;
@@ -437,7 +462,7 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
   }
 
   // Only split kernel supports appending to KV cache
-  run_mha_fwd(params, stream, /*force_split_kernel=*/k != nullptr);
+  run_mha_fwd(params, stream, /*force_split_kernel=*/k_new != nullptr);
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
index efc1f565c4fa..24891bcc4d49 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
@@ -51,6 +51,7 @@ Status mha_fwd(const cudaDeviceProp& dprops,
                int seqlen_k,
                float softmax_scale,
                bool is_causal,
+               bool is_bf16,
                int num_splits = 0,
                void* softmax_lse_accum = nullptr,  // num_splits x batch_size x seqlen_q x num_heads
                void* out_accum = nullptr,          // num_splits x batch_size x seqlen_q x num_heads x head_size_rounded
@@ -73,7 +74,8 @@ Status mha_varlen_fwd(const cudaDeviceProp& dprops,
                       int max_seqlen_q,
                       int max_seqlen_k,
                       float softmax_scale,
-                      bool is_causal);
+                      bool is_causal,
+                      bool is_bf16);
 
 Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        cudaStream_t stream,
@@ -85,6 +87,8 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        void* out,          // batch_size x seqlen_q x num_heads x head_size
                        void* softmax_lse,  // batch_size x num_heads x seqlen_q
                        void* seqlens_k_,   // batch_size
+                       void* rotary_sin,   // seqlen_ro x (rotary_dim / 2)
+                       void* rotary_cos,   // seqlen_ro x (rotary_dim / 2)
                        int batch_size,
                        int num_heads,
                        int num_heads_k,
@@ -92,13 +96,17 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        int seqlen_q,
                        int seqlen_k,
                        int seqlen_k_new,
+                       int rotary_dim,
                        const float softmax_scale,
                        bool is_causal,
+                       bool is_bf16,
                        bool past_bsnh,  // otherwise bnsh
                        int num_splits = 0,
                        void* softmax_lse_accum = nullptr,  // num_splits x batch_size x seqlen_q x num_heads
                        void* out_accum = nullptr,          // num_splits x batch_size x seqlen_q x num_heads x head_size_rounded
-                       int local_window_size = -1);
+                       int local_window_size = -1,
+                       bool is_rotary_interleaved = false,
+                       bool is_packed_qkv = false);
 
 size_t get_softmax_lse_size(int max_seqlen_q, int batch_size, int num_heads);
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim128_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim128_bf16_sm80.cu
new file mode 100644
index 000000000000..431eb2bd69de
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim128_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu
new file mode 100644
index 000000000000..0cb48272dec3
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 160>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim160<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim192_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim192_bf16_sm80.cu
new file mode 100644
index 000000000000..142e922f7103
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim192_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu
new file mode 100644
index 000000000000..2142b1c34311
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 224>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim224<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim256_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim256_bf16_sm80.cu
new file mode 100644
index 000000000000..751363184e23
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim256_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim32_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim32_bf16_sm80.cu
new file mode 100644
index 000000000000..ebf023643597
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim32_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim64_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim64_bf16_sm80.cu
new file mode 100644
index 000000000000..166bb2a0072f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim64_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim96_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim96_bf16_sm80.cu
new file mode 100644
index 000000000000..c8760b8168db
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim96_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim128_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim128_bf16_sm80.cu
new file mode 100644
index 000000000000..3ca416f6580c
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim128_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu
new file mode 100644
index 000000000000..3e37c9af80b3
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 160>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim192_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim192_bf16_sm80.cu
new file mode 100644
index 000000000000..79606fd05b4d
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim192_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim224_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim224_bf16_sm80.cu
new file mode 100644
index 000000000000..0b0d9384709c
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim224_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 224>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim256_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim256_bf16_sm80.cu
new file mode 100644
index 000000000000..8eb5c8f84544
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim256_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_bf16_sm80.cu
new file mode 100644
index 000000000000..0141f27aa199
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu
new file mode 100644
index 000000000000..489d2d47bc70
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_bf16_sm80.cu
new file mode 100644
index 000000000000..bcfd47e76b99
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h
index 05ac2476690c..5b70988949bb 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h
@@ -23,11 +23,15 @@
     }                                           \
   }()
 
-#define FP16_SWITCH(COND, ...)         \
-  [&] {                                \
-    assert(COND);                      \
-    using elem_type = cutlass::half_t; \
-    return __VA_ARGS__();              \
+#define FP16_SWITCH(COND, ...)               \
+  [&] {                                      \
+    if (COND) {                              \
+      using elem_type = cutlass::half_t;     \
+      return __VA_ARGS__();                  \
+    } else {                                 \
+      using elem_type = cutlass::bfloat16_t; \
+      return __VA_ARGS__();                  \
+    }                                        \
   }()
 
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)   \
diff --git a/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb.cc b/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb.cc
new file mode 100644
index 000000000000..49bf79188efd
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb.cc
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cuda_common.h"
+#include "contrib_ops/cuda/bert/gemma_rotary_emb.h"
+#include "contrib_ops/cuda/bert/gemma_rotary_emb_impl.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#define REGISTER_KERNEL_TYPED(T, U)                               \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      GemmaRotaryEmbedding,                                       \
+      kMSDomain,                                                  \
+      1,                                                          \
+      T,                                                          \
+      kCudaExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())  \
+          .TypeConstraint("U", DataTypeImpl::GetTensorType<U>()), \
+      GemmaRotaryEmbedding<T, U>);
+
+REGISTER_KERNEL_TYPED(MLFloat16, float)
+
+template <typename T, typename U>
+GemmaRotaryEmbedding<T, U>::GemmaRotaryEmbedding(const OpKernelInfo& info) : CudaKernel(info) {
+}
+
+template <typename T, typename U>
+Status GemmaRotaryEmbedding<T, U>::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* emb = context->Input<Tensor>(0);
+  const Tensor* q = context->Input<Tensor>(1);
+  const Tensor* q_rot = context->Input<Tensor>(2);
+  const Tensor* k = context->Input<Tensor>(3);
+  const Tensor* k_rot = context->Input<Tensor>(4);
+
+  const auto& emb_dims = emb->Shape().GetDims();
+  const auto& q_dims = q->Shape().GetDims();
+  int batch_size = static_cast<int>(q_dims[0]);
+  int num_heads = static_cast<int>(q_dims[1]);
+  int seq_len = static_cast<int>(q_dims[2]);
+  int dim = static_cast<int>(q_dims[3]);
+
+  // q_dims should be [batch_size, num_heads, seq_len, dim]
+  // emb_dims should be [batch_size, seq, dim]
+  ORT_ENFORCE(emb_dims.size() == 3, "emb_dims should be 3D");
+  ORT_ENFORCE(q_dims.size() == 4, "emb_dims should be 4D");
+  ORT_ENFORCE(emb_dims[0] == batch_size, "emb_dims[0] should match q_dims[0]");
+  ORT_ENFORCE(emb_dims[1] == seq_len, "emb_dims[1] should match q_dims[2]");
+  ORT_ENFORCE(emb_dims[2] == dim, "emb_dims[2] should match q_dims[3]");
+
+  Tensor* output1 = context->Output(0, q_dims);
+  Tensor* output2 = context->Output(1, q_dims);
+
+  typedef typename ToCudaType<T>::MappedType CudaT;
+  typedef typename ToCudaType<U>::MappedType CudaU;
+  return LaunchGemmaRotaryEmbeddingKernel<CudaT>(
+      Stream(context),
+      reinterpret_cast<CudaT*>(output1->template MutableData<T>()),
+      reinterpret_cast<CudaT*>(output2->template MutableData<T>()),
+      reinterpret_cast<const CudaU*>(emb->template Data<U>()),
+      reinterpret_cast<const CudaT*>(q->template Data<T>()),
+      reinterpret_cast<const CudaT*>(q_rot->template Data<T>()),
+      reinterpret_cast<const CudaT*>(k->template Data<T>()),
+      reinterpret_cast<const CudaT*>(k_rot->template Data<T>()),
+      batch_size,
+      num_heads,
+      seq_len,
+      dim);
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb.h b/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb.h
new file mode 100644
index 000000000000..e63236d2ab7c
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb.h
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+using onnxruntime::cuda::CudaKernel;
+using onnxruntime::cuda::ToCudaType;
+
+template <typename T, typename U>
+class GemmaRotaryEmbedding final : public CudaKernel {
+ public:
+  GemmaRotaryEmbedding(const OpKernelInfo& info);
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb_impl.cu b/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb_impl.cu
new file mode 100644
index 000000000000..9e00ca713a44
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb_impl.cu
@@ -0,0 +1,104 @@
+/*
+Copyright (c) Microsoft Corporation.
+Licensed under the MIT License.
+*/
+/*
+Kernel implementation for Gamma rotary embeddings. 
+This implementation below subgraph
+           (emb)
+          /   \
+        /      \
+     Sin         Cos
+      |             |
+     Cast           Cast
+      |              |
+  Unsqueeze        Unsqueeze
+ \/        \/   \/         \/
+ Mul       Mul   Mul        Mul
+    \     /         \     /
+      Add             Add  
+       |               |
+    (output1)         (output2)
+*/
+
+#include <cuda_fp16.h>
+#include <cmath>
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "contrib_ops/cuda/bert/gemma_rotary_emb_impl.h"
+
+using namespace onnxruntime::cuda;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+constexpr int kThreadsPerBlock = GridDim::maxThreadsPerBlock;
+
+template <typename T, typename U>
+__global__ void GemmaRotaryEmb(
+                                T* output1,
+                                T* output2,
+                                const U* emb,
+                                const T* q,
+                                const T* q_rot,
+                                const T* k,
+                                const T* k_rot,
+                                const int batch_size,
+                                const int num_heads,
+                                const int seq_len,
+                                const int dim) {
+
+    const int qk_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // index [i, j, k, l] -> [i, k, l]
+    const int emb_idx = qk_idx / (num_heads * seq_len * dim) * (seq_len * dim) + qk_idx %  (seq_len * dim);
+    if (qk_idx < batch_size * num_heads * seq_len * dim) {
+      T sin_val = static_cast<T>(sin(emb[emb_idx]));
+      T cos_val = static_cast<T>(cos(emb[emb_idx]));
+      output1[qk_idx] = q[qk_idx] * cos_val + q_rot[qk_idx] * sin_val;
+      output2[qk_idx] = k[qk_idx] * cos_val + k_rot[qk_idx] * sin_val;
+    }
+}
+
+template <typename T, typename U>
+Status LaunchGemmaRotaryEmbeddingKernel(
+    cudaStream_t stream,
+    T* output1,
+    T* output2,
+    const U* emb,
+    const T* q,
+    const T* q_rot,
+    const T* k,
+    const T* k_rot,
+    const int batch_size,
+    const int num_heads,
+    const int seq_len,
+    const int dim
+    ) {
+  int blocksPerGrid = static_cast<int>(ceil(float(batch_size * num_heads * seq_len * dim) / kThreadsPerBlock));
+
+  GemmaRotaryEmb<<<blocksPerGrid, kThreadsPerBlock, 0, stream>>>(
+    output1, output2,
+    emb, q, q_rot, k, k_rot,
+    batch_size, num_heads, seq_len, dim
+  );
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+template Status LaunchGemmaRotaryEmbeddingKernel<half, float>(
+    cudaStream_t stream,
+    half* output1,
+    half* output2,
+    const float* emb,
+    const half* q,
+    const half* q_rot,
+    const half* k,
+    const half* k_rot,
+    const int batch_size,
+    const int num_heads,
+    const int seq_len,
+    const int dim);
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb_impl.h b/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb_impl.h
new file mode 100644
index 000000000000..c57fbe0d7e92
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/gemma_rotary_emb_impl.h
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+template <typename T, typename U>
+Status LaunchGemmaRotaryEmbeddingKernel(
+    cudaStream_t stream,
+    T* output1,
+    T* output2,
+    const U* emb,
+    const T* q,
+    const T* q_rot,
+    const T* k,
+    const T* k_rot,
+    const int batch_size,
+    const int num_heads,
+    const int seq_len,
+    const int dim);
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index 93892169f6c7..112f609d4659 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -34,6 +34,7 @@ namespace cuda {
 
 // REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(BFloat16)
 
 template <typename T>
 GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info)
@@ -46,6 +47,8 @@ GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info)
   kv_num_heads_ = static_cast<int>(kv_num_heads);
   is_past_bsnh_ = false;  // info.GetAttrOrDefault<int64_t>("is_past_bsnh", 1) == 1;
   local_window_size_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("local_window_size", -1));
+  do_rotary_ = info.GetAttrOrDefault<int64_t>("do_rotary", 0) == 1;
+  rotary_interleaved_ = info.GetAttrOrDefault<int64_t>("rotary_interleaved", 0) == 1;
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
 #if USE_FLASH_ATTENTION
@@ -61,6 +64,9 @@ GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info)
 #else
   disable_memory_efficient_attention_ = true;
 #endif
+  if (!disable_flash_attention_) {
+    zeros_ = this->GetScratchBuffer<int>(kZerosCount, nullptr);
+  }
 }
 
 template <typename T>
@@ -72,6 +78,8 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* past_value = context->Input<Tensor>(4);
   const Tensor* seqlens_k = context->Input<Tensor>(5);
   const Tensor* total_seqlen = context->Input<Tensor>(6);
+  const Tensor* cos_cache = context->Input<Tensor>(7);
+  const Tensor* sin_cache = context->Input<Tensor>(8);
 
   auto& device_prop = GetDeviceProp();
   GroupQueryAttentionParameters parameters;
@@ -83,6 +91,8 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                                 value,
                                                                 past_key,
                                                                 past_value,
+                                                                cos_cache,
+                                                                sin_cache,
                                                                 &parameters,
                                                                 num_heads_,
                                                                 kv_num_heads_,
@@ -92,7 +102,18 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                                 scale_,
                                                                 device_prop.maxThreadsPerBlock));
   parameters.local_window_size = local_window_size_;
+  parameters.is_unidirectional = is_unidirectional_;
+  parameters.zeros_count = kZerosCount;
+  parameters.zero_ptr = zeros_.get();
+  // parameters.left_padding = left_padding_;
   int sequence_length = parameters.sequence_length;
+  parameters.do_rotary = do_rotary_;
+  parameters.rotary_interleaved = rotary_interleaved_;
+
+  if (do_rotary_ && (cos_cache == nullptr || sin_cache == nullptr)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "cos_cache and sin_cache must be passed to GroupQueryAttention when do_rotary = 1");
+  }
 
   TensorShapeVector output_shape(3);
   output_shape[0] = static_cast<int64_t>(parameters.batch_size);
@@ -149,18 +170,31 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   if (use_memory_efficient_attention && needs_buff) {
     kv_buffer_bytes = (sizeof(T) * parameters.batch_size * parameters.num_heads * parameters.seqlen_present_kv_cache * parameters.head_size);
   }
+  size_t rotary_buffer_bytes = 0;
+  if (use_memory_efficient_attention && do_rotary_) {
+    rotary_buffer_bytes = 2 * sizeof(T) * parameters.batch_size * parameters.num_heads * parameters.sequence_length * parameters.head_size;
+    rotary_buffer_bytes += sizeof(int64_t) * parameters.batch_size * parameters.sequence_length;
+  }
   size_t fmha_buffer_bytes = 0;
   if (use_memory_efficient_attention && MemoryEfficientAttentionParams::need_workspace(parameters.head_size, sizeof(T) == sizeof(float))) {
     fmha_buffer_bytes = (parameters.batch_size * parameters.sequence_length * parameters.num_heads * parameters.head_size * sizeof(float));
   }
+  size_t unpacked_qkv_bytes = 0;
+  if (use_memory_efficient_attention && parameters.is_packed_qkv) {
+    unpacked_qkv_bytes = (parameters.batch_size * parameters.sequence_length * (parameters.num_heads + 2 * parameters.kv_num_heads) * parameters.head_size * sizeof(T));
+  }
   auto k_buffer = GetScratchBuffer<void>(kv_buffer_bytes, context->GetComputeStream());
   auto v_buffer = GetScratchBuffer<void>(kv_buffer_bytes, context->GetComputeStream());
+  auto rotary_buffer = GetScratchBuffer<void>(rotary_buffer_bytes, context->GetComputeStream());
   auto fmha_buffer = GetScratchBuffer<void>(fmha_buffer_bytes, context->GetComputeStream());
+  auto unpacked_qkv_buffer = GetScratchBuffer<void>(unpacked_qkv_bytes, context->GetComputeStream());
 #else
   constexpr bool use_memory_efficient_attention = false;
   auto k_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
   auto v_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
+  auto rotary_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
   auto fmha_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
+  auto unpacked_qkv_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
 #endif
 
   // seqlens_k buffer
@@ -181,8 +215,8 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   Tensor* present_value = context->Output(2, present_shape);
 
   data.query = reinterpret_cast<const CudaT*>(query->Data<T>());
-  data.key = reinterpret_cast<const CudaT*>(key->Data<T>());
-  data.value = reinterpret_cast<const CudaT*>(value->Data<T>());
+  data.key = key == nullptr ? nullptr : reinterpret_cast<const CudaT*>(key->Data<T>());
+  data.value = value == nullptr ? nullptr : reinterpret_cast<const CudaT*>(value->Data<T>());
   data.past_key = (nullptr == past_key) ? nullptr : reinterpret_cast<const CudaT*>(past_key->Data<T>());
   data.past_value = (nullptr == past_value) ? nullptr : reinterpret_cast<const CudaT*>(past_value->Data<T>());
   data.output = reinterpret_cast<CudaT*>(output->MutableData<T>());
@@ -228,6 +262,17 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   if (fmha_buffer != nullptr) {
     data.fmha_buffer = reinterpret_cast<CudaT*>(fmha_buffer.get());
   }
+  if (unpacked_qkv_buffer != nullptr) {
+    data.unpacked_qkv_buffer = reinterpret_cast<CudaT*>(unpacked_qkv_buffer.get());
+  }
+  if (rotary_buffer != nullptr) {
+    data.rotary_buffer = reinterpret_cast<CudaT*>(rotary_buffer.get());
+  }
+  // Rotary Embedding
+  if (parameters.do_rotary) {
+    data.cos_cache = reinterpret_cast<const CudaT*>(cos_cache->Data<T>());
+    data.sin_cache = reinterpret_cast<const CudaT*>(sin_cache->Data<T>());
+  }
 
   cublasHandle_t cublas = GetCublasHandle(context);
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
index 54a8127e29e7..15573ece166f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
@@ -23,10 +23,15 @@ class GroupQueryAttention final : public CudaKernel {
   int num_heads_;     // number of attention heads
   int kv_num_heads_;  // different for k and v for group query attention
   int local_window_size_;
+  bool is_unidirectional_;
   bool is_past_bsnh_;
+  bool do_rotary_;
+  bool rotary_interleaved_;
   float scale_;
   bool disable_flash_attention_;
   bool disable_memory_efficient_attention_;
+  static constexpr int kZerosCount = 256;  // In prompt case we create a zero buffer of size 256 for seqlen (assume batch_size <= 256)
+  IAllocatorUniquePtr<int> zeros_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
index 2cb9955807f2..1a7c3fcea3fa 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
@@ -16,6 +16,8 @@ Status CheckInputs(const Tensor* query,
                    const Tensor* value,
                    const Tensor* past_key,
                    const Tensor* past_value,
+                   const Tensor* cos_cache,
+                   const Tensor* sin_cache,
                    void* parameters,
                    int num_heads,
                    int kv_num_heads,
@@ -24,19 +26,18 @@ Status CheckInputs(const Tensor* query,
                    bool is_past_bsnh,
                    float scale) {
   // Note: Here S* is past_cache_sequence_length, S- is past_sequence_length, S+ is sequence_length
-  //     past_key                   : (B, N_k, S*, H) or (B, N_k, S-, H)
-  //     past_value                 : (B, N_k, S*, H) or (B, N_k, S-, H)
+  //     past_key                   : (B, N_k, S*, H) or (B, N_k, S-, H) or nullptr
+  //     past_value                 : (B, N_k, S*, H) or (B, N_k, S-, H) or nullptr
   // no packing for q/k/v:
-  //     query            (Q)       : (B, S, D)
-  //     key              (K)       : (B, S, D_kv)
-  //     value            (V)       : (B, S, D_kv)
+  //     query            (Q)       : (B, S, D) or (B, S, (D_q + 2 D_kv))
+  //     key              (K)       : (B, S, D_kv) or nullptr
+  //     value            (V)       : (B, S, D_kv) or nullptr
   ORT_UNUSED_PARAMETER(value);
 
   AttentionQkvFormat qkv_format = Q_K_V_BSNH;
   AttentionQkvFormat past_kv_format = is_past_bsnh ? Q_K_V_BSNH : Q_K_V_BNSH;
-
+  const bool is_packed_qkv = key == nullptr;
   const auto& query_dims = query->Shape().GetDims();
-  const auto& key_dims = key->Shape().GetDims();
 
   if (query_dims.size() != 3) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'query' is expected to have 3 dimensions, got ",
@@ -46,10 +47,69 @@ Status CheckInputs(const Tensor* query,
   int batch_size = static_cast<int>(query_dims[0]);
   int sequence_length = static_cast<int>(query_dims[1]);
   int q_hidden_size = static_cast<int>(query_dims[2]);
-  int head_size = static_cast<int>(q_hidden_size) / num_heads;
+  int head_size = 0;
 
-  int kv_hidden_size = static_cast<int>(key_dims[2]);
+  if (num_heads % kv_num_heads != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "num_heads must be a multiple of kv_num_heads. Got num_heads % kv_num_heads == ",
+                           num_heads % kv_num_heads);
+  }
 
+  int kv_hidden_size = 0;
+  // Check key and value when not packed
+  if (!is_packed_qkv) {
+    head_size = static_cast<int>(q_hidden_size) / num_heads;
+    if (head_size % 8 != 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "head_size must be a multiple of 8. Got head_size % 8 == ",
+                             head_size % 8);
+    }
+    if (value == nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'key' and 'value' shall be both present, or both absent in the case of packed qkv.");
+    }
+    const auto& key_dims = key->Shape().GetDims();
+    if (key_dims.size() != 3) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'key' is expected to have 3 dimensions, got ",
+                             key_dims.size());
+    } else if (query_dims[0] != key_dims[0]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'query' and 'key' shall have same dim 0 (batch size)");
+    } else if (query_dims[1] != key_dims[1]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'query' and 'key' shall have same dim 1 (sequence length)");
+    }
+    kv_hidden_size = static_cast<int>(key_dims[2]);
+    const auto& value_dims = value->Shape().GetDims();
+    if (value_dims.size() != 3) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'value' is expected to have 3 dimensions, got ",
+                             value_dims.size());
+    } else if (query_dims[0] != value_dims[0]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'query' and 'value' shall have same dim 0 (batch size)");
+    } else if (query_dims[1] != value_dims[1]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'query' and 'value' shall have same dim 1 (sequence length)");
+    } else if (value_dims[2] != kv_hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'value' is expected to have same hidden size as key.");
+    }
+  } else {
+    // Check packed qkv
+    head_size = static_cast<int>(q_hidden_size) / (num_heads + 2 * kv_num_heads);
+    if (head_size % 8 != 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "head_size must be a multiple of 8. Got head_size % 8 == ",
+                             head_size % 8);
+    }
+    if (value != nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'key' and 'value' shall be both present, or both absent in the case of packed qkv.");
+    }
+    q_hidden_size = head_size * num_heads;
+    kv_hidden_size = head_size * kv_num_heads;
+  }
+
+  // Check past-present KV
   int32_t past_sequence_length = 0;
   if (past_key != nullptr && past_value != nullptr) {
     const auto& past_key_dims = past_key->Shape().GetDims();
@@ -130,41 +190,6 @@ Status CheckInputs(const Tensor* query,
                            "Input 'past_key' and 'past_value' shall be both present or both absent.");
   }
 
-  if (key_dims.size() != 3) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'key' is expected to have 3 dimensions, got ",
-                           key_dims.size());
-  }
-  if (query_dims[0] != key_dims[0]) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'query' and 'key' shall have same dim 0 (batch size)");
-  }
-
-  if (num_heads % kv_num_heads != 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "num_heads must be a multiple of kv_num_heads. Got num_heads % kv_num_heads == ",
-                           num_heads % kv_num_heads);
-  }
-
-  const auto& value_dims = value->Shape().GetDims();
-  if (value_dims.size() != 3) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'value' is expected to have 3 dimensions, got ",
-                           value_dims.size());
-  }
-
-  if (query_dims[0] != value_dims[0]) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'query' and 'value' shall have same dim 0 (batch_size)");
-  }
-
-  if (static_cast<int64_t>(sequence_length) != value_dims[1]) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'query,' 'key,' and 'value' shall have the same dim 1 (sequence_length)");
-  }
-
-  if (value_dims[2] != kv_hidden_size) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'value' is expected to have same hidden size as key.");
-  }
-
   // Check seqlens_k tensor (holding past seqlen for token gen)
   const auto& seqlens_dim = seqlens_k->Shape().GetDims();
   if (seqlens_dim.size() != 1 && seqlens_dim[0] != batch_size) {
@@ -180,6 +205,42 @@ Status CheckInputs(const Tensor* query,
   int total_sequence_length = *((*total_seqlen).template Data<int32_t>());
   int present_sequence_length = std::max(total_sequence_length, past_sequence_length);
 
+  int rotary_dim = 0;
+  if (cos_cache != nullptr && sin_cache != nullptr) {
+    const auto& cos_dims = cos_cache->Shape().GetDims();
+    const auto& sin_dims = sin_cache->Shape().GetDims();
+
+    if (head_size % 16 != 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "head_size shall be a multiple of 16. Got head_size % 16 == ",
+                             head_size % 16);
+    }
+    if (cos_dims[0] < present_sequence_length) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "cos_cache dimension 0 should be of max_sequence_length.");
+    }
+    if (sin_dims[0] < present_sequence_length) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "sin_cache dimension 0 should be of max_sequence_length.");
+    }
+    if (cos_dims[1] > (head_size / 16) * 8 || cos_dims[1] % 8 != 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "cos_cache dimension 1 must be <= head_size / 2 and a multiple of 8.");
+    }
+    if (sin_dims[1] > (head_size / 16) * 8 || sin_dims[1] % 8 != 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "sin_cache dimension 1 must be <= head_size / 2 and a multiple of 8.");
+    }
+    if (cos_dims[1] != sin_dims[1]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "cos_cache and sin_cache dimension 1 must be the same.");
+    }
+    rotary_dim = static_cast<int>(cos_dims[1] * 2);
+  } else if (cos_cache != nullptr || sin_cache != nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'cos_cache' and 'sin_cache' shall be both present or both absent.");
+  }
+
   bool is_prompt = sequence_length != 1;
 
   if (parameters != nullptr) {
@@ -190,9 +251,11 @@ Status CheckInputs(const Tensor* query,
     output_parameters->seqlen_present_kv_cache = present_sequence_length;  // max sequence length of present kv tensors
     output_parameters->hidden_size = q_hidden_size;
     output_parameters->num_heads = num_heads;
-    output_parameters->head_size = q_hidden_size / num_heads;
+    output_parameters->head_size = head_size;
     output_parameters->kv_hidden_size = kv_hidden_size;
     output_parameters->kv_num_heads = kv_num_heads;
+    output_parameters->rotary_dim = rotary_dim;
+    output_parameters->is_packed_qkv = is_packed_qkv;
     output_parameters->is_unidirectional = true;
     output_parameters->is_prompt = is_prompt;
     output_parameters->scale = scale;
@@ -208,6 +271,8 @@ Status CheckInputs(const Tensor* query,
                    const Tensor* value,
                    const Tensor* past_key,
                    const Tensor* past_value,
+                   const Tensor* cos_cache,
+                   const Tensor* sin_cache,
                    void* parameters,
                    int num_heads,
                    int kv_num_heads,
@@ -220,7 +285,7 @@ Status CheckInputs(const Tensor* query,
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads should be no larger than ", max_threads_per_block);
   }
 
-  return CheckInputs(query, key, value, past_key, past_value, parameters, num_heads, kv_num_heads, seqlens_k, total_seqlen, is_past_bsnh, scale);
+  return CheckInputs(query, key, value, past_key, past_value, cos_cache, sin_cache, parameters, num_heads, kv_num_heads, seqlens_k, total_seqlen, is_past_bsnh, scale);
 }
 
 }  // namespace group_query_attention_helper
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
index b22ccb68c1e7..f519be1c9714 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -42,6 +42,7 @@ limitations under the License.
 #include "contrib_ops/cuda/bert/group_query_attention_impl.h"
 #include "contrib_ops/cuda/bert/attention_impl.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
+#include "contrib_ops/cuda/bert/rotary_embedding_impl.h"
 #include <cublas_v2.h>
 
 using namespace onnxruntime::cuda;
@@ -150,10 +151,13 @@ __global__ void ConcatNewToPastKVLarge(const int new_seqlen,
 template <typename T>
 Status LaunchConcatNewToPastKV(contrib::GroupQueryAttentionParameters& parameters,
                                GroupQueryAttentionData<T>& data,
+                               const void* new_key,
+                               const void* new_value,
                                cudaStream_t stream,
-                               const int max_threads_per_block) {
+                               const int max_threads_per_block,
+                               const bool past_only = false) {
   const int batch_size = parameters.batch_size;
-  const int kv_sequence_length = parameters.sequence_length;
+  const int kv_sequence_length = past_only ? 0 : parameters.sequence_length;
   const int past_sequence_length = parameters.seqlen_past_kv_cache;
   const int present_sequence_length = parameters.seqlen_present_kv_cache;
   const int kv_num_heads = parameters.kv_num_heads;
@@ -170,14 +174,14 @@ Status LaunchConcatNewToPastKV(contrib::GroupQueryAttentionParameters& parameter
     ConcatNewToPastKV<float2><<<grid, block, 0, stream>>>(kv_sequence_length,
                                                           past_sequence_length,
                                                           reinterpret_cast<const float2*>(data.past_key),
-                                                          reinterpret_cast<const float2*>(data.key),
+                                                          reinterpret_cast<const float2*>(new_key),
                                                           reinterpret_cast<float2*>(data.present_key),
                                                           seqlens_k,
                                                           past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
     ConcatNewToPastKV<float2><<<grid, block, 0, stream>>>(kv_sequence_length,
                                                           past_sequence_length,
                                                           reinterpret_cast<const float2*>(data.past_value),
-                                                          reinterpret_cast<const float2*>(data.value),
+                                                          reinterpret_cast<const float2*>(new_value),
                                                           reinterpret_cast<float2*>(data.present_value),
                                                           seqlens_k,
                                                           past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
@@ -190,7 +194,7 @@ Status LaunchConcatNewToPastKV(contrib::GroupQueryAttentionParameters& parameter
                                                                H,
                                                                kv_num_heads,
                                                                reinterpret_cast<const float2*>(data.past_key),
-                                                               reinterpret_cast<const float2*>(data.key),
+                                                               reinterpret_cast<const float2*>(new_key),
                                                                reinterpret_cast<float2*>(data.present_key),
                                                                seqlens_k,
                                                                past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
@@ -199,7 +203,7 @@ Status LaunchConcatNewToPastKV(contrib::GroupQueryAttentionParameters& parameter
                                                                H,
                                                                kv_num_heads,
                                                                reinterpret_cast<const float2*>(data.past_value),
-                                                               reinterpret_cast<const float2*>(data.value),
+                                                               reinterpret_cast<const float2*>(new_value),
                                                                reinterpret_cast<float2*>(data.present_value),
                                                                seqlens_k,
                                                                past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
@@ -280,6 +284,8 @@ __global__ void ConcatKVInPlaceLarge(const int max_seqlen,
 template <typename T>
 Status LaunchConcatKVInPlace(contrib::GroupQueryAttentionParameters& parameters,
                              GroupQueryAttentionData<T>& data,
+                             const void* new_key,
+                             const void* new_value,
                              cudaStream_t stream,
                              const int max_threads_per_block) {
   const int batch_size = parameters.batch_size;
@@ -299,12 +305,12 @@ Status LaunchConcatKVInPlace(contrib::GroupQueryAttentionParameters& parameters,
     const dim3 block(H, kv_num_heads, 1);
     ConcatKVInPlace<float2><<<grid, block, 0, stream>>>(present_sequence_length,
                                                         reinterpret_cast<float2*>(data.present_key),
-                                                        reinterpret_cast<const float2*>(data.key),
+                                                        reinterpret_cast<const float2*>(new_key),
                                                         seqlens_k,
                                                         past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
     ConcatKVInPlace<float2><<<grid, block, 0, stream>>>(present_sequence_length,
                                                         reinterpret_cast<float2*>(data.present_value),
-                                                        reinterpret_cast<const float2*>(data.value),
+                                                        reinterpret_cast<const float2*>(new_value),
                                                         seqlens_k,
                                                         past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
   } else {
@@ -315,14 +321,14 @@ Status LaunchConcatKVInPlace(contrib::GroupQueryAttentionParameters& parameters,
                                                              H,
                                                              kv_num_heads,
                                                              reinterpret_cast<float2*>(data.present_key),
-                                                             reinterpret_cast<const float2*>(data.key),
+                                                             reinterpret_cast<const float2*>(new_key),
                                                              seqlens_k,
                                                              past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
     ConcatKVInPlaceLarge<float2><<<grid, block, 0, stream>>>(present_sequence_length,
                                                              H,
                                                              kv_num_heads,
                                                              reinterpret_cast<float2*>(data.present_value),
-                                                             reinterpret_cast<const float2*>(data.value),
+                                                             reinterpret_cast<const float2*>(new_value),
                                                              seqlens_k,
                                                              past_kv_format == AttentionQkvFormat::Q_K_V_BSNH);
   }
@@ -441,7 +447,6 @@ Status LaunchUngroup(contrib::GroupQueryAttentionParameters& parameters,
   return CUDA_CALL(cudaGetLastError());
 }
 
-
 __global__ void PastToTotalSeqlen(int32_t* seqlens_k,
                                   int32_t* seqlens_k_buff,
                                   const int add_seqlen) {
@@ -451,7 +456,7 @@ __global__ void PastToTotalSeqlen(int32_t* seqlens_k,
 // Convert Past to Total sequence length tensor
 Status LaunchGetSeqlenBuff(contrib::GroupQueryAttentionParameters& parameters, int32_t* seqlens_k,
                            int32_t* seqlens_k_buff, bool is_total, cudaStream_t stream,
-                               const int threads_per_block) {
+                           const int /*threads_per_block*/) {
   if (parameters.is_prompt) {
     return Status::OK();
   }
@@ -468,6 +473,83 @@ Status LaunchGetSeqlenBuff(contrib::GroupQueryAttentionParameters& parameters, i
   return CUDA_CALL(cudaGetLastError());
 }
 
+// Kernel to unpack qkv from packed qkv
+template <typename T>
+__global__ void UnpackQKV(const T* packed_qkv, T* unpacked_q, T* unpacked_k, T* unpacked_v, const int num_heads,
+                          const int kv_num_heads, const int head_size, const int sequence_length,
+                          const int batch_size) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int d = (num_heads + 2 * kv_num_heads) * head_size;
+  const int qkv_size = batch_size * sequence_length * d;
+  const int q_size = num_heads * head_size;
+  const int k_size = kv_num_heads * head_size;
+  if (tid < qkv_size) {
+    int batch = tid / (d * sequence_length);
+    int sequence = (tid % (d * sequence_length)) / d;
+    int offset = tid % d;
+    if (offset < q_size) {
+      int unpacked_i = batch * sequence_length * num_heads * head_size + sequence * num_heads * head_size + offset;
+      unpacked_q[unpacked_i] = packed_qkv[tid];
+    } else if (offset < q_size + k_size) {
+      int unpacked_i = batch * sequence_length * kv_num_heads * head_size + sequence * kv_num_heads * head_size + (offset - q_size);
+      unpacked_k[unpacked_i] = packed_qkv[tid];
+    } else {
+      int unpacked_i = batch * sequence_length * kv_num_heads * head_size + sequence * kv_num_heads * head_size + (offset - q_size - k_size);
+      unpacked_v[unpacked_i] = packed_qkv[tid];
+    }
+  }
+}
+
+// Unpack packed qkv
+template <typename T>
+Status LaunchUnpackQKV(const T* packed_qkv, T* unpacked_q, T* unpacked_k, T* unpacked_v, const int num_heads,
+                       const int kv_num_heads, const int head_size, const int sequence_length, const int batch_size,
+                       cudaStream_t stream, const int max_threads_per_block) {
+  const int threads = max_threads_per_block;
+  const int blocks = (batch_size * sequence_length * (num_heads + 2 * kv_num_heads) * head_size + threads - 1) / threads;
+  UnpackQKV<<<blocks, threads, 0, stream>>>(packed_qkv, unpacked_q, unpacked_k, unpacked_v, num_heads, kv_num_heads,
+                                            head_size, sequence_length, batch_size);
+  return CUDA_CALL(cudaGetLastError());
+}
+
+// Kernel to convert seqlens_k to position_ids
+__global__ void SeqlensToPosIdsPrompt(int32_t* seqlens_k, int64_t* position_ids, const int seqlen,
+                                      const int batch_size) {
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  int b = tid / seqlen;
+  int s = tid % seqlen;
+  if (b < batch_size) {
+    if (s < seqlens_k[b] + 1) {
+      position_ids[tid] = s;
+    } else {
+      position_ids[tid] = 1;
+    }
+  }
+}
+
+// Kernel to convert seqlens_k to position_ids
+__global__ void SeqlensToPosIdsToken(int32_t* seqlens_k, int64_t* position_ids, const int batch_size) {
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < batch_size) {
+    position_ids[tid] = seqlens_k[tid];
+  }
+}
+
+// Convert seqlens_k to position_ids
+Status LaunchSeqlensToPosIds(contrib::GroupQueryAttentionParameters& parameters, int32_t* seqlens_k,
+                             int64_t* position_ids, cudaStream_t stream, const int max_threads_per_block) {
+  const int seqlen = parameters.sequence_length;
+  const int batch_size = parameters.batch_size;
+  const int threads = max_threads_per_block;
+  const int blocks = (batch_size * seqlen + threads - 1) / threads;
+  if (parameters.is_prompt) {
+    SeqlensToPosIdsPrompt<<<blocks, threads, 0, stream>>>(seqlens_k, position_ids, seqlen, batch_size);
+  } else {
+    SeqlensToPosIdsToken<<<blocks, threads, 0, stream>>>(seqlens_k, position_ids, batch_size);
+  }
+  return CUDA_CALL(cudaGetLastError());
+}
+
 ////////// Launch Kernels
 
 #if USE_FLASH_ATTENTION
@@ -482,89 +564,64 @@ Status FlashAttention(
   const int batch_size = parameters.batch_size;
   const int sequence_length = parameters.sequence_length;
   const int kv_sequence_length = parameters.sequence_length;
-  const int present_sequence_length = parameters.seqlen_present_kv_cache;
   const int num_heads = parameters.num_heads;
   const int kv_num_heads = parameters.kv_num_heads;
   const int head_size = parameters.head_size;
   AttentionQkvFormat past_kv_format = parameters.past_kv_format;
+  bool is_causal = true;
+  bool is_bf16 = std::is_same<T, BFloat16>::value;
 
   void* query = reinterpret_cast<void*>(const_cast<T*>(data.query));
-  void* key = reinterpret_cast<void*>(const_cast<T*>(data.key));
-  void* value = reinterpret_cast<void*>(const_cast<T*>(data.value));
-
-  bool is_causal = true;
+  void* key;
+  void* value;
 
-  // Note: seqlens_k is past sequence length for flash
-  if (parameters.is_prompt) {
-    // Launch kernel to copy seqlen
-    constexpr int thr_per_blk = 256;
-    int blk_in_grid = (batch_size + thr_per_blk -1) / thr_per_blk;
-    repeat_seqlen<<<blk_in_grid, thr_per_blk, 0, stream>>>(data.seqlens_k_total, parameters.sequence_length, batch_size);
+  if (!parameters.is_packed_qkv) {
+    key = reinterpret_cast<void*>(const_cast<T*>(data.key));
+    value = reinterpret_cast<void*>(const_cast<T*>(data.value));
+  } else {
+    const size_t key_offset = static_cast<size_t>(num_heads * head_size);
+    const size_t value_offset = static_cast<size_t>(kv_num_heads * head_size);
+    key = reinterpret_cast<T*>(query) + key_offset;
+    value = reinterpret_cast<T*>(key) + value_offset;
   }
 
   void* seqlens_k = reinterpret_cast<void*>(data.seqlens_k);
-
-  if (parameters.kv_share_buffer) {
-    // Share buffer case
-    if (data.past_key == nullptr || data.past_key != data.present_key) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Past and present kv shall share the same tensor when kv_share_buffer is on.");
-    }
-
-    if (parameters.is_prompt) {
-      ORT_RETURN_IF_ERROR(LaunchConcatKVInPlace(parameters, data, stream, max_threads_per_block));
-      key = nullptr;
-      value = nullptr;
-      seqlens_k = reinterpret_cast<void*>(data.seqlens_k_total);
-    }
-
-    void* present_key = reinterpret_cast<void*>(const_cast<T*>(data.present_key));
-    void* present_value = reinterpret_cast<void*>(const_cast<T*>(data.present_value));
-
-    DUMP_TENSOR_INIT();
-    DUMP_TENSOR("seqlens_k", reinterpret_cast<int*>(seqlens_k), batch_size, 1);
-
-    bool past_bsnh = past_kv_format == AttentionQkvFormat::Q_K_V_BSNH;
-    ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
-        device_prop, stream, query, present_key, present_value, key, value, data.output, reinterpret_cast<void*>(data.softmax_lse),
-        seqlens_k, batch_size, num_heads, kv_num_heads,
-        head_size, sequence_length, present_sequence_length, kv_sequence_length,
-        scale, is_causal, past_bsnh, parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum),
-        reinterpret_cast<void*>(data.out_accum), parameters.local_window_size));
-  } else {
-    // Not share buffer case
-    // Note that Flash Attention kv-caching operates in place on a buffer... therefore this path is inneficient
-    if (data.past_key != nullptr && data.past_key == data.present_key) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Past and present kv share the same tensor but kv_share_buffer is not on.");
-    }
-
-    ORT_RETURN_IF_ERROR(LaunchConcatNewToPastKV(parameters, data, stream, max_threads_per_block));
-
-    if (!parameters.is_prompt) {
-      ORT_RETURN_IF_ERROR(LaunchGetSeqlenBuff(parameters, data.seqlens_k, data.seqlens_k_total, true, stream, 256));
+  if (parameters.is_prompt) {
+    // set seqlens_k to zeros... flash api uses seqlens_k to indicate where to append key and value
+    // user should use seqlens_k to index into output to get new tokens
+    if (batch_size <= parameters.zeros_count) {
+      seqlens_k = parameters.zero_ptr;
+    } else {
+      // Launch kernel to create larger seqlen tensor when batch_size > 256
+      constexpr int thr_per_blk = 256;
+      int blk_in_grid = (batch_size + thr_per_blk - 1) / thr_per_blk;
+      repeat_seqlen<<<blk_in_grid, thr_per_blk, 0, stream>>>(data.seqlens_k_total, 0, batch_size);
+      seqlens_k = data.seqlens_k_total;
     }
-
-    seqlens_k = reinterpret_cast<void*>(data.seqlens_k_total);
-
-    void* present_key = reinterpret_cast<void*>(const_cast<T*>(data.present_key));
-    void* present_value = reinterpret_cast<void*>(const_cast<T*>(data.present_value));
-
-    DUMP_TENSOR_INIT();
-    DUMP_TENSOR("seqlens_k", reinterpret_cast<int*>(seqlens_k), batch_size, 1);
-    DUMP_TENSOR("Q", data.query, batch_size, sequence_length, num_heads, head_size);
-    DUMP_TENSOR("K", data.present_key, batch_size, kv_num_heads, present_sequence_length, head_size);
-    DUMP_TENSOR("V", data.present_value, batch_size, kv_num_heads, present_sequence_length, head_size);
-
-    bool past_bsnh = past_kv_format == AttentionQkvFormat::Q_K_V_BSNH;
-    ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
-        device_prop, stream, query, present_key, present_value, nullptr, nullptr, data.output, reinterpret_cast<void*>(data.softmax_lse),
-        seqlens_k, batch_size, num_heads, kv_num_heads,
-        head_size, sequence_length, present_sequence_length, 0,
-        scale, is_causal, past_bsnh, parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum),
-        reinterpret_cast<void*>(data.out_accum), parameters.local_window_size));
+  } else if (!parameters.kv_share_buffer) {  // copy past kv to present kv
+    ORT_RETURN_IF_ERROR(LaunchConcatNewToPastKV(parameters, data, nullptr, nullptr, stream, max_threads_per_block,
+                                                true));
   }
 
+  void* present_key = reinterpret_cast<void*>(const_cast<T*>(data.present_key));
+  void* present_value = reinterpret_cast<void*>(const_cast<T*>(data.present_value));
+  void* cos_cache = reinterpret_cast<void*>(const_cast<T*>(data.cos_cache));
+  void* sin_cache = reinterpret_cast<void*>(const_cast<T*>(data.sin_cache));
+
+  bool past_bsnh = past_kv_format == AttentionQkvFormat::Q_K_V_BSNH;
+  ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd_kvcache(
+      device_prop, stream, query, present_key, present_value, key, value, data.output,
+      reinterpret_cast<void*>(data.softmax_lse), seqlens_k, cos_cache, sin_cache,
+      batch_size, num_heads, kv_num_heads, head_size, sequence_length,
+      parameters.seqlen_present_kv_cache, kv_sequence_length, parameters.rotary_dim,
+      scale, is_causal, is_bf16, past_bsnh, parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum),
+      reinterpret_cast<void*>(data.out_accum), parameters.local_window_size, parameters.rotary_interleaved,
+      parameters.is_packed_qkv));
+
+  // if (parameters.left_padding && parameters.is_prompt) {
+  //   ORT_RETURN_IF_ERROR(LaunchLeftPadLast(parameters, data, stream, device_prop.maxThreadsPerBlock));
+  // }
+
   DUMP_TENSOR_INIT();
   DUMP_TENSOR("flash attention output", data.output, batch_size, sequence_length, num_heads, head_size);
 
@@ -589,15 +646,62 @@ Status EfficientAttention(
   const int head_size = parameters.head_size;
   AttentionQkvFormat past_kv_format = parameters.past_kv_format;
 
-  const void* query = reinterpret_cast<const void*>(data.query);
-  const void* key = reinterpret_cast<const void*>(data.key);
-  const void* value = reinterpret_cast<const void*>(data.value);
+  const void* query;
+  const void* key;
+  const void* value;
+
+  if (!parameters.is_packed_qkv) {
+    query = reinterpret_cast<const void*>(data.query);
+    key = reinterpret_cast<const void*>(data.key);
+    value = reinterpret_cast<const void*>(data.value);
+  } else {
+    size_t q_size = static_cast<size_t>(batch_size * sequence_length * num_heads * head_size);
+    size_t k_size = static_cast<size_t>(batch_size * sequence_length * kv_num_heads * head_size);
+    auto q = reinterpret_cast<T*>(data.unpacked_qkv_buffer);
+    auto k = reinterpret_cast<T*>(data.unpacked_qkv_buffer + q_size);
+    auto v = reinterpret_cast<T*>(data.unpacked_qkv_buffer + q_size + k_size);
+    ORT_RETURN_IF_ERROR(LaunchUnpackQKV(reinterpret_cast<const T*>(data.query), q, k, v, num_heads, kv_num_heads,
+                                        head_size, sequence_length, batch_size, stream, max_threads_per_block));
+    query = reinterpret_cast<const void*>(q);
+    key = reinterpret_cast<const void*>(k);
+    value = reinterpret_cast<const void*>(v);
+  }
+
+  if (parameters.do_rotary) {
+    size_t q_size = static_cast<size_t>(batch_size * sequence_length * num_heads * head_size);
+    size_t k_size = static_cast<size_t>(batch_size * sequence_length * kv_num_heads * head_size);
+    auto q_buffer = reinterpret_cast<T*>(data.rotary_buffer);
+    auto k_buffer = q_buffer + q_size;
+    auto position_ids_buff = reinterpret_cast<int64_t*>(k_buffer + k_size);
+    ORT_RETURN_IF_ERROR(LaunchSeqlensToPosIds(parameters, data.seqlens_k, position_ids_buff, stream,
+                                              max_threads_per_block));
+    DUMP_TENSOR_INIT();
+    DUMP_TENSOR("position_ids", position_ids_buff, batch_size, sequence_length);
+    // Launch rotary embedding kernel
+    ORT_RETURN_IF_ERROR(LaunchRotaryEmbeddingKernel<T>(stream, q_buffer, reinterpret_cast<const T*>(query),
+                                                       position_ids_buff, data.cos_cache, data.sin_cache,
+                                                       parameters.batch_size, parameters.sequence_length,
+                                                       parameters.num_heads, parameters.head_size,
+                                                       parameters.rotary_dim, parameters.seqlen_present_kv_cache,
+                                                       /*position_ids_format*/ 1, parameters.rotary_interleaved,
+                                                       device_prop.maxThreadsPerBlock, /*transposed*/ false));
+    ORT_RETURN_IF_ERROR(LaunchRotaryEmbeddingKernel<T>(stream, k_buffer, reinterpret_cast<const T*>(key),
+                                                       position_ids_buff, data.cos_cache, data.sin_cache,
+                                                       parameters.batch_size, parameters.sequence_length,
+                                                       parameters.kv_num_heads, parameters.head_size,
+                                                       parameters.rotary_dim, parameters.seqlen_present_kv_cache,
+                                                       /*position_ids_format*/ 1, parameters.rotary_interleaved,
+                                                       device_prop.maxThreadsPerBlock, /*transposed*/ false));
+    query = reinterpret_cast<const void*>(q_buffer);
+    key = reinterpret_cast<const void*>(k_buffer);
+  }
 
   if (parameters.is_prompt) {
     // Launch kernel to copy seqlen
     constexpr int thr_per_blk = 256;
     int blk_in_grid = (batch_size + thr_per_blk - 1) / thr_per_blk;
-    repeat_seqlen<<<blk_in_grid, thr_per_blk, 0, stream>>>(data.seqlens_k_total, parameters.sequence_length, batch_size);
+    repeat_seqlen<<<blk_in_grid, thr_per_blk, 0, stream>>>(data.seqlens_k_total, parameters.sequence_length,
+                                                           batch_size);
   } else {
     ORT_RETURN_IF_ERROR(LaunchGetSeqlenBuff(parameters, data.seqlens_k, data.seqlens_k_total, true, stream, 256));
   }
@@ -609,7 +713,7 @@ Status EfficientAttention(
                              "Past and present kv shall share the same tensor when kv_share_buffer is on.");
     }
     // Concatenate new kv in place
-    ORT_RETURN_IF_ERROR(LaunchConcatKVInPlace(parameters, data, stream, max_threads_per_block));
+    ORT_RETURN_IF_ERROR(LaunchConcatKVInPlace(parameters, data, key, value, stream, max_threads_per_block));
   } else {
     // Not share buffer case
     if (data.past_key != nullptr && data.past_key == data.present_key) {
@@ -617,7 +721,7 @@ Status EfficientAttention(
                              "Past and present kv share the same tensor but kv_share_buffer is not on.");
     }
     // Copy past and concat new KV to present buffer
-    ORT_RETURN_IF_ERROR(LaunchConcatNewToPastKV(parameters, data, stream, max_threads_per_block));
+    ORT_RETURN_IF_ERROR(LaunchConcatNewToPastKV(parameters, data, key, value, stream, max_threads_per_block));
   }
 
   // Ungroup if grouped, otherwise use present kv directly
@@ -670,7 +774,6 @@ Status EfficientAttention(
   p.has_custom_right_padding = true;
   run_memory_efficient_attention(p);
 
-  DUMP_TENSOR_INIT();
   DUMP_TENSOR("efficient attention output", data.output, batch_size, sequence_length, num_heads, head_size);
 
   return Status::OK();
@@ -682,7 +785,7 @@ Status EfficientAttention(
 template <typename T>
 Status QkvToContext(
     const cudaDeviceProp& device_prop,
-    cublasHandle_t& cublas,
+    cublasHandle_t& /*cublas*/,
     Stream* ort_stream,
     contrib::GroupQueryAttentionParameters& parameters,
     GroupQueryAttentionData<T>& data) {
@@ -713,6 +816,15 @@ template Status QkvToContext<half>(
     contrib::GroupQueryAttentionParameters& parameters,
     GroupQueryAttentionData<half>& data);
 
+template struct GroupQueryAttentionData<BFloat16>;
+
+template Status QkvToContext<BFloat16>(
+    const cudaDeviceProp& device_prop,
+    cublasHandle_t& cublas,
+    Stream* ort_stream,
+    contrib::GroupQueryAttentionParameters& parameters,
+    GroupQueryAttentionData<BFloat16>& data);
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
index de32d7ea9316..32341afa0e3f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.h
@@ -21,6 +21,8 @@ struct GroupQueryAttentionData {
   const T* past_key = nullptr;
   const T* past_value = nullptr;
   int* seqlens_k = nullptr;
+  const T* cos_cache = nullptr;
+  const T* sin_cache = nullptr;
   // Flash buffers
   T* softmax_lse = nullptr;
   T* softmax_lse_accum = nullptr;
@@ -28,6 +30,8 @@ struct GroupQueryAttentionData {
   int* seqlens_k_total = nullptr;
   // Memory Efficient buffers
   T* fmha_buffer = nullptr;
+  T* unpacked_qkv_buffer = nullptr;
+  T* rotary_buffer = nullptr;
   T* k = nullptr;
   T* v = nullptr;
   // Output Tensors
diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc b/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc
index e556ae4a490e..9c5d0e9834f6 100644
--- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc
@@ -136,7 +136,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
         cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
         weights_data, n,
         input_data, k,
-        &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop));
+        &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop, UseTF32()));
   } else {
     // q
     const CudaT* q_weight = weights_data;
@@ -145,7 +145,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
         cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
         q_weight, n,
         input_data, k,
-        &zero, q_data, n, device_prop));
+        &zero, q_data, n, device_prop, UseTF32()));
     // k
     const CudaT* k_weight = q_weight + static_cast<int64_t>(hidden_size) * hidden_size;
     CudaT* k_data = q_data + static_cast<int64_t>(batch_size) * sequence_length * hidden_size;
@@ -153,7 +153,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
         cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
         k_weight, n,
         input_data, k,
-        &zero, k_data, n, device_prop));
+        &zero, k_data, n, device_prop, UseTF32()));
 
     // v
     const CudaT* v_weight = k_weight + static_cast<int64_t>(hidden_size) * hidden_size;
@@ -162,7 +162,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
         cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
         v_weight, n,
         input_data, k,
-        &zero, v_data, n, device_prop));
+        &zero, v_data, n, device_prop, UseTF32()));
   }
 
   // Wait for async copy of batch_global_num
@@ -195,7 +195,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
           reinterpret_cast<const CudaT*>(global_weights->Data<T>()), n,
           input_data, k,
-          &zero, global_gemm_buffer, n, device_prop));
+          &zero, global_gemm_buffer, n, device_prop, UseTF32()));
     } else {
       // global q
       const CudaT* global_q_weight = global_weights_data;
@@ -205,7 +205,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
             cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
             global_q_weight, n,
             input_data, k,
-            &zero, global_q, n, device_prop));
+            &zero, global_q, n, device_prop, UseTF32()));
       } else {
         CUBLAS_RETURN_IF_ERROR(cublasGemmStridedBatchedHelper(
             cublas,
@@ -226,7 +226,8 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
             hidden_size,                                          // ldc
             static_cast<int64_t>(max_num_global) * hidden_size,   // strideC
             batch_size,                                           // batch count
-            device_prop));
+            device_prop,
+            UseTF32()));
       }
       // global k
       const CudaT* global_k_weight = global_weights_data + static_cast<int64_t>(hidden_size) * hidden_size;
@@ -235,7 +236,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
           global_k_weight, n,
           input_data, k,
-          &zero, global_k, n, device_prop));
+          &zero, global_k, n, device_prop, UseTF32()));
 
       // global v
       const CudaT* global_v_weight = global_k_weight + static_cast<int64_t>(hidden_size) * hidden_size;
@@ -244,7 +245,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
           cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
           global_v_weight, n,
           input_data, k,
-          &zero, global_v, n, device_prop));
+          &zero, global_v, n, device_prop, UseTF32()));
     }
   }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu
index f00239460071..c9c66b73b3e9 100644
--- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu
@@ -1005,7 +1005,6 @@ Status LaunchLongformerAttentionKernel(
     bool disable_compact_memory,
     bool use_merged_qkv_weights,
     bool use_half4) {
-  CublasMathModeSetter helper(device_prop, cublas, CUBLAS_TENSOR_OP_MATH);
   size_t softmax_workspace_size = GetLongformerSoftmaxWorkspaceSize(element_size,
                                                                     batch_size,
                                                                     num_heads,
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
index ebd66d8c6528..2ef011cdd9a2 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -44,6 +44,8 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info)
   mask_filter_value_ = info.GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
 
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
+  is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
+  ORT_ENFORCE(!is_unidirectional_, "Unidirectional MHA does not support CUDA kernel. Consider using Attention or GQA instead.");
 
   disable_fused_self_attention_ = sizeof(T) != 2 ||
                                   ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedSelfAttention, false);
@@ -92,6 +94,8 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
 
   auto& device_prop = GetDeviceProp();
   AttentionParameters parameters;
+  parameters.use_tf32 = UseTF32();
+
   ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckInputs<Tensor>(query,
                                                                       key,
                                                                       value,
@@ -105,6 +109,7 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                                       num_heads_,
                                                                       mask_filter_value_,
                                                                       scale_,
+                                                                      is_unidirectional_,
                                                                       false,  // past_present_share_buffer
                                                                       false,  // dmmha_packing
                                                                       device_prop.maxThreadsPerBlock));
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
index c162f7133cc1..86a32c92ce00 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
@@ -25,6 +25,7 @@ class MultiHeadAttention final : public CudaKernel {
   int num_heads_;  // number of attention heads
   float mask_filter_value_;
   float scale_;
+  bool is_unidirectional_;
   bool disable_fused_self_attention_;
   bool enable_trt_flash_attention_;
   bool disable_fused_cross_attention_;
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
index ec8b1d051b3d..e4b90727121c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
@@ -268,6 +268,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* relative_position_bias = context->Input<Tensor>(5);
 
   PackedAttentionParameters parameters;
+  parameters.use_tf32 = UseTF32();
   ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(),
                                   weights->Shape(),
                                   bias->Shape(),
@@ -303,17 +304,17 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   int m = parameters.token_count;
   int n = parameters.hidden_size + parameters.hidden_size + parameters.v_hidden_size;
   int k = parameters.input_hidden_size;
-  gemm_buffer = this->GetScratchBuffer<T>(static_cast<size_t>(m) * n, context->GetComputeStream());
+  gemm_buffer = this->template GetScratchBuffer<T>(static_cast<size_t>(m) * n, context->GetComputeStream());
 
   cublasHandle_t cublas = this->GetCublasHandle(context);
 
   // Gemm, note that CUDA assumes col-major, so result(N, M) = 1 * weights x input + 1 x bias
-  // The bias part is not included here since we fuse bias, transpose and output 3 matrice into one cuda kernel.
+  // The bias part is not included here since we fuse bias, transpose and output 3 matrices into one cuda kernel.
   CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
       cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
       reinterpret_cast<const CudaT*>(weights->Data<T>()), n,
       reinterpret_cast<const CudaT*>(input->Data<T>()), k,
-      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop));
+      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop, UseTF32()));
 
   constexpr size_t element_size = sizeof(T);
   constexpr bool no_qkv_workspace = false;  // need workspace to add bias
@@ -327,7 +328,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                    false,
                                                    use_memory_efficient_attention,
                                                    no_qkv_workspace);
-  auto work_space = this->GetScratchBuffer<void>(workSpaceSize, context->GetComputeStream());
+  auto work_space = this->template GetScratchBuffer<void>(workSpaceSize, context->GetComputeStream());
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   PackedAttentionData<CudaT> data;
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
index 3b5232083940..a84a310b46ca 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
@@ -440,7 +440,7 @@ Status LaunchTransposeRemovePadding(
 
 template <typename T>
 Status FusedScaledDotProductAttention(
-    const cudaDeviceProp& device_prop,
+    const cudaDeviceProp& /*device_prop*/,
     cudaStream_t stream,
     PackedAttentionParameters& parameters,
     PackedAttentionData<T>& data) {
@@ -596,7 +596,7 @@ Status UnfusedScaledDotProductAttention(
       q, qk_head_size, sequence_length * qk_head_size,
       &zero,
       scaled_qk, sequence_length, sequence_length * sequence_length,
-      batches, device_prop));
+      batches, device_prop, parameters.use_tf32));
 
   DUMP_TENSOR_D("PackedAttention unfused QK", scaled_qk, batch_size * num_heads, sequence_length, sequence_length);
 
@@ -624,7 +624,7 @@ Status UnfusedScaledDotProductAttention(
       v_head_size, sequence_length, sequence_length,
       &one, v, v_head_size, sequence_length * v_head_size,
       attention_score, sequence_length, sequence_length * sequence_length,
-      &zero, temp_output, v_head_size, sequence_length * v_head_size, batches, device_prop));
+      &zero, temp_output, v_head_size, sequence_length * v_head_size, batches, device_prop, parameters.use_tf32));
 
   // Temp_output is BxNxSxH_v, transpose and remove padding to output token_countxNxH_v
   Status result = LaunchTransposeRemovePadding(
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
index 1b026e64778e..00ab32886112 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
@@ -228,6 +228,7 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
   const Tensor* relative_position_bias = context->Input<Tensor>(6);
 
   PackedAttentionParameters parameters;
+  parameters.use_tf32 = UseTF32();
   ORT_RETURN_IF_ERROR(CheckInputs(query->Shape(),
                                   key,
                                   value,
@@ -297,7 +298,7 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
                                                    use_flash_attention,
                                                    use_memory_efficient_attention,
                                                    no_qkv_workspace);
-  auto work_space = this->GetScratchBuffer<void>(workSpaceSize, context->GetComputeStream());
+  auto work_space = this->template GetScratchBuffer<void>(workSpaceSize, context->GetComputeStream());
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   PackedMultiHeadAttentionData<CudaT> data;
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
index 8a508241d80b..982c7eaa2cb2 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
@@ -381,7 +381,7 @@ void InvokeTranspose(
     const T* query, const T* key, const T* value, const T* bias, T* output,
     const int batch_size, const int sequence_length,
     const int num_heads, const int qk_head_size, const int v_head_size,
-    AttentionQkvFormat source_format, AttentionQkvFormat target_format,
+    [[maybe_unused]] AttentionQkvFormat source_format, AttentionQkvFormat target_format,
     const int32_t* token_offset, int32_t token_count,
     cudaStream_t stream) {
   if (key != nullptr && value != nullptr) {
@@ -551,7 +551,7 @@ void LaunchTranspose(
 
 template <typename T>
 Status FusedAttentionTrt(
-    const cudaDeviceProp& device_prop,
+    const cudaDeviceProp& /*device_prop*/,
     cudaStream_t stream,
     PackedAttentionParameters& parameters,
     PackedMultiHeadAttentionData<T>& data) {
@@ -639,7 +639,8 @@ Status FlashAttention(
           sequence_length,
           sequence_length,
           scale,
-          false  // is causal
+          false,  // is causal
+          false  // is bf16
           ));
 
   DUMP_TENSOR_INIT();
@@ -774,7 +775,7 @@ Status UnfusedAttention(
       q, qk_head_size, sequence_length * qk_head_size,
       &zero,
       scaled_qk, sequence_length, sequence_length * sequence_length,
-      batches, device_prop));
+      batches, device_prop, parameters.use_tf32));
 
   // Q, K and V are ready now
   DUMP_TENSOR_INIT();
@@ -807,7 +808,7 @@ Status UnfusedAttention(
       v_head_size, sequence_length, sequence_length,
       &one, v, v_head_size, sequence_length * v_head_size,
       attention_score, sequence_length, sequence_length * sequence_length,
-      &zero, temp_output, v_head_size, sequence_length * v_head_size, batches, device_prop));
+      &zero, temp_output, v_head_size, sequence_length * v_head_size, batches, device_prop, parameters.use_tf32));
 
   // Temp_output is BxNxSxH_v, transpose and remove padding to output TxNxH_v
   Status result = LaunchTransposeRemovePadding(
diff --git a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.cc b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.cc
index 92ba808dd85c..05f55d9106d0 100644
--- a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.cc
@@ -200,7 +200,7 @@ Status GatedRelativePositionBias<T>::ComputeInternal(OpKernelContext* context) c
       D, BNS, head_size, &one,
       reinterpret_cast<const CudaT*>(weight_tensor.template Data<T>()), (int)D,
       reinterpret_cast<const CudaT*>(workspace.get()), (int)head_size,
-      &zero, gemm_output, ld_gemm_output, device_prop));
+      &zero, gemm_output, ld_gemm_output, device_prop, UseTF32()));
 
   auto status = LaunchGatedRelativePositionBiasKernel<CudaT>(
       device_prop, stream,
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
index 2d12e975d88d..ab7479f2938f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
@@ -29,10 +29,13 @@ namespace cuda {
 
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(BFloat16)
 
 template <typename T>
 RotaryEmbedding<T>::RotaryEmbedding(const OpKernelInfo& info) : CudaKernel(info) {
   scale = info.GetAttrOrDefault<float>("scale", 1.0);
+  rotary_embedding_dim = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
+  num_heads = static_cast<int>(info.GetAttrOrDefault<int64_t>("num_heads", 0));
   interleaved = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1);
 }
 
@@ -48,6 +51,8 @@ Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
                                                                    position_ids,
                                                                    cos_cache,
                                                                    sin_cache,
+                                                                   num_heads,
+                                                                   rotary_embedding_dim,
                                                                    &parameters));
 
   Tensor* output = context->Output(0, input->Shape());
@@ -71,13 +76,12 @@ Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
       parameters.sequence_length,
       parameters.num_heads,
       parameters.head_size,
+      parameters.rotary_embedding_dim,
       parameters.max_sequence_length,
       parameters.position_ids_format,
       interleaved,
       device_prop.maxThreadsPerBlock,
       parameters.transposed);
-
-  return Status::OK();
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h
index 6dab2ad56749..d52f61d67044 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h
@@ -19,6 +19,8 @@ class RotaryEmbedding final : public CudaKernel {
 
  protected:
   float scale;
+  int num_heads;
+  int rotary_embedding_dim;
   bool interleaved;
 };
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
index e1b83bd8caf5..bd50e8646c4c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
@@ -7,9 +7,9 @@ Licensed under the MIT License.
 Kernel implementation for rotary embeddings.
 */
 
-#include <cuda_fp16.h>
-#include "core/providers/cuda/cu_inc/common.cuh"
 #include "contrib_ops/cuda/bert/rotary_embedding_impl.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include <cuda_fp16.h>
 
 using namespace onnxruntime::cuda;
 
@@ -18,141 +18,120 @@ namespace contrib {
 namespace cuda {
 
 template <typename T>
-__global__ void RotaryEmbeddingBSNH(T* output,                   // BxSxNxH
-                                    const T* input,              // BxSxNxH
-                                    const T* cos_cache,          // Mx(H/2)
-                                    const T* sin_cache,          // Mx(H/2)
-                                    const int64_t* position_ids, // (1) or BxS
-                                    const int sequence_length,
-                                    const int num_heads,
-                                    const int head_size,
-                                    const int position_ids_format,
-                                    const bool interleaved,
-                                    const int batch_stride,
-                                    const int seq_stride,
+__global__ void RotaryEmbeddingBSNH(T *output,                   // BxSxNxH
+                                    const T *input,              // BxSxNxH
+                                    const T *cos_cache,          // Mx(H/2)
+                                    const T *sin_cache,          // Mx(H/2)
+                                    const int64_t *position_ids, // (1) or BxS
+                                    const int sequence_length, const int num_heads, const int head_size,
+                                    const int rotary_embedding_dim, const int position_ids_format,
+                                    const bool interleaved, const int batch_stride, const int seq_stride,
                                     const int head_stride) {
-  // B = batch size, S = sequence length, N = num heads, H = head size, M = max sequence length
-  // Use .x in innermost loop to access global memory efficiently
-  
-  const int b = blockIdx.z;
-  const int s = blockIdx.y;
-  const int n = blockIdx.x;
-
-  const int i = threadIdx.x;
-
-  const int block_offset = b * batch_stride + s * seq_stride + n * head_stride;
-
-  const T* input_data = input + block_offset;
-  T* output_data = output + block_offset;
-
-  // Cache is (M, H/2)
-  const int half_head_size = head_size / 2;
-  const int position_id = (position_ids_format == 0) ? \
-                          static_cast<int>(position_ids[0]) + s \
-                          : static_cast<int>(position_ids[b * sequence_length + s]);
-  const int cache_offset = position_id * half_head_size;
-  const T* cos_data = cos_cache + cache_offset;
-  const T* sin_data = sin_cache + cache_offset;
-
-  int cache_idx = 0;
-  T sign = 0;
-  int j = 0;
-  if (interleaved) {
-    cache_idx = (i / 2) % half_head_size;
-    sign = (i % 2 == 0) ? -1 : 1;
-    j = (i % 2 == 0) ? i+1 : i-1;  // i - sign
-  } else {
-    cache_idx = i % half_head_size;
-    sign = (i < half_head_size) ? -1 : 1;
-    j = (i + half_head_size) % head_size;
-  }
-  output_data[i] = input_data[i] * cos_data[cache_idx] + sign * input_data[j] * sin_data[cache_idx];
+    // B = batch size, S = sequence length, N = num heads, H = head size, M = max sequence length
+    // Use .x in innermost loop to access global memory efficiently
+
+    const int b = blockIdx.y;
+    const int s = blockIdx.x;
+    const int n = blockIdx.z;
+
+    const int i = threadIdx.x;
+
+    if (i >= head_size) {
+        return;
+    }
+
+    const int block_offset = b * batch_stride + s * seq_stride + n * head_stride;
+
+    const T *input_data = input + block_offset;
+    T *output_data = output + block_offset;
+
+    if (i >= rotary_embedding_dim) {
+        output_data[i] = input_data[i];
+        return;
+    }
+
+    // Cache is (M, H/2)
+    const int half_rotary_embedding_dim = rotary_embedding_dim / 2;
+    const int position_id = (position_ids_format == 0) ? static_cast<int>(position_ids[0]) + s
+                                                       : static_cast<int>(position_ids[b * sequence_length + s]);
+    const int cache_offset = position_id * half_rotary_embedding_dim;
+    const T *cos_data = cos_cache + cache_offset;
+    const T *sin_data = sin_cache + cache_offset;
+
+    int cache_idx = 0;
+    T sign = 0;
+    int j = 0;
+    if (interleaved) {
+        cache_idx = (i / 2) % half_rotary_embedding_dim;
+        sign = (i % 2 == 0) ? -1 : 1;
+        j = (i % 2 == 0) ? i + 1 : i - 1; // i - sign
+    } else {
+        cache_idx = i % half_rotary_embedding_dim;
+        sign = (i < half_rotary_embedding_dim) ? -1 : 1;
+        j = (i + half_rotary_embedding_dim) % rotary_embedding_dim;
+    }
+    output_data[i] = input_data[i] * cos_data[cache_idx] + sign * input_data[j] * sin_data[cache_idx];
 }
 
-
 template <typename T>
-Status LaunchRotaryEmbeddingKernel(
-    cudaStream_t stream,
-    T* output,
-    const T* input,
-    const int64_t* position_ids,
-    const T* cos_cache,
-    const T* sin_cache,
-    const int batch_size,
-    const int sequence_length,
-    const int num_heads,
-    const int head_size,
-    const int max_sequence_length,
-    const int position_ids_format,
-    const bool interleaved,
-    const int max_threads_per_block,
-    const bool transposed) {
-
-  constexpr int smem_size = 0;
-  const dim3 grid(num_heads, sequence_length, batch_size);
-  const dim3 block(head_size, 1, 1);
-
-  // Note: Current implementation assumes head_size <= max_threads_per_block
-  // because head_size is currently large for LLaMA-2. For smaller head_size
-  // and num_heads values, we can create a block as `block(num_heads, head_size, 1)`
-  // instead. This will require kernel changes to support.
-
-  // Default input tensor shape is [batch, seq, hidden_size]
-  int head_stride = head_size;
-  int seq_stride = num_heads * head_stride;
-  int batch_stride = sequence_length * seq_stride;
-  if (transposed) {
-    // When transposed, input tensor shape is [batch, num_heads, seq, head_size]
-    seq_stride = head_size;
-    head_stride = sequence_length * seq_stride;
-    batch_stride = num_heads * head_stride;
-  }
-
-  assert(head_size <= max_threads_per_block);
-  RotaryEmbeddingBSNH<<<grid, block, smem_size, stream>>>(
-    output, input, cos_cache, sin_cache, position_ids,
-    sequence_length, num_heads, head_size, position_ids_format, interleaved,
-    batch_stride, seq_stride, head_stride
-  );
-
-  return CUDA_CALL(cudaGetLastError());
+Status LaunchRotaryEmbeddingKernel(cudaStream_t stream, T *output, const T *input, const int64_t *position_ids,
+                                   const T *cos_cache, const T *sin_cache, const int batch_size,
+                                   const int sequence_length, const int num_heads, const int head_size,
+                                   const int rotary_embedding_dim, const int /*max_sequence_length*/,
+                                   const int position_ids_format, const bool interleaved,
+                                   const int max_threads_per_block, const bool transposed) {
+    // Note: Current implementation assumes head_size <= max_threads_per_block
+    // because head_size is currently large for LLaMA-2. For smaller head_size
+    // and num_heads values, we can create a block as `block(num_heads, head_size, 1)`
+    // instead. This will require kernel changes to support.
+    ORT_ENFORCE(head_size <= max_threads_per_block, "Rotary embedding dim must be <= max_threads_per_block");
+
+    int tpb = (head_size + 31) / 32 * 32;
+
+    const dim3 block(tpb);
+    const dim3 grid(sequence_length, batch_size, num_heads);
+
+    // Default input tensor shape is [batch, seq, hidden_size]
+    int head_stride = head_size;
+    int seq_stride = num_heads * head_stride;
+    int batch_stride = sequence_length * seq_stride;
+    if (transposed) {
+        // When transposed, input tensor shape is [batch, num_heads, seq, head_size]
+        seq_stride = head_size;
+        head_stride = sequence_length * seq_stride;
+        batch_stride = num_heads * head_stride;
+    }
+
+    assert(head_size <= max_threads_per_block);
+    RotaryEmbeddingBSNH<<<grid, block, 0, stream>>>(output, input, cos_cache, sin_cache, position_ids, sequence_length,
+                                                    num_heads, head_size, rotary_embedding_dim, position_ids_format,
+                                                    interleaved, batch_stride, seq_stride, head_stride);
+
+    return CUDA_CALL(cudaGetLastError());
 }
 
-template Status LaunchRotaryEmbeddingKernel<float>(
-    cudaStream_t stream,
-    float* output,
-    const float* input,
-    const int64_t* position_ids,
-    const float* cos_cache,
-    const float* sin_cache,
-    const int batch_size,
-    const int sequence_length,
-    const int num_heads,
-    const int head_size,
-    const int max_sequence_length,
-    const int position_ids_format,
-    const bool interleaved,
-    const int max_threads_per_block,
-    const bool transposed);
-
-template Status LaunchRotaryEmbeddingKernel<half>(
-    cudaStream_t stream,
-    half* output,
-    const half* input,
-    const int64_t* position_ids,
-    const half* cos_cache,
-    const half* sin_cache,
-    const int batch_size,
-    const int sequence_length,
-    const int num_heads,
-    const int head_size,
-    const int max_sequence_length,
-    const int position_ids_format,
-    const bool interleaved,
-    const int max_threads_per_block,
-    const bool transposed);
-
-
-}  // namespace cuda
-}  // namespace contrib
-}  // namespace onnxruntime
+template Status LaunchRotaryEmbeddingKernel<float>(cudaStream_t stream, float *output, const float *input,
+                                                   const int64_t *position_ids, const float *cos_cache,
+                                                   const float *sin_cache, const int batch_size,
+                                                   const int sequence_length, const int num_heads, const int head_size,
+                                                   const int rotary_embedding_dim, const int max_sequence_length,
+                                                   const int position_ids_format, const bool interleaved,
+                                                   const int max_threads_per_block, const bool transposed);
+
+template Status LaunchRotaryEmbeddingKernel<half>(cudaStream_t stream, half *output, const half *input,
+                                                  const int64_t *position_ids, const half *cos_cache,
+                                                  const half *sin_cache, const int batch_size,
+                                                  const int sequence_length, const int num_heads, const int head_size,
+                                                  const int rotary_embedding_dim, const int max_sequence_length,
+                                                  const int position_ids_format, const bool interleaved,
+                                                  const int max_threads_per_block, const bool transposed);
+
+template Status LaunchRotaryEmbeddingKernel<BFloat16>(
+    cudaStream_t stream, BFloat16 *output, const BFloat16 *input, const int64_t *position_ids,
+    const BFloat16 *cos_cache, const BFloat16 *sin_cache, const int batch_size, const int sequence_length,
+    const int num_heads, const int head_size, const int rotary_embedding_dim, const int max_sequence_length,
+    const int position_ids_format, const bool interleaved, const int max_threads_per_block, const bool transposed);
+
+} // namespace cuda
+} // namespace contrib
+} // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h
index ee1ccc43dcbf..36300fe7a660 100644
--- a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h
@@ -21,6 +21,7 @@ Status LaunchRotaryEmbeddingKernel(
     const int sequence_length,
     const int num_heads,
     const int head_size,
+    const int rotary_embedding_dim,
     const int max_sequence_length,
     const int position_ids_format,
     const bool interleaved,
diff --git a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
index 8fb6575d27cc..4a4e3eeecf64 100644
--- a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
@@ -53,9 +53,9 @@ class FusedMHARunnerFP16v2::mhaImpl {
 
   ~mhaImpl() {}
 
-  void setup(const int S, const int B) {
+  void setup(const int seq_len, const int B) {
     // For bert and vit, use flash attention when sequence length is larger than the threshold.
-    use_flash_attention = is_flash_attention(S);
+    use_flash_attention = is_flash_attention(seq_len);
 
     params.force_unroll = use_flash_attention;
 
@@ -68,26 +68,26 @@ class FusedMHARunnerFP16v2::mhaImpl {
       warps_n = 1;
     } else {
       if (sm == 70) {
-        if (S == 64 || S == 96) {
+        if (seq_len == 64 || seq_len == 96) {
           warps_m = 2;
           warps_n = 2;
-        } else if (S == 128) {
+        } else if (seq_len == 128) {
           warps_m = 1;
           warps_n = 4;
-        } else if (S == 256 || S == 384) {
+        } else if (seq_len == 256 || seq_len == 384) {
           warps_m = 1;
           warps_n = 8;
         } else {
           ORT_ENFORCE(false, "Unsupported sequence length");
         }
       } else {
-        if (S == 32 || S == 64 || S == 96 || S == 128) {
+        if (seq_len == 32 || seq_len == 64 || seq_len == 96 || seq_len == 128) {
           warps_m = 2;
           warps_n = 2;
-        } else if (S == 192 || S == 256) {
+        } else if (seq_len == 192 || seq_len == 256) {
           warps_m = 1;
           warps_n = 4;
-        } else if (S == 384) {
+        } else if (seq_len == 384) {
           warps_m = 1;
           warps_n = 8;
         } else {
@@ -99,7 +99,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
     // The number of threads per CTA.
     threads_per_cta = warps_m * warps_n * warps_k * 32;
     // The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M dimension.
-    xmmas_m = (S + 16 * warps_m - 1) / (16 * warps_m);
+    xmmas_m = (seq_len + 16 * warps_m - 1) / (16 * warps_m);
 
     const float scale_bmm1 = interface->mScale;
     const float scale_softmax = 1.f;  // Seems to be only required for int8
@@ -111,7 +111,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
 
     params.b = B;
     params.h = interface->mNumHeads;
-    params.s = S;
+    params.s = seq_len;
     params.d = interface->mHeadSize;
 
     params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
@@ -121,7 +121,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
     has_causal_mask = false;
   }
 
-  void setup_causal_masked_fmha(const int S, const int B) {
+  void setup_causal_masked_fmha(const int seq_len, const int B) {
     const float scale_bmm1 = interface->mScale;
     const float scale_softmax = 1.f;  // Seems to be only required for int8
     const float scale_bmm2 = 1.f;
@@ -132,7 +132,7 @@ class FusedMHARunnerFP16v2::mhaImpl {
 
     params.b = B;
     params.h = interface->mNumHeads;
-    params.s = S;
+    params.s = seq_len;
     params.d = interface->mHeadSize;
 
     params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
@@ -182,30 +182,30 @@ class FusedMHARunnerFP16v2::mhaImpl {
       return max_seq_len;
     }
 
-    int S = max_seq_len;
+    int seq_len = max_seq_len;
     if (max_seq_len <= 32) {
-      S = (sm == 70) ? 64 : 32;
+      seq_len = (sm == 70) ? 64 : 32;
     } else if (max_seq_len <= 64) {
-      S = 64;
+      seq_len = 64;
     } else if (max_seq_len <= 96) {
-      S = 96;
+      seq_len = 96;
     } else if (max_seq_len <= 128) {
-      S = 128;
+      seq_len = 128;
     } else if (max_seq_len <= 192) {
-      S = (sm == 70) ? 256 : 192;
+      seq_len = (sm == 70) ? 256 : 192;
     } else if (max_seq_len <= 256) {
-      S = 256;
+      seq_len = 256;
     } else if (max_seq_len <= 384) {
-      S = 384;
+      seq_len = 384;
     }
 
-    return S;
+    return seq_len;
   }
 
  protected:
-  bool is_flash_attention(const int S) const {
+  bool is_flash_attention(const int seq_len) const {
     ORT_ENFORCE(interface->mHasCausalMask == false);
-    return interface->mEnableFlashAttention && S >= kMinSequenceLengthFlashAttention;
+    return interface->mEnableFlashAttention && seq_len >= kMinSequenceLengthFlashAttention;
   }
 
  private:
@@ -232,12 +232,12 @@ FusedMHARunnerFP16v2::FusedMHARunnerFP16v2(const int numHeads,
       pimpl(new mhaImpl(this)) {
 }
 
-void FusedMHARunnerFP16v2::setup(const int S, const int B) {
-  MHARunner::setup(S, B);
+void FusedMHARunnerFP16v2::setup(const int seq_len, const int B) {
+  MHARunner::setup(seq_len, B);
   if (mHasCausalMask) {
-    pimpl->setup_causal_masked_fmha(S, B);
+    pimpl->setup_causal_masked_fmha(seq_len, B);
   } else {
-    pimpl->setup(S, B);
+    pimpl->setup(seq_len, B);
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
index 9b989dac9a94..1dbbe8c4e7ea 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
+#include <utility>
 
 #include "core/common/safeint.h"
 #include "core/providers/cuda/cuda_common.h"
@@ -18,25 +18,18 @@ namespace cuda {
 
 #if defined(ORT_USE_NCCL)
 
-#define REGISTER_KERNEL_TYPED(T)                                  \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      ShardedMoE,                                                 \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kCudaExecutionProvider,                                     \
-      (*KernelDefBuilder::Create())                               \
-          .MayInplace(0, 0)                                       \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+#define REGISTER_KERNEL_TYPED(T)                                                                            \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                            \
+      ShardedMoE, kMSDomain, 1, T, kCudaExecutionProvider,                                                  \
+      (*KernelDefBuilder::Create()).MayInplace(0, 0).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       ShardedMoE<T>);
 
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
 
-using namespace ONNX_NAMESPACE;
-
 template <typename T>
 ShardedMoE<T>::ShardedMoE(const OpKernelInfo& op_kernel_info) : NcclKernel(op_kernel_info), MoEBase(op_kernel_info) {
+  ORT_ENFORCE(op_kernel_info.GetAttr<int64_t>("tensor_shards", &tensor_shards_).IsOK());
   ORT_ENFORCE(op_kernel_info.GetAttr<int64_t>("local_experts_start_index", &local_experts_start_index_).IsOK());
   rank_to_experts_start_index_.resize(nccl_->Size());
   // Initialize rank_to_experts_start_index_[0] to a value to convey that it is not initialized.
@@ -57,27 +50,34 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   // Create a {Rank, ExpertsStartIndex} map on Host.
   AutoDestoryCudaEvent cuda_event;
   cudaEvent_t& copy_event = cuda_event.Get();
-  ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event));
 
   const Tensor* input = context->Input<Tensor>(0);
   const Tensor* router_probs = context->Input<Tensor>(1);
   const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
-  const Tensor* fc2_experts_weights = context->Input<Tensor>(3);
-  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
+  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(3);
+  const Tensor* fc2_experts_weights = context->Input<Tensor>(4);
   const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(5);
+  const Tensor* fc3_experts_weights_optional = context->Input<Tensor>(6);
+  const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(7);
+
+  MoEParameters moe_params(tensor_shards_);
+  MoEQuantType quant_type = MoEQuantType::None;
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, quant_type, input, router_probs, fc1_experts_weights,
+                                  fc1_experts_bias_optional, fc2_experts_weights, fc2_experts_bias_optional,
+                                  fc3_experts_weights_optional, fc3_experts_bias_optional));
 
-  MoEParameters moe_params;
-  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc2_experts_weights,
-                                  fc1_experts_bias_optional, fc2_experts_bias_optional));
-  ORT_RETURN_IF_NOT(moe_params.num_experts % nccl_->Size() == 0,
-                    "num_experts should be divisible by world_size");
+  ORT_RETURN_IF_NOT(moe_params.num_experts % nccl_->Size() == 0, "num_experts should be divisible by world_size");
+
+  if (moe_params.parallel_type == MoEParallelType::EP || moe_params.parallel_type == MoEParallelType::EPAndTP) {
+    ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event));
+  }
 
-  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm);
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm, fc3_experts_weights_optional != nullptr,
+                                                                     normalize_routing_weights_);
 
-  size_t ws_size =
-      moe_runner.getWorkspaceSize(static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
-                                  static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
-                                  static_cast<int>(k_));
+  size_t ws_size = moe_runner.getWorkspaceSize(
+      static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+      static_cast<size_t>(moe_params.inter_size), static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
 
   size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
   size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
@@ -95,54 +95,71 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   IAllocatorUniquePtr<void> expert_for_source_row =
       IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
 
-  // fc1_scales and fc2_scales are used in quantized MoE
-  const CudaT* fc1_scales_ptr = nullptr;
-  const CudaT* fc2_scales_ptr = nullptr;
-
-  moe_runner.run_moe_fc(reinterpret_cast<const CudaT*>(input->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()),
-                        std::move(fc1_scales_ptr),
-                        fc1_experts_bias_optional == nullptr
-                            ? nullptr
-                            : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
-                        activation_type_, reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
-                        std::move(fc2_scales_ptr), static_cast<int>(moe_params.num_rows),
-                        static_cast<int>(moe_params.hidden_size),
-                        static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
-                        static_cast<int>(moe_params.local_num_experts), static_cast<int>(local_experts_start_index_),
-                        static_cast<int>(k_), reinterpret_cast<char*>(work_space.get()),
-                        reinterpret_cast<CudaT*>(fc2_output.get()), reinterpret_cast<CudaT*>(expert_scales.get()),
-                        reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
-                        reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
+  const CudaT* fc_scales_ptr = nullptr;
+
+  moe_runner.run_moe_fc(
+      reinterpret_cast<const CudaT*>(input->template Data<T>()),
+      reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
+      reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()), std::move(fc_scales_ptr),
+      fc1_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
+      activation_type_,
+      fc3_experts_weights_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->template Data<T>()),
+      std::move(fc_scales_ptr),
+      fc3_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+      reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()), std::move(fc_scales_ptr),
+      static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
+      static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
+      static_cast<int>(moe_params.local_num_experts), static_cast<int>(local_experts_start_index_),
+      static_cast<int>(k_), reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
+      reinterpret_cast<CudaT*>(expert_scales.get()),
+      reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
+      reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
 
   Tensor* output = context->Output(0, input->Shape());
 
-  size_t stride_count = moe_params.hidden_size;
-  size_t stride_bytes = stride_count * sizeof(CudaT);
-  int64_t total_past_rows = 0;
-  int64_t total_covered_rows = 0;
-  if (copy_event != nullptr) {
-    CUDA_RETURN_IF_ERROR(cudaEventSynchronize(copy_event));
+  if (moe_params.parallel_type == MoEParallelType::None) {
+    fc2_output_bc = std::move(fc2_output);
   }
-  NCCL_RETURN_IF_ERROR(ncclGroupStart());
-  for (int rank = 0; rank < nccl_->Size(); ++rank) {
-    int64_t experts_start_index = rank_to_experts_start_index_[rank];
-    moe_runner.get_total_rows_info(experts_start_index,
-                                   moe_params.local_num_experts,
-                                   total_past_rows,
-                                   total_covered_rows);
-    const char* src = reinterpret_cast<const char*>(fc2_output.get()) + total_past_rows * stride_bytes;
-    char* dst = reinterpret_cast<char*>(fc2_output_bc.get()) + total_past_rows * stride_bytes;
-    NCCL_RETURN_IF_ERROR(ncclBroadcast(src,
-                                       dst,
-                                       total_covered_rows * stride_count,
-                                       GetNcclDataType(input->DataType()),
-                                       rank,
-                                       nccl_->Comm(),
-                                       Stream(context)));
+
+  if (moe_params.parallel_type == MoEParallelType::EPAndTP) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Expert and Tensor Parallelism is not supported yet");
+  }
+
+  if (moe_params.parallel_type == MoEParallelType::TP) {
+    ORT_ENFORCE(moe_params.tensor_shards == nccl_->Size());
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
+    NCCL_RETURN_IF_ERROR(ncclAllReduce(reinterpret_cast<const char*>(fc2_output.get()),
+                                       reinterpret_cast<char*>(fc2_output_bc.get()), fc2_output_size / sizeof(CudaT),
+                                       GetNcclDataType(input->DataType()), ncclSum, nccl_->Comm(), Stream(context)));
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
+  }
+
+  if (moe_params.parallel_type == MoEParallelType::EP) {
+    size_t stride_count = moe_params.hidden_size;
+    size_t stride_bytes = stride_count * sizeof(CudaT);
+    int64_t total_past_rows = 0;
+    int64_t total_covered_rows = 0;
+    if (copy_event != nullptr) {
+      CUDA_RETURN_IF_ERROR(cudaEventSynchronize(copy_event));
+    }
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
+    for (int rank = 0; rank < nccl_->Size(); ++rank) {
+      int64_t experts_start_index = rank_to_experts_start_index_[rank];
+      moe_runner.get_total_rows_info(experts_start_index, moe_params.local_num_experts, total_past_rows,
+                                     total_covered_rows);
+      const char* src = reinterpret_cast<const char*>(fc2_output.get()) + total_past_rows * stride_bytes;
+      char* dst = reinterpret_cast<char*>(fc2_output_bc.get()) + total_past_rows * stride_bytes;
+      NCCL_RETURN_IF_ERROR(ncclBroadcast(src, dst, total_covered_rows * stride_count,
+                                         GetNcclDataType(input->DataType()), rank, nccl_->Comm(), Stream(context)));
+    }
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
   }
-  NCCL_RETURN_IF_ERROR(ncclGroupEnd());
 
   ort_fastertransformer::finalize_moe_routing_kernelLauncher(
       reinterpret_cast<CudaT*>(fc2_output_bc.get()), reinterpret_cast<CudaT*>(output->template MutableData<T>()),
@@ -158,8 +175,7 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
 }
 
 template <typename T>
-Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator,
-                                                   OpKernelContext* context,
+Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator, OpKernelContext* context,
                                                    cudaEvent_t& cuda_event) const {
   if (rank_to_experts_start_index_[0] != std::numeric_limits<int64_t>::min()) {
     return Status::OK();
@@ -176,23 +192,16 @@ Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator,
       IAllocator::MakeUniquePtr<IndexType>(allocator, nccl_->Size(), false, stream);
 
   // Only happens in the first run.
-  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(experts_start_index_d.get(),
-                                       &local_experts_start_index_,
-                                       IndexTypeSize,
-                                       cudaMemcpyHostToDevice,
-                                       Stream(context)));
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(experts_start_index_d.get(), &local_experts_start_index_, IndexTypeSize,
+                                       cudaMemcpyHostToDevice, Stream(context)));
   NCCL_RETURN_IF_ERROR(ncclAllGather(reinterpret_cast<const char*>(experts_start_index_d.get()),
-                                     reinterpret_cast<char*>(rank_to_experts_start_index_d.get()),
-                                     1,
-                                     GetNcclDataType(DataTypeImpl::GetType<IndexType>()),
-                                     nccl_->Comm(),
+                                     reinterpret_cast<char*>(rank_to_experts_start_index_d.get()), 1,
+                                     GetNcclDataType(DataTypeImpl::GetType<IndexType>()), nccl_->Comm(),
                                      Stream(context)));
   // The const_cast<> violates the const modifier to make sure the synchronization happens only once per session.
   CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(const_cast<int64_t*>(rank_to_experts_start_index_.data()),
-                                       rank_to_experts_start_index_d.get(),
-                                       nccl_->Size() * IndexTypeSize,
-                                       cudaMemcpyDeviceToHost,
-                                       Stream(context)));
+                                       rank_to_experts_start_index_d.get(), nccl_->Size() * IndexTypeSize,
+                                       cudaMemcpyDeviceToHost, Stream(context)));
 
   CUDA_RETURN_IF_ERROR(cudaEventCreateWithFlags(&cuda_event, cudaEventDisableTiming));
   CUDA_RETURN_IF_ERROR(cudaEventRecord(cuda_event, Stream(context)));
@@ -204,5 +213,3 @@ Status ShardedMoE<T>::SynchronizeExpertsStartIndex(AllocatorPtr& allocator,
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
index cbd483fddab7..827283a794dd 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "contrib_ops/cuda/moe/ft_moe/moe_kernel.h"
@@ -28,6 +26,7 @@ class ShardedMoE final : public NcclKernel, public MoEBase {
   Status SynchronizeExpertsStartIndex(AllocatorPtr& alloc, OpKernelContext* ctx, cudaEvent_t& cuda_event) const;
 
   int64_t local_experts_start_index_;
+  int64_t tensor_shards_;
   std::vector<int64_t> rank_to_experts_start_index_;
 };
 
@@ -36,5 +35,3 @@ class ShardedMoE final : public NcclKernel, public MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index be7e9f6a8225..583e67b2e6de 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -70,13 +70,13 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop);
-#ifdef USE_CUTLASS
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MoE);
-#endif
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QMoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, GroupQueryAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice);
@@ -97,6 +97,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RotaryEmbedding);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, RotaryEmbedding);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, RotaryEmbedding);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GemmaRotaryEmbedding);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Sampling);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ScaledTanh);
@@ -120,6 +122,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float_float_MLFloat16, SimplifiedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16_float_float, SimplifiedLayerNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, BFloat16_float_BFloat16, SimplifiedLayerNormalization);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Inverse);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulNBits);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulNBits);
@@ -167,10 +170,8 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllR
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllGather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllToAll);
 
-#ifdef USE_CUTLASS
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ShardedMoE);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ShardedMoE);
-#endif
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedMatMul);
@@ -204,6 +205,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedSqueeze);
 #endif
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 16, float, GridSample);
+#endif
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   KernelCreateInfo info;
@@ -270,13 +275,13 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop)>,
-#ifdef USE_CUTLASS
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MoE)>,
-#endif
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QMoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, GroupQueryAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice)>,
@@ -297,6 +302,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, RotaryEmbedding)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, RotaryEmbedding)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GemmaRotaryEmbedding)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Sampling)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
@@ -320,6 +327,7 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float_float_MLFloat16, SimplifiedLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16_float_float, SimplifiedLayerNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, BFloat16_float_BFloat16, SimplifiedLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Inverse)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulNBits)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulNBits)>,
@@ -373,10 +381,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllGather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllToAll)>,
 
-#ifdef USE_CUTLASS
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ShardedMoE)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ShardedMoE)>,
-#endif
 
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedMatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedMatMul)>,
@@ -410,6 +416,9 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedSqueeze)>,
 #endif
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 16, float, GridSample)>,
+#endif
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
index 87e88ac31c99..dea5391c7629 100644
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
@@ -24,7 +24,8 @@ namespace {
 
 template <typename T>
 struct DispatchGroupNorm {
-  Status operator()(cudaStream_t stream,
+  Status operator()(CudaTuningContext* tuning_ctx,
+                    Stream* ort_stream,
                     Tensor* output,
                     Tensor* add_out,
                     const Tensor* input,
@@ -44,7 +45,8 @@ struct DispatchGroupNorm {
                     int channels_per_block) {
     typedef typename ToCudaType<T>::MappedType CudaT;
     return LaunchGroupNormKernel<CudaT>(
-        stream,
+        tuning_ctx,
+        ort_stream,
         reinterpret_cast<CudaT*>(output->MutableData<T>()),
         add_out == nullptr ? nullptr : reinterpret_cast<CudaT*>(add_out->MutableData<T>()),
         reinterpret_cast<const CudaT*>(input->Data<T>()),
@@ -209,7 +211,8 @@ Status GroupNorm::ComputeInternal(OpKernelContext* context) const {
                                           context->GetComputeStream());
 
   utils::MLTypeCallDispatcher<GROUP_NORM_TYPES> dispatcher(input->GetElementType());
-  return dispatcher.InvokeRet<Status, DispatchGroupNorm>(Stream(context), output, add_out, input, skip, bias,
+  return dispatcher.InvokeRet<Status, DispatchGroupNorm>(GetTuningContext(),
+                                                         context->GetComputeStream(), output, add_out, input, skip, bias,
                                                          gamma, beta, workspace.get(),
                                                          epsilon_,
                                                          batch_size,
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.cc b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.cc
new file mode 100644
index 000000000000..5dec69052884
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.cc
@@ -0,0 +1,101 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The CUDA kernel is modified from GroupNorm plugin of TensorRT 8.5
+// Modifications: heuristic channels per block; support epsilon; support skip and bias; update coding style.
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/cuda/diffusion/group_norm_common_base.h"
+
+using namespace onnxruntime::cuda;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+int NextSize(int x) {
+  for (size_t i = 0; i < kNumOfSizes; ++i) {
+    if (x <= kSizes[i]) {
+      return kSizes[i];
+    }
+  }
+
+  return x;
+}
+
+int32_t GetThreadsPerBlock(int32_t channels_per_block, int32_t channels_per_thread) {
+  return NextSize(channels_per_block) / channels_per_thread;
+}
+
+int32_t FindMaxDivisor(int32_t n, int32_t max_allowed_divisor) {
+  int32_t max_divisor = -1;
+  for (int32_t i = 1; i <= std::sqrt(n); i++) {
+    if (n % i == 0) {
+      int32_t divisor1 = n / i;
+      int32_t divisor2 = i;
+
+      if (divisor1 > max_divisor && divisor1 < max_allowed_divisor) {
+        max_divisor = divisor1;
+      }
+      if (divisor2 > max_divisor && divisor2 < max_allowed_divisor) {
+        max_divisor = divisor2;
+      }
+    }
+  }
+  return max_divisor;
+}
+
+// Find proper channels per block based on a cost function: The cost is number of channels corresponding to
+// extra threads allocated but no channels assigned to them to work on. If cost is zero, every thread has
+// work to do so it is ideal case.
+int FindChannelsPerBlock(int num_channels, int channels_per_group) {
+  int min_cost = -1;
+  int best_candidate = -1;
+  for (size_t i = kNumOfSizes; i > 0; --i) {
+    if (kSizes[i - 1] < channels_per_group) {
+      break;
+    }
+
+    int channels_per_block = kSizes[i - 1] / channels_per_group * channels_per_group;
+    int blocks = (num_channels + channels_per_block - 1) / channels_per_block;
+    int cost = blocks * kSizes[i - 1] - num_channels;
+    if (cost == 0) {
+      return channels_per_block;
+    }
+
+    if (min_cost == -1 || cost < min_cost) {
+      min_cost = cost;
+      best_candidate = channels_per_block;
+    }
+  }
+
+  return best_candidate;
+}
+
+int GetChannelsPerBlock(int num_channels, int num_groups) {
+  int32_t channels_per_group = num_channels / num_groups;
+  int32_t channels_per_block = channels_per_group;
+  if (channels_per_group < kMaxSize / 2) {
+    channels_per_block = FindChannelsPerBlock(num_channels, channels_per_group);
+  }
+  return channels_per_block;
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h
new file mode 100644
index 000000000000..a80584d3293a
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_common_base.h
@@ -0,0 +1,186 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The CUDA kernel is modified from GroupNorm plugin of TensorRT 8.5
+// Modifications: heuristic channels per block; support epsilon; support skip and bias; update coding style.
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/cuda/cuda_common.h"
+using namespace onnxruntime::cuda;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+// TODO: Similar to SkipLayerNorm kernel, read/write up to 8 channels at same time.
+constexpr static int32_t CHANNELS_PER_THREAD = 2;
+
+constexpr static int kSizes[] = {128, 256, 320, 384, 512};
+constexpr static size_t kNumOfSizes = sizeof(kSizes) / sizeof(kSizes[0]);
+constexpr static int kMaxSize = kSizes[kNumOfSizes - 1];
+
+int32_t GetThreadsPerBlock(int32_t channels_per_block, int32_t channels_per_thread);
+
+static inline int32_t DivUp(int32_t m, int32_t n) {
+  return (m + n - 1) / n;
+}
+
+int32_t FindMaxDivisor(int32_t n, int32_t max_allowed_divisor);
+
+int GetChannelsPerBlock(int num_channels, int num_groups);
+
+template <typename T>
+struct GroupNormNHWCParams {
+  // The output buffer. Shape is (n, h, w, c).
+  T* dst;
+
+  // Optional output of element-wise add result of src, skip and bias. Shape is (n, h, w, c).
+  T* add_out;
+
+  // The input buffer. Shape is (n, h, w, c).
+  T const* src;
+
+  // Optional input buffer for skip tensor. Shape is (n, h, w, c) or (n, 1, 1, c) or (n, c).
+  T const* skip;
+
+  // Optional input buffer for bias tensor. Shape is (c).
+  T const* bias;
+
+  // The gamma scaling factor.
+  float const* gamma;
+
+  // The beta term to add in GN.
+  float const* beta;
+
+  // The temporary buffer to do the global parallel reduction. Shape is (n, 2, g), where g is number of groups.
+  float* group_sum_buffer;
+
+  // The number of instances in the batch.
+  int32_t n;
+
+  // The height and width of each activation map.
+  int32_t h;
+  int32_t w;
+
+  // Number of channels.
+  int32_t c;
+
+  // Number of groups.
+  int32_t groups;
+
+  // Do we apply the SiLU activation function?
+  bool use_silu;
+
+  // Precomputed values and parameters to control the execution of the kernels.
+
+  // Number of activations per instance (h * w)
+  int32_t hw;
+
+  // Number of activations per block
+  int32_t hw_per_block;
+
+  // Number of channels per block in the C dimension.
+  int32_t channels_per_block;
+
+  // Number of channels per group in the C dimension.
+  int32_t channels_per_group;
+
+  // The precomputed stride between instances.
+  int32_t hwc;
+  // The inverse of hw*channels_per_group to compute mean of a group.
+  float inv_hw_channels_per_group;
+  // The precomputed number of groups per block.
+  int32_t groups_per_block;
+
+  // Number of threads per block
+  int32_t threads_per_block;
+
+  // Epsilon to get stable variance in normalization.
+  float epsilon;
+
+  // Whether skip need broadcast. True if shape of skip is (N, C) or (N, 1, 1, C); False otherwise.
+  bool broadcast_skip;
+
+  // For SkipGroupNorm, it points to the intermediate result of adding skip and bias.
+  T* skip_workspace;
+
+  GroupNormNHWCParams(T* output,
+                      T* add_out,
+                      const T* input,
+                      const T* skip,
+                      const T* bias,
+                      const float* gamma,
+                      const float* beta,
+                      float* workspace,
+                      float epsilon,
+                      int batch_size,
+                      int num_channels,
+                      int height,
+                      int width,
+                      int num_groups,
+                      bool use_silu,
+                      bool broadcast_skip,
+                      int channels_per_block) {
+    int32_t channels_per_group_in = num_channels / num_groups;
+    // channels_per_block is computed in PrePack.
+    // If the gamma is not initializer, channels_per_block might be zero after PrePack. In that happens, compute it here.
+    if (channels_per_block < channels_per_group_in) {
+      channels_per_block = GetChannelsPerBlock(num_channels, num_groups);
+    }
+
+    this->use_silu = use_silu;
+    this->dst = output;
+    this->add_out = add_out;
+    this->src = input;
+    this->skip = skip;
+    this->bias = bias;
+    this->gamma = gamma;
+    this->beta = beta;
+    this->group_sum_buffer = workspace;
+    this->n = batch_size;
+    this->h = height;
+    this->w = width;
+    this->c = num_channels;
+    this->groups = num_groups;
+    this->hw = this->h * this->w;
+
+    // This will allocate as many blocks as possible to partition HW.
+    // For Stable Diffusion, latent hw is 4K ~ 16K. This will allocate 1024 blocks, and each handles 4~16 hw.
+    // TODO: tune this logic to find proper blocks when hw is small.
+    constexpr int32_t max_blocks_per_hw = 1024;
+    const int32_t blocks_per_hw = FindMaxDivisor(this->hw, max_blocks_per_hw);
+    this->hw_per_block = DivUp(this->hw, blocks_per_hw);
+
+    this->channels_per_block = channels_per_block;
+    this->channels_per_group = channels_per_group_in;
+    this->hwc = this->hw * this->c;
+    this->inv_hw_channels_per_group = 1.F / (float)(this->hw * this->channels_per_group);
+    this->groups_per_block = channels_per_block / this->channels_per_group;
+    this->epsilon = epsilon;
+    this->broadcast_skip = broadcast_skip;
+
+    // Workspace for SkipGroupNorm to store intermediate results of src+skip+bias.
+    this->skip_workspace = (this->add_out != nullptr) ? this->add_out : this->dst;
+
+    this->threads_per_block = GetThreadsPerBlock(channels_per_block, CHANNELS_PER_THREAD);
+  }
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.cu b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.cu
index 48b161552ce0..4909dc5e3897 100644
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.cu
@@ -27,6 +27,8 @@
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "contrib_ops/cuda/diffusion/group_norm_impl.h"
 #include "contrib_ops/cuda/transformers/dump_cuda_tensor.h"
+#include "contrib_ops/cuda/diffusion/group_norm_common_base.h"
+#include "contrib_ops/cuda/diffusion/group_norm_impl_kernel.cuh"
 
 using namespace onnxruntime::cuda;
 
@@ -34,329 +36,6 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-namespace {
-
-// TODO: Similar to SkipLayerNorm kernel, read/write up to 8 channels at same time.
-constexpr static int32_t CHANNELS_PER_THREAD = 2;
-
-constexpr static int kSizes[] = {128, 256, 320, 384, 512};
-constexpr static size_t kNumOfSizes = sizeof(kSizes) / sizeof(kSizes[0]);
-constexpr static int kMaxSize = kSizes[kNumOfSizes - 1];
-
-int NextSize(int x) {
-  for (size_t i = 0; i < kNumOfSizes; ++i) {
-    if (x <= kSizes[i]) {
-      return kSizes[i];
-    }
-  }
-
-  return x;
-}
-}  // namespace
-
-static inline int32_t DivUp(int32_t m, int32_t n) {
-  return (m + n - 1) / n;
-}
-
-static inline __device__ __host__ float sigmoid(float x) {
-  return 1.F / (1.F + expf(-x));
-}
-
-struct GroupSums {
-  // Is it the 1st element of the group?
-  int32_t flag;
-  // The sum.
-  float sum;
-  // The sum of squares.
-  float sum_sq;
-};
-
-struct GroupSumsOp {
-  inline __device__ GroupSums operator()(GroupSums const& a, GroupSums const& b) {
-    GroupSums dst;
-    dst.sum = b.flag ? b.sum : (a.sum + b.sum);
-    dst.sum_sq = b.flag ? b.sum_sq : (a.sum_sq + b.sum_sq);
-    dst.flag = a.flag + b.flag;
-    return dst;
-  }
-};
-
-template <typename T>
-struct GroupNormNHWCParams {
-  // The output buffer. Shape is (n, h, w, c).
-  T* dst;
-
-  // Optional output of element-wise add result of src, skip and bias. Shape is (n, h, w, c).
-  T* add_out;
-
-  // The input buffer. Shape is (n, h, w, c).
-  T const* src;
-
-  // Optional input buffer for skip tensor. Shape is (n, h, w, c) or (n, 1, 1, c) or (n, c).
-  T const* skip;
-
-  // Optional input buffer for bias tensor. Shape is (c).
-  T const* bias;
-
-  // The gamma scaling factor.
-  float const* gamma;
-
-  // The beta term to add in GN.
-  float const* beta;
-
-  // The temporary buffer to do the global parallel reduction. Shape is (n, 2, g), where g is number of groups.
-  float* group_sum_buffer;
-
-  // The number of instances in the batch.
-  int32_t n;
-
-  // The height and width of each activation map.
-  int32_t h;
-  int32_t w;
-
-  // Number of channels.
-  int32_t c;
-
-  // Number of groups.
-  int32_t groups;
-
-  // Do we apply the SiLU activation function?
-  bool use_silu;
-
-  // Precomputed values and parameters to control the execution of the kernels.
-
-  // Number of activations per instance (h * w)
-  int32_t hw;
-
-  // Number of activations per block
-  int32_t hw_per_block;
-
-  // Number of channels per block in the C dimension.
-  int32_t channels_per_block;
-
-  // Number of channels per group in the C dimension.
-  int32_t channels_per_group;
-
-  // The precomputed stride between instances.
-  int32_t hwc;
-  // The inverse of hw*channels_per_group to compute mean of a group.
-  float inv_hw_channels_per_group;
-  // The precomputed number of groups per block.
-  int32_t groups_per_block;
-
-  // Number of threads per block
-  int32_t threads_per_block;
-
-  // Epsilon to get stable variance in normalization.
-  float epsilon;
-
-  // Whether skip need broadcast. True if shape of skip is (N, C) or (N, 1, 1, C); False otherwise.
-  bool broadcast_skip;
-
-  // For SkipGroupNorm, it points to the intermediate result of adding skip and bias.
-  T* skip_workspace;
-};
-
-template <typename T>
-inline __device__ void UpdateSum(const T* src, int64_t offset, float& sum, float& sum_sq);
-
-template <>
-inline __device__ void UpdateSum(const half* src, int64_t offset, float& sum, float& sum_sq) {
-  // Fetch two channels per thread.
-  __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]);
-
-  float2 f2 = __half22float2(h2);
-
-  // Update the sum.
-  sum += f2.x + f2.y;
-
-  // Update the sum of squares.
-  sum_sq += f2.x * f2.x + f2.y * f2.y;
-}
-
-template <>
-inline __device__ void UpdateSum(const float* src, int64_t offset, float& sum, float& sum_sq) {
-  // Fetch two channels per thread.
-  float2 f2 = *reinterpret_cast<float2 const*>(&src[offset]);
-
-  // Update the sum.
-  sum += f2.x + f2.y;
-
-  // Update the sum of squares.
-  sum_sq += f2.x * f2.x + f2.y * f2.y;
-}
-
-// Sum for SkipGroupNorm: add_out[offset] = src[offset] + skip[skip_offset] + bias[bias_offset]
-template <typename T>
-inline __device__ void AddSkipBias(T* add_out, const T* src, const T* skip, const T* bias,
-                                   int64_t offset, int64_t skip_offset, int64_t bias_offset, float& sum, float& sum_sq);
-
-template <>
-inline __device__ void AddSkipBias(half* add_out, const half* src, const half* skip, const half* bias,
-                                   int64_t offset, int64_t skip_offset, int64_t bias_offset, float& sum, float& sum_sq) {
-  // Fetch two channels per thread.
-  __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]);
-  __half2 s = *reinterpret_cast<__half2 const*>(&skip[skip_offset]);
-  __half2 b = *reinterpret_cast<__half2 const*>(&bias[bias_offset]);
-  h2 = h2 + b;
-  h2 = h2 + s;
-
-  *reinterpret_cast<__half2*>(&add_out[offset]) = h2;
-
-  float2 f2 = __half22float2(h2);
-  sum += f2.x + f2.y;
-  sum_sq += f2.x * f2.x + f2.y * f2.y;
-}
-
-template <>
-inline __device__ void AddSkipBias(float* add_out, const float* src, const float* skip, const float* bias,
-                                   int64_t offset, int64_t skip_offset, int64_t bias_offset, float& sum, float& sum_sq) {
-  float2 f2 = *reinterpret_cast<float2 const*>(&src[offset]);
-  float2 s = *reinterpret_cast<float2 const*>(&skip[skip_offset]);
-  float2 b = *reinterpret_cast<float2 const*>(&bias[bias_offset]);
-  f2.x += s.x + b.x;
-  f2.y += s.y + b.y;
-
-  *reinterpret_cast<float2*>(&add_out[offset]) = f2;
-
-  sum += f2.x + f2.y;
-  sum_sq += f2.x * f2.x + f2.y * f2.y;
-}
-
-// Sum for SkipGroupNorm without bias: add_out[offset] = src[offset] + skip[skip_offset]
-template <typename T>
-inline __device__ void AddSkip(T* add_out, const T* src, const T* skip,
-                               int64_t offset, int64_t skip_offset, float& sum, float& sum_sq);
-
-template <>
-inline __device__ void AddSkip(half* add_out, const half* src, const half* skip,
-                               int64_t offset, int64_t skip_offset, float& sum, float& sum_sq) {
-  __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]);
-  __half2 s = *reinterpret_cast<__half2 const*>(&skip[skip_offset]);
-  h2 = h2 + s;
-
-  *reinterpret_cast<__half2*>(&add_out[offset]) = h2;
-
-  float2 f2 = __half22float2(h2);
-  sum += f2.x + f2.y;
-  sum_sq += f2.x * f2.x + f2.y * f2.y;
-}
-
-template <>
-inline __device__ void AddSkip(float* add_out, const float* src, const float* skip,
-                               int64_t offset, int64_t skip_offset, float& sum, float& sum_sq) {
-  float2 f2 = *reinterpret_cast<float2 const*>(&src[offset]);
-  float2 s = *reinterpret_cast<float2 const*>(&skip[skip_offset]);
-  f2.x += s.x;
-  f2.y += s.y;
-  *reinterpret_cast<float2*>(&add_out[offset]) = f2;
-  sum += f2.x + f2.y;
-  sum_sq += f2.x * f2.x + f2.y * f2.y;
-}
-
-template <typename T, int32_t THREADS_PER_BLOCK>
-__global__ void GroupNormNHWCSumKernel(GroupNormNHWCParams<T> params) {
-  // The object in charge of doing the sums for the different blocks.
-  typedef cub::BlockScan<GroupSums, THREADS_PER_BLOCK> BlockScan;
-
-  // Allocate shared memory for BlockScan.
-  __shared__ typename BlockScan::TempStorage temp_storage;
-
-  // Allocate shared memory for the groups. We could reduce the amount of shared memory reserved.
-  __shared__ float2 smem[THREADS_PER_BLOCK];
-
-  // The instance in the batch.
-  int32_t ni = blockIdx.z;
-
-  // The channel loaded by that thread.
-  int32_t ci = blockIdx.x * params.channels_per_block + threadIdx.x * CHANNELS_PER_THREAD;
-
-  if (ci >= params.c || threadIdx.x * CHANNELS_PER_THREAD >= params.channels_per_block) {
-    return;
-  }
-
-  // The first activation loaded by that block.
-  int32_t hw_begin = blockIdx.y * params.hw_per_block;
-  // The last activation loaded by that block.
-  int32_t hw_end = min(hw_begin + params.hw_per_block, params.hw);
-
-  // The sums.
-  float sum = 0.F;
-  float sum_sq = 0.F;
-
-  // Iterate over the activations to compute the sums.
-  int64_t offset = static_cast<int64_t>(ni) * params.hwc + static_cast<int64_t>(hw_begin) * params.c + ci;
-  if (params.skip != nullptr) {
-    // SkipGroupNorm: skip is (n, h, w, c) or (n, 1, 1, c) or (n, c),  bias is (c), and add_out is (n, h, w, c)
-    const int64_t bias_offset = static_cast<int64_t>(ci);
-    T* add_out = params.skip_workspace;
-    if (params.broadcast_skip) {
-      const int64_t skip_offset = static_cast<int64_t>(ni) * params.c + ci;
-
-      if (params.bias != nullptr) {
-        for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += params.c) {
-          AddSkipBias(add_out, params.src, params.skip, params.bias, offset, skip_offset, bias_offset, sum, sum_sq);
-        }
-      } else {
-        for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += params.c) {
-          AddSkip(add_out, params.src, params.skip, offset, skip_offset, sum, sum_sq);
-        }
-      }
-    } else {
-      if (params.bias != nullptr) {
-        for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += params.c) {
-          AddSkipBias(add_out, params.src, params.skip, params.bias, offset, offset, bias_offset, sum, sum_sq);
-        }
-      } else {
-        for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += params.c) {
-          AddSkip(add_out, params.src, params.skip, offset, offset, sum, sum_sq);
-        }
-      }
-    }
-  } else {  // GroupNorm
-    for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += params.c) {
-      UpdateSum(params.src, offset, sum, sum_sq);
-    }
-  }
-
-  // The group index relative to the first group within the same block.
-  int32_t gi = threadIdx.x * CHANNELS_PER_THREAD / params.channels_per_group;
-  // The channel in the group.
-  int32_t cj = ci % params.channels_per_group;
-
-  // The data for the summations.
-  GroupSums inp{cj == 0 ? 1 : 0, sum, sum_sq};
-
-  // Do the segmented scan. InclusiveScan is not deterministic.
-  GroupSums out;
-  BlockScan(temp_storage).InclusiveScan(inp, out, GroupSumsOp());
-
-  // Store the results for the groups in shared memory (to produce coalesced stores later).
-  // For each group, only the last thread of that group is picked to save sum to shared memory.
-  if (cj == params.channels_per_group - CHANNELS_PER_THREAD) {
-    smem[gi] = make_float2(out.sum, out.sum_sq);
-  }
-
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-  // Threads that have nothing left to do, exit.
-  if (threadIdx.x >= params.groups_per_block) {
-    return;
-  }
-
-  // The global group index.
-  // Use neighboring threads for coalesced write.
-  int32_t gj = blockIdx.x * params.groups_per_block + threadIdx.x;
-
-  if (gj < params.groups) {
-    float2 sums = smem[threadIdx.x];
-    const int index = (2 * ni) * params.groups + gj;
-    atomicAdd(&params.group_sum_buffer[index], sums.x);
-    atomicAdd(&params.group_sum_buffer[index + params.groups], sums.y);
-  }
-}
-
 template <typename T>
 void GroupNormNHWCSum(GroupNormNHWCParams<T> const& params, cudaStream_t stream) {
   dim3 grid;
@@ -370,119 +49,26 @@ void GroupNormNHWCSum(GroupNormNHWCParams<T> const& params, cudaStream_t stream)
   // The number of instances.
   grid.z = params.n;
 
+#define LAUNCH_GROUPNORM_SUM(ThreadsPerBlock, VecSize)                                               \
+  GroupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>                                                \
+      <<<grid, ThreadsPerBlock, 0, stream>>>(                                                        \
+          params.skip_workspace, params.group_sum_buffer, params.src, params.skip, params.bias,       \
+          params.channels_per_block, params.hw_per_block, params.hw, params.hwc, params.c,           \
+          params.channels_per_group, params.groups, params.groups_per_block, params.broadcast_skip); \
+  break;
+
   // Threads_per_block is half of values in kSizes since CHANNELS_PER_THREAD = 2.
   switch (params.threads_per_block) {
     case 256:
-      GroupNormNHWCSumKernel<T, 256><<<grid, 256, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SUM(256, CHANNELS_PER_THREAD)
     case 192:
-      GroupNormNHWCSumKernel<T, 192><<<grid, 192, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SUM(192, CHANNELS_PER_THREAD)
     case 160:
-      GroupNormNHWCSumKernel<T, 160><<<grid, 160, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SUM(160, CHANNELS_PER_THREAD)
     case 128:
-      GroupNormNHWCSumKernel<T, 128><<<grid, 128, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SUM(128, CHANNELS_PER_THREAD)
     case 64:
-      GroupNormNHWCSumKernel<T, 64><<<grid, 64, 0, stream>>>(params);
-      break;
-  }
-}
-
-template <typename T>
-__device__ void ComputeGroupNorm(const T* src, T* dst, int64_t offset, float mean, float inv_std_dev,
-                                 float2& gamma_f2, float2& beta_f2, bool silu);
-
-template <>
-__device__ void ComputeGroupNorm(const half* src, half* dst, int64_t offset, float mean, float inv_std_dev,
-                                 float2& gamma_f2, float2& beta_f2, bool silu) {
-  // Fetch two channels per thread.
-  __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]);
-
-  // Extract the two half values.
-  float2 f2 = __half22float2(h2);
-
-  // Normalize the channels.
-  f2.x = (f2.x - mean) * inv_std_dev;
-  f2.y = (f2.y - mean) * inv_std_dev;
-
-  // Scale by gamma and add beta.
-  f2.x = gamma_f2.x * f2.x + beta_f2.x;
-  f2.y = gamma_f2.y * f2.y + beta_f2.y;
-
-  // Apply SiLU activation if needed.
-  if (silu) {
-    f2.x = f2.x * sigmoid(f2.x);
-    f2.y = f2.y * sigmoid(f2.y);
-  }
-
-  *reinterpret_cast<__half2*>(&dst[offset]) = __float22half2_rn(f2);
-}
-
-template <>
-__device__ void ComputeGroupNorm(const float* src, float* dst, int64_t offset, float mean, float inv_std_dev,
-                                 float2& gamma_f2, float2& beta_f2, bool silu) {
-  // Fetch two channels per thread.
-  float2 f2 = *reinterpret_cast<float2 const*>(&src[offset]);
-
-  // Normalize the channels.
-  f2.x = (f2.x - mean) * inv_std_dev;
-  f2.y = (f2.y - mean) * inv_std_dev;
-
-  // Scale by gamma and add beta.
-  f2.x = gamma_f2.x * f2.x + beta_f2.x;
-  f2.y = gamma_f2.y * f2.y + beta_f2.y;
-
-  // Apply SiLU activation if needed.
-  if (silu) {
-    f2.x = f2.x * sigmoid(f2.x);
-    f2.y = f2.y * sigmoid(f2.y);
-  }
-
-  *reinterpret_cast<float2*>(&dst[offset]) = f2;
-}
-
-template <typename T>
-__global__ void GroupNormNHWCScaleKernel(GroupNormNHWCParams<T> params) {
-  // The channel loaded by that thread.
-  int32_t ci = blockIdx.x * params.channels_per_block + threadIdx.x * CHANNELS_PER_THREAD;
-  if (ci >= params.c || threadIdx.x * CHANNELS_PER_THREAD >= params.channels_per_block) {
-    return;
-  }
-
-  // The instance in the batch.
-  int32_t ni = blockIdx.z;
-
-  // The group that thread works on.
-  int32_t gi = ci / params.channels_per_group;
-
-  // Load the sum and sum of squares for the group.
-  float sum = 0.F, sum_sq = 0.F;
-  if (gi < params.groups) {
-    const int index = (2 * ni) * params.groups + gi;
-    sum = params.group_sum_buffer[index];
-    sum_sq = params.group_sum_buffer[index + params.groups];
-  }
-
-  // Load gamma/beta. Fetch two per thread.
-  float2 gamma_f2 = *reinterpret_cast<float2 const*>(&params.gamma[ci]);
-  float2 beta_f2 = *reinterpret_cast<float2 const*>(&params.beta[ci]);
-
-  // Compute the mean.
-  float mean = sum * params.inv_hw_channels_per_group;
-  // Compute the variance.
-  float var = sum_sq * params.inv_hw_channels_per_group - (mean * mean);
-  // Compute the inverse of the stddev.
-  float inv_std_dev = rsqrtf(var + params.epsilon);
-
-  int32_t hw_begin = blockIdx.y * params.hw_per_block;
-  int32_t hw_end = min(hw_begin + params.hw_per_block, params.hw);
-
-  const T* input = (params.skip != nullptr) ? params.skip_workspace : params.src;
-  int64_t offset = static_cast<int64_t>(ni) * params.hwc + static_cast<int64_t>(hw_begin) * params.c + ci;
-  for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += params.c) {
-    ComputeGroupNorm<T>(input, params.dst, offset, mean, inv_std_dev, gamma_f2, beta_f2, params.use_silu);
+      LAUNCH_GROUPNORM_SUM(64, CHANNELS_PER_THREAD)
   }
 }
 
@@ -497,83 +83,34 @@ void GroupNormNHWCScale(GroupNormNHWCParams<T> const& params, cudaStream_t strea
   // The number of instances.
   grid.z = params.n;
 
+#define LAUNCH_GROUPNORM_SCALE(ThreadsPerBlock, VecSize)                                                           \
+  GroupNormNHWCScaleKernel<T, VecSize>                                                                             \
+      <<<grid, ThreadsPerBlock, 0, stream>>>(                                                                      \
+          params.dst, params.src, params.skip, params.gamma, params.beta, params.skip_workspace,                   \
+          params.group_sum_buffer, params.epsilon, params.c, params.channels_per_block, params.channels_per_group, \
+          params.groups, params.hwc, params.inv_hw_channels_per_group, params.hw, params.hw_per_block,             \
+          params.use_silu);                                                                                        \
+  break;
+
   // Threads_per_block is half of values in kSizes since CHANNELS_PER_THREAD = 2.
   switch (params.threads_per_block) {
     case 256:
-      GroupNormNHWCScaleKernel<T><<<grid, 256, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SCALE(256, CHANNELS_PER_THREAD)
     case 192:
-      GroupNormNHWCScaleKernel<T><<<grid, 192, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SCALE(192, CHANNELS_PER_THREAD)
     case 160:
-      GroupNormNHWCScaleKernel<T><<<grid, 160, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SCALE(160, CHANNELS_PER_THREAD)
     case 128:
-      GroupNormNHWCScaleKernel<T><<<grid, 128, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SCALE(128, CHANNELS_PER_THREAD)
     case 64:
-      GroupNormNHWCScaleKernel<T><<<grid, 64, 0, stream>>>(params);
-      break;
+      LAUNCH_GROUPNORM_SCALE(64, CHANNELS_PER_THREAD)
   }
 }
 
-int32_t FindMaxDivisor(int32_t n, int32_t max_allowed_divisor) {
-  int32_t max_divisor = -1;
-  for (int32_t i = 1; i <= std::sqrt(n); i++) {
-    if (n % i == 0) {
-      int32_t divisor1 = n / i;
-      int32_t divisor2 = i;
-
-      if (divisor1 > max_divisor && divisor1 < max_allowed_divisor) {
-        max_divisor = divisor1;
-      }
-      if (divisor2 > max_divisor && divisor2 < max_allowed_divisor) {
-        max_divisor = divisor2;
-      }
-    }
-  }
-  return max_divisor;
-}
-
-// Find proper channels per block based on a cost function: The cost is number of channels corresponding to
-// extra threads allocated but no channels assigned to them to work on. If cost is zero, every thread has
-// work to do so it is ideal case.
-int FindChannelsPerBlock(int num_channels, int channels_per_group) {
-  int min_cost = -1;
-  int best_candidate = -1;
-  for (size_t i = kNumOfSizes; i > 0; --i) {
-    if (kSizes[i - 1] < channels_per_group) {
-      break;
-    }
-
-    int channels_per_block = kSizes[i - 1] / channels_per_group * channels_per_group;
-    int blocks = (num_channels + channels_per_block - 1) / channels_per_block;
-    int cost = blocks * kSizes[i - 1] - num_channels;
-    if (cost == 0) {
-      return channels_per_block;
-    }
-
-    if (min_cost == -1 || cost < min_cost) {
-      min_cost = cost;
-      best_candidate = channels_per_block;
-    }
-  }
-
-  return best_candidate;
-}
-
-int GetChannelsPerBlock(int num_channels, int num_groups) {
-  int32_t channels_per_group = num_channels / num_groups;
-  int32_t channels_per_block = channels_per_group;
-  if (channels_per_group < kMaxSize / 2) {
-    channels_per_block = FindChannelsPerBlock(num_channels, channels_per_group);
-  }
-  return channels_per_block;
-}
-
 template <typename T>
 Status LaunchGroupNormKernel(
-    cudaStream_t stream,
+    CudaTuningContext* tuning_ctx,
+    Stream* ort_stream,
     T* output,
     T* add_out,
     const T* input,
@@ -591,19 +128,17 @@ Status LaunchGroupNormKernel(
     bool use_silu,
     bool broadcast_skip,
     int channels_per_block) {
-  GroupNormNHWCParams<T> params;
 
-  int32_t channels_per_group = num_channels / num_groups;
-  // channels_per_block is computed in PrePack.
-  // If the gamma is not initializer, channels_per_block might be zero after PrePack. In that happens, compute it here.
-  if (channels_per_block < channels_per_group) {
-    channels_per_block = GetChannelsPerBlock(num_channels, num_groups);
-  }
+  // tuning_ctx only used for ROCm EP.
+  ORT_UNUSED_PARAMETER(tuning_ctx);
 
-  // TODO: Update the kernel to support CHANNELS_PER_THREAD==1 and other corner cases
-  if (channels_per_block % channels_per_group != 0 ||
-      channels_per_block > kMaxSize ||
-      (channels_per_group % CHANNELS_PER_THREAD != 0)) {
+  GroupNormNHWCParams<T> params(output, add_out, input, skip, bias, gamma, beta, reinterpret_cast<float*>(workspace), epsilon,
+                                batch_size, num_channels, height, width, num_groups, use_silu,
+                                broadcast_skip, channels_per_block);
+
+  if (params.channels_per_block % params.channels_per_group != 0 ||
+      params.channels_per_block > kMaxSize ||
+      (params.channels_per_group % CHANNELS_PER_THREAD != 0)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                            "GroupNorm in CUDA does not support the input: n=", batch_size,
                            " h=", height,
@@ -612,42 +147,7 @@ Status LaunchGroupNormKernel(
                            " groups=", num_groups);
   }
 
-  params.use_silu = use_silu;
-  params.dst = output;
-  params.add_out = add_out;
-  params.src = input;
-  params.skip = skip;
-  params.bias = bias;
-  params.gamma = gamma;
-  params.beta = beta;
-  params.group_sum_buffer = reinterpret_cast<float*>(workspace);
-  params.n = batch_size;
-  params.h = height;
-  params.w = width;
-  params.c = num_channels;
-  params.groups = num_groups;
-  params.hw = params.h * params.w;
-
-  // This will allocate as many blocks as possible to partition HW.
-  // For Stable Diffusion, latent hw is 4K ~ 16K. This will allocate 1024 blocks, and each handles 4~16 hw.
-  // TODO: tune this logic to find proper blocks when hw is small.
-  constexpr int32_t max_blocks_per_hw = 1024;
-  const int32_t blocks_per_hw = FindMaxDivisor(params.hw, max_blocks_per_hw);
-  params.hw_per_block = DivUp(params.hw, blocks_per_hw);
-
-  params.channels_per_block = channels_per_block;
-  params.channels_per_group = channels_per_group;
-  params.hwc = params.hw * params.c;
-  params.inv_hw_channels_per_group = 1.F / (float)(params.hw * params.channels_per_group);
-  params.groups_per_block = channels_per_block / params.channels_per_group;
-  params.epsilon = epsilon;
-  params.broadcast_skip = broadcast_skip;
-
-  // Workspace for SkipGroupNorm to store intermediate results of src+skip+bias.
-  params.skip_workspace = (params.add_out != nullptr) ? params.add_out : params.dst;
-
-  params.threads_per_block = NextSize(channels_per_block) / CHANNELS_PER_THREAD;
-
+  auto stream = static_cast<cudaStream_t>(ort_stream->GetHandle());
   CUDA_RETURN_IF_ERROR(cudaMemsetAsync(
       params.group_sum_buffer, 0, GetGroupNormWorkspaceSizeInBytes(batch_size, num_groups), stream));
 
@@ -663,14 +163,14 @@ Status LaunchGroupNormKernel(
   return Status::OK();
 }
 
-template Status LaunchGroupNormKernel<half>(cudaStream_t stream, half* output, half* add_out,
+template Status LaunchGroupNormKernel<half>(CudaTuningContext* tuning_ctx, Stream* stream, half* output, half* add_out,
                                             const half* input, const half* skip, const half* bias,
                                             const float* gamma, const float* beta, void* workspace,
                                             float epsilon, int batch_size, int num_channels,
                                             int height, int width, int num_groups, bool silu,
                                             bool broadcast_skip, int channels_per_block);
 
-template Status LaunchGroupNormKernel<float>(cudaStream_t stream, float* output, float* add_out,
+template Status LaunchGroupNormKernel<float>(CudaTuningContext* tuning_ctx, Stream* stream, float* output, float* add_out,
                                              const float* input, const float* skip, const float* bias,
                                              const float* gamma, const float* beta, void* workspace,
                                              float epsilon, int batch_size, int num_channels,
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h
index 9532aeecb2f5..98f38a1475ee 100644
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h
@@ -8,6 +8,8 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 
+#include "core/providers/cuda/tunable/cuda_tunable.h"
+
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
@@ -21,7 +23,8 @@ int GetChannelsPerBlock(int num_channels, int num_groups);
 
 template <typename T>
 Status LaunchGroupNormKernel(
-    cudaStream_t stream,
+    CudaTuningContext* tuning_ctx,
+    Stream* ort_stream,
     T* output,              // normalized output tensor. Shape is (n, h, w, c)
     T* add_out,             // optional output tensor for element-wise sum of input + skip + bias. Shape is (n, h, w, c)
     const T* input,         // input tensor. Shape is (n, h, w, c)
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl_kernel.cuh b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl_kernel.cuh
new file mode 100644
index 000000000000..ecd06315e370
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl_kernel.cuh
@@ -0,0 +1,451 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The CUDA kernel is modified from GroupNorm plugin of TensorRT 8.5
+// Modifications: heuristic channels per block; support epsilon; support skip and bias; update coding style.
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <cuda_fp16.h>
+#include <cub/cub.cuh>
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+
+using namespace onnxruntime::cuda;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+static inline __device__ __host__ float sigmoid(float x) {
+  return 1.F / (1.F + expf(-x));
+}
+
+struct GroupSums {
+  // Is it the 1st element of the group?
+  int32_t flag;
+  // The sum.
+  float sum;
+  // The sum of squares.
+  float sum_sq;
+};
+
+struct GroupSumsOp {
+  inline __device__ GroupSums operator()(GroupSums const& a, GroupSums const& b) {
+    GroupSums dst;
+    dst.sum = b.flag ? b.sum : (a.sum + b.sum);
+    dst.sum_sq = b.flag ? b.sum_sq : (a.sum_sq + b.sum_sq);
+    dst.flag = a.flag + b.flag;
+    return dst;
+  }
+};
+
+template <typename T, int ILP>
+inline __device__ void UpdateSum(const T* src, int64_t offset, float& sum, float& sum_sq) {
+  using VecT = onnxruntime::cuda::aligned_vector<T, ILP>;
+  const VecT input_v = *reinterpret_cast<const VecT*>(src + offset);
+
+#pragma unroll
+  for (int i = 0; i < ILP; i++) {
+    const float val = static_cast<float>(input_v.val[i]);
+    sum += val;
+    sum_sq += val * val;
+  }
+}
+
+template <>
+inline __device__ void UpdateSum<half, 2>(const half* src, int64_t offset, float& sum, float& sum_sq) {
+  // Fetch two channels per thread.
+  __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]);
+
+  float2 f2 = __half22float2(h2);
+
+  // Update the sum.
+  sum += f2.x + f2.y;
+
+  // Update the sum of squares.
+  sum_sq += f2.x * f2.x + f2.y * f2.y;
+}
+
+template <>
+inline __device__ void UpdateSum<float, 2>(const float* src, int64_t offset, float& sum, float& sum_sq) {
+  // Fetch two channels per thread.
+  float2 f2 = *reinterpret_cast<float2 const*>(&src[offset]);
+
+  // Update the sum.
+  sum += f2.x + f2.y;
+
+  // Update the sum of squares.
+  sum_sq += f2.x * f2.x + f2.y * f2.y;
+}
+
+// Sum for SkipGroupNorm: add_out[offset] = src[offset] + skip[skip_offset] + bias[bias_offset]
+template <typename T, int32_t ILP>
+inline __device__ void AddSkipBias(T* add_out, const T* src, const T* skip, const T* bias,
+                                   int64_t offset, int64_t skip_offset, int64_t bias_offset, float& sum, float& sum_sq) {
+  using VecT = onnxruntime::cuda::aligned_vector<T, ILP>;
+  const VecT input_v = *reinterpret_cast<const VecT*>(src + offset);
+  const VecT skip_v = *reinterpret_cast<const VecT*>(skip + skip_offset);
+  const VecT bias_v = *reinterpret_cast<const VecT*>(bias + bias_offset);
+  VecT output_v = *reinterpret_cast<VecT*>(add_out + offset);
+
+#pragma unroll
+  for (int i = 0; i < ILP; i++) {
+    output_v.val[i] = input_v.val[i] + skip_v.val[i] + bias_v.val[i];
+    const float val = static_cast<float>(output_v.val[i]);
+    sum += val;
+    sum_sq += val * val;
+  }
+  *(reinterpret_cast<VecT*>(add_out + offset)) = output_v;
+}
+
+template <>
+inline __device__ void AddSkipBias<half, 2>(half* add_out, const half* src, const half* skip, const half* bias,
+                                            int64_t offset, int64_t skip_offset, int64_t bias_offset, float& sum, float& sum_sq) {
+  // Fetch two channels per thread.
+  __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]);
+  __half2 s = *reinterpret_cast<__half2 const*>(&skip[skip_offset]);
+  __half2 b = *reinterpret_cast<__half2 const*>(&bias[bias_offset]);
+  h2 = h2 + b;
+  h2 = h2 + s;
+
+  *reinterpret_cast<__half2*>(&add_out[offset]) = h2;
+
+  float2 f2 = __half22float2(h2);
+  sum += f2.x + f2.y;
+  sum_sq += f2.x * f2.x + f2.y * f2.y;
+}
+
+template <>
+inline __device__ void AddSkipBias<float, 2>(float* add_out, const float* src, const float* skip, const float* bias,
+                                             int64_t offset, int64_t skip_offset, int64_t bias_offset, float& sum, float& sum_sq) {
+  float2 f2 = *reinterpret_cast<float2 const*>(&src[offset]);
+  float2 s = *reinterpret_cast<float2 const*>(&skip[skip_offset]);
+  float2 b = *reinterpret_cast<float2 const*>(&bias[bias_offset]);
+  f2.x += s.x + b.x;
+  f2.y += s.y + b.y;
+
+  *reinterpret_cast<float2*>(&add_out[offset]) = f2;
+
+  sum += f2.x + f2.y;
+  sum_sq += f2.x * f2.x + f2.y * f2.y;
+}
+
+// Sum for SkipGroupNorm without bias: add_out[offset] = src[offset] + skip[skip_offset]
+template <typename T, int32_t ILP>
+inline __device__ void AddSkip(T* add_out, const T* src, const T* skip,
+                               int64_t offset, int64_t skip_offset, float& sum, float& sum_sq) {
+  using VecT = onnxruntime::cuda::aligned_vector<T, ILP>;
+  const VecT input_v = *reinterpret_cast<const VecT*>(src + offset);
+  const VecT skip_v = *reinterpret_cast<const VecT*>(skip + skip_offset);
+  VecT output_v = *reinterpret_cast<VecT*>(add_out + offset);
+
+#pragma unroll
+  for (int i = 0; i < ILP; i++) {
+    output_v.val[i] = input_v.val[i] + skip_v.val[i];
+    const float val = static_cast<float>(output_v.val[i]);
+    sum += val;
+    sum_sq += val * val;
+  }
+  *(reinterpret_cast<VecT*>(add_out + offset)) = output_v;
+}
+
+template <>
+inline __device__ void AddSkip<half, 2>(half* add_out, const half* src, const half* skip,
+                                        int64_t offset, int64_t skip_offset, float& sum, float& sum_sq) {
+  __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]);
+  __half2 s = *reinterpret_cast<__half2 const*>(&skip[skip_offset]);
+  h2 = h2 + s;
+
+  *reinterpret_cast<__half2*>(&add_out[offset]) = h2;
+
+  float2 f2 = __half22float2(h2);
+  sum += f2.x + f2.y;
+  sum_sq += f2.x * f2.x + f2.y * f2.y;
+}
+
+template <>
+inline __device__ void AddSkip<float, 2>(float* add_out, const float* src, const float* skip,
+                                         int64_t offset, int64_t skip_offset, float& sum, float& sum_sq) {
+  float2 f2 = *reinterpret_cast<float2 const*>(&src[offset]);
+  float2 s = *reinterpret_cast<float2 const*>(&skip[skip_offset]);
+  f2.x += s.x;
+  f2.y += s.y;
+  *reinterpret_cast<float2*>(&add_out[offset]) = f2;
+  sum += f2.x + f2.y;
+  sum_sq += f2.x * f2.x + f2.y * f2.y;
+}
+
+template <typename T, int32_t THREADS_PER_BLOCK, int32_t ILP>
+__global__ void GroupNormNHWCSumKernel(T* skip_workspace, float* group_sum_buffer, const T* src, const T* skip, const T* bias,
+                                       int32_t channels_per_block, int32_t hw_per_block, int32_t hw, int32_t hwc, int32_t c,
+                                       int32_t channels_per_group, int32_t groups, int32_t groups_per_block, bool broadcast_skip) {
+  // The object in charge of doing the sums for the different blocks.
+  typedef cub::BlockScan<GroupSums, THREADS_PER_BLOCK> BlockScan;
+
+  // Allocate shared memory for BlockScan.
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // Allocate shared memory for the groups. We could reduce the amount of shared memory reserved.
+  __shared__ float2 smem[THREADS_PER_BLOCK];
+
+  // The instance in the batch.
+  int32_t ni = blockIdx.z;
+
+  // The channel loaded by that thread.
+  int32_t ci = blockIdx.x * channels_per_block + threadIdx.x * ILP;
+
+  if (ci >= c || threadIdx.x * ILP >= channels_per_block) {
+    return;
+  }
+
+  // The first activation loaded by that block.
+  int32_t hw_begin = blockIdx.y * hw_per_block;
+  // The last activation loaded by that block.
+  int32_t hw_end = min(hw_begin + hw_per_block, hw);
+
+  // The sums.
+  float sum = 0.F;
+  float sum_sq = 0.F;
+
+  // Iterate over the activations to compute the sums.
+  int64_t offset = static_cast<int64_t>(ni) * hwc + static_cast<int64_t>(hw_begin) * c + ci;
+  if (skip != nullptr) {
+    // SkipGroupNorm: skip is (n, h, w, c) or (n, 1, 1, c) or (n, c),  bias is (c), and add_out is (n, h, w, c)
+    const int64_t bias_offset = static_cast<int64_t>(ci);
+    T* add_out = skip_workspace;
+    if (broadcast_skip) {
+      const int64_t skip_offset = static_cast<int64_t>(ni) * c + ci;
+
+      if (bias != nullptr) {
+        for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += c) {
+          AddSkipBias<T, ILP>(add_out, src, skip, bias, offset, skip_offset, bias_offset, sum, sum_sq);
+        }
+      } else {
+        for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += c) {
+          AddSkip<T, ILP>(add_out, src, skip, offset, skip_offset, sum, sum_sq);
+        }
+      }
+    } else {
+      if (bias != nullptr) {
+        for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += c) {
+          AddSkipBias<T, ILP>(add_out, src, skip, bias, offset, offset, bias_offset, sum, sum_sq);
+        }
+      } else {
+        for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += c) {
+          AddSkip<T, ILP>(add_out, src, skip, offset, offset, sum, sum_sq);
+        }
+      }
+    }
+  } else {  // GroupNorm
+    for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += c) {
+      UpdateSum<T, ILP>(src, offset, sum, sum_sq);
+    }
+  }
+
+  // The group index relative to the first group within the same block.
+  int32_t gi = threadIdx.x * ILP / channels_per_group;
+  // The channel in the group.
+  int32_t cj = ci % channels_per_group;
+
+  // The data for the summations.
+  GroupSums inp{cj == 0 ? 1 : 0, sum, sum_sq};
+
+  // Do the segmented scan. InclusiveScan is not deterministic.
+  GroupSums out;
+  BlockScan(temp_storage).InclusiveScan(inp, out, GroupSumsOp());
+
+  // Store the results for the groups in shared memory (to produce coalesced stores later).
+  // For each group, only the last thread of that group is picked to save sum to shared memory.
+  if (cj == channels_per_group - ILP) {
+    smem[gi] = make_float2(out.sum, out.sum_sq);
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  // Threads that have nothing left to do, exit.
+  if (threadIdx.x >= groups_per_block) {
+    return;
+  }
+
+  // The global group index.
+  // Use neighboring threads for coalesced write.
+  int32_t gj = blockIdx.x * groups_per_block + threadIdx.x;
+
+  if (gj < groups) {
+    float2 sums = smem[threadIdx.x];
+    const int index = (2 * ni) * groups + gj;
+    atomicAdd(&group_sum_buffer[index], sums.x);
+    atomicAdd(&group_sum_buffer[index + groups], sums.y);
+  }
+}
+
+template <typename T, int32_t ILP>
+__device__ void computeGroupNormVec(const T* src, T* dst, int64_t offset, float mean, float inv_std_dev,
+                                    const float* gamma_v, const float* beta_v, bool silu) {
+  using VecT = onnxruntime::cuda::aligned_vector<T, ILP>;
+  const VecT input_v = *reinterpret_cast<const VecT*>(src + offset);
+  VecT output_v;
+
+#pragma unroll
+  for (int i = 0; i < ILP; i++) {
+    float val = static_cast<float>(input_v.val[i]);
+    val = (val - mean) * inv_std_dev;
+    val = gamma_v[i] * val + beta_v[i];
+
+    if (silu) {
+      val = val * sigmoid(val);
+    }
+    output_v.val[i] = static_cast<T>(val);
+  }
+  *(reinterpret_cast<VecT*>(dst + offset)) = output_v;
+}
+
+template <typename T>
+__device__ void ComputeGroupNorm(const T* src, T* dst, int64_t offset, float mean, float inv_std_dev,
+                                 float2& gamma_f2, float2& beta_f2, bool silu);
+
+template <>
+__device__ void ComputeGroupNorm(const half* src, half* dst, int64_t offset, float mean, float inv_std_dev,
+                                 float2& gamma_f2, float2& beta_f2, bool silu) {
+  // Fetch two channels per thread.
+  __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]);
+
+  // Extract the two half values.
+  float2 f2 = __half22float2(h2);
+
+  // Normalize the channels.
+  f2.x = (f2.x - mean) * inv_std_dev;
+  f2.y = (f2.y - mean) * inv_std_dev;
+
+  // Scale by gamma and add beta.
+  f2.x = gamma_f2.x * f2.x + beta_f2.x;
+  f2.y = gamma_f2.y * f2.y + beta_f2.y;
+
+  // Apply SiLU activation if needed.
+  if (silu) {
+    f2.x = f2.x * sigmoid(f2.x);
+    f2.y = f2.y * sigmoid(f2.y);
+  }
+
+  *reinterpret_cast<__half2*>(&dst[offset]) = __float22half2_rn(f2);
+}
+
+template <>
+__device__ void ComputeGroupNorm(const float* src, float* dst, int64_t offset, float mean, float inv_std_dev,
+                                 float2& gamma_f2, float2& beta_f2, bool silu) {
+  // Fetch two channels per thread.
+  float2 f2 = *reinterpret_cast<float2 const*>(&src[offset]);
+
+  // Normalize the channels.
+  f2.x = (f2.x - mean) * inv_std_dev;
+  f2.y = (f2.y - mean) * inv_std_dev;
+
+  // Scale by gamma and add beta.
+  f2.x = gamma_f2.x * f2.x + beta_f2.x;
+  f2.y = gamma_f2.y * f2.y + beta_f2.y;
+
+  // Apply SiLU activation if needed.
+  if (silu) {
+    f2.x = f2.x * sigmoid(f2.x);
+    f2.y = f2.y * sigmoid(f2.y);
+  }
+
+  *reinterpret_cast<float2*>(&dst[offset]) = f2;
+}
+
+template <typename T, int32_t ILP>
+__device__ void ComputeGroupNormKernel(const T* input, T* dst, int64_t offset, float mean, float inv_std_dev,
+                                       const float* gamma, const float* beta, bool use_silu, int32_t c, int32_t ci, int32_t hw_begin, int32_t hw_end) {
+  using VecF = onnxruntime::cuda::aligned_vector<float, ILP>;
+
+  const VecF gamma_v = *reinterpret_cast<const VecF*>(gamma + ci);
+  const VecF beta_v = *reinterpret_cast<const VecF*>(beta + ci);
+  // Iterate over the activations to compute the sums.
+  for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += c) {
+    // Fetch ILP channels per thread.
+    computeGroupNormVec<T, ILP>(input, dst, offset, mean, inv_std_dev, gamma_v.val, beta_v.val, use_silu);
+  }
+}
+
+template <>
+__device__ void ComputeGroupNormKernel<float, 2>(const float* input, float* dst, int64_t offset, float mean, float inv_std_dev,
+                                                 const float* gamma, const float* beta, bool use_silu, int32_t c, int32_t ci, int32_t hw_begin, int32_t hw_end) {
+  // Load gamma/beta. Fetch two per thread.
+  float2 gamma_f2 = *reinterpret_cast<float2 const*>(&gamma[ci]);
+  float2 beta_f2 = *reinterpret_cast<float2 const*>(&beta[ci]);
+  for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += c) {
+    ComputeGroupNorm<float>(input, dst, offset, mean, inv_std_dev, gamma_f2, beta_f2, use_silu);
+  }
+}
+
+template <>
+__device__ void ComputeGroupNormKernel<half, 2>(const half* input, half* dst, int64_t offset, float mean, float inv_std_dev,
+                                                const float* gamma, const float* beta, bool use_silu, int32_t c, int32_t ci, int32_t hw_begin, int32_t hw_end) {
+  // Load gamma/beta. Fetch two per thread.
+  float2 gamma_f2 = *reinterpret_cast<float2 const*>(&gamma[ci]);
+  float2 beta_f2 = *reinterpret_cast<float2 const*>(&beta[ci]);
+  for (int32_t hwi = hw_begin; hwi < hw_end; ++hwi, offset += c) {
+    ComputeGroupNorm<half>(input, dst, offset, mean, inv_std_dev, gamma_f2, beta_f2, use_silu);
+  }
+}
+
+template <typename T, int32_t ILP>
+__global__ void GroupNormNHWCScaleKernel(T* dst, const T* src, const T* skip, const float* gamma, const float* beta,
+                                         const T* skip_workspace, const float* group_sum_buffer, float epsilon,
+                                         int32_t c, int32_t channels_per_block, int32_t channels_per_group,
+                                         int32_t groups, int32_t hwc, float inv_hw_channels_per_group,
+                                         int32_t hw, int32_t hw_per_block, bool use_silu) {
+  // The channel loaded by that thread.
+  int32_t ci = blockIdx.x * channels_per_block + threadIdx.x * ILP;
+  if (ci >= c || threadIdx.x * ILP >= channels_per_block) {
+    return;
+  }
+
+  // The instance in the batch.
+  int32_t ni = blockIdx.z;
+
+  // The group that thread works on.
+  int32_t gi = ci / channels_per_group;
+
+  // Load the sum and sum of squares for the group.
+  float sum = 0.F, sum_sq = 0.F;
+  if (gi < groups) {
+    const int index = (2 * ni) * groups + gi;
+    sum = group_sum_buffer[index];
+    sum_sq = group_sum_buffer[index + groups];
+  }
+
+  // Compute the mean.
+  float mean = sum * inv_hw_channels_per_group;
+  // Compute the variance.
+  float var = sum_sq * inv_hw_channels_per_group - (mean * mean);
+  // Compute the inverse of the stddev.
+  float inv_std_dev = rsqrtf(var + epsilon);
+
+  int32_t hw_begin = blockIdx.y * hw_per_block;
+  int32_t hw_end = min(hw_begin + hw_per_block, hw);
+
+  const T* input = (skip != nullptr) ? skip_workspace : src;
+  int64_t offset = static_cast<int64_t>(ni) * hwc + static_cast<int64_t>(hw_begin) * c + ci;
+  ComputeGroupNormKernel<T, ILP>(input, dst, offset, mean, inv_std_dev, gamma, beta, use_silu, c, ci, hw_begin, hw_end);
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.cc b/onnxruntime/contrib_ops/cuda/grid_sample.cc
index 4c2999c279e0..2500de39d353 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample.cc
+++ b/onnxruntime/contrib_ops/cuda/grid_sample.cc
@@ -9,22 +9,23 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                   \
+#define REGISTER_KERNEL_TYPED(T, VERSION, LAYOUT, DOMAIN)          \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
       GridSample,                                                  \
-      kMSDomain,                                                   \
-      1,                                                           \
+      DOMAIN,                                                      \
+      VERSION,                                                     \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())  \
           .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
-      GridSample<T>);
+      onnxruntime::contrib::cuda::GridSample<T, LAYOUT>);
 
-REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(float, 1, LAYOUT_NCHW, kMSDomain)
+REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NHWC, kMSInternalNHWCDomain)
 
-template <typename T>
-GridSample<T>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
+template <typename T, bool IsNHWC>
+GridSample<T, IsNHWC>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
   std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
   std::string padding_mode_str = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
   align_corners_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("align_corners", 0));
@@ -48,8 +49,8 @@ GridSample<T>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
   }
 }
 
-template <typename T>
-Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool IsNHWC>
+Status GridSample<T, IsNHWC>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* X = context->Input<Tensor>(0);
   const auto& dims_input = X->Shape().GetDims();
   const Tensor* Grid = context->Input<Tensor>(1);
@@ -61,11 +62,13 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
   ORT_ENFORCE(dims_grid[0] == dims_input[0], "Grid batch size ", dims_grid[0], " does not match input batch size ", dims_input[0]);
   ORT_ENFORCE(dims_grid[3] == 2, "Last dimension of grid: ", dims_grid[3], ", expect 2");
 
+  using Ch = Channels<IsNHWC>;
+
   TensorShapeVector dims_output(4);
-  dims_output[0] = dims_input[0];
-  dims_output[1] = dims_input[1];
-  dims_output[2] = dims_grid[1];
-  dims_output[3] = dims_grid[2];
+  dims_output[Ch::N] = dims_input[Ch::N];
+  dims_output[Ch::C] = dims_input[Ch::C];
+  dims_output[Ch::H] = dims_grid[1 /* Grid::H */];
+  dims_output[Ch::W] = dims_grid[2 /* Grid::W */];
   Tensor* Y = context->Output(0, dims_output);
   // Return early if the output tensor is going to be of size 0
   if (Y->Shape().Size() == 0) {
@@ -74,7 +77,7 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaT* Y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
-  GridSampleImpl<CudaT>(
+  GridSampleImpl<CudaT, IsNHWC>(
       Stream(context),
       reinterpret_cast<const CudaT*>(X->Data<T>()),
       reinterpret_cast<const CudaT*>(Grid->Data<T>()),
@@ -89,4 +92,8 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
 }
 }  // namespace cuda
 }  // namespace contrib
+
+namespace cuda {
+REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NCHW, kOnnxDomain)
+}  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.h b/onnxruntime/contrib_ops/cuda/grid_sample.h
index 08ca58c7cc45..16581bfe7748 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample.h
+++ b/onnxruntime/contrib_ops/cuda/grid_sample.h
@@ -12,7 +12,7 @@ namespace cuda {
 
 using namespace onnxruntime::cuda;
 
-template <typename T>
+template <typename T, bool IsNHWC>
 class GridSample final : public CudaKernel {
  public:
   explicit GridSample(const OpKernelInfo& info);
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu b/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
index 8a391eca7e86..b23da635bc83 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
@@ -50,28 +50,34 @@ __device__ T GsReflect(T x, float x_min, float x_max) {
   return static_cast<T>(fx);
 }
 
-template <typename T>
+template <typename T, bool Layout>
 __device__ T PixelAtGrid(const T* input_data, int64_t bIdx, int64_t cIdx, int64_t y, int64_t x,
-    int64_t padding_mode, int64_t N, int64_t C, int64_t H, int64_t W, float border[4]) {
+                         int64_t padding_mode, int64_t N, int64_t C, int64_t H, int64_t W, float border[4]) {
   T pixel = 0.0f;
+
+  auto PixelOffset = [bIdx, cIdx, C, H, W](int64_t x, int64_t y) -> int64_t {
+    return Layout == LAYOUT_NCHW
+       ? (bIdx * C * H * W + cIdx * H * W + y * W + x)
+       : (bIdx * H * W * C + y * W * C + x * C + cIdx);
+  };
+
   if (padding_mode == 0) {  // zeros
     if (x >= 0 && x < W && y >= 0 && y < H) {
-      pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+      pixel = input_data[PixelOffset(x, y)];
     }
-  } else if (padding_mode == 1) {  //border
+  } else if (padding_mode == 1) {  // border
     x = max((int64_t)0, min((int64_t)W - 1, (int64_t)x));
     y = max((int64_t)0, min((int64_t)H - 1, (int64_t)y));
-    pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+    pixel = input_data[PixelOffset(x, y)];
   } else {  // Reflection
-    x = (int64_t) GsReflect<T>(x, border[0], border[2]);
-    y = (int64_t) GsReflect<T>(y, border[1], border[3]);
-    pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+    x = (int64_t)GsReflect<T>(x, border[0], border[2]);
+    y = (int64_t)GsReflect<T>(y, border[1], border[3]);
+    pixel = input_data[PixelOffset(x, y)];
   }
   return pixel;
 }
 
-__device__ void GsGetCubicCoeffs(float x, float coeffs[4])
-{
+__device__ void GsGetCubicCoeffs(float x, float coeffs[4]) {
   float cubic_alpha = -0.75f;
   x = abs(x);
   coeffs[0] = (((cubic_alpha * (x + 1) - 5 * cubic_alpha) * (x + 1) + 8 * cubic_alpha) * (x + 1) - 4 * cubic_alpha);
@@ -93,7 +99,7 @@ __device__ T GsBicubicInterpolate(T p[4][4], float x, float y) {
   return pixel;
 }
 
-template <typename T>
+template <typename T, bool Layout>
 __global__ void _GridSampleKernel(
     const T* input_data,
     const T* grid_data,
@@ -110,16 +116,32 @@ __global__ void _GridSampleKernel(
 {
     CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(idx, N * C * H_out * W_out);
     // extract batch index, channel index, y index, x index for current thread
-    int BIdx = idx / (C * H_out * W_out );
-    int tmpBCnt = BIdx * (C * H_out * W_out);
+    int BIdx, yIdx, xIdx, cIdx;
+    if constexpr (Layout == LAYOUT_NCHW) {
+      BIdx = idx / (C * H_out * W_out);
+      int tmpBCnt = BIdx * (C * H_out * W_out);
+
+      cIdx = (idx - tmpBCnt) / (H_out * W_out);
+      int tmpCCnt = tmpBCnt + cIdx * (H_out * W_out);
 
-    int cIdx = (idx - tmpBCnt) / (H_out * W_out);
-    int tmpCCnt = tmpBCnt + cIdx * (H_out * W_out);
+      yIdx = (idx - tmpCCnt) / W_out;
+      int tmpHCnt = tmpCCnt + yIdx * W_out;
 
-    int yIdx = (idx - tmpCCnt) / W_out;
-    int tmpHCnt = tmpCCnt + yIdx * W_out;
+      xIdx = (idx - tmpHCnt);
+    } else {
+      static_assert(Layout == LAYOUT_NHWC, "Unsupported layout");
 
-    int xIdx = (idx - tmpHCnt);
+      BIdx = idx / (H_out * W_out * C);
+      int tmpBCnt = BIdx * (H_out * W_out * C);
+
+      yIdx = (idx - tmpBCnt) / (W_out * C);
+      int tmpHCnt = tmpBCnt + yIdx * (W_out * C);
+
+      xIdx = (idx - tmpHCnt) / C;
+      int tmpWCnt = tmpHCnt + xIdx * C;
+
+      cIdx = (idx - tmpWCnt);
+    }
 
     int grid_idx = BIdx * H_out * W_out + yIdx * W_out + xIdx;
     T grid_X = grid_data[grid_idx * 2 + 0];
@@ -147,8 +169,9 @@ __global__ void _GridSampleKernel(
     if (grid_x_imgSpace < x_min || grid_x_imgSpace > x_max ||
         grid_y_imgSpace < y_min || grid_y_imgSpace > y_max) { // out of bound
       if (padding_mode == 1) {  // border
-        grid_x_imgSpace = max(0.0f, min(grid_x_imgSpace, W_in - 1.0f));
-        grid_y_imgSpace = max(0.0f, min(grid_y_imgSpace, H_in - 1.0f));
+        // Clamping must not be done here, see #10607
+        // grid_x_imgSpace = max(0.0f, min(grid_x_imgSpace, W_in - 1.0f));
+        // grid_y_imgSpace = max(0.0f, min(grid_y_imgSpace, H_in - 1.0f));
       } else if (padding_mode == 2) {  // reflection
         grid_x_imgSpace = GsReflect(grid_x_imgSpace, x_min, x_max);
         grid_y_imgSpace = GsReflect(grid_y_imgSpace, y_min, y_max);
@@ -175,10 +198,10 @@ __global__ void _GridSampleKernel(
       w_lb = w_b * w_l;
       w_rb = w_b * w_r;
 
-      T lt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x1, padding_mode, N, C, H_in, W_in, border);
-      T rt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x2, padding_mode, N, C, H_in, W_in, border);
-      T lb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x1, padding_mode, N, C, H_in, W_in, border);
-      T rb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x2, padding_mode, N, C, H_in, W_in, border);
+      T lt_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y1, x1, padding_mode, N, C, H_in, W_in, border);
+      T rt_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y1, x2, padding_mode, N, C, H_in, W_in, border);
+      T lb_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y2, x1, padding_mode, N, C, H_in, W_in, border);
+      T rb_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y2, x2, padding_mode, N, C, H_in, W_in, border);
       T interpoV = w_lt * lt_v + w_rt * rt_v + w_lb * lb_v + w_rb * rb_v;
       output_data[outIdx] = interpoV;
       return;
@@ -186,7 +209,8 @@ __global__ void _GridSampleKernel(
     if (mode == 1) {  // nearest
       int x_n = grid_x_imgSpace;
       int y_n = grid_y_imgSpace;
-      output_data[outIdx] = PixelAtGrid(input_data, BIdx, cIdx, y_n, x_n, padding_mode, N, C, H_in, W_in, border);
+      output_data[outIdx] =
+        PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y_n, x_n, padding_mode, N, C, H_in, W_in, border);
       return;
     }
     if (mode == 2) {  // bicubic
@@ -195,7 +219,8 @@ __global__ void _GridSampleKernel(
       T p[4][4] = {};  // [H][W]
       for (int64_t h = 0; h < 4; h++) {
         for (int64_t w = 0; w < 4; w++) {
-          p[h][w] = PixelAtGrid(input_data, BIdx, cIdx, h + y0, w + x0, padding_mode, N, C, H_in, W_in, border);
+          p[h][w] = 
+            PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, h + y0, w + x0, padding_mode, N, C, H_in, W_in, border);
         }
       }
       T dx = grid_x_imgSpace - x0 - 1;
@@ -204,7 +229,7 @@ __global__ void _GridSampleKernel(
     }
 }
 
-template <typename T>
+template <typename T, bool IsNHWC>
 void GridSampleImpl(
     cudaStream_t stream,
     const T* input_data,
@@ -216,17 +241,23 @@ void GridSampleImpl(
     const int64_t H_out,
     const int64_t W_out,
     T* output_data) {
-  int blocksPerGrid = (int)(ceil(static_cast<T>(dims[0] * dims[1] * H_out * W_out) / GridDim::maxThreadsPerBlock));
-  _GridSampleKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      input_data, grid_data, mode, padding_mode, align_corners, dims[0], dims[1], dims[2], dims[3], H_out, W_out, output_data);
+  using Ch = Channels<IsNHWC>;
+
+  int blocksPerGrid = static_cast<int>(
+    ceil(static_cast<T>(dims[Ch::N] * dims[Ch::C] * H_out * W_out) / GridDim::maxThreadsPerBlock));
+  _GridSampleKernel<T, IsNHWC><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      input_data, grid_data, mode, padding_mode, align_corners, 
+      dims[Ch::N], dims[Ch::C], dims[Ch::H], dims[Ch::W],
+      H_out, W_out, output_data);
 }
 
-#define SPECIALIZED_IMPL(T) \
-  template void GridSampleImpl<T>(cudaStream_t stream, const T* input_data, const T* grid_data, \
-                                  const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \
-                                  const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data);
+#define SPECIALIZED_IMPL(T, IsNHWC)                                                                                    \
+  template void GridSampleImpl<T, IsNHWC>(cudaStream_t stream, const T* input_data, const T* grid_data,                \
+                                          const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \
+                                          const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data);
 
-SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(float, false)  // NCHW
+SPECIALIZED_IMPL(float, true)   // NHWC
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h b/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
index 6df86ce16190..62cd66a48fa8 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
+++ b/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
@@ -8,7 +8,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-template <typename T>
+template <typename T, bool IsNHWC>
 void GridSampleImpl(
     cudaStream_t stream,
     const T* input_data,
diff --git a/onnxruntime/contrib_ops/cuda/inverse.cc b/onnxruntime/contrib_ops/cuda/inverse.cc
index 81e161e60642..9075dda26f86 100644
--- a/onnxruntime/contrib_ops/cuda/inverse.cc
+++ b/onnxruntime/contrib_ops/cuda/inverse.cc
@@ -78,9 +78,9 @@ struct Inverse::ComputeImpl {
     cudaStream_t stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
 
     // Make a copy of the input which will serve as a workspace as well.
-    if (std::is_same<T, float>::value || std::is_same<T, MLFloat16>::value) {
+    if constexpr (std::is_same<T, float>::value || std::is_same<T, MLFloat16>::value) {
       IAllocatorUniquePtr<float> input_workspace = inst->GetScratchBuffer<float>(input_count, ort_stream);
-      if (std::is_same<T, MLFloat16>::value) {
+      if constexpr (std::is_same<T, MLFloat16>::value) {
         // Convert from MLFloat16(half) to float
         Impl_Cast<CudaT, float>(stream, reinterpret_cast<const CudaT*>(input.Data<MLFloat16>()), input_workspace.get(), input_count);
       } else {
@@ -96,7 +96,7 @@ struct Inverse::ComputeImpl {
       // Need to compute ptrs for output buffers
       // Output for MLFloat
       IAllocatorUniquePtr<float*> output_ptrs = inst->GetScratchBuffer<float*>(n_batches, ort_stream);
-      if (std::is_same<T, MLFloat16>::value) {
+      if constexpr (std::is_same<T, MLFloat16>::value) {
         IAllocatorUniquePtr<float> ml_float_output = inst->GetScratchBuffer<float>(input_count, ort_stream);
         ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, ml_float_output.get(), num_batches, rows, output_ptrs));
         // Do the inverse
@@ -112,7 +112,7 @@ struct Inverse::ComputeImpl {
         ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
         // We are done here
       }
-    } else if (std::is_same<T, double>::value) {
+    } else if constexpr (std::is_same<T, double>::value) {
       IAllocatorUniquePtr<double> input_workspace = inst->GetScratchBuffer<double>(static_cast<int>(input_count), ort_stream);
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data<double>(), sizeof(double) * input_count,
                                            cudaMemcpyDeviceToDevice, stream));
diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu
index ca94477114ee..47a64502b348 100644
--- a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu
@@ -97,8 +97,8 @@ void ComplexMul_Impl(
     const TArray<int64_t>* rhs_padded_strides,
     const T* rhs_data,
     const TArray<onnxruntime::cuda::fast_divmod>* fdm_output_strides,
-    const onnxruntime::cuda::fast_divmod& fdm_H,
-    const onnxruntime::cuda::fast_divmod& fdm_C,
+    const onnxruntime::cuda::fast_divmod& /*fdm_H*/,
+    const onnxruntime::cuda::fast_divmod& /*fdm_C*/,
     T* output_data,
     int64_t count,
     int64_t lhs_size,
diff --git a/onnxruntime/contrib_ops/cuda/math/gemm_float8.cu b/onnxruntime/contrib_ops/cuda/math/gemm_float8.cu
index 064b6dd39243..28ab27ee33d1 100644
--- a/onnxruntime/contrib_ops/cuda/math/gemm_float8.cu
+++ b/onnxruntime/contrib_ops/cuda/math/gemm_float8.cu
@@ -174,7 +174,7 @@ Status GemmFloat8::ComputeGemm(
     int32_t dtype_A, int32_t dtype_B,
     int32_t dtype_C, int32_t dtype_Y,
     const TensorShape& shape_A, const TensorShape& shape_B,
-    const TensorShape& shape_C, const TensorShape& shape_Y,
+    const TensorShape& shape_C, const TensorShape& /*shape_Y*/,
     bool trans_A, bool trans_B, const void* p_input_a, const void* p_input_b,
     const void* p_input_c, const void* p_scale_a, const void* p_scale_b,
     const void* p_scale_y, void* p_output_y, int M, int N, int K, int lda,
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h
new file mode 100644
index 000000000000..07c38c58e446
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h
@@ -0,0 +1,110 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+#include "contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+// Tag which triggers MMA which will trigger
+struct OpMultiplyAddDequantizeInterleavedBToA;
+
+/*
+  Below we have extra tags to signal what kind of dequantization we want to do
+  (per col, scale only fine grained, finegrained with zero). This still lets us
+  the existing template infrastructure (incl. that in CUTLASS). However, we
+  split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
+  with the quantization op before instantiating the GEMM pieces.
+
+  Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount of
+  code we need to duplicate.
+ */
+struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+
+// The default just forwards the original operator
+template <typename MmaOp, WeightOnlyQuantOp QuantOp_>
+struct TagOperator {
+  using TaggedOperator = MmaOp;
+};
+
+// Specializations below attach more information to the operator
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS> {
+  using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+};
+
+// Here we instantiate some structs to "detag" the tagged operator. It splits it back to the original
+// operator + the extra information. If no extra info was tagged, the dequant op per column scaling
+// as a default.
+template <typename TaggedMmaOp>
+struct DetagOperator {
+  using Operator = TaggedMmaOp;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scale> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias> {
+  using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+  static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
+};
+
+}  // namespace arch
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h
similarity index 62%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h
index 9b97690fe70f..99cbe4a66049 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/compute_occupancy.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h
@@ -13,9 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include <cuda_runtime_api.h>
@@ -29,19 +26,22 @@ namespace ort_fastertransformer {
 
 template <typename GemmKernel>
 inline int compute_occupancy_for_kernel() {
-  int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+  int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
 
   if (smem_size > (48 << 10)) {
-    cudaError_t status =
-        cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
-    if (status == cudaError::cudaErrorInvalidValue) {
-      // Clear the error bit since we can ignore this.
-      // This should mean that smem_size > cudaDevAttrMaxSharedMemoryPerBlockOptin. In that case, we return an
-      // occupancy of 0. This will cause the heuristic to ignore this configuration.
-      status = cudaGetLastError();
+    cudaFuncAttributes attr;
+    int device = 0;
+    int max_smem_per_block = 0;
+    CUDA_CALL_THROW(cudaGetDevice(&device));
+    CUDA_CALL_THROW(cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+    CUDA_CALL_THROW(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
+    if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block)) {
+      // This should mean that
+      // cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
+      // wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
+      // configuration.
       return 0;
     }
-    CUDA_CALL_THROW(status);
   }
 
   int max_active_blocks = -1;
@@ -52,5 +52,3 @@ inline int compute_occupancy_for_kernel() {
 }
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h
similarity index 58%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h
index 617f9992d180..da8cb6d294ef 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/gemm_moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h
@@ -28,56 +28,68 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-
-#ifdef USE_CUTLASS
-
 /*! \file
-    \brief Scheduler for grouped GEMM
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
 */
 
 #pragma once
 
+#include "cutlass/array.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
-#include "cutlass/matrix_coord.h"
-
-#include "moe_problem_visitor.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/half.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
-namespace gemm {
-namespace kernel {
+namespace epilogue {
+namespace thread {
 
-/// Visitor class to abstract away the algorithm for iterating over tiles
-template <typename ThreadblockShape, GroupScheduleMode GroupScheduleMode_, int PrefetchTileCount, int ThreadCount,
-          bool Transposed = false>
-struct GemmMoeProblemVisitor
-    : public MoeProblemVisitor<detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>, ThreadblockShape,
-                               GroupScheduleMode_, PrefetchTileCount, ThreadCount> {
-  static bool const kTransposed = Transposed;
+/////////////////////////////////////////////////////////////////////////////////////////////////
 
-  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
-  using Base =
-      MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
-  using Params = typename Base::Params;
-  using SharedStorage = typename Base::SharedStorage;
+__forceinline__ __device__ float copysignf_pos(float a, float b) {
+  float r;
+  r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
+  return r;
+}
 
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  GemmMoeProblemVisitor(Params const& params_, SharedStorage& shared_storage_, int32_t block_idx)
-      : Base(params_, shared_storage_, block_idx) {}
-};
+__forceinline__ __device__ float tanh_opt(float x) {
+#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDA_ARCH__ < 750)
+  float const exp_val = -1.f * fabs(2 * x);
+  return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
+#else
+  return fast_tanh(x);
+#endif
+}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+struct GELU_taylor<float> {
+  static bool const kIsHeavy = true;
+
+  CUTLASS_DEVICE
+  float operator()(float const& z) const {
+    float k0 = static_cast<float>(0.7978845608028654);
+    float k1 = static_cast<float>(0.044715);
+
+    return static_cast<float>(
+        cutlass::constants::half<float>() * z *
+        (cutlass::constants::one<float>() + tanh_opt(k0 * z * (cutlass::constants::one<float>() + k1 * z * z))));
+  }
+
+  using Params = LinearCombinationGenericParams<float>;
+
+  CUTLASS_DEVICE
+  float operator()(float const& scalar, Params const& params_) const { return this->operator()(scalar); }
+};
 
-}  // namespace kernel
-}  // namespace gemm
+}  // namespace thread
+}  // namespace epilogue
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
new file mode 100644
index 000000000000..affd1d83a35d
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
@@ -0,0 +1,306 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue visitor for threadblock scoped INT8 GEMMs that uses one scaling factor per row, and one per column.
+
+  original file: 3rdparty/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_conversion.h"
+#include "tensorrt_llm/common/quantization.h"
+
+namespace tk = tensorrt_llm::common;
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename ThreadblockShape_, int ThreadCount, typename ScaleTileIterator_, typename OutputTileIterator_,
+          typename ElementAccumulator_, typename ElementCompute_, typename ElementwiseFunctor_,
+          bool UseMasking_ = false>
+class EpilogueVisitorPerRowPerCol {
+ public:
+  using ThreadblockShape = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+
+  using ScaleTileIterator = ScaleTileIterator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  using ElementOutput = typename OutputTileIterator::Element;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  using ElementAccumulator = ElementAccumulator_;
+
+  using AlphaScaleElementType = typename ScaleTileIterator::Element;
+
+  using ElementCompute = ElementCompute_;
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using ComputeFragment = Array<ElementCompute_, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+
+  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
+  static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
+
+  /// Argument structure
+  struct Arguments {
+    typename ElementwiseFunctor::Params elementwise;
+    int64_t batch_stride_alpha;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+    Arguments() : batch_stride_alpha(0), batch_stride_C(0), batch_stride_D(0) {}
+
+    explicit Arguments(typename ElementwiseFunctor::Params elementwise_)
+        : elementwise(elementwise_), batch_stride_alpha(0), batch_stride_C(0), batch_stride_D(0) {}
+
+    Arguments(typename ElementwiseFunctor::Params elementwise_, int64_t batch_stride_alpha_, int64_t batch_stride_C_,
+              int64_t batch_stride_D_)
+        : elementwise(elementwise_),
+          batch_stride_alpha(batch_stride_alpha_),
+          batch_stride_C(batch_stride_C_),
+          batch_stride_D(batch_stride_D_) {}
+  };
+
+  struct Params {
+    typename ElementwiseFunctor::Params elementwise;
+    int64_t batch_stride_alpha;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    explicit Params(Arguments const& args)
+        : elementwise(args.elementwise),
+          batch_stride_alpha(args.batch_stride_alpha),
+          batch_stride_C(args.batch_stride_C),
+          batch_stride_D(args.batch_stride_D) {}
+  };
+
+  /// Shared storage
+  struct SharedStorage {};
+
+ private:
+  Params const& params_;
+  SharedStorage& shared_storage_;
+  MatrixCoord extent_;
+  MatrixCoord extent_real_;
+  ElementwiseFunctor elementwise_;
+
+  bool const per_token_quant_;
+  bool const per_channel_quant_;
+
+  AlphaScaleElementType* ptr_alpha_row_;
+  AlphaScaleElementType* ptr_alpha_col_;
+  ScaleTileIterator iterator_alpha_col_;
+  OutputTileIterator iterator_C_;
+  OutputTileIterator iterator_D_;
+
+  AlphaScaleElementType element_alpha_row_ = 1.0f;
+  AlphaScaleElementType element_alpha_col_ = 1.0f;
+  typename ScaleTileIterator::Fragment fragment_alpha_col_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+
+  ElementAccumulator beta_;
+
+  int column_offset_;
+
+  MatrixCoord thread_offset_;
+
+ public:
+  CUTLASS_DEVICE
+  EpilogueVisitorPerRowPerCol(Params const& params, SharedStorage& shared_storage,
+                              cutlass::MatrixCoord const& problem_size, int thread_idx, int warp_idx, int lane_idx,
+                              typename ScaleTileIterator::Params params_alpha_col,
+                              typename OutputTileIterator::Params params_C,
+                              typename OutputTileIterator::Params params_D, tk::QuantMode quant_option,
+                              AlphaScaleElementType* ptr_alpha_row, AlphaScaleElementType* ptr_alpha_col,
+                              typename OutputTileIterator::Element* ptr_C, typename OutputTileIterator::Element* ptr_D,
+                              cutlass::MatrixCoord const& threadblock_offset = cutlass::MatrixCoord(0, 0),
+                              int column_offset = 0,
+                              cutlass::MatrixCoord const& problem_size_real = cutlass::MatrixCoord(0, 0))
+      : params_(params),
+        shared_storage_(shared_storage),
+        extent_(problem_size),
+        elementwise_(params.elementwise),
+        per_token_quant_(quant_option.hasPerTokenScaling()),
+        per_channel_quant_(quant_option.hasPerChannelScaling()),
+        ptr_alpha_row_(ptr_alpha_row),
+        ptr_alpha_col_(ptr_alpha_col),
+        iterator_alpha_col_(params_alpha_col, ptr_alpha_col, problem_size, thread_idx, threadblock_offset),
+        iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset),
+        iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset),
+        extent_real_(problem_size_real) {
+    beta_ = (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+
+    if (beta_ == ElementAccumulator()) {
+      iterator_C_.clear_mask();
+    }
+
+    if (!per_channel_quant_ && (ptr_alpha_col_ != nullptr)) {
+      element_alpha_col_ = *ptr_alpha_col_;
+    }
+
+    if (!per_token_quant_ && (ptr_alpha_row_ != nullptr)) {
+      element_alpha_row_ = *ptr_alpha_row_;
+    }
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(int split_k_index,     ///< Index of this threadblock within split-K partitioned scheme
+                       int split_k_slices) {  ///< Total number of split-K slices
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+    iterator_alpha_col_.add_pointer_offset(batch_idx * params_.batch_stride_alpha);
+    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+    if (per_channel_quant_) {
+      iterator_alpha_col_.load(fragment_alpha_col_);
+    }
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+    fragment_C_.clear();
+
+    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      iterator_C_.load(fragment_C_);
+      ++iterator_C_;
+    }
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+    // load alpha_row in begin_step only when per token(row) scaling is used
+    if (per_token_quant_) {
+      int thread_offset_row =
+          iterator_D_.thread_start_row() + OutputTileIterator::ThreadMap::iteration_offset(row_idx).row();
+
+      arch::global_load<AlphaScaleElementType, sizeof(AlphaScaleElementType)>(
+          element_alpha_row_, ptr_alpha_row_ + thread_offset_row, thread_offset_row < extent_.row());
+    }
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(int iter_idx, int row_idx, int column_idx, int frag_idx, AccumulatorFragment const& accum) {
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess> source_converter;
+
+    ComputeFragment result = source_converter(accum);
+    if (per_channel_quant_) {
+      ComputeFragment alpha_col = reinterpret_cast<ComputeFragment*>(&fragment_alpha_col_)[column_idx];
+      result = per_token_channel_scale_accumulator_(result, alpha_col, element_alpha_row_);
+    } else {
+      result = per_token_scale_accumulator_(result, element_alpha_col_, element_alpha_row_);
+    }
+
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> output_converter;
+    OutputVector& output = reinterpret_cast<OutputVector*>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {}
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {}
+
+ private:
+  CUTLASS_DEVICE
+  ComputeFragment per_token_channel_scale_accumulator_(ComputeFragment const& accum, ComputeFragment const& scale_col,
+                                                       AlphaScaleElementType const& scale_row) {
+    ComputeFragment result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ComputeFragment::kElements; ++i) {
+      result[i] = accum[i] * (scale_col[i] * scale_row);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  ComputeFragment per_token_scale_accumulator_(ComputeFragment const& accum, AlphaScaleElementType const& scale_col,
+                                               AlphaScaleElementType const& scale_row) {
+    ComputeFragment result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ComputeFragment::kElements; ++i) {
+      result[i] = accum[i] * (scale_col * scale_row);
+    }
+
+    return result;
+  }
+};
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
new file mode 100644
index 000000000000..40f126d56616
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
@@ -0,0 +1,247 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  original file: 3rdparty/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Partial specialization for bfloat16_t <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <typename ThreadblockShape, typename WarpShape, typename InstructionShape, typename ThreadMap>
+struct DefaultIteratorsTensorOp<cutlass::bfloat16_t, int32_t, 8, ThreadblockShape, WarpShape, InstructionShape,
+                                ThreadMap> {
+  using WarpTileIterator =
+      cutlass::epilogue::warp::TileIteratorTensorOpMixed<WarpShape, InstructionShape, int32_t, 32, 16, 8, 8>;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<ThreadMap, int32_t, 32, 16, 8, 8>;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <typename ThreadMap_  ///< Thread map (concept: OutputTileThreadMap)
+          >
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = int32_t;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value / 8;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow * ThreadMap::Iterations::kGroup *
+                         ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<Element, const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+                                const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const* pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(TensorRef ref, int thread_idx) : stride_((ref.stride(0) / LoadType::kElements)) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointers
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] = reinterpret_cast<LoadType const*>(ref.data());
+
+      int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess;
+      int bank_offset = (col_idx * static_cast<int>(sizeof(LoadType)) / 128) % kLoadsPerAccess;
+
+      col_idx += (bank_offset + i) % kLoadsPerAccess;
+
+      pointers_[i] += thread_offset.row() * stride_ + col_idx;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += offset.row() * Shape::kRow * stride_ + offset.column() * Shape::kColumn / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_ptr_offset = row * ThreadMap::Delta::kRow * stride_ + group * ThreadMap::Delta::kGroup * stride_ +
+                               cluster * ThreadMap::Delta::kCluster * stride_ + pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType* frag_ptr = reinterpret_cast<LoadType*>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              int vector_idx = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess);
+
+              LoadType const* memory_pointer = pointers_[v] + row_ptr_offset;
+
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const { load_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue_helpers.h
similarity index 55%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue_helpers.h
index f41c42440f19..b784646c31f8 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue_helpers.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -22,65 +23,14 @@
  *
  */
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-#include "cutlass/functional.h"
-#include "cutlass/half.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h"
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/epilogue/thread/linear_combination_generic.h"
 #include "cutlass/epilogue/thread/linear_combination_relu.h"
 #include "cutlass/epilogue/thread/linear_combination_silu.h"
 
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-
-__forceinline__ __device__ float copysignf_pos(float a, float b) {
-  float r;
-  r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
-  return r;
-}
-
-__forceinline__ __device__ float tanh_opt(float x) {
-#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDA_ARCH__ < 750)
-  const float exp_val = -1.f * fabs(2 * x);
-  return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
-#else
-  return fast_tanh(x);
-#endif
-}
-
-template <>
-struct GELU_taylor<float> {
-  static const bool kIsHeavy = true;
-  CUTLASS_DEVICE
-  float operator()(float const& z) const {
-    float k0 = float(0.7978845608028654);
-    float k1 = float(0.044715);
-
-    return float(
-        cutlass::constants::half<float>() * z *
-        (cutlass::constants::one<float>() + tanh_opt(k0 * z * (cutlass::constants::one<float>() + k1 * z * z))));
-  }
-
-  using Params = LinearCombinationGenericParams<float>;
-
-  CUTLASS_DEVICE
-  float operator()(float const& scalar, Params const& params_) const { return this->operator()(scalar); }
-};
-
-}  // namespace thread
-}  // namespace epilogue
-}  // namespace cutlass
-
 namespace ort_fastertransformer {
 
 struct EpilogueOpBiasSilu {};
@@ -89,49 +39,71 @@ struct EpilogueOpBiasReLU {};
 
 struct EpilogueOpBiasFtGelu {};
 
+struct EpilogueOpDefaultSilu {};
+
+struct EpilogueOpDefaultReLU {};
+
+struct EpilogueOpDefaultFtGelu {};
+
 struct EpilogueOpBias {};
 
-struct EpilogueOpNoBias {};
+struct EpilogueOpDefault {};
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator, typename Op>
 struct Epilogue {};
 
+constexpr auto BiasScaleMode = cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasSilu> {
   using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                              ElementAccumulator,
-                                                              cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
+                                                              ElementAccumulator, BiasScaleMode>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasReLU> {
   using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                              ElementAccumulator,
-                                                              cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
+                                                              ElementAccumulator, BiasScaleMode>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasFtGelu> {
   using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
       cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator,
-      ElementAccumulator, cutlass::epilogue::thread::ScaleType::NoBetaScaling,
-      cutlass::FloatRoundStyle::round_to_nearest, true>;
+      ElementAccumulator, BiasScaleMode, cutlass::FloatRoundStyle::round_to_nearest, true>;
 };
 
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBias> {
   using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                          ElementAccumulator,
-                                                          cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
+                                                          ElementAccumulator, BiasScaleMode>;
 };
 
+constexpr auto DefaultScaleMode = cutlass::epilogue::thread::ScaleType::Default;
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
-struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBias> {
-  using Op =
-      cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                   ElementAccumulator, cutlass::epilogue::thread::ScaleType::Default>;
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultSilu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator, DefaultScaleMode>;
 };
 
-}  // namespace ort_fastertransformer
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultReLU> {
+  using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator, DefaultScaleMode>;
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultFtGelu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
+      cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator,
+      ElementAccumulator, DefaultScaleMode, cutlass::FloatRoundStyle::round_to_nearest, true>;
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefault> {
+  using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                          ElementAccumulator, DefaultScaleMode>;
+};
 
-#endif
+}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/gemm_universal_base_compat.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
new file mode 100644
index 000000000000..f5064afc23ae
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and
+    batched array variants.
+*/
+
+#pragma once
+
+// #include <limits>
+#include <algorithm>
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/trace.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+    This is the device layer from CUTLASS 2.10 (SHA - cc85b64cf676c45f98a17e3a47c0aafcf817f088)
+    It is replicated here since we needed to duplicate kernel level APIs for mixed dtype GEMMs
+    and SmoothQuant. The newer device layer is not compatible with these older kernel level APIs.
+
+    Note: While CUTLASS 3.x supports stream-k, none of the kernels in the extensions folder support
+          that feature at the moment.
+  */
+
+template <typename GemmKernel_>
+class GemmUniversalBaseCompat {
+ public:
+  using GemmKernel = GemmKernel_;
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+
+  using ElementA = typename GemmKernel::ElementA;
+  using LayoutA = typename GemmKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+
+  using ElementB = typename GemmKernel::ElementB;
+  using LayoutB = typename GemmKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename GemmKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+
+  using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using Operator = typename GemmKernel::Operator;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::Arguments;
+
+ protected:
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+ protected:
+  /// Private helper to obtain the grid dimensions with fix-up for split-K
+  static void get_grid_shape_(gemm::GemmCoord& grid_tiled_shape, int& gemm_k_size, Arguments const& args) {
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+    gemm_k_size = args.problem_size.k();
+
+    if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      int const kAlignK =
+          const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+      gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+      if (gemm_k_size) {
+        grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+      }
+    }
+  }
+
+ public:
+  /// Constructs the GEMM.
+  GemmUniversalBaseCompat() {}
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+    // Determine grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    uint32_t const kGridYZMax = ((1 << (sizeof(uint16_t) * 8)) - 1);
+
+    if (!(grid.y <= kGridYZMax && grid.z <= kGridYZMax)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return GemmKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_workspace_size()");
+
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      // Split-K parallel always requires a temporary workspace
+      workspace_bytes = sizeof(ElementC) * size_t(args.batch_stride_D) * size_t(grid_tiled_shape.k());
+    } else if (args.mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1) {
+      // Serial split-K only requires a temporary workspace if the number of partitions along the
+      // GEMM K dimension is greater than one.
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    workspace_bytes += GemmKernel::get_extra_workspace_size(args, grid_tiled_shape);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_grid_shape()");
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+    dim3 result = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    CUTLASS_TRACE_HOST("  grid_tiled_shape: " << grid_tiled_shape << "\n"
+                                              << "  result = {" << result << "}");
+
+    return result;
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::maximum_active_blocks()");
+
+    int max_active_blocks = -1;
+    int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    if (smem_size <= (48 << 10)) {
+      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, Kernel<GemmKernel>,
+                                                                         GemmKernel::kThreadCount, smem_size);
+
+      if (result == cudaSuccess) {
+        CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+        return max_active_blocks;
+      }
+    } else {
+      // Query assuming zero shared memory then compute occupancy limit based on SMEM
+      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, Kernel<GemmKernel>,
+                                                                         GemmKernel::kThreadCount, 0);
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+                           << cudaGetErrorString(result));
+
+        return -1;
+      }
+
+      if (smem_capacity < 0) {
+        int device_idx = 0;
+        result = cudaGetDevice(&device_idx);
+
+        if (result != cudaSuccess) {
+          return -1;
+        }
+
+        cudaDeviceProp properties;
+        result = cudaGetDeviceProperties(&properties, device_idx);
+
+        if (result != cudaSuccess) {
+          return -1;
+        }
+
+        smem_capacity = static_cast<int>(properties.sharedMemPerMultiprocessor);
+      }
+
+      int occupancy = std::min(max_active_blocks, smem_capacity / smem_size);
+
+      CUTLASS_TRACE_HOST("  occupancy: " << occupancy);
+
+      return occupancy;
+    }
+
+    CUTLASS_TRACE_HOST("  returning internal error");
+
+    return -1;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::initialize() - workspace "
+                       << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    if (workspace_bytes) {
+      if (!workspace) {
+        CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+
+        return Status::kErrorWorkspaceNull;
+      }
+
+      if (args.mode == GemmUniversalMode::kGemm) {
+        CUTLASS_TRACE_HOST("  clearing device workspace");
+        cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_bytes, stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+
+          return Status::kErrorInternal;
+        }
+      }
+    }
+
+    // Get CUDA grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params(args, grid_tiled_shape, gemm_k_size, static_cast<int*>(workspace));
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result =
+          cudaFuncSetAttribute(Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::run()");
+
+    //
+    // Configure grid and block dimensions
+    //
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = static_cast<int>(sizeof(typename GemmKernel::SharedStorage));
+
+    //
+    // Launch kernel
+    //
+
+    CUTLASS_TRACE_HOST("  grid: (" << grid << "),  block: (" << block << "),  SMEM: " << smem_size << " bytes");
+
+    // Launch
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/splitk_gemm_grouped.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/splitk_gemm_grouped.h
new file mode 100644
index 000000000000..b226b73e86fe
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/device/splitk_gemm_grouped.h
@@ -0,0 +1,476 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Based on cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/trace.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T_IN, typename T_OUT>
+__global__ void splitkReduction(T_OUT** out_tensor, const T_IN* in_tensor, GemmCoord const* problem_sizes, int splitk,
+                                int64_t* splitk_buffer_offsets) {
+  // in_tensor: [problem_idx, k_partition, hidden_size]
+  //      Note that different requests of in_tensor might have different hidden_size (=m*n)
+  //      so, we need to use splitk_buffer_offsets.
+  // out_tensor: problem_idx * [hidden_size]
+
+  int const problem_idx = blockIdx.y;
+  GemmCoord problem = problem_sizes[problem_idx];
+  int const hidden_size = problem.m() * problem.n();
+  const T_IN* in_tensor_ = in_tensor + splitk_buffer_offsets[problem_idx] * splitk;
+  T_OUT* out_tensor_ = out_tensor[problem_idx];
+
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < hidden_size; i += blockDim.x * gridDim.x) {
+    float sum = 0.0f;
+    for (int k_idx = 0; k_idx < splitk; k_idx++) {
+      sum += static_cast<float>(in_tensor_[k_idx * hidden_size + i]);
+    }
+    out_tensor_[i] = (T_OUT)(sum);
+  }
+}
+
+/// GEMM Grouped
+template <typename BaseKernel_>
+class BaseSplitkGrouped {
+ public:
+  using BaseKernel = BaseKernel_;
+
+  using ElementA = typename BaseKernel::ElementA;
+  using LayoutA = typename BaseKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = BaseKernel::kTransformA;
+  static int const kAlignmentA = BaseKernel::kAlignmentA;
+
+  using ElementB = typename BaseKernel::ElementB;
+  using LayoutB = typename BaseKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = BaseKernel::kTransformB;
+  static int const kAlignmentB = BaseKernel::kAlignmentB;
+
+  using ElementC = typename BaseKernel::ElementC;
+  using LayoutC = typename BaseKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  static int const kAlignmentC = BaseKernel::kAlignmentC;
+
+  using ElementAccumulator = typename BaseKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename BaseKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename threadblock::GemmSplitKHorizontalThreadblockSwizzle;
+
+  using Operator = typename BaseKernel::Operator;
+  using WarpMmaOperator = typename BaseKernel::Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+  using ThreadblockShape = typename BaseKernel::Mma::Shape;
+  using WarpShape = typename BaseKernel::WarpShape;
+  using InstructionShape = typename BaseKernel::InstructionShape;
+  static int const kStages = BaseKernel::Mma::kStages;
+
+  /// Argument structure
+  using Arguments = typename BaseKernel::Arguments;
+
+  using ProblemInfo = typename BaseKernel::ProblemVisitor::ProblemInfo;
+
+ protected:
+  /// Kernel parameters object
+  typename BaseKernel::Params gemm_params_;
+
+ private:
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(cutlass::gemm::GemmCoord const* problem_sizes_ptr, int problem_count) {
+    int32_t tiles = 0;
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_ptr[i];
+      BaseKernel::ProblemVisitor::possibly_transpose_problem(problem);
+      tiles += problem_tile_count(problem);
+    }
+    return tiles;
+  }
+
+  /// Copy from `data` to `workspace`
+  Status copy_to_workspace(void* workspace, void* data, size_t bytes) {
+    cudaError_t cuda_error = cudaMemcpy(workspace, data, bytes, cudaMemcpyHostToDevice);
+    if (cuda_error != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      cuda_error = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaMemcpy() returned error " << cudaGetErrorString(cuda_error));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Precomputes scheduling information for the grouped GEMM
+  Status precompute(Arguments const& args, int32_t tile_count, void* workspace) {
+    size_t workspace_bytes = get_workspace_size(args);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+    BaseKernel::ProblemVisitor::host_precompute(args.host_problem_sizes, args.problem_count, args.threadblock_count,
+                                                reinterpret_cast<void*>(host_workspace.data()));
+    return copy_to_workspace(workspace, host_workspace.data(), workspace_bytes);
+  }
+
+  /// Reorder `data` according to `indices`
+  template <typename T>
+  static void reorder_array(T* data, std::vector<size_t> const& indices) {
+    // For now, simply create a copy of the data and then copy over to the original.
+    std::vector<T> copy(indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+      copy.at(i) = data[indices[i]];
+    }
+
+    memcpy(data, copy.data(), indices.size() * sizeof(T));
+  }
+
+ public:
+  /// Constructs the GEMM.
+  BaseSplitkGrouped() {}
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) { return BaseKernel::can_implement(args); }
+
+  /// Get the number of tiles in a problem
+  static int32_t problem_tile_count(cutlass::gemm::GemmCoord const& problem) {
+    auto grid = BaseKernel::ProblemVisitor::grid_shape(problem);
+    return BaseKernel::ProblemVisitor::tile_count(grid);
+  }
+
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(Arguments const& args) {
+    if (args.host_problem_sizes == nullptr) {
+      CUTLASS_TRACE_HOST("Received nullptr for `args.host_problem_sizes");
+      return -1;
+    }
+
+    return group_tile_count(args.host_problem_sizes, args.problem_count);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    size_t total_mn = 0;
+    for (int i = 0; i < args.problem_count; i++) {
+      total_mn += args.host_problem_sizes[i].m() * args.host_problem_sizes[i].n();
+    }
+    size_t workSpaceSize = total_mn * sizeof(ElementAccumulator) * args.split_k_slices;
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      workSpaceSize += BaseKernel::ProblemVisitor::get_workspace_size(args.host_problem_sizes, args.problem_count,
+                                                                      args.threadblock_count);
+    }
+    return workSpaceSize;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args) { return dim3(args.threadblock_count, 1, 1); }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    CUTLASS_TRACE_HOST("BaseSplitkGrouped::maximum_active_blocks()");
+
+    int smem_size = static_cast<int>(sizeof(typename BaseKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    cudaError_t result;
+    if (smem_size > (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<BaseKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        // Call cudaGetLastError() to clear the error bit
+        result = cudaGetLastError();
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error " << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    int max_active_blocks = -1;
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, Kernel<BaseKernel>,
+                                                           BaseKernel::kThreadCount, smem_size);
+
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+                         << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Sorts each pointer passed in according to the indices that sort
+  /// `problem_sizes_ptr` in descending order of problem-K dimension.
+  static void sort_problems(int problem_count, cutlass::gemm::GemmCoord* problem_sizes_ptr, int64_t* lda_host_ptr,
+                            int64_t* ldb_host_ptr, int64_t* ldc_host_ptr, int64_t* ldd_host_ptr, int64_t* offset_A_ptr,
+                            int64_t* offset_B_ptr, int64_t* offset_C_ptr, int64_t* offset_D_ptr) {
+    std::vector<size_t> indices(problem_count);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::stable_sort(indices.begin(), indices.end(), [&problem_sizes_ptr](size_t i, size_t j) {
+      return problem_sizes_ptr[i].k() > problem_sizes_ptr[j].k();
+    });
+
+    reorder_array(problem_sizes_ptr, indices);
+    reorder_array(lda_host_ptr, indices);
+    reorder_array(ldb_host_ptr, indices);
+    reorder_array(ldc_host_ptr, indices);
+    reorder_array(ldd_host_ptr, indices);
+    reorder_array(offset_A_ptr, indices);
+    reorder_array(offset_B_ptr, indices);
+    reorder_array(offset_C_ptr, indices);
+    reorder_array(offset_D_ptr, indices);
+  }
+
+  /// Computes the number of threadblocks to launch for the grouped kernel
+  static int sufficient(cutlass::gemm::GemmCoord const* problem_sizes_ptr = nullptr, int problem_count = 0,
+                        int available_sm_count = -1) {
+    // Determine the number of blocks that would be launched to fill up a single
+    // wave on the GPU with each SM having maximum occupancy.
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error " << cudaGetErrorString(result));
+      return 0;
+    }
+
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count, cudaDevAttrMultiProcessorCount, device_idx);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaDeviceGetAttribute() returned error " << cudaGetErrorString(result));
+      return 0;
+    }
+
+    bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
+    if (override_sm_count) {
+      available_sm_count = multiprocessor_count;
+    }
+
+    int max_active_blocks = maximum_active_blocks();
+    if (max_active_blocks <= 0) {
+      return 0;
+    }
+
+    int occupancy_based_block_count = available_sm_count * max_active_blocks;
+
+    if (problem_sizes_ptr == nullptr || problem_count == 0) {
+      return occupancy_based_block_count;
+    }
+
+    int total_tiles = group_tile_count(problem_sizes_ptr, problem_count);
+
+    // If the group contains a single problem, launching the exact number of
+    // threadblocks needed to cover the problem minimizes the work performed
+    // per threadblock in finding the next tile to compute. We return total_tiles
+    // unless the user has provided the SM count.
+    if (problem_count == 1 && override_sm_count) {
+      return total_tiles;
+    }
+
+    // Choose between the full wave of threadblocks and the tile count. If there
+    // are fewer tiles in the group than threadblocks in the full wave, only
+    // some threadblocks will be assigned tiles. Those threadblocks
+    // which are not assigned tiles still need to perform the work of iterating through
+    // problem sizes to determine that they have no work to do. This competes for cycles
+    // with those threadblocks that are assigned tiles to compute.
+    return std::min(total_tiles, occupancy_based_block_count);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("BaseSplitkGrouped::initialize() - workspace "
+                       << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Workspace
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      gemm_params_ = typename BaseKernel::Params(args, workspace, tile_count);
+    } else {
+      gemm_params_ = typename BaseKernel::Params(args, workspace);
+    }
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = static_cast<int>(sizeof(typename BaseKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result =
+          cudaFuncSetAttribute(Kernel<BaseKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      gemm_params_.update(args, workspace, tile_count);
+    } else {
+      gemm_params_.update(args, workspace);
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    if (!gemm_params_.problem_visitor.problem_count) {
+      return Status::kSuccess;
+    }
+
+    //
+    // Launch kernel
+    //
+
+    // Launch splitk grouped gemm
+    {
+      dim3 grid(gemm_params_.threadblock_count, 1, gemm_params_.split_k_slices);
+      dim3 block(BaseKernel::kThreadCount, 1, 1);
+
+      int smem_size = static_cast<int>(sizeof(typename BaseKernel::SharedStorage));
+      cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(gemm_params_);
+
+      cudaError_t result = cudaGetLastError();
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    // Launch splitkReduction
+    {
+      dim3 grid(32, gemm_params_.problem_visitor.problem_count);
+      dim3 block(256);
+      splitkReduction<<<grid, block, 0, stream>>>(gemm_params_.ptr_D, gemm_params_.ptr_D_split,
+                                                  gemm_params_.problem_visitor.problem_sizes,
+                                                  gemm_params_.split_k_slices, gemm_params_.splitk_buffer_offsets);
+
+      cudaError_t result = cudaGetLastError();
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
+
+  /// Initializes and runs the kernel.
+  Status operator()(Arguments const& args, void* workspace, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM Grouped
+template <typename GemmKernel_>
+class SplitkGemmGrouped : public BaseSplitkGrouped<GemmKernel_> {
+ public:
+  using GemmKernel = GemmKernel_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
similarity index 71%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
index efb30d07507b..2b3478a38fc2 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/layout_traits_helper.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,53 +14,22 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/*
-  This file exists so that we use the same weight layout for MoE grouped gemm and regular gemm when the weight is
-  quantized. The preprocessing code reads this template to know how to organize the quantized weight matrices
-  to be consumed by CUTLASS.
-
-  Note that for int4, ThreadBlockK MUST be 64.
-
- */
-
-#ifdef USE_CUTLASS
-
 #pragma once
 
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
 #include "cutlass/arch/arch.h"
 #include "cutlass/arch/mma.h"
-#include "cutlass/platform/platform.h"
+#include "cutlass/bfloat16.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h"
 
 namespace cutlass {
 namespace gemm {
 namespace kernel {
 
-template <typename TypeB, typename Arch, typename Enable = void>
-struct LayoutDetailsB {};
-
-// Volta specialiations. Volta will dequantize before STS, so we need a different operator
-template <typename TypeB>
-struct LayoutDetailsB<TypeB, arch::Sm70> {
-  static constexpr int ThreadblockK = 64;
-  using Layout = layout::RowMajor;
-  static constexpr int ElementsPerAccess = 8;
-  using Operator = cutlass::arch::OpMultiplyAdd;
-};
-
-// Specializations for Turing+ when B is FP16. These are currently only used for MoE networks.
-// TODO - Switch this to column major for weights since gemms should be more performant.
-template <typename Arch>
-struct LayoutDetailsB<half_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type> {
-  static constexpr int ThreadblockK = 64;
-  using Layout = layout::RowMajor;
-  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
-  using Operator = cutlass::arch::OpMultiplyAdd;
-};
-
 template <typename TypeA, typename TypeB, typename arch, typename Enable = void>
 struct MixedGemmArchTraits {};
 
@@ -68,7 +38,7 @@ struct MixedGemmArchTraits<float, float, arch> {
   static constexpr int Stages = 2;
   using OperatorClass = cutlass::arch::OpClassSimt;
   using AccType = float;
-  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
 
   static constexpr int ElementsPerAccessA = 1;
   static constexpr int ElementsPerAccessB = 1;
@@ -82,10 +52,13 @@ struct MixedGemmArchTraits<float, float, arch> {
 // ========================= Volta Traits ===========================
 // Volta will always dequantize after the global memory load.
 // This will instantiate any HMMA tensorcore kernels for Volta.
+// Note that volta does not have native bfloat support so weights and activations will be casted to fp16
+// and compute will happen in fp16 then will be converted for bf16 output.
 template <typename TypeA, typename TypeB>
 struct MixedGemmArchTraits<
     TypeA, TypeB, cutlass::arch::Sm70,
-    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value>::type> {
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value ||
+                                          cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type> {
  private:
   using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm70>;
 
@@ -105,10 +78,13 @@ struct MixedGemmArchTraits<
 };
 
 // ======================= Turing Traits ==============================
+// Note that turing does not have native bfloat support so weights and activations will be casted to fp16
+// and compute will happen in fp16 then will be converted for bf16 output.
 template <typename TypeA, typename TypeB>
 struct MixedGemmArchTraits<
     TypeA, TypeB, cutlass::arch::Sm75,
-    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value>::type> {
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value ||
+                                          cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type> {
  private:
   using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm75>;
 
@@ -131,7 +107,8 @@ struct MixedGemmArchTraits<
 template <typename TypeA, typename TypeB>
 struct MixedGemmArchTraits<
     TypeA, TypeB, cutlass::arch::Sm80,
-    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value>::type> {
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value ||
+                                          cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type> {
  private:
   using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm80>;
 
@@ -153,5 +130,3 @@ struct MixedGemmArchTraits<
 }  // namespace kernel
 }  // namespace gemm
 }  // namespace cutlass
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_int8_traits.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_int8_traits.h
new file mode 100644
index 000000000000..fe4bc0940d9e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_int8_traits.h
@@ -0,0 +1,51 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+template <typename arch>
+struct Int8GemmArchTraits {
+  using OperatorClass = cutlass::arch::OpClassSimt;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+};
+
+// ======================= Turing Traits ==============================
+template <>
+struct Int8GemmArchTraits<cutlass::arch::Sm75> {
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+};
+
+// ======================= Ampere Traits ==============================
+template <>
+struct Int8GemmArchTraits<cutlass::arch::Sm80> {
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h
new file mode 100644
index 000000000000..9339be92dfb2
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h
@@ -0,0 +1,206 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+
+#include "cutlass/layout/permute.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<OperatorClass, ArchTag, ElementA_, ElementB_,
+                                                                  ElementC_, ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    ///
+    typename Enable = void>
+struct DefaultSplitkGemmGrouped;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Permute result D
+    typename PermuteDLayout>
+struct DefaultSplitkGemmGrouped<ElementA, LayoutA,
+                                ComplexTransform::kNone,  // transform A
+                                kAlignmentA, ElementB, LayoutB,
+                                ComplexTransform::kNone,  // transform B
+                                kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag,
+                                ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle,
+                                Stages, GroupScheduleMode_, Operator, SharedMemoryClear, PermuteDLayout,
+                                typename platform::enable_if<!cutlass::is_complex<ElementAccumulator>::value>::type> {
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments =
+      kernel::detail::MapArguments<ElementA, LayoutA, ComplexTransform::kNone, kAlignmentA, ElementB, LayoutB,
+                                   ComplexTransform::kNone, kAlignmentB, LayoutC, kInternalTranspose>;
+
+  // Define the default GEMM kernel
+  using DefaultGemmKernel =
+      typename kernel::DefaultGemm<typename MapArguments::ElementA, typename MapArguments::LayoutA,
+                                   MapArguments::kAlignmentA, typename MapArguments::ElementB,
+                                   typename MapArguments::LayoutB, MapArguments::kAlignmentB, ElementC,
+                                   typename MapArguments::LayoutC, ElementAccumulator, OperatorClass, ArchTag,
+                                   ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle,
+                                   Stages, true, Operator, SharedMemoryClear, false, /*GatherA*/
+                                   false,                                            /*GatherB*/
+                                   false,                                            /*ScatterD*/
+                                   PermuteDLayout>::GemmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::SplitkGemmGrouped<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue,
+                                               ThreadblockSwizzle, GroupScheduleMode_, kInternalTranspose>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
new file mode 100644
index 000000000000..778d45f39eab
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
@@ -0,0 +1,513 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <type_traits>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+template <typename>
+inline constexpr bool dependent_false_v = false;
+}
+
+template <typename Mma_,                 ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,            ///! Epilogue
+          typename ThreadblockSwizzle_,  ///! Threadblock swizzling function
+          typename KernelArch,           ///! The Architecture this kernel is compiled for. Used since SIMT kernels lose
+                                         /// top-level
+                                         /// arch.
+          bool SplitKSerial              ///! If true, code supporting split-K via serial reduction is enabled.
+          >
+struct GemmFpAIntB {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Element;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Mma::LayoutC;
+  using ElementScale = ElementC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformA;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  static constexpr int kInterleave = Mma::IteratorB::Shape::kRow / Mma::Shape::kK;
+
+  /// Parameters structure
+  struct Arguments {
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+
+    cutlass::gemm::GemmCoord problem_size;
+    int group_size;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Mma::IteratorScale::TensorRef ref_scale;
+    typename Mma::IteratorScale::TensorRef ref_zero;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+
+    // Control serial split-k
+    int batch_count;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    // For gather+scatter operations
+    int const* gather_A_indices;
+    int const* gather_B_indices;
+    int const* scatter_D_indices;
+
+    // Included so we can use Gemm Universal
+    int batch_stride_D = 0;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Arguments() {}
+
+    CUTLASS_HOST_DEVICE
+    Arguments(cutlass::gemm::GemmCoord const& problem_size, int const group_size,
+              typename Mma::IteratorA::TensorRef ref_A, typename Mma::IteratorB::TensorRef ref_B,
+              typename Mma::IteratorScale::TensorRef ref_scale, typename Mma::IteratorScale::TensorRef ref_zero,
+              typename Epilogue::OutputTileIterator::TensorRef ref_C,
+              typename Epilogue::OutputTileIterator::TensorRef ref_D, int serial_split_k_factor,
+              typename EpilogueOutputOp::Params output_op = typename EpilogueOutputOp::Params(),
+              int const* gather_A_indices = nullptr, int const* gather_B_indices = nullptr,
+              int const* scatter_D_indices = nullptr)
+        : problem_size(problem_size),
+          group_size(group_size),
+          ref_A(ref_A),
+          ref_B(ref_B),
+          ref_scale(ref_scale),
+          ref_zero(ref_zero),
+          ref_C(ref_C),
+          ref_D(ref_D),
+          batch_count(serial_split_k_factor),
+          output_op(output_op),
+          gather_A_indices(gather_A_indices),
+          gather_B_indices(gather_B_indices),
+          scatter_D_indices(scatter_D_indices) {}
+  };
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    int group_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Mma::IteratorScale::Params params_scale;
+    typename Mma::IteratorScale::TensorRef ref_scale;
+    typename Mma::IteratorScale::TensorRef ref_zero;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    int* semaphore;
+    int gemm_k_size;
+    // For gather+scatter operations
+    int const* gather_A_indices;
+    int const* gather_B_indices;
+    int const* scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() : swizzle_log_tile(0), semaphore(0), gemm_k_size(0) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape, int const gemm_k_size,
+           void* workspace = nullptr)
+        : problem_size(args.problem_size),
+          group_size(args.group_size),
+          grid_tiled_shape(grid_tiled_shape),
+          swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+          params_A(args.ref_A.layout()),
+          ref_A(args.ref_A),
+          params_B(args.ref_B.layout()),
+          ref_B(args.ref_B),
+          params_scale(args.ref_scale.layout()),
+          ref_scale(args.ref_scale),
+          ref_zero(args.ref_zero),
+          params_C(args.ref_C.layout()),
+          ref_C(args.ref_C),
+          params_D(args.ref_D.layout()),
+          ref_D(args.ref_D),
+          output_op(args.output_op),
+          semaphore(static_cast<int*>(workspace)),
+          gemm_k_size(gemm_k_size),
+          gather_A_indices(args.gather_A_indices),
+          gather_B_indices(args.gather_B_indices),
+          scatter_D_indices(args.scatter_D_indices) {}
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  GemmFpAIntB() {}
+
+  /// Determines whether kernel satisfies alignment
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Arguments const& args) {
+    static int const kAlignmentA =
+        (platform::is_same<typename Mma::IteratorA::Layout, layout::ColumnMajorInterleaved<32>>::value) ? 32
+        : (platform::is_same<typename Mma::IteratorA::Layout, layout::ColumnMajorInterleaved<64>>::value)
+            ? 64
+            : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =
+        (platform::is_same<typename Mma::IteratorB::Layout, layout::RowMajorInterleaved<32>>::value) ? 32
+        : (platform::is_same<typename Mma::IteratorB::Layout, layout::RowMajorInterleaved<64>>::value)
+            ? 64
+            : Mma::IteratorB::AccessType::kElements;
+
+    static int const kAlignmentScale = Mma::IteratorScale::AccessType::kElements;
+
+    static int const kAlignmentC =
+        (platform::is_same<typename Epilogue::OutputTileIterator::Layout, layout::ColumnMajorInterleaved<32>>::value)
+            ? 32
+        : (platform::is_same<typename Epilogue::OutputTileIterator::Layout, layout::ColumnMajorInterleaved<64>>::value)
+            ? 64
+            : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(args.ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_scale, kAlignmentScale)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_zero, kAlignmentScale)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!args.ref_scale.good()) {
+      return Status::kErrorNotSupported;
+    }
+
+    if constexpr (hasZero(Mma::QuantOp)) {
+      if (!args.ref_zero.good()) {
+        return Status::kErrorNotSupported;
+      }
+    } else {
+      if (args.ref_zero.good()) {
+        return Status::kErrorNotSupported;
+      }
+    }
+
+    if constexpr (isFinegrained(Mma::QuantOp)) {
+      if (args.group_size != 64 && args.group_size != 128) {
+        return Status::kErrorNotSupported;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
+    return 0;
+  }
+
+  // Initializes the fine grained scale+bias iterator. Needed since the fine grained iterator
+  // has a different constructor signature than a regular cutlass iterator
+  template <typename IteratorScale, WeightOnlyQuantOp op, std::enable_if_t<isFinegrained(op), bool> = true>
+  CUTLASS_DEVICE static IteratorScale initialize_scale(typename IteratorScale::Params const& params,
+                                                       typename IteratorScale::Pointer pointer_scale,
+                                                       typename IteratorScale::Pointer pointer_zero,
+                                                       typename IteratorScale::TensorCoord extent, int thread_id,
+                                                       typename IteratorScale::TensorCoord const& threadblock_offset,
+                                                       int group_size) {
+    return IteratorScale(params, pointer_scale, pointer_zero, extent, thread_id, threadblock_offset, group_size);
+  }
+
+  template <typename IteratorScale, WeightOnlyQuantOp op, std::enable_if_t<!isFinegrained(op), bool> = true>
+  CUTLASS_DEVICE static IteratorScale initialize_scale(typename IteratorScale::Params const& params,
+                                                       typename IteratorScale::Pointer pointer_scale,
+                                                       typename IteratorScale::Pointer pointer_zero,
+                                                       typename IteratorScale::TensorCoord extent, int thread_id,
+                                                       typename IteratorScale::TensorCoord const& threadblock_offset,
+                                                       int group_size) {
+    return IteratorScale(params, pointer_scale, extent, thread_id, threadblock_offset);
+  }
+
+  CUTLASS_DEVICE
+  void run_kernel_(Params const& params, SharedStorage& shared_storage) {
+    using LayoutB = typename Mma::IteratorB::Layout;
+    static_assert(platform::is_same<LayoutB, layout::RowMajor>::value && kInterleave == 1 ||
+                      platform::is_same<LayoutB, layout::ColumnMajor>::value && kInterleave >= 1,
+                  "B must be row major/col major OR col major interleaved.");
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.k() * params.gemm_k_size,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{threadblock_tile_offset.k() * params.gemm_k_size * kInterleave,
+                                     threadblock_tile_offset.n() * Mma::Shape::kN / kInterleave};
+
+    typename MatrixCoord::Index fg_row_offset = threadblock_tile_offset.k() * params.gemm_k_size / 64;
+    typename MatrixCoord::Index scale_row_offset = isFinegrained(Mma::QuantOp) ? fg_row_offset : 0;
+    cutlass::MatrixCoord tb_offset_scale{scale_row_offset, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(params.problem_size.k(), (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(params.params_A, params.ref_A.data(), {params.problem_size.m(), problem_size_k},
+                                       thread_idx, tb_offset_A, params.gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(params.params_B, params.ref_B.data(),
+                                       {problem_size_k * kInterleave, params.problem_size.n() / kInterleave},
+                                       thread_idx, tb_offset_B, params.gather_B_indices);
+
+    typename MatrixCoord::Index scale_row_extent = isFinegrained(Mma::QuantOp) ? problem_size_k / 64 : 1;
+    typename Mma::IteratorScale iterator_scale = initialize_scale<typename Mma::IteratorScale, Mma::QuantOp>(
+        params.params_scale, params.ref_scale.data(), params.ref_zero.data(),
+        {scale_row_extent, params.problem_size.n()}, thread_idx, tb_offset_scale, params.group_size);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, params.group_size, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_scale, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // assume identity swizzle
+    MatrixCoord threadblock_offset(threadblock_tile_offset.m() * Mma::Shape::kM,
+                                   threadblock_tile_offset.n() * Mma::Shape::kN);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(params.params_C, params.ref_C.data(), params.problem_size.mn(),
+                                                     thread_idx, threadblock_offset, params.scatter_D_indices);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(params.params_D, params.ref_D.data(), params.problem_size.mn(),
+                                                     thread_idx, threadblock_offset, params.scatter_D_indices);
+
+    Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      } else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+
+  template <typename CompilationArch>
+  CUTLASS_DEVICE void run_kernel(Params const& params, SharedStorage& shared_storage) {
+    if constexpr (platform::is_same<KernelArch, CompilationArch>::value) {
+      run_kernel_(params, shared_storage);
+    } else {
+      CUTLASS_NOT_IMPLEMENTED();
+    }
+  }
+
+  /*
+      To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
+      to the ArchTag of the cutlass kernel operator.
+    */
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 750)
+    run_kernel<arch::Sm70>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
+    run_kernel<arch::Sm75>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
+    run_kernel<arch::Sm80>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 900)
+    CUTLASS_NOT_IMPLEMENTED();  // Don't compile these for Hopper or later. Use CUTLASS 3.x kernels.
+#else
+    static_assert(false,
+                  "Invalid architecture being compiled. Only Volta+ supported in weight-only quantization kernels.");
+#endif
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
new file mode 100644
index 000000000000..6cb5cc4e1334
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+    \brief Scheduler for grouped GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/matrix_coord.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape, GroupScheduleMode GroupScheduleMode_, int PrefetchTileCount, int ThreadCount,
+          bool Transposed = false>
+struct GemmMoeProblemVisitor
+    : public MoeProblemVisitor<detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>, ThreadblockShape,
+                               GroupScheduleMode_, PrefetchTileCount, ThreadCount> {
+  static bool const kTransposed = Transposed;
+
+  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
+  using Base =
+      MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using Params = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GemmMoeProblemVisitor(Params const& params_, SharedStorage& shared_storage_, int32_t block_idx)
+      : Base(params_, shared_storage_, block_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
new file mode 100644
index 000000000000..fb35b2dbf12c
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
@@ -0,0 +1,516 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief GEMM kernel to support the epilogue visitor model
+    for customized softmax partial reduction epilogue fusion.
+
+    This source file will likely be moved to `include/cutlass/gemm/kernel/` in the future once
+    its usage has been stabilized. For now, it is included in this example to demonstrate
+    some basic output fusion options.
+
+    original file: 3rdparty/cutlass/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/trace.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h"
+
+namespace tk = tensorrt_llm::common;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,                ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,           ///! Epilogue
+          typename ThreadblockSwizzle_  ///! Threadblock swizzling function
+          >
+struct GemmWithEpilogueVisitor {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueVisitor = typename Epilogue::Visitor;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+
+  using ElementCompute = typename EpilogueVisitor::ElementCompute;
+  using LayoutAlphaCol = cutlass::layout::RowMajor;
+  using LayoutAlphaRow = cutlass::layout::ColumnMajor;
+  using TensorRefAlphaCol = TensorRef<ElementCompute, LayoutAlphaCol>;
+  using TensorRefAlphaRow = TensorRef<ElementCompute, LayoutAlphaRow>;
+
+  using ElementC = typename EpilogueVisitor::ElementOutput;
+  using LayoutC = typename Epilogue::Layout;
+  using TensorRefC = TensorRef<ElementC, LayoutC>;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+  using EpilogueOutputOp =
+      typename Epilogue::Visitor::ElementwiseFunctor;  // Define type so GemmUniversalBase doesn't complain
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    tk::QuantMode quant_option;
+    TensorRefAlphaCol ref_alpha_col;
+    TensorRefAlphaRow ref_alpha_row;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_D;
+
+    typename EpilogueVisitor::Arguments epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    Arguments() : mode(GemmUniversalMode::kGemm), batch_count(1) {}
+
+    /// constructs an arguments structure
+    Arguments(GemmUniversalMode mode_, GemmCoord problem_size_, int batch_count_, TensorRefA ref_A_, TensorRefB ref_B_,
+              tk::QuantMode quant_option_, TensorRefAlphaCol ref_alpha_col_, TensorRefAlphaRow ref_alpha_row_,
+              TensorRefC ref_C_, TensorRefC ref_D_, int64_t batch_stride_A_, int64_t batch_stride_B_,
+              typename EpilogueVisitor::Arguments epilogue_visitor_)
+        : mode(mode_),
+          problem_size(problem_size_),
+          batch_count(batch_count_),
+          ref_A(ref_A_),
+          ref_B(ref_B_),
+          quant_option(quant_option_),
+          ref_alpha_col(ref_alpha_col_),
+          ref_alpha_row(ref_alpha_row_),
+          ref_C(ref_C_),
+          ref_D(ref_D_),
+          batch_stride_A(batch_stride_A_),
+          batch_stride_B(batch_stride_B_),
+          batch_stride_D(0),
+          epilogue_visitor(epilogue_visitor_) {}
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_col;
+    typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_row;
+    typename EpilogueVisitor::OutputTileIterator::Params params_C;
+    typename EpilogueVisitor::OutputTileIterator::Params params_D;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void* ptr_A;
+    void* ptr_B;
+    tk::QuantMode quant_option;
+    typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_col;
+    typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_row;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    typename EpilogueVisitor::Params epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : swizzle_log_tile(0),
+          params_A(0),
+          params_B(0),
+          params_alpha_col(0),
+          params_C(0),
+          params_D(0),
+          batch_count(0),
+          gemm_k_size(0),
+          mode(cutlass::gemm::GemmUniversalMode::kGemm),
+          ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_alpha_col(nullptr),
+          ptr_alpha_row(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          batch_stride_A(0),
+          batch_stride_B(0) {}
+
+    Params(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape_, int gemm_k_size_, int* workspace_)
+        : problem_size(args.problem_size),
+          swizzle_log_tile(0),
+          params_A(args.ref_A.layout()),
+          params_B(args.ref_B.layout()),
+          params_alpha_col(args.ref_alpha_col.layout()),
+          params_alpha_row(args.ref_alpha_col.layout()),
+          params_C(args.ref_C.layout()),
+          params_D(args.ref_D.layout()),
+          mode(args.mode),
+          batch_count(args.batch_count),
+          gemm_k_size(args.problem_size.k()),
+          ptr_A(args.ref_A.data()),
+          ptr_B(args.ref_B.data()),
+          quant_option(args.quant_option),
+          ptr_alpha_col(args.ref_alpha_col.data()),
+          ptr_alpha_row(args.ref_alpha_row.data()),
+          ptr_C(args.ref_C.data()),
+          ptr_D(args.ref_D.data()),
+          batch_stride_A(args.batch_stride_A),
+          batch_stride_B(args.batch_stride_B),
+          epilogue_visitor(args.epilogue_visitor) {
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+          args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+      if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+        int const kAlignK =
+            const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+        gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+        if (gemm_k_size) {
+          grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+        }
+      }
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+
+    struct {
+      typename Epilogue::SharedStorage epilogue;
+      typename EpilogueVisitor::SharedStorage visitor;
+    } epilogue;
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitor() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) {
+    CUTLASS_TRACE_HOST("GemmWithEpilogueVisitor::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = EpilogueVisitor::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value ||
+               platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value ||
+               platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value ||
+               platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const& args) { return can_implement(args.problem_size); }
+
+  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
+    return 0;
+  }
+
+#define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void run_kernel_(Params const& params, SharedStorage& shared_storage) {
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA* ptr_A = static_cast<ElementA*>(params.ptr_A);
+    ElementB* ptr_B = static_cast<ElementB*>(params.ptr_B);
+
+#if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    } else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    } else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA* const*>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB* const*>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+#endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{offset_k, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(params.params_A, ptr_A, {params.problem_size.m(), problem_size_k}, thread_idx,
+                                       tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(params.params_B, ptr_B, {problem_size_k, params.problem_size.n()}, thread_idx,
+                                       tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // assume identity swizzle
+    MatrixCoord threadblock_offset(threadblock_tile_offset.m() * Mma::Shape::kM,
+                                   threadblock_tile_offset.n() * Mma::Shape::kN);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Construct the epilogue visitor
+    //
+
+    EpilogueVisitor epilogue_visitor(
+        params.epilogue_visitor, shared_storage.epilogue.visitor, params.problem_size.mn(), thread_idx, warp_idx,
+        lane_idx, params.params_alpha_col, params.params_C, params.params_D, params.quant_option, params.ptr_alpha_row,
+        params.ptr_alpha_col, params.ptr_C, params.ptr_D, threadblock_offset, blockIdx.y * params.problem_size.m());
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+      // Indicate which position in a serial reduction the output operator is currently updating
+      epilogue_visitor.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    } else if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
+      epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
+    }
+
+    // Construct the epilogue
+    Epilogue epilogue(shared_storage.epilogue.epilogue, thread_idx, warp_idx, lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(epilogue_visitor, accumulators);
+  }
+
+  template <typename CompilationArch>
+  CUTLASS_DEVICE void run_kernel(Params const& params, SharedStorage& shared_storage) {
+    if constexpr (platform::is_same<ArchTag, CompilationArch>::value) {
+      run_kernel_(params, shared_storage);
+    } else {
+      CUTLASS_NOT_IMPLEMENTED();
+    }
+  }
+
+  /*
+      To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
+      to the ArchTag of the cutlass kernel operator.
+    */
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 720)
+    run_kernel<arch::Sm70>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 720) && (__CUDA_ARCH__ < 750)
+    run_kernel<arch::Sm72>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
+    run_kernel<arch::Sm75>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
+    run_kernel<arch::Sm80>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 900)
+    // replace with CUTLASS_NOT_IMPLEMENTED() and upgrade to 3.x kernels.
+    run_kernel<arch::Sm80>(params, shared_storage);
+#else
+    static_assert(false,
+                  "Invalid architecture being compiled. Only Volta+ supported in weight-only quantization kernels.");
+#endif
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
new file mode 100644
index 000000000000..35d22b2f55a8
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
@@ -0,0 +1,126 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+  This file exists so that we use the same weight layout for MoE grouped gemm and regular gemm when the weight is
+  quantized. The preprocessing code reads this template to know how to organize the quantized weight matrices
+  to be consumed by CUTLASS.
+
+  Note that for int4, ThreadBlockK MUST be 64.
+
+ */
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/platform/platform.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+template <typename TypeB, typename Arch, typename Enable = void>
+struct LayoutDetailsB {};
+
+// Volta specialiations. Volta will dequantize before STS, so we need a different operator
+template <typename TypeB>
+struct LayoutDetailsB<TypeB, arch::Sm70> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 8;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specializations for Turing+ when B is FP16. These are currently only used for MoE networks.
+// Switch this to column major for weights since gemms should be more performant.
+template <typename Arch>
+struct LayoutDetailsB<half_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+template <typename Arch>
+struct LayoutDetailsB<bfloat16_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<bfloat16_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specializations for Turing+ when B is quantized. These can use the operator OpMultiplyAddDequantizeInterleavedBToA,
+// which signals that we want to dequantize after loading from smem.
+template <typename Arch>
+    struct LayoutDetailsB <
+    uint8_t,
+    Arch,
+    typename platform::enable_if<Arch::kMinComputeCapability >= 75 && Arch::kMinComputeCapability<90>::type> {
+  static constexpr int ThreadblockK = 64;
+
+ private:
+  static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits<uint8_t>::value;
+  static constexpr int ColumnsInterleaved = ElementsPerCacheLine / ThreadblockK;
+
+ public:
+  using Layout = layout::ColumnMajorTileInterleave<ThreadblockK, ColumnsInterleaved>;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<uint8_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
+};
+
+template <typename Arch>
+    struct LayoutDetailsB <
+    uint4b_t,
+    Arch,
+    typename platform::enable_if<Arch::kMinComputeCapability >= 75 && Arch::kMinComputeCapability<90>::type> {
+  static constexpr int ThreadblockK = 64;
+
+ private:
+  static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits<uint4b_t>::value;
+  static constexpr int ColumnsInterleaved = ElementsPerCacheLine / ThreadblockK;
+
+ public:
+  using Layout = layout::ColumnMajorTileInterleave<ThreadblockK, ColumnsInterleaved>;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<uint4b_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
+};
+
+template <typename Arch>
+struct LayoutDetailsB<uint8_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 90>::type> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+template <typename Arch>
+struct LayoutDetailsB<uint4b_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 90>::type> {
+  static constexpr int ThreadblockK = 64;
+  using Layout = layout::ColumnMajor;
+  static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h
new file mode 100644
index 000000000000..9e3e9d20d7f6
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h
@@ -0,0 +1,471 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// This section exists to that we can use the same kernel code for regular gemm and dequantizing gemms.
+// It will dispatch to the dequantizing gemm if the Mma type has an Iterator for scales in global.
+template <typename...>
+using void_t = void;
+
+template <typename Mma, typename = void>
+struct use_dq_gemm : platform::false_type {};
+
+template <typename Mma>
+struct use_dq_gemm<Mma, void_t<typename Mma::IteratorScale>> : platform::true_type {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,                        ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,                   ///! Epilogue
+          typename ThreadblockSwizzle_,         ///! Threadblock swizzling function
+          typename KernelArch,                  ///! The Architecture this kernel is compiled for. Used since SIMT
+                                                /// kernels lose top-level arch.
+          GroupScheduleMode GroupScheduleMode_  ///! Type of scheduling to perform
+          >
+struct MoeFCGemm {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = false;
+
+  // Optional transpose
+  using MapArguments =
+      kernel::detail::MapArguments<typename Mma::IteratorA::Element, typename Mma::IteratorA::Layout, Mma::kTransformA,
+                                   Mma::IteratorA::AccessType::kElements, typename Mma::IteratorB::Element,
+                                   typename Mma::IteratorB::Layout, Mma::kTransformB,
+                                   Mma::IteratorB::AccessType::kElements, typename Mma::LayoutC, kTransposed>;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  static_assert(!kTransposed, "Transpose problem not supported");
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+  using ElementScale = ElementC;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor =
+      GemmMoeProblemVisitor<ThreadblockShape, kGroupScheduleMode, kThreadCount, kThreadCount, kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    int problem_count;
+    int threadblock_count;
+    int group_size;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    ElementA* ptr_A;
+    ElementB* ptr_B;
+    ElementScale* weight_scales;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+
+    int64_t* total_rows_before_expert;
+    int64_t gemm_n;
+    int64_t gemm_k;
+
+    // Only used by device-level operator
+    GemmCoord* host_problem_sizes;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments()
+        : problem_count(0),
+          threadblock_count(0),
+          ptr_A(nullptr),
+          ptr_B(nullptr),
+          weight_scales(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          total_rows_before_expert(nullptr),
+          gemm_n(0),
+          gemm_k(0),
+          host_problem_sizes(nullptr) {}
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(int problem_count, int threadblock_count, int group_size, typename EpilogueOutputOp::Params output_op,
+              ElementA const* ptr_A, ElementB const* ptr_B, ElementScale const* weight_scales, ElementC const* ptr_C,
+              ElementC* ptr_D, int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k,
+              GemmCoord* host_problem_sizes = nullptr)
+        : problem_count(problem_count),
+          threadblock_count(threadblock_count),
+          group_size(group_size),
+          output_op(output_op),
+          ptr_A(const_cast<ElementA*>(ptr_A)),
+          ptr_B(const_cast<ElementB*>(ptr_B)),
+          weight_scales(const_cast<ElementScale*>(weight_scales)),
+          ptr_C(const_cast<ElementC*>(ptr_C)),
+          ptr_D(ptr_D),
+          total_rows_before_expert(total_rows_before_expert),
+          gemm_n(gemm_n),
+          gemm_k(gemm_k),
+          host_problem_sizes(nullptr) {
+      if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value) {
+        assert(weight_scales);
+      }
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+    int group_size;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    ElementA* ptr_A;
+    ElementB* ptr_B;
+    ElementScale* weight_scales;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() : ptr_A(nullptr), ptr_B(nullptr), weight_scales(nullptr), ptr_C(nullptr), ptr_D(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    explicit Params(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
+        : problem_visitor(args.total_rows_before_expert, args.gemm_n, args.gemm_k, args.problem_count, workspace,
+                          tile_count),
+          threadblock_count(args.threadblock_count),
+          group_size(args.group_size),
+          output_op(args.output_op),
+          ptr_A(args.ptr_A),
+          ptr_B(args.ptr_B),
+          weight_scales(args.weight_scales),
+          ptr_C(args.ptr_C),
+          ptr_D(args.ptr_D) {}
+
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const& args, void* workspace = nullptr, int tile_count = 0) {
+      problem_visitor = typename ProblemVisitor::Params(args.total_rows_before_expert, args.gemm_n, args.gemm_k,
+                                                        args.problem_count, workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      weight_scales = args.weight_scales;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename ProblemVisitor::SharedStorage problem_visitor;
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  MoeFCGemm() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) { return Status::kSuccess; }
+
+  static Status can_implement(Arguments const& args) {
+    if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value) {
+      if (args.weight_scales == nullptr) {
+        CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t");
+        return Status::kInvalid;
+      }
+    } else if (args.weight_scales != nullptr) {
+      CUTLASS_TRACE_HOST(
+          "MoeFCGemm::can_implement() - weight scales are ignored for all types except uint8_t and uint4b_t");
+      return Status::kInvalid;
+    } else if (args.group_size != args.gemm_k) {
+      CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - scale shape should be (1, gemm_n)");
+      return Status::kInvalid;
+    } else if (static_cast<size_t>(args.gemm_n) < Mma::IteratorB::AccessType::kElements) {
+      CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - gemm_n is smaller than the input alignment");
+      return Status::kInvalid;
+    }
+    return Status::kSuccess;
+  }
+
+  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
+    return 0;
+  }
+
+  CUTLASS_DEVICE
+  void run_kernel_(Params const& params, SharedStorage& shared_storage) {
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+    static constexpr int kInterleave = Mma::IteratorB::Shape::kRow / Mma::Shape::kK;
+    static_assert(platform::is_same<LayoutB, layout::RowMajor>::value && kInterleave == 1 ||
+                      platform::is_same<LayoutB, layout::ColumnMajor>::value && kInterleave >= 1,
+                  "B must be row major/col major OR col major interleaved.");
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
+
+    const int64_t gemm_k = params.problem_visitor.gemm_k;
+    const int64_t gemm_n = params.problem_visitor.gemm_n;
+    int64_t bytes_per_expert_matrix = (gemm_k * gemm_n / 8) * cutlass::sizeof_bits<ElementB>::value;
+
+    // Outer 'persistent' loop to iterate over tiles
+    int loop = 0;
+    while (problem_visitor.next_tile()) {
+      loop++;
+
+      GemmCoord problem_size = problem_visitor.problem_size();
+      int32_t problem_idx = problem_visitor.problem_index();
+      int32_t cta_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(static_cast<int>(cta_idx / grid_shape.n()) * Mma::Shape::kM,
+                                                  static_cast<int>(cta_idx % grid_shape.n()) * Mma::Shape::kN, 0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      const int64_t rows_to_jump = problem_idx == 0 ? 0 : params.problem_visitor.last_row_for_problem[problem_idx - 1];
+      ElementA* ptr_A = reinterpret_cast<ElementA*>(params.ptr_A) + rows_to_jump * gemm_k;
+      typename LayoutA::LongIndex ldm_A = gemm_k;
+
+      char* byte_ptr_B = (reinterpret_cast<char*>(params.ptr_B)) + problem_idx * bytes_per_expert_matrix;
+      ElementB* ptr_B = reinterpret_cast<ElementB*>(byte_ptr_B);
+      typename LayoutB::LongIndex ldm_B =
+          platform::is_same<layout::RowMajor, LayoutB>::value ? gemm_n : gemm_k * kInterleave;
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+          threadblock_offset.m(),
+          0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{0, threadblock_offset.n() / kInterleave};
+
+      cutlass::MatrixCoord tb_offset_scale{0, threadblock_offset.n()};
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(LayoutA(ldm_A), ptr_A, {problem_size.m(), problem_size.k()}, thread_idx,
+                                         tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(LayoutB(ldm_B), ptr_B,
+                                         {problem_size.k() * kInterleave, problem_size.n() / kInterleave}, thread_idx,
+                                         tb_offset_B);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      auto CreateMMA = [&]() {
+        if constexpr (use_dq_gemm<Mma>::value)
+          return Mma(shared_storage.main_loop, params.group_size, thread_idx, warp_idx, lane_idx);
+        else
+          return Mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+      };
+      Mma mma = CreateMMA();
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      ElementScale* weight_scale_ptr = params.weight_scales + problem_idx * problem_size.n();
+
+      if constexpr (use_dq_gemm<Mma>::value) {
+        const MatrixCoord scale_extent = {1, problem_size.n()};
+        typename Mma::IteratorScale iterator_scale(Mma::IteratorScale::Layout(scale_extent.column()), weight_scale_ptr,
+                                                   scale_extent, thread_idx, tb_offset_scale);
+
+        mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_scale, accumulators);
+      } else {
+        mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+      }
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC* ptr_C = reinterpret_cast<ElementC*>(params.ptr_C) + problem_idx * gemm_n;
+      ElementC* ptr_D = reinterpret_cast<ElementC*>(params.ptr_D) + rows_to_jump * gemm_n;
+
+      LayoutC layout_C(0);
+      LayoutC layout_D(gemm_n);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(params_C, ptr_C, problem_size.mn(), thread_idx,
+                                                       threadblock_offset.mn());
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(params_D, ptr_D, problem_size.mn(), thread_idx,
+                                                       threadblock_offset.mn());
+
+      Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+
+  template <typename CompilationArch>
+  CUTLASS_DEVICE void run_kernel(Params const& params, SharedStorage& shared_storage) {
+    if constexpr (platform::is_same<KernelArch, CompilationArch>::value) {
+      run_kernel_(params, shared_storage);
+    } else {
+      CUTLASS_NOT_IMPLEMENTED();
+    }
+  }
+
+  /*
+    To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
+    to the ArchTag of the cutlass kernel operator.
+  */
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 750)
+    run_kernel<arch::Sm70>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
+    run_kernel<arch::Sm75>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
+    run_kernel<arch::Sm80>(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 900)
+    run_kernel<arch::Sm80>(params,
+                           shared_storage);  // Don't compile these for Hopper or later. Use CUTLASS 3.x kernels.
+#else
+    // static_assert(false,
+    //               "Invalid architecture being compiled. Only Volta+ supported in weight-only quantization kernels.");
+    ;
+#endif
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_problem_visitor.h
similarity index 80%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_problem_visitor.h
index 157437439cd0..6852d4c811b4 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_problem_visitor.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_problem_visitor.h
@@ -1,40 +1,24 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 /*! \file
     \brief Base scheduler for grouped problems, using MoE
 */
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "cutlass/gemm/kernel/grouped_problem_visitor.h"
@@ -108,7 +92,7 @@ struct BaseMoeProblemVisitor {
 
   /// Get the grid shape
   CUTLASS_HOST_DEVICE
-  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+  static cutlass::gemm::GemmCoord grid_shape(cutlass::gemm::GemmCoord const& problem) {
     return cutlass::gemm::GemmCoord(((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
                                     ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN), 1);
   }
@@ -147,9 +131,9 @@ struct BaseMoeProblemVisitor {
   }
 
   CUTLASS_HOST_DEVICE
-  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) { return ProblemSizeHelper::tile_count(grid); }
+  static int32_t tile_count(cutlass::gemm::GemmCoord const& grid) { return ProblemSizeHelper::tile_count(grid); }
 
-  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count) {
+  static int32_t group_tile_count(cutlass::gemm::GemmCoord const* host_problem_sizes_ptr, int32_t problem_count) {
     int32_t total_tiles = 0;
     for (int32_t i = 0; i < problem_count; ++i) {
       auto problem = host_problem_sizes_ptr[i];
@@ -278,17 +262,15 @@ struct MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode:
     return true;
   }
 
-  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count,
+  static size_t get_workspace_size(cutlass::gemm::GemmCoord const* host_problem_sizes_ptr, int32_t problem_count,
                                    int32_t block_count) {
     return 0;
   }
 
-  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count,
+  static void host_precompute(cutlass::gemm::GemmCoord const* host_problem_sizes_ptr, int32_t problem_count,
                               int32_t block_count, void* host_workspace_ptr) {}
 };
 
 }  // namespace kernel
 }  // namespace gemm
 }  // namespace cutlass
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h
new file mode 100644
index 000000000000..5d8ff0c38d3c
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h
@@ -0,0 +1,464 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief based on cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,                         ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,                    ///! Epilogue
+          typename ThreadblockSwizzle_,          ///! Threadblock swizzling function
+          GroupScheduleMode GroupScheduleMode_,  ///! Type of scheduling to perform
+          bool Transposed = false>
+struct SplitkGemmGrouped {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Optional transpose
+  using MapArguments =
+      kernel::detail::MapArguments<typename Mma::IteratorA::Element, typename Mma::IteratorA::Layout, Mma::kTransformA,
+                                   Mma::IteratorA::AccessType::kElements, typename Mma::IteratorB::Element,
+                                   typename Mma::IteratorB::Layout, Mma::kTransformB,
+                                   Mma::IteratorB::AccessType::kElements, typename Mma::LayoutC, kTransposed>;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+
+  using ElementFinalOutput = typename MapArguments::ElementA;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor =
+      GemmGroupedProblemVisitor<ThreadblockShape, kGroupScheduleMode, kThreadCount, kThreadCount, kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmCoord* problem_sizes;
+    int problem_count;
+    int threadblock_count;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    ElementA** ptr_A;
+    ElementB** ptr_B;
+    ElementFinalOutput** ptr_C;
+    ElementFinalOutput** ptr_D;
+
+    typename LayoutA::Stride::LongIndex* lda;
+    typename LayoutB::Stride::LongIndex* ldb;
+    typename LayoutC::Stride::LongIndex* ldc;
+    typename LayoutC::Stride::LongIndex* ldd;
+
+    // Only used by device-level operator
+    GemmCoord* host_problem_sizes;
+
+    // splitK
+    int split_k_slices;
+    int64_t* splitk_buffer_offsets;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments()
+        : problem_count(0),
+          threadblock_count(0),
+          ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          lda(nullptr),
+          ldb(nullptr),
+          ldc(nullptr),
+          ldd(nullptr),
+          host_problem_sizes(nullptr),
+          split_k_slices(1),
+          splitk_buffer_offsets(nullptr) {}
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(GemmCoord* problem_sizes, int problem_count, int threadblock_count,
+              typename EpilogueOutputOp::Params output_op, ElementA** ptr_A, ElementB** ptr_B,
+              ElementFinalOutput** ptr_C, ElementFinalOutput** ptr_D, typename LayoutA::Stride::LongIndex* lda,
+              typename LayoutB::Stride::LongIndex* ldb, typename LayoutC::Stride::LongIndex* ldc,
+              typename LayoutC::Stride::LongIndex* ldd, GemmCoord* host_problem_sizes, int split_k_slices,
+              int64_t* splitk_buffer_offsets)
+        : problem_sizes(problem_sizes),
+          problem_count(problem_count),
+          threadblock_count(threadblock_count),
+          output_op(output_op),
+          ptr_A(ptr_A),
+          ptr_B(ptr_B),
+          ptr_C(ptr_C),
+          ptr_D(ptr_D),
+          lda(lda),
+          ldb(ldb),
+          ldc(ldc),
+          ldd(ldd),
+          host_problem_sizes(host_problem_sizes),
+          split_k_slices(split_k_slices),
+          splitk_buffer_offsets(splitk_buffer_offsets) {}
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    ElementA** ptr_A;
+    ElementB** ptr_B;
+    ElementFinalOutput** ptr_C;
+    ElementFinalOutput** ptr_D;
+    ElementC* ptr_C_split;
+    ElementC* ptr_D_split;
+
+    typename LayoutA::Stride::LongIndex* lda;
+    typename LayoutB::Stride::LongIndex* ldb;
+    typename LayoutC::Stride::LongIndex* ldc;
+    typename LayoutC::Stride::LongIndex* ldd;
+
+    //
+    // Methods
+    //
+
+    // splitk
+    GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    int gemm_k_size;
+    GemmCoord* host_problem_sizes;
+    int split_k_slices;
+    int64_t* splitk_buffer_offsets;
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          ptr_C_split(nullptr),
+          ptr_D_split(nullptr),
+          lda(nullptr),
+          ldb(nullptr),
+          ldc(nullptr),
+          ldd(nullptr),
+          swizzle_log_tile(0),
+          gemm_k_size(0),
+          host_problem_sizes(nullptr),
+          split_k_slices(1),
+          splitk_buffer_offsets(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    explicit(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
+        : problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+          host_problem_sizes(args.host_problem_sizes),
+          threadblock_count(args.threadblock_count),
+          output_op(args.output_op),
+          ptr_A(args.ptr_A),
+          ptr_B(args.ptr_B),
+          ptr_C(args.ptr_C),
+          ptr_D(args.ptr_D),
+          ptr_C_split(reinterpret_cast<ElementC*>(workspace)),
+          ptr_D_split(reinterpret_cast<ElementC*>(workspace)),
+          lda(args.lda),
+          ldb(args.ldb),
+          ldc(args.ldc),
+          ldd(args.ldd),
+          split_k_slices(args.split_k_slices),
+          splitk_buffer_offsets(args.splitk_buffer_offsets) {
+      // Determine grid shape
+      ThreadblockSwizzle threadblock_swizzle;
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+          args.host_problem_sizes[0], {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+          args.split_k_slices);
+      swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
+
+      // only support same k
+      int full_gemm_k_iterations = args.host_problem_sizes[0].k() / Mma::Shape::kK;
+      int gemm_k_iterations = full_gemm_k_iterations / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const& args, void* workspace = nullptr, int tile_count = 0) {
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count, workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      ptr_C_split = workspace;
+      ptr_D_split = workspace;
+
+      lda = args.lda;
+      ldb = args.ldb;
+      ldc = args.ldc;
+      ldd = args.ldd;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  SplitkGemmGrouped() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) { return Status::kSuccess; }
+
+  static Status can_implement(Arguments const& args) { return Status::kSuccess; }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+      GemmCoord problem_size = problem_visitor.problem_size();
+      int32_t problem_idx = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA* ptr_A =
+          reinterpret_cast<ElementA*>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB* ptr_B =
+          reinterpret_cast<ElementB*>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute threadblock location
+      ThreadblockSwizzle threadblock_swizzle;
+      GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      cutlass::gemm::GemmCoord threadblock_offset(static_cast<int>(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+                                                  static_cast<int>(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+                                                  0);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+          threadblock_offset.m(),
+          threadblock_tile_offset.k() * params.gemm_k_size,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{threadblock_tile_offset.k() * params.gemm_k_size, threadblock_offset.n()};
+
+      // Problem size is a function of threadblock index in the K dimension
+      int problem_size_k;
+      if (threadblock_tile_offset.k() + 1 == params.grid_tiled_shape.k()) {
+        problem_size_k = problem_size.k();
+      } else {
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(LayoutA(ldm_A), ptr_A, {problem_size.m(), problem_size_k}, thread_idx,
+                                         tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(LayoutB(ldm_B), ptr_B, {problem_size_k, problem_size.n()}, thread_idx,
+                                         tb_offset_B);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC* ptr_C = params.ptr_C_split;
+      ElementC* ptr_D = params.ptr_D_split;
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // assume identity swizzle
+      MatrixCoord threadblock_offset_C(threadblock_offset.m(), threadblock_offset.n());
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(params_C, ptr_C, problem_size.mn(), thread_idx,
+                                                       threadblock_offset_C);
+
+      iterator_C.add_pointer_offset(problem_size.m() * problem_size.n() * threadblock_tile_offset.k() +
+                                    gridDim.z * params.splitk_buffer_offsets[problem_idx]);
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(params_D, ptr_D, problem_size.mn(), thread_idx,
+                                                       threadblock_offset_C);
+      iterator_D.add_pointer_offset(problem_size.m() * problem_size.n() * threadblock_tile_offset.k() +
+                                    gridDim.z * params.splitk_buffer_offsets[problem_idx]);
+
+      Epilogue epilogue(shared_storage.kernel.epilogue, thread_idx, warp_idx, lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h
new file mode 100644
index 000000000000..8bbc1ee4e6c4
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h
@@ -0,0 +1,120 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+
+// We need to distinguish here, since we want volta support. It is too much effort
+// to write shared memory iterators that are probably needed for volta to function
+// properly. As a result, we allow converters both after the LDG (for volta) and after
+// the LDS for Turing+.
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Warp level Mma
+    typename MmaOperator,
+    /// Math operation perform by warp level operator
+    typename MathOperator>
+struct SetConverters {};
+
+// Dequantize after LDG, so set transforms accordingly
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Mma Policy
+    typename MmaOperator>
+struct SetConverters<IteratorB, MmaOperator, arch::OpMultiplyAdd> {
+  using TransformAfterLDG =
+      FastInterleavedAndBiasedNumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+                                                    typename IteratorB::Element, IteratorB::Fragment::kElements>;
+
+  using TransformAfterLDS =
+      NumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+                            typename MmaOperator::ArchMmaOperator::ElementB, MmaOperator::FragmentB::kElements>;
+};
+
+// Dequantize after LDS, so set transforms accordingly
+
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Mma Policy
+    typename MmaOperator>
+struct SetConverters<IteratorB, MmaOperator, arch::OpMultiplyAddDequantizeInterleavedBToA> {
+  using TransformAfterLDG =
+      NumericArrayConverter<typename IteratorB::Element, typename IteratorB::Element, IteratorB::Fragment::kElements>;
+
+  using TransformAfterLDS =
+      FastInterleavedAndBiasedNumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+                                                    typename TransformAfterLDG::result_type::Element,
+                                                    MmaOperator::FragmentB::kElements>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale_,
+    /// Layout for the scale operand
+    typename LayoutScale_,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    ///
+    typename Enable = void>
+struct DqMma;
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h
new file mode 100644
index 000000000000..8b9d6b0b14ad
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h
@@ -0,0 +1,289 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <algorithm>
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment,
+          typename Enable = void>
+struct DefaultScaleIterators;
+
+// Fine grained iterators
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment>
+struct DefaultScaleIterators<MmaShape, Element, Layout, QuantOp, Alignment, std::enable_if_t<isFinegrained(QuantOp)>> {
+  using IteratorScale =
+      cutlass::transform::threadblock::FineGrainedScaleZeroIterator<cutlass::MatrixShape<1, MmaShape::kN>, Element,
+                                                                    Layout, 0, Alignment>;
+
+  using SmemIteratorScale = IteratorScale;
+};
+
+// Per column iterators
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment>
+struct DefaultScaleIterators<MmaShape, Element, Layout, QuantOp, Alignment, std::enable_if_t<!isFinegrained(QuantOp)>> {
+  // ThreadMap for scale iterator
+  static_assert((MmaShape::kN % Alignment) == 0, "");
+
+ private:
+  using IteratorScaleThreadMap = transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaShape::kN, 1>,
+                                                                           MmaShape::kN / Alignment, Alignment>;
+
+ public:
+  // Define iterators over tiles from the scale operand
+  using IteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaShape::kN>, Element, Layout, 0,
+                                                              IteratorScaleThreadMap, Alignment>;
+
+  using SmemIteratorScale = IteratorScale;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Type for elementA
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Stages in GEMM
+    int kStages,
+    ///
+    typename Operator_,
+    ///
+    SharedMemoryClearOption SharedMemoryClear>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+             ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+             InstructionShape, kStages, Operator_, SharedMemoryClear,
+             typename platform::enable_if<(ArchTag::kMinComputeCapability >= 80 &&
+                                           !layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type> {
+  static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+                "Element A must be fp16 or bf16");
+
+  using OperatorInfo = arch::DetagOperator<Operator_>;
+  using Operator = typename OperatorInfo::Operator;
+  static_assert(platform::is_same<Operator, arch::OpMultiplyAddDequantizeInterleavedBToA>::value,
+                "Mma multistage must dequantize after ldsm");
+
+  static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+                "Element B must be uint8 or uint4");
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA = ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+                                                                  ? cutlass::arch::CacheOperation::Global
+                                                                  : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB = ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+                                                                  ? cutlass::arch::CacheOperation::Global
+                                                                  : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  // Mma core does not depend on stages, so pass in at least 3 here to mma multistage pieces are created
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, OperatorClass, std::max(kStages, 3), Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  using ScaleIterators =
+      DefaultScaleIterators<typename MmaCore::Shape, ElementScale, LayoutScale, OperatorInfo::QuantOp, kAlignmentScale>;
+
+  // Define iterators over tiles from the scale operand
+  using IteratorScale = typename ScaleIterators::IteratorScale;
+
+  using SmemIteratorScale = typename ScaleIterators::SmemIteratorScale;
+
+  using Converter = FastInterleavedAndBiasedNumericArrayConverter<ElementA, ElementB,
+                                                                  MmaCore::MmaPolicy::Operator::FragmentB::kElements>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::DqMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB,
+      typename MmaCore::SmemIteratorB, MmaCore::kCacheOpB, IteratorScale, SmemIteratorScale, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy, kStages, Converter, OperatorInfo::QuantOp, SharedMemoryClear>;
+};
+
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Stages in GEMM
+    int kStages,
+    ///
+    typename Operator_,
+    ///
+    SharedMemoryClearOption SharedMemoryClear>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+             ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+             InstructionShape, kStages, Operator_, SharedMemoryClear,
+             typename platform::enable_if<(ArchTag::kMinComputeCapability >= 80 &&
+                                           layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type> {
+  static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+                "Element A must be fp16 or bf16");
+
+  using OperatorInfo = arch::DetagOperator<Operator_>;
+  using Operator = typename OperatorInfo::Operator;
+  static_assert(platform::is_same<Operator, arch::OpMultiplyAddDequantizeInterleavedBToA>::value,
+                "Mma multistage must dequantize after ldsm");
+
+  static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+                "Element B must be uint8 or uint4");
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA = ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+                                                                  ? cutlass::arch::CacheOperation::Global
+                                                                  : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB = ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+                                                                  ? cutlass::arch::CacheOperation::Global
+                                                                  : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  // Mma core does not depend on stages, so pass in at least 3 here to mma multistage pieces are created
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, ElementB, layout::ColumnMajor,
+      ElementAccumulator, layout::RowMajor, OperatorClass, std::max(kStages, 3), Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+ private:
+  static constexpr int ColumnsInterleaved = LayoutB::kColumnsInterleaved;
+  static constexpr int RowsPerTile = LayoutB::kRowsPerTile;
+  static_assert(!(MmaCore::Shape::kN % ColumnsInterleaved), "");
+  static_assert(RowsPerTile == MmaCore::Shape::kK, "");
+
+  using OriginalThreadMap = typename MmaCore::IteratorThreadMapB;
+  using OriginalWarpArrangement = typename OriginalThreadMap::Detail::WarpThreadArrangement;
+  static_assert(!(OriginalWarpArrangement::kStrided % ColumnsInterleaved), "");
+
+  using GmemIteratorShape =
+      MatrixShape<MmaCore::Shape::kK * ColumnsInterleaved, MmaCore::Shape::kN / ColumnsInterleaved>;
+  using GmemThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<GmemIteratorShape::kRow, GmemIteratorShape::kColumn>, OriginalThreadMap::kThreads,
+      layout::PitchLinearShape<OriginalWarpArrangement::kContiguous * ColumnsInterleaved,
+                               OriginalWarpArrangement::kStrided / ColumnsInterleaved>,
+      MmaCore::kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+ public:
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<GmemIteratorShape, ElementB, layout::ColumnMajor, 0,
+                                                                    GmemThreadMapB, AccessTypeB>;
+
+  using ScaleIterators =
+      DefaultScaleIterators<typename MmaCore::Shape, ElementScale, LayoutScale, OperatorInfo::QuantOp, kAlignmentScale>;
+
+  // Define iterators over tiles from the scale operand
+  using IteratorScale = typename ScaleIterators::IteratorScale;
+
+  using SmemIteratorScale = typename ScaleIterators::SmemIteratorScale;
+
+  using Converter = FastInterleavedAndBiasedNumericArrayConverter<ElementA, ElementB,
+                                                                  MmaCore::MmaPolicy::Operator::FragmentB::kElements>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::DqMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB,
+      typename MmaCore::SmemIteratorB, MmaCore::kCacheOpB, IteratorScale, SmemIteratorScale, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy, kStages, Converter, OperatorInfo::QuantOp, SharedMemoryClear>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h
new file mode 100644
index 000000000000..91c4cd342569
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h
@@ -0,0 +1,245 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+             ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+             InstructionShape, 2, Operator_, SharedMemoryClearOption::kNone,
+             typename platform::enable_if<(ArchTag::kMinComputeCapability < 80 &&
+                                           !layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type> {
+  static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+                "Element A must be fp16 or bf16");
+
+  static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+                "Element B must be uint8 or uint4");
+
+  using OperatorInfo = arch::DetagOperator<Operator_>;
+  using Operator = typename OperatorInfo::Operator;
+  static_assert(OperatorInfo::QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+  static constexpr bool DqAfterLDG = platform::is_same<arch::OpMultiplyAdd, Operator>::value;
+  static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+  using MmaCoreElementA = typename platform::conditional<arch_has_bf16_mma, ElementA, half_t>::type;
+  using MmaCoreElementB = typename platform::conditional<DqAfterLDG, MmaCoreElementA, ElementB>::type;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, MmaCoreElementA, LayoutA, MmaCoreElementB, LayoutB,
+      ElementAccumulator, layout::RowMajor, OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA, LayoutA, 1,
+      typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB, LayoutB, 0,
+      typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // ThreadMap for scale iterator
+  static_assert((MmaCore::Shape::kN % kAlignmentScale) == 0, "");
+  using IteratorScaleThreadMap =
+      transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaCore::Shape::kN, 1>,
+                                                MmaCore::Shape::kN / kAlignmentScale, kAlignmentScale>;
+
+  // Define iterators over tiles from the scale operand
+  using IteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>, ElementScale,
+                                                              LayoutScale, 0, IteratorScaleThreadMap, kAlignmentScale>;
+
+  using SmemScaleType = typename platform::conditional<arch_has_bf16_mma, ElementScale, half_t>::type;
+  using SmemIteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>,
+                                                              SmemScaleType, LayoutScale, 0, IteratorScaleThreadMap,
+                                                              kAlignmentScale>;
+
+  using Converters = SetConverters<IteratorB, typename MmaCore::MmaPolicy::Operator, Operator>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::DqMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB,
+      IteratorScale, SmemIteratorScale, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy,
+      typename Converters::TransformAfterLDG, typename Converters::TransformAfterLDS, OperatorInfo::QuantOp>;
+};
+
+// Specialization to handle column major interleave B
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+             ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+             InstructionShape, 2, Operator_, SharedMemoryClearOption::kNone,
+             typename platform::enable_if<(ArchTag::kMinComputeCapability < 80 &&
+                                           layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type> {
+  static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+                "Element A must be fp16 or bf16");
+
+  static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+                "Element B must be uint8 or uint4");
+
+  using OperatorInfo = arch::DetagOperator<Operator_>;
+  using Operator = typename OperatorInfo::Operator;
+  static_assert(OperatorInfo::QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+  static constexpr bool DqAfterLDG = platform::is_same<arch::OpMultiplyAdd, Operator>::value;
+  static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+  using MmaCoreElementA = typename platform::conditional<arch_has_bf16_mma, ElementA, half_t>::type;
+  using MmaCoreElementB = typename platform::conditional<DqAfterLDG, MmaCoreElementA, ElementB>::type;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, MmaCoreElementA, LayoutA, MmaCoreElementB, layout::ColumnMajor,
+      ElementAccumulator, layout::RowMajor, OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA, LayoutA, 1,
+      typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+ private:
+  static constexpr int ColumnsInterleaved = LayoutB::kColumnsInterleaved;
+  static constexpr int RowsPerTile = LayoutB::kRowsPerTile;
+  static_assert(!(MmaCore::Shape::kN % ColumnsInterleaved), "");
+  static_assert(RowsPerTile == MmaCore::Shape::kK, "");
+
+  using OriginalThreadMap = typename MmaCore::IteratorThreadMapB;
+  using OriginalWarpArrangement = typename OriginalThreadMap::Detail::WarpThreadArrangement;
+  static_assert(!(OriginalWarpArrangement::kStrided % ColumnsInterleaved), "");
+
+  using GmemIteratorShape =
+      MatrixShape<MmaCore::Shape::kK * ColumnsInterleaved, MmaCore::Shape::kN / ColumnsInterleaved>;
+  using GmemThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<GmemIteratorShape::kRow, GmemIteratorShape::kColumn>, OriginalThreadMap::kThreads,
+      layout::PitchLinearShape<OriginalWarpArrangement::kContiguous * ColumnsInterleaved,
+                               OriginalWarpArrangement::kStrided / ColumnsInterleaved>,
+      MmaCore::kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+ public:
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<GmemIteratorShape, ElementB, layout::ColumnMajor, 0,
+                                                              GmemThreadMapB, kAlignmentB>;
+
+  // ThreadMap for scale iterator
+  static_assert((MmaCore::Shape::kN % kAlignmentScale) == 0, "");
+  using IteratorScaleThreadMap =
+      transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaCore::Shape::kN, 1>,
+                                                MmaCore::Shape::kN / kAlignmentScale, kAlignmentScale>;
+
+  // Define iterators over tiles from the scale operand
+  using IteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>, ElementScale,
+                                                              LayoutScale, 0, IteratorScaleThreadMap, kAlignmentScale>;
+
+  using SmemScaleType = typename platform::conditional<arch_has_bf16_mma, ElementScale, half_t>::type;
+  using SmemIteratorScale =
+      cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>,
+                                                              SmemScaleType, LayoutScale, 0, IteratorScaleThreadMap,
+                                                              kAlignmentScale>;
+
+  using Converters = SetConverters<IteratorB, typename MmaCore::MmaPolicy::Operator, Operator>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::DqMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB,
+      IteratorScale, SmemIteratorScale, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy,
+      typename Converters::TransformAfterLDG, typename Converters::TransformAfterLDS, OperatorInfo::QuantOp>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h
new file mode 100644
index 000000000000..1a3e7e39c965
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h
@@ -0,0 +1,283 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int8 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+  using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, 2, Operator>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+  using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, 2, Operator>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  kStages, Operator, false, SharedMemoryClear> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+  using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  kStages, Operator, false, SharedMemoryClear> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+  using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+// fp16 x fp16 specialization on Ampere to use mma multistage for 2 stage. Helps avoid reg spills on
+// large tile when not enough shared mem is present to do 3+ stage
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<half_t, LayoutA, kAlignmentA, half_t, LayoutB, kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 2, Operator, false,
+                  SharedMemoryClear, GatherA, GatherB> {
+  // Define the MmaCore components
+  // 3 is used on purpose here to trigger components for mma multistage
+  using MmaCore =
+      typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, half_t,
+                                                          LayoutA, half_t, LayoutB, ElementAccumulator,
+                                                          layout::RowMajor, arch::OpClassTensorOp, 3, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<half_t, kAlignmentA>;
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, half_t, LayoutA, 1, ThreadMapA, AccessTypeA,
+      GatherA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<half_t, kAlignmentB>;
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, half_t, LayoutB, 0, ThreadMapB, AccessTypeB,
+      GatherB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma =
+      cutlass::gemm::threadblock::MmaMultistage<typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+                                                MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+                                                MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+                                                typename MmaCore::MmaPolicy, 2>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
new file mode 100644
index 000000000000..4afd482f8562
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
@@ -0,0 +1,345 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & bf16 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, bfloat16_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, false, SharedMemoryClear, GatherA, GatherB> {
+ private:
+  // Conversions only needed pre-ampere. This will trigger mma pipeline, so we convert before STS.
+  static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+  using MmaElementA = typename platform::conditional<arch_has_bf16_mma, bfloat16_t, half_t>::type;
+  using MmaElementB = typename platform::conditional<arch_has_bf16_mma, bfloat16_t, half_t>::type;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore =
+      typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, MmaElementA,
+                                                          LayoutA, MmaElementB, LayoutB, ElementAccumulator,
+                                                          layout::RowMajor, arch::OpClassTensorOp, 2, Operator>;
+
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, bfloat16_t, LayoutA, 1,
+      typename MmaCore::IteratorThreadMapA, kAlignmentA, GatherA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, bfloat16_t, LayoutB, 0,
+      typename MmaCore::IteratorThreadMapB, kAlignmentB, GatherB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma =
+      cutlass::gemm::threadblock::MmaPipelined<typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+                                               IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+                                               layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+// bf16 x bf16 specialization on Ampere to use mma multistage for 2 stage. Helps avoid reg spills on
+// large tile when not enough shared mem is present to do 3+ stage
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, bfloat16_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, false, SharedMemoryClear, GatherA, GatherB> {
+  // Define the MmaCore components
+  // 3 is used on purpose here to trigger components for mma multistage
+  using MmaCore =
+      typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, bfloat16_t,
+                                                          LayoutA, bfloat16_t, LayoutB, ElementAccumulator,
+                                                          layout::RowMajor, arch::OpClassTensorOp, 3, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<bfloat16_t, kAlignmentA>;
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, bfloat16_t, LayoutA, 1, ThreadMapA, AccessTypeA,
+      GatherA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<bfloat16_t, kAlignmentB>;
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, bfloat16_t, LayoutB, 0, ThreadMapB, AccessTypeB,
+      GatherB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma =
+      cutlass::gemm::threadblock::MmaMultistage<typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+                                                MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+                                                MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+                                                typename MmaCore::MmaPolicy, 2>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & int8 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+  using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, 2, Operator>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+  using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, 2, Operator>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  kStages, Operator, false, SharedMemoryClear> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+  using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+                  layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  kStages, Operator, false, SharedMemoryClear> {
+ private:
+  static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+  using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+                    kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
+                    ThreadblockShape, WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+ public:
+  // Define the MmaCore components
+  using MmaCore = typename Mma::MmaCore;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = typename Mma::IteratorA;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename Mma::IteratorB;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h
new file mode 100644
index 000000000000..cf5ba6faa0c8
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h
@@ -0,0 +1,237 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+// SFINAE trick so I can keep the same loop code for Volta and dispatch to the
+// correct warp level mma. On volta, all data is stored to shared memory as FP16.
+template <typename WarpMma, int kExpansionFactor = 1>
+CUTLASS_DEVICE void run_warp_mma(WarpMma& warp_mma, typename WarpMma::FragmentC& D,
+                                 typename WarpMma::FragmentA const& A, typename WarpMma::FragmentB const& B,
+                                 typename WarpMma::FragmentC const& C, int const warp_tileB_k_offset) {
+  warp_mma(D, A, B, C);
+}
+
+template <typename WarpMma, int kExpansionFactor = WarpMma::kExpansionFactor>
+CUTLASS_DEVICE void run_warp_mma(WarpMma& warp_mma, typename WarpMma::FragmentC& D,
+                                 typename WarpMma::TransformedFragmentA const& A,
+                                 typename WarpMma::TransformedFragmentB const& B, typename WarpMma::FragmentC const& C,
+                                 int const warp_tileB_k_offset) {
+  warp_mma(D, A, B, C, warp_tileB_k_offset);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// The type of the scales
+    typename ElementScale_,
+    /// Number of stages,
+    int Stages,
+    /// The dequantizing op to be performed.
+    WeightOnlyQuantOp DequantOp,
+    /// Used for partial specialization,
+    typename Enable = bool>
+class DqMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Type of the scale to be loaded
+  using ElementScale = ElementScale_;
+
+  static_assert(DequantOp != WeightOnlyQuantOp::UNDEFINED, "");
+
+  // Finegrained scales get streamed in via cp.async
+  static constexpr int ScalebiasStages = isFinegrained(DequantOp) ? Stages : 1;
+  // We always have scales.
+  static constexpr int ScaleElementsPerStage = Shape::kN;
+  // We sometimes have a bias
+  static constexpr int BiasElementsPerStage = hasZero(DequantOp) ? Shape::kN : 0;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM operations
+  static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  static constexpr int kNumKIterationsPerWarpBLoad =
+      Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
+
+  static_assert(!(kWarpGemmIterations % kNumKIterationsPerWarpBLoad), "");
+  static constexpr int kWarpGemmIterationsForB = kWarpGemmIterations / kNumKIterationsPerWarpBLoad;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA =
+        MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow, Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow, Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    /// Shape of the shared memory buffer for the scales for the B matrix.
+    using ShapeScale = MatrixShape<ScalebiasStages, ScaleElementsPerStage>;
+    /// Shape of the shared memory buffer for the biases of the B matrix.
+    using ShapeZero = MatrixShape<ScalebiasStages, BiasElementsPerStage>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer to hold scales for threadblock
+    AlignedBuffer<ElementScale, ShapeScale::kCount> operand_scale;
+
+    /// Buffer to hold scales for threadblock
+    AlignedBuffer<ElementScale, ShapeZero::kCount> operand_zero;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() { return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn}); }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() { return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn}); }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() { return TensorRefA{operand_A.data(), LayoutA()}; }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() { return TensorRefB{operand_B.data(), LayoutB()}; }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DqMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage& shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h
new file mode 100644
index 000000000000..f11e94d9d2b9
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = void>
+class DqMmaMultistage;
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h"
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h
new file mode 100644
index 000000000000..dd934b9a0036
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h
@@ -0,0 +1,634 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+class DqMmaMultistage<Shape_, IteratorA_, SmemIteratorA_, CacheOpA, IteratorB_, SmemIteratorB_, CacheOpB,
+                      IteratorScale_, SmemIteratorScale_, ElementC_, LayoutC_, Policy_, Stages, TransformBAfterLDS_,
+                      QuantOp_, SharedMemoryClear, std::enable_if_t<isFinegrained(QuantOp_)>>
+    : public DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_> {
+ public:
+  ///< Base class
+  using Base = DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using IteratorScale = IteratorScale_;
+  using ElementScale = typename IteratorScale::Element;
+  using LayoutScale = typename IteratorScale::Layout;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorScale = SmemIteratorScale_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using TransformBAfterLDS = TransformBAfterLDS_;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  using Dequantizer = warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB, ElementScale,
+                                                   LayoutScale, 32, QuantOp>;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  static_assert(Base::SharedStorage::ShapeScale::kRow == Stages, "");
+  static_assert(Base::SharedStorage::ShapeScale::kColumn == Shape::kN, "");
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA = IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB = IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+  Dequantizer warp_dequantizer_;
+
+  using ElementB = typename IteratorB::Element;
+  using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+  static constexpr bool RequiresTileInterleave =
+      layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+  static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+                "Layout K must match threadblockK");
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of scale and zero operand to shared memory
+  SmemIteratorScale smem_iterator_scale_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DqMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& shared_storage,
+      /// The group size for quantization
+      int group_size,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+                          {shared_storage.operand_zero.data(), LayoutScale(Shape::kN)},
+                          (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+        smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(),
+                             shared_storage.operand_zero.data(), {Base::kStages, Shape::kN}, thread_idx, group_size) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_scales_and_advance(IteratorScale& iterator_scale, int stage = -1, int k_iter = -1) {
+    static_assert(IteratorScale::Shape::kRow == 1, "Scale stride must be 1.");
+
+    typename IteratorScale::AccessType* gmem_scale_ptr = iterator_scale.get_scale();
+    typename IteratorScale::AccessType* gmem_zero_ptr = iterator_scale.get_zero();
+
+    typename IteratorScale::AccessType* smem_scale_ptr =
+        reinterpret_cast<typename IteratorScale::AccessType*>(this->smem_iterator_scale_.get_scale());
+    typename IteratorScale::AccessType* smem_zero_ptr =
+        reinterpret_cast<typename IteratorScale::AccessType*>(this->smem_iterator_scale_.get_zero());
+
+    int const kSrcBytes = sizeof_bits<typename IteratorScale::Element>::value * IteratorScale::kAlignment / 8;
+
+    cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(smem_scale_ptr, gmem_scale_ptr, iterator_scale.valid());
+
+    if (gmem_zero_ptr != nullptr) {
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(smem_zero_ptr, gmem_zero_ptr, iterator_scale.valid());
+    }
+
+    if (iterator_scale.group_size_ == 64) {
+      iterator_scale.add_tile_offset({1, 0});
+    } else if (iterator_scale.group_size_ == 128) {
+      if (iterator_scale.row_groupsize64_ & 0x1) {
+        iterator_scale.add_tile_offset({1, 0});
+      }
+    }
+
+    iterator_scale.row_groupsize64_++;
+
+    this->smem_iterator_scale_.add_tile_offset({1, 0});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA& iterator_A, IteratorB& iterator_B, IteratorScale& iterator_scale,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over scale operand in global memory
+      IteratorScale iterator_scale,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    TransformBAfterLDS lds_converter;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+      iterator_scale.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      copy_scales_and_advance(iterator_scale, stage, gemm_k_iterations);
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+    // so that all accumulator elements outside the GEMM footprint are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+    typename Dequantizer::FragmentScale warp_frag_scales;
+    typename Dequantizer::FragmentZero warp_frag_zeros;
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    warp_dequantizer_.load(warp_frag_scales, warp_frag_zeros);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+    warp_dequantizer_.add_pointer_offset(Shape::kN);
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+    iterator_scale.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_A_;
+
+        int const warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+        int const warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+        if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+          ++this->warp_tile_iterator_B_;
+        }
+
+        typename TransformBAfterLDS::result_type converted_frag_B =
+            lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+        warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales, warp_frag_zeros);
+
+        run_warp_mma(warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum,
+                     warp_tileB_k_compute_offset);
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, iterator_scale, group_start_iteration_A,
+                                 group_start_iteration_B);
+
+          // This is the first group of a given stage, so we issue the loads for the B scales immediately.
+          if (group_start_iteration_B == 0) {
+            copy_scales_and_advance(iterator_scale);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, iterator_scale, group_start_iteration_A,
+                                 group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 -
+          // #committed)
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            this->smem_iterator_scale_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+            warp_dequantizer_.add_pointer_offset(-Base::kStages * Shape::kN);
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+          iterator_scale.clear_mask(gemm_k_iterations == 0);
+        }
+      }
+
+      // Load the scale needed for the next tile iteration.
+      warp_dequantizer_.load(warp_frag_scales, warp_frag_zeros);
+      // Update internal pointer to set of scales in shared memory.
+      warp_dequantizer_.add_pointer_offset(Shape::kN);
+    }
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h
new file mode 100644
index 000000000000..33bcb1910638
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h
@@ -0,0 +1,586 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+class DqMmaMultistage<Shape_, IteratorA_, SmemIteratorA_, CacheOpA, IteratorB_, SmemIteratorB_, CacheOpB,
+                      IteratorScale_, SmemIteratorScale_, ElementC_, LayoutC_, Policy_, Stages, TransformBAfterLDS_,
+                      QuantOp_, SharedMemoryClear, std::enable_if_t<!isFinegrained(QuantOp_)>>
+    : public DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_> {
+ public:
+  ///< Base class
+  using Base = DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using IteratorScale = IteratorScale_;
+  using ElementScale = typename IteratorScale::Element;
+  using LayoutScale = typename IteratorScale::Layout;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorScale = SmemIteratorScale_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using TransformBAfterLDS = TransformBAfterLDS_;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand Scale loaded from global memory;
+  using FragmentScale = typename IteratorScale::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  using Dequantizer = warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB, ElementScale,
+                                                   LayoutScale, 32, QuantOp>;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA = IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB = IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+  Dequantizer warp_dequantizer_;
+
+  using ElementB = typename IteratorB::Element;
+  using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+  static constexpr bool RequiresTileInterleave =
+      layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+  static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+                "Layout K must match threadblockK");
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of scale operand to shared memory
+  SmemIteratorScale smem_iterator_scale_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DqMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& shared_storage,
+      ///< Group size for quantization. Not used by this main loop since it assumes per-column
+      int const group_size,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+                          (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+        smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(), {1, Shape::kN}, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA& iterator_A, IteratorB& iterator_B, int group_start_A = 0,
+                              int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over scale operand in global memory
+      IteratorScale iterator_scale,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    TransformBAfterLDS lds_converter;
+
+    // NOTE - switch to ldg.sts
+    // Issue this first, so cp.async.commit_group will commit this load as well.
+    // Note: we do not commit here and this load will commit in the same group as
+    //       the first load of A.
+    FragmentScale tb_frag_scales;
+    tb_frag_scales.clear();
+    iterator_scale.load(tb_frag_scales);
+    this->smem_iterator_scale_.store(tb_frag_scales);
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+    // so that all accumulator elements outside the GEMM footprint are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+    typename Dequantizer::FragmentScale warp_frag_scales;
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+    warp_dequantizer_.load(warp_frag_scales);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_A_;
+
+        int const warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+        int const warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+        if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+          ++this->warp_tile_iterator_B_;
+        }
+
+        typename TransformBAfterLDS::result_type converted_frag_B =
+            lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+        warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales);
+
+        run_warp_mma(warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum,
+                     warp_tileB_k_compute_offset);
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+      }
+    }
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
new file mode 100644
index 000000000000..2c85ba8a1995
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
@@ -0,0 +1,379 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Converter for B matrix applied immediately after the LDG (before STS)
+    typename TransformBAfterLDG_,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DqMmaPipelined : public DqMmaBase<Shape_, Policy_, typename SmemIteratorScale_::Element, 2, QuantOp_> {
+ public:
+  ///< Base class
+  using Base = DqMmaBase<Shape_, Policy_, typename SmemIteratorScale_::Element, 2, QuantOp_>;
+
+  using Shape = Shape_;          ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;  ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;  ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;    ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;      ///< Layout of accumulator matrix
+  using Policy = Policy_;        ///< Policy describing tuning details
+
+  using IteratorScale = IteratorScale_;
+  using ElementScale = typename IteratorScale::Element;
+  using LayoutScale = typename IteratorScale::Layout;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorScale = SmemIteratorScale_;
+
+  using TransformBAfterLDG = TransformBAfterLDG_;
+  using TransformBAfterLDS = TransformBAfterLDS_;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of operand Scale loaded from global memory;
+  using FragmentScale = typename IteratorScale::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  using Dequantizer =
+      warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB,
+                                   typename SmemIteratorScale::Fragment::Element, LayoutScale, 32, QuantOp>;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for DqMmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages == 2), "DqMmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+  Dequantizer warp_dequantizer_;
+
+  using ElementB = typename IteratorB::Element;
+  using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+  static constexpr bool RequiresTileInterleave =
+      layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+  static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+                "Layout K must match threadblockK");
+
+ protected:
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of scale operand to shared memory
+  SmemIteratorScale smem_iterator_scale_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DqMmaPipelined(
+      typename Base::SharedStorage&
+          shared_storage,    ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      int const group_size,  ///< Will not be used, just to adapt to finegrained modifications and make the compilation
+                             ///< successful. Because DqMmaPipelined is only enabled for sm<80, so even if this
+                             ///< argument is not added, it does not affect compilation for sm>=80.
+      int thread_idx,        ///< ID within the threadblock
+      int warp_idx,          ///< ID of warp
+      int lane_idx           ///< ID of each thread within a warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+                          (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+        smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(), {1, Shape::kN}, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(int gemm_k_iterations,         ///< number of iterations of the mainloop
+                  FragmentC& accum,              ///< destination accumulator tile
+                  IteratorA iterator_A,          ///< iterator over A operand in global memory
+                  IteratorB iterator_B,          ///< iterator over B operand in global memory
+                  IteratorScale iterator_scale,  ///< iterator over scale operand in global memory
+                  FragmentC const& src_accum) {  ///< source accumulator tile
+    //
+    // Prologue
+    //
+    TransformBAfterLDG ldg_converter;
+    TransformBAfterLDS lds_converter;
+
+    using TransformA =
+        NumericArrayConverter<typename WarpFragmentA::Element, typename FragmentA::Element, FragmentA::kElements>;
+
+    using TransformScale = NumericArrayConverter<typename SmemIteratorScale::Fragment::Element,
+                                                 typename FragmentScale::Element, FragmentScale::kElements>;
+
+    // These transforms are mainly to handle when we have bfloat activations and weights in GMEM and want
+    // to issue HMMA on architectures older than Ampere. We will convert to FP16 before STS.
+    TransformA transformA;
+    TransformScale transformScale;
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+    FragmentScale tb_frag_scales;
+
+    using WarpFragmentScale = typename Dequantizer::FragmentScale;
+    WarpFragmentScale warp_frag_scales;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+    tb_frag_scales.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+    iterator_scale.load(tb_frag_scales);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transformA(tb_frag_A));
+    this->smem_iterator_B_.store(ldg_converter(tb_frag_B));
+    this->smem_iterator_scale_.store(transformScale(tb_frag_scales));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    warp_dequantizer_.load(warp_frag_scales);
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transformA(tb_frag_A));
+
+          this->smem_iterator_B_.store(ldg_converter(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_A_;
+
+        int const warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+        int const warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+        // We are just about to finish computing on a fragment of B, so initiate the load for the next fragment.
+        if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+          ++this->warp_tile_iterator_B_;
+        }
+
+        if (warp_mma_k == 0) {
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+
+          ++iterator_A;
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        typename TransformBAfterLDS::result_type converted_frag_B =
+            lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+        warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales);
+        run_warp_mma(warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum,
+                     warp_tileB_k_compute_offset);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
new file mode 100644
index 000000000000..f0b6f4fcaad3
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
@@ -0,0 +1,103 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/arch/mma.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements,
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<WarpShape_, InstructionShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                          arch::OpMultiplyAddDequantizeInterleavedBToA, PartitionsK, AccumulatorsInRowMajor> {
+ private:
+  // Shape for computing the FP16s
+  using ComputeInstructionShape = InstructionShape_;
+
+  // Chosen so we get K=16 for int8 and K=32 for int4.
+  static constexpr int LoadInstructionK = 8 * sizeof_bits<ElementA>::value / sizeof_bits<ElementB>::value;
+
+  // Shape for loading the narrow data type from shared memory
+  using LoadInstructionShape = GemmShape<InstructionShape_::kM, InstructionShape_::kN, LoadInstructionK>;
+
+ public:
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<InstructionShape_, 32, ElementA, cutlass::layout::RowMajor, ElementA,
+                         cutlass::layout::ColumnMajor, ElementC, cutlass::layout::RowMajor, arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>>;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOpComputeBWithF16<WarpShape_, ElementA, LayoutA, ElementB, LayoutB,
+                                                               ElementC, LayoutC, Policy, LoadInstructionShape,
+                                                               PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
new file mode 100644
index 000000000000..a368c6d22026
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    typename Policy_,
+    /// Instruction shape to override shared memory iterators with
+    typename SharedMemoryInstructionShape_,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaTensorOpComputeBWithF16 {
+ public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+  static_assert((platform::is_same<typename ArchMmaOperator::ElementA, half_t>::value &&
+                 platform::is_same<typename ArchMmaOperator::ElementB, half_t>::value) ||
+                    (platform::is_same<typename ArchMmaOperator::ElementA, bfloat16_t>::value &&
+                     platform::is_same<typename ArchMmaOperator::ElementB, bfloat16_t>::value &&
+                     ArchTag::kMinComputeCapability >= 80),
+                "MmaTensorOpCvtBToA only supports underlying HMMA");
+
+  static_assert(platform::is_same<ElementA, half_t>::value ||
+                    (platform::is_same<ElementA, bfloat16_t>::value && ArchTag::kMinComputeCapability >= 80),
+                "MmaTensorOpCvtBToA only supports Fp16 A or Bf16 A on Ampere+");
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Instruction shape to override shared memory iterators with
+  using SharedMemoryInstructionShape = SharedMemoryInstructionShape_;
+
+  static_assert(SharedMemoryInstructionShape::kM == InstructionShape::kM,
+                "M dimension of compute instruction must match load");
+  static_assert(SharedMemoryInstructionShape::kN == InstructionShape::kN,
+                "N dimension of compute instruction must match load");
+
+  static constexpr int kExpansionFactor = SharedMemoryInstructionShape::kK / InstructionShape::kK;
+
+  static_assert(!(Shape::kK % SharedMemoryInstructionShape::kK), "");
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+ public:
+  /// Iterates over the A operand in memory
+  using IteratorA =
+      MmaTensorOpMultiplicandTileIterator<MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+                                          MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+                                          Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB =
+      MmaTensorOpMultiplicandTileIterator<MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+                                          MatrixShape<SharedMemoryInstructionShape::kK, InstructionShape::kN>,
+                                          Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+                                                       typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<(Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+                                    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN>;
+
+ public:
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaTensorOpComputeBWithF16() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(FragmentC& D, TransformedFragmentA const& A, TransformedFragmentB const& B, FragmentC const& C,
+                  int const warp_tileB_k_offset) const {
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    static_assert(
+        TransformedFragmentB::kElements == MmaOperandB::kElements * kExpansionFactor * MmaIterations::kColumn,
+        "Each thread should have a pack of mma registers for each column iteration AND for the expanded K dim of "
+        "B");
+
+    D = C;
+
+    MmaOperandA const* ptr_A = reinterpret_cast<MmaOperandA const*>(&A);
+    MmaOperandB const* ptr_B = reinterpret_cast<MmaOperandB const*>(&B);
+    MmaOperandC* ptr_D = reinterpret_cast<MmaOperandC*>(&D);
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+    // Serpentine visitation order maximizing reuse of Rb
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+        int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+
+        int n_offsetB = warp_tileB_k_offset + kExpansionFactor * n;
+        if (AccumulatorsInRowMajor) {  // matrix B is reordered
+          mma(ptr_D[n + m_serpentine * MmaIterations::kColumn], ptr_A[m_serpentine], ptr_B[n_offsetB],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn]);
+        } else {
+          mma(ptr_D[m_serpentine + n * MmaIterations::kRow], ptr_A[m_serpentine], ptr_B[n_offsetB],
+              ptr_D[m_serpentine + n * MmaIterations::kRow]);
+        }
+      }
+    }
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    // Serpentine visitation order maximizing reuse of Ra
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+        int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+        int n_serpentine_offsetB = warp_tileB_k_offset + kExpansionFactor * n_serpentine;
+        if (AccumulatorsInRowMajor) {  // matrix B is reordered
+          mma(ptr_D[n_serpentine + m * MmaIterations::kColumn], ptr_A[m], ptr_B[n_serpentine_offsetB],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+        } else {
+          mma(ptr_D[m + n_serpentine * MmaIterations::kRow], ptr_A[m], ptr_B[n_serpentine_offsetB],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+        }
+      }
+    }
+#else
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
new file mode 100644
index 000000000000..51ca8282e42f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
@@ -0,0 +1,534 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include <functional>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/functional.h"
+#include "cutlass/platform/platform.h"
+
+#include "contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Matrix multiply operator
+    typename MmaOperator_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand,
+    /// Data type of Scale elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    ///
+    WeightOnlyQuantOp QuantOp_,
+    ///
+    typename Enable = void>
+class MmaTensorOpDequantizer;
+
+////////////////////////////////////////////////////////////////////////////////
+// Bfloat specialization for Ampere
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<
+    MmaOperator_, Shape_, Operand::kB, bfloat16_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<
+        MmaOperator_::ArchTag::kMinComputeCapability >= 80 &&
+        platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type> {
+ public:
+  /// Mma Operator
+  using MmaOperator = MmaOperator_;
+
+  // The architecture specific mma ooperator being used
+  using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+  // Mma Instruction Shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  // This is the ratio of the load instruction vs the compute instruction.
+  static constexpr int kExpansionFactor = MmaOperator::IteratorB::InstructionShape::kRow / InstructionShape::kK;
+
+  /// Type of the scales
+  using ElementScale = bfloat16_t;
+
+  /// Fragment to hold B data before Mma
+  using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+  // Fragment to hold scale data to apply to B before mma
+  // We need 1 fp16 per matrix iteration in the N dimension
+  static constexpr int kColsPerMmaPerThread = 1;
+  using FragmentScale = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+  using FragmentZero = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+
+  /// Warp mma shape
+  using Shape = Shape_;
+
+  /// Layout of the scales in shared memory
+  using Layout = layout::RowMajor;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<ElementScale, Layout>;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, TensorRef smem_zeros, int const warp_idx_n, int const lane_idx) {
+    int const warp_offset = warp_idx_n * Shape::kN;
+    int const quad = lane_idx / 4;
+    int const thread_offset = warp_offset + quad;
+    pointer_scale_ = smem_scales.data() + thread_offset;
+    if constexpr (hasZero(QuantOp)) {
+      pointer_zero_ = smem_zeros.data() + thread_offset;
+    }
+  }
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, int const warp_idx_n, int const lane_idx)
+      : MmaTensorOpDequantizer(smem_scales, TensorRef(), warp_idx_n, lane_idx) {}
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+      scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag) {
+    // Slow path not implemented here on purpose. If we need to do HMMA on older arch, scale conversion should
+    // happen before scales are stored to shared memory and we should use the fp16 dequantizer. This will avoid
+    // numerous conversion instructions in GEMM main loop.
+    arch::device_breakpoint();
+  }
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag, FragmentScale& zero_frag) {
+    if constexpr (hasZero(QuantOp)) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+        zero_frag[mma_n_iter] = pointer_zero_[mma_n_iter * InstructionShape::kN];
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag,
+                  FragmentScale const& zero_frag) {
+    // Slow path not implemented here on purpose. If we need to do HMMA on older arch, scale conversion should
+    // happen before scales are stored to shared memory and we should use the fp16 dequantizer. This will avoid
+    // numerous conversion instructions in GEMM main loop.
+    arch::device_breakpoint();
+  }
+
+  // Adds a pointer offset in units of elements.
+  CUTLASS_DEVICE
+  void add_pointer_offset(int64_t const& offset) {
+    static_assert(sizeof(ElementScale) > 1, "");
+    pointer_scale_ += offset;
+    pointer_zero_ += offset;
+  }
+
+ private:
+  ElementScale const* pointer_scale_;
+  ElementScale const* pointer_zero_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Turing & Ampere
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<
+    MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<
+        MmaOperator_::ArchTag::kMinComputeCapability >= 75 &&
+        platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type> {
+ public:
+  /// Mma Operator
+  using MmaOperator = MmaOperator_;
+
+  // The architecture specific mma ooperator being used
+  using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+  // Mma Instruction Shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  // This is the ratio of the load instruction vs the compute instruction.
+  static constexpr int kExpansionFactor = MmaOperator::IteratorB::InstructionShape::kRow / InstructionShape::kK;
+
+  /// Type of the scales
+  using ElementScale = half_t;
+
+  /// Fragment to hold B data before Mma
+  using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+  // Fragment to hold scale data to apply to B before mma
+  // We need 1 fp16 per matrix iteration in the N dimension
+  static constexpr int kColsPerMmaPerThread = 1;
+  using FragmentScale = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+  using FragmentZero = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+
+  /// Warp mma shape
+  using Shape = Shape_;
+
+  /// Layout of the scales in shared memory
+  using Layout = layout::RowMajor;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<ElementScale, Layout>;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, TensorRef smem_zeros, int const warp_idx_n, int const lane_idx) {
+    int const warp_offset = warp_idx_n * Shape::kN;
+    int const quad = lane_idx / 4;
+    int const thread_offset = warp_offset + quad;
+    pointer_scale_ = smem_scales.data() + thread_offset;
+    if constexpr (hasZero(QuantOp)) {
+      pointer_zero_ = smem_zeros.data() + thread_offset;
+    }
+  }
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, int const warp_idx_n, int const lane_idx)
+      : MmaTensorOpDequantizer(smem_scales, TensorRef(), warp_idx_n, lane_idx) {}
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+      scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag) {
+    using _MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using ExpandedMmaOperandB = Array<typename _MmaOperandB::Element, kExpansionFactor * _MmaOperandB::kElements>;
+    static_assert(
+        ExpandedMmaOperandB::kElements * MmaOperator::MmaIterations::kColumn == FragmentDequantizedOperand::kElements,
+        "");
+
+    multiplies<ExpandedMmaOperandB> mul_op;
+
+    ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+      operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag, FragmentScale& zero_frag) {
+    if constexpr (hasZero(QuantOp)) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+        zero_frag[mma_n_iter] = pointer_zero_[mma_n_iter * InstructionShape::kN];
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag,
+                  FragmentScale const& zero_frag) {
+    using _MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using ExpandedMmaOperandB = Array<typename _MmaOperandB::Element, kExpansionFactor * _MmaOperandB::kElements>;
+    static_assert(
+        ExpandedMmaOperandB::kElements * MmaOperator::MmaIterations::kColumn == FragmentDequantizedOperand::kElements,
+        "");
+
+    multiplies<ExpandedMmaOperandB> mul_op;
+    ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
+
+    if constexpr (hasZero(QuantOp)) {
+      plus<ExpandedMmaOperandB> plus_op;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        operand_frag_ptr[mma_n_iter] =
+            plus_op(mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]), zero_frag[mma_n_iter]);
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) {
+        operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+      }
+    }
+  }
+
+  // Adds a pointer offset in units of elements.
+  CUTLASS_DEVICE
+  void add_pointer_offset(int64_t const& offset) {
+    static_assert(sizeof(ElementScale) > 1, "");
+    pointer_scale_ += offset;
+    pointer_zero_ += offset;
+  }
+
+ private:
+  ElementScale const* pointer_scale_;
+  ElementScale const* pointer_zero_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Volta A x RowMajor B tensorOp, for 32x32x4 interleaved gemm
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<
+    MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<
+        platform::is_same<typename MmaOperator_::ArchTag, arch::Sm70>::value &&
+        platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::RowMajor>::value>::type> {
+ public:
+  static_assert(platform::is_same<typename MmaOperator_::InterleavedTileShape, GemmShape<32, 32, 4>>::value, "");
+
+  /// Mma Operator
+  using MmaOperator = MmaOperator_;
+
+  // The architecture specific mma ooperator being used
+  using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+  // Mma Instruction Shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Type of the scales
+  using ElementScale = half_t;
+
+  /// Fragment to hold B data before Mma
+  using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+  /// Warp mma shape
+  using Shape = Shape_;
+
+  // Fragment to hold scale data to apply to B before mma
+  // Each 32x32x4 matmul uses 8 elements from B.
+  static constexpr int ColsPerMmaTile = 32;
+  static constexpr int TileNIterations = Shape::kN / ColsPerMmaTile;
+  using FragmentScale = Array<ElementScale, TileNIterations * 8>;
+  using AccessType = Array<ElementScale, 8>;
+
+  /// Layout of the scales in shared memory
+  using Layout = layout::RowMajor;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<ElementScale, Layout>;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+  static_assert(QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, int const warp_idx_n, int const lane_idx) {
+    int const warp_offset = warp_idx_n * Shape::kN;
+    int const base_col = lane_idx & 0xF8;
+    int const thread_offset = warp_offset + base_col;
+    pointer_ = smem_scales.data() + thread_offset;
+  }
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag) {
+    AccessType* scale_frag_ptr = reinterpret_cast<AccessType*>(&scale_frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_iter = 0; tile_iter < TileNIterations; ++tile_iter) {
+      // We jump by 32 here since volta does <32x32x4> super mmas inside a warp.
+      scale_frag_ptr[tile_iter] = *reinterpret_cast<AccessType const*>(pointer_ + ColsPerMmaTile * tile_iter);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag) {
+    static_assert(FragmentScale::kElements == FragmentDequantizedOperand::kElements, "");
+
+    multiplies<FragmentDequantizedOperand> mul_op;
+    operand_frag = mul_op(operand_frag, scale_frag);
+  }
+
+ private:
+  ElementScale const* pointer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Volta A x ColumnMajor B tensorOp, for 32x32x4 interleaved gemm
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<
+    MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<
+        platform::is_same<typename MmaOperator_::ArchTag, arch::Sm70>::value &&
+        platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type> {
+ public:
+  static_assert(platform::is_same<typename MmaOperator_::InterleavedTileShape, GemmShape<32, 32, 4>>::value, "");
+
+  /// Mma Operator
+  using MmaOperator = MmaOperator_;
+
+  // The architecture specific mma ooperator being used
+  using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+  // Mma Instruction Shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Type of the scales
+  using ElementScale = half_t;
+
+  /// Fragment to hold B data before Mma
+  using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+  /// Warp mma shape
+  using Shape = Shape_;
+
+  // Fragment to hold scale data to apply to B before mma
+  // Each 32x32x4 matmul uses 8 elements from B.
+  static constexpr int ColsPerMmaTile = 32;
+  static constexpr int TileNIterations = Shape::kN / ColsPerMmaTile;
+  using FragmentScale = Array<ElementScale, TileNIterations * 2>;
+
+  /// Layout of the scales in shared memory
+  using Layout = layout::RowMajor;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<ElementScale, Layout>;
+
+  static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+  static_assert(QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+  CUTLASS_DEVICE
+  MmaTensorOpDequantizer(TensorRef smem_scales, int const warp_idx_n, int const lane_idx) {
+    int const warp_offset = warp_idx_n * Shape::kN;
+    int const base_col = lane_idx & 0xF8 + lane_idx % 4;
+    int const thread_offset = warp_offset + base_col;
+    pointer_ = smem_scales.data() + thread_offset;
+  }
+
+  CUTLASS_DEVICE
+  void load(FragmentScale& scale_frag) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_iter = 0; tile_iter < TileNIterations; ++tile_iter) {
+      // We jump by 32 here since volta does <32x32x4> super mmas inside a warp.
+      // For col major B, each thread will jump 4 cols to get its next value inside
+      // of the super mma.
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_iter = 0; mma_iter < 2; ++mma_iter) {
+        scale_frag[tile_iter * 2 + mma_iter] = pointer_[ColsPerMmaTile * tile_iter + 4 * mma_iter];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void dequantize(FragmentDequantizedOperand& operand_frag, FragmentScale const& scale_frag) {
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    static constexpr int total_n_mmas = 2 * TileNIterations;
+    static_assert(MmaOperandB::kElements * total_n_mmas == FragmentDequantizedOperand::kElements, "");
+
+    multiplies<MmaOperandB> mul_op;
+
+    MmaOperandB* operand_frag_ptr = reinterpret_cast<MmaOperandB*>(&operand_frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n_iter = 0; mma_n_iter < total_n_mmas; ++mma_n_iter) {
+      operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+    }
+  }
+
+ private:
+  ElementScale const* pointer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h
new file mode 100644
index 000000000000..0841218a480b
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace ort_fastertransformer {
+// Note: The shapes are in the format MxNxK. The K shape of the runtime config MUST match the K shape
+//       in the kernel layout details when doing weight only quantization.
+enum class CutlassTileConfig {
+  // Signals that we should run heuristics do choose a config
+  Undefined,
+
+  // Signals that we should run heuristics do choose a config
+  ChooseWithHeuristic,
+
+  // SiMT config
+  CtaShape128x128x8_WarpShape64x64x8,
+
+  // TensorCore configs CTA_N = 128, CTA_K = 64
+  // Warp configs for M=16
+  CtaShape16x128x64_WarpShape16x32x64,
+  // Warp configs for M=32
+  CtaShape32x128x64_WarpShape32x32x64,
+
+  // Warp configs for M=64
+  CtaShape64x128x64_WarpShape32x64x64,
+  CtaShape64x64x128_WarpShape32x64x64,
+  CtaShape64x128x64_WarpShape64x32x64,
+
+  // Warp configs for M=128
+  CtaShape128x64x64_WarpShape64x32x64,
+  CtaShape128x128x64_WarpShape64x32x64,
+  CtaShape128x128x64_WarpShape64x64x64,
+  CtaShape128x128x64_WarpShape128x32x64,
+  CtaShape128x256x64_WarpShape64x64x64,
+
+  // Warp configs for M=256
+  CtaShape256x128x64_WarpShape64x64x64,
+
+  // TensorCore config CTA_N = 256, CTA_K = 64
+  CtaShape16x256x64_WarpShape16x64x64
+};
+
+enum class SplitKStyle {
+  NO_SPLIT_K,
+  SPLIT_K_SERIAL,
+  // SPLIT_K_PARALLEL // Not supported yet
+};
+
+enum class CutlassTileConfigSM90 {
+  // Signals that we should run heuristics do choose a config
+  Undefined,
+
+  // Signals that we should run heuristics do choose a config
+  ChooseWithHeuristic,
+
+  // CTA configs for M=64
+  CtaShape64x16x128B,
+  CtaShape64x32x128B,
+  CtaShape64x64x128B,
+  CtaShape64x128x128B,
+  CtaShape64x256x128B,
+
+  // CTA configs for M=128
+  CtaShape128x16x128B,
+  CtaShape128x32x128B,
+  CtaShape128x64x128B,
+  CtaShape128x128x128B,
+  CtaShape128x256x128B,
+};
+
+enum class MainloopScheduleType {
+  AUTO  // Automatically selects between pingpong and cooperative schedules on Hopper. On older architectures, this
+        // defaults to the "legacy" main loop schedule.
+};
+
+enum class EpilogueScheduleType {
+  AUTO  // Automatically chooses an epilogue schedule compatible with the selected main loop schedule for Hopper. For
+        // architectures older than hopper, the epilogue is always performed by the same thread block as the main loop.
+};
+
+enum class ClusterShape { ClusterShape_1x1x1,
+                          ClusterShape_2x1x1,
+                          ClusterShape_1x2x1,
+                          ClusterShape_2x2x1 };
+
+struct CutlassGemmConfig {
+  CutlassTileConfig tile_config = CutlassTileConfig::ChooseWithHeuristic;
+  SplitKStyle split_k_style = SplitKStyle::NO_SPLIT_K;
+  int split_k_factor = -1;
+  int stages = -1;
+
+  // config options for sm90
+  CutlassTileConfigSM90 tile_config_sm90 = CutlassTileConfigSM90::ChooseWithHeuristic;
+  MainloopScheduleType mainloop_schedule = MainloopScheduleType::AUTO;
+  EpilogueScheduleType epilogue_schedule = EpilogueScheduleType::AUTO;
+  ClusterShape cluster_shape = ClusterShape::ClusterShape_1x1x1;
+
+  CutlassGemmConfig() {}
+
+  CutlassGemmConfig(CutlassTileConfig tile_config, SplitKStyle split_k_style, int split_k_factor, int stages)
+      : tile_config(tile_config), split_k_style(split_k_style), split_k_factor(split_k_factor), stages(stages) {}
+
+  CutlassGemmConfig(CutlassTileConfigSM90 tile_config_sm90, MainloopScheduleType mainloop_schedule,
+                    EpilogueScheduleType epilogue_schedule, ClusterShape cluster_shape)
+      : tile_config_sm90(tile_config_sm90),
+        mainloop_schedule(mainloop_schedule),
+        epilogue_schedule(epilogue_schedule),
+        cluster_shape(cluster_shape) {}
+};
+
+}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h
new file mode 100644
index 000000000000..7fd1745aa2c5
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/interleaved_numeric_conversion.h
@@ -0,0 +1,392 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Boost-like numeric conversion operator for int8 and CUTLASS int4b_t interleaved in a register
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/numeric_types.h"
+
+namespace cutlass {
+
+// This converter is meant to be used with data interleaved in a 32-bit register where the even elements are in the low
+// bits and the odd elemeents are in the high bits of the register. In addition, it assumes elements were originally
+// signed and had a bias of 2**(b-1) added (where b is the number of bits in the type) to make all numbers unsigned.
+// This converter will uninterleave the data and subtract the bias while converting to the result type.
+template <typename T, typename S, int N>
+struct FastInterleavedAndBiasedNumericArrayConverter {};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint8_t, 4> {
+  using result_type = Array<half_t, 4>;
+  using source_type = Array<uint8_t, 4>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+
+    uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+    static constexpr uint32_t mask_for_elt_01 = 0x5250;
+    static constexpr uint32_t mask_for_elt_23 = 0x5351;
+    static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+    asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(h[0]) : "r"(i8s), "n"(start_byte_for_fp16), "n"(mask_for_elt_01));
+    asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(h[1]) : "r"(i8s), "n"(start_byte_for_fp16), "n"(mask_for_elt_23));
+
+    // Lastly, we subtract 1152 from our constructed number using fp16 math to get our signed integer as fp16.
+    static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(I8s_TO_F16s_MAGIC_NUM));
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[1]) : "r"(h[1]), "r"(I8s_TO_F16s_MAGIC_NUM));
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint8_t, N> {
+  static constexpr int VEC_WIDTH = 4;
+  static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");
+
+  using result_type = Array<half_t, N>;
+  using source_type = Array<uint8_t, N>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    using scalar_result_type = typename result_type::Element;
+    using scalar_source_type = typename source_type::Element;
+    FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH> convert_vector_;
+
+    result_type result;
+    using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+    using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+    vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+    vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / VEC_WIDTH; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint8_t, 4> {
+  using result_type = Array<bfloat16_t, 4>;
+  using source_type = Array<uint8_t, 4>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+    uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+    static constexpr uint32_t fp32_base = 0x4B000000;
+    float fp32_intermediates[4];
+
+    // Construct FP32s, bfloat does not have enough mantissa for IADD trick
+    uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+    fp32_intermediates_casted[0] = __byte_perm(i8s, fp32_base, 0x7650);
+    fp32_intermediates_casted[1] = __byte_perm(i8s, fp32_base, 0x7652);
+    fp32_intermediates_casted[2] = __byte_perm(i8s, fp32_base, 0x7651);
+    fp32_intermediates_casted[3] = __byte_perm(i8s, fp32_base, 0x7653);
+
+    // Subtract out fp32_base + 128 to make the unsigned integer signed.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < 4; ++ii) {
+      fp32_intermediates[ii] -= 8388736.f;
+    }
+
+    // Truncate the fp32 representation and pack up as bfloat16s.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < 2; ++ii) {
+      bf16_result_ptr[ii] =
+          __byte_perm(fp32_intermediates_casted[2 * ii + 0], fp32_intermediates_casted[2 * ii + 1], 0x7632);
+    }
+#else
+    // Disable this on architectures older than Ampere since they lack hardware for bf16 mma. If one wishes to use
+    // HMMA on older hardware, they should Convert directly to FP16 using FP16 converters.
+    result.clear();  // Suppress compiler warning
+    arch::device_breakpoint();
+#endif
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint8_t, N> {
+  static constexpr int VEC_WIDTH = 4;
+  static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");
+
+  using result_type = Array<bfloat16_t, N>;
+  using source_type = Array<uint8_t, N>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    using scalar_result_type = typename result_type::Element;
+    using scalar_source_type = typename source_type::Element;
+    FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH> convert_vector_;
+
+    result_type result;
+    using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+    using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+    vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+    vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / VEC_WIDTH; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint4b_t, 8> {
+  using result_type = Array<half_t, 8>;
+  using source_type = Array<uint4b_t, 8>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+
+    uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
+
+    // First, we extract the i4s and construct an intermediate fp16 number.
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
+    static constexpr uint32_t TOP_MASK = 0x00f000f0;
+    static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+    // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
+    // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
+    // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
+    // elt_67 to fp16 without having to shift them to the bottom bits before hand.
+
+    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
+    // immediately before required.
+    const uint32_t top_i4s = i4s >> 8;
+    // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[0])
+                 : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[1])
+                 : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[2])
+                 : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[3])
+                 : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+
+    // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
+    // half2 ctor. In this case, I chose performance reliability over code readability.
+
+    // This is the half2 {1032, 1032} represented as an integer.
+    static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+    // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+    static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+    // This is the half2 {-72, -72} represented as an integer.
+    static constexpr uint32_t NEG_72 = 0xd480d480;
+
+    // Finally, we construct the output numbers.
+    // Convert elt_01
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_23
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_72));
+    // Convert elt_45
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_67
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_72));
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint4b_t, N> {
+  static constexpr int VEC_WIDTH = 8;
+  static_assert(!(N % VEC_WIDTH), "N must be multiple of 8.");
+
+  using result_type = Array<half_t, N>;
+  using source_type = Array<uint4b_t, N>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    using scalar_result_type = typename result_type::Element;
+    using scalar_source_type = typename source_type::Element;
+    FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH> convert_vector_;
+
+    result_type result;
+    using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+    using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+    vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+    vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / VEC_WIDTH; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint4b_t, 8> {
+  using result_type = Array<bfloat16_t, 8>;
+  using source_type = Array<uint4b_t, 8>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    result_type result;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+    uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const source_i4s = reinterpret_cast<uint32_t const&>(source);
+
+    // First, we extract the i4s and construct an intermediate fp16 number.
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    static constexpr uint32_t MASK = 0x000f000f;
+    static constexpr uint32_t I4s_TO_BF16s_MAGIC_NUM = 0x43004300;
+
+    // We don't have enough mantissa to remove as much shift overhead as FP16, so we must loop.
+    // No shift needed for first item.
+    uint32_t i4s = source_i4s;
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[0])
+                 : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 1; ii < result_type::kElements / 2; ++ii) {
+      i4s >>= sizeof_bits<typename source_type::Element>::value;
+      // (i4s & 0x000f000f) | 0x43004300
+      asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                   : "=r"(h[ii])
+                   : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
+    }
+
+    // This is the BF16 {-136, -136} represented as an integer.
+    static constexpr uint32_t BF16_BIAS = 0xC308C308;
+    static constexpr uint32_t BF16_ONE = 0x3F803F80;
+
+    // Finally, we construct the output numbers.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < result_type::kElements / 2; ++ii) {
+      // Since this section is for Ampere+, we use bf16 fma to do the bias subtraction
+      asm("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[ii]) : "r"(h[ii]), "r"(BF16_ONE), "r"(BF16_BIAS));
+    }
+#else
+    // Disable this on architectures older than Ampere since they lack hardware for bf16 mma. If one wishes to use
+    // HMMA on older hardware, they should Convert directly to FP16 using FP16 converters.
+    arch::device_breakpoint();
+    result.clear();  // Suppress compiler warning.
+#endif
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint4b_t, N> {
+  static constexpr int VEC_WIDTH = 8;
+  static_assert(!(N % VEC_WIDTH), "N must be multiple of 8.");
+
+  using result_type = Array<bfloat16_t, N>;
+  using source_type = Array<uint4b_t, N>;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const& source) {
+    using scalar_result_type = typename result_type::Element;
+    using scalar_source_type = typename source_type::Element;
+    FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH> convert_vector_;
+
+    result_type result;
+    using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+    using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+    vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+    vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / VEC_WIDTH; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const& s) { return convert(s); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h
similarity index 97%
rename from onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
rename to onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h
index 111d5240e40a..e5abefa35bc8 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/tile_interleaved_layout.h
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/tile_interleaved_layout.h
@@ -31,9 +31,6 @@
 /*! \file
     \brief Defines new layouts needed for MoE
 */
-
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "cutlass/cutlass.h"
@@ -45,7 +42,7 @@ namespace cutlass {
 namespace layout {
 
 template <int RowsPerTile, int ColumnsInterleaved>
-class ColumnMajorTileInterleave {
+struct ColumnMajorTileInterleave {
   static constexpr int kRowsPerTile = RowsPerTile;
   static constexpr int kColumnsInterleaved = ColumnsInterleaved;
 };
@@ -62,5 +59,3 @@ struct IsColumnMajorTileInterleave<ColumnMajorTileInterleave<U, V>> {
 
 }  // namespace layout
 }  // namespace cutlass
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h
new file mode 100644
index 000000000000..79811ef3e611
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates for visiting scales to be used when dequantizing the weights for weight-only GEMM
+           quantization.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape, typename Element, typename Layout, int AdvanceRank, int Alignment>
+class FineGrainedScaleZeroIterator;
+
+template <typename Shape_, typename Element_, int Alignment_>
+class FineGrainedScaleZeroIterator<Shape_, Element_, layout::RowMajor, 0, Alignment_> {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = 0;
+  static int const kAlignment = Alignment_;
+
+  static int const kAccessesPerVector = 1;
+
+  /// Row index of scales corresponding to the groupsize of 64
+  int row_groupsize64_;
+  int group_size_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using AccessType = AlignedArray<Element, kAlignment>;
+
+  // For compatibility with existing iterator interface
+  struct Params {
+    LongIndex stride_ = 0;
+
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_ = 0;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    explicit Params(Layout const& layout) : stride_(layout.stride(0)) {
+      inc_advance_ = Shape::kRow * stride_ * sizeof_bits<Element>::value / 8;
+    }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_scale_;
+  BytePointer pointer_zero_;
+
+  bool is_valid_ = false;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_DEVICE
+  FineGrainedScaleZeroIterator(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of scale tensor
+      Pointer pointer_scale,
+      ///< Pointer to start of zero tensor
+      Pointer pointer_zero,
+      ///< Extent of the scale and bias
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      ///< Group size
+      int group_size)
+      : params_(params),
+        pointer_scale_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer_scale))),
+        pointer_zero_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer_zero))) {
+    row_groupsize64_ = threadblock_offset.row();
+    group_size_ = group_size;
+
+    const LongIndex tb_row_byte_offset =
+        threadblock_offset.row() / (group_size / 64) * params_.stride_ * sizeof_bits<Element>::value / 8;
+    const LongIndex tb_col_byte_offset = threadblock_offset.column() * sizeof_bits<Element>::value / 8;
+    pointer_scale_ += (tb_row_byte_offset + tb_col_byte_offset);
+
+    if (pointer_zero_ != nullptr) {
+      pointer_zero_ += (tb_row_byte_offset + tb_col_byte_offset);
+    }
+
+    static constexpr int THREADS_PER_ROW = Shape::kColumn / kAlignment;
+
+    int const thread_row = thread_id / THREADS_PER_ROW;
+    int const thread_col = thread_id % THREADS_PER_ROW;
+
+    const LongIndex thread_row_byte_offset = thread_row * params_.stride_ * sizeof_bits<Element>::value / 8;
+    const LongIndex thread_col_byte_offset = thread_col * kAlignment * sizeof_bits<Element>::value / 8;
+    pointer_scale_ += (thread_row_byte_offset + thread_col_byte_offset);
+    if (pointer_zero_ != nullptr) {
+      pointer_zero_ += (thread_row_byte_offset + thread_col_byte_offset);
+    }
+
+    // For the rows, we must check that we are within the extent AND the tile to avoid extra reads on
+    // a given iteration. The same threads will be responsible for issues reads since the number of scales
+    // read in a given iteration is a constant. Therefore, we should never have to update is_valid_
+    // outside of the constructor.
+    int const global_row = threadblock_offset.row() + thread_row;
+    int const global_col = threadblock_offset.column() + thread_col * kAlignment;
+
+    bool const row_in_bounds = global_row < extent.row() && thread_row < Shape::kRow;
+    bool const col_in_bounds = global_col < extent.column();
+
+    is_valid_ = row_in_bounds && col_in_bounds;
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE FineGrainedScaleZeroIterator(Params const& params,   ///< Precomputed parameters object
+                                                   Pointer pointer_scale,  ///< Pointer to start of scale tensor
+                                                   Pointer pointer_zero,   ///< Pointer to start of zero tensor
+                                                   TensorCoord extent,     ///< Extent of tensor
+                                                   int thread_id,          ///< ID of each participating thread
+                                                   int group_size)
+      : FineGrainedScaleZeroIterator(params, pointer_scale, pointer_zero, extent, thread_id, make_Coord(0, 0),
+                                     group_size) {}
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    const LongIndex row_byte_offset = tile_offset.row() * params_.inc_advance_;
+    const LongIndex col_byte_offset = tile_offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+    pointer_scale_ += row_byte_offset + col_byte_offset;
+    if (pointer_zero_ != nullptr) {
+      pointer_zero_ += row_byte_offset + col_byte_offset;
+    }
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE void clear_mask(bool enable = true) { is_valid_ &= (!enable); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const { return is_valid_; }
+
+  /// Returns a scale pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get_scale() const { return reinterpret_cast<AccessType*>(pointer_scale_); }
+
+  /// Returns a zero pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get_zero() const { return reinterpret_cast<AccessType*>(pointer_zero_); }
+};
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h
new file mode 100644
index 000000000000..403221a95601
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/weight_only_quant_op.h
@@ -0,0 +1,50 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+namespace cutlass {
+
+enum class WeightOnlyQuantOp { UNDEFINED,
+                               PER_COLUMN_SCALE_ONLY,
+                               FINEGRAINED_SCALE_ONLY,
+                               FINEGRAINED_SCALE_AND_ZEROS };
+
+constexpr bool isFinegrained(WeightOnlyQuantOp op) {
+  return op == WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS || op == WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+}
+
+constexpr bool hasZero(WeightOnlyQuantOp op) { return op == WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS; }
+
+}  // namespace cutlass
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
index f0abd46572a9..cd59e904ad9e 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.cc
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifdef USE_CUTLASS
 
 #include "cutlass_heuristic.h"
 
@@ -66,9 +65,9 @@ bool is_valid_split_k_factor(const int64_t m, const int64_t n, const int64_t k,
   }
 
   // Check that the workspace has sufficient space for this split-k factor
-  const int ctas_in_m_dim = static_cast<int>((m + tile_shape.m - 1) / tile_shape.m);
-  const int ctas_in_n_dim = static_cast<int>((n + tile_shape.n - 1) / tile_shape.n);
-  const int required_ws_bytes = split_k_factor == 1 ? 0 : sizeof(int) * ctas_in_m_dim * ctas_in_n_dim;
+  const size_t ctas_in_m_dim = static_cast<int>((m + tile_shape.m - 1) / tile_shape.m);
+  const size_t ctas_in_n_dim = static_cast<int>((n + tile_shape.n - 1) / tile_shape.n);
+  const size_t required_ws_bytes = split_k_factor == 1 ? 0 : sizeof(int) * ctas_in_m_dim * ctas_in_n_dim;
 
   if (required_ws_bytes > workspace_bytes) {
     return false;
@@ -128,7 +127,7 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
   int current_m_tile = 0;
 
   const int max_split_k = n >= multi_processor_count * 256 ? 1 : split_k_limit;
-  for (int ii = 0; ii < candidate_configs.size(); ++ii) {
+  for (size_t ii = 0; ii < candidate_configs.size(); ++ii) {
     CutlassGemmConfig candidate_config = candidate_configs[ii];
     TileShape tile_shape = get_cta_shape_for_config(candidate_config.tile_config);
     int occupancy = occupancies[ii];
@@ -152,8 +151,8 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
         const int ctas_for_problem = ctas_in_m_dim * ctas_in_n_dim * split_k_factor;
 
         const int num_waves_total = (ctas_for_problem + ctas_per_wave - 1) / ctas_per_wave;
-        const float num_waves_fractional = ctas_for_problem / float(ctas_per_wave);
-        const float current_score = float(num_waves_total) - num_waves_fractional;
+        const float num_waves_fractional = ctas_for_problem / static_cast<float>(ctas_per_wave);
+        const float current_score = static_cast<float>(num_waves_total) - num_waves_fractional;
 
         const float score_slack = 0.1f;
         if (current_score < config_score ||
@@ -186,5 +185,3 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
 }
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
index 0019db66d953..0f75a121b3b9 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/cutlass_heuristic.h
@@ -13,11 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifdef USE_CUTLASS
 
 #pragma once
 
-#include "ft_gemm_configs.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -38,4 +37,3 @@ CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<Cutlas
                                                         const int multi_processor_count, const int is_weight_only);
 
 }  // namespace ort_fastertransformer
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
deleted file mode 100644
index 7f58d8fe7251..000000000000
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/ft_gemm_configs.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef USE_CUTLASS
-
-#pragma once
-
-namespace ort_fastertransformer {
-// Note: The shapes are in the format MxNxK. The K shape of the runtime config MUST match the K shape
-//       in the kernel layout details when doing weight only quantization.
-enum class CutlassTileConfig {
-  // Signals that we should run heuristics do choose a config
-  Undefined,
-
-  // Signals that we should run heuristics do choose a config
-  ChooseWithHeuristic,
-
-  // SiMT config
-  CtaShape128x128x8_WarpShape64x64x8,
-
-  // TensorCore configs CTA_N = 128, CTA_K = 64
-  // Warp configs for M=32
-  CtaShape32x128x64_WarpShape32x32x64,
-
-  // Warp configs for M=64
-  CtaShape64x128x64_WarpShape32x64x64,
-  CtaShape64x128x64_WarpShape64x32x64,
-
-  // Warp configs for M=128
-  CtaShape128x128x64_WarpShape64x32x64,
-  CtaShape128x128x64_WarpShape128x32x64
-};
-
-enum class SplitKStyle {
-  NO_SPLIT_K,
-  SPLIT_K_SERIAL,
-  // SPLIT_K_PARALLEL // Not supported yet
-};
-
-struct CutlassGemmConfig {
-  CutlassTileConfig tile_config = CutlassTileConfig::ChooseWithHeuristic;
-  SplitKStyle split_k_style = SplitKStyle::NO_SPLIT_K;
-  int split_k_factor = -1;
-  int stages = -1;
-};
-
-}  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
deleted file mode 100644
index 48343d72aa7f..000000000000
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_cutlass_kernel.h
+++ /dev/null
@@ -1,467 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#ifdef USE_CUTLASS
-
-#pragma once
-
-#include "cutlass/complex.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-
-#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/trace.h"
-
-#include "gemm_moe_problem_visitor.h"
-#include "tile_interleaved_layout.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// This section exists to that we can use the same kernel code for regular gemm and dequantizing gemms.
-// It will dispatch to the dequantizing gemm if the Mma type has an Iterator for scales in global.
-template <typename...>
-using void_t = void;
-
-template <typename Mma, typename = void>
-struct use_dq_gemm : platform::false_type {};
-
-template <typename Mma>
-struct use_dq_gemm<Mma, void_t<typename Mma::IteratorScale>> : platform::true_type {};
-
-// SFINAE overload for dequantizing gemm
-template <typename Mma, typename ElementScale, typename platform::enable_if<use_dq_gemm<Mma>::value, bool>::type = true>
-CUTLASS_DEVICE static void run_mma(Mma mma, int gemm_k_iterations, typename Mma::FragmentC& accum,
-                                   typename Mma::IteratorA iterator_A, typename Mma::IteratorB iterator_B,
-                                   typename Mma::FragmentC const& src_accum, ElementScale* weight_scale_ptr,
-                                   MatrixCoord scale_extent, const int thread_idx, MatrixCoord tb_offset_scale) {
-  typename Mma::IteratorScale iterator_scale(Mma::IteratorScale::Layout(scale_extent.column()), weight_scale_ptr,
-                                             scale_extent, thread_idx, tb_offset_scale);
-
-  mma(gemm_k_iterations, accum, iterator_A, iterator_B, iterator_scale, src_accum);
-}
-
-// SFINAE overload for normal gemm. This completely ignores the scale parameters
-template <typename Mma, typename ElementScale,
-          typename platform::enable_if<!use_dq_gemm<Mma>::value, bool>::type = true>
-CUTLASS_DEVICE static void run_mma(Mma mma, int gemm_k_iterations, typename Mma::FragmentC& accum,
-                                   typename Mma::IteratorA iterator_A, typename Mma::IteratorB iterator_B,
-                                   typename Mma::FragmentC const& src_accum, ElementScale* weight_scale_ptr,
-                                   MatrixCoord scale_extent, const int thread_idx, MatrixCoord tb_offset_scale) {
-  mma(gemm_k_iterations, accum, iterator_A, iterator_B, src_accum);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Mma_,                        ///! Threadblock-scoped matrix multiply-accumulate
-          typename Epilogue_,                   ///! Epilogue
-          typename ThreadblockSwizzle_,         ///! Threadblock swizzling function
-          typename KernelArch,                  ///! The Architecture this kernel is compiled for. Used since SIMT kernels lose
-                                                /// top-level
-                                                /// arch.
-          GroupScheduleMode GroupScheduleMode_  ///! Type of scheduling to perform
-          >
-struct MoeFCGemm {
- public:
-  using Mma = Mma_;
-  using Epilogue = Epilogue_;
-  using EpilogueOutputOp = typename Epilogue::OutputOp;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
-  static bool const kTransposed = false;
-
-  // Optional transpose
-  using MapArguments =
-      kernel::detail::MapArguments<typename Mma::IteratorA::Element, typename Mma::IteratorA::Layout, Mma::kTransformA,
-                                   Mma::IteratorA::AccessType::kElements, typename Mma::IteratorB::Element,
-                                   typename Mma::IteratorB::Layout, Mma::kTransformB,
-                                   Mma::IteratorB::AccessType::kElements, typename Mma::LayoutC, kTransposed>;
-
-  // Public-facing type definitions related to operand element type, layout, and complex conjugate
-  // operation. Must interact with the 'kTransposed' notion.
-  static_assert(!kTransposed, "Transpose problem not supported");
-  using ElementA = typename MapArguments::ElementA;
-  using LayoutA = typename MapArguments::LayoutA;
-  using ElementB = typename MapArguments::ElementB;
-  using LayoutB = typename MapArguments::LayoutB;
-  using ElementC = typename Epilogue::OutputTileIterator::Element;
-  using LayoutC = typename MapArguments::LayoutC;
-  using ElementScale = ElementC;
-
-  static ComplexTransform const kTransformA = MapArguments::kTransformA;
-  static ComplexTransform const kTransformB = MapArguments::kTransformB;
-
-  // Type definitions about the mainloop.
-  using Operator = typename Mma::Operator;
-  using OperatorClass = typename Mma::Operator::OperatorClass;
-  using ThreadblockShape = typename Mma::Shape;
-  using WarpShape = typename Mma::Operator::Shape;
-  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
-  using ArchTag = typename Mma::ArchTag;
-
-  static int const kStages = Mma::kStages;
-  static int const kAlignmentA = MapArguments::kAlignmentA;
-  static int const kAlignmentB = MapArguments::kAlignmentB;
-  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
-
-  /// Warp count (concept: GemmShape)
-  using WarpCount = typename Mma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount::kCount;
-
-  using ProblemVisitor =
-      GemmMoeProblemVisitor<ThreadblockShape, kGroupScheduleMode, kThreadCount, kThreadCount, kTransposed>;
-
-  //
-  // Structures
-  //
-
-  /// Argument structure
-  struct Arguments {
-    //
-    // Data members
-    //
-
-    int problem_count;
-    int threadblock_count;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    ElementA* ptr_A;
-    ElementB* ptr_B;
-    ElementScale* weight_scales;
-    ElementC* ptr_C;
-    ElementC* ptr_D;
-
-    int64_t* total_rows_before_expert;
-    int64_t gemm_n;
-    int64_t gemm_k;
-
-    // Only used by device-level operator
-    GemmCoord* host_problem_sizes;
-
-    //
-    // Methods
-    //
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments()
-        : problem_count(0),
-          threadblock_count(0),
-          ptr_A(nullptr),
-          ptr_B(nullptr),
-          weight_scales(nullptr),
-          ptr_C(nullptr),
-          ptr_D(nullptr),
-          total_rows_before_expert(nullptr),
-          gemm_n(0),
-          gemm_k(0),
-          host_problem_sizes(nullptr) {}
-
-    /// Ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(int problem_count, int threadblock_count, typename EpilogueOutputOp::Params output_op,
-              const ElementA* ptr_A, const ElementB* ptr_B, const ElementScale* weight_scales, const ElementC* ptr_C,
-              ElementC* ptr_D, int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k,
-              GemmCoord* host_problem_sizes = nullptr)
-        : problem_count(problem_count),
-          threadblock_count(threadblock_count),
-          output_op(output_op),
-          ptr_A(const_cast<ElementA*>(ptr_A)),
-          ptr_B(const_cast<ElementB*>(ptr_B)),
-          weight_scales(const_cast<ElementScale*>(weight_scales)),
-          ptr_C(const_cast<ElementC*>(ptr_C)),
-          ptr_D(ptr_D),
-          total_rows_before_expert(total_rows_before_expert),
-          gemm_n(gemm_n),
-          gemm_k(gemm_k),
-          host_problem_sizes(nullptr) {
-      if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value) {
-        assert(weight_scales);
-      }
-    }
-  };
-
-  //
-  // Structure for precomputing values in host memory and passing to kernels
-  //
-
-  /// Parameters structure
-  struct Params {
-    typename ProblemVisitor::Params problem_visitor;
-    int threadblock_count;
-
-    typename EpilogueOutputOp::Params output_op;
-
-    ElementA* ptr_A;
-    ElementB* ptr_B;
-    ElementScale* weight_scales;
-    ElementC* ptr_C;
-    ElementC* ptr_D;
-
-    //
-    // Methods
-    //
-
-    CUTLASS_HOST_DEVICE
-    Params() : ptr_A(nullptr), ptr_B(nullptr), weight_scales(nullptr), ptr_C(nullptr), ptr_D(nullptr) {}
-
-    CUTLASS_HOST_DEVICE
-    Params(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
-        : problem_visitor(args.total_rows_before_expert, args.gemm_n, args.gemm_k, args.problem_count, workspace,
-                          tile_count),
-          threadblock_count(args.threadblock_count),
-          output_op(args.output_op),
-          ptr_A(args.ptr_A),
-          ptr_B(args.ptr_B),
-          weight_scales(args.weight_scales),
-          ptr_C(args.ptr_C),
-          ptr_D(args.ptr_D) {}
-
-    CUTLASS_HOST_DEVICE
-    void update(Arguments const& args, void* workspace = nullptr, int tile_count = 0) {
-      problem_visitor = typename ProblemVisitor::Params(args.total_rows_before_expert, args.gemm_n, args.gemm_k,
-                                                        args.problem_count, workspace, tile_count);
-      threadblock_count = args.threadblock_count;
-      output_op = args.output_op;
-      ptr_A = args.ptr_A;
-      ptr_B = args.ptr_B;
-      weight_scales = args.weight_scales;
-      ptr_C = args.ptr_C;
-      ptr_D = args.ptr_D;
-    }
-  };
-
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename ProblemVisitor::SharedStorage problem_visitor;
-    typename Mma::SharedStorage main_loop;
-    typename Epilogue::SharedStorage epilogue;
-  };
-
- public:
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  MoeFCGemm() {}
-
-  /// Determines whether kernel satisfies alignment
-  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) { return Status::kSuccess; }
-
-  static Status can_implement(Arguments const& args) {
-    if (args.weight_scales != nullptr) {
-      CUTLASS_TRACE_HOST(
-          "MoeFCGemm::can_implement() - weight scales are ignored for all types except uint8_t and uint4b_t");
-      return Status::kInvalid;
-    }
-    return Status::kSuccess;
-  }
-
-  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
-    return 0;
-  }
-
-  // The dummy template parameter is not used and exists so that we can compile this code using
-  // a standard earlier than C++17. Prior to C++17, fully specialized templates HAD to exists in
-  // a namespace
-  template <bool B, typename dummy = void>
-  struct KernelRunner {
-    CUTLASS_DEVICE
-    static void run_kernel(Params const& params, SharedStorage& shared_storage) { CUTLASS_NOT_IMPLEMENTED(); }
-  };
-
-  template <typename dummy>
-  struct KernelRunner<true, dummy> {
-    CUTLASS_DEVICE
-    static void run_kernel(Params const& params, SharedStorage& shared_storage) {
-      //
-      // These types shadow the type-level definitions and support the ability to implement
-      // a 'transposed' GEMM that computes the transposed problems.
-      //
-      using ElementA = typename Mma::IteratorA::Element;
-      using LayoutA = typename Mma::IteratorA::Layout;
-      using ElementB = typename Mma::IteratorB::Element;
-      using LayoutB = typename Mma::IteratorB::Layout;
-      using ElementC = typename Epilogue::OutputTileIterator::Element;
-      using LayoutC = typename Epilogue::OutputTileIterator::Layout;
-      static constexpr int kInterleave = Mma::IteratorB::Shape::kRow / Mma::Shape::kK;
-      static_assert(platform::is_same<LayoutB, layout::RowMajor>::value && kInterleave == 1 ||
-                        platform::is_same<LayoutB, layout::ColumnMajor>::value && kInterleave >= 1,
-                    "B must be row major/col major OR col major interleaved.");
-
-      //
-      // Problem visitor.
-      //
-      ProblemVisitor problem_visitor(params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
-
-      const int64_t gemm_k = params.problem_visitor.gemm_k;
-      const int64_t gemm_n = params.problem_visitor.gemm_n;
-      int64_t bytes_per_expert_matrix = (gemm_k * gemm_n / 8) * cutlass::sizeof_bits<ElementB>::value;
-
-      // Outer 'persistent' loop to iterate over tiles
-      while (problem_visitor.next_tile()) {
-        GemmCoord problem_size = problem_visitor.problem_size();
-        int32_t problem_idx = problem_visitor.problem_index();
-        int32_t cta_idx = int32_t(problem_visitor.threadblock_idx());
-
-        GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
-
-        cutlass::gemm::GemmCoord threadblock_offset(int(cta_idx / grid_shape.n()) * Mma::Shape::kM,
-                                                    int(cta_idx % grid_shape.n()) * Mma::Shape::kN, 0);
-
-        // Load element pointers. Exchange pointers and strides if working on the transpose
-        const int64_t rows_to_jump =
-            problem_idx == 0 ? 0 : params.problem_visitor.last_row_for_problem[problem_idx - 1];
-        ElementA* ptr_A = reinterpret_cast<ElementA*>(params.ptr_A) + rows_to_jump * gemm_k;
-        typename LayoutA::LongIndex ldm_A = gemm_k;
-
-        char* byte_ptr_B = ((char*)params.ptr_B) + problem_idx * bytes_per_expert_matrix;
-        ElementB* ptr_B = reinterpret_cast<ElementB*>(byte_ptr_B);
-        typename LayoutB::LongIndex ldm_B =
-            platform::is_same<layout::RowMajor, LayoutB>::value ? gemm_n : gemm_k * kInterleave;
-
-        // Compute initial location in logical coordinates
-        cutlass::MatrixCoord tb_offset_A{
-            threadblock_offset.m(),
-            0,
-        };
-
-        cutlass::MatrixCoord tb_offset_B{0, threadblock_offset.n() / kInterleave};
-
-        cutlass::MatrixCoord tb_offset_scale{0, threadblock_offset.n()};
-
-        // Compute position within threadblock
-        int thread_idx = threadIdx.x;
-
-        // Construct iterators to A and B operands
-        typename Mma::IteratorA iterator_A(LayoutA(ldm_A), ptr_A, {problem_size.m(), problem_size.k()}, thread_idx,
-                                           tb_offset_A);
-
-        typename Mma::IteratorB iterator_B(LayoutB(ldm_B), ptr_B,
-                                           {problem_size.k() * kInterleave, problem_size.n() / kInterleave}, thread_idx,
-                                           tb_offset_B);
-
-        typename Mma::FragmentC accumulators;
-
-        accumulators.clear();
-
-        // Broadcast the warp_id computed by lane 0 to ensure dependent code
-        // is compiled as warp-uniform.
-        int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-
-        int lane_idx = threadIdx.x % 32;
-
-        //
-        // Matrix multiply phase
-        //
-
-        // Construct thread-scoped matrix multiply
-        Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-
-        // Compute threadblock-scoped matrix multiply-add
-        int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
-
-        // Wait for all threads to finish their epilogue phases from the previous tile.
-        __syncthreads();
-
-        // Compute threadblock-scoped matrix multiply-add
-        ElementScale* weight_scale_ptr = params.weight_scales + problem_idx * problem_size.n();
-        run_mma<Mma>(mma, gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, weight_scale_ptr,
-                     {1, problem_size.n()}, thread_idx, tb_offset_scale);
-
-        //
-        // Epilogue
-        //
-
-        EpilogueOutputOp output_op(params.output_op);
-
-        ElementC* ptr_C = reinterpret_cast<ElementC*>(params.ptr_C) + problem_idx * gemm_n;
-        ElementC* ptr_D = reinterpret_cast<ElementC*>(params.ptr_D) + rows_to_jump * gemm_n;
-
-        LayoutC layout_C(0);
-        LayoutC layout_D(gemm_n);
-
-        typename Epilogue::OutputTileIterator::Params params_C(layout_C);
-        typename Epilogue::OutputTileIterator::Params params_D(layout_D);
-
-        // Tile iterator loading from source tensor.
-        typename Epilogue::OutputTileIterator iterator_C(params_C, ptr_C, problem_size.mn(), thread_idx,
-                                                         threadblock_offset.mn());
-
-        // Tile iterator writing to destination tensor.
-        typename Epilogue::OutputTileIterator iterator_D(params_D, ptr_D, problem_size.mn(), thread_idx,
-                                                         threadblock_offset.mn());
-
-        Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
-
-        // Execute the epilogue operator to update the destination tensor.
-        epilogue(output_op, iterator_D, accumulators, iterator_C);
-
-        // Next tile
-        problem_visitor.advance(gridDim.x);
-      }
-    }
-  };
-
-  /*
-    To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
-    to the ArchTag of the cutlass kernel operator.
-  */
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const& params, SharedStorage& shared_storage) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 750)
-    static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm70>::value;
-    KernelRunner<compile_needed>::run_kernel(params, shared_storage);
-#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
-    static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm75>::value;
-    KernelRunner<compile_needed>::run_kernel(params, shared_storage);
-#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
-    static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm80>::value;
-    KernelRunner<compile_needed>::run_kernel(params, shared_storage);
-#else
-    CUTLASS_NOT_IMPLEMENTED();
-#endif
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
index a30bd1c1e9df..7e29dde8f897 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
@@ -14,12 +14,10 @@
  * limitations under the License.
  */
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm_configs.h"
 #include <cuda_runtime_api.h>
-#include "ft_gemm_configs.h"
 
 namespace ort_fastertransformer {
 
@@ -44,8 +42,9 @@ class MoeGemmRunner {
                          int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
                          int num_experts, ActivationType activation_type, cudaStream_t stream);
 
-  void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert,
-                int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream);
+  void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+                int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+                cudaStream_t stream);
 
  private:
   template <typename EpilogueTag>
@@ -64,5 +63,3 @@ class MoeGemmRunner {
 };
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
index 1d0dfe7c5a64..15cab9dd4a9b 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_fp16.cu
@@ -13,13 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4200)
+#endif
 
-#ifdef USE_CUTLASS
-
-#include "moe_gemm_kernels_template.h"
+#include "contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h"
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 namespace ort_fastertransformer {
 template class MoeGemmRunner<half, half>;
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_uint4.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_uint4.cu
new file mode 100644
index 000000000000..1309a7c32a37
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp16_uint4.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4200)
+#endif
+
+#include "contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h"
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+namespace ort_fastertransformer {
+template class MoeGemmRunner<half, cutlass::uint4b_t>;
+}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
index 7a5d97902ee8..0277fab9df95 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_fp32_fp32.cu
@@ -13,13 +13,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4200)
+#endif
 
-#ifdef USE_CUTLASS
+#include "contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h"
 
-#include "moe_gemm_kernels_template.h"
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
 namespace ort_fastertransformer {
 template class MoeGemmRunner<float, float>;
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
index 3fd0fc47055a..d81808e217fb 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
@@ -14,29 +14,38 @@
  * limitations under the License.
  */
 
-#ifdef USE_CUTLASS
-
 // Ignore CUTLASS warnings about type punning
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif
 
+// Ignore CUTLASS warning C4100: unreferenced formal parameter
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+#include "cutlass/arch/arch.h"
 #include "cutlass/array.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/gemm/device/gemm_grouped.h"
-#include "cutlass/gemm/kernel/default_gemm_grouped.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/arch.h"
 #include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
 
-#include "compute_occupancy.h"
-#include "epilogue_helpers.h"
-#include "layout_traits_helper.h"
-#include "moe_cutlass_kernel.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/epilogue_helpers.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h"
+#include "contrib_ops/cuda/moe/cutlass_extensions/gemm/threadblock/default_mma.h"
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
@@ -59,10 +68,6 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
                                      int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k, int num_experts,
                                      CutlassGemmConfig gemm_config, const int multi_processor_count,
                                      cudaStream_t stream, int* kernel_occupancy = nullptr) {
-  if (gemm_config.split_k_style != SplitKStyle::NO_SPLIT_K) {
-    ORT_THROW("[FT Error][MoeGemm] Grouped gemm does not support split-k");
-  }
-
   static_assert(cutlass::platform::is_same<T, half>::value || cutlass::platform::is_same<T, float>::value,
                 "Specialized for half, float");
 
@@ -79,10 +84,11 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
   using CutlassWeightType_ =
       typename cutlass::platform::conditional<cutlass::platform::is_same<WeightType, half>::value, cutlass::half_t,
                                               WeightType>::type;
+
   using CutlassWeightType = CutlassWeightType_;
 
-  // We need separate config for each architecture since we will target different tensorcore instructions. For float,
-  // we do not target TCs.
+  // We need separate config for each architecture since we will target different tensorcore instructions. For
+  // float, we do not target TCs.
   using MixedGemmArchTraits = cutlass::gemm::kernel::MixedGemmArchTraits<ElementType, CutlassWeightType, arch>;
   using ElementAccumulator = typename MixedGemmArchTraits::AccType;
 
@@ -111,17 +117,17 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
     return;
   }
   int occupancy = std::min(2, GemmGrouped::maximum_active_blocks());
-  if (occupancy == 0) {
-    ORT_THROW("[FT Error][MoE Runner] GPU lacks the shared memory resources to run GroupedGEMM kernel");
-  }
-  const int threadblock_count = multi_processor_count * occupancy;
+  ORT_ENFORCE(occupancy > 0, "GPU lacks the shared memory resources to run GroupedGEMM kernel");
+  int const threadblock_count = multi_processor_count * occupancy;
 
-  typename EpilogueOp::Params epilogue_op(ElementAccumulator(1.f), ElementAccumulator(0.f));
+  typename EpilogueOp::Params epilogue_op(ElementAccumulator(1.f),
+                                          biases ? ElementAccumulator(1.f) : ElementAccumulator(0.f));
 
+  int const group_size = gemm_k;
   typename GemmGrouped::Arguments args(
-      num_experts, threadblock_count, epilogue_op, reinterpret_cast<const ElementType*>(A),
-      reinterpret_cast<const CutlassWeightType*>(B), reinterpret_cast<const ElementType*>(weight_scales),
-      reinterpret_cast<const ElementType*>(biases), reinterpret_cast<ElementType*>(C), total_rows_before_expert, gemm_n,
+      num_experts, threadblock_count, group_size, epilogue_op, reinterpret_cast<ElementType const*>(A),
+      reinterpret_cast<CutlassWeightType const*>(B), reinterpret_cast<ElementType const*>(weight_scales),
+      reinterpret_cast<ElementType const*>(biases), reinterpret_cast<ElementType*>(C), total_rows_before_expert, gemm_n,
       gemm_k);
 
   GemmGrouped gemm;
@@ -151,10 +157,10 @@ void generic_moe_gemm_kernelLauncher(const T* A, const WeightType* B, const T* w
 template <typename T, typename WeightType, typename arch, typename EpilogueTag, typename ThreadblockShape,
           typename WarpShape, int Stages, typename Enable = void>
 struct dispatch_stages {
-  static void dispatch(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                       int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k, int num_experts,
-                       CutlassGemmConfig gemm_config, int multi_processor_count, cudaStream_t stream,
-                       int* occupancy = nullptr) {
+  static void dispatch(const T* /*A*/, const WeightType* /*B*/, const T* /*weight_scales*/, const T* /*biases*/,
+                       T* /*C*/, int64_t* /*total_rows_before_expert*/, int64_t /*gemm_n*/, int64_t /*gemm_k*/,
+                       int /*num_experts*/, CutlassGemmConfig /*gemm_config*/, int /*multi_processor_count*/,
+                       cudaStream_t /*stream*/, [[maybe_unused]] int* occupancy = nullptr) {
     std::string err_msg = "Cutlass fpA_intB gemm. Not instantiates for arch " +
                           std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
     ORT_THROW("[FT Error][dispatch_stages::dispatch] " + err_msg);
@@ -223,10 +229,28 @@ template <
     typename T, typename WeightType, typename arch, typename EpilogueTag,
     typename std::enable_if<!std::is_same<T, float>::value && std::is_same<T, WeightType>::value>::type* = nullptr>
 void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                                  int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
-                                  int num_experts, CutlassGemmConfig gemm_config, int sm_version,
+                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n,
+                                  int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
                                   int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
   switch (gemm_config.tile_config) {
+    case CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64:
+      ORT_ENFORCE(arch::kMinComputeCapability >= 75, "Invalid config on Volta");
+      if constexpr (arch::kMinComputeCapability >= 75) {
+        dispatch_gemm_config<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<16, 128, 64>,
+                             cutlass::gemm::GemmShape<16, 32, 64>>(
+            A, B, weight_scales, biases, C, total_rows_before_expert, gemm_n, gemm_k, num_experts, gemm_config,
+            multi_processor_count, stream, occupancy);
+      }
+      break;
+    case CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64:
+      ORT_ENFORCE(arch::kMinComputeCapability >= 75, "Invalid config on Volta");
+      if constexpr (arch::kMinComputeCapability >= 75) {
+        dispatch_gemm_config<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<16, 256, 64>,
+                             cutlass::gemm::GemmShape<16, 64, 64>>(
+            A, B, weight_scales, biases, C, total_rows_before_expert, gemm_n, gemm_k, num_experts, gemm_config,
+            multi_processor_count, stream, occupancy);
+      }
+      break;
     case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
       dispatch_gemm_config<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<32, 128, 64>,
                            cutlass::gemm::GemmShape<32, 32, 64>>(A, B, weight_scales, biases, C,
@@ -246,13 +270,13 @@ void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weig
                                                                  gemm_config, multi_processor_count, stream, occupancy);
       break;
     case CutlassTileConfig::Undefined:
-      ORT_THROW("[FT Error][dispatch_moe_gemm_to_cutlass] gemm config undefined.");
+      ORT_THROW("GEMM config undefined.");
       break;
     case CutlassTileConfig::ChooseWithHeuristic:
-      ORT_THROW("[FT Error][dispatch_moe_gemm_to_cutlass] gemm config should have already been set by heuristic.");
+      ORT_THROW("GEMM config should have already been set by heuristic.");
       break;
     default:
-      ORT_THROW("[FT Error][dispatch_moe_gemm_to_cutlass] Config is invalid for same type MoE tensorop GEMM.");
+      ORT_THROW("Config is invalid for same type tensorop GEMM.");
       break;
   }
 }
@@ -302,8 +326,8 @@ void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weig
 template <typename T, typename WeightType, typename arch, typename EpilogueTag,
           typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
 void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                                  int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
-                                  int num_experts, CutlassGemmConfig gemm_config, int sm_version,
+                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n,
+                                  int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
                                   int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
   switch (gemm_config.tile_config) {
     case CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8:
@@ -395,20 +419,20 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_bias_act(const T* A, const WeightTyp
                                                      cudaStream_t stream) {
   switch (activation_type) {
     case ActivationType::Relu:
-      run_gemm<EpilogueOpBiasReLU>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                                   num_experts, stream);
+      run_gemm<EpilogueOpDefaultReLU>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
+                                      gemm_k, num_experts, stream);
       break;
     case ActivationType::Gelu:
-      run_gemm<EpilogueOpBiasFtGelu>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
-                                     gemm_k, num_experts, stream);
+      run_gemm<EpilogueOpDefaultFtGelu>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
+                                        gemm_k, num_experts, stream);
       break;
     case ActivationType::Silu:
-      run_gemm<EpilogueOpBiasSilu>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                                   num_experts, stream);
+      run_gemm<EpilogueOpDefaultSilu>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
+                                      gemm_k, num_experts, stream);
       break;
     case ActivationType::Identity:
-      run_gemm<EpilogueOpBias>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                               num_experts, stream);
+      run_gemm<EpilogueOpDefault>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+                                  num_experts, stream);
       break;
     case ActivationType::InvalidType:
       ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM");
@@ -420,13 +444,11 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_bias_act(const T* A, const WeightTyp
 }
 
 template <typename T, typename WeightType>
-void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, T* C,
-                                            int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n,
+void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases,
+                                            T* C, int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n,
                                             int64_t gemm_k, int num_experts, cudaStream_t stream) {
-  run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
-                             num_experts, stream);
+  run_gemm<EpilogueOpDefault>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+                              num_experts, stream);
 }
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index 9232e8d01293..360c0aacd9c7 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -16,13 +16,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
+#include <algorithm>
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <math.h>
 #include <sstream>
-#include <algorithm>
 
 // Ignore CUTLASS warnings about type punning
 #ifdef __GNUC__
@@ -32,7 +30,6 @@
 
 #include "cutlass/array.h"
 #include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
 
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
@@ -51,7 +48,6 @@
 #endif
 
 namespace ort_fastertransformer {
-
 static constexpr int WARP_SIZE = 32;
 
 // ====================== Softmax things ===============================
@@ -110,14 +106,15 @@ __launch_bounds__(TPB) __global__
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
 template <typename T, int TPB>
-__launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*, int*, int, const int) {
+__launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*, int*, int, int, bool) {
   // Does not support pre-Kepler architectures
   ;
 }
 #else
 template <typename T, int TPB>
-__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax, const bool* finished, T* output,
-                                                 int* indices, int* source_rows, int num_experts, int k) {
+__launch_bounds__(TPB) __global__
+    void moe_top_k(const T* inputs_after_softmax, const bool* finished, T* output, int* indices, int* source_rows,
+                   int num_experts, int k, bool normalize_routing_weights) {
   using cub_kvp = cub::KeyValuePair<int, T>;
   using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
   __shared__ typename BlockReduce::TempStorage tmpStorage;
@@ -130,6 +127,7 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
 
   const bool should_process_row = finished ? !finished[block_row] : true;
   const int thread_read_offset = blockIdx.x * num_experts;
+  float output_row_sum = 0.f;
   for (int k_idx = 0; k_idx < k; ++k_idx) {
     thread_kvp.key = 0;
     thread_kvp.value = T(-1.f);  // This is OK because inputs are probabilities
@@ -157,6 +155,13 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
       output[idx] = result_kvp.value;
       indices[idx] = should_process_row ? result_kvp.key : num_experts;
       source_rows[idx] = k_idx * num_rows + block_row;
+
+      if (normalize_routing_weights && k_idx == k - 1) {
+#pragma unroll
+        for (int ki = 0; ki < k; ++ki) {
+          output[idx - ki] = T(static_cast<float>(output[idx - ki]) / output_row_sum);
+        }
+      }
     }
     __syncthreads();
   }
@@ -180,7 +185,7 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
 template <typename T, int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
     void topk_gating_softmax(const T* input, const bool* finished, T* output, int num_rows, int* indices,
-                             int* source_rows, int k) {
+                             int* source_rows, int k, bool normalize_routing_weights) {
   // We begin by enforcing compile time assertions and setting up compile time constants.
   static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
   static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
@@ -298,6 +303,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
   int start_col = first_elt_read_by_thread;
   static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
 
+  float output_row_sum = 0.f;
   for (int k_idx = 0; k_idx < k; ++k_idx) {
     // First, each thread does the local argmax
     float max_val = row_chunk[0];
@@ -338,8 +344,16 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
       // single) thread per row of the input/output matrices.
       const int idx = k * thread_row + k_idx;
       output[idx] = T(max_val);
+      output_row_sum = output_row_sum + static_cast<float>(max_val);
       indices[idx] = should_process_row ? expert : NUM_EXPERTS;
       source_rows[idx] = k_idx * num_rows + thread_row;
+
+      if (normalize_routing_weights && k_idx == k - 1) {
+#pragma unroll
+        for (int ki = 0; ki < k; ++ki) {
+          output[idx - ki] = T(static_cast<float>(output[idx - ki]) / output_row_sum);
+        }
+      }
     }
 
     // Finally, we clear the value in the thread with the current max if there is another iteration to run.
@@ -372,7 +386,8 @@ struct TopkConstants {
 
 template <typename T, int EXPERTS, int WARPS_PER_TB>
 void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T* output, int* indices, int* source_row,
-                                         int num_rows, int num_experts, int k, cudaStream_t stream) {
+                                         int num_rows, int /*num_experts*/, int k, bool normalize_routing_weights,
+                                         cudaStream_t stream) {
   static constexpr unsigned long MAX_BYTES_PER_LDG = 16;
 
   static constexpr int BYTES_PER_LDG = std::min((int)MAX_BYTES_PER_LDG, (int)sizeof(T) * EXPERTS);
@@ -383,62 +398,62 @@ void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T
   const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
 
   dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
-  topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG>
-      <<<num_blocks, block_dim, 0, stream>>>(input, finished, output, num_rows, indices, source_row, k);
+  topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
+      input, finished, output, num_rows, indices, source_row, k, normalize_routing_weights);
 }
 
 template <typename T>
 void topk_gating_softmax_kernelLauncher(const T* input, const bool* finished, T* output, T* softmax_temp_output,
-                                        int* indices, int* source_row, int num_rows, int num_experts,
-                                        int k, cudaStream_t stream) {
+                                        int* indices, int* source_row, int num_rows, int num_experts, int k,
+                                        bool normalize_routing_weights, cudaStream_t stream) {
   static constexpr int WARPS_PER_TB = 4;
 
   switch (num_experts) {
     case 2: {
       topk_gating_softmax_launcher_helper<T, 2, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 4: {
       topk_gating_softmax_launcher_helper<T, 4, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 8: {
       topk_gating_softmax_launcher_helper<T, 8, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 16: {
       topk_gating_softmax_launcher_helper<T, 16, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 32: {
       topk_gating_softmax_launcher_helper<T, 32, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 64: {
       topk_gating_softmax_launcher_helper<T, 64, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 128: {
       topk_gating_softmax_launcher_helper<T, 128, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                                num_experts, k, stream);
+                                                                num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 256: {
       topk_gating_softmax_launcher_helper<T, 256, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                                num_experts, k, stream);
+                                                                num_experts, k, normalize_routing_weights, stream);
       break;
     }
     default: {
       static constexpr int TPB = 256;
       moe_softmax<T, TPB><<<num_rows, TPB, 0, stream>>>(input, finished, softmax_temp_output, num_experts);
-      moe_top_k<T, TPB>
-          <<<num_rows, TPB, 0, stream>>>(softmax_temp_output, finished, output, indices, source_row, num_experts, k);
+      moe_top_k<T, TPB><<<num_rows, TPB, 0, stream>>>(softmax_temp_output, finished, output, indices, source_row,
+                                                      num_experts, k, normalize_routing_weights);
     }
   }
 }
@@ -505,8 +520,8 @@ __global__ void compute_total_rows_before_expert_kernel(const int* sorted_expert
   total_rows_before_expert[expert] = find_total_elts_leq_target(sorted_experts, sorted_experts_len, expert);
 }
 
-__global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, int num_experts,
-                                            int local_num_experts, int local_experts_start_index) {
+__global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, int num_experts, int local_num_experts,
+                                            int local_experts_start_index) {
   const int expert = blockIdx.x * blockDim.x + threadIdx.x;
   const int local_experts_end_index = local_experts_start_index + local_num_experts - 1;
 
@@ -523,25 +538,30 @@ __global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, i
 }
 
 template <typename T, typename WeightType, typename Enable>
-CutlassMoeFCRunner<T, WeightType, Enable>::CutlassMoeFCRunner(int sm_version) {
-  total_past_rows_ = 0;
-  total_covered_rows_ = 0;
+CutlassMoeFCRunner<T, WeightType, Enable>::CutlassMoeFCRunner(int sm_version, bool has_fc3,
+                                                              bool normalize_routing_weights)
+    : has_fc3_(has_fc3),
+      total_past_rows_(0),
+      total_covered_rows_(0),
+      normalize_routing_weights_(normalize_routing_weights) {
   moe_gemm_runner_.initialize(sm_version);
 }
 
 template <typename T, typename WeightType, typename Enable>
-size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows, const int hidden_size,
-                                                                   const int inter_size, int num_experts,
-                                                                   int k) {
-  const int buf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * hidden_size));
-  const int interbuf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * inter_size));
-  const int padded_experts = static_cast<int>(pad_to_multiple_of_16(num_experts));
-  const int num_moe_inputs = static_cast<int>(pad_to_multiple_of_16(k * num_rows));
-  int num_softmax_outs = 0;
+size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(size_t num_rows, const size_t hidden_size,
+                                                                   const size_t inter_size, size_t num_experts,
+                                                                   size_t k) {
+  total_covered_rows_ = k * num_rows;
+
+  const size_t buf_size = pad_to_multiple_of_16(k * num_rows * hidden_size);
+  const size_t interbuf_size = pad_to_multiple_of_16(k * num_rows * inter_size);
+  const size_t padded_experts = pad_to_multiple_of_16(num_experts);
+  const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows);
+  size_t num_softmax_outs = 0;
 
   const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
   if (!is_pow_2 || num_experts > 256) {
-    num_softmax_outs = static_cast<int>(pad_to_multiple_of_16(num_rows * num_experts));
+    num_softmax_outs = pad_to_multiple_of_16(num_rows * num_experts);
   }
 
   // softmax output, permuted_rows and permuted_experts have moved to outside of moe kernel, allocate them
@@ -550,13 +570,13 @@ size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows,
   total_ws_bytes += buf_size * sizeof(T);                    // permuted_data
   total_ws_bytes += padded_experts * sizeof(int64_t);        // Hold total_rows_before_expert_
   total_ws_bytes += num_softmax_outs * sizeof(T);
-  const int bytes_for_fc1_result = interbuf_size * sizeof(T);
-  const int sorter_ws_size_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_.getWorkspaceSize(num_rows)));
-  sorter_.update_num_experts(num_experts);
+  const size_t bytes_for_fc1_result = has_fc3_ ? 2 * interbuf_size * sizeof(T) : interbuf_size * sizeof(T);
+  const size_t sorter_ws_size_bytes = pad_to_multiple_of_16(sorter_.getWorkspaceSize(num_rows));
+  sorter_.update_num_experts(static_cast<int>(num_experts));
 
-  int bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
+  size_t bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
   if (sorter_ws_size_bytes > bytes_for_fc1_result) {
-    int remaining_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_ws_size_bytes - bytes_for_fc1_result));
+    size_t remaining_bytes = pad_to_multiple_of_16(sorter_ws_size_bytes - bytes_for_fc1_result);
     bytes_for_intermediate_and_sorting += remaining_bytes;
   }
 
@@ -565,39 +585,140 @@ size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows,
 }
 
 template <typename T, typename WeightType, typename Enable>
-void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr, int num_rows,
-                                                                  const int hidden_size, const int inter_size,
-                                                                  int num_experts, int k) {
-  const int buf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * hidden_size));
-  const int interbuf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * inter_size));
-  const int padded_experts = static_cast<int>(pad_to_multiple_of_16(num_experts));
-  const int num_moe_inputs = static_cast<int>(pad_to_multiple_of_16(k * num_rows));
-
-  source_rows_ = (int*)ws_ptr;
+void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr, size_t num_rows,
+                                                                  const size_t hidden_size, const size_t inter_size,
+                                                                  size_t num_experts, size_t k) {
+  const size_t buf_size = pad_to_multiple_of_16(k * num_rows * hidden_size);
+  const size_t interbuf_size = pad_to_multiple_of_16(k * num_rows * inter_size);
+  const size_t padded_experts = pad_to_multiple_of_16(num_experts);
+  const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows);
+
+  source_rows_ = reinterpret_cast<int*>(ws_ptr);
   permuted_rows_ = source_rows_ + num_moe_inputs;
   permuted_experts_ = permuted_rows_ + num_moe_inputs;
-  permuted_data_ = (T*)(permuted_experts_ + num_moe_inputs);
+  permuted_data_ = reinterpret_cast<T*>(permuted_experts_ + num_moe_inputs);
 
-  total_rows_before_expert_ = (int64_t*)(permuted_data_ + buf_size);
+  total_rows_before_expert_ = reinterpret_cast<int64_t*>(permuted_data_ + buf_size);
 
-  fc1_result_ = (T*)(total_rows_before_expert_ + padded_experts);
+  if (has_fc3_) {
+    fc3_result_ = reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
+    fc1_result_ = reinterpret_cast<T*>(fc3_result_ + interbuf_size);
+  } else {
+    fc1_result_ = reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
+  }
 
   const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
   if (!is_pow_2 || num_experts > 256) {
-    softmax_out_ = (T*)(fc1_result_ + interbuf_size);
+    softmax_out_ = reinterpret_cast<T*>(fc1_result_ + interbuf_size);
   } else {
     softmax_out_ = nullptr;
   }
 }
 
+namespace {
+
+struct __align__(8) Half4 {
+  half2 x;
+  half2 y;
+};
+
+// TODO(wy): move to common header
+template <typename T>
+struct T4;
+template <>
+struct T4<float> {
+  using Type = float4;
+};
+template <>
+struct T4<half> {
+  using Type = Half4;
+};
+
+template <typename T>
+struct T2;
+template <>
+struct T2<float> {
+  using Type = float2;
+};
+template <>
+struct T2<half> {
+  using Type = half2;
+};
+
+inline __device__ float2 operator*(const float2 a, const float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+
+inline __device__ float4 operator*(const float4 a, const float4 b) {
+  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+
+// TODO(wy): use cuda common header and investigate pipeline build issue.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 && \
+    ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 2)))
+inline __device__ half operator*(const half a, const half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+
+inline __device__ half2 operator*(const half2 a, const half2 b) {
+  return make_half2(a.x * b.x, a.y * b.y);
+}
+#endif
+
+// TODO(wy): use cuda common header and investigate pipeline build issue.
+inline __device__ Half4 operator*(const Half4 a, const Half4 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 && \
+    ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 2)))
+  Half4 result;
+  result.x = a.x * b.x;
+  result.y = a.y * b.y;
+  return result;
+#else
+  return Half4{__hmul2(a.x, b.x), __hmul2(a.y, b.y)};
+#endif
+}
+
+}  // anonymous namespace
+
+template <typename T>
+__global__ void elementWiseMulKernel(T* output, T const* input, size_t inter_size) {
+  int const tid = threadIdx.x;
+  int const token = blockIdx.x;
+
+  output = output + token * inter_size;
+  input = input + token * inter_size;
+  for (int i = tid; i < inter_size; i += blockDim.x) {
+    T fc1_value = input[i];
+    output[i] = fc1_value * output[i];
+  }
+}
+
+template <typename T>
+void elementWiseMul(T* output, T const* input, int inter_size, int num_tokens, cudaStream_t stream) {
+  int const blocks = num_tokens;
+
+  if (inter_size & 3 == 0) {
+    using vec_type = typename T4<T>::Type;
+    int const threads = std::min(inter_size / 4, 1024);
+    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<vec_type*>(output), reinterpret_cast<vec_type const*>(input), inter_size / 4);
+  } else if (inter_size & 1 == 0) {
+    using vec_type = typename T2<T>::Type;
+    int const threads = std::min(inter_size / 2, 1024);
+    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<vec_type*>(output), reinterpret_cast<vec_type const*>(input), inter_size / 2);
+  } else {
+    int const threads = std::min(inter_size, 1024);
+    elementWiseMulKernel<T><<<blocks, threads, 0, stream>>>(output, input, inter_size);
+  }
+}
+
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales,
-    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc2_expert_weights,
-    const T* fc2_scales, int num_rows, const int hidden_size, const int inter_size, int num_experts,
-    int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result,
-    const bool* finished, int active_rows, T* expert_scales, int* expanded_source_row_to_expanded_dest_row,
-    int* expert_for_source_row, cudaStream_t stream) {
+    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc3_expert_weights,
+    const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales,
+    int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts,
+    int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows,
+    T* expert_scales, int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row, cudaStream_t stream) {
   static constexpr bool scales_required =
       std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value;
 
@@ -615,13 +736,14 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     }
   }
 
-  configure_ws_ptrs(workspace_ptr, num_rows, hidden_size, inter_size, num_experts, k);
+  configure_ws_ptrs(workspace_ptr, static_cast<size_t>(num_rows), static_cast<size_t>(hidden_size),
+                    static_cast<size_t>(inter_size), static_cast<size_t>(num_experts), static_cast<size_t>(k));
   topk_gating_softmax_kernelLauncher<T>(gating_output, finished, expert_scales, softmax_out_, expert_for_source_row,
-                                        source_rows_, num_rows, num_experts, k, stream);
+                                        source_rows_, num_rows, num_experts, k, normalize_routing_weights_, stream);
 
   const int sorter_ws_size_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_.getWorkspaceSize(k * num_rows)));
-  sorter_.run((void*)fc1_result_, sorter_ws_size_bytes, expert_for_source_row, permuted_experts_, source_rows_,
-              permuted_rows_, k * num_rows, stream);
+  sorter_.run(reinterpret_cast<void*>(fc1_result_), sorter_ws_size_bytes, expert_for_source_row, permuted_experts_,
+              source_rows_, permuted_rows_, k * num_rows, stream);
 
   initialize_moe_routing_kernelLauncher(input_activations, permuted_data_, permuted_rows_,
                                         expanded_source_row_to_expanded_dest_row, num_rows, active_rows, hidden_size, k,
@@ -635,33 +757,63 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     dispatch_activations(total_rows_before_expert_, num_experts, local_num_experts, local_experts_start_index, stream);
   }
 
-  // expanded_active_expert_rows is not used
-  moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size,
-                                     fc1_expert_weights, fc1_scales, fc1_expert_biases,
-                                     fc1_result_ + total_past_rows_ * inter_size,
-                                     total_rows_before_expert_ + local_experts_start_index,
-                                     expanded_active_expert_rows, inter_size, hidden_size,
-                                     local_num_experts, fc1_activation_type, stream);
+  moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size, fc1_expert_weights, fc1_scales,
+                                     fc1_expert_biases, fc1_result_ + total_past_rows_ * inter_size,
+                                     total_rows_before_expert_ + local_experts_start_index, expanded_active_expert_rows,
+                                     inter_size, hidden_size, local_num_experts, fc1_activation_type, stream);
+
+  if (has_fc3_) {
+    if (scales_required) {
+      if (fc3_scales == nullptr) {
+        ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for third matmul is a null pointer");
+      }
+    } else {
+      if (fc3_scales != nullptr) {
+        ORT_THROW("[FT Error][Run MoE FC] Scales are ignored for fp32/fp16/bf16 but received scale for FC3");
+      }
+    }
+    if (fc3_expert_weights == nullptr) {
+      ORT_THROW("[FT Error][Run MoE FC] FC3 weights are null");
+    }
+    moe_gemm_runner_.moe_gemm(permuted_data_ + total_past_rows_ * hidden_size, fc3_expert_weights, fc3_scales,
+                              fc3_expert_biases, fc3_result_ + total_past_rows_ * inter_size,
+                              total_rows_before_expert_ + local_experts_start_index, expanded_active_expert_rows,
+                              inter_size, hidden_size, local_num_experts, stream);
 
-  moe_gemm_runner_.moe_gemm(fc1_result_ + total_past_rows_ * inter_size,
-                            fc2_expert_weights, fc2_scales,
+    elementWiseMul(fc1_result_ + total_past_rows_ * inter_size, fc3_result_ + total_past_rows_ * inter_size,
+                   static_cast<int>(inter_size), static_cast<int>(total_covered_rows_), stream);
+  }
+
+  moe_gemm_runner_.moe_gemm(fc1_result_ + total_past_rows_ * inter_size, fc2_expert_weights, fc2_scales, nullptr,
                             fc2_result + total_past_rows_ * hidden_size,
-                            total_rows_before_expert_ + local_experts_start_index,
-                            expanded_active_expert_rows, hidden_size, inter_size, local_num_experts, stream);
+                            total_rows_before_expert_ + local_experts_start_index, expanded_active_expert_rows,
+                            hidden_size, inter_size, local_num_experts, stream);
 }
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+template <typename T, typename WeightType, typename Enable>
+void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(const T*, const T*, const WeightType*, const T*, const T*,
+                                                           ActivationType, const WeightType*, const T*, const T*,
+                                                           const WeightType*, const T*, int, const int, const int, int,
+                                                           int, int, int k, char*, T*, T*, int*, int*, cudaStream_t) {
+  // MoE gemm only supports Volta+ architectures
+  ORT_THROW("[FT Error][Run MoE FC] MoE gemm only supports Volta+ architectures");
+}
+#else
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales,
-    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc2_expert_weights,
-    const T* fc2_scales, int num_rows, const int hidden_size, const int inter_size, int num_experts,
-    int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales,
+    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc3_expert_weights,
+    const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales,
+    int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts,
+    int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales,
     int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row, cudaStream_t stream) {
   run_moe_fc(input_activations, gating_output, fc1_expert_weights, fc1_scales, fc1_expert_biases, fc1_activation_type,
-             fc2_expert_weights, fc2_scales, num_rows, hidden_size, inter_size, num_experts, local_num_experts,
-             local_experts_start_index, k, workspace_ptr, fc2_result, nullptr, num_rows, expert_scales,
-             expanded_source_row_to_expanded_dest_row, expert_for_source_row, stream);
+             fc3_expert_weights, fc3_scales, fc3_expert_biases, fc2_expert_weights, fc2_scales, num_rows, hidden_size,
+             inter_size, num_experts, local_num_experts, local_experts_start_index, k, workspace_ptr, fc2_result,
+             nullptr, num_rows, expert_scales, expanded_source_row_to_expanded_dest_row, expert_for_source_row, stream);
 }
+#endif
 
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::compute_total_rows_before_expert(const int* sorted_indices,
@@ -677,8 +829,8 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::compute_total_rows_before_expert
 }
 
 template <typename T, typename WeightType, typename Enable>
-void CutlassMoeFCRunner<T, WeightType, Enable>::dispatch_activations(int64_t* total_rows_before_expert,
-                                                                     int num_experts, int local_num_experts,
+void CutlassMoeFCRunner<T, WeightType, Enable>::dispatch_activations(int64_t* total_rows_before_expert, int num_experts,
+                                                                     int local_num_experts,
                                                                      int local_experts_start_index,
                                                                      cudaStream_t stream) {
   total_rows_before_expert_host_.resize(num_experts);
@@ -692,16 +844,15 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::dispatch_activations(int64_t* to
   cudaEventCreateWithFlags(&copy_event, cudaEventDisableTiming);
   cudaEventRecord(copy_event, stream);
 
-  dispatch_activations_kernel<<<blocks, threads, 0, stream>>>(total_rows_before_expert, num_experts,
-                                                              local_num_experts, local_experts_start_index);
+  dispatch_activations_kernel<<<blocks, threads, 0, stream>>>(total_rows_before_expert, num_experts, local_num_experts,
+                                                              local_experts_start_index);
 
   get_total_rows_info(local_experts_start_index, local_num_experts, total_past_rows_, total_covered_rows_);
 }
 
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::get_total_rows_info(int64_t experts_start_index,
-                                                                    int64_t local_num_experts,
-                                                                    int64_t& total_past_rows,
+                                                                    int64_t local_num_experts, int64_t& total_past_rows,
                                                                     int64_t& total_covered_rows) {
   int64_t experts_end_index = experts_start_index + local_num_experts - 1;
   total_past_rows = 0;
@@ -758,8 +909,8 @@ __global__ void initialize_moe_routing_kernel(const T* unpermuted_input, T* perm
 template <typename T>
 void initialize_moe_routing_kernelLauncher(const T* unpermuted_input, T* permuted_output,
                                            const int* expanded_dest_row_to_expanded_source_row,
-                                           int* expanded_source_row_to_expanded_dest_row, int num_rows,
-                                           int active_rows, int cols, int k, cudaStream_t stream) {
+                                           int* expanded_source_row_to_expanded_dest_row, int num_rows, int active_rows,
+                                           int cols, int k, cudaStream_t stream) {
   const int blocks = num_rows * k;
   const int threads = std::min(cols, 1024);
   initialize_moe_routing_kernel<T>
@@ -813,9 +964,10 @@ __global__ void finalize_moe_routing_kernel(const T* expanded_permuted_rows, T*
       const T* expanded_permuted_rows_row_ptr = expanded_permuted_rows + expanded_permuted_row * cols;
 
       const int expert_idx = expert_for_source_row[k_offset];
-      const T* bias_ptr = bias + expert_idx * cols;
+      const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr;
 
-      thread_output = thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] + bias_ptr[tid]);
+      thread_output =
+          thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] + (bias_ptr ? bias_ptr[tid] : T(0)));
     }
     reduced_row_ptr[tid] = thread_output;
   }
@@ -825,8 +977,8 @@ __global__ void finalize_moe_routing_kernel(const T* expanded_permuted_rows, T*
 template <typename T>
 void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* reduced_unpermuted_output, const T* bias,
                                          const T* scales, const int* expanded_source_row_to_expanded_dest_row,
-                                         const int* expert_for_source_row, int num_rows, int cols,
-                                         int k, cudaStream_t stream) {
+                                         const int* expert_for_source_row, int num_rows, int cols, int k,
+                                         cudaStream_t stream) {
   const int blocks = num_rows;
   const int threads = std::min(cols, 1024);
   finalize_moe_routing_kernel<T, 0><<<blocks, threads, 0, stream>>>(
@@ -838,8 +990,8 @@ template <typename T>
 void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* reduced_unpermuted_output, const T* skip,
                                          const T* bias, const T* scales,
                                          const int* expanded_source_row_to_expanded_dest_row,
-                                         const int* expert_for_source_row, int num_rows, int cols,
-                                         int k, cudaStream_t stream) {
+                                         const int* expert_for_source_row, int num_rows, int cols, int k,
+                                         cudaStream_t stream) {
   const int blocks = num_rows;
   const int threads = std::min(cols, 1024);
   finalize_moe_routing_kernel<T, 1>
@@ -851,8 +1003,8 @@ template <typename T>
 void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* reduced_unpermuted_output, const T* skip_1,
                                          const T* skip_2, const T* bias, const T* scales,
                                          const int* expanded_source_row_to_expanded_dest_row,
-                                         const int* expert_for_source_row, int num_rows, int cols,
-                                         int k, cudaStream_t stream) {
+                                         const int* expert_for_source_row, int num_rows, int cols, int k,
+                                         cudaStream_t stream) {
   const int blocks = num_rows;
   const int threads = std::min(cols, 1024);
   if (skip_2 == nullptr) {
@@ -867,20 +1019,21 @@ void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* red
 }
 
 // ========================= TopK Softmax specializations ===========================
-template void topk_gating_softmax_kernelLauncher(const float*, const bool*, float*, float*, int*, int*, int,
-                                                 int, int, cudaStream_t);
-template void topk_gating_softmax_kernelLauncher(const half*, const bool*, half*, half*, int*, int*, int,
-                                                 int, int, cudaStream_t);
+template void topk_gating_softmax_kernelLauncher(const float*, const bool*, float*, float*, int*, int*, int, int, int,
+                                                 bool, cudaStream_t);
+template void topk_gating_softmax_kernelLauncher(const half*, const bool*, half*, half*, int*, int*, int, int, int,
+                                                 bool, cudaStream_t);
 
 // ==================== Variable batched GEMM specializations ==================================
 template class CutlassMoeFCRunner<float, float>;
 template class CutlassMoeFCRunner<half, half>;
+template class CutlassMoeFCRunner<half, cutlass::uint4b_t>;
 
 // ===================== Specializations for init routing =========================
-template void initialize_moe_routing_kernelLauncher(const float*, float*, const int*, int*, int, int,
-                                                    int, int, cudaStream_t);
-template void initialize_moe_routing_kernelLauncher(const half*, half*, const int*, int*, int, int,
-                                                    int, int, cudaStream_t);
+template void initialize_moe_routing_kernelLauncher(const float*, float*, const int*, int*, int, int, int, int,
+                                                    cudaStream_t);
+template void initialize_moe_routing_kernelLauncher(const half*, half*, const int*, int*, int, int, int, int,
+                                                    cudaStream_t);
 
 // ==================== Specializations for final routing ===================================
 template void finalize_moe_routing_kernelLauncher(const float*, float*, const float*, const float*, const int*,
@@ -888,17 +1041,12 @@ template void finalize_moe_routing_kernelLauncher(const float*, float*, const fl
 template void finalize_moe_routing_kernelLauncher(const half*, half*, const half*, const half*, const int*, const int*,
                                                   int, int, int, cudaStream_t);
 template void finalize_moe_routing_kernelLauncher(const float*, float*, const float*, const float*, const float*,
-                                                  const int*, const int*, int, int, int,
-                                                  cudaStream_t);
+                                                  const int*, const int*, int, int, int, cudaStream_t);
 template void finalize_moe_routing_kernelLauncher(const half*, half*, const half*, const half*, const half*, const int*,
                                                   const int*, int, int, int, cudaStream_t);
 template void finalize_moe_routing_kernelLauncher(const float*, float*, const float*, const float*, const float*,
-                                                  const float*, const int*, const int*, int, int, int,
-                                                  cudaStream_t);
+                                                  const float*, const int*, const int*, int, int, int, cudaStream_t);
 template void finalize_moe_routing_kernelLauncher(const half*, half*, const half*, const half*, const half*,
-                                                  const half*, const int*, const int*, int, int, int,
-                                                  cudaStream_t);
+                                                  const half*, const int*, const int*, int, int, int, cudaStream_t);
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
index f09471de1cc2..5eef6f95f482 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
@@ -16,8 +16,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "moe_gemm_kernels.h"
@@ -26,6 +24,8 @@
 #include "core/common/common.h"
 #include "contrib_ops/cuda/bert/transformer_cuda_common.h"
 
+#include "cutlass/numeric_types.h"
+
 using namespace onnxruntime;
 
 namespace ort_fastertransformer {
@@ -109,12 +109,13 @@ template <typename T,          /*The type used for activations/scales/compute*/
           typename Enable = void>
 class CutlassMoeFCRunner {
  public:
-  CutlassMoeFCRunner(int sm_version);
+  CutlassMoeFCRunner(int sm_version, bool has_fc3, bool normalize_routing_weights);
 
-  size_t getWorkspaceSize(int num_rows, int hidden_size, int inter_size, int num_experts, int k);
+  size_t getWorkspaceSize(size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, size_t k);
 
   void run_moe_fc(const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights,
                   const T* fc1_scales, const T* fc1_expert_biases, ActivationType fc1_activation_type,
+                  const WeightType* fc3_expert_weights, const T* fc3_scales, const T* fc3_expert_biases,
                   const WeightType* fc2_expert_weights, const T* fc2_scales, int num_rows, int hidden_size,
                   int inter_size, int num_experts, int local_num_experts, int local_experts_start_index, int k,
                   char* workspace_ptr, T* fc2_result, T* expert_scales, int* expanded_source_row_to_expanded_dest_row,
@@ -122,6 +123,7 @@ class CutlassMoeFCRunner {
 
   void run_moe_fc(const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights,
                   const T* fc1_scales, const T* fc1_expert_biases, ActivationType fc1_activation_type,
+                  const WeightType* fc3_expert_weights, const T* fc3_scales, const T* fc3_expert_biases,
                   const WeightType* fc2_expert_weights, const T* fc2_scales, int num_rows, int hidden_size,
                   int inter_size, int num_experts, int local_num_experts, int local_experts_start_index, int k,
                   char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows, T* expert_scales,
@@ -137,7 +139,8 @@ class CutlassMoeFCRunner {
                            int64_t& total_covered_rows);
 
  private:
-  void configure_ws_ptrs(char* ws_ptr, int num_rows, int hidden_size, int inter_size, int num_experts, int k);
+  void configure_ws_ptrs(char* ws_ptr, size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts,
+                         size_t k);
 
  private:
   CubKeyValueSorter sorter_;
@@ -154,12 +157,17 @@ class CutlassMoeFCRunner {
   int64_t* total_rows_before_expert_;
 
   T* fc1_result_;
+  T* fc3_result_;
+
+  bool has_fc3_;
+  bool normalize_routing_weights_;
 
   // Cuda events
   contrib::cuda::AutoDestoryCudaEvent cuda_event_;
 
   int64_t total_past_rows_;
   int64_t total_covered_rows_;
+
   // TODO: use pinned memory
   std::vector<int64_t> total_rows_before_expert_host_;
 };
@@ -167,13 +175,11 @@ class CutlassMoeFCRunner {
 template <typename WeightType>
 class CutlassMoeFCRunner<float, WeightType, typename std::enable_if_t<!std::is_same<float, WeightType>::value>> {
  public:
-  CutlassMoeFCRunner(int sm_version);
+  CutlassMoeFCRunner(int sm_version, bool has_fc3, bool normalize_routing_weights);
 
-  size_t getWorkspaceSize(int num_rows, int hidden_size, int inter_size, int num_experts, int k) {
+  size_t getWorkspaceSize(size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, size_t k) {
     return 0;
   }
 };
 
 }  // namespace ort_fastertransformer
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.cc b/onnxruntime/contrib_ops/cuda/moe/moe.cc
index 0da06192e266..dbd783c0cb11 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.cc
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #include "core/common/safeint.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "moe.h"
@@ -15,39 +13,33 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                  \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      MoE,                                                        \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kCudaExecutionProvider,                                     \
-      (*KernelDefBuilder::Create())                               \
-          .MayInplace(0, 0)                                       \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      MoE<T>);
+#define REGISTER_KERNEL_TYPED(T)                    \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                    \
+      MoE, kMSDomain, 1, T, kCudaExecutionProvider, \
+      (*KernelDefBuilder::Create()).MayInplace(0, 0).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), MoE<T>);
 
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
 
-using namespace ONNX_NAMESPACE;
-
 template <typename T>
-MoE<T>::MoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoEBase(op_kernel_info) {
-}
+MoE<T>::MoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoEBase(op_kernel_info) {}
 
 template <typename T>
 Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* input = context->Input<Tensor>(0);
   const Tensor* router_probs = context->Input<Tensor>(1);
   const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
-  const Tensor* fc2_experts_weights = context->Input<Tensor>(3);
-  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
+  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(3);
+  const Tensor* fc2_experts_weights = context->Input<Tensor>(4);
   const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(5);
+  const Tensor* fc3_experts_weights_optional = context->Input<Tensor>(6);
+  const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(7);
 
   MoEParameters moe_params;
-  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc2_experts_weights,
-                                  fc1_experts_bias_optional, fc2_experts_bias_optional));
+  MoEQuantType quant_type = MoEQuantType::None;
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, quant_type, input, router_probs, fc1_experts_weights,
+                                  fc1_experts_bias_optional, fc2_experts_weights, fc2_experts_bias_optional,
+                                  fc3_experts_weights_optional, fc3_experts_bias_optional));
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   auto stream = context->GetComputeStream();
@@ -55,12 +47,12 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   auto& device_prop = GetDeviceProp();
   const int sm = device_prop.major * 10 + device_prop.minor;
 
-  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm);
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm, fc3_experts_weights_optional != nullptr,
+                                                                     normalize_routing_weights_);
 
-  size_t ws_size =
-      moe_runner.getWorkspaceSize(static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
-                                  static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
-                                  static_cast<int>(k_));
+  size_t ws_size = moe_runner.getWorkspaceSize(
+      static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+      static_cast<size_t>(moe_params.inter_size), static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
   size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
   size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
   size_t expanded_source_row_to_expanded_dest_row_size = k_ * moe_params.num_rows * sizeof(int);
@@ -79,26 +71,29 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   IAllocatorUniquePtr<void> expert_for_source_row =
       IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
 
-  // fc1_scales and fc2_scales are used in quantized MoE
-  const CudaT* fc1_scales_ptr = nullptr;
-  const CudaT* fc2_scales_ptr = nullptr;
-
-  moe_runner.run_moe_fc(reinterpret_cast<const CudaT*>(input->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()),
-                        std::move(fc1_scales_ptr),
-                        fc1_experts_bias_optional == nullptr
-                            ? nullptr
-                            : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
-                        activation_type_, reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
-                        std::move(fc2_scales_ptr), static_cast<int>(moe_params.num_rows),
-                        static_cast<int>(moe_params.hidden_size), static_cast<int>(moe_params.inter_size),
-                        static_cast<int>(moe_params.num_experts), static_cast<int>(moe_params.local_num_experts),
-                        0 /*local_experts_start_index_ used in sharded MoE*/, static_cast<int>(k_),
-                        reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
-                        reinterpret_cast<CudaT*>(expert_scales.get()),
-                        reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
-                        reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
+  const CudaT* fc_scales_ptr = nullptr;
+  moe_runner.run_moe_fc(
+      reinterpret_cast<const CudaT*>(input->template Data<T>()),
+      reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
+      reinterpret_cast<const CudaT*>(fc1_experts_weights->DataRaw()), fc_scales_ptr,
+      fc1_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
+      activation_type_,
+      fc3_experts_weights_optional == nullptr ? nullptr
+                                              : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->DataRaw()),
+      fc_scales_ptr,
+      fc3_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+      reinterpret_cast<const CudaT*>(fc2_experts_weights->DataRaw()), fc_scales_ptr,
+      static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
+      static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
+      static_cast<int>(moe_params.local_num_experts), 0 /*local_experts_start_index_ used in sharded MoE*/,
+      static_cast<int>(k_), reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
+      reinterpret_cast<CudaT*>(expert_scales.get()),
+      reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
+      reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
 
   Tensor* output = context->Output(0, input->Shape());
 
@@ -110,8 +105,7 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
       reinterpret_cast<CudaT*>(expert_scales.get()),
       reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
       reinterpret_cast<int*>(expert_for_source_row.get()), static_cast<int>(moe_params.num_rows),
-      static_cast<int>(moe_params.hidden_size),
-      static_cast<int>(k_), Stream(context));
+      static_cast<int>(moe_params.hidden_size), static_cast<int>(k_), Stream(context));
 
   return Status::OK();
 }
@@ -119,5 +113,3 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.h b/onnxruntime/contrib_ops/cuda/moe/moe.h
index 710b914f0633..c4d8c4dc64c5 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.h
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "contrib_ops/cuda/moe/ft_moe/moe_kernel.h"
@@ -26,5 +24,3 @@ class MoE final : public CudaKernel, public MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_base.h b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
index dc8b9d57f79f..4a407fa1b215 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe_base.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #pragma once
 
 #include "core/common/common.h"
+#include "core/framework/tensor_shape.h"
 #include "core/framework/op_kernel.h"
 #include "contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h"
 
@@ -15,27 +14,36 @@ namespace cuda {
 
 enum class MoEParallelType {
   None = 0,
-  ExpertSlicing = 1,
+  EP = 1,
+  TP = 2,
+  EPAndTP = 3,
+};
+
+enum class MoEQuantType {
+  None = 0,
+  UINT4 = 1,
 };
 
 struct MoEParameters {
+  MoEParameters() {}
+  explicit MoEParameters(int64_t tensor_shards) : tensor_shards(tensor_shards) {}
   int64_t num_rows;
   int64_t num_experts;
   int64_t local_num_experts;
   int64_t hidden_size;
   int64_t inter_size;
+
   MoEParallelType parallel_type;
+  int64_t tensor_shards{1};
 };
 
 class MoEBase {
  public:
-  Status CheckInputs(MoEParameters& parameters,
-                     const Tensor* input,
-                     const Tensor* router_probs,
-                     const Tensor* fc1_experts_weights,
-                     const Tensor* fc2_experts_weights,
-                     const Tensor* fc1_experts_bias_optional,
-                     const Tensor* fc2_experts_bias_optional) const {
+  Status CheckInputs(MoEParameters& parameters, MoEQuantType& quant_type, const Tensor* input,
+                     const Tensor* router_probs, const Tensor* fc1_experts_weights,
+                     const Tensor* fc1_experts_bias_optional, const Tensor* fc2_experts_weights,
+                     const Tensor* fc2_experts_bias_optional, const Tensor* fc3_experts_weights_optional,
+                     const Tensor* fc3_experts_bias_optional) const {
     const auto& input_dims = input->Shape().GetDims();
     const auto& router_probs_dims = router_probs->Shape().GetDims();
     const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims();
@@ -45,7 +53,7 @@ class MoEBase {
     int64_t hidden_size = input_dims[input_dims.size() - 1];
     int64_t local_num_experts = fc1_experts_weights_dims[0];
     int64_t num_experts = router_probs_dims[1];
-    int64_t inter_size = fc1_experts_weights_dims[2];
+    int64_t inter_size = fc2_experts_weights_dims[1];
 
     if (fc1_experts_weights_dims.size() != 3) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_weights_dims must be 3D, got ",
@@ -63,20 +71,21 @@ class MoEBase {
     if (fc2_experts_weights_dims[1] != inter_size) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "fc2_experts_weights_dims[1] must be equal to inter_size, got ",
-                             fc2_experts_weights_dims[1],
-                             " and ", inter_size);
+                             fc2_experts_weights_dims[1], " and ", inter_size);
     }
-    if (fc1_experts_weights_dims[2] != inter_size) {
+
+    const int64_t coe = quant_type == MoEQuantType::UINT4 ? 2 : 1;
+    if (fc1_experts_weights_dims[2] != inter_size / coe) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "fc1_experts_weights_dims[2] must be equal to inter_size, got ",
-                             fc1_experts_weights_dims[2],
-                             " and ", inter_size);
+                             fc1_experts_weights_dims[2], " and ", inter_size);
     }
-    if (fc2_experts_weights_dims[2] != hidden_size) {
+    if (fc2_experts_weights_dims[2] != hidden_size / coe) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "fc2_experts_weights_dims[2] must be equal to hidden_size, got ",
                              fc2_experts_weights_dims[2], " and ", hidden_size);
     }
+
     if (router_probs_dims.size() != 2) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "router_probs_dims must be 2D, got ",
                              router_probs_dims.size());
@@ -85,12 +94,6 @@ class MoEBase {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "router_probs_dims[0] must be equal to num_rows, got ",
                              router_probs_dims[0], " and ", num_rows);
     }
-    if (fc1_experts_bias_optional != nullptr && fc2_experts_bias_optional == nullptr) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_bias is set but fc2_experts_bias is not set");
-    }
-    if (fc1_experts_bias_optional == nullptr && fc2_experts_bias_optional != nullptr) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_bias is not set but fc2_experts_bias is set");
-    }
     if (fc1_experts_bias_optional != nullptr && fc2_experts_bias_optional != nullptr) {
       const auto& fc1_experts_bias_dims = fc1_experts_bias_optional->Shape().GetDims();
       const auto& fc2_experts_bias_dims = fc2_experts_bias_optional->Shape().GetDims();
@@ -105,42 +108,99 @@ class MoEBase {
       if (fc1_experts_bias_dims[0] != local_num_experts) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                                "fc1_experts_bias_dims[0] must be equal to local_num_experts, got ",
-                               fc1_experts_bias_dims[0],
-                               " and ", local_num_experts);
+                               fc1_experts_bias_dims[0], " and ", local_num_experts);
       }
       if (fc2_experts_bias_dims[0] != num_experts) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "fc2_experts_bias_dims[0] must be equal to num_experts, got ",
-                               fc2_experts_bias_dims[0],
+                               "fc2_experts_bias_dims[0] must be equal to num_experts, got ", fc2_experts_bias_dims[0],
                                " and ", num_experts);
       }
       if (fc1_experts_bias_dims[1] != inter_size) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "fc1_experts_bias_dims[1] must be equal to inter_size, got ",
-                               fc1_experts_bias_dims[1],
+                               "fc1_experts_bias_dims[1] must be equal to inter_size, got ", fc1_experts_bias_dims[1],
                                " and ", inter_size);
       }
       if (fc2_experts_bias_dims[1] != hidden_size) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "fc2_experts_bias_dims[1] must be equal to hidden_size, got ",
-                               fc2_experts_bias_dims[1],
+                               "fc2_experts_bias_dims[1] must be equal to hidden_size, got ", fc2_experts_bias_dims[1],
                                " and ", hidden_size);
       }
     }
 
+    if (fc3_experts_weights_optional != nullptr &&
+        fc3_experts_weights_optional->Shape().GetDims() != fc1_experts_weights_dims) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "fc3_experts_weights_dims must be equal to fc1_experts_weights_dims, got ",
+                             fc3_experts_weights_optional->Shape(), " and ", TensorShape(fc1_experts_weights_dims));
+    }
+
+    if (fc3_experts_bias_optional != nullptr && fc1_experts_bias_optional != nullptr &&
+        fc3_experts_bias_optional->Shape().GetDims() != fc1_experts_bias_optional->Shape().GetDims()) {
+      return ORT_MAKE_STATUS(
+          ONNXRUNTIME, INVALID_ARGUMENT, "fc3_experts_bias_dims must be equal to fc1_experts_bias_dims, got ",
+          fc3_experts_bias_optional->Shape(), " and ", fc1_experts_bias_optional->Shape());
+    }
+
     parameters.num_rows = num_rows;
     parameters.num_experts = num_experts;
     parameters.local_num_experts = local_num_experts;
     parameters.hidden_size = hidden_size;
     parameters.inter_size = inter_size;
     if (num_experts == local_num_experts) {
-      parameters.parallel_type = MoEParallelType::None;
+      if (parameters.tensor_shards == 1) {
+        parameters.parallel_type = MoEParallelType::None;
+      } else {
+        parameters.parallel_type = MoEParallelType::TP;
+      }
     } else if (num_experts > local_num_experts) {
-      parameters.parallel_type = MoEParallelType::ExpertSlicing;
+      if (parameters.tensor_shards == 1) {
+        parameters.parallel_type = MoEParallelType::EP;
+      } else {
+        parameters.parallel_type = MoEParallelType::EPAndTP;
+      }
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "num_experts must be greater than or equal to local_num_experts, got ",
-                             num_experts, " and ", local_num_experts);
+                             "num_experts must be greater than or equal to local_num_experts, got ", num_experts,
+                             " and ", local_num_experts);
+    }
+
+    return Status::OK();
+  }
+
+  Status CheckInputScales(const Tensor* fc1_experts_scales, const Tensor* fc2_experts_scales,
+                          const Tensor* fc3_experts_scales, int64_t num_experts, int64_t hidden_size,
+                          int64_t inter_size) const {
+    const auto& fc1_experts_scales_dims = fc1_experts_scales->Shape().GetDims();
+    const auto& fc2_experts_scales_dims = fc2_experts_scales->Shape().GetDims();
+
+    if (fc1_experts_scales_dims.size() != 2) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_scales must be 2D, got ",
+                             fc1_experts_scales->Shape().GetDims().size());
+    }
+    if (fc1_experts_scales_dims[0] != num_experts) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_scales[0] must be equal to num_experts, got ",
+                             fc1_experts_scales_dims[0], " and ", num_experts);
+    }
+    if (fc1_experts_scales_dims[1] != inter_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_scales[1] must be equal to inter_size, got ",
+                             fc1_experts_scales_dims[1], " and ", inter_size);
+    }
+    if (fc2_experts_scales_dims.size() != 2) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc2_experts_scales must be 2D, got ",
+                             fc2_experts_scales->Shape().GetDims().size());
+    }
+    if (fc2_experts_scales_dims[0] != num_experts) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc2_experts_scales[0] must be equal to num_experts, got ",
+                             fc2_experts_scales_dims[0], " and ", num_experts);
+    }
+    if (fc2_experts_scales_dims[1] != hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc2_experts_scales[1] must be equal to hidden_size, got ",
+                             fc2_experts_scales_dims[1], " and ", hidden_size);
+    }
+    if (fc3_experts_scales != nullptr && fc1_experts_scales_dims != fc3_experts_scales->Shape().GetDims()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "fc3_experts_scales must be equal to fc1_experts_scales, got ",
+                             fc3_experts_scales->Shape(), " and ", TensorShape(fc1_experts_scales_dims));
     }
 
     return Status::OK();
@@ -163,8 +223,11 @@ class MoEBase {
     } else {
       ORT_THROW("Unsupported MoE activation type: ", activation_type_str);
     }
+
+    normalize_routing_weights_ = op_kernel_info.GetAttrOrDefault<int64_t>("normalize_routing_weights", 0) == 1;
   }
 
+  bool normalize_routing_weights_;
   int64_t k_;
   ort_fastertransformer::ActivationType activation_type_;
 };
@@ -172,5 +235,3 @@ class MoEBase {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc
index 705f2d49fe2b..168c69c69f00 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc
@@ -106,6 +106,8 @@ Status QAttention<T, int8_t>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* past_tensor = context->Input<Tensor>(8);
 
   AttentionParameters parameters;
+  parameters.use_tf32 = UseTF32();
+
   ORT_RETURN_IF_ERROR(CheckInputs(input,
                                   weights,
                                   bias,
@@ -152,7 +154,7 @@ Status QAttention<T, int8_t>::ComputeInternal(OpKernelContext* context) const {
   CudaT dequant_scale;
   CudaT input_scale = *(reinterpret_cast<const CudaT*>(input_scale_tensor->Data<T>()));
   CudaT weight_scale = *(reinterpret_cast<const CudaT*>(weight_scale_tensor->Data<T>()));
-  if (sizeof(T) == 2) {
+  if constexpr (sizeof(T) == 2) {
     dequant_scale = __float2half(__half2float(input_scale) * __half2float(weight_scale));
   } else {
     dequant_scale = input_scale * weight_scale;
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
index 6b66f1d84e22..265adf22eeb6 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
@@ -2,10 +2,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <cstdint>
 #include <cub/cub.cuh>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #include <cmath>
+#include <type_traits>
 #include <math_constants.h>
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/cuda_common.h"
@@ -21,7 +23,7 @@ namespace cuda {
 
 __device__ __forceinline__ void DequantizeEightElements(uint32_t values_quant, half scale, half zp, half* output) {
   half2 scale_half2 = {scale, scale};
-  half zp_adjust = -scale * __short2half_rn(zp);
+  half zp_adjust = -scale * zp;
   half2 zp_adjust2 = {zp_adjust, zp_adjust};
 
   alignas(16) half2 results[4];
@@ -56,41 +58,95 @@ __device__ __forceinline__ void DequantizeEightElements(uint32_t values_quant, f
 }
 
 template <class T>
-__global__ void Dequantize4BitsKernel(
+__global__ void Dequantize4BitsKernelReOrder(
     T* output,
     const uint8_t* quant_data,
     const T* scale_data,
     const uint8_t* zero_points,
+    const int32_t* reorder_idx,
     int block_size,
-    int blocks_per_K,
-    int blocks_per_threadblock,
-    int total_blks,
-    int shift) {
-  int block_id = blockIdx.x * blocks_per_threadblock + ((threadIdx.x * 8) >> shift);
-  if (block_id >= total_blks) {
+    int groups_per_K,
+    int groups_per_threadblock,
+    int total_groups) {
+  int group_id = blockIdx.x * groups_per_threadblock + ((threadIdx.x * 8) / block_size);
+  if (group_id >= total_groups) {
     return;
   }
-  int n_idx = block_id / blocks_per_K;
-  int kb_idx = block_id % blocks_per_K;
-  int element_offset = block_id * block_size + ((threadIdx.x * 8) & ((1 << shift) - 1));
+  // T __shared__ zero_points_after_reorder[];//K
+  // T __shared__ scales_after_reorder[];     // K
+  // const int num_r_per_thread = k / 256;
+
+  const int zero_point_shape_x = (groups_per_K + 1) / 2;
+  const int scales_shape_x = groups_per_K;
+  int n_idx = group_id / scales_shape_x;
+  int kb_idx = group_id % scales_shape_x;
+  int element_offset = group_id * block_size + ((threadIdx.x * 8) & (block_size - 1));
+  T* output_i = output + element_offset;
+  uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
+  const int32_t* reorder_idx_with_off = reorder_idx + kb_idx * block_size + ((threadIdx.x * 8) & (block_size - 1));
+  for (int i = 0; i < 8; i++) {
+    int32_t rid = reorder_idx_with_off[i];
+    T scale = *(scale_data + n_idx * scales_shape_x + rid);
+    uint8_t zp = 8;
+    if (zero_points) {
+      zp = zero_points[n_idx * zero_point_shape_x + rid / 2];
+      zp = (rid & 0x01) ? (zp >> 4) : (zp & 0x0f);
+    }
+
+    if constexpr (std::is_same_v<T, half>) {
+      T zp_adjust = -scale * __short2half_rn(zp);
+      output_i[i] = __uint2half_rn((quant_value >> (4 * i)) & 0xF) * scale + zp_adjust;
+    } else {
+      T zp_adjust = -scale * T(zp);
+      output_i[i] = T((quant_value >> (4 * i)) & 0xF) * scale + zp_adjust;
+    }
+  }
+}
+
+template <class T, typename ZeroT = uint8_t>
+__global__ void Dequantize4BitsKernel(
+    T* output,
+    const uint8_t* quant_data,
+    const T* scale_data,
+    const ZeroT* zero_points,
+    int block_size,
+    int groups_per_K,
+    int groups_per_threadblock,
+    int total_groups) {
+  int block_id = blockIdx.x * groups_per_threadblock + ((threadIdx.x * 8) / block_size);
+  if (block_id >= total_groups) {
+    return;
+  }
+  int element_offset = block_id * block_size + ((threadIdx.x * 8) & (block_size - 1));
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
   T scale = *(scale_data + block_id);
-  uint8_t zp = 8;
-  if (zero_points) {
-    zp = zero_points[n_idx * ((blocks_per_K + 1)/2) + kb_idx / 2];
-    zp = (kb_idx & 0x01) ? (zp >> 4) : (zp & 0x0f);
+  T zero_point_value;
+  if constexpr (std::is_same_v<ZeroT, uint8_t>) {
+    const int scales_shape_x = groups_per_K;
+    const int zero_point_shape_x = (groups_per_K + 1) / 2;
+    int kb_idx = block_id % scales_shape_x;
+    int n_idx = block_id / scales_shape_x;
+    uint8_t zp = 8;
+    if (zero_points) {
+      zp = zero_points[n_idx * zero_point_shape_x + kb_idx / 2];
+      zp = (kb_idx & 0x01) ? (zp >> 4) : (zp & 0x0f);
+    }
+    zero_point_value = static_cast<T>(zp);
+  } else {
+    zero_point_value = zero_points? *(zero_points + block_id):static_cast<T>(8);
   }
 
   output = output + element_offset;
-  DequantizeEightElements(quant_value, scale, static_cast<T>(zp), output);
+  DequantizeEightElements(quant_value, scale, zero_point_value, output);
 }
 
-template <class T>
+template <class T, typename ZeroT>
 Status Dequantize4Bits(
     T* output,
     const uint8_t* quant_data,
     const T* scales_data,
-    const uint8_t* zero_points,  // shape: [N, (block_per_K + 1)/2]
+    const ZeroT* zero_points,  // shape: [N, (block_per_K + 1)/2]
+    const int32_t* reorder_idx,
     int k,
     int n,
     int block_size,
@@ -98,47 +154,79 @@ Status Dequantize4Bits(
   // k is padded and equal to block_per_K * block_size
   ORT_ENFORCE(k % block_size == 0, "k must be a multiplier of block_size");
   constexpr int element_per_thread = 8;
-  int blocks_per_threadblock = GridDim::maxThreadsPerBlock * element_per_thread / block_size;
-  int blocks_per_K = k / block_size;
-  int total_blks = n * blocks_per_K;
-  int blocks_per_grid = static_cast<int>(CeilDiv(n * blocks_per_K, blocks_per_threadblock));
-  int shift = static_cast<int>(log2f(float(block_size)));
-
-  Dequantize4BitsKernel<<<blocks_per_grid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      output,
-      quant_data,
-      scales_data,
-      zero_points,
-      block_size,
-      blocks_per_K,
-      blocks_per_threadblock,
-      total_blks,
-      shift);
+  int groups_per_threadblock = GridDim::maxThreadsPerBlock * element_per_thread / block_size;
+  int groups_per_K = k / block_size;
+  int total_groups = n * groups_per_K;  // total elemenets in quant_data
+  int groups_per_grid = static_cast<int>(CeilDiv(total_groups, groups_per_threadblock));
+  if (!reorder_idx || std::is_same_v<ZeroT, T>) {
+    Dequantize4BitsKernel<T, ZeroT><<<groups_per_grid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+        output,
+        quant_data,
+        scales_data,
+        zero_points,
+        block_size,
+        groups_per_K,
+        groups_per_threadblock,
+        total_groups);
+  } else {
+    // static_assert(std::is_same_v<ZeroT, uint8_t>, "ZeroT must be uint8_t");
+    Dequantize4BitsKernelReOrder<<<groups_per_grid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+        output,
+        quant_data,
+        scales_data,
+        (const uint8_t*)zero_points,
+        reorder_idx,
+        block_size,
+        groups_per_K,
+        groups_per_threadblock,
+        total_groups);
+  }
 
   return Status::OK();
 }
 
-template Status Dequantize4Bits<float>(
+template Status Dequantize4Bits<float, uint8_t>(
     float* output,
     const uint8_t* quant_data,
     const float* scales_data,
     const uint8_t* zero_points,
+    const int32_t* reorder_idx,
     int k,
     int n,
     int block_size,
     cudaStream_t stream);
 
-template Status Dequantize4Bits<half>(
+template Status Dequantize4Bits<half, uint8_t>(
     half* output,
     const uint8_t* quant_data,
     const half* scales_data,
     const uint8_t* zero_points,
+    const int32_t* reorder_idx,
+    int k,
+    int n,
+    int block_size,
+    cudaStream_t stream);
+template Status Dequantize4Bits<float, float>(
+    float* output,
+    const uint8_t* quant_data,
+    const float* scales_data,
+    const float* zero_points,
+    const int32_t* reorder_idx,
     int k,
     int n,
     int block_size,
     cudaStream_t stream);
 
-
+template Status Dequantize4Bits<half, half>(
+    half* output,
+    const uint8_t* quant_data,
+    const half* scales_data,
+    const half* zero_points,
+    const int32_t* reorder_idx,
+    int k,
+    int n,
+    int block_size,
+    cudaStream_t stream);
 ///////////////////////////////////////////////////////////////////////////////
 // A more general block-wise dequantization implementation that supports
 // different block sizes and block orientations (row-wise/column-wise).
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
index f9c09c55fd89..580b5087f3fa 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
@@ -7,18 +7,18 @@
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
-template <class T>
+template <class T, typename ZeroT>
 Status Dequantize4Bits(
     T* output,
     const uint8_t* quant_data,
     const T* scales_data,
-    const uint8_t* zero_points,
+    const ZeroT* zero_points,
+    const int32_t* reorder_idx,
     int k,
     int n,
     int block_size,
     cudaStream_t stream);
 
-
 /**
  * @brief Dequantize a block-wise quantized matrix, and store the result in a
  *        column major matrix for use in subsequent GEMM. This implementation supports
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc
index bbcb7de99781..0534ed6dc7fc 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc
@@ -117,7 +117,8 @@ Status MatMulBnb4<T>::ComputeInternal(OpKernelContext* ctx) const {
         &zero,
         reinterpret_cast<CudaT*>(Y->MutableData<T>()),
         helper.Ldc(),
-        GetDeviceProp()));
+        GetDeviceProp(),
+        UseTF32()));
   }
 
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
index 5b0e61e19701..1cec6f6a12f1 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
@@ -1,15 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-//
-// This module define MatMulFp32Q4 operator, it is basically
-// matmul float32 with right hand side being a 2-D matrix
-// pre-packed and block-compacted into int4
-//
-
-#include "core/common/safeint.h"
-#include "core/providers/cuda/cuda_kernel.h"
-#include "core/providers/cuda/shared_inc/fpgeneric.h"
+#include "contrib_ops/cuda/quantization/matmul_nbits.h"
+
+#include <cstdint>
+
+#include "core/common/status.h"
+#include "core/framework/float16.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "matmul_nbits.cuh"
 #include "dequantize_blockwise.cuh"
@@ -19,40 +16,19 @@ namespace contrib {
 namespace cuda {
 using namespace onnxruntime::cuda;
 
-template <typename T>
-class MatMulNBits final : public CudaKernel {
- public:
-  MatMulNBits(const OpKernelInfo& info) : CudaKernel(info) {
-    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("K", &K_));
-    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
-    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
-    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("bits", &nbits_));
-    ORT_ENFORCE(nbits_ == 4,
-                "Only 4b quantization is supported for MatMulNBits op,"
-                " additional bits support is planned.");
-  }
-
-  Status ComputeInternal(OpKernelContext* context) const override;
-
- private:
-  int64_t K_;
-  int64_t N_;
-  int64_t block_size_;
-  int64_t nbits_;
-  bool column_wise_quant_blk_{true};
-};
-
 template <typename T>
 Status MatMulNBits<T>::ComputeInternal(OpKernelContext* ctx) const {
   const Tensor* a = ctx->Input<Tensor>(0);
   const Tensor* b = ctx->Input<Tensor>(1);
   const Tensor* scales = ctx->Input<Tensor>(2);
   const Tensor* zero_points = ctx->Input<Tensor>(3);
+  const Tensor* reorder_idx = ctx->Input<Tensor>(4);
 
   const auto* a_data = a->Data<T>();
   const uint8_t* blob_data = b->Data<uint8_t>();
   const auto* scales_data = scales->Data<T>();
-  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->Data<uint8_t>();
+  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
+  const auto* reorder_idx_data = reorder_idx == nullptr ? nullptr : reorder_idx->Data<int32_t>();
 
   typedef typename ToCudaType<T>::MappedType CudaT;
 
@@ -67,76 +43,99 @@ Status MatMulNBits<T>::ComputeInternal(OpKernelContext* ctx) const {
   // Bail out early if the output is going to be empty
   if (Y->Shape().Size() == 0) return Status::OK();
 
-  bool is_4bit_done = TryMatMul4Bits(
-      reinterpret_cast<CudaT*>(Y->MutableData<T>()),
-      reinterpret_cast<const CudaT*>(a_data),
-      blob_data,
-      reinterpret_cast<const CudaT*>(scales_data),
-      zero_points_data,
-      SafeInt<int>(helper.M()),
-      SafeInt<int>(helper.N()),
-      SafeInt<int>(helper.K()),
-      SafeInt<int>(block_size_),
-      SafeInt<int>(GetDeviceProp().sharedMemPerBlock),
-      static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
-  if (!is_4bit_done) {
-    int64_t K_padded = (K_ + block_size_ - 1) / block_size_ * block_size_;
-    IAllocatorUniquePtr<T> b_data_ptr = GetScratchBuffer<T>(N_ * K_padded, ctx->GetComputeStream());
-    auto* b_data = b_data_ptr.get();
-    if (column_wise_quant_blk_) {
-      // column-wise block
+  bool is_4bit_done = (reorder_idx_data == nullptr) &&
+                      (!zero_points || !zero_points->IsDataType<T>()) &&
+                      TryMatMul4Bits(
+                          reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                          reinterpret_cast<const CudaT*>(a_data),
+                          blob_data,
+                          reinterpret_cast<const CudaT*>(scales_data),
+                          static_cast<const uint8_t*>(zero_points_data),
+                          SafeInt<int>(helper.M()),
+                          SafeInt<int>(helper.N()),
+                          SafeInt<int>(helper.K()),
+                          SafeInt<int>(block_size_),
+                          SafeInt<int>(GetDeviceProp().sharedMemPerBlock),
+                          static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
+
+  if (is_4bit_done) {
+    return Status::OK();
+  }
+
+  int64_t K_padded = (K_ + block_size_ - 1) / block_size_ * block_size_;
+  IAllocatorUniquePtr<T> b_data_ptr = GetScratchBuffer<T>(N_ * K_padded, ctx->GetComputeStream());
+  auto* b_data = b_data_ptr.get();
+  if (column_wise_quant_blk_) {
+    if (reorder_idx) {
+      ORT_ENFORCE(K_padded == reorder_idx->Shape()[0], "K_padded != g_idx->Shape()[0]");
+    }
+    // column-wise block
+    if ((zero_points && zero_points->IsDataType<T>())) {
       ORT_RETURN_IF_ERROR(Dequantize4Bits(
           reinterpret_cast<CudaT*>(b_data),
           blob_data,
           reinterpret_cast<const CudaT*>(scales_data),
-          zero_points_data,
+          (const CudaT*)zero_points_data,
+          reorder_idx_data,
           SafeInt<int>(K_padded),
           SafeInt<int>(N_),
           SafeInt<int>(block_size_),
           static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
     } else {
-      // row-wise block
-      K_padded = K_;
-
-      ORT_RETURN_IF_ERROR(DequantizeBlockwise4b(
+      ORT_RETURN_IF_ERROR(Dequantize4Bits(
           reinterpret_cast<CudaT*>(b_data),
           blob_data,
           reinterpret_cast<const CudaT*>(scales_data),
-          zero_points_data,
-          SafeInt<int>(block_size_),
-          column_wise_quant_blk_,
-          SafeInt<int>(K_),
+          (const uint8_t*)zero_points_data,
+          reorder_idx_data,
+          SafeInt<int>(K_padded),
           SafeInt<int>(N_),
+          SafeInt<int>(block_size_),
           static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
     }
+  } else {
+    // row-wise block
+    K_padded = K_;
+
+    ORT_RETURN_IF_ERROR(DequantizeBlockwise4b(
+        reinterpret_cast<CudaT*>(b_data),
+        blob_data,
+        reinterpret_cast<const CudaT*>(scales_data),
+        (const uint8_t*)zero_points_data,
+        SafeInt<int>(block_size_),
+        column_wise_quant_blk_,
+        SafeInt<int>(K_),
+        SafeInt<int>(N_),
+        static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
+  }
 #if 0
-  cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
-  T* b_data_cpu = new T[K_ * N_];
-  cudaMemcpy(b_data_cpu, b_data, K_ * N_ * sizeof(T), cudaMemcpyDeviceToHost);
-  delete[] b_data_cpu;
+cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
+T* b_data_cpu = new T[K_ * N_];
+cudaMemcpy(b_data_cpu, b_data, K_ * N_ * sizeof(T), cudaMemcpyDeviceToHost);
+delete[] b_data_cpu;
 #endif
 
-    const CudaT alpha = ToCudaType<T>::FromFloat(1.f);
-    const CudaT zero = ToCudaType<T>::FromFloat(0.f);
-
-    if (helper.OutputOffsets().size() == 1) {
-      CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
-          GetCublasHandle(ctx),
-          CUBLAS_OP_T,
-          CUBLAS_OP_N,
-          SafeInt<int>(helper.N()),
-          SafeInt<int>(helper.M()),
-          SafeInt<int>(helper.K()),
-          &alpha,
-          reinterpret_cast<const CudaT*>(b_data),
-          SafeInt<int>(K_padded),
-          reinterpret_cast<const CudaT*>(a_data),
-          helper.Lda(transa),
-          &zero,
-          reinterpret_cast<CudaT*>(Y->MutableData<T>()),
-          helper.Ldc(),
-          GetDeviceProp()));
-    }
+  const CudaT alpha = ToCudaType<T>::FromFloat(1.f);
+  const CudaT zero = ToCudaType<T>::FromFloat(0.f);
+
+  if (helper.OutputOffsets().size() == 1) {
+    CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+        GetCublasHandle(ctx),
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        SafeInt<int>(helper.N()),
+        SafeInt<int>(helper.M()),
+        SafeInt<int>(helper.K()),
+        &alpha,
+        reinterpret_cast<const CudaT*>(b_data),
+        SafeInt<int>(K_padded),
+        reinterpret_cast<const CudaT*>(a_data),
+        helper.Lda(transa),
+        &zero,
+        reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+        helper.Ldc(),
+        GetDeviceProp(),
+        UseTF32()));
   }
 
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
index 67384957d8dd..d4d583906b7f 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
@@ -89,7 +89,7 @@ __device__ __forceinline__ void Convert8xInt4To8xHalfs(uint32_t value, half2* ha
   asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(kOneSixteenth), "r"(kNeg64));
 }
 
-__device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant, half scale, uint8_t zp, const half* a, half* sums) {
+__device__ __forceinline__ void AccumulateEightElements(uint32_t values_quant, half scale, uint8_t zp, const half* a, half* sums) {
   half2 scale_half2 = {scale, scale};
   half zp_adjust = -scale * __short2half_rn(zp);
   half2 zp_adjust2 = {zp_adjust, zp_adjust};
@@ -120,7 +120,7 @@ __device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant,
   sums_half2[3] = sums_half2[3] + v3 * (*(reinterpret_cast<half2*>(&(vec_permuted.w))));
 }
 #else
-__device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant, half scale, uint8_t zp, const half* a, half* sums) {
+__device__ __forceinline__ void AccumulateEightElements(uint32_t values_quant, half scale, uint8_t zp, const half* a, half* sums) {
   half2 scale_half2 = {scale, scale};
   half zp_adjust = -scale * __short2half_rn(zp);
   half2 zp_adjust2 = {zp_adjust, zp_adjust};
@@ -144,7 +144,7 @@ __device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant,
 }
 #endif
 
-__device__ __forceinline__ float AccumulateEightElements(uint32_t values_quant, float scale, uint8_t zp, const float* a, float* sums) {
+__device__ __forceinline__ void AccumulateEightElements(uint32_t values_quant, float scale, uint8_t zp, const float* a, float* sums) {
   float4 a_vec_0 = *(reinterpret_cast<const float4*>(a));
   float4 a_vec_1 = *(reinterpret_cast<const float4*>(a + 4));
 
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h
new file mode 100644
index 000000000000..f5c2c6c4e4fd
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+//
+// This module define MatMulNBits operator, it is basically
+// matmul float with right hand side being a 2-D matrix
+// pre-packed and block-compacted into int4
+//
+#pragma once
+#include "core/common/safeint.h"
+#include "core/providers/cuda/cuda_kernel.h"
+#include "core/providers/cuda/shared_inc/fpgeneric.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+using namespace onnxruntime::cuda;
+
+template <typename T>
+class MatMulNBits final : public CudaKernel {
+ public:
+  MatMulNBits(const OpKernelInfo& info) : CudaKernel(info) {
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("K", &K_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("bits", &nbits_));
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  int64_t K_;
+  int64_t N_;
+  int64_t block_size_;
+  int64_t nbits_;
+  bool column_wise_quant_blk_{true};
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.cc
new file mode 100644
index 000000000000..7bb0945615d3
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.cc
@@ -0,0 +1,143 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <type_traits>
+#include "core/common/safeint.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "contrib_ops/cuda/quantization/moe_quantization.h"
+
+using namespace onnxruntime::cuda;
+using namespace ::onnxruntime::common;
+using namespace ONNX_NAMESPACE;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#define REGISTER_KERNEL()                                                                  \
+  ONNX_OPERATOR_KERNEL_EX(QMoE, kMSDomain, 1, kCudaExecutionProvider,                      \
+                          (*KernelDefBuilder::Create())                                    \
+                              .MayInplace(0, 0)                                            \
+                              .TypeConstraint("T", BuildKernelDefConstraints<MLFloat16>()) \
+                              .TypeConstraint("T1", BuildKernelDefConstraints<uint8_t>()), \
+                          QMoE);
+
+REGISTER_KERNEL()
+
+namespace {
+template <typename T, bool use_quint4x2>
+struct ToCudaTypeWrapper : public ToCudaType<T> {};
+
+template <>
+struct ToCudaTypeWrapper<uint8_t, false> {
+  using MappedType = uint8_t;
+};
+
+template <>
+struct ToCudaTypeWrapper<uint8_t, true> {
+  using MappedType = cutlass::uint4b_t;
+};
+}  // anonymous namespace
+
+QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoEBase(op_kernel_info) {}
+
+Status QMoE::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const Tensor* router_probs = context->Input<Tensor>(1);
+  const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
+  const Tensor* fc1_scales = context->Input<Tensor>(3);
+  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
+  const Tensor* fc2_experts_weights = context->Input<Tensor>(5);
+  const Tensor* fc2_scales = context->Input<Tensor>(6);
+  const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(7);
+  const Tensor* fc3_experts_weights_optional = context->Input<Tensor>(8);
+  const Tensor* fc3_scales_optional = context->Input<Tensor>(9);
+  const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(10);
+
+  MoEParameters moe_params;
+  MoEQuantType quant_type = MoEQuantType::UINT4;
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, quant_type, input, router_probs, fc1_experts_weights,
+                                  fc1_experts_bias_optional, fc2_experts_weights, fc2_experts_bias_optional,
+                                  fc3_experts_weights_optional, fc3_experts_bias_optional));
+  ORT_RETURN_IF_ERROR(CheckInputScales(fc1_scales, fc2_scales, fc3_scales_optional, moe_params.num_experts,
+                                       moe_params.hidden_size, moe_params.inter_size));
+
+  // Support int4 only at the moment. We can add uint8 if needed.
+  static constexpr bool use_quint4x2 = true;
+  using T = MLFloat16;
+  using CudaT = typename ToCudaType<T>::MappedType;
+  using CudaWeightT = typename ToCudaTypeWrapper<uint8_t, use_quint4x2>::MappedType;
+
+  auto stream = context->GetComputeStream();
+
+  auto& device_prop = GetDeviceProp();
+  const int sm = device_prop.major * 10 + device_prop.minor;
+
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaWeightT> moe_runner(sm, fc3_experts_weights_optional != nullptr,
+                                                                           normalize_routing_weights_);
+
+  size_t ws_size = moe_runner.getWorkspaceSize(
+      static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+      static_cast<size_t>(moe_params.inter_size), static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
+  size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
+  size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
+  size_t expanded_source_row_to_expanded_dest_row_size = k_ * moe_params.num_rows * sizeof(int);
+  size_t expert_for_source_row_size = k_ * moe_params.num_rows * sizeof(int);
+
+  AllocatorPtr allocator;
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
+
+  IAllocatorUniquePtr<void> work_space = IAllocator::MakeUniquePtr<void>(allocator, ws_size, false, stream);
+  IAllocatorUniquePtr<void> fc2_output = IAllocator::MakeUniquePtr<void>(allocator, fc2_output_size, false, stream);
+  IAllocatorUniquePtr<void> expert_scales =
+      IAllocator::MakeUniquePtr<void>(allocator, expert_scales_size, false, stream);
+  IAllocatorUniquePtr<void> expanded_source_row_to_expanded_dest_row =
+      IAllocator::MakeUniquePtr<void>(allocator, expanded_source_row_to_expanded_dest_row_size, false, stream);
+  IAllocatorUniquePtr<void> expert_for_source_row =
+      IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
+
+  moe_runner.run_moe_fc(
+      reinterpret_cast<const CudaT*>(input->template Data<T>()),
+      reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
+      reinterpret_cast<const CudaWeightT*>(fc1_experts_weights->DataRaw()),
+      fc1_scales == nullptr ? nullptr : reinterpret_cast<const CudaT*>(fc1_scales->template Data<T>()),
+      fc1_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
+      activation_type_,
+      fc3_experts_weights_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaWeightT*>(fc3_experts_weights_optional->DataRaw()),
+      fc3_scales_optional == nullptr ? nullptr
+                                     : reinterpret_cast<const CudaT*>(fc3_scales_optional->template Data<T>()),
+      fc3_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+      reinterpret_cast<const CudaWeightT*>(fc2_experts_weights->DataRaw()),
+      fc2_scales == nullptr ? nullptr : reinterpret_cast<const CudaT*>(fc2_scales->template Data<T>()),
+      static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
+      static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
+      static_cast<int>(moe_params.local_num_experts), 0 /*local_experts_start_index_ used in sharded MoE*/,
+      static_cast<int>(k_), reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
+      reinterpret_cast<CudaT*>(expert_scales.get()),
+      reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
+      reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
+
+  Tensor* output = context->Output(0, input->Shape());
+
+  ort_fastertransformer::finalize_moe_routing_kernelLauncher(
+      reinterpret_cast<CudaT*>(fc2_output.get()), reinterpret_cast<CudaT*>(output->template MutableData<T>()),
+      fc2_experts_bias_optional == nullptr
+          ? nullptr
+          : reinterpret_cast<const CudaT*>(fc2_experts_bias_optional->template Data<T>()),
+      reinterpret_cast<CudaT*>(expert_scales.get()),
+      reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
+      reinterpret_cast<int*>(expert_for_source_row.get()), static_cast<int>(moe_params.num_rows),
+      static_cast<int>(moe_params.hidden_size), static_cast<int>(k_), Stream(context));
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.h b/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.h
new file mode 100644
index 000000000000..7b68d2d082de
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/moe_quantization.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "contrib_ops/cuda/moe/ft_moe/moe_kernel.h"
+#include "contrib_ops/cuda/moe/moe_base.h"
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+using namespace onnxruntime::cuda;
+
+class QMoE final : public CudaKernel, public MoEBase {
+ public:
+  explicit QMoE(const OpKernelInfo& op_kernel_info);
+  Status ComputeInternal(OpKernelContext* ctx) const override;
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
index 3cecebedae2f..12835978536e 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
@@ -142,7 +142,7 @@ inline void debug_print([[maybe_unused]] const T* arr,
   std::cout << "========" << name << std::endl;
   for (size_t i = 0; i < sz; i++) {
     if (i % w == 0) std::cout << std::endl;
-    if (std::is_same<T, int8_t>().value) {
+    if constepxr (std::is_same<T, int8_t>::value) {
       std::cout << (int)buf[i] << ", ";
     } else {
       std::cout << buf[i] << ", ";
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu
index f4d5a7b404a6..fd4b51f40fb4 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu
@@ -151,7 +151,7 @@ QOrderBatchInt8MatrixTransposeKernel(const int8_t* src, const int8_t* dst, const
   }
 }
 
-Status QOrderBatchTransposeInt8Matrix(cudaStream_t stream, const cudaDeviceProp& device_prop,
+Status QOrderBatchTransposeInt8Matrix(cudaStream_t stream, const cudaDeviceProp& /*device_prop*/,
                                       const int batch_size, const int rows, const int cols,
                                       const int8_t* input, int8_t* output) {
   ORT_ENFORCE(rows % 4 == 0 && cols % 4 == 0, "Matrix rows and cols must be divisible by 4!");
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq_impl.cu b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq_impl.cu
index baff8e76ec73..e6ac0bc8a517 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq_impl.cu
@@ -389,7 +389,7 @@ QOrderDequantizeKernel_Strict(const int8_t* __restrict__ src, const __half* __re
   }
 }
 
-Status QOrderDequantize_Strict(cudaStream_t stream, const cudaDeviceProp& device_prop,
+Status QOrderDequantize_Strict(cudaStream_t stream, const cudaDeviceProp& /*device_prop*/,
                                const int8_t* src, __half* dst, float scale, size_t N) {
   ORT_RETURN_IF(N & 0x3LL, "N can not divide by 4!");
 
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search.cc b/onnxruntime/contrib_ops/cuda/transformers/beam_search.cc
index 2a90e4911f28..08cbb145a6f6 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search.cc
@@ -49,6 +49,7 @@ ONNX_OPERATOR_KERNEL_EX(
         .InputMemoryType(OrtMemTypeCPUInput, 9)    // 'attention_mask' needs to be on CPU
         .InputMemoryType(OrtMemTypeCPUInput, 10)   // 'decoder_input_ids' needs to be on CPU
         .InputMemoryType(OrtMemTypeCPUInput, 11)   // 'logits_processor' needs to be on CPU
+        .InputMemoryType(OrtMemTypeCPUInput, 14)   // 'temperature' needs to be on CPU
         .OutputMemoryType(OrtMemTypeCPUOutput, 0)  // 'sequences' output on CPU
         .OutputMemoryType(OrtMemTypeCPUOutput, 1)  // 'sequences_scores' output on CPU
         .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
diff --git a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc
index b31f5d243e00..4cfa89a4d58c 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc
@@ -203,23 +203,19 @@ void DumpGpuTensor(const char* name, const Tensor& tensor) {
   DumpGpuTensor(nullptr, tensor, static_cast<int>(num_rows), static_cast<int>(row_size));
 }
 
-void CudaTensorConsoleDumper::Print(const char* name, const float* tensor, int dim0, int dim1) const {
+void CudaTensorConsoleDumper::Print(const char* name, const size_t* tensor, int dim0, int dim1) const {
   if (is_enabled_)
-    DumpGpuTensor<float>(name, tensor, dim0, dim1, true);
+    DumpGpuTensor<size_t>(name, tensor, dim0, dim1, true);
 }
 
-void CudaTensorConsoleDumper::Print(const char* name, const MLFloat16* tensor, int dim0, int dim1) const {
+void CudaTensorConsoleDumper::Print(const char* name, const int32_t* tensor, int dim0, int dim1) const {
   if (is_enabled_)
-    DumpGpuTensor<MLFloat16>(name, tensor, dim0, dim1, true);
+    DumpGpuTensor<int32_t>(name, tensor, dim0, dim1, true);
 }
 
-void CudaTensorConsoleDumper::Print(const char* name, const size_t* tensor, int dim0, int dim1) const {
+void CudaTensorConsoleDumper::Print(const char* name, const int32_t* tensor, int dim0, int dim1, int dim2) const {
   if (is_enabled_)
-    DumpGpuTensor<size_t>(name, tensor, dim0, dim1, true);
-}
-
-void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1) const {
-  Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1);
+    DumpGpuTensor<int32_t>(name, tensor, dim0, dim1, dim2, true);
 }
 
 void CudaTensorConsoleDumper::Print(const char* name, const int64_t* tensor, int dim0, int dim1) const {
@@ -227,9 +223,14 @@ void CudaTensorConsoleDumper::Print(const char* name, const int64_t* tensor, int
     DumpGpuTensor<int64_t>(name, tensor, dim0, dim1, true);
 }
 
-void CudaTensorConsoleDumper::Print(const char* name, const int32_t* tensor, int dim0, int dim1) const {
+void CudaTensorConsoleDumper::Print(const char* name, const int64_t* tensor, int dim0, int dim1, int dim2) const {
   if (is_enabled_)
-    DumpGpuTensor<int32_t>(name, tensor, dim0, dim1, true);
+    DumpGpuTensor<int64_t>(name, tensor, dim0, dim1, dim2, true);
+}
+
+void CudaTensorConsoleDumper::Print(const char* name, const float* tensor, int dim0, int dim1) const {
+  if (is_enabled_)
+    DumpGpuTensor<float>(name, tensor, dim0, dim1, true);
 }
 
 void CudaTensorConsoleDumper::Print(const char* name, const float* tensor, int dim0, int dim1, int dim2) const {
@@ -242,6 +243,11 @@ void CudaTensorConsoleDumper::Print(const char* name, const float* tensor, int d
     DumpGpuTensor<float>(name, tensor, dim0, dim1, dim2, dim3, true);
 }
 
+void CudaTensorConsoleDumper::Print(const char* name, const MLFloat16* tensor, int dim0, int dim1) const {
+  if (is_enabled_)
+    DumpGpuTensor<MLFloat16>(name, tensor, dim0, dim1, true);
+}
+
 void CudaTensorConsoleDumper::Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2) const {
   if (is_enabled_)
     DumpGpuTensor<MLFloat16>(name, tensor, dim0, dim1, dim2, true);
@@ -252,22 +258,31 @@ void CudaTensorConsoleDumper::Print(const char* name, const MLFloat16* tensor, i
     DumpGpuTensor<MLFloat16>(name, tensor, dim0, dim1, dim2, dim3, true);
 }
 
-void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1, int dim2) const {
-  Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1, dim2);
+void CudaTensorConsoleDumper::Print(const char* name, const BFloat16* tensor, int dim0, int dim1) const {
+  if (is_enabled_)
+    DumpGpuTensor<BFloat16>(name, tensor, dim0, dim1, true);
 }
 
-void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1, int dim2, int dim3) const {
-  Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1, dim2, dim3);
+void CudaTensorConsoleDumper::Print(const char* name, const BFloat16* tensor, int dim0, int dim1, int dim2) const {
+  if (is_enabled_)
+    DumpGpuTensor<BFloat16>(name, tensor, dim0, dim1, dim2, true);
 }
 
-void CudaTensorConsoleDumper::Print(const char* name, const int64_t* tensor, int dim0, int dim1, int dim2) const {
+void CudaTensorConsoleDumper::Print(const char* name, const BFloat16* tensor, int dim0, int dim1, int dim2, int dim3) const {
   if (is_enabled_)
-    DumpGpuTensor<int64_t>(name, tensor, dim0, dim1, dim2, true);
+    DumpGpuTensor<BFloat16>(name, tensor, dim0, dim1, dim2, dim3, true);
 }
 
-void CudaTensorConsoleDumper::Print(const char* name, const int32_t* tensor, int dim0, int dim1, int dim2) const {
-  if (is_enabled_)
-    DumpGpuTensor<int32_t>(name, tensor, dim0, dim1, dim2, true);
+void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1) const {
+  Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1);
+}
+
+void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1, int dim2) const {
+  Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1, dim2);
+}
+
+void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1, int dim2, int dim3) const {
+  Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1, dim2, dim3);
 }
 
 void CudaTensorConsoleDumper::Print(const char* name, const Tensor& tensor) const {
@@ -301,43 +316,52 @@ void CudaTensorConsoleDumper::Print(const char* name, const std::string& value,
 }
 
 #else
-void CudaTensorConsoleDumper::Print(const char*, const float*, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const size_t*, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const int32_t*, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const size_t*, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const int32_t*, int, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const half*, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const int64_t*, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const int64_t*, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const int64_t*, int, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const int32_t*, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const float*, int, int) const {
 }
 
 void CudaTensorConsoleDumper::Print(const char*, const float*, int, int, int) const {
 }
 
+void CudaTensorConsoleDumper::Print(const char*, const float*, int, int, int, int) const {
+}
+
+void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int) const {
+}
+
 void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const half*, int, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const int64_t*, int, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const BFloat16*, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const int32_t*, int, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const BFloat16*, int, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const float*, int, int, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const BFloat16*, int, int, int, int) const {
 }
 
-void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int, int, int) const {
+void CudaTensorConsoleDumper::Print(const char*, const half*, int, int) const {
+}
+
+void CudaTensorConsoleDumper::Print(const char*, const half*, int, int, int) const {
 }
 
 void CudaTensorConsoleDumper::Print(const char*, const half*, int, int, int, int) const {
diff --git a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.h b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.h
index 264ecd7cfe2f..773401f79531 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.h
@@ -16,20 +16,31 @@ class CudaTensorConsoleDumper : public onnxruntime::contrib::transformers::ICons
  public:
   CudaTensorConsoleDumper() = default;
   virtual ~CudaTensorConsoleDumper() {}
-  void Print(const char* name, const float* tensor, int dim0, int dim1) const override;
-  void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1) const override;
+
   void Print(const char* name, const size_t* tensor, int dim0, int dim1) const override;
-  void Print(const char* name, const half* tensor, int dim0, int dim1) const;
-  void Print(const char* name, const int64_t* tensor, int dim0, int dim1) const override;
+
   void Print(const char* name, const int32_t* tensor, int dim0, int dim1) const override;
+  void Print(const char* name, const int32_t* tensor, int dim0, int dim1, int dim2) const override;
+
+  void Print(const char* name, const int64_t* tensor, int dim0, int dim1) const override;
+  void Print(const char* name, const int64_t* tensor, int dim0, int dim1, int dim2) const override;
+
+  void Print(const char* name, const float* tensor, int dim0, int dim1) const override;
   void Print(const char* name, const float* tensor, int dim0, int dim1, int dim2) const override;
   void Print(const char* name, const float* tensor, int dim0, int dim1, int dim2, int dim3) const;
-  void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2) const override;
-  void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2, int dim3) const;
+
+  void Print(const char* name, const half* tensor, int dim0, int dim1) const;
   void Print(const char* name, const half* tensor, int dim0, int dim1, int dim2) const;
   void Print(const char* name, const half* tensor, int dim0, int dim1, int dim2, int dim3) const;
-  void Print(const char* name, const int64_t* tensor, int dim0, int dim1, int dim2) const override;
-  void Print(const char* name, const int32_t* tensor, int dim0, int dim1, int dim2) const override;
+
+  void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1) const override;
+  void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2) const override;
+  void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2, int dim3) const;
+
+  void Print(const char* name, const BFloat16* tensor, int dim0, int dim1) const;
+  void Print(const char* name, const BFloat16* tensor, int dim0, int dim1, int dim2) const;
+  void Print(const char* name, const BFloat16* tensor, int dim0, int dim1, int dim2, int dim3) const;
+
   void Print(const char* name, const Tensor& value) const override;
   void Print(const char* name, const OrtValue& value) const override;
   void Print(const char* name, int index, bool end_line) const override;
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
index dbd7fb010462..eb1943b59d97 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -1,11 +1,22 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+
+// cub.cuh includes device/dispatch_radix_sort.cuh which has assignment in conditional expressions
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4706)  
+#endif
+#include <cub/cub.cuh>
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#include <cub/util_type.cuh>
+
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
-#include "cub/util_type.cuh"
-#include <cub/cub.cuh>
-#include <cub/device/device_segmented_radix_sort.cuh>
+
 #include "contrib_ops/cuda/bert/utils.cuh"
 #include "contrib_ops/cuda/transformers/generation_cuda_impl.h"
 
@@ -307,12 +318,13 @@ __device__ bool BeamHypotheses::CanImprove(float best_sum_logprobs, int current_
   return beams_[beams_count_ - 1].score < current_score;
 }
 
+template <typename T>
 __device__ void BeamHypotheses::Output(
     int top_k,
     int max_length,
     int pad_token_id,
     int32_t* sequences,       // buffer of shape (num_return_sequences, max_length)
-    float* sequences_scores)  // buffer of shape (num_return_sequences) or empty
+    T* sequences_scores)      // buffer of shape (num_return_sequences) or empty
 {
   // Copy the top_k beams into the sequences
   for (int index = 0; index < top_k; index++) {
@@ -327,7 +339,7 @@ __device__ void BeamHypotheses::Output(
       target[i] = pad_token_id;
 
     if (sequences_scores)
-      sequences_scores[index] = item.score;
+      sequences_scores[index] = (T)item.score;
   }
 }
 
@@ -501,13 +513,14 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
                                                                                   next_beam_tokens.data());
 }
 
+template <typename T>
 __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
                                           const int32_t* sequences_buffer,
                                           int sequence_length,
                                           BeamHypotheses* beam_hyps_,
                                           const float* final_beam_scores,
                                           int32_t* output,
-                                          float* sequence_scores) {
+                                          T* sequence_scores) {
   int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
   if (batch_index >= state.batch_size_)
     return;
@@ -534,6 +547,7 @@ __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
       sequence_scores ? sequence_scores + batch_index * state.num_return_sequences_ : nullptr);
 }
 
+template <typename T>
 void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      BeamScorerState& state,
                                      gsl::span<const int32_t> sequences,
@@ -541,7 +555,7 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      gsl::span<BeamHypotheses> beam_hyps,
                                      gsl::span<const float> final_beam_scores,
                                      gsl::span<int32_t> output,
-                                     gsl::span<float> sequence_scores,
+                                     gsl::span<T> sequence_scores,
                                      cudaStream_t stream) {
   BeamSearchScorer_Finalize<<<1, batch_size, 0, stream>>>(state,
                                                           sequences.data(),
@@ -552,6 +566,58 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                                           sequence_scores.data());
 }
 
+template void LaunchBeamSearchScorer_Finalize<float>(
+    int batch_size,
+    BeamScorerState& state,
+    gsl::span<const int32_t> sequences,
+    int sequence_length,
+    gsl::span<BeamHypotheses> beam_hyps,
+    gsl::span<const float> final_beam_scores,
+    gsl::span<int32_t> output,
+    gsl::span<float> sequence_scores,
+    cudaStream_t stream);
+
+template void LaunchBeamSearchScorer_Finalize<__half>(
+    int batch_size,
+    BeamScorerState& state,
+    gsl::span<const int32_t> sequences,
+    int sequence_length,
+    gsl::span<BeamHypotheses> beam_hyps,
+    gsl::span<const float> final_beam_scores,
+    gsl::span<int32_t> output,
+    gsl::span<__half> sequence_scores,
+    cudaStream_t stream);
+
+template <typename T>
+__global__ void FloatConvertAndCopyKernel(const float* src, T* dst, size_t total_elements) {
+  int64_t index = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < total_elements) {
+    dst[index] = (T)src[index];
+  }
+}
+
+template <typename T>
+void LaunchBeamSearchScoreCopy(gsl::span<const float> final_scores,
+                               gsl::span<T> output_scores,
+                               cudaStream_t stream) {
+    ORT_ENFORCE(final_scores.size() == output_scores.size());
+    constexpr unsigned ThreadPerBlock = 256;
+    unsigned num_blocks = (unsigned)((final_scores.size() + (ThreadPerBlock - 1))/ ThreadPerBlock);
+
+    typedef typename ToCudaType<float>::MappedType CudaT;
+
+    FloatConvertAndCopyKernel<<<num_blocks, ThreadPerBlock, 0, stream>>>(
+        final_scores.data(), (CudaT*)output_scores.data(), final_scores.size());
+}
+
+template void LaunchBeamSearchScoreCopy(gsl::span<const float> final_scores,
+                                        gsl::span<float> output_scores,
+                                        cudaStream_t stream);
+
+template void LaunchBeamSearchScoreCopy(gsl::span<const float> final_scores,
+                                        gsl::span<MLFloat16> output_scores,
+                                        cudaStream_t stream);
+
 __global__ void AddProbsKernel(float* log_probs,
                                float* cum_log_probs,
                                const int vocab_size,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
index 5ed5949196b2..281cb6c72597 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
@@ -65,11 +65,12 @@ struct BeamHypotheses {
   __device__ bool CanImprove(float best_sum_logprobs, int current_length) const;
 
   // Output results
-  __device__ void Output(int top_k,                 // number of sequences to return
-                         int max_length,            // max sequence length
-                         int pad_token_id,          // pad token
-                         int32_t* sequences,        // buffer with pad token, shape (num_return_sequences, max_length)
-                         float* sequences_scores);  // buffer for sequence scores, with shape (num_return_sequences)
+  template <typename T>
+  __device__ void Output(int top_k,             // number of sequences to return
+                         int max_length,        // max sequence length
+                         int pad_token_id,      // pad token
+                         int32_t* sequences,    // buffer with pad token, shape (num_return_sequences, max_length)
+                         T* sequences_scores);  // buffer for sequence scores, with shape (num_return_sequences)
 };
 
 struct BeamScorerState {
@@ -110,6 +111,7 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
                                                        gsl::span<int32_t> next_beam_indices,
                                                        cudaStream_t stream);
 
+template <typename T>
 void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      BeamScorerState& state,
                                      gsl::span<const int32_t> sequences,
@@ -117,9 +119,14 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      gsl::span<BeamHypotheses> beam_hyps_,
                                      gsl::span<const float> final_beam_scores,
                                      gsl::span<int32_t> output,
-                                     gsl::span<float> sequence_scores,
+                                     gsl::span<T> sequence_scores,
                                      cudaStream_t stream);
 
+template <typename T>
+void LaunchBeamSearchScoreCopy(gsl::span<const float> final_scores,
+                               gsl::span<T> output_scores,
+                               cudaStream_t stream);
+
 void LaunchNextTokenKernel(const int64_t* next_token_indices,
                            int32_t* next_indices,
                            int32_t* next_tokens,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index 380d561bbb23..7adc2fe0a67e 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -424,7 +424,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
   const bool is_whisper_model = (parameters->model_type == onnxruntime::contrib::transformers::IGenerationParameters::kModelTypeWhisper);
   if (step == 1 && is_whisper_model && parameters->no_speech_probs) {
     cuda::LaunchSaveNoSpeechProbs<T>(
-        (T*)parameters->no_speech_probs, Y_data, batch_size, num_beams, vocab_size, parameters->no_speech_token, cuda_stream);
+        (T*)parameters->no_speech_probs, Y_data, batch_size, num_beams, vocab_size, parameters->no_speech_token_id, cuda_stream);
   }
 
   // NOTE: currently we treat extra decoding ids are same
@@ -469,7 +469,15 @@ Status ProcessLogits(const OrtValue& logits,                                 //
                                          cudaMemcpyDeviceToHost,
                                          cuda_stream));
     constexpr int max_initial_timestamp_index = 50;
-    onnxruntime::contrib::transformers::TimestampLogitsProcessor<float> time_logit_processor(parameters->eos_token_id, max_initial_timestamp_index);
+    // Token ids are passed below in the order that they appear in the tokenizer
+    onnxruntime::contrib::transformers::TimestampLogitsProcessor<float> time_logit_processor(parameters->eos_token_id,
+                                                                                             parameters->decoder_start_token_id,
+                                                                                             parameters->translate_token_id,
+                                                                                             parameters->transcribe_token_id,
+                                                                                             parameters->start_of_lm_token_id,
+                                                                                             parameters->no_timestamps_token_id,
+                                                                                             parameters->beginning_timestamp_token_id,
+                                                                                             max_initial_timestamp_index);
     onnxruntime::contrib::transformers::NextTokenScores<float> next_token_scores_timestamp({cpu_next_token_scores_span, batch_beam_size, vocab_size});
 
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(cuda_stream));
@@ -620,6 +628,8 @@ struct CudaBeamSearchScorer : transformers::IBeamScorer {
                 Tensor* output_sequences,
                 Tensor* output_sequence_scores) override;
 
+  void OutputScores(gsl::span<const float>& final_scores, Tensor* output_scores) override;
+
   bool IsDone() const override { return false; }  // For CUDA we speculatively run the next step while we wait for the GPU to report status. We use 'IsDoneLater()' for this
   bool IsDoneLater() const override;
 
@@ -632,7 +642,6 @@ struct CudaBeamSearchScorer : transformers::IBeamScorer {
   }
   gsl::span<int32_t> GetNextIndicesGPU() override { return next_beam_indices_; }
 
- private:
   mutable cuda::AutoDestoryCudaEvent event_process_complete_;
   IAllocatorUniquePtr<cuda::BeamScorerState> state_cpu_;
   IAllocatorUniquePtr<cuda::BeamScorerState> state_gpu_;
@@ -743,22 +752,58 @@ bool CudaBeamSearchScorer::IsDoneLater() const {
   return state_cpu_->not_done_count_ == 0;
 }
 
+template <typename T>
+void CudaOutputSequenceScores(CudaBeamSearchScorer* scorer,
+                              transformers::ISequences& sequences,
+                              gsl::span<const float>& final_beam_scores,
+                              Tensor* output_sequences,
+                              Tensor* output_sequence_scores) {
+  // Word IDs of each sequence, with shape (batch_size * num_return_sequences, max_sequence_length).
+  gsl::span<int32_t> output{output_sequences->MutableData<int32_t>(), static_cast<size_t>(output_sequences->Shape().Size())};
+
+  // Score of each sequence, with shape (batch_size * num_return_sequences).
+  using CudaT = typename ToCudaType<T>::MappedType;
+  gsl::span<CudaT> sequence_scores;
+  if (output_sequence_scores) {
+    sequence_scores = gsl::span<CudaT>{(CudaT*)output_sequence_scores->MutableData<T>(), static_cast<size_t>(output_sequence_scores->Shape().Size())};
+  }
+
+  cuda::LaunchBeamSearchScorer_Finalize(scorer->state_cpu_->batch_size_,
+                                        *scorer->state_gpu_,
+                                        sequences.GetCurrentDeviceSequences(),
+                                        sequences.GetSequenceLength(),
+                                        scorer->beam_hyps_,
+                                        final_beam_scores,
+                                        output,
+                                        sequence_scores,
+                                        scorer->stream_);
+}
+
 void CudaBeamSearchScorer::Finalize(transformers::ISequences& sequences,
                                     gsl::span<const float>& final_beam_scores,
                                     Tensor* output_sequences,
                                     Tensor* output_sequence_scores) {
   ORT_ENFORCE(output_sequences != nullptr);
 
-  // Word IDs of each sequence, with shape (batch_size * num_return_sequences, max_sequence_length).
-  gsl::span<int32_t> output{output_sequences->MutableData<int32_t>(), static_cast<size_t>(output_sequences->Shape().Size())};
-
-  // Score of each sequence, with shape (batch_size * num_return_sequences).
-  gsl::span<float> sequence_scores;
-  if (output_sequence_scores) {
-    sequence_scores = gsl::span<float>{output_sequence_scores->MutableData<float>(), static_cast<size_t>(output_sequence_scores->Shape().Size())};
+  if (output_sequence_scores == nullptr || output_sequence_scores->IsDataType<float>()) {
+    CudaOutputSequenceScores<float>(this, sequences, final_beam_scores, output_sequences, output_sequence_scores);
+  } else {
+    ORT_ENFORCE(output_sequence_scores->IsDataType<MLFloat16>());
+    CudaOutputSequenceScores<MLFloat16>(this, sequences, final_beam_scores, output_sequences, output_sequence_scores);
   }
+}
 
-  cuda::LaunchBeamSearchScorer_Finalize(state_cpu_->batch_size_, *state_gpu_, sequences.GetCurrentDeviceSequences(), sequences.GetSequenceLength(), beam_hyps_, final_beam_scores, output, sequence_scores, stream_);
+void CudaBeamSearchScorer::OutputScores(gsl::span<const float>& final_scores, Tensor* output_scores) {
+  if (output_scores) {
+    if (output_scores->IsDataType<float>()) {
+      gsl::span<float> target(output_scores->MutableData<float>(), output_scores->Shape().Size());
+      cuda::LaunchBeamSearchScoreCopy(final_scores, target, stream_);
+    } else {
+      ORT_ENFORCE(output_scores->IsDataType<MLFloat16>());
+      gsl::span<MLFloat16> target(output_scores->MutableData<MLFloat16>(), output_scores->Shape().Size());
+      cuda::LaunchBeamSearchScoreCopy(final_scores, target, stream_);
+    }
+  }
 }
 
 std::unique_ptr<transformers::IBeamScorer> CreateBeamScorer(const transformers::IGenerationParameters& parameters,
diff --git a/onnxruntime/contrib_ops/js/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/js/bert/rotary_embedding.cc
new file mode 100644
index 000000000000..7ee168e27f6f
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/bert/rotary_embedding.cc
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "rotary_embedding.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsepSupportedFloatTypes;
+
+ONNX_OPERATOR_KERNEL_EX(RotaryEmbedding, kMSDomain, 1, kJsExecutionProvider,
+                        (*KernelDefBuilder::Create())
+                            .TypeConstraint("T", JsepSupportedFloatTypes())
+                            .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>()),
+                        RotaryEmbedding);
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/bert/rotary_embedding.h b/onnxruntime/contrib_ops/js/bert/rotary_embedding.h
new file mode 100644
index 000000000000..376b4e7082fb
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/bert/rotary_embedding.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsKernel;
+
+class RotaryEmbedding final : public JsKernel {
+ public:
+  explicit RotaryEmbedding(const OpKernelInfo& info) : JsKernel(info) {
+    int64_t interleaved = info.GetAttrOrDefault<int64_t>("interleaved", 0);
+    int64_t num_heads = info.GetAttrOrDefault<int64_t>("num_heads", 0);
+    int64_t rotary_embedding_dim = info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0);
+    float scale = info.GetAttrOrDefault<float>("scale", 1.0);
+
+    JSEP_INIT_KERNEL_ATTRIBUTE(RotaryEmbedding, ({
+                                 "interleaved" : !!$1,
+                                 "numHeads" : $2,
+                                 "rotaryEmbeddingDim" : $3,
+                                 "scale" : $4,
+                               }),
+                               static_cast<int32_t>(interleaved), static_cast<int32_t>(num_heads),
+                               static_cast<int32_t>(rotary_embedding_dim), scale);
+  }
+};
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/fast_gelu.cc b/onnxruntime/contrib_ops/js/fast_gelu.cc
new file mode 100644
index 000000000000..62c538318160
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/fast_gelu.cc
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "fast_gelu.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsepSupportedFloatTypes;
+
+ONNX_OPERATOR_KERNEL_EX(
+    FastGelu,
+    kMSDomain,
+    1,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedFloatTypes()),
+    FastGelu);
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/fast_gelu.h b/onnxruntime/contrib_ops/js/fast_gelu.h
new file mode 100644
index 000000000000..68c7892741c6
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/fast_gelu.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsKernel;
+JSEP_KERNEL_IMPL(FastGelu, FastGelu);
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/js_contrib_kernels.cc b/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
index 498a9f5679eb..a6f8aebc2d1e 100644
--- a/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
@@ -8,12 +8,17 @@ namespace contrib {
 namespace js {
 
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Attention);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasAdd);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasSplitGelu);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FastGelu);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FusedConv);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Gelu);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MatMulNBits);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MultiHeadAttention);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasSplitGelu);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasAdd);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, RotaryEmbedding);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, SkipLayerNormalization);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FusedConv);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, SimplifiedLayerNormalization);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, SkipSimplifiedLayerNormalization);
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -24,13 +29,20 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
 Status RegisterJsContribKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Attention)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Gelu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MultiHeadAttention)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasAdd)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, BiasSplitGelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FastGelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FusedConv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Gelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MatMulNBits)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MultiHeadAttention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, RotaryEmbedding)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1,
                                                             SkipLayerNormalization)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, FusedConv)>};
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1,
+                                                            SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1,
+                                                            SkipSimplifiedLayerNormalization)>};
 
   for (auto& function_table_entry : function_table) {
     KernelCreateInfo info = function_table_entry();
diff --git a/onnxruntime/contrib_ops/js/layer_norm.cc b/onnxruntime/contrib_ops/js/layer_norm.cc
new file mode 100644
index 000000000000..814543a9905e
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/layer_norm.cc
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_data_types.h"
+#include "core/providers/js/operators/layer_norm.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+ONNX_OPERATOR_KERNEL_EX(
+    SimplifiedLayerNormalization,
+    kOnnxDomain,
+    1,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", onnxruntime::js::JsepSupportedFloatTypes())
+        .TypeConstraint("U", onnxruntime::js::JsepSupportedFloatTypes()),
+    onnxruntime::js::LayerNorm<true>);
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/js/quantization/matmul_nbits.cc
new file mode 100644
index 000000000000..888db0fd161f
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/quantization/matmul_nbits.cc
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/js/quantization/matmul_nbits.h"
+#include "core/providers/js/js_data_types.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsepSupportedFloatTypes;
+
+ONNX_OPERATOR_KERNEL_EX(
+    MatMulNBits,
+    kMSDomain,
+    1,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", JsepSupportedFloatTypes())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
+    MatMulNBits);
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/js/quantization/matmul_nbits.h
new file mode 100644
index 000000000000..cca2c4757765
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/quantization/matmul_nbits.h
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsKernel;
+
+class MatMulNBits final : public JsKernel {
+ public:
+  MatMulNBits(const OpKernelInfo& info) : JsKernel(info),
+                                          K_{narrow<size_t>(info.GetAttr<int64_t>("K"))},
+                                          N_{narrow<size_t>(info.GetAttr<int64_t>("N"))},
+                                          accuracy_level_{info.GetAttrOrDefault<int64_t>("accuracy_level", 0)},
+                                          nbits_{narrow<size_t>(info.GetAttr<int64_t>("bits"))},
+                                          block_size_{narrow<size_t>(info.GetAttr<int64_t>("block_size"))} {
+    ORT_ENFORCE(nbits_ == 4,
+                "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
+    ORT_ENFORCE(block_size_ >= 16 && !(block_size_ & (block_size_ - 1)),
+                "Block size must be a power of 2 and greater than or equal to 16.");
+    JSEP_INIT_KERNEL_ATTRIBUTE(MatMulNBits, ({
+                                 "k" : $1,
+                                 "n" : $2,
+                                 "accuracyLevel" : $3,
+                                 "bits" : $4,
+                                 "blockSize" : $5
+                               }),
+                               static_cast<int32_t>(K_),
+                               static_cast<int32_t>(N_),
+                               static_cast<int32_t>(accuracy_level_),
+                               static_cast<int32_t>(nbits_),
+                               static_cast<int32_t>(block_size_));
+  }
+
+ private:
+  const size_t K_;
+  const size_t N_;
+  const int64_t accuracy_level_;
+  const size_t nbits_;
+  const size_t block_size_;
+};
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/skip_layer_norm.cc b/onnxruntime/contrib_ops/js/skip_layer_norm.cc
index f949326e1dc9..dc2c4ab75f2f 100644
--- a/onnxruntime/contrib_ops/js/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/js/skip_layer_norm.cc
@@ -14,10 +14,16 @@ ONNX_OPERATOR_KERNEL_EX(
     kMSDomain,
     1,
     kJsExecutionProvider,
-    (*KernelDefBuilder::Create())
-        .TypeConstraint("T", JsepSupportedFloatTypes())
-        .TypeConstraint("U", JsepSupportedFloatTypes()),
-    SkipLayerNorm);
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
+    SkipLayerNorm<false>);
+
+ONNX_OPERATOR_KERNEL_EX(
+    SkipSimplifiedLayerNormalization,
+    kMSDomain,
+    1,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
+    SkipLayerNorm<true>);
 
 }  // namespace js
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/js/skip_layer_norm.h b/onnxruntime/contrib_ops/js/skip_layer_norm.h
index c3011e96ae29..ead5146aa96d 100644
--- a/onnxruntime/contrib_ops/js/skip_layer_norm.h
+++ b/onnxruntime/contrib_ops/js/skip_layer_norm.h
@@ -11,19 +11,20 @@ namespace js {
 
 using onnxruntime::js::JsKernel;
 
+template <bool simplified>
 class SkipLayerNorm final : public JsKernel {
  public:
   SkipLayerNorm(const OpKernelInfo& op_kernel_info) : JsKernel(op_kernel_info) {
-    ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK());
-    ORT_ENFORCE(epsilon_ >= 0);
+    float epsilon;
+    ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon).IsOK());
+    ORT_ENFORCE(epsilon >= 0);
     JSEP_INIT_KERNEL_ATTRIBUTE(SkipLayerNormalization, ({
-                                 "epsilon" : $1
+                                 "epsilon" : $1,
+                                 "simplified" : !!$2
                                }),
-                               epsilon_);
+                               epsilon,
+                               static_cast<int32_t>(simplified));
   }
-
- private:
-  float epsilon_;
 };
 
 }  // namespace js
diff --git a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl.cuh b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl.cuh
index 0599318a4022..be8508670e4b 100644
--- a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl.cuh
+++ b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl.cuh
@@ -31,7 +31,7 @@ using MaskingSpecialization = ck::tensor_operation::device::MaskingSpecializatio
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute;  // the interface
+using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute;               // the interface
 using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle;  // the implementation
 
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
@@ -141,6 +141,35 @@ std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<
 GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
     F16, ck::Tuple<F16, F16>, F32, PreSoftmaxAttentionScoreOp, MaskingSpecialization::MaskDisabled>();
 
+template <>
+std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<
+    2, 1, 1, 1, 1,
+    F16, F16, F16, F16, ck::Tuple<>, ck::Tuple<>,
+    PassThrough, PassThrough, PreSoftmaxAttentionScoreOp, PassThrough, PassThrough,
+    MaskingSpecialization::MaskOutUpperTriangle>>>
+GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
+    F16, ck::Tuple<>, F32, PreSoftmaxAttentionScoreOp, MaskingSpecialization::MaskOutUpperTriangle>();
+
+// fp16, biased, non-masked
+template <>
+std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<
+    2, 1, 1, 1, 1,
+    F16, F16, F16, F16, ck::Tuple<F16>, ck::Tuple<>,
+    PassThrough, PassThrough, PreSoftmaxAttentionScoreOp, PassThrough, PassThrough,
+    MaskingSpecialization::MaskOutUpperTriangle>>>
+GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
+    F16, ck::Tuple<F16>, F32, PreSoftmaxAttentionScoreOp, MaskingSpecialization::MaskOutUpperTriangle>();
+
+// fp16, biased, fp16 masked, basically, two bias
+template <>
+std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<
+    2, 1, 1, 1, 1,
+    F16, F16, F16, F16, ck::Tuple<F16, F16>, ck::Tuple<>,
+    PassThrough, PassThrough, PreSoftmaxAttentionScoreOp, PassThrough, PassThrough,
+    MaskingSpecialization::MaskOutUpperTriangle>>>
+GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
+    F16, ck::Tuple<F16, F16>, F32, PreSoftmaxAttentionScoreOp, MaskingSpecialization::MaskOutUpperTriangle>();
+
 }  // namespace internal
 }  // namespace rocm
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16.cu b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16.cu
index 181e47f012c9..2e32a6594d16 100644
--- a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16.cu
@@ -32,6 +32,27 @@ GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
   return instances;
 }
 
+using NonBiasedNonmaskedCausal = DeviceBatchedGemmSoftmaxGemmPermute<
+    2, 1, 1, 1, 1,
+    F16, F16, F16, F16, ck::Tuple<>, ck::Tuple<>,
+    PassThrough, PassThrough, PreSoftmaxAttentionScoreOp, PassThrough, PassThrough,
+    MaskingSpecialization::MaskOutUpperTriangle>;
+
+template <>
+std::vector<std::unique_ptr<NonBiasedNonmaskedCausal>>
+GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
+    F16, ck::Tuple<>, F32, PreSoftmaxAttentionScoreOp, MaskingSpecialization::MaskOutUpperTriangle>() {
+  std::vector<std::unique_ptr<NonBiasedNonmaskedCausal>> instances;
+  ck::tensor_operation::device::instance::add_device_operation_instances(
+      instances,
+      device_batched_gemm_softmax_gemm_permute_instances<
+          2, 1, 1, 1, 1,
+          F16, ck::Tuple<>, F32, PreSoftmaxAttentionScoreOp,
+          MaskingSpecialization::MaskOutUpperTriangle>{});
+
+  return instances;
+}
+
 }  // namespace internal
 }  // namespace rocm
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16_biased.cu b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16_biased.cu
index 1577bdf397fa..91da8d9e1f9a 100644
--- a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16_biased.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16_biased.cu
@@ -32,6 +32,27 @@ GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
   return instances;
 }
 
+using BiasedNonmaskedCausal = DeviceBatchedGemmSoftmaxGemmPermute<
+    2, 1, 1, 1, 1,
+    F16, F16, F16, F16, ck::Tuple<F16>, ck::Tuple<>,
+    PassThrough, PassThrough, PreSoftmaxAttentionScoreOp, PassThrough, PassThrough,
+    MaskingSpecialization::MaskOutUpperTriangle>;
+
+template <>
+std::vector<std::unique_ptr<BiasedNonmaskedCausal>>
+GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
+    F16, ck::Tuple<F16>, F32, PreSoftmaxAttentionScoreOp, MaskingSpecialization::MaskOutUpperTriangle>() {
+  std::vector<std::unique_ptr<BiasedNonmaskedCausal>> instances;
+  ck::tensor_operation::device::instance::add_device_operation_instances(
+      instances,
+      device_batched_gemm_softmax_gemm_permute_instances<
+          2, 1, 1, 1, 1,
+          F16, ck::Tuple<F16>, F32, PreSoftmaxAttentionScoreOp,
+          MaskingSpecialization::MaskOutUpperTriangle>{});
+
+  return instances;
+}
+
 }  // namespace internal
 }  // namespace rocm
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16_biased_biased.cu b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16_biased_biased.cu
index 14de59234356..b08123be1897 100644
--- a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16_biased_biased.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl_fp16_biased_biased.cu
@@ -32,6 +32,27 @@ GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
   return instances;
 }
 
+using BiasedNonmaskedCausal = DeviceBatchedGemmSoftmaxGemmPermute<
+    2, 1, 1, 1, 1,
+    F16, F16, F16, F16, ck::Tuple<F16, F16>, ck::Tuple<>,
+    PassThrough, PassThrough, PreSoftmaxAttentionScoreOp, PassThrough, PassThrough,
+    MaskingSpecialization::MaskOutUpperTriangle>;
+
+template <>
+std::vector<std::unique_ptr<BiasedNonmaskedCausal>>
+GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
+    F16, ck::Tuple<F16, F16>, F32, PreSoftmaxAttentionScoreOp, MaskingSpecialization::MaskOutUpperTriangle>() {
+  std::vector<std::unique_ptr<BiasedNonmaskedCausal>> instances;
+  ck::tensor_operation::device::instance::add_device_operation_instances(
+      instances,
+      device_batched_gemm_softmax_gemm_permute_instances<
+          2, 1, 1, 1, 1,
+          F16, ck::Tuple<F16, F16>, F32, PreSoftmaxAttentionScoreOp,
+          MaskingSpecialization::MaskOutUpperTriangle>{});
+
+  return instances;
+}
+
 }  // namespace internal
 }  // namespace rocm
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_pipelines.cuh b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_pipelines.cuh
index 78983ac95e67..54dda4bfa6d2 100644
--- a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_pipelines.cuh
+++ b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_pipelines.cuh
@@ -732,122 +732,154 @@ class GemmSoftmaxGemmPermuteTunableOp : public tunable::TunableOp<GemmSoftmaxGem
 
 #ifdef USE_COMPOSABLE_KERNEL
 
-template <typename T, bool USE_BIAS, bool USE_MASK>
-auto GetCKGemmSoftmaxGemmPermuteTypeStringAndOps() {
+template <typename U, typename V, typename T, bool USE_BIAS, bool USE_MASK>
+auto GetArgAndRunInvoker(const U& impl, const V& invoker, const GemmSoftmaxGemmPermuteParams<T>* params) {
   constexpr const int kNumBiasBuffer = static_cast<int>(USE_BIAS) + static_cast<int>(USE_MASK);
 
   using Nop = ck::tensor_operation::element_wise::PassThrough;
   using Acc0ElementOp = internal::PreSoftmaxAttentionScoreOp;
 
+  TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+      !GemmSoftmaxGemmPermuteTunableOp<T>::IsSupportedMode(params->attention),
+      "attention mode is not supported, got ", params->attention->mode);
+  if constexpr (USE_BIAS) {
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+        params->bias_buffer == nullptr, "biased version only support input with bias");
+  } else {
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+        params->bias_buffer != nullptr, "non-biased version only support input without bias");
+  }
+  if constexpr (USE_MASK) {
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+        !GemmSoftmaxGemmPermuteTunableOp<T>::IsSupportedMaskType(params->attention),
+        "mask type is not supported, got ", params->attention->mask_type);
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+        params->mask_index_buffer == nullptr, "masked version only support input with mask");
+  } else {
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+        params->mask_index_buffer != nullptr, "non-masked version only support input without mask");
+  }
+
+  auto attn = params->attention;
+  const int& G0 = attn->batch_size;
+  const int& G1 = attn->num_heads;
+  const int& M = attn->sequence_length;
+  const int& N = attn->total_sequence_length;
+  const int& K = attn->head_size;
+  const int& O = attn->v_head_size;
+  {
+    auto [m, n, k, o, batch] = params->GetGemmsMNKOBatch();
+    ORT_ENFORCE(M == m && N == n && K == k && O == o && G0 * G1 == batch, "semantic mismatch");
+  }
+
+  auto [qs, ks, vs] = GetQkvStrides(attn);
+  std::vector<ck::index_t> q_buffer_lengths = {G0, G1, M, K};
+  std::vector<ck::index_t> q_buffer_strides = qs.template ForBNSHCoord<std::vector<ck::index_t>>();
+  std::vector<ck::index_t> k_buffer_lengths = {G0, G1, N, K};
+  std::vector<ck::index_t> k_buffer_strides = ks.template ForBNSHCoord<std::vector<ck::index_t>>();
+  std::vector<ck::index_t> v_buffer_lengths = {G0, G1, O, N};
+  std::vector<ck::index_t> v_buffer_strides = vs.template ForBNHSCoord<std::vector<ck::index_t>>();
+  std::vector<ck::index_t> out_buffer_lengths = {G0, G1, M, O};
+  std::vector<ck::index_t> out_buffer_strides = {M * G1 * O, O, G1 * O, 1};  // permute 0213
+
+  std::array<void*, kNumBiasBuffer> bias_buffers{};
+  std::array<std::vector<ck::index_t>, kNumBiasBuffer> bias_lengths{};
+  std::array<std::vector<ck::index_t>, kNumBiasBuffer> bias_strides{};
+  if constexpr (USE_BIAS) {
+    bias_buffers[0] = const_cast<T*>(params->bias_buffer);
+    bias_lengths[0] = {G0, G1, M, N};  // BN(G0*G1), S(M), T(N)
+    bias_strides[0] = {G1 * M * N, M * N, N, 1};
+  }
+  if constexpr (USE_MASK) {
+    bias_buffers[kNumBiasBuffer - 1] = params->workspace_buffer;
+    bias_lengths[kNumBiasBuffer - 1] = {G0, G1, M, N};  // BN(G0*G1), S(M), T(N)
+    if (params->mask_index_dims.size() == 2) {          // [B,T]
+      bias_strides[kNumBiasBuffer - 1] = {N, 0, 0, 1};
+    } else if (params->mask_index_dims.size() == 3) {  // [B,S,T]
+      bias_strides[kNumBiasBuffer - 1] = {M * N, 0, N, 1};
+    } else if (params->mask_index_dims.size() == 4) {  // [B,1,max_seq_len,max_seq_len] -->convert--> [B,S,T]
+      bias_strides[kNumBiasBuffer - 1] = {M * N, 0, N, 1};
+    } else {
+      ORT_ENFORCE(false, "Unreachable");
+    }
+  }
+
+  auto arg = impl->MakeArgumentPointer(
+      params->q_buffer, params->k_buffer, params->v_buffer, params->out_buffer,
+      bias_buffers,  // Gemm1 bias, as attention mask
+      {},            // Gemm2 bias
+      q_buffer_lengths, q_buffer_strides,
+      k_buffer_lengths, k_buffer_strides,
+      v_buffer_lengths, v_buffer_strides,
+      out_buffer_lengths, out_buffer_strides,
+      bias_lengths, bias_strides,
+      {},
+      {},
+      Nop{},
+      Nop{},
+      Acc0ElementOp{params->scale},
+      Nop{},
+      Nop{});
+
+  TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!impl->IsSupportedArgument(arg.get()),
+                                            impl->GetTypeString(), " does not support the params");
+
+  if constexpr (USE_MASK) {
+    ORT_RETURN_IF_ERROR(GemmSoftmaxGemmPermuteTunableOp<T>::LaunchConvertToFilledMaskValue(params));
+  }
+
+  invoker->Run(arg.get(), StreamConfig{params->StreamHandle()});
+  return Status::OK();
+}
+
+template <typename T, bool USE_BIAS, bool USE_MASK>
+auto GetCKGemmSoftmaxGemmPermuteTypeStringAndOps() {
   using CKDataType = typename CKDataTypeAdaptor<T>::type;
   using D0DataType = typename ck::detail::tuple_concat<
       std::conditional_t<USE_BIAS, ck::Tuple<CKDataType>, ck::Tuple<>>,
       std::conditional_t<USE_MASK, ck::Tuple<CKDataType>, ck::Tuple<>>>::type;
 
-  constexpr static auto MaskingSpec =
+  constexpr static auto MaskingSpecMaskDisabled =
       ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+  constexpr static auto MaskingSpecMaskOutUpperTriangle =
+      ck::tensor_operation::device::MaskingSpecialization::MaskOutUpperTriangle;
+
+  std::vector<std::pair<std::string, Op<GemmSoftmaxGemmPermuteParams<T>>>>
+      ret;
 
-  std::vector<std::pair<std::string, Op<GemmSoftmaxGemmPermuteParams<T>>>> ret;
   for (auto&& impl : internal::GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
-           CKDataType, D0DataType, internal::F32, internal::PreSoftmaxAttentionScoreOp, MaskingSpec>()) {
+           CKDataType, D0DataType, internal::F32, internal::PreSoftmaxAttentionScoreOp, MaskingSpecMaskDisabled>()) {
     auto type_string = impl->GetTypeString();
 
     auto invoker = impl->MakeInvokerPointer();
     auto op = [impl = std::move(impl), invoker = std::move(invoker)](
                   const GemmSoftmaxGemmPermuteParams<T>* params) -> Status {
       TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-          !GemmSoftmaxGemmPermuteTunableOp<T>::IsSupportedMode(params->attention),
-          "attention mode is not supported, got ", params->attention->mode);
-      if constexpr (USE_BIAS) {
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            params->bias_buffer == nullptr, "biased version only support input with bias");
-      } else {
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            params->bias_buffer != nullptr, "non-biased version only support input without bias");
-      }
-      if constexpr (USE_MASK) {
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            !GemmSoftmaxGemmPermuteTunableOp<T>::IsSupportedMaskType(params->attention),
-            "mask type is not supported, got ", params->attention->mask_type);
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            params->mask_index_buffer == nullptr, "masked version only support input with mask");
-      } else {
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            params->mask_index_buffer != nullptr, "non-masked version only support input without mask");
-      }
+          params->attention->is_unidirectional, "unidirectional attention is not supported with MaskingSpecMaskDisabled");
 
-      auto attn = params->attention;
-      const int& G0 = attn->batch_size;
-      const int& G1 = attn->num_heads;
-      const int& M = attn->sequence_length;
-      const int& N = attn->total_sequence_length;
-      const int& K = attn->head_size;
-      const int& O = attn->v_head_size;
-      {
-        auto [m, n, k, o, batch] = params->GetGemmsMNKOBatch();
-        ORT_ENFORCE(M == m && N == n && K == k && O == o && G0 * G1 == batch, "semantic mismatch");
-      }
+      return GetArgAndRunInvoker<decltype(impl), decltype(invoker), T, USE_BIAS, USE_MASK>(impl, invoker, params);
+    };
+    ret.emplace_back(std::make_pair(std::move(type_string), std::move(op)));
+  }
 
-      auto [qs, ks, vs] = GetQkvStrides(attn);
-      std::vector<ck::index_t> q_buffer_lengths = {G0, G1, M, K};
-      std::vector<ck::index_t> q_buffer_strides = qs.template ForBNSHCoord<std::vector<ck::index_t>>();
-      std::vector<ck::index_t> k_buffer_lengths = {G0, G1, N, K};
-      std::vector<ck::index_t> k_buffer_strides = ks.template ForBNSHCoord<std::vector<ck::index_t>>();
-      std::vector<ck::index_t> v_buffer_lengths = {G0, G1, O, N};
-      std::vector<ck::index_t> v_buffer_strides = vs.template ForBNHSCoord<std::vector<ck::index_t>>();
-      std::vector<ck::index_t> out_buffer_lengths = {G0, G1, M, O};
-      std::vector<ck::index_t> out_buffer_strides = {M * G1 * O, O, G1 * O, 1};  // permute 0213
-
-      std::array<void*, kNumBiasBuffer> bias_buffers{};
-      std::array<std::vector<ck::index_t>, kNumBiasBuffer> bias_lengths{};
-      std::array<std::vector<ck::index_t>, kNumBiasBuffer> bias_strides{};
-      if constexpr (USE_BIAS) {
-        bias_buffers[0] = const_cast<T*>(params->bias_buffer);
-        bias_lengths[0] = {G0, G1, M, N};  // BN(G0*G1), S(M), T(N)
-        bias_strides[0] = {G1 * M * N, M * N, N, 1};
-      }
-      if constexpr (USE_MASK) {
-        bias_buffers[kNumBiasBuffer - 1] = params->workspace_buffer;
-        bias_lengths[kNumBiasBuffer - 1] = {G0, G1, M, N};  // BN(G0*G1), S(M), T(N)
-        if (params->mask_index_dims.size() == 2) {          // [B,T]
-          bias_strides[kNumBiasBuffer - 1] = {N, 0, 0, 1};
-        } else if (params->mask_index_dims.size() == 3) {  // [B,S,T]
-          bias_strides[kNumBiasBuffer - 1] = {M * N, 0, N, 1};
-        } else if (params->mask_index_dims.size() == 4) {  // [B,1,max_seq_len,max_seq_len] -->convert--> [B,S,T]
-          bias_strides[kNumBiasBuffer - 1] = {M * N, 0, N, 1};
-        } else {
-          ORT_ENFORCE(false, "Unreachable");
-        }
-      }
+  for (auto&& impl : internal::GetDeviceBatchedGemmSoftmaxGemmPermuteInstances<
+           CKDataType, D0DataType, internal::F32, internal::PreSoftmaxAttentionScoreOp, MaskingSpecMaskOutUpperTriangle>()) {
+    auto type_string = impl->GetTypeString();
 
-      auto arg = impl->MakeArgumentPointer(
-          params->q_buffer, params->k_buffer, params->v_buffer, params->out_buffer,
-          bias_buffers,  // Gemm1 bias, as attention mask
-          {},            // Gemm2 bias
-          q_buffer_lengths, q_buffer_strides,
-          k_buffer_lengths, k_buffer_strides,
-          v_buffer_lengths, v_buffer_strides,
-          out_buffer_lengths, out_buffer_strides,
-          bias_lengths, bias_strides,
-          {},
-          {},
-          Nop{},
-          Nop{},
-          Acc0ElementOp{params->scale},
-          Nop{},
-          Nop{});
-
-      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!impl->IsSupportedArgument(arg.get()),
-                                                impl->GetTypeString(), " does not support the params");
-
-      if constexpr (USE_MASK) {
-        ORT_RETURN_IF_ERROR(GemmSoftmaxGemmPermuteTunableOp<T>::LaunchConvertToFilledMaskValue(params));
-      }
-      invoker->Run(arg.get(), StreamConfig{params->StreamHandle()});
-      return Status::OK();
+    auto invoker = impl->MakeInvokerPointer();
+    auto op = [impl = std::move(impl), invoker = std::move(invoker)](
+                  const GemmSoftmaxGemmPermuteParams<T>* params) -> Status {
+      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+          !params->attention->is_unidirectional, "bidirectional attention is not supported with MaskingSpecMaskOutUpperTriangle");
+      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+          params->attention->sequence_length != params->attention->total_sequence_length,
+          "seqence_length != total_seqence_length is not supported with MaskingSpecMaskOutUpperTriangle");
+
+      return GetArgAndRunInvoker<decltype(impl), decltype(invoker), T, USE_BIAS, USE_MASK>(impl, invoker, params);
     };
     ret.emplace_back(std::make_pair(std::move(type_string), std::move(op)));
   }
+
   return ret;
 }
 #endif  // USE_COMPOSABLE_KERNEL
diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc b/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc
deleted file mode 100644
index 9cb414e4e898..000000000000
--- a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "contrib_ops/rocm/bert/fast_gelu.h"
-
-#include "core/providers/rocm/rocm_common.h"
-#include "core/providers/rocm/miopen_common.h"
-#include "contrib_ops/cpu/bert/bias_gelu_helper.h"
-#include "contrib_ops/rocm/bert/elementwise.h"
-#include "contrib_ops/rocm/bert/transformer_common.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-#define REGISTER_KERNEL_TYPED(T)                                  \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      FastGelu,                                                   \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kRocmExecutionProvider,                                     \
-      (*KernelDefBuilder::Create())                               \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      FastGelu<T>);
-
-REGISTER_KERNEL_TYPED(float)
-REGISTER_KERNEL_TYPED(MLFloat16)
-REGISTER_KERNEL_TYPED(BFloat16)
-
-using namespace ONNX_NAMESPACE;
-
-template <typename T>
-Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
-  ORT_RETURN_IF_ERROR(bias_gelu_helper::CheckInputs(context));
-
-  const Tensor* input = context->Input<Tensor>(0);
-  const Tensor* bias = context->Input<Tensor>(1);
-  Tensor* output = context->Output(0, input->Shape());
-
-  int64_t input_length = input->Shape().Size();
-  if (input_length == 0) {
-    return Status::OK();
-  }
-  int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
-  typedef typename ToHipType<T>::MappedType HipT;
-
-  const HipT* input_buffer = reinterpret_cast<const HipT*>(input->Data<T>());
-  const HipT* bias_buffer = (nullptr != bias) ? reinterpret_cast<const HipT*>(bias->Data<T>()) : nullptr;
-  return LaunchElementwiseKernel<functor::FastGeLU, HipT>(
-      GetTuningContext(), context->GetComputeStream(),
-      input_buffer, static_cast<int>(input_length),
-      bias_buffer, static_cast<int>(bias_length),
-      reinterpret_cast<HipT*>(output->MutableData<T>()));
-}
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h b/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h
deleted file mode 100644
index 42bfe5a0b024..000000000000
--- a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/common/common.h"
-#include "core/providers/rocm/rocm_kernel.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-using namespace onnxruntime::rocm;
-
-template <typename T>
-class FastGelu final : public RocmKernel {
- public:
-  FastGelu(const OpKernelInfo& op_kernel_info) : RocmKernel(op_kernel_info) {}
-  Status ComputeInternal(OpKernelContext* ctx) const override;
-};
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
index 6f98312e4067..09e7d61b71db 100644
--- a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
@@ -68,6 +68,7 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info)
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
   past_present_share_buffer_ = info.GetAttrOrDefault<int64_t>("past_present_share_buffer", 0LL) != 0LL;
+  is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
 
   using HipT = typename ToHipType<T>::MappedType;
   using AttentionTunableOp = GemmSoftmaxGemmPermuteTunableOp<HipT>;
@@ -121,8 +122,8 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
           query, key, value, bias,
           key_padding_mask, relative_position_bias,
           past_key, past_value, past_seq_len,
-          &attn,
-          num_heads_, mask_filter_value_, scale_,
+          &attn, num_heads_, 
+          mask_filter_value_, scale_, false, /*is_unidirectional_*/ 
           past_present_share_buffer_, false, device_prop.maxThreadsPerBlock));
 
   if (attn_type_ == kDecoderMaskedMultiHeadAttention && attn.sequence_length != 1) {
diff --git a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
index 84d8b76bbfeb..1d676d7a7bca 100644
--- a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
@@ -25,6 +25,7 @@ class MultiHeadAttention final : public RocmKernel {
   float mask_filter_value_;
   float scale_;
   bool past_present_share_buffer_{false};
+  bool is_unidirectional_{false};
 
   // type-erased GemmSoftmaxGemmPermuteTunableOp<HipT>, the reason for this is:
   //   1. We don't want to include the cuh file where GemmSoftmaxGemmPermuteTunableOp<HipT> is defined.
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
deleted file mode 100644
index e82e15a304f4..000000000000
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/rocm/rocm_common.h"
-#include "contrib_ops/rocm/diffusion/group_norm.h"
-#include "contrib_ops/rocm/diffusion/group_norm_impl.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-#define GROUP_NORM_TYPES float, MLFloat16
-
-ONNX_OPERATOR_KERNEL_EX(
-    GroupNorm, kMSDomain, 1, kRocmExecutionProvider,
-    (*KernelDefBuilder::Create()).TypeConstraint("T", BuildKernelDefConstraints<GROUP_NORM_TYPES>()), GroupNorm);
-
-using namespace ONNX_NAMESPACE;
-
-namespace {
-template <typename T>
-struct DispatchGroupNorm {
-  Status operator()(RocmTuningContext* tuning_ctx,
-                    Stream* stream,
-                    Tensor* output,
-                    const Tensor* input,
-                    const Tensor* gamma,
-                    const Tensor* beta,
-                    void* workspace,
-                    float epsilon,
-                    int batch_size,
-                    int num_channels,
-                    int height,
-                    int width,
-                    int num_groups,
-                    bool use_swish_activation) {
-    typedef typename ToHipType<T>::MappedType HipT;
-    return LaunchGroupNormKernel<HipT>(
-        tuning_ctx,
-        stream,
-        reinterpret_cast<HipT*>(output->MutableData<T>()),
-        reinterpret_cast<const HipT*>(input->Data<T>()),
-        gamma->Data<float>(),
-        beta->Data<float>(),
-        workspace,
-        epsilon,
-        batch_size,
-        num_channels,
-        height,
-        width,
-        num_groups,
-        use_swish_activation);
-  }
-};
-
-}  // namespace
-
-GroupNorm::GroupNorm(const OpKernelInfo& op_info) : RocmKernel(op_info) {
-  epsilon_ = op_info.GetAttrOrDefault<float>("epsilon", 1e-5f);
-  ORT_ENFORCE(epsilon_ >= 0);
-
-  int64_t num_groups;
-  ORT_ENFORCE(op_info.GetAttr("groups", &num_groups).IsOK());
-  ORT_ENFORCE(num_groups >= 0);
-  num_groups_ = static_cast<int>(num_groups);
-
-  int64_t activation;
-  ORT_ENFORCE(op_info.GetAttr("activation", &activation).IsOK());
-  ORT_ENFORCE(activation == 0 || activation == 1);  // 0 is None, 1 is Swish
-  use_swish_activation_ = (activation == 1);
-
-  channels_last_ = (op_info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(1)) != 0);
-}
-
-Status GroupNorm::PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
-                          bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
-  is_packed = false;
-  return Status::OK();
-}
-
-Status GroupNorm::ComputeInternal(OpKernelContext* context) const {
-  const Tensor* input = context->Input<Tensor>(0);
-  const Tensor* gamma = context->Input<Tensor>(1);
-  const Tensor* beta = context->Input<Tensor>(2);
-  Tensor* output = context->Output(0, input->Shape());
-
-  if (!channels_last_) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "only the channels_last layout is supported");
-  }
-
-  const auto& input_dims = input->Shape().GetDims();
-  if (input_dims.size() != 4) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "input is expected to have 4 dimensions, got ", input_dims.size());
-  }
-
-  const auto& gamma_dims = gamma->Shape().GetDims();
-  if (gamma_dims.size() != 1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "gamma is expected to have 1 dimension, got ", gamma_dims.size());
-  }
-  if (gamma_dims[0] != input_dims[3]) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Number of channels in gamma and input does not match");
-  }
-
-  const auto& beta_dims = beta->Shape().GetDims();
-  if (beta_dims.size() != 1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "beta is expected to have 1 dimension, got ", beta_dims.size());
-  }
-  if (beta_dims[0] != input_dims[3]) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Number of channels in beta and input does not match");
-  }
-
-  // Input and output format is NHWC
-  int batch_size = static_cast<int>(input_dims[0]);
-  int num_channels = static_cast<int>(input_dims[3]);
-  int height = static_cast<int>(input_dims[1]);
-  int width = static_cast<int>(input_dims[2]);
-
-  if (num_channels % num_groups_ != 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "number of channels should be divisible by num_groups");
-  }
-
-  if (context->GetUseDeterministicCompute()) {
-    static std::once_flag log_warning;
-    std::call_once(log_warning, []() {
-      LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic GPU kernel, its outputs may still be nondeterministic.";
-    });
-  }
-
-  auto workspace = GetScratchBuffer<void>(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream());
-
-  utils::MLTypeCallDispatcher<GROUP_NORM_TYPES> dispatcher(input->GetElementType());
-  return dispatcher.InvokeRet<Status, DispatchGroupNorm>(GetTuningContext(), context->GetComputeStream(),
-                                                         output, input, gamma, beta, workspace.get(),
-                                                         epsilon_,
-                                                         batch_size,
-                                                         num_channels,
-                                                         height,
-                                                         width,
-                                                         num_groups_,
-                                                         use_swish_activation_);
-}
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh
index fb7091592c16..d0a0d09fcbae 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh
@@ -26,13 +26,18 @@ namespace rocm {
 
 using onnxruntime::rocm::CKDataTypeAdaptor;
 
-using Swish = ck::tensor_operation::element_wise::Swish;
+// The SiLU function is a special case of Swish function,
+// The Swish function is parametrized by b, which is set to 1.0 for SiLU. They are defined as:
+// SiLU(x) = x * sigmoid(x)
+// Swish(x) = x * sigmoid(bx)
+// The default value of b is 1.0 in ck::tensor_operation::element_wise::Swish function. We treat them as the same function here.
+using Silu = ck::tensor_operation::element_wise::Swish;
 using Pass = ck::tensor_operation::element_wise::PassThrough;
 
 constexpr int Rank = 5;
 constexpr int NumReduceDim = 3;
 
-template <typename T, typename AccT, bool WithSwish>
+template <typename T, typename AccT, bool WithSilu>
 auto GetCKGroupNormNHWCTypeStringAndOps() {
   using XDataType = typename CKDataTypeAdaptor<T>::type;
   using YDataType = typename CKDataTypeAdaptor<T>::type;
@@ -40,26 +45,30 @@ auto GetCKGroupNormNHWCTypeStringAndOps() {
   using GammaDataType = float;
   using BetaDataType = float;
 
-  using Activation = std::conditional_t<WithSwish, Swish, Pass>;
+  using Activation = std::conditional_t<WithSilu, Silu, Pass>;
 
-  std::vector<std::pair<std::string, onnxruntime::rocm::tunable::Op<GroupNormNHWCParams<T>>>> ret;
+  std::vector<std::pair<std::string, onnxruntime::rocm::tunable::Op<GroupNormNHWCTunableParams<T>>>> ret;
   for (auto&& impl : internal::GetDeviceGroupNormInstances<XDataType, GammaDataType, BetaDataType, YDataType,
                                                            SaveMeanInvStdDataType, Activation, Rank, NumReduceDim>()) {
-    std::string swish_suffix = WithSwish ? "_Swish" : "_Pass";
-    auto type_string = onnxruntime::MakeString(impl->GetTypeString()) + swish_suffix;
+    std::string silu_suffix = WithSilu ? "_Silu" : "_Pass";
+    auto type_string = onnxruntime::MakeString(impl->GetTypeString()) + silu_suffix;
     auto invoker = impl->MakeInvokerPointer();
 
-    auto ck_group_norm_op = [impl = std::move(impl), invoker = std::move(invoker)](const GroupNormNHWCParams<T>* params) -> Status {
-      if constexpr (WithSwish) {
+    auto ck_group_norm_op = [impl = std::move(impl), invoker = std::move(invoker)](
+                                const GroupNormNHWCTunableParams<T>* params) -> Status {
+      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF((params->skip != nullptr || params->bias != nullptr),
+                                                "Input skip or bias is not supported by composable kernel.");
+      if constexpr (WithSilu) {
         TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            !params->withSwish, "Swish version only support groupnorm with swish");
+            !params->use_silu, "Silu version only support groupnorm with silu");
       } else {
         TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-            params->withSwish, "Pass version only support groupnorm without swish");
+            params->use_silu, "Pass version only support groupnorm without silu");
       }
-      std::vector<ck::index_t> in_lengths{params->n, params->h, params->w, params->groups, params->cPerGroup};
-      std::vector<ck::index_t> in_out_strides{params->h * params->w * params->c, params->w * params->c, params->c, params->cPerGroup, 1};
-      std::vector<ck::index_t> gamma_beta_strides{0, 0, 0, params->cPerGroup, 1};
+      std::vector<ck::index_t> in_lengths{params->n, params->h, params->w, params->groups, params->channels_per_group};
+      std::vector<ck::index_t> in_out_strides{params->h * params->w * params->c, params->w * params->c,
+                                              params->c, params->channels_per_group, 1};
+      std::vector<ck::index_t> gamma_beta_strides{0, 0, 0, params->channels_per_group, 1};
       std::vector<ck::index_t> reduce_dims{1, 2, 4};
 
       auto activation = Activation{};
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh
index 19b081881dce..4cb371fdcf96 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh
@@ -18,7 +18,7 @@ namespace internal {
 using F16 = ck::half_t;
 using F32 = float;
 
-using Swish = ck::tensor_operation::element_wise::Swish;
+using Silu = ck::tensor_operation::element_wise::Swish;
 using Pass = ck::tensor_operation::element_wise::PassThrough;
 
 using ck::tensor_operation::device::DeviceNormalizationFwd;      // the interface
@@ -101,9 +101,9 @@ GetDeviceGroupNormInstances() {
 
 template <>
 std::vector<std::unique_ptr<DeviceNormalizationFwd<
-    F16, F32, F32, F16, F32, Swish, 5, 3>>>
+    F16, F32, F32, F16, F32, Silu, 5, 3>>>
 GetDeviceGroupNormInstances<
-    F16, F32, F32, F16, F32, Swish, 5, 3>();
+    F16, F32, F32, F16, F32, Silu, 5, 3>();
 
 template <>
 std::vector<std::unique_ptr<DeviceNormalizationFwd<
@@ -113,9 +113,9 @@ GetDeviceGroupNormInstances<
 
 template <>
 std::vector<std::unique_ptr<DeviceNormalizationFwd<
-    F32, F32, F32, F32, F32, Swish, 5, 3>>>
+    F32, F32, F32, F32, F32, Silu, 5, 3>>>
 GetDeviceGroupNormInstances<
-    F32, F32, F32, F32, F32, Swish, 5, 3>();
+    F32, F32, F32, F32, F32, Silu, 5, 3>();
 
 template <>
 std::vector<std::unique_ptr<DeviceNormalizationFwd<
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp16.cu b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp16.cu
index 6718f2926803..ad191314e5e4 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp16.cu
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp16.cu
@@ -11,12 +11,12 @@ namespace rocm {
 namespace internal {
 
 template <>
-std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Swish, 5, 3>>>
-GetDeviceGroupNormInstances<F16, F32, F32, F16, F32, Swish, 5, 3>() {
-  std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Swish, 5, 3>>> instances;
+std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Silu, 5, 3>>>
+GetDeviceGroupNormInstances<F16, F32, F32, F16, F32, Silu, 5, 3>() {
+  std::vector<std::unique_ptr<DeviceNormalizationFwd<F16, F32, F32, F16, F32, Silu, 5, 3>>> instances;
   ck::tensor_operation::device::instance::add_device_operation_instances(
       instances,
-      device_normalization_f16_instances<Swish, 5, 3>{});
+      device_normalization_f16_instances<Silu, 5, 3>{});
 
   return instances;
 }
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu
index 9b0ccab17b4c..ceb53ed442ab 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu
@@ -11,12 +11,12 @@ namespace rocm {
 namespace internal {
 
 template <>
-std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Swish, 5, 3>>>
-GetDeviceGroupNormInstances<F32, F32, F32, F32, F32, Swish, 5, 3>() {
-  std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Swish, 5, 3>>> instances;
+std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Silu, 5, 3>>>
+GetDeviceGroupNormInstances<F32, F32, F32, F32, F32, Silu, 5, 3>() {
+  std::vector<std::unique_ptr<DeviceNormalizationFwd<F32, F32, F32, F32, F32, Silu, 5, 3>>> instances;
   ck::tensor_operation::device::instance::add_device_operation_instances(
       instances,
-      device_normalization_f32_instances<Swish, 5, 3>{});
+      device_normalization_f32_instances<Silu, 5, 3>{});
 
   return instances;
 }
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h
index 008ae20b0561..7cff640db2f3 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h
@@ -8,110 +8,47 @@
 #include "core/providers/rocm/cu_inc/common.cuh"
 #include "core/providers/rocm/rocm_common.h"
 #include "core/providers/rocm/tunable/rocm_tunable.h"
+#include "contrib_ops/rocm/diffusion/group_norm_common_base.h"
 
 namespace onnxruntime {
 namespace contrib {
 namespace rocm {
 
-using onnxruntime::rocm::CeilDiv;
-
-int32_t findMaxDivisor(int32_t n, int32_t maxAllowedDivisor) {
-  int32_t maxDivisor = -1;
-  for (int32_t i = 1; i <= std::sqrt(n); i++) {
-    if (n % i == 0) {
-      int32_t divisor1 = n / i;
-      int32_t divisor2 = i;
-
-      if (divisor1 > maxDivisor && divisor1 < maxAllowedDivisor) {
-        maxDivisor = divisor1;
-      }
-      if (divisor2 > maxDivisor && divisor2 < maxAllowedDivisor) {
-        maxDivisor = divisor2;
-      }
-    }
-  }
-  return maxDivisor;
-}
-
 template <typename T>
-struct GroupNormNHWCParams : OpParams {
-  GroupNormNHWCParams(RocmTuningContext* tuning_ctx, onnxruntime::Stream* stream, T* dst, float* redBuffer, const T* src, const float* gamma,
-                      const float* beta, int32_t n, int32_t h, int32_t w, int32_t c, int32_t groups, float epsilon, bool withSwish)
-      : OpParams(tuning_ctx, stream), dst(dst), src(src), gamma(gamma), beta(beta), redBuffer(redBuffer), epsilon(epsilon), n(n), h(h), w(w), c(c), groups(groups), withSwish(withSwish) {
-    int32_t maxBlocksPerHW = 1024;
-    switch (c) {
-      case 960:
-      case 1920:
-        cPerBlock = 480;
-        break;
-      case 512:
-      case 256:
-        cPerBlock = 256;
-        break;
-      case 128:
-        cPerBlock = 128;
-        break;
-      default:
-        cPerBlock = 320;
-    }
-
-    hw = h * w;
-    const int32_t blocksPerHW = findMaxDivisor(hw, maxBlocksPerHW);
-    hwPerBlock = CeilDiv(hw, blocksPerHW);
-    cPerGroup = c / groups;
-    hwc = hw * c;
-    invHWC = 1.F / (float)(hw * cPerGroup);
-    groupsPerBlock = cPerBlock / cPerGroup;
-  }
+struct GroupNormNHWCTunableParams : OpParams, GroupNormNHWCParams<T> {
+  GroupNormNHWCTunableParams(RocmTuningContext* tuning_ctx,
+                             onnxruntime::Stream* ort_stream,
+                             T* output,
+                             T* add_out,
+                             const T* input,
+                             const T* skip,
+                             const T* bias,
+                             const float* gamma,
+                             const float* beta,
+                             float* workspace,
+                             float epsilon,
+                             int batch_size,
+                             int num_channels,
+                             int height,
+                             int width,
+                             int num_groups,
+                             bool use_silu,
+                             bool broadcast_skip,
+                             int channels_per_block)
+      : OpParams(tuning_ctx, ort_stream),
+        GroupNormNHWCParams<T>(output, add_out, input, skip, bias, gamma, beta, workspace, epsilon, batch_size,
+                               num_channels, height, width, num_groups, use_silu, broadcast_skip, channels_per_block) {}
 
   std::string Signature() const override {
-    std::string swish_suffix = withSwish ? "_Swish" : "_Pass";
-    std::string sig = std::to_string(n) + "_" + std::to_string(h * w) + "_" + std::to_string(c) + "_" + std::to_string(groups) + swish_suffix;
+    std::string silu_suffix = this->use_silu ? "_silu" : "_pass";
+    std::string skip_suffix = this->skip != nullptr ? "_skip" : "_noskip";
+    std::string broadcast_suffix = this->broadcast_skip ? "_broadcast" : "_nobroadcast";
+    std::string bias_suffix = this->bias != nullptr ? "_bias" : "_nobias";
+    std::string sig = std::to_string(this->n) + "_" + std::to_string(this->h * this->w) + "_" +
+                      std::to_string(this->c) + "_" + std::to_string(this->groups) + silu_suffix +
+                      skip_suffix + broadcast_suffix + bias_suffix;
     return sig;
   }
-
-  // The output buffer. Layout NHWC.
-  T* dst;
-  // The input buffer. Layout NHWC.
-  T const* src;
-  // The gamma scaling factor.
-  float const* gamma;
-  // The beta term to add in GN.
-  float const* beta;
-  // The temporary buffer to do the global parallel reduction. Size:
-  // BLOCKS_PER_BATCH x C x 2.
-  float* redBuffer;
-  float epsilon;
-
-  // The number of instances in the batch.
-  int32_t n;
-  // The height and width of each activation map.
-  int32_t h;
-  int32_t w;
-  // The number of channels.
-  int32_t c;
-  // The number of groups.
-  int32_t groups;
-  // Do we apply the Swish activation function?
-  bool withSwish;
-
-  // Precomputed values and parameters to control the execution of the kernels.
-
-  // The number of activations per instance (h * w) and the number of
-  // activations per block.
-  int32_t hw;
-  int32_t hwPerBlock;
-  // The number of channels per group and blocks per activation in the C
-  // dimension.
-  int32_t cPerBlock;
-  int32_t cPerGroup;
-
-  // The precomputed stride between instances.
-  int32_t hwc;
-  // The inverse of hwc in floats (to compute mean/var).
-  float invHWC;
-  // The precomputed number of groups per block.
-  int32_t groupsPerBlock;
 };
 
 }  // namespace rocm
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu
index dbd5009e6367..142aaf14e8d2 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu
@@ -15,9 +15,12 @@ namespace rocm {
 template <typename T>
 Status LaunchGroupNormKernel(
     RocmTuningContext* tuning_ctx,
-    Stream* stream,
+    Stream* ort_stream,
     T* output,
+    T* add_out,
     const T* input,
+    const T* skip,
+    const T* bias,
     const float* gamma,
     const float* beta,
     void* workspace,
@@ -27,19 +30,26 @@ Status LaunchGroupNormKernel(
     int height,
     int width,
     int num_groups,
-    bool use_swish_activation) {
-  if (batch_size > static_cast<int>(kMaxGroupNormBatchSize)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
-                           "only support batch_size <= 32. Got", batch_size);
-  }
+    bool use_silu,
+    bool broadcast_skip,
+    int channels_per_block) {
+  GroupNormNHWCTunableParams<T> params(tuning_ctx, ort_stream, output, add_out, input, skip, bias, gamma, beta,
+                                       reinterpret_cast<float*>(workspace), epsilon, batch_size, num_channels,
+                                       height, width, num_groups, use_silu, broadcast_skip, channels_per_block);
 
-  if (num_groups != static_cast<int>(kGroupNormNumberOfGroups)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED,
-                           "only num_groups=32 is supported. Got", num_groups);
+  if (params.channels_per_block % params.channels_per_group != 0 ||
+      params.channels_per_block > kMaxSize ||
+      (params.channels_per_group % CHANNELS_PER_THREAD != 0)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "GroupNorm in ROCM does not support the input: n=", batch_size,
+                           " h=", height,
+                           " w=", width,
+                           " c=", num_channels,
+                           " groups=", num_groups);
   }
 
-  GroupNormNHWCParams<T> params(tuning_ctx, stream, output, reinterpret_cast<float*>(workspace), input, gamma, beta,
-                                batch_size, height, width, num_channels, num_groups, epsilon, use_swish_activation);
+  HIP_RETURN_IF_ERROR(hipMemsetAsync(
+      params.group_sum_buffer, 0, GetGroupNormWorkspaceSizeInBytes(batch_size, num_groups), params.StreamHandle()));
 
   if (tuning_ctx->IsTunableOpEnabled()) {
     static GroupNormNHWCTunableOp<T> op;
@@ -50,14 +60,17 @@ Status LaunchGroupNormKernel(
 }
 
 template Status LaunchGroupNormKernel<half>(RocmTuningContext* tuning_ctx, Stream* stream, half* output,
-                                            const half* input, const float* gamma, const float* beta, void* workspace,
-                                            float epsilon, int batch_size, int num_channels,
-                                            int height, int width, int num_groups, bool swish);
+                                            half* add_out, const half* input, const half* skip, const half* bias,
+                                            const float* gamma, const float* beta, void* workspace, float epsilon,
+                                            int batch_size, int num_channels, int height, int width, int num_groups,
+                                            bool use_silu, bool broadcast_skip, int channels_per_block);
 
 template Status LaunchGroupNormKernel<float>(RocmTuningContext* tuning_ctx, Stream* stream, float* output,
-                                             const float* input, const float* gamma, const float* beta, void* workspace,
-                                             float epsilon, int batch_size, int num_channels,
-                                             int height, int width, int num_groups, bool swish);
+                                             float* add_out, const float* input, const float* skip, const float* bias,
+                                             const float* gamma, const float* beta, void* workspace, float epsilon,
+                                             int batch_size, int num_channels, int height, int width, int num_groups,
+                                             bool use_silu, bool broadcast_skip, int channels_per_block);
+
 }  // namespace rocm
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h
deleted file mode 100644
index a0f7e0aca5de..000000000000
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <cstdint>
-#include <hip/hip_runtime.h>
-
-#include "core/common/common.h"
-#include "core/common/status.h"
-#include "core/providers/rocm/tunable/rocm_tunable.h"
-
-using onnxruntime::rocm::tunable::RocmTuningContext;
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-constexpr size_t kMaxGroupNormBatchSize = 32;
-constexpr size_t kGroupNormNumberOfGroups = 32;
-
-constexpr size_t GetGroupNormWorkspaceSizeInBytes() {
-  // Two buffers for sum and squared sum
-  return (sizeof(float) * 2) * kMaxGroupNormBatchSize * kGroupNormNumberOfGroups;
-}
-
-template <typename T>
-Status LaunchGroupNormKernel(
-    RocmTuningContext* tuning_ctx,
-    Stream* stream,
-    T* output,                 // normalized output tensor
-    const T* input,            // input tensor
-    const float* gamma,        // gamma (also known as weight or scale)
-    const float* beta,         // beta (also known as bias)
-    void* workspace,           // Work space
-    float epsilon,             // epsilon used normalization
-    int batch_size,            // N
-    int num_channels,          // C
-    int height,                // H
-    int width,                 // W
-    int num_groups,            // number of groups
-    bool use_swish_activation  // Whether there is Swish activation after group normalization
-);
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh
deleted file mode 100644
index d6322a12a936..000000000000
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh
+++ /dev/null
@@ -1,213 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// The ROCm kernel is modified from TensorRT 8.5.
-/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime_api.h>
-#include <hipcub/hipcub.hpp>
-#include "core/providers/rocm/cu_inc/common.cuh"
-#include "core/providers/rocm/rocm_common.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-static inline __device__ __host__ float sigmoid(float x) {
-  return 1.F / (1.F + expf(-x));
-}
-
-struct GroupSums {
-  // Is it the 1st element of the group?
-  int32_t flag;
-  // The sum.
-  float sum;
-  // The sum of squares.
-  float sumSq;
-};
-
-struct GroupSumsOp {
-  inline __device__ GroupSums operator()(GroupSums const& a, GroupSums const& b) {
-    GroupSums dst;
-    dst.sum = b.flag ? b.sum : (a.sum + b.sum);
-    dst.sumSq = b.flag ? b.sumSq : (a.sumSq + b.sumSq);
-    dst.flag = a.flag + b.flag;
-    return dst;
-  }
-};
-
-template <typename T, typename U, int ILP>
-inline __device__ void UpdateSum(const T* src, int64_t offset, U& sum, U& sumSq) {
-  using VecT = onnxruntime::rocm::aligned_vector<T, ILP>;
-  const VecT input_v = *reinterpret_cast<const VecT*>(src + offset);
-
-#pragma unroll
-  for (int i = 0; i < ILP; i++) {
-    const U val = static_cast<U>(input_v.val[i]);
-    sum += val;
-    sumSq += val * val;
-  }
-}
-
-template <typename T, int ThreadsPerBlock, int ILP>
-__global__ void groupNormNHWCSumKernel(const T* src, float* redBuffer, int32_t cPerBlock, int32_t hwPerBlock, int32_t hw,
-                                       int32_t hwc, int32_t c, int32_t cPerGroup, int32_t groups, int32_t groupsPerBlock) {
-  // The object in charge of doing the sums for the different blocks.
-  typedef hipcub::BlockScan<GroupSums, ThreadsPerBlock> BlockScan;
-
-  // Allocate shared memory for BlockScan.
-  __shared__ typename BlockScan::TempStorage tempStorage;
-  // Allocate shared memory for the groups. We could reduce the amount of shared
-  // memory reserved.
-  __shared__ float2 smem[ThreadsPerBlock];
-
-  // The instance in the batch.
-  int32_t ni = blockIdx.z;
-  // The channel loaded by that thread (ILP channels per thread).
-  int32_t ci = blockIdx.x * cPerBlock + threadIdx.x * ILP;
-
-  // The first activation loaded by that block.
-  int32_t hwBegin = blockIdx.y * hwPerBlock;
-  // The last activation loaded by that block.
-  int32_t hwEnd = min(hwBegin + hwPerBlock, hw);
-
-  // The sums.
-  float sum = 0.F;
-  float sumSq = 0.F;
-
-  // Iterate over the activations to compute the sums.
-  if (ci < c) {
-    for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) {
-      // The offset.
-      int64_t offset = static_cast<int64_t>(ni) * hwc + static_cast<int64_t>(hwi) * c + ci;
-      UpdateSum<T, float, ILP>(src, offset, sum, sumSq);
-    }
-  }
-
-  // The group that thread works on and the channel in the group (modulus).
-  int32_t gi = threadIdx.x * ILP / cPerGroup;
-  int32_t cj = threadIdx.x * ILP - cPerGroup * gi;
-
-  // The data for the summations.
-  GroupSums inp{cj == 0 ? 1 : 0, sum, sumSq};
-
-  // Do the segmented scan.
-  GroupSums out;
-  BlockScan(tempStorage).InclusiveScan(inp, out, GroupSumsOp());
-
-  // Store the results for the groups in shared memory (to produce coalesced
-  // stores later).
-  if (cj == cPerGroup - ILP) {  // ILP channels per thread
-    smem[gi] = make_float2(out.sum, out.sumSq);
-  }
-
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-  // The global group index.
-  int32_t gj = blockIdx.x * groupsPerBlock + threadIdx.x;
-
-  // Threads that have nothing left to do, exit.
-  if (threadIdx.x >= groupsPerBlock || gj >= groups) {
-    return;
-  }
-
-  // The first threads (those storing to global memory, load the values).
-  float2 sums = smem[threadIdx.x];
-
-  // Store to global memory.
-  atomicAdd(&redBuffer[(2 * ni + 0) * groups + gj], sums.x);
-  atomicAdd(&redBuffer[(2 * ni + 1) * groups + gj], sums.y);
-}
-
-template <typename T, typename U, int32_t ILP>
-__device__ void computeGroupNorm(const T* src, T* dst, int64_t offset, U mean, U invStdDev,
-                                 const U* gamma_v, const U* beta_v, bool swish) {
-  using VecT = onnxruntime::rocm::aligned_vector<T, ILP>;
-  const VecT input_v = *reinterpret_cast<const VecT*>(src + offset);
-  VecT output_v;
-
-#pragma unroll
-  for (int i = 0; i < ILP; i++) {
-    U val = static_cast<U>(input_v.val[i]);
-    val = (val - mean) * invStdDev;
-    val = gamma_v[i] * val + beta_v[i];
-
-    if (swish) {
-      val = val * sigmoid(val);
-    }
-    output_v.val[i] = static_cast<T>(val);
-  }
-  *(reinterpret_cast<VecT*>(dst + offset)) = output_v;
-}
-
-template <typename T, int ThreadsPerBlock, int ILP>
-__global__ void groupNormNHWCScaleKernel(T* dst, const T* src, const float* gamma, const float* beta, const float* redBuffer, float epsilon, int32_t c, int32_t cPerBlock,
-                                         int32_t cPerGroup, int32_t groups, int32_t hwc, float invHWC, int32_t hw, int32_t hwPerBlock, bool withSwish) {
-  // The channel loaded by that thread (ILP channels per thread for F16x2).
-  int32_t ci = blockIdx.x * cPerBlock + threadIdx.x * ILP;
-  if (ci >= c) {
-    return;
-  }
-
-  // The instance in the batch.
-  int32_t ni = blockIdx.z;
-
-  // The group that thread works on and the channel in the group (modulus).
-  int32_t gi = ci / cPerGroup;
-
-  // Load the sum and sum of squares for the group.
-  float sum = 0.F, sumSq = 0.F;
-  if (gi < groups) {
-    sum = redBuffer[(2 * ni + 0) * groups + gi];
-    sumSq = redBuffer[(2 * ni + 1) * groups + gi];
-  }
-
-  using VecF = onnxruntime::rocm::aligned_vector<float, ILP>;
-
-  const VecF gamma_v = *reinterpret_cast<const VecF*>(gamma + ci);
-  const VecF beta_v = *reinterpret_cast<const VecF*>(beta + ci);
-
-  // Compute the mean.
-  float mean = sum * invHWC;
-  // Compute the variance.
-  float var = sumSq * invHWC - (mean * mean);
-  // Compute the inverse of the stddev.
-  float invStdDev = var <= 0.F ? 1.F : rsqrtf(var + epsilon);
-
-  // The first activation loaded by that block.
-  int32_t hwBegin = blockIdx.y * hwPerBlock;
-  // The last activation loaded by that block.
-  int32_t hwEnd = min(hwBegin + hwPerBlock, hw);
-
-  // Iterate over the activations to compute the sums.
-  for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) {
-    // The src/dst offset.
-    int64_t offset = (int64_t)ni * hwc + hwi * c + ci;
-
-    // Fetch ILP channels per thread.
-    computeGroupNorm<T, float, ILP>(src, dst, offset, mean, invStdDev, gamma_v.val, beta_v.val, withSwish);
-  }
-}
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
index b7b9441ac997..c6ca16bfdfc8 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
@@ -20,21 +20,21 @@ namespace rocm {
 
 namespace {
 
-template <typename T, bool WithSwish>
+template <typename T, bool WithSilu>
 std::string GetGroupNormTritonGroupName() {
   std::string ret = "GroupNormTriton_";
-  std::string swish_suffix = WithSwish ? "Swish_" : "Pass_";
-  ret += swish_suffix;
+  std::string silu_suffix = WithSilu ? "Silu_" : "Pass_";
+  ret += silu_suffix;
   ret += GetDataTypeName<T>();
   return ret;
 }
 
 }  // namespace
 
-template <typename T, bool WithSwish>
+template <typename T, bool WithSilu>
 auto GetTritonGroupNormNHWCTypeStringAndOps() {
-  std::vector<std::pair<std::string, tunable::Op<GroupNormNHWCParams<T>>>> ret;
-  auto group_name = GetGroupNormTritonGroupName<T, WithSwish>();
+  std::vector<std::pair<std::string, tunable::Op<GroupNormNHWCTunableParams<T>>>> ret;
+  auto group_name = GetGroupNormTritonGroupName<T, WithSilu>();
   auto* kernel_list = GetOrtTritonKernelByGroup(group_name);
   if (kernel_list == nullptr) {
     return ret;
@@ -45,36 +45,50 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() {
     auto* metadata = GetOrtTritonKernelMetadata(i);
     auto block_size = metadata->constants.at("BLOCK_SIZE");
     auto hw_size = metadata->constants.at("HW_SIZE");
-    auto impl = [i, block_size, hw_size](const GroupNormNHWCParams<T>* params) -> Status {
+    auto impl = [i, block_size, hw_size](const GroupNormNHWCTunableParams<T>* params) -> Status {
       TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-          params->cPerGroup > block_size || params->cPerGroup * 2 <= block_size,
-          "Arg block_size (", block_size, ") is not the next power of 2 of cPerGroup (", params->cPerGroup, ").");
+          params->channels_per_group > block_size || params->channels_per_group * 2 <= block_size,
+          "Arg block_size (", block_size, ") is not the next power of 2 of channels_per_group (",
+          params->channels_per_group, ").");
       TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
           params->hw % hw_size != 0, "Arg hw_size (", hw_size, ") is not a divisor of hw (", params->hw, ").");
-      if constexpr (WithSwish) {
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!params->withSwish, "Swish version does not support GN w/o swish.");
+      if constexpr (WithSilu) {
+        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!params->use_silu, "Silu version does not support GN w/o silu.");
       } else {
-        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->withSwish, "Pass version does not support GN w/ swish.");
+        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->use_silu, "Pass version does not support GN w/ silu.");
       }
       // Construct args for launch kernel
       struct {
-        void* X;
-        void* Y;
+        const void* src;
+        const void* skip;
+        const void* bias;
+        void* out;
+        void* add_out;
         const void* gamma;
         const void* beta;
         int hw;
         int c;
         int c_per_group;
         float eps;
+        bool has_skip;
+        bool has_bias;
+        bool broadcast_skip;
       } args = {
-          (void*)params->src,
+          (const void*)params->src,
+          (const void*)params->skip,
+          (const void*)params->bias,
           (void*)params->dst,
+          (void*)params->skip_workspace,
           (const void*)params->gamma,
           (const void*)params->beta,
           params->hw,
           params->c,
-          params->cPerGroup,
-          params->epsilon};
+          params->channels_per_group,
+          params->epsilon,
+          params->skip != nullptr,
+          params->bias != nullptr,
+          params->broadcast_skip,
+      };
 
       // Grid dim is (batch_count, groups, 1)
       return LaunchTritonKernel(params->StreamHandle(), i, params->n, params->groups, 1, &args, sizeof(args));
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
index 56b3a030b289..5ba96ebc117f 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
@@ -12,16 +12,22 @@
 @triton.jit
 def group_norm_kernel(
     input_ptr,
+    skip_ptr,
+    bias_ptr,
     output_ptr,
+    add_out_ptr,
     gamma_ptr,
     beta_ptr,
     img_size,
     c,
     c_per_group,
     eps,
+    has_skip,
+    has_bias,
+    broadcast_skip,
     BLOCK_SIZE: tl.constexpr,
     HW_SIZE: tl.constexpr,
-    ACTIVATION_SWISH: tl.constexpr,
+    ACTIVATION_SILU: tl.constexpr,
 ):
     row_x = tl.program_id(0)
     row_y = tl.program_id(1)
@@ -36,14 +42,35 @@ def group_norm_kernel(
     offsets = hw[:, None] * c + cols[None, :]
     mask = (cols < c_per_group)[None, :]
 
+    bias = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    if has_skip:
+        add_out_ptr += row_x * stride + row_y * c_per_group
+        if broadcast_skip:
+            broadcast_skip_ptr = skip_ptr + row_x * c + row_y * c_per_group
+            bias += tl.load(broadcast_skip_ptr + cols, mask=cols < c_per_group, other=0.0).to(tl.float32)
+        else:
+            skip_ptr += row_x * stride + row_y * c_per_group
+    if has_bias:
+        bias_ptr += row_y * c_per_group
+        bias += tl.load(bias_ptr + cols, mask=cols < c_per_group, other=0.0).to(tl.float32)
+
     # Calculate mean and variance
     _sum = tl.zeros([HW_SIZE, BLOCK_SIZE], dtype=tl.float32)
     _square_sum = tl.zeros([HW_SIZE, BLOCK_SIZE], dtype=tl.float32)
     for i in range(tl.cdiv(img_size, HW_SIZE)):
         x_ptr = input_ptr + i * HW_SIZE * c
         a = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        if has_skip and not broadcast_skip:
+            s_ptr = skip_ptr + i * HW_SIZE * c
+            s = tl.load(s_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+            a += s
+        if has_bias or broadcast_skip:
+            a += bias
         _sum += a
         _square_sum += a * a
+        if has_skip:
+            add_y_ptr = add_out_ptr + i * HW_SIZE * c
+            tl.store(add_y_ptr + offsets, a, mask=mask)
 
     # Set axis=None (or leave it unspecified) to reduce all axes.
     # TODO: In older Triton we have to reduce an axis at a time, but in our case
@@ -57,12 +84,16 @@ def group_norm_kernel(
     gamma = tl.load(gamma_ptr + cols, mask=cols < c_per_group).to(tl.float32)
     beta = tl.load(beta_ptr + cols, mask=cols < c_per_group).to(tl.float32)
     for i in range(tl.cdiv(img_size, HW_SIZE)):
-        x_ptr = input_ptr + i * HW_SIZE * c
         y_ptr = output_ptr + i * HW_SIZE * c
-        x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        if has_skip:
+            add_y_ptr = add_out_ptr + i * HW_SIZE * c
+            x = tl.load(add_y_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        else:
+            x_ptr = input_ptr + i * HW_SIZE * c
+            x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
         x_hat = (x - group_mean) * rstd
         y = x_hat * gamma + beta
-        if ACTIVATION_SWISH:
+        if ACTIVATION_SILU:
             y *= tl.sigmoid(y)
         tl.store(y_ptr + offsets, y, mask=mask)
 
@@ -71,27 +102,27 @@ def group_norm_kernel(
 # blocks = [16, 32, 64, 128, 256, 512]
 # hw_sizes = [8, 16, 32, 64, 128, 256, 512]
 # but this will result in too many functions and slow down the compilation.
-with_swish = [True, False]
+with_silu = [True, False]
 dtypes = ["fp32", "fp16"]
 blocks = [16, 32, 64, 128]
 hw_sizes = [8, 16, 32, 64, 128, 256]
 warps = [1, 2, 4, 8, 16]
 name_pattern = "GroupNormTriton_{}_{}_b{}_hw{}_w{}"
-sig_pattern = "*{},*{},*fp32,*fp32,i32,i32,i32,fp32"
+sig_pattern = "*{},*{},*{},*{},*{},*fp32,*fp32,i32,i32,i32,fp32,i1,i1,i1"
 group_pattern = "GroupNormTriton_{}_{}"
 
 
 def get_function_table():
     func_table = []
 
-    for swish, dtype, hw_size, warp, b in product(with_swish, dtypes, hw_sizes, warps, blocks):
-        swish_suffix = "Swish" if swish else "Pass"
-        name = name_pattern.format(swish_suffix, dtype, b, hw_size, warp)
-        group = group_pattern.format(swish_suffix, dtype)
-        sig = sig_pattern.format(dtype, dtype)
+    for silu, dtype, hw_size, warp, b in product(with_silu, dtypes, hw_sizes, warps, blocks):
+        silu_suffix = "Silu" if silu else "Pass"
+        name = name_pattern.format(silu_suffix, dtype, b, hw_size, warp)
+        group = group_pattern.format(silu_suffix, dtype)
+        sig = sig_pattern.format(dtype, dtype, dtype, dtype, dtype)
         kwargs = {
             "num_warps": warp,
-            "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SWISH": int(swish)},
+            "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SILU": int(silu)},
         }
         func_desc = {"name": name, "group": group, "func": group_norm_kernel, "sig": sig, "kwargs": kwargs}
         func_table.append(func_desc)
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h
index 25d820f7ed32..e6831f764b41 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h
@@ -20,115 +20,117 @@ namespace rocm {
 using onnxruntime::rocm::GPU_WARP_SIZE;
 
 template <typename T>
-void groupNormNHWCSum(const GroupNormNHWCParams<T>* params) {
-  // Make sure the values are as we expect.
-  ORT_ENFORCE(params->c % params->cPerBlock == 0 && params->hw % params->hwPerBlock == 0);
-  // Make sure a group does not span multiple blocks.
-  ORT_ENFORCE(params->cPerBlock % params->cPerGroup == 0);
-
+void GroupNormNHWCSum(const GroupNormNHWCTunableParams<T>* params) {
   dim3 grid;
 
   // The number of blocks to compute all the channels.
-  grid.x = params->c / params->cPerBlock;
+  grid.x = DivUp(params->c, params->channels_per_block);
   // The number of blocks to compute all the activations in a given instance.
-  grid.y = CeilDiv(params->hw, params->hwPerBlock);
+  grid.y = DivUp(params->hw, params->hw_per_block);
   // The number of instances.
   grid.z = params->n;
 
-#define LAUNCH_GROUPNORM_SUM(ThreadsPerBlock, VecSize)                \
-  groupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>                 \
-      <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(         \
-          params->src, params->redBuffer, params->cPerBlock,          \
-          params->hwPerBlock, params->hw, params->hwc, params->c,     \
-          params->cPerGroup, params->groups, params->groupsPerBlock); \
+#define LAUNCH_GROUPNORM_SUM(ThreadsPerBlock, VecSize)                                                   \
+  GroupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>                                                    \
+      <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(                                            \
+          params->skip_workspace, params->group_sum_buffer, params->src, params->skip, params->bias,     \
+          params->channels_per_block, params->hw_per_block, params->hw, params->hwc, params->c,          \
+          params->channels_per_group, params->groups, params->groups_per_block, params->broadcast_skip); \
   break;
 
-  switch (params->cPerBlock) {
-    case 320:
-      LAUNCH_GROUPNORM_SUM(256, 2)
-    case 480:
-      LAUNCH_GROUPNORM_SUM(256, 2)
+  // Threads_per_block is half of values in kSizes since CHANNELS_PER_THREAD = 2.
+  switch (params->threads_per_block) {
     case 256:
-      LAUNCH_GROUPNORM_SUM(128, 2)
+      LAUNCH_GROUPNORM_SUM(256, CHANNELS_PER_THREAD)
+    case 192:
+      LAUNCH_GROUPNORM_SUM(192, CHANNELS_PER_THREAD)
+    case 160:
+      LAUNCH_GROUPNORM_SUM(160, CHANNELS_PER_THREAD)
     case 128:
-      LAUNCH_GROUPNORM_SUM(64, 2)
+      LAUNCH_GROUPNORM_SUM(128, CHANNELS_PER_THREAD)
+    case 64:
+      LAUNCH_GROUPNORM_SUM(64, CHANNELS_PER_THREAD)
     default:
       ORT_NOT_IMPLEMENTED("Not implemented");
   }
 }
 
 template <typename T, int ThreadsPerBlock, int VecSize>
-Status GroupNormNHWCSumOp(const GroupNormNHWCParams<T>* params) {
+Status GroupNormNHWCSumOp(const GroupNormNHWCTunableParams<T>* params) {
   dim3 grid;
-  grid.x = params->c / params->cPerBlock;
-  grid.y = CeilDiv(params->hw, params->hwPerBlock);
+  grid.x = DivUp(params->c, params->channels_per_block);
+  grid.y = DivUp(params->hw, params->hw_per_block);
   grid.z = params->n;
 
-  groupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>
+  GroupNormNHWCSumKernel<T, ThreadsPerBlock, VecSize>
       <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(
-          params->src, params->redBuffer, params->cPerBlock, params->hwPerBlock,
-          params->hw, params->hwc, params->c, params->cPerGroup, params->groups, params->groupsPerBlock);
+          params->skip_workspace, params->group_sum_buffer, params->src, params->skip, params->bias,
+          params->channels_per_block, params->hw_per_block, params->hw, params->hwc, params->c,
+          params->channels_per_group, params->groups, params->groups_per_block, params->broadcast_skip);
   return HIP_CALL(hipGetLastError());
 }
 
 template <typename T>
-void groupNormNHWCScale(const GroupNormNHWCParams<T>* params) {
-  // Make sure the dimensions are aligned with what we expect.
-  ORT_ENFORCE(params->c % params->cPerBlock == 0);
-  // Make sure a group does not span multiple blocks.
-  ORT_ENFORCE(params->cPerBlock % params->cPerGroup == 0);
-
+void GroupNormNHWCScale(const GroupNormNHWCTunableParams<T>* params) {
   dim3 grid;
 
   // The number of blocks to compute all the channels.
-  grid.x = params->c / params->cPerBlock;
+  grid.x = DivUp(params->c, params->channels_per_block);
   // The number of blocks to compute all the activations in a given instance.
-  grid.y = CeilDiv(params->hw, params->hwPerBlock);
+  grid.y = DivUp(params->hw, params->hw_per_block);
   // The number of instances.
   grid.z = params->n;
 
-#define LAUNCH_GROUPNORM_SCALE(ThreadsPerBlock, VecSize)                    \
-  groupNormNHWCScaleKernel<T, ThreadsPerBlock, VecSize>                     \
-      <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(               \
-          params->dst, params->src, params->gamma, params->beta,            \
-          params->redBuffer, params->epsilon, params->c, params->cPerBlock, \
-          params->cPerGroup, params->groups, params->hwc, params->invHWC,   \
-          params->hw, params->hwPerBlock, params->withSwish);               \
+#define LAUNCH_GROUPNORM_SCALE(ThreadsPerBlock, VecSize)                                               \
+  GroupNormNHWCScaleKernel<T, VecSize>                                                                 \
+      <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(                                          \
+          params->dst, params->src, params->skip, params->gamma, params->beta, params->skip_workspace, \
+          params->group_sum_buffer, params->epsilon, params->c, params->channels_per_block,            \
+          params->channels_per_group, params->groups, params->hwc, params->inv_hw_channels_per_group,  \
+          params->hw, params->hw_per_block, params->use_silu);                                         \
   break;
 
-  switch (params->cPerBlock) {
-    case 320:
-      LAUNCH_GROUPNORM_SCALE(256, 2)
-    case 480:
-      LAUNCH_GROUPNORM_SCALE(256, 2)
+  // Threads_per_block is half of values in kSizes since CHANNELS_PER_THREAD = 2.
+  switch (params->threads_per_block) {
     case 256:
-      LAUNCH_GROUPNORM_SCALE(128, 2)
+      LAUNCH_GROUPNORM_SCALE(256, CHANNELS_PER_THREAD)
+    case 192:
+      LAUNCH_GROUPNORM_SCALE(192, CHANNELS_PER_THREAD)
+    case 160:
+      LAUNCH_GROUPNORM_SCALE(160, CHANNELS_PER_THREAD)
     case 128:
-      LAUNCH_GROUPNORM_SCALE(64, 2)
+      LAUNCH_GROUPNORM_SCALE(128, CHANNELS_PER_THREAD)
+    case 64:
+      LAUNCH_GROUPNORM_SCALE(64, CHANNELS_PER_THREAD)
     default:
       ORT_NOT_IMPLEMENTED("Not implemented");
   }
 }
 
 template <typename T, int ThreadsPerBlock, int VecSize>
-Status GroupNormNHWCScaleOp(const GroupNormNHWCParams<T>* params) {
+Status GroupNormNHWCScaleOp(const GroupNormNHWCTunableParams<T>* params) {
   dim3 grid;
-  grid.x = params->c / params->cPerBlock;
-  grid.y = CeilDiv(params->hw, params->hwPerBlock);
+  grid.x = DivUp(params->c, params->channels_per_block);
+  grid.y = DivUp(params->hw, params->hw_per_block);
   grid.z = params->n;
 
-  groupNormNHWCScaleKernel<T, ThreadsPerBlock, VecSize>
+  GroupNormNHWCScaleKernel<T, VecSize>
       <<<grid, ThreadsPerBlock, 0, params->StreamHandle()>>>(
-          params->dst, params->src, params->gamma, params->beta, params->redBuffer, params->epsilon, params->c, params->cPerBlock,
-          params->cPerGroup, params->groups, params->hwc, params->invHWC, params->hw, params->hwPerBlock, params->withSwish);
+          params->dst, params->src, params->skip, params->gamma, params->beta, params->skip_workspace,
+          params->group_sum_buffer, params->epsilon, params->c, params->channels_per_block, params->channels_per_group,
+          params->groups, params->hwc, params->inv_hw_channels_per_group, params->hw, params->hw_per_block,
+          params->use_silu);
   return HIP_CALL(hipGetLastError());
 }
 
 template <typename T, int ThreadsPerBlock, int VecSize>
 class GroupNormNHWCOp {
  public:
-  Status operator()(const GroupNormNHWCParams<T>* params) {
-    HIP_RETURN_IF_ERROR(hipMemsetAsync(params->redBuffer, 0, GetGroupNormWorkspaceSizeInBytes(), params->StreamHandle()));
+  Status operator()(const GroupNormNHWCTunableParams<T>* params) {
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(params->group_sum_buffer,
+                                       0,
+                                       GetGroupNormWorkspaceSizeInBytes(params->n, params->groups),
+                                       params->StreamHandle()));
     auto status = GroupNormNHWCSumOp<T, ThreadsPerBlock, VecSize>(params);
     ORT_RETURN_IF_ERROR(status);
     HIP_RETURN_IF_ERROR(hipGetLastError());
@@ -138,29 +140,30 @@ class GroupNormNHWCOp {
     return Status::OK();
   }
 
-  Status IsSupported(const GroupNormNHWCParams<T>* params) {
+  Status IsSupported(const GroupNormNHWCTunableParams<T>* params) {
     TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-        !(params->c % VecSize == 0 && params->cPerGroup % VecSize == 0),
-        "The number of channels (", params->c, ") or the number of channels per group (", params->cPerGroup,
+        !(params->c % VecSize == 0 && params->channels_per_group % VecSize == 0),
+        "The number of channels (", params->c, ") or the number of channels per group (", params->channels_per_group,
         ") isn't divisible by the number of vector size: ", VecSize);
-    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->cPerBlock % params->cPerGroup == 0 &&
-                                                params->c % params->cPerBlock == 0 && params->hw % params->hwPerBlock == 0),
-                                              "The value of attributes don't meet the requirements.");
-    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->cPerBlock <= ThreadsPerBlock * VecSize &&
-                                                params->cPerBlock > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize),
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->channels_per_block <= ThreadsPerBlock * VecSize &&
+                                                params->channels_per_block > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize),
                                               "Configuration: Threads (", ThreadsPerBlock, "), vector size (",
-                                              VecSize, ") is redundant for the number of channels per group: ", params->cPerBlock);
+                                              VecSize, ") is redundant for the number of channels per group: ",
+                                              params->channels_per_block);
 
     return Status::OK();
   }
 };
 
 template <typename T>
-Status GroupNormNHWCStaticSelection(const GroupNormNHWCParams<T>* params) {
-  HIP_RETURN_IF_ERROR(hipMemsetAsync(params->redBuffer, 0, GetGroupNormWorkspaceSizeInBytes(), params->StreamHandle()));
-  groupNormNHWCSum<T>(params);
+Status GroupNormNHWCStaticSelection(const GroupNormNHWCTunableParams<T>* params) {
+  HIP_RETURN_IF_ERROR(hipMemsetAsync(params->group_sum_buffer,
+                                     0,
+                                     GetGroupNormWorkspaceSizeInBytes(params->n, params->groups),
+                                     params->StreamHandle()));
+  GroupNormNHWCSum<T>(params);
   HIP_RETURN_IF_ERROR(hipGetLastError());
-  groupNormNHWCScale<T>(params);
+  GroupNormNHWCScale<T>(params);
   HIP_RETURN_IF_ERROR(hipGetLastError());
   return Status::OK();
 }
@@ -178,30 +181,30 @@ Status GroupNormNHWCStaticSelection(const GroupNormNHWCParams<T>* params) {
   ADD_OP_FOR_ALL_VEC_SIZE(name, 320)
 
 template <typename T>
-class GroupNormNHWCTunableOp : public TunableOp<GroupNormNHWCParams<T>> {
+class GroupNormNHWCTunableOp : public TunableOp<GroupNormNHWCTunableParams<T>> {
  public:
   GroupNormNHWCTunableOp() {
     this->RegisterOp(GroupNormNHWCStaticSelection<T>);
     ADD_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(GroupNormNHWCOp)
 
 #ifdef USE_COMPOSABLE_KERNEL
-    for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps<T, /*AccT=*/float, /*WithSwish=*/false>()) {
+    for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps<T, /*AccT=*/float, /*WithSilu=*/false>()) {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
 
-    for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps<T, /*AccT=*/float, /*WithSwish=*/true>()) {
+    for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps<T, /*AccT=*/float, /*WithSilu=*/true>()) {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
 #endif  // USE_COMPOSABLE_KERNEL
 
 #ifdef USE_TRITON_KERNEL
-    for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps<T, /*WithSwish=*/false>()) {
+    for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps<T, /*WithSilu=*/false>()) {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
-    for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps<T, /*WithSwish=*/true>()) {
+    for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps<T, /*WithSilu=*/true>()) {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc
index d597e0d57fbc..63804f79a32f 100644
--- a/onnxruntime/contrib_ops/rocm/fused_conv.cc
+++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc
@@ -76,7 +76,12 @@ struct FNVHash {
   void HashConvolutionDescriptor(miopenConvolutionDescriptor_t cdesc) {
     int spatial_dim = 1;
 #if ROCM_VERSION >= 50500
-    miopenGetConvolutionSpatialDim(cdesc, &spatial_dim);
+    MIOPEN_CALL(miopenGetConvolutionSpatialDim(cdesc, &spatial_dim));
+    std::vector<int> pads{spatial_dim};
+    std::vector<int> strides{spatial_dim};
+    std::vector<int> dilations{spatial_dim};
+    miopenConvolutionMode_t mode;
+    MIOPEN_CALL(miopenGetConvolutionNdDescriptor(cdesc, spatial_dim, &spatial_dim, pads.data(), strides.data(), dilations.data(), &mode));
 #else
     // Previous versions of MIOpen doesn't provide API to probe the dimension of a
     // miopenConvolutionDescriptor_t, so we have to guess.
@@ -100,11 +105,12 @@ struct FNVHash {
     pads.resize(spatial_dim);
     strides.resize(spatial_dim);
     dilations.resize(spatial_dim);
+#endif
     (*this) << spatial_dim;
     (*this) << pads;
     (*this) << strides;
     (*this) << dilations;
-#endif
+    (*this) << mode;
   }
 
  private:
@@ -313,6 +319,8 @@ class FusedConv : public onnxruntime::rocm::Conv<T, false> {
       auto ret = miopenCompileFusionPlan(handle, fusion->plan);
       if (miopenStatusSuccess == ret) {
         fusion->compiled_on.insert(handle);
+      } else {
+        return ret;
       }
       return miopenStatusSuccess;
     }
diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
index 55cd6a1d112f..e19a976f3141 100644
--- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
@@ -93,6 +93,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Samp
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SkipGroupNorm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SkipLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization);
@@ -150,7 +151,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ShrunkenGather);
 #endif
 
-#if defined(USE_MPI) && defined(ORT_USE_NCCL)
+#ifdef ORT_USE_NCCL
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllReduce);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllToAll);
@@ -246,6 +247,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SkipGroupNorm)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SkipLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization)>,
@@ -309,7 +311,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ShrunkenGather)>,
 #endif
 
-#if defined(USE_MPI) && defined(ORT_USE_NCCL)
+#ifdef ORT_USE_NCCL
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllReduce)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllToAll)>,
diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index fcf9c2b03dea..be881f6bc4bc 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -30,6 +30,10 @@
 #define HWCAP2_SVEI8MM (1 << 9)
 #endif
 
+#ifndef HWCAP2_BF16
+#define HWCAP2_BF16 (1 << 14)
+#endif
+
 #endif  // ARM
 
 #endif  // Linux
@@ -48,6 +52,13 @@
 
 #if defined(CPUINFO_SUPPORTED)
 #include <cpuinfo.h>
+#if defined(CPUIDINFO_ARCH_ARM)
+namespace onnxruntime {
+// The following function is declared in "core/common/cpuid_uarch.h" but we cannot include the whole header file because
+//  some of its symbols are conflict with <cpuinfo.h>
+void decodeMIDR(uint32_t midr, uint32_t uarch[1]);
+}  // namespace onnxruntime
+#endif
 #else
 #include "core/common/cpuid_uarch.h"
 #endif  // CPUINFO_SUPPORTED
@@ -138,16 +149,12 @@ void CPUIDInfo::ArmLinuxInit() {
   // Pytorch CPUINFO only works on ARM linux or android
   // Assuming no hyper-threading, no NUMA groups
 #ifdef CPUINFO_SUPPORTED
-  pytorch_cpuinfo_init_ = cpuinfo_initialize();
-  if (!pytorch_cpuinfo_init_) {
-    LOGS_DEFAULT(WARNING) << "Failed to init pytorch cpuinfo library, may cause CPU EP performance degradation due to undetected CPU features.";
-    return;
-  }
   is_hybrid_ = cpuinfo_get_uarchs_count() > 1;
   has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot();
   has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
   has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm();
   has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm();
+  has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16();
 
   const uint32_t core_cnt = cpuinfo_get_cores_count();
   core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown);
@@ -177,6 +184,7 @@ void CPUIDInfo::ArmLinuxInit() {
   has_arm_neon_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_I8MM) != 0);
   has_arm_sve_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_SVEI8MM) != 0);
 
+  has_arm_neon_bf16_ = ((getauxval(AT_HWCAP2) & HWCAP2_BF16) != 0);
 #endif
 }
 
@@ -233,51 +241,24 @@ void CPUIDInfo::ArmWindowsInit() {
       lastUarch = uarch;
     }
   }
-
-  switch (lastUarch) {
-    case cpuinfo_uarch_cortex_a55:
-    case cpuinfo_uarch_cortex_a55r0:
-    case cpuinfo_uarch_cortex_a76:
-    case cpuinfo_uarch_neoverse_n1:
-    case cpuinfo_uarch_cortex_a77:
-    case cpuinfo_uarch_exynos_m4:
-    case cpuinfo_uarch_exynos_m5:
-      has_fp16_ = true;
-      break;
-    default:
-      break;
-  }
-  if (!has_fp16_) {
-    /*
-     * Detecting fp16 support. Different cores should have the same instruction set.
-     * So we just check the first ID_AA64PFR0_EL1
-     *  Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0100), Op2(0b000),
-     */
-    uint64_t ID_AA64PFR0_EL1;
-    unsigned long valsize = sizeof(uint64_t);
-    auto retCode = ::RegGetValueA(
-        HKEY_LOCAL_MACHINE,
-        "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
-        "CP 4020", RRF_RT_REG_QWORD, nullptr,
-        &ID_AA64PFR0_EL1, &valsize);
-    if (retCode == ERROR_SUCCESS) {
-      // AdvSIMD, bits [23:20]
-      auto advSimd = ID_AA64PFR0_EL1 >> 20;
-      if ((advSimd & 0xfULL) == 1) {
-        has_fp16_ = true;
-      }
-    }
-  }
 #endif /* Application Family or OneCore Family */
 
   has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0);
 #else
   has_arm_neon_dot_ = false;
 #endif
-  has_fp16_ |= has_arm_neon_dot_;
-  /* TODO: implement them when hw+sw is available for testing these features */
-  has_arm_neon_i8mm_ = false;
-  has_arm_sve_i8mm_ = false;
+
+  if (pytorch_cpuinfo_init_) {
+    has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
+    has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm();
+    has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm();
+    has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16();
+  } else {
+    has_fp16_ = false;
+    has_arm_neon_i8mm_ = false;
+    has_arm_sve_i8mm_ = false;
+    has_arm_neon_bf16_ = false;
+  }
 }
 
 #endif /* (arm or arm64) and windows */
@@ -297,5 +278,21 @@ uint32_t CPUIDInfo::GetCurrentCoreIdx() const {
   return 0xFFFFFFFF;  // don't know how to get core index
 #endif
 }
-
+CPUIDInfo::CPUIDInfo() {
+#ifdef CPUIDINFO_ARCH_X86
+  X86Init();
+#elif defined(CPUIDINFO_ARCH_ARM)
+#if CPUINFO_SUPPORTED
+  pytorch_cpuinfo_init_ = cpuinfo_initialize();
+  if (!pytorch_cpuinfo_init_) {
+    LOGS_DEFAULT(WARNING) << "Failed to init pytorch cpuinfo library, may cause CPU EP performance degradation due to undetected CPU features.";
+  }
+#endif
+#ifdef __linux__
+  ArmLinuxInit();
+#elif defined(_WIN32)
+  ArmWindowsInit();
+#endif /* (arm or arm64) and windows */
+#endif
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index a15c75104b83..a3936b4bd11a 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -30,6 +30,7 @@ class CPUIDInfo {
   bool HasArmNeonDot() const { return has_arm_neon_dot_; }
   bool HasArmNeon_I8MM() const { return has_arm_neon_i8mm_; }
   bool HasArmSVE_I8MM() const { return has_arm_sve_i8mm_; }
+  bool HasArmNeon_BF16() const { return has_arm_neon_bf16_; }
 
   uint32_t GetCurrentCoreIdx() const;
 
@@ -92,17 +93,7 @@ class CPUIDInfo {
   }
 
  private:
-  CPUIDInfo() {
-#ifdef CPUIDINFO_ARCH_X86
-    X86Init();
-#elif defined(CPUIDINFO_ARCH_ARM)
-#ifdef __linux__
-    ArmLinuxInit();
-#elif defined(_WIN32)
-    ArmWindowsInit();
-#endif /* (arm or arm64) and windows */
-#endif
-  }
+  CPUIDInfo();
   bool has_amx_bf16_{false};
   bool has_avx_{false};
   bool has_avx2_{false};
@@ -125,15 +116,18 @@ class CPUIDInfo {
   bool has_fp16_{false};
   bool has_arm_neon_i8mm_{false};
   bool has_arm_sve_i8mm_{false};
+  bool has_arm_neon_bf16_{false};
 
 #ifdef CPUIDINFO_ARCH_X86
 
   void X86Init();
-
 #elif defined(CPUIDINFO_ARCH_ARM)
+  // Now the following var is only used in ARM build, but later one we may expand the usage.
+  bool pytorch_cpuinfo_init_{false};
+#endif
+
 #ifdef __linux__
 
-  bool pytorch_cpuinfo_init_{false};
   void ArmLinuxInit();
 
 #elif defined(_WIN32)
@@ -141,7 +135,6 @@ class CPUIDInfo {
   void ArmWindowsInit();
 
 #endif /* (arm or arm64) and windows */
-#endif
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/flatbuffers.h b/onnxruntime/core/common/flatbuffers.h
new file mode 100644
index 000000000000..0d61e1038a82
--- /dev/null
+++ b/onnxruntime/core/common/flatbuffers.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#if defined(__GNUC__)
+#include "onnxruntime_config.h"
+#pragma GCC diagnostic push
+
+#ifdef HAS_SHORTEN_64_TO_32
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
+#endif
+
+#include "flatbuffers/flatbuffers.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc
index 6c6e2f48557e..eac9a7fa0808 100644
--- a/onnxruntime/core/common/logging/logging.cc
+++ b/onnxruntime/core/common/logging/logging.cc
@@ -12,6 +12,8 @@
 
 #ifdef _WIN32
 #include <Windows.h>
+#include "core/platform/windows/logging/etw_sink.h"
+#include "core/common/logging/sinks/composite_sink.h"
 #else
 #include <unistd.h>
 #if defined(__MACH__) || defined(__wasm__) || defined(_AIX)
@@ -243,5 +245,36 @@ unsigned int GetProcessId() {
 #endif
 }
 
+std::unique_ptr<ISink> EnhanceLoggerWithEtw(std::unique_ptr<ISink> existingLogger, logging::Severity originalSeverity,
+                                            logging::Severity etwSeverity) {
+#ifdef _WIN32
+  auto& manager = EtwRegistrationManager::Instance();
+  if (manager.IsEnabled()) {
+    auto compositeSink = std::make_unique<CompositeSink>();
+    compositeSink->AddSink(std::move(existingLogger), originalSeverity);
+    compositeSink->AddSink(std::make_unique<EtwSink>(), etwSeverity);
+    return compositeSink;
+  } else {
+    return existingLogger;
+  }
+#else
+  // On non-Windows platforms, just return the existing logger
+  (void)originalSeverity;
+  (void)etwSeverity;
+  return existingLogger;
+#endif  // _WIN32
+}
+
+Severity OverrideLevelWithEtw(Severity originalSeverity) {
+#ifdef _WIN32
+  auto& manager = logging::EtwRegistrationManager::Instance();
+  if (manager.IsEnabled() &&
+      (manager.Keyword() & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
+    return manager.MapLevelToSeverity();
+  }
+#endif  // _WIN32
+  return originalSeverity;
+}
+
 }  // namespace logging
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/logging/sinks/composite_sink.h b/onnxruntime/core/common/logging/sinks/composite_sink.h
index f27abb9e6aad..9d18eb527ffd 100644
--- a/onnxruntime/core/common/logging/sinks/composite_sink.h
+++ b/onnxruntime/core/common/logging/sinks/composite_sink.h
@@ -5,6 +5,8 @@
 
 #include <string>
 #include <vector>
+#include <utility>
+#include <memory>
 
 #include "core/common/logging/isink.h"
 #include "core/common/logging/logging.h"
@@ -27,20 +29,31 @@ class CompositeSink : public ISink {
   /// Adds a sink. Takes ownership of the sink (so pass unique_ptr by value).
   /// </summary>
   /// <param name="sink">The sink.</param>
+  /// <param name="severity">The min severity to send a message to that sink</param>
   /// <returns>This instance to allow chaining.</returns>
-  CompositeSink& AddSink(std::unique_ptr<ISink> sink) {
-    sinks_.push_back(std::move(sink));
+  CompositeSink& AddSink(std::unique_ptr<ISink> sink, logging::Severity severity) {
+    sinks_with_severity_.emplace_back(std::move(sink), severity);
     return *this;
   }
 
+  /// <summary>
+  /// Gets a const reference to the collection of sinks and min severity for that sink
+  /// </summary>
+  /// <returns>A const reference to the vector pair of unique_ptr to ISink and severity.</returns>
+  const std::vector<std::pair<std::unique_ptr<ISink>, logging::Severity>>& GetSinks() const {
+    return sinks_with_severity_;
+  }
+
  private:
   void SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) override {
-    for (auto& sink : sinks_) {
-      sink->Send(timestamp, logger_id, message);
+    for (auto& sink_pair : sinks_with_severity_) {
+      if (message.Severity() >= sink_pair.second) {
+        sink_pair.first->Send(timestamp, logger_id, message);
+      }
     }
   }
 
-  std::vector<std::unique_ptr<ISink>> sinks_;
+  std::vector<std::pair<std::unique_ptr<ISink>, logging::Severity>> sinks_with_severity_;
 };
 }  // namespace logging
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/logging/sinks/ostream_sink.cc b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
index 0db3d8709d48..a120138d1d15 100644
--- a/onnxruntime/core/common/logging/sinks/ostream_sink.cc
+++ b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/common/logging/sinks/ostream_sink.h"
-#include "date/date.h"
 
 namespace onnxruntime {
 namespace logging {
@@ -24,7 +23,7 @@ struct Color {
 
 void OStreamSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
   // operator for formatting of timestamp in ISO8601 format including microseconds
-  using date::operator<<;
+  using timestamp_ns::operator<<;
 
   // Two options as there may be multiple calls attempting to write to the same sink at once:
   // 1) Use mutex to synchronize access to the stream.
diff --git a/onnxruntime/core/common/string_utils.h b/onnxruntime/core/common/string_utils.h
index eca1221e84cb..716eed1afec5 100644
--- a/onnxruntime/core/common/string_utils.h
+++ b/onnxruntime/core/common/string_utils.h
@@ -65,5 +65,24 @@ inline std::string TrimString(std::string s) {
   return s;
 }
 
+/**
+ * @brief A consistent way to construct the full qualified op name.
+ */
+inline std::string GetFullQualifiedOpName(const std::string& op_type, const std::string& domain) {
+  return MakeString(domain, "::", op_type);
+}
+
+/**
+ * Use this simple hash to generate unique int by given string input.
+ */
+inline uint32_t GetHashFromString(const std::string& str_value) {
+  uint32_t hash = 0;
+  for (char const& c : str_value) {
+    hash = hash * 101 + c;
+  }
+
+  return hash;
+}
+
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/utf8_util.h b/onnxruntime/core/common/utf8_util.h
index 218309f7198d..583aaf0a47cf 100644
--- a/onnxruntime/core/common/utf8_util.h
+++ b/onnxruntime/core/common/utf8_util.h
@@ -8,8 +8,13 @@
 namespace onnxruntime {
 namespace utf8_util {
 
-// Returns the number of bytes in the utf8 character
-// by analyzing its leading byte
+/// <summary>
+/// Checks the extension bytes and returns a number of
+/// bytes in the UTF-8 character
+/// </summary>
+/// <param name="ch"></param>
+/// <param name="len">result</param>
+/// <returns>false if the char len is greater than 4 otherwise true</returns>
 inline bool utf8_bytes(unsigned char ch, size_t& len) {
   if ((ch & 0x80) == 0) {
     len = 1;
diff --git a/onnxruntime/core/flatbuffers/checkpoint_version.h b/onnxruntime/core/flatbuffers/checkpoint_version.h
index 6cad27c35024..e6ee20bf508c 100644
--- a/onnxruntime/core/flatbuffers/checkpoint_version.h
+++ b/onnxruntime/core/flatbuffers/checkpoint_version.h
@@ -13,7 +13,9 @@ namespace onnxruntime {
 //            The format includes support for the ModuleState (stores the module parameters), OptimizerGroups
 //            (stores the optimizer states), and PropertyBag
 //            (stores custom user properties with support for int64, float and strings).
-constexpr const int kCheckpointVersion = 1;
+// Version 2: Introduces the On-Device Training nominal checkpoint state.
+//             Changes include the addition of the is_nominal_state field in the checkpoint's ModuleState.
+constexpr const int kCheckpointVersion = 2;
 
 /**
  * @brief Check if the given checkpoint version is supported in this build
diff --git a/onnxruntime/core/flatbuffers/flatbuffers_utils.h b/onnxruntime/core/flatbuffers/flatbuffers_utils.h
index 55bde0b2df80..76860d6ab1db 100644
--- a/onnxruntime/core/flatbuffers/flatbuffers_utils.h
+++ b/onnxruntime/core/flatbuffers/flatbuffers_utils.h
@@ -5,7 +5,7 @@
 
 #include <unordered_map>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/common.h"
 #include "core/common/path_string.h"
diff --git a/onnxruntime/core/flatbuffers/ort_flatbuffers_py/fbs/ModuleState.py b/onnxruntime/core/flatbuffers/ort_flatbuffers_py/fbs/ModuleState.py
index 2be826fee2cc..19c6b1b6f275 100644
--- a/onnxruntime/core/flatbuffers/ort_flatbuffers_py/fbs/ModuleState.py
+++ b/onnxruntime/core/flatbuffers/ort_flatbuffers_py/fbs/ModuleState.py
@@ -74,9 +74,17 @@ def FrozenParamsIsNone(self):
         o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
         return o == 0
 
-def ModuleStateStart(builder): builder.StartObject(2)
+    # ModuleState
+    def IsNominalState(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+        return False
+
+def ModuleStateStart(builder): builder.StartObject(3)
 def ModuleStateAddRequiresGradParams(builder, requiresGradParams): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(requiresGradParams), 0)
 def ModuleStateStartRequiresGradParamsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
 def ModuleStateAddFrozenParams(builder, frozenParams): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(frozenParams), 0)
 def ModuleStateStartFrozenParamsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModuleStateAddIsNominalState(builder, isNominalState): builder.PrependBoolSlot(2, isNominalState, 0)
 def ModuleStateEnd(builder): return builder.EndObject()
diff --git a/onnxruntime/core/flatbuffers/schema/README.md b/onnxruntime/core/flatbuffers/schema/README.md
index 932478111ee6..96a2936c196a 100644
--- a/onnxruntime/core/flatbuffers/schema/README.md
+++ b/onnxruntime/core/flatbuffers/schema/README.md
@@ -21,7 +21,7 @@ e.g.
     - /build/Linux/Debug/_deps/flatbuffers-build/flatc
 
 It is possible to use another flatc as well, e.g., from a separate installation. Note that ONNX Runtime uses
-FlatBuffers 1.12.
+FlatBuffers 23.5.26.
 
 To update the flatbuffers schemas and generated files:
 1. Modify [the ORT file format schema](ort.fbs) or [training checkpoint schema](ort_training_checkpoint.fbs).
diff --git a/onnxruntime/core/flatbuffers/schema/ort.fbs.h b/onnxruntime/core/flatbuffers/schema/ort.fbs.h
index e0f5342c2962..dc8a471f2d81 100644
--- a/onnxruntime/core/flatbuffers/schema/ort.fbs.h
+++ b/onnxruntime/core/flatbuffers/schema/ort.fbs.h
@@ -4,7 +4,7 @@
 #ifndef FLATBUFFERS_GENERATED_ORT_ONNXRUNTIME_FBS_H_
 #define FLATBUFFERS_GENERATED_ORT_ONNXRUNTIME_FBS_H_
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 namespace onnxruntime {
 namespace fbs {
@@ -562,8 +562,8 @@ struct DimensionValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_DIM_TYPE) &&
-           VerifyField<int64_t>(verifier, VT_DIM_VALUE) &&
+           VerifyField<int8_t>(verifier, VT_DIM_TYPE, 1) &&
+           VerifyField<int64_t>(verifier, VT_DIM_VALUE, 8) &&
            VerifyOffset(verifier, VT_DIM_PARAM) &&
            verifier.VerifyString(dim_param()) &&
            verifier.EndTable();
@@ -634,7 +634,7 @@ struct TensorTypeAndShape FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_ELEM_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_ELEM_TYPE, 4) &&
            VerifyOffset(verifier, VT_SHAPE) &&
            verifier.VerifyTable(shape()) &&
            verifier.EndTable();
@@ -687,7 +687,7 @@ struct MapType FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_KEY_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_KEY_TYPE, 4) &&
            VerifyOffset(verifier, VT_VALUE_TYPE) &&
            verifier.VerifyTable(value_type()) &&
            verifier.EndTable();
@@ -787,7 +787,7 @@ struct NodeEdge FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint32_t>(verifier, VT_NODE_INDEX) &&
+           VerifyField<uint32_t>(verifier, VT_NODE_INDEX, 4) &&
            VerifyOffset(verifier, VT_INPUT_EDGES) &&
            verifier.VerifyVector(input_edges()) &&
            VerifyOffset(verifier, VT_OUTPUT_EDGES) &&
@@ -911,11 +911,11 @@ struct Node FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(doc_string()) &&
            VerifyOffset(verifier, VT_DOMAIN) &&
            verifier.VerifyString(domain()) &&
-           VerifyField<int32_t>(verifier, VT_SINCE_VERSION) &&
-           VerifyField<uint32_t>(verifier, VT_INDEX) &&
+           VerifyField<int32_t>(verifier, VT_SINCE_VERSION, 4) &&
+           VerifyField<uint32_t>(verifier, VT_INDEX, 4) &&
            VerifyOffset(verifier, VT_OP_TYPE) &&
            verifier.VerifyString(op_type()) &&
-           VerifyField<int32_t>(verifier, VT_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_TYPE, 4) &&
            VerifyOffset(verifier, VT_EXECUTION_PROVIDER_TYPE) &&
            verifier.VerifyString(execution_provider_type()) &&
            VerifyOffset(verifier, VT_INPUTS) &&
@@ -1174,7 +1174,7 @@ struct TypeInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DENOTATION) &&
            verifier.VerifyString(denotation()) &&
-           VerifyField<uint8_t>(verifier, VT_VALUE_TYPE) &&
+           VerifyField<uint8_t>(verifier, VT_VALUE_TYPE, 1) &&
            VerifyOffset(verifier, VT_VALUE) &&
            VerifyTypeInfoValue(verifier, value(), value_type()) &&
            verifier.EndTable();
@@ -1259,7 +1259,7 @@ struct OperatorSetId FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DOMAIN) &&
            verifier.VerifyString(domain()) &&
-           VerifyField<int64_t>(verifier, VT_VERSION) &&
+           VerifyField<int64_t>(verifier, VT_VERSION, 8) &&
            verifier.EndTable();
   }
 };
@@ -1343,7 +1343,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(doc_string()) &&
            VerifyOffset(verifier, VT_DIMS) &&
            verifier.VerifyVector(dims()) &&
-           VerifyField<int32_t>(verifier, VT_DATA_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_DATA_TYPE, 4) &&
            VerifyOffset(verifier, VT_RAW_DATA) &&
            verifier.VerifyVector(raw_data()) &&
            VerifyOffset(verifier, VT_STRING_DATA) &&
@@ -1568,9 +1568,9 @@ struct Attribute FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(name()) &&
            VerifyOffset(verifier, VT_DOC_STRING) &&
            verifier.VerifyString(doc_string()) &&
-           VerifyField<int32_t>(verifier, VT_TYPE) &&
-           VerifyField<float>(verifier, VT_F) &&
-           VerifyField<int64_t>(verifier, VT_I) &&
+           VerifyField<int32_t>(verifier, VT_TYPE, 4) &&
+           VerifyField<float>(verifier, VT_F, 4) &&
+           VerifyField<int64_t>(verifier, VT_I, 8) &&
            VerifyOffset(verifier, VT_S) &&
            verifier.VerifyString(s()) &&
            VerifyOffset(verifier, VT_T) &&
@@ -1759,12 +1759,12 @@ struct NodesToOptimizeIndices FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NODE_INDICES) &&
            verifier.VerifyVector(node_indices()) &&
-           VerifyField<uint32_t>(verifier, VT_NUM_INPUTS) &&
-           VerifyField<uint32_t>(verifier, VT_NUM_OUTPUTS) &&
-           VerifyField<uint8_t>(verifier, VT_HAS_VARIADIC_INPUT) &&
-           VerifyField<uint8_t>(verifier, VT_HAS_VARIADIC_OUTPUT) &&
-           VerifyField<uint32_t>(verifier, VT_NUM_VARIADIC_INPUTS) &&
-           VerifyField<uint32_t>(verifier, VT_NUM_VARIADIC_OUTPUTS) &&
+           VerifyField<uint32_t>(verifier, VT_NUM_INPUTS, 4) &&
+           VerifyField<uint32_t>(verifier, VT_NUM_OUTPUTS, 4) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_VARIADIC_INPUT, 1) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_VARIADIC_OUTPUT, 1) &&
+           VerifyField<uint32_t>(verifier, VT_NUM_VARIADIC_INPUTS, 4) &&
+           VerifyField<uint32_t>(verifier, VT_NUM_VARIADIC_OUTPUTS, 4) &&
            verifier.EndTable();
   }
 };
@@ -1862,8 +1862,8 @@ struct DeprecatedNodeIndexAndKernelDefHash FLATBUFFERS_FINAL_CLASS : private fla
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint32_t>(verifier, VT_NODE_INDEX) &&
-           VerifyField<uint64_t>(verifier, VT_KERNEL_DEF_HASH) &&
+           VerifyField<uint32_t>(verifier, VT_NODE_INDEX, 4) &&
+           VerifyField<uint64_t>(verifier, VT_KERNEL_DEF_HASH, 8) &&
            verifier.EndTable();
   }
 };
@@ -2161,7 +2161,7 @@ struct Graph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_NODES) &&
            verifier.VerifyVector(nodes()) &&
            verifier.VerifyVectorOfTables(nodes()) &&
-           VerifyField<uint32_t>(verifier, VT_MAX_NODE_INDEX) &&
+           VerifyField<uint32_t>(verifier, VT_MAX_NODE_INDEX, 4) &&
            VerifyOffset(verifier, VT_NODE_EDGES) &&
            verifier.VerifyVector(node_edges()) &&
            verifier.VerifyVectorOfTables(node_edges()) &&
@@ -2390,7 +2390,7 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int64_t>(verifier, VT_IR_VERSION) &&
+           VerifyField<int64_t>(verifier, VT_IR_VERSION, 8) &&
            VerifyOffset(verifier, VT_OPSET_IMPORT) &&
            verifier.VerifyVector(opset_import()) &&
            verifier.VerifyVectorOfTables(opset_import()) &&
@@ -2400,7 +2400,7 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(producer_version()) &&
            VerifyOffset(verifier, VT_DOMAIN) &&
            verifier.VerifyString(domain()) &&
-           VerifyField<int64_t>(verifier, VT_MODEL_VERSION) &&
+           VerifyField<int64_t>(verifier, VT_MODEL_VERSION, 8) &&
            VerifyOffset(verifier, VT_DOC_STRING) &&
            verifier.VerifyString(doc_string()) &&
            VerifyOffset(verifier, VT_GRAPH) &&
@@ -2740,8 +2740,8 @@ struct ArgTypeAndIndex FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_ARG_TYPE) &&
-           VerifyField<uint32_t>(verifier, VT_INDEX) &&
+           VerifyField<int8_t>(verifier, VT_ARG_TYPE, 1) &&
+           VerifyField<uint32_t>(verifier, VT_INDEX, 4) &&
            verifier.EndTable();
   }
 };
diff --git a/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs b/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs
index c8244b0a426f..94757fa6d5bf 100644
--- a/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs
+++ b/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs
@@ -8,6 +8,10 @@ namespace onnxruntime.fbs;
 table ModuleState {
   requires_grad_params:[Tensor];
   frozen_params:[Tensor];
+  // Nominal state just means that the Tensors in the ModuleState
+  // are empty. i.e. The tensors are treated as named entities
+  // without any meaningful data.
+  is_nominal_state:bool;
 }
 
 table ParameterOptimizerState {
diff --git a/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h b/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h
index 48feebb19769..62e6cf74394e 100644
--- a/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h
+++ b/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h
@@ -4,7 +4,7 @@
 #ifndef FLATBUFFERS_GENERATED_ORTTRAININGCHECKPOINT_ONNXRUNTIME_FBS_H_
 #define FLATBUFFERS_GENERATED_ORTTRAININGCHECKPOINT_ONNXRUNTIME_FBS_H_
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "ort.fbs.h"
 
@@ -39,7 +39,8 @@ struct ModuleState FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ModuleStateBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_REQUIRES_GRAD_PARAMS = 4,
-    VT_FROZEN_PARAMS = 6
+    VT_FROZEN_PARAMS = 6,
+    VT_IS_NOMINAL_STATE = 8
   };
   const flatbuffers::Vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>> *requires_grad_params() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>> *>(VT_REQUIRES_GRAD_PARAMS);
@@ -47,6 +48,9 @@ struct ModuleState FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>> *frozen_params() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>> *>(VT_FROZEN_PARAMS);
   }
+  bool is_nominal_state() const {
+    return GetField<uint8_t>(VT_IS_NOMINAL_STATE, 0) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_REQUIRES_GRAD_PARAMS) &&
@@ -55,6 +59,7 @@ struct ModuleState FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_FROZEN_PARAMS) &&
            verifier.VerifyVector(frozen_params()) &&
            verifier.VerifyVectorOfTables(frozen_params()) &&
+           VerifyField<uint8_t>(verifier, VT_IS_NOMINAL_STATE, 1) &&
            verifier.EndTable();
   }
 };
@@ -69,6 +74,9 @@ struct ModuleStateBuilder {
   void add_frozen_params(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>>> frozen_params) {
     fbb_.AddOffset(ModuleState::VT_FROZEN_PARAMS, frozen_params);
   }
+  void add_is_nominal_state(bool is_nominal_state) {
+    fbb_.AddElement<uint8_t>(ModuleState::VT_IS_NOMINAL_STATE, static_cast<uint8_t>(is_nominal_state), 0);
+  }
   explicit ModuleStateBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -84,23 +92,27 @@ struct ModuleStateBuilder {
 inline flatbuffers::Offset<ModuleState> CreateModuleState(
     flatbuffers::FlatBufferBuilder &_fbb,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>>> requires_grad_params = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>>> frozen_params = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>>> frozen_params = 0,
+    bool is_nominal_state = false) {
   ModuleStateBuilder builder_(_fbb);
   builder_.add_frozen_params(frozen_params);
   builder_.add_requires_grad_params(requires_grad_params);
+  builder_.add_is_nominal_state(is_nominal_state);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<ModuleState> CreateModuleStateDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>> *requires_grad_params = nullptr,
-    const std::vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>> *frozen_params = nullptr) {
+    const std::vector<flatbuffers::Offset<onnxruntime::fbs::Tensor>> *frozen_params = nullptr,
+    bool is_nominal_state = false) {
   auto requires_grad_params__ = requires_grad_params ? _fbb.CreateVector<flatbuffers::Offset<onnxruntime::fbs::Tensor>>(*requires_grad_params) : 0;
   auto frozen_params__ = frozen_params ? _fbb.CreateVector<flatbuffers::Offset<onnxruntime::fbs::Tensor>>(*frozen_params) : 0;
   return onnxruntime::fbs::CreateModuleState(
       _fbb,
       requires_grad_params__,
-      frozen_params__);
+      frozen_params__,
+      is_nominal_state);
 }
 
 struct ParameterOptimizerState FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -194,8 +206,8 @@ struct OptimizerGroup FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_GROUP_NAME) &&
            verifier.VerifyString(group_name()) &&
-           VerifyField<int64_t>(verifier, VT_STEP) &&
-           VerifyField<float>(verifier, VT_INITIAL_LEARNING_RATE) &&
+           VerifyField<int64_t>(verifier, VT_STEP, 8) &&
+           VerifyField<float>(verifier, VT_INITIAL_LEARNING_RATE, 4) &&
            VerifyOffset(verifier, VT_OPTIMIZER_STATES) &&
            verifier.VerifyVector(optimizer_states()) &&
            verifier.VerifyVectorOfTables(optimizer_states()) &&
@@ -277,7 +289,7 @@ struct IntProperty FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
-           VerifyField<int64_t>(verifier, VT_VALUE) &&
+           VerifyField<int64_t>(verifier, VT_VALUE, 8) &&
            verifier.EndTable();
   }
 };
@@ -341,7 +353,7 @@ struct FloatProperty FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
-           VerifyField<float>(verifier, VT_VALUE) &&
+           VerifyField<float>(verifier, VT_VALUE, 4) &&
            verifier.EndTable();
   }
 };
@@ -560,7 +572,7 @@ struct Checkpoint FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_VERSION) &&
+           VerifyField<int32_t>(verifier, VT_VERSION, 4) &&
            VerifyOffset(verifier, VT_MODULE_STATE) &&
            verifier.VerifyTable(module_state()) &&
            VerifyOffset(verifier, VT_OPTIMIZER_GROUPS) &&
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index ea7a6432a750..95e5380675df 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -175,14 +175,12 @@ class PlannerImpl {
 
   size_t num_logic_streams_{0};
   std::vector<InlinedVector<NodeIndex>> stream_nodes_;
-  InlinedVector<size_t> node_stream_map_;
 
   // dependence_graph_ keeps the dependencies combining model graph and logic streams
   // e.g. dependence_graph_[downstream_node] = [upstream_node_0, upstream_node_1, upstream_node_2 ...]
   // upstream_node_0 and upstream_node_1 are the immmediate upstream nodes of downstream_node
   // upstream_node_2 is the immediate nodes ahead of downstream_node in the same logic stream
   InlinedHashMap<onnxruntime::NodeIndex, InlinedHashSet<onnxruntime::NodeIndex>> dependence_graph_;
-  InlinedHashMap<onnxruntime::OrtValueIndex, InlinedHashSet<onnxruntime::NodeIndex>> value_consumer_map_;
   InlinedHashMap<onnxruntime::OrtValueIndex, onnxruntime::NodeIndex> value_node_map_;
 
   // OrtValueInfo: Auxiliary information about an OrtValue used only during plan-generation:
@@ -295,7 +293,7 @@ class PlannerImpl {
   }
 #endif
 
-  // Find if there exists some input tensor that we can use in-place for output_arg_num-th input in the node.
+  // Find if there exists some input tensor that we can use in-place for output_arg_num-th output in the node.
   bool FindReusableInput(const onnxruntime::Node& node, int output_arg_num, OrtValueIndex* reusable_input,
                          bool* is_strided_tensor) {
     *is_strided_tensor = false;
@@ -530,6 +528,7 @@ class PlannerImpl {
 
     // Initialize allocation plan:
     plan_.allocation_plan.resize(num_ml_values);
+    for (int i = 0; static_cast<size_t>(i) < num_ml_values; i++) AllocPlan(i).reused_buffer = i;
   }
 
   bool HasExternalOutputs(const Node& node) const {
@@ -1065,7 +1064,8 @@ class PlannerImpl {
 
     // build the consumer list for each value
     int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1;
-    value_consumer_map_.reserve(num_ml_values);
+    InlinedHashMap<onnxruntime::OrtValueIndex, InlinedHashSet<onnxruntime::NodeIndex>> value_consumer_map;
+    value_consumer_map.reserve(num_ml_values);
 
     // iterate each stream from back, so the first element is the last consumer in single stream case
     for (auto& stream : stream_nodes_) {
@@ -1078,10 +1078,10 @@ class PlannerImpl {
             const auto& name = input.Name();
             int value_idx;
             ORT_RETURN_IF_ERROR(ort_value_name_idx_map_.GetIdx(name, value_idx));
-            auto origin = Buffer(value_idx);
-            if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
+            auto origin = AllocPlan(value_idx).reused_buffer;
+            if (AllocPlan(origin).alloc_kind == AllocKind::kAllocate) {
               // add current node as consumer for origin buffer
-              value_consumer_map_[origin].insert(node_index);
+              value_consumer_map[origin].insert(node_index);
             }
           }
           return Status::OK();
@@ -1138,8 +1138,8 @@ class PlannerImpl {
                   std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
                   allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                   allocation_plan[output_idx_global].reused_buffer = reusable_input;
-                  value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
-                                                             value_consumer_map_[output_idx_global].end());
+                  value_consumer_map[reusable_input].insert(value_consumer_map[output_idx_global].begin(),
+                                                            value_consumer_map[output_idx_global].end());
                   reused.insert(reusable_input);
                   found_reusable = true;
                   break;
@@ -1168,8 +1168,8 @@ class PlannerImpl {
                   allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
                 allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                 allocation_plan[output_idx_global].reused_buffer = reusable_input;
-                value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
-                                                           value_consumer_map_[output_idx_global].end());
+                value_consumer_map[reusable_input].insert(value_consumer_map[output_idx_global].begin(),
+                                                          value_consumer_map[output_idx_global].end());
                 reused.insert(reusable_input);
                 continue;
               }  // if
@@ -1187,11 +1187,11 @@ class PlannerImpl {
                 OrtValueIndex input_arg_index{};
                 if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() &&
                     allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) {
-                  if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
+                  if (value_consumer_map[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
                     allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                     allocation_plan[output_idx_global].reused_buffer = input_arg_index;
-                    value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(),
-                                                                value_consumer_map_[output_idx_global].end());
+                    value_consumer_map[input_arg_index].insert(value_consumer_map[output_idx_global].begin(),
+                                                               value_consumer_map[output_idx_global].end());
                     reused.insert(input_arg_index);
                   }
                 }
@@ -1266,7 +1266,7 @@ class PlannerImpl {
             }
 
             bool all_covered = true;
-            for (auto consumer : value_consumer_map_[output_idx_global]) {
+            for (auto consumer : value_consumer_map[output_idx_global]) {
               if (deps->find(consumer) == deps->end()) {
                 all_covered = false;
                 break;
@@ -1277,9 +1277,9 @@ class PlannerImpl {
               allocation_plan[downstream_value].reused_buffer = output_idx_global;
               get_reused = true;
               // add new consumer for the value to be reused
-              value_consumer_map_[output_idx_global].insert(value_node_map_[downstream_value]);
-              value_consumer_map_[output_idx_global].insert(value_consumer_map_[downstream_value].begin(),
-                                                            value_consumer_map_[downstream_value].end());
+              value_consumer_map[output_idx_global].insert(value_node_map_[downstream_value]);
+              value_consumer_map[output_idx_global].insert(value_consumer_map[downstream_value].begin(),
+                                                           value_consumer_map[downstream_value].end());
               node_iter = size_iter->second.erase(node_iter);
               if (size_iter->second.empty()) {
                 local_iter->second.erase(size_iter);
@@ -1342,8 +1342,9 @@ class PlannerImpl {
     ort_value_usecount.reserve(ort_value_info_.size());
 #endif
     for (size_t i = 0; i < stream_nodes_.size(); ++i) {
-      // compute use count first
+      // compute use count first. TODO(leca): call ComputeReuseCount() only once is enough!
       ORT_RETURN_IF_ERROR(ComputeReuseCount());
+      for (int j = 0; static_cast<size_t>(j) < ort_value_info_.size(); j++) Buffer(j) = j;
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
       if (i == 0) {
         for (auto ort_value_info : ort_value_info_) {
@@ -1693,8 +1694,8 @@ class PlannerImpl {
             const auto& name = input.Name();
             int value_idx;
             ORT_RETURN_IF_ERROR(ort_value_name_idx_map_.GetIdx(name, value_idx));
-            auto origin = Buffer(value_idx);
-            if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
+            auto origin = AllocPlan(value_idx).reused_buffer;
+            if (AllocPlan(origin).alloc_kind == AllocKind::kAllocate) {
               // add current node as consumer for origin buffer
               value_consumers[origin].push_back(node_index);
             }
@@ -1721,9 +1722,9 @@ class PlannerImpl {
         // we actually can do better if all the consumers depends on the last consumer.
         // will optimize it later
         bool is_all_consumer_same_stream = true;
-        auto stream_idx = node_stream_map_[value_consumers[i][0]];
+        auto stream_idx = plan_.node_stream_map_[value_consumers[i][0]];
         for (size_t j = 1; j < value_consumers[i].size(); ++j) {
-          if (node_stream_map_[value_consumers[i][j]] != stream_idx) {
+          if (plan_.node_stream_map_[value_consumers[i][j]] != stream_idx) {
             is_all_consumer_same_stream = false;
             break;
           }
@@ -1748,10 +1749,10 @@ class PlannerImpl {
                             const PathString& /*partition_config_file*/) {
     if (graph_viewer_.NumberOfNodes() > 0) {
       stream_nodes_.push_back({});
-      node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
+      plan_.node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
       for (auto node_index : graph_viewer_.GetNodesInTopologicalOrder()) {
         stream_nodes_[0].push_back(node_index);
-        node_stream_map_[node_index] = 0;
+        plan_.node_stream_map_[node_index] = 0;
       }
       num_logic_streams_ = 1;
     }
@@ -1773,7 +1774,12 @@ class PlannerImpl {
       execution_plan.emplace_back(std::make_unique<SequentialExecutionPlan::LogicStream>(node_device_mem_location));
       // 2. add steps to the execution plan
       for (auto node_index : stream_nodes_[0]) {
+#if defined(ORT_MINIMAL_BUILD)
         execution_plan[0]->steps_.emplace_back(std::make_unique<LaunchKernelStep>(node_index));
+#else
+        execution_plan[0]->steps_.emplace_back(std::make_unique<LaunchKernelStep>(node_index,
+                                                                                  graph_viewer_.GetNode(node_index)->Name()));
+#endif
       }
     } else {
       // graph with no nodes. e.g. subgraph of If might return the input as-is or a constant value from an initializer
@@ -1790,10 +1796,10 @@ class PlannerImpl {
     auto partitioner = IGraphPartitioner::CreateGraphPartitioner(logger, partition_config_file);
     auto status = partitioner->PartitionGraph(graph_viewer_, execution_providers, stream_nodes_, context_->GetExecutionOrder());
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
-    node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
+    plan_.node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
     for (size_t i = 0; i < stream_nodes_.size(); ++i) {
       for (auto node_index : stream_nodes_[i]) {
-        node_stream_map_[node_index] = i;
+        plan_.node_stream_map_[node_index] = i;
       }
     }
     num_logic_streams_ = stream_nodes_.size();
@@ -1856,7 +1862,7 @@ class PlannerImpl {
         auto* node = graph_viewer_.GetNode(node_index);
         for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
           // if the output node is not in the same stream, generate a trigger point
-          if (node_stream_map_[it->Index()] != i
+          if (plan_.node_stream_map_[it->Index()] != i
 #ifdef ENABLE_TRAINING
               // Do not insert Barrier/TriggerDownStream step if the producer and consumer are in different sides of yieldOp
               // As in this case producer will surely be ready before the consumer is running.
@@ -1889,9 +1895,9 @@ class PlannerImpl {
                   // 2. the consumer is in the same stream(non-cpu device), but it consumes a CPU tensor from an non-shape op.
                   //    for example, a resize cuda kernel consumer a tensor from MemCpyToHost cuda kernel on the same stream.
                   //    in this case, the FIFO can't guarantee the cpu tensor is ready when resize kernel is launching
-                  OrtDevice::DeviceType output_arg_device = plan_.allocation_plan[output_arg_idx].location.Type();
+                  OrtDevice::DeviceType output_arg_device = AllocPlan(output_arg_idx).location.Type();
                   WaitNotificationFn wait_handle = stream_handle_registry.GetWaitHandle(stream_device, output_arg_device);
-                  if ((node_stream_map_[it->Index()] != i || output_arg_device == OrtDevice::CPU) && wait_handle != nullptr) {
+                  if ((plan_.node_stream_map_[it->Index()] != i || output_arg_device == OrtDevice::CPU) && wait_handle != nullptr) {
                     if (node_to_notification.find(node_index) == node_to_notification.end()) {
                       node_to_notification[node_index] = plan_.notification_owners.size();
                       plan_.notification_owners.push_back(i);
@@ -1903,7 +1909,7 @@ class PlannerImpl {
               }  // output->Exists
             }    // for each output
             if (output_consumed_in_subgraph) {
-              const auto downstream = node_stream_map_[it->Index()];
+              const auto downstream = plan_.node_stream_map_[it->Index()];
               if (downstream != i) {
                 auto downstream_device = execution_plan[downstream]->device_.Type();
                 WaitNotificationFn wait_handle = stream_handle_registry.GetWaitHandle(stream_device, downstream_device);
@@ -1929,7 +1935,7 @@ class PlannerImpl {
         onnxruntime::ProviderType exec_provider_name = node->GetExecutionProviderType();
         const IExecutionProvider* ep = execution_providers.Get(exec_provider_name);
         auto node_device_mem_location = ep->GetOrtDeviceByMemType(OrtMemType::OrtMemTypeDefault);
-        ORT_ENFORCE(execution_plan[node_stream_map_[node_index]]->device_.Type() == node_device_mem_location.Type());
+        ORT_ENFORCE(execution_plan[plan_.node_stream_map_[node_index]]->device_.Type() == node_device_mem_location.Type());
       }
     }
 
@@ -1978,8 +1984,12 @@ class PlannerImpl {
           // add dependency for model graph
           dependence_graph_[it->Index()].insert(node_index);
         }
-        // push launch kernel command
+// push launch kernel command
+#if defined(ORT_MINIMAL_BUILD)
         execution_plan[i]->steps_.emplace_back(std::make_unique<LaunchKernelStep>(node_index));
+#else
+        execution_plan[i]->steps_.emplace_back(std::make_unique<LaunchKernelStep>(node_index, graph_viewer_.GetNode(node_index)->Name()));
+#endif
         // check if any notification generated by this node, if yes, push a activate
         auto notification_it = node_to_notification.find(node_index);
         if (notification_it != node_to_notification.end()) {
@@ -2003,7 +2013,7 @@ class PlannerImpl {
         if (!node_output->Exists()) continue;
         OrtValueIndex output_idx_global;
         ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(node_output->Name(), output_idx_global));
-        plan_.value_to_stream_map[output_idx_global] = node_stream_map_[node_index];
+        plan_.value_to_stream_map[output_idx_global] = plan_.node_stream_map_[node_index];
         value_node_map_[output_idx_global] = node_index;
       }
     }
@@ -2079,7 +2089,7 @@ class PlannerImpl {
         }
         // trigger downstream
         for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
-          auto stream_idx = node_stream_map_[it->Index()];
+          auto stream_idx = plan_.node_stream_map_[it->Index()];
           if (stream_idx != i) {
             auto node_it = std::find(stream_nodes_[stream_idx].begin(), stream_nodes_[stream_idx].end(), it->Index());
             int offset = static_cast<int>(std::distance(stream_nodes_[stream_idx].begin(), node_it));
diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h
index e16b90ded338..5e4cd9f62f11 100644
--- a/onnxruntime/core/framework/bfc_arena.h
+++ b/onnxruntime/core/framework/bfc_arena.h
@@ -482,7 +482,7 @@ class BFCArena : public IAllocator {
 
   Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
 
-  char bins_space_[sizeof(Bin) * kNumBins];
+  alignas(Bin) char bins_space_[sizeof(Bin) * kNumBins];
 
   // The size of the current region allocation.
   SafeInt<size_t> curr_region_allocation_bytes_;
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index d9c49dc6bea1..32a5f749af08 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -204,6 +204,14 @@ AllocatorPtr IExecutionFrame::GetAllocator(const OrtDevice& info) const {
 
 Status IExecutionFrame::ReleaseMLValue(int ort_value_idx) { return ReleaseMLValueImpl(ort_value_idx); }
 
+#ifdef ENABLE_TRAINING
+void IExecutionFrame::ReleaseAllMLValues() {
+  for (size_t ort_value_idx = 0; ort_value_idx < all_values_.size(); ort_value_idx++) {
+    all_values_[ort_value_idx] = OrtValue();
+  }
+}
+#endif
+
 Status IExecutionFrame::ReleaseMLValueImpl(int ort_value_idx) {
   if (ort_value_idx == NodeIndexInfo::kInvalidEntry || static_cast<size_t>(ort_value_idx) >= all_values_size_) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid index ", ort_value_idx);
@@ -223,7 +231,8 @@ void IExecutionFrame::Init(gsl::span<const int> feed_mlvalue_idxs, gsl::span<con
                            const std::unordered_map<int, OrtValue>& initializers,
                            const std::function<bool(const std::string& name)>& is_initializer_sparse_func,
                            gsl::span<const OrtValue> fetches) {
-  ORT_ENFORCE(feeds.size() == feed_mlvalue_idxs.size());
+  ORT_ENFORCE(feeds.size() == feed_mlvalue_idxs.size(), "Get feed size: ", feeds.size(), " but expected feed size: ",
+              feed_mlvalue_idxs.size());
   ORT_ENFORCE(fetches.empty() || fetches.size() == fetch_mlvalue_idxs_.size());
 
   // Need this for sparse conversions in host memory
@@ -830,7 +839,20 @@ AllocatorPtr ExecutionFrame::GetAllocatorImpl(const OrtDevice& info) const {
 // This method is not thread safe!
 // Return S_OK and nullptr if index map to a value that is an unused optional input/output
 Status ExecutionFrame::CreateNodeOutputMLValueImpl(OrtValue& ort_value, int ort_value_idx, const TensorShape* shape) {
+#ifdef ENABLE_TRAINING
+  try {
+    auto status = AllocateAsPerAllocationPlan(ort_value, ort_value_idx, shape);
+    return status;
+  } catch (const std::exception& e) {
+    LOGS(session_state_.Logger(), WARNING)
+        << "Exception caught when allocating memory for ort_value with index: " << ort_value_idx
+        << "so clean up all OrtValues";
+    ReleaseAllMLValues();
+    return Status(ONNXRUNTIME, FAIL, e.what());
+  }
+#else
   return AllocateAsPerAllocationPlan(ort_value, ort_value_idx, shape);
+#endif
 }
 
 void ExecutionFrame::VerifyOutputSizes(int output_index, const Node& node, const TensorShape& output_shape) {
diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h
index 1576c16684fa..18d210ffd48f 100644
--- a/onnxruntime/core/framework/execution_frame.h
+++ b/onnxruntime/core/framework/execution_frame.h
@@ -67,6 +67,8 @@ class IExecutionFrame {
 
                      const std::unordered_map<int, OrtValue>& initializers);
   Status GetOutputs(gsl::span<const int> fetch_mlvalue_idxs, std::vector<OrtValue>& fetches);
+  // if OOM happens, then release all values, so session can run next batch.
+  void ReleaseAllMLValues();
 #endif
 
   // TO DO: make it thread safe
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index 7f8009216ce3..b39924d4c3ff 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -35,77 +35,4 @@ common::Status IExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
 }
 
 #endif
-
-int IExecutionProvider::ModelMetadefIdGenerator::GenerateId(const onnxruntime::GraphViewer& graph_viewer,
-                                                            HashValue& model_hash) {
-  model_hash = 0;
-
-  // find the top level graph
-  const Graph* cur_graph = &graph_viewer.GetGraph();
-  while (cur_graph->IsSubgraph()) {
-    cur_graph = cur_graph->ParentGraph();
-  }
-
-  uint32_t instance_hash[4] = {0, 0, 0, 0};
-
-  const Graph& main_graph = *cur_graph;
-
-  // hash the bytes in the Graph instance. we can't just use the address as a new Graph instance may use
-  // the same memory (unit tests prove this can occur). the raw bytes of the Graph instance should be a unique
-  // fingerprint for the instance that can use used as the key to the hash of the model path/contents.
-  MurmurHash3::x86_128(&main_graph, gsl::narrow_cast<int32_t>(sizeof(Graph)), instance_hash[0], &instance_hash);
-  HashValue graph_instance_hash = instance_hash[0] | (uint64_t(instance_hash[1]) << 32);
-
-  // if we've already hashed this main graph instance use the cached value
-  auto entry = main_graph_hash_.find(graph_instance_hash);
-  if (entry != main_graph_hash_.cend()) {
-    model_hash = entry->second;
-  } else {
-    uint32_t hash[4] = {0, 0, 0, 0};
-
-    // prefer path the model was loaded from
-    // this may not be available if the model was loaded from a stream or in-memory bytes
-    const auto& model_path_str = main_graph.ModelPath().ToPathString();
-    if (!model_path_str.empty()) {
-      MurmurHash3::x86_128(model_path_str.data(), gsl::narrow_cast<int32_t>(model_path_str.size()), hash[0], &hash);
-    } else {
-      auto hash_str = [&hash](const std::string& str) {
-        MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
-      };
-
-      // fingerprint the main graph by hashing graph inputs and the ordered outputs from each node
-      for (const auto* node_arg : main_graph.GetInputsIncludingInitializers()) {
-        hash_str(node_arg->Name());
-      }
-
-      // note: process nodes in order defined in model to be deterministic
-      for (const auto& node : main_graph.Nodes()) {
-        for (const auto* node_arg : node.OutputDefs()) {
-          if (node_arg->Exists()) {
-            hash_str(node_arg->Name());
-          }
-        }
-      }
-    }
-
-    model_hash = hash[0] | (uint64_t(hash[1]) << 32);
-
-    main_graph_hash_[graph_instance_hash] = model_hash;
-  }
-
-  // return the current unique id, and increment to update
-  return model_metadef_id_[model_hash]++;
-}
-
-int IExecutionProvider::GenerateMetaDefId(const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash) const {
-  ORT_ENFORCE(metadef_id_generator_,
-              "IExecutionProvider constructor must be called with true for use_metadef_id_creator");
-
-  // if the EP is shared across multiple sessions there's a very small potential for concurrency issues.
-  // use a lock when generating an id to be paranoid
-  static OrtMutex mutex;
-  std::lock_guard<OrtMutex> lock(mutex);
-  return metadef_id_generator_->GenerateId(graph_viewer, model_hash);
-}
-
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_providers.h b/onnxruntime/core/framework/execution_providers.h
index d97953fd9d5e..dc45cad692b6 100644
--- a/onnxruntime/core/framework/execution_providers.h
+++ b/onnxruntime/core/framework/execution_providers.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-// #include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -13,7 +12,10 @@
 #include "core/graph/graph_viewer.h"
 #include "core/common/logging/logging.h"
 #ifdef _WIN32
+#include <winmeta.h>
+#include <evntrace.h>
 #include "core/platform/tracing.h"
+#include "core/platform/windows/telemetry.h"
 #endif
 
 namespace onnxruntime {
@@ -43,20 +45,62 @@ class ExecutionProviders {
     exec_provider_options_[provider_id] = providerOptions;
 
 #ifdef _WIN32
+    LogProviderOptions(provider_id, providerOptions, false);
+
+    // Register callback for ETW capture state (rundown)
+    WindowsTelemetry::RegisterInternalCallback(
+        [this](
+            LPCGUID SourceId,
+            ULONG IsEnabled,
+            UCHAR Level,
+            ULONGLONG MatchAnyKeyword,
+            ULONGLONG MatchAllKeyword,
+            PEVENT_FILTER_DESCRIPTOR FilterData,
+            PVOID CallbackContext) {
+          (void)SourceId;
+          (void)Level;
+          (void)MatchAnyKeyword;
+          (void)MatchAllKeyword;
+          (void)FilterData;
+          (void)CallbackContext;
+
+          // Check if this callback is for capturing state
+          if ((IsEnabled == EVENT_CONTROL_CODE_CAPTURE_STATE) &&
+              ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)) != 0)) {
+            for (size_t i = 0; i < exec_providers_.size(); ++i) {
+              const auto& provider_id = exec_provider_ids_[i];
+
+              auto it = exec_provider_options_.find(provider_id);
+              if (it != exec_provider_options_.end()) {
+                const auto& options = it->second;
+
+                LogProviderOptions(provider_id, options, true);
+              }
+            }
+          }
+        });
+#endif
+
+    exec_provider_ids_.push_back(provider_id);
+    exec_providers_.push_back(p_exec_provider);
+    return Status::OK();
+  }
+
+#ifdef _WIN32
+  void LogProviderOptions(const std::string& provider_id, const ProviderOptions& providerOptions, bool captureState) {
     for (const auto& config_pair : providerOptions) {
       TraceLoggingWrite(
           telemetry_provider_handle,
           "ProviderOptions",
+          TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+          TraceLoggingLevel(WINEVENT_LEVEL_INFO),
           TraceLoggingString(provider_id.c_str(), "ProviderId"),
           TraceLoggingString(config_pair.first.c_str(), "Key"),
-          TraceLoggingString(config_pair.second.c_str(), "Value"));
+          TraceLoggingString(config_pair.second.c_str(), "Value"),
+          TraceLoggingBool(captureState, "isCaptureState"));
     }
-#endif
-
-    exec_provider_ids_.push_back(provider_id);
-    exec_providers_.push_back(p_exec_provider);
-    return Status::OK();
   }
+#endif
 
   const IExecutionProvider* Get(const onnxruntime::Node& node) const {
     return Get(node.GetExecutionProviderType());
diff --git a/onnxruntime/core/framework/execution_steps.cc b/onnxruntime/core/framework/execution_steps.cc
index df19236d037c..b647833cfd37 100644
--- a/onnxruntime/core/framework/execution_steps.cc
+++ b/onnxruntime/core/framework/execution_steps.cc
@@ -1,8 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
 #include "core/framework/execution_steps.h"
 #include "core/framework/sequential_executor.h"
+
 namespace onnxruntime {
+
 BarrierStep::BarrierStep(size_t id, NodeIndex node_index) : SequentialExecutionPlan::ExecutionStep(node_index),
                                                             barrier_id_{id} {}
 
@@ -16,8 +19,8 @@ Status BarrierStep::Execute(StreamExecutionContext& ctx,
 }
 
 std::string BarrierStep::ToString() const {
-  return ::onnxruntime::MakeString("Set a barrier with id: ",
-                                   barrier_id_, ", count: ", 2, ".");
+  // Set a barrier with id: barrier_id_, count: 2.
+  return MakeString("Barrier - BarrierId: ", barrier_id_, ", Count: ", 2);
 }
 
 WaitOnEPStep::WaitOnEPStep(WaitNotificationFn handle,
@@ -42,11 +45,17 @@ Status WaitOnEPStep::Execute(StreamExecutionContext& ctx,
 }
 
 std::string WaitOnEPStep::ToString() const {
-  return ::onnxruntime::MakeString("WaitOnEPStep: wait on notification with id: ",
-                                   notification_idx_, ". ");
+  // Wait on notification with notification_idx_
+  return MakeString("WaitOnEP - NotificationId: ", notification_idx_);
 }
 
-LaunchKernelStep::LaunchKernelStep(NodeIndex index) : SequentialExecutionPlan::ExecutionStep(index) {}
+#if defined(ORT_MINIMAL_BUILD)
+LaunchKernelStep::LaunchKernelStep(NodeIndex index)
+    : SequentialExecutionPlan::ExecutionStep(index) {}
+#else
+LaunchKernelStep::LaunchKernelStep(NodeIndex index, std::string_view node_name)
+    : SequentialExecutionPlan::ExecutionStep(index), node_name_(node_name) {}
+#endif
 
 Status LaunchKernelStep::Execute(StreamExecutionContext& ctx,
                                  size_t stream_idx,
@@ -61,13 +70,17 @@ Status LaunchKernelStep::Execute(StreamExecutionContext& ctx,
     return Status::OK();
   }
 #endif
-  onnxruntime::Status status = ExecuteKernel(ctx, node_index_, stream_idx, terminate_flag, session_scope);
+  Status status = ExecuteKernel(ctx, node_index_, stream_idx, terminate_flag, session_scope);
   continue_flag = status.IsOK();
   return status;
 }
 
 std::string LaunchKernelStep::ToString() const {
-  return ::onnxruntime::MakeString("Launch kernel with node id: ", node_index_, ". ");
+#if defined(ORT_MINIMAL_BUILD)
+  return MakeString("LaunchKernel - ", "NodeIndex: ", node_index_);
+#else
+  return MakeString("LaunchKernel - ", "NodeIndex: ", node_index_, ", Name: ", node_name_);
+#endif
 }
 
 ActivateNotificationStep::ActivateNotificationStep(
@@ -89,12 +102,12 @@ Status ActivateNotificationStep::Execute(StreamExecutionContext& ctx,
 }
 
 std::string ActivateNotificationStep::ToString() const {
-  return ::onnxruntime::MakeString("ActivateNotificationStep: activate notification with id: ",
-                                   notification_idx_, ". ");
+  // Activate notification with id: notification_idx_
+  return MakeString("ActivateNotification - NotificationId: ", notification_idx_);
 }
 
-TriggerDownstreamStep::TriggerDownstreamStep(size_t trigger_point_index, NodeIndex node_index) : SequentialExecutionPlan::ExecutionStep(node_index),
-                                                                                                 trigger_point_index_(trigger_point_index) {}
+TriggerDownstreamStep::TriggerDownstreamStep(size_t trigger_point_index, NodeIndex node_index)
+    : SequentialExecutionPlan::ExecutionStep(node_index), trigger_point_index_(trigger_point_index) {}
 
 Status TriggerDownstreamStep::Execute(StreamExecutionContext& ctx,
                                       size_t /*stream_idx*/,
@@ -107,7 +120,8 @@ Status TriggerDownstreamStep::Execute(StreamExecutionContext& ctx,
 }
 
 std::string TriggerDownstreamStep::ToString() const {
-  return ::onnxruntime::MakeString("TriggerDownstreamStep: trigger downstream of trigger point: ",
-                                   trigger_point_index_, ".");
+  // Trigger downstream of trigger point: trigger_point_index_.
+  return MakeString("TriggerDownstream - TriggerPointIndex: ", trigger_point_index_);
 }
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_steps.h b/onnxruntime/core/framework/execution_steps.h
index b67b58390082..545dabc56b27 100644
--- a/onnxruntime/core/framework/execution_steps.h
+++ b/onnxruntime/core/framework/execution_steps.h
@@ -44,7 +44,11 @@ class WaitOnEPStep : public SequentialExecutionPlan::ExecutionStep {
 
 class LaunchKernelStep : public SequentialExecutionPlan::ExecutionStep {
  public:
+#if defined(ORT_MINIMAL_BUILD)
   LaunchKernelStep(NodeIndex index);
+#else
+  LaunchKernelStep(NodeIndex index, std::string_view node_name);
+#endif
 
   Status Execute(StreamExecutionContext& ctx,
                  size_t stream_idx,
@@ -53,6 +57,11 @@ class LaunchKernelStep : public SequentialExecutionPlan::ExecutionStep {
                  bool& continue_flag) override;
 
   std::string ToString() const override;
+
+#if !defined(ORT_MINIMAL_BUILD)
+ private:
+  std::string node_name_;
+#endif
 };
 
 class ActivateNotificationStep : public SequentialExecutionPlan::ExecutionStep {
diff --git a/onnxruntime/core/framework/feeds_fetches_manager.h b/onnxruntime/core/framework/feeds_fetches_manager.h
index 75cb7485a6e3..c2c1be64f3e1 100644
--- a/onnxruntime/core/framework/feeds_fetches_manager.h
+++ b/onnxruntime/core/framework/feeds_fetches_manager.h
@@ -25,7 +25,7 @@ enum class DeviceCopyCheck {
 };
 
 struct DeviceCopyChecks {
-  DeviceCopyCheck status = DeviceCopyCheck::Unknown;  ///< Overall status. If NoCopy no input or output copies are needed
+  DeviceCopyCheck status = DeviceCopyCheck::Unknown;  ///< Overall status. NoCopy means input_copy_needed and output_copy_needed are both NoCopy
   DeviceCopyCheck input_copy_needed = DeviceCopyCheck::Unknown;
   DeviceCopyCheck output_copy_needed = DeviceCopyCheck::Unknown;
 };
@@ -73,6 +73,9 @@ struct FeedsFetchesInfo {
 struct MLValueCopyInfo {
   OrtDevice source_device{};
   OrtDevice target_device{};  // default is CPU
+
+  // if all the consume ops are from the same stream, this variable is the stream index; otherwise -1
+  int unique_stream_index_consumes_it = -1;
 };
 
 class FeedsFetchesManager {
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index e4fe0c756454..90ee8a46f66a 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
 // #define COUNT_NON_CUDA_OPS
@@ -634,6 +635,98 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
   return Status::OK();
 }
 
+static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
+                                   const Graph& graph,
+                                   const std::string& ep_context_path,
+                                   const logging::Logger& logger) {
+  InlinedVector<const Node*> all_ep_context_nodes;
+  for (const auto& ep : execution_providers) {
+    const InlinedVector<const Node*> ep_context_nodes = ep->GetEpContextNodes();
+    all_ep_context_nodes.insert(all_ep_context_nodes.begin(), ep_context_nodes.begin(), ep_context_nodes.end());
+  }
+
+  if (all_ep_context_nodes.size() < 1) {
+    return Status::OK();
+  }
+
+  auto get_ep_context_node = [&all_ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
+    for (auto& node : all_ep_context_nodes) {
+      if (node_name == node->Name()) {
+        return std::make_pair(true, node);
+      }
+    }
+    return std::make_pair(false, static_cast<const Node*>(nullptr));
+  };
+
+  onnxruntime::PathString context_cache_path;
+  PathString model_pathstring = graph.ModelPath().ToPathString();
+
+  if (!ep_context_path.empty()) {
+    context_cache_path = ToPathString(ep_context_path);
+  } else if (!model_pathstring.empty()) {
+    context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
+  }
+
+  {
+#ifdef _WIN32
+    std::wifstream fs(context_cache_path);
+#else
+    std::ifstream fs(context_cache_path);
+#endif
+    ORT_RETURN_IF(fs.good(), "Failed to generate EP context model since the file exist already.");
+  }
+
+  Model ep_context_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                         graph.DomainToVersionMap(), {}, logger);
+  auto& ep_graph = ep_context_model.MainGraph();
+  ep_graph.SetDescription(graph.Description());
+
+  // Set inputs outputs explicitly to make sure the order is same as the user model.
+  auto inputs = graph.GetInputs();
+  auto outputs = graph.GetOutputs();
+
+  InlinedVector<const NodeArg*> ep_graph_inputs;
+  ep_graph_inputs.reserve(inputs.size());
+  for (auto& input : inputs) {
+    auto input_arg = graph.GetNodeArg(input->Name());
+    auto& ep_graph_input_arg = ep_graph.GetOrCreateNodeArg(input_arg->Name(), input_arg->TypeAsProto());
+    ep_graph_inputs.push_back(&ep_graph_input_arg);
+  }
+
+  InlinedVector<const NodeArg*> ep_graph_outputs;
+  ep_graph_outputs.reserve(outputs.size());
+  for (auto& output : outputs) {
+    auto output_arg = graph.GetNodeArg(output->Name());
+    auto& ep_graph_output_arg = ep_graph.GetOrCreateNodeArg(output_arg->Name(), output_arg->TypeAsProto());
+    ep_graph_outputs.push_back(&ep_graph_output_arg);
+  }
+
+  ep_graph.SetInputs(ep_graph_inputs);
+  ep_graph.SetOutputs(ep_graph_outputs);
+
+  for (const auto& node : graph.Nodes()) {
+    // the fused node and EPContext node has same node name
+    auto ep_context_node = get_ep_context_node(node.Name());
+    // Use EpContext node created by the EPs if name matched, otherwise use node from original model
+    if (ep_context_node.first) {
+      ep_graph.AddNode(*ep_context_node.second);
+    } else {
+      ep_graph.AddNode(node);
+    }
+  }
+
+  // handle initializers
+  for (const auto& initialized_tensor : graph.GetAllInitializedTensors()) {
+    if (ep_graph.GetNodeArg(initialized_tensor.first) != nullptr) {
+      ep_graph.AddInitializedTensor(*initialized_tensor.second);
+    }
+  }
+
+  ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+
+  return Status::OK();
+}
+
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
                                        KernelRegistryManager& kernel_registry_manager) {
@@ -840,6 +933,8 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,
 
 Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                                   const ConfigOptions& config_options,
+                                   const logging::Logger& logger,
                                    Mode mode,
                                    const layout_transformation::DebugGraphFn& debug_graph_fn) const {
   // It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
@@ -886,7 +981,15 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 #if !defined(ORT_MINIMAL_BUILD)
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode,
                                                  providers_, kernel_registry_mgr_));
+
+    bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    if (ep_context_enabled) {
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
+    }
 #else
+    ORT_UNUSED_PARAMETER(config_options);
+    ORT_UNUSED_PARAMETER(logger);
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)
   } else {
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index 4fc85c258826..d1ef193cf152 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -13,6 +13,7 @@ namespace onnxruntime {
 class ExecutionProviders;
 class KernelRegistryManager;
 class Model;
+struct ConfigOptions;
 
 class GraphPartitioner {
  public:
@@ -31,6 +32,8 @@ class GraphPartitioner {
   // Run partitioning.
   Status Partition(Graph& graph, FuncManager& func_mgr,
                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                   const ConfigOptions& config_options,
+                   const logging::Logger& logger,
                    Mode mode = Mode::kNormal,
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
 
diff --git a/onnxruntime/core/framework/kernel_registry_manager.cc b/onnxruntime/core/framework/kernel_registry_manager.cc
index b2ef85311958..f8ccdb8fb023 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.cc
+++ b/onnxruntime/core/framework/kernel_registry_manager.cc
@@ -24,7 +24,8 @@ Status KernelRegistryManager::CreateKernel(const Node& node,
                            session_state.GetConstantInitializedTensors(),
                            session_state.GetOrtValueNameIdxMap(),
                            session_state.GetDataTransferMgr(),
-                           session_state.GetAllocators());
+                           session_state.GetAllocators(),
+                           session_state.GetSessionOptions().config_options);
 
   return kernel_create_info.kernel_create_func(session_state.GetMutableFuncMgr(), kernel_info, out);
 }
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.h b/onnxruntime/core/framework/kernel_type_str_resolver.h
index 31a806dd5229..fea2a6ef3a43 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver.h
+++ b/onnxruntime/core/framework/kernel_type_str_resolver.h
@@ -7,7 +7,7 @@
 #include <string_view>
 #include <utility>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/graph/onnx_protobuf.h"
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc b/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
index 4f5fa9910b5d..423307b4c8fc 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
+++ b/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
@@ -5,7 +5,7 @@
 
 #include "core/framework/kernel_type_str_resolver_utils.h"
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/common.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
@@ -53,200 +53,240 @@ Status AddLayoutTransformationRequiredOpsToKernelTypeStrResolver(KernelTypeStrRe
   // clang-format off
   constexpr uint8_t kLayoutTransformationRequiredOpsKernelTypeStrResolverBytes[] = {
       0x10, 0x00, 0x00, 0x00, 0x6b, 0x74, 0x73, 0x72, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,
-      0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,
-      0x4c, 0x0b, 0x00, 0x00, 0xac, 0x08, 0x00, 0x00, 0xd0, 0x0a, 0x00, 0x00, 0x10, 0x06, 0x00, 0x00,
-      0xa8, 0x07, 0x00, 0x00, 0x18, 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
-      0x44, 0x07, 0x00, 0x00, 0x9c, 0x01, 0x00, 0x00, 0xf8, 0x07, 0x00, 0x00, 0x78, 0x09, 0x00, 0x00,
-      0x14, 0x01, 0x00, 0x00, 0x50, 0x06, 0x00, 0x00, 0x60, 0x02, 0x00, 0x00, 0xf4, 0x08, 0x00, 0x00,
-      0x8c, 0x03, 0x00, 0x00, 0x9c, 0x02, 0x00, 0x00, 0x84, 0x06, 0x00, 0x00, 0xcc, 0x03, 0x00, 0x00,
-      0x60, 0x05, 0x00, 0x00, 0xb8, 0x01, 0x00, 0x00, 0x1c, 0x03, 0x00, 0x00, 0x08, 0x04, 0x00, 0x00,
-      0xe0, 0x09, 0x00, 0x00, 0x8c, 0xf4, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65,
-      0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x34, 0x00, 0x00, 0x00, 0x00, 0xb4, 0xf4, 0xff, 0xff,
-      0x08, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xda, 0xf4, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x9c, 0xf4, 0xff, 0xff,
-      0xd8, 0xf4, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-      0x60, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-      0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61,
-      0x72, 0x3a, 0x31, 0x30, 0x00, 0x00, 0x00, 0x00, 0x10, 0xf5, 0xff, 0xff, 0xa4, 0x0a, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xfc, 0xf4, 0xff, 0xff,
-      0x01, 0x00, 0x00, 0x00, 0x2c, 0xf5, 0xff, 0xff, 0xb0, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x4e, 0xf5, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x48, 0xf5, 0xff, 0xff, 0xc8, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x38, 0xf5, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
-      0x30, 0xf5, 0xff, 0xff, 0x6c, 0xf5, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
-      0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a,
-      0x31, 0x39, 0x00, 0x00, 0x9c, 0xf5, 0xff, 0xff, 0x3c, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc2, 0xf5, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x94, 0xf5, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xc4, 0xf5, 0xff, 0xff,
-      0xe8, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xb4, 0xf5, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xac, 0xf5, 0xff, 0xff,
-      0xe8, 0xf5, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74,
-      0x79, 0x3a, 0x31, 0x39, 0x00, 0x00, 0x00, 0x00, 0x10, 0xf6, 0xff, 0xff, 0xac, 0x05, 0x00, 0x00,
+      0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x88, 0x0d, 0x00, 0x00,
+      0xec, 0x06, 0x00, 0x00, 0x68, 0x06, 0x00, 0x00, 0x1c, 0x08, 0x00, 0x00, 0xc8, 0x02, 0x00, 0x00,
+      0x2c, 0x03, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00, 0xc0, 0x09, 0x00, 0x00, 0xdc, 0x03, 0x00, 0x00,
+      0x6c, 0x09, 0x00, 0x00, 0x64, 0x02, 0x00, 0x00, 0xbc, 0x0c, 0x00, 0x00, 0x04, 0x0d, 0x00, 0x00,
+      0xd4, 0x00, 0x00, 0x00, 0x10, 0x04, 0x00, 0x00, 0x04, 0x05, 0x00, 0x00, 0x68, 0x08, 0x00, 0x00,
+      0x70, 0x03, 0x00, 0x00, 0xf0, 0x0d, 0x00, 0x00, 0x8c, 0x04, 0x00, 0x00, 0x6c, 0x05, 0x00, 0x00,
+      0x94, 0x0a, 0x00, 0x00, 0x44, 0x0c, 0x00, 0x00, 0x28, 0x07, 0x00, 0x00, 0xc4, 0x05, 0x00, 0x00,
+      0xc0, 0x09, 0x00, 0x00, 0x08, 0x0a, 0x00, 0x00, 0xb8, 0x08, 0x00, 0x00, 0x90, 0x01, 0x00, 0x00,
+      0x5c, 0x07, 0x00, 0x00, 0xbc, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xf2, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
+      0x28, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72,
+      0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69,
+      0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x60, 0xf2, 0xff, 0xff, 0x64, 0x0b, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x36, 0xf6, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xf8, 0xf5, 0xff, 0xff, 0x34, 0xf6, 0xff, 0xff,
-      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
-      0x50, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72,
+      0x4e, 0xf2, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xb8, 0xf2, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0x88, 0xf2, 0xff, 0xff, 0x10, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd8, 0xf2, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x70, 0xf2, 0xff, 0xff, 0xac, 0xf2, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+      0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e,
+      0x65, 0x61, 0x72, 0x3a, 0x31, 0x30, 0x00, 0x00, 0xe0, 0xf2, 0xff, 0xff, 0xb8, 0x0a, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xbc, 0xf2, 0xff, 0xff,
+      0xf8, 0xf2, 0xff, 0xff, 0xcc, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe6, 0xf2, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x50, 0xf3, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x20, 0xf3, 0xff, 0xff, 0x50, 0x0a, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6c, 0xf3, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x3c, 0xf3, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+      0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x64, 0xf3, 0xff, 0xff,
+      0xd4, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xb0, 0xf3, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x80, 0xf3, 0xff, 0xff, 0x90, 0x0c, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x6e, 0xf3, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x68, 0xf3, 0xff, 0xff, 0xa4, 0xf3, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+      0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72,
       0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65,
-      0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x00, 0x74, 0xf6, 0xff, 0xff,
-      0x38, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x64, 0xf6, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x5c, 0xf6, 0xff, 0xff,
-      0x98, 0xf6, 0xff, 0xff, 0x40, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xbe, 0xf6, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x90, 0xf6, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xc0, 0xf6, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x31, 0x00, 0xe4, 0xf6, 0xff, 0xff,
-      0x2c, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x0a, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xcc, 0xf6, 0xff, 0xff,
-      0x08, 0xf7, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f,
-      0x73, 0x65, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x30, 0xf7, 0xff, 0xff, 0xe0, 0x08, 0x00, 0x00,
+      0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x00, 0xe4, 0xf3, 0xff, 0xff,
+      0xe0, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xd2, 0xf3, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x3c, 0xf4, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x0c, 0xf4, 0xff, 0xff, 0x8c, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5c, 0xf4, 0xff, 0xff,
+      0x02, 0x00, 0x00, 0x00, 0xf4, 0xf3, 0xff, 0xff, 0x30, 0xf4, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x36, 0x00, 0x00, 0x00, 0x00,
+      0x58, 0xf4, 0xff, 0xff, 0xb0, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0xf4, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x40, 0xf4, 0xff, 0xff, 0x7c, 0xf4, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+      0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x00, 0xa4, 0xf4, 0xff, 0xff,
+      0x94, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xf0, 0xf4, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xc0, 0xf4, 0xff, 0xff, 0x50, 0x0b, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x56, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x18, 0xf7, 0xff, 0xff, 0x54, 0xf7, 0xff, 0xff,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-      0x0b, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x00,
-      0x78, 0xf7, 0xff, 0xff, 0x98, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x9e, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x60, 0xf7, 0xff, 0xff, 0x9c, 0xf7, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e,
-      0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x4e, 0x68, 0x77, 0x63, 0x4d, 0x61,
-      0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x3a, 0x31, 0x00, 0xd0, 0xf7, 0xff, 0xff, 0x40, 0x08, 0x00, 0x00,
+      0xae, 0xf4, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xa8, 0xf4, 0xff, 0xff, 0xe4, 0xf4, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+      0x38, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a,
+      0x31, 0x31, 0x00, 0x00, 0x0c, 0xf5, 0xff, 0xff, 0x04, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xfa, 0xf4, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xf4, 0xf4, 0xff, 0xff, 0x30, 0xf5, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x54, 0x69, 0x6e, 0x64, 0x00, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x88, 0xf5, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x58, 0xf5, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x14, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65,
+      0x3a, 0x31, 0x00, 0x00, 0x7c, 0xf5, 0xff, 0xff, 0x94, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6a, 0xf5, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0x64, 0xf5, 0xff, 0xff, 0xa0, 0xf5, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x00,
+      0xc8, 0xf5, 0xff, 0xff, 0x48, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb6, 0xf5, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xb0, 0xf5, 0xff, 0xff, 0xec, 0xf5, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x03, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+      0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e,
+      0x65, 0x61, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x20, 0xf6, 0xff, 0xff, 0xa4, 0x07, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xf6, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xb8, 0xf7, 0xff, 0xff, 0xf4, 0xf7, 0xff, 0xff,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x31,
-      0x00, 0x00, 0x00, 0x00, 0x1c, 0xf8, 0xff, 0xff, 0xf4, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x42, 0xf8, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x04, 0xf8, 0xff, 0xff, 0x40, 0xf8, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
-      0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x31, 0x00, 0x00, 0x00,
-      0x68, 0xf8, 0xff, 0xff, 0xa8, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x50, 0xf8, 0xff, 0xff, 0x8c, 0xf8, 0xff, 0xff, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x07, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00,
-      0x0c, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
-      0x1b, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66,
-      0x74, 0x3a, 0x51, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x76, 0x3a, 0x31, 0x00,
-      0xd8, 0xf8, 0xff, 0xff, 0xdc, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xc4, 0xf8, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xf4, 0xf8, 0xff, 0xff,
-      0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x33, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x22, 0xf9, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0xf4, 0xf8, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x24, 0xf9, 0xff, 0xff,
-      0xe4, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x10, 0xf9, 0xff, 0xff, 0x06, 0x00, 0x00, 0x00, 0x40, 0xf9, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x77, 0x5f, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x38, 0xf9, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
-      0x68, 0xf9, 0xff, 0xff, 0x70, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf9, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,
-      0x60, 0xf9, 0xff, 0xff, 0x03, 0x00, 0x00, 0x00, 0x90, 0xf9, 0xff, 0xff, 0x1c, 0x05, 0x00, 0x00,
+      0x0e, 0xf6, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x78, 0xf6, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0x48, 0xf6, 0xff, 0xff, 0x50, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x24, 0xf6, 0xff, 0xff, 0x60, 0xf6, 0xff, 0xff, 0x10, 0x07, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xac, 0xf6, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x7c, 0xf6, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x33, 0x00, 0xa4, 0xf6, 0xff, 0xff,
+      0xc8, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xf0, 0xf6, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xc0, 0xf6, 0xff, 0xff, 0x50, 0x09, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x80, 0xf9, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x78, 0xf9, 0xff, 0xff, 0xb4, 0xf9, 0xff, 0xff,
-      0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x34, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa8, 0xf9, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0xd8, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73,
-      0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x04, 0xfa, 0xff, 0xff,
-      0x84, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xf0, 0xf9, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x20, 0xfa, 0xff, 0xff, 0xf0, 0x05, 0x00, 0x00,
+      0xae, 0xf6, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xa8, 0xf6, 0xff, 0xff, 0xe4, 0xf6, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+      0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a,
+      0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x39, 0x00, 0x00, 0x14, 0xf7, 0xff, 0xff,
+      0xb0, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x6c, 0xf7, 0xff, 0xff,
+      0x02, 0x00, 0x00, 0x00, 0x3c, 0xf7, 0xff, 0xff, 0x5c, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x8c, 0xf7, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x24, 0xf7, 0xff, 0xff, 0x60, 0xf7, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+      0x0b, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x31, 0x00,
+      0x88, 0xf7, 0xff, 0xff, 0x88, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x76, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x70, 0xf7, 0xff, 0xff, 0xac, 0xf7, 0xff, 0xff, 0xc0, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf8, 0xf7, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0xc8, 0xf7, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65,
+      0x7a, 0x65, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x00, 0xf0, 0xf7, 0xff, 0xff, 0x20, 0x08, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x46, 0xfa, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x08, 0xfa, 0xff, 0xff, 0x44, 0xfa, 0xff, 0xff,
-      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
-      0x14, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a,
-      0x31, 0x31, 0x00, 0x00, 0x6c, 0xfa, 0xff, 0xff, 0xc4, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xfa, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0x88, 0xfa, 0xff, 0xff, 0x88, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xae, 0xfa, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x70, 0xfa, 0xff, 0xff, 0xac, 0xfa, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75,
-      0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x00, 0x00, 0xd0, 0xfa, 0xff, 0xff, 0x40, 0x05, 0x00, 0x00,
+      0xde, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xd8, 0xf7, 0xff, 0xff, 0x14, 0xf8, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+      0x44, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74,
+      0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x39, 0x00, 0x00, 0x00, 0x00,
+      0x48, 0xf8, 0xff, 0xff, 0x50, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x98, 0xf8, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0x30, 0xf8, 0xff, 0xff, 0x6c, 0xf8, 0xff, 0xff, 0x58, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5a, 0xf8, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xc4, 0xf8, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x94, 0xf8, 0xff, 0xff,
+      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+      0x64, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71,
+      0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x33,
+      0x00, 0x00, 0x00, 0x00, 0xcc, 0xf8, 0xff, 0xff, 0xc8, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb6, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xe8, 0xf8, 0xff, 0xff, 0x28, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x38, 0xf9, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0xd0, 0xf8, 0xff, 0xff, 0x0c, 0xf9, 0xff, 0xff, 0x60, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf9, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x28, 0xf9, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f,
+      0x73, 0x65, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00, 0x50, 0xf9, 0xff, 0xff, 0xc0, 0x06, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xf6, 0xfa, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xb8, 0xfa, 0xff, 0xff, 0xf4, 0xfa, 0xff, 0xff,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31,
-      0x00, 0x00, 0x00, 0x00, 0x1c, 0xfb, 0xff, 0xff, 0xf4, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x42, 0xfb, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x04, 0xfb, 0xff, 0xff, 0x40, 0xfb, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-      0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x00,
-      0x68, 0xfb, 0xff, 0xff, 0xa8, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x8e, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x50, 0xfb, 0xff, 0xff, 0x8c, 0xfb, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65,
-      0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x36, 0x00, 0x00, 0x00, 0x00, 0xb4, 0xfb, 0xff, 0xff,
-      0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe2, 0xfb, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0xa4, 0xfb, 0xff, 0xff, 0xe0, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
-      0x0a, 0x00, 0x00, 0x00, 0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00,
-      0x08, 0xfc, 0xff, 0xff, 0x08, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x2e, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xf0, 0xfb, 0xff, 0xff, 0x2c, 0xfc, 0xff, 0xff, 0x04, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x18, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0x48, 0xfc, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-      0x24, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+      0x3e, 0xf9, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x38, 0xf9, 0xff, 0xff, 0x74, 0xf9, 0xff, 0xff,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+      0x1b, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66,
+      0x74, 0x3a, 0x4e, 0x68, 0x77, 0x63, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x3a, 0x31, 0x00,
+      0xa8, 0xf9, 0xff, 0xff, 0x68, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x96, 0xf9, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x90, 0xf9, 0xff, 0xff, 0xcc, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+      0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61,
+      0x72, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0x98, 0x03, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x50, 0xfa, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xe8, 0xf9, 0xff, 0xff, 0x24, 0xfa, 0xff, 0xff,
+      0xa0, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x12, 0xfa, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x7c, 0xfa, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x4c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
       0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a,
-      0x31, 0x30, 0x00, 0x00, 0x7c, 0xfc, 0xff, 0xff, 0x30, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xfc, 0xff, 0xff, 0x94, 0xfc, 0xff, 0xff,
-      0x44, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xba, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x8c, 0xfc, 0xff, 0xff,
-      0x02, 0x00, 0x00, 0x00, 0xbc, 0xfc, 0xff, 0xff, 0x4c, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa8, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0xd8, 0xfc, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x4c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71,
-      0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x39,
-      0x00, 0x00, 0x00, 0x00, 0x0c, 0xfd, 0xff, 0xff, 0xcc, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x32, 0xfd, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x04, 0xfd, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x34, 0xfd, 0xff, 0xff,
-      0x78, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x24, 0xfd, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x1c, 0xfd, 0xff, 0xff,
-      0x58, 0xfd, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75,
-      0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x33, 0x00, 0x80, 0xfd, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x61, 0x78, 0x65, 0x73, 0x00, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x78, 0xfd, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0xa8, 0xfd, 0xff, 0xff, 0x68, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xce, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x90, 0xfd, 0xff, 0xff, 0xcc, 0xfd, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x03, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-      0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e,
-      0x65, 0x61, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x79, 0x5f, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf8, 0xfd, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0x28, 0xfe, 0xff, 0xff, 0x84, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x04, 0xfe, 0xff, 0xff, 0x40, 0xfe, 0xff, 0xff, 0x98, 0x00, 0x00, 0x00,
+      0x32, 0x31, 0x00, 0x00, 0x7c, 0xfa, 0xff, 0xff, 0x48, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6a, 0xfa, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xd4, 0xfa, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xa4, 0xfa, 0xff, 0xff,
+      0xf4, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xf4, 0xfa, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x8c, 0xfa, 0xff, 0xff,
+      0xc8, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73,
+      0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00, 0xf4, 0xfa, 0xff, 0xff,
+      0x1c, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xe2, 0xfa, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xdc, 0xfa, 0xff, 0xff,
+      0x18, 0xfb, 0xff, 0xff, 0x54, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x64, 0xfb, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x34, 0xfb, 0xff, 0xff,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x34,
+      0x00, 0x00, 0x00, 0x00, 0x5c, 0xfb, 0xff, 0xff, 0xac, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x4a, 0xfb, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0x44, 0xfb, 0xff, 0xff, 0x80, 0xfb, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+      0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x00, 0xa4, 0xfb, 0xff, 0xff,
+      0x6c, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x92, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x8c, 0xfb, 0xff, 0xff,
+      0xc8, 0xfb, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65,
+      0x7a, 0x65, 0x3a, 0x31, 0x31, 0x00, 0x00, 0x00, 0xf0, 0xfb, 0xff, 0xff, 0x20, 0x04, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x66, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x38, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
-      0x68, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x2c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e,
-      0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69,
-      0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x00, 0x00, 0xa4, 0xfe, 0xff, 0xff,
-      0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x31, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x9c, 0xfe, 0xff, 0xff,
-      0x01, 0x00, 0x00, 0x00, 0x94, 0xfe, 0xff, 0xff, 0xd0, 0xfe, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x32, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xd0, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-      0x09, 0x00, 0x00, 0x00, 0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x00,
-      0x28, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x54, 0x69, 0x6e, 0x64, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x20, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x50, 0xff, 0xff, 0xff, 0xc0, 0x00, 0x00, 0x00,
+      0xde, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xd8, 0xfb, 0xff, 0xff, 0x14, 0xfc, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+      0x3c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65,
+      0x7a, 0x65, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x40, 0xfc, 0xff, 0xff, 0xd0, 0x03, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x76, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x38, 0xff, 0xff, 0xff, 0x74, 0xff, 0xff, 0xff,
-      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
-      0x24, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71,
-      0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x33,
-      0x00, 0x00, 0x00, 0x00, 0xac, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x07, 0x00, 0x00, 0x00, 0x78, 0x5f, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xa4, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xd4, 0xff, 0xff, 0xff,
-      0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00,
-      0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,
+      0x2e, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x28, 0xfc, 0xff, 0xff, 0x64, 0xfc, 0xff, 0xff,
+      0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x61, 0x78, 0x65, 0x73,
+      0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xbc, 0xfc, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x8c, 0xfc, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61,
+      0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x00, 0xb4, 0xfc, 0xff, 0xff,
+      0x5c, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xa2, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x9c, 0xfc, 0xff, 0xff,
+      0xd8, 0xfc, 0xff, 0xff, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+      0xa8, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0x28, 0x01, 0x00, 0x00,
+      0x2c, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+      0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x51, 0x4c,
+      0x69, 0x6e, 0x65, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x76, 0x3a, 0x31, 0x00, 0x24, 0xfd, 0xff, 0xff,
+      0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x77, 0x5f, 0x73, 0x63,
+      0x61, 0x6c, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x7c, 0xfd, 0xff, 0xff,
+      0x04, 0x00, 0x00, 0x00, 0x4c, 0xfd, 0xff, 0xff, 0x20, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x98, 0xfd, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x68, 0xfd, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+      0x79, 0x5f, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xc0, 0xfd, 0xff, 0xff, 0x06, 0x00, 0x00, 0x00, 0x90, 0xfd, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x31, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe8, 0xfd, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0x80, 0xfd, 0xff, 0xff, 0xbc, 0xfd, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x54, 0x32, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x14, 0xfe, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00, 0x1c, 0xfe, 0xff, 0xff,
+      0x03, 0x00, 0x00, 0x00, 0xec, 0xfd, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x54, 0x33, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xe2, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x4c, 0xfe, 0xff, 0xff,
+      0x07, 0x00, 0x00, 0x00, 0x1c, 0xfe, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x54, 0x34, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x70, 0xfe, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x40, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+      0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00,
+      0x68, 0xfe, 0xff, 0xff, 0xa8, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x56, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x50, 0xfe, 0xff, 0xff, 0x8c, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65,
+      0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x39, 0x00, 0x00, 0x00, 0x00, 0xb4, 0xfe, 0xff, 0xff,
+      0x54, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xa2, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x9c, 0xfe, 0xff, 0xff,
+      0xd8, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74,
+      0x79, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf6, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xf0, 0xfe, 0xff, 0xff, 0x2c, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x03, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+      0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c,
+      0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x30, 0x00, 0x00, 0x00, 0x00, 0x64, 0xff, 0xff, 0xff,
+      0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x78, 0x5f, 0x73, 0x63,
+      0x61, 0x6c, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xbc, 0xff, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x8c, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x7e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xb0, 0xff, 0xff, 0xff, 0x60, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0xa0, 0xff, 0xff, 0xff, 0xdc, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75,
+      0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x31, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,
       0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
       0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-      0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
       0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
   };
   // clang-format on
diff --git a/onnxruntime/core/framework/model_metadef_id_generator.cc b/onnxruntime/core/framework/model_metadef_id_generator.cc
new file mode 100644
index 000000000000..e51c6ebc2997
--- /dev/null
+++ b/onnxruntime/core/framework/model_metadef_id_generator.cc
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include <unordered_map>
+#include "model_metadef_id_generator.h"
+#include "core/platform/ort_mutex.h"
+#include "core/graph/graph_viewer.h"
+#include "core/framework/murmurhash3.h"
+
+namespace onnxruntime {
+int ModelMetadefIdGenerator::GenerateId(const onnxruntime::GraphViewer& graph_viewer,
+                                        HashValue& model_hash) const {
+  // if the EP is shared across multiple sessions there's a very small potential for concurrency issues.
+  // use a lock when generating an id to be paranoid
+  static OrtMutex mutex;
+  std::lock_guard<OrtMutex> lock(mutex);
+  model_hash = 0;
+
+  // find the top level graph
+  const Graph* cur_graph = &graph_viewer.GetGraph();
+  while (cur_graph->IsSubgraph()) {
+    cur_graph = cur_graph->ParentGraph();
+  }
+
+  uint32_t instance_hash[4] = {0, 0, 0, 0};
+
+  const Graph& main_graph = *cur_graph;
+
+  // hash the bytes in the Graph instance. we can't just use the address as a new Graph instance may use
+  // the same memory (unit tests prove this can occur). the raw bytes of the Graph instance should be a unique
+  // fingerprint for the instance that can use used as the key to the hash of the model path/contents.
+  MurmurHash3::x86_128(&main_graph, gsl::narrow_cast<int32_t>(sizeof(Graph)), instance_hash[0], &instance_hash);
+  HashValue graph_instance_hash = instance_hash[0] | (uint64_t(instance_hash[1]) << 32);
+
+  // if we've already hashed this main graph instance use the cached value
+  auto entry = main_graph_hash_.find(graph_instance_hash);
+  if (entry != main_graph_hash_.cend()) {
+    model_hash = entry->second;
+  } else {
+    uint32_t hash[4] = {0, 0, 0, 0};
+
+    // prefer path the model was loaded from
+    // this may not be available if the model was loaded from a stream or in-memory bytes
+    const auto& model_path_str = main_graph.ModelPath().ToPathString();
+    if (!model_path_str.empty()) {
+      MurmurHash3::x86_128(model_path_str.data(), gsl::narrow_cast<int32_t>(model_path_str.size()), hash[0], &hash);
+    } else {
+      auto hash_str = [&hash](const std::string& str) {
+        MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
+      };
+
+      // fingerprint the main graph by hashing graph inputs and the ordered outputs from each node
+      for (const auto* node_arg : main_graph.GetInputsIncludingInitializers()) {
+        hash_str(node_arg->Name());
+      }
+
+      // note: process nodes in order defined in model to be deterministic
+      for (const auto& node : main_graph.Nodes()) {
+        for (const auto* node_arg : node.OutputDefs()) {
+          if (node_arg->Exists()) {
+            hash_str(node_arg->Name());
+          }
+        }
+      }
+    }
+
+    model_hash = hash[0] | (uint64_t(hash[1]) << 32);
+
+    main_graph_hash_[graph_instance_hash] = model_hash;
+  }
+
+  // return the current unique id, and increment to update
+  return model_metadef_id_[model_hash]++;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/model_metadef_id_generator.h b/onnxruntime/core/framework/model_metadef_id_generator.h
new file mode 100644
index 000000000000..82f68c42b5c3
--- /dev/null
+++ b/onnxruntime/core/framework/model_metadef_id_generator.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <unordered_map>
+#include "core/common/basic_types.h"
+namespace onnxruntime {
+class GraphViewer;
+
+/// <summary>
+/// helper to generate ids that are unique to model and deterministic, even if the execution provider is shared across
+/// multiple sessions.
+/// </summary>
+class ModelMetadefIdGenerator {
+ public:
+  /** Generate a unique id that can be used in a MetaDef name. Values are unique for a model instance.
+   The model hash is also returned if you wish to include that in the MetaDef name to ensure uniqueness across models.
+   @param graph_viewer[in] Graph viewer that GetCapability was called with. Can be for the main graph or nested graph.
+   @param model_hash[out] Returns the hash for the main (i.e. top level) graph in the model.
+                          This is created using the model path if available,
+                          or the model input names and the output names from all nodes in the main graph.
+   */
+  int GenerateId(const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash) const;
+
+ private:
+  // mutable as these are caches so we can minimize the hashing required on each usage of GenerateId
+  mutable std::unordered_map<HashValue, HashValue> main_graph_hash_;  // map graph instance hash to model contents hash
+  mutable std::unordered_map<HashValue, int> model_metadef_id_;       // current unique id for model
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/node_unit.cc b/onnxruntime/core/framework/node_unit.cc
new file mode 100644
index 000000000000..174942b9033d
--- /dev/null
+++ b/onnxruntime/core/framework/node_unit.cc
@@ -0,0 +1,359 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#include "node_unit.h"
+#include "core/graph/graph_viewer.h"
+
+namespace onnxruntime {
+
+namespace {
+
+enum class QLinearOpType : uint8_t {
+  Unknown,  // Unknown or not a linear quantized op
+  DequantizeLinear,
+  QuantizeLinear,
+  QLinearConv,
+  QLinearMatMul,
+  QLinearAdd,
+  QLinearSigmoid,
+  QLinearAveragePool,
+  QLinearMul,
+  QLinearReduceMean,
+  QLinearConcat,
+  QLinearGlobalAveragePool,
+  QLinearLeakyRelu,
+};
+
+QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
+  const auto& op_type = node.OpType();
+  if (op_type == "DequantizeLinear")
+    return QLinearOpType::DequantizeLinear;
+  else if (op_type == "QuantizeLinear")
+    return QLinearOpType::QuantizeLinear;
+  else if (op_type == "QLinearConv")
+    return QLinearOpType::QLinearConv;
+  else if (op_type == "QLinearMatMul")
+    return QLinearOpType::QLinearMatMul;
+  else if (op_type == "QLinearAdd")
+    return QLinearOpType::QLinearAdd;
+  else if (op_type == "QLinearSigmoid")
+    return QLinearOpType::QLinearSigmoid;
+  else if (op_type == "QLinearAveragePool")
+    return QLinearOpType::QLinearAveragePool;
+  else if (op_type == "QLinearMul")
+    return QLinearOpType::QLinearMul;
+  else if (op_type == "QLinearReduceMean")
+    return QLinearOpType::QLinearReduceMean;
+  else if (op_type == "QLinearConcat")
+    return QLinearOpType::QLinearConcat;
+  else if (op_type == "QLinearGlobalAveragePool")
+    return QLinearOpType::QLinearGlobalAveragePool;
+  else if (op_type == "QLinearLeakyRelu")
+    return QLinearOpType::QLinearLeakyRelu;
+
+  return QLinearOpType::Unknown;
+}
+
+// Ops have 1 input
+bool IsUnaryQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearSigmoid ||
+         type == QLinearOpType::QLinearAveragePool ||
+         type == QLinearOpType::QLinearGlobalAveragePool ||
+         type == QLinearOpType::QLinearLeakyRelu ||
+         type == QLinearOpType::QLinearReduceMean;
+}
+
+// Ops have 2 inputs
+bool IsBinaryQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearConv ||
+         type == QLinearOpType::QLinearMatMul ||
+         type == QLinearOpType::QLinearAdd ||
+         type == QLinearOpType::QLinearMul;
+}
+
+// Ops have 1 or more inputs
+bool IsVariadicQLinearOp(QLinearOpType type) {
+  return type == QLinearOpType::QLinearConcat;
+}
+
+const std::vector<const Node*> GetQDQIONodes(const GraphViewer& graph_viewer,
+                                             const QDQ::NodeGroup& node_group, bool is_input) {
+  std::vector<const Node*> io_nodes;
+  const auto& src_nodes = is_input ? node_group.dq_nodes : node_group.q_nodes;
+  io_nodes.reserve(src_nodes.size());
+  for (const auto& node_idx : src_nodes) {
+    io_nodes.push_back(graph_viewer.GetNode(node_idx));
+  }
+
+  return io_nodes;
+}
+
+// Get the input or output NodeUnitIODef(s) for the given QDQ NodeGroup
+std::vector<NodeUnitIODef> GetQDQIODefs(const Node& target_node, const QDQ::NodeGroup& node_group, bool is_input) {
+  const auto& dq_or_q_nodes = is_input ? node_group.dq_nodes : node_group.q_nodes;
+  const auto target_node_io_defs = is_input ? target_node.InputDefs() : target_node.OutputDefs();
+  const size_t target_node_io_defs_size = target_node_io_defs.size();
+
+  // Find all the quantized IO defs and indices (for the input/output of the target node)
+  std::unordered_map<size_t, NodeUnitIODef> quantized_io_defs;
+  quantized_io_defs.reserve(target_node_io_defs_size);
+
+  auto cur = is_input ? target_node.InputEdgesBegin() : target_node.OutputEdgesBegin();
+  auto end = is_input ? target_node.InputEdgesEnd() : target_node.OutputEdgesEnd();
+
+  for (; cur != end; ++cur) {
+    const Node& node = cur->GetNode();
+
+    // If we can find the node index in the dq or q nodes this is a quantized input/output
+    if (std::find(dq_or_q_nodes.cbegin(), dq_or_q_nodes.cend(), node.Index()) != dq_or_q_nodes.cend()) {
+      const auto node_inputs = node.InputDefs();
+      const auto& node_attrs = node.GetAttributes();
+
+      // Get the Q or DQ axis attribute if available.
+      std::optional<int64_t> axis;
+      if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) {
+        axis = entry->second.i();
+      }
+
+      // quantization scale and zp are always the input[1, 2]
+      NodeUnitIODef::QuantParam quant_param{*node_inputs[1], node_inputs.size() == 3 ? node_inputs[2] : nullptr, axis};
+
+      if (is_input) {
+        // DQ is input to the target node, use the DstArgIndex
+        auto idx = cur->GetDstArgIndex();
+        // This is a DQ node, we are using x, x_scale, x_zp (input[0, 1, 2])
+        quantized_io_defs.insert({idx, NodeUnitIODef{*node_inputs[0], quant_param}});
+      } else {
+        // Q is output of the target node, use the SrcArgIndex
+        auto idx = cur->GetSrcArgIndex();
+        // This is a Q node, we are using y (output[0]), y_scale, y_zp (input[1, 2])
+        const auto node_outputs = node.OutputDefs();
+        quantized_io_defs.insert({idx, NodeUnitIODef{*node_outputs[0], quant_param}});
+      }
+    }
+  }
+
+  // Construct the IODefs for this QDQ NodeGroup
+  std::vector<NodeUnitIODef> io_defs;
+  io_defs.reserve(target_node_io_defs_size);
+  for (size_t i = 0; i < target_node_io_defs_size; i++) {
+    // If we can find the NodeUnitIODef for this index, this is a quantized input/output
+    if (quantized_io_defs.find(i) != quantized_io_defs.cend()) {
+      io_defs.push_back(std::move(quantized_io_defs.at(i)));
+    } else {
+      // This is a regular input
+      io_defs.push_back({*target_node_io_defs[i], std::nullopt});
+    }
+  }
+
+  return io_defs;
+}
+
+}  // namespace
+
+Status QDQ::NodeGroup::CanCreateNodeGroup(const GraphViewer& graph_viewer,
+                                          const Node& target_node,
+                                          gsl::span<const Node* const> dq_nodes,
+                                          gsl::span<const Node* const> q_nodes) {
+  // Within a QDQ node group, a target node input is the only consumer of each DQ.
+  // This should have been ensured by the EnsureUniqueDQForNodeUnit graph transformer, but other graph modifications
+  // may have happened since. Verify that this is still true.
+  for (const auto* dq_node : dq_nodes) {
+    const bool dq_produces_graph_output = graph_viewer.NodeProducesGraphOutput(*dq_node);
+    ORT_RETURN_IF(dq_produces_graph_output,
+                  "QDQ node group cannot have DQ node that produces a graph output. DQ node: ", dq_node->Name(),
+                  ", target node: ", target_node.Name());
+
+    const bool dq_has_single_output_edge_to_target =
+        dq_node->GetOutputEdgesCount() == 1 &&
+        dq_node->OutputEdgesBegin()->GetNode().Index() == target_node.Index();
+    ORT_RETURN_IF_NOT(dq_has_single_output_edge_to_target,
+                      "QDQ node group cannot have DQ that doesn't have a single output edge to the target node. "
+                      "DQ node: ",
+                      dq_node->Name(), ", target node: ", target_node.Name());
+  }
+
+  // an output from the target node can have either Q consumers or direct consumers. it cannot have both.
+  // this must be checked on a per output basis.
+  // e.g. TopK produces values and indices. The indices output won't be quantized, so even if we replace the TopK QDQ
+  // node group with a quantized TopK, an int64_t indices value will be produced and can provide a graph output.
+  if (!q_nodes.empty()) {
+    auto cur_edge = target_node.OutputEdgesBegin();
+    auto end_edge = target_node.OutputEdgesEnd();
+    std::vector<const Node*> output_consumers(target_node.OutputDefs().size(), nullptr);
+
+    for (; cur_edge != end_edge; ++cur_edge) {
+      auto output_idx = cur_edge->GetSrcArgIndex();
+      const Node& this_consumer = cur_edge->GetNode();
+      const Node* existing_consumer = output_consumers[output_idx];
+
+      if (existing_consumer != nullptr) {
+        // another edge for this output. either both are Q or both are not.
+        bool valid = true;
+        if (existing_consumer->OpType() == "QuantizeLinear") {
+          valid = this_consumer.OpType() == "QuantizeLinear";
+        } else {
+          valid = this_consumer.OpType() != "QuantizeLinear";
+        }
+
+        ORT_RETURN_IF_NOT(valid,
+                          "QDQ node group cannot have an output from the target node being consumed by a Q node and "
+                          "a non-Q node. target node: ",
+                          target_node.Name());
+      } else {
+        output_consumers[output_idx] = &this_consumer;
+      }
+    }
+
+    const auto& graph_outputs = graph_viewer.GetOutputs();
+    for (size_t idx = 0, end = output_consumers.size(); idx < end; ++idx) {
+      // any output with a Q cannot be a graph output as it will disappear if the QDQ node unit is converted to
+      // a quantized op.
+      if (output_consumers[idx] != nullptr && output_consumers[idx]->OpType() == "QuantizeLinear") {
+        const auto& output_name = target_node.OutputDefs()[idx]->Name();
+        bool is_graph_output = std::any_of(graph_outputs.begin(), graph_outputs.end(),
+                                           [&output_name](const NodeArg* node_arg) {
+                                             return node_arg->Name() == output_name;
+                                           });
+        ORT_RETURN_IF(is_graph_output,
+                      "QDQ node group cannot have an output from the target node that is consumed by a Q node and "
+                      "a graph output. target node: ",
+                      target_node.Name(), " output idx:", idx);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+NodeUnit::NodeUnit(const Node& node)
+    : target_node_(node),
+      type_(Type::SingleNode),
+      input_edge_count_(node.GetInputEdgesCount()) {
+  InitForSingleNode();
+}
+
+NodeUnit::NodeUnit(const GraphViewer& graph_viewer, const QDQ::NodeGroup& node_group)
+    : dq_nodes_{GetQDQIONodes(graph_viewer, node_group, true /* is_input */)},
+      target_node_(*graph_viewer.GetNode(node_group.target_node)),
+      q_nodes_{GetQDQIONodes(graph_viewer, node_group, false /* is_input */)},
+      type_(Type::QDQGroup),
+      inputs_{GetQDQIODefs(target_node_, node_group, true /* is_input */)},
+      outputs_{GetQDQIODefs(target_node_, node_group, false /* is_input */)} {
+  ORT_THROW_IF_ERROR(QDQ::NodeGroup::CanCreateNodeGroup(graph_viewer, target_node_, dq_nodes_, q_nodes_));
+
+  input_edge_count_ = std::accumulate(dq_nodes_.cbegin(), dq_nodes_.cend(), size_t(0),
+                                      [](size_t acc, const Node* node) { return acc + node->GetInputEdgesCount(); });
+
+  // add edges for inputs that are not from DQ nodes. there is one edge to each DQ node.
+  // other inputs could come from initializers or graph inputs (no edges) or other nodes (edge).
+  input_edge_count_ += target_node_.GetInputEdgesCount() - dq_nodes_.size();
+
+  // create output edges. each target node output either goes to Q node/s or non-Q node/s.
+  // ValidateNodeGroupQDQNodes ensures this.
+  auto cur_edge = target_node_.OutputEdgesBegin();
+  auto end_edge = target_node_.OutputEdgesEnd();
+  for (; cur_edge != end_edge; ++cur_edge) {
+    const Node& node = cur_edge->GetNode();
+
+    // if node is in q_nodes we hide the Q node.
+    if (std::find(q_nodes_.cbegin(), q_nodes_.cend(), &node) != q_nodes_.cend()) {
+      auto src_idx = cur_edge->GetSrcArgIndex();
+      auto q_cur_edge = node.OutputEdgesBegin();
+      auto q_end_edge = node.OutputEdgesEnd();
+      for (; q_cur_edge != q_end_edge; ++q_cur_edge) {
+        output_edges_.insert(Node::EdgeEnd{q_cur_edge->GetNode(), src_idx, q_cur_edge->GetDstArgIndex()});
+      }
+    } else {
+      // non-Q node, or Q node that isn't in the QDQ node group (unexpected but may be possible). add as-is.
+      output_edges_.insert(*cur_edge);
+    }
+  }
+}
+
+const std::string& NodeUnit::Domain() const noexcept { return target_node_.Domain(); }
+const std::string& NodeUnit::OpType() const noexcept { return target_node_.OpType(); }
+const std::string& NodeUnit::Name() const noexcept { return target_node_.Name(); }
+int NodeUnit::SinceVersion() const noexcept { return target_node_.SinceVersion(); }
+NodeIndex NodeUnit::Index() const noexcept { return target_node_.Index(); }
+const Path& NodeUnit::ModelPath() const noexcept { return target_node_.ModelPath(); }
+ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target_node_.GetExecutionProviderType(); }
+
+void NodeUnit::InitForSingleNode() {
+  const auto& input_defs = target_node_.InputDefs();
+  const auto& output_defs = target_node_.OutputDefs();
+  auto qlinear_type = GetQLinearOpType(target_node_);
+  if (qlinear_type == QLinearOpType::Unknown || IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
+    // Not a Qlinear op, add all inputs / outputs
+    auto add_all_io = [](std::vector<NodeUnitIODef>& defs,
+                         const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
+      defs.reserve(node_defs.size());
+
+      for (const auto def : node_defs) {
+        defs.push_back(NodeUnitIODef{*def, std::nullopt});
+      }
+    };
+
+    add_all_io(inputs_, input_defs);
+    add_all_io(outputs_, output_defs);
+  } else if (IsUnaryQLinearOp(qlinear_type)) {
+    // Unary QLinear Op has 5 inputs
+    // x, x_scale, x_zp, y_scale, y_zp (optional)
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0],
+                                     NodeUnitIODef::QuantParam{*input_defs[3],
+                                                               input_defs.size() > 4 ? input_defs[4] : nullptr}});
+
+  } else if (IsBinaryQLinearOp(qlinear_type)) {
+    // Binary QLinear Op has 9 inputs
+    // x1, x1_scale, x1_zp, x2/w, x2_scale, x2_zp, y_scale , y_zp, B
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
+    inputs_.push_back(NodeUnitIODef{*input_defs[3], NodeUnitIODef::QuantParam{*input_defs[4], input_defs[5]}});
+
+    if (input_defs.size() == 9) {                                      // has Bias
+      inputs_.push_back(NodeUnitIODef{*input_defs[8], std::nullopt});  // for Bias the scale and zp are optional
+    }
+
+    outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[6], input_defs[7]}});
+
+  } else if (qlinear_type == QLinearOpType::DequantizeLinear) {
+    // DequantizeLinear has 3 inputs
+    // x, x_scale, x_zp
+    // output is not quantized
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
+                                                                                                  ? input_defs[2]
+                                                                                                  : nullptr}});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
+
+  } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
+    // QuantizeLinear the input is not quantized and has 3 inputs
+    // x, y_scale, y_zp (optional)
+    // The output is quantized
+    inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
+    outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
+                                                                                                    ? input_defs[2]
+                                                                                                    : nullptr}});
+  } else {
+    ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
+  }
+}
+
+Node::EdgeConstIterator NodeUnit::OutputEdgesBegin() const {
+  return (type_ == Type::SingleNode) ? target_node_.OutputEdgesBegin() : output_edges_.begin();
+}
+
+Node::EdgeConstIterator NodeUnit::OutputEdgesEnd() const {
+  return (type_ == Type::SingleNode) ? target_node_.OutputEdgesEnd() : output_edges_.end();
+}
+
+std::vector<const Node*> NodeUnit::GetAllNodesInGroup() const noexcept {
+  std::vector<const Node*> all_nodes = dq_nodes_;
+  all_nodes.push_back(&target_node_);
+  all_nodes.insert(all_nodes.end(), q_nodes_.begin(), q_nodes_.end());
+  return all_nodes;
+}
+
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.h b/onnxruntime/core/framework/node_unit.h
similarity index 51%
rename from onnxruntime/core/providers/shared/node_unit/node_unit.h
rename to onnxruntime/core/framework/node_unit.h
index b47204ca3c42..a168495f12eb 100644
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.h
+++ b/onnxruntime/core/framework/node_unit.h
@@ -3,6 +3,9 @@
 
 #pragma once
 
+// QDQ models require graph modification at runtime, so we know this infrastructure is not used in a minimal build
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include <string>
 #include <optional>
 #include <vector>
@@ -18,17 +21,31 @@ class NodeArg;
 class Path;
 
 namespace QDQ {
-struct NodeGroup;
-}
+// Struct to represent a DequantizeLinear -> Op -> QuantizeLinear node group
+struct NodeGroup {
+  std::vector<NodeIndex> dq_nodes;
+  std::vector<NodeIndex> q_nodes;
+  NodeIndex target_node;
+
+  // Validator to check if the set of nodes can form a valid QDQ NodeGroup.
+  // Checks target node is only consumer of each DQ, and that the outputs remain valid if the QDQ node group was to
+  // be converted into a single node with a quantized operator.
+  static Status CanCreateNodeGroup(const GraphViewer& graph_viewer,
+                                   const Node& target_node,
+                                   gsl::span<const Node* const> dq_nodes,
+                                   gsl::span<const Node* const> q_nodes);
+};
+}  // namespace QDQ
 
 // Definition of one input or output
 // If the optional quant_param is present, then this is a quantized input,
 // otherwise this is a regular input
 struct NodeUnitIODef {
-  // The quantization parameter, scale is manadatory, and zero_point is optional
+  // The quantization parameter. Scale is mandatory. Zero-point and axis are optional.
   struct QuantParam {
     const NodeArg& scale;
     const NodeArg* zero_point{nullptr};
+    std::optional<int64_t> axis{std::nullopt};
   };
 
   const NodeArg& node_arg;
@@ -69,26 +86,33 @@ class NodeUnit {
   const std::vector<const Node*>& GetQNodes() const noexcept { return q_nodes_; }
   std::vector<const Node*> GetAllNodesInGroup() const noexcept;
 
-  Node::EdgeConstIterator OutputEdgesBegin(size_t index) const;
-  Node::EdgeConstIterator OutputEdgesEnd(size_t index) const;
+  /// Number of input edges to the logical node. For a QDQ node this is the count of input edges to the DQ nodes
+  /// plus any other edges to the target node for inputs that are not via a DQ node.
+  size_t InputEdgeCount() const { return input_edge_count_; }
+
+  // output edges. src index is for outputs of the target node. dest index and node is for consumer of node unit
+  // output. any Q nodes are hidden.
+  Node::EdgeConstIterator OutputEdgesBegin() const;
+  Node::EdgeConstIterator OutputEdgesEnd() const;
 
  private:
-  const std::vector<const Node*> q_nodes_;   // q-nodes for this NodeUnit
-  const std::vector<const Node*> dq_nodes_;  // dq nodes for this NodeUnit, not all inputs
+  // Initialization for a NodeUnit that contains a single node
+  void InitForSingleNode();
+
+  const std::vector<const Node*> dq_nodes_;  // dq nodes for this NodeUnit, not necessarily all inputs
   const Node& target_node_;
+  const std::vector<const Node*> q_nodes_;  // q-nodes for this NodeUnit. not necessarily all outputs
   const Type type_;
 
   std::vector<NodeUnitIODef> inputs_;
   std::vector<NodeUnitIODef> outputs_;
 
-  // Initializing for a single Node
-  void InitForSingleNode();
-};
+  size_t input_edge_count_;  // total number of input edges
 
-// Get all the nodes in the given graph_viewer as NodeUnits (SingleNode or QDQGroup)
-// And return a map to quick query the NodeUnit which contains the given Node,
-// Note, the value of the map is owned by the vector of std::unique_ptr<NodeUnit>
-std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-GetAllNodeUnits(const GraphViewer& graph_viewer);
+  // output edges, hiding any Q nodes involved. src_idx will be value from target node. only used for QDQ node group.
+  Node::EdgeSet output_edges_;
+};
 
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/op_kernel_info.cc b/onnxruntime/core/framework/op_kernel_info.cc
index 841fdb585f0d..28793dae36d2 100644
--- a/onnxruntime/core/framework/op_kernel_info.cc
+++ b/onnxruntime/core/framework/op_kernel_info.cc
@@ -15,7 +15,8 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
                            const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                            const OrtValueNameIdxMap& ort_value_name_idx_map,
                            const DataTransferManager& data_transfer_mgr,
-                           const AllocatorMap& allocators)
+                           const AllocatorMap& allocators,
+                           const ConfigOptions& config_options)
     : OpNodeProtoHelper(&proto_helper_context_),
       node_(node),
       kernel_def_(kernel_def),
@@ -24,15 +25,22 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
       ort_value_name_idx_map_(ort_value_name_idx_map),
       data_transfer_mgr_(data_transfer_mgr),
       proto_helper_context_(node),
-      allocators_(allocators) {}
+      allocators_(allocators),
+      config_options_(config_options) {
+}
 
 OpKernelInfo::OpKernelInfo(const OpKernelInfo& other)
     : OpKernelInfo(other.node_, other.kernel_def_, *other.execution_provider_, other.constant_initialized_tensors_,
-                   other.ort_value_name_idx_map_, other.data_transfer_mgr_, other.allocators_) {}
+                   other.ort_value_name_idx_map_, other.data_transfer_mgr_,
+                   other.allocators_, other.config_options_) {
+}
 
 AllocatorPtr OpKernelInfo::GetAllocator(OrtMemType mem_type) const {
   auto it = allocators_.find(execution_provider_->GetOrtDeviceByMemType(mem_type));
-  if (it != allocators_.end()) return it->second;
+  if (it != allocators_.end()) {
+    return it->second;
+  }
+
   return nullptr;
 }
 
diff --git a/onnxruntime/core/framework/sequential_execution_plan.h b/onnxruntime/core/framework/sequential_execution_plan.h
index 3152154e52d7..62c66bc6f336 100644
--- a/onnxruntime/core/framework/sequential_execution_plan.h
+++ b/onnxruntime/core/framework/sequential_execution_plan.h
@@ -203,6 +203,8 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
     }
     return count;
   }
+
+  InlinedVector<size_t> node_stream_map_;
 };
 
 // Output details of an execution plan:
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index ba68bc1d7d83..0cc7294a4649 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -181,7 +181,7 @@ class SessionScope {
     }
 
     auto& logger = session_state_.Logger();
-    LOGS(logger, VERBOSE) << "Begin execution";
+    VLOGS(logger, 0) << "Begin execution";
     const SequentialExecutionPlan& seq_exec_plan = *session_state_.GetExecutionPlan();
     const auto& exec_plan_vec = seq_exec_plan.execution_plan;
     VLOGS(logger, 1) << "Size of execution plan vector: " << exec_plan_vec.size();
@@ -306,18 +306,20 @@ class KernelScope {
 #endif
 
 #ifdef ENABLE_NVTX_PROFILE
-    auto& node = kernel_.Node();
-    profile::NvtxRangeCreator& forward_range = session_scope_.forward_range_;
-    profile::NvtxRangeCreator& backward_range = session_scope_.backward_range_;
-    if (node.Description() != "Backward pass" && !forward_range.IsBeginCalled()) {
-      // Start timing forward pass when encountering the first forward node.
-      forward_range.Begin();
-    } else if (node.Description() == "Backward pass" && !backward_range.IsBeginCalled() &&
-               forward_range.IsBeginCalled()) {
-      // Start timing backward pass when encountering the first backward node.
-      // In the meanwhile, forward range ends.
-      forward_range.End();
-      backward_range.Begin();
+    {
+      auto& node = kernel_.Node();
+      profile::NvtxRangeCreator& forward_range = session_scope_.forward_range_;
+      profile::NvtxRangeCreator& backward_range = session_scope_.backward_range_;
+      if (node.Description() != "Backward pass" && !forward_range.IsBeginCalled()) {
+        // Start timing forward pass when encountering the first forward node.
+        forward_range.Begin();
+      } else if (node.Description() == "Backward pass" && !backward_range.IsBeginCalled() &&
+                 forward_range.IsBeginCalled()) {
+        // Start timing backward pass when encountering the first backward node.
+        // In the meanwhile, forward range ends.
+        forward_range.End();
+        backward_range.Begin();
+      }
     }
 #endif
 
@@ -515,7 +517,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
     return Status(status.Category(), status.Code(), msg_string);
   }
   ctx.RecycleNodeInputs(idx);
-  LOGS(logger, VERBOSE) << "stream " << stream_idx << " launch kernel with idx " << idx;
+  VLOGS(logger, 0) << "stream " << stream_idx << " launch kernel with idx " << idx;
   return Status::OK();
 }
 
@@ -531,7 +533,7 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
                                    const bool only_execute_path_to_fetches,
                                    bool single_thread_mode) {
   auto* execution_plan = session_state.GetExecutionPlan();
-  LOGS(logger, VERBOSE) << "Number of streams: " << execution_plan->execution_plan.size();
+  VLOGS(logger, 0) << "Number of streams: " << execution_plan->execution_plan.size();
   int32_t valid_streams = 0;
   for (auto& stream : execution_plan->execution_plan) {
     if (stream && stream->steps_.size() > 0)
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 40c59cfcf699..796a018ac0f6 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -65,6 +65,11 @@ struct FreeDimensionOverride {
  * Configuration information for a session.
  */
 struct SessionOptions {
+#if defined(__wasm__) && defined(__EMSCRIPTEN_PTHREADS__)
+  static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = false;
+#else
+  static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = true;
+#endif
   ExecutionMode execution_mode = ExecutionMode::ORT_SEQUENTIAL;
 
   // set the execution order of the graph
@@ -129,7 +134,8 @@ struct SessionOptions {
 
   // By default the session uses its own set of threadpools, unless this is set to false.
   // Use this in conjunction with the CreateEnvWithGlobalThreadPools API.
-  bool use_per_session_threads = true;
+  bool use_per_session_threads = DEFAULT_USE_PER_SESSION_THREADS;
+
   bool thread_pool_allow_spinning = true;
 
   // Deterministic compute is likely not as performant. This option is default to false.
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index 51bb02918d82..e318c9a8238c 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -8,7 +8,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/gsl.h"
 
@@ -259,8 +259,8 @@ class SessionState {
      * \param p_node0 Nullable
      * \param kci0 Nullable
      */
-    NodeInfo(size_t index0, const onnxruntime::Node* p_node0, const KernelCreateInfo* kci0, const OrtDevice& device0)
-        : index(index0), p_node(p_node0), kci(kci0), device(&device0) {}
+    NodeInfo(size_t index0, const onnxruntime::Node* p_node0, const KernelCreateInfo* kci0, const OrtDevice& device0, int stream_index0 = -1)
+        : index(index0), p_node(p_node0), kci(kci0), device(&device0), stream_index(stream_index0) {}
 
     size_t index;
     // Nullable
@@ -268,6 +268,7 @@ class SessionState {
     // Nullable
     const KernelCreateInfo* kci = nullptr;
     const OrtDevice* device = nullptr;
+    int stream_index;
   };
 
   using NameNodeInfoMapType = InlinedHashMap<std::string, InlinedVector<NodeInfo>>;
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index df11fe8302ae..692ca0877253 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -367,6 +367,7 @@ common::Status SaveInputOutputNamesToNodeMapping(const onnxruntime::GraphViewer&
 
   for (auto& node : graph.Nodes()) {
     const KernelCreateInfo& kci = session_state.GetNodeKernelCreateInfo(node.Index());
+    int stream_index = static_cast<int>(exec_plan->node_stream_map_[node.Index()]);
 
     ORT_RETURN_IF_ERROR(
         onnxruntime::Node::ForEachWithIndex(
@@ -379,8 +380,7 @@ common::Status SaveInputOutputNamesToNodeMapping(const onnxruntime::GraphViewer&
               int arg_index;
               ORT_RETURN_IF_ERROR(name_to_id.GetIdx(arg.Name(), arg_index));
               const auto& device = exec_plan->GetLocation(arg_index);
-
-              SessionState::NodeInfo node_info(index, &node, &kci, device);
+              SessionState::NodeInfo node_info(index, &node, &kci, device, stream_index);
 
               if (IsArgNameInInputsOutputs(arg.Name(), graph_inputs)) {
                 ORT_RETURN_IF_ERROR(session_state.AddInputNameToNodeInfoMapping(arg.Name(), node_info));
@@ -419,7 +419,7 @@ common::Status SaveInputOutputNamesToNodeMapping(const onnxruntime::GraphViewer&
         int arg_index;
         ORT_RETURN_IF_ERROR(name_to_id.GetIdx(input_def->Name(), arg_index));
         auto& device = exec_plan->GetLocation(arg_index);
-        SessionState::NodeInfo node_info(std::numeric_limits<size_t>::max(), &node, &kci, device);
+        SessionState::NodeInfo node_info(std::numeric_limits<size_t>::max(), &node, &kci, device, stream_index);
         ORT_RETURN_IF_ERROR(session_state.AddInputNameToNodeInfoMapping(input_def->Name(), node_info));
       }
     }
diff --git a/onnxruntime/core/framework/stream_execution_context.cc b/onnxruntime/core/framework/stream_execution_context.cc
index 4ff5ee5db865..dd7f4d35b34b 100644
--- a/onnxruntime/core/framework/stream_execution_context.cc
+++ b/onnxruntime/core/framework/stream_execution_context.cc
@@ -168,7 +168,7 @@ void StreamExecutionContext::RecycleNodeInputs(onnxruntime::NodeIndex node_index
   for (auto idx : execution_plan->node_release_list[node_index]) {
     if (--release_plan_[idx] == 0) {
       ORT_ENFORCE(frame_.ReleaseMLValue(static_cast<int>(execution_plan->release_actions[idx].value_index)).IsOK());
-      LOGS(*logger_, VERBOSE) << "ort value " << execution_plan->release_actions[idx].value_index << " released";
+      VLOGS(*logger_, 0) << "ort value " << execution_plan->release_actions[idx].value_index << " released";
     }
   }
 }
@@ -181,11 +181,13 @@ void RunSince(size_t stream_idx, StreamExecutionContext& ctx, SessionScope& sess
   }
 
 #ifdef USE_CANN
+  // Leave it to CANN EP to fill the gap if they want to use run_options
+  static onnxruntime::RunOptions run_options;
   // For CANN EP, it is necessary to explicitly create a corresponding Context for each thread in the thread pool,
   // which is different from CUDA Runtime API, but similar to CUDA Driver API.
   auto& execution_providers = ctx.GetSessionState().GetExecutionProviders();
   for (auto& xp : execution_providers) {
-    auto status = xp->OnRunStart();
+    auto status = xp->OnRunStart(run_options);
     if (!status.IsOK()) {
       ctx.SetStatus(status);
       return;
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index fd32aaedcc2e..8a2db6d5728a 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -7,6 +7,10 @@
 #include <algorithm>
 #include <limits>
 
+#if defined(__wasm__)
+#include <emscripten.h>
+#endif
+
 #include "core/common/gsl.h"
 #include "core/common/logging/logging.h"
 #include "core/common/narrow.h"
@@ -769,6 +773,7 @@ static void DeleteCharArray(void* param) noexcept {
   delete[] arr;
 }
 
+#if !defined(__wasm__)
 static Status GetFileContent(
     const Env& env, const ORTCHAR_T* file_path, FileOffsetType offset, size_t length,
     void*& raw_buffer, OrtCallback& deleter) {
@@ -797,6 +802,7 @@ static Status GetFileContent(
   raw_buffer = buffer.release();
   return Status::OK();
 }
+#endif
 
 Status GetExtDataFromTensorProto(const Env& env, const ORTCHAR_T* model_path,
                                  const ONNX_NAMESPACE::TensorProto& tensor_proto,
@@ -819,6 +825,69 @@ Status GetExtDataFromTensorProto(const Env& env, const ORTCHAR_T* model_path,
     ext_data_len = raw_data_safe_len;
     ext_data_deleter = OrtCallback{nullptr, nullptr};
   } else {
+#if defined(__wasm__)
+    ORT_RETURN_IF(file_offset < 0 || file_offset + raw_data_safe_len >= 4294967296,
+                  "External initializer: ", tensor_proto.name(),
+                  " offset: ", file_offset, " size to read: ", static_cast<size_t>(raw_data_safe_len),
+                  " are out of bounds or can not be read in full (>4GB).");
+
+    auto buffer = std::make_unique<char[]>(raw_data_safe_len);
+    ext_data_deleter = OrtCallback{DeleteCharArray, buffer.get()};
+    ext_data_buf = buffer.release();
+    ext_data_len = raw_data_safe_len;
+
+    // In WebAssembly, try use a simplified preloaded file map in WebAssembly when available.
+    auto err_code = EM_ASM_INT(({
+                                 // If available, "Module.MountedFiles" is a Map for all preloaded files.
+                                 if (typeof Module == 'undefined' || !Module.MountedFiles) {
+                                   return 1;  // "Module.MountedFiles" is not available.
+                                 }
+                                 let fileName = UTF8ToString($0 >>> 0);
+                                 if (fileName.startsWith('./')) {
+                                   fileName = fileName.substring(2);
+                                 }
+                                 const fileData = Module.MountedFiles.get(fileName);
+                                 if (!fileData) {
+                                   return 2;  // File not found in preloaded files.
+                                 }
+                                 const offset = $1 >>> 0;
+                                 const length = $2 >>> 0;
+                                 const buffer = $3 >>> 0;
+
+                                 if (offset + length > fileData.byteLength) {
+                                   return 3;  // Out of bounds.
+                                 }
+
+                                 try {
+                                   // Copy the file data (fileData,offset,length) into WebAssembly memory (HEAPU8,buffer,length).
+                                   HEAPU8.set(fileData.subarray(offset, offset + length), buffer);
+                                   return 0;
+                                 } catch {
+                                   return 4;
+                                 }
+                               }),
+                               external_data_file_path.c_str(),
+                               static_cast<int32_t>(file_offset),
+                               static_cast<int32_t>(raw_data_safe_len),
+                               ext_data_buf);
+    const char* err_msg;
+    switch (err_code) {
+      case 0:
+        return Status::OK();
+      case 1:
+        err_msg = "Module.MountedFiles is not available.";
+        break;
+      case 2:
+        err_msg = "File not found in preloaded files.";
+        break;
+      case 3:
+        err_msg = "Out of bounds.";
+        break;
+      default:
+        err_msg = "Unknown error occurred in memory copy.";
+    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to load external data file \"", external_data_file_path, "\", error: ", err_msg);
+#else
     size_t file_length;
     // error reporting is inconsistent across platforms. Make sure the full path we attempted to open is included.
     auto status = env.GetFileLength(external_data_file_path.c_str(), file_length);
@@ -836,6 +905,7 @@ Status GetExtDataFromTensorProto(const Env& env, const ORTCHAR_T* model_path,
     ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), file_offset, raw_data_safe_len,
                                        ext_data_buf, ext_data_deleter));
     ext_data_len = raw_data_safe_len;
+#endif
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 23fe5e1cd3d9..0c4d498fae9e 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -270,6 +270,15 @@ static common::Status CalculateStaticCopyInfoForFeed(const SessionState& session
     }
 
     copy_info.target_device = *node_info.device;
+    copy_info.unique_stream_index_consumes_it = node_info.stream_index;
+    ORT_RETURN_IF(node_info.stream_index < 0, "node_info.stream_index < 0");
+    for (size_t i = 1; i < node_info_vec.size(); i++) {
+      ORT_RETURN_IF(node_info_vec[i].stream_index < 0, "node_info_vec[i].stream_index < 0");
+      if (node_info_vec[i].stream_index != node_info.stream_index) {
+        copy_info.unique_stream_index_consumes_it = -1;
+        break;
+      }
+    }
 
 #ifdef ENABLE_TRAINING
   } else {
@@ -441,11 +450,12 @@ static void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager
 static common::Status CopyInputsAcrossDevices(const SessionState& session_state,
                                               gsl::span<const OrtValue> orig_feeds,
                                               std::vector<OrtValue>& new_feeds,
-                                              gsl::span<const MLValueCopyInfo> copy_info,
-                                              gsl::span<Stream* const> feed_streams) {
+#ifdef ORT_ENABLE_STREAM
+                                              DeviceStreamCollection* device_stream_collection,
+#endif
+                                              gsl::span<const MLValueCopyInfo> copy_info) {
   size_t num_feeds = orig_feeds.size();
   ORT_ENFORCE(copy_info.size() == num_feeds);
-  ORT_ENFORCE(feed_streams.size() == num_feeds);
 
   new_feeds.resize(num_feeds);
   std::vector<IDataTransfer::SrcDstPair> batched_data_transfers;
@@ -453,14 +463,32 @@ static common::Status CopyInputsAcrossDevices(const SessionState& session_state,
   std::vector<IDataTransfer::SparseSrcDstPair> batched_sparse_data_transfers;
 #endif
 
+  std::unordered_set<Stream*> stream_to_flush;
   for (size_t idx = 0; idx < num_feeds; ++idx) {
+    Stream* copy_this_feed = nullptr;
+#ifdef ORT_ENABLE_STREAM
+    if (device_stream_collection) {
+      if (copy_info[idx].unique_stream_index_consumes_it < 0) {
+        for (size_t i = 0; i < device_stream_collection->NumStreams(); i++) {
+          Stream* stream = device_stream_collection->GetStream(i);
+          if (stream && stream->GetDevice().Type() == copy_info[idx].target_device.Type()) {
+            copy_this_feed = stream;
+            stream_to_flush.insert(stream);
+            break;
+          }
+        }
+      } else {
+        copy_this_feed = device_stream_collection->GetStream(copy_info[idx].unique_stream_index_consumes_it);
+      }
+    }
+#endif
 #if !defined(DISABLE_SPARSE_TENSORS)
     ORT_RETURN_IF_ERROR(BatchOrCopyMLValue(session_state, copy_info[idx], orig_feeds[idx], new_feeds[idx],
-                                           feed_streams[idx],
+                                           copy_this_feed,
                                            &batched_data_transfers, &batched_sparse_data_transfers));
 #else
     ORT_RETURN_IF_ERROR(BatchOrCopyMLValue(session_state, copy_info[idx], orig_feeds[idx], new_feeds[idx],
-                                           feed_streams[idx],
+                                           copy_this_feed,
                                            &batched_data_transfers));
 #endif
   }
@@ -479,10 +507,7 @@ static common::Status CopyInputsAcrossDevices(const SessionState& session_state,
   // TODO: this sync is because the graph inputs can be consumed by multiple stream,
   // but we can only place the MemCpyAsync on one of the stream. Ideally we should make
   // other stream wait on the event of the memory copy stream, instead of host sync stream.
-  std::unordered_set<Stream*> visited;
-  for (auto* stream : feed_streams) {
-    if (stream && visited.insert(stream).second) stream->Flush();
-  }
+  for (const auto& stream : stream_to_flush) stream->Flush();
   return Status::OK();
 }
 
@@ -640,33 +665,12 @@ ExecuteGraphImpl(const SessionState& session_state,
 
     if (device_copy_checks.input_copy_needed == DeviceCopyCheck::Copy) {
       const auto& feed_copy_info = feeds_fetches_manager.GetFeedsDeviceCopyInfo();
-      InlinedVector<Stream*> feed_streams;
-      feed_streams.reserve(feed_copy_info.size());
-      // TODO: we can pre-calculate the stream index for graph inputs in execution plan
+      auto status = CopyInputsAcrossDevices(session_state, feeds, device_feeds,
 #ifdef ORT_ENABLE_STREAM
-      for (auto& copy_info : feed_copy_info) {
-        auto& device = copy_info.target_device;
-        bool found = false;
-        if (device_stream_collection) {
-          size_t num_streams = device_stream_collection->NumStreams();
-          for (size_t i = 0; i < num_streams; i++) {
-            Stream* stream = device_stream_collection->GetStream(i);
-            if (stream && stream->GetDevice().Type() == device.Type()) {
-              feed_streams.push_back(stream);
-              found = true;
-              break;
-            }
-          }
-        }
-        if (!found)
-          feed_streams.push_back(nullptr);
-      }
-#else
-      for (size_t i = 0; i < feed_copy_info.size(); ++i) {
-        feed_streams.push_back(nullptr);
-      }
+                                            device_stream_collection,
 #endif
-      ORT_RETURN_IF_ERROR(CopyInputsAcrossDevices(session_state, feeds, device_feeds, feed_copy_info, feed_streams));
+                                            feed_copy_info);
+      ORT_RETURN_IF_ERROR(status);
       feeds_to_use = device_feeds;
     }
 
@@ -819,27 +823,7 @@ common::Status ExecutePartialGraphImpl(const SessionState& session_state, FeedsF
 
     if (device_copy_checks.input_copy_needed == DeviceCopyCheck::Copy) {
       const auto& feed_copy_info = feeds_fetches_manager.GetFeedsDeviceCopyInfo();
-      InlinedVector<Stream*> feed_streams;
-      feed_streams.reserve(feed_copy_info.size());
-      // TODO: we can pre-calculate the stream index for graph inputs in execution plan
-      for (auto& copy_info : feed_copy_info) {
-        auto& device = copy_info.target_device;
-        bool found = false;
-        if (device_stream_collection) {
-          size_t num_streams = device_stream_collection->NumStreams();
-          for (size_t i = 0; i < num_streams; i++) {
-            Stream* stream = device_stream_collection->GetStream(i);
-            if (stream && stream->GetDevice().Type() == device.Type()) {
-              feed_streams.push_back(stream);
-              found = true;
-              break;
-            }
-          }
-        }
-        if (!found)
-          feed_streams.push_back(nullptr);
-      }
-      ORT_RETURN_IF_ERROR(CopyInputsAcrossDevices(session_state, feeds, device_feeds, feed_copy_info, feed_streams));
+      ORT_RETURN_IF_ERROR(CopyInputsAcrossDevices(session_state, feeds, device_feeds, device_stream_collection, feed_copy_info));
       p_feeds = device_feeds;
     }
 
@@ -1015,9 +999,19 @@ bool IsInputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index)
   }
 
 #ifdef ENABLE_ATEN
+  // For ATen node, we assume that all tensor inputs are on device, all non-tensor inputs are on CPU,
+  // except those specified in attribute cpu_input_args;
   if (node.GetExecutionProviderType() == kCudaExecutionProvider && node.OpType() == "ATen" &&
       node.Domain() == kPytorchAtenDomain) {
     const auto& attrs = node.GetAttributes();
+    if (auto entry = attrs.find("cpu_input_args"); entry != attrs.end()) {
+      const auto& attr = entry->second;
+      if (utils::HasInts(attr) && std::any_of(attr.ints().cbegin(), attr.ints().cend(),
+                                              [index](int64_t arg) { return static_cast<int64_t>(index) == arg; })) {
+        return true;
+      }
+    }
+
     ORT_ENFORCE(utils::HasString(attrs.at("operator")));
     std::string op_name = attrs.at("operator").s();
     std::string overload_name = "";
@@ -1025,7 +1019,7 @@ bool IsInputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index)
       overload_name = attrs.at("overload_name").s();
     }
 
-    return contrib::aten_ops::ATenOperatorExecutor::Instance().IsCpuArgument(op_name, overload_name, index, true);
+    return !contrib::aten_ops::ATenOperatorExecutor::Instance().IsTensorArgument(op_name, overload_name, index, true);
   }
 #else
   ORT_UNUSED_PARAMETER(node);
@@ -1040,9 +1034,19 @@ bool IsOutputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index
   }
 
 #ifdef ENABLE_ATEN
+  // For ATen node, we assume that all tensor outputs are on device, all non-tensor outputs are on CPU,
+  // except those specified in attribute cpu_output_args;
   if (node.GetExecutionProviderType() == kCudaExecutionProvider && node.OpType() == "ATen" &&
       node.Domain() == kPytorchAtenDomain) {
     const auto& attrs = node.GetAttributes();
+    if (auto entry = attrs.find("cpu_output_args"); entry != attrs.end()) {
+      const auto& attr = entry->second;
+      if (utils::HasInts(attr) && std::any_of(attr.ints().cbegin(), attr.ints().cend(),
+                                              [index](int64_t arg) { return static_cast<int64_t>(index) == arg; })) {
+        return true;
+      }
+    }
+
     ORT_ENFORCE(utils::HasString(attrs.at("operator")));
     std::string op_name = attrs.at("operator").s();
     std::string overload_name = "";
@@ -1050,7 +1054,7 @@ bool IsOutputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index
       overload_name = attrs.at("overload_name").s();
     }
 
-    return contrib::aten_ops::ATenOperatorExecutor::Instance().IsCpuArgument(op_name, overload_name, index, false);
+    return !contrib::aten_ops::ATenOperatorExecutor::Instance().IsTensorArgument(op_name, overload_name, index, false);
   }
 #else
   ORT_UNUSED_PARAMETER(node);
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index ea67218b5c92..adfa1b61e192 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -14,11 +14,12 @@
 using namespace ::ONNX_NAMESPACE;
 
 namespace ONNX_NAMESPACE {
-void matmulShapeInference(
+namespace defs::math::utils {
+void MatMulShapeInference(
     ONNX_NAMESPACE::InferenceContext& ctx,
     int input1Idx,
     int input2Idx);
-
+}  // namespace defs::math::utils
 }  // namespace ONNX_NAMESPACE
 
 namespace onnxruntime {
@@ -260,12 +261,22 @@ void GroupQueryAttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext&
       *output_shape.add_dim() = query_dims[2];
       updateOutputShape(ctx, 0, output_shape);
     } else {
-      fail_shape_inference("Missing input 2 (value)");
+      ONNX_NAMESPACE::TensorShapeProto output_shape;
+      int64_t num_heads = getAttribute(ctx, "num_heads", 0);
+      int64_t kv_num_heads = getAttribute(ctx, "kv_num_heads", 0);
+      int64_t hidden_size = query_dims[2].dim_value();
+      int64_t head_size = hidden_size / (num_heads + 2 * kv_num_heads);
+      *output_shape.add_dim() = query_dims[0];
+      *output_shape.add_dim() = query_dims[1];
+      output_shape.add_dim()->set_dim_value(head_size * num_heads);
+      updateOutputShape(ctx, 0, output_shape);
     }
   }
 
   if (ctx.getNumOutputs() > 1) {  // has present output
     if (hasInputShape(ctx, past_key_index)) {
+      // auto& query_shape = getInputShape(ctx, 0);
+      // auto& query_dims = query_shape.dim();
       auto& past_shape = getInputShape(ctx, past_key_index);
       auto& past_dims = past_shape.dim();
       if (past_dims.size() != 4) {
@@ -273,8 +284,7 @@ void GroupQueryAttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext&
       }
       ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, past_key_index, 1);
       ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, static_cast<size_t>(past_key_index) + 1, 2);
-      ONNX_NAMESPACE::propagateShapeFromInputToOutput(ctx, past_key_index, 1);
-      ONNX_NAMESPACE::propagateShapeFromInputToOutput(ctx, static_cast<size_t>(past_key_index) + 1, 2);
+      // TODO(aciddelgado): propagate output shapes depending if kv-share buffer is on or not
     }
   }
 }
@@ -333,6 +343,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
               "Whether to use rotary position embedding. Default value is 0.",
               AttributeProto::INT,
               OPTIONAL_VALUE)
+        .Attr("rotary_embedding_dim",
+              "Dimension of rotary embedding. Limited to 32, 64 or 128. Default value is head_size",
+              AttributeProto::INT,
+              OPTIONAL_VALUE)
         .Attr("mask_filter_value",
               "The value to be filled in the attention mask. Default value is -10000.0f",
               AttributeProto::FLOAT,
@@ -923,6 +937,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
               "Custom scale will be used if specified. Default value is 1/sqrt(head_size)",
               AttributeProto::FLOAT,
               OPTIONAL_VALUE)
+        .Attr("unidirectional",
+              "Whether every token can only attend to previous tokens. Default value is 0.",
+              AttributeProto::INT,
+              static_cast<int64_t>(0))
         .Input(0,
                "query",
                "Query with shape (batch_size, sequence_length, hidden_size), or packed QKV with shape (batch_size, kv_sequence_length, num_heads, 3, head_size)",
@@ -1007,18 +1025,29 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
               "left_window_size for local attention (like Mistral). Default value is -1 meaning unused.",
               AttributeProto::INT,
               static_cast<int64_t>(-1))
+        .Attr("do_rotary",
+              "Whether to use rotary position embedding. Default value is 0.",
+              AttributeProto::INT,
+              OPTIONAL_VALUE)
+        .Attr("rotary_interleaved",
+              "Rotate using interleaved pattern. Default value is 0 (False).",
+              AttributeProto::INT,
+              OPTIONAL_VALUE)
         .Input(0,
                "query",
-               "Query with shape (batch_size, sequence_length, hidden_size)",
+               "Query with shape (batch_size, sequence_length, hidden_size), or packed QKV with shape"
+               "(batch_size, sequence_length, d) where d is (num_heads * head_size + 2 * kv_num_heads * head_size).",
                "T")
         .Input(1,
                "key",
                "Key with shape (batch_size, kv_sequence_length, kv_hidden_size) ",
-               "T")
+               "T",
+               OpSchema::Optional)
         .Input(2,
                "value",
                "Value with shape (batch_size, kv_sequence_length, kv_hidden_size)",
-               "T")
+               "T",
+               OpSchema::Optional)
         .Input(3,
                "past_key",
                "past state key with support for format BNSH. When past_key uses same tensor as present_key"
@@ -1039,6 +1068,16 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                "total_sequence_length",
                "Scalar tensor of total sequence length (past + new).",
                "M")
+        .Input(7,
+               "cos_cache",
+               "2D tensor with shape (max_sequence_length, head_size / 2).",
+               "T",
+               OpSchema::Optional)
+        .Input(8,
+               "sin_cache",
+               "2D tensor with shape (max_sequence_length, head_size / 2).",
+               "T",
+               OpSchema::Optional)
         .Output(0,
                 "output",
                 "3D output tensor with shape (batch_size, sequence_length, hidden_size)",
@@ -1055,7 +1094,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 "(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +"
                 "kv_sequence_length.",
                 "T")
-        .TypeConstraint("T", {"tensor(float16)"}, "Constrain input and output to float tensors.")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(bfloat16)"}, "Constrain input and output to float tensors.")
         .TypeConstraint("M", {"tensor(int32)"}, "Constrain mask to int tensor.")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           GroupQueryAttentionTypeAndShapeInference(ctx, 3);
@@ -1141,6 +1180,14 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
               "Rotate using interleaved pattern. Default value is 0 (False).",
               AttributeProto::INT,
               OPTIONAL_VALUE)
+        .Attr("rotary_embedding_dim",
+              "Rotary embedding dimension. Default value is 0.",
+              AttributeProto::INT,
+              OPTIONAL_VALUE)
+        .Attr("num_heads",
+              "Number of attention heads. Default value is 0. Must use with rotary_embedding_dim",
+              AttributeProto::INT,
+              OPTIONAL_VALUE)
         .Input(0,
                "input",
                "3D tensor with shape (batch_size, sequence_length, hidden_size) or 4D with shape (batch_size, num_heads, sequence_length, head_size)",
@@ -1151,23 +1198,88 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                "M")
         .Input(2,
                "cos_cache",
-               "2D tensor with shape (max_sequence_length, head_size / 2).",
+               "2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)",
                "T")
         .Input(3,
                "sin_cache",
-               "2D tensor with shape (max_sequence_length, head_size / 2).",
+               "2D tensor with shape (max_sequence_length, head_size / 2) or (max_sequence_length, rotary_embedding_dim / 2)",
                "T")
         .Output(0,
                 "output",
                 "tensor with same shape as input.",
                 "T")
-        .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", {"tensor(float)", "tensor(float16)", "tensor(bfloat16)"}, "Constrain input and output types to float tensors.")
         .TypeConstraint("M", {"tensor(int64)"}, "Constrain input and output types to integer tensors")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
           propagateShapeFromInputToOutput(ctx, 0, 0);
         }));
 
+constexpr const char* GemmaRotaryEmbedding_ver1_doc = R"DOC(
+GemmaRotaryEmbedding is the implementation of below part of rotary positional embeddings (RoPE). It implements below from modeling_gemma.py.
+
+Here's onnxscript that was tested
+
+from onnxscript import FLOAT, FLOAT16, script
+from onnxscript import opset18 as op
+
+@script()
+def gemma_rotary_embedding(emb: FLOAT["bs", "seq_len", "dim"], q: FLOAT16["bs", "num_heads", "seq_len", "dim"], q_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"], k: FLOAT16["bs", "num_heads", "seq_len", "dim"], k_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"]):
+  sin_val = op.Sin(emb)
+  casted_sin = op.Cast(sin_val, to=10) # for fp16 mix-precision training. Other types are not supported.
+  cos_val = op.Cos(emb)
+  casted_cos = op.Cast(cos_val, to=10)
+  unsqueezed_sin = op.Unsqueeze(casted_sin, [1])
+  unsqueezed_cos = op.Unsqueeze(casted_cos, [1])
+  q_embed = (q * casted_cos) + (q_rot * casted_sin)
+  k_embed = (k * casted_cos) + (k_rot * casted_sin)
+  return q_embed, k_embed
+
+onnx_model = gemma_rotary_embedding.to_model_proto()
+
+
+)DOC";
+ONNX_MS_OPERATOR_SET_SCHEMA(
+    GemmaRotaryEmbedding, 1,
+    OpSchema()
+        .SetDoc(GemmaRotaryEmbedding_ver1_doc)
+        .Input(0,
+               "emb",
+               "embeddding - 3D tensor with shape (batch_size, seq_len, dim)",
+               "U")
+        .Input(1,
+               "q",
+               "q state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)",
+               "T")
+        .Input(2,
+               "q_rot",
+               "half rotated q state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)",
+               "T")
+        .Input(3,
+               "k",
+               "k state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)",
+               "T")
+        .Input(4,
+               "k_rot",
+               "k state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)",
+               "T")
+        .Output(0,
+                "output1",
+                "4D tensor with shape (batch_size, num_heads, seq_len, dim)",
+                "T")
+        .Output(1,
+                "output2",
+                "4D tensor with shape (batch_size, num_heads, seq_len, dim)",
+                "T")
+        .TypeConstraint("T", {"tensor(float16)"}, "Constrain input and output types to float16 tensors.")
+        .TypeConstraint("U", {"tensor(float)"}, "Constrain input 0 type to float tensors")
+        .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 1, 0);
+          propagateElemTypeFromInputToOutput(ctx, 1, 1);
+          propagateShapeFromInputToOutput(ctx, 1, 0);
+          propagateShapeFromInputToOutput(ctx, 1, 1);
+        }));
+
 constexpr const char* EmbedLayerNormalization_ver1_doc = R"DOC(
 EmbedLayerNormalization is the fusion of embedding layer in BERT model, with optional mask processing.
 The embedding layer takes input_ids (word IDs) and segment_ids (sentence IDs) to look up word_embedding, position_embedding,
@@ -1281,7 +1393,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .Output(3, "input_skip_bias_sum", "Sum of the input and skip inputs (and bias if it exists) with shape (batch_size, sequence_length, hidden_size).", "T", OpSchema::Optional)
         .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
         .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
-        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+        .TypeAndShapeInferenceFunction(SkipLayerNormalizationShapeInference));
 
 ONNX_MS_OPERATOR_SET_SCHEMA(
     SkipSimplifiedLayerNormalization, 1,
@@ -1330,7 +1442,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 OpSchema::Optional)
         .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
         .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
-        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+        .TypeAndShapeInferenceFunction(SkipLayerNormalizationShapeInference));
 
 constexpr const char* NGramRepeatBlock_ver1_doc = R"DOC(
 Enforce no repetition of n-grams. Scores are set to `-inf` for tokens that form a repeated n-gram if added to the back of the input_ids.
@@ -1398,7 +1510,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                         "Constrain input and output types to float or half tensors.")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
-          ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
+          ONNX_NAMESPACE::defs::math::utils::MatMulShapeInference(ctx, 0, 1);
         }));
 
 constexpr const char* RemovePadding_ver1_doc = R"DOC(
diff --git a/onnxruntime/core/graph/contrib_ops/collective_defs.cc b/onnxruntime/core/graph/contrib_ops/collective_defs.cc
index 4aa43f5de1cd..a0ca2e45f153 100644
--- a/onnxruntime/core/graph/contrib_ops/collective_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/collective_defs.cc
@@ -91,10 +91,18 @@ void RegisterCollectiveOps() {
             "Number of top experts to select from expert pool",
             AttributeProto::INT,
             static_cast<int64_t>(1))
+      .Attr("normalize_routing_weights",
+            "Whether to normalize routing weights",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
       .Attr("local_experts_start_index",
             "The start index of local experts",
             AttributeProto::INT,
-            static_cast<int64_t>(-1))
+            static_cast<int64_t>(0))
+      .Attr("tensor_shards",
+            "Tensor parallelism config. The number of shards for each expert weight and bias",
+            AttributeProto::INT,
+            static_cast<int64_t>(1))
       .Input(0,
              "input",
              "2D input tensor with shape (num_rows, hidden_size) or "
@@ -106,22 +114,32 @@ void RegisterCollectiveOps() {
              "T")
       .Input(2,
              "fc1_experts_weights",
-             "3D input tensor with shape (local_num_experts, hidden_size, inter_size)",
+             "3D input tensor with shape (local_num_experts, hidden_size, local_inter_size)",
              "T")
       .Input(3,
-             "fc2_experts_weights",
-             "3D input tensor with shape (local_num_experts, inter_size, hidden_size)",
-             "T")
-      .Input(4,
              "fc1_experts_bias",
-             "2D optional input tensor with shape (local_num_experts, inter_size)",
+             "2D optional input tensor with shape (local_num_experts, local_inter_size)",
              "T",
              OpSchema::Optional)
+      .Input(4,
+             "fc2_experts_weights",
+             "3D input tensor with shape (local_num_experts, local_inter_size, hidden_size)",
+             "T")
       .Input(5,
              "fc2_experts_bias",
              "2D optional input tensor with shape (num_experts, hidden_size)",
              "T",
              OpSchema::Optional)
+      .Input(6,
+             "fc3_experts_weights",
+             "3D optional input tensor with shape (local_num_experts, hidden_size, local_inter_size)",
+             "T",
+             OpSchema::Optional)
+      .Input(7,
+             "fc3_experts_bias",
+             "2D optional input tensor with shape (local_num_experts, local_inter_size)",
+             "T",
+             OpSchema::Optional)
       .Output(0,
               "output",
               "2D input tensor with shape (num_rows, hidden_size) or "
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 54eb43753931..0f364b888006 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -39,10 +39,13 @@ void convPoolShapeInference(
     bool use_dilation, bool require_kernel_shape,
     int input1Idx,
     int input2Idx);
-void matmulShapeInference(
+
+namespace defs::math::utils {
+void MatMulShapeInference(
     ONNX_NAMESPACE::InferenceContext& ctx,
     int input1Idx,
     int input2Idx);
+}
 
 void convTransposeWithDynamicPadsShapeInference(InferenceContext& ctx) {
   propagateElemTypeFromInputToOutput(ctx, 0, 0);
@@ -1163,7 +1166,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(BeamSearch, 1,
                                        "Shape is (1,)",
                                        "T", OpSchema::Optional)
                                 .Input(6, "repetition_penalty", "The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)", "T", OpSchema::Optional)
-                                .Input(7, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)", "M", OpSchema::Optional)
+                                .Input(7, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)", "M", OpSchema::Optional)
                                 .Input(8, "prefix_vocab_mask", "Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)", "M", OpSchema::Optional)
                                 .Input(9, "attention_mask", "Custom attention mask. Shape is (batch_size, sequence_length)", "I", OpSchema::Optional)
                                 .Input(10, "decoder_input_ids", "The forced input id sequence for the decoder subgraph. Shape is (batch_size, initial_sequence_length)", "I", OpSchema::Optional)
@@ -1188,7 +1191,15 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                 .SetDoc("Beam Search for whisper model, especiall with cross_qk features etc.")
                                 .Attr("eos_token_id", "The id of the end-of-sequence token", AttributeProto::INT)
                                 .Attr("pad_token_id", "The id of the padding token", AttributeProto::INT)
-                                .Attr("decoder_start_token_id", "The id of the token that indicates decoding starts.", AttributeProto::INT, static_cast<int64_t>(-1))
+                                .Attr("decoder_start_token_id", "The id of the token that indicates decoding starts (i.e. the start of transcription token id)", AttributeProto::INT, static_cast<int64_t>(-1))
+                                .Attr("translate_token_id", "The id of the translate task", AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("transcribe_token_id", "The id of the transcribe task", AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("start_of_lm_token_id", "The id of the token that indicates LM starts", AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("no_speech_token_id",
+                                      "The token in whisper model that marks all sequence empty. With this model, whisper could output no_speech_prob after. Default -1.",
+                                      AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("no_timestamps_token_id", "The id of the token that indicates no timestamps", AttributeProto::INT, OPTIONAL_VALUE)
+                                .Attr("beginning_timestamp_token_id", "The id of the first timestamp", AttributeProto::INT, OPTIONAL_VALUE)
                                 .Attr("no_repeat_ngram_size", "no repeat ngrams size", AttributeProto::INT, static_cast<int64_t>(0))
                                 .Attr("early_stopping", "early stop or not", AttributeProto::INT, static_cast<int64_t>(0))
                                 .Attr("model_type", "Must be 2 for whisper", AttributeProto::INT, static_cast<int64_t>(2))
@@ -1203,27 +1214,24 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                       "If not provided, it will be inferred from the decoder subgraph's output shape",
                                       AttributeProto::INT, static_cast<int64_t>(-1))
                                 .Attr("decoder_output_cross_qk", "If nozero, decoder subgraph contains output Q*K from cross attentions. Default 0.", AttributeProto::INT, OPTIONAL_VALUE)
-                                .Attr("no_speech_token",
-                                      "The token in whisper model that marks all sequence empty. With this model, whisper could output no_speech_prob after. Default -1.",
-                                      AttributeProto::INT, OPTIONAL_VALUE)
                                 .Input(0, "input_ids", "The sequence used as a prompt for the generation in the encoder subgraph. Shape is (batch_size, sequence_length)", "F")
                                 .Input(1, "max_length", "The maximum length of the sequence to be generated. Shape is (1)", "I")
                                 .Input(2, "min_length", "The minimum length below which the score of eos_token_id is set to -Inf. Shape is (1)", "I", OpSchema::Optional)
                                 .Input(3, "num_beams", "Number of beams for beam search. 1 means no beam search. Shape is (1)", "I")
                                 .Input(4, "num_return_sequences", "The number of returned sequences in the batch. Shape is (1)", "I")
                                 .Input(5, "length_penalty",
-                                       "Exponential penalty to the length. Default value 1.0 means no penalty."
-                                       "Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences."
+                                       "Exponential penalty to the length. Default value 1.0 means no penalty. "
+                                       "Value > 1.0 encourages longer sequences, while values < 1.0 produces shorter sequences. "
                                        "Shape is (1,)",
                                        "T", OpSchema::Optional)
                                 .Input(6, "repetition_penalty", "The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)", "T", OpSchema::Optional)
-                                .Input(7, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)", "M", OpSchema::Optional)
+                                .Input(7, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)", "M", OpSchema::Optional)
                                 .Input(8, "prefix_vocab_mask", "Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)", "M", OpSchema::Optional)
                                 .Input(9, "attention_mask", "Custom attention mask. Shape is (batch_size, sequence_length)", "I", OpSchema::Optional)
                                 .Input(10, "decoder_input_ids", "The forced input id sequence for the decoder subgraph. Shape is (batch_size, initial_sequence_length)", "I", OpSchema::Optional)
                                 .Input(11, "logits_processor", "Specific logits processor for different types of beamsearch models. Default value 0 means no specific logit processor. Accepts value >= 0. Shape is (1)", "I", OpSchema::Optional)
                                 .Input(12, "cross_qk_layer_head",
-                                       "Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect all"
+                                       "Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect all "
                                        "its shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]",
                                        "I", OpSchema::Optional)
                                 .Input(13, "extra_decoding_ids",
@@ -1231,23 +1239,23 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                        "In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) "
                                        "are treated as stop of the extra_decoding_ids for corresponding batch.",
                                        "I", OpSchema::Optional)
+                                .Input(14, "temperature", "Temperature value to apply to logits processing during this execution's decoding. Shape is (1)", "T", OpSchema::Optional)
                                 .Output(0, "sequences", "Word IDs of generated sequences. Shape is (batch_size, num_return_sequences, max_sequence_length)", "I")
                                 .Output(1, "sequences_scores", "Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)", "T", OpSchema::Optional)
                                 .Output(2, "scores",
-                                        "Processed beam scores for each vocabulary token at each generation step."
-                                        "Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam."
+                                        "Processed beam scores for each vocabulary token at each generation step. "
+                                        "Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam. "
                                         "Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)",
                                         "T", OpSchema::Optional)
                                 .Output(3, "cross_qk",
                                         "Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, "
-                                        "F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers,"
-                                        "B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F]."
+                                        "F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers, "
+                                        "B = batch size, R = num_return_sequences. It then should return tensor of shape [B, R, L*H, T, F]. "
                                         "If cross_qk_layer_head is given, shape is [B, R, cross_qk_layer_head.shape[0], T, F]",
                                         "V", OpSchema::Optional)
                                 .Output(4, "non_speech_probs",
-                                        "For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token."
-                                        "Currently we treat the last token's logits is what we need, in future extra graph logic may be add to the encoder/context-decoder subgraph."
-                                        "The prob is save before logits may be updated by extra-decoding-ids. The shape of non_speech_probs is [B]",
+                                        "For whisper model, output the probabilities from logits after encoder and context decoding for the no_speech_token_id. "
+                                        "The shape of non_speech_probs is [B]",
                                         "T", OpSchema::Optional)
                                 .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain to float tensors.")
                                 .TypeConstraint("F", {"tensor(float)", "tensor(int32)", "tensor(float16)"}, "Constrain input type to float or int tensors.")
@@ -1321,7 +1329,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(GreedySearch, 1,
                                 .Input(1, "max_length", "The maximum length of the sequence to be generated. Shape is (1)", "I")
                                 .Input(2, "min_length", "The minimum length below which the score of eos_token_id is set to -Inf. Shape is (1)", "I", OpSchema::Optional)
                                 .Input(3, "repetition_penalty", "The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)", "T", OpSchema::Optional)
-                                .Input(4, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)", "I", OpSchema::Optional)
+                                .Input(4, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)", "I", OpSchema::Optional)
                                 .Input(5, "prefix_vocab_mask", "Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)", "I", OpSchema::Optional)
                                 .Input(6, "attention_mask", "Custom attention mask. Shape is (batch_size, sequence_length)", "I", OpSchema::Optional)
                                 .Output(0, "sequences", "Word IDs of generated sequences. Shape is (batch_size, max_sequence_length)", "I")
@@ -1362,7 +1370,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Sampling, 1,
                                 .Input(1, "max_length", "The maximum length of the sequence to be generated. Shape is (1)", "I")
                                 .Input(2, "min_length", "The minimum length below which the score of eos_token_id is set to -Inf. Shape is (1)", "I", OpSchema::Optional)
                                 .Input(3, "repetition_penalty", "The parameter for repetition penalty. Default value 1.0 means no penalty. Accepts value > 0.0. Shape is (1)", "T", OpSchema::Optional)
-                                .Input(4, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vacab_size)", "I", OpSchema::Optional)
+                                .Input(4, "vocab_mask", "Mask of vocabulary. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (vocab_size)", "I", OpSchema::Optional)
                                 .Input(5, "prefix_vocab_mask", "Mask of vocabulary for first step. Words that masked with 0 are not allowed to be generated, and 1 is allowed. Shape is (batch_size, vocab_size)", "I", OpSchema::Optional)
                                 .Input(6, "attention_mask", "Custom attention mask. Shape is (batch_size, sequence_length)", "I", OpSchema::Optional)
                                 .Input(7, "presence_mask", "Presence penalty mask. Shape is (batch_size, vocab_size)", "I", OpSchema::Optional)
@@ -1377,8 +1385,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Sampling, 1,
 
 constexpr const char* MoE_ver1_doc = R"DOC(
       Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1,
-      GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
-      usually uses top 32 experts.
+      GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
+      usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral).
       )DOC";
 
 ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1,
@@ -1386,16 +1394,77 @@ ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1,
                                 .SetDoc(MoE_ver1_doc)
                                 .Attr("activation_type", "Activation function to use. Choose from relu, gelu, silu and identity. Default is relu", AttributeProto::STRING, std::string("relu"))
                                 .Attr("k", "Number of top experts to select from expert pool", AttributeProto::INT, static_cast<int64_t>(1))
+                                .Attr("normalize_routing_weights", "Whether to normalize routing weights", AttributeProto::INT, static_cast<int64_t>(0))
                                 .Input(0, "input", "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T")
                                 .Input(1, "router_probs", "2D input tensor with shape (num_rows, num_experts)", "T")
                                 .Input(2, "fc1_experts_weights", "3D input tensor with shape (num_experts, hidden_size, inter_size)", "T")
-                                .Input(3, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size)", "T")
-                                .Input(4, "fc1_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
+                                .Input(3, "fc1_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
+                                .Input(4, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size)", "T")
                                 .Input(5, "fc2_experts_bias", "2D optional input tensor with shape (num_experts, hidden_size)", "T", OpSchema::Optional)
+                                .Input(6, "fc3_experts_weights", "3D optional input tensor with shape (num_experts, hidden_size, inter_size)", "T", OpSchema::Optional)
+                                .Input(7, "fc3_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
                                 .Output(0, "output", "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T")
                                 .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or float16 tensors.")
                                 .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
 
+ONNX_MS_OPERATOR_SET_SCHEMA(
+    QMoE, 1,
+    OpSchema()
+        .SetDoc("Int4 MoE")
+        .Attr("activation_type",
+              "Activation function to use. Choose from relu, gelu, silu and identity. Default is relu",
+              AttributeProto::STRING,
+              std::string("relu"))
+        .Attr("k",
+              "Number of top experts to select from expert pool",
+              AttributeProto::INT,
+              static_cast<int64_t>(1))
+        .Attr("normalize_routing_weights",
+              "Whether to normalize routing weights",
+              AttributeProto::INT,
+              static_cast<int64_t>(0))
+        .Input(0,
+               "input",
+               "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape "
+               "(batch_size, sequence_length, hidden_size)",
+               "T")
+        .Input(1, "router_probs", "2D input tensor with shape (num_rows, num_experts)", "T")
+        .Input(2, "fc1_experts_weights", "3D input tensor with shape (num_experts, hidden_size, inter_size / 2)", "T1")
+        .Input(3, "fc1_scales", "2D input tensor with shape (num_experts, inter_size)", "T")
+        .Input(4,
+               "fc1_experts_bias",
+               "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
+        .Input(5, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size / 2)", "T1")
+        .Input(6, "fc2_scales", "2D input tensor with shape (num_experts, hidden_size)", "T")
+        .Input(7,
+               "fc2_experts_bias",
+               "2D optional input tensor with shape (num_experts, hidden_size)",
+               "T",
+               OpSchema::Optional)
+        .Input(8,
+               "fc3_experts_weights",
+               "3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)",
+               "T1",
+               OpSchema::Optional)
+        .Input(9,
+               "fc3_scales",
+               "2D optional input tensor with shape (num_experts, inter_size)",
+               "T",
+               OpSchema::Optional)
+        .Input(10,
+               "fc3_experts_bias",
+               "2D optional input tensor with shape (num_experts, inter_size)",
+               "T",
+               OpSchema::Optional)
+        .Output(0,
+                "output",
+                "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape "
+                "(batch_size, sequence_length, hidden_size)",
+                "T")
+        .TypeConstraint("T", {"tensor(float16)"}, "Constrain input and output types to float or float16 tensors.")
+        .TypeConstraint("T1", {"tensor(uint8)"}, "Constrain weights type to uint8 tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
 ONNX_MS_OPERATOR_SET_SCHEMA(SampleOp, 1,
                             OpSchema()
                                 .Input(0, "X", "input", "T")
@@ -1893,7 +1962,7 @@ Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-
                                   // Right now we only support int32
                                   y_type->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto::INT32);
 
-                                  ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
+                                  ONNX_NAMESPACE::defs::math::utils::MatMulShapeInference(ctx, 0, 1);
                                 }));
 
 /**
@@ -3230,6 +3299,11 @@ void RegisterContribSchemas() {
           "(Optional) SDK version used to convert the model.",
           AttributeProto::STRING,
           OPTIONAL_VALUE)
+      .Attr(
+          "hardware_architecture",
+          "(Optional) Hardware architecture.",
+          AttributeProto::STRING,
+          OPTIONAL_VALUE)
       .Attr(
           "partition_name",
           "(Optional) partitioned graph name.",
@@ -3333,22 +3407,23 @@ MatMulNBits is a MatMul with weight quantized with N bits(e.g., 2, 3, 4, 5, 6, 7
      And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,..
   3. Input B's scale and zero point are specified by input scales and zero_points.
 
-Input B is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
-- n_blocks_per_col = (K + block_size - 1) / block_size
-- blob_size = block_size / 8 * bits
+  Input is stored as uint8_t with shape: [N][n_blocks_per_col][blob_size] in which:
+  - n_blocks_per_col = (K + block_size - 1) / block_size
+  - blob_size = CeilDiv(block_size * bits, bitsof(uint8_t)<8>)
+  For all bits from 2-8, a row of data is stored squeezely and represented by uint8_t.
+    - for 2,4,8 bits, 4x2bit,2x4bit,1x8bit are stored in one uint8_t.
+        4bit example:
+        |.|.|.|.| .|.|.|.| =uint8_t (2x4bit)
+    - for 3,5,6,7 bits, 32x3bit,32x5bit,16x6bit,32x7bit are stored in 12xuint8_t,20xuint8_t,12xuint8_t,28xuint8_t separately. no bits are wasted.
+        3bit example:
+        |.|.|. |.|.|. |.|.|. = 9bit, which across 2 uint8_t, the highest bit for the second uint8_t is used.
+  The last uint_8 may have some bits unused.
 
-  For a block blob. It is stored in format:
-  struct Blob {
-    uint8 one_bits[(bits & 0x1) * 1 * block_size / 8];  // highest 1 bit for 3, 5, 7 bits quantization
-    uint8 two_bits[(bits & 0x2) * 2 * block_size / 8];  // high 2 bits for 2, 6, 7 bits quantization
-    uint8 four_bits[(bits & 0x4) * 4 * block_size / 8]; // low 4 bits for 4, 5, 6 bits quantization
-  }
 
 Input scales is stored in same type as original type of B(float32, float16) with shape like: [N * n_blocks_per_col]
-Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored as one unit8_t. If bits > 4, one zero point is stored with one unit8_t. Thus, its shape is:
-  - [(N * n_blocks_per_col + 1) / 2] if bits <=4
-  - [N * n_blocks_per_col] if bits > 4
-
+Input zero_points is stored as uint8_t or same as type(A). It has the same packing method as input B.
+  - [CeilDiv((N * n_blocks_per_col + 1) *bits, 8)]
+  If zero_points has same type as A, it's not packed and has the same shape as Scales.
 )DOC";
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulNBits)
@@ -3367,12 +3442,15 @@ Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored
             "type T1.",
             AttributeProto::INT, static_cast<int64_t>(0))
       .Input(0, "A", "The input tensor, not quantized", "T1")
-      .Input(1, "B", "1-dimensional data blob", "T2")
+      .Input(1, "B", "1 or 2 dimensional data blob", "T2")
       .Input(2, "scales", "quantization scale", "T1")
-      .Input(3, "zero_points", "quantization zero points", "T2", OpSchema::Optional)
+      .Input(3, "zero_points", "quantization zero points", "T3", OpSchema::Optional)
+      .Input(4, "g_idx", "group_idx", "T4", OpSchema::Optional)
       .Output(0, "Y", "tensor. The output tensor has the same rank as the input. ", "T1")
       .TypeConstraint("T1", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float/half_float tensors.")
-      .TypeConstraint("T2", {"tensor(uint8)"}, "Constrain quantized weight types to uint8.")
+      .TypeConstraint("T2", {"tensor(uint8)", "tensor(int32)"}, "Constrain quantized weight types to uint8/int32.")
+      .TypeConstraint("T3", {"tensor(uint8)", "tensor(int32)", "tensor(float16)", "tensor(float)"}, "Constrain quantized zero point types to uint8/int32/float16/float.")
+      .TypeConstraint("T4", {"tensor(int32)"}, "the index tensor.")
       .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
         // Type inference
         propagateElemTypeFromInputToOutput(ctx, 0, 0);
@@ -3460,6 +3538,8 @@ MatMulBnb4 is a MatMul with weight quantized with 4 bits using either FP4 or NF4
               /*min_arity*/ 1)
       .Attr("operator", "Name of ATen operator.", AttributeProto::STRING)
       .Attr("overload_name", "Overload name of ATen operator.", AttributeProto::STRING, false)
+      .Attr("cpu_input_args", "CPU input argument indices.", AttributeProto::INTS, false)
+      .Attr("cpu_output_args", "CPU output argument indices.", AttributeProto::INTS, false)
       .TypeConstraint("T", OpSchema::all_tensor_types_ir4(),
                       "Allow inputs and outputs to be any kind of tensor.");
 #endif
diff --git a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
index c8960578f9e3..6bf19654a3ce 100644
--- a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
+++ b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
@@ -106,6 +106,7 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 14);
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 15);
 
+  REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 1);
   REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 11);
   REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 13);
 
diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h
index 5eef1b33a24d..a23ad4678b2a 100644
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@@ -84,6 +84,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulFpQ4);
 #endif
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MaxpoolWithMask);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MoE);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QMoE);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MultiHeadAttention);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GroupQueryAttention);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MurmurHash3);
@@ -97,6 +98,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RemovePadding);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RestorePadding);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Rfft);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RotaryEmbedding);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GemmaRotaryEmbedding);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SampleOp);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Sampling);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SkipGroupNorm);
@@ -191,6 +193,7 @@ class OpSet_Microsoft_ver1 {
 #endif
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MaxpoolWithMask)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MoE)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QMoE)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MultiHeadAttention)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GroupQueryAttention)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MurmurHash3)>());
@@ -206,6 +209,7 @@ class OpSet_Microsoft_ver1 {
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RestorePadding)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Rfft)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RotaryEmbedding)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GemmaRotaryEmbedding)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SampleOp)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Sampling)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SkipGroupNorm)>());
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
index 4313fae767fe..47f61a43458e 100644
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
@@ -22,7 +22,9 @@ void RNNShapeInference(InferenceContext& ctx);
 void convTransposeShapeInference(InferenceContext& ctx);
 void convPoolShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, bool use_dilation, bool require_kernel_shape,
                             int input1Idx, int input2Idx);
-void matmulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int input1Idx, int input2Idx);
+namespace defs::math::utils {
+  void MatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int input1Idx, int input2Idx);
+}
 
 }  // namespace ONNX_NAMESPACE
 
@@ -400,7 +402,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data type to 8-bit integer tensor.")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
-          ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
+          ONNX_NAMESPACE::defs::math::utils::MatMulShapeInference(ctx, 0, 1);
         }));
 
 ONNX_MS_OPERATOR_SET_SCHEMA(
@@ -434,11 +436,11 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .Output(0, "Y", "Matrix multiply results from A * B", "T3")
         .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input A data type to 8-bit integer tensor.")
         .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data type to 8-bit integer tensor.")
-        .TypeConstraint("T3", {"tensor(float)"},
+        .TypeConstraint("T3", {"tensor(float)", "tensor(float16)"},
                         "Constrain input a_scale, b_scale and output Y data type as float tensor.")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 2, 0);
-          ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
+          ONNX_NAMESPACE::defs::math::utils::MatMulShapeInference(ctx, 0, 1);
         }));
 
 ONNX_MS_OPERATOR_SET_SCHEMA(
@@ -1129,7 +1131,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .TypeConstraint("S", {"tensor(float)"}, "Constrain bias and scales to float32")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
-          ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 2);
+          ONNX_NAMESPACE::defs::math::utils::MatMulShapeInference(ctx, 0, 2);
         }));
 
 static const char* Attention_QOrdered_doc = R"DOC(
diff --git a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
index eeef20e9dff5..8b1812f62be2 100644
--- a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
+++ b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
@@ -114,6 +114,45 @@ void EmbedLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& c
   }
 }
 
+void SkipLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx) {
+  propagateShapeAndTypeFromFirstInput(ctx);
+
+  auto stash_type = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+  if (ctx.getNumOutputs() > 1) {
+    auto output_type = ctx.getOutputType(1);
+    output_type->mutable_tensor_type()->set_elem_type(static_cast<int32_t>(stash_type));
+  }
+  if (ctx.getNumOutputs() > 2) {
+    auto output_type = ctx.getOutputType(2);
+    output_type->mutable_tensor_type()->set_elem_type(static_cast<int32_t>(stash_type));
+  }
+  if (ctx.getNumOutputs() > 3) {
+    propagateElemTypeFromInputToOutput(ctx, 0, 3);
+  }
+  if (!hasNInputShapes(ctx, 1)) {
+    return;
+  }
+  auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+  int64_t input_ndim = input_shape.dim_size();
+  int axis = static_cast<int>(input_ndim - 1);
+
+  if (ctx.getNumOutputs() > 1) {
+    auto mean_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape();
+    mean_shape->CopyFrom(input_shape);
+    mean_shape->mutable_dim(axis)->set_dim_value(1);
+  }
+
+  if (ctx.getNumOutputs() > 2) {
+    auto inv_std_dev_shape = ctx.getOutputType(2)->mutable_tensor_type()->mutable_shape();
+    inv_std_dev_shape->CopyFrom(input_shape);
+    inv_std_dev_shape->mutable_dim(axis)->set_dim_value(1);
+  }
+
+  if (ctx.getNumOutputs() > 3) {
+    propagateShapeFromInputToOutput(ctx, 0, 3);
+  }
+}
+
 // Shape inference for Attention and QAttention
 void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index) {
   // Input 0, 1, 2 are input, weights and bias.
diff --git a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
index 93cf5b304f65..6eb06af15309 100644
--- a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
+++ b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
@@ -13,5 +13,6 @@ namespace onnxruntime {
 namespace contrib {
 void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index);
 void EmbedLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx);
+void SkipLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx);
 }  // namespace contrib
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index baebe2420073..2220b9cd1db7 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -1818,16 +1818,36 @@ void Graph::ReverseDFSFrom(gsl::span<const Node* const> from,
   }
 }
 
+template <typename T>
+struct VisitorPriorityQueue {
+  using ComparatorType = std::function<bool(T, T)>;
+  std::list<T> list_;
+  const ComparatorType comparator_ = nullptr;
+  VisitorPriorityQueue(const ComparatorType& comp) : comparator_(comp) {}
+
+  void push(T node) {
+    list_.insert(
+        std::upper_bound(list_.begin(), list_.end(), node, comparator_),
+        node);
+  }
+  bool empty() { return list_.empty(); }
+  T top() { return list_.back(); }
+  void pop() { list_.pop_back(); }
+};
+
 #if !defined(ORT_MINIMAL_BUILD)
 void Graph::KahnsTopologicalSort(const std::function<void(const Node*)>& enter,
                                  const std::function<bool(const Node*, const Node*)>& comp) const {
-  std::unordered_map<NodeIndex, size_t> in_degree;
-  std::priority_queue<const Node*, std::vector<const Node*>, decltype(comp)> to_visit(comp);
-  std::vector<NodeIndex> topo_order;
+  InlinedVector<size_t> in_degree(MaxNodeIndex(), 0);
+  InlinedVector<NodeIndex> topo_order;
+  VisitorPriorityQueue<const Node*> to_visit(comp);
+
+  auto number_of_nodes = NumberOfNodes();
+  topo_order.reserve(number_of_nodes);
 
   for (auto& node : Nodes()) {
     size_t input_edge_count = node.GetInputEdgesCount();
-    in_degree.insert({node.Index(), input_edge_count});
+    in_degree[node.Index()] = input_edge_count;
     if (input_edge_count == 0) {
       to_visit.push(&node);
     }
@@ -1844,16 +1864,17 @@ void Graph::KahnsTopologicalSort(const std::function<void(const Node*)>& enter,
     }
 
     for (auto node_it = current->OutputNodesBegin(); node_it != current->OutputNodesEnd(); ++node_it) {
-      in_degree[node_it->Index()]--;
+      auto& node_in_degree = in_degree[node_it->Index()];
+      node_in_degree--;
 
-      if (in_degree[node_it->Index()] == 0) {
+      if (node_in_degree == 0) {
         to_visit.push(&*node_it);
       }
     }
     topo_order.push_back(current->Index());
   }
 
-  if (NumberOfNodes() != static_cast<int>(topo_order.size())) {
+  if (number_of_nodes != static_cast<int>(topo_order.size())) {
     ORT_THROW("Some nodes are not included in the topological sort, graph have a cycle.");
   }
 }
@@ -2367,8 +2388,14 @@ Status Graph::InferAndVerifyTypeMatch(Node& node, const OpSchema& op, const Reso
       inferred_type = existing_type;
     } else {
       // This should not happen: indicates incompleteness in ONNX inference.
+      std::stringstream ss;
+      ss << "index=" << operand_index;
+      for (auto it = op_formal_parameter.GetTypes().begin(); it != op_formal_parameter.GetTypes().end(); ++it) {
+        ss << "," << *(*it);
+      }
       Status status(ONNXRUNTIME, onnxruntime::common::StatusCode::FAIL,
-                    "Node (" + node_name + ") output arg (" + output_def->Name() + ") type inference failed");
+                    "Node (" + node_name + ") Op (" + node.OpType() + ") output arg (" +
+                        output_def->Name() + ") type inference failed, inferred types: " + ss.str());
       return status;
     }
 
@@ -2550,15 +2577,23 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) {
     // Node verification.
     auto& node = *GetNode(node_index);
 
-    NodeProto node_proto;
-    node.ToProto(node_proto);
     const auto& node_name = node.Name();
 
     if (!node.Op()) {
       {
         auto status = Status::OK();
         ORT_TRY {
-          checker::check_node(node_proto, ctx, lsc);
+          // if this is first Graph::Resolve call, we may have a NodeProto that was set on the Node so we can skip
+          // the ToProto call.
+          if (const NodeProto* orig_node_proto = node.GetOriginalNodeProto(); orig_node_proto) {
+            checker::check_node(*orig_node_proto, ctx, lsc);
+            // clear original as we don't know if the node will be modified once the Graph::Resolve completes.
+            node.SetOriginalNodeProto(nullptr);
+          } else {
+            NodeProto node_proto;
+            node.ToProto(node_proto);
+            checker::check_node(node_proto, ctx, lsc);
+          }
         }
         ORT_CATCH(const std::exception& ex) {
           ORT_HANDLE_EXCEPTION([&]() {
@@ -2630,8 +2665,8 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) {
     NO_CHANGE_ON_SYNC_FLAG(ORT_RETURN_IF_ERROR(InferAndVerifyTypeMatch(node, *p_op, options)));
 
     // Accumulate output names of the iterated Node
-    for (auto& output_name : node_proto.output()) {
-      lsc.output_names.insert(output_name);
+    for (const auto& output : node.OutputDefs()) {
+      lsc.output_names.insert(output->Name());
     }
   }
 
@@ -2792,12 +2827,13 @@ Status Graph::Resolve(const ResolveOptions& options) {
                 graph.GraphProtoSyncNeeded(false);
             }
 
+            // set num_resolves_ here so the graph and any subgraphs all have the same value
+            ++graph.num_resolves_;
+
             return Status::OK(); };
 
   ORT_RETURN_IF_ERROR(ForThisAndAllSubgraphs(all_subgraphs, finalize_func));
 
-  ++num_resolves_;
-
   return Status::OK();
 }
 
@@ -2836,7 +2872,7 @@ void Graph::AddInitializedTensor(const TensorProto& tensor) {
 
   const gsl::not_null<TensorProto*> tensor_added{graph_proto_->add_initializer()};
   *(tensor_added) = tensor;
-  name_to_initial_tensor_[tensor.name()] = tensor_added;
+  name_to_initial_tensor_.emplace(tensor.name(), tensor_added);
   SetGraphResolveNeeded();
   if (!is_loaded_from_model_file_ && GetNodeArg(tensor.name()) == nullptr) {
     // make sure there is a NodeArg for the initializer as SetGraphInputsOutputs may add it to the graph inputs.
@@ -3095,13 +3131,25 @@ Node& Graph::AddNode(const NodeProto& node_proto,
     attributes[attr.name()] = attr;
   }
 
-  return AddNode(node_proto.name(),
-                 node_proto.op_type(),
-                 node_proto.doc_string(),
-                 input_defs,
-                 output_defs,
-                 &attributes,
-                 node_proto.domain());
+  Node& new_node = AddNode(node_proto.name(),
+                           node_proto.op_type(),
+                           node_proto.doc_string(),
+                           input_defs,
+                           output_defs,
+                           &attributes,
+                           node_proto.domain());
+
+  // Perf optimization: temporarily set NodeProto in Node so we don't need to call Node::ToProto prior to
+  // calling onnx::check_node
+  // NOTE: We don't handle a node with kOnnxDomainAlias. The entry in schema_registry_ uses kOnnxDomain,
+  // and that's what onnx::check_node uses during validation.
+  // The Node ctor automatically converts kOnnxDomainAlias to kOnnxDomain to handle this.
+  // node_proto is const so we can't do the same here.
+  if (node_proto.domain() != kOnnxDomainAlias) {
+    new_node.SetOriginalNodeProto(&node_proto);
+  }
+
+  return new_node;
 }
 
 static flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>>
diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.cc b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
index 8e962403556d..2314a5228f83 100644
--- a/onnxruntime/core/graph/graph_flatbuffers_utils.cc
+++ b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
@@ -3,7 +3,7 @@
 
 #include "graph_flatbuffers_utils.h"
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/narrow.h"
 #include "core/flatbuffers/flatbuffers_utils.h"
@@ -392,6 +392,14 @@ Status LoadOrtTensorOrtFormat(const fbs::Tensor& fbs_tensor, const AllocatorPtr
   ort_tensor = onnxruntime::Tensor(
       tensor_dtype, TensorShape(tensor_dims->data(), tensor_dims->size()), allocator);
 
+  if (fbs_tensor.raw_data()->size() == 0U) {
+    // Empty tensor. Nothing to unpack.
+    // This check is necessary because an empty ort tensor will return a size of 1.
+    // As a result, the following call to UnpackTensor will fail since the src and
+    // dst sizes do not match (0 and 1 elements).
+    return Status::OK();
+  }
+
   // The tensor proto is used as a dummy here. The actual data is stored in the raw_data field of the flatbuffer.
   // The data is copied from the raw_data field to the ort_tensor.
   ONNX_NAMESPACE::TensorProto unused_tensor_proto;
diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.h b/onnxruntime/core/graph/graph_flatbuffers_utils.h
index b625cbf3ca49..9c55dad3c41e 100644
--- a/onnxruntime/core/graph/graph_flatbuffers_utils.h
+++ b/onnxruntime/core/graph/graph_flatbuffers_utils.h
@@ -5,7 +5,7 @@
 
 #include <memory>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/status.h"
 #include "core/graph/ort_format_load_options.h"
diff --git a/onnxruntime/core/graph/graph_viewer.cc b/onnxruntime/core/graph/graph_viewer.cc
index cf78040ea5ac..119d420066a8 100644
--- a/onnxruntime/core/graph/graph_viewer.cc
+++ b/onnxruntime/core/graph/graph_viewer.cc
@@ -14,8 +14,8 @@ bool NodeCompare::operator()(const Node* n1, const Node* n2) const {
 struct PriorityNodeCompare {
   inline bool IsHighPri(const Node* n) const {
     // local statics so we can compare std::strings in the checks
-    static const std::string shape_op("Shape");
-    static const std::string size_op("Size");
+    static constexpr std::string_view shape_op("Shape");
+    static constexpr std::string_view size_op("Size");
 
     const auto& op_type = n->OpType();
     return op_type == shape_op || op_type == size_op;
@@ -26,15 +26,20 @@ struct PriorityNodeCompare {
   // If return true, n2 will be output first
   bool operator()(const Node* n1, const Node* n2) const {
     // nodes in global high priority list will be output first
-    if (IsHighPri(n1) != IsHighPri(n2)) {
-      return IsHighPri(n2);
+    const bool isN1HighPri = IsHighPri(n1);
+    const bool isN2HighPri = IsHighPri(n2);
+    if (isN1HighPri != isN2HighPri) {
+      return isN2HighPri;
     }
 
     // nodes with lower priority value will be output first
-    if (n1->Priority() != n2->Priority()) {
-      return n1->Priority() > n2->Priority();
+    const auto n1_priority = n1->Priority();
+    const auto n2_priority = n2->Priority();
+    if (n1_priority != n2_priority) {
+      return n1_priority > n2_priority;
     }
 
+#ifdef ENABLE_TRAINING
     // nodes of forward pass will be output first
     auto n1_attrs = n1->GetAttributes();
     auto n2_attrs = n2->GetAttributes();
@@ -45,6 +50,7 @@ struct PriorityNodeCompare {
     if (n1_is_forward != n2_is_forward) {
       return n2_is_forward > n1_is_forward;
     }
+#endif
 
     // otherwise, nodes with lower index will be output first
     return n1->Index() > n2->Index();
@@ -212,6 +218,8 @@ const std::string& GraphViewer::Description() const noexcept {
 
 bool GraphViewer::GetInitializedTensor(const std::string& tensor_name,
                                        const ONNX_NAMESPACE::TensorProto*& value) const {
+  value = nullptr;
+
   // if we are using filtered subgraph, the initializer has to be part of the subgraph
   if (filter_info_ != nullptr && filtered_initializers_.find(tensor_name) == filtered_initializers_.cend())
     return false;
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 4ce6660b794b..a774d5fe3446 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -8,7 +8,7 @@
 #include <climits>
 #include <string>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/path.h"
 #include "core/graph/graph_viewer.h"
diff --git a/onnxruntime/core/graph/op_identifier_utils.h b/onnxruntime/core/graph/op_identifier_utils.h
index 8a9351a2d0dd..f7b1198c3197 100644
--- a/onnxruntime/core/graph/op_identifier_utils.h
+++ b/onnxruntime/core/graph/op_identifier_utils.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/graph/op_identifier.h"
 
diff --git a/onnxruntime/core/graph/runtime_optimization_record_container.h b/onnxruntime/core/graph/runtime_optimization_record_container.h
index a28b19e786de..75750c2b9698 100644
--- a/onnxruntime/core/graph/runtime_optimization_record_container.h
+++ b/onnxruntime/core/graph/runtime_optimization_record_container.h
@@ -9,7 +9,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 #include "core/common/common.h"
 #include "core/graph/runtime_optimization_record.h"
diff --git a/onnxruntime/core/mickey/README.md b/onnxruntime/core/mickey/README.md
index 7e8d30cd1805..735ec4b80daf 100644
--- a/onnxruntime/core/mickey/README.md
+++ b/onnxruntime/core/mickey/README.md
@@ -4,3 +4,7 @@ Playful name for a template library of high performance cuda code that
 are often shared by various AI operators. The intention is to make this
 header files only, with no binary impact unless it is instantiated
 where it is needed.
+
+Currently cuda code are scattered in multiple locations in the repo.
+Hopefully this can be the starting point of consolidating all cuda
+code.
diff --git a/onnxruntime/core/mickey/blk_q4/f16_gemm_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_gemm_sm80.h
new file mode 100644
index 000000000000..52bff7e40dbe
--- /dev/null
+++ b/onnxruntime/core/mickey/blk_q4/f16_gemm_sm80.h
@@ -0,0 +1,208 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *   blk_q4/f16_gemm_sm80.h
+ *
+ * Abstract:
+ *   Entry point for Q4F16 GEMM kernel for SM80 devices.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass_ext/q4gemm/device/quantb_gemm.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+//
+// This is the implementation of the quantized GEMM kernel for 16b float x blocked quantized 4b data type
+//
+template <
+    typename ElementDequant_,  // <- data type of dequantized elements for gemm, fp16 or bf16
+    typename QuantBlocking_,   // <- weights block per scale, cutlass::MatrixShape<x,y>
+    bool SmallM,               // <- true if M <= 16
+    bool kHasQuantOffset>
+struct BlkQ4F16GemmImpl {
+  //
+  // Type definitions
+  //
+
+  using ElementDequant = ElementDequant_;
+  using QuantBlocking = QuantBlocking_;
+
+  static_assert(sizeof(ElementDequant) == 2, "q4f16gemm kerenl only support 16b operands!");
+
+  // Data types that are fixed for this kernel
+  using ElementAccumulator = float;
+  using ElementComputeEpilogue = ElementAccumulator;
+  using ElementInputA = ElementDequant;
+  using ElementOutput = ElementDequant;
+
+  using ElementW = uint8_t;  // <- Weight is int4, uint8 for two of them
+
+  // We pack 4 weights into one 16b element, so as to leverage cutlass tile iterators
+  // for async shared memory loading and minimize bank conflict
+  using ElementWPack = ElementDequant;
+
+  using ElementQScale = ElementDequant;  // <- data type of quantization scale
+  using ElementQOffset = uint8_t;
+
+  using LayoutInputA = cutlass::layout::RowMajor;
+  using LayoutInputWPack = cutlass::layout::ColumnMajor;
+  using LayoutOutput = cutlass::layout::RowMajor;
+
+  // Layout of quantization scale and offset, oriented to be loaded using less instructions
+  // in a warp tile
+  using LayoutInputQScale =
+      typename std::conditional<QuantBlocking::kRow == 1,
+                                cutlass::layout::ColumnMajor,
+                                cutlass::layout::RowMajor>::type;  // <- layout of quantization scale
+
+  using ShapeMMAThreadBlock =
+      typename std::conditional<SmallM,
+                                cutlass::gemm::GemmShape<16, 64, 64>,
+                                cutlass::gemm::GemmShape<128, 256, 64>>::type;
+
+  static constexpr int MinN = QuantBlocking::kColumn > 32 ? QuantBlocking::kColumn : 32;
+  using ShapeMMAWarp =
+      typename std::conditional<SmallM,
+                                cutlass::gemm::GemmShape<16, MinN, 64>,
+                                cutlass::gemm::GemmShape<64, 64, 64>>::type;
+
+  using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+
+  // This code section describes the epilogue part of the kernel
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,                                     // <- data type of output matrix
+      128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+                                                         // memory access. For a byte, it's 16
+                                                         // elements. This becomes the vector width of
+                                                         // math instructions in the epilogue too
+      ElementAccumulator,                                // <- data type of accumulator
+      ElementComputeEpilogue>;                           // <- data type for alpha/beta in linear combination function
+
+  // Number of pipelines you want to use
+  static constexpr int NumStages = 3;
+
+  using Gemm = cutlass::gemm::device::QuantBGemm<
+      ElementInputA,
+      LayoutInputA,
+      ElementWPack,
+      LayoutInputWPack,
+      ElementQScale,
+      typename std::conditional<kHasQuantOffset, ElementQOffset, std::monostate>::type,
+      LayoutInputQScale,
+      QuantBlocking,
+      ElementOutput,
+      LayoutOutput,
+      ElementAccumulator,
+      cutlass::arch::OpClassTensorOp,
+      cutlass::arch::Sm80,
+      ShapeMMAThreadBlock,
+      ShapeMMAWarp,
+      ShapeMMAOp,
+      EpilogueOp,
+      SwizzleThreadBlock,
+      NumStages>;
+
+  using Arguments = typename Gemm::Arguments;
+
+  // Invoke gemm kernel (the version with quantization offset)
+  static cutlass::Status run(
+      cudaStream_t stream,
+      const cutlass::gemm::GemmCoord& problem_size_,
+      cutlass::TensorRef<ElementInputA const, LayoutInputA> ref_A_,
+      cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_B_,
+      cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_Qscale_,
+      cutlass::TensorRef<ElementQOffset const, LayoutInputQScale> ref_Qoffset_,
+      cutlass::TensorRef<ElementOutput const, LayoutOutput> ref_C_,
+      cutlass::TensorRef<ElementOutput, LayoutOutput> ref_D_,
+      typename EpilogueOp::Params epilogue_ = typename EpilogueOp::Params()) {
+    if constexpr (!kHasQuantOffset) {
+      return cutlass::Status::kErrorNotSupported;
+    } else {
+      if constexpr (ShapeMMAThreadBlock::kM == 16) {
+        if (problem_size_.m() > 16) {
+          // For M > 16, the caller should have picked the
+          // kernel with bigger M
+          return cutlass::Status::kErrorNotSupported;
+        }
+      }
+
+      // Construct Gemm arguments
+      Arguments args{
+          problem_size_,
+          ref_A_,
+          ref_B_,
+          ref_Qscale_,
+          ref_Qoffset_,
+          ref_C_,
+          ref_D_,
+          epilogue_};
+
+      Gemm gemm_op;
+
+      // Check if this GEMM can be run or not
+      cutlass::Status status = gemm_op.can_implement(args);
+      if (status != cutlass::Status::kSuccess) {
+        return status;
+      }
+
+      // Launch the CUTLASS GEMM kernel.
+      return gemm_op(args, nullptr, stream);
+    }
+  }
+
+  // Invoke gemm kernel (the version without quantization offset)
+  static cutlass::Status run(
+      cudaStream_t stream,
+      const cutlass::gemm::GemmCoord& problem_size_,
+      cutlass::TensorRef<ElementInputA const, LayoutInputA> ref_A_,
+      cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_B_,
+      cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_Qscale_,
+      cutlass::TensorRef<ElementOutput const, LayoutOutput> ref_C_,
+      cutlass::TensorRef<ElementOutput, LayoutOutput> ref_D_,
+      typename EpilogueOp::Params epilogue_ = typename EpilogueOp::Params()) {
+    if constexpr (kHasQuantOffset) {
+      return cutlass::Status::kErrorNotSupported;
+    } else {
+      if constexpr (ShapeMMAThreadBlock::kM == 16) {
+        if (problem_size_.m() > 16) {
+          // For M > 16, the caller should have picked the
+          // kernel with bigger M
+          return cutlass::Status::kErrorNotSupported;
+        }
+      }
+
+      // Construct Gemm arguments
+      Arguments args{
+          problem_size_,
+          ref_A_,
+          ref_B_,
+          ref_Qscale_,
+          ref_C_,
+          ref_D_,
+          epilogue_};
+
+      Gemm gemm_op;
+
+      // Check if this GEMM can be run or not
+      cutlass::Status status = gemm_op.can_implement(args);
+      if (status != cutlass::Status::kSuccess) {
+        return status;
+      }
+
+      // Launch the CUTLASS GEMM kernel.
+      return gemm_op(args, nullptr, stream);
+    }
+  }
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/mickey/blk_q4/prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
similarity index 94%
rename from onnxruntime/core/mickey/blk_q4/prepack_sm80.h
rename to onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
index e291ab39e8aa..c81b4967d271 100644
--- a/onnxruntime/core/mickey/blk_q4/prepack_sm80.h
+++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
@@ -3,7 +3,7 @@
  * Licensed under the MIT License.
  *
  * Module Name:
- *    prepack_sm80.h
+ *    blk_q4/f16_prepack_sm80.h
  *
  * Abstract:
  *    Prepack weights and quantization parameters (scales and offsets) for
@@ -110,8 +110,8 @@ struct BlockwiseQuantization {
   static void prepack_weights(
       int rows,
       int columns,
-      const gsl::span<uint8_t const>& weights,     // <- int4 weights, column major
-      const gsl::span<uint8_t>& weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
+      gsl::span<uint8_t const> weights,     // <- int4 weights, column major
+      gsl::span<uint8_t> weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
   ) {
     ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 &&
                     (rows % QuantBlocking::kRow) == 0 &&
@@ -171,10 +171,10 @@ struct BlockwiseQuantization {
   static void prepack_quant_scales(
       size_t rows,
       size_t columns,
-      const gsl::span<ElementT const>& scales,     // <- quant scales, column major layout
-      const gsl::span<ElementT>& scales_prepacked  // <- quant scales prepacked, same size buffer
+      gsl::span<ElementT const> scales,     // <- quant scales, column major layout
+      gsl::span<ElementT> scales_prepacked  // <- quant scales prepacked, same size buffer
   ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
     ORT_ENFORCE(scales.size() == size_t(meta_shape.product()),
                 "Quantization scale tensor shape mismatch!");
     ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()),
@@ -241,10 +241,10 @@ struct BlockwiseQuantization {
   static void prepack_quant_offsets(
       size_t rows,
       size_t columns,
-      const gsl::span<uint8_t const>& offsets,     // <- quant offsets, int4, column major layout
-      const gsl::span<uint8_t>& offsets_prepacked  // <- quant offsets prepacked, double size buffer
+      gsl::span<uint8_t const> offsets,     // <- quant offsets, int4, column major layout
+      gsl::span<uint8_t> offsets_prepacked  // <- quant offsets prepacked, double size buffer
   ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
 
     ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0,
                 "Does not support odd number of rows or columns!");
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/device/quantb_gemm.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/device/quantb_gemm.h
new file mode 100644
index 000000000000..38795291b032
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/device/quantb_gemm.h
@@ -0,0 +1,481 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_gemm.h
+ * @brief Modified from cutlass/gemm/device/gemm.h, boilerplate code passing input pointers to the kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm.h"
+
+#include "cutlass_ext/q4gemm/kernel/default_quantb_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! A specialized GEMM operator for quantized B GEMM.
+
+  It is modified from cutlass::gemm::device::Gemm. Both this class and the original Gemm class
+  are pretty much boilerplate code that construct the Gemm kernel class, and pass parameters
+  and controls to it. The only difference is that this class has a few more template parameters
+  to support quantization.
+
+  This implementation pretty much follows the design of cutlass. But this class seems to be
+  just a wrapper of the Gemm kernel class. Consider combining them in future iterations.
+
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for quant scales
+    typename ElementQScale_,
+    /// Element type for quant offsets
+    typename ElementQOffset_,
+    /// Layout type for quant scales and offsets
+    typename LayoutQMeta_,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute>
+class QuantBGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  // Quantization Parameters
+  static_assert(std::is_same<LayoutB, layout::ColumnMajor>::value,
+                "LayoutB, i.e. packed weights must appear ColumnMajor.");
+  static_assert(InstructionShape::kK == 16,
+                "InstructionShape::kK must be a multiple of 16 (2 tiles), required by 4b weight packing layout.");
+  using ElementQScale = ElementQScale_;
+  using ElementQOffset = ElementQOffset_;
+  using LayoutQMeta = LayoutQMeta_;
+  using QuantBlocking = QuantBlocking_;
+  static constexpr bool kHasQOffset = !(std::is_same<ElementQOffset, std::monostate>::value);
+
+  // TODO(chenfucn): consider moving to uint4_t or smaller for QOffset
+  static_assert(!kHasQOffset || std::is_same<ElementQOffset_, uint8_t>::value, "QOffset must be uint8_t");
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultQuantBGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementQScale,
+    ElementQOffset,
+    LayoutQMeta,
+    QuantBlocking,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    GatherA,
+    GatherB,
+    ScatterD,
+    PermuteDLayout
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    TensorRef<ElementQScale const, LayoutQMeta> ref_Qscale;
+    TensorRef<ElementQOffset const, LayoutQMeta> ref_Qoffset;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    // split-K parallelism (etc.) are not yet supported, keeping this for future extension
+    int split_k_slices{1};
+    // For gather+scatter operations
+    int const *gather_A_indices{nullptr};
+    int const *gather_B_indices{nullptr};
+    int const *scatter_D_indices{nullptr};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0) {}
+
+    /// Constructs an Arguments structure
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementQScale const, LayoutQMeta> ref_Qscale_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ =
+        typename EpilogueOutputOp::Params()):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_Qscale(ref_Qscale_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_) {
+        assert(!kHasQOffset);
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementQScale const, LayoutQMeta> ref_Qscale_,
+      TensorRef<ElementQOffset const, LayoutQMeta> ref_Qoffset_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ =
+        typename EpilogueOutputOp::Params()):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_Qscale(ref_Qscale_),
+      ref_Qoffset(ref_Qoffset_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_) {
+        assert(kHasQOffset);
+    }
+  };
+
+ private:
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+ public:
+  /// Constructs the GEMM.
+  QuantBGemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_Qscale.non_const_ref(),
+      args.ref_Qoffset.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    } else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_Qscale.non_const_ref(),
+      args.ref_Qoffset.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.epilogue,
+      static_cast<int *>(workspace),
+      args.gather_A_indices,
+      args.gather_B_indices,
+      args.scatter_D_indices
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_Qscale.reset(args.ref_Qscale.non_const_ref().data());
+    params_.ref_Qoffset.reset(args.ref_Qoffset.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        std::cerr << "Failed to obtain maximum shared memory size " << smem_size << " for kernel: "
+                  << cudaGetErrorString(result) << "\n";
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/default_quantb_gemm.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/default_quantb_gemm.h
new file mode 100644
index 000000000000..2f4460bb59e9
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/default_quantb_gemm.h
@@ -0,0 +1,255 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file default_quantb_gemm.h
+ * @brief Modified from cutlass/gemm/kernel/default_gemm.h. templates for combining
+ *        threadblock-scoped matrix multiply-add with  the appropriate
+ *        threadblock-scoped epilogue.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass_ext/q4gemm/kernel/quantb_gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass_ext/q4gemm/threadblock/default_quantb_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/layout/permute.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for quant scales
+    typename ElementQScale_,
+    /// Element type for quant offsets
+    typename ElementQOffset_,
+    /// Layout type for quant scales and offsets
+    typename LayoutQMeta_,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking_,
+    /// Access granularity of quant scales in units of elements
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute,
+    ///
+    typename Enable = void
+>
+struct DefaultQuantBGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for quant scales
+    typename ElementQScale,
+    /// Element type for quant offsets
+    typename ElementQOffset,
+    /// Layout type for quant scales
+    typename LayoutQMeta,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking,
+    /// Access granularity of quant scales in units of elements
+    typename ElementC,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultQuantBGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                         ElementQScale, ElementQOffset, LayoutQMeta, QuantBlocking,
+                         ElementC, LayoutC, ElementAccumulator,
+                         arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
+                         InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+                         SplitKSerial, Operator, GatherA, GatherB, ScatterD,
+                         PermuteDLayout, PermuteALayout, PermuteBLayout> {
+
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
+             "Epilogue in the kernel level must be row major");
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultQuantBMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementQScale, ElementQOffset, LayoutQMeta, QuantBlocking,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using RegularEpilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  using Affine2Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpAffineRankN<
+          2, ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+                                                  RegularEpilogue,
+                                                  Affine2Epilogue>::type;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::QuantBGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/quantb_gemm.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/quantb_gemm.h
new file mode 100644
index 000000000000..6e5ad8f40614
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/kernel/quantb_gemm.h
@@ -0,0 +1,462 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_gemm.h
+ * @brief Modified from cutlass/gemm/kernel/gemm.h.
+ *        Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/util/debug.h"
+#include "cutlass/util/device_dump.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct QuantBGemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  static constexpr bool kHasQOffset = Mma::kHasQOffset;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Mma::IteratorQScale::Params params_QScale;
+    typename Mma::IteratorQScale::TensorRef ref_QScale;
+    typename Mma::IteratorQOffset::Params params_QOffset;
+    typename Mma::IteratorQOffset::TensorRef ref_QOffset;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename OutputOp::Params output_op;
+    int *semaphore;
+    int gemm_k_size;  // how many k vectors are processed by this threadblock
+    // For gather+scatter operations
+    int const *gather_A_indices;
+    int const *gather_B_indices;
+    int const *scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_size(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Mma::IteratorQScale::TensorRef ref_QScale,
+      typename Mma::IteratorQOffset::TensorRef ref_QOffset,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr,
+      int const *gather_A_indices = nullptr,
+      int const *gather_B_indices = nullptr,
+      int const *scatter_D_indices = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_QScale(ref_QScale.layout()),
+      ref_QScale(ref_QScale),
+      params_QOffset(ref_QOffset.layout()),
+      ref_QOffset(ref_QOffset),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      gather_A_indices(gather_A_indices),
+      gather_B_indices(gather_B_indices),
+      scatter_D_indices(scatter_D_indices) {
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+      semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  QuantBGemm() { }
+
+  /// Determines whether kernel satisfies alignment
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    typename Mma::IteratorA::TensorRef ref_A,
+    typename Mma::IteratorB::TensorRef ref_B,
+    typename Mma::IteratorQScale::TensorRef ref_QScale,
+    typename Mma::IteratorQOffset::TensorRef ref_QOffset,
+    typename Epilogue::OutputTileIterator::TensorRef ref_C,
+    typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    // TODO check problem_size K, N must be multiple of QuantBlocking
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (problem_size.k() % Mma::Shape::kK != 0) {
+      // Currently we don't support this case due to the way
+      // predicate iterator works, it loads the partial tile
+      // in the first iteration and then the full tile in the
+      // remaining iterations. This will cause the blockwise
+      // quantization parameters to go out of step with the
+      // weights. We can fix this by adding a predicate iterator
+      // that loads the full tile in the first iterations and
+      // then the partial tile in the last iteration.
+      return Status::kErrorInvalidProblem;
+    }
+
+    int qscale_k = problem_size.k() / Mma::QuantBlocking::kRow;
+    int qscale_n = problem_size.n() / Mma::QuantBlocking::kColumn;
+    if ((qscale_k == 0) || (qscale_k * Mma::QuantBlocking::kRow != problem_size.k())) {
+      // partial block not supported
+      return Status::kErrorInvalidProblem;
+    }
+    if ((qscale_n == 0) || (qscale_n * Mma::QuantBlocking::kColumn != problem_size.n())) {
+      // partial block not supported
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (!TensorRef_aligned(ref_QScale, Mma::IteratorQScale::AccessType::kElements)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if constexpr(kHasQOffset) {
+      if (!TensorRef_aligned(ref_QOffset, Mma::IteratorQOffset::AccessType::kElements)) {
+        return Status::kErrorMisalignedOperand;
+      }
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      (threadblock_tile_offset.k() * params.gemm_k_size) / 2,
+      (threadblock_tile_offset.n() * Mma::Shape::kN) / 2
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(),
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k/2, params.problem_size.n()/2},
+      thread_idx,
+      tb_offset_B,
+      params.gather_B_indices);
+
+    const int qscale_k = problem_size_k / Mma::QuantBlocking::kRow;
+    const int qscale_n = params.problem_size.n() / Mma::QuantBlocking::kColumn;
+
+    // should have been verified by can_implement()
+    assert((qscale_k > 0) && (qscale_k * Mma::QuantBlocking::kRow == problem_size_k));
+    assert((qscale_n > 0) && (qscale_n * Mma::QuantBlocking::kColumn == params.problem_size.n()));
+
+    cutlass::MatrixCoord tb_offset_QScale{
+      threadblock_tile_offset.k() * (params.gemm_k_size/Mma::QuantBlocking::kRow),
+      threadblock_tile_offset.n() * (Mma::Shape::kN/Mma::QuantBlocking::kColumn)
+    };
+
+    typename Mma::IteratorQScale iterator_QScale(
+      params.params_QScale,
+      params.ref_QScale.data(),
+      {qscale_k, qscale_n},
+      thread_idx,
+      tb_offset_QScale,
+      nullptr);
+
+    typename Mma::IteratorQOffset iterator_QOffset(
+      params.params_QOffset,
+      params.ref_QOffset.data(),
+      {qscale_k, qscale_n},
+      thread_idx,
+      tb_offset_QScale);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    const int warp_idx = canonical_warp_idx();
+    const int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_QScale, iterator_QOffset, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma.h
new file mode 100644
index 000000000000..0af604f090e1
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma.h
@@ -0,0 +1,248 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file default_quantb_mma.h
+ * @brief Modified from cutlass/gemm/threadblock/default_mma.h.
+ *        Defining global memory data layout and iterators, combinging with mma core and
+ *        pipelined GEMM kernel.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h"
+#include "cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for quant scales
+    typename ElementQScale_,
+    /// Element type for quant offsets
+    typename ElementQOffset_,
+    /// Layout for quant scales and offsets
+    typename LayoutQMeta_,
+    /// Blocking size for quantization
+    typename QuantBlocking_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute
+    >
+struct DefaultQuantBMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for quant scales
+    typename ElementQScale,
+    /// Element type for quant offsets
+    typename ElementQOffset,
+    /// Layout for quant scales and offsets
+    typename LayoutQMeta,
+    /// Blocking size for quantization
+    typename QuantBlocking,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultQuantBMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementQScale, ElementQOffset,
+                  LayoutQMeta, QuantBlocking,
+                  ElementAccumulator, LayoutC,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+
+  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+             "simt epilogue must be row major");
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultQuantBMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementQScale, ElementQOffset, LayoutQMeta, QuantBlocking,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK/2, ThreadblockShape::kN/2>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
+
+  // Define iterators over tiles from the quant scales
+  using ThreadMapQScale = typename MmaCore::IteratorThreadMapQScale;
+  using AccessTypeQScale =
+      cutlass::Array<ElementQScale, ThreadMapQScale::kElementsPerAccess>;
+  using IteratorQScale =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          typename MmaCore::ThreadblockQShape,
+          ElementQScale, LayoutQMeta, 0, ThreadMapQScale, AccessTypeQScale>;
+
+  using ThreadMapQOffset = typename MmaCore::IteratorThreadMapQOffset;
+  using AccessTypeQOffset =
+      cutlass::Array<ElementQOffset, ThreadMapQOffset::kElementsPerAccess>;
+  using IteratorQOffset =
+      cutlass::transform::threadblock::OptionalPredicatedTileAccessIterator<
+            typename MmaCore::ThreadblockQShape, ElementQOffset, LayoutQMeta,
+            0, ThreadMapQOffset, AccessTypeQOffset, MmaCore::kThreads>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::QuantBMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, IteratorQScale, typename MmaCore::SmemIteratorQScale,
+      cutlass::arch::CacheOperation::Global, IteratorQOffset,
+      typename MmaCore::SmemIteratorQOffset, cutlass::arch::CacheOperation::Global,
+      ElementAccumulator, LayoutC,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h
new file mode 100644
index 000000000000..ad322f650520
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/default_quantb_mma_core.h
@@ -0,0 +1,340 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file default_quantb_mma_core.h
+ * @brief Modified from cutlass/gemm/threadblock/default_mma_core.h.
+ *        Defining data layout in shared memory, and its iterators.
+ */
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h"
+
+#include "cutlass/util/debug.h"
+#include "cutlass/util/device_dump.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Element data type of quant scale
+    typename ElementQScale,
+    /// Element data type of quant offset
+    typename ElementQOffset,
+    /// Layout of quant scale
+    typename LayoutQMeta,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultQuantBMmaCore;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Element data type of quant scale
+    typename ElementQScale_,
+    /// Element data type of quant offset
+    typename ElementQOffset_,
+    /// Layout of quant scale
+    typename LayoutQMeta_,
+    /// Blocking dimensions for quantization
+    typename QuantBlocking_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultQuantBMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementQScale_, ElementQOffset_, LayoutQMeta_, QuantBlocking_,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+
+  using ElementQScale = ElementQScale_;
+  using ElementQOffset = ElementQOffset_;
+  using LayoutQMeta = LayoutQMeta_;
+  using QuantBlocking = QuantBlocking_;
+
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN,
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      (Shape::kK / 2) / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK/2>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK/2, Shape::kN/2>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK/2, Shape::kN/2>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  using SmemLayoutQScale = LayoutQMeta;
+  using SmemLayoutQOffset = LayoutQMeta;
+
+  /// Threadblock-level quantization meta data shape
+  using ThreadblockQShape = MatrixShape<Shape::kK / QuantBlocking::kRow, Shape::kN / QuantBlocking::kColumn>;
+  static_assert(Shape::kK % QuantBlocking::kRow == 0, "K must be multiple of QuantBlocking::kRow");
+  static_assert(Shape::kN % QuantBlocking::kColumn == 0, "N must be multiple of QuantBlocking::kColumn");
+  static_assert(ThreadblockQShape::kCount > 0, "QuantBlocking too big to fit in a thread block!");
+  static_assert(QuantBlocking::kRow == 1 || QuantBlocking::kColumn == 1,
+        "Only support single column or row quantize blocking!");
+  static_assert(QuantBlocking::kColumn != 1 || std::is_same<LayoutQMeta, layout::RowMajor>::value,
+        "Quant scale matrix's major dimension must have more elements, to facilitate fast loading!");
+
+  /// Threadblock-level quantization meta data shape in pitch-linear layout
+  using TBQPitchLinearShape = typename std::conditional<
+      std::is_same<LayoutQMeta, layout::RowMajor>::value,
+      layout::PitchLinearShape<ThreadblockQShape::kColumn, ThreadblockQShape::kRow>,
+      layout::PitchLinearShape<ThreadblockQShape::kRow, ThreadblockQShape::kColumn>>::type;
+
+  /// By default we would like to use 128b load. However, we can't load more than
+  /// a column at a time in a column major layout.
+  static int const kElementsPerAccessQScale =
+      (kAccessSizeInBits / sizeof_bits<ElementQScale>::value) > TBQPitchLinearShape::kContiguous
+          ? TBQPitchLinearShape::kContiguous
+          : (kAccessSizeInBits / sizeof_bits<ElementQScale>::value);
+
+  /// quant scale is tiny.  Not all threads are needed.
+  static int const kAccessCntQScale = ThreadblockQShape::kCount / kElementsPerAccessQScale;
+  static int const kThreadsQScale = (kAccessCntQScale > kThreads) ? kThreads : kAccessCntQScale;
+
+  using IteratorThreadMapQScale = transform::PitchLinearStripminedThreadMap<
+      TBQPitchLinearShape, kThreadsQScale, kElementsPerAccessQScale>;
+
+  using SmemIteratorQScale = transform::threadblock::RegularTileAccessIterator<
+        ThreadblockQShape, ElementQScale, SmemLayoutQScale, 1, IteratorThreadMapQScale>;
+
+  static int const kElementsPerAccessQOffset =
+      (kAccessSizeInBits / sizeof_bits<ElementQOffset>::value) > TBQPitchLinearShape::kContiguous
+          ? TBQPitchLinearShape::kContiguous
+          : (kAccessSizeInBits / sizeof_bits<ElementQOffset>::value);
+  static int const kAccessCntQOffset = ThreadblockQShape::kCount / kElementsPerAccessQOffset;
+  static int const kThreadsQOffset = (kAccessCntQOffset > kThreads) ? kThreads : kAccessCntQOffset;
+
+  using IteratorThreadMapQOffset = transform::PitchLinearStripminedThreadMap<
+      TBQPitchLinearShape, kThreadsQOffset, kElementsPerAccessQOffset>;
+
+  using SmemIteratorQOffset = transform::threadblock::OptionalRegularTileAccessIterator<
+        ThreadblockQShape, ElementQOffset, SmemLayoutQOffset, 1, IteratorThreadMapQOffset, kThreads>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultQuantBMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementQScale, SmemLayoutQScale, ElementQOffset, SmemLayoutQScale, QuantBlocking,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h
new file mode 100644
index 000000000000..6f27a692a3a2
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_predicated_tile_access_iter.h
@@ -0,0 +1,314 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT license.
+ *
+ * @file optional_predicated_tile_access_iter.h
+ * @brief Templates for loading and storing optional tiles of matrix data.
+ *   This iterator is just a wrapper of PredicatedTileAccessIterator, with
+ *   the option to turn it off at compile time and minimize its runtime
+ *   footprint. Also, it utilize the higher numbered threads in the
+ *   threadblock when  the iterator can not utilize all the threads.
+ */
+
+#pragma once
+
+#include <variant>
+
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Optional 2-D matrix data loader, when element is std::monostate, the
+/// iterator becomes no-op with minimal runtime footprint. Also, it utilize the
+/// higher numbered threads in the threadblock when the iterator can not utilize
+/// all the threads.
+///
+template <
+    /// Tile shape of the iterator
+    typename Shape_,
+    /// Element data type of the iterator, no-op when it is std::monostate
+    typename Element_,
+    /// Layout of the source matrix
+    typename Layout_,
+    int AdvanceRank_,
+    typename ThreadMap_,
+    typename AccessType_,
+    /// Number of threads in the threadblock, when provided, the iterator
+    /// will utilize the higher numbered threads
+    int kThreadBlockSize_ = -1>
+class OptionalPredicatedTileAccessIterator{
+ public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  static constexpr int kAdvanceRank = AdvanceRank_;
+  static constexpr int kThreadblockSize = kThreadBlockSize_;
+
+  static_assert(!std::is_same<Element, std::monostate>::value,
+      "Disabled Iterator failed to match the specialized version below.");
+  static_assert(kThreadblockSize == -1 || kThreadblockSize >= ThreadMap::kThreads,
+      "kThreadblockSize must be no smaller than ThreadMap::kThreads");
+
+  using Base = PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank, ThreadMap, AccessType>;
+
+  using LongIndex = typename Base::LongIndex;
+  using Mask = typename Base::Mask;
+  using TensorCoord = typename Base::TensorCoord;
+  using TensorRef = typename Base::TensorRef;
+  using Params = typename Base::Params;
+  using Pointer = typename Base::Pointer;
+
+  static constexpr int kAccessesPerVector = Base::kAccessesPerVector;
+
+  CUTLASS_HOST_DEVICE
+  static int flip_thread_id(int thread_id){
+    if constexpr (kThreadblockSize > 0) {
+      return kThreadblockSize - 1 - thread_id;
+    }
+    return thread_id;
+  }
+
+ public:
+   Base base_;
+
+  /// Default constructor
+  OptionalPredicatedTileAccessIterator(): base_() {};
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : base_(params, pointer, extent, flip_thread_id(thread_id), threadblock_offset) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : OptionalPredicatedTileAccessIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    base_.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    base_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    base_.add_tile_offset(tile_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return base_.get();
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator &operator++() {
+    ++base_;
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator operator++(int) {
+    OptionalPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    base_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    base_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    base_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    base_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return base_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for the disabled version
+/// Reduce runtime overhead
+///
+template <
+    /// Tile shape of the iterator
+    typename Shape_,
+    typename Layout_,
+    int AdvanceRank_,
+    typename ThreadMap_,
+    typename AccessType_,
+    int kThreadBlockSize_>
+class OptionalPredicatedTileAccessIterator<Shape_, std::monostate, Layout_, AdvanceRank_, ThreadMap_, AccessType_, kThreadBlockSize_>{
+ public:
+
+  using Shape = Shape_;
+  using Element = std::monostate;
+  using Layout = Layout_;
+  static int const kAdvanceRank = AdvanceRank_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  static constexpr int kThreadblockSize = kThreadBlockSize_;
+
+  using Base = PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank, ThreadMap, AccessType>;
+
+  using LongIndex = typename Base::LongIndex;
+  using Mask = typename Base::Mask;
+  using TensorCoord = typename Base::TensorCoord;
+  using TensorRef = typename Base::TensorRef;
+  using Params = typename Base::Params;
+  using Pointer = typename Base::Pointer;
+
+  static constexpr int kAccessesPerVector = Base::kAccessesPerVector;
+
+ public:
+  std::monostate base_;
+
+  /// Default constructor
+  OptionalPredicatedTileAccessIterator(): base_() {};
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : base_() {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : base_() {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {}
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return nullptr;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  OptionalPredicatedTileAccessIterator operator++(int) {
+    return *this;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {}
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {}
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {}
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {}
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const { return false; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h
new file mode 100644
index 000000000000..4b0ae5317f8b
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/optional_regular_tile_access_iter.h
@@ -0,0 +1,224 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT license.
+ *
+ * @file optional_regular_tile_access_iter.h
+ * @brief Templates implementing the address computation of storing of tiles
+ *   from pitch-linear rank=2 tensors.
+ *
+ *   This iterator is just a wrapper of RegularTileAccessIterator, with the
+ *   option to turn it off at compile time and minimize its runtime footprint.
+ *   Also, it utilize the higher numbered threads in the threadblock when the
+ *   iterator can not utilize all the threads.
+ *
+ *   Must be used in conjunction with OptionalPredicatedTileAccessIterator,
+ *   with the same template parameters.
+ */
+
+#pragma once
+
+#include <variant>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Optional 2-D tile iterator, when element is std::monostate, the iterator
+/// becomes no-op with minimal runtime footprint. Also, it utilize the higher
+/// numbered threads in the threadblock when the iterator can not utilize all
+/// the threads.
+///
+template <
+    /// Tile shape of the iterator
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    /// Number of threads in the threadblock, when not -1, the iterator
+    /// will utilize the higher numbered threads
+    int ThreadblockSize_ = -1,
+    int Alignment =
+        sizeof_bits<Element_>::value * ThreadMap_::kElementsPerAccess / 8>
+class OptionalRegularTileAccessIterator{
+ public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  static constexpr int kAlignment = Alignment;
+  static constexpr int kThreadblockSize = ThreadblockSize_;
+
+  static_assert(!std::is_same<Element, std::monostate>::value,
+      "Disabled Iterator failed to match the specialized template");
+  static_assert(kThreadblockSize == -1 || kThreadblockSize >= ThreadMap::kThreads,
+      "kThreadblockSize must be no smaller than ThreadMap::kThreads");
+
+  using Base = RegularTileAccessIterator<Shape, Element, Layout, AdvanceRank, ThreadMap, Alignment>;
+
+  using LongIndex = typename Base::LongIndex;
+  using TensorRef = typename Base::TensorRef;
+  using TensorCoord = typename Base::TensorCoord;
+  using AccessType = typename Base::AccessType;
+
+  CUTLASS_HOST_DEVICE
+  static int flip_thread_id(int thread_id){
+    if constexpr (kThreadblockSize > 0) {
+      return kThreadblockSize - 1 - thread_id;
+    }
+    return thread_id;
+  }
+
+ private:
+
+  Base base_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : base_(ref, flip_thread_id(thread_id)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    base_.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    base_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    return base_.get();
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator &operator++() {
+    ++base_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    base_.add_tile_offset(coord);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization when Element is std::monostate, the iterator becomes no-op
+///
+template <
+    typename Shape_,
+    typename Layout_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int ThreadblockSize_,
+    int Alignment>
+class OptionalRegularTileAccessIterator<Shape_, std::monostate, Layout_,
+    AdvanceRank, ThreadMap_, ThreadblockSize_, Alignment>{
+ public:
+
+  using Shape = Shape_;
+  using Element = std::monostate;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  static constexpr int kAlignment = Alignment;
+  static constexpr int kThreadblockSize = ThreadblockSize_;
+
+  using Base = RegularTileAccessIterator<Shape, Element, Layout, AdvanceRank, ThreadMap, Alignment>;
+
+  using LongIndex = typename Base::LongIndex;
+  using TensorRef = typename Base::TensorRef;
+  using TensorCoord = typename Base::TensorCoord;
+  using AccessType = typename Base::AccessType;
+
+ private:
+
+  std::monostate base_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : base_() {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {}
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    return nullptr;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  OptionalRegularTileAccessIterator operator++(int) {
+    return *this;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {}
+};
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
new file mode 100644
index 000000000000..28364cc34f2d
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
@@ -0,0 +1,1290 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_mma_multistage.h
+ * @brief Modified from cutlass/gemm/threadblock/mma_multistage.h.
+ * Added the quantized data memory pipeline, dequantization, and feeding
+ * to tensor cores. Mainloop pipeline is heavily modified.
+ */
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+#include "cutlass/util/debug.h"
+#include "cutlass/util/device_dump.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Utilities for printing layout for the prepacked weights and quantization parameters
+///
+template<
+    /// Data type of the prepacked weights
+    typename ElementWeight,
+    /// Data type of the quant scales
+    typename ElementQScale,
+    /// Data type of the quant offsets
+    typename ElementQOffset>
+struct QuantBLayoutDebug{
+  static constexpr bool debug_smem = true;
+  static constexpr bool debug_fragment = true;
+  ElementWeight* smem_b_ptr_;
+  ElementQScale* smem_qscale_ptr_;
+  ElementQOffset* smem_qoffset_ptr_;
+  int warp_id_;
+  int lane_id_;
+  int block_id_;
+
+  template<typename Element, int Size>
+  CUTLASS_DEVICE
+  static void print_fragment(cutlass::Array<Element, Size> const& frag, char label, int block_id, int warp_id, int lane_id){
+    static_assert(Size % 4 == 0, "Size must be multiple of 4");
+    if constexpr (debug_fragment){
+      if (block_id == 1 && warp_id == 0){
+        const Element* ptr = reinterpret_cast<const Element*>(&frag);
+        for (int i = 0; i < Size/4; i++, ptr+=4){
+          if constexpr(std::is_integral<Element>::value){
+            printf("T%.2d%c%d, %3d, %3d, %3d, %3d\n",
+                   threadIdx.x, label, i,
+                   ptr[0], ptr[1], ptr[2], ptr[3]);
+          } else {
+            printf("T%.2d%c%d, %.3f, %.3f, %.3f, %.3f\n",
+                   threadIdx.x, label, i,
+                   float(ptr[0]), float(ptr[1]), float(ptr[2]), float(ptr[3]));
+          }
+        }
+      }
+    }
+  }
+
+  template<typename Element, int Size>
+  CUTLASS_DEVICE
+  static void print_as_int4(cutlass::Array<Element, Size> const& frag, char label, int block_id, int warp_id, int lane_id){
+    constexpr int I8Size = Size * cutlass::sizeof_bits<Element>::value / 8;
+    static_assert(I8Size % 2 == 0, "Size must be multiple of 4");
+    if constexpr (debug_fragment){
+      if (block_id == 1 && warp_id == 0){
+        const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&frag);
+        for (int i = 0; i < I8Size/2; i++, ptr+=2){
+          printf("T%.2dW%d, %d, %d, %d, %d\n", threadIdx.x, i, ptr[0] & 0x0f, ptr[0] >> 4, ptr[1] & 0x0f, ptr[1] >> 4);
+        }
+      }
+    }
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dummy type when quant offset is not used, to avoid compilation error,
+/// and reduce runtime footprint
+///
+struct DummyType{
+  std::monostate dummy_;
+ public:
+  DummyType() = default;
+
+  CUTLASS_HOST_DEVICE
+  void* data() const {
+    return nullptr;
+  }
+
+  CUTLASS_HOST_DEVICE
+  std::monostate& operator[](int /*idx */) {
+    return dummy_;
+  }
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class QuantBMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  static constexpr bool kHasQOffset = !std::is_same<typename Operator::ElementQOffset, std::monostate>::value;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the prepacked weights
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  // Tensor reference to the quantization scales
+  using TensorRefQScale = TensorRef<typename Operator::ElementQScale, typename Operator::SmemLayoutQScale>;
+  using TensorRefQOffset = TensorRef<typename Operator::ElementQOffset, typename Operator::SmemLayoutQOffset>;
+
+  // Block size of the quantization (one set of quantization parameters per block of weights)
+  using QuantBlocking = typename Operator::QuantBlocking;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the prepacked weights in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK / 2 * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN / 2 + Policy::SmemPaddingB::kColumn>;
+
+    /// Shape of the quantization parameter matrix in shared memory
+    /// Validation done in mma core class ThreadblockQShape
+    using ShapeQScale =
+        MatrixShape<(Shape::kK / QuantBlocking::kRow) * kStages,
+                    Shape::kN / QuantBlocking::kColumn>;
+
+    using BufTypeQOffset = std::conditional_t<kHasQOffset,
+          AlignedBuffer<typename Operator::ElementQOffset, ShapeQScale::kCount>,
+          DummyType>;
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for prepacked weights
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for quantization scales
+    AlignedBuffer<typename Operator::ElementQScale, ShapeQScale::kCount> operand_QScale;
+
+    /// Buffer for quantization offsets
+    BufTypeQOffset operand_QOffset;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    CUTLASS_HOST_DEVICE
+    static typename Operator::SmemLayoutQScale LayoutQMeta() {
+      return Operator::SmemLayoutQScale::packed({ShapeQScale::kRow, ShapeQScale::kColumn});
+    }
+
+    CUTLASS_HOST_DEVICE
+    static typename Operator::SmemLayoutQOffset LayoutQOffset() {
+      return Operator::SmemLayoutQOffset::packed({ShapeQScale::kRow, ShapeQScale::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the prepacked weights
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the quantization scales
+    CUTLASS_HOST_DEVICE
+    TensorRefQScale operand_QScale_ref() {
+      return TensorRefQScale{operand_QScale.data(), LayoutQMeta()};
+    }
+
+    CUTLASS_HOST_DEVICE
+    TensorRefQOffset operand_QOffset_ref() {
+      if constexpr (!kHasQOffset){
+        return TensorRefQOffset();
+      } else {
+        return TensorRefQOffset{operand_QOffset.data(), LayoutQOffset()};
+      }
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of quant scales from shared memory
+  typename Operator::IteratorQMeta warp_tile_iterator_QScale_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  QuantBMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx),
+      warp_tile_iterator_QScale_(shared_storage.operand_QScale_ref(),
+             shared_storage.operand_QOffset_ref(), lane_idx)
+  {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterators over tiles of quant scales in global memory
+    typename IteratorQScale_,
+    /// Iterators over tiles of quant scales in shared memory
+    typename SmemIteratorQScale_,
+    /// Cache operation for quant scales
+    cutlass::arch::CacheOperation::Kind CacheOpQScale,
+    /// Iterators over tiles of quant scales in global memory
+    typename IteratorQOffset_,
+    /// Iterators over tiles of quant scales in shared memory
+    typename SmemIteratorQOffset_,
+    /// Cache operation for quant scales
+    cutlass::arch::CacheOperation::Kind CacheOpQOffset,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class QuantBMmaMultistage :
+  public QuantBMmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = QuantBMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using IteratorQScale = IteratorQScale_;
+  using IteratorQOffset = IteratorQOffset_;
+  using SmemIteratorQScale = SmemIteratorQScale_;
+  using SmemIteratorQOffset = SmemIteratorQOffset_;
+  using QuantBlocking = typename Base::QuantBlocking;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpQScale = CacheOpQScale;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpQOffset = CacheOpQOffset;
+  static constexpr bool kHasQOffset = Base::kHasQOffset;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of packed weights
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    static int const AsyncCopyIterationsPerStageQScale =
+        IteratorQScale::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of quant scale
+    static int const kAccessesPerGroupQScale =
+        (AsyncCopyIterationsPerStageQScale + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    static int const AsyncCopyIterationsPerStageQOffset =
+        IteratorQOffset::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of quant offset
+    static int const kAccessesPerGroupQOffset =
+        (AsyncCopyIterationsPerStageQOffset + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
+    // accuracy, where each mainloop iteration first accumulates into a temporary
+    // set of freshly-cleared accumulators, which are subsequently added to the
+    // final accumulator set.
+    static bool const kStagedAccumulation = arch::UseStagedAccumulation<typename Operator::MathOperator>::value;
+  };
+
+ private:
+
+
+  // Structure encapsulating pipeline state live from one iteration to the next
+  struct PipeState {
+
+    using WarpLoadedFragmentA = typename Operator::FragmentA;
+    using WarpLoadedFragmentB = typename Operator::FragmentB;
+    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+    /// Temporary accumulator to facilitate staged-accumulation
+    FragmentC tmp_accum_;
+
+    /// Pair of A fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentA warp_loaded_frag_A_[2];
+
+    /// Pair of B fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentB warp_loaded_frag_B_;
+    WarpTransformedFragmentB warp_transformed_frag_B_[2];
+
+    using WarpLoadedFragmentQScale = typename Operator::FragmentQScale;
+    WarpLoadedFragmentQScale warp_loaded_frag_QScale_;
+
+    using WarpLoadedFragmentQOffset = typename std::conditional<kHasQOffset,
+            typename Operator::FragmentQOffset,
+            std::monostate>::type;
+    WarpLoadedFragmentQOffset warp_loaded_frag_QOffset_;
+  };
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Warp-level MMA operator
+  Operator warp_mma_;
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of quant meta data to shared memory
+  SmemIteratorQScale smem_iterator_QScale_;
+  SmemIteratorQOffset smem_iterator_QOffset_;
+
+  /// Shared memory write stage index
+  int smem_write_stage_idx_;
+
+  /// Shared memory read stage index
+  int smem_read_stage_idx_;
+
+  /// very small meta data tensor require less threads to load
+  bool const should_load_qscale_;
+  bool const should_load_qoffset_;
+
+  /// Shared memory pointers for debug dumping
+  static constexpr bool debug_layout = false;
+  using LayoutDebugType = typename std::conditional<debug_layout,
+      QuantBLayoutDebug<typename IteratorB::Element, typename IteratorQScale::Element, typename IteratorQOffset::Element>,
+      std::monostate>::type;
+  LayoutDebugType layout_debug_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  QuantBMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+      smem_iterator_QScale_(shared_storage.operand_QScale_ref(), thread_idx),
+      smem_iterator_QOffset_(shared_storage.operand_QOffset_ref(), thread_idx),
+      should_load_qscale_(thread_idx < IteratorQScale::ThreadMap::kThreads),
+      should_load_qoffset_(thread_idx >= IteratorQOffset::kThreadblockSize - IteratorQOffset::ThreadMap::kThreads),
+      smem_write_stage_idx_(0),
+      smem_read_stage_idx_(0)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    if constexpr(debug_layout){
+      layout_debug_.smem_b_ptr_ = shared_storage.operand_B_ref().data();
+      layout_debug_.smem_qscale_ptr_ = shared_storage.operand_QScale_ref().data();
+      if constexpr(kHasQOffset){
+        layout_debug_.smem_qoffset_ptr_ = shared_storage.operand_QOffset_ref().data();
+      } else {
+        layout_debug_.smem_qoffset_ptr_ = nullptr;
+      }
+      layout_debug_.warp_id_ = warp_idx;
+      layout_debug_.lane_id_ = lane_idx;
+      layout_debug_.block_id_ = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
+    }
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_QScale_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Advance shared memory read-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_read_stage()
+  {
+    ++smem_read_stage_idx_;
+
+    if (smem_read_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      this->warp_tile_iterator_A_.add_tile_offset({0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+      this->warp_tile_iterator_QScale_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+
+      smem_read_stage_idx_ = 0;
+    }
+  }
+
+  /// Advance global memory read-iterators and shared memory write-iterators to the stage
+  CUTLASS_DEVICE
+  void advance_smem_write_stage(
+    IteratorA &iterator_A,
+    IteratorB &iterator_B,
+    IteratorQScale &iterator_QScale,
+    IteratorQOffset &iterator_QOffset)
+  {
+    // Advance global iterators
+    iterator_A.add_tile_offset({0, 1});
+    iterator_B.add_tile_offset({1, 0});
+    iterator_QScale.add_tile_offset({1, 0});
+
+    // Advance shared iterators
+    smem_iterator_A_.add_tile_offset({0, 1});
+    smem_iterator_B_.add_tile_offset({1, 0});
+    smem_iterator_QScale_.add_tile_offset({1, 0});
+
+    if constexpr (kHasQOffset) {
+      iterator_QOffset.add_tile_offset({1, 0});
+      smem_iterator_QOffset_.add_tile_offset({1, 0});
+    }
+
+    // Increment shared memory write stage index
+    ++smem_write_stage_idx_;
+
+    if (smem_write_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+      smem_iterator_QScale_.add_tile_offset({-Base::kStages, 0});
+      if constexpr (kHasQOffset) {
+        smem_iterator_QOffset_.add_tile_offset({-Base::kStages, 0});
+      }
+      smem_write_stage_idx_ = 0;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_qscale_tiles(IteratorQScale &iterator_QScale){
+    // Quant scale matrix is 1/block_size of the B matrix, for a 64x64 warp tile,
+    // it's only 64x64/block_size elements. For blocking size 16 ~ 64, it only
+    // takes 4 ~ 16 cp.async instructions to load. One warp has 32 threads, so
+    // it should be loaded in less than one cp.async instruction per thread.
+    // Even less for quant offset matrix.
+    static_assert(Detail::AsyncCopyIterationsPerStageQScale == 1,
+                  "Quant scale should be loaded in one shot!");
+    static_assert(IteratorQScale::kAccessesPerVector == 1,
+                  "Quant scale should 1 access per vector!");
+
+    // Async Copy for quantization scale
+    typename IteratorQScale::AccessType *dst_ptr =
+        reinterpret_cast<typename IteratorQScale::AccessType *>(
+            this->smem_iterator_QScale_.get());
+
+    constexpr int kSrcBytes =
+        sizeof_bits<typename IteratorQScale::Element>::value *
+            IteratorQScale::ThreadMap::kElementsPerAccess / 8;
+
+    cutlass::arch::cp_async<kSrcBytes, kCacheOpQScale>(
+        dst_ptr, iterator_QScale.get(), iterator_QScale.valid());
+  }
+
+  CUTLASS_DEVICE
+  void copy_qoffset_tiles(IteratorQOffset & iterator_QOffset) {
+    static_assert(Detail::AsyncCopyIterationsPerStageQOffset == 1,
+                  "Quant offset should be loaded in one shot!");
+    static_assert(IteratorQOffset::kAccessesPerVector == 1,
+                  "Quant offset should 1 access per vector!");
+
+    if constexpr(kHasQOffset) {
+      // Async Copy for quantization offset
+      typename IteratorQOffset::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorQOffset::AccessType *>(
+              this->smem_iterator_QOffset_.get());
+
+      constexpr int kSrcBytes = sizeof_bits<typename IteratorQOffset::Element>::value *
+                                IteratorQOffset::ThreadMap::kElementsPerAccess / 8;
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpQOffset>(
+            dst_ptr, iterator_QOffset.get(), iterator_QOffset.valid());
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start = 0) {
+    auto group_start_A = group_start * Detail::kAccessesPerGroupA;
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    auto group_start_B = group_start * Detail::kAccessesPerGroupB;
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, iterator_B.valid());
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
+  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
+  CUTLASS_DEVICE
+  void prologue(
+    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
+    IteratorQScale &iterator_QScale, ///< [in|out] iterator over quant scales in global memory
+    IteratorQOffset &iterator_QOffset, ///< [in|out] iterator over quant offsets in global memory
+    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+
+      // Disable global fetching if done with global fetch iterations
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+      iterator_QScale.clear_mask(gemm_k_iterations == 0 || !should_load_qscale_);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Async Copy for quantization scale
+      static_assert(Detail::AsyncCopyIterationsPerStageQScale == 1, "Quant scale should be loaded in one shot!");
+      static_assert(IteratorQScale::kAccessesPerVector == 1, "Quant scale should 1 access per vector!");
+
+      typename IteratorQScale::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorQScale::AccessType *>(
+              this->smem_iterator_QScale_.get());
+
+      constexpr int kSrcBytes =
+          sizeof_bits<typename IteratorQScale::Element>::value *
+          IteratorQScale::ThreadMap::kElementsPerAccess / 8;
+
+      auto gmem_ptr = iterator_QScale.get();
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpQScale>(
+          dst_ptr, gmem_ptr, iterator_QScale.valid());
+
+      if constexpr (kHasQOffset) {
+        iterator_QOffset.clear_mask(gemm_k_iterations == 0 || !should_load_qoffset_);
+
+        // Async Copy for quantization offset
+        static_assert(Detail::AsyncCopyIterationsPerStageQOffset == 1, "Quant offset should be loaded in one shot!");
+        static_assert(IteratorQOffset::kAccessesPerVector == 1, "Quant offset should 1 access per vector!");
+        typename IteratorQOffset::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorQOffset::AccessType *>(
+                this->smem_iterator_QOffset_.get());
+
+        constexpr int kSrcBytes =
+            sizeof_bits<typename IteratorQOffset::Element>::value *
+                IteratorQOffset::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpQOffset>(
+            dst_ptr, iterator_QOffset.get(), iterator_QOffset.valid());
+      }
+
+      // Move to the next write stage
+      advance_smem_write_stage(iterator_A, iterator_B, iterator_QScale, iterator_QOffset);
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+  }
+
+
+  /// Wait until we have at least one completed global fetch stage
+  CUTLASS_DEVICE
+  void gmem_wait()
+  {
+    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    if constexpr(debug_layout) {
+      if (LayoutDebugType::debug_smem && layout_debug_.block_id_ == 1) {
+        if (threadIdx.x == 0){
+          printf("stage: %d\n", smem_write_stage_idx_);
+        }
+        cutlass::debug::dump_shmem(layout_debug_.smem_qscale_ptr_, Base::SharedStorage::ShapeQScale::kCount);
+        if constexpr(kHasQOffset){
+          cutlass::debug::dump_shmem(layout_debug_.smem_qoffset_ptr_, Base::SharedStorage::ShapeQScale::kCount);
+        }
+      }
+    }
+  }
+
+  /// Perform a threadblock mainloop iteration of matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void mac_loop_iter(
+    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
+    FragmentC &accum,               ///< [in|out] destination accumulator tile
+    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
+    IteratorQScale &iterator_QScale, ///< [in|out] iterator over quant scales in global memory
+    IteratorQOffset &iterator_QOffset, ///< [in|out] iterator over quant offsets in global memory
+    int &gemm_k_iterations)         ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+      // Loading next warp-level tiles from shared memory. This can be skipped on the very
+      // last iteration where:
+      //   (gemm_k_iterations == (1 - Base::kStages)) && (warp_mma_k == (Base::kWarpGemmIterations - 1))
+      // However, evaluating this condition seems more expensive than simply loading the tiles
+      this->warp_tile_iterator_QScale_.load(
+          pipe_state.warp_loaded_frag_QScale_,
+          pipe_state.warp_loaded_frag_QOffset_);
+      ++this->warp_tile_iterator_QScale_;
+
+      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+      ++this->warp_tile_iterator_B_;
+
+      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_A_;
+
+      // All warp-tiles issue their share of global->shared fragment copies
+      copy_tiles_and_advance(
+          iterator_A,
+          iterator_B,
+          (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+      if constexpr(debug_layout) {
+        if (LayoutDebugType::debug_fragment && layout_debug_.block_id_ == 1 && layout_debug_.warp_id_ == 0 && layout_debug_.lane_id_ == 0){
+          printf("LINE %d, warp_tile_B kgroup %d\n", __LINE__, warp_mma_k % Base::kWarpGemmIterations);
+        }
+        LayoutDebugType::print_as_int4(pipe_state.warp_loaded_frag_B_, 'W', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QScale_), 'Q', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        if constexpr(kHasQOffset){
+          LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QOffset_), 'O', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        }
+      }
+
+      warp_mma_.transform(
+        pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2],
+        pipe_state.warp_loaded_frag_B_,
+        pipe_state.warp_loaded_frag_QScale_,
+        pipe_state.warp_loaded_frag_QOffset_);
+
+      if constexpr(debug_layout) {
+        LayoutDebugType::print_fragment(pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2], 'B', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+      }
+
+      // Execute the current warp-tile of MMA operations
+      if (Detail::kStagedAccumulation) {
+        warp_mma_(
+          pipe_state.tmp_accum_,
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.tmp_accum_
+        );
+
+        if (warp_mma_k == 0) {
+          plus<FragmentC> plus_accum;
+          accum = plus_accum(accum, pipe_state.tmp_accum_);
+          pipe_state.tmp_accum_.clear();
+        }
+      } else {
+        warp_mma_(
+          accum,
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          accum
+        );
+      }
+
+      if (warp_mma_k == 0) {
+        copy_qscale_tiles(iterator_QScale);
+      }
+      if (warp_mma_k == 1) {
+        copy_qoffset_tiles(iterator_QOffset);
+      }
+
+      // The second-to-last warp-tile also moves to the next global fetch stage
+      if (warp_mma_k == Base::kWarpGemmIterations - 2) {
+        // Inserts a memory fence between stages of cp.async instructions.
+        cutlass::arch::cp_async_fence();
+
+        // Move to the next global fetch stage
+        advance_smem_write_stage(iterator_A, iterator_B, iterator_QScale, iterator_QOffset);
+        advance_smem_read_stage();
+
+        // Disable global fetching when done with global fetch iterations
+        --gemm_k_iterations;
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+        iterator_QScale.clear_mask(gemm_k_iterations == 0 || !should_load_qscale_);
+        if constexpr(kHasQOffset){
+          iterator_QOffset.clear_mask(gemm_k_iterations == 0 || !should_load_qoffset_);
+        }
+
+        // Wait until we have at least one completed global fetch stage
+        gmem_wait();
+      }
+
+    }
+  }
+
+  /// Specialized mainloop iteration of matrix multiply-accumulate, for small M
+  CUTLASS_DEVICE
+  void mac_loop_iter_small_m(
+    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
+    FragmentC &accum,               ///< [in|out] destination accumulator tile
+    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
+    IteratorQScale &iterator_QScale, ///< [in|out] iterator over quant scales in global memory
+    IteratorQOffset &iterator_QOffset, ///< [in|out] iterator over quant offsets in global memory
+    int &gemm_k_iterations)         ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+      // In the case of small M, memory latency dominates. We try to move uses far
+      // from their definitions to hide latency.
+      if constexpr(debug_layout) {
+        if (LayoutDebugType::debug_fragment && layout_debug_.block_id_ == 1 && layout_debug_.warp_id_ == 0 && layout_debug_.lane_id_ == 0){
+          printf("LINE %d, warp_tile_B kgroup %d\n", __LINE__, warp_mma_k % Base::kWarpGemmIterations);
+        }
+        LayoutDebugType::print_as_int4(pipe_state.warp_loaded_frag_B_, 'W', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QScale_), 'Q', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        if constexpr(kHasQOffset){
+          LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QOffset_), 'O', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        }
+      }
+
+      warp_mma_.transform(
+        pipe_state.warp_transformed_frag_B_[(warp_mma_k) % 2],
+        pipe_state.warp_loaded_frag_B_,
+        pipe_state.warp_loaded_frag_QScale_,
+        pipe_state.warp_loaded_frag_QOffset_);
+
+      if constexpr(debug_layout) {
+        LayoutDebugType::print_fragment(pipe_state.warp_transformed_frag_B_[(warp_mma_k) % 2], 'B', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+      }
+
+      // Loading next warp-level tiles from shared memory.
+      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+      ++this->warp_tile_iterator_B_;
+
+      this->warp_tile_iterator_QScale_.load(
+          pipe_state.warp_loaded_frag_QScale_,
+          pipe_state.warp_loaded_frag_QOffset_);
+      ++this->warp_tile_iterator_QScale_;
+
+      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_A_;
+
+      // All warp-tiles issue their share of global->shared fragment copies
+      copy_tiles_and_advance(
+          iterator_A,
+          iterator_B,
+          (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+      // Execute the current warp-tile of MMA operations
+      if (Detail::kStagedAccumulation) {
+        warp_mma_(
+          pipe_state.tmp_accum_,
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.tmp_accum_
+        );
+
+        if (warp_mma_k == 0) {
+          plus<FragmentC> plus_accum;
+          accum = plus_accum(accum, pipe_state.tmp_accum_);
+          pipe_state.tmp_accum_.clear();
+        }
+      } else {
+        warp_mma_(
+          accum,
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          accum
+        );
+      }
+
+      // The second-to-last warp-tile also moves to the next global fetch stage
+      if (warp_mma_k == Base::kWarpGemmIterations - 2) {
+        // Inserts a memory fence between stages of cp.async instructions.
+        cutlass::arch::cp_async_fence();
+
+        // Move to the next global fetch stage
+        advance_smem_write_stage(iterator_A, iterator_B, iterator_QScale, iterator_QOffset);
+        advance_smem_read_stage();
+
+        // Disable global fetching when done with global fetch iterations
+        --gemm_k_iterations;
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+        iterator_QScale.clear_mask(gemm_k_iterations == 0 || !should_load_qscale_);
+        if constexpr(kHasQOffset){
+          iterator_QOffset.clear_mask(gemm_k_iterations == 0 || !should_load_qoffset_);
+        }
+
+        copy_qscale_tiles(iterator_QScale);
+        copy_qoffset_tiles(iterator_QOffset);
+
+        // Wait until we have at least one completed global fetch stage
+        gmem_wait();
+      }
+
+    }
+  }
+
+
+  /// Perform the specified number of threadblock mainloop iterations of matrix
+  /// multiply-accumulate.  Assumes prologue has been initiated.
+  CUTLASS_DEVICE
+  void gemm_iters(
+      int gemm_k_iterations,        ///< number of threadblock mainloop iterations
+      FragmentC &accum,             ///< [in|out] accumulator tile
+      IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
+      IteratorB &iterator_B,        ///< [in|out] iterator over B operand in global memory
+      IteratorQScale &iterator_QScale, ///< [in|out] iterator over QScale operand in global memory
+      IteratorQOffset &iterator_QOffset) ///< [in|out] iterator over QOffset operand in global memory
+  {
+    PipeState pipe_state;
+
+    // Disable global fetching if done with global fetch iterations
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+    iterator_QScale.clear_mask(gemm_k_iterations == 0 || !should_load_qscale_);
+    if constexpr(kHasQOffset) {
+      iterator_QOffset.clear_mask(gemm_k_iterations == 0 || !should_load_qoffset_);
+    }
+
+    // Load first warp-tile's B fragment from shared memory
+    this->warp_tile_iterator_QScale_.load(
+        pipe_state.warp_loaded_frag_QScale_,
+        pipe_state.warp_loaded_frag_QOffset_);
+    ++this->warp_tile_iterator_QScale_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+    ++this->warp_tile_iterator_B_;
+
+    // Load first warp-tile's A fragment from shared memory
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[0]);
+    ++this->warp_tile_iterator_A_;
+
+    copy_tiles_and_advance(iterator_A, iterator_B, 0);
+
+    if constexpr(Shape::kM > 32) {
+      // the case of bigger m
+      if constexpr(debug_layout) {
+        if (LayoutDebugType::debug_fragment && layout_debug_.block_id_ == 1 && layout_debug_.warp_id_ == 0 && layout_debug_.lane_id_ == 0){
+          printf("LINE %d, warp_tile_B kgroup %d\n", __LINE__, 0);
+        }
+        LayoutDebugType::print_as_int4(pipe_state.warp_loaded_frag_B_, 'W', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QScale_), 'Q', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        if constexpr(kHasQOffset){
+          LayoutDebugType::print_fragment(Operator::IteratorQScale::debug_expand(pipe_state.warp_loaded_frag_QOffset_), 'O', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+        }
+      }
+
+      warp_mma_.transform(
+        pipe_state.warp_transformed_frag_B_[0],
+        pipe_state.warp_loaded_frag_B_,
+        pipe_state.warp_loaded_frag_QScale_,
+        pipe_state.warp_loaded_frag_QOffset_);
+
+      if constexpr(debug_layout) {
+        LayoutDebugType::print_fragment(pipe_state.warp_transformed_frag_B_[0], 'B', layout_debug_.block_id_, layout_debug_.warp_id_, layout_debug_.lane_id_);
+      }
+    } else {
+      // the case of small m
+      copy_qscale_tiles(iterator_QScale);
+      copy_qoffset_tiles(iterator_QOffset);
+    }
+
+    if (Detail::kStagedAccumulation) {
+      pipe_state.tmp_accum_.clear();
+    }
+
+    // Mainloop
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      if constexpr(Shape::kM > 32) {
+        mac_loop_iter(
+          pipe_state,
+          accum,
+          iterator_A,
+          iterator_B,
+          iterator_QScale,
+          iterator_QOffset,
+          gemm_k_iterations);
+      } else {
+        mac_loop_iter_small_m(
+          pipe_state,
+          accum,
+          iterator_A,
+          iterator_B,
+          iterator_QScale,
+          iterator_QOffset,
+          gemm_k_iterations);
+      }
+    }
+
+    if (Detail::kStagedAccumulation) {
+      plus<FragmentC> plus_accum;
+      accum = plus_accum(accum, pipe_state.tmp_accum_);
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over quant scales in global memory
+      IteratorQScale iterator_QScale,
+      ///< Iterator over quant offsets in global memory
+      IteratorQOffset iterator_QOffset,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    // Prologue (start fetching iterations of global fragments into shared memory)
+    prologue(iterator_A, iterator_B, iterator_QScale, iterator_QOffset, gemm_k_iterations);
+
+    // Wait until we have at least one completed global fetch stage
+    gmem_wait();
+
+    // Initialize destination accumulators with source accumulators
+    accum = src_accum;
+
+    // Perform the MAC-iterations
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B, iterator_QScale, iterator_QOffset);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h
new file mode 100644
index 000000000000..2c49888c9450
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/default_quantb_mma_tensor_op.h
@@ -0,0 +1,112 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file default_quantb_mma_tensor_op.h
+ * @brief Modified from cutlass/gemm/warp/default_mma_tensor_op.h
+ * Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Data type of quant scales
+    typename ElementQScale,
+    /// Layout of quant scales (concept: MatrixLayout)
+    typename SmemLayoutQScale,
+    /// Data type of quant offsets
+    typename ElementQOffset,
+    /// Layout of quant offsets (concept: MatrixLayout)
+    typename SmemLayoutQOffset,
+    /// Blocking size of quantization
+    typename QuantBlocking,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_ = arch::OpMultiplyAdd,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultQuantBMmaTensorOp {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<InstructionShape_, 32, ElementA,
+                         cutlass::layout::RowMajor, ElementB,
+                         cutlass::layout::ColumnMajor, ElementC,
+                         cutlass::layout::RowMajor, Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::QuantBMmaTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementQScale, SmemLayoutQScale,
+      ElementQOffset, SmemLayoutQOffset, QuantBlocking, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
new file mode 100644
index 000000000000..26239161cf8a
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
@@ -0,0 +1,882 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_meta_mma_tensor_op_tile_iterator.h
+ * @brief Templates for loading quantization meta data for operand B
+ *        from shared memory to fragments. This is meant to be used in
+ *        lock step with the operand B tile iterator. Containing logic
+ *        to figure out the operand B layout in the tensor core,
+ *        and deliver each meta data element to its corresponding
+ *        operand B element for dequantization.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace{
+
+struct b32_pair{
+  uint32_t a;
+  uint32_t b;
+};
+
+struct fp16_quad{
+  cutlass::half_t a;
+  cutlass::half_t b;
+  cutlass::half_t c;
+  cutlass::half_t d;
+};
+
+struct b16_quad{
+  int16_t a;
+  int16_t b;
+  int16_t c;
+  int16_t d;
+};
+
+union b64 {
+  uint64_t single;
+  b32_pair pair;
+  b16_quad quard;
+  fp16_quad fp16_quad;
+};
+
+static_assert(sizeof(b64) == 8, "b64 should be 64 bits");
+
+/// Convert packed 4b weights into fp16(weight + 16)
+/// Current bit hacking only supports fp16, need to add bf16 later.
+///
+template<int Size>
+CUTLASS_DEVICE
+void weights2Half(cutlass::Array<uint8_t,Size/2> const &weights,
+                 cutlass::Array<cutlass::half_t, Size>& dest)
+{
+  static_assert(Size % 8 == 0, "Weights should have been prepacked by 2x2 tiles, 2 weights per tile.");
+  uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
+  const uint32_t* w_oct = reinterpret_cast<const uint32_t*>(weights.data());
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int oct_idx = 0; oct_idx < Size/8; oct_idx++, w_oct++, dest_pair += 4){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+    // static_cast<cutlass::half_t>(16 + weight)
+    // 4b weights are prepacked into [0, 2, 4, 6, 1, 3, 5, 7], so that adjacent weights
+    // are in different 16b half words, making it easier to convert to fp16.
+    asm volatile(
+        "{\n\t"
+        "  shl.b32       %0, %4, 6;\n"
+        "  shl.b32       %1, %4, 2;\n"
+        "  shr.u32       %2, %4, 2;\n"
+        "  shr.u32       %3, %4, 6;\n"
+        "  lop3.b32      %0, %0, 0x03c003c0, 0x4c004c00, 0xea;\n" // a & 0x03c0 | 0x4c00
+        "  lop3.b32      %1, %1, 0x03c003c0, 0x4c004c00, 0xea;\n"
+        "  lop3.b32      %2, %2, 0x03c003c0, 0x4c004c00, 0xea;\n"
+        "  lop3.b32      %3, %3, 0x03c003c0, 0x4c004c00, 0xea;\n"
+        "}\n"
+        : "=r"(dest_pair[0]), "=r"(dest_pair[1]),
+          "=r"(dest_pair[2]), "=r"(dest_pair[3])
+        : "r"(*w_oct));
+#else
+    assert(0);
+#endif
+  }
+
+}
+
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Traits to describe the layout of quantization meta data layout in a MMA fragment
+// Since operand B is quantized on a per block basis, it's one meta data per block.
+
+template <
+  /// Shape of the operand B matrix to load in a warp (concept: MatrixShape<kK, kN>)
+  typename WarpShapeB_,
+  /// Block dimensions of the blockwise quantization. So the actual meta data
+  /// warp shape is WarpShapeB_ / BlockingShape_
+  typename BlockingShape_,
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  typename ArchMmaOperator_,
+  /// Number of threads participating in one matrix operation
+  int Threads>
+class QuantBMetaMmaTile{
+public:
+
+  using WarpShapeB = WarpShapeB_;
+  using BlockingShape = BlockingShape_;
+  using ArchMmaOperator = ArchMmaOperator_;
+
+  static_assert(Threads == 32, "This iterator should work in a warp only.");
+
+  /// Shape of the curresponding operand B tile iterator <instruction_k, warp_n>
+  using TileShapeB = MatrixShape<ArchMmaOperator::Shape::kK, WarpShapeB::kColumn>;
+
+  // Tensor core operand B layout is a column major 4x8 tile, divided
+  // into 32 threads (T0 ~ T31) as shown below. Each element of the tile is 32b,
+  // so for fp16 it becomes 8 x 8, and int8 it becomes 16 x 8.
+  //  T0 |  T4 |  T8 | T12 | T16 | T20 | T24 | T28
+  //  T1 |  T5 |  T9 | T13 | T17 | T21 | T25 | T29
+  //  T2 |  T6 | T10 | T14 | T18 | T22 | T26 | T30
+  //  T3 |  T7 | T11 | T15 | T19 | T23 | T27 | T31
+  using CoreTile = layout::PitchLinearShape<4, 8>;
+
+  /// Each thread holds a 32b fragment per tile: for half precision, it's 2 elements, 4 elements for int8
+  static int const kNumBsPerCoreTileFragement = 32 / sizeof_bits<typename ArchMmaOperator::ElementB>::value;
+
+  /// Each mma instruction can process either 1 or 2 tensor core operand B tiles (stacked on the k dimension)
+  static int const kBTilesPerMma =
+      sizeof_bits<typename ArchMmaOperator::ElementB>::value * ArchMmaOperator::FragmentB::kElements / 32;
+  static_assert(kBTilesPerMma == 1 || kBTilesPerMma == 2, "Only support 1 or 2 operand B tiles per mma.");
+
+  /// Each operand B tile iterator load covers a number of mma instructions
+  static int const kMmaIterationsB = WarpShapeB::kColumn / ArchMmaOperator::Shape::kN;
+
+  /// Number of B elements a fragment of meta data should cover
+  static int const kExpandedSize = kNumBsPerCoreTileFragement * kBTilesPerMma * kMmaIterationsB;
+
+  // Now we figure out how many meta data elements to load for each TileShapeB
+
+  /// Number of meta elements per CoreTile.
+  static int const kCoreTileFragementSize = (kNumBsPerCoreTileFragement + BlockingShape::kRow - 1) / BlockingShape::kRow;
+
+  /// Number of core tiles per mma instruction, different from kBTilesPerMma when blocking size on K dimension
+  /// exceeds the tile depth, so two tiles share the same meta data
+  static int const kTilesPerMma = ((kBTilesPerMma == 2) &&
+                                  (BlockingShape::kRow <= kNumBsPerCoreTileFragement * CoreTile::kContiguous))
+                                  ? 2 : 1;
+
+  /// stride to reach the meta data for the next CoreTile on the K dimension
+  static int const kKTileStride = (kNumBsPerCoreTileFragement * CoreTile::kContiguous + BlockingShape::kRow - 1) / BlockingShape::kRow;
+
+  /// Stride on N dimension should be the tile width, shrunk by blocking size on this dimension.
+  static int const kNStride = (CoreTile::kStrided + BlockingShape::kColumn - 1) / BlockingShape::kColumn;
+
+  /// On N dimension, how many tiles share the same meta data
+  static int const kNRepeats = (BlockingShape::kColumn + CoreTile::kStrided - 1) / CoreTile::kStrided;
+
+  /// Each fragment should cover kMmaIterationsB number of mma intructions on the N dimension.
+  /// When blocking size on this dimension exceeds the tile width, multiple iterations
+  /// would share the same data.
+  static int const kMmaIterations = (kMmaIterationsB + kNRepeats - 1) / kNRepeats;
+
+  static int const kFragementSize = kCoreTileFragementSize * kTilesPerMma * kMmaIterations;
+
+  CUTLASS_DEVICE
+  static MatrixCoord lane_position(int lane_id) {
+    if constexpr(kNumBsPerCoreTileFragement == 2
+                 && kBTilesPerMma == 2
+                 && BlockingShape::kRow == 1){
+      // Optimize for a special case of:
+      //    16b gemm (kNumBsPerCoreTileFragement == 2)
+      //    2 B operand tiles per mma (kBTilesPerMma == 2)
+      //    (1,n) quantization blocking
+      // The scale and offset tensors are prepacked to reduce the number of load instructions.
+      return make_Coord((lane_id % CoreTile::kContiguous) * 4,
+         lane_id / CoreTile::kContiguous);
+    } else {
+      return make_Coord((lane_id % CoreTile::kContiguous) * kNumBsPerCoreTileFragement,
+         lane_id / CoreTile::kContiguous);
+    }
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is to load quantization meta data for operand B from
+/// shared memory to fragments (hopefully allocated to registers by compilers).
+/// Examples of meta data include scale or offsets. The operand B matrix is
+/// quantized on a per block basis, meaning one element of meta data per block.
+///
+/// This is meant to be used in lock step with the operand B tile iterator.
+/// So all parameters are logical positions in the operand B tiles.
+/// The goal here is to deliver each meta data element to its corresponding
+/// operand B element for dequantization. As a result, we need to figure
+/// out the operand B layout in the tensor core.
+///
+template <
+  /// Shape of the operand B matrix to load in a warp (concept: MatrixShape<kK, kN>)
+  typename WarpShapeB_,
+  /// Block dimensions of the blockwise quantization. So the actual meta data
+  /// warp shape is WarpShapeB_ / BlockingShape_
+  typename BlockingShape_,
+  /// Data type of the quant scales
+  typename ElementScale_,
+  /// Layout of the quant scales
+  typename LayoutScale_,
+  /// Data type of quant offsets
+  typename ElementOffset_,
+  /// Layout of quant offsets
+  typename LayoutOffset_,
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  typename ArchMmaOperator_,
+  /// Number of threads participating in one matrix operation
+  int Threads,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1>
+class QuantBMetaMmaTensorOpTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column major layout
+
+template <
+  /// Shape of the operand B matrix to load in a warp (concept: MatrixShape<kK, kN>)
+  typename WarpShapeB_,
+  /// Block dimensions of the blockwise quantization. So the actual meta data
+  /// warp shape is WarpShapeB_ / BlockingShape_
+  typename BlockingShape_,
+  /// Data type of the meta data elements
+  typename ElementScale_,
+  /// Data type of quant offsets
+  typename ElementOffset_,
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  typename ArchMmaOperator_,
+  /// Number of threads participating in one matrix operation
+  int Threads>
+class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
+    ElementScale_, cutlass::layout::ColumnMajor,
+    ElementOffset_, cutlass::layout::ColumnMajor,
+    ArchMmaOperator_, Threads, 1>{
+public:
+
+  using WarpShapeB = WarpShapeB_;
+  using BlockingShape = BlockingShape_;
+  using ElementScale = ElementScale_;
+  using Layout = cutlass::layout::ColumnMajor;
+  using ElementOffset = ElementOffset_;
+  using ArchMmaOperator = ArchMmaOperator_;
+
+  static constexpr bool kHasOffset = !(std::is_same<ElementOffset, std::monostate>::value);
+
+  static_assert(BlockingShape::kRow == 1 && BlockingShape::kColumn > 1,
+          "Only support row blocking for column major layout");
+
+  using MetaTile = QuantBMetaMmaTile<WarpShapeB, BlockingShape, ArchMmaOperator, Threads>;
+
+  /// Number of MMA instructions for this tile
+  static constexpr int kMmaIterationsB = MetaTile::kMmaIterationsB;
+
+  /// Number of B elements per mma tile fragment (32b), 2 for half precision, 4 for int8
+  static constexpr int kNumBsPerCoreTileFragement = MetaTile::kNumBsPerCoreTileFragement;
+
+  /// Each mma instruction can process either 1 or 2 operand B tiles (stacked on the k dimension)
+  static constexpr int kBTilesPerMma = MetaTile::kBTilesPerMma;
+
+  /// Number of B elements a fragment of meta data should cover
+  static constexpr int kExpandedSize = MetaTile::kExpandedSize;
+
+  /// Number of meta elements per core tile fragment
+  static constexpr int kCoreTileFragementSize = MetaTile::kCoreTileFragementSize;
+
+  /// stride for reaching the next core tile (if there is one) on the K dimension
+  static constexpr int kKTileStride = MetaTile::kKTileStride;
+
+  /// do we need to load meta data for the next core tile on the K dimension?
+  static constexpr int kTilesPerMma = MetaTile::kTilesPerMma;
+
+  static constexpr int kNStride = MetaTile::kNStride;
+  static constexpr int kNRepeats = MetaTile::kNRepeats;
+  static constexpr int kMmaIterations = MetaTile::kMmaIterations;
+
+  using TensorRefScale = TensorRef<ElementScale, Layout>;
+  using TensorRefOffset = TensorRef<ElementOffset, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using FragmentScale = Array<ElementScale, MetaTile::kFragementSize>;
+  using FragmentOffset = typename std::conditional<kHasOffset,
+          Array<ElementOffset, MetaTile::kFragementSize>,
+          std::monostate>::type;
+
+  using AccessTypeScale = Array<ElementScale, kCoreTileFragementSize>;
+  using AccessTypeOffset = Array<ElementOffset, kCoreTileFragementSize>;
+
+private:
+
+  ElementScale *pointer_;
+  Layout layout_;
+
+  ElementOffset *pointer_offset_;
+  Layout layout_offset_;
+
+  TensorCoord lane_position_;
+
+public:
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator() { }
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator(
+    TensorRefScale const &ref,
+    TensorRefOffset const &ref_offset,
+    int lane_idx
+  ):
+    pointer_(ref.data()),
+    layout_(ref.layout()),
+    pointer_offset_(ref_offset.data()),
+    layout_offset_(ref_offset.layout()),
+    lane_position_(MetaTile::lane_position(lane_idx)){}
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(FragmentScale &frag, FragmentOffset &frag_offset) {
+    if constexpr(kNumBsPerCoreTileFragement == 2
+                 && kBTilesPerMma == 2){
+      // Optimize for a special case of:
+      //    16b gemm (kNumBsPerCoreTileFragement == 2)
+      //    2 B operand tiles per mma (kBTilesPerMma == 2)
+      //    (1,n) quantization blocking (BlockingShape::kRow == 1)
+      // The scale and offset tensors are prepacked to reduce the number of load instructions needed
+      const int row = lane_position_.row();
+      const int column = lane_position_.column() / BlockingShape::kColumn;
+
+      Array<ElementScale, 4> *dst_ptr = reinterpret_cast<Array<ElementScale, 4>*>(frag.data());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_idx = 0, c = column; n_idx < kMmaIterations; n_idx++, c += kNStride){
+        Array<ElementScale, 4> *src_ptr = reinterpret_cast<Array<ElementScale, 4>*>(pointer_ + layout_({row, c}));
+        *dst_ptr = *src_ptr;
+        dst_ptr++;
+      }
+
+      if constexpr(kHasOffset){
+        Array<ElementOffset, 4> *dst_ptr_offset = reinterpret_cast<Array<ElementOffset, 4>*>(frag_offset.data());
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_idx = 0, c = column; n_idx < kMmaIterations; n_idx++, c += kNStride){
+          Array<ElementOffset, 4> *src_ptr_offset = reinterpret_cast<Array<ElementOffset, 4>*>(pointer_offset_ + layout_offset_({row, c}));
+          *dst_ptr_offset = *src_ptr_offset;
+          dst_ptr_offset++;
+        }
+      }
+
+    } else {
+      // Other cases, offsets and scales are not prepacked.
+
+      const int row = lane_position_.row() / BlockingShape::kRow;
+      const int column = lane_position_.column() / BlockingShape::kColumn;
+
+      AccessTypeScale* dst_ptr = reinterpret_cast<AccessTypeScale*>(frag.data());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_idx = 0, c = column; n_idx < kMmaIterations; n_idx++, c += kNStride){
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_tile_idx = 0, r = row; mma_tile_idx < kTilesPerMma; mma_tile_idx++, r += kKTileStride){
+          AccessTypeScale* src_ptr = reinterpret_cast<AccessTypeScale*>(pointer_ + layout_({r, c}));
+          *dst_ptr = *src_ptr;
+          dst_ptr++;
+        }
+      }
+
+      if constexpr(kHasOffset){
+        AccessTypeOffset* dst_ptr = reinterpret_cast<AccessTypeOffset*>(frag_offset.data());
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_idx = 0, c = column; n_idx < kMmaIterations; n_idx++, c += kNStride){
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_tile_idx = 0, r = row; mma_tile_idx < kTilesPerMma; mma_tile_idx++, r += kKTileStride){
+            AccessTypeOffset* src_ptr = reinterpret_cast<AccessTypeOffset*>(pointer_offset_ + layout_offset_({r, c}));
+            *dst_ptr = *src_ptr;
+            dst_ptr++;
+          }
+        }
+      }
+    }
+  }
+
+  template <typename ElementT>
+  CUTLASS_HOST_DEVICE
+  static Array<ElementT, kExpandedSize> debug_expand(Array<ElementT, MetaTile::kFragementSize> const &frag){
+    Array<ElementT, kExpandedSize> ret;
+    int out_idx = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int n_out = 0; n_out < kMmaIterationsB; n_out++){
+      int n_idx = n_out / kNRepeats;
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_tile_out_idx = 0; mma_tile_out_idx < kBTilesPerMma; mma_tile_out_idx++){
+        int mma_tile_idx = mma_tile_out_idx / (kBTilesPerMma / kTilesPerMma);
+        CUTLASS_PRAGMA_UNROLL
+        for (int elem_out_idx = 0; elem_out_idx < kNumBsPerCoreTileFragement; elem_out_idx++){
+          int elem_idx = elem_out_idx / BlockingShape::kRow;
+          int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
+          ret[out_idx] = frag[idx];
+          out_idx++;
+        }
+      }
+    }
+    return ret;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void dequant(FragmentScale const &scales,
+                      FragmentOffset const &fragment_offsets,
+                      Array<uint8_t,kExpandedSize/2> const &weights,
+                      Array<ElementScale, kExpandedSize>& dest){
+    static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm.");
+    static_assert(kExpandedSize % 8 == 0, "Weights should have been prepacked by 2x2 tiles, 2 weights per tile.");
+
+    // First convert 4b weight into fp16(weight + 16)
+    weights2Half(weights, dest);
+
+    if constexpr(kBTilesPerMma == 2){
+      // Optimize for a special case of:
+      //    2 B operand tiles per mma (kBTilesPerMma == 2)
+      //    (1,n) quantization blocking (BlockingShape::kRow == 1)
+
+      uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
+      const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
+      [[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr;
+      if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
+        // dequantize: d = scale * (weight - offset)
+        // to use FMA, d = scale * weight + (scale * (-offset))
+
+        [[maybe_unused]] b64 offsets{0};
+        if constexpr(kHasOffset) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          const uint32_t* p = reinterpret_cast<const uint32_t*>(fragment_offsets_ptr);
+          asm volatile(
+              "{\n\t"
+              "  .reg  .b32    rb0, rb1;\n"      // b32 regs for fp16x2 mul operands
+
+              // static_cast<cutlass::half_t>(-16 - offset)
+              // input [d, b, c, a],
+              "  shl.b32       rb0, %4, 6;\n"     // rb0 = [x, b, x, a] << 6
+              "  shr.u32       rb1, %4, 2;\n"     // rb1 = [x, d, x, c] << 6
+              "  lop3.b32      rb0, rb0, 0x03c003c0, 0xcc00cc00, 0xea;\n" // a & 0x03c0 | 0xcc00
+              "  lop3.b32      rb1, rb1, 0x03c003c0, 0xcc00cc00, 0xea;\n"
+              "  mul.rn.f16x2  %0, %2, rb0;\n"    // offset = scale * (-16 - offset)
+              "  mul.rn.f16x2  %1, %3, rb1;\n"
+              "}\n"
+              : "=r"(offsets.pair.a), "=r"(offsets.pair.b)
+              : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b),
+                "r"(p[0]));
+#else
+          assert(0);
+#endif
+
+          fragment_offsets_ptr += 4;
+        } else {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+              "{\n\t"
+              "  .reg  .b32    rb0;\n"
+              "  mov.u32       rb0, 0xce00ce00;\n"
+              "  mul.rn.f16x2  %0, %2, rb0;\n"    // offset = scale * (-16 - 8)
+              "  mul.rn.f16x2  %1, %3, rb0;\n"
+              "}\n"
+              : "=r"(offsets.pair.a), "=r"(offsets.pair.b)
+              : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b));
+#else
+          offsets.fp16_quad.a = scales_ptr->fp16_quad.a * static_cast<cutlass::half_t>(-16-8);
+          offsets.fp16_quad.b = scales_ptr->fp16_quad.b * static_cast<cutlass::half_t>(-16-8);
+          offsets.fp16_quad.c = scales_ptr->fp16_quad.c * static_cast<cutlass::half_t>(-16-8);
+          offsets.fp16_quad.d = scales_ptr->fp16_quad.d * static_cast<cutlass::half_t>(-16-8);
+#endif
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_r = 0; n_r < kNRepeats; n_r++){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+              "{\n\t"
+              "  fma.rn.f16x2  %0, %2, %0, %4;\n" // dest = scale * (16 + weight) +  (scale * (-16 - offset))
+              "  fma.rn.f16x2  %1, %3, %1, %5;\n"
+              "}\n"
+              : "+r"(dest_pair[0]), "+r"(dest_pair[1])
+              : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b),
+                "r"(offsets.pair.a), "r"(offsets.pair.b));
+#else
+          assert(0);
+#endif
+          dest_pair += 2;
+        }
+        scales_ptr++;
+      }
+
+    } else {
+      // unoptiomized path for other cases, very slow
+      int out_idx = 0;
+      ElementScale offset;
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_out = 0; n_out < kMmaIterationsB; n_out++){
+        int n_idx = n_out / kNRepeats;
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_tile_out_idx = 0; mma_tile_out_idx < kBTilesPerMma; mma_tile_out_idx++){
+          int mma_tile_idx = mma_tile_out_idx / (kBTilesPerMma / kTilesPerMma);
+          CUTLASS_PRAGMA_UNROLL
+          for (int elem_out_idx = 0; elem_out_idx < kNumBsPerCoreTileFragement; elem_out_idx++){
+            int elem_idx = elem_out_idx / BlockingShape::kRow;
+            int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
+            ElementScale s = scales[idx];
+            if constexpr(kHasOffset){
+              offset = s * static_cast<ElementScale>(-16 - static_cast<int>(fragment_offsets[idx]));
+            } else {
+              offset = s * static_cast<ElementScale>(-16-8);
+            }
+            dest[out_idx] = s * dest[out_idx] + offset;
+            out_idx++;
+          }
+        }
+      }
+
+    }
+
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  QuantBMetaMmaTensorOpTileIterator &operator++() {
+    // This is for operand B, so advance on the K dimension
+    lane_position_ += make_Coord(MetaTile::TileShapeB::kRow, 0);
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int rows = tile_offset.row() * MetaTile::TileShapeB::kRow;
+    int columns = tile_offset.column() * MetaTile::TileShapeB::kColumn;
+    lane_position_ += TensorCoord(rows, columns);
+    return *this;
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row major layout
+
+template <
+  /// Shape of the operand B matrix to load in a warp (concept: MatrixShape<kK, kN>)
+  typename WarpShapeB_,
+  /// Block dimensions of the blockwise quantization. So the actual meta data
+  /// warp shape is WarpShapeB_ / BlockingShape_
+  typename BlockingShape_,
+  /// Data type of the meta data elements
+  typename ElementScale_,
+  /// Data type of quant offsets
+  typename ElementOffset_,
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  typename ArchMmaOperator_,
+  /// Number of threads participating in one matrix operation
+  int Threads>
+class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
+    ElementScale_, cutlass::layout::RowMajor,
+    ElementOffset_, cutlass::layout::RowMajor,
+    ArchMmaOperator_, Threads, 1>{
+public:
+
+  using WarpShapeB = WarpShapeB_;
+  using BlockingShape = BlockingShape_;
+  using ElementScale = ElementScale_;
+  using ElementOffset = ElementOffset_;
+  using Layout = cutlass::layout::RowMajor;
+  using ArchMmaOperator = ArchMmaOperator_;
+
+  static constexpr bool kHasOffset = !(std::is_same<ElementOffset, std::monostate>::value);
+
+  static_assert(BlockingShape::kColumn == 1 && BlockingShape::kRow > 1,
+          "Only support column blocking for row major layout");
+
+  using MetaTile = QuantBMetaMmaTile<WarpShapeB, BlockingShape, ArchMmaOperator, Threads>;
+
+  /// Number of MMA instructions for this tile
+  static constexpr int kMmaIterationsB = MetaTile::kMmaIterationsB;
+
+  /// Number of B elements per mma tile fragment (32b), 2 for half precision, 4 for int8
+  static constexpr int kNumBsPerCoreTileFragement = MetaTile::kNumBsPerCoreTileFragement;
+
+  /// Each mma instruction can process either 1 or 2 operand B tiles (stacked on the k dimension)
+  static constexpr int kBTilesPerMma = MetaTile::kBTilesPerMma;
+
+  /// Number of B elements a fragment of meta data should cover
+  static constexpr int kExpandedSize = MetaTile::kExpandedSize;
+
+  /// Number of meta elements per core tile fragment
+  static constexpr int kCoreTileFragementSize = MetaTile::kCoreTileFragementSize;
+
+  /// stride for reaching the next core tile (if there is one) on the K dimension
+  static constexpr int kKTileStride = MetaTile::kKTileStride;
+
+  /// do we need to load meta data for the next core tile on the K dimension?
+  static constexpr int kTilesPerMma = MetaTile::kTilesPerMma;
+
+  static constexpr int kNStride = MetaTile::kNStride;
+  static constexpr int kNRepeats = MetaTile::kNRepeats;
+  static constexpr int kMmaIterations = MetaTile::kMmaIterations;
+
+  using TensorRefScale = TensorRef<ElementScale, Layout>;
+  using TensorRefOffset = TensorRef<ElementOffset, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using FragmentScale = Array<ElementScale, MetaTile::kFragementSize>;
+  using FragmentOffset = typename std::conditional<kHasOffset,
+          Array<ElementOffset, MetaTile::kFragementSize>,
+          std::monostate>::type;
+
+private:
+
+  ElementScale *pointer_;
+  Layout layout_;
+
+  ElementOffset *pointer_offset_;
+  Layout layout_offset_;
+
+  TensorCoord lane_position_;
+
+public:
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator() { }
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator(
+    TensorRefScale const &ref,
+    TensorRefOffset const &ref_offset,
+    int lane_idx
+  ):
+    pointer_(ref.data()),
+    layout_(ref.layout()),
+    pointer_offset_(ref_offset.data()),
+    layout_offset_(ref_offset.layout()),
+    lane_position_(MetaTile::lane_position(lane_idx))
+     {}
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(FragmentScale &frag, FragmentOffset &frag_offset) {
+    const int row = lane_position_.row() / BlockingShape::kRow;
+    const int column = lane_position_.column() / BlockingShape::kColumn;
+    static_assert(kTilesPerMma * kCoreTileFragementSize == 1, "Only support one meta data per core tile");
+
+    ElementScale* src_ptr = pointer_ + layout_({row, column});
+    ElementScale* dst_ptr = frag.data();
+    CUTLASS_PRAGMA_UNROLL
+    for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
+      dst_ptr[n_idx] = src_ptr[n_idx * kNStride];
+    }
+
+    if constexpr(kHasOffset){
+      ElementOffset* src_ptr_offset = pointer_offset_ + layout_offset_({row, column});
+      ElementOffset* dst_ptr_offset = frag_offset.data();
+      CUTLASS_PRAGMA_UNROLL
+      for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
+        dst_ptr_offset[n_idx] = src_ptr_offset[n_idx * kNStride];
+      }
+    }
+  }
+
+  template <typename ElementT>
+  CUTLASS_HOST_DEVICE
+  static Array<ElementT, kExpandedSize> debug_expand(Array<ElementT, MetaTile::kFragementSize> const &frag){
+    Array<ElementT, kExpandedSize> ret;
+
+    int out_idx = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int n_out = 0; n_out < kMmaIterationsB; n_out++){
+      int n_idx = n_out / kNRepeats;
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_tile_out_idx = 0; mma_tile_out_idx < kBTilesPerMma; mma_tile_out_idx++){
+        int mma_tile_idx = mma_tile_out_idx / (kBTilesPerMma / kTilesPerMma);
+        CUTLASS_PRAGMA_UNROLL
+        for (int elem_out_idx = 0; elem_out_idx < kNumBsPerCoreTileFragement; elem_out_idx++){
+          int elem_idx = elem_out_idx / BlockingShape::kRow;
+          int col = elem_idx + mma_tile_idx * kCoreTileFragementSize;
+          int idx = col * kMmaIterations + n_idx;
+          ret[out_idx] = frag[idx];
+          out_idx++;
+        }
+      }
+    }
+    return ret;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void dequant(FragmentScale const &scales,
+                      FragmentOffset const &offsets,
+                      Array<uint8_t,kExpandedSize/2> const &weights,
+                      Array<ElementScale, kExpandedSize>& dest){
+    static_assert(kNRepeats == 1, "This is implied by BlockingShape::kColumn == 1");
+    static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm now.");
+
+    // First convert 4b weight into fp16(weight + 16)
+    weights2Half(weights, dest);
+
+    ElementScale addon[kMmaIterationsB];
+    if constexpr (kMmaIterationsB % 4 == 0) {
+      const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
+      uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
+      if constexpr(kHasOffset){
+        const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets.data());
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_idx = 0; n_idx < kMmaIterationsB; n_idx += 4){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+            "{\n\t"
+            "  .reg  .b32    rb0, rb1, rb2;\n"
+
+            // offset from [d, c, b, a] --> [d, b, c, a]
+            "  prmt.b32      rb2, %4, rb0, 0x3120;\n"
+
+            // static_cast<cutlass::half_t>(-16 - offset)
+            // input [d, b, c, a],
+            "  shl.b32       rb0, rb2, 6;\n"     // rb0 = [x, b, x, a] << 6
+            "  shr.u32       rb1, rb2, 2;\n"     // rb1 = [x, d, x, c] << 6
+            "  lop3.b32      rb0, rb0, 0x03c003c0, 0xcc00cc00, 0xea;\n" // a & 0x03c0 | 0xcc00
+            "  lop3.b32      rb1, rb1, 0x03c003c0, 0xcc00cc00, 0xea;\n"
+            "  mul.rn.f16x2  %0, %2, rb0;\n"    // offset = scale * (-16 - offset)
+            "  mul.rn.f16x2  %1, %3, rb1;\n"
+            "}\n"
+            : "=r"(addon_ptr[0]), "=r"(addon_ptr[1])
+            : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b),
+              "r"(p[0]));
+#else
+          assert(0);
+#endif
+          scales_ptr++;
+          p++;
+          addon_ptr += 2;
+        }
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n_idx = 0; n_idx < kMmaIterationsB; n_idx += 4){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          asm volatile(
+            "{\n\t"
+            "  .reg  .b32    rb0;\n"
+            "  mov.u32       rb0, 0xce00ce00;\n"
+            "  mul.rn.f16x2  %0, %2, rb0;\n"    // offset = scale * (-16 - 8)
+            "  mul.rn.f16x2  %1, %3, rb0;\n"
+            "}\n"
+            : "=r"(addon_ptr[0]), "=r"(addon_ptr[1])
+            : "r"(scales_ptr->pair.a), "r"(scales_ptr->pair.b));
+#else
+          assert(0);
+#endif
+          scales_ptr++;
+          addon_ptr += 2;
+        }
+      }
+    } else if constexpr (kMmaIterationsB % 2 == 0) {
+      if constexpr (kHasOffset){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+        const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
+        uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
+        // possible buffer over read 2 bytes here.
+        const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets.data());
+
+        asm volatile(
+          "{\n\t"
+          "  .reg  .b32    rb0, rb1, rb2;\n"
+
+          // offset from [?, ?, b, a] --> [?, b, ?, a]
+          "  prmt.b32      rb2, %2, rb0, 0x3120;\n"
+
+          // static_cast<cutlass::half_t>(-16 - offset)
+          // input [d, b, c, a],
+          "  shl.b32       rb0, rb2, 6;\n"     // rb0 = [x, b, x, a] << 6
+          "  lop3.b32      rb0, rb0, 0x03c003c0, 0xcc00cc00, 0xea;\n" // a & 0x03c0 | 0xcc00
+          "  mul.rn.f16x2  %0, %1, rb0;\n"    // offset = scale * (-16 - offset)
+          "}\n"
+          : "=r"(addon_ptr[0])
+          : "r"(scales_ptr[0])
+            "r"(p[0]));
+#else
+        assert(0);
+#endif
+      } else {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+        asm volatile(
+          "{\n\t"
+          "  .reg  .b32    rb0;\n"
+          "  mov.u32       rb0, 0xce00ce00;\n"
+          "  mul.rn.f16x2  %0, %1, rb0;\n"    // offset = scale * (-16 - 8)
+          "}\n"
+          : "=r"(addon_ptr[0])
+          : "r"(scales_ptr[0]));
+#else
+        assert(0);
+#endif
+      }
+    } else {
+      // kMmaIterationsB == 1
+      if constexpr(kHasOffset){
+        uint8_t zp = offsets[0];
+        addon[0] = scales[0] * static_cast<ElementScale>(-16 - static_cast<int>(zp));
+      } else {
+        addon[0] = scales[0] * static_cast<ElementScale>(-16-8);
+      }
+    }
+
+    int out_idx = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int n_out = 0; n_out < kMmaIterationsB; n_out++){
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_tile_out_idx = 0; mma_tile_out_idx < kBTilesPerMma; mma_tile_out_idx++){
+        dest[out_idx] = scales[n_out] * dest[out_idx] + addon[n_out];
+        dest[out_idx + 1] = scales[n_out] * dest[out_idx + 1] + addon[n_out];
+        out_idx += 2;
+      }
+    }
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  QuantBMetaMmaTensorOpTileIterator &operator++() {
+    // This is for operand B, so advance on the K dimension
+    lane_position_ += make_Coord(MetaTile::TileShapeB::kRow, 0);
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  QuantBMetaMmaTensorOpTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int rows = tile_offset.row() * MetaTile::TileShapeB::kRow;
+    int columns = tile_offset.column() * MetaTile::TileShapeB::kColumn;
+    lane_position_ += TensorCoord(rows, columns);
+    return *this;
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h
new file mode 100644
index 000000000000..f29cedf326a4
--- /dev/null
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_mma_tensor_op.h
@@ -0,0 +1,361 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+ * Modifications Copyright (c) Microsoft.
+ * Licensed under the MIT license.
+ *
+ * @file quantb_mma_tensor_op.h
+ * @brief Modified from cutlass/gemm/warp/mma_tensor_op.h
+ * Templates implementing warp-level matrix multiply-accumulate operations
+ * targeting tensor cores.
+ */
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Data type of quant scales
+  typename ElementQScale_,
+  /// Layout of quant scales (concept: MatrixLayout)
+  typename SmemLayoutQScale_,
+  /// Data type of quant offsets
+  typename ElementQOffset_,
+  /// Layout of quant offsets (concept: MatrixLayout)
+  typename SmemLayoutQOffset_,
+  /// Blocking dimensions of quantization
+  typename QuantBlocking_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class QuantBMmaTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK/2, Shape::kN/2>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK/2, ArchMmaOperator::Shape::kN/2>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+  // warp B MatrixShape<64, 64>,
+  // layout B cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<16, 64>,
+  // instruction op shape cutlass::MatrixShape<16, 8>,
+  // kPartitionsK 1
+  // FragmentB::kElements 32
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment; // cutlass::Array<cutlass::half_t, 8>
+
+  /// Storage for transformed B tile
+  /// When loading weights, we packed 4 int4 weights into one 2-byte-element, when expanded
+  /// we multiply the number of elements by 4.
+  /// TODO: make sure ArchMmaOperator::ElementB same as dequantized ElementB
+  /// and change the transform function below to perform dequantization
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 4>;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  using ElementQScale = ElementQScale_;
+  using SmemLayoutQScale = SmemLayoutQScale_;
+  using QuantBlocking = QuantBlocking_;
+
+  using ElementQOffset = ElementQOffset_;
+  using SmemLayoutQOffset = SmemLayoutQOffset_;
+
+  /// Iterates over the quantization parameters in memory
+  using WarpQScaleShape = MatrixShape<(Shape::kK / QuantBlocking::kRow), (Shape::kN / QuantBlocking::kColumn)>;
+  static_assert(Shape::kK % QuantBlocking::kRow == 0, "K must be multiple of QuantBlocking::kRow");
+  static_assert(Shape::kN % QuantBlocking::kColumn == 0, "N must be multiple of QuantBlocking::kColumn");
+  static_assert(WarpQScaleShape::kCount > 0, "QuantBlocking too big to fit in a warp block!");
+
+  // TODO This is an expanding iterator, it needs to replicate the quantization parameters
+  // to all threads in the warp.
+  using IteratorQMeta = QuantBMetaMmaTensorOpTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, QuantBlocking, ElementQScale, SmemLayoutQScale,
+    ElementQOffset, SmemLayoutQOffset,
+    ArchMmaOperator, kThreadCount, kPartitionsK>;
+
+  using FragmentQScale = typename IteratorQMeta::FragmentScale;
+  using FragmentQOffset = typename IteratorQMeta::FragmentOffset;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  QuantBMmaTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    TransformedFragmentA const &A,
+    TransformedFragmentB const &B,
+    FragmentC const &C
+  ) const {
+
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+      // Serpentine visitation order maximizing reuse of Rb
+      // The visitation order is like
+      //      _
+      //   | | | |
+      //   | | | |
+      //   |_| |_|
+      //
+      // Down Up Down Up
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn]);
+          } else {
+            mma(
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[m_serpentine + n * MmaIterations::kRow]);
+          }
+        }
+      }
+    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      // Serpentine visitation order maximizing reuse of Ra
+      // The visitation order is like
+      //   _________
+      //   _________|
+      //  |_________
+      //  __________|
+      //
+      // Right Left Right Left
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+          } else {
+            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_A[m],
+                ptr_B[n_serpentine],
+                ptr_D[m + n_serpentine * MmaIterations::kRow]);
+          }
+        }
+      }
+    #else
+      assert(0);
+    #endif
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentB &dst_B,
+                 FragmentB const &B,
+                 FragmentQScale const &scales,
+                 FragmentQOffset const &offsets) const {
+
+    Array<uint8_t, FragmentB::kElements * 2> const *ptr_B =
+        reinterpret_cast<Array<uint8_t, FragmentB::kElements * 2> const *>(&B);
+    IteratorQMeta::dequant(scales, offsets, *ptr_B, dst_B);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//#include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index bdd4dba521eb..ce7838556fbf 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1614,6 +1614,119 @@ MlasHalfGemmConvertPackB(
     void* PackedB
     );
 
+#if defined(__aarch64__) && defined(__linux__)
+/**
+ * @brief Whether current CPU supports Bfloat16(bf16) acceleration.
+ */
+bool MLASCALL
+MlasBf16AccelerationSupported();
+
+/**
+ * @brief Interface for bf16 gemm post processors.
+ *
+ * Example implementation of this interface includes activations,
+ * conversion from single precision to precision, etc.
+ *
+ * SBGEMM is computed tile by tile. When a tile of result matrix
+ * is produced, the method Process() is called to process this tile.
+ * Parameters of this method describe the location and shape of the
+ * tile.
+ */
+class MLAS_SBGEMM_POSTPROCESSOR
+{
+   public:
+    virtual void Process(float*, /**< the address of matrix to process */
+                         size_t, /**< the start row index of matrix */
+                         size_t, /**< the start col index of matrix */
+                         size_t, /**< the element count per row to process */
+                         size_t, /**< the element count per col to process */
+                         size_t  /**< the leading dimension of matrix */
+    ) const = 0;
+
+    virtual ~MLAS_SBGEMM_POSTPROCESSOR() {}
+};
+
+/**
+ * @brief bfloat16 precision activation functions, with optional sum tensor.
+ * Supplied sum tensor must be the same layout as the GEMM output tensor.
+ * And the supplied sum tensor will be added to the tensor before activation.
+ */
+class MLAS_SBGEMM_ACTIVATION_PROCESSOR : public MLAS_SBGEMM_POSTPROCESSOR
+{
+   public:
+    MLAS_SBGEMM_ACTIVATION_PROCESSOR(const MLAS_ACTIVATION& Activation, const float* SumBuf = nullptr)
+        : Activation_(Activation), SumBuf_(SumBuf)
+    {
+    }
+
+    void Process(float* C, size_t StartM, size_t StartN, size_t CountM, size_t CountN, size_t ldc)
+        const override;
+
+   private:
+    const MLAS_ACTIVATION& Activation_;
+    const float* SumBuf_;
+};
+
+/**
+ * @brief Data parameters for bfloat16 precision GEMM routine
+ *        All except C are [in] parameters
+ */
+struct MLAS_SBGEMM_DATA_PARAMS {
+    const void* A = nullptr;     /**< address of A */
+    const void* B = nullptr;     /**< address of B */
+    const float* Bias = nullptr; /**< address of Bias, vector size N */
+    float* C = nullptr;          /**< address of result matrix */
+    size_t lda = 0;              /**< leading dimension of A */
+    size_t ldb = 0;              /**< leading dimension of B, 0 when B is pre-packed*/
+    size_t ldc = 0;              /**< leading dimension of C*/
+    const MLAS_SBGEMM_POSTPROCESSOR* OutputProcessor = nullptr;
+    bool AIsfp32 = false; /**< matrix A is fp32, needs to be converted to bf16*/
+    bool BIsfp32 = false; /**< matrix B is fp32, needs to be converted to bf16*/
+};
+
+/**
+ * @brief Bfloat16 precision Batched GEMM:  C = A * B + Bias
+ *        Either B can be either fp32 or bf16
+ *
+ * Note:  We only support uniform batching, so shapes and types of the
+ *        input must be same across all parameter blocks.
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @param[in]  ThreadPool
+ * @return
+ */
+void MLASCALL
+MlasSBGemmBatch(const size_t M, const size_t N, const size_t K, const size_t BatchN, const MLAS_SBGEMM_DATA_PARAMS* DataParams, MLAS_THREADPOOL* ThreadPool = nullptr);
+
+/**
+ * @brief For bfloat16 precision GEMM, returns size of the
+ *        packing buffer needed for right hand side
+ * @param[in] N   Number of columns
+ * @param[in] K   Number of rows
+ * @return  size of the packing buffer,
+ *          0 if operation not supported
+ */
+size_t MLASCALL
+MlasSBGemmPackBSize(size_t N, size_t K);
+
+/**
+ * @brief For bfloat16 precision GEMM, convert the float matrix B
+ *        to blfoat16 precision and pack it into a packing buffer
+ *
+ * @param[in]  N        Number of columns
+ * @param[in]  K        Number of rows
+ * @param[in]  B        Address of matrix B
+ * @param[in]  ldb      leading dimension of input matrix B
+ * @param[out] PackedB  Address of the packed matrix
+ */
+void MLASCALL
+MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* PackedB);
+#endif
+
 /**
  * @brief Indirect Depthwise convolution for fp16
  * @param Input         Supplies the indirect buffer for NHWC input
diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h
index 1e83dd1cec40..32e9cc98106d 100644
--- a/onnxruntime/core/mlas/inc/mlas_qnbit.h
+++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h
@@ -23,19 +23,34 @@ Module Name:
 #include "mlas.h"
 #include "mlas_gemm_postprocessor.h"
 
+/**
+ * @brief Define compute types of block quantization, in order of decreasing accuracy.
+ */
+typedef enum {
+    CompUndef = 0, /*!< undef */
+    CompFp32,      /*!< input fp32, accumulator fp32 */
+    CompFp16,      /*!< input fp16, accumulator fp16 */
+    CompBf16,      /*!< input bf16, accumulator fp32 */
+    CompInt8,      /*!< input int8, accumulator int32 */
+
+    // special values that should be the first and last actual values
+
+    CompMostAccurate = CompUndef,
+    CompLeastAccurate = CompInt8,
+} MLAS_SQNBIT_GEMM_COMPUTE_TYPE;
+
 /**
  * @brief Data parameters for float/n-bit quantized int GEMM routine.
  */
 struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
-    const float* A = nullptr;                ///< address of A (float32 matrix)
-    size_t lda = 0;                          ///< leading dimension of A
-    const void* QuantBData = nullptr;        ///< address of quantized B (quantized n-bit int values)
-    const float* QuantBScale = nullptr;      ///< address of scale values of quantized B, one per block
-    const void* QuantBZeroPoint = nullptr;   ///< optional address of zero point values of quantized B, one per block
-    bool IsBPacked = false;                  ///< whether B values are packed in an optimized format for the computation
-    const float* Bias = nullptr;             ///< optional address of Bias, vector size N
-    float* C = nullptr;                      ///< address of result matrix
-    size_t ldc = 0;                          ///< leading dimension of C
+    const float* A = nullptr;               ///< address of A (float32 matrix)
+    size_t lda = 0;                         ///< leading dimension of A
+    const void* QuantBData = nullptr;       ///< address of quantized B (quantized n-bit int values)
+    const float* QuantBScale = nullptr;     ///< address of scale values of quantized B, one per block
+    const void* QuantBZeroPoint = nullptr;  ///< optional address of zero point values of quantized B, one per block
+    const float* Bias = nullptr;            ///< optional address of Bias, vector size N
+    float* C = nullptr;                     ///< address of result matrix
+    size_t ldc = 0;                         ///< leading dimension of C
 
     ///< optional post processing to apply to result matrix
     MLAS_GEMM_POSTPROCESSOR<float>* PostProcessor = nullptr;
@@ -46,13 +61,26 @@ struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
  *        A must be a float32 matrix
  *        B must be a quantized and packed n-bit int matrix
  *
+ *        Call MlasIsSQNBitGemmAvailable() with the same parameters to determine whether this function may be called.
+ *
+ *        Call MlasSQNBitGemmPackQuantBDataSize() with the same parameters to determine whether
+ *          MLAS_SQNBIT_GEMM_DATA_PARAMS::QuantBData in `DataParams` should point to a buffer packed with
+ *          MlasSQNBitGemmPackQuantBData().
+ *
+ *        Call MlasSQNBitGemmBatchWorkspaceSize() with the same parameters to determine whether `Workspace` should
+ *          point to an intermediate workspace buffer.
+ *
  * @param[in]       M               row size of matrix A and C
  * @param[in]       N               column size of matrix B and C
  * @param[in]       K               column size of matrix A and row size of matrix B
  * @param[in]       BatchN          number of batches
  * @param[in]       BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]       BlkLen          number of quantized values per block
+ * @param[in]       ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  * @param[inout]    DataParams      An array (size BatchN) of parameter blocks
+ * @param[in]       Workspace       Address of intermediate workspace buffer.
+                                    If MlasSQNBitGemmBatchWorkspaceSize() returns a non-zero value, this must be a
+                                    buffer with at least that many bytes. Otherwise, it may be nullptr.
  * @param[in]       ThreadPool      optional thread pool to use
  */
 void MLASCALL
@@ -63,158 +91,91 @@ MlasSQNBitGemmBatch(
     size_t BatchN,
     size_t BlkBitWidth,
     size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
 
 /**
  * @brief Determines whether a float32/quantized n-bit int GEMM implementation is available on the current platform.
+ *
  * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 bool MLASCALL
 MlasIsSQNBitGemmAvailable(
     size_t BlkBitWidth,
-    size_t BlkLen
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
 /**
- * @brief Define compute types of block quantization
- */
-typedef enum {
-    CompUndef = 0, /*!< undef */
-    CompFp32 = 1,  /*!< input fp32, accumulator fp32 */
-    CompFp16 = 2,  /*!< input fp16, accumulator fp16 */
-    CompBf16 = 3,  /*!< input bf16, accumulator fp32 */
-    CompInt8 = 4   /*!< input int8, accumulator int32 */
-} MLAS_SQNBIT_COMPUTE_TYPE;
-
-/**
- * @brief Data parameters for NBits GEMM routine
- *        C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *        All except C are [in] parameters
- */
-struct MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
-    const float* A = nullptr; /**< address of A (float32 matrix)*/
-    const void* B = nullptr;  /**< address of B (packed nbits blob)*/
-    float* C = nullptr;       /**< address of result matrix */
-    size_t lda = 0;           /**< leading dimension of A */
-    size_t ldc = 0;           /**< leading dimension of C*/
-};
-
-/**
- * @brief Compute the byte size of the parameter combination
+ * @brief Gets the size in bytes of the intermediate workspace buffer required by the float32/quantized n-bit int GEMM
+ * implementation. If zero, no intermediate workspace is required.
  *
- * @param N      the number of columns of matrix B.
- * @param K      the number of rows of matrix B.
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits  number of bits used for weight quantization
- * @param is_asym  flag for asymmetric quantization
- * @param comp_type  specify input data type and accumulator data type
- * @return size of the packing buffer, 0 if the operation is not yet supported.
+ * @param[in]   M               row size of matrix A and C
+ * @param[in]   N               column size of matrix B and C
+ * @param[in]   K               column size of matrix A and row size of matrix B
+ * @param[in]   BatchN          number of batches
+ * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t block_size, int nbits, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE comp_type
-);
-
-/**
- * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
- *
- * @param PackedBuf     packed data buffer
- * @param QData         quantized data buffer
- * @param Scale         scale pointer
- * @param Zp            zero point pointer
- * @param N             the number of columns of matrix B.
- * @param K             the number of rows of matrix B.
- * @param ldb           leading dimension of B
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits         number of bits used for weight quantization (default 4)
- * @param is_asym       flag for asymmetric quantization
- * @param comp_type     specify input data type and accumulator data type
- * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
- * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
- * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
- * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale 
- * (is_asym is false) and Zp(is_asym is true).
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
+MlasSQNBitGemmBatchWorkspaceSize(
+    size_t M,
     size_t N,
     size_t K,
-    size_t ldb,
-    size_t block_size,
-    int nbits,
-    bool is_asym,
-    bool last_call,
-    MLAS_SQNBIT_COMPUTE_TYPE comp_type,
-    MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Unpack and dequantize to fp32
- *
- * @param FpData     unpacked float32 data
- * @param PackedBuf  quantized and packed data
- * @param N          the number of columns of matrix B.
- * @param K          the number of rows of matrix B.
- * @param ldb        leading dimension of B
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmUnPackB(
-    float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* thread_pool
+    size_t BatchN,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
 /**
- * @brief Get the workspace size required by computation.
+ * @brief Gets the size in bytes of the packed quantized B data.
+ * If non-zero, the quantized B data must first be packed by calling MlasSQNBitGemmPackQuantBData() with a buffer of
+ * this size, and then that packed quantized B data buffer must be passed to MlasSQNBitGemmBatch().
+ * If zero, MlasSQNBitGemmPackQuantBData() must not be called and the quantized B data must be directly passed to
+ * MlasSQNBitGemmBatch().
  *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @return     Workspace size in bytes
+ * @param[in]   N               column size of matrix B and C
+ * @param[in]   K               column size of matrix A and row size of matrix B
+ * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 size_t MLASCALL
-MlasSQNBitsGemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
+MlasSQNBitGemmPackQuantBDataSize(
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
 /**
- * @brief Batched GEMM:  C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
+ * @brief Packs the quantized B data in a format that the kernel expects.
  *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @param[in]  WorkSpace  temporary buffer
- * @param[in]  ThreadPool
- * @return
+ * @param[in]   N                   column size of matrix B and C
+ * @param[in]   K                   column size of matrix A and row size of matrix B
+ * @param[in]   BlkBitWidth         quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen              number of quantized values per block
+ * @param[in]   ComputeType         GEMM compute type (e.g., multiplying float or int8 values)
+ * @param[in]   QuantBData          quantized B data
+ * @param[out]  PackedQuantBData    packed quantized B data
+ * @param[in]   ThreadPool          optional thread pool to use
  */
 void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
+MlasSQNBitGemmPackQuantBData(
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const void* QuantBData,
+    void* PackedQuantBData,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
diff --git a/onnxruntime/core/mlas/lib/aarch64/SbgemmKernelNeon.S b/onnxruntime/core/mlas/lib/aarch64/SbgemmKernelNeon.S
new file mode 100644
index 000000000000..e424c30515e9
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/aarch64/SbgemmKernelNeon.S
@@ -0,0 +1,907 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SbgemmKernelNeon.s
+
+Abstract:
+
+    This module implements the kernels for the bfloat16 half precision matrix/matrix
+    multiply operation (SBGEMM).
+
+--*/
+
+#include "asmmacro.h"
+
+        .text
+
+//
+// Stack frame layout for the sbgemm kernel. d8-d15, x19-x30 need save
+//
+        .equ  .LMlasSbgemmKernel_backup_x19_x20,    0
+        .equ  .LMlasSbgemmKernel_backup_x21_x22,    16
+        .equ  .LMlasSbgemmKernel_backup_x23_x24,    32
+        .equ  .LMlasSbgemmKernel_backup_x25_x26,    48
+        .equ  .LMlasSbgemmKernel_backup_x27_x28,    64
+        .equ  .LMlasSbgemmKernel_backup_d8_d9,      80
+        .equ  .LMlasSbgemmKernel_backup_d10_d11,    96
+        .equ  .LMlasSbgemmKernel_backup_d12_d13,    112
+        .equ  .LMlasSbgemmKernel_backup_d14_d15,    128
+        .equ  .LMlasSbgemmKernel_SavedRegisters,    144
+        .equ  .LMlasSbgemmKernel_SavedRegisters_Neg, -144
+
+
+//
+// ClearRowAccumulators
+//
+// Generates the code to clear the accumulators for a single row of the output
+// block.
+//
+
+        .macro  InitRowAccumulators Columns, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg
+
+        mov     v\Vec1Reg\().16b,v0.16b
+.if \Columns\() > 2
+        mov     v\Vec2Reg\().16b,v1.16b
+.endif
+.if \Columns\() > 4
+        mov     v\Vec3Reg\().16b,v2.16b
+.endif
+.if \Columns\() > 6
+        mov     v\Vec4Reg\().16b,v3.16b
+.endif
+
+        .endm
+
+//
+// InitBlockAccumulators
+//
+// Generates the code to init the accumulators for a single row of the output
+// block.
+//
+
+        .macro  InitBlockAccumulators Mode, Columns, Rows
+
+        //check if the Bias != nullptr
+        cbz     x8,.L\Mode\().InitBlock\Columns\().x\Rows\().SkipBiasAdd
+
+        ld1     {v14.4s},[x8],#16            // load Bias[0]
+        // v4~v7 will be set to matrixB after this, so, they can used now
+        dup     v4.4s,v14.s[0]              // broadcast Bias
+        dup     v5.4s,v14.s[1]
+        dup     v6.4s,v14.s[2]
+        dup     v7.4s,v14.s[3]
+
+        zip1    v0.4s, v4.4s, v5.4s
+        zip2    v1.4s, v6.4s, v7.4s
+.if \Columns\() > 4
+        ld1     {v15.4s},[x8],#16            // load Bias[4]
+        dup     v4.4s,v15.s[0]              // broadcast Bias
+        dup     v5.4s,v15.s[1]
+        dup     v6.4s,v15.s[2]
+        dup     v7.4s,v15.s[3]
+
+        zip1    v2.4s, v4.4s, v5.4s
+        zip2    v3.4s, v6.4s, v7.4s
+.endif
+
+        b       .L\Mode\().PopulateAccumulators\Columns\().x\Rows\()
+
+.L\Mode\().InitBlock\Columns\().x\Rows\().SkipBiasAdd:
+        eor     v0.16b,v0.16b,v0.16b // No bias, reset regs
+        eor     v1.16b,v1.16b,v1.16b
+        eor     v2.16b,v2.16b,v2.16b
+        eor     v3.16b,v3.16b,v3.16b
+
+.L\Mode\().PopulateAccumulators\Columns\().x\Rows\():
+        InitRowAccumulators \Columns\(),16,17,18,19
+.if \Rows\() > 2
+        InitRowAccumulators \Columns\(),20,21,22,23
+.endif
+.if \Rows\() > 4
+        InitRowAccumulators \Columns\(),24,25,26,27
+.endif
+.if \Rows\() > 6
+        InitRowAccumulators \Columns\(),28,29,30,31
+.endif
+
+        .endm
+
+// LoadMatrixAElementsBy8
+//
+// Generates the code to load 4 or 8 elements from matrix A.
+//
+        .macro  LoadMatrixAElementsBy8 Rows
+
+        ldr     q8,[x0],#16
+        bfcvtn  v8.4h, v8.4s
+.if \Rows\() > 1
+        ldr     q1,[x10],#16
+        bfcvtn2 v8.8h, v1.4s
+.endif
+
+.if \Rows\() > 2
+        ldr     q9,[x11],#16
+        bfcvtn  v9.4h, v9.4s
+.endif
+.if \Rows\() > 3
+        ldr     q1,[x12],#16
+        bfcvtn2 v9.8h, v1.4s
+.endif
+
+.if \Rows\() > 4
+        ldr     q10,[x20],#16
+        bfcvtn  v10.4h, v10.4s
+.endif
+.if \Rows\() > 5
+        ldr     q1,[x21],#16
+        bfcvtn2 v10.8h, v1.4s
+.endif
+
+.if \Rows\() > 6
+        ldr     q11,[x22],#16
+        bfcvtn  v11.4h, v11.4s
+.endif
+.if \Rows\() > 7
+        ldr     q1,[x23],#16
+        bfcvtn2 v11.8h, v1.4s
+.endif
+
+        .endm
+
+
+//
+// MultiplyAccumulateRow
+//
+// Generates the code to multiply and accumulate a single row of the output
+// block.
+//
+
+        .macro  MultiplyAccumulateRow Columns, MatrixAReg, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg
+
+        bfmmla  v\Vec1Reg\().4s, \MatrixAReg\().8h, v4.8h
+.if \Columns\() > 2
+        bfmmla  v\Vec2Reg\().4s, \MatrixAReg\().8h, v5.8h
+.endif
+.if \Columns\() > 4
+        bfmmla  v\Vec3Reg\().4s, \MatrixAReg\().8h, v6.8h
+.endif
+.if \Columns\() > 6
+        bfmmla  v\Vec4Reg\().4s, \MatrixAReg\().8h, v7.8h
+.endif
+
+        .endm
+
+//
+// MultiplyAccumulateBlock
+//
+// Generates the code to multiply and accumulate into the output block.
+//
+
+        .macro  MultiplyAccumulateBlock Columns, Rows
+
+        MultiplyAccumulateRow \Columns\(),v8,16,17,18,19
+.if \Rows\() > 2
+        MultiplyAccumulateRow \Columns\(),v9,20,21,22,23
+.endif
+.if \Rows\() > 4
+        MultiplyAccumulateRow \Columns\(),v10,24,25,26,27
+.endif
+.if \Rows\() > 6
+        MultiplyAccumulateRow \Columns\(),v11,28,29,30,31
+.endif
+
+        .endm
+
+//
+// ComputeBlockLoop
+//
+// Generates the code to loop over K entries of the input matrices to produce
+// the output block.
+//
+
+        .macro  ComputeBlockLoop Mode, Columns, Rows
+
+        InitBlockAccumulators \Mode\(),\Columns\(),\Rows\()
+
+        add     x10,x0,x6,lsl #2            // compute matrix A plus 1 row
+.if \Rows\() > 2
+        add     x11,x10,x6,lsl #2           // compute matrix A plus 2 rows
+        add     x12,x11,x6,lsl #2           // compute matrix A plus 3 rows
+.endif
+.if \Rows\() > 4
+        add     x20,x12,x6,lsl #2           // compute matrix A plus 4 rows
+        add     x21,x20,x6,lsl #2           // compute matrix A plus 5 rows
+.endif
+.if \Rows\() > 6
+        add     x22,x21,x6,lsl #2           // compute matrix A plus 6 rows
+        add     x23,x22,x6,lsl #2           // compute matrix A plus 7 rows
+.endif
+        sub     x9,x3,#4                   //  block count to process
+        tbnz    x9,#63,.L\Mode\().ProcessRemaining\Columns\().x\Rows\().Blocks
+
+.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4Loop:
+
+        LoadMatrixAElementsBy8 \Rows\()
+        ldr     q4, [x1],#16
+.if \Columns\() > 2
+	ldr     q5,[x1],#16
+.endif
+.if \Columns\() > 4
+        ldr     q6,[x1],#16
+.endif
+.if \Columns\() > 6
+        ldr     q7,[x1],#16
+.endif
+        MultiplyAccumulateBlock \Columns\(),\Rows\()
+
+        sub     x9,x9,#4
+        tbz     x9,#63,.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4Loop
+.L\Mode\().ProcessRemaining\Columns\().x\Rows\().Blocks:
+        add     x9,x9,#4                    // correct for over-subtract above
+        cbz     x9,.L\Mode\().Output\Columns\().x\Rows\().Block
+
+.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4PaddedLoop:
+        LoadMatrixAElementsBy8 \Rows\()
+        ldr     q4, [x1],#16
+.if \Columns\() > 2
+        ldr     q5,[x1],#16
+.endif
+.if \Columns\() > 4
+        ldr     q6,[x1],#16
+.endif
+.if \Columns\() > 6
+        ldr     q7,[x1],#16
+.endif
+        MultiplyAccumulateBlock \Columns\(),\Rows\()
+
+.L\Mode\().Output\Columns\().x\Rows\().Block:
+
+        .endm
+
+
+//
+// OutputRow2Element
+// OutputRow4Element
+// OutputRow6Element
+// OutputRow8Element
+// OutputRow10Element
+// OutputRow12Element
+// OutputRow14Element
+// OutputRow16Element
+//
+// Generates the code to store elements to the output block.
+//
+
+        .macro  OutputRow2Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     s8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     s9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        mov     v8.S[2], v9.S[0]
+
+        fadd    v8.4s,v8.4s,v\Vec1Reg\().4s
+
+        mov     w27, v8.S[0]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     w27, v8.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        mov     w27, v\Vec1Reg\().S[0]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     w27, v\Vec1Reg\().S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow4Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     d8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     d9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+
+        mov     v8.D[1], v9.D[0]
+
+        fadd    v8.4s,v8.4s,v\Vec1Reg\().4s
+
+        mov     x27, v8.D[0]
+        mov     x28, v8.D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.else
+        mov     x27, v\Vec1Reg\().D[0]
+        mov     x28, v\Vec1Reg\().D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow6Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     d8,[\AddrReg1\()],#8
+        ldr     w28,[\AddrReg1\()],#-8
+        mov     v8.S[2], w28
+.if \last_row\() == 0
+        ldr     d9,[\AddrReg2\()],#8
+        ldr     w27,[\AddrReg2\()],#-8
+        mov     v9.S[2], w27
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+
+        mov     x27, v8.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v8.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     x27, v9.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v9.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        mov     x27, v4.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v4.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     x27, v5.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v5.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow8Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow10Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     w28, [\AddrReg1\()],#-16
+
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     w27,[\AddrReg2\()],#-16
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+        mov     v8.S[0], w28
+        mov     v8.S[2], w27
+
+        fadd    v8.4s,v8.4s,v\Vec3Reg\().4s
+
+        mov     w27, v8.S[0]
+        mov     w28, v8.S[2]
+
+        str     w27, [\AddrReg1\()],#4
+.if \last_row\() == 0
+        str     w28, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+        mov     w27, v\Vec3Reg\().S[0]
+        mov     w28, v\Vec3Reg\().S[2]
+
+        str     w27, [\AddrReg1\()],#4
+.if \last_row\() == 0
+        str     w28, [\AddrReg2\()],#4
+.endif
+.endif
+
+.endm
+
+
+        .macro  OutputRow12Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     d10,[\AddrReg1\()],#-16
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     d11,[\AddrReg2\()],#-16
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+        mov     v11.D[0],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+
+        mov     v10.D[1], v11.D[0]
+
+        fadd    v10.4s,v10.4s,v\Vec3Reg\().4s
+
+        mov     x27, v10.D[0]
+        mov     x28, v10.D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+        mov     x27, v\Vec3Reg\().D[0]
+        mov     x28, v\Vec3Reg\().D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+.endif
+
+        .endm
+
+       .macro  OutputRow14Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     d10,[\AddrReg1\()],#8
+        ldr     w28, [\AddrReg1\()],#-24
+        mov     v10.S[2], w28
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     d11,[\AddrReg2\()],#8
+        ldr     w27,[\AddrReg2\()],#-24
+        mov     v11.S[2], w27
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+
+        mov     v11.D[0],x27
+        mov     v11.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+        fadd    v10.4s,v10.4s,v6.4s
+        fadd    v11.4s,v11.4s,v7.4s
+
+        str     q8,[\AddrReg1\()],#16
+
+        mov     x27, v10.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v10.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+        mov     x27, v11.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v11.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+        mov     x27, v6.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v6.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+        mov     x27, v7.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v7.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+.endif
+
+        .endm
+
+
+        .macro  OutputRow16Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldp     q8,q10,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldp     q9,q11,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+
+        mov     v11.D[0],x27
+        mov     v11.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        fadd    v8.4s,v8.4s,v4.4s
+        fadd    v9.4s,v9.4s,v5.4s
+        fadd    v10.4s,v10.4s,v6.4s
+        fadd    v11.4s,v11.4s,v7.4s
+
+        stp     q8,q10,[\AddrReg1\()],#32
+.if \last_row\() == 0
+        stp     q9,q11,[\AddrReg2\()],#32
+.endif
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        stp     q4,q6,[\AddrReg1\()],#32
+.if \last_row\() == 0
+        stp     q5,q7,[\AddrReg2\()],#32
+.endif
+.endif
+
+        .endm
+
+//
+// OutputBlock
+//
+// Generates the code to store the output block.
+//
+
+        .macro  OutputBlock Mode, Columns, Rows
+
+        OutputRow\Columns\()Element \Mode\(),x2,x13,16,17,18,19,(\Rows\() == 1)
+
+.if \Rows\() > 2
+        OutputRow\Columns\()Element \Mode\(),x14,x15,20,21,22,23,(\Rows\() == 3)
+.endif
+
+.if \Rows\() > 4
+        OutputRow\Columns\()Element \Mode\(),x16,x17,24,25,26,27,(\Rows\() == 5)
+.endif
+
+.if \Rows\() > 6
+        OutputRow\Columns\()Element \Mode\(),x18,x19,28,29,30,31,(\Rows\() == 7)
+.endif
+
+        .endm
+//
+// ProcessRows
+//
+// Generates the code to process a compute and store the output block for a
+// fixed number of rows.
+//
+
+        .macro  ProcessRows Mode, Rows
+        mov     x4,#\Rows\()                   // return number of rows handled
+        cmp     x5,#6
+        ble     .L\Mode\().ProcessNextColumnLoop6x\Rows\()
+
+.L\Mode\().ProcessNextColumnLoop8x\Rows\():
+        ComputeBlockLoop \Mode\(),8,\Rows\()
+
+        sub     x5,x5,#8
+        cmp     x5,#0
+        blt     .L\Mode\().Output14ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),16,\Rows\()
+        mov     x0,x26               // reload matrix A
+        cmp     x5,#6
+        bgt     .L\Mode\().ProcessNextColumnLoop8x\Rows\()
+        cbz     x5,.L\Mode\().ExitKernel
+
+
+.L\Mode\().ProcessNextColumnLoop6x\Rows\():
+
+        cmp     x5,#4
+        ble     .L\Mode\().ProcessNextColumnLoop4x\Rows\()
+        ComputeBlockLoop \Mode\(),6,\Rows\()
+        sub 	x5,x5,#6
+                cmp   x5,#0
+        blt     .L\Mode\().Output10ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),12,\Rows\()
+
+        mov     x0,x26               // reload matrix A
+        cmp     x5,#4
+        bgt     .L\Mode\().ProcessNextColumnLoop6x\Rows\()
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop4x\Rows\():
+        cmp     x5,#2
+        ble     .L\Mode\().ProcessNextColumnLoop2x\Rows\()
+        ComputeBlockLoop \Mode\(),4,\Rows\()
+        sub     x5,x5,#4
+        cmp     x5,#0
+        blt     .L\Mode\().Output6ElementsOnlyFor\Rows\()
+
+        OutputBlock \Mode\(),8,\Rows\()
+
+        mov     x0,x26               // reload matrix A
+        cmp     x5,#2
+        bgt     .L\Mode\().ProcessNextColumnLoop4x\Rows\()
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop2x\Rows\():
+        ComputeBlockLoop \Mode\(),2,\Rows\()
+        sub     x5,x5,#2
+        cmp     x5,#0
+        blt     .L\Mode\().Output2ElementsOnlyFor\Rows\()
+
+        OutputBlock \Mode\(),4,\Rows\()
+
+        mov     x0,x26               // reload matrix A
+        cmp     x5,#2
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().Output14ElementsOnlyFor\Rows\():
+	OutputBlock \Mode\(),14,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output10ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),10,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output6ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),6,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output2ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),2,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+        .endm
+
+
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A (x0) - Supplies the address of matrix A.
+
+    B (x1) - Supplies the address of matrix B. The matrix data has been packed
+        using MlasSbgemmCopyPackB or MlasSbgemmTransposePackB.
+
+    C (x2) - Supplies the address of matrix C.
+
+    CountK (x3) - Supplies the number of columns from matrix A and the number
+        of rows from matrix B to iterate over.
+
+    CountM (x4) - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN (x5) - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda (x6) - Supplies the first dimension of matrix A.
+
+    ldc (x7) - Supplies the first dimension of matrix C.
+
+    Bias -  Supplies the address of Bias Vector [1xn]
+
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+        .macro  SbgemmKernelNeonFunction Mode
+
+        FUNCTION_ENTRY MlasSbgemmKernel\Mode\()
+
+        ldr     x8, [sp, #0]   //Bias vector
+
+        stp     x19, x20, [sp, #.LMlasSbgemmKernel_SavedRegisters_Neg]!
+        stp     x21, x22, [sp, #.LMlasSbgemmKernel_backup_x21_x22]
+        stp     x23, x24, [sp, #.LMlasSbgemmKernel_backup_x23_x24]
+        stp     x25, x26, [sp, #.LMlasSbgemmKernel_backup_x25_x26]
+        stp     x27, x28, [sp, #.LMlasSbgemmKernel_backup_x27_x28]
+        stp     d8, d9, [sp, #.LMlasSbgemmKernel_backup_d8_d9]
+        stp     d10, d11, [sp, #.LMlasSbgemmKernel_backup_d10_d11]
+        stp     d12, d13, [sp, #.LMlasSbgemmKernel_backup_d12_d13]
+        stp     d14, d15, [sp, #.LMlasSbgemmKernel_backup_d14_d15]
+
+        add     x13,x2,x7,lsl #2            // compute matrix C plus 1 row
+        add     x14,x13,x7,lsl #2           // compute matrix C plus 2 rows
+        add     x15,x14,x7,lsl #2           // compute matrix C plus 3 rows
+        add     x16,x15,x7,lsl #2           // compute matrix C plus 4 rows
+        add     x17,x16,x7,lsl #2           // compute matrix C plus 5 rows
+        add     x18,x17,x7,lsl #2           // compute matrix C plus 6 rows
+        add     x19,x18,x7,lsl #2           // compute matrix C plus 7 rows
+
+        mov     x26,x0                       // save matrix A
+//
+// Process 8 rows of the matrices.
+//
+        cmp     x4,#8
+        blt     .L\Mode\().ProcessCountMLessThan8
+        ProcessRows \Mode\(),8
+
+//
+// Restore non-volatile registers and return.
+//
+
+.L\Mode\().ExitKernel:
+        mov     x0,x4
+
+        ldp     d14, d15, [sp, #.LMlasSbgemmKernel_backup_d14_d15]
+        ldp     d12, d13, [sp, #.LMlasSbgemmKernel_backup_d12_d13]
+        ldp     d10, d11, [sp, #.LMlasSbgemmKernel_backup_d10_d11]
+        ldp     d8, d9, [sp, #.LMlasSbgemmKernel_backup_d8_d9]
+        ldp     x27, x28, [sp, #.LMlasSbgemmKernel_backup_x27_x28]
+        ldp     x25, x26, [sp, #.LMlasSbgemmKernel_backup_x25_x26]
+        ldp     x23, x24, [sp, #.LMlasSbgemmKernel_backup_x23_x24]
+        ldp     x21, x22, [sp, #.LMlasSbgemmKernel_backup_x21_x22]
+        ldp     x19, x20, [sp], #.LMlasSbgemmKernel_SavedRegisters
+
+        ret
+
+//
+// Process 4 rows of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan8:
+        cmp     x4,#4
+        blt     .L\Mode\().ProcessCountMLessThan4
+        ProcessRows \Mode\(),4
+        b       .L\Mode\().ExitKernel
+
+//
+// Process 2 row of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan4:
+        cmp     x4,#2
+        blt     .L\Mode\().ProcessCountMLessThan2
+
+        ProcessRows \Mode\(),2
+        b       .L\Mode\().ExitKernel
+
+
+//
+// Process the last row of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan2:
+        ProcessRows \Mode\(),1
+        b       .L\Mode\().ExitKernel
+
+
+        .endm
+
+        SbgemmKernelNeonFunction Zero
+        SbgemmKernelNeonFunction Add
diff --git a/onnxruntime/core/mlas/lib/amd64/SoftmaxKernelAvx512F.asm b/onnxruntime/core/mlas/lib/amd64/SoftmaxKernelAvx512F.asm
new file mode 100644
index 000000000000..3e83bc852f55
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/amd64/SoftmaxKernelAvx512F.asm
@@ -0,0 +1,103 @@
+;++
+;
+;Copyright (c) Microsoft Corporation. All rights reserved.
+;
+;Licensed under the MIT License.
+;
+;Module Name:
+;
+;    SoftmaxKernelAvx512F.asm
+;
+;Abstract:
+;
+;    This module implements the kernels for the single precision softmax
+;    operation.
+;
+;    This implementation uses AVX512F instructions.
+;
+;--
+
+        .xlist
+INCLUDE mlasi.inc
+        .list
+
+        EXTERN  MlasMinimumF32Value:NEAR
+
+;++
+;
+;Routine Description:
+;
+;    This routine implements a vectorized kernel to find the maximum value of
+;    the supplied buffer.
+;
+;Arguments:
+;
+;    Input (rcx) - Supplies the input buffer.
+;
+;    N (rdx) - Supplies the number of elements to process.
+;
+;Return Value:
+;
+;    Returns the maximum value of the supplied buffer.
+;
+;--
+
+        LEAF_ENTRY MlasReduceMaximumF32KernelAvx512F, _TEXT
+
+        vbroadcastss zmm0,DWORD PTR [MlasMinimumF32Value]
+        test    rdx,rdx
+        jz      ExitKernel
+        cmp     rdx,16
+        jb      ProcessRemainingCountBy1
+        cmp     rdx,64
+        jb      ProcessRemainingCountBy16
+        vmovaps zmm1,zmm0
+        vmovaps zmm2,zmm0
+        vmovaps zmm3,zmm0
+
+ProcessRemainingCountBy64:
+        vmaxps  zmm0,zmm0,ZMMWORD PTR [rcx]
+        vmaxps  zmm1,zmm1,ZMMWORD PTR [rcx+16*4]
+        sub     rdx,64
+        vmaxps  zmm2,zmm2,ZMMWORD PTR [rcx+32*4]
+        vmaxps  zmm3,zmm3,ZMMWORD PTR [rcx+48*4]
+        add     rcx,64*4                        ; advance input by 64 elements
+        cmp     rdx,64
+        jae     ProcessRemainingCountBy64
+        vmaxps  zmm0,zmm0,zmm1                  ; reduce to single vector
+        vmaxps  zmm2,zmm2,zmm3
+        vmaxps  zmm0,zmm0,zmm2
+
+ProcessRemainingCountBy16:
+        cmp     rdx,16
+        jb      ProcessRemainingCountLessThan16
+        vmaxps  zmm0,zmm0,ZMMWORD PTR [rcx]
+        sub     rdx,16
+        add     rcx,16*4                         ; advance input by 16 elements
+        jmp     ProcessRemainingCountBy16
+
+ProcessRemainingCountLessThan16:
+        vextractf32x8     ymm1,zmm0,1           ; reduce to single scalar
+        vmaxps  ymm0,ymm0,ymm1
+        vextractf128 xmm1,ymm0,1
+        vmaxps  xmm0,xmm0,xmm1
+        vshufps xmm1,xmm0,xmm0,0EEh
+        vmaxps  xmm0,xmm0,xmm1
+        vshufps xmm1,xmm0,xmm0,055h
+        vmaxss  xmm0,xmm0,xmm1
+        test    rdx,rdx
+        jz      ExitKernel
+
+ProcessRemainingCountBy1:
+        vmaxss  xmm0,xmm0,DWORD PTR [rcx]
+        add     rcx,4                           ; advance input by 1 element
+        dec     edx
+        jnz     ProcessRemainingCountBy1
+
+ExitKernel:
+        vzeroupper
+        ret
+
+        LEAF_END MlasReduceMaximumF32KernelAvx512F, _TEXT
+
+        END
diff --git a/onnxruntime/core/mlas/lib/amx_common.h b/onnxruntime/core/mlas/lib/amx_common.h
index 3eb0700932fa..caf94af02362 100644
--- a/onnxruntime/core/mlas/lib/amx_common.h
+++ b/onnxruntime/core/mlas/lib/amx_common.h
@@ -18,7 +18,7 @@ Module Name:
 
 #include "mlasi.h"
 
-#ifdef WIN32
+#ifdef _WIN32
 #define tile_dpbssd(dst, src1, src2) _tile_dpbssd(dst, src1, src2)
 
 #define tile_dpbsud(dst, src1, src2) _tile_dpbsud(dst, src1, src2)
diff --git a/onnxruntime/core/mlas/lib/jblas_defs.h b/onnxruntime/core/mlas/lib/jblas_defs.h
deleted file mode 100644
index 9cd1711a3ffd..000000000000
--- a/onnxruntime/core/mlas/lib/jblas_defs.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
---*/
-
-#pragma once
-
-#include "jblas/jit_blas_prologue_b.h"
-#include "jblas/jit_blas_wrapper.h"
-
-namespace jblas
-{
-
-/*
-Name conversion explaination:
-Fp32:   comp type, determined by GemmCore, can be any jblas::gemm::SCorexxx(float GemmCore)
-S4:     weight dtype, determined by jblas::prologue_b::gemm::WeightKBlockS4(also support other integer and float weight
-classes)
-F32F32: input/output dtype, determined by jblas::prologue_a::gemm::ActivationKBlockBaseF32 and
-jblas::epilogue::gemm::AccumulatorWriteBackFp32.
-
-Tips: jblas::epilogue::gemm::CompFp32BlockEpilogue is a fixed class for all fp32 accumulator GemmCores.
-*/
-template <class GemmCore_T>
-using tLauncher_Fp32_S4_F32F32 = jblas::wrapper::gemm::LauncherKBlock<
-    GemmCore_T::ISA,
-    GemmCore_T,
-    jblas::prologue_a::gemm::ActivationKBlockBaseF32,
-    jblas::prologue_b::gemm::WeightKBlockS4,
-    jblas::epilogue::gemm::CompFp32BlockEpilogue,
-    jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-
-/*
-Name conversion explaination:
-Int8:   comp type, determined by GemmCore, can be any jblas::gemm::ICorexxx(integer GemmCore)
-S4:     weight dtype, determined by jblas::prologue_b::gemm::WeightKBlockS4(support integer weight classes only)
-F32F32: input/output dtype, determined by jblas::prologue_a::gemm::ActivationKBlockBaseF32 and
-jblas::epilogue::gemm::AccumulatorWriteBackFp32.
-
-Tips: jblas::epilogue::gemm::CompInt8BlockEpilogue is a fixed class for all int32 accumulator GemmCores.
-*/
-template <class GemmCore_T>
-using tLauncher_Int8_S4_F32F32 = jblas::wrapper::gemm::LauncherKBlock<
-    GemmCore_T::ISA,
-    GemmCore_T,
-    jblas::prologue_a::gemm::ActivationF32KBlockQuantize,
-    jblas::prologue_b::gemm::WeightKBlockS4,
-    jblas::epilogue::gemm::CompInt8BlockEpilogue,
-    jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-
-using tAVX512F = jblas::gemm::SCoreRowNAvx512f<48, 8>;
-using tAMX_BF16 = jblas::gemm::HCoreRowNAmxbf16<64, 16>;
-using tAVX512_FP16 = jblas::gemm::HCoreRowNAvx512fp16<96, 8>;
-using tAVX_VNNI = jblas::gemm::ICoreRowNAvxvnni<48, 2>;  // TODO(Yu) use 24x4 for higher efficiency
-using tAVX512_VNNI = jblas::gemm::ICoreRowNAvx512vnni<48, 8>;
-using tAMX_INT8_US = jblas::gemm::ICoreRowNAmxint8<64, 16>;
-using tAMX_INT8_SS = jblas::gemm::ICoreRowNAmxint8SS<64, 16>;
-using tAVX2 = jblas::gemm::SCoreRowNAvx2<48, 2>;  // TODO(Yu) use 24x4 for higher efficiency
-
-class ORTThreading : public jblas::parallel::IThreading
-{
-   public:
-    ORTThreading(void* tp);
-    void parallel_for(const jblas::parallel::thread_func& func) override;
-    void set_threads(int nthreads) override { assert(0); }
-    void sync() override { assert(0); }
-    void* mTp;
-};
-
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/jblas_gemm.cpp b/onnxruntime/core/mlas/lib/jblas_gemm.cpp
deleted file mode 100644
index f3cae3186c28..000000000000
--- a/onnxruntime/core/mlas/lib/jblas_gemm.cpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
-Module Name:
-
-    jblas_gemm.cpp
-
-Abstract:
-
-    Currently only support Q4 gemm.
---*/
-
-#include "jblas_gemm.h"
-
-#include "jblas_defs.h"
-#include "mlasi.h"
-
-using namespace jblas;
-
-jblas::ORTThreading::ORTThreading(void* tp)
-    : IThreading(MLAS_THREADPOOL::DegreeOfParallelism(reinterpret_cast<MLAS_THREADPOOL*>(tp))), mTp(tp)
-{
-}
-
-void
-jblas::ORTThreading::parallel_for(const jblas::parallel::thread_func& func)
-{
-    MlasTrySimpleParallel(reinterpret_cast<MLAS_THREADPOOL*>(mTp), mThreadNum, [&](ptrdiff_t tid) {
-        func(static_cast<int>(tid));
-    });
-}
-
-template <class GemmCore_T>
-static void
-JblasSQ4GemmCompF32(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc,
-    int8_t* WorkSpace,
-    jblas::parallel::IThreading* th
-)
-{
-    auto M_ = static_cast<int>(M);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto lda_ = static_cast<int>(lda);
-    auto ldc_ = static_cast<int>(ldc);
-    if (M <= 16) {
-        using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-        using Launcher = tLauncher_Fp32_S4_F32F32<GemmCore_T>;
-        static Launcher kernel;
-        auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
-        if (B->mIsAsym) {
-            reduceA.assign(WorkSpace);
-            ORTThreading single(nullptr);
-            kernel.mProA.reduce({A, lda_}, &reduceA, M_, K_, &single);
-        }
-        typename Launcher::BEpiParam blkargs{
-            B->template SPtr<int8_t>(),    B->mScaT,   B->mCStep, B->template ZPtr<int8_t>(),
-            reduceA.template get<float>(), reduceA.lda};
-
-        typename Launcher::Param args{M_, N_, K_, B->mBlockSize, {A, lda_}, {B}, blkargs, {C, ldc_}};
-        jblas::parallel::GemmKBlockRun<Parallel>(kernel, args, th);
-    } else {
-        using Parallel = jblas::parallel::gemm::SchedulerBase<GemmCore_T>;
-        using Launcher = jblas::wrapper::gemm::LauncherBase<
-            GemmCore_T::ISA, GemmCore_T, jblas::prologue_a::gemm::ActivationBase,
-            jblas::prologue_b::gemm::WeightKBlockS4, jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-        static Launcher kernel;
-
-        typename Launcher::Param args{M_, N_, K_, {A, lda_}, {B}, {C, ldc_}};
-        jblas::parallel::GemmBaseRun<Parallel>(kernel, args, th);
-    }
-}
-
-template <class GemmCore_T>
-static void
-JblasSQ4GemmCompInt8(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc,
-    int8_t* WorkSpace,
-    jblas::parallel::IThreading* th
-)
-{
-    using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-    using Launcher = tLauncher_Int8_S4_F32F32<GemmCore_T>;
-    auto M_ = static_cast<int>(M);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto lda_ = static_cast<int>(lda);
-    auto ldc_ = static_cast<int>(ldc);
-    static Launcher kernel;
-    auto quanA = kernel.mProA.createStorage(M_, K_, B->mBlockSize, B->mIsAsym);
-    quanA.assign(WorkSpace);
-    if (M <= 16) {
-        ORTThreading single(nullptr);
-        kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, &single);
-    } else {
-        kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, th);
-    }
-    typename Launcher::Param args{
-        M_,
-        N_,
-        K_,
-        B->mBlockSize,
-        {A, lda_, &quanA},
-        {B},
-        {B->template SPtr<int8_t>(), B->mScaT, B->mCStep, quanA.template SPtr<float>(), quanA.mCStep,
-         quanA.template ZPtr<uint8_t>(), B->template RPtr<float>(), B->mRedT, B->template ZPtr<int8_t>(),
-         quanA.template RPtr<float>(), B->mBlockSize},
-        {C, ldc_}};
-    jblas::parallel::GemmKBlockRun<Parallel>(kernel, args, th);
-}
-
-bool
-JblasSQ4GemmBatchDriver(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    int8_t* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetCPUDevice();
-    ORTThreading orth(ThreadPool);
-    bool processed = true;
-    for (size_t i = 0; i < BatchN; i++) {
-        auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
-        auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-        if (ptr) {
-            if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-                auto kptr = reinterpret_cast<jblas::storage::gemm::StorageWeightKBlockS4*>(ptr);
-                auto coretype = ptr->mCoreId;
-                auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-                );
-                auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-                );
-                if (CType == uint32_t(gemm::CompType::COMP_FP32)) {
-                    if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                        JblasSQ4GemmCompF32<tAVX512F>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                        JblasSQ4GemmCompF32<tAVX2>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_US_INT32)) {
-                    if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                        JblasSQ4GemmCompInt8<tAMX_INT8_US>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                        JblasSQ4GemmCompInt8<tAVX512_VNNI>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                        JblasSQ4GemmCompInt8<tAVX_VNNI>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_SS_INT32)) {
-                    if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                        JblasSQ4GemmCompInt8<tAMX_INT8_SS>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-            }
-        } else {
-            processed = false;
-            break;
-        }
-    }
-    return processed;
-}
-
-template <class GemmCore_T>
-static size_t
-JblasSQ4GemmCompF32WorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc
-)
-{
-    auto M_ = static_cast<int>(M);
-    auto K_ = static_cast<int>(K);
-    (void)(N);
-    (void)(lda);
-    (void)(ldc);
-    if (M <= 16) {
-        using Launcher = tLauncher_Fp32_S4_F32F32<GemmCore_T>;
-        static Launcher kernel;
-        if (B->mIsAsym) {
-            auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
-            return reduceA.mSize;
-        }
-        return 0;
-    } else {
-        using Launcher = jblas::wrapper::gemm::LauncherBase<
-            GemmCore_T::ISA, GemmCore_T, jblas::prologue_a::gemm::ActivationBase,
-            jblas::prologue_b::gemm::WeightKBlockS4, jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-        static Launcher kernel;
-        return 0;
-    }
-    return 0;
-}
-
-template <class GemmCore_T>
-static size_t
-JblasSQ4GemmCompInt8WorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc
-)
-{
-    using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-    using Launcher = tLauncher_Int8_S4_F32F32<GemmCore_T>;
-    static Launcher kernel;
-    (void)(N);
-    (void)(lda);
-    (void)(ldc);
-    auto quanA = kernel.mProA.createStorage(
-        static_cast<int>(M), static_cast<int>(K), static_cast<int>(B->mBlockSize), B->mIsAsym
-    );
-    return quanA.mSize;
-}
-
-size_t
-JblasSQ4GemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-    GetCPUDevice();
-    size_t size = 0;
-    for (size_t i = 0; i < BatchN; i++) {
-        auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
-        auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-        if (ptr) {
-            if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-                auto kptr = reinterpret_cast<jblas::storage::gemm::StorageWeightKBlockS4*>(ptr);
-                auto coretype = ptr->mCoreId;
-                auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-                );
-                auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-                );
-                if (CType == uint32_t(gemm::CompType::COMP_FP32)) {
-                    if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                        size = std::max(
-                            JblasSQ4GemmCompF32WorkspaceSize<tAVX512F>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                        size = std::max(
-                            JblasSQ4GemmCompF32WorkspaceSize<tAVX2>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_US_INT32)) {
-                    if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_US>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAVX512_VNNI>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAVX_VNNI>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_SS_INT32)) {
-                    if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_SS>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-            }
-        }
-    }
-    return size;
-}
-
-template <typename T>
-static size_t
-JblasQ4BuSize(size_t block_size, size_t N, size_t K, bool isAsym)
-{
-    static T launcher;
-    auto stor = launcher.mProB.createStorage(
-        static_cast<int>(N), static_cast<int>(K), static_cast<int>(block_size), JBLAS_DTYPE::S4_CLIP, JBLAS_DTYPE::F32,
-        JBLAS_DTYPE::BF16, isAsym
-    );
-    // TODO(Yu) support more scale dtype
-    return stor.mSize;
-}
-
-size_t
-JblasQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType)
-{
-    GetCPUDevice();
-    if (K % BlkSize != 0) {
-        return 0;
-    }
-    // from low precision to high precision
-    switch (CompType) {
-        case CompInt8:
-            if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAMX_INT8_SS>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX512_VNNI>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX_VNNI>>(BlkSize, N, K, isAsym);
-            }
-        case CompBf16:
-        case CompFp16:
-        case CompFp32:
-        case CompUndef:
-            if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX512F>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX2>>(BlkSize, N, K, isAsym);
-            }
-            break;
-        default:
-            return 0;
-    }
-    return 0;
-}
-
-template <typename T>
-static void
-JblasQ4GemmPackBImpl(
-    void* PackedBuf,
-    size_t BlkSize,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    bool IsAsym,
-    bool lastCall,
-    size_t ldb,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    static T JblasKernel;
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto stor = JblasKernel.mProB.createStorage(
-        N_, K_, static_cast<int>(BlkSize), JBLAS_DTYPE::S4_CLIP, JBLAS_DTYPE::F32, JBLAS_DTYPE::BF16, IsAsym
-    );
-    stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
-    ORTThreading orth(ThreadPool);
-    JblasKernel.mProB.packNbitsWeight(N_, K_, IsAsym, QData, static_cast<int>(ldb), Scale, Zp, &stor, &orth);
-    if (lastCall) {
-        JblasKernel.mProB.reduceWeight(&stor, &orth);
-    }
-}
-
-bool
-JblasQ4GemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetCPUDevice();
-    // explicit statement fall through.
-    switch (CompType) {
-        case CompInt8:
-            if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAMX_INT8_SS>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAVX512_VNNI>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAVX_VNNI>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-        case CompBf16:
-        case CompFp16:
-        case CompFp32:
-        case CompUndef:
-            if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Fp32_S4_F32F32<tAVX512F>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Fp32_S4_F32F32<tAVX2>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-        default:
-            return false;
-    }
-    return false;
-}
-
-bool
-JblasQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-    auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(PackedBuf);
-    auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-    ORTThreading orth(ThreadPool);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto ldb_ = static_cast<int>(ldb);
-    GetCPUDevice();
-    if (ptr) {
-        if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-            auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-            );
-            auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-            );
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_FP32)) {
-                if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX512F, tAVX512F::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX2, tAVX2::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_INT8_US_INT32)) {
-                if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAMX_INT8_US, tAMX_INT8_US::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX512_VNNI, tAVX512_VNNI::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX_VNNI, tAVX_VNNI::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_INT8_SS_INT32)) {
-                if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAMX_INT8_SS, tAMX_INT8_SS::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-        }
-        return true;
-    }
-    return false;
-}
diff --git a/onnxruntime/core/mlas/lib/jblas_gemm.h b/onnxruntime/core/mlas/lib/jblas_gemm.h
deleted file mode 100644
index 044dc5e849a0..000000000000
--- a/onnxruntime/core/mlas/lib/jblas_gemm.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
-Module Name:
-
-    jblas_gemm.h
-
-Abstract:
-
-    Currently only support Q4 gemm.
---*/
-
-#pragma once
-
-#include "mlas_qnbit.h"
-
-size_t
-JblasQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType);
-
-bool
-JblasQ4GemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-);
-
-bool
-JblasQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb
-	, MLAS_THREADPOOL* ThreadPool);
-
-bool
-JblasSQ4GemmBatchDriver(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    int8_t* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-);
-
-size_t
-JblasSQ4GemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-);
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 7bb8b17031a8..4b93dde1bcef 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -193,6 +193,8 @@ class MLASCPUIDInfo
 
     bool HasArmSVE_I8MM() const { return has_arm_sve_i8mm_; }
 
+    bool HasArmNeon_BF16() const { return has_arm_neon_bf16_; }
+
    private:
     MLASCPUIDInfo();
 
@@ -200,6 +202,7 @@ class MLASCPUIDInfo
     bool has_fp16_{false};
     bool has_arm_neon_i8mm_{false};
     bool has_arm_sve_i8mm_{false};
+    bool has_arm_neon_bf16_{false};
 };
 using MLAS_CPUIDINFO = MLASCPUIDInfo;
 
@@ -357,6 +360,20 @@ size_t
 
 #else
 
+#if defined(__aarch64__) && defined(__linux__)
+typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)(
+    const float* A,
+    const bfloat16_t* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    const float* Bias
+);
+#endif
+
 typedef
 size_t
 (MLASCALL MLAS_GEMM_FLOAT_KERNEL)(
@@ -727,6 +744,10 @@ extern "C" {
 #else
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
+#if defined(__aarch64__) && defined(__linux__)
+    MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero;
+    MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd;
+#endif
     MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelZero;
     MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelAdd;
 #endif
@@ -825,6 +846,7 @@ extern "C" {
     MLAS_REDUCE_MINIMUM_MAXIMUM_FLOAT_KERNEL MlasReduceMinimumMaximumF32Kernel;
 #if defined(MLAS_TARGET_AMD64)
     MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx;
+    MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx512F;
     MLAS_REDUCE_MINIMUM_MAXIMUM_FLOAT_KERNEL MlasReduceMinimumMaximumF32KernelAvx;
 #endif
 
@@ -856,6 +878,10 @@ extern "C" {
 #define MLAS_DGEMM_THREAD_COMPLEXITY                (size_t(64) * size_t(1024))
 #define MLAS_QGEMM_THREAD_COMPLEXITY                65536
 
+#if defined(__aarch64__) && defined(__linux__)
+#define MLAS_SBGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024))
+#endif
+
 //
 // Single-threaded single precision matrix/matrix multiply operation.
 //
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 8329a34f1338..a53c5085b10c 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -60,6 +60,10 @@ MLASCPUIDInfo::MLASCPUIDInfo()
 #define HWCAP2_SVEI8MM (1 << 9)
 #endif
 
+#ifndef HWCAP2_BF16
+#define HWCAP2_BF16 (1 << 14)
+#endif
+
 #if defined(BUILD_MLAS_NO_ONNXRUNTIME)
 MLASCPUIDInfo::MLASCPUIDInfo()
 {
@@ -70,6 +74,8 @@ MLASCPUIDInfo::MLASCPUIDInfo()
 
     has_arm_neon_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_I8MM) != 0);
     has_arm_sve_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_SVEI8MM) != 0);
+
+    has_arm_neon_bf16_ = ((getauxval(AT_HWCAP2) & HWCAP2_BF16) != 0);
 }
 #endif
 
@@ -415,6 +421,7 @@ Return Value:
                     this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelAvx512F;
                     this->ComputeExpF32Kernel = MlasComputeExpF32KernelAvx512F;
                     this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelAvx512F;
+                    this->ReduceMaximumF32Kernel = MlasReduceMaximumF32KernelAvx512F;
                     this->QuantizeLinearS8Kernel = MlasQuantizeLinearS8KernelAvx512F;
                     this->QuantizeLinearU8Kernel = MlasQuantizeLinearU8KernelAvx512F;
                     this->NchwcBlockSize = 16;
@@ -482,7 +489,6 @@ Return Value:
     this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchNeon;
     this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchNeon;
     this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon;
-    this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
 
     //
     // Check if the processor supports ASIMD dot product instructions.
@@ -512,6 +518,9 @@ Return Value:
         this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchSdot;
         this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchDot;
         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
+
+        // MlasSQNBitGemmDispatchNeon has a dependency on dot product instructions
+        this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
     }
 
 #if defined(__linux__)
diff --git a/onnxruntime/core/mlas/lib/sbgemm.h b/onnxruntime/core/mlas/lib/sbgemm.h
new file mode 100644
index 000000000000..de7fd72fad45
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/sbgemm.h
@@ -0,0 +1,399 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    sbgemm.h
+
+Abstract:
+
+    This module defines the set of template functions to implement bfloat16
+    precision matrix/matrix multiply operation (SBGEMM).
+
+    To implement a new kernel, template functions below need to be specialized:
+       MlasSBGemmConvertPackB
+       MlasSBGemmPackedBOffset
+       MlasSBGemmPackedBLeadingDim
+       MlasSBGemmKernel
+
+    MlasSBGemmOperation is the shared kernel driver.
+
+    A kernel type should define the following constants:
+        bool PackNeeded;         Whether B needs to be packed
+        size_t KernelMaxM;       Max # rows the vectorized kernel can process
+        size_t PackedK;          Packed alignment on the K dim (power of 2)
+        size_t PackedN;          Packed alignment on the n dim (power of 2)
+        MLAS_SBGEMM_STRIDES Strides{128, 128, 256};
+--*/
+
+#if defined(__aarch64__) && defined(__linux__)
+
+#pragma once
+
+#include <cassert>
+#include <cstdlib>
+
+#include "mlasi.h"
+
+/**
+ * @brief Define the default striding parameters for
+ *        the bfloat16 precision gemm operation
+ */
+struct MLAS_SBGEMM_STRIDES {
+    size_t M;
+    size_t N;
+    size_t K;
+};
+
+/**
+ * @brief Convert fp32 matrix B to bf16 and pack the data
+ *
+ * @tparam KernelType
+ * @param[out] D         Address of packing buffer
+ * @param[in]  B         Address of source matrix B in fp32
+ * @param[in]  ldb       Leading dimension of B
+ * @param[in]  CountN    # of column to pack
+ * @param[in]  CountK    # of rows to pack
+ */
+template <typename KernelType>
+void
+MlasSBGemmConvertPackB(
+    bfloat16_t* PackedB, const float* B, size_t ldb, size_t CountN, size_t CountK
+);
+
+/**
+ * @brief Find the location of PackedB[StartK, StartN]
+ *
+ * @tparam KernelType
+ * @param PackedB
+ * @param DimN       Total columns of the packing buffer
+ * @param DimK       Total rows of the packing buffer
+ * @param StartN
+ * @param StartK
+ * @return  Address of PackedB[StartK, StartN]
+ */
+template <typename KernelType>
+MLAS_FORCEINLINE const bfloat16_t*
+MlasSBGemmPackedBOffset(
+    const bfloat16_t* PackedB, size_t DimN, size_t DimK, size_t StartN, size_t StartK
+)
+{
+    // By default the packed buffer is just a row major
+    // K row by N column buffer
+    MLAS_UNREFERENCED_PARAMETER(DimK);
+    return PackedB + StartK * DimN + StartN;
+}
+
+/**
+ * @brief leading dimension of the packed B buffer
+ *        Related to how B is packed
+ * @tparam KernelType
+ * @param DimN
+ * @param DimK
+ * @return leading dimension of the packed B buffer
+ */
+template <typename KernelType>
+MLAS_FORCEINLINE size_t
+MlasSBGemmPackedBLeadingDim(size_t DimN, size_t DimK)
+{
+    // By default the packed buffer is just a row major
+    // K row by N column buffer
+    MLAS_UNREFERENCED_PARAMETER(DimK);
+    return DimN;
+}
+
+template <typename KernelType>
+void
+MlasSBGemmKernel(const size_t CountM, const size_t CountN, const size_t CountK, const float* A, const size_t lda, const bfloat16_t* B, float* C, size_t ldc, const float* Bias, const bool ZeroMode);
+
+template <typename KernelType>
+MLAS_FORCEINLINE void
+MlasSBGemmPackedOperation(size_t M, size_t RangeStartN, size_t RangeCountN, size_t AlignedN, size_t K, const float* A, size_t lda, const void* PackedB, float* C, size_t ldc, const float* Bias, void* PostProcessor)
+{
+    constexpr MLAS_SBGEMM_STRIDES Strides = KernelType::Strides;
+    size_t PackedStrideN = Strides.N;
+    size_t PackedStrideK = Strides.K;
+
+    //
+    // Step through each slice of matrix B along the N dimension.
+    //
+    size_t CountN;
+    for (size_t n = 0; n < RangeCountN; n += CountN) {
+        const size_t SliceStartN = RangeStartN + n;
+        CountN = std::min(RangeCountN - n, PackedStrideN);
+
+        //
+        // Step through each slice of matrix B along the K dimension.
+        //
+        size_t CountK;
+        for (size_t k = 0; k < K; k += CountK) {
+            bool ZeroMode = (k == 0);
+            CountK = std::min(K - k, PackedStrideK);
+
+            const bfloat16_t* pb = (const bfloat16_t*)PackedB + AlignedN * k + CountK * SliceStartN;
+            float* c = C + n;
+            const float* pbias = ((nullptr == Bias) ? nullptr : Bias + RangeStartN + n);
+            MlasSBGemmKernel<KernelType>(M, CountN, CountK, A + k, lda, pb, c, ldc, ZeroMode ? pbias : nullptr, ZeroMode);
+        }
+        if (PostProcessor != nullptr) {
+            ((MLAS_SBGEMM_POSTPROCESSOR*)PostProcessor)
+                ->Process(C + n, M, SliceStartN, M, CountN, ldc);
+        }
+    }
+}
+
+template <typename KernelType>
+void
+MlasSBGemmNonPackedOperation(size_t M, size_t N, size_t K, const float* A, size_t lda, const float* B, size_t ldb, float* C, size_t ldc, const float* Bias, void* PostProcessor)
+{
+    //
+    // Compute the strides to step through slices of the input matrices.
+    //
+    // Expand the N stride if K is small or expand the K stride if N is small
+    // for better utilization of the B panel. Avoid changing the K stride if
+    // the A panel needs to be used for transposing.
+    //
+    constexpr MLAS_SBGEMM_STRIDES Strides = KernelType::Strides;
+    size_t StrideN = Strides.N;
+    size_t StrideK = Strides.K;
+
+    if (N >= K) {
+        while (StrideK / 2 >= K) {
+            StrideN *= 2;
+            StrideK /= 2;
+        }
+    } else {
+        while (StrideN > 16 && StrideN / 2 >= N) {
+            StrideK *= 2;
+            StrideN /= 2;
+        }
+    }
+
+    constexpr size_t packBSize = UpAlignSize(Strides.N * Strides.K * sizeof(bfloat16_t));
+    MlasThreadedBufAlloc(packBSize);
+    uint8_t* p = ThreadedBufHolder.get();
+    auto* PanelB = reinterpret_cast<bfloat16_t*>(p);
+
+    //
+    // Step through each slice of matrix B along the N dimension.
+    //
+    size_t CountN;
+    for (size_t n = 0; n < N; n += CountN) {
+        CountN = std::min(N - n, StrideN);
+
+        //
+        // Step through each slice of matrix B along the N dimension.
+        //
+        size_t CountK;
+        for (size_t k = 0; k < K; k += CountK) {
+            CountK = std::min(K - k, StrideK);
+
+            //
+            // Copy a panel of matrix B to a local packed buffer.
+            //
+            MlasSBGemmConvertPackB<KernelType>(PanelB, B + n + k * ldb, ldb, CountN, CountK);
+
+            auto* c = C + n;
+            const float* pbias =
+                ((nullptr == Bias) ? nullptr : Bias + n);  // TODO: check the SliceNStart
+
+            bool ZeroMode = (k == 0);
+            MlasSBGemmKernel<KernelType>(M, CountN, CountK, A + k, lda, PanelB, c, ldc, ZeroMode ? pbias : nullptr, ZeroMode);
+        }
+        if (PostProcessor != nullptr) {
+            ((MLAS_SBGEMM_POSTPROCESSOR*)PostProcessor)->Process(C + n, M, N, M, CountN, ldc);
+        }
+    }
+}
+
+template <typename KernelType>
+void
+MlasSBGemmOperation(const ptrdiff_t ThreadCountM, const ptrdiff_t ThreadCountN, const size_t M, const size_t N, const size_t K, const MLAS_SBGEMM_DATA_PARAMS* DataParams, ptrdiff_t ThreadId)
+{
+    const ptrdiff_t ThreadIdM = ThreadId / ThreadCountN;
+    const ptrdiff_t ThreadIdN = ThreadId % ThreadCountN;
+
+    //
+    // Partition the operation along the M dimension.
+    //
+    size_t RangeStartM;
+    size_t RangeCountM;
+
+    MlasPartitionWork(ThreadIdM, ThreadCountM, M, &RangeStartM, &RangeCountM);
+
+    //
+    // Partition the operation along the N dimension.
+    //
+    size_t RangeStartN;
+    size_t RangeCountN;
+
+    const size_t BlockedN =
+        (N + MLAS_SGEMM_STRIDEN_THREAD_ALIGN - 1) / MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+
+    MlasPartitionWork(ThreadIdN, ThreadCountN, BlockedN, &RangeStartN, &RangeCountN);
+
+    RangeStartN *= MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+    RangeCountN *= MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+
+    RangeCountN = std::min(N - RangeStartN, RangeCountN);
+
+    //
+    // Dispatch the partitioned operation.
+    //
+    const size_t lda = DataParams->lda;
+    const size_t ldc = DataParams->ldc;
+    const float* A = (const float*)DataParams->A + RangeStartM * lda;
+    float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+    const float* bias = DataParams->Bias;
+
+    if (!DataParams->BIsfp32) {
+        MlasSBGemmPackedOperation<KernelType>(
+            RangeCountM, RangeStartN, RangeCountN, BlockedN * MLAS_SGEMM_STRIDEN_THREAD_ALIGN, K, A,
+            lda, DataParams->B, C, ldc, bias, (void*)DataParams->OutputProcessor
+        );
+    } else {
+        const size_t ldb = DataParams->ldb;
+        const float* B = (const float*)DataParams->B + RangeStartN;
+        MlasSBGemmNonPackedOperation<KernelType>(RangeCountM, RangeCountN, K, A, lda, B, ldb, C, ldc, bias, (void*)DataParams->OutputProcessor);
+    }
+}
+
+//
+// dispatch structure.
+//
+typedef void(MLAS_SBGEMM_OPERATION)(const ptrdiff_t ThreadCountM, const ptrdiff_t ThreadCountN, const size_t M, const size_t N, const size_t K, const MLAS_SBGEMM_DATA_PARAMS* DataParams, ptrdiff_t ThreadId);
+
+typedef void(MLAS_SBGEMM_CONVERTPACKB_ROUTINE)(
+    bfloat16_t* D, const float* B, size_t ldb, size_t CountN, size_t CountK
+);
+
+/**
+ * @brief Hardware dependent dispatch for half precision GEMM
+ */
+struct MLAS_SBGEMM_DISPATCH {
+    MLAS_SBGEMM_OPERATION* Operation;                      /**< HalfGemm driver */
+    MLAS_SBGEMM_CONVERTPACKB_ROUTINE* ConvertPackBRoutine; /**< Convert and pack function for B */
+    size_t PackedK;
+    size_t PackedN;
+    size_t StrideM;
+    size_t BufOverRead;
+};
+
+extern const MLAS_SBGEMM_DISPATCH MlasSBGemmDispatchNeon;
+
+MLAS_FORCEINLINE
+const MLAS_SBGEMM_DISPATCH*
+MlasSBGemmGetDispatch()
+{
+#if defined(MLAS_TARGET_ARM64)
+    return &MlasSBGemmDispatchNeon;
+#else
+    std::cerr << "SBGemm Kernel is supported only on ARM64 platform.";
+    exit(1);
+#endif
+}
+
+size_t MLASCALL
+MlasSBGemmPackBSize(size_t N, size_t K)
+{
+    //
+    // Compute the number of bytes required to hold the packed buffer.
+    //
+    const auto* dispatch = MlasSBGemmGetDispatch();
+    if (dispatch == nullptr) return 0;
+
+    const auto padding = dispatch->BufOverRead;
+    const auto PackedK = dispatch->PackedK;
+    const auto PackedN = dispatch->PackedN;
+
+    const size_t AlignedK = (K + PackedK - 1) & ~(PackedK - 1);
+    const size_t AlignedN = (N + PackedN - 1) & ~(PackedN - 1);
+    const size_t BytesRequired = AlignedN * AlignedK * sizeof(bfloat16_t) + padding;
+    const size_t BufferAlignment = MlasGetPreferredBufferAlignment();
+    const size_t AlignedBytesRequired =
+        (BytesRequired + BufferAlignment - 1) & ~(BufferAlignment - 1);
+
+    return AlignedBytesRequired;
+}
+
+void MLASCALL
+MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* PackedB)
+{
+    const auto* dispatch = MlasSBGemmGetDispatch();
+    if (dispatch == nullptr) return;
+
+    dispatch->ConvertPackBRoutine((bfloat16_t*)PackedB, B, ldb, N, K);
+}
+
+void MLASCALL
+MlasSBGemmBatch(const size_t M, const size_t N, const size_t K, const size_t BatchN, const MLAS_SBGEMM_DATA_PARAMS* Data, MLAS_THREADPOOL* ThreadPool)
+{
+    const MLAS_SBGEMM_DISPATCH* dispatch = MlasSBGemmGetDispatch();
+    if (dispatch == nullptr) return;
+
+    MLAS_SBGEMM_OPERATION* operation = dispatch->Operation;
+
+    //
+    // Compute the number of target threads given the complexity of the SGEMM
+    // operation. Small requests should run using the single threaded path.
+    //
+
+    const double Complexity = double(M) * double(N) * double(K);
+
+    ptrdiff_t TargetThreadCount;
+
+    if (Complexity < double(MLAS_SBGEMM_THREAD_COMPLEXITY * GetMlasPlatform().MaximumThreadCount)) {
+        TargetThreadCount = ptrdiff_t(Complexity / double(MLAS_SGEMM_THREAD_COMPLEXITY)) + 1;
+    } else {
+        TargetThreadCount = GetMlasPlatform().MaximumThreadCount;
+    }
+
+    ptrdiff_t MaximumThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+
+    if (TargetThreadCount >= MaximumThreadCount) {
+        TargetThreadCount = MaximumThreadCount;
+    }
+
+    //
+    // Segment the operation across multiple threads.
+    //
+    // N.B. Currently, the operation is segmented as a 1D partition, which
+    // works okay for operations involving skinny matrices.
+    //
+    ptrdiff_t ThreadsPerGemm = (TargetThreadCount + BatchN - 1) / BatchN;
+    ptrdiff_t ThreadCountM;
+    ptrdiff_t ThreadCountN;
+
+    if (N > M) {
+        const size_t BlockedN =
+            (N + MLAS_SGEMM_STRIDEN_THREAD_ALIGN - 1) / MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+
+        if (size_t(ThreadsPerGemm) > BlockedN) {
+            ThreadsPerGemm = ptrdiff_t(BlockedN);
+        }
+
+        ThreadCountM = 1;
+        ThreadCountN = ThreadsPerGemm;
+
+    } else {
+        if (size_t(ThreadsPerGemm) > M) {
+            ThreadsPerGemm = ptrdiff_t(M);
+        }
+
+        ThreadCountM = ThreadsPerGemm;
+        ThreadCountN = 1;
+    }
+
+    MlasTrySimpleParallel(
+        ThreadPool, ThreadsPerGemm * static_cast<ptrdiff_t>(BatchN), [=](ptrdiff_t tid) {
+            ptrdiff_t GemmIdx = tid / ThreadsPerGemm;
+            ptrdiff_t ThreadIdx = tid % ThreadsPerGemm;
+            operation(ThreadCountM, ThreadCountN, M, N, K, &(Data[GemmIdx]), ThreadIdx);
+        }
+    );
+}
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/core/mlas/lib/sbgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sbgemm_kernel_neon.cpp
new file mode 100644
index 000000000000..a6a73996c548
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/sbgemm_kernel_neon.cpp
@@ -0,0 +1,362 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    sbgemm_kernel_neon.cpp
+
+Abstract:
+
+    This module implements bfloat16 precision GEMM kernel for neon.
+
+--*/
+
+#if defined(__aarch64__) && defined(__linux__)
+
+#include "arm_neon.h"
+#include "mlasi.h"
+#include "sbgemm.h"
+
+struct MLAS_SBGEMM_KERNEL_NEON {
+    static constexpr bool PackNeeded = true;
+    static constexpr size_t KernelMaxM = 8;  // max # rows the vectorized kernel can process
+    static constexpr size_t PackedK = 4;
+    static constexpr size_t PackedN = MLAS_SGEMM_STRIDEN_THREAD_ALIGN;
+    static constexpr MLAS_SBGEMM_STRIDES Strides{128, 128, 256};  // M:N:K
+};
+
+bool MLASCALL
+MlasBf16AccelerationSupported()
+{
+#if defined(MLAS_TARGET_ARM64)
+    return MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_BF16();
+#else
+    return false;
+#endif
+}
+
+/*
+    This routine converts fp32 to bf16 and copies elements from the source
+     matrix to the destination packed buffer.
+
+    4x2 elements from the source matrix are unrolled to be physically
+    contiguous for better locality inside the SBGEMM kernels. The remaining
+    rows and columns are padded to 4 and 2 alignment.
+*/
+MLAS_FORCEINLINE
+void
+MlasSBGemmConvertCopyPackB(bfloat16_t* D, const float* B, size_t ldb, size_t CountN, size_t CountK)
+{
+    //
+    // Copy data from matrix B into the destination buffer 4x2 blocks at a
+    // time.
+    //
+    //
+    while (CountN >= 8) {
+        const float* b = B;
+        int y = static_cast<int>(CountK);
+
+        while (y > 0) {
+            MLAS_FLOAT32X4 t0_l = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t0_h = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t1_l = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t1_h = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t2_l = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t2_h = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t3_l = MlasZeroFloat32x4();
+            MLAS_FLOAT32X4 t3_h = MlasZeroFloat32x4();
+
+            if (y >= 4) {
+                t0_l = MlasLoadFloat32x4(&b[ldb * 0]);
+                t0_h = MlasLoadFloat32x4(&b[ldb * 0 + 4]);
+                t1_l = MlasLoadFloat32x4(&b[ldb * 1]);
+                t1_h = MlasLoadFloat32x4(&b[ldb * 1 + 4]);
+                t2_l = MlasLoadFloat32x4(&b[ldb * 2]);
+                t2_h = MlasLoadFloat32x4(&b[ldb * 2 + 4]);
+                t3_l = MlasLoadFloat32x4(&b[ldb * 3]);
+                t3_h = MlasLoadFloat32x4(&b[ldb * 3 + 4]);
+            } else {
+                switch (y) {
+                    case 3:
+                        t0_l = MlasLoadFloat32x4(&b[ldb * 0]);
+                        t0_h = MlasLoadFloat32x4(&b[ldb * 0 + 4]);
+                        t1_l = MlasLoadFloat32x4(&b[ldb * 1]);
+                        t1_h = MlasLoadFloat32x4(&b[ldb * 1 + 4]);
+                        t2_l = MlasLoadFloat32x4(&b[ldb * 2]);
+                        t2_h = MlasLoadFloat32x4(&b[ldb * 2 + 4]);
+                        break;
+                    case 2:
+                        t0_l = MlasLoadFloat32x4(&b[ldb * 0]);
+                        t0_h = MlasLoadFloat32x4(&b[ldb * 0 + 4]);
+                        t1_l = MlasLoadFloat32x4(&b[ldb * 1]);
+                        t1_h = MlasLoadFloat32x4(&b[ldb * 1 + 4]);
+                        break;
+                    case 1:
+                        t0_l = MlasLoadFloat32x4(&b[ldb * 0]);
+                        t0_h = MlasLoadFloat32x4(&b[ldb * 0 + 4]);
+                        break;
+                }
+            }
+
+            float32x4x2_t z0_l = vzipq_f32(t0_l, t2_l);
+            float32x4x2_t z1_l = vzipq_f32(t1_l, t3_l);
+            float32x4x2_t o0_l = vzipq_f32(z0_l.val[0], z1_l.val[0]);
+            float32x4x2_t o1_l = vzipq_f32(z0_l.val[1], z1_l.val[1]);
+            t0_l = o0_l.val[0];
+            t1_l = o0_l.val[1];
+            t2_l = o1_l.val[0];
+            t3_l = o1_l.val[1];
+
+            bfloat16x8_t t0t1_l_4h = vcvtq_low_bf16_f32(t0_l);
+            bfloat16x8_t t0t1_l_8h = vcvtq_high_bf16_f32(t0t1_l_4h, t1_l);
+
+            bfloat16x8_t t2t3_l_4h = vcvtq_low_bf16_f32(t2_l);
+            bfloat16x8_t t2t3_l_8h = vcvtq_high_bf16_f32(t2t3_l_4h, t3_l);
+
+            vst1q_bf16(&D[0], t0t1_l_8h);
+            vst1q_bf16(&D[8], t2t3_l_8h);
+
+            float32x4x2_t z0_h = vzipq_f32(t0_h, t2_h);
+            float32x4x2_t z1_h = vzipq_f32(t1_h, t3_h);
+            float32x4x2_t o0_h = vzipq_f32(z0_h.val[0], z1_h.val[0]);
+            float32x4x2_t o1_h = vzipq_f32(z0_h.val[1], z1_h.val[1]);
+            t0_h = o0_h.val[0];
+            t1_h = o0_h.val[1];
+            t2_h = o1_h.val[0];
+            t3_h = o1_h.val[1];
+
+            bfloat16x8_t t0t1_h_4h = vcvtq_low_bf16_f32(t0_h);
+            bfloat16x8_t t0t1_h_8h = vcvtq_high_bf16_f32(t0t1_h_4h, t1_h);
+
+            bfloat16x8_t t2t3_h_4h = vcvtq_low_bf16_f32(t2_h);
+            bfloat16x8_t t2t3_h_8h = vcvtq_high_bf16_f32(t2t3_h_4h, t3_h);
+
+            vst1q_bf16(&D[16], t0t1_h_8h);
+            vst1q_bf16(&D[24], t2t3_h_8h);
+
+            D += 32;
+            b += ldb * 4;
+            y -= 4;
+        };
+        B += 8;
+        CountN -= 8;
+    }
+
+    //
+    // Special case the handling of the remaining columns less than 8 elements
+    // wide.
+    //
+    if (CountN > 0) {
+        int y = static_cast<int>(CountK);
+        while (y > 0) {
+            const float* b = B;
+            size_t b_inc = 0;
+            if ((CountN & 4) != 0) {
+                MLAS_FLOAT32X4 t0 = MlasZeroFloat32x4();
+                MLAS_FLOAT32X4 t1 = MlasZeroFloat32x4();
+                MLAS_FLOAT32X4 t2 = MlasZeroFloat32x4();
+                MLAS_FLOAT32X4 t3 = MlasZeroFloat32x4();
+                if (y >= 4) {
+                    t0 = MlasLoadFloat32x4(&b[ldb * 0]);
+                    t1 = MlasLoadFloat32x4(&b[ldb * 1]);
+                    t2 = MlasLoadFloat32x4(&b[ldb * 2]);
+                    t3 = MlasLoadFloat32x4(&b[ldb * 3]);
+                } else {
+                    switch (y) {
+                        case 3:
+                            t0 = MlasLoadFloat32x4(&b[ldb * 0]);
+                            t1 = MlasLoadFloat32x4(&b[ldb * 1]);
+                            t2 = MlasLoadFloat32x4(&b[ldb * 2]);
+                            break;
+                        case 2:
+                            t0 = MlasLoadFloat32x4(&b[ldb * 0]);
+                            t1 = MlasLoadFloat32x4(&b[ldb * 1]);
+                            break;
+                        case 1:
+                            t0 = MlasLoadFloat32x4(&b[ldb * 0]);
+                            break;
+                    }
+                }
+
+                float32x4x2_t z0 = vzipq_f32(t0, t2);
+                float32x4x2_t z1 = vzipq_f32(t1, t3);
+                float32x4x2_t o0 = vzipq_f32(z0.val[0], z1.val[0]);
+                float32x4x2_t o1 = vzipq_f32(z0.val[1], z1.val[1]);
+
+                t0 = o0.val[0];
+                t1 = o0.val[1];
+                t2 = o1.val[0];
+                t3 = o1.val[1];
+
+                bfloat16x8_t t0t1_4h = vcvtq_low_bf16_f32(t0);
+                bfloat16x8_t t0t1_8h = vcvtq_high_bf16_f32(t0t1_4h, t1);
+
+                bfloat16x8_t t2t3_4h = vcvtq_low_bf16_f32(t2);
+                bfloat16x8_t t2t3_8h = vcvtq_high_bf16_f32(t2t3_4h, t3);
+
+                vst1q_bf16(&D[0], t0t1_8h);
+                vst1q_bf16(&D[8], t2t3_8h);
+
+                D += 16;
+                b += 4;
+                b_inc += 4;
+            }
+
+            if ((CountN & 2) != 0) {
+                float32x2_t t0 = {0x0, 0x0};
+                float32x2_t t1 = {0x0, 0x0};
+                float32x2_t t2 = {0x0, 0x0};
+                float32x2_t t3 = {0x0, 0x0};
+
+                if (y >= 4) {
+                    t0 = vld1_f32(&b[ldb * 0]);
+                    t1 = vld1_f32(&b[ldb * 1]);
+                    t2 = vld1_f32(&b[ldb * 2]);
+                    t3 = vld1_f32(&b[ldb * 3]);
+                } else {
+                    switch (y) {
+                        case 3:
+                            t0 = vld1_f32(&b[ldb * 0]);
+                            t1 = vld1_f32(&b[ldb * 1]);
+                            t2 = vld1_f32(&b[ldb * 2]);
+                            break;
+                        case 2:
+                            t0 = vld1_f32(&b[ldb * 0]);
+                            t1 = vld1_f32(&b[ldb * 1]);
+                            break;
+                        case 1:
+                            t0 = vld1_f32(&b[ldb * 0]);
+                            break;
+                    }
+                }
+
+                float32x2x2_t z0 = vzip_f32(t0, t2);
+                float32x2x2_t z1 = vzip_f32(t1, t3);
+                float32x2x2_t o0 = vzip_f32(z0.val[0], z1.val[0]);
+                float32x2x2_t o1 = vzip_f32(z0.val[1], z1.val[1]);
+
+                float32x4_t tt0 = vcombine_f32(o0.val[0], o0.val[1]);
+                float32x4_t tt1 = vcombine_f32(o1.val[0], o1.val[1]);
+
+                bfloat16x8_t t_4h = vcvtq_low_bf16_f32(tt0);
+                bfloat16x8_t t_8h = vcvtq_high_bf16_f32(t_4h, tt1);
+
+                vst1q_bf16(&D[0], t_8h);
+
+                D += 8;
+                b += 2;
+                b_inc += 2;
+            }
+            if ((CountN & 1) != 0) {
+                float a = 0.0f;
+                float b = 0.0f;
+                float c = 0.0f;
+                float d = 0.0f;
+
+                if (y >= 4) {
+                    a = *(float*)(&B[ldb * 0 + b_inc]);
+                    b = *(float*)(&B[ldb * 1 + b_inc]);
+                    c = *(float*)(&B[ldb * 2 + b_inc]);
+                    d = *(float*)(&B[ldb * 3 + b_inc]);
+                } else {
+                    switch (y) {
+                        case 3:
+                            a = *(float*)(&B[ldb * 0 + b_inc]);
+                            b = *(float*)(&B[ldb * 1 + b_inc]);
+                            c = *(float*)(&B[ldb * 2 + b_inc]);
+                            break;
+                        case 2:
+                            a = *(float*)(&B[ldb * 0 + b_inc]);
+                            b = *(float*)(&B[ldb * 1 + b_inc]);
+                            break;
+                        case 1:
+                            a = *(float*)(&B[ldb * 0 + b_inc]);
+                            break;
+                    }
+                }
+
+                float32x2_t t0 = {a, 0x0};
+                float32x2_t t1 = {b, 0x0};
+                float32x2_t t2 = {c, 0x0};
+                float32x2_t t3 = {d, 0x0};
+
+                float32x2x2_t z0 = vzip_f32(t0, t2);
+                float32x2x2_t z1 = vzip_f32(t1, t3);
+                float32x2x2_t o0 = vzip_f32(z0.val[0], z1.val[0]);
+                float32x2x2_t o1 = vzip_f32(z0.val[1], z1.val[1]);
+
+                float32x4_t tt0 = vcombine_f32(o0.val[0], o0.val[1]);
+                float32x4_t tt1 = vcombine_f32(o1.val[0], o1.val[1]);
+
+                bfloat16x8_t t_4h = vcvtq_low_bf16_f32(tt0);
+                bfloat16x8_t t_8h = vcvtq_high_bf16_f32(t_4h, tt1);
+
+                vst1q_bf16(&D[0], t_8h);
+
+                D += 8;
+                b += 1;
+                b_inc += 1;
+            }
+            B += 4 * ldb;
+            y -= 4;
+        }
+    }
+}
+
+template <typename KernelType>
+void
+MlasSBGemmConvertPackB(
+    bfloat16_t* PackedB, const float* B, size_t ldb, size_t CountN, size_t CountK
+)
+{
+    const auto* dispatch = MlasSBGemmGetDispatch();
+    if (dispatch == nullptr) return;
+
+    const auto PackedN = dispatch->PackedN;
+
+    const size_t AlignedN = (CountN + PackedN - 1) & ~(PackedN - 1);
+
+    //
+    // Step through each slice of matrix B along the K dimension.
+    //
+    size_t K_block_size;
+    constexpr MLAS_SBGEMM_STRIDES Strides = KernelType::Strides;
+
+    for (size_t k = 0; k < CountK; k += K_block_size) {
+        K_block_size = std::min(CountK - k, Strides.K);
+
+        MlasSBGemmConvertCopyPackB((bfloat16_t*)PackedB, B + k * ldb, ldb, CountN, K_block_size);
+        PackedB = (bfloat16_t*)PackedB + AlignedN * K_block_size;
+    }
+}
+
+template <>
+MLAS_FORCEINLINE void
+MlasSBGemmKernel<MLAS_SBGEMM_KERNEL_NEON>(size_t CountM, size_t CountN, size_t CountK, const float* A, size_t lda, const bfloat16_t* B, float* C, size_t ldc, const float* Bias, const bool ZeroMode)
+{
+    while (CountM > 0) {
+        size_t RowsHandled;
+        if (ZeroMode) {
+            RowsHandled = MlasSbgemmKernelZero(A, B, C, CountK, CountM, CountN, lda, ldc, Bias);
+        } else {
+            RowsHandled = MlasSbgemmKernelAdd(A, B, C, CountK, CountM, CountN, lda, ldc, Bias);
+        }
+        C += ldc * RowsHandled;
+        A += lda * RowsHandled;
+        CountM -= RowsHandled;
+    }
+}
+
+const MLAS_SBGEMM_DISPATCH MlasSBGemmDispatchNeon = {
+    MlasSBGemmOperation<MLAS_SBGEMM_KERNEL_NEON>,
+    MlasSBGemmConvertPackB<MLAS_SBGEMM_KERNEL_NEON>,
+    MLAS_SBGEMM_KERNEL_NEON::PackedK,
+    MLAS_SBGEMM_KERNEL_NEON::PackedN,
+    MLAS_SBGEMM_KERNEL_NEON::KernelMaxM,
+    32  // kernel may read beyond buffer end by 32 bytes
+};
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
index 7f1d1b084aec..38c31c884176 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
@@ -11,38 +11,535 @@ Module Name:
 Abstract:
 
     This module implements the float/quantized n-bit integer matrix
-    multiplication hardware agnostic entrypoint, MlasSQNBitGemmBatch.
+    multiplication hardware agnostic entrypoint, MlasSQNBitGemmBatch,
+    as well as some SQNBitGemm-related query functions.
 --*/
 
 #include "sqnbitgemm.h"
-#ifdef MLAS_JBLAS
-#include "jblas_gemm.h"
-#endif
+
+#include <cassert>
+
+namespace
+{
+
+enum SQNBitGemmVariant {
+    SQNBitGemmVariantInvalid = -1,
+
+    // Valid variants
+
+    SQNBitGemmVariant_BitWidth4_CompFp32 = 0,
+    SQNBitGemmVariant_BitWidth4_CompInt8,
+
+    // End of valid variants
+
+    // Keep this element last and ensure that its value is the number of valid SQNBitGemmVariant values.
+    // Its value is used as an array size.
+    SQNBitGemmVariantCount,
+};
+
+SQNBitGemmVariant
+GetSQNBitGemmVariant(
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+)
+{
+    if (BlkBitWidth == 4 &&
+        (BlkLen == 16 || BlkLen == 32 || BlkLen == 64 || BlkLen == 128 || BlkLen == 256)) {
+        if (ComputeType == CompFp32 ||
+            ComputeType == CompUndef) {  // treat CompUndef (undefined) as CompFp32
+            return SQNBitGemmVariant_BitWidth4_CompFp32;
+        } else if (ComputeType == CompInt8) {
+            return SQNBitGemmVariant_BitWidth4_CompInt8;
+        }
+    }
+
+    return SQNBitGemmVariantInvalid;
+}
+
+}  // namespace
+
+bool MLASCALL
+MlasIsSQNBitGemmAvailable(
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+)
+{
+    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    if (Dispatch == nullptr) {
+        return false;
+    }
+
+    const auto Variant = GetSQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType);
+
+    switch (Variant) {
+        case SQNBitGemmVariant_BitWidth4_CompFp32: {
+            return Dispatch->SQ4BitGemmM1Kernel_CompFp32 != nullptr &&
+                   Dispatch->Q4BitBlkDequantBForSgemm_CompFp32 != nullptr;
+        }
+        case SQNBitGemmVariant_BitWidth4_CompInt8: {
+            return Dispatch->SQ4BitGemmM1Kernel_CompInt8 != nullptr &&
+                   Dispatch->QuantizeARow_CompInt8 != nullptr;
+        }
+        default: {
+            return false;
+        }
+    }
+}
+
+namespace
+{
+
+size_t
+SQNBitGemmWorkspaceAlignment(SQNBitGemmVariant Variant)
+{
+    switch (Variant) {
+        case SQNBitGemmVariant_BitWidth4_CompInt8: {
+            return Q8BlkAlignment();
+        }
+        default: {
+            return 1;
+        }
+    }
+}
+
+size_t
+SQNBitGemmPerGemmWorkspaceSize(
+    SQNBitGemmVariant Variant,
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BlkLen
+)
+{
+    MLAS_UNREFERENCED_PARAMETER(N);
+
+    switch (Variant) {
+        case SQNBitGemmVariant_BitWidth4_CompInt8: {
+            // workspace buffer is used for block quantization of A to int8
+            const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+            const size_t PerGemmWorkspaceSize = M * BlockCountK * Q8BlkSize(BlkLen);
+            return PerGemmWorkspaceSize;
+        }
+        default: {
+            return 0;
+        }
+    }
+}
+
+size_t
+SQNBitGemmPerGemmWorkspaceStride(
+    SQNBitGemmVariant Variant,
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BlkLen
+)
+{
+    const auto Size = SQNBitGemmPerGemmWorkspaceSize(Variant, M, N, K, BlkLen);
+    const auto Alignment = SQNBitGemmWorkspaceAlignment(Variant);
+    return MlasDivRoundup(Size, Alignment) * Alignment;
+}
+
+}  // namespace
+
+size_t MLASCALL
+MlasSQNBitGemmBatchWorkspaceSize(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+)
+{
+    const auto Variant = GetSQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType);
+
+    const size_t PerGemmWorkspaceStride = SQNBitGemmPerGemmWorkspaceStride(Variant, M, N, K, BlkLen);
+    if (PerGemmWorkspaceStride == 0) {
+        return 0;
+    }
+
+    const size_t Alignment = SQNBitGemmWorkspaceAlignment(Variant);
+
+    const size_t WorkspaceSize = BatchN * PerGemmWorkspaceStride;
+
+    return WorkspaceSize + Alignment - 1;
+}
+
+size_t MLASCALL
+MlasSQNBitGemmPackQuantBDataSize(
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+)
+{
+    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    if (Dispatch == nullptr) {
+        return 0;
+    }
+
+    if (BlkBitWidth == 4 && Dispatch->SQ4BitGemmPackQuantBDataSize != nullptr) {
+        return Dispatch->SQ4BitGemmPackQuantBDataSize(
+            N, K, BlkLen, ComputeType
+        );
+    }
+
+    return 0;
+}
+
+void MLASCALL
+MlasSQNBitGemmPackQuantBData(
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const void* QuantBData,
+    void* PackedQuantBData,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    if (Dispatch == nullptr) {
+        return;
+    }
+
+    if (BlkBitWidth == 4 && Dispatch->SQ4BitGemmPackQuantBData != nullptr) {
+        Dispatch->SQ4BitGemmPackQuantBData(
+            N,
+            K,
+            BlkLen,
+            ComputeType,
+            static_cast<const std::byte*>(QuantBData),
+            static_cast<std::byte*>(PackedQuantBData),
+            ThreadPool
+        );
+        return;
+    }
+}
 
 namespace
 {
 
-// Get quantization variant based on `BlkBitWidth` and `BlkLen`.
-// Return -1 if the input values are unsupported.
-int32_t
-GetDispatchQuantVariant(size_t BlkBitWidth, size_t BlkLen)
+MLAS_FORCEINLINE void
+AddBiasForGemm(const float* Bias, float* C, size_t CountM, size_t CountN, size_t ldc)
+{
+    for (size_t m = 0; m < CountM; m++) {
+        const float* bias = Bias;
+        float* sum = C;
+        for (size_t n = 0; n < CountN; n += 4) {
+            if (CountN - n < 4) {
+                for (size_t nn = n; nn < CountN; nn++) {
+                    *sum += *bias;
+                    sum++;
+                    bias++;
+                }
+                break;
+            }
+
+            MLAS_FLOAT32X4 acc_x = MlasLoadFloat32x4(sum);
+            acc_x = MlasAddFloat32x4(acc_x, MlasLoadFloat32x4(bias));
+            MlasStoreFloat32x4(sum, acc_x);
+            bias += 4;
+            sum += 4;
+        }
+        C += ldc;
+    }
+}
+
+typedef void(SQNBitGemmFn)(
+    size_t BlkLen,
+    size_t K,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* PerGemmWorkspace,
+    size_t RangeStartM,
+    size_t RangeCountM,
+    size_t RangeStartN,
+    size_t RangeCountN
+);
+
+void
+SQ4BitGemm_CompFp32(
+    const size_t BlkLen,
+    const size_t K,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams,
+    void* const PerGemmWorkspace,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+
+    MLAS_UNREFERENCED_PARAMETER(PerGemmWorkspace);
+
+    const size_t lda = DataParams->lda;
+    const size_t ldc = DataParams->ldc;
+
+    const size_t k_blks = MlasDivRoundup(K, BlkLen);
+    const size_t ldb = k_blks * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t k_blks_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(k_blks);
+
+    const float* A = DataParams->A + RangeStartM * lda;
+
+    const std::byte* QuantBData = static_cast<const std::byte*>(DataParams->QuantBData) + RangeStartN * ldb;
+    const float* QuantBScale = DataParams->QuantBScale + RangeStartN * k_blks;
+    const std::byte* QuantBZeroPoint =
+        (DataParams->QuantBZeroPoint == nullptr)
+            ? nullptr
+            : static_cast<const std::byte*>(DataParams->QuantBZeroPoint) + RangeStartN * k_blks_zp_bytes;
+
+    float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+
+    const float* Bias = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias + RangeStartN;
+
+    if (RangeCountM == 1) {
+        size_t CountN;
+        for (size_t n = 0; n < RangeCountN; n += CountN) {
+            CountN = std::min(RangeCountN - n, size_t{128});
+
+            const float* a_row = A;
+            const std::byte* b_col = QuantBData + n * ldb;
+            const float* b_col_scale = QuantBScale + n * k_blks;
+            const std::byte* b_col_zp =
+                (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
+            float* c_blk = C + n;
+            const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
+
+            GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmM1Kernel_CompFp32(
+                BlkLen,
+                a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias
+            );
+
+            if (DataParams->PostProcessor != nullptr) {
+                DataParams->PostProcessor->Process(
+                    DataParams->C, RangeStartM, RangeStartN + n,
+                    RangeCountM, CountN, ldc
+                );
+            }
+        }
+        return;
+    }
+
+    constexpr size_t StrideN = 32;
+    size_t bufsize = k_blks * BlkLen * StrideN * sizeof(float);
+    MlasThreadedBufAlloc(bufsize);
+    auto* dequant_b = reinterpret_cast<float*>(ThreadedBufHolder.get());
+
+    //
+    // Step through each slice of matrix B along the N dimension.
+    //
+    size_t CountN;
+    for (size_t n = 0; n < RangeCountN; n += CountN) {
+        CountN = std::min(RangeCountN - n, StrideN);
+
+        //
+        // Step through each slice of matrix A along the M dimension.
+        //
+        const float* a_row = A;
+        const std::byte* b_col = QuantBData + n * ldb;
+        const float* b_col_scale = QuantBScale + n * k_blks;
+        const std::byte* b_col_zp =
+            (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
+        float* c_blk = C + n;
+        const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
+
+        GetMlasPlatform().SQNBitGemmDispatch->Q4BitBlkDequantBForSgemm_CompFp32(
+            BlkLen,
+            dequant_b, b_col, b_col_scale, b_col_zp, CountN, K, k_blks
+        );
+
+        size_t RowsRemaining = RangeCountM;
+        while (RowsRemaining > 0) {
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
+            auto RowsHandled = GetMlasPlatform().GemmFloatKernel(
+                a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true
+            );
+#else
+            auto RowsHandled = MlasSgemmKernelZero(a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f);
+#endif
+
+            if (bias) {
+                AddBiasForGemm(bias, c_blk, RowsHandled, CountN, ldc);
+            }
+            if (DataParams->PostProcessor != nullptr) {
+                DataParams->PostProcessor->Process(
+                    DataParams->C, RangeStartM + RangeCountM - RowsRemaining, RangeStartN,
+                    RowsHandled, CountN, ldc
+                );
+            }
+
+            c_blk += ldc * RowsHandled;
+            a_row += lda * RowsHandled;
+            RowsRemaining -= RowsHandled;
+        }
+    }
+}
+
+void
+SQ4BitGemm_CompInt8(
+    const size_t BlkLen,
+    const size_t K,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams,
+    void* const PerGemmWorkspace,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN
+)
 {
-    int32_t type = -1;
-    if (BlkBitWidth == 4 && BlkLen == 16) {
-        type = QuantVariant_BitWidth4_BlockSize16;
-    } else if (BlkBitWidth == 4 && BlkLen == 32) {
-        type = QuantVariant_BitWidth4_BlockSize32;
-    } else if (BlkBitWidth == 4 && BlkLen == 64) {
-        type = QuantVariant_BitWidth4_BlockSize64;
-    } else if (BlkBitWidth == 4 && BlkLen == 128) {
-        type = QuantVariant_BitWidth4_BlockSize128;
-    } else if (BlkBitWidth == 4 && BlkLen == 256) {
-        type = QuantVariant_BitWidth4_BlockSize256;
+    constexpr size_t BlkBitWidth = 4;
+
+    const size_t k_blks = MlasDivRoundup(K, BlkLen);
+
+    const size_t lda = k_blks * Q8BlkSize(BlkLen);
+    const size_t ldc = DataParams->ldc;
+    const size_t ldb = k_blks * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t k_blks_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(k_blks);
+
+    const std::byte* QuantA = static_cast<const std::byte*>(PerGemmWorkspace) + RangeStartM * lda;
+
+    const std::byte* QuantBData = static_cast<const std::byte*>(DataParams->QuantBData) + RangeStartN * ldb;
+    const float* QuantBScale = DataParams->QuantBScale + RangeStartN * k_blks;
+    const std::byte* QuantBZeroPoint =
+        (DataParams->QuantBZeroPoint == nullptr)
+            ? nullptr
+            : static_cast<const std::byte*>(DataParams->QuantBZeroPoint) + RangeStartN * k_blks_zp_bytes;
+
+    float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+
+    const float* Bias = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias + RangeStartN;
+
+    if (RangeCountM == 1) {
+        size_t CountN;
+        for (size_t n = 0; n < RangeCountN; n += CountN) {
+            CountN = std::min(RangeCountN - n, size_t{128});
+
+            const std::byte* a_row = QuantA;
+            const std::byte* b_col = QuantBData + n * ldb;
+            const float* b_col_scale = QuantBScale + n * k_blks;
+            const std::byte* b_col_zp =
+                (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
+            float* c_blk = C + n;
+            const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
+
+            GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmM1Kernel_CompInt8(
+                BlkLen,
+                a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias
+            );
+
+            if (DataParams->PostProcessor != nullptr) {
+                DataParams->PostProcessor->Process(
+                    DataParams->C, RangeStartM, RangeStartN + n,
+                    RangeCountM, CountN, ldc
+                );
+            }
+        }
+        return;
     }
 
-    return type;
+    // This is a naive M > 1 implementation that repeatedly calls the M=1 kernel.
+    // TODO Replace it with an optimized implementation.
+    size_t CountN;
+    for (size_t n = 0; n < RangeCountN; n += CountN) {
+        CountN = std::min(RangeCountN - n, size_t{128});
+
+        const std::byte* a_row = QuantA;
+        const std::byte* b_col = QuantBData + n * ldb;
+        const float* b_col_scale = QuantBScale + n * k_blks;
+        const std::byte* b_col_zp =
+            (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
+        float* c_blk = C + n;
+        const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
+
+        for (size_t m = 0; m < RangeCountM; ++m) {
+            GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmM1Kernel_CompInt8(
+                BlkLen,
+                a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias
+            );
+
+            if (DataParams->PostProcessor != nullptr) {
+                DataParams->PostProcessor->Process(
+                    DataParams->C, RangeStartM, RangeStartN + n,
+                    RangeCountM, CountN, ldc
+                );
+            }
+
+            c_blk += ldc;
+            a_row += lda;
+        }
+    }
 }
 
+typedef void(InitializeWorkspaceFn)(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkLen,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
+    size_t PerGemmWorkspaceStride,
+    MLAS_THREADPOOL* ThreadPool
+);
+
+void
+InitializeWorkspace_CompInt8(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkLen,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
+    size_t PerGemmWorkspaceStride,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+    MLAS_UNREFERENCED_PARAMETER(N);
+
+    const auto QuantizeARow = GetMlasPlatform().SQNBitGemmDispatch->QuantizeARow_CompInt8;
+
+    const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+    const size_t QuantAStride = BlockCountK * Q8BlkSize(BlkLen);
+
+    MlasTrySimpleParallel(ThreadPool, BatchN, [&](ptrdiff_t gemm_idx) {
+        const auto& data = DataParams[gemm_idx];
+
+        const float* ARowPtr = data.A;
+        std::byte* QuantARowPtr = static_cast<std::byte*>(Workspace) + gemm_idx * PerGemmWorkspaceStride;
+
+        for (size_t m = 0; m < M; ++m) {
+            QuantizeARow(BlkLen, ARowPtr, K, QuantARowPtr);
+
+            ARowPtr += data.lda;
+            QuantARowPtr += QuantAStride;
+        }
+    });
+}
+
+struct Operations {
+    InitializeWorkspaceFn* InitializeWorkspace = nullptr;
+    SQNBitGemmFn* SQNBitGemm = nullptr;
+};
+
+constexpr auto OperationMap = []() {
+    std::array<Operations, SQNBitGemmVariantCount> ops;
+
+    ops[SQNBitGemmVariant_BitWidth4_CompFp32].SQNBitGemm = SQ4BitGemm_CompFp32;
+
+    ops[SQNBitGemmVariant_BitWidth4_CompInt8].InitializeWorkspace = InitializeWorkspace_CompInt8;
+    ops[SQNBitGemmVariant_BitWidth4_CompInt8].SQNBitGemm = SQ4BitGemm_CompInt8;
+
+    return ops;
+}();
+
 }  // namespace
 
 void MLASCALL
@@ -53,17 +550,43 @@ MlasSQNBitGemmBatch(
     const size_t BatchN,
     const size_t BlkBitWidth,
     const size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
     MLAS_THREADPOOL* ThreadPool
 )
 {
-    const int32_t QuantVariant = GetDispatchQuantVariant(BlkBitWidth, BlkLen);
-    MLAS_SQNBIT_GEMM_OPERATION* const Operation = GetMlasPlatform().SQNBitGemmDispatch->Operations[QuantVariant];
+    const auto Variant = GetSQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType);
+    assert(Variant != SQNBitGemmVariantInvalid);
+
+    //
+    // Ensure `Workspace` has correct alignment.
+    //
+    if (Workspace != nullptr) {
+        const size_t Alignment = SQNBitGemmWorkspaceAlignment(Variant);
+        const uintptr_t WorkspaceAddress = reinterpret_cast<uintptr_t>(Workspace);
+        Workspace = reinterpret_cast<void*>(
+            (WorkspaceAddress + Alignment - 1) & (~(Alignment - 1))
+        );
+    }
+
+    const size_t PerGemmWorkspaceStride = SQNBitGemmPerGemmWorkspaceStride(Variant, M, N, K, BlkLen);
+
+    if (const auto InitializeWorkspaceOperation = OperationMap[Variant].InitializeWorkspace;
+        InitializeWorkspaceOperation != nullptr) {
+        InitializeWorkspaceOperation(
+            M, N, K, BatchN, BlkLen, DataParams, Workspace, PerGemmWorkspaceStride, ThreadPool
+        );
+    }
+
+    const auto ComputeOperation = OperationMap[Variant].SQNBitGemm;
 
     if (ThreadPool == nullptr) {
         for (size_t gemm_i = 0; gemm_i < BatchN; gemm_i++) {
-            auto Data = &DataParams[gemm_i];
-            Operation(K, Data, 0, M, 0, N);
+            const auto* Data = &DataParams[gemm_i];
+            void* PerGemmWorkspace =
+                reinterpret_cast<std::byte*>(Workspace) + gemm_i * PerGemmWorkspaceStride;
+            ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, 0, M, 0, N);
         }
         return;
     }
@@ -112,7 +635,10 @@ MlasSQNBitGemmBatch(
     MlasTrySimpleParallel(ThreadPool, ThreadsPerGemm * BatchN, [&](ptrdiff_t tid) {
         const auto gemm_i = tid / ThreadsPerGemm;
         const auto blk_i = tid % ThreadsPerGemm;
-        auto Data = &DataParams[gemm_i];
+        const auto* Data = &DataParams[gemm_i];
+        void* PerGemmWorkspace = reinterpret_cast<void*>(
+            reinterpret_cast<std::byte*>(Workspace) + gemm_i * PerGemmWorkspaceStride
+        );
 
         const ptrdiff_t ThreadIdN = blk_i / ThreadCountM;
         const ptrdiff_t ThreadIdM = blk_i % ThreadCountM;
@@ -123,149 +649,6 @@ MlasSQNBitGemmBatch(
         const size_t RangeStartN = ThreadIdN * StrideN;
         const size_t RangeCountN = std::min(N - RangeStartN, (size_t)StrideN);
 
-        Operation(K, Data, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
+        ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
     });
 }
-
-bool MLASCALL
-MlasIsSQNBitGemmAvailable(
-    size_t BlkBitWidth,
-    size_t BlkLen
-)
-{
-    const int32_t QuantVariant = GetDispatchQuantVariant(BlkBitWidth, BlkLen);
-    if (QuantVariant == -1) {
-        return false;
-    }
-
-    if (GetMlasPlatform().SQNBitGemmDispatch == nullptr ||
-        GetMlasPlatform().SQNBitGemmDispatch->Operations[QuantVariant] == nullptr) {
-        return false;
-    }
-
-    return true;
-}
-
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType
-)
-{
-#ifdef MLAS_JBLAS
-    if (nbits == 4) {
-        auto jsize = JblasQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
-        if (jsize) {
-            return jsize;
-        }
-    }
-#endif
-    (void)(N);
-    (void)(K);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(CompType);
-    return 0;
-}
-
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    int nbits,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-#ifdef MLAS_JBLAS
-    if (nbits == 4) {
-        if (JblasQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
-            return;
-        }
-    }
-#endif
-    (void)(PackedBuf);
-    (void)(QData);
-    (void)(Scale);
-    (void)(Zp);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(lastCall);
-    (void)(CompType);
-    (void)(ThreadPool);
-}
-
-void MLASCALL
-MlasNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-#ifdef MLAS_JBLAS
-    if (JblasQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
-        return;
-    }
-#endif
-    (void)(FpData);
-    (void)(PackedBuf);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(ThreadPool);
-}
-
-size_t MLASCALL
-MlasSQNBitsGemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-#ifdef MLAS_JBLAS
-    return JblasSQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    return 0;
-}
-
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetMlasPlatform();
-#ifdef MLAS_JBLAS
-    if (JblasSQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
-        // PackedWeight is created by jblas
-        return;
-    }
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    (void)(WorkSpace);
-    (void)(ThreadPool);
-}
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.h b/onnxruntime/core/mlas/lib/sqnbitgemm.h
index f8f7dcd43699..3992bc3e452a 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.h
@@ -10,98 +10,23 @@ Module Name:
 
 Abstract:
 
-    This module includes:
+    This module includes kernel function prototypes and helper functions for
+    implementing SQNBitGemm.
 
-    - Declaration of the set of template functions used to implement a kernel
-    for a matrix/matrix multiplication, A*B, where A is a float matrix and B is
-    a n-bit quantized integer matrix (QNBitGemm).
-
-    - A shared kernel driver function template, MlasSQNBitGemmOperation.
-
-    - Kernel dispatch structure.
-
-    The B matrix is block quantized, which means that its values are grouped
-    into blocks which each have one scale and optional zero point. Each
-    quantized value in B is n-bits wide.
+    SQNBitGemm is a matrix/matrix multiplication, A*B, where A is a float
+    matrix and B is a n-bit quantized integer matrix. B is block quantized,
+    meaning values of B are divided into blocks and each block has its own
+    scale and optional zero point.
 
 --*/
 
 #pragma once
 
+#include <cassert>
+
 #include "mlas_qnbit.h"
 #include "mlasi.h"
 
-//
-// Kernel implementation template declarations
-//
-
-/**
- * @brief Multiply float matrix A with quantized n-bit integer matrix B.
- *        B is block quantized and column major.
- *        This kernel handles the special case where M, the number of rows of A and C, is 1.
- *
- * @tparam BlkBitWidth  Bit width of each value in a block.
- * @tparam BlkLen       Number of values in a block.
- * @tparam KernelType   Hardware-specific kernel type.
- *
- * @param       A                   Supplies the A matrix.
- * @param       QuantBData          Supplies the quantized B matrix block data.
- * @param       QuantBScale         Supplies the quantized B matrix block scale values.
- * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
- * @param[out]  C                   Supplies the output C matrix.
- * @param       CountN              Number of columns of B and C.
- * @param       CountK              Number of columns of A and rows of B.
- * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
- * @param       Bias                Bias vector of length N.
- */
-template <size_t BlkBitWidth, size_t BlkLen, typename KernelType>
-MLAS_FORCEINLINE void
-MlasSQNBitGemmM1Kernel(
-    const float* A,
-    const uint8_t* QuantBData,
-    const float* QuantBScale,
-    const uint8_t* QuantBZeroPoint,
-    float* C,
-    size_t CountN,
-    size_t CountK,
-    size_t BlockStrideQuantB,
-    const float* Bias
-);
-
-/**
- * @brief Dequantize B into the format expected by the Sgemm kernel.
- *        B is block quantized and column major.
- *        This is equivalent to dequantizing B and then running
- *        MlasSgemmCopyPackB.
- *
- * @tparam BlkBitWidth  Bit width of each value in a block.
- * @tparam BlkLen       Number of values in a block.
- * @tparam KernelType   Hardware-specific kernel type.
- *
- * @param[out]  FpData              Supplies the output buffer for the dequantized B float data.
- * @param       QuantBData          Supplies the quantized B matrix block data.
- * @param       QuantBScale         Supplies the quantized B matrix block scale values.
- * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
- * @param       CountN              Number of columns of B.
- * @param       CountK              Number of rows of B.
- * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
- */
-template <size_t BlkBitWidth, size_t BlkLen, typename KernelType>
-MLAS_FORCEINLINE void
-MlasQNBitBlkDequantBForSgemm(
-    float* FpData,
-    const uint8_t* QuantBData,
-    const float* QuantBScale,
-    const uint8_t* QuantBZeroPoint,
-    size_t CountN,
-    size_t CountK,
-    size_t BlockStrideQuantB
-);
-
-//
-// MlasQNBitGemmOperation and helpers
-//
-
 constexpr MLAS_FORCEINLINE size_t
 MlasQNBitBlkDataSizeInBytes(size_t BlkBitWidth, size_t BlkLen)
 {
@@ -119,169 +44,201 @@ MlasQNBitZeroPointsForBlksSizeInBytes(size_t BlkCount)
     }
 }
 
-MLAS_FORCEINLINE void
-MlasAddBiasForGemm(const float* Bias, float* C, size_t CountM, size_t CountN, size_t ldc)
+//
+// Quantized int8 block helpers.
+//
+
+MLAS_FORCEINLINE
+const float&
+Q8BlkScale(const std::byte* BlkPtr)
 {
-    for (size_t m = 0; m < CountM; m++) {
-        const float* bias = Bias;
-        float* sum = C;
-        for (size_t n = 0; n < CountN; n += 4) {
-            if (CountN - n < 4) {
-                for (size_t nn = n; nn < CountN; nn++) {
-                    *sum += *bias;
-                    sum++;
-                    bias++;
-                }
-                break;
-            }
-
-            MLAS_FLOAT32X4 acc_x = MlasLoadFloat32x4(sum);
-            acc_x = MlasAddFloat32x4(acc_x, MlasLoadFloat32x4(bias));
-            MlasStoreFloat32x4(sum, acc_x);
-            bias += 4;
-            sum += 4;
-        }
-        C += ldc;
-    }
+    return *reinterpret_cast<const float*>(BlkPtr);
 }
 
-template <size_t BlkBitWidth, size_t BlkLen, typename KernelType>
-MLAS_FORCEINLINE void MLASCALL
-MlasSQNBitGemmOperation(
-    const size_t K,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams,
-    const size_t RangeStartM,
-    const size_t RangeCountM,
-    const size_t RangeStartN,
-    const size_t RangeCountN
-)
+MLAS_FORCEINLINE
+float&
+Q8BlkScale(std::byte* BlkPtr)
 {
-    const size_t lda = DataParams->lda;
-    const size_t ldc = DataParams->ldc;
-
-    const size_t k_blks = MlasDivRoundup(K, BlkLen);
-    const size_t ldb = k_blks * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
-    const size_t k_blks_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(k_blks);
-
-    const float* A = DataParams->A + RangeStartM * lda;
-
-    const uint8_t* QuantBData = static_cast<const uint8_t*>(DataParams->QuantBData) + RangeStartN * ldb;
-    const float* QuantBScale = DataParams->QuantBScale + RangeStartN * k_blks;
-    const uint8_t* QuantBZeroPoint =
-        (DataParams->QuantBZeroPoint == nullptr)
-            ? nullptr
-            : static_cast<const uint8_t*>(DataParams->QuantBZeroPoint) + RangeStartN * k_blks_zp_bytes;
-
-    float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
-
-    const float* Bias = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias + RangeStartN;
-
-    if (RangeCountM == 1) {
-        size_t CountN;
-        for (size_t n = 0; n < RangeCountN; n += CountN) {
-            CountN = std::min(RangeCountN - n, size_t{128});
-
-            const float* a_row = A;
-            const uint8_t* b_col = QuantBData + n * ldb;
-            const float* b_col_scale = QuantBScale + n * k_blks;
-            const uint8_t* b_col_zp =
-                (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
-            float* c_blk = C + n;
-            const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
-
-            MlasSQNBitGemmM1Kernel<BlkBitWidth, BlkLen, KernelType>(
-                a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias
-            );
-
-            if (DataParams->PostProcessor != nullptr) {
-                DataParams->PostProcessor->Process(
-                    DataParams->C, RangeStartM, RangeStartN + n,
-                    RangeCountM, CountN, ldc
-                );
-            }
-        }
-        return;
-    }
+    return *reinterpret_cast<float*>(BlkPtr);
+}
 
-    constexpr size_t StrideN = 32;
-    size_t bufsize = k_blks * BlkLen * StrideN * sizeof(float);
-    MlasThreadedBufAlloc(bufsize);
-    auto* dequant_b = reinterpret_cast<float*>(ThreadedBufHolder.get());
-    //
-    // Step through each slice of matrix B along the N dimension.
-    //
+MLAS_FORCEINLINE
+const int8_t*
+Q8BlkData(const std::byte* BlkPtr)
+{
+    return reinterpret_cast<const int8_t*>(BlkPtr + sizeof(float));
+}
 
-    size_t CountN;
-    for (size_t n = 0; n < RangeCountN; n += CountN) {
-        CountN = std::min(RangeCountN - n, StrideN);
-
-        //
-        // Step through each slice of matrix A along the M dimension.
-        //
-        const float* a_row = A;
-        const uint8_t* b_col = QuantBData + n * ldb;
-        const float* b_col_scale = QuantBScale + n * k_blks;
-        const uint8_t* b_col_zp =
-            (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
-        float* c_blk = C + n;
-        const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
-
-        MlasQNBitBlkDequantBForSgemm<BlkBitWidth, BlkLen, KernelType>(
-            dequant_b, b_col, b_col_scale, b_col_zp, CountN, K, k_blks
-        );
-
-        size_t RowsRemaining = RangeCountM;
-        while (RowsRemaining > 0) {
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER)
-            auto RowsHandled = GetMlasPlatform().GemmFloatKernel(
-                a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true
-            );
-#else
-            auto RowsHandled = MlasSgemmKernelZero(a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f);
-#endif
-
-            if (bias) {
-                MlasAddBiasForGemm(bias, c_blk, RowsHandled, CountN, ldc);
-            }
-            if (DataParams->PostProcessor != nullptr) {
-                DataParams->PostProcessor->Process(
-                    DataParams->C, RangeStartM + RangeCountM - RowsRemaining, RangeStartN,
-                    RowsHandled, CountN, ldc
-                );
-            }
-
-            c_blk += ldc * RowsHandled;
-            a_row += lda * RowsHandled;
-            RowsRemaining -= RowsHandled;
-        }
-    }
+MLAS_FORCEINLINE
+int8_t*
+Q8BlkData(std::byte* BlkPtr)
+{
+    return reinterpret_cast<int8_t*>(BlkPtr + sizeof(float));
+}
+
+MLAS_FORCEINLINE
+constexpr size_t
+Q8BlkSize(size_t BlkLen)
+{
+    const size_t BlkSize = sizeof(float) + BlkLen * sizeof(int8_t);
+    // Currently, the strictest alignment requirement of a block is for a float.
+    // Ensure contiguous blocks are suitably aligned.
+    assert(BlkSize % alignof(float) == 0);
+    return BlkSize;
+}
+
+MLAS_FORCEINLINE
+constexpr size_t
+Q8BlkAlignment()
+{
+    return alignof(float);
 }
 
 //
 // Kernel dispatch structure.
 //
 
-typedef void(MLASCALL MLAS_SQNBIT_GEMM_OPERATION)(
-    size_t K,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
-    size_t RangeStartM,
-    size_t RangeCountM,
-    size_t RangeStartN,
-    size_t RangeCountN
-);
-
-enum QuantVariant {
-    QuantVariant_BitWidth4_BlockSize16,
-    QuantVariant_BitWidth4_BlockSize32,
-    QuantVariant_BitWidth4_BlockSize64,
-    QuantVariant_BitWidth4_BlockSize128,
-    QuantVariant_BitWidth4_BlockSize256,
-    QuantVariantCount,  // Keep this element last and ensure that its value is the number of other QuantVariant values.
-                        // Its value is used as an array size.
-};
-
 struct MLAS_SQNBIT_GEMM_DISPATCH {
-    MLAS_SQNBIT_GEMM_OPERATION* Operations[QuantVariantCount] = {
-        // Initialized to nullptrs. Overwrite in hardware-specific kernel implementation.
-    };
+    //
+    // Quantized B data packing function prototypes.
+    //
+
+    /** Gets size of packed quantized B data containing 4-bit integers. See MlasSQNBitGemmPackQuantBDataSize(). */
+    typedef size_t(SQ4BitGemmPackQuantBDataSize_Fn)(
+        size_t N,
+        size_t K,
+        size_t BlkLen,
+        MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    );
+
+    SQ4BitGemmPackQuantBDataSize_Fn* SQ4BitGemmPackQuantBDataSize = nullptr;
+
+    /** Packs quantized B data containing 4-bit integers. See MlasSQNBitGemmPackQuantBData(). */
+    typedef void(SQ4BitGemmPackQuantBData_Fn)(
+        size_t N,
+        size_t K,
+        size_t BlkLen,
+        MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+        const std::byte* QuantBDataBegin,
+        std::byte* PackedQuantBDataBegin,
+        MLAS_THREADPOOL* ThreadPool
+    );
+
+    SQ4BitGemmPackQuantBData_Fn* SQ4BitGemmPackQuantBData = nullptr;
+
+    //
+    // CompFp32 kernel function prototypes.
+    //
+
+    /**
+     * @brief Multiply float matrix A with quantized 4-bit integer matrix B.
+     *        B is block quantized and column major.
+     *        This kernel handles the special case where M, the number of rows of A and C, is 1.
+     *
+     * @param       BlkLen              Number of values in a block.
+     * @param       A                   Supplies the A matrix.
+     * @param       QuantBData          Supplies the quantized B matrix block data.
+     * @param       QuantBScale         Supplies the quantized B matrix block scale values.
+     * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
+     * @param[out]  C                   Supplies the output C matrix.
+     * @param       CountN              Number of columns of B and C.
+     * @param       CountK              Number of columns of A and rows of B.
+     * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
+     * @param       Bias                Bias vector of length N.
+     */
+    typedef void(SQ4BitGemmM1Kernel_CompFp32_Fn)(
+        size_t BlkLen,
+        const float* A,
+        const std::byte* QuantBData,
+        const float* QuantBScale,
+        const std::byte* QuantBZeroPoint,
+        float* C,
+        size_t CountN,
+        size_t CountK,
+        size_t BlockStrideQuantB,
+        const float* Bias
+    );
+
+    SQ4BitGemmM1Kernel_CompFp32_Fn* SQ4BitGemmM1Kernel_CompFp32 = nullptr;
+
+    /**
+     * @brief Dequantize B into the format expected by the Sgemm kernel.
+     *        B is a quantized 4-bit integer matrix that is block quantized and column major.
+     *        This is equivalent to dequantizing B and then running MlasSgemmCopyPackB.
+     *
+     * @param       BlkLen              Number of values in a block.
+     * @param[out]  FpData              Supplies the output buffer for the dequantized B float data.
+     * @param       QuantBData          Supplies the quantized B matrix block data.
+     * @param       QuantBScale         Supplies the quantized B matrix block scale values.
+     * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
+     * @param       CountN              Number of columns of B.
+     * @param       CountK              Number of rows of B.
+     * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
+     */
+    typedef void(Q4BitBlkDequantBForSgemm_CompFp32_Fn)(
+        size_t BlkLen,
+        float* FpData,
+        const std::byte* QuantBData,
+        const float* QuantBScale,
+        const std::byte* QuantBZeroPoint,
+        size_t CountN,
+        size_t CountK,
+        size_t BlockStrideQuantB
+    );
+
+    Q4BitBlkDequantBForSgemm_CompFp32_Fn* Q4BitBlkDequantBForSgemm_CompFp32 = nullptr;
+
+    //
+    // CompInt8 kernel function prototypes.
+    //
+
+    /**
+     * @brief Multiply quantized 8-bit integer matrix A with quantized 4-bit integer matrix B.
+     *        A and B are block quantized and B is column major.
+     *        This kernel handles the special case where M, the number of rows of A and C, is 1.
+     *
+     * @param       BlkLen              Number of values in a block.
+     * @param       QuantA              Supplies the quantized A matrix.
+                                        Binary data containing block quantized int8 data and scale values.
+     * @param       QuantBData          Supplies the quantized B matrix block data.
+     * @param       QuantBScale         Supplies the quantized B matrix block scale values.
+     * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
+     * @param[out]  C                   Supplies the output C matrix.
+     * @param       CountN              Number of columns of B and C.
+     * @param       CountK              Number of columns of A and rows of B.
+     * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
+     * @param       Bias                Bias vector of length N.
+     */
+    typedef void(SQ4BitGemmM1Kernel_CompInt8_Fn)(
+        size_t BlkLen,
+        const std::byte* QuantA,
+        const std::byte* QuantBData,
+        const float* QuantBScale,
+        const std::byte* QuantBZeroPoint,
+        float* C,
+        size_t CountN,
+        size_t CountK,
+        size_t BlockStrideQuantB,
+        const float* Bias
+    );
+
+    SQ4BitGemmM1Kernel_CompInt8_Fn* SQ4BitGemmM1Kernel_CompInt8 = nullptr;
+
+    /**
+     * @brief Block quantize values from one row of matrix A from floats to quantized 8-bit integers.
+     *
+     * @param       BlkLen  Number of values in a block.
+     * @param       A       Supplies the A matrix.
+     * @param       CountK  Number of columns of A.
+     * @param[out]  QuantA  Supplies the output quantized A matrix.
+     *                      Binary data containing block quantized int8 data and scale values.
+     */
+    typedef void(QuantizeARow_CompInt8_Fn)(
+        size_t BlkLen,
+        const float* A,
+        size_t CountK,
+        std::byte* QuantA
+    );
+
+    QuantizeARow_CompInt8_Fn* QuantizeARow_CompInt8 = nullptr;
 };
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
index 63afe57dd913..9d7b0ae06e22 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
@@ -15,19 +15,114 @@ Module Name:
 
 --*/
 
-#include "sqnbitgemm.h"
-
 #include <arm_neon.h>
 
 #include <algorithm>
 #include <cassert>
 #include <utility>
 
+#include "sqnbitgemm.h"
+
 //
-// Hardware-specific kernel type.
+// Quantized B data packing function implementation.
+//
+
+namespace
+{
+
+size_t
+SQ4BitGemmPackQuantBDataSize(
+    size_t N,
+    size_t K,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+)
+{
+    MLAS_UNREFERENCED_PARAMETER(ComputeType);  // same size regardless of ComputeType
+
+    constexpr size_t BlkBitWidth = 4;
+
+    const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+    const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    return PackedQuantBDataSize;
+}
+
+void
+SQ4BitGemmPackQuantBData(
+    size_t N,
+    size_t K,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const std::byte* QuantBDataBegin,
+    std::byte* PackedQuantBDataBegin,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+
+    assert(BlkLen >= 16 && BlkLen % 16 == 0);
+
+    const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+    const size_t BlkDataSize = MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t Iterations = N * BlockCountK;  // one iteration per block
+
+    const size_t SubBlkLen = (ComputeType == CompInt8)
+                                 ? ((BlkLen == 16) ? 16 : 32)
+                                 : 16;
+
+    const size_t SubBlkDataSize = SubBlkLen / 2;
+    const size_t SubBlkBytePairCount = SubBlkLen / 4;
+
+    //
+    // For SubBlkLen == 16, pack 16 4-bit values (8 bytes) at a time like this:
+    //
+    // src: | v0 v1 | v2 v3 | v4 v5 | v6 v7 | v8 v9 | vA vB | vC vD | vE vF |
+    //   =>
+    // dst: | v0 v8 | v1 v9 | v2 vA | v3 vB | v4 vC | v5 vD | v6 vE | v7 vF |
+    //
+
+    //
+    // For SubBlkLen == 32, pack 32 4-bit values (16 bytes) at a time like this:
+    //
+    // src: | v0  v1  | v2  v3  | ... | v28 v29 | v30 v31 |
+    //   =>
+    // dst: | v0  v16 | v1  v17 | ... | v14 v30 | v15 v31 |
+    //
+
+    MlasTrySimpleParallel(
+        ThreadPool, Iterations,
+        [&](ptrdiff_t tid) {
+            const size_t n = tid / BlockCountK;
+            const size_t k_blk = tid % BlockCountK;
+
+            const size_t data_offset = n * BlockCountK * BlkDataSize + k_blk * BlkDataSize;
+            const std::byte* QuantBData = QuantBDataBegin + data_offset;
+            std::byte* PackedQuantBData = PackedQuantBDataBegin + data_offset;
+
+            for (size_t kk = 0; kk < BlkLen; kk += SubBlkLen) {
+                for (size_t byte_pair_idx = 0; byte_pair_idx < SubBlkBytePairCount; ++byte_pair_idx) {
+                    const std::byte src0 = QuantBData[byte_pair_idx];
+                    const std::byte src1 = QuantBData[byte_pair_idx + SubBlkDataSize / 2];
+
+                    std::byte& dst0 = PackedQuantBData[2 * byte_pair_idx];
+                    std::byte& dst1 = PackedQuantBData[2 * byte_pair_idx + 1];
+
+                    dst0 = (src0 & std::byte{0x0F}) | ((src1 & std::byte{0x0F}) << 4);
+                    dst1 = (src0 >> 4) | ((src1 >> 4) << 4);
+                }
+
+                QuantBData += SubBlkDataSize;
+                PackedQuantBData += SubBlkDataSize;
+            }
+        }
+    );
+}
+
+}  // namespace
+
+//
+// General helpers.
 //
-struct MLAS_SQNBIT_GEMM_KERNEL_NEON {
-};
 
 namespace
 {
@@ -70,7 +165,7 @@ FoldAccumulators(float32x4_t a0, float32x4_t a1, float32x4_t a2, float32x4_t a3)
 
 template <size_t Capacity>
 MLAS_FORCEINLINE void
-LoadData(const float* src, size_t count, float32x4_t (& dst)[Capacity / 4])
+LoadFloatData(const float* src, size_t count, float32x4_t (&dst)[Capacity / 4])
 {
     static_assert(Capacity % 4 == 0, "Capacity must be divisible by 4.");
 
@@ -101,13 +196,23 @@ LoadData(const float* src, size_t count, float32x4_t (& dst)[Capacity / 4])
     }
 }
 
-template <size_t BlkBitWidth, size_t BlkLen, size_t NCols>
+}  // namespace
+
+//
+// CompFp32 kernel implementation.
+//
+
+namespace
+{
+
+template <size_t NCols, bool HasZeroPoint>
 MLAS_FORCEINLINE void
-ComputeDotProducts(
+ComputeDotProducts_BlkBitWidth4_CompFp32(
+    size_t BlkLen,
     const float* ARowPtr,
-    const uint8_t* QuantBDataColPtr,
+    const std::byte* QuantBDataColPtr,
     const float* QuantBScaleColPtr,
-    const uint8_t* QuantBZeroPointColPtr,
+    const std::byte* QuantBZeroPointColPtr,
     float* SumPtr,
     size_t CountK,
     size_t StrideQuantBData,
@@ -116,8 +221,13 @@ ComputeDotProducts(
     const float* BiasPtr
 )
 {
+    constexpr size_t BlkBitWidth = 4;
+    constexpr size_t SubBlkLen = 16;
+
     static_assert(NCols == 1 || NCols == 4, "NCols must be 1 or 4");
 
+    assert(BlkLen >= SubBlkLen && BlkLen % SubBlkLen == 0);
+
     const uint8x8_t LowMask = vdup_n_u8(0x0F);
 
     // Manual conversion to float takes place in two steps:
@@ -135,9 +245,10 @@ ComputeDotProducts(
 
     float32x4_t acc[NCols]{};
 
-    const uint8_t* QuantBData = QuantBDataColPtr;
+    const std::byte* QuantBData = QuantBDataColPtr;
     const float* QuantBScale = QuantBScaleColPtr;
-    size_t QuantBZeroPointIdx = 0;  // track half byte increments with this index instead of a pointer
+    [[maybe_unused]] size_t QuantBZeroPointIdx = 0;  // track half byte increments with this index instead of a pointer
+                                                     // only used if HasZeroPoint == true
 
     for (size_t k = 0; k < CountK; k += BlkLen) {
         const size_t k_blk_len = std::min(CountK - k, BlkLen);
@@ -147,52 +258,42 @@ ComputeDotProducts(
             [&](size_t i) { scale[i] = QuantBScale[i * StrideQuantBScale]; }
         );
 
-        float offset[NCols];  // Includes zero point and float conversion offset of 16.
-        if (QuantBZeroPointColPtr != nullptr) {
+        [[maybe_unused]] float offset[NCols];  // Includes zero point and float conversion offset of 16.
+                                               // only used if HasZeroPoint == true
+        if constexpr (HasZeroPoint) {
             UnrolledLoop<NCols>([&](size_t i) {
-                const uint8_t zp_packed =
+                const std::byte zp_packed =
                     QuantBZeroPointColPtr[i * StrideQuantBZeroPoint + QuantBZeroPointIdx / 2];
-                const uint8_t zp = ((QuantBZeroPointIdx & 1) == 1) ? (zp_packed >> 4) : (zp_packed & 0x0F);
-                offset[i] = 16.0f + zp;
-            });
-        } else {
-            UnrolledLoop<NCols>([&](size_t i) {
-                constexpr float zp = 8.0f;
-                offset[i] = 16.0f + zp;
+                const std::byte zp = ((QuantBZeroPointIdx & 1) == 1)
+                                         ? (zp_packed >> 4)
+                                         : (zp_packed & std::byte{0x0F});
+                offset[i] = 16.0f + std::to_integer<uint8_t>(zp);
             });
         }
 
-        constexpr size_t SubBlkLen = 16;  // number of block elements to process in one iteration
-
         for (size_t k_idx_in_blk = 0; k_idx_in_blk < k_blk_len; k_idx_in_blk += SubBlkLen) {
             // load A row vector elements
 
             // load `SubBlkLen` elements from A, padded with 0's if there aren't enough
             const size_t k_subblk_len = std::min(k_blk_len - k_idx_in_blk, SubBlkLen);
             float32x4_t av[4]{};
-            LoadData<SubBlkLen>(ARowPtr + k + k_idx_in_blk, k_subblk_len, av);
+            LoadFloatData<SubBlkLen>(ARowPtr + k + k_idx_in_blk, k_subblk_len, av);
 
             // load B column vectors
             uint8x8_t bv_packed[NCols];
+            const size_t b_data_block_offset = k_idx_in_blk * BlkBitWidth / 8;
             UnrolledLoop<NCols>([&](size_t i) {
-                const size_t b_data_block_offset = k_idx_in_blk * BlkBitWidth / 8;
-                bv_packed[i] = vld1_u8(QuantBData + i * StrideQuantBData + b_data_block_offset);
-            });
-
-            uint8x8_t bv_u8_unzipped[NCols][2];
-            UnrolledLoop<NCols>([&](size_t i) {
-                bv_u8_unzipped[i][0] = vand_u8(bv_packed[i], LowMask);
-                bv_u8_unzipped[i][1] = vand_u8(vshr_n_u8(bv_packed[i], 4), LowMask);
+                bv_packed[i] = vld1_u8(
+                    reinterpret_cast<const uint8_t*>(QuantBData) + i * StrideQuantBData + b_data_block_offset
+                );
             });
 
             uint8x8_t bv_u8[NCols][2];
             UnrolledLoop<NCols>([&](size_t i) {
-                bv_u8[i][0] = vzip1_u8(bv_u8_unzipped[i][0], bv_u8_unzipped[i][1]);
-                bv_u8[i][1] = vzip2_u8(bv_u8_unzipped[i][0], bv_u8_unzipped[i][1]);
+                bv_u8[i][0] = vand_u8(bv_packed[i], LowMask);
+                bv_u8[i][1] = vshr_n_u8(bv_packed[i], 4);
             });
 
-            // dequantize B
-
             // shift left 3 and widen to 16 bits
             uint16x8_t bv_u16[NCols][2];
             UnrolledLoop<NCols>([&](size_t i) {
@@ -221,10 +322,17 @@ ComputeDotProducts(
             });
 
             // subtract float conversion offset (16) and zero point
-            UnrolledLoop<NCols>([&](size_t i) {
-                const float32x4_t offset_v = vdupq_n_f32(offset[i]);
-                UnrolledLoop<4>([&](size_t j) { bv[i][j] = vsubq_f32(bv[i][j], offset_v); });
-            });
+            if constexpr (HasZeroPoint) {
+                UnrolledLoop<NCols>([&](size_t i) {
+                    const float32x4_t offset_v = vdupq_n_f32(offset[i]);
+                    UnrolledLoop<4>([&](size_t j) { bv[i][j] = vsubq_f32(bv[i][j], offset_v); });
+                });
+            } else {
+                const float32x4_t offset_v = vdupq_n_f32(16.0f + 8.0f);
+                UnrolledLoop<NCols>([&](size_t i) {
+                    UnrolledLoop<4>([&](size_t j) { bv[i][j] = vsubq_f32(bv[i][j], offset_v); });
+                });
+            }
 
             // multiply by scale
             UnrolledLoop<NCols>([&](size_t i) {
@@ -241,7 +349,9 @@ ComputeDotProducts(
         // increment pointers to next block
         QuantBData += MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
         QuantBScale += 1;
-        QuantBZeroPointIdx += 1;
+        if constexpr (HasZeroPoint) {
+            QuantBZeroPointIdx += 1;
+        }
     }
 
     if constexpr (NCols == 4) {
@@ -262,19 +372,14 @@ ComputeDotProducts(
     }
 }
 
-}  // namespace
-
-//
-// MlasSQNBitGemmKernel and helpers.
-//
-
-template <size_t BlkBitWidth, size_t BlkLen>
-MLAS_FORCEINLINE void
-MlasSQNBitGemmM1KernelNeon(
+template <bool HasZeroPoint>
+void
+SQ4BitGemmM1Kernel_CompFp32_Impl(
+    size_t BlkLen,
     const float* A,
-    const uint8_t* QuantBData,
+    const std::byte* QuantBData,
     const float* QuantBScale,
-    const uint8_t* QuantBZeroPoint,
+    const std::byte* QuantBZeroPoint,
     float* C,
     size_t CountN,
     size_t CountK,
@@ -282,6 +387,7 @@ MlasSQNBitGemmM1KernelNeon(
     const float* Bias
 )
 {
+    constexpr size_t BlkBitWidth = 4;
     constexpr size_t NCols = 4;
 
     const float* ARowPtr = A;
@@ -295,16 +401,17 @@ MlasSQNBitGemmM1KernelNeon(
 
     const float* BiasPtr = Bias;
 
-    const uint8_t* QuantBDataColPtr = QuantBData;
+    const std::byte* QuantBDataColPtr = QuantBData;
     const float* QuantBScaleColPtr = QuantBScale;
-    const uint8_t* QuantBZeroPointColPtr = QuantBZeroPoint;
+    const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
 
     float* SumPtr = CRowPtr;
 
     int64_t nblk = static_cast<int64_t>(CountN) - NCols;
 
     while (nblk >= 0) {
-        ComputeDotProducts<BlkBitWidth, BlkLen, NCols>(
+        ComputeDotProducts_BlkBitWidth4_CompFp32<NCols, HasZeroPoint>(
+            BlkLen,
             ARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK,
             StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint,
             BiasPtr
@@ -314,7 +421,7 @@ MlasSQNBitGemmM1KernelNeon(
 
         QuantBDataColPtr += NCols * StrideQuantBData;
         QuantBScaleColPtr += NCols * StrideQuantBScale;
-        if (QuantBZeroPointColPtr != nullptr) {
+        if constexpr (HasZeroPoint) {
             QuantBZeroPointColPtr += NCols * StrideQuantBZeroPoint;
         }
 
@@ -327,7 +434,8 @@ MlasSQNBitGemmM1KernelNeon(
     // left over columns less than `NCols`?
     nblk += NCols;
     for (int64_t n = 0; n < nblk; ++n) {
-        ComputeDotProducts<BlkBitWidth, BlkLen, 1>(
+        ComputeDotProducts_BlkBitWidth4_CompFp32<1, HasZeroPoint>(
+            BlkLen,
             ARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK,
             StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint,
             BiasPtr
@@ -337,7 +445,7 @@ MlasSQNBitGemmM1KernelNeon(
 
         QuantBDataColPtr += StrideQuantBData;
         QuantBScaleColPtr += StrideQuantBScale;
-        if (QuantBZeroPointColPtr != nullptr) {
+        if constexpr (HasZeroPoint) {
             QuantBZeroPointColPtr += StrideQuantBZeroPoint;
         }
 
@@ -346,59 +454,70 @@ MlasSQNBitGemmM1KernelNeon(
     }
 }
 
-#define SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(BlkBitWidth, BlkLen)                  \
-    template <>                                                                \
-    MLAS_FORCEINLINE void                                                      \
-    MlasSQNBitGemmM1Kernel<BlkBitWidth, BlkLen, MLAS_SQNBIT_GEMM_KERNEL_NEON>( \
-        const float* A,                                                        \
-        const uint8_t* QuantBData,                                             \
-        const float* QuantBScale,                                              \
-        const uint8_t* QuantBZeroPoint,                                        \
-        float* C,                                                              \
-        size_t CountN,                                                         \
-        size_t CountK,                                                         \
-        size_t BlockStrideQuantB,                                              \
-        const float* Bias                                                      \
-    )                                                                          \
-    {                                                                          \
-        return MlasSQNBitGemmM1KernelNeon<BlkBitWidth, BlkLen>(                \
-            A, QuantBData, QuantBScale, QuantBZeroPoint, C, CountN, CountK,    \
-            BlockStrideQuantB, Bias                                            \
-        );                                                                     \
+MLAS_FORCEINLINE void
+SQ4BitGemmM1Kernel_CompFp32(
+    size_t BlkLen,
+    const float* A,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t CountK,
+    size_t BlockStrideQuantB,
+    const float* Bias
+)
+{
+    if (QuantBZeroPoint != nullptr) {
+        SQ4BitGemmM1Kernel_CompFp32_Impl<true>(
+            BlkLen,
+            A,
+            QuantBData,
+            QuantBScale,
+            QuantBZeroPoint,
+            C,
+            CountN,
+            CountK,
+            BlockStrideQuantB,
+            Bias
+        );
+    } else {
+        SQ4BitGemmM1Kernel_CompFp32_Impl<false>(
+            BlkLen,
+            A,
+            QuantBData,
+            QuantBScale,
+            QuantBZeroPoint,
+            C,
+            CountN,
+            CountK,
+            BlockStrideQuantB,
+            Bias
+        );
     }
+}
 
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 16)
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 32)
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 64)
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 128)
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 256)
-
-#undef SPECIALIZE_SQNBIT_GEMM_M1_KERNEL
-
-//
-// MlasQNBitBlkDequantBForSgemm and helpers.
-//
-
-template <size_t BlkBitWidth, size_t BlkLen>
 MLAS_FORCEINLINE void
-MlasQNBitBlkDequantBForSgemmNeon(
+Q4BitBlkDequantBForSgemm_CompFp32(
+    size_t BlkLen,
     float* FpData,
-    const uint8_t* QuantBData,
+    const std::byte* QuantBData,
     const float* QuantBScale,
-    const uint8_t* QuantBZeroPoint,
+    const std::byte* QuantBZeroPoint,
     size_t CountN,
     size_t CountK,
     size_t BlockStrideQuantB
 )
 {
     auto impl0_reference = [&]() {
-        static_assert(BlkBitWidth == 4);
+        constexpr size_t BlkBitWidth = 4;
+        constexpr size_t SubBlkLen = 16;
 
         float* Dst = FpData;
 
-        const uint8_t* QuantBDataCol = QuantBData;
+        const std::byte* QuantBDataCol = QuantBData;
         const float* QuantBScaleCol = QuantBScale;
-        const uint8_t* QuantBZeroPointCol = QuantBZeroPoint;
+        const std::byte* QuantBZeroPointCol = QuantBZeroPoint;
 
         for (size_t n = 0; n < CountN; n += 16) {
             const size_t nnlen = std::min(CountN - n, size_t{16});
@@ -407,20 +526,26 @@ MlasQNBitBlkDequantBForSgemmNeon(
                 for (size_t k = 0, k_blk_idx = 0; k < CountK; k += BlkLen, k_blk_idx += 1) {
                     const size_t kklen = std::min(CountK - k, BlkLen);
 
-                    const uint8_t* b_data =
+                    const std::byte* b_data =
                         QuantBDataCol + k_blk_idx * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
                     const float b_s = QuantBScaleCol[k_blk_idx];
                     const uint8_t b_z =
                         (QuantBZeroPointCol != nullptr)
                             ? ((k_blk_idx & 1) == 1)
-                                  ? QuantBZeroPointCol[k_blk_idx / 2] >> 4
-                                  : QuantBZeroPointCol[k_blk_idx / 2] & 0x0F
+                                  ? std::to_integer<uint8_t>(QuantBZeroPointCol[k_blk_idx / 2] >> 4)
+                                  : std::to_integer<uint8_t>(QuantBZeroPointCol[k_blk_idx / 2] & std::byte{0x0F})
                             : 8;
 
                     for (size_t kk = 0; kk < kklen; ++kk) {
-                        const uint8_t b_packed = b_data[kk / 2];
-                        const uint8_t b_byte = ((kk & 1) == 1) ? b_packed >> 4 : b_packed & 0x0F;
-                        const float b_value = (b_byte - b_z) * b_s;
+                        const size_t packed_idx = kk % SubBlkLen;
+
+                        const bool is_low_half = packed_idx < (SubBlkLen / 2);
+                        const size_t packed_byte_idx = packed_idx % (SubBlkLen / 2);
+                        const size_t packed_range_offset = (kk / SubBlkLen) * (SubBlkLen / 2);
+
+                        const std::byte b_packed = b_data[packed_range_offset + packed_byte_idx];
+                        const std::byte b_byte = is_low_half ? (b_packed & std::byte{0x0F}) : (b_packed >> 4);
+                        const float b_value = (std::to_integer<int8_t>(b_byte) - b_z) * b_s;
 
                         Dst[(k + kk) * 16 + nn] = b_value;
                     }
@@ -448,31 +573,651 @@ MlasQNBitBlkDequantBForSgemmNeon(
     impl0_reference();
 }
 
-#define SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(BlkBitWidth, BlkLen)                           \
-    template <>                                                                                 \
-    MLAS_FORCEINLINE void                                                                       \
-    MlasQNBitBlkDequantBForSgemm<BlkBitWidth, BlkLen, MLAS_SQNBIT_GEMM_KERNEL_NEON>(            \
-        float* FpData,                                                                          \
-        const uint8_t* QuantBData,                                                              \
-        const float* QuantBScale,                                                               \
-        const uint8_t* QuantBZeroPoint,                                                         \
-        size_t CountN,                                                                          \
-        size_t CountK,                                                                          \
-        size_t BlockStrideQuantB                                                                \
-    )                                                                                           \
-    {                                                                                           \
-        MlasQNBitBlkDequantBForSgemmNeon<BlkBitWidth, BlkLen>(                                  \
-            FpData, QuantBData, QuantBScale, QuantBZeroPoint, CountN, CountK, BlockStrideQuantB \
-        );                                                                                      \
+//
+// CompInt8 kernel implementation.
+//
+
+template <size_t SubBlkLen>
+MLAS_FORCEINLINE void
+QuantizeBlock(
+    size_t BlkLen,
+    const float* A,
+    size_t ElementCount,
+    std::byte* QuantA
+)
+{
+    static_assert(SubBlkLen >= 16 && SubBlkLen % 16 == 0);
+
+    assert(BlkLen % SubBlkLen == 0);
+
+    //
+    // Scan block values first to determine scale.
+    //
+
+    float amax = 0.0f;  // max of absolute values of A block
+
+    size_t k;
+    for (k = 0; k < ElementCount; k += SubBlkLen) {
+        const size_t SubBlkElementCount = std::min(ElementCount - k, SubBlkLen);
+
+        float32x4_t a[SubBlkLen / 4]{};
+        LoadFloatData<SubBlkLen>(A + k, SubBlkElementCount, a);
+
+        float32x4_t abs_a[SubBlkLen / 4];
+        UnrolledLoop<SubBlkLen / 4>([&](size_t i) {
+            abs_a[i] = vabsq_f32(a[i]);
+        });
+
+        // find amax of SubBlkLen elements
+        for (size_t interval = SubBlkLen / 4 / 2; interval > 0; interval /= 2) {
+            for (size_t i = 0; i < interval; ++i) {
+                abs_a[i] = vmaxq_f32(abs_a[i], abs_a[i + interval]);
+            }
+        }
+
+        // update existing amax
+        amax = std::max(amax, vmaxvq_f32(abs_a[0]));
+    }
+
+    constexpr float range_max = (1 << 7) - 1;
+    const float scale = amax / range_max;
+    const float scale_reciprocal = scale != 0.0f ? 1.0f / scale : 0.0f;
+
+    Q8BlkScale(QuantA) = scale;
+
+    //
+    // Compute quantized block values.
+    //
+
+    int8_t* QuantAData = Q8BlkData(QuantA);
+
+    for (k = 0; k < ElementCount; k += SubBlkLen) {
+        const size_t SubBlkElementCount = std::min(ElementCount - k, SubBlkLen);
+
+        float32x4_t a[SubBlkLen / 4]{};
+        LoadFloatData<SubBlkLen>(A + k, SubBlkElementCount, a);
+
+        UnrolledLoop<SubBlkLen / 4>([&](size_t i) {
+            a[i] = vmulq_n_f32(a[i], scale_reciprocal);
+        });
+
+        int32x4_t a_s32[SubBlkLen / 4];
+        UnrolledLoop<SubBlkLen / 4>([&](size_t i) {
+            a_s32[i] = vcvtaq_s32_f32(a[i]);
+        });
+
+        UnrolledLoop<SubBlkLen / 4>([&](size_t i) {
+            QuantAData[k + i * 4 + 0] = static_cast<int8_t>(vgetq_lane_s32(a_s32[i], 0));
+            QuantAData[k + i * 4 + 1] = static_cast<int8_t>(vgetq_lane_s32(a_s32[i], 1));
+            QuantAData[k + i * 4 + 2] = static_cast<int8_t>(vgetq_lane_s32(a_s32[i], 2));
+            QuantAData[k + i * 4 + 3] = static_cast<int8_t>(vgetq_lane_s32(a_s32[i], 3));
+        });
+    }
+
+    //
+    // Zero out any remaining sub-block elements.
+    //
+
+    for (; k < BlkLen; k += SubBlkLen) {
+        const int8x16_t Zeros = vdupq_n_s8(0);
+        UnrolledLoop<SubBlkLen / 16>([&](size_t i) {
+            vst1q_s8(QuantAData + k + i * 16, Zeros);
+        });
+    }
+}
+
+void MLASCALL
+QuantizeARow_CompInt8(
+    size_t BlkLen,
+    const float* A,
+    size_t CountK,
+    std::byte* QuantA
+)
+{
+    const float* ADataBlkPtr = A;
+    std::byte* QuantABlkPtr = QuantA;
+
+    for (size_t k = 0; k < CountK; k += BlkLen) {
+        const size_t k_blk_len = std::min(CountK - k, BlkLen);
+
+        QuantizeBlock<16>(BlkLen, ADataBlkPtr, k_blk_len, QuantABlkPtr);
+
+        ADataBlkPtr += BlkLen;
+        QuantABlkPtr += Q8BlkSize(BlkLen);
+    }
+}
+
+template <bool HasZeroPoint>
+void
+SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen16(
+    const std::byte* QuantA,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t BlockCountK,
+    const float* Bias
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+    constexpr size_t BlkLen = 16;
+
+    float* CRowPtr = C;
+
+    const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t StrideQuantBScale = BlockCountK;
+    const size_t StrideQuantBZeroPoint = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(BlockCountK);
+
+    const float* BiasPtr = Bias;
+
+    const std::byte* QuantBDataColPtr = QuantBData;
+    const float* QuantBScaleColPtr = QuantBScale;
+    const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
+
+    float* SumPtr = CRowPtr;
+
+    const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F);
+    const uint8x8_t LowMaskU8x8 = vdup_n_u8(0x0F);
+
+    for (size_t n = 0; n < CountN; ++n) {
+        const std::byte* QuantAPtr = QuantA;
+        const std::byte* QuantBDataPtr = QuantBDataColPtr;
+        const float* QuantBScalePtr = QuantBScaleColPtr;
+        const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
+
+        float32x4_t acc0{}, acc1{};
+
+        size_t k_blks_remaining = BlockCountK;
+        for (; k_blks_remaining > 1; k_blks_remaining -= 2) {
+            const std::byte* QuantABlk0 = QuantAPtr;
+            const std::byte* QuantABlk1 = QuantABlk0 + Q8BlkSize(BlkLen);
+
+            // compute combined scale
+            const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * QuantBScalePtr[0]);
+            const float32x4_t scale1 = vdupq_n_f32(Q8BlkScale(QuantABlk1) * QuantBScalePtr[1]);
+
+            // load B zero point
+            const int8x16_t bzp0 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>(QuantBZeroPointPtr[0] & std::byte{0x0F}) : 8
+            );
+            const int8x16_t bzp1 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>(QuantBZeroPointPtr[0] >> 4) : 8
+            );
+
+            // load A
+            const int8x16_t av0 = vld1q_s8(Q8BlkData(QuantABlk0));
+            const int8x16_t av1 = vld1q_s8(Q8BlkData(QuantABlk1));
+
+            // load B
+            const uint8x16_t bv_packed01 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+
+            const uint8x16_t bv_lo01 = vandq_u8(bv_packed01, LowMaskU8x16);
+            const uint8x16_t bv_hi01 = vshrq_n_u8(bv_packed01, 4);
+
+            int8x16_t bv0 = vreinterpretq_s8_u8(vcombine_u8(vget_low_u8(bv_lo01), vget_low_u8(bv_hi01)));
+            int8x16_t bv1 = vreinterpretq_s8_u8(vcombine_u8(vget_high_u8(bv_lo01), vget_high_u8(bv_hi01)));
+
+            // subtract B zero point
+            bv0 = vsubq_s8(bv0, bzp0);
+            bv1 = vsubq_s8(bv1, bzp1);
+
+            // quantized dot product
+            const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
+            const int32x4_t dot1 = vdotq_s32(vdupq_n_s32(0), av1, bv1);
+
+            // convert to float
+            const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+            const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1);
+
+            // multiply by scale and update accumulator
+            acc0 = vfmaq_f32(acc0, dot_f32_0, scale0);
+            acc1 = vfmaq_f32(acc1, dot_f32_1, scale1);
+
+            // increment block pointers
+
+            QuantAPtr += Q8BlkSize(BlkLen) * 2;
+            QuantBDataPtr += 8 * 2;
+            QuantBScalePtr += 2;
+            if constexpr (HasZeroPoint) {
+                QuantBZeroPointPtr += 1;
+            }
+        }
+
+        if (k_blks_remaining > 0) {
+            const std::byte* QuantABlk0 = QuantAPtr;
+
+            // compute combined scale
+            const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * (*QuantBScalePtr));
+
+            // load B zero point
+            const int8x16_t bzp0 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>(QuantBZeroPointPtr[0] & std::byte{0x0F}) : 8
+            );
+
+            // load A
+            const int8x16_t av0 = vld1q_s8(Q8BlkData(QuantABlk0));
+
+            // load B
+            const uint8x8_t bv_packed0 = vld1_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+
+            const uint8x8_t bv_lo0 = vand_u8(bv_packed0, LowMaskU8x8);
+            const uint8x8_t bv_hi0 = vshr_n_u8(bv_packed0, 4);
+
+            int8x16_t bv0 = vreinterpretq_s8_u8(vcombine_u8(bv_lo0, bv_hi0));
+
+            // subtract B zero point
+            bv0 = vsubq_s8(bv0, bzp0);
+
+            // quantized dot product
+            const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
+
+            // convert to float
+            const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+
+            // multiply by scale and update accumulator
+            acc0 = vfmaq_f32(acc0, dot_f32_0, scale0);
+        }
+
+        *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1);
+        if (BiasPtr) {
+            *SumPtr += *BiasPtr;
+        }
+
+        // move to next column
+
+        QuantBDataColPtr += StrideQuantBData;
+        QuantBScaleColPtr += StrideQuantBScale;
+        if constexpr (HasZeroPoint) {
+            QuantBZeroPointColPtr += StrideQuantBZeroPoint;
+        }
+
+        BiasPtr += BiasPtr != nullptr ? 1 : 0;
+        SumPtr += 1;
+    }
+}
+
+template <bool HasZeroPoint>
+void
+SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen32(
+    const std::byte* QuantA,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t BlockCountK,
+    const float* Bias
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+    constexpr size_t BlkLen = 32;
+
+    float* CRowPtr = C;
+
+    const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t StrideQuantBScale = BlockCountK;
+    const size_t StrideQuantBZeroPoint = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(BlockCountK);
+
+    const float* BiasPtr = Bias;
+
+    const std::byte* QuantBDataColPtr = QuantBData;
+    const float* QuantBScaleColPtr = QuantBScale;
+    const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
+
+    float* SumPtr = CRowPtr;
+
+    const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F);
+
+    for (size_t n = 0; n < CountN; ++n) {
+        const std::byte* QuantAPtr = QuantA;
+        const std::byte* QuantBDataPtr = QuantBDataColPtr;
+        const float* QuantBScalePtr = QuantBScaleColPtr;
+        const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
+
+        float32x4_t acc0{}, acc1{};
+
+        size_t k_blks_remaining = BlockCountK;
+        for (; k_blks_remaining > 1; k_blks_remaining -= 2) {
+            const std::byte* QuantABlk0 = QuantAPtr;
+            const std::byte* QuantABlk1 = QuantABlk0 + Q8BlkSize(BlkLen);
+
+            // compute combined scale
+            const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * QuantBScalePtr[0]);
+            const float32x4_t scale1 = vdupq_n_f32(Q8BlkScale(QuantABlk1) * QuantBScalePtr[1]);
+
+            // load B zero point
+            const int8x16_t bzp0 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>((*QuantBZeroPointPtr) & std::byte{0x0F}) : 8
+            );
+            const int8x16_t bzp1 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>((*QuantBZeroPointPtr) >> 4) : 8
+            );
+
+            // load A
+            const int8x16_t av_lo0 = vld1q_s8(Q8BlkData(QuantABlk0));
+            const int8x16_t av_hi0 = vld1q_s8(Q8BlkData(QuantABlk0) + 16);
+            const int8x16_t av_lo1 = vld1q_s8(Q8BlkData(QuantABlk1));
+            const int8x16_t av_hi1 = vld1q_s8(Q8BlkData(QuantABlk1) + 16);
+
+            // load B
+            const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+            const uint8x16_t bv_packed1 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + 16);
+
+            int8x16_t bv_lo0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16));
+            int8x16_t bv_hi0 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4));
+            int8x16_t bv_lo1 = vreinterpretq_s8_u8(vandq_u8(bv_packed1, LowMaskU8x16));
+            int8x16_t bv_hi1 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed1, 4));
+
+            // subtract B zero point
+            bv_lo0 = vsubq_s8(bv_lo0, bzp0);
+            bv_hi0 = vsubq_s8(bv_hi0, bzp0);
+            bv_lo1 = vsubq_s8(bv_lo1, bzp1);
+            bv_hi1 = vsubq_s8(bv_hi1, bzp1);
+
+            // quantized dot product
+            int32x4_t dot0{}, dot1{};
+            dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
+            dot1 = vdotq_s32(vdotq_s32(dot1, av_lo1, bv_lo1), av_hi1, bv_hi1);
+
+            // convert to float
+            const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+            const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1);
+
+            // multiply by scale and update accumulator
+            acc0 = vfmaq_f32(acc0, dot_f32_0, scale0);
+            acc1 = vfmaq_f32(acc1, dot_f32_1, scale1);
+
+            // increment block pointers
+
+            QuantAPtr += Q8BlkSize(BlkLen) * 2;
+            QuantBDataPtr += 16 * 2;
+            QuantBScalePtr += 2;
+            if constexpr (HasZeroPoint) {
+                QuantBZeroPointPtr += 1;
+            }
+        }
+
+        if (k_blks_remaining > 0) {
+            const std::byte* QuantABlk0 = QuantAPtr;
+
+            // compute combined scale
+            const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * (*QuantBScalePtr));
+
+            // load B zero point
+            const int8x16_t bzp0 = vdupq_n_s8(
+                HasZeroPoint ? std::to_integer<int8_t>((*QuantBZeroPoint) & std::byte{0x0F}) : 8
+            );
+
+            // load A
+            const int8x16_t av_lo0 = vld1q_s8(Q8BlkData(QuantABlk0));
+            const int8x16_t av_hi0 = vld1q_s8(Q8BlkData(QuantABlk0) + 16);
+
+            // load B
+            const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+
+            int8x16_t bv_lo0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16));
+            int8x16_t bv_hi0 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4));
+
+            // subtract B zero point
+            bv_lo0 = vsubq_s8(bv_lo0, bzp0);
+            bv_hi0 = vsubq_s8(bv_hi0, bzp0);
+
+            // quantized dot product
+            int32x4_t dot0{};
+            dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
+
+            // convert to float
+            const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+
+            // multiply by scale and update accumulator
+            acc0 = vfmaq_f32(acc0, dot_f32_0, scale0);
+        }
+
+        *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1);
+        if (BiasPtr) {
+            *SumPtr += *BiasPtr;
+        }
+
+        // move to next column
+
+        QuantBDataColPtr += StrideQuantBData;
+        QuantBScaleColPtr += StrideQuantBScale;
+        if constexpr (HasZeroPoint) {
+            QuantBZeroPointColPtr += StrideQuantBZeroPoint;
+        }
+
+        BiasPtr += BiasPtr != nullptr ? 1 : 0;
+        SumPtr += 1;
     }
+}
 
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 16)
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 32)
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 64)
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 128)
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 256)
+template <bool HasZeroPoint>
+void
+SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLenGreaterThan32(
+    size_t BlkLen,
+    const std::byte* QuantA,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t BlockCountK,
+    const float* Bias
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+
+    assert(BlkLen > 32);
+    assert(BlkLen % 32 == 0);
+
+    float* CRowPtr = C;
+
+    const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t StrideQuantBScale = BlockCountK;
+    const size_t StrideQuantBZeroPoint = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(BlockCountK);
+
+    const float* BiasPtr = Bias;
+
+    const std::byte* QuantBDataColPtr = QuantBData;
+    const float* QuantBScaleColPtr = QuantBScale;
+    const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
+
+    float* SumPtr = CRowPtr;
+
+    const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F);
+
+    // process blocks in 32-element sub-blocks
+    const size_t SubBlksPerBlk = BlkLen / 32;
+
+    for (size_t n = 0; n < CountN; ++n) {
+        const std::byte* QuantAPtr = QuantA;
+        const std::byte* QuantBDataPtr = QuantBDataColPtr;
+        const float* QuantBScalePtr = QuantBScaleColPtr;
+        const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
+
+        float32x4_t acc0{}, acc1{};
+
+        for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) {
+            // compute combined scale
+            const float32x4_t scale = vdupq_n_f32(Q8BlkScale(QuantAPtr) * (*QuantBScalePtr));
+
+            // load B zero point
+            const int8x16_t bzp = [&]() -> int8x16_t {
+                if constexpr (HasZeroPoint) {
+                    return vdupq_n_s8(
+                        ((k_blk_idx & 1) == 0) ? std::to_integer<int8_t>((*QuantBZeroPointPtr) & std::byte{0x0F})
+                                               : std::to_integer<int8_t>((*QuantBZeroPointPtr) >> 4)
+                    );
+                } else {
+                    return vdupq_n_s8(8);
+                }
+            }();
+
+            const int8_t* QuantADataPtr = Q8BlkData(QuantAPtr);
+
+            for (size_t sub_blk_idx = 0; sub_blk_idx < SubBlksPerBlk; sub_blk_idx += 2) {
+                // load A
+                const int8x16_t av0 = vld1q_s8(QuantADataPtr + 0);
+                const int8x16_t av1 = vld1q_s8(QuantADataPtr + 16);
+                const int8x16_t av2 = vld1q_s8(QuantADataPtr + 32);
+                const int8x16_t av3 = vld1q_s8(QuantADataPtr + 48);
+
+                // load B
+                const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
+                const uint8x16_t bv_packed1 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + 16);
+
+                int8x16_t bv0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16));
+                int8x16_t bv1 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4));
+                int8x16_t bv2 = vreinterpretq_s8_u8(vandq_u8(bv_packed1, LowMaskU8x16));
+                int8x16_t bv3 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed1, 4));
+
+                // subtract B zero point
+                bv0 = vsubq_s8(bv0, bzp);
+                bv1 = vsubq_s8(bv1, bzp);
+                bv2 = vsubq_s8(bv2, bzp);
+                bv3 = vsubq_s8(bv3, bzp);
+
+                // quantized dot product
+                int32x4_t dot0{}, dot1{};
+                dot0 = vdotq_s32(vdotq_s32(dot0, av0, bv0), av1, bv1);
+                dot1 = vdotq_s32(vdotq_s32(dot1, av2, bv2), av3, bv3);
+
+                // convert to float
+                const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
+                const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1);
+
+                // multiply by scale and update accumulator
+                acc0 = vfmaq_f32(acc0, dot_f32_0, scale);
+                acc1 = vfmaq_f32(acc1, dot_f32_1, scale);
+
+                // increment block data pointers to next sub-block
+                QuantADataPtr += 16 * 4;
+                QuantBDataPtr += 16 * 2;
+            }
+
+            // increment other block pointers
+
+            QuantAPtr += Q8BlkSize(BlkLen);
+            QuantBScalePtr += 1;
+
+            if constexpr (HasZeroPoint) {
+                QuantBZeroPointPtr += ((k_blk_idx & 1) == 0) ? 0 : 1;
+            }
+        }
+
+        *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1);
+        if (BiasPtr) {
+            *SumPtr += *BiasPtr;
+        }
+
+        // move to next column
+
+        QuantBDataColPtr += StrideQuantBData;
+        QuantBScaleColPtr += StrideQuantBScale;
+        if constexpr (HasZeroPoint) {
+            QuantBZeroPointColPtr += StrideQuantBZeroPoint;
+        }
 
-#undef SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM
+        BiasPtr += BiasPtr != nullptr ? 1 : 0;
+        SumPtr += 1;
+    }
+}
+
+template <bool HasZeroPoint>
+MLAS_FORCEINLINE void
+SQ4BitGemmM1Kernel_CompInt8_DispatchOnBlkLen(
+    size_t BlkLen,
+    const std::byte* QuantA,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t BlockStrideQuantB,
+    const float* Bias
+)
+{
+    if (BlkLen == 16) {
+        SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen16<HasZeroPoint>(
+            QuantA,
+            QuantBData,
+            QuantBScale,
+            QuantBZeroPoint,
+            C,
+            CountN,
+            BlockStrideQuantB,
+            Bias
+        );
+    } else if (BlkLen == 32) {
+        SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen32<HasZeroPoint>(
+            QuantA,
+            QuantBData,
+            QuantBScale,
+            QuantBZeroPoint,
+            C,
+            CountN,
+            BlockStrideQuantB,
+            Bias
+        );
+    } else {
+        SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLenGreaterThan32<HasZeroPoint>(
+            BlkLen,
+            QuantA,
+            QuantBData,
+            QuantBScale,
+            QuantBZeroPoint,
+            C,
+            CountN,
+            BlockStrideQuantB,
+            Bias
+        );
+    }
+}
+
+MLAS_FORCEINLINE
+void
+SQ4BitGemmM1Kernel_CompInt8(
+    size_t BlkLen,
+    const std::byte* QuantA,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t /*CountK*/,
+    size_t BlockStrideQuantB,
+    const float* Bias
+)
+{
+    if (QuantBZeroPoint != nullptr) {
+        SQ4BitGemmM1Kernel_CompInt8_DispatchOnBlkLen<true>(
+            BlkLen,
+            QuantA,
+            QuantBData,
+            QuantBScale,
+            QuantBZeroPoint,
+            C,
+            CountN,
+            BlockStrideQuantB,
+            Bias
+        );
+    } else {
+        SQ4BitGemmM1Kernel_CompInt8_DispatchOnBlkLen<false>(
+            BlkLen,
+            QuantA,
+            QuantBData,
+            QuantBScale,
+            QuantBZeroPoint,
+            C,
+            CountN,
+            BlockStrideQuantB,
+            Bias
+        );
+    }
+}
+
+}  // namespace
 
 //
 // Kernel dispatch structure definition.
@@ -480,10 +1225,15 @@ SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 256)
 
 const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon = []() {
     MLAS_SQNBIT_GEMM_DISPATCH d;
-    d.Operations[QuantVariant_BitWidth4_BlockSize16] = MlasSQNBitGemmOperation<4, 16, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
-    d.Operations[QuantVariant_BitWidth4_BlockSize32] = MlasSQNBitGemmOperation<4, 32, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
-    d.Operations[QuantVariant_BitWidth4_BlockSize64] = MlasSQNBitGemmOperation<4, 64, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
-    d.Operations[QuantVariant_BitWidth4_BlockSize128] = MlasSQNBitGemmOperation<4, 128, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
-    d.Operations[QuantVariant_BitWidth4_BlockSize256] = MlasSQNBitGemmOperation<4, 256, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
+
+    d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize;
+    d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData;
+
+    d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32;
+    d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32;
+
+    d.SQ4BitGemmM1Kernel_CompInt8 = SQ4BitGemmM1Kernel_CompInt8;
+    d.QuantizeARow_CompInt8 = QuantizeARow_CompInt8;
+
     return d;
 }();
diff --git a/onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp b/onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp
index 955b7c5deee9..43a12b37e4ff 100644
--- a/onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp
+++ b/onnxruntime/core/mlas/lib/wasm_simd/SgemmKernelWasmSimd.cpp
@@ -171,11 +171,9 @@ Return Value:
         if (k > 0) {
 
             Row0AElements0 = a[0];
-            Row0AElements1 = a[1];
 
             if (ProcessTwoRows) {
                 Row1AElements0 = a[lda];
-                Row1AElements1 = a[lda + 1];
             }
 
             BElements0 = MlasLoadFloat32x4(B + 0);
diff --git a/onnxruntime/core/mlas/lib/x86_64/SoftmaxKernelAvx512F.S b/onnxruntime/core/mlas/lib/x86_64/SoftmaxKernelAvx512F.S
new file mode 100644
index 000000000000..db9728604656
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/x86_64/SoftmaxKernelAvx512F.S
@@ -0,0 +1,101 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SoftmaxKernelAvx512F.s
+
+Abstract:
+
+    This module implements the kernels for the single precision softmax
+    operation.
+
+    This implementation uses AVX512F instructions.
+
+--*/
+
+#include "asmmacro.h"
+
+        .intel_syntax noprefix
+
+        .text
+
+/*++
+
+Routine Description:
+
+    This routine implements a vectorized kernel to find the maximum value of
+    the supplied buffer.
+
+Arguments:
+
+    Input (rdi) - Supplies the input buffer.
+
+    N (rsi) - Supplies the number of elements to process.
+
+Return Value:
+
+    Returns the maximum value of the supplied buffer.
+
+--*/
+
+        FUNCTION_ENTRY MlasReduceMaximumF32KernelAvx512F
+
+        vbroadcastss zmm0,DWORD PTR C_UNDERSCORE(MlasMinimumF32Value)[rip]
+        test    rsi,rsi
+        jz      .LReduceMaximum.ExitKernel
+        cmp     rsi,16
+        jb      .LReduceMaximum.ProcessRemainingCountBy1
+        cmp     rsi,64
+        jb      .LReduceMaximum.ProcessRemainingCountBy16
+        vmovaps zmm1,zmm0
+        vmovaps zmm2,zmm0
+        vmovaps zmm3,zmm0
+
+.LReduceMaximum.ProcessRemainingCountBy64:
+        vmaxps  zmm0,zmm0,ZMMWORD PTR [rdi]
+        vmaxps  zmm1,zmm1,ZMMWORD PTR [rdi+16*4]
+        sub     rsi,64
+        vmaxps  zmm2,zmm2,ZMMWORD PTR [rdi+32*4]
+        vmaxps  zmm3,zmm3,ZMMWORD PTR [rdi+48*4]
+        add     rdi,64*4                        # advance input by 64 elements
+        cmp     rsi,64
+        jae     .LReduceMaximum.ProcessRemainingCountBy64
+        vmaxps  zmm0,zmm0,zmm1                  # reduce to single vector
+        vmaxps  zmm2,zmm2,zmm3
+        vmaxps  zmm0,zmm0,zmm2
+
+.LReduceMaximum.ProcessRemainingCountBy16:
+        cmp     rsi,16
+        jb      .LReduceMaximum.ProcessRemainingCountLessThan16
+        vmaxps  zmm0,zmm0,ZMMWORD PTR [rdi]
+        sub     rsi,16
+        add     rdi,16*4                         # advance input by 16 elements
+        jmp     .LReduceMaximum.ProcessRemainingCountBy16
+
+.LReduceMaximum.ProcessRemainingCountLessThan16:
+        vextractf32x8     ymm1,zmm0,1           # reduce to single scalar
+        vmaxps  ymm0,ymm0,ymm1
+        vextractf128 xmm1,ymm0,1
+        vmaxps  xmm0,xmm0,xmm1
+        vshufps xmm1,xmm0,xmm0,0xEE
+        vmaxps  xmm0,xmm0,xmm1
+        vshufps xmm1,xmm0,xmm0,0x55
+        vmaxss  xmm0,xmm0,xmm1
+        test    rsi,rsi
+        jz      .LReduceMaximum.ExitKernel
+
+.LReduceMaximum.ProcessRemainingCountBy1:
+        vmaxss  xmm0,xmm0,DWORD PTR [rdi]
+        add     rdi,4                           # advance input by 1 element
+        dec     esi
+        jnz     .LReduceMaximum.ProcessRemainingCountBy1
+
+.LReduceMaximum.ExitKernel:
+        vzeroupper
+        ret
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format b/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
deleted file mode 100644
index 84b876706161..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
+++ /dev/null
@@ -1,7 +0,0 @@
-Language:        Cpp
-BasedOnStyle:  Google
-DerivePointerAlignment: false
-ColumnLimit: 120
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SortIncludes: false
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt b/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
deleted file mode 100644
index 5d9c5edf45a9..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-
-project(jblas LANGUAGES CXX VERSION 0.1.0)
-
-file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
-file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
-
-add_library(${PROJECT_NAME} INTERFACE)
-add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
-
-target_include_directories(
-	${PROJECT_NAME} INTERFACE
-	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
-	"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
-)
-
-if(WIN32)
-	target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
-	target_compile_options(${PROJECT_NAME} INTERFACE /wd4068 /wd4849 /wd6262 /wd4702 /wd4100) 
-	#4068 ignore unroll and GCC flags
-	#4849 ignore collapse
-	#6262 ignore stack too large
-	#4702 unreachable code(false warning on constexpr condition)
-	#4100 unreferenced formal parameter
-
-	target_link_options(${PROJECT_NAME} INTERFACE /STACK:3145728) #Stack requires up to L2 cache size
-endif(WIN32)
-
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
deleted file mode 100644
index 143adb771760..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
+++ /dev/null
@@ -1,303 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <stdint.h>
-
-#include <cstddef>
-#include <type_traits>
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-#define OFFSET(field) offsetof(params, field)
-
-namespace jblas {
-
-namespace xbyak {
-class JitBase : protected Xbyak::CodeGenerator {
- protected:
-  JitBase(size_t size = 16 * 1024) : CodeGenerator(size) {}
-
-  void load32(const Xbyak::Reg64& reg, const Xbyak::Address& addr) {
-    xor_(reg, reg);
-    mov(reg.cvt32(), addr);
-  }
-
-  void vreg_push(const Xbyak::Reg64& baseaddr) {
-#ifdef _WIN32
-    for (int i = 0; i < 10; i++) {
-      movaps(xword[baseaddr + i * 16], Xbyak::Xmm(6 + i));
-    }
-#endif
-  }
-
-  void vreg_pop(const Xbyak::Reg64& baseaddr) {
-#ifdef _WIN32
-    for (int i = 0; i < 10; i++) {
-      movaps(Xbyak::Xmm(6 + i), xword[baseaddr + i * 16]);
-    }
-#endif
-  }
-
-  void padto_le(const Xbyak::Reg64& _src, int padding) {
-    // _src=_src/padding*padding
-    if (padding == 1) {
-      return;
-    }
-    for (int i = 1; i < 16; i++) {
-      if ((1 << i) == padding) {
-        shr(_src, i);
-        shl(_src, i);
-        return;
-      }
-    }
-    assert(0);
-  }
-
-  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Address& _total,
-                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
-    inLocalLabel();
-    lea(_tmp, _total);
-    sub(_tmp, _pos);
-    cmp(_tmp, N);
-    jb(".maskflag");
-    cmp(_tmp, 0);
-    jl(".zeroflag");
-    uint64_t allmask = (static_cast<uint64_t>(1) << N) - 1;
-    if (N == 64) {
-      allmask = static_cast<uint64_t>(-1);
-    }
-    mov(_tmp, allmask);
-    kmovq(_msk, _tmp);
-    jmp(".maskend");
-    L(".maskflag");
-    mov(_tmp1, 1);
-    shlx(_tmp1, _tmp1, _tmp);
-    sub(_tmp1, 1);
-    kmovq(_msk, _tmp1);
-    jmp(".maskend");
-    L(".zeroflag");
-    mov(_tmp1, 0);
-    kmovq(_msk, _tmp1);
-    L(".maskend");
-    outLocalLabel();
-  }
-  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Reg64& _total,
-                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
-    generate_Nbitsmask(_msk, _pos, ptr[_total], _tmp, _tmp1, N);
-  }
-};
-
-class JitAvx : protected JitBase {
- protected:
-  static int constexpr VBits = 256;
-  static int constexpr VecBytes = VBits / 8;
-  static int constexpr RegCount = 16;
-  typedef Xbyak::Ymm vreg_t;
-};
-
-class JitAvx2 : protected JitAvx {
- protected:
-  static int constexpr VBits = 256;
-  typedef Xbyak::Ymm vreg_t;
-  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxor(x1, x2, op); }
-
-  void loadbf16_f32(const Xbyak::Ymm& dst, const Xbyak::Address& addr) {
-    vpmovzxwd(dst, addr);
-    vpslld(dst, dst, 16);
-  }
-};
-
-class JitAvx512f : protected JitAvx2 {
- protected:
-  static int constexpr VBits = 512;
-  static int constexpr VecBytes = VBits / 8;
-  static int constexpr RegCount = 32;
-  typedef Xbyak::Zmm vreg_t;
-
-  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxorq(x1, x2, op); }
-
-  void interleave_2rows_4regs(Xbyak::Zmm* src_2regs, Xbyak::Zmm* tmp_2reg) {
-    vpunpcklwd(tmp_2reg[0], src_2regs[0], src_2regs[1]);
-    vpunpckhwd(tmp_2reg[1], src_2regs[0], src_2regs[1]);
-    vshuff32x4(src_2regs[0], tmp_2reg[0], tmp_2reg[1], 0 | (1 << 2) | (0 << 4) | (1 << 6));
-    vshuff32x4(src_2regs[0], src_2regs[0], src_2regs[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    vshuff32x4(src_2regs[1], tmp_2reg[0], tmp_2reg[1], 2 | (3 << 2) | (2 << 4) | (3 << 6));
-    vshuff32x4(src_2regs[1], src_2regs[1], src_2regs[1], 0 | (2 << 2) | (1 << 4) | (3 << 6));
-  }
-
-  void transpose16x16_4B(Xbyak::Zmm* src, Xbyak::Zmm* tmp, const int N = 16) {
-    for (int i = 0; i < 8; ++i) {
-      vpunpckldq(tmp[2 * i + 0], src[2 * i], src[2 * i + 1]);
-      vpunpckhdq(tmp[2 * i + 1], src[2 * i], src[2 * i + 1]);
-    }
-
-    for (int i = 0; i < 4; ++i) {
-      vpunpcklqdq(src[4 * i + 0], tmp[4 * i + 0], tmp[4 * i + 2]);
-      vpunpckhqdq(src[4 * i + 1], tmp[4 * i + 0], tmp[4 * i + 2]);
-      vpunpcklqdq(src[4 * i + 2], tmp[4 * i + 1], tmp[4 * i + 3]);
-      vpunpckhqdq(src[4 * i + 3], tmp[4 * i + 1], tmp[4 * i + 3]);
-    }
-
-    for (int i = 0; i < 2; ++i) {
-      vshufi32x4(tmp[8 * i + 0], src[8 * i + 0], src[8 * i + 4], 0x88);
-      vshufi32x4(tmp[8 * i + 1], src[8 * i + 1], src[8 * i + 5], 0x88);
-      vshufi32x4(tmp[8 * i + 2], src[8 * i + 2], src[8 * i + 6], 0x88);
-      vshufi32x4(tmp[8 * i + 3], src[8 * i + 3], src[8 * i + 7], 0x88);
-      vshufi32x4(tmp[8 * i + 4], src[8 * i + 0], src[8 * i + 4], 0xdd);
-      vshufi32x4(tmp[8 * i + 5], src[8 * i + 1], src[8 * i + 5], 0xdd);
-      vshufi32x4(tmp[8 * i + 6], src[8 * i + 2], src[8 * i + 6], 0xdd);
-      vshufi32x4(tmp[8 * i + 7], src[8 * i + 3], src[8 * i + 7], 0xdd);
-    }
-
-    // last step and move out
-    for (int i = 0; i < N; ++i) {
-      vshufi32x4(src[i], tmp[i % 8], tmp[8 + i % 8], i < 8 ? 0x88 : 0xdd);
-    }
-  }
-
-  void interleave_4rows_6regs(Xbyak::Zmm* src_4regs, Xbyak::Zmm* tmp_regs, const Xbyak::Opmask* masks) {
-    vpunpcklbw(tmp_regs[0], src_4regs[0], src_4regs[1]);
-    vpunpckhbw(tmp_regs[1], src_4regs[0], src_4regs[1]);
-    vpunpcklbw(tmp_regs[2], src_4regs[2], src_4regs[3]);
-    vpunpckhbw(tmp_regs[3], src_4regs[2], src_4regs[3]);
-
-    vpunpcklwd(tmp_regs[4], tmp_regs[0], tmp_regs[2]);
-    vpunpckhwd(tmp_regs[5], tmp_regs[0], tmp_regs[2]);
-    vpunpcklwd(tmp_regs[0], tmp_regs[1], tmp_regs[3]);
-    vpunpckhwd(tmp_regs[2], tmp_regs[1], tmp_regs[3]);
-    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (4 << 4) | 4);
-    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (4 << 4) | 4);
-    vmovups(src_4regs[0], tmp_regs[1]);
-    vshuff32x4(src_4regs[0] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
-    vmovups(src_4regs[1], tmp_regs[3]);
-    vshuff32x4(src_4regs[1] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
-    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (14 << 4) | 14);
-    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (14 << 4) | 14);
-    vmovups(src_4regs[2], tmp_regs[1]);
-    vshuff32x4(src_4regs[2] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
-    vmovups(src_4regs[3], tmp_regs[3]);
-    vshuff32x4(src_4regs[3] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
-  }
-
-  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) {
-    vpsrld(_fp32, _fp32, 16);
-    vpmovdw(_bf16, _fp32);
-  }
-
-  void loadbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Address& addr) {
-    vpmovzxwd(dst, addr);
-    vpslld(dst, dst, 16);
-  }
-
-  void broadcastbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Reg64& tmp, const Xbyak::Address& addr) {
-    mov(tmp.cvt16(), addr);
-    shl(tmp.cvt32(), 16);
-    vpbroadcastd(dst, tmp.cvt32());
-  }
-
-  void store_fp32_bf16(const Xbyak::Zmm& _fp32, const Xbyak::Address& _add) {
-    auto bf16 = Xbyak::Ymm(_fp32.getIdx());
-    cvt_fp32_bf16(bf16, _fp32);
-    vmovups(_add, bf16);
-  }
-};
-
-class JitAvx512_bf16 : protected JitAvx512f {};
-
-class JitAvx512_fp16 : protected JitAvx512f {};
-
-class JitAvx512vnni : protected JitAvx512f {
- protected:
-  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
-    vpdpbusds(x1, x2, op, Xbyak::EvexEncoding);
-  }
-};
-
-class JitAvxvnni : protected JitAvx2 {
- protected:
-  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
-    vpdpbusds(x1, x2, op, Xbyak::VexEncoding);
-  }
-};
-
-class JitAmxtile : protected JitAvx512f {
- public:
-  struct alignas(64) tileconfig_t {
-    uint8_t palette_id;
-    uint8_t reserved[15];
-    uint16_t colb[16];
-    uint8_t rows[16];
-  };
-  static int constexpr TileCount = 8;
-
-  typedef long long (*configure_t)(void*);
-
-  static void generate_config(Xbyak::CodeGenerator* g) {
-    Xbyak::util::StackFrame st(g, 1, 0, 0);
-    auto& parambase = st.p[0];
-    g->ldtilecfg(g->ptr[parambase]);
-  }
-
-  static void configure_tiles(tileconfig_t& tc, int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum,
-                              int CNum) {
-    // Filling tile configure structure. Could be done offline.
-    tc.palette_id = 1;
-    // Configure C tiles
-    int t = 0;
-    for (; t < CNum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_M);
-      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
-    }
-    // Configure A tiles
-    for (; t < CNum + ANum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_M);
-      tc.colb[t] = static_cast<uint16_t>(TILE_K * elesize);
-    }
-    // Configure B tile. B effectively has 64 rows and 16 columns.
-    int kpack = 4 / elesize;
-    for (; t < CNum + ANum + BNum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_K / kpack);
-      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
-    }
-  }
-};
-
-class JitAmxbf16 : protected JitAmxtile {
- protected:
-  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) { vcvtneps2bf16(_bf16, _fp32); }
-};
-
-class JitAmxint8 : protected JitAmxtile {
- protected:
-  template <class, class>
-  void _tdpb(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3);
-};
-template <>
-inline void JitAmxint8::_tdpb<int8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbssd(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<int8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbsud(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<uint8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbusd(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<uint8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbuud(x1, x2, x3);
-}
-}  // namespace xbyak
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
deleted file mode 100644
index 8ecf3535c17f..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <stdint.h>
-enum JBLAS_CODE {
-  JblasSuccess = 0,
-  JblasInvalidParam = 1,
-  JblasInvalidISA = 2,
-  JblasRuntimeError = 4,
-  JblasNotSupport = 8,
-};
-enum JBLAS_ISA : uint32_t {
-  JblasNoSIMD = 0,
-  JblasAVX,
-  JblasAVX2,
-  JblasAVX_VNNI,
-  JblasAVX512F,
-  JblasAVX512_VNNI,
-  JblasAMX_BF16,
-  JblasAMX_INT8,
-  JblasAVX512_FP16,
-  JblasAVX512_BF16,
-};
-enum class JBLAS_DTYPE : uint32_t {
-  EleBitsMask = 0xff,
-  EleBitsUndef = 0,
-  EleBits4 = 4,
-  EleBits8 = 8,
-  EleBits16 = 16,
-  EleBits32 = 32,
-  EleBits64 = 64,
-  TypeMask = 0xff00,
-  TypeFloat = 0 << 8,
-  TypeInt = 1 << 8,
-  SubTypeMask = 0xff0000,
-  SubType0 = 0 << 16,
-  SubType1 = 1 << 16,
-  SubType2 = 2 << 16,
-  F64 = EleBits64 | TypeFloat,
-  F32 = EleBits32 | TypeFloat,
-  F16 = EleBits16 | TypeFloat,
-  BF16 = EleBits16 | TypeFloat | SubType1,
-  F8_E4M3 = EleBits8 | TypeFloat,
-  F8_E5M2 = EleBits8 | TypeFloat | SubType1,
-  F8_E3M4 = EleBits8 | TypeFloat | SubType2,
-  S8 = EleBits8 | TypeInt,
-  U8 = EleBits8 | TypeInt | SubType1,
-  S4_CLIP = EleBits4 | TypeInt,
-  S4_FULLRANGE = EleBits4 | TypeInt | SubType1,
-  F4_E2M1 = EleBits4 | TypeFloat,
-  F4_BNB = EleBits4 | TypeFloat | SubType1,
-  F4_NF4 = EleBits4 | TypeFloat | SubType2,
-  S32 = EleBits32 | TypeInt,
-  U32 = EleBits32 | TypeInt | SubType1,
-};
-
-enum JBLAS_LAYOUT { JblasRowMajor = 101, JblasColMajor = 102 };
-enum JBLAS_TRANSPOSE {
-  JblasNoTrans = 111,
-  JblasTrans = 112,
-  JblasConjTrans = 113,
-};
-enum JBLAS_ELTWISEOP {
-  GELU,
-  SWISH,
-  TANH,
-  EXP,
-  LOW_PRECISION_EXP,
-  RELU,
-  LINEAR,
-};
-
-enum class JBLAS_PROLOGUEB_IDS : uint32_t {
-  Undef = (uint32_t)-1,
-  Begin = 0,
-  NormalBegin = Begin,
-  WeightPack = NormalBegin,
-  NormalEnd,
-  KBlockBegin = NormalEnd,
-  WeightKBlockS8 = KBlockBegin,
-  WeightKBlockS4,
-  WeightKBlockF4,
-  KBlockEnd,
-  End,
-};
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
deleted file mode 100644
index 5cac1080bc61..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
+++ /dev/null
@@ -1,277 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas.h"
-#include "xbyak/xbyak_util.h"
-
-namespace jblas {
-
-namespace device {
-
-struct X64_ISA {
-  int64_t MMX : 1;                  // 0
-  int64_t SSE : 1;                  // 1
-  int64_t SSE2 : 1;                 // 2
-  int64_t SSE3 : 1;                 // 3
-  int64_t SSSE3 : 1;                // 4
-  int64_t SSE41 : 1;                // 5
-  int64_t SSE42 : 1;                // 6
-  int64_t AVX : 1;                  // 7
-  int64_t F16C : 1;                 // 8
-  int64_t FMA : 1;                  // 9
-  int64_t AVX2 : 1;                 // 10
-  int64_t AVX_VNNI : 1;             // 11
-  int64_t AVX_VNNI_INT8 : 1;        // 12
-  int64_t AVX_NE_CONVERT : 1;       // 13
-  int64_t AVX_IFMA : 1;             // 14
-  int64_t AVX512F : 1;              // 15
-  int64_t AVX512BW : 1;             // 16
-  int64_t AVX512CD : 1;             // 17
-  int64_t AVX512DQ : 1;             // 18
-  int64_t AVX512ER : 1;             // 19
-  int64_t AVX512IFMA52 : 1;         // 20
-  int64_t AVX512PF : 1;             // 21
-  int64_t AVX512VL : 1;             // 22
-  int64_t AVX512VPOPCNTDQ : 1;      // 23
-  int64_t AVX512_4FMAPS : 1;        // 24
-  int64_t AVX512_4VNNIW : 1;        // 25
-  int64_t AVX512_BF16 : 1;          // 26
-  int64_t AVX512_BITALG : 1;        // 27
-  int64_t AVX512_VBMI : 1;          // 28
-  int64_t AVX512_VBMI2 : 1;         // 29
-  int64_t AVX512_VNNI : 1;          // 30
-  int64_t AVX512_VP2INTERSECT : 1;  // 31
-  int64_t AVX512_FP16 : 1;          // 32
-  int64_t AMX_TILE : 1;             // 33
-  int64_t AMX_BF16 : 1;             // 34
-  int64_t AMX_INT8 : 1;             // 35
-  int64_t AMX_FP16 : 1;             // 36
-  int64_t AMX_COMPLEX : 1;          // 37
-  int64_t reserved : (64 - 38);
-};
-
-class AVX2_Default {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 0;
-  static constexpr bool AVX512BW = 0;
-  static constexpr bool AVX512CD = 0;
-  static constexpr bool AVX512DQ = 0;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 0;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 0;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 0;
-  static constexpr bool AMX_BF16 = 0;
-  static constexpr bool AMX_INT8 = 0;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-class AVX512_VNNI_Default {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 1;
-  static constexpr bool AVX512BW = 1;
-  static constexpr bool AVX512CD = 1;
-  static constexpr bool AVX512DQ = 1;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 1;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 1;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 0;
-  static constexpr bool AMX_BF16 = 0;
-  static constexpr bool AMX_INT8 = 0;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-class SapphireRapids {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 1;
-  static constexpr bool AVX512BW = 1;
-  static constexpr bool AVX512CD = 1;
-  static constexpr bool AVX512DQ = 1;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 1;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 1;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 1;
-  static constexpr bool AMX_BF16 = 1;
-  static constexpr bool AMX_INT8 = 1;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-template <JBLAS_ISA ISA_T>
-class isa_base {
- public:
-  static bool constexpr avx = ISA_T >= JblasAVX;
-  static bool constexpr avx2 = ISA_T >= JblasAVX2;
-  static bool constexpr avx512f = ISA_T >= JblasAVX512F;
-  static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
-  static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
-  static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
-  static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
-};
-
-class CpuDevice {
- public:
-  inline void setThreads(int _nth) {
-    if (_nth <= 0) {
-      numthreads = numcores;
-    } else {
-      numthreads = std::min(numcores, _nth);
-    }
-  }
-  inline int getThreads() { return numthreads; }
-  inline int getCores() { return numcores; }
-  inline uint32_t getL2CacheSize() { return L2Cache; }
-  inline uint32_t getL1CacheSize() { return L1Cache; }
-  inline bool AVX() { return mHasAVX; }
-  inline bool AVX2() { return mHasAVX2; }
-  inline bool AVX_VNNI() { return mHasAVX_VNNI; }
-  inline bool AVX512F() { return mHasAVX512F; }
-  inline bool AVX512_VNNI() { return mHasAVX512_VNNI; }
-  inline bool AMX_INT8() { return mHasAMX_INT8; }
-  inline bool AMX_BF16() { return mHasAMX_BF16; }
-  inline bool AVX512_BF16() { return mHasAVX512_BF16; }
-  inline bool AVX512_FP16() { return mHasAVX512_FP16; }
-#define ADD_FLAG(isa) mHas##isa = _cpu.has(_cpu.t##isa)
-  CpuDevice() {
-    static Xbyak::util::Cpu _cpu;
-    L1Cache = _cpu.getDataCacheSize(0);
-    L2Cache = _cpu.getDataCacheSize(1);
-    ADD_FLAG(AVX);
-    ADD_FLAG(AVX2);
-    ADD_FLAG(AVX512F);
-    ADD_FLAG(AVX512_VNNI);
-    ADD_FLAG(AVX_VNNI);
-    ADD_FLAG(AMX_BF16);
-    ADD_FLAG(AMX_INT8);
-    ADD_FLAG(AVX512_BF16);
-    ADD_FLAG(AVX512_FP16);
-    numcores = _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::CoreLevel);
-    numthreads = numcores;
-  }
-
-  static CpuDevice* getInstance() {
-    static CpuDevice instance;
-    return &instance;
-  }
-
-  void print() {
-    printf(
-        "AVX:%d AVX2:%d AVX512F:%d AVX_VNNI:%d AVX512_VNNI:%d AMX_INT8:%d AMX_BF16:%d AVX512_BF16:%d AVX512_FP16:%d\n",
-        mHasAVX, mHasAVX2, mHasAVX512F, mHasAVX_VNNI, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512_BF16,
-        mHasAVX512_FP16);
-  }
-#undef ADD_FLAG
-
- protected:
-  uint32_t L2Cache, L1Cache;
-  bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
-      mHasAVX512_FP16;
-  int numcores;
-  int numthreads;
-};
-
-#define GetCPUDevice() auto _cd = jblas::device::CpuDevice::getInstance();
-
-class CpuBase {
- public:
-  CpuBase() {
-    GetCPUDevice();
-    mL2Cache = _cd->getL2CacheSize();
-    mL1Cache = _cd->getL1CacheSize();
-    mNumThreads = _cd->getThreads();
-  }
-  size_t mL2Cache, mL1Cache;
-  int mNumThreads;
-};
-}  // namespace device
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
deleted file mode 100644
index ceb7a545092d..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
+++ /dev/null
@@ -1,329 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <tuple>
-
-#include "jit_base.h"
-#include "jit_blas.h"
-#include "jit_blas_utils.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace epilogue {
-namespace gemm {
-
-template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T>
-class AccumulatorWriteBack {
- public:
-  using SType = _SRC_T;
-  using DType = _DST_T;
-  struct Param {
-    DType* C;
-    int ldc;
-    void* elt_const_v;
-  };
-
-  template <typename... Eltops>
-  JBLAS_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize, Eltops... ops) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    bool constexpr Valid = !std::is_same<DType, utils::bf16>::value || std::is_same<SType, float>::value;
-    static_assert(Valid, "fp32 to bf16 conversion only.");
-    if constexpr (std::is_same<DType, utils::bf16>::value) {
-      return kernel::wrapper::Memcpy2DFp32CvtBf16::template forward<ISA_T>(
-          const_cast<_SRC_T*>(cacheptr), cptr, M, N, cachestep * sizeof(SType), _param.ldc * sizeof(DType), false);
-    } else if constexpr (std::is_same<std::tuple<SType, DType>, std::tuple<utils::fp16, float>>::value) {
-      return kernel::wrapper::Memcpy2DFp16CvtFp32::template forward<ISA_T>(
-          const_cast<_SRC_T*>(cacheptr), cptr, M, N, cachestep * sizeof(SType), _param.ldc * sizeof(DType), false);
-    } else if constexpr (sizeof(SType) == sizeof(DType)) {
-      return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep,
-                                                                              _param.ldc, _param.elt_const_v, ops...);
-    } else {
-      assert(false);
-    }
-  }
-};
-
-template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP _OP>
-class CustomAccumulatorWriteBackWithEltop {
- public:
-  struct Param {
-    _DST_T* C;
-    int ldc;
-    void* elt_const_v;
-  };
-  JBLAS_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
-      return kernel::wrapper::Memcpy2D::template forward1<ISA_T, float, float, _OP>(cacheptr, cptr, M, N, cachestep,
-                                                                                    _param.ldc, _param.elt_const_v);
-    } else {
-      assert(false);
-    }
-  }
-};
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp32 = AccumulatorWriteBack<ISA_T, float, float>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackInt32 = AccumulatorWriteBack<ISA_T, int, int>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackBf16 = AccumulatorWriteBack<ISA_T, utils::bf16, utils::bf16>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp16 = AccumulatorWriteBack<ISA_T, utils::fp16, utils::fp16>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp16Fp32 = AccumulatorWriteBack<ISA_T, utils::fp16, float>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp32Bf16 = AccumulatorWriteBack<ISA_T, float, utils::bf16>;
-
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackWithGeluFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, GELU>;
-
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackWithSwishFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, SWISH>;
-
-template <JBLAS_ISA ISA_T>
-class AlphaBetaProcessFp32 {
- public:
-  struct Param {
-    float *C, *D;
-    int ldc, ldd;
-    float alpha, beta;
-  };
-
-  JBLAS_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto DOffset = M_offset * _param.ldd + N_offset;
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    auto dptr = _param.D + DOffset;
-    return kernel::wrapper::AlphaBetaF32F32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, _param.beta,
-                                                                     dptr, _param.ldd, cptr, _param.ldc, M, N);
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class CompFp32BlockEpilogue {
- public:
-  struct Param {
-    void* scales;
-    JBLAS_DTYPE scaledtype;
-    int ldsb;
-    int8_t* zps = nullptr;
-    float* reduce = nullptr;
-    int ldra;
-  };
-  JBLAS_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                     const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                     size_t cachesize) {
-    auto ret = JblasNotSupport;
-    if (_param.scaledtype == JBLAS_DTYPE::F32) {
-      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
-          reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
-          cachestep, M, N);
-      assert(ret == JblasSuccess);
-      if (_param.zps != nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zps + K_offset * _param.ldsb + N_offset,
-            reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, _param.ldra,
-            _param.reduce + M_offset * _param.ldra + K_offset);
-      }
-      assert(ret == JblasSuccess);
-      return ret;
-    } else if (_param.scaledtype == JBLAS_DTYPE::BF16) {
-      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
-          cachestep, M, N);
-      assert(_param.zps == nullptr);
-      assert(ret == JblasSuccess);
-      return ret;
-    }
-    return JblasNotSupport;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class DequantInt32ToFp32 {
- public:
-  struct Param {
-    float* C;
-    int ldc;
-    int ldsa;
-    float* scalesA;
-    float* scalesB;
-  };
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
-                                                                   _param.scalesA + M_offset * _param.ldsa, _param.ldsa,
-                                                                   _param.scalesB + N_offset);
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class CompInt8BlockEpilogue {
- public:
-  struct Param {
-    void* scalesB;
-    JBLAS_DTYPE scaleBdtype;
-    int ldsb;
-    float* scalesA;
-    int ldsa;
-    // optional if A asym
-    uint8_t* zpA = nullptr;
-    void* reduceB = nullptr;
-    JBLAS_DTYPE reduceBdtype = JBLAS_DTYPE::F32;
-    // optional if B asym
-    int8_t* zpB = nullptr;
-    float* reduceA = nullptr;
-    int K = 1;
-  };
-  JBLAS_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                     const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                     size_t cachesize) {
-    JBLAS_CODE ret = JblasNotSupport;
-    float* scab = nullptr;
-    size_t ScaleBTmpSize = N * sizeof(float);
-    size_t ReduceBTmpSize = N * sizeof(float);
-    assert(cachesize >= (ScaleBTmpSize + ReduceBTmpSize));
-    if (_param.scaleBdtype == JBLAS_DTYPE::BF16) {
-      auto scache = reinterpret_cast<float*>(tmpcache);
-      ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scalesB) + N_offset + K_offset * _param.ldsb, scache, 1, N, N, N,
-          false);
-      assert(ret == JblasSuccess);
-      scab = scache;
-    } else if (_param.scaleBdtype == JBLAS_DTYPE::F32) {
-      scab = reinterpret_cast<float*>(_param.scalesB) + N_offset + K_offset * _param.ldsb;
-    }
-    float* redb = nullptr;
-    if (_param.reduceB) {
-      if (_param.reduceBdtype == JBLAS_DTYPE::BF16) {
-        auto rcache = reinterpret_cast<float*>(reinterpret_cast<char*>(tmpcache) + ScaleBTmpSize);
-        ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-            reinterpret_cast<utils::bf16*>(_param.reduceB) + N_offset + K_offset * _param.ldsb, rcache, 1, N, N, N,
-            false);
-        assert(ret == JblasSuccess);
-        redb = rcache;
-      } else if (_param.reduceBdtype == JBLAS_DTYPE::F32) {
-        redb = reinterpret_cast<float*>(_param.reduceB) + N_offset + K_offset * _param.ldsb;
-      }
-    }
-    ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(
-        srcptr, cachestep, reinterpret_cast<float*>(const_cast<int32_t*>(srcptr)), cachestep, M, N,
-        _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, scab);
-    assert(ret == JblasSuccess);
-    ret = kernel::wrapper::AccumulateFp32::template forward<ISA_T>(reinterpret_cast<const float*>(srcptr), cachestep,
-                                                                   dstptr, cachestep, M, N);
-    assert(ret == JblasSuccess);
-
-    if (_param.zpA == nullptr) {
-      if (_param.zpB == nullptr) {
-        return ret;
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpB + N_offset + K_offset * _param.ldsb, scab, _param.ldsa,
-            _param.reduceA + M_offset * _param.ldsa + K_offset);
-      }
-    } else {
-      if (_param.zpB == nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, redb);
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.zpB + N_offset + K_offset * _param.ldsb, _param.scalesA + M_offset * _param.ldsa + K_offset, scab,
-            _param.ldsa, _param.K, _param.reduceA + M_offset * _param.ldsa + K_offset, redb);
-      }
-    }
-    return ret;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class ZpDequantInt32ToFp32 {
- public:
-  struct Param {
-    // necessary
-    float* C;
-    int ldc;
-    int ldsa;
-    float* scalesA;
-    float* scalesB;
-    // optional if A asym
-    uint8_t* zpA = nullptr;
-    float* reduceB = nullptr;
-    // optional if B asym
-    int8_t* zpB = nullptr;
-    float* reduceA = nullptr;
-    int K = 1;
-  };
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
-                                                                       _param.scalesA + M_offset * _param.ldsa,
-                                                                       _param.ldsa, _param.scalesB + N_offset);
-    if (ret != JblasSuccess) {
-      return ret;
-    }
-    if (_param.zpA == nullptr && _param.zpB == nullptr) {
-      return ret;
-    } else if (_param.zpA != nullptr && _param.zpB == nullptr) {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.scalesA + M_offset * _param.ldsa,
-          _param.ldsa, _param.reduceB + N_offset);
-    } else if (_param.zpA == nullptr && _param.zpB != nullptr) {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpB + N_offset, _param.scalesB + N_offset, _param.ldsa,
-          _param.reduceA + M_offset * _param.ldsa);
-    } else {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.zpB + N_offset,
-          _param.scalesA + M_offset * _param.ldsa, _param.scalesB + N_offset, _param.ldsa, _param.K,
-          _param.reduceA + M_offset * _param.ldsa, _param.reduceB + N_offset);
-    }
-    return ret;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class AlphaBetaProcessS32U8 {
- public:
-  struct Param {
-    uint8_t* C;
-    int ldc;
-    float alpha;
-    float scaleAcc, scaleC;
-    int zpC;
-  };
-
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,
-                                                                   M, N, _param.scaleAcc, _param.scaleC, _param.zpC);
-  }
-};
-
-}  // namespace gemm
-}  // namespace epilogue
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
deleted file mode 100644
index 364da9223940..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
+++ /dev/null
@@ -1,2699 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <array>
-
-#include "jit_blas_utils.h"
-#include "jit_base.h"
-
-namespace jblas {
-namespace gemm {
-enum class CompType : uint32_t {
-  COMP_FP32 = 0,
-  COMP_BF16_FP32 = 1,
-  COMP_FP16_FP16 = 2,
-  COMP_INT_START = 3,
-  COMP_INT8_US_INT32 = COMP_INT_START,
-  COMP_INT8_UU_INT32 = 4,
-  COMP_INT8_SS_INT32 = 5,
-  COMP_INT8_SU_INT32 = 6,
-  COMP_INT16_SS_INT32 = 7,
-  COMP_INT8_US_FP32 = 8,
-  COMP_INT8_UU_FP32 = 9,
-  COMP_INT8_SS_FP32 = 10,
-  COMP_INT8_SU_FP32 = 11,
-};
-
-class CoreAttr {
- public:
-  // INT32=LSB|**8bits:NTile**||**8bits:PackRow**||**8bits:CompType**||**8bits:Reserve**|
-  static uint32_t constexpr NTILE_MASK = 0xff, NTILE_SHIFT = 0, PACKROW_MASK = 0xff00, PACKROW_SHIFT = 8,
-                            COMP_MASK = 0xff0000, COMP_SHIFT = 16, ISA_MASK = 0xff000000, ISA_SHIFT = 24;
-
-  static inline uint32_t get_mask_val(uint32_t raw, uint32_t mask, uint32_t shift) { return (raw & mask) >> shift; }
-  static constexpr uint32_t make_core_id(uint32_t NTile, uint32_t PackRow, uint32_t CompType, uint32_t ISA) {
-    return (NTile << NTILE_SHIFT) | (PackRow << PACKROW_SHIFT) | (CompType << COMP_SHIFT) | (ISA << ISA_SHIFT);
-  }
-
-  static void parse_id(uint32_t id, uint32_t* vals) {
-    vals[0] = get_mask_val(id, NTILE_MASK, NTILE_SHIFT);
-    vals[1] = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
-    vals[2] = get_mask_val(id, COMP_MASK, COMP_SHIFT);
-    vals[3] = get_mask_val(id, ISA_MASK, ISA_SHIFT);
-  }
-
-  static const char* to_str(uint32_t id) {
-    static char tmp[128];
-    uint32_t vals[4];
-    parse_id(id, vals);
-    sprintf(tmp, "N%d_PACK%d_COMP%d_ISA%d", vals[0], vals[1], vals[2], vals[3]);
-    return tmp;
-  }
-
-  static inline size_t get_bsize(uint32_t id) {
-    auto packrow = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
-    return size_t(4 / packrow);
-  }
-};
-
-namespace code {
-
-template <int _NTILE, int _MTILE = 0>
-class Avx2N8P1 : protected jblas::xbyak::JitAvx2 {
- public:
-  static int constexpr RegLen = 8, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX2;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512fN16P1 : protected jblas::xbyak::JitAvx512f {
- public:
-  static int constexpr RegLen = 16, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512F;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512fp16N32P1 : protected jblas::xbyak::JitAvx512_fp16 {
- public:
-  static int constexpr RegLen = 32, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_FP16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP16_FP16;
-  typedef utils::fp16 AType;
-  typedef utils::fp16 BType;
-  typedef utils::fp16 CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastw(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastw(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512bf16N16P2 : protected jblas::xbyak::JitAvx512_bf16 {
- public:
-  static int constexpr RegLen = 16, PackRow = 2;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 2;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_BF16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_BF16_FP32;
-  typedef utils::bf16 AType;
-  typedef utils::bf16 BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                        ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512vnniN16P4 : protected jblas::xbyak::JitAvx512vnni {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_INT32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef int32_t CType;
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- private:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-
- protected:
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _kunroll) {
-    for (int kk = 0; kk < _kunroll; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class AvxvnniN8P4 : protected jblas::xbyak::JitAvxvnni {
- public:
-  static int constexpr RegLen = 8, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_INT32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef int32_t CType;
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- private:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
- protected:
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _kunroll) {
-    for (int kk = 0; kk < _kunroll; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Amxbf16N16P2 : protected jblas::xbyak::JitAmxbf16 {
- public:
-  static int constexpr RegLen = 16, PackRow = 2;
-  static_assert(_NTILE % RegLen == 0);
-  static_assert(_MTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
-  static_assert(NRegs * MRegs + 2 <= TileCount);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 32;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAMX_BF16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_BF16_FP32;
-  typedef utils::bf16 AType;
-  typedef utils::bf16 BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-    void* workspace;
-  };
-  typedef long long (*func_t)(params*);
-
-  int TmpRegCount = RegCount;
-  int TmpReg = 0;
-  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
-  int CTile = 0, ATile = 0, BTile = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CTileCount = NRegs * MRegs;
-    auto tile_re = TileCount - CTileCount;
-    if (tile_re - 1 >= NRegs) {
-      BTileCount = NRegs;
-      ATileCount = tile_re - BTileCount;
-    } else if (tile_re - 1 >= MRegs) {
-      ATileCount = MRegs;
-      BTileCount = tile_re - ATileCount;
-    } else {
-      ATileCount = 1;
-      BTileCount = tile_re - ATileCount;
-    }
-    CTile = 0;
-    ATile = CTile + CTileCount;
-    BTile = ATile + ATileCount;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int kunrll) {
-    auto& reg_Bstride = reg_tmp1;
-    mov(reg_Bstride, NTILE * 4);
-    int mtiles = _mtile / RegLen;
-
-    for (int kk = 0; kk < kunrll; kk++) {
-      auto& reg_Atmp = reg_tmp2;
-      if (mtiles == 1) {
-        reg_Atmp = reg_matAptr;
-      } else {
-        mov(reg_Atmp, reg_matAptr);
-      }
-      if (BTileCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-        }
-        for (int mm = 0; mm < mtiles; mm++) {
-          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-          for (int i = 0; i < NRegs; i++) {
-            tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
-          }
-          if (mm != mtiles - 1) {
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-          }
-        }
-      } else {
-        if (ATileCount == mtiles) {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-          for (int i = 0; i < NRegs; i++) {
-            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-            for (int mm = 0; mm < mtiles; mm++) {
-              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
-            }
-          }
-        } else {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            for (int i = 0; i < NRegs; i++) {
-              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
-            }
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < CTileCount; i++) {
-      tilezero(Xbyak::Tmm(CTile + i));
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    int mtnum = _mtile / 16;
-    for (int mm = 0; mm < mtnum; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
-      }
-      if (mm != mtnum - 1) {
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-      }
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
-    mov(reg_tmp1, NTILE * 4);
-    for (int mm = 0; mm < MRegs; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
-      }
-    }
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    int zunroll = TmpRegCount / NRegs;
-    for (int i = 0; i < _mtile; i += zunroll) {
-      int m_re = utils::remainsize(i, _mtile, zunroll);
-      for (int im = 0; im < m_re; im++) {
-        for (int j = 0; j < NRegs; j++) {
-          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
-          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
-        }
-        add(reg_matCptr, reg_cstride);
-      }
-    }
-    outLocalLabel();
-  }
-};
-
-template <typename AT, typename BT, int _NTILE, int _MTILE = 0>
-class Amxint8N16P4 : protected jblas::xbyak::JitAmxint8 {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static_assert(_MTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
-  static_assert(NRegs * MRegs + 2 <= TileCount);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 64;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAMX_INT8;
-  static uint32_t constexpr COMPUTE =
-      (uint32_t)(std::is_same_v<AT, int8_t>
-                     ? std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_SS_INT32 : CompType::COMP_INT8_SU_INT32
-                 : std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_US_INT32
-                                              : CompType::COMP_INT8_UU_INT32);
-  using AType = AT;
-  using BType = BT;
-  typedef int32_t CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-    void* workspace;
-  };
-  typedef long long (*func_t)(params*);
-
-  int TmpRegCount = RegCount;
-  int TmpReg = 0;
-  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
-  int CTile = 0, ATile = 0, BTile = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CTileCount = NRegs * MRegs;
-    auto tile_re = TileCount - CTileCount;
-    if (tile_re - 1 >= NRegs) {
-      BTileCount = NRegs;
-      ATileCount = tile_re - BTileCount;
-    } else if (tile_re - 1 >= MRegs) {
-      ATileCount = MRegs;
-      BTileCount = tile_re - ATileCount;
-    } else {
-      ATileCount = 1;
-      BTileCount = tile_re - ATileCount;
-    }
-    CTile = 0;
-    ATile = CTile + CTileCount;
-    BTile = ATile + ATileCount;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int kunrll) {
-    auto& reg_Bstride = reg_tmp1;
-    mov(reg_Bstride, NTILE * 4);
-    int mtiles = _mtile / RegLen;
-
-    for (int kk = 0; kk < kunrll; kk++) {
-      auto& reg_Atmp = reg_tmp2;
-      if (mtiles == 1) {
-        reg_Atmp = reg_matAptr;
-      } else {
-        mov(reg_Atmp, reg_matAptr);
-      }
-      if (BTileCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-        }
-        for (int mm = 0; mm < mtiles; mm++) {
-          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-          for (int i = 0; i < NRegs; i++) {
-            _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
-          }
-          if (mm != mtiles - 1) {
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-          }
-        }
-      } else {
-        if (ATileCount == mtiles) {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-          for (int i = 0; i < NRegs; i++) {
-            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-            for (int mm = 0; mm < mtiles; mm++) {
-              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
-            }
-          }
-        } else {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            for (int i = 0; i < NRegs; i++) {
-              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
-            }
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < CTileCount; i++) {
-      tilezero(Xbyak::Tmm(CTile + i));
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    int mtnum = _mtile / 16;
-    for (int mm = 0; mm < mtnum; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
-      }
-      if (mm != mtnum - 1) {
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-      }
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
-    mov(reg_tmp1, NTILE * 4);
-    for (int mm = 0; mm < MRegs; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
-      }
-    }
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    int zunroll = TmpRegCount / NRegs;
-    for (int i = 0; i < _mtile; i += zunroll) {
-      int m_re = utils::remainsize(i, _mtile, zunroll);
-      for (int im = 0; im < m_re; im++) {
-        for (int j = 0; j < NRegs; j++) {
-          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
-          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
-        }
-        add(reg_matCptr, reg_cstride);
-      }
-    }
-    outLocalLabel();
-  }
-};
-template <int N, int M>
-using Amxint8N16P4US = Amxint8N16P4<uint8_t, int8_t, N, M>;
-
-template <int N, int M>
-using Amxint8N16P4SS = Amxint8N16P4<int8_t, int8_t, N, M>;
-
-class AmxConfigure : protected jblas::xbyak::JitAmxtile {
- public:
-  typedef long long (*func_t)(tileconfig_t*);
-
-  static void configure(int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum, int CNum) {
-    static AmxConfigure code;
-    tileconfig_t cfg;
-    std::memset(&cfg, 0, sizeof(cfg));
-    configure_tiles(cfg, TILE_M, TILE_N, TILE_K, elesize, ANum, BNum, CNum);
-    code.mKernel(&cfg);
-  }
-
- protected:
-  AmxConfigure() {
-    generate_config(this);
-    mKernel = getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-};
-
-namespace kblock {
-// optimize for kblock gemm, each block size in k dimension has dequant operation
-// all accumulators use fp32 dtype.
-template <int _NTILE, int _MTILE = 0>
-class Avx512fN16P1 : protected jblas::xbyak::JitAvx512f {
- public:
-  static int constexpr RegLen = 16, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512F;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512vnniN16P4 : protected jblas::xbyak::JitAvx512vnni {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1 - NRegs) / (NRegs * 2) : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_FP32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    uint8_t* zpA;
-    float* scaleA;
-    int ldsa;
-    float* scaleB;
-    float* reduceB;
-    int ldsb;
-    int k;
-    int n;
-    int kblock;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, CF32Reg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_iterkb;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_tmp4;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = NRegs;
-    CReg = 0;
-    CF32Reg = CReg + CRegCount;
-    BReg = CF32Reg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg < RegCount);
-    TmpRegCount = RegCount - TmpReg;
-    assert(TmpRegCount >= 1);
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_iterkb = st.t[12];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_tmp4 = st.t[11];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    xor_(reg_iterkb, reg_iterkb);
-    L(".kloop");
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vpxorq(Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j));
-      }
-    }
-    xor_(reg_tmp2, reg_tmp2);
-    load32(reg_tmp3, ptr[parambase + OFFSET(kblock)]);
-    mov(reg_tmp, reg_tmp3);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kbloop", T_NEAR);
-    L(".unkbloop");
-    generate_fma(_mtile, KUNROLL, reg_tmp1);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_tmp2, KUNROLL * KTILE);
-    cmp(reg_tmp2, reg_tmp);
-    jb(".unkbloop");
-    cmp(reg_tmp, reg_tmp3);
-    jge(".kend", T_NEAR);
-    L(".kbloop");
-    generate_fma(_mtile, 1, reg_tmp1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_tmp2, 1 * KTILE);
-    cmp(reg_tmp2, reg_tmp3);
-    jb(".kbloop");
-    L(".kend");
-    add(reg_iterk, reg_tmp2);
-    generate_f32_accumulate(_mtile);
-    generate_zp_correction(_mtile);
-    inc(reg_iterkb);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile, Xbyak::Reg64& tmp) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(tmp, ptr[reg_matAptr + kk * AKStepSize]);
-      for (int i = 0; i < NRegs; i++) {
-        vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-      }
-      for (int mm = 0; mm < _mtile; mm++) {
-        vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-        add(reg_tmp1, reg_astride);
-        for (int i = 0; i < NRegs; i++) {
-          vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CF32Reg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void generate_f32_accumulate(int _mtile) {
-    load32(reg_tmp, ptr[parambase + OFFSET(ldsb)]);
-    imul(reg_tmp, reg_iterkb);
-    mov(reg_tmp2, ptr[parambase + OFFSET(scaleB)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp * sizeof(float)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
-
-    mov(reg_tmp, ptr[parambase + OFFSET(scaleA)]);
-    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(float)]);
-    load32(reg_tmp1, ptr[parambase + OFFSET(ldsa)]);
-    for (int i = 0; i < NRegs; i++) {
-      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_tmp2 + i * VecBytes]);
-    }
-    for (int mm = 0; mm < _mtile; mm++) {
-      vbroadcastss(Xbyak::Zmm(TmpReg), ptr[reg_tmp]);
-      lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
-      for (int i = 0; i < NRegs; i++) {
-        vcvtdq2ps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
-        vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(TmpReg), Xbyak::Zmm(BReg + i));
-        vmulps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(AReg));
-        vaddps(Xbyak::Zmm(CF32Reg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
-      }
-    }
-  }
-
-  void generate_zp_correction(int _mtile) {
-    load32(reg_tmp1, ptr[parambase + OFFSET(ldsb)]);
-    imul(reg_tmp1, reg_iterkb);
-    mov(reg_tmp2, ptr[parambase + OFFSET(reduceB)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp1 * sizeof(float)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
-    auto& reg_redB = reg_tmp2;
-
-    mov(reg_tmp, ptr[parambase + OFFSET(zpA)]);
-    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(AType)]);
-    auto& reg_zpA = reg_tmp;
-
-    mov(reg_tmp1, ptr[parambase + OFFSET(scaleA)]);
-    lea(reg_tmp1, ptr[reg_tmp1 + reg_iterkb * sizeof(float)]);
-    auto& reg_scaleA = reg_tmp1;
-
-    load32(reg_tmp3, ptr[parambase + OFFSET(ldsa)]);
-    auto& reg_ldsa = reg_tmp3;
-    for (int i = 0; i < NRegs; i++) {
-      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_redB + i * VecBytes]);
-    }
-
-    for (int i = 0; i < _mtile; i++) {
-      vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
-      vpmovzxbd(Xbyak::Zmm(AReg), Xbyak::Xmm(AReg));
-      vcvtdq2ps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg));
-      vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg), zword_b[reg_scaleA]);
-      for (int j = 0; j < NRegs; j++) {
-        vmulps(Xbyak::Zmm(CReg + j), Xbyak::Zmm(AReg), Xbyak::Zmm(BReg + j));
-        vsubps(Xbyak::Zmm(CF32Reg + i * NRegs + j), Xbyak::Zmm(CReg + j));
-      }
-      lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
-      lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
-    }
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CF32Reg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-}  // namespace kblock
-}  // namespace code
-template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
-class CoreCodeBase {
- public:
-  using Code = CodeT<_NTILE, _MTILE>;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  static int constexpr NTILE = Code::NTILE;
-  static int constexpr MTILE = Code::MTILE;
-  static int constexpr KTILE = Code::KTILE;
-  static int constexpr PACK_ROW = Code::PackRow;
-  static int constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
-  static JBLAS_ISA constexpr ISA = (JBLAS_ISA)Code::ISA;
-  static uint32_t constexpr ID = CoreAttr::make_core_id(NTILE, PACK_ROW, COMP, ISA);
-  void configure() { (void)(0); }
-
- protected:
-  CoreCodeBase() {
-    for (int i = 0; i < mCodes.size(); i++) {
-      mCodes[i].generate_code(i + 1);
-    }
-  }
-  std::array<Code, Code::MTILE> mCodes;
-};
-
-template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
-class CoreCodeBaseAMX {
- public:
-  using Code = CodeT<_NTILE, _MTILE>;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  static int constexpr NTILE = Code::NTILE;
-  static int constexpr MTILE = Code::MTILE;
-  static int constexpr KTILE = Code::KTILE;
-  static int constexpr PACK_ROW = Code::PackRow;
-  static int constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
-  static JBLAS_ISA constexpr ISA = (JBLAS_ISA)Code::ISA;
-  static uint32_t constexpr ID = CoreAttr::make_core_id(_NTILE, PACK_ROW, COMP, ISA);
-  Xbyak::CodeGenerator cfgcode;
-
- protected:
-  CoreCodeBaseAMX() {
-    for (int i = 0; i < mCodes.size(); i++) {
-      mCodes[i].generate_code((i + 1) * 16);
-    }
-  }
-  std::array<Code, Code::MRegs> mCodes;
-};
-
-template <int _NTILE, int _MTILE = 0>
-class SCoreRowNAvx2 : public CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE>::Code;
-  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class SCoreRowNAvx512f : public CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE>::Code;
-  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAvx512fp16 : public CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE>::Code;
-
-  void forward(utils::fp16* matA, utils::fp16* matB, utils::fp16* matC, int _m, int _n, int _k, int _astride,
-               int _bstride, int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAvx512bf16 : public CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE>::Code;
-  void forward(utils::bf16* matA, utils::bf16* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAmxbf16 : public CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(AType* matA, BType* matB, CType* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvx512vnni : public CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvx512vnniKBlock : public CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
-  void forward(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
-               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA,  _astride, matB,    _bstride, matC, _cstride, zpA,     scaleA,
-                                       _ldsa, scaleB,   reduceB, _ldsb,    _k,   _n,       _kblock, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvxvnni : public CoreCodeBase<code::AvxvnniN8P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::AvxvnniN8P4, _NTILE, _MTILE>::Code;
-
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAmxint8 : public CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAmxint8SS : public CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(int8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-}  // namespace gemm
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
deleted file mode 100644
index a1607c901218..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
+++ /dev/null
@@ -1,678 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <functional>
-#include <thread>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include "jit_blas_utils.h"
-#include "jit_blas_device.h"
-
-namespace jblas {
-namespace parallel {
-struct Config2D {
-  int threads;
-  int size[2];
-  int step[2];
-};
-struct ThreadProblem2D {
-  int tid;
-  int tidx[2];
-  int loc[2];
-  int size[2];
-  bool valid;
-  void print() {
-    printf("Thread %d indice:(%d,%d)\n", tid, tidx[0], tidx[1]);
-    printf("Thread location:(%d,%d)\n", loc[0], loc[1]);
-    printf("Thread problem size:(%d,%d)\n", size[0], size[1]);
-  }
-};
-class Scheduler2D {
- public:
-  Scheduler2D() = default;
-  Scheduler2D(const Config2D& config) { update(config); }
-  using ThreadProblem = ThreadProblem2D;
-
-  virtual void getIndex(ThreadProblem& problem) {
-    if (problem.tid >= mThdValid) {
-      problem.size[0] = 0;
-      problem.size[1] = 0;
-      problem.valid = false;
-      return;
-    }
-    auto& tid = problem.tid;
-    problem.tidx[1] = tid % mThdPerRow;
-    problem.tidx[0] = tid / mThdPerRow;
-    problem.loc[0] = problem.tidx[0] * mThdSize[0];
-    problem.loc[1] = problem.tidx[1] * mThdSize[1];
-    problem.size[0] = utils::remainsize(problem.loc[0], mSize[0], mThdSize[0]);
-    problem.size[1] = utils::remainsize(problem.loc[1], mSize[1], mThdSize[1]);
-    problem.valid = true;
-  }
-
-  virtual void update(const Config2D& config) {
-    mThdCount = config.threads;
-    for (size_t i = 0; i < 2; i++) {
-      mSize[i] = config.size[i];
-      mStep[i] = config.step[i];
-    }
-    schedule();
-  }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-  }
-
- protected:
-  void set(const int* thdsize, const int* size, const int* step) {
-    for (size_t i = 0; i < 2; i++) {
-      mThdSize[i] = thdsize[i];
-      mSize[i] = size[i];
-      mStep[i] = step[i];
-    }
-  }
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    float ratio = colnum * rownum / static_cast<float>(mThdCount);
-    if (ratio <= 1) {
-      mThdSize[0] = mStep[0];
-      mThdSize[1] = mStep[1];
-      mThdPerRow = colnum;
-      calc_valid_threads();
-      return;
-    }
-    float colratio = ratio > colnum ? colnum : ceil(ratio);
-    mThdSize[1] = static_cast<int>(colratio * mStep[1]);
-    mThdPerRow = static_cast<int>(ceil(static_cast<float>(colnum) / colratio));
-    mThdSize[0] = static_cast<int>(ceil(rownum / (static_cast<float>(mThdCount) / mThdPerRow)) * mStep[0]);
-    calc_valid_threads();
-  }
-  void calc_valid_threads() {
-    mThdValid = mThdPerRow * static_cast<int>(std::ceil(static_cast<float>(mSize[0]) / mThdSize[0]));
-  }
-
-  int mThdPerRow = 0;
-  int mThdValid = 0;
-  int mThdCount = 0;
-
- private:
-  int mThdSize[2] = {0, 0};
-  int mSize[2] = {0, 0};
-  int mStep[2] = {0, 0};
-};
-
-namespace gemm {
-
-struct ConfigGemmBase {
-  int threads;
-  int size[3];
-  size_t l2cache = 1024ULL * 1024;
-  size_t l1cache = 32ULL * 1024;
-};
-
-struct ThreadProblemBase : ThreadProblem2D {
-  int block[3];
-  size_t l2cachesize;
-  size_t tmpcachesize;
-};
-
-template <class _GemmCore_T>
-class SchedulerBase : public Scheduler2D {
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerBase() = default;
-  SchedulerBase(const ConfigGemmBase& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.l2cachesize = mL2Size;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const ConfigGemmBase& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.size[i];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-  }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  const float DensityThres = 32;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // cache = mMStep * mNStep * CSize + mNStep * mKStep * BSize
-  //       = mNStep * (mMStep*CSize + mKStep*BSize)
-  // C Access = K/mKStep
-  // B Access = M/mMStep
-  // A Access = N/mNStep
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    size_t csize_total = mL2Size - _GemmCore_T::PREFERRED_N * KRef * mEleSize[1];
-    int maxM = static_cast<int>(csize_total / _GemmCore_T::PREFERRED_N / mEleSize[2]);
-    maxM = utils::downdiv(maxM, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-    int maxN = static_cast<int>(mL2Size / (mBlock[0] * mEleSize[2] + KRef * mEleSize[1]));
-    maxN = utils::downdiv(maxN, mStep[1]);
-    int nthdn = mThdSize[1] / mStep[1];
-    if (maxN < nthdn) {
-      int niter = utils::updiv(nthdn, maxN);
-      mBlock[1] = utils::updiv(nthdn, niter) * mStep[1];
-    } else {
-      mBlock[1] = mThdSize[1];
-    }
-    auto rawk = static_cast<int>((mL2Size - mBlock[0] * mBlock[1] * mEleSize[2]) /
-                                 (mBlock[0] * mEleSize[0] + mBlock[1] * mEleSize[1]));
-    rawk = std::min(rawk, mSizePadded[2]);
-    mBlock[2] = utils::padto_le(rawk, mStep[2]);
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = mThdSize[0];
-    mBlock[1] = mStep[1];
-    size_t reservsize = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    size_t maxK = (mL1Size - reservsize) / (mBlock[1] * mEleSize[1] + mBlock[0] * mEleSize[0]);
-    size_t Bsize = maxK * mBlock[1] * mEleSize[1];
-    size_t Bsize_1K = utils::padto_le(Bsize, 1024);
-    mBlock[2] = static_cast<int>(Bsize_1K / mEleSize[1] / mBlock[1]);
-    mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-  }
-
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-
-struct ConfigGemmKBlock : ConfigGemmBase {
-  int kblock;
-};
-
-template <class _GemmCore_T>
-class SchedulerKBlock : public Scheduler2D {
-  // Block[2]: block size of K must be mutiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlock() = default;
-  SchedulerKBlock(const ConfigGemmKBlock& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.l2cachesize = mL2Size;
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const ConfigGemmKBlock& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.size[i];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    mKBlock = config.kblock;
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-  }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2] * 2;
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  const float DensityThres = 32;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio * 1.f;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitStage * mStep[2] >= mSize[2]) {
-      mBlock[2] = mSize[2];
-    } else if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-    }      
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = static_cast<int>(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = static_cast<int>(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = static_cast<int>(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = static_cast<int>(getMaxK(mBlock[1]));
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-      auto tmp = utils::updiv(mKBlock, mBlock[2]);
-      while (mKBlock % tmp != 0) tmp++;  // TODO(Yu) optimize
-      mBlock[2] = utils::downdiv(mKBlock, tmp);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-#if 0
-template <class _GemmCore_T>
-class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
-  // Block[2]: block size of K must be mutiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlockS() = default;
-  SchedulerKBlockS(const ConfigGemmKBlock& config) { update(config); }
-
- protected:
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-    }
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = int(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = int(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = int(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = getMaxK(mBlock[1]);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-#endif
-}  // namespace gemm
-using thread_func = std::function<void(int tid)>;
-
-class IThreading {
- public:
-  IThreading(int nthreads) : mThreadNum(nthreads) {}
-  virtual void parallel_for(const thread_func& func) = 0;
-  virtual inline void sync() = 0;
-  virtual int num_threads() { return mThreadNum; };
-  virtual void set_threads(int nthreads) = 0;
-
- protected:
-  int mThreadNum;
-};
-#ifdef _OPENMP
-class OMPThreading : public IThreading {
- public:
-  OMPThreading(int nthreads) : IThreading(nthreads) { omp_set_num_threads(nthreads); }
-  void parallel_for(const thread_func& func) override {
-#pragma omp parallel
-    {
-      int tidx = omp_get_thread_num();
-      func(tidx);
-    }
-  }
-  virtual void set_threads(int nthreads) override {
-    mThreadNum = nthreads;
-    omp_set_num_threads(nthreads);
-  }
-  virtual inline void sync() override {
-#pragma omp barrier
-    (void)(0);  // make msvc happy with c++20
-  }
-};
-#endif
-
-class StdThreading : public IThreading {
- public:
-  StdThreading(int nthreads) : IThreading(nthreads) { thdset.resize(nthreads); }
-  void parallel_for(const thread_func& func) override {
-    for (size_t i = 0; i < mThreadNum; i++) {
-      thdset[i] = std::thread([&](int tidx) { func(tidx); }, int(i));
-    }
-    for (size_t i = 0; i < mThreadNum; i++) {
-      thdset[i].join();
-    }
-  }
-
-  virtual void set_threads(int nthreads) override {
-    mThreadNum = nthreads;
-    thdset.resize(nthreads);
-  }
-
-  virtual inline void sync() override { assert(0); }
-
- private:
-  std::vector<std::thread> thdset;
-};
-
-template <class Parallel_T, class Launch_T>
-void GemmBaseRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache});
-  static bool flag = false;
-  if (flag) {
-    printf("%s\n", __FUNCTION__);
-    para.print();
-    flag = false;
-  }
-  th->parallel_for([&](int tidx) {
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-template <class Parallel_T, class Launch_T>
-void GemmKBlockRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache, args.KBlock});
-  static bool flag = false;
-  if (flag) {
-    printf("%s\n", __FUNCTION__);
-    para.print();
-    flag = false;
-  }
-  th->parallel_for([&](int tidx) {
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-template <class Parallel_T, class Launch_T>
-void GemmKBlockRunWithA(Launch_T& launcher, const typename Launch_T::Param& args,
-                        const typename Launch_T::AParam& Aargs, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache, args.KBlock});
-  using AParall = typename Launch_T::PrologueA::Parallel;
-  AParall apara({th->num_threads(), args.M, args.K, 1, args.KBlock});
-  th->parallel_for([&](int tidx) {
-    typename AParall::ThreadProblem thdpA{tidx};
-    apara.getIndex(thdpA);
-    if (thdpA.valid) {
-      launcher.mProA.run(Aargs, thdpA);
-    }
-    th->sync();
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-}  // namespace parallel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
deleted file mode 100644
index b006e0b410cd..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
+++ /dev/null
@@ -1,214 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <immintrin.h>
-#include <cassert>
-
-#include "jit_blas.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_utils.h"
-#include "jit_blas_storage.h"
-#include "jit_blas_device.h"
-#include "jit_blas_parallel.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace prologue_a {
-namespace gemm {
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class ActivationBase {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SRCType = AType;
-  struct Param {
-    const AType* A;
-    int lda;
-  };
-  ActivationBase() {}
-
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    auto aptr = const_cast<AType*>(_param.A);
-    if (k_size % _GemmCore_T::KTILE == 0 && m_size >= _GemmCore_T::MTILE) {
-      *dstptr = aptr + m_offset * _param.lda + k_offset;
-      *dststep = _param.lda;
-      return JblasSuccess;
-    } else {
-      auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
-      *dststep = k_pad;
-      return kernel::wrapper::Memcpy2D::forward<ISA_T, AType, AType>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                     m_size, k_size, _param.lda, k_pad);
-    }
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationConverter {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SRCType = SRC_T;
-  struct Param {
-    const SRC_T* A;
-    int lda;
-  };
-  ActivationConverter() {}
-
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    auto aptr = const_cast<SRC_T*>(_param.A);
-    auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
-    *dststep = k_pad;
-    if constexpr (std::is_same_v<AType, utils::bf16> && std::is_same_v<SRC_T, float>) {
-      return kernel::wrapper::Memcpy2DFp32CvtBf16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else if constexpr (std::is_same_v<AType, utils::fp16> && std::is_same_v<SRC_T, float>) {
-      return kernel::wrapper::Memcpy2DFp32CvtFp16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else if constexpr (std::is_same_v<AType, float> && std::is_same_v<SRC_T, utils::bf16>) {
-      return kernel::wrapper::Memcpy2DBf16CvtFp32::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else {
-      assert(0);
-    }
-    return JblasNotSupport;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationConverterFp32 = ActivationConverter<_GemmCore_T, ISA_T, float>;
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationConverterBf16 = ActivationConverter<_GemmCore_T, ISA_T, utils::bf16>;
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationKBlockQuantize {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SType = float;
-  using QParam = storage::gemm::StorageQuantActivation;
-  using SRCType = SRC_T;
-  struct Param {
-    const SRC_T* A;
-    int lda;
-    QParam* quan;
-  };
-  using Parallel = jblas::parallel::Scheduler2D;
-  using ThreadProblem = jblas::parallel::ThreadProblem2D;
-
-  inline QParam createStorage(int m, int k, int kblock, bool hasreduce) {
-    QParam tmp;
-    int kpad = utils::padto(k, _GemmCore_T::KTILE);
-    int mpad = utils::padto(m, _GemmCore_T::MTILE);
-    tmp.resize(mpad, kpad, kblock == -1 ? kpad : kblock, JBLAS_DTYPE::U8, JBLAS_DTYPE::F32, JBLAS_DTYPE::U8,
-               JBLAS_DTYPE::F32, std::is_same_v<AType, uint8_t>, hasreduce);
-    return tmp;
-  }
-
-  void run(const Param& _param, ThreadProblem& thdp) {
-    auto quan = _param.quan;
-    if (thdp.valid) {
-      // min max
-      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
-      auto thdqptr = quan->template APtr<AType>() + thdp.loc[0] * quan->lda + thdp.loc[1];
-      auto blk_offset = thdp.loc[0] * quan->mCStep + thdp.loc[1] / quan->kblock;
-      auto thdsptr = quan->template SPtr<float>() + blk_offset;
-      auto thdzptr = quan->template ZPtr<AType>() + blk_offset;
-      auto thdrptr = quan->template RPtr<float>() == nullptr ? nullptr : quan->template RPtr<float>() + blk_offset;
-      if constexpr (std::is_same_v<AType, uint8_t>) {
-        kernel::wrapper::QuantizeU8ColBlock::template forward<ISA_T, SRC_T>(
-            thdp.size[0], thdp.size[1], srcptr, _param.lda, thdqptr, quan->lda, thdsptr, quan->mCStep, thdzptr,
-            quan->kblock, thdrptr);
-      }
-      if constexpr (std::is_same_v<AType, int8_t>) {
-        kernel::wrapper::QuantizeS8ColBlock::template forward<ISA_T, SRC_T>(thdp.size[0], thdp.size[1], srcptr,
-                                                                            _param.lda, thdqptr, quan->lda, thdsptr,
-                                                                            quan->mCStep, quan->kblock, thdrptr);
-      }
-    }
-  }
-
-  JBLAS_CODE quantize(const Param& _param, int m, int k, jblas::parallel::IThreading* threading) {
-    auto paral = Parallel({threading->num_threads(), m, k, 1, _param.quan->kblock});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      paral.getIndex(thdp);
-      if (thdp.valid) run(_param, thdp);
-    });
-    return JblasSuccess;
-  }
-
- public:  // Runtime get by launcher
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    (void)m_size;
-    (void)k_size;
-    auto quan = _param.quan;
-    auto aptr = quan->template APtr<AType>();
-    *dstptr = aptr + m_offset * quan->lda + k_offset;
-    *dststep = quan->lda;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationF32KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, float>;
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationBf16KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, utils::bf16>;
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationKBlockBase : public ActivationBase<_GemmCore_T, ISA_T> {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SType = storage::gemm::StorageReduce;
-  using SRCType = SRC_T;
-  using Param = typename ActivationBase<_GemmCore_T, ISA_T>::Param;
-  using Parallel = jblas::parallel::Scheduler2D;
-  using ThreadProblem = jblas::parallel::ThreadProblem2D;
-
-  inline SType createStorage(int m, int k, int kblock) {
-    SType tmp;
-    tmp.resize(m, k, kblock == -1 ? k : kblock, JBLAS_DTYPE::F32);
-    return tmp;
-  }
-
-  void run(const Param& _param, SType* stor, int m, int k, ThreadProblem& thdp) {
-    if (thdp.valid) {
-      // min max
-      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
-      auto blk_offset = thdp.loc[0] * stor->lda + thdp.loc[1] / stor->kblock;
-      auto thdrptr = stor->template get<float>() + blk_offset;
-      auto ret = kernel::wrapper::ColBlockReduceSum::template forward<ISA_T, SRC_T>(
-          srcptr, _param.lda, thdp.size[0], thdp.size[1], stor->kblock, thdrptr, stor->lda);
-      assert(ret == JblasSuccess);
-    }
-  }
-
-  JBLAS_CODE reduce(const Param& _param, SType* stor, int m, int k, jblas::parallel::IThreading* threading) {
-    auto paral = Parallel({threading->num_threads(), m, k, 1, stor->kblock});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      paral.getIndex(thdp);
-      if (thdp.valid) run(_param, stor, m, k, thdp);
-    });
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationKBlockBaseF32 = ActivationKBlockBase<_GemmCore_T, ISA_T, float>;
-}  // namespace gemm
-}  // namespace prologue_a
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
deleted file mode 100644
index 7fd632d4d3c6..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
+++ /dev/null
@@ -1,892 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas_storage.h"
-#include "jit_blas_device.h"
-#include "jit_blas_parallel.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace prologue_b {
-namespace gemm {
-
-template <typename WT, JBLAS_ISA ISA_T>
-static inline void transposeWeight(const int Row, const int Col, const WT* src, const int ld_src, WT* dst,
-                                   const int ld_dst, parallel::IThreading* threading) {
-  jblas::parallel::Scheduler2D _para;
-  _para.update({threading->num_threads(), Row, Col, 16, 16});
-  threading->parallel_for([&](int tidx) {
-    jblas::parallel::ThreadProblem2D thdp{tidx};
-    _para.getIndex(thdp);
-    if (thdp.valid) {
-      kernel::wrapper::Transpose2D<WT>::template forward<ISA_T>(src + thdp.loc[0] * ld_src + thdp.loc[1],
-                                                                   dst + thdp.loc[0] + thdp.loc[1] * ld_dst,
-                                                                   thdp.size[0], thdp.size[1], ld_src, ld_dst);
-    }
-  });
-}
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightPack {
- public:
-  using WType = typename _GemmCore_T::BType;
-  using StorageType = storage::gemm::StoragePackedWeight;
-  struct Param {
-    const WType* B;
-    const int ldb;
-    StorageType* packedW;
-  };
-
-  StorageType createStorage(int n, int k) {
-    int KPad = utils::padto(k, _GemmCore_T::KTILE);
-    int NPad = utils::padto(n, _GemmCore_T::NTILE);
-    StorageType tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, n, k, utils::jblas_dtype<WType>);
-    return tmp;
-  }
-
-  void packWeightTranspose(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<WType>(static_cast<size_t>(N) * K);
-    transposeWeight<WType, ISA_T>(N, K, _param.B, _param.ldb, B_NT, N, threading);
-    packWeight(N, K, {B_NT, N, _param.packedW}, threading);
-    utils::afree(B_NT);
-  }
-
-  // from KxN int8 symmetric weight to packed N//NtilexKPadxNTile int4 weight
-  void packWeight(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        run(_param, thdp);
-      }
-    });
-  }
-
-  void run(const Param& _param, parallel::ThreadProblem2D& thdp) {
-    auto packedw = _param.packedW;
-    auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-    auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-    const auto src = _param.B + thdp.loc[0] * _param.ldb + thdp.loc[1];
-    const auto dst = packedw->template get<WType>() + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * packedw->mKPad;
-    using PaddingInterleaveMNWType = kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
-    auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
-        src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, _param.ldb, packedw->mKPad);
-    assert(ret == JblasSuccess);
-    (void)ret;
-  }
-
-  inline JBLAS_CODE getWeight(WType** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param param, void* tmpcache, size_t cachesize) {
-    auto wptr = param.packedW;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->template get<WType>() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    kernel::wrapper::Memcpy2D::template forward<ISA_T, WType, WType>(
-        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
-        _GemmCore_T::NTILE * k_size);
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockS8 {
- public:
-  using StorageWeight = storage::gemm::StorageWeightKBlockS8;
-  using BType = typename _GemmCore_T::BType;
-  struct Param {
-    const storage::gemm::WeightKBlockBase* packedW;
-  };
-
-  StorageWeight createStorage(int n, int k, int blocksize, JBLAS_DTYPE scat, JBLAS_DTYPE redt, bool is_asym) {
-    int KPad = utils::padto(k, _GemmCore_T::KTILE);
-    int NPad = utils::padto(n, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, n, k, scat, redt, is_asym);
-    return tmp;
-  }
-
-  virtual void packTransposeWeight(const int N, const int K, const float* B, const int ldb, void* stor,
-                                   parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
-    transposeWeight<float, ISA_T>(N, K, B, ldb, B_NT, N, threading);
-    packWeight(N, K, B_NT, N, stor, threading);
-    utils::afree(B_NT);
-  }
-
-  // from packed N//NtilexKPadxNTile int8 weight to KxN f32 weight
-  virtual void unpackTransposeWeight(const int N, const int K, void* stor, float* B, const int ldb,
-                                     parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
-    unpackWeight(N, K, stor, B_NT, N, threading);
-    transposeWeight<float, ISA_T>(K, N, B_NT, N, B, ldb, threading);
-    utils::afree(B_NT);
-  }
-
-  // from KxN f32 weight to packed N//NtilexKPadxNTile int8 weight
-  virtual void packWeight(const int N, const int K, const float* B, const int ldb, void* stor,
-                          parallel::IThreading* threading) {
-    auto tmpq = utils::amalloc<int8_t>(static_cast<size_t>(N) * K);
-    auto ptr = reinterpret_cast<StorageWeight*>(stor);
-    int nk_scale = utils::updiv(K, ptr->mBlockSize);
-    auto ssize = static_cast<size_t>(N) * nk_scale;
-    auto Tscales = utils::amalloc<float>(ssize);
-    auto Tzps = utils::amalloc<int8_t>(ptr->mIsAsym ? ssize : 0);
-    quantizeWeight(N, K, B, ldb, ptr->mBlockSize, tmpq, Tscales, Tzps, ptr->mDType, threading);
-    packQWeight(N, K, tmpq, N, Tscales, Tzps, stor, threading);
-    utils::afree(tmpq);
-    utils::afree(Tscales);
-    utils::afree(Tzps);
-  }
-
-  virtual void unpackWeight(const int N, const int K, void* stor, float* B, const int ldb,
-                            parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        auto dequant = utils::amalloc<float>((size_t)rowpad * colpad);
-        auto dstptr = dequant;
-        int dststep = 0;
-        size_t constexpr CacheSize = size_t(100) << 10;
-        int8_t tmpcache[CacheSize];
-        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {(storage::gemm::WeightKBlockBase*)stor},
-                  tmpcache, CacheSize);
-        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
-            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
-        utils::afree(dequant);
-      }
-    });
-  }
-
-  virtual void unpackWeight(const int N, const int K, void* stor, int8_t* B, const int ldb,
-                            parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        auto dequant = utils::amalloc<int8_t>((size_t)rowpad * colpad);
-        auto dstptr = dequant;
-        int dststep = 0;
-        size_t constexpr CacheSize = size_t(100) << 10;
-        int8_t tmpcache[CacheSize];
-        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {(storage::gemm::WeightKBlockBase*)stor},
-                  tmpcache, CacheSize);
-        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
-            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
-        utils::afree(dequant);
-      }
-    });
-  }
-
-  virtual void setQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales, void* ptr,
-                                  parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->mScaT == JBLAS_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr)
-                std::memcpy(stor->template SPtr<float>() + i * stor->mNPad, scales + i * N, N * sizeof(scales[0]));
-              if (zero_points != nullptr)
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-    } else if (stor->mScaT == JBLAS_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[j + i * stor->mNPad] = static_cast<utils::bf16>(scales[i * N + j]);
-                }
-              }
-              if (zero_points != nullptr) {
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-              }
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-    }
-  }
-
-  virtual void setTransposeQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales,
-                                           void* ptr, parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->mScaT == JBLAS_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<float>()[i * stor->mNPad + j] = scales[j * rawnk_scale + i];
-                }
-              } else {
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              }
-            }
-          }
-        }
-      });
-    } else if (stor->mScaT == JBLAS_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[i * stor->mNPad + j] = utils::bf16(scales[j * rawnk_scale + i]);
-                }
-              } else {
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              }
-            }
-          }
-        }
-      });
-    }
-    if (stor->mIsAsym && zero_points)
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              for (size_t j = 0; j < N; j++) {
-                stor->template ZPtr<int8_t>()[i * stor->mNPad + j] = zero_points[j * rawnk_scale + i];
-              }
-            } else {
-              std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
-                           const int8_t* zero_points, void* ptr, parallel::IThreading* threading) {
-    setQuantCorrection(N, K, zero_points, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    reorderWeight(N, K, B, ldb, stor->WPtr(), threading);
-    reduceWeight(ptr, threading);
-  }
-
-  void reduceWeight(void* ptr, parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    if (stor->mHasReduce) {
-      auto deq = utils::amalloc<float>((size_t)stor->mK * stor->mN);
-      unpackWeight(stor->mN, stor->mK, stor, deq, stor->mN, threading);
-      if (stor->mRedT == JBLAS_DTYPE::F32) {
-        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<float>(), stor->mCStep,
-               threading);
-      } else if (stor->mRedT == JBLAS_DTYPE::BF16) {
-        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<utils::bf16>(), stor->mCStep,
-               threading);
-      } else {
-        assert(0);
-      }
-      utils::afree(deq);
-    }
-  }
-  template <typename RED_T>
-  void reduce(const int N, const int K, const int KBlock, const float* B, const int ldb, RED_T* rptr, const int ldr,
-              parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, KBlock, 16});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
-        const auto dst = rptr + thdp.loc[1] + thdp.loc[0] / KBlock * ldr;
-        using RowReduceSum = kernel::wrapper::RowReduceSum<RED_T>;
-        for (int i = 0; i < thdp.size[0]; i += KBlock) {
-          int rowremain = utils::remainsize(thdp.loc[0] + i, K, KBlock);
-          auto ret = RowReduceSum::template forward<ISA_T>(  //
-              src + i * ldb, ldb, rowremain, thdp.size[1], dst + i / KBlock * ldr);
-          assert(ret == JblasSuccess);
-          (void)ret;
-        }
-      }
-    });
-  }
-
-  void quantizeWeight(const int N, const int K, const float* B, const int ldb, int blocksize, int8_t* qB, float* scales,
-                      int8_t* zero_points, JBLAS_DTYPE quant_dtype, parallel::IThreading* threading) {
-    int bsize = blocksize == -1 ? K : blocksize;
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, bsize, 16});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        quantRowBlock(B + thdp.loc[0] * ldb + thdp.loc[1], qB + thdp.loc[0] * N + thdp.loc[1], thdp.size[0],
-                      thdp.size[1], ldb, N, scales + thdp.loc[0] / bsize * N + thdp.loc[1],
-                      zero_points == nullptr ? zero_points : zero_points + thdp.loc[0] / bsize * N + thdp.loc[1], bsize,
-                      quant_dtype);
-      }
-    });
-  }
-
-  void reorderWeight(const int N, const int K, const int8_t* B, const int ldb, int8_t* dstptr,
-                     parallel::IThreading* threading) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
-        const auto dst = dstptr + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * KPad;
-        using PaddingInterleaveMNWType =
-            kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
-        auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
-            src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, ldb, KPad);
-        assert(ret == JblasSuccess);
-        (void)ret;
-      }
-    });
-  }
-
- public:
-  virtual inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    auto zptr = wptr->template ZPtr<int8_t>();
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        kernel::wrapper::DecompressKBlockS8F32<_GemmCore_T::PACK_ROW>::template forward<ISA_T, float>(
-            bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-            zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-            wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad);
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        kernel::wrapper::DecompressKBlockS8F32<_GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16>(
-            bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-            zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-            wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-  virtual inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                      int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-  virtual inline JBLAS_CODE getWeight(utils::fp16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                      int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-  virtual inline JBLAS_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    kernel::wrapper::Memcpy2D::template forward<ISA_T, int8_t, int8_t>(
-        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
-        _GemmCore_T::NTILE * k_size);
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      kernel::wrapper::DecompressKBlockS8S8Fp::template forward<ISA_T>(
-          bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize);
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      kernel::wrapper::DecompressKBlockS8S8Fp::template forward<ISA_T>(
-          bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize);
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(utils::fp16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return getWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) {
-    if (quant_dtype == JBLAS_DTYPE::S8) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S8>(srcptr, dstptr, row, col, ld_src,
-                                                                                ld_dst, scales, zero_points, blocksize);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockS4 : public WeightKBlockS8<_GemmCore_T, ISA_T> {
- public:
-  using Param = typename WeightKBlockS8<_GemmCore_T, ISA_T>::Param;
-  using StorageWeight = storage::gemm::StorageWeightKBlockS4;
-  StorageWeight createStorage(const int N, const int K, int blocksize, JBLAS_DTYPE weiT, JBLAS_DTYPE scaT,
-                              JBLAS_DTYPE redT, bool is_asym = false) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    int NPad = utils::padto(N, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, weiT, scaT, redT, is_asym);
-    return tmp;
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
-                           const int8_t* zero_points, void* ptr, parallel::IThreading* threading) override {
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setQuantCorrection(N, K, zero_points, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto tmp = utils::amalloc<float>((size_t)stor->mKPad * stor->mNPad);
-    auto reorded = (int8_t*)tmp;
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded, threading);
-    compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(), threading);
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reduceWeight(ptr, threading);
-    utils::afree(tmp);
-  }
-
-  virtual void packNbitsWeight(const int N, const int K, bool isasym, const uint8_t* B, const int ldb,
-                               const float* scales, const uint8_t* zero_points, void* ptr,
-                               parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto tmp = utils::amalloc<float>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
-    auto blks = utils::updiv(K, stor->mBlockSize);
-    auto blks_padding2 = utils::padto(blks, 2);
-    auto tmpscales = tmp;
-    auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
-    if (scales) {
-      for (size_t i = 0; i < N * blks; i += 2) {
-        tmpscales[i] = scales[i] / 16;
-        tmpscales[i + 1] = scales[i + 1] / 16;
-      }
-    }
-    if (zero_points) {
-      for (size_t i = 0; i < N; i += 1) {
-        for (size_t ib = 0; ib < blks; ib += 2) {
-          auto tmpzp = *(zero_points + i * blks_padding2 / 2 + ib / 2);
-          tmpzeropoints[i * blks + ib] = ((tmpzp & 0xf) - 8) << 4;
-          if (ib + 1 < blks) {
-            tmpzeropoints[i * blks + ib + 1] = (((tmpzp & 0xf0) >> 4) - 8) << 4;
-          }
-        }
-      }
-    }
-
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setTransposeQuantCorrection(N, K, zero_points ? tmpzeropoints : nullptr,
-                                                                    scales ? tmpscales : nullptr, ptr, threading);
-    if (B) {
-      auto s8ptr = (int8_t*)tmp;
-      auto transposeunpackfunc_u4s4 = [&]() {
-        parallel::Scheduler2D para({threading->num_threads(), N, K, 1, 2});
-        threading->parallel_for([&](int tid) {
-          parallel::ThreadProblem2D thdp{tid};
-          para.getIndex(thdp);
-          if (thdp.valid) {
-            for (size_t i = thdp.loc[0]; i < thdp.loc[0] + thdp.size[0]; i++) {
-              for (size_t j = thdp.loc[1]; j < thdp.loc[1] + thdp.size[1]; j += 2) {
-                auto src = *(B + i * ldb / 2 + j / 2);
-                s8ptr[(j + 0) * N + i] = ((src & 0xf) - 8) << 4;
-                s8ptr[(j + 1) * N + i] = (((src & 0xf0) >> 4) - 8) << 4;
-              }
-            }
-          }
-        });
-      };
-      transposeunpackfunc_u4s4();
-      auto reorded = s8ptr + static_cast<size_t>(K) * N;
-      WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, s8ptr, N, reorded, threading);
-      compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(), threading);
-    }
-    utils::afree(tmp);
-  }
-
-  void compressWeight(const int N, const int K, const int8_t* B, const int ldb, utils::bit4x2* dstptr,
-                      parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto ret = doCompress(B + thdp.loc[0] * ldb + thdp.loc[1], dstptr + thdp.loc[0] * ldb / 2 + thdp.loc[1] / 2,
-                              thdp.size[0], thdp.size[1], ldb, ldb);
-        assert(ret == JblasSuccess);
-        (void)ret;
-      }
-    });
-  }
-
- public:
-  inline JBLAS_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-        kernel::wrapper::DecompressKBlockS4S8::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-            (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-            ColSize, ColSize);
-      } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-        kernel::wrapper::DecompressKBlockS4S8::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-            (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-            ColSize, ColSize);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                    int n_offset, const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual JBLAS_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst) {
-    return kernel::wrapper::CompressS8S4<_GemmCore_T::NTILE>::template forward<ISA_T>(
-        srcptr, reinterpret_cast<utils::int4x2*>(dstptr), row, col, ld_src,
-        ld_dst);  // ld_dst here not stride
-  }
-
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) {
-    if (quant_dtype == JBLAS_DTYPE::S4_FULLRANGE) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::S4_CLIP) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, blocksize);
-    }
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  template <typename _T>
-  inline JBLAS_CODE getFpWeight(_T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto zptr = wptr->template ZPtr<int8_t>();
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                             JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                             JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                             JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                             JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockF4 : public WeightKBlockS4<_GemmCore_T, ISA_T> {
- public:
-  using Param = typename WeightKBlockS8<_GemmCore_T, ISA_T>::Param;
-  using StorageWeight = storage::gemm::StorageWeightKBlockF4;
-  StorageWeight createStorage(const int N, const int K, int blocksize, JBLAS_DTYPE f4T, JBLAS_DTYPE scaT) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    int NPad = utils::padto(N, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, f4T, scaT);
-    return tmp;
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales, void* ptr,
-                           parallel::IThreading* threading) {
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setQuantCorrection(N, K, NULL, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto reorded = utils::amalloc<int8_t>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded, threading);
-    WeightKBlockS4<_GemmCore_T, ISA_T>::compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(),
-                                                       threading);
-    utils::afree(reorded);
-  }
-
-  inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                    int n_offset, const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) override {
-    if (quant_dtype == JBLAS_DTYPE::F4_BNB) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_BNB>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::F4_E2M1) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_E2M1>(srcptr, dstptr, row, col, ld_src,
-                                                                                ld_dst, scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::F4_NF4) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_NF4>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, blocksize);
-    }
-  }
-
-  virtual JBLAS_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst) override {
-    return kernel::wrapper::CompressFp4<_GemmCore_T::NTILE>::template forward<ISA_T>(
-        srcptr, reinterpret_cast<utils::f4x2*>(dstptr), row, col, ld_src,
-        ld_dst);  // ld_dst here not stride
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
-      auto fp32ptr = *dstptr + i * k_size;
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_NF4>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_E2M1>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_BNB>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_NF4>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_E2M1>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_BNB>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
-      auto fp32ptr = *dstptr + i * k_size;
-      if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_NF4>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_E2M1>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_BNB>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-}  // namespace gemm
-}  // namespace prologue_b
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
deleted file mode 100644
index 052728dba687..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
+++ /dev/null
@@ -1,665 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_base.h"
-#include "jit_blas.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace storage {
-
-constexpr size_t Alignment = 64;
-class ISerialObject {
- protected:
-  virtual size_t getSerializedSize() = 0;
-
-  virtual void serializeToBuffer(int8_t*& wptr) = 0;
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) = 0;
-};
-
-class ISerializable : public ISerialObject {
- public:
-  virtual ~ISerializable() = default;
-
-  virtual void assign(int8_t* buf) = 0;
-
-  virtual void serialize(int8_t* wptr) = 0;
-
-  virtual void deserialize(int8_t* rptr) = 0;
-  size_t mSize = 0;
-
- protected:
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = 0;
-    totalsize += sizeof(mSize);
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override { utils::serialize(wptr, mSize); }
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
-    if (!map_buf) {
-      mSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<size_t>(rptr, mSize);
-    }
-  }
-};
-
-class ISerialBuffer : public ISerialObject {
- public:
-  template <typename T>
-  inline constexpr T* get() {
-    return reinterpret_cast<T*>(mBufPtr);
-  };
-  template <typename T>
-  inline size_t size() {
-    return mBufSize / sizeof(T);
-  };
-
-  void resize(size_t bytes) { mBufSize = bytes; }
-
- protected:
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = 0;
-    totalsize += sizeof(mBufSize);
-    totalsize += mBufSize + Alignment;
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override {
-    utils::serialize(wptr, mBufSize);
-    wptr = utils::pointer_align<Alignment>(wptr);
-    if (wptr != mBufPtr) {
-      std::memcpy(wptr, mBufPtr, mBufSize);
-    }
-    wptr += mBufSize;
-  }
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
-    if (!map_buf) {
-      mBufSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<size_t>(rptr, mBufSize);
-    }
-    rptr = utils::pointer_align<Alignment>(rptr);
-    mBufPtr = rptr;
-    rptr += mBufSize;
-  }
-
-  int8_t* mBufPtr = NULL;
-  size_t mBufSize = 0;
-};
-namespace gemm {
-// Storage classes for GEMM cases:
-// Weight K*N
-// Activation M*K
-
-class WeightBase : public storage::ISerializable {
- public:
-  JBLAS_PROLOGUEB_IDS mPrologueID = JBLAS_PROLOGUEB_IDS::Undef;
-  uint32_t mCoreId = 0;
-  JBLAS_DTYPE mDType = JBLAS_DTYPE::F32;
-  int mNPad = 0, mKPad = 0;
-  int mN = 0, mK = 0;
-
-  WeightBase(uint32_t _id) { mCoreId = _id; }
-
-  // bytes offset to mPrologueID
-  static constexpr inline size_t offset() { return sizeof(mSize); }
-
- protected:
-  void resize(int NPad, int KPad, int N, int K, JBLAS_DTYPE dtype) {
-    mNPad = NPad;
-    mKPad = KPad;
-    mN = N;
-    mK = K;
-    mDType = dtype;
-  }
-
-  virtual size_t getSerializedSize() { return ISerializable::getSerializedSize() + getMiscSize(); }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    utils::serialize(wptr, mPrologueID);
-    utils::serialize(wptr, mCoreId);
-    utils::serialize(wptr, mNPad);
-    utils::serialize(wptr, mKPad);
-    utils::serialize(wptr, mN);
-    utils::serialize(wptr, mK);
-    utils::serialize(wptr, mDType);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    ISerializable::deserializeBuffer(rptr, map_buf);
-    if (!map_buf) {
-      mPrologueID = utils::deserialize<JBLAS_PROLOGUEB_IDS>(rptr);
-      mCoreId = utils::deserialize<uint32_t>(rptr);
-      mNPad = utils::deserialize<int>(rptr);
-      mKPad = utils::deserialize<int>(rptr);
-      mN = utils::deserialize<int>(rptr);
-      mK = utils::deserialize<int>(rptr);
-      mDType = utils::deserialize<JBLAS_DTYPE>(rptr);
-    } else {
-      utils::serialize<JBLAS_PROLOGUEB_IDS>(rptr, mPrologueID);
-      utils::serialize<uint32_t>(rptr, mCoreId);
-      utils::serialize<int>(rptr, mNPad);
-      utils::serialize<int>(rptr, mKPad);
-      utils::serialize<int>(rptr, mN);
-      utils::serialize<int>(rptr, mK);
-      utils::serialize<JBLAS_DTYPE>(rptr, mDType);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(mPrologueID);
-    totalsize += sizeof(mCoreId);
-    totalsize += sizeof(mNPad);
-    totalsize += sizeof(mKPad);
-    totalsize += sizeof(mN);
-    totalsize += sizeof(mK);
-    totalsize += sizeof(mDType);
-    return totalsize;
-  }
-};
-
-class WeightKBlockBase : public WeightBase {
- public:
-  int mBlockSize = 1;
-  WeightKBlockBase(uint32_t _id) : WeightBase(_id) {}
-  void resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE dtype) {
-    WeightBase::resize(NPad, KPad, N, K, dtype);
-    mBlockSize = Block;
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    size_t totalsize = WeightBase::getSerializedSize() + getMiscSize();
-    return totalsize;
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    WeightBase::serializeToBuffer(wptr);
-    utils::serialize(wptr, mBlockSize);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    WeightBase::deserializeBuffer(rptr, map_buf);
-    if (!map_buf) {
-      mBlockSize = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, mBlockSize);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = sizeof(mBlockSize);
-    return totalsize;
-  }
-};
-
-class StorageQuantCorrection : public ISerialObject {
-  // ser
- public:
-  size_t mCSize = 0;
-  int mCStep = 0;
-  bool mIsAsym = false;
-  bool mHasReduce = false;
-  JBLAS_DTYPE mScaT = JBLAS_DTYPE::F32, mZpT = JBLAS_DTYPE::F32, mRedT = JBLAS_DTYPE::F32;
-
- protected:
-  int8_t* mSPtr = nullptr;
-  int8_t* mZPtr = nullptr;
-  int8_t* mRPtr = nullptr;
-
-  // non-ser
- public:
-  int mScaEleSize = 0, mZpEleSize = 0, mRedEleSize = 0;
-
- public:
-  template <typename T>
-  inline T* SPtr() {
-    return (T*)mSPtr;
-  }
-
-  template <typename T>
-  inline T* ZPtr() {
-    return (T*)mZPtr;
-  }
-
-  template <typename T>
-  inline T* RPtr() {
-    return (T*)mRPtr;
-  }
-
-  size_t resize(int Rows, int Step, JBLAS_DTYPE scalet, JBLAS_DTYPE zpt, JBLAS_DTYPE redt, bool _is_asym,
-                bool _has_reduce) {
-    mScaT = scalet;
-    mZpT = zpt;
-    mRedT = redt;
-    updateSize();
-    mIsAsym = _is_asym;
-    mHasReduce = _has_reduce;
-    mCStep = Step;
-    mCSize = static_cast<size_t>(Rows) * Step;
-    return getSerializedSize();
-  }
-
- protected:
-  inline void updateSize() {
-    mScaEleSize = int(utils::jblas_dtype_size(mScaT));
-    mZpEleSize = int(utils::jblas_dtype_size(mZpT));
-    mRedEleSize = int(utils::jblas_dtype_size(mRedT));
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(mScaT);
-    totalsize += sizeof(mZpT);
-    totalsize += sizeof(mRedT);
-    totalsize += sizeof(mIsAsym);
-    totalsize += sizeof(mHasReduce);
-    totalsize += sizeof(mCStep);
-    totalsize += sizeof(mCSize);
-    return totalsize;
-  }
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = getMiscSize();
-    totalsize += mCSize * mScaEleSize + Alignment;
-    if (mIsAsym) totalsize += mCSize * mZpEleSize + Alignment;
-    if (mHasReduce) totalsize += mCSize * mRedEleSize + Alignment;
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override {
-    utils::serialize(wptr, mScaT);
-    utils::serialize(wptr, mZpT);
-    utils::serialize(wptr, mRedT);
-    utils::serialize(wptr, mIsAsym);
-    utils::serialize(wptr, mHasReduce);
-    utils::serialize(wptr, mCStep);
-    utils::serialize(wptr, mCSize);
-    wptr = utils::pointer_align<Alignment>(wptr);
-    if (wptr != mSPtr) {
-      std::memcpy(wptr, mSPtr, mScaEleSize);
-    }
-    wptr += mCSize * mScaEleSize;
-    if (mIsAsym) {
-      wptr = utils::pointer_align<Alignment>(wptr);
-      if (wptr != mZPtr) {
-        std::memcpy(wptr, mZPtr, mZpEleSize);
-      }
-      wptr += mCSize * mZpEleSize;
-    }
-    if (mHasReduce) {
-      wptr = utils::pointer_align<Alignment>(wptr);
-      if (wptr != mRPtr) {
-        std::memcpy(wptr, mRPtr, mCSize * mRedEleSize);
-      }
-      wptr += mCSize * mRedEleSize;
-    }
-  }
-  virtual void deserializeBuffer(int8_t*& rptr, bool locate_buf) override {
-    if (!locate_buf) {
-      mScaT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      mZpT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      mRedT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      updateSize();
-      mIsAsym = utils::deserialize<bool>(rptr);
-      mHasReduce = utils::deserialize<bool>(rptr);
-      mCStep = utils::deserialize<int>(rptr);
-      mCSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<JBLAS_DTYPE>(rptr, mScaT);
-      utils::serialize<JBLAS_DTYPE>(rptr, mZpT);
-      utils::serialize<JBLAS_DTYPE>(rptr, mRedT);
-      utils::serialize<bool>(rptr, mIsAsym);
-      utils::serialize<bool>(rptr, mHasReduce);
-      utils::serialize<int>(rptr, mCStep);
-      utils::serialize<size_t>(rptr, mCSize);
-    }
-    rptr = utils::pointer_align<Alignment>(rptr);
-    mSPtr = rptr;
-    rptr += mCSize * mScaEleSize;
-    if (mIsAsym) {
-      rptr = utils::pointer_align<Alignment>(rptr);
-      mZPtr = rptr;
-      rptr += mCSize * mZpEleSize;
-    }
-    if (mHasReduce) {
-      rptr = utils::pointer_align<Alignment>(rptr);
-      mRPtr = rptr;
-      rptr += mCSize * mRedEleSize;
-    }
-  }
-};
-
-class StorageReduce : public ISerializable, public ISerialBuffer {
- public:
-  using CorrectionType = StorageQuantCorrection;
-  int m = 0, k = 0, lda = 0, kblock = 1;
-  size_t resize(int _m, int _k, int _kblock, JBLAS_DTYPE redt) {
-    kblock = _kblock;
-    m = _m;
-    k = _k;
-    lda = utils::updiv(_k, _kblock);
-    size_t bufsize = static_cast<size_t>(m) * lda * utils::jblas_dtype_size(redt);
-    ISerialBuffer::resize(bufsize);
-    mSize = getSerializedSize();
-    return mSize;
-  }
-  template <typename QT_T>
-  inline QT_T* APtr() {
-    return get<QT_T>();
-  }
-
-  virtual void assign(int8_t* buf) override {
-    ISerializable::deserializeBuffer(buf, true);
-    deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    ISerializable::deserializeBuffer(rptr, false);
-    deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    return ISerializable::getSerializedSize() + getMiscSize() + ISerialBuffer::getSerializedSize();
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    utils::serialize(wptr, m);
-    utils::serialize(wptr, k);
-    utils::serialize(wptr, lda);
-    utils::serialize(wptr, kblock);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    if (!map_buf) {
-      m = utils::deserialize<int>(rptr);
-      lda = utils::deserialize<int>(rptr);
-      kblock = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, m);
-      utils::serialize(rptr, k);
-      utils::serialize(rptr, lda);
-      utils::serialize(rptr, kblock);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(m);
-    totalsize += sizeof(k);
-    totalsize += sizeof(lda);
-    totalsize += sizeof(kblock);
-    return totalsize;
-  }
-};
-
-class StorageQuantActivation : public ISerializable, public ISerialBuffer, public StorageQuantCorrection {
- public:
-  using CorrectionType = StorageQuantCorrection;
-  int m = 0, lda = 0, kblock = 1;
-  size_t resize(int _m, int _lda, int _kblock, JBLAS_DTYPE buft, JBLAS_DTYPE scalet, JBLAS_DTYPE zpt, JBLAS_DTYPE redt,
-                bool is_asym, bool has_reduce) {
-    kblock = _kblock;
-    lda = _lda;
-    m = _m;
-    CorrectionType::resize(_m, utils::updiv(_lda, _kblock), scalet, zpt, redt, is_asym, has_reduce);
-    size_t bufsize = static_cast<size_t>(m) * lda * utils::jblas_dtype_size(buft);
-    ISerialBuffer::resize(bufsize);
-    mSize = getSerializedSize();
-    return mSize;
-  }
-  template <typename QT_T>
-  inline QT_T* APtr() {
-    return get<QT_T>();
-  }
-
-  virtual void assign(int8_t* buf) override {
-    ISerializable::deserializeBuffer(buf, true);
-    deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    ISerializable::deserializeBuffer(rptr, false);
-    deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    return ISerializable::getSerializedSize() + getMiscSize() + ISerialBuffer::getSerializedSize() +
-           CorrectionType::getSerializedSize();
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    utils::serialize(wptr, m);
-    utils::serialize(wptr, lda);
-    utils::serialize(wptr, kblock);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    if (!map_buf) {
-      m = utils::deserialize<int>(rptr);
-      lda = utils::deserialize<int>(rptr);
-      kblock = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, m);
-      utils::serialize(rptr, lda);
-      utils::serialize(rptr, kblock);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(m);
-    totalsize += sizeof(lda);
-    totalsize += sizeof(kblock);
-    return totalsize;
-  }
-};
-
-class StoragePackedWeight : public WeightBase, public ISerialBuffer {
- public:
-  StoragePackedWeight(uint32_t _id) : WeightBase(_id) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightPack; }
-
-  size_t resize(int NPad, int KPad, int N, int K, JBLAS_DTYPE dtype) {
-    WeightBase::resize(NPad, KPad, N, K, dtype);
-    auto bsize = static_cast<size_t>(NPad) * KPad * jblas::utils::jblas_dtype_size(dtype);
-    ISerialBuffer::resize(bsize);
-    mSize = WeightBase::getSerializedSize() + ISerialBuffer::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    WeightBase::deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    WeightBase::serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    WeightBase::deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-  }
-};
-
-class Buffer8Bit : public ISerialBuffer {
- public:
-  void resize(size_t size) { ISerialBuffer::resize(size); }
-  inline int8_t* WPtr() { return get<int8_t>(); }
-};
-
-class Buffer4Bit : public ISerialBuffer {
- public:
-  void resize(size_t size) { ISerialBuffer::resize(utils::updiv(size, 2)); }
-  inline utils::bit4x2* WPtr() { return get<utils::bit4x2>(); }
-};
-
-class StorageWeightKBlockS8 : public WeightKBlockBase, public Buffer8Bit, public StorageQuantCorrection {
- public:
-  using InfoType = WeightKBlockBase;
-  using QWeightType = Buffer8Bit;
-  using CorrectionType = StorageQuantCorrection;
-  StorageWeightKBlockS8(uint32_t _type) : WeightKBlockBase(_type) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockS8; }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE scalet, JBLAS_DTYPE redt, bool IsAsym) {
-    JBLAS_DTYPE zpt = JBLAS_DTYPE::S8;
-    InfoType::resize(NPad, KPad, Block, N, K, JBLAS_DTYPE::S8);
-    QWeightType::resize(static_cast<size_t>(NPad) * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    auto gemm_comp = jblas::gemm::CoreAttr::get_mask_val(mCoreId, jblas::gemm::CoreAttr::COMP_MASK,
-                                                         jblas::gemm::CoreAttr::COMP_SHIFT);
-    CorrectionType::resize(nk_scale, NPad, scalet, zpt, redt, IsAsym,
-                           gemm_comp >= static_cast<uint32_t>(jblas::gemm::CompType::COMP_INT_START));
-    mSize = InfoType::getSerializedSize() + QWeightType::getSerializedSize() + CorrectionType::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    InfoType::deserializeBuffer(buf, true);
-    QWeightType::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    InfoType::serializeToBuffer(wptr);
-    QWeightType::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    InfoType::deserializeBuffer(rptr, false);
-    QWeightType::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-};
-
-class StorageWeightKBlockS4 : public WeightKBlockBase, public Buffer4Bit, public StorageQuantCorrection {
- public:
-  using InfoType = WeightKBlockBase;
-  using QWeightType = Buffer4Bit;
-  using CorrectionType = StorageQuantCorrection;
-  StorageWeightKBlockS4(uint32_t _type) : WeightKBlockBase(_type) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockS4; }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE s4t, JBLAS_DTYPE scalet, JBLAS_DTYPE redt,
-                bool IsAsym) {
-    JBLAS_DTYPE zpt = JBLAS_DTYPE::S8;
-    InfoType::resize(NPad, KPad, Block, N, K, s4t);
-    QWeightType::resize(static_cast<size_t>(NPad) * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    auto gemm_comp = jblas::gemm::CoreAttr::get_mask_val(mCoreId, jblas::gemm::CoreAttr::COMP_MASK,
-                                                         jblas::gemm::CoreAttr::COMP_SHIFT);
-    CorrectionType::resize(nk_scale, NPad, scalet, zpt, redt, IsAsym,
-                           gemm_comp >= static_cast<uint32_t>(jblas::gemm::CompType::COMP_INT_START));
-    mSize = InfoType::getSerializedSize() + QWeightType::getSerializedSize() + CorrectionType::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    InfoType::deserializeBuffer(buf, true);
-    QWeightType::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    InfoType::serializeToBuffer(wptr);
-    QWeightType::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    InfoType::deserializeBuffer(rptr, false);
-    QWeightType::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-};
-
-class StorageWeightKBlockF4 : public StorageWeightKBlockS4 {
- public:
-  StorageWeightKBlockF4(uint32_t _type) : StorageWeightKBlockS4(_type) {
-    mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockF4;
-  }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE f4t, JBLAS_DTYPE scalet) {
-    StorageWeightKBlockS4::InfoType::resize(NPad, KPad, Block, N, K, f4t);
-    StorageWeightKBlockS4::QWeightType::resize((size_t)NPad * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    StorageWeightKBlockS4::CorrectionType::resize(nk_scale, NPad, scalet, JBLAS_DTYPE::S8, JBLAS_DTYPE::F32, false,
-                                                  false);
-    mSize = StorageWeightKBlockS4::InfoType::getSerializedSize() +
-            StorageWeightKBlockS4::QWeightType::getSerializedSize() +
-            StorageWeightKBlockS4::CorrectionType::getSerializedSize();
-    return mSize;
-  }
-};
-
-class PackedWeightParser {
- public:
-  static gemm::WeightBase* deserialBuffer(const void* serialized_buf) {
-    auto rptr = reinterpret_cast<int8_t*>(const_cast<void*>(serialized_buf));
-    rptr += WeightBase::offset();
-    int mProID = utils::deserialize<int>(rptr);
-    WeightBase* ptr = NULL;
-    if (mProID >= int(JBLAS_PROLOGUEB_IDS::Begin) && mProID < int(JBLAS_PROLOGUEB_IDS::End)) {
-      rptr = reinterpret_cast<int8_t*>(const_cast<void*>(serialized_buf));
-      auto type = static_cast<JBLAS_PROLOGUEB_IDS>(mProID);
-      switch (type) {
-        case JBLAS_PROLOGUEB_IDS::WeightPack:
-          ptr = new gemm::StoragePackedWeight(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockS8:
-          ptr = new gemm::StorageWeightKBlockS8(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockS4:
-          ptr = new gemm::StorageWeightKBlockS4(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockF4:
-          ptr = new gemm::StorageWeightKBlockF4(0);
-          break;
-        default:
-          break;
-      }
-      if (ptr) {
-        ptr->deserialize(rptr);
-      }
-    }
-    return ptr;
-  }
-};
-}  // namespace gemm
-}  // namespace storage
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
deleted file mode 100644
index 96d9e94c9bfc..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
+++ /dev/null
@@ -1,638 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <functional>
-#include <cassert>
-#include <vector>
-#include <cstdio>
-#ifdef _WIN32
-#include <cstdlib>
-#else
-#include <err.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <sys/signal.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#include <stdlib.h>
-
-#define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__)
-#define XFEATURE_XTILECFG 17
-#define XFEATURE_XTILEDATA 18
-#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
-#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
-#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
-
-#define ARCH_GET_XCOMP_PERM 0x1022
-#define ARCH_REQ_XCOMP_PERM 0x1023
-
-#endif
-#include "jit_blas.h"
-
-// As long as the compiler supports the ISA, we will enable it.
-// Only the ISA you use in your project will be compiled.
-#ifdef __GNUC__
-#define CompileAVX512F() (__GNUC__ >= 6)
-#define CompileAVX2() (__GNUC__ >= 5)
-#define CompileAMX() (__GNUC__ >= 11)
-#define CompileBF16() (__GNUC__ >= 13)
-#define CompileFP16() (__GNUC__ >= 13)
-#define CompileAMXBF16() (CompileAMX())
-#define CompileAMXINT8() (CompileAMX())
-#else
-#define CompileAVX512F() _MSC_VER && (_MSC_VER >= 1911)
-#define CompileAVX2() _MSC_VER && (_MSC_VER >= 1900)
-#define CompileAMX() 0
-#define CompileBF16() 0
-#define CompileFP16() 0
-#define CompileAMXBF16() 0
-#define CompileAMXINT8() 0
-#endif
-#if CompileBF16() || CompileFP16()
-#include <immintrin.h>
-#endif
-
-namespace jblas {
-namespace utils {
-
-template <typename T2, typename T1>
-inline const T2 bit_cast(T1 i) {
-  static_assert(sizeof(T1) == sizeof(T2), "Bit-casting must preserve size.");
-  T2 o;
-  memcpy(&o, &i, sizeof(T2));
-  return o;
-}
-
-template <typename T>
-inline uint32_t bitand_u32(const T& src, const T& src1) {
-  return uint32_t(src) & uint32_t(src1);
-}
-
-struct bf16 {
-  uint16_t x;
-  union bf16f32 {
-    float f32;
-    unsigned int u;
-    uint16_t bf16[2];
-  };
-  bf16() : x(0) {}
-
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512vl", "avx512bf16")
-  static uint16_t f32_to_bf16(float v) {
-    auto mm = _mm_load_ss(&v);
-    auto mm2 = _mm_cvtneps_pbh(mm);
-    uint16_t dst;
-    _mm_storeu_si16(reinterpret_cast<uint16_t*>(&dst), reinterpret_cast<__m128i>(mm2));
-    return dst;
-  }
-#pragma GCC pop_options
-  explicit bf16(float vf32) : x(bit_cast<uint16_t>(f32_to_bf16(vf32))) {}
-#else
-  explicit bf16(float vf32) { fromfloat(vf32); }
-#endif
-
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512vl", "avx512bf16")
-  float tofloat() const {
-    auto mm = _mm_loadu_si16(&(this->x));
-    auto mm2 = _mm_bslli_si128(mm, 2);
-    float dst;
-    _mm_store_ss(&dst, reinterpret_cast<__m128>(mm2));
-    return dst;
-  }
-#pragma GCC pop_options
-#else
-  float tofloat() const {
-    bf16f32 tmp = {0.f};
-    tmp.bf16[1] = x;
-    return tmp.f32;
-  }
-#endif
-
-  float tofloat_nosimd() const {
-    bf16f32 tmp = {0.f};
-    tmp.bf16[1] = x;
-    return tmp.f32;
-  }
-
-  operator float() const { return tofloat(); }
-
-  static bf16 from_bin(const uint16_t x) {
-    bf16 res;
-    res.x = x;
-    return res;
-  }
-
-  void fromfloat(float _v) {
-#if CompileBF16()
-    x = bit_cast<uint16_t>(f32_to_bf16(_v));
-#else
-    bf16f32 tmp = {0.f};
-    tmp.f32 = _v;
-    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2
-    const auto lsb = tmp.bf16[1] & 1;
-    tmp.u += 0x7fff + lsb;
-    x = tmp.bf16[1];
-#endif
-  }
-
-  void fromfloat_nosimd(float _v) {
-    bf16f32 tmp = {0.f};
-    tmp.f32 = _v;
-    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures
-    // Software Developer’s Manual Volume 2
-    const auto lsb = tmp.bf16[1] & 1;
-    tmp.u += 0x7fff + lsb;
-    x = tmp.bf16[1];
-  }
-};
-
-struct fp16 {
-  uint16_t x;
-
-  fp16() { x = 0; }
-  explicit fp16(float val) { (*this) = val; }
-  explicit fp16(bf16 val) { (*this) = static_cast<float>(val); }
-
-  fp16& operator=(float val) {
-#if CompileFP16()
-    this->x = bit_cast<uint16_t>(static_cast<_Float16>(val));
-#else
-    // round-to-nearest-even: add last bit after truncated mantissa
-    const uint32_t b = bit_cast<uint32_t>(val) + 0x00001000;
-    const uint32_t e = (b & 0x7F800000) >> 23;  // exponent
-    // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
-    const uint32_t m = b & 0x007FFFFF;
-    // sign : normalized : denormalized : saturate
-
-    this->x = static_cast<uint16_t>((b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) |
-                                    ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
-                                    (e > 143) * 0x7FFF);
-#endif
-    return *this;
-  }
-  explicit operator float() const {
-#if CompileFP16()
-    return static_cast<float>(bit_cast<_Float16>(this->x));
-#else
-    // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15,
-    // +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
-    const uint32_t e = (x & 0x7C00) >> 10;  // exponent
-    const uint32_t m = (x & 0x03FF) << 13;  // mantissa
-    // evil log2 bit hack to count leading zeros in denormalized format
-    const uint32_t v = bit_cast<uint32_t>(static_cast<float>(m)) >> 23;
-    // sign : normalized : denormalized
-    return bit_cast<float>((x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) |
-                           ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)));
-#endif
-  }
-  explicit operator bf16() const {
-#if CompileBF16() && CompileFP16()
-    return bf16(static_cast<float>(bit_cast<_Float16>(this->x)));
-#else
-    // Extract the exponent, and mantissa from the fp16 value.
-    int exponent = x >> 10 & 0x1f;
-    int mantissa = x & 0x3ff;
-
-    // If the exponent is 0, the bf16 value is 0.
-    if (exponent == 0) {
-      return bf16();
-    }
-    // If the exponent is 31, the bf16 value is the sign bit plus 0x7fff.
-    else if (exponent == 31) {
-      bf16 res{};
-      return bf16::from_bin(x | 0x7fff);
-    }
-    // Otherwise, the bf16 value is the sign bit plus the exponent minus 15,
-    // followed by the mantissa.
-    else {
-      int sign = x & 0x8000;
-      return bf16::from_bin(static_cast<uint16_t>(sign | (exponent + 128 - 16) << 7 | mantissa >> 3));
-    }
-#endif
-  }
-};
-
-struct bit4x2 {
-  int8_t x : 4;
-  int8_t y : 4;
-  bit4x2(int8_t v) : x(v), y(v) {}
-  bit4x2() : x(0), y(0) {}
-};
-
-struct int4x2 : bit4x2 {
-  int4x2(int8_t v) : bit4x2(v) {}
-  int4x2() : bit4x2() {}
-  static int8_t convert(int8_t src) {
-    int32_t dst = src;
-    dst = dst >= 0 ? dst + 8 : dst - 8;
-    dst = dst / 16;
-    dst = dst > 7 ? 7 : dst;
-    dst = dst < -8 ? -8 : dst;
-    return static_cast<int8_t>(dst);
-  }
-};
-
-struct f4x2 : bit4x2 {
-  f4x2(int8_t v) : bit4x2(v) {}
-  f4x2() : bit4x2() {}
-};
-
-template <typename T>
-inline constexpr JBLAS_DTYPE jblas_dtype = std::is_same_v<T, double>        ? JBLAS_DTYPE::F64
-                                           : std::is_same_v<T, float>       ? JBLAS_DTYPE::F32
-                                           : std::is_same_v<T, utils::bf16> ? JBLAS_DTYPE::BF16
-                                           : std::is_same_v<T, utils::fp16> ? JBLAS_DTYPE::F16
-                                           : std::is_same_v<T, int8_t>      ? JBLAS_DTYPE::S8
-                                           : std::is_same_v<T, uint8_t>     ? JBLAS_DTYPE::U8
-                                                                            : (assert(0), JBLAS_DTYPE::F32);
-template <typename T>
-inline constexpr const char* type_str = std::is_same_v<T, double>    ? "double"
-                                        : std::is_same_v<T, float>   ? "float"
-                                        : std::is_same_v<T, bf16>    ? "bf16"
-                                        : std::is_same_v<T, fp16>    ? "fp16"
-                                        : std::is_same_v<T, int8_t>  ? "int8_t"
-                                        : std::is_same_v<T, uint8_t> ? "uint8_t"
-                                                                     : (assert(0), "undef");
-
-inline const char* dtype2str(JBLAS_DTYPE dtype) {
-  switch (dtype) {
-    case JBLAS_DTYPE::F64:
-      return "float64";
-    case JBLAS_DTYPE::F32:
-      return "float32";
-    case JBLAS_DTYPE::F16:
-      return "float16";
-    case JBLAS_DTYPE::BF16:
-      return "bfloat16";
-    case JBLAS_DTYPE::F8_E4M3:
-      return "fp8_e4m3";
-    case JBLAS_DTYPE::F8_E5M2:
-      return "fp8_e5m2";
-    case JBLAS_DTYPE::F8_E3M4:
-      return "fp8_e3m4";
-    case JBLAS_DTYPE::S8:
-      return "signed_int8";
-    case JBLAS_DTYPE::U8:
-      return "unsigned_int8";
-    case JBLAS_DTYPE::S4_CLIP:
-      return "int4_clip";
-    case JBLAS_DTYPE::S4_FULLRANGE:
-      return "int4_fullrange";
-    case JBLAS_DTYPE::F4_E2M1:
-      return "fp4_e2m1";
-    case JBLAS_DTYPE::F4_BNB:
-      return "fp4_bitsandbytes";
-    case JBLAS_DTYPE::F4_NF4:
-      return "fp4_nf4";
-    case JBLAS_DTYPE::S32:
-      return "signed_int32";
-    case JBLAS_DTYPE::U32:
-      return "unsigned_int32";
-    default:
-      return "ErrType";
-  }
-}
-
-template <JBLAS_DTYPE DT>
-inline constexpr const char* dtype_str() {
-  return dtype2str(DT);
-}
-
-inline constexpr size_t jblas_dtype_size(const JBLAS_DTYPE t) {
-  auto bits = static_cast<uint32_t>(t) & static_cast<uint32_t>(0xff);
-  return bits >> 3;  // bits to bytes
-}
-
-#ifndef _WIN32
-static void request_perm_xtile_data() {
-  unsigned long bitmask;
-  long rc;
-
-  rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
-  if (rc) fatal_error("XTILE_DATA request failed: %ld", rc);
-
-  rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
-  if (rc) fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
-#ifndef NDEBUG
-  if (bitmask & XFEATURE_MASK_XTILE) printf("ARCH_REQ_XCOMP_PERM XTILE_DATA successful.\n");
-#endif
-}
-#else
-static void request_perm_xtile_data() {}
-#endif
-
-template <JBLAS_ISA ISA_T>
-class isa_base {
- public:
-  static bool constexpr avx = ISA_T >= JblasAVX;
-  static bool constexpr avx2 = ISA_T >= JblasAVX2;
-  static bool constexpr avx512f = ISA_T >= JblasAVX512F;
-  static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
-  static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
-  static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
-  static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
-};
-
-static inline int padto_le(int src, int padding) { return src / padding * padding; }
-
-static inline size_t padto_le(size_t src, int padding) { return src / size_t(padding) * size_t(padding); }
-
-static inline int updiv(int a, int b) { return (a + b - 1) / b; }
-
-static inline size_t updiv(size_t a, int b) { return (a + b - 1) / b; }
-
-static inline int downdiv(int a, int b) { return a / b; }
-
-static inline int remainsize(int pos, int size, int N) { return pos + N <= size ? N : size - pos; }
-
-template <typename _SRCT, typename _DSTT>
-static inline _DSTT cast(_SRCT _src) {
-  return static_cast<_DSTT>(_src);
-}
-
-template <>
-int8_t cast(float _src) {
-  _src = roundf(_src);
-  _src = std::min(_src, 127.f);
-  _src = std::max(_src, -128.f);
-  return static_cast<int8_t>(_src);
-}
-
-template <>
-uint8_t cast(float _src) {
-  _src += 0.5f;
-  _src = std::min(_src, 255.f);
-  _src = std::max(_src, 0.f);
-  return static_cast<uint8_t>(_src);
-}
-
-template <>
-int cast(float _src) {
-  return int(roundf(_src));
-}
-
-template <>
-float cast(bf16 _src) {
-  return _src.tofloat();
-}
-
-template <>
-bf16 cast(float _src) {
-  bf16 tmp;
-  tmp.fromfloat(_src);
-  return tmp;
-}
-
-template <typename _T>
-void serialize(int8_t*& buf, _T _val) {
-  *reinterpret_cast<_T*>(buf) = _val;
-  buf += sizeof(_T);
-}
-
-template <typename _T>
-_T deserialize(int8_t*& buf) {
-  auto val = *reinterpret_cast<_T*>(buf);
-  buf += sizeof(_T);
-  return val;
-}
-
-static inline int padto(int a, int b) { return updiv(a, b) * b; }
-static inline size_t padto(size_t a, int b) { return updiv(a, b) * b; }
-
-template <int _Alignment, typename _T>
-static inline _T* pointer_align(_T* src) {
-  auto uptr = reinterpret_cast<uint64_t>(src);
-  return reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
-}
-
-template <typename _T>
-static inline _T* amalloc(size_t _size, size_t _alignment = 64) {
-  if (_size == 0) {
-    return NULL;
-  }
-  auto psize = padto(_size * sizeof(_T), static_cast<int>(_alignment));
-#ifdef _WIN32
-  return reinterpret_cast<_T*>(_aligned_malloc(psize, _alignment));
-#else
-  return reinterpret_cast<_T*>(aligned_alloc(_alignment, psize));
-#endif
-}
-
-static inline void afree(void* ptr) {
-  if (ptr == NULL) {
-    return;
-  }
-#ifdef _WIN32
-  _aligned_free(ptr);
-#else
-  free(ptr);
-#endif
-}
-
-template <typename _T, int _Alignment = 64>
-class aligned_vector {
- public:
-  aligned_vector() : mRawsize(0), mPtr(nullptr), mAlignedsize(0) {}
-  aligned_vector(size_t _size) { resize(_size); }
-  aligned_vector(size_t _size, _T _val) {
-    resize(_size);
-    std::fill_n(mVec.begin(), mVec.size(), _val);
-  }
-  size_t size() { return mRawsize; }
-  void resize(size_t size) {
-    mRawsize = size;
-    mAlignedsize = (mRawsize + _Alignment - 1) / _Alignment * _Alignment + _Alignment;
-    if (size) {
-      mVec.resize(mAlignedsize);
-      auto uptr = reinterpret_cast<uint64_t>(mVec.data());
-      mPtr = reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
-    } else {
-      mPtr = NULL;
-    }
-  }
-  _T* data() const { return mPtr; }
-  _T& operator[](size_t _n) noexcept { return mPtr[_n]; }
-
- protected:
-  size_t mAlignedsize, mRawsize;
-  std::vector<_T> mVec;
-  _T* mPtr;
-};
-
-template <typename _T, int _Alignment = 64>
-using avector = aligned_vector<_T, _Alignment>;
-
-using milliseconds = std::chrono::milliseconds;
-using nanoseconds = std::chrono::nanoseconds;
-using microseconds = std::chrono::microseconds;
-template <typename _DUR = std::chrono::milliseconds>
-class timer {
- public:
-  using sclock_t = std::chrono::steady_clock;
-  using stime_point_t = std::chrono::time_point<sclock_t>;
-
-  timer() { clear(); }
-
-  void start() { startT = sclock_t::now(); }
-
-  void clear() { startT = stime_point_t::min(); }
-
-  bool null_state() { return startT == stime_point_t::min(); }
-
-  float stop() { return static_cast<float>(std::chrono::duration_cast<_DUR>(sclock_t::now() - startT).count()); }
-
-  stime_point_t startT;
-};
-
-template <typename T>
-class minmax_statistics {
- public:
-  minmax_statistics() { clear(); }
-
-  void clear() {
-    min_val = std::numeric_limits<T>::max();
-    max_val = std::numeric_limits<T>::min();
-    avg_val = 0;
-    count = 0;
-  }
-
-  void add(T _val) {
-    min_val = min_val > _val ? _val : min_val;
-    max_val = max_val < _val ? _val : max_val;
-    count += 1;
-    avg_val = (avg_val * (count - 1) + _val) / count;
-  }
-
-  T min_val, max_val, avg_val;
-  size_t count;
-};
-
-template <int _PRINT_CYCLE_MS = 100, typename _PRECISION = microseconds, typename _LOG_PRECISION = milliseconds>
-class timer_statistics_logger {
- public:
-  typedef timer<milliseconds> log_timer_t;
-  timer_statistics_logger() {
-    clear();
-    log_ratio = static_cast<float>(std::chrono::duration_cast<_PRECISION>(_LOG_PRECISION(1)).count());
-  }
-
-  void clear() {
-    statis.clear();
-    logtm.clear();
-  }
-
-  void start() {
-    if (logtm.null_state()) {
-      logtm.start();
-    }
-    tm.start();
-  }
-
-  bool stop() {
-    auto elapsed = tm.stop();
-    statis.add(elapsed);
-    if (logtm.stop() >= _PRINT_CYCLE_MS) {
-      record();
-      clear();
-      logtm.start();
-      return true;
-    }
-    return false;
-  }
-
-  bool add(float time) {
-    statis.add(time);
-    if (logtm.stop() >= _PRINT_CYCLE_MS) {
-      record();
-      clear();
-      logtm.start();
-      return true;
-    }
-    return false;
-  }
-
-  const char* get_log_str() {
-    sprintf(str, "Min:%.4f, Max:%.4f, Average:%.4f", min_val, max_val, avg_val);
-    return str;
-  }
-  float min_val, max_val, avg_val;
-
- private:
-  void record() {
-    min_val = statis.min_val / log_ratio;
-    max_val = statis.max_val / log_ratio;
-    avg_val = statis.avg_val / log_ratio;
-  }
-  float log_ratio;
-  char str[256];
-  timer<_PRECISION> tm;
-  minmax_statistics<float> statis;
-  timer<milliseconds> logtm;
-};
-}  // namespace utils
-
-static float fp4_bnb_dequant_fp32_LUT[] = {
-    0.00000000f,        5.208333333e-03f,   0.66666667f,        1.00000000f,        0.33333333f,
-    0.50000000f,        0.16666667f,        0.25000000f,        -1.f * 0.00000000f, -1.f * 5.208333333e-03f,
-    -1.f * 0.66666667f, -1.f * 1.00000000f, -1.f * 0.33333333f, -1.f * 0.50000000f, -1.f * 0.16666667f,
-    -1.f * 0.25000000f};
-
-static float fp4_e2m1_dequant_fp32_LUT[] = {
-    0.f,
-    0.010416666666666666f,
-    0.16666666666666666f,
-    0.25f,
-    0.333333333333333f,
-    0.5f,
-    0.6666666666666f,
-    1.f,
-    -1.f * 0.f,
-    -1.f * 0.010416666666666666f,
-    -1.f * 0.16666666666666666f,
-    -1.f * 0.25f,
-    -1.f * 0.333333333333333f,
-    -1.f * 0.5f,
-    -1.f * 0.6666666666666f,
-    -1.f * 1.f,
-};
-
-static float nf4_dequant_fp32_LUT[] = {0.f,
-                                       -0.6961928009986877f,
-                                       -0.5250730514526367f,
-                                       -0.39491748809814453f,
-                                       -0.28444138169288635f,
-                                       -0.18477343022823334f,
-                                       -0.09105003625154495f,
-                                       -1.f,
-                                       0.07958029955625534f,
-                                       0.16093020141124725f,
-                                       0.24611230194568634f,
-                                       0.33791524171829224f,
-                                       0.44070982933044434f,
-                                       0.5626170039176941f,
-                                       0.7229568362236023f,
-                                       1.0f};
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
deleted file mode 100644
index 27e240a822cd..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
+++ /dev/null
@@ -1,281 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <thread>
-
-#include "jit_blas_epilogue.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_prologue_a.h"
-#include "jit_blas_prologue_b.h"
-#include "jit_blas_utils.h"
-#include "kernel_avx512f.h"
-#include "kernel_jit.h"
-#include "kernel_ref.h"
-
-namespace jblas {
-namespace wrapper {
-namespace gemm {
-
-template <JBLAS_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, JBLAS_ISA> class _PrologueA_T,
-          template <class _T, JBLAS_ISA> class _PrologueB_T, template <JBLAS_ISA> class _Epilogue_T>
-class LauncherBase {
- public:
-  using GemmCore = _GemmCore_T;
-  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
-  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
-  using AType = typename GemmCore::AType;
-  using AParam = typename PrologueA::Param;
-  using BType = typename GemmCore::BType;
-  using BParam = typename PrologueB::Param;
-  using CType = typename GemmCore::CType;
-  using EpiParam = typename Epilogue::Param;
-  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
-  struct Param {
-    const int M, N, K;
-    const AParam paramA;
-    const BParam paramB;
-    const EpiParam paramC;
-  };
-  _GemmCore_T mGemmCore;
-  PrologueA mProA;
-  PrologueB mProB;
-  Epilogue mEpilogue;
-
-  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
-    mGemmCore.configure();
-    auto StackTmp = alloca(_config.l2cachesize);
-    auto tmpB = reinterpret_cast<BType*>(StackTmp);
-    tmpB = utils::pointer_align<64>(tmpB);
-    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
-    tmpA = utils::pointer_align<64>(tmpA);
-    auto tmpC = reinterpret_cast<CType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
-    tmpC = utils::pointer_align<64>(tmpC);
-    auto tmpCache = (void*)(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpCache = utils::pointer_align<64>(tmpCache);
-    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
-      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
-      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
-        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
-        run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
-      }
-    }
-  }
-
- protected:
-  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpC, void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    for (int iterk = 0; iterk < _param.K; iterk += _config.block[2]) {
-      int k_remain = utils::remainsize(iterk, _param.K, _config.block[2]);
-      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-      int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
-      auto bptr_cache = tmpB;
-      int bcache_step = 0;
-      mProB.getWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
-                      tmpcache, _config.tmpcachesize);
-      int bcache_stride = bcache_step * sizeof(BType);
-      for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-        int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-        auto cptr_cache = tmpC + i * _config.block[1];
-        int ccache_stride = _config.block[1] * sizeof(CType);
-        if (k_paddedle) {
-          AType* aptr_cache = tmpA;
-          int acache_step = 0;
-          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
-                              (blk_m + i + _config.loc[0]), iterk, tmpcache, _config.tmpcachesize);
-          mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
-                            acache_step * sizeof(AType), bcache_stride, ccache_stride, iterk, tmpcache,
-                            _config.tmpcachesize);
-        }
-        int k_tail = k_remain - k_paddedle;
-        if (k_tail) {
-          AType* aptr_cache = tmpA;
-          int acache_step = 0;
-          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail, (blk_m + i + _config.loc[0]),
-                              iterk + k_paddedle, tmpcache, _config.tmpcachesize);
-          mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                            GemmCore::KTILE, acache_step * sizeof(AType), bcache_stride, ccache_stride,
-                            iterk + k_paddedle, tmpcache, _config.tmpcachesize);
-        }
-      }
-    }
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpcache, _config.tmpcachesize);
-  }
-};
-
-template <JBLAS_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, JBLAS_ISA> class _PrologueA_T,
-          template <class _T, JBLAS_ISA> class _PrologueB_T, template <JBLAS_ISA> class _BlockEpilogue_T,
-          template <JBLAS_ISA> class _Epilogue_T>
-class LauncherKBlock {
- public:
-  using GemmCore = _GemmCore_T;
-  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
-  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
-  using BlockEpilogue = _BlockEpilogue_T<_RT_ISA_T>;
-  using AType = typename GemmCore::AType;
-  using AParam = typename PrologueA::Param;
-  using BType = typename GemmCore::BType;
-  using BParam = typename PrologueB::Param;
-  using CType = typename GemmCore::CType;
-  using BEpiParam = typename BlockEpilogue::Param;
-  using EpiParam = typename Epilogue::Param;
-  using AccType = float;
-  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
-  struct Param {
-    const int M, N, K, KBlock;
-    const AParam paramA;
-    const BParam paramB;
-    const BEpiParam paramBlk;
-    const EpiParam paramC;
-  };
-  _GemmCore_T mGemmCore;
-  PrologueA mProA;
-  PrologueB mProB;
-  BlockEpilogue mBlockEpi;
-  Epilogue mEpilogue;
-
-  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
-    mGemmCore.configure();
-    auto StackTmp = alloca(_config.l2cachesize);
-    auto tmpB = reinterpret_cast<BType*>(StackTmp);
-    tmpB = utils::pointer_align<64>(tmpB);
-    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
-    tmpA = utils::pointer_align<64>(tmpA);
-    auto tmpC = reinterpret_cast<AccType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
-    tmpC = utils::pointer_align<64>(tmpC);
-    auto tmpBlk = reinterpret_cast<CType*>(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpBlk = utils::pointer_align<64>(tmpBlk);
-    auto tmpCache = reinterpret_cast<void*>(tmpBlk + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpCache = utils::pointer_align<64>(tmpCache);
-    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
-      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
-      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
-        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
-        std::memset(tmpC, 0, _config.block[0] * _config.block[1] * sizeof(AccType));
-        if (_param.KBlock <= _config.block[2]) {
-          run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpBlk, tmpC, tmpCache);
-        } else {
-          run_block_large(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpBlk, tmpC, tmpCache);
-        }
-      }
-    }
-  }
-
- protected:
-  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpBlk, AccType* tmpC, void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    for (int iterk = 0; iterk < _param.K; iterk += _config.block[2]) {
-      int k_remain = utils::remainsize(iterk, _param.K, _config.block[2]);
-      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-      auto bptr_cache = tmpB;
-      int bcache_step = 0;
-      mProB.getKBlockWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
-                            tmpcache, _config.tmpcachesize);
-      int bcache_stride = bcache_step * sizeof(BType);
-
-      for (int ikk = 0; ikk < k_remain; ikk += _param.KBlock) {
-        int k_remain1 = utils::remainsize(iterk + ikk, _param.K, _param.KBlock);
-        int k_paddedle1 = utils::padto_le(k_remain1, GemmCore::KTILE);
-        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-          auto cptr_cache = tmpBlk + i * _config.block[1];
-          int ccache_stride = _config.block[1] * sizeof(CType);
-          if (k_paddedle1) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle1,
-                                (blk_m + i + _config.loc[0]), iterk + ikk, tmpcache, _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + ikk * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                              k_paddedle1, acache_step * sizeof(AType), bcache_stride, ccache_stride, 0, tmpcache,
-                              _config.tmpcachesize);
-          }
-          int k_tail = k_remain1 - k_paddedle1;
-          if (k_tail) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
-                                (blk_m + i + _config.loc[0]), iterk + ikk + k_paddedle1, tmpcache,
-                                _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + (ikk + k_paddedle1) * GemmCore::NTILE, cptr_cache, m_remain,
-                              n_padded, k_tail, acache_step * sizeof(AType), bcache_stride, ccache_stride,
-                              0 + k_paddedle1, tmpcache, _config.tmpcachesize);
-          }
-        }
-        mBlockEpi.forward(tmpBlk, tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n,
-                          (iterk + ikk) / _param.KBlock, blk_msize, blk_nsize, _param.paramBlk, tmpcache,
-                          _config.tmpcachesize);
-      }
-    }
-    auto cachewithblk = _config.tmpcachesize + static_cast<size_t>(_config.block[0]) * _config.block[1] * sizeof(CType);
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpBlk, cachewithblk);
-  }
-
-  void run_block_large(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                       int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpBlk, AccType* tmpC,
-                       void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    assert(_param.K % _param.KBlock == 0);
-    for (int iterk = 0; iterk < _param.K; iterk += _param.KBlock) {
-      memset(tmpBlk, 0, sizeof(CType) * blk_msize * _config.block[1]);
-      for (int iblkk = 0; iblkk < _param.KBlock; iblkk += _config.block[2]) {
-        int k_remain = utils::remainsize(iterk + iblkk, iterk + _param.KBlock, _config.block[2]);
-        int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-        int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
-        auto bptr_cache = tmpB;
-        int bcache_step = 0;
-        mProB.getKBlockWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk + iblkk, _config.loc[1] + blk_n,
-                              _param.paramB, tmpcache, _config.tmpcachesize);
-        int bcache_stride = bcache_step * sizeof(BType);
-        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-          auto cptr_cache = tmpBlk + i * _config.block[1];
-          int ccache_stride = _config.block[1] * sizeof(CType);
-          if (k_paddedle) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
-                                (blk_m + i + _config.loc[0]), iterk + iblkk, tmpcache, _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
-                              acache_step * sizeof(AType), bcache_stride, ccache_stride, iblkk, tmpcache,
-                              _config.tmpcachesize);
-          }
-          int k_tail = k_remain - k_paddedle;
-          if (k_tail) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
-                                (blk_m + i + _config.loc[0]), iterk + k_paddedle + iblkk, tmpcache,
-                                _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                              k_tail, acache_step * sizeof(AType), bcache_stride, ccache_stride, iblkk + k_paddedle,
-                              tmpcache, _config.tmpcachesize);
-          }
-        }
-      }
-      mBlockEpi.forward(tmpBlk, tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n,
-                        iterk / _param.KBlock, blk_msize, blk_nsize, _param.paramBlk, tmpcache, _config.tmpcachesize);
-    }
-    auto cachewithblk = _config.tmpcachesize + static_cast<size_t>(_config.block[0]) * _config.block[1] * sizeof(CType);
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpBlk, cachewithblk);
-  }
-};
-}  // namespace gemm
-}  // namespace wrapper
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
deleted file mode 100644
index 56472aba64f9..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
+++ /dev/null
@@ -1,874 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jblas/jit_blas.h"
-#include "kernel_ref.h"
-#include "jit_blas_utils.h"
-#if CompileAVX2()
-#include <immintrin.h>
-#endif
-namespace jblas {
-namespace kernel {
-namespace avx2 {
-#if CompileAVX2()
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx2", "fma")
-#else
-#endif
-
-static uint8_t shuffle_map[] = {0x00, 0x01, 0x02, 0x03, 0xff, 0xff, 0xff, 0xff,
-                                0x04, 0x05, 0x06, 0x07, 0xff, 0xff, 0xff, 0xff};
-
-template <JBLAS_DTYPE S4_T>
-static inline __m128i unpack_4bits_sse(void* srcptr) {
-  auto shuffle_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(shuffle_map));
-  auto raw_data = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
-  auto xmm0 = _mm_shuffle_epi8(raw_data, shuffle_v);
-  auto xmm1 = _mm_srli_epi32(xmm0, 0x04);
-  auto and_helper = _mm_set1_epi8(0x0f);
-  xmm0 = _mm_and_si128(xmm0, and_helper);
-  xmm1 = _mm_and_si128(xmm1, and_helper);
-  auto xmm2 = _mm_unpacklo_epi8(xmm0, xmm1);
-  auto xmm3 = _mm_unpackhi_epi8(xmm0, xmm1);
-  xmm2 = _mm_unpacklo_epi64(xmm2, xmm3);
-  if constexpr (S4_T != JBLAS_DTYPE::S4_FULLRANGE) xmm2 = _mm_slli_epi32(xmm2, 4);
-  return xmm2;
-}
-
-inline __m256 ymm_cvt_bf16_fp32(__m128i vbf16) {
-  auto vf32 = _mm256_cvtepu16_epi32(vbf16);
-  return _mm256_castsi256_ps(_mm256_slli_epi32(vf32, 16));
-}
-
-inline __m128i ymm_cvtepi32_epi16(__m256i src) {
-  __m128i tmp;
-#ifdef __GNUC__
-  for (size_t i = 0; i < 8; i++) {
-    (reinterpret_cast<int16_t*>(&tmp))[i] = (reinterpret_cast<int32_t*>(&src))[i];
-  }
-#else
-  for (size_t i = 0; i < 8; i++) {
-    tmp.m128i_i16[i] = src.m256i_i32[i];
-  }
-#endif
-  return tmp;
-}
-
-inline __m128i ymm_cvt_fp32_bf16(__m256 vfp32) {
-  return ymm_cvtepi32_epi16(_mm256_bsrli_epi128(_mm256_castps_si256(vfp32), 2));
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline void convert_s4_s8_16_sse(int8_t* dstptr, int8_t* srcptr) {
-  auto dst0 = unpack_4bits_sse<S4_T>(srcptr);
-  if constexpr (S4_T == JBLAS_DTYPE::S4_FULLRANGE) {
-    auto s8 = _mm_set1_epi8(8);
-    dst0 = _mm_sub_epi8(dst0, s8);
-  }
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), dst0);
-}
-
-template <typename T>
-static inline void convert_s8_fp_v8(T* dstptr, int8_t* srcptr) {
-  auto xmm = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
-  auto ymm = _mm256_cvtepi8_epi32(xmm);
-  auto ymm1 = _mm256_cvtepi32_ps(ymm);
-  if constexpr (std::is_same_v<T, utils::bf16>) {
-    auto xmm = ymm_cvt_fp32_bf16(ymm1);
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), xmm);
-  } else {
-    _mm256_storeu_ps(dstptr, ymm1);
-  }
-}
-
-static inline void fp4_pad_4bit(int8_t* dstptr, int8_t* srcptr) {
-  auto dst0 = unpack_4bits_sse<JBLAS_DTYPE::S4_FULLRANGE>(srcptr);
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), dst0);
-}
-
-template <int N, bool _IS_SYM>
-static inline void dequant_s8_N_avx2(float* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps = nullptr) {
-  static_assert(N % 8 == 0);
-  int constexpr VLoop = N / 8;
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto src_s8 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto zmm = _mm256_cvtepi8_epi32(src_s8);
-    if constexpr (!_IS_SYM) zmm = _mm256_sub_epi32(zmm, vzps[iv]);
-    auto fzmm = _mm256_cvtepi32_ps(zmm);
-    fzmm = _mm256_mul_ps(fzmm, vscales[iv]);
-    _mm256_storeu_ps(dstptr + iv * 8, fzmm);
-  }
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  int constexpr Vlen = 8;
-  auto vN = utils::padto_le(N, Vlen);
-  auto valpha = _mm256_set1_ps(alpha);
-  auto vbeta = _mm256_set1_ps(beta);
-
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    if (beta != 0.f) {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-        auto vsrc1 = _mm256_loadu_ps(src1ptr + i * src1step + j);
-        auto vdst = _mm256_mul_ps(valpha, vsrc);
-        vdst = _mm256_fmadd_ps(vbeta, vsrc1, vdst);
-        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    } else {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-        auto vdst = _mm256_mul_ps(valpha, vsrc);
-        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <bool WITH_ZP>
-JBLAS_CODE dequant_kblock_s8_f32_fwd(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                     float* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  const int Vlen = 8;
-  size_t simd_process_num = utils::padto_le(col, Vlen);
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    int j = 0;
-    for (; j < simd_process_num; j += Vlen) {
-      auto s8_ymm_v = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + i * ld_src + j));
-      auto s32_ymm_v = _mm256_cvtepi8_epi32(s8_ymm_v);
-      if constexpr (WITH_ZP) {
-        s32_ymm_v = _mm256_sub_epi32(
-            s32_ymm_v,
-            _mm256_cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + kpos * NPad + j))));
-      }
-      auto f32_ymm_v = _mm256_cvtepi32_ps(s32_ymm_v);
-      f32_ymm_v = _mm256_mul_ps(f32_ymm_v, _mm256_loadu_ps(sptr + j));
-      _mm256_storeu_ps(dstptr + i * ld_dst + j, f32_ymm_v);
-    }
-    for (; j < col; j++) {
-      float tmp = (float)(srcptr[i * ld_src + j]);
-      if constexpr (WITH_ZP) tmp -= (float)(zero_points[kpos * NPad + j]);
-      dstptr[i * ld_dst + j] = tmp * sptr[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequant_kblock_s8_f32(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                               float* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  if (zero_points == nullptr)
-    return dequant_kblock_s8_f32_fwd<false>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                            kblock, NPad);
-  else
-    return dequant_kblock_s8_f32_fwd<true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                           kblock, NPad);
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int row, const int col, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  int col8 = utils::padto_le(col, 8);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = scaleA[irow * ldsa];
-    auto valpha = _mm256_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col8; icol += 8) {
-      __m256 vwscale;
-      if constexpr (std::is_same_v<SCAB_T, float>) {
-        vwscale = _mm256_loadu_ps(scaleB + icol);
-      } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-        auto tmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(scaleB + icol));
-        vwscale = ymm_cvt_bf16_fp32(tmp);
-      }
-      auto vscale = _mm256_mul_ps(valpha, vwscale);
-      auto vsrcd = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + irow * srcstep + icol));
-      auto vsrc = _mm256_cvtepi32_ps(vsrcd);
-      vsrc = _mm256_mul_ps(vsrc, vscale);
-      _mm256_storeu_ps(dstptr + irow * dststep + icol, vsrc);
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    int j = 0;
-    auto vzp = _mm256_set1_ps(-zpf);
-    for (; j < col8; j += VLen) {
-      auto vreduce = _mm256_loadu_ps(reduce + j);
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= zpf * reduce[j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  const int32_t mask[] = {-1, -1, 0, 0};
-  for (int i = 0; i < row; i++) {
-    auto vreduce = _mm256_set1_ps(-reduce[i * lds]);
-    int j = 0;
-    for (; j < col8; j += VLen) {
-      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zps + j),
-                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
-      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
-      auto vzp = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scales + j));
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  auto vk = _mm256_set1_ps(static_cast<float>(k));
-  const int32_t mask[] = {-1, -1, 0, 0};
-  for (int i = 0; i < row; i++) {
-    auto vreducea = _mm256_set1_ps(-reducea[i * lds]);
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    auto vzpa = _mm256_set1_ps(-zpaf);
-    int j = 0;
-    for (; j < col8; j += VLen) {
-      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zpb + j),
-                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
-      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
-      auto vzpb = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scaleb + j));
-      auto vreduceb = _mm256_loadu_ps(reduceb + j);
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzpa, vreduceb, vacc);
-      vacc = _mm256_fmadd_ps(vzpb, vreducea, vacc);
-      vzpb = _mm256_mul_ps(vzpb, vk);
-      vacc = _mm256_fmadd_ps(vzpa, vzpb, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zpb[j]) * scaleb[j] * reducea[i * lds];
-        accptr[i * ldacc + j] -= zpaf * reduceb[j];
-        accptr[i * ldacc + j] -= zpaf * static_cast<float>(zpb[j]) * scaleb[j] * k;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      convert_s4_s8_16_sse<S4_T>(dstptr + i, reinterpret_cast<int8_t*>(srcptr + i / 2));
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = jblas::kernel::ref::get_s8<S4_T>(tmp.x);
-      dstptr[i + 1] = jblas::kernel::ref::get_s8<S4_T>(tmp.y);
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-    assert(tmpsize >= 16);
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      convert_s4_s8_16_sse<S4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2));
-      convert_s8_fp_v8(dstptr + i, tmp);
-      convert_s8_fp_v8(dstptr + i + 8, tmp + 8);
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<_DST_T>(static_cast<float>(ref::get_s8<S4_T>(tmp.x)));
-      dstptr[i + 1] = static_cast<_DST_T>(static_cast<float>(ref::get_s8<S4_T>(tmp.y)));
-    }
-    return JblasSuccess;
-  }
-  return JblasSuccess;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        for (size_t j = 0; j < 64; j += 8) {
-          convert_s8_fp_v8(dstptr + i + j, srcptr + i + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 1) {
-      auto tmp = srcptr[i];
-      dstptr[i] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  int constexpr Vlen = 8;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    __m256 valpha;
-    if constexpr (std::is_same_v<SCA_T, float>) {
-      valpha = _mm256_loadu_ps(alpha + j);
-    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
-      auto tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha + j));
-      valpha = ymm_cvt_bf16_fp32(tmp);
-    }
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm256_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm256_fmadd_ps(valpha, vsrc, vsrc1);
-      _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += alpha[j] * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps) {
-  static_assert(N % 8 == 0);
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-  int constexpr VLoop = N / 8;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv++) {
-    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto pad_idx = _mm256_cvtepu8_epi32(idx);
-    auto fp32_dq_v = _mm256_i32gather_ps(LUT, pad_idx, 4);
-    fp32_dq_v = _mm256_mul_ps(fp32_dq_v, vscales[iv]);
-    if constexpr (std::is_same_v<_DST_T, float>) {
-      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
-    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
-      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
-  static_assert(N % 8 == 0);
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-  int constexpr VLoop = N / 8;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv++) {
-    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto pad_idx = _mm256_cvtepu8_epi32(idx);
-    auto fp32_dq_v = _mm256_i32gather_ps(LUT, pad_idx, 4);
-    if constexpr (std::is_same_v<_DST_T, float>) {
-      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
-    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
-      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
-    }
-  }
-}
-
-template <JBLAS_DTYPE F4_T, typename DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-    assert(tmpsize >= 16);
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      fp4_pad_4bit(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2));
-      unpack_f4_N<16, DST_T, F4_T>(dstptr + i, tmp);
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
-      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
-    }
-    return JblasSuccess;
-  }
-  return JblasSuccess;
-}
-
-template <bool _IS_SYM, typename _ST, typename _DST_T>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DST_T*, int8_t*, __m256*, __m256i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*), int8_t* tmpbuf,
-                                                         size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == 48) {
-    __m256 vscales[6];
-    __m256i vzps[6];
-    int constexpr UnrollRow = 4;
-    int constexpr Loop16 = 48 * UnrollRow / 16;
-    assert(tmpsize >= (48 * UnrollRow));
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int irow = 0;
-    if (row0) {
-      int rowpad4 = utils::padto_le(row0, UnrollRow);
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < rowpad4; irow += UnrollRow) {
-        for (int iter16 = 0; iter16 < Loop16; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        for (int iterr = 0; iterr < UnrollRow; iterr++)
-          dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * 48, vscales, vzps);
-      }
-      for (; irow < row0; irow++) {
-        for (int iter16 = 0; iter16 < 3; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-
-    int row1_blk = utils::padto_le(row1, kblock) + row0;
-    assert(kblock % UnrollRow == 0);
-    assert(ld_src == 48);
-    assert(ld_dst == 48);
-
-    for (; irow < row1_blk; irow += kblock) {
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (int irr = 0; irr < kblock; irr += UnrollRow) {
-        for (int iter16 = 0; iter16 < Loop16; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 8 * iter16));
-        for (int iterr = 0; iterr < UnrollRow; iterr++)
-          dequantize(dstptr + (irow + irr + iterr) * ld_src, tmpbuf + iterr * 48, vscales, vzps);
-      }
-    }
-    if (irow < row) {
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < row; irow++) {
-        for (int iter16 = 0; iter16 < 3; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-    return JblasSuccess;
-  } else {
-    assert(0);
-  }
-  return JblasNotSupport;
-}
-
-template <bool _IS_SYM, typename _ST, typename _DST_T>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DST_T*, int8_t*, __m256*, __m256i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*), int8_t* tmp,
-                                                         size_t tmpsize) {
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
-                                                 int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    return decompress_kblock_bit4_packrow1<true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
-                                                              fp4_pad_4bit, tmp, tmpsize);
-  } else if constexpr (_PACK_ROW == 2) {
-    return decompress_kblock_bit4_packrow2<true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<64, _DST_T, _F4_T>,
-                                                              fp4_pad_4bit, tmp, tmpsize);
-  }
-  return JblasNotSupport;
-}
-
-enum class AVX2_REDUCE_TYPE { MAX, MIN, ADD };
-#define AVX2_REDUCE_OP                                                  \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) x = _mm256_max_ps(x, y); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) x = _mm256_min_ps(x, y); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) x = _mm256_add_ps(x, y);
-
-template <AVX2_REDUCE_TYPE TYPE>
-inline float avx2_reduce_ps(__m256 x) {
-  __m256 y = _mm256_permute2f128_ps(x, x, 1);
-  AVX2_REDUCE_OP
-  y = _mm256_permute_ps(x, 0b01001110);
-  AVX2_REDUCE_OP
-  y = _mm256_permute_ps(x, 0b10110001);
-  AVX2_REDUCE_OP
-  return _mm256_cvtss_f32(x);
-}
-
-#define AVX2_REDUCE_OP_EPI32(dst, src)                                           \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) dst = _mm256_max_epi32(dst, src); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) dst = _mm256_min_epi32(dst, src); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) dst = _mm256_add_epi32(dst, src);
-
-#ifndef _mm256_cvtsi256_si32
-#define _mm256_cvtsi256_si32(a) (_mm_cvtsi128_si32(_mm256_castsi256_si128(a)))
-#endif
-
-template <AVX2_REDUCE_TYPE TYPE>
-inline int avx2_reduce_epi32(__m256i xd) {
-  auto x = _mm256_castsi256_ps(xd);
-  __m256 y = _mm256_permute2f128_ps(x, x, 1);
-  auto yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  x = _mm256_castsi256_ps(xd);
-  y = _mm256_permute_ps(x, 0b01001110);
-  yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  x = _mm256_castsi256_ps(xd);
-  y = _mm256_permute_ps(x, 0b10110001);
-  yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  return _mm256_cvtsi256_si32(xd);
-}
-
-inline __m128i avx2_cvtepi32_epu8(__m256i x) {
-  auto out_v = _mm_packus_epi32(_mm256_castsi256_si128(x), _mm256_extractf128_si256(x, 1));
-  out_v = _mm_packus_epi16(out_v, out_v);
-  return out_v;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                                 float* blkreduce) {
-  int constexpr VLen = 8;
-  auto vff = _mm256_set1_epi32(255);
-  auto v0 = _mm256_set1_epi32(0);
-  int vblocksize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m256 vmaxval = _mm256_set1_ps(0.f);
-      __m256 vminval = _mm256_set1_ps(0.f);
-      size_t ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m256 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) assert(0);
-        vmaxval = _mm256_max_ps(vmaxval, vsrc);
-        vminval = _mm256_min_ps(vminval, vsrc);
-      }
-      auto maxval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MAX>(vmaxval);
-      auto minval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MIN>(vminval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = (float)srcptr[(j + ij) + i * ld_src];
-          maxval = std::max(maxval, srcval);
-          minval = std::min(minval, srcval);
-        }
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm256_set1_ps(rscale);
-      auto vdzp = _mm256_set1_epi32(zp);
-      ij = 0;
-      if (blkreduce) {
-        for (; ij < vblocksize; ij += VLen) {
-          __m256 vsrc;
-          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-            auto vtmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(&srcptr[(j + ij) + i * ld_src]));
-            vsrc = ymm_cvt_bf16_fp32(vtmp);
-          }
-          vsrc = _mm256_mul_ps(vsrc, vrscale);
-          auto vdsrc = _mm256_cvtps_epi32(vsrc);
-          sum += avx2_reduce_epi32<AVX2_REDUCE_TYPE::ADD>(vdsrc);
-          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
-          vdsrc = _mm256_min_epi32(vdsrc, vff);
-          vdsrc = _mm256_max_epi32(vdsrc, v0);
-          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
-          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-        }
-      } else {
-        for (; ij < vblocksize; ij += VLen) {
-          __m256 vsrc;
-          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-            auto vtmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(&srcptr[(j + ij) + i * ld_src]));
-            vsrc = ymm_cvt_bf16_fp32(vtmp);
-          }
-          vsrc = _mm256_mul_ps(vsrc, vrscale);
-          auto vdsrc = _mm256_cvtps_epi32(vsrc);
-          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
-          vdsrc = _mm256_min_epi32(vdsrc, vff);
-          vdsrc = _mm256_max_epi32(vdsrc, v0);
-          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
-          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-        }
-      }
-      for (; ij < blocksize; ij++) {
-        auto srcval = (float)srcptr[(j + ij) + i * ld_src];
-        srcval = srcval * rscale;
-        auto srcint = int(roundf(srcval));
-        sum += srcint;
-        srcint += zp;
-        srcint = std::min(srcint, 0xff);
-        srcint = std::max(srcint, 0);
-        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        maxval = std::max((float)srcptr[ij + i * ld_src], maxval);
-        minval = std::min((float)srcptr[ij + i * ld_src], minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto srcint = utils::cast<float, int>(srcptr[ij + i * ld_src] * rscale);
-        sum += srcint;
-        srcint += zp;
-        srcint = srcint <= 255 ? srcint : 255;
-        srcint = srcint >= 0 ? srcint : 0;
-        dstptr[ij + i * ld_dst] = utils::cast<int, uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  int constexpr VLen = 8;
-  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
-  auto vblock_ = utils::padto_le(blocksize, VLen);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      auto vsum = _mm256_set1_ps(0.f);
-      int jj = 0;
-      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
-      auto vblock = j + vblock_ <= col ? vblock_ : 0;
-      for (; jj < vblock2; jj += VLen * 2) {
-        auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
-        auto vtmp1 = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
-        auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
-        auto s1 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp1);
-        tmp += s0;
-        tmp += s1;
-      }
-      if (jj + VLen <= vblock) {
-        for (; jj < vblock; jj += VLen) {
-          auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
-          auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
-          tmp += s0;
-        }
-      }
-      for (; jj < blocksize; jj++) {
-        tmp += *(srcptr + i * ldsrc + j + jj);
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 8;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      auto bf16_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(src + j));
-      auto fp32_v = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(bf16_v), 2));
-      _mm256_storeu_ps(dst + j, fp32_v);
-    }
-    for (; j < col; j++) {
-      *(dst + j) = (src + j)->tofloat();
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-}
-
-static const uint8_t avx2_bf16_convert_maigc_num[32] = {
-    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-static inline __m128i cvt_fp32_to_bf16(const __m256 src, __m256i* and_helper, __m256i* add_helper) {
-  auto shuffle_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(avx2_bf16_convert_maigc_num));
-  auto round_bias = _mm256_castps_si256(src);
-  round_bias = _mm256_and_si256(*and_helper, _mm256_srli_si256(round_bias, 2));
-  round_bias = _mm256_add_epi32(round_bias, *add_helper);
-  auto round_fp32_v = _mm256_add_epi32(_mm256_castps_si256(src), round_bias);
-  __m256i trunc_elements = _mm256_shuffle_epi8(round_fp32_v, shuffle_v);
-  __m256i ordered = _mm256_permute4x64_epi64(trunc_elements, 0x58);
-  return _mm256_castsi256_si128(ordered);
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 8;
-  auto bf16_and_helper = _mm256_set1_epi32(0X00000001);
-  auto bf16_add_helper = _mm256_set1_epi32(0x00007FFF);
-  auto col_body_loop = col / simd_proc_elt * simd_proc_elt;
-  int npadding = dststride - col * sizeof(utils::bf16);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j += simd_proc_elt) {
-      auto pack_bf16_value = cvt_fp32_to_bf16(_mm256_loadu_ps(reinterpret_cast<const float*>(src) + j),
-                                              &bf16_and_helper, &bf16_add_helper);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + j * sizeof(jblas::utils::bf16)), pack_bf16_value);
-    }
-    for (; j < col; j++) {
-      (reinterpret_cast<jblas::utils::bf16*>(dst) + j)->fromfloat(*(reinterpret_cast<const float*>(src) + j));
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-  return JblasSuccess;
-}
-
-#ifdef __GNUC__
-#pragma GCC pop_options
-#else
-#endif
-#endif
-}  // namespace avx2
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
deleted file mode 100644
index 70cea4749aa7..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <immintrin.h>
-#include "kernel_avx512f.h"
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace kernel {
-namespace avx512_bf16 {
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512bf16", "avx512vl", "avx512bw")
-#endif
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileBF16()
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt)
-      _mm512_storeu_ps(
-          dst + j,  //
-          reinterpret_cast<__m512>(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
-    if (col_tail > 0)
-      _mm512_mask_storeu_ps(
-          dst + j, tail_mask,
-          reinterpret_cast<__m512>(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#endif
-  return avx512f::bf16_cvt_fp32_2D_write_back(src_ptr, dst_ptr, row, col, src_step, dst_step, zeropadding);
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-#if CompileBF16()
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 32;
-  auto col_body_loop = col / simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const uint32_t tail_mask = (1U << col_tail) - 1;
-  int npadding = dststride - col * sizeof(utils::bf16);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j++) {
-      _mm512_storeu_epi16(
-          (dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-          (__m512i)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
-                                       _mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
-    }
-    if (col_tail > 0) {
-      _mm512_mask_storeu_epi16(
-          (dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)), tail_mask,  //
-          (__m512i)_mm512_cvtne2ps_pbh(
-              _mm512_maskz_loadu_ps(tail_mask >> 16, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
-              _mm512_maskz_loadu_ps(tail_mask >> 0, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-#endif
-  return avx512f::fp32_cvt_bf16_2D_write_back(raw_srcptr, raw_dstptr, row, col, srcstride, dststride, zeropadding);
-}
-#if CompileBF16()
-#pragma GCC pop_options
-#endif
-}  // namespace avx512_bf16
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
deleted file mode 100644
index 3dc0278b8b80..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
+++ /dev/null
@@ -1,1966 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas_utils.h"
-#include "kernel_ref.h"
-
-#include <array>
-#include <cstring>
-#include <type_traits>
-#if CompileAVX512F()
-#include <immintrin.h>
-#endif
-
-namespace jblas {
-namespace kernel {
-namespace avx512f {
-#if CompileAVX512F()
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx512f", "avx512bw", "avx512vl", "avx512vbmi", "avx512dq")
-#if CompileBF16()
-#pragma GCC target("avx512bf16")
-#endif
-#if CompileFP16()
-#pragma GCC target("avx512fp16")
-#endif
-#else
-#endif
-
-inline __m512 zmm_cvt_bf16_fp32(__m256i vbf16) {
-#if CompileBF16()
-  return _mm512_cvtpbh_ps((__m256bh)vbf16);
-#else
-  auto vf32 = _mm512_cvtepu16_epi32(vbf16);
-  return _mm512_castsi512_ps(_mm512_slli_epi32(vf32, 16));
-#endif
-}
-
-inline __m256i zmm_cvt_fp32_bf16(__m512 vfp32) {
-#if CompileBF16()
-  return (__m256i)_mm512_cvtneps_pbh(vfp32);
-#else
-  return _mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_castps_si512(vfp32), 2));
-#endif
-}
-
-static inline __m512i unpack_4bits(__m256i v4bits, __m512i vmask) {
-  auto ymm1 = _mm256_slli_epi32(v4bits, 4);
-  auto zmm = _mm512_cvtepi8_epi16(v4bits);
-  auto zmm1 = _mm512_cvtepi8_epi16(ymm1);
-  zmm = _mm512_slli_epi16(zmm, 8);
-  zmm1 = _mm512_mask_mov_epi8(zmm1, 0xaaaaaaaaaaaaaaaa, zmm);
-  zmm1 = _mm512_and_epi32(zmm1, vmask);
-  return zmm1;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline void convert_s4_s8(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int LoadMask) {
-  auto ymm = _mm256_maskz_loadu_epi32(__mmask8(LoadMask), reinterpret_cast<const __m256i*>(srcptr));
-  auto zmm = unpack_4bits(ymm, vmask);
-  if constexpr (S4_T == JBLAS_DTYPE::S4_FULLRANGE) {
-    zmm = _mm512_srli_epi32(zmm, 4);
-    auto s8 = _mm512_set1_epi8(8);
-    zmm = _mm512_sub_epi8(zmm, s8);
-  }
-  _mm512_mask_storeu_epi64(dstptr, __mmask8(LoadMask), zmm);
-}
-
-template <typename T>
-static inline void convert_s8_fp_v16(T* dstptr, int8_t* srcptr) {
-  auto xmm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcptr));
-  auto zmm = _mm512_cvtepi8_epi32(xmm);
-  auto zmm1 = _mm512_cvtepi32_ps(zmm);
-  if constexpr (std::is_same_v<T, utils::bf16>) {
-    auto ymm = zmm_cvt_fp32_bf16(zmm1);
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr), ymm);
-  } else {
-    _mm512_storeu_ps(dstptr, zmm1);
-  }
-}
-
-constexpr void (*pad_fp4)(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int) = &convert_s4_s8<JBLAS_DTYPE::S4_CLIP>;
-
-template <int N, typename _DST_T, bool _IS_SYM>
-static inline void dequant_s8_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto src_s8 = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    auto zmm = _mm512_cvtepi8_epi32(src_s8);
-    if constexpr (!_IS_SYM) zmm = _mm512_sub_epi32(zmm, vzps[iv]);
-    auto fzmm = _mm512_cvtepi32_ps(zmm);
-    fzmm = _mm512_mul_ps(fzmm, vscales[iv]);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    idx = _mm_srli_epi32(idx, 4);
-    auto pad_idx = _mm512_cvtepu8_epi32(idx);
-    auto lut = _mm512_loadu_si512(LUT);
-    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
-    auto fzmm = _mm512_mul_ps(_mm512_castsi512_ps(fp32_dq_v), vscales[iv]);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    idx = _mm_srli_epi32(idx, 4);
-    auto pad_idx = _mm512_cvtepu8_epi32(idx);
-    auto lut = _mm512_loadu_si512(LUT);
-    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
-    auto fzmm = _mm512_castsi512_ps(fp32_dq_v);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <typename _ST>
-static inline __m512 vec_loadscalex16(_ST* ptr) {
-  return _mm512_loadu_ps(ptr);
-}
-
-template <>
-inline __m512 vec_loadscalex16(utils::bf16* ptr) {
-  auto vbf16 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr));
-  return zmm_cvt_bf16_fp32(vbf16);
-}
-
-static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs) {
-  dst2regs[0] = _mm512_unpacklo_epi32(src1regs[0], src1regs[0]);
-  dst2regs[1] = _mm512_unpackhi_epi32(src1regs[0], src1regs[0]);
-}
-
-static inline void vec_broadcast_ps_1_2(__m512* dst2regs, __m512* src1regs, __m512i idxreg) {
-  auto tmpreg = _mm512_permutexvar_epi64(idxreg, _mm512_castps_si512(src1regs[0]));
-  dst2regs[0] = _mm512_castsi512_ps(_mm512_unpacklo_epi32(tmpreg, tmpreg));
-  dst2regs[1] = _mm512_castsi512_ps(_mm512_unpackhi_epi32(tmpreg, tmpreg));
-}
-
-static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs, __m512i idxreg) {
-  auto tmpreg = _mm512_permutexvar_epi64(idxreg, src1regs[0]);
-  dst2regs[0] = _mm512_unpacklo_epi32(tmpreg, tmpreg);
-  dst2regs[1] = _mm512_unpackhi_epi32(tmpreg, tmpreg);
-}
-
-static inline void vec_broadcast_pi8_1_2(__m128i* dst2regs, __m128i* src1regs, __m128i idxreg) {
-  auto tmpreg = _mm_permutexvar_epi16(idxreg, src1regs[0]);
-  dst2regs[0] = _mm_unpacklo_epi8(tmpreg, tmpreg);
-  dst2regs[1] = _mm_unpackhi_epi8(tmpreg, tmpreg);
-}
-
-static inline void vec_broadcast_epi32_2_4(__m512i* dst4regs, __m512i* src2regs) {
-  vec_broadcast_epi32_1_2(dst4regs, src2regs);
-  vec_broadcast_epi32_1_2(dst4regs + 2, src2regs + 1);
-}
-
-template <typename _ST, typename _DT, bool _IS_SYM>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
-                                                         int8_t* tmpbuf, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == 48) {
-    constexpr int ColTile = 48;
-    constexpr int NRegs = ColTile / 16;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    constexpr int LoadMask48 = (1 << (48 / 8)) - 1;
-    __m512 vscales[NRegs];
-    __m512i vzps[NRegs];
-    int constexpr UnrollRow = 4;
-    int constexpr Loop64 = ColTile * UnrollRow / 64;
-    assert(tmpsize >= (ColTile * UnrollRow));
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int irow = 0;
-    if (row0) {
-      int rowpad4 = utils::padto_le(row0, UnrollRow);
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < rowpad4; irow += UnrollRow) {
-        for (int iter64 = 0; iter64 < Loop64; iter64++) {
-          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 32 * iter64), zmm_mask,
-                   LoadMask64);
-        }
-        for (int iterr = 0; iterr < UnrollRow; iterr++) {
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
-          }
-        }
-      }
-      for (; irow < row0; irow++) {
-        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
-        if constexpr (_IS_SYM) {
-          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
-        } else {
-          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-        }
-      }
-    }
-
-    int row1_blk = utils::padto_le(row1, kblock) + row0;
-    assert(kblock % UnrollRow == 0);
-    assert(ld_src == 48);  // no padding for unroll process
-
-    for (; irow < row1_blk; irow += kblock) {
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-
-      for (int irr = 0; irr < kblock; irr += UnrollRow) {
-        for (int iter64 = 0; iter64 < Loop64; iter64++) {
-          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 32 * iter64),
-                   zmm_mask, LoadMask64);
-        }
-        for (int iterr = 0; iterr < UnrollRow; iterr++) {
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
-          }
-        }
-      }
-    }
-    if (irow < row) {
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-    }
-    for (; irow < row; irow++) {
-      pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
-      if constexpr (_IS_SYM) {
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
-      } else {
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename _ST, typename _DT, bool _IS_SYM = true>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
-                                                         int8_t* tmpbuf, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  auto broadcast_idx = _mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7);
-  auto broadcast_idx_128 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-  if (col % 64 == 0) {
-    constexpr int ColTile = 64;
-    constexpr int NRegs = ColTile / 16;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (int icol = 0; icol < col; icol += ColTile) {
-      __m512 vscales[NRegs];
-      __m512i vzps[NRegs];
-      assert(tmpsize >= ColTile);
-      int row0 = kblock - k_offset % kblock;
-      row0 = row0 == kblock ? 0 : row0;
-      row0 = row0 > row ? row : row0;
-      int row1 = row - row0;
-      int irow = 0;
-      if (row0) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-
-        for (; irow < row0; irow++) {
-          pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask, LoadMask64);
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, nullptr);
-          } else {
-            dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
-          }
-        }
-      }
-
-      int row1_blk = utils::padto_le(row1, kblock) + row0;
-      for (; irow < row1_blk; irow += kblock) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-
-        for (int irr = 0; irr < kblock; irr += 1) {
-          pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + icol / 2), zmm_mask,
-                   LoadMask64);
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, vzps);
-          }
-        }
-      }
-      if (irow < row) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-      }
-      for (; irow < row; irow++) {
-        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask, LoadMask64);
-        if constexpr (_IS_SYM) {
-          dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, nullptr);
-        } else {
-          dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
-        }
-      }
-    }
-
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int8_t* zero_points, int k_offset, int kblock,
-                                                 int NPad, int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    if (zero_points == nullptr) {
-      return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<48, _DST_T, true>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    } else {
-      return decompress_kblock_bit4_packrow1<_ST, _DST_T, false>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<48, _DST_T, false>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    }
-  } else if constexpr (_PACK_ROW == 2) {
-    if (zero_points == nullptr) {
-      return decompress_kblock_bit4_packrow2<_ST, _DST_T, true>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<64, _DST_T, true>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    } else {
-      return decompress_kblock_bit4_packrow2<_ST, _DST_T, false>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<64, _DST_T, false>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    }
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
-                                                 int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
-                                                              pad_fp4, tmp, tmpsize);
-  } else if constexpr (_PACK_ROW == 2) {
-    return decompress_kblock_bit4_packrow2<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<64, _DST_T, _F4_T>,
-                                                              pad_fp4, tmp, tmpsize);
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE F4_T, typename DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    assert(tmpsize >= 256);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      pad_fp4(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-      for (size_t j = 0; j < 256; j += 64) {
-        unpack_f4_N<64, DST_T, F4_T>(dstptr + i + j, tmp + j);
-      }
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        pad_fp4(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-        unpack_f4_N<64, DST_T, F4_T>(dstptr + i, tmp);
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
-      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      convert_s4_s8<S4_T>(dstptr + i + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        convert_s4_s8<S4_T>(dstptr + i, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = jblas::kernel::ref::get_s8<S4_T>(tmp.x);
-      dstptr[i + 1] = jblas::kernel::ref::get_s8<S4_T>(tmp.y);
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock_sym(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                            int ld_src, int ld_dst, float* scales, int blocksize) {
-  int constexpr VLen = 16;
-  auto v127 = _mm512_set1_ps(127.f);
-  int col16 = utils::padto_le(col, 16);
-  int i = 0;
-  auto align_row = row / blocksize * blocksize;
-  for (; i < col16; i += VLen) {
-    int j = 0;
-    auto simd_process_block = [&](int size) {
-      __m512 vscale;
-      __m512 vmaxval = _mm512_set1_ps(0.f);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_abs_ps(vsrc);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-      }
-      vscale = _mm512_div_ps(vmaxval, v127);
-      auto vrscale = _mm512_div_ps(v127, vmaxval);
-      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
-      }
-    };
-    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
-    if (j < row) simd_process_block(row - align_row);
-  }
-  for (; i < col; i++) {
-    int j = 0;
-    auto scalar_process_block = [&](int size) {
-      float maxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < size; ij++) {
-        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      float scale = maxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < size; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
-      }
-    };
-    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
-    if (j < row) scalar_process_block(row - align_row);
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock_asym(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                             int ld_src, int ld_dst, float* scales, int8_t* zero_points,
-                                                             int blocksize) {
-  int constexpr VLen = 16;
-  auto v255 = _mm512_set1_ps(255.f);
-  auto v2 = _mm512_set1_ps(2.f);
-  auto v0 = _mm512_set1_ps(0.f);
-  int col16 = utils::padto_le(col, 16);
-  int i = 0;
-  auto align_row = row / blocksize * blocksize;
-  for (; i < col16; i += VLen) {
-    int j = 0;
-    auto simd_process_block = [&](int size) {
-      __m512 vscale;
-      __m512 vzp;
-      __m512 vmaxval = v0;
-      __m512 vminval = vmaxval;
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-        vminval = _mm512_min_ps(vminval, vsrc);
-      }
-      auto vsub = _mm512_sub_ps(vmaxval, vminval);
-      vscale = _mm512_div_ps(vsub, v255);
-      auto vrscale = _mm512_div_ps(v255, vsub);
-      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
-      auto vsum = _mm512_add_ps(vmaxval, vminval);
-      auto vmedium = _mm512_div_ps(vsum, v2);
-      vzp = _mm512_mul_ps(_mm512_sub_ps(v0, vmedium), vrscale);
-      auto vbzp = _mm512_cvtsepi32_epi8(_mm512_cvtps_epi32(vzp));
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(&zero_points[j / blocksize * ld_dst + i]), vbzp);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_mul_ps(_mm512_sub_ps(vsrc, vmedium), vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        auto vbsrc = _mm512_cvtsepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
-      }
-    };
-    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
-    if (j < row) simd_process_block(row - align_row);
-  }
-  for (; i < col; i++) {
-    int j = 0;
-    auto scalar_process_block = [&](int size) {
-      float maxval = 0;
-      float minval = 0;
-      for (size_t ij = 0; ij < size; ij++) {
-        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
-        minval = std::min(maxval, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (maxval - minval) / 255.f;
-      float rscale = 1.f / scale;
-      scales[j / blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2.f;
-      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
-      zero_points[j / blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < size; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
-      }
-    };
-    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
-    if (j < row) scalar_process_block(row - align_row);
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                        int ld_src, int ld_dst, float* scales, int8_t* zero_points,
-                                                        int blocksize) {
-  if (zero_points == nullptr)
-    return quantize_f32_sign_int_rowblock_sym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, blocksize);
-  else
-    return quantize_f32_sign_int_rowblock_asym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                               blocksize);
-}
-
-static float F4_NF4_quant_sub_helper[] = {0.f,         0.23746347f, 0.38810113f, 0.50841697f, 0.61348899f, 0.71018467f,
-                                          0.80257138f, 0.88788655f, 0.96835165f, 1.05161765f, 1.14011017f, 1.23740894f,
-                                          1.34975982f, 1.49088332f, 1.70957482f, 2.0f};
-static float F4_BNB_quant_sub_helper[] = {0.00260417f, 0.0859375f, 0.20833333f, 0.29166667f,
-                                          0.4166667f,  0.583333f,  0.8333333f,  1.01f};
-static float F4_E2M1_quant_sub_helper[] = {0.00520833f, 0.08854167f, 0.20833333f, 0.29166667f,
-                                           0.41666667f, 0.58333333f, 0.83333333f, 1.01f};
-constexpr static int8_t F4_NF4_simd_quant_v[] = {0b0111, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0000,
-                                                 0b1000, 0b1001, 0b1010, 0b1011, 0b1100, 0b1101, 0b1110, 0b1111};
-constexpr static int8_t F4_BNB_simd_quant_v[] = {0b0000, 0b0001, 0b0110, 0b0111, 0b0100, 0b0101, 0b0010, 0b0011};
-constexpr static int8_t F4_E2M1_simd_quant_v[] = {0b0000, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0111};
-
-template <std::size_t N, std::size_t... I>
-constexpr auto broadcast_N_2_Nx16(const int8_t* arr, std::index_sequence<I...>) {
-  return std::array<int8_t, N * 16>{(arr[I / 16])...};
-}
-
-template <std::size_t N>
-constexpr auto broadcast_N_2_Nx16(const int8_t* arr) {
-  return broadcast_N_2_Nx16<N>(arr, std::make_index_sequence<N * 16>{});
-}
-
-template <JBLAS_DTYPE F4_T>
-inline void f32_f4_quantize_4x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
-                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
-  __m128i xmm0{}, xmm1{}, xmm2{}, xmm3{};
-  __m512 zmm0{}, zmm1{}, zmm2{}, zmm3{}, zmm4, zmm5, zmm6, zmm7, zmm_scale{};
-  __mmask16 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
-  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
-  auto avoid_double_cmp = _mm512_set1_ps(100.f);
-  auto zmm_v0 = _mm512_set1_ps(0.f);
-  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
-  zmm1 = _mm512_mask_loadu_ps(zmm1, ls_mask, srcptr + 1 * ld_src);
-  zmm2 = _mm512_mask_loadu_ps(zmm2, ls_mask, srcptr + 2 * ld_src);
-  zmm3 = _mm512_mask_loadu_ps(zmm3, ls_mask, srcptr + 3 * ld_src);
-  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
-  zmm1 = _mm512_mul_ps(zmm1, zmm_scale);
-  zmm2 = _mm512_mul_ps(zmm2, zmm_scale);
-  zmm3 = _mm512_mul_ps(zmm3, zmm_scale);
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    auto zmm_zp = _mm512_set1_ps(0.8480964004993439f);
-    zmm0 = _mm512_add_ps(zmm0, zmm_zp);
-    zmm1 = _mm512_add_ps(zmm1, zmm_zp);
-    zmm2 = _mm512_add_ps(zmm2, zmm_zp);
-    zmm3 = _mm512_add_ps(zmm3, zmm_zp);
-  } else {
-    mask4 = _mm512_cmplt_ps_mask(zmm0, zmm_v0);
-    mask5 = _mm512_cmplt_ps_mask(zmm1, zmm_v0);
-    mask6 = _mm512_cmplt_ps_mask(zmm2, zmm_v0);
-    mask7 = _mm512_cmplt_ps_mask(zmm3, zmm_v0);
-
-    zmm0 = _mm512_abs_ps(zmm0);
-    zmm1 = _mm512_abs_ps(zmm1);
-    zmm2 = _mm512_abs_ps(zmm2);
-    zmm3 = _mm512_abs_ps(zmm3);
-  }
-  constexpr int loop_num = F4_T == JBLAS_DTYPE::F4_NF4 ? 16 : 8;
-  for (int i = 0; i < loop_num; i++) {
-    __m512 sub_v;
-    if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
-    zmm4 = _mm512_sub_ps(zmm0, sub_v);
-    zmm5 = _mm512_sub_ps(zmm1, sub_v);
-    zmm6 = _mm512_sub_ps(zmm2, sub_v);
-    zmm7 = _mm512_sub_ps(zmm3, sub_v);
-    mask0 = _mm512_cmple_ps_mask(zmm4, zmm_v0);
-    mask1 = _mm512_cmple_ps_mask(zmm5, zmm_v0);
-    mask2 = _mm512_cmple_ps_mask(zmm6, zmm_v0);
-    mask3 = _mm512_cmple_ps_mask(zmm7, zmm_v0);
-    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm1 = _mm_mask_blend_epi8(mask1, xmm1, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm2 = _mm_mask_blend_epi8(mask2, xmm2, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm3 = _mm_mask_blend_epi8(mask3, xmm3, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
-    zmm1 = _mm512_mask_add_ps(zmm1, mask1, zmm1, avoid_double_cmp);
-    zmm2 = _mm512_mask_add_ps(zmm2, mask2, zmm2, avoid_double_cmp);
-    zmm3 = _mm512_mask_add_ps(zmm3, mask3, zmm3, avoid_double_cmp);
-  }
-  if constexpr (F4_T != JBLAS_DTYPE::F4_NF4) {
-    auto xmm_bias = _mm_set1_epi8(0x08);
-    xmm0 = _mm_mask_add_epi8(xmm0, mask4, xmm0, xmm_bias);
-    xmm1 = _mm_mask_add_epi8(xmm1, mask5, xmm1, xmm_bias);
-    xmm2 = _mm_mask_add_epi8(xmm2, mask6, xmm2, xmm_bias);
-    xmm3 = _mm_mask_add_epi8(xmm3, mask7, xmm3, xmm_bias);
-  }
-  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
-  _mm_mask_storeu_epi8(dstptr + 1 * ld_dst, ls_mask, xmm1);
-  _mm_mask_storeu_epi8(dstptr + 2 * ld_dst, ls_mask, xmm2);
-  _mm_mask_storeu_epi8(dstptr + 3 * ld_dst, ls_mask, xmm3);
-}
-
-template <JBLAS_DTYPE F4_T>
-inline void f32_f4_quantize_1x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
-                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
-  __m512 zmm0{}, zmm1, zmm_scale{};
-  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
-  auto avoid_double_cmp = _mm512_set1_ps(100.f);
-  auto zmm_v0 = _mm512_set1_ps(0.f);
-  __m128i xmm0{};
-  __mmask16 mask0, mask1;
-  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
-  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    auto zp = _mm512_set1_ps(0.8480964004993439f);
-    zmm0 = _mm512_add_ps(zmm0, zp);
-  } else {
-    mask1 = _mm512_cmplt_ps_mask(zmm0, zmm_v0);
-    zmm0 = _mm512_abs_ps(zmm0);
-  }
-  constexpr int loop_num = F4_T == JBLAS_DTYPE::F4_NF4 ? 16 : 8;
-  for (int i = 0; i < loop_num; i++) {
-    __m512 sub_v;
-    if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
-    zmm1 = _mm512_sub_ps(zmm0, sub_v);
-    mask0 = _mm512_cmple_ps_mask(zmm1, zmm_v0);
-    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
-  }
-  if constexpr (F4_T != JBLAS_DTYPE::F4_NF4) {
-    auto xmm_bias = _mm_set1_epi8(0x08);
-    xmm0 = _mm_mask_add_epi8(xmm0, mask1, xmm0, xmm_bias);
-  }
-  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
-}
-
-inline void calc_blkx16_scale(const float* srcptr, int blocksize, int ld_src, float* scales, __mmask16 ls_mask) {
-  auto absmax = _mm512_set1_ps(0.f);
-  __m512 tmp{};
-  for (int i = 0; i < blocksize; i++) {
-    absmax = _mm512_range_ps(absmax, _mm512_mask_loadu_ps(tmp, ls_mask, srcptr + i * ld_src), 7);
-  }
-  _mm512_mask_storeu_ps(scales, ls_mask, absmax);
-}
-
-constexpr auto broadcast_F4_NF4_quantv = broadcast_N_2_Nx16<16>(F4_NF4_simd_quant_v);
-constexpr auto broadcast_F4_BNB_quantv = broadcast_N_2_Nx16<8>(F4_BNB_simd_quant_v);
-constexpr auto broadcast_F4_E2M1_quantv = broadcast_N_2_Nx16<8>(F4_E2M1_simd_quant_v);
-
-template <JBLAS_DTYPE F4_T>
-inline JBLAS_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  // assert(col % 16 == 0);
-  auto align_row = row / blocksize * blocksize;
-  auto align_blk = blocksize / 4 * 4;
-  int8_t* broadcast_f4_quantv;
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_NF4_quantv.data());
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_BNB_quantv.data());
-  if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1)
-    broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_E2M1_quantv.data());
-  int i = 0;
-  int align_col = col / 16 * 16;
-
-  auto process_row_blk = [&](int i, int col_size) {
-    int j = 0;
-    __mmask16 ls_mask = _cvtu32_mask16(0xffff >> (16 - col_size));
-    for (; j < align_row; j += blocksize) {
-      calc_blkx16_scale(srcptr + j * ld_src + i, blocksize, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
-      int k = 0;
-      for (; k < align_blk; k += 4) {
-        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-      for (; k < blocksize; k++) {
-        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-    }
-    if (j < row) {
-      auto fin_row = row - align_row;
-      calc_blkx16_scale(srcptr + j * ld_src + i, fin_row, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
-      int k = 0;
-      auto align_fin_blk = fin_row / 4 * 4;
-      for (; k < align_fin_blk; k += 4) {
-        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-      for (; k < fin_row; k++) {
-        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-    }
-  };
-
-  for (; i < align_col; i += 16) process_row_blk(i, 16);
-  if (i < col) process_row_blk(i, col - i);
-
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                                 float* blkreduce) {
-  int constexpr VLen = 16;
-  auto vff = _mm512_set1_epi32(255);
-  auto v0 = _mm512_set1_epi32(0);
-  int vblocksize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i += 1) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m512 vmaxval = _mm512_set1_ps(0.f);
-      __m512 vminval = _mm512_set1_ps(0.f);
-      size_t ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-        vminval = _mm512_min_ps(vminval, vsrc);
-      }
-      auto maxval = _mm512_reduce_max_ps(vmaxval);
-      auto minval = _mm512_reduce_min_ps(vminval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-          maxval = std::max(maxval, srcval);
-          minval = std::min(minval, srcval);
-        }
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm512_set1_ps(rscale);
-      auto vdzp = _mm512_set1_epi32(zp);
-      int sum = 0;
-      ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        if (blkreduce) {
-          sum += _mm512_reduce_add_epi32(vdsrc);
-        }
-        vdsrc = _mm512_add_epi32(vdsrc, vdzp);
-        vdsrc = _mm512_min_epi32(vdsrc, vff);
-        vdsrc = _mm512_max_epi32(vdsrc, v0);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-      }
-      for (; ij < blocksize; ij++) {
-        auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        srcval = srcval * rscale;
-        auto srcint = utils::cast<float, int>(srcval);
-        sum += srcint;
-        srcint += zp;
-        srcint = std::min(srcint, 0xff);
-        srcint = std::max(srcint, 0);
-        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
-        auto srcint = utils::cast<float, int>(fsrc * rscale);
-        sum += srcint;
-        srcint += zp;
-        srcint = srcint <= 255 ? srcint : 255;
-        srcint = srcint >= 0 ? srcint : 0;
-        dstptr[ij + i * ld_dst] = srcint;
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, int blocksize,
-                                                 float* reduce) {
-  int constexpr VLen = 16;
-  auto vpos = _mm512_set1_epi32(127);
-  auto vneg = _mm512_set1_epi32(-128);
-  int VBlockSize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i += 1) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m512 vmaxval = _mm512_set1_ps(std::numeric_limits<float>::min());
-      size_t ij = 0;
-      for (; ij < VBlockSize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_abs_ps(vsrc);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-      }
-      auto maxval = _mm512_reduce_max_ps(vmaxval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = std::abs(static_cast<float>(srcptr[(j + ij) + i * ld_src]));
-          maxval = std::max(maxval, srcval);
-        }
-      }
-      float scale = maxval / 127;
-      scales[j / blocksize + i * ld_scale] = scale;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm512_set1_ps(rscale);
-      ij = 0;
-      int sum = 0;
-
-      for (; ij < VBlockSize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        sum += _mm512_reduce_add_epi32(vdsrc);
-        vdsrc = _mm512_min_epi32(vdsrc, vpos);
-        vdsrc = _mm512_max_epi32(vdsrc, vneg);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-      }
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-          srcval = srcval * rscale;
-          auto srcint = int(roundf(srcval));
-          sum += srcint;
-          srcint = std::min(srcint, 127);
-          srcint = std::max(srcint, -127);
-          dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-        }
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-    if (j < col) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = j; ij < col; ij++) {
-        absmaxval = std::max(std::abs((float)srcptr[(j + ij) + i * ld_src]), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>((float)srcptr[(ij) + i * ld_src] * rscale);
-        sum += dstptr[(ij) + i * ld_dst];
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  auto valpha = _mm512_set1_ps(alpha);
-  auto vbeta = _mm512_set1_ps(beta);
-
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    if (beta != 0.f) {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-        auto vsrc1 = _mm512_loadu_ps(src1ptr + i * src1step + j);
-        auto vdst = _mm512_mul_ps(valpha, vsrc);
-        vdst = _mm512_fmadd_ps(vbeta, vsrc1, vdst);
-        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    } else {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-        auto vdst = _mm512_mul_ps(valpha, vsrc);
-        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    assert(tmpsize >= 256);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      convert_s4_s8<S4_T>(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-      for (size_t j = 0; j < 256; j += 16) {
-        convert_s8_fp_v16(dstptr + i + j, tmp + j);
-      }
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        convert_s4_s8<S4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-        for (size_t j = 0; j < 64; j += 16) {
-          convert_s8_fp_v16(dstptr + i + j, tmp + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<_DST_T>(static_cast<float>(jblas::kernel::ref::get_s8<S4_T>(tmp.x)));
-      dstptr[i + 1] = static_cast<_DST_T>(static_cast<float>(jblas::kernel::ref::get_s8<S4_T>(tmp.y)));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        for (size_t j = 0; j < 64; j += 16) {
-          convert_s8_fp_v16(dstptr + i + j, srcptr + i + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 1) {
-      auto tmp = srcptr[i];
-      dstptr[i] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    __m512 valpha;
-    if constexpr (std::is_same_v<SCA_T, float>) {
-      valpha = _mm512_loadu_ps(alpha + j);
-    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
-      auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(alpha + j));
-      valpha = zmm_cvt_bf16_fp32(tmp);
-    }
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm512_fmadd_ps(valpha, vsrc, vsrc1);
-      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += static_cast<float>(alpha[j]) * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                       const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm512_add_ps(vsrc, vsrc1);
-      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline void vec_quanout_s32_u32_v16(const int32_t* srcptr, __m512& vfactor, __m512i& vzp, __m512i& vzeros,
-                                           __m512i& v255, uint8_t* dstptr) {
-  auto vsrcd = _mm512_loadu_si512(srcptr);
-  auto vsrcf = _mm512_mul_ps(vfactor, _mm512_cvtepi32_ps(vsrcd));
-  vsrcd = _mm512_cvtps_epi32(vsrcf);
-  vsrcd = _mm512_add_epi32(vsrcd, vzp);
-  vsrcd = _mm512_max_epi32(vsrcd, vzeros);
-  vsrcd = _mm512_min_epi32(vsrcd, v255);
-  auto vdstb = _mm512_cvtepi32_epi8(vsrcd);
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), vdstb);
-}
-
-static inline JBLAS_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                                         const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
-                                         int zpDst) {
-  float factor = alpha * scaleSrc / scaleDst;
-  auto vfactor = _mm512_set1_ps(factor);
-  auto vzp = _mm512_set1_epi32(zpDst);
-  auto vzeros = _mm512_set1_epi32(0);
-  auto v255 = _mm512_set1_epi32(255);
-  int N64 = utils::padto_le(N, 64);
-  int N48 = utils::padto_le(N, 48);
-  int N16 = utils::padto_le(N, 16);
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    for (; j < N64; j += 64) {
-      for (int iv = 0; iv < 4; iv++) {
-        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
-                                &dstptr[i * dststep + j + iv * 16]);
-      }
-    }
-    if (N48 - j >= 48) {
-      for (; j < N48; j += 48) {
-        for (int iv = 0; iv < 3; iv++) {
-          vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
-                                  &dstptr[i * dststep + j + iv * 16]);
-        }
-      }
-    }
-    if (N16 - j >= 16) {
-      for (; j < N16; j += 16) {
-        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j], vfactor, vzp, vzeros, v255, &dstptr[i * dststep + j]);
-      }
-    }
-    for (; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
-      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
-                                                       int row, int col, int ld_src, int ld_dst, float* ascales,
-                                                       int ldas, float* wscales) {
-  auto vbeta = _mm512_set1_ps(beta);
-  int col16 = utils::padto_le(col, 16);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = ascales[irow * ldas] * alpha;
-    auto valpha = _mm512_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col16; icol += 16) {
-      auto vwscale = _mm512_loadu_ps(wscales + icol);
-      auto vscale = _mm512_mul_ps(valpha, vwscale);
-      auto vdst = _mm512_loadu_ps(dstptr + irow * ld_dst + icol);
-      vdst = _mm512_mul_ps(vdst, vbeta);
-      auto vsrcd = _mm512_loadu_si512(srcptr + irow * ld_src + icol);
-      auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-      vsrc = _mm512_fmadd_ps(vsrc, vscale, vdst);
-      _mm512_storeu_ps(dstptr + irow * ld_dst + icol, vsrc);
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * ld_dst + icol] =
-          scale * wscales[icol] * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int row, const int col, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  int col16 = utils::padto_le(col, 16);
-  int col64 = utils::padto_le(col, 64);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = scaleA[irow * ldsa];
-    auto valpha = _mm512_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col64; icol += 64) {
-      for (int ic = 0; ic < 4; ic++) {
-        __m512 vwscale;
-        if constexpr (std::is_same_v<SCAB_T, float>) {
-          vwscale = _mm512_loadu_ps(scaleB + icol + ic * 16);
-        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol + ic * 16));
-          vwscale = zmm_cvt_bf16_fp32(tmp);
-        }
-        auto vscale = _mm512_mul_ps(valpha, vwscale);
-        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol + ic * 16);
-        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-        vsrc = _mm512_mul_ps(vsrc, vscale);
-        _mm512_storeu_ps(dstptr + irow * dststep + icol + ic * 16, vsrc);
-      }
-    }
-    if (icol + 16 <= col16) {
-      for (; icol < col16; icol += 16) {
-        __m512 vwscale;
-        if constexpr (std::is_same_v<SCAB_T, float>) {
-          vwscale = _mm512_loadu_ps(scaleB + icol);
-        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol));
-          vwscale = zmm_cvt_bf16_fp32(tmp);
-        }
-        auto vscale = _mm512_mul_ps(valpha, vwscale);
-        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol);
-        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-        vsrc = _mm512_mul_ps(vsrc, vscale);
-        _mm512_storeu_ps(dstptr + irow * dststep + icol, vsrc);
-      }
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
-  int i = 0;
-  int constexpr VN = 64 / sizeof(srcval);
-  int numv = utils::padto_le(num, VN);
-  auto vsrc = _mm512_set1_epi8(srcval);
-  for (; i < numv; i += VN) {
-    _mm512_storeu_si512(dstptr + i, vsrc);
-  }
-  int num32 = utils::padto_le(num, 32);
-  if (i + 32 <= num32) {
-    for (; i < num32; i += 32) {
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + i), _mm512_castsi512_si256(vsrc));
-    }
-  }
-  for (; i < num; i++) {
-    dstptr[i] = srcval;
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    int j = 0;
-    auto vzp = _mm512_set1_ps(-zpf);
-    for (; j < col16; j += VLen) {
-      auto vreduce = _mm512_loadu_ps(reduce + j);
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= zpf * reduce[j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto vreduce = _mm512_set1_ps(-reduce[i * lds]);
-    int j = 0;
-    for (; j < col16; j += VLen) {
-      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zps + j)));
-      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
-      auto vzp = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scales + j));
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  auto vk = _mm512_set1_ps(static_cast<float>(k));
-  for (int i = 0; i < row; i++) {
-    auto vreducea = _mm512_set1_ps(-reducea[i * lds]);
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    auto vzpa = _mm512_set1_ps(-zpaf);
-    int j = 0;
-    for (; j < col16; j += VLen) {
-      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zpb + j)));
-      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
-      auto vzpb = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scaleb + j));
-      auto vreduceb = _mm512_loadu_ps(reduceb + j);
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzpa, vreduceb, vacc);
-      vacc = _mm512_fmadd_ps(vzpb, vreducea, vacc);
-      vzpb = _mm512_mul_ps(vzpb, vk);
-      vacc = _mm512_fmadd_ps(vzpa, vzpb, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        float zpbf = static_cast<float>(zpb[j]) * scaleb[j];
-        accptr[i * ldacc + j] -= zpbf * reducea[i * lds];
-        accptr[i * ldacc + j] -= zpaf * reduceb[j];
-        accptr[i * ldacc + j] -= zpaf * zpbf * k;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 16;
-  auto col_body_loop = col / simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  auto tail_mask = _cvtu32_mask16(0xffff >> (16 - col_tail));
-  int npadding = dststride - col * sizeof(utils::bf16);
-  auto bf16_and_helper = _mm512_set1_epi32(0x00000001);
-  auto bf16_add_helper = _mm512_set1_epi32(0X00007FFF);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j++) {
-      auto round_bias = _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j);
-      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
-      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
-      auto round_fp32_v = _mm512_add_epi32(round_bias, _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j));
-      auto pack_bf16_value = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-                          pack_bf16_value);
-    }
-    if (col_tail > 0) {
-      auto round_bias = _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j);
-      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
-      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
-      auto round_fp32_v =
-          _mm512_add_epi32(round_bias, _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j));
-      auto pack_bf16_tail = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
-      _mm256_mask_storeu_epi16(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-                               tail_mask, pack_bf16_tail);
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  int constexpr VLen = 16;
-  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
-  auto vblock_ = utils::padto_le(blocksize, VLen);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      auto vsum = _mm512_set1_ps(0.f);
-      int jj = 0;
-      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
-      auto vblock = j + vblock_ <= col ? vblock_ : 0;
-      for (; jj < vblock2; jj += VLen * 2) {
-        auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
-        auto vtmp1 = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
-        auto s0 = _mm512_reduce_add_ps(vtmp);
-        auto s1 = _mm512_reduce_add_ps(vtmp1);
-        tmp += s0;
-        tmp += s1;
-      }
-      if (jj + VLen <= vblock) {
-        for (; jj < vblock; jj += VLen) {
-          auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
-          auto s0 = _mm512_reduce_add_ps(vtmp);
-          tmp += s0;
-        }
-      }
-      for (; jj < blocksize; jj++) {
-        tmp += *(srcptr + i * ldsrc + j + jj);
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE fp32_cvt_fp16_2D_write_back(const float* src_ptr, utils::fp16* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileFP16()
-  const int npadding = (dst_step - col) * sizeof(utils::fp16);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    const auto src = src_ptr + i * src_step;
-    const auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      _mm256_storeu_ph(dst + j, _mm512_cvtxps_ph(_mm512_loadu_ps(src + j)));
-    }
-    if (col_tail > 0) {
-      _mm256_mask_storeu_epi16(  //
-          dst + j, tail_mask, _mm256_castph_si256(_mm512_cvtxps_ph(_mm512_maskz_loadu_ps(tail_mask, src + j))));
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#else
-  return JblasNotSupport;
-#endif
-}
-
-static inline JBLAS_CODE fp16_cvt_fp32_2D_write_back(const utils::fp16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileFP16()
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    const auto src = src_ptr + i * src_step;
-    const auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      _mm512_storeu_ps(dst + j, _mm512_cvtxph_ps(_mm256_loadu_ph(src + j)));
-    }
-    if (col_tail > 0) {
-      _mm512_mask_storeu_ps(dst + j, tail_mask,
-                            _mm512_cvtxph_ps(_mm256_castsi256_ph(_mm256_maskz_loadu_epi16(tail_mask, src + j))));
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#else
-  return JblasNotSupport;
-#endif
-}
-
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt)
-      _mm512_storeu_ps(
-          dst + j,
-          _mm512_castsi512_ps(_mm512_bslli_epi128(
-              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
-    if (col_tail > 0)
-      _mm512_mask_storeu_ps(
-          dst + j, tail_mask,
-          _mm512_castsi512_ps(_mm512_bslli_epi128(
-              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wignored-attributes"  // https://stackoverflow.com/a/49216021
-#endif
-// Interleave 2 bf16 zmm vectors inplace
-static inline void interleave_word(std::array<__m512i, 2>& dst) {  // NOLINT [runtime/references]
-  static constexpr uint32_t perm_idx_a[16]{
-      0 | 0,  1 | 0,  2 | 0,  3 | 0,   //
-      0 | 16, 1 | 16, 2 | 16, 3 | 16,  //
-      4 | 0,  5 | 0,  6 | 0,  7 | 0,   //
-      4 | 16, 5 | 16, 6 | 16, 7 | 16,  //
-  };
-  static constexpr uint32_t perm_idx_b[16]{
-      8 | 0,   9 | 0,   10 | 0,  11 | 0,   //
-      8 | 16,  9 | 16,  10 | 16, 11 | 16,  //
-      12 | 0,  13 | 0,  14 | 0,  15 | 0,   //
-      12 | 16, 13 | 16, 14 | 16, 15 | 16,  //
-  };
-  static const auto v_perm_idx_a = _mm512_loadu_si512(perm_idx_a);
-  static const auto v_perm_idx_b = _mm512_loadu_si512(perm_idx_b);
-
-  __m512i tmp[2];
-  tmp[0] = _mm512_unpacklo_epi16(dst[0], dst[1]);
-  tmp[1] = _mm512_unpackhi_epi16(dst[0], dst[1]);
-  dst[0] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_a, tmp[1]);
-  dst[1] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_b, tmp[1]);
-}
-
-// Interleave 16 zmm vectors of dwords inplace
-static inline void tr_x16_dword(std::array<__m512i, 16>& dst) {  // NOLINT [runtime/references]
-  __m512i tmp[16];
-
-#pragma unroll(8)
-  for (int i = 0; i < 8; ++i) {
-    tmp[2 * i] = _mm512_unpacklo_epi32(dst[2 * i], dst[2 * i + 1]);
-    tmp[2 * i + 1] = _mm512_unpackhi_epi32(dst[2 * i], dst[2 * i + 1]);
-  }
-
-#pragma unroll(4)
-  for (int i = 0; i < 4; ++i) {
-    dst[4 * i] = _mm512_unpacklo_epi64(tmp[4 * i], tmp[4 * i + 2]);
-    dst[4 * i + 1] = _mm512_unpackhi_epi64(tmp[4 * i], tmp[4 * i + 2]);
-    dst[4 * i + 2] = _mm512_unpacklo_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
-    dst[4 * i + 3] = _mm512_unpackhi_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
-  }
-
-#pragma unroll(2)
-  for (int i = 0; i < 2; ++i) {
-    tmp[8 * i + 0] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0x88);
-    tmp[8 * i + 1] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0x88);
-    tmp[8 * i + 2] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0x88);
-    tmp[8 * i + 3] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0x88);
-    tmp[8 * i + 4] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0xdd);
-    tmp[8 * i + 5] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0xdd);
-    tmp[8 * i + 6] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0xdd);
-    tmp[8 * i + 7] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0xdd);
-  }
-
-  dst[0] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0x88);
-  dst[1] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0x88);
-  dst[2] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0x88);
-  dst[3] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0x88);
-  dst[4] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0x88);
-  dst[5] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0x88);
-  dst[6] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0x88);
-  dst[7] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0x88);
-  dst[8] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0xdd);
-  dst[9] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0xdd);
-  dst[10] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0xdd);
-  dst[11] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0xdd);
-  dst[12] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0xdd);
-  dst[13] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0xdd);
-  dst[14] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0xdd);
-  dst[15] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0xdd);
-}
-
-#if CompileBF16() && CompileFP16()
-// Load 2 fp16 vectors; convert them to bf16 and interleave them
-template <int tail>
-static inline std::array<__m512i, 2> load_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda) {
-  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
-  std::array<__m512i, 2> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
-  }
-  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
-  interleave_word(dst);
-  return dst;
-}
-
-// load_fp16_bf16_interleave_word with maskz
-template <int tail>
-static inline std::array<__m512i, 2> load_maskz_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda,
-                                                                          uint32_t mask) {
-  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
-
-  const auto mask_lo = mask;
-  const auto mask_hi = mask >> 16;
-  std::array<__m512i, 2> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
-  }
-  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
-  interleave_word(dst);
-  return dst;
-}
-
-template <int tail>
-static inline std::array<__m512i, 16> load_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda) {
-  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
-  std::array<__m512i, 16> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
-  }
-  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
-  tr_x16_dword(dst);
-  return dst;
-}
-static constexpr decltype(load_fp16_bf16_tr_x16_dword<1>)* load_fp16_bf16_tr_x16_dword_tbl[17]{
-    load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<2>,
-    load_fp16_bf16_tr_x16_dword<3>,  load_fp16_bf16_tr_x16_dword<4>,  load_fp16_bf16_tr_x16_dword<5>,
-    load_fp16_bf16_tr_x16_dword<6>,  load_fp16_bf16_tr_x16_dword<7>,  load_fp16_bf16_tr_x16_dword<8>,
-    load_fp16_bf16_tr_x16_dword<9>,  load_fp16_bf16_tr_x16_dword<10>, load_fp16_bf16_tr_x16_dword<11>,
-    load_fp16_bf16_tr_x16_dword<12>, load_fp16_bf16_tr_x16_dword<13>, load_fp16_bf16_tr_x16_dword<14>,
-    load_fp16_bf16_tr_x16_dword<15>, load_fp16_bf16_tr_x16_dword<16>,
-};
-
-template <int tail>
-static inline std::array<__m512i, 16> load_maskz_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda,
-                                                                        uint32_t mask) {
-  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
-  std::array<__m512i, 16> dst;
-
-  const auto mask_lo = mask;
-  const auto mask_hi = mask >> 16;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
-  }
-  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
-  tr_x16_dword(dst);
-  return dst;
-}
-static constexpr decltype(load_maskz_fp16_bf16_tr_x16_dword<1>)* load_maskz_fp16_bf16_tr_x16_dword_tbl[17]{
-    load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<2>,
-    load_maskz_fp16_bf16_tr_x16_dword<3>,  load_maskz_fp16_bf16_tr_x16_dword<4>,  load_maskz_fp16_bf16_tr_x16_dword<5>,
-    load_maskz_fp16_bf16_tr_x16_dword<6>,  load_maskz_fp16_bf16_tr_x16_dword<7>,  load_maskz_fp16_bf16_tr_x16_dword<8>,
-    load_maskz_fp16_bf16_tr_x16_dword<9>,  load_maskz_fp16_bf16_tr_x16_dword<10>, load_maskz_fp16_bf16_tr_x16_dword<11>,
-    load_maskz_fp16_bf16_tr_x16_dword<12>, load_maskz_fp16_bf16_tr_x16_dword<13>, load_maskz_fp16_bf16_tr_x16_dword<14>,
-    load_maskz_fp16_bf16_tr_x16_dword<15>, load_maskz_fp16_bf16_tr_x16_dword<16>,
-};
-#endif
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-template <typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-struct padding_interleave_cvt {
-  padding_interleave_cvt() = delete;
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int NTile, int row, int col, int row_pad, int col_pad,
-                            int src_step, int dst_step) {
-    return JblasNotSupport;
-  }
-};
-#if CompileBF16() && CompileFP16()
-template <>
-struct padding_interleave_cvt<utils::fp16, utils::bf16, 2> {
-  static constexpr int RowPack = 2;
-  padding_interleave_cvt() = delete;
-
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
-  static JBLAS_CODE forward(const utils::fp16* src, utils::bf16* dst, int NTile, int row, int col, int row_pad,
-                            int col_pad, int src_step, int dst_step) {
-    int i = 0;
-    for (; i < row / RowPack * RowPack; i += RowPack) {
-      int j = 0;
-      for (; j < col / NTile * NTile; j += NTile) {
-        assert(NTile % 32 == 0);
-        for (int jj = 0; jj < NTile; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-      }
-      if (j < col) {  // j: tail processing
-        int jj = 0;
-        for (; j + jj < col / 32 * 32; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-        if (j + jj < col) {  // jj: tail processing
-          const uint32_t mask = (1U << (col - j - jj)) - 1;
-          const auto xss = load_maskz_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step, mask);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-          jj += 32;
-        }
-        for (; jj < NTile; jj += 32) {  // jj: padding zero
-          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
-        }
-        j += NTile;
-      }
-      for (; j < col_pad; j += NTile) {  // j: padding zero
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-    }
-    if (i < row) {                      // i: tail processing
-      static constexpr int tail_m = 1;  // must be 1
-      int j = 0;
-      for (; j < col / NTile * NTile; j += NTile) {
-        assert(NTile % 32 == 0);
-        for (int jj = 0; jj < NTile; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-      }
-      if (j < col) {  // j: tail processing
-        int jj = 0;
-        for (; j + jj < col / 32 * 32; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-        if (j + jj < col) {  // jj: tail processing
-          const uint32_t mask = (1U << (col - j - jj)) - 1;
-          const auto xss = load_maskz_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step, mask);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-          jj += 32;
-        }
-        for (; jj < NTile; jj += 32) {  // jj: padding zero
-          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
-        }
-        j += NTile;
-      }
-      for (; j < col_pad; j += NTile) {  // j: padding zero
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-      i += RowPack;
-    }
-    for (; i < row_pad; i += RowPack) {  // i: padding zero
-      for (int j = 0; j < col_pad; j += NTile) {
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-    }
-    return JblasSuccess;
-  }
-};
-#endif
-
-template <typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-struct padding_trans_interleave_cvt {
-  padding_trans_interleave_cvt() = delete;
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int MTile, int row, int col, int row_pad, int col_pad,
-                            int src_step, int dst_step) {
-    return JblasNotSupport;
-  }
-};
-#if CompileBF16() && CompileFP16()
-template <>
-struct padding_trans_interleave_cvt<utils::fp16, utils::bf16, 2> {
-  static constexpr int ColPack = 2;
-  padding_trans_interleave_cvt() = delete;
-
-  static JBLAS_CODE forward(const utils::fp16* src, utils::bf16* dst, int MTile, int row, int col, int row_pad,
-                            int col_pad, int src_step, int dst_step) {
-    assert(row_pad % 16 == 0 && col_pad % 32 == 0);
-    int i = 0;
-    for (; i < row / MTile * MTile; i += MTile) {
-      assert(MTile % 16 == 0);
-      int j = 0;
-      for (; j < col / 32 * 32; j += 32) {
-        for (int ii = 0; ii < MTile; ii += 16) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-      }
-      if (j < col) {  // j: tail processing
-        for (int ii = 0; ii < MTile; ii += 16) {
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        j += 32;
-      }
-      for (; j < col_pad; j += 2) {  // j: padding zero
-        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
-      }
-    }
-    if (i < row) {  // i: tail processing
-      int ii = 0;
-      for (; i + ii < row / 16 * 16; ii += 16) {
-        int j = 0;
-        for (; j < col / 32 * 32; j += 32) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        if (j < col) {  // j: tail processing
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-          j += 32;
-        }
-        for (; j < col_pad; j += 2) {  // j: padding zero
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-      }
-      if (i + ii < row) {  // ii: tail processing
-        const int tbl_idx = row - i - ii;
-        int j = 0;
-        for (; j < col / 32 * 32; j += 32) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        if (j < col) {  // j: tail processing
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss =
-              load_maskz_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-          j += 32;
-        }
-        for (; j < col_pad; j += 2) {  // j: padding zero
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-        ii += 16;
-      }
-      for (; ii < MTile; ii += 16) {  // ii: padding zero
-        for (int j = 0; j < col_pad; j += 2) {
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-      }
-      assert(ii == MTile);
-      i += MTile;
-    }
-    assert(row_pad % MTile == 0);
-    for (; i < row_pad; i += MTile) {  // i: padding zero
-      for (int j = 0; j < col_pad; j += 2) {
-        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
-      }
-    }
-    return JblasSuccess;
-  }
-};
-#endif
-
-#ifdef __GNUC__
-#pragma GCC pop_options
-#else
-#endif
-#endif
-}  // namespace avx512f
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
deleted file mode 100644
index 245401876c91..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
+++ /dev/null
@@ -1,1375 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "jit_base.h"
-#include "jit_blas_utils.h"
-#include "kernel_jit_injector.h"
-
-namespace jblas {
-namespace kernel {
-namespace jit {
-
-class DequanS8F32 {
- public:
-  class MicroKernelAVX512F : protected jblas::xbyak::JitAvx512f {
-   public:
-    struct params {
-      void *srcptr, *dstptr;
-      int row, col;
-      int srcstride, dststride;
-      float* scales;
-      int8_t* zps;
-    };
-    typedef long long (*func_t)(params*);
-    static int constexpr VBytes = 64;
-    static int constexpr RegScale = 0;
-    static int constexpr RegZP = 4;
-    static int constexpr RegTmp = RegScale + 8;
-    MicroKernelAVX512F(bool is_sym_) {
-      is_sym = is_sym_;
-      generate();
-      this->ready();
-      mKernel = this->getCode<func_t>();
-    }
-
-    void generate() {
-      inLocalLabel();  // use local label for multiple instance
-      int SF_TmpSize = 64;
-      int SF_TmpPos = 16 * 14;
-      Xbyak::util::StackFrame st(this, 1, 13, SF_TmpPos + SF_TmpSize);
-      parambase = st.p[0];
-      reg_srcptr = st.t[0];
-      reg_dstptr = st.t[1];
-      reg_srcstride = st.t[2];
-      reg_dststride = st.t[3];
-      reg_rowsize = st.t[4];
-      reg_colsize = st.t[5];
-      reg_iterrow = st.t[6];
-      reg_itercol = st.t[7];
-      reg_tmp = st.t[8];
-      reg_scaleptr = st.t[9];
-      reg_tmpdst = st.t[10];
-      reg_tmp1 = st.t[12];
-      reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      mov(reg_scaleptr, ptr[parambase + OFFSET(scales)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      xor_(reg_itercol, reg_itercol);
-
-      // reuse parambase reg
-      if (!is_sym) {
-        mov(reg_tmp1, ptr[parambase + OFFSET(zps)]);
-        mov(reg_zpptr, reg_tmp1);
-        xor_(reg_tmp1, reg_tmp1);
-      }
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, 64);
-      jl(".proc48", T_NEAR);
-      generateNTile(4);
-      add(reg_itercol, 64);
-      add(reg_srcptr, 1 * 64);
-      add(reg_dstptr, 4 * 64);
-      add(reg_scaleptr, 4 * 64);
-      if (!is_sym) add(reg_zpptr, 1 * 64);
-      jmp(".colend", T_NEAR);
-
-      L(".proc48");
-      cmp(reg_tmp, 48);
-      jl(".proc32", T_NEAR);
-      generateNTile(3);
-      add(reg_itercol, 48);
-      add(reg_srcptr, 1 * 48);
-      add(reg_dstptr, 4 * 48);
-      add(reg_scaleptr, 4 * 48);
-      if (!is_sym) add(reg_zpptr, 1 * 48);
-      jmp(".colend", T_NEAR);
-
-      L(".proc32");
-      generateNTile(2);
-      add(reg_itercol, 32);
-      add(reg_srcptr, 1 * 32);
-      add(reg_dstptr, 4 * 32);
-      add(reg_scaleptr, 4 * 32);
-      if (!is_sym) add(reg_zpptr, 1 * 32);
-
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-      outLocalLabel();  // end of local label
-    }
-
-    void generateNTile(int N) {
-      for (int i = 0; i < N; i++) {
-        vmovups(Xbyak::Zmm(RegScale + i), ptr[reg_scaleptr + i * 64]);
-        if (!is_sym) {
-          vpmovsxbd(Xbyak::Zmm(RegZP + i), ptr[reg_zpptr + i * 16]);
-        }
-      }
-      inLocalLabel();
-      xor_(reg_iterrow, reg_iterrow);
-      mov(reg_tmp, reg_srcptr);
-      mov(reg_tmp1, reg_dstptr);
-      L(".rowloop");
-      for (int i = 0; i < N; i++) {
-        vpmovsxbd(Xbyak::Zmm(RegTmp), ptr[reg_tmp + i * 16]);
-        if (!is_sym) {
-          vpsubd(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegZP + i));
-        }
-        vcvtdq2ps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp));
-        vmulps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegScale + i));
-        vmovups(ptr[reg_tmp1 + i * 64], Xbyak::Zmm(RegTmp));
-      }
-      add(reg_tmp, reg_srcstride);
-      add(reg_tmp1, reg_dststride);
-      add(reg_iterrow, 1);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-      outLocalLabel();
-    }
-    func_t mKernel = nullptr;
-
-   private:
-    Xbyak::Reg64 parambase;
-    Xbyak::Reg64 reg_srcptr;
-    Xbyak::Reg64 reg_dstptr;
-    Xbyak::Reg64 reg_srcstride;
-    Xbyak::Reg64 reg_dststride;
-    Xbyak::Reg64 reg_rowsize;
-    Xbyak::Reg64 reg_colsize;
-    Xbyak::Reg64 reg_iterrow;
-    Xbyak::Reg64 reg_itercol;
-    Xbyak::Reg64 reg_tmp;
-    Xbyak::Reg64 reg_scaleptr;
-    Xbyak::Reg64 reg_tmpdst;
-    Xbyak::Reg64 reg_tmp1;
-    Xbyak::Reg64 reg_ret;
-    Xbyak::Reg64 reg_zpptr = reg_ret;
-    bool is_sym;
-  };
-  static void forward_avx512f(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst, float* scales,
-                              int8_t* zero_points) {
-    static MicroKernelAVX512F mAVX512FSym(true);
-    static MicroKernelAVX512F mAVX512FASym(false);
-    auto param = MicroKernelAVX512F::params{srcptr,
-                                            dstptr,
-                                            row,
-                                            col,
-                                            static_cast<int>(ld_src * sizeof(int8_t)),
-                                            static_cast<int>(ld_dst * sizeof(float)),
-                                            scales,
-                                            zero_points};
-    if (zero_points == nullptr) {
-      mAVX512FSym.mKernel(&param);
-    } else {
-      mAVX512FASym.mKernel(&param);
-    }
-  }
-};
-
-class DequanKBlockS8F32 {
- public:
-  template <typename _ST>
-  static inline JBLAS_CODE forward_avx512f(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                           _ST* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int row1_blk = utils::padto_le(row1, kblock);
-    int row2 = row - row1_blk - row0;
-    auto sptr = scales + k_offset / kblock * NPad;
-    int8_t* zptr = nullptr;
-    if (zero_points != nullptr) zptr = zero_points + k_offset / kblock * NPad;
-    if (row0 > 0) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, row0, col, ld_src, ld_dst, sptr, zptr);
-      srcptr += row0 * ld_src;
-      dstptr += row0 * ld_dst;
-      sptr += NPad;
-      if (zero_points != nullptr) zptr += NPad;
-    }
-    for (int i = 0; i < row1_blk; i += kblock) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, kblock, col, ld_src, ld_dst, sptr, zptr);
-      srcptr += kblock * ld_src;
-      dstptr += kblock * ld_dst;
-      sptr += NPad;
-      if (zero_points != nullptr) zptr += NPad;
-    }
-    if (row2 > 0) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, row2, col, ld_src, ld_dst, sptr, zptr);
-    }
-    return JblasSuccess;
-  }
-};
-
-class JitMemcpy2DAvx2 : protected jblas::xbyak::JitAvx2 {
- public:
-  struct params {
-    void *srcptr, *dstptr, *elt_const_v;
-    int row, col;
-    int srcstride, dststride;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 32;
-  JitMemcpy2DAvx2(int unroll_row, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
-    generate(unroll_row, injectors);
-  }
-
-  template <typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* elt_const_v = nullptr, const Eltops&... ops) {
-    if (col * sizeof(_SRC_T) % 4 != 0) {
-      return JblasNotSupport;
-    }
-    static std::vector<kernel::jit_injector::eltwise_injector> p = {static_cast<JBLAS_ELTWISEOP>(ops)...};
-    if constexpr (sizeof...(ops) != 0)
-      static_assert(std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value);
-    static JitMemcpy2DAvx2 instance_withops(1, p);
-    static JitMemcpy2DAvx2 instance2_withops(2, p);
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row2 = utils::padto_le(row, 2);
-    if (row2) {
-      param.row = row2;
-      instance2_withops.mKernel(&param);
-    }
-    int rowtail = row - row2;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row2 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row2 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
-  template <typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP Op>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* elt_const_v = nullptr) {
-    if (col * sizeof(_SRC_T) % 4 != 0) {
-      return JblasNotSupport;
-    }
-    static JitMemcpy2DAvx2 instance_withops(1, {kernel::jit_injector::eltwise_injector(Op)});
-    static JitMemcpy2DAvx2 instance2_withops(2, {kernel::jit_injector::eltwise_injector(Op)});
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row2 = utils::padto_le(row, 2);
-    if (row2) {
-      param.row = row2;
-      instance2_withops.mKernel(&param);
-    }
-    int rowtail = row - row2;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row2 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row2 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
- protected:
-  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors) {
-    // unrollK=[1,2]
-    assert(unrollk == 1 || unrollk == 2);
-    Xbyak::Label data_label;
-    inLocalLabel();  // use local label for multiple instance
-    {
-      int SF_TmpSize = 64;
-      int SF_TmpPos = 16 * 10;
-      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-      const Xbyak::Reg64& parambase = st.p[0];
-      const Xbyak::Reg64& reg_srcptr = st.t[0];
-      const Xbyak::Reg64& reg_dstptr = st.t[1];
-      const Xbyak::Reg64& reg_srcstride = st.t[2];
-      const Xbyak::Reg64& reg_dststride = st.t[3];
-      const Xbyak::Reg64& reg_rowsize = st.t[4];
-      const Xbyak::Reg64& reg_colsize = st.t[5];
-      const Xbyak::Reg64& reg_iterrow = st.t[6];
-      const Xbyak::Reg64& reg_itercol = st.t[7];
-      const Xbyak::Reg64& reg_tmp = st.t[8];
-      const Xbyak::Reg64& reg_elt_constv = st.t[8];  // alias of reg_tmp.
-      const Xbyak::Reg64& reg_tmpsrc = st.t[9];
-      const Xbyak::Reg64& reg_tmpdst = st.t[10];
-      const Xbyak::Reg64& reg_tmp1 = st.t[12];
-      const Xbyak::Reg64& reg_tmp2 = st.t[11];
-      const Xbyak::Reg64& reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      int const ColUnroll = 4;
-
-      for (int i = 0; i < unrollk * ColUnroll; i++) used_ymm_idx.insert(i);
-      for (auto&& injector : injectors) {
-        injector.assign_resources(this, used_ymm_idx, reg_ret);
-        injector.assign_reg_elt_constp(reg_elt_constv);
-      }
-
-      xor_(reg_iterrow, reg_iterrow);
-      L(".rowloop");
-      xor_(reg_itercol, reg_itercol);
-      mov(reg_tmpsrc, reg_srcptr);
-      mov(reg_tmpdst, reg_dstptr);
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, ColUnroll * VBytes);
-      jl(".maskproc", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          for (int i = 0; i < ColUnroll; i++) {
-            vmovups(Xbyak::Ymm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_srcstride * j + i * VBytes]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Ymm(i + j * ColUnroll), k * 3 * sizeof(float));
-            vmovups(ptr[reg_tmpdst + reg_dststride * j + i * VBytes], Xbyak::Ymm(i + j * ColUnroll));
-          }
-        }
-      } else {
-        for (int i = 0; i < ColUnroll; i++) {
-          vmovups(Xbyak::Ymm(i), ptr[reg_tmpsrc + i * VBytes]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(i), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + i * VBytes], Xbyak::Ymm(i));
-        }
-      }
-      add(reg_tmpsrc, ColUnroll * VBytes);
-      add(reg_tmpdst, ColUnroll * VBytes);
-      add(reg_itercol, ColUnroll * VBytes);
-      jmp(".colend", T_NEAR);
-      L(".maskproc");
-      mov(reg_tmp2, reg_colsize);
-      sub(reg_tmp2, reg_itercol);
-      cmp(reg_tmp2, VBytes);
-      jb(".maskflag", T_NEAR);
-      cmp(reg_tmp2, 0);
-      jl(".maskend", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          vmovups(Xbyak::Ymm(0), ptr[reg_tmpsrc + reg_srcstride * j]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Ymm(0));
-        }
-      } else {
-        vmovups(Xbyak::Ymm(0), ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-        vmovups(ptr[reg_tmpdst], Xbyak::Ymm(0));
-      }
-      jmp(".maskend", T_NEAR);
-      L(".maskflag");
-      // 0<tail<8
-      mov(reg_tmp1.cvt32(), 1);
-      shlx(reg_tmp1.cvt32(), reg_tmp1.cvt32(), reg_tmp2.cvt32());
-      sub(reg_tmp1.cvt32(), 1);
-      vmovd(Xbyak::Xmm(1), reg_tmp1.cvt32());
-      vpbroadcastd(Xbyak::Ymm(1), Xbyak::Xmm(1));
-      vpsllvd(Xbyak::Ymm(1), Xbyak::Ymm(1), ptr[rip + data_label]);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_tmpsrc + reg_srcstride * j]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-          vpmaskmovd(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Ymm(1), Xbyak::Ymm(0));
-        }
-      } else {
-        vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-        vpmaskmovd(ptr[reg_tmpdst], Xbyak::Ymm(1), Xbyak::Ymm(0));
-      }
-      L(".maskend");
-      add(reg_tmpsrc, VBytes);
-      add(reg_tmpdst, VBytes);
-      add(reg_itercol, VBytes);
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-      add(reg_iterrow, unrollk);
-      lea(reg_srcptr, ptr[reg_srcptr + reg_srcstride * unrollk]);
-      lea(reg_dstptr, ptr[reg_dstptr + reg_dststride * unrollk]);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-    }
-    outLocalLabel();  // end of local label
-    L(data_label);
-    uint32_t mask_bias[8] = {28, 24, 20, 16, 12, 8, 4, 0};
-    db(reinterpret_cast<uint8_t*>(mask_bias), sizeof(mask_bias));
-    for (auto&& injector : injectors) injector.prepare_table();
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-  std::set<int> used_ymm_idx;
-};
-
-class JitMemcpy2DAvx512f : protected jblas::xbyak::JitAvx512f {
- public:
-  struct params {
-    void *srcptr, *dstptr, *elt_const_v;
-    int row, col;
-    int srcstride, dststride;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 64;
-  JitMemcpy2DAvx512f(int unroll_row, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
-    generate(unroll_row, injectors);
-  }
-
-  template <typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* elt_const_v = nullptr, const Eltops&... ops) {
-    static std::vector<kernel::jit_injector::eltwise_injector> p = {static_cast<JBLAS_ELTWISEOP>(ops)...};
-    if constexpr (sizeof...(ops) != 0)
-      static_assert(std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value);
-    static JitMemcpy2DAvx512f instance_withops(1, p);
-    static JitMemcpy2DAvx512f instance4_withops(4, p);
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row4 = utils::padto_le(row, 4);
-    if (row4) {
-      param.row = row4;
-      instance4_withops.mKernel(&param);
-    }
-    int rowtail = row - row4;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row4 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row4 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
-  template <typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP Op>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* elt_const_v = nullptr) {
-    static JitMemcpy2DAvx512f instance_withops(1, {kernel::jit_injector::eltwise_injector(Op)});
-    static JitMemcpy2DAvx512f instance4_withops(4, {kernel::jit_injector::eltwise_injector(Op)});
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row4 = utils::padto_le(row, 4);
-    if (row4) {
-      param.row = row4;
-      instance4_withops.mKernel(&param);
-    }
-    int rowtail = row - row4;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row4 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row4 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
- protected:
-  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors) {  // unrollK=[1,2,4]
-    if (unrollk != 1 && unrollk != 2 && unrollk != 4) {
-      assert(false);
-      return;
-    }
-    inLocalLabel();  // use local label for multiple instance
-    {
-      int SF_TmpSize = 64;
-      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-      const Xbyak::Reg64& parambase = st.p[0];
-      const Xbyak::Reg64& reg_srcptr = st.t[0];
-      const Xbyak::Reg64& reg_dstptr = st.t[1];
-      const Xbyak::Reg64& reg_srcstride = st.t[2];
-      const Xbyak::Reg64& reg_dststride = st.t[3];
-      const Xbyak::Reg64& reg_rowsize = st.t[4];
-      const Xbyak::Reg64& reg_colsize = st.t[5];
-      const Xbyak::Reg64& reg_iterrow = st.t[6];
-      const Xbyak::Reg64& reg_itercol = st.t[7];
-      const Xbyak::Reg64& reg_tmp = st.t[8];
-      const Xbyak::Reg64& reg_elt_constv = st.t[8];  // alias of reg_tmp.
-      const Xbyak::Reg64& reg_tmpsrc = st.t[9];
-      const Xbyak::Reg64& reg_tmpdst = st.t[10];
-      const Xbyak::Reg64& reg_tmp1 = st.t[12];
-      const Xbyak::Reg64& reg_tmp2 = st.t[11];
-      const Xbyak::Reg64& reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      if (unrollk == 4) {
-        imul(reg_tmp1, reg_srcstride, 3);
-        imul(reg_tmp2, reg_dststride, 3);
-      }
-      int const ColUnroll = 4;
-
-      for (int i = 0; i < unrollk * ColUnroll; i++) used_zmm_idx.insert(i);
-      for (auto&& injector : injectors) {
-        injector.assign_resources(this, used_zmm_idx, reg_ret, k2);
-        injector.assign_reg_elt_constp(reg_elt_constv);
-      }
-
-      xor_(reg_iterrow, reg_iterrow);
-      L(".rowloop");
-      xor_(reg_itercol, reg_itercol);
-      mov(reg_tmpsrc, reg_srcptr);
-      mov(reg_tmpdst, reg_dstptr);
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, ColUnroll * VBytes);
-      jl(".maskproc", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          for (int i = 0; i < ColUnroll; i++) {
-            if (j == 3) {
-              vmovups(Xbyak::Zmm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_tmp1 + i * VBytes]);
-              for (int k = 0; k < injectors.size(); k++)
-                injectors[k].vector_compute(Xbyak::Zmm(i + j * ColUnroll), k * 3 * sizeof(float));
-              vmovups(ptr[reg_tmpdst + reg_tmp2 + i * VBytes], Xbyak::Zmm(i + j * ColUnroll));
-            } else {
-              vmovups(Xbyak::Zmm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_srcstride * j + i * VBytes]);
-              for (int k = 0; k < injectors.size(); k++)
-                injectors[k].vector_compute(Xbyak::Zmm(i + j * ColUnroll), k * 3 * sizeof(float));
-              vmovups(ptr[reg_tmpdst + reg_dststride * j + i * VBytes], Xbyak::Zmm(i + j * ColUnroll));
-            }
-          }
-        }
-      } else {
-        for (int i = 0; i < ColUnroll; i++) {
-          vmovups(Xbyak::Zmm(i), ptr[reg_tmpsrc + i * VBytes]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Zmm(i), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + i * VBytes], Xbyak::Zmm(i));
-        }
-      }
-      add(reg_tmpsrc, ColUnroll * VBytes);
-      add(reg_tmpdst, ColUnroll * VBytes);
-      add(reg_itercol, ColUnroll * VBytes);
-      jmp(".colend", T_NEAR);
-      L(".maskproc");
-      push(reg_tmp1);
-      generate_Nbitsmask(k1, reg_itercol, reg_colsize, reg_tmp, reg_tmp1, VBytes);
-      pop(reg_tmp1);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          if (j == 3) {
-            vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc + reg_tmp1]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-            vmovdqu8(ptr[reg_tmpdst + reg_tmp2], Xbyak::Zmm(0) | k1);
-          } else {
-            vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc + reg_srcstride * j]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-            vmovdqu8(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Zmm(0) | k1);
-          }
-        }
-      } else {
-        vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-        vmovdqu8(ptr[reg_tmpdst], Xbyak::Zmm(0) | k1);
-      }
-      add(reg_tmpsrc, VBytes);
-      add(reg_tmpdst, VBytes);
-      add(reg_itercol, VBytes);
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-      add(reg_iterrow, unrollk);
-      lea(reg_srcptr, ptr[reg_srcptr + reg_srcstride * unrollk]);
-      lea(reg_dstptr, ptr[reg_dstptr + reg_dststride * unrollk]);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-    }
-    outLocalLabel();  // end of local label
-    for (auto&& injector : injectors) injector.prepare_table();
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-  std::set<int> used_zmm_idx;
-};
-
-static inline Xbyak::Zmm unpack_4bit(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm zmm, Xbyak::Zmm zmm1,
-                                     Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
-  Xbyak::Ymm ymm1(zmm1.getIdx());
-  jit->vpmovsxbw(zmm, v4bits);
-  jit->vpslld(ymm1, v4bits, 4);
-  jit->vpmovsxbw(zmm1, ymm1);
-  jit->vpsllw(zmm, zmm, 8);
-  jit->vmovdqu8(zmm1 | unpack_mask, zmm);
-  jit->vpandd(zmm1, vmask, zmm1);
-  return zmm1;
-}
-
-static inline Xbyak::Zmm unpack_4bit_2regs(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm tmp,
-                                           Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
-  Xbyak::Zmm dst(v4bits.getIdx());
-  jit->vpmovsxbw(tmp, v4bits);
-  jit->vpslld(v4bits, v4bits, 4);
-  jit->vpmovsxbw(dst, v4bits);
-  jit->vpsllw(tmp, tmp, 8);
-  jit->vmovdqu8(dst | unpack_mask, tmp);
-  jit->vpandd(dst, vmask, dst);
-  return dst;
-}
-
-class DecompressS4S8_AVX512F : protected jblas::xbyak::JitAvx512f {
- public:
-  struct params {
-    void *srcptr, *dstptr;
-    size_t size;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 64;
-  DecompressS4S8_AVX512F() {
-    inLocalLabel();  // use local label for multiple instance
-    int SF_TmpSize = 64;
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_size = st.t[5];
-    const Xbyak::Reg64& reg_iterrow = st.t[6];
-    const Xbyak::Reg64& reg_itercol = st.t[7];
-    const Xbyak::Reg64& reg_tmp = st.t[8];
-    const Xbyak::Reg64& reg_tmp1 = st.t[12];
-    const Xbyak::Reg64& reg_ret = rax;
-
-    vreg_push(rsp);
-
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    mov(reg_size, ptr[parambase + OFFSET(size)]);
-    Xbyak::Opmask unpack_mask(4);
-    Xbyak::Zmm zmm_mask(31);
-    mov(reg_tmp.cvt32(), uint32_t(0xf0f0f0f0));
-    vpbroadcastd(zmm_mask, reg_tmp.cvt32());
-    mov(reg_tmp, 0xaaaaaaaaaaaaaaaa);
-    kmovq(unpack_mask, reg_tmp);
-    int const ColUnroll = 4;
-    xor_(reg_iterrow, reg_iterrow);
-    xor_(reg_itercol, reg_itercol);
-    L(".colloop");
-    mov(reg_tmp, reg_size);
-    sub(reg_tmp, reg_itercol);
-    cmp(reg_tmp, ColUnroll * VBytes);
-    jl(".maskproc", T_NEAR);
-    mov(reg_tmp, reg_itercol);
-    shr(reg_tmp, 1);
-    for (int i = 0; i < ColUnroll; i++) {
-      vmovups(Xbyak::Ymm(i), ptr[reg_srcptr + reg_tmp + i * VBytes / 2]);
-      unpack_4bit_2regs(this, Xbyak::Ymm(i), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
-      vmovups(ptr[reg_dstptr + reg_itercol + i * VBytes], Xbyak::Zmm(i));
-    }
-    add(reg_itercol, ColUnroll * VBytes);
-    jmp(".colend");
-    L(".maskproc");
-    generate_Nbitsmask(k1, reg_itercol, reg_size, reg_tmp, reg_tmp1, VBytes);
-    mov(reg_tmp, reg_itercol);
-    shr(reg_tmp, 1);
-    vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_srcptr + reg_tmp]);
-    unpack_4bit_2regs(this, Xbyak::Ymm(0), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
-    vmovdqu8(ptr[reg_dstptr + reg_itercol], Xbyak::Zmm(0) | k1);
-    add(reg_itercol, VBytes);
-    L(".colend");
-    cmp(reg_itercol, reg_size);
-    jb(".colloop");
-
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-    outLocalLabel();  // end of local label
-
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, size_t size) {
-    static DecompressS4S8_AVX512F instance;
-    auto param = params{srcptr, dstptr, size};
-    instance.mKernel(&param);
-    return JblasSuccess;
-  }
-
- private:
-  func_t mKernel = nullptr;
-};
-
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  if (col != ld_src) {  // memory is not continuous
-    return JblasNotSupport;
-  }
-  DecompressS4S8_AVX512F::forward(srcptr, dstptr, (size_t)row * col);
-  return JblasSuccess;
-}
-
-// src: row x col => dst: ⌈col/n_tile⌉ x ⌈row/row_pack⌉ x n_tile x row_pack (zeor-padded)
-// Extra padding can be applied with memset calls in `static void forward(...)`
-class PaddingInterleaveCvt : protected xbyak::JitAvx512f {
- public:
-  struct params {
-    const void* srcptr;
-    void* dstptr;
-    int row, col;
-    int srcstride, dststride;  // dst = dst_base + dststride * n_idx, where n_idx % n_tile == 0
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-
- private:
-  static inline const uint16_t idx_interleave_self[32] = {
-      0,  16, 1,  17, 2,  18, 3,  19,  //
-      4,  20, 5,  21, 6,  22, 7,  23,  //
-      8,  24, 9,  25, 10, 26, 11, 27,  //
-      12, 28, 13, 29, 14, 30, 15, 31,  //
-  };
-
-  PaddingInterleaveCvt(int n_tile, JBLAS_DTYPE dst_t) : PaddingInterleaveCvt(n_tile, dst_t, dst_t) {}
-  PaddingInterleaveCvt(int n_tile, JBLAS_DTYPE dst_t, JBLAS_DTYPE src_t, int row_pack = 0) : xbyak::JitAvx512f() {
-    inLocalLabel();  // use local label for multiple instance
-    const auto src_bytes = static_cast<int>(utils::jblas_dtype_size(src_t));
-    const auto dst_bytes = static_cast<int>(utils::jblas_dtype_size(dst_t));
-    if (row_pack == 0) row_pack = 4 / dst_bytes;  // default value
-    const auto ne_zmm = 64 / std::max(src_bytes, dst_bytes);
-    const auto src_bytes_vmm = ne_zmm * src_bytes;
-
-    assert(n_tile % ne_zmm == 0);
-    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
-
-    int SF_TmpSize = 64;
-    Xbyak::Label l_idx_interleave_self;
-    std::shared_ptr<void> epilogue{
-        // generate code at the very end
-        nullptr, [&](void*) {
-          align(64);
-          L(l_idx_interleave_self);
-          db(reinterpret_cast<const uint8_t*>(idx_interleave_self), sizeof(idx_interleave_self));
-          outLocalLabel();  // end of local label
-
-          this->ready();
-          this->mKernel = this->getCode<func_t>();
-        }};
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_srcstride = st.t[2];
-    const Xbyak::Reg64& reg_dststride = st.t[3];
-    const Xbyak::Reg64& reg_colsize = st.t[5];
-    const Xbyak::Reg64& reg_iterrow = st.t[6];
-    const Xbyak::Reg64& reg_itercol = st.t[7];
-    const Xbyak::Reg64& reg_tmp = st.t[8];
-    const Xbyak::Reg64& reg_tmp1 = st.t[9];
-    const Xbyak::Reg64& reg_tmp2 = st.t[12];
-    const Xbyak::Reg64& reg_tmp3 = st.t[10];
-
-    const Xbyak::Reg64& reg_ret = rax;
-    auto& mask_rd = k1;
-    const Xbyak::Zmm& vreg_idx0 = zmm31;
-
-    vreg_push(rsp);
-    vmovups(vreg_idx0, zword[rip + l_idx_interleave_self]);
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
-
-    std::vector<Xbyak::Zmm> reg_srcs(row_pack), reg_tmps(row_pack);
-    const int ZIDX_TranSrc = 0;
-    const int ZIDX_TransTmp = row_pack;
-    for (int i = 0; i < row_pack; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-    for (int i = 0; i < row_pack; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    L(".rowloop");
-    xor_(reg_itercol, reg_itercol);
-    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
-    sub(reg_tmp2, reg_iterrow);
-    cmp(reg_tmp2, row_pack);
-    jb(".tailrowloop", T_NEAR);
-
-    L(".colloop");
-    mov(reg_tmp1, reg_itercol);
-    imul(reg_tmp1, reg_dststride);
-    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
-    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
-    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
-      for (int ii = 0; ii < row_pack; ii++) {
-        const Xbyak::Xmm reg_srcs_ii = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[ii].getIdx())
-                                       : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[ii].getIdx())
-                                       : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[ii].getIdx())
-                                                             : (assert(false), reg_srcs[ii]);
-        if (src_bytes == 1) {
-          vmovdqu8(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        } else if (src_bytes == 2) {
-          vmovdqu16(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        } else if (src_bytes == 4) {
-          vmovdqu32(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        }
-      }
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
-        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
-        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
-      } else {
-        // interleave_2rows_4regs(reg_srcs.data(), reg_tmps.data());
-        assert(false);  // Not implemented
-      }
-    }
-    add(reg_itercol, n_tile);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".colloop");
-    lea(reg_srcptr, ptr[reg_srcptr + row_pack * reg_srcstride]);
-    lea(reg_dstptr, ptr[reg_dstptr + row_pack * n_tile * dst_bytes]);
-
-    add(reg_iterrow, row_pack);
-    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
-    jb(".rowloop");
-    jmp(".aftercolloop", T_NEAR);
-
-    L(".tailrowloop");
-    L(".tailcolloop");
-    mov(reg_tmp1, reg_itercol);
-    imul(reg_tmp1, reg_dststride);
-    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
-    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
-    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
-      if (row_pack == 2) {
-        const Xbyak::Xmm reg_srcs_0 = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[0].getIdx())
-                                      : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[0].getIdx())
-                                      : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[0].getIdx())
-                                                            : (assert(false), reg_srcs[0]);
-        if (src_bytes == 1) {
-          vmovdqu8(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        } else if (src_bytes == 2) {
-          vmovdqu16(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        } else if (src_bytes == 4) {
-          vmovdqu32(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        }
-        vxorps(reg_srcs[1], reg_srcs[1]);
-      } else {
-        assert(false);
-      }
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
-        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
-        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
-      } else {
-        assert(false);
-      }
-    }
-    add(reg_itercol, n_tile);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".tailcolloop");
-    L(".aftercolloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                      int dst_step) {
-    const auto kern_col_pad = utils::padto(col, NTile);
-    const auto kern_row_pad = utils::padto(row, RowPack);
-    assert(kern_col_pad <= col_pad && col_pad % NTile == 0);
-    assert(kern_row_pad <= row_pad && row_pad % RowPack == 0);
-    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
-    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
-    params param = {src, dst, row, col, src_stride, dst_stride};
-    static const PaddingInterleaveCvt kern(NTile, utils::jblas_dtype<T_DST>, utils::jblas_dtype<T_SRC>, RowPack);
-    kern(&param);
-
-    // extra row and col pad
-    const auto row_pad_size_memset = sizeof(T_DST) * (row_pad - kern_row_pad) * NTile;
-    if (row_pad_size_memset) {
-      for (int j = 0; j < kern_col_pad; j += NTile)
-        memset(dst + j * dst_step + kern_row_pad * NTile, 0, row_pad_size_memset);
-    }
-    for (int j = kern_col_pad; j < col_pad; j += NTile)  //
-      memset(dst + j * dst_step, 0, sizeof(T_DST) * NTile * row_pad);
-  }
-
-  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                        int dst_step) {
-    assert(utils::padto(col, NTile) <= col_pad && col_pad % NTile == 0);
-    assert(utils::padto(row, RowPack) <= row_pad && row_pad % RowPack == 0);
-    for (int i = 0; i < row_pad; i += RowPack)
-      for (int j = 0; j < col_pad; j += NTile)
-        for (int ii = 0; ii < RowPack; ++ii)
-          for (int jj = 0; jj < NTile; ++jj)
-            dst[i * NTile + j * dst_step + ii + jj * RowPack] =
-                static_cast<T_DST>((i + ii < row && j + jj < col) ? src[(i + ii) * src_step + j + jj] : 0);
-  }
-};
-
-// src: row x col => dst: ⌈row/m_tile⌉ x ⌈col/(trans_cell*col_pack==64/sizeof(t_dst))⌉ x m_tile x col_pack (zeor-padded)
-// Note1: the extra padding on the dimension of col due to the implementation limitation
-// Note2: dst will only be zero-padded to a multiple of trans_cell in the dimension of m_tile
-// Extra padding can be applied with memset calls in `static void forward(...)`
-class PaddingTransInterleaveCvt : protected xbyak::JitAvx512f {
- public:
-  struct params {
-    const void* srcptr;
-    void* dstptr;
-    int row, col;
-    int srcstride;  // src = src_base + srcstride * m_idx
-    int dststride;  // dst = dst_base + dststride * m_idx, where m_idx % m_tile == 0
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-  const int trans_cell;  // transpose matrices of size trans_cellxtrans_cell (in terms of #elements or #packs)
-
- private:
-  PaddingTransInterleaveCvt(int m_tile, JBLAS_DTYPE dst_t) : PaddingTransInterleaveCvt(m_tile, dst_t, dst_t) {}
-  PaddingTransInterleaveCvt(int m_tile, JBLAS_DTYPE dst_t, JBLAS_DTYPE src_t, int col_pack = 0)
-      : xbyak::JitAvx512f(), trans_cell(64 / col_pack / int(utils::jblas_dtype_size(dst_t))) {
-    const auto src_bytes = static_cast<int>(utils::jblas_dtype_size(src_t));
-    const auto dst_bytes = static_cast<int>(utils::jblas_dtype_size(dst_t));
-    if (col_pack == 0) col_pack = 4 / dst_bytes;  // default value
-    // const auto src_bytes_vmm = ne_zmm * src_bytes;
-    // const auto dst_bytes_vmm = ne_zmm * dst_bytes;
-
-    assert(m_tile % trans_cell == 0);
-    assert(col_pack > 0 && col_pack < 3);  // TODO(yi): int8 interleave not implemented
-
-    inLocalLabel();                // use local label for multiple instance
-    std::shared_ptr<void> epilogue{// generate code at the very end
-                                   nullptr, [&](void*) {
-                                     outLocalLabel();  // end of local label
-
-                                     this->ready();
-                                     this->mKernel = this->getCode<func_t>();
-                                   }};
-    Xbyak::util::StackFrame st(this, 1, 11 | Xbyak::util::UseRDX, 16 * 10);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_srcstride = st.t[2];
-    const Xbyak::Reg64& reg_dststride = st.t[3];
-    const Xbyak::Reg64& reg_colsize = st.t[4];
-    const Xbyak::Reg64& reg_iterrow = st.t[5];
-    const Xbyak::Reg64& reg_itercol = st.t[6];
-    const Xbyak::Reg64& reg_tmp = st.t[7];
-    const Xbyak::Reg64& reg_tmp2 = st.t[9];
-    const Xbyak::Reg64& reg_tmp3 = st.t[10];
-
-    const Xbyak::Reg64& reg_ret = rax;
-    const auto& mask_rd = k1;
-    const auto& mask_rd2 = k2;
-
-    vreg_push(rsp);
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
-
-    std::vector<Xbyak::Zmm> reg_srcs(trans_cell), reg_tmps(trans_cell);
-    const int ZIDX_TranSrc = 0;
-    const int ZIDX_TransTmp = trans_cell;
-    for (int i = 0; i < trans_cell; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-    for (int i = 0; i < trans_cell; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    L(".rowloop");
-    xor_(rdx, rdx);
-    mov(rax, reg_iterrow);
-    mov(reg_tmp, m_tile);
-    div(reg_tmp);                                 // reg_iterrow `div` m_tile
-    imul(reg_dstptr, rdx, col_pack * dst_bytes);  // ii * col_pack
-    add(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    imul(reg_tmp, rax, m_tile);
-    imul(reg_tmp, reg_dststride);
-    lea(reg_dstptr, ptr[reg_dstptr + reg_tmp]);  // dst = dst_base + i * dst_step + ii * col_pack
-    xor_(reg_itercol, reg_itercol);
-
-    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
-    sub(reg_tmp2, reg_iterrow);
-    cmp(reg_tmp2, trans_cell);
-    jb(".tailrowloop", T_NEAR);
-
-    L(".colloop");
-    generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
-    if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-      kshiftrq(mask_rd2, mask_rd, 16);
-      assert(trans_cell == 16);
-      for (int ii = 0; ii < trans_cell; ++ii) {
-        lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
-        vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
-        vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
-        vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
-      }
-      transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
-      for (int jj = 0; jj < trans_cell; ++jj) {
-        vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
-      }
-    } else {
-      assert(false);  // Not implemented
-    }
-    lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
-    lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".colloop");
-
-    imul(reg_tmp, reg_srcstride, trans_cell);
-    lea(reg_srcptr, ptr[reg_srcptr + reg_tmp]);  // srcptr += trans_cell * srcstride
-    lea(reg_iterrow, ptr[reg_iterrow + trans_cell]);
-    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
-    jb(".rowloop");
-    jmp(".aftercolloop", T_NEAR);
-
-    L(".tailrowloop");
-    // reg_itercol, reg_dstptr should have been set in the non-tail section
-    Xbyak::Label l_tail_tbl;
-    std::vector<Xbyak::Label> l_tail_case(trans_cell);
-    mov(reg_tmp, l_tail_tbl);                              // TODO(Yi): rip + l + offset?
-    jmp(ptr[reg_tmp + reg_tmp2 * sizeof(void*)], T_NEAR);  // switch(rows-iterrow) ...
-    align(sizeof(intptr_t));
-    L(l_tail_tbl);
-    db(reinterpret_cast<uintptr_t>(nullptr), sizeof(intptr_t));  // case 0 should never occur
-    for (int i = 1; i < trans_cell; ++i) putL(l_tail_case[i]);
-
-    for (int m_tail = 1; m_tail < trans_cell; ++m_tail) {  // case (m_tail):
-      auto& tailcolloop = l_tail_case[m_tail];
-      L(tailcolloop);
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        kshiftrq(mask_rd2, mask_rd, 16);
-        assert(trans_cell == 16);
-        for (int ii = 0; ii < trans_cell; ++ii) {
-          if (ii < m_tail) {
-            lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
-            vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
-            vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
-            vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
-          } else if (ii == m_tail) {
-            vxorps(reg_srcs[ii], reg_srcs[ii], reg_srcs[ii]);
-          } else {
-            vmovaps(reg_srcs[ii], reg_srcs[m_tail]);
-          }
-        }
-        transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
-        for (int jj = 0; jj < trans_cell; ++jj) {
-          vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
-        }
-      } else {
-        assert(false);  // Not implemented
-      }
-      lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
-      lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
-      cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-      jb(tailcolloop);
-      jmp(".aftercolloop", T_NEAR);
-    }
-
-    L(".aftercolloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                      int dst_step) {
-    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
-    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
-    static const PaddingTransInterleaveCvt kern(MTile, utils::jblas_dtype<T_DST>, utils::jblas_dtype<T_SRC>, ColPack);
-    // 0-padded guarantee by jit kern
-    const auto kern_row_pad = utils::padto(row, kern.trans_cell),
-               kern_col_pad = utils::padto(col, kern.trans_cell * ColPack);
-    assert(kern_row_pad <= row_pad && row_pad % MTile == 0);
-    assert(kern_col_pad <= col_pad && col_pad % ColPack == 0);
-    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
-    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
-    params param = {src, dst, row, col, src_stride, dst_stride};
-    kern(&param);
-
-    // extra row and col pad
-    const auto col_pad_size_memset = sizeof(T_DST) * (col_pad - kern_col_pad) * MTile;
-    if (col_pad_size_memset) {
-      for (int i = 0; i < kern_row_pad; i += MTile)
-        memset(dst + i * dst_step + kern_col_pad * MTile, 0, col_pad_size_memset);
-    }
-    const auto row_tail_pad_size_memset = sizeof(T_DST) * (utils::padto(row, MTile) - kern_row_pad) * ColPack;
-    if (row_tail_pad_size_memset) {  // row tail due to kernel limitation: kern_row_pad < next_multiple_of_MTile
-      const auto kern_row_pad_le_mtile = utils::padto_le(kern_row_pad, MTile);
-      const auto tail_dst_base = dst + kern_row_pad_le_mtile * dst_step + kern_row_pad % MTile * ColPack;
-      for (int j = 0; j < kern_col_pad; j += ColPack) memset(tail_dst_base + j * MTile, 0, row_tail_pad_size_memset);
-    }
-    for (int j = utils::padto(row, MTile); j < row_pad; j += MTile)
-      memset(dst + kern_row_pad * dst_step, 0, sizeof(T_DST) * MTile * col_pad);
-  }
-
-  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                        int dst_step) {
-    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
-    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
-    for (int i = 0; i < row_pad; i += MTile)
-      for (int j = 0; j < col_pad; j += ColPack)
-        for (int ii = 0; ii < MTile; ++ii)
-          for (int jj = 0; jj < ColPack; ++jj)
-            dst[j * MTile + i * dst_step + jj + ii * ColPack] =
-                static_cast<T_DST>((j + jj < col && i + ii < row) ? src[(i + ii) * src_step + j + jj] : 0);
-  }
-};
-
-// Complex number matrix(interleaved) - vector(as diagonal matrix) multiplication; Typically used for
-// shift-RoPE
-//
-// vector: fp16 values; view every adjacent 2 values on colunm as a complex num
-// src: bf16 ⌈row/row_pack⌉ x n_tile x row_pack; view every adjacent 2 values on colunm as a complex num
-// dst: same as src
-class CScaleInterleavedBF16FP16 : protected xbyak::JitAvx512_fp16 {
- public:
-  struct params {
-    void* srcptr;
-    const void* scaleptr;
-    int row;
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-
- private:
-  explicit CScaleInterleavedBF16FP16(int n_tile, int n_off, int row_pack = 2, int unroll = 2)
-      : xbyak::JitAvx512_fp16() {
-    inLocalLabel();  // use local label for multiple instance
-    assert(("n_tile must be a multiple of 16", n_tile % 16 == 0));
-    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
-    int SF_TmpSize = 64;
-    std::shared_ptr<void> epilogue{// generate code at the very end
-                                   nullptr, [&](void*) {
-                                     outLocalLabel();  // end of local label
-                                     this->ready();
-                                     this->mKernel = this->getCode<func_t>();
-                                   }};
-    Xbyak::util::StackFrame st(this, 1, 4, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_src = st.t[0];
-    const Xbyak::Reg64& reg_scale = st.t[1];
-    const Xbyak::Reg64& reg_rowsize = st.t[2];
-    const Xbyak::Reg64& reg_iterrow = st.t[3];
-    const Xbyak::Zmm& vreg_scale = zmm31;
-    const auto& mask = k1;
-    const auto masked_off = n_off % 16;
-    if (masked_off != 0) {
-      mov(reg_src, ((1ULL << (16 - masked_off)) - 1) << masked_off);
-      kmovw(mask, reg_src.cvt32());
-    }
-
-    vreg_push(rsp);
-    mov(reg_rowsize.cvt32(), ptr[parambase + OFFSET(row)]);
-    mov(reg_src, qword[parambase + OFFSET(srcptr)]);
-    mov(reg_scale, qword[parambase + OFFSET(scaleptr)]);
-
-    std::vector<Xbyak::Zmm> vreg_src(4 * n_tile / 16);
-    const int ZIDX_TranSrc = 0;
-    for (int i = 0; i < 4 * n_tile / 16; i++) vreg_src[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    Xbyak::Label rowloop;
-    L(rowloop);
-    {
-      assert(("only implement for pack2 bf16", row_pack == 2));
-      for (int i = 0; i < unroll * row_pack; i += row_pack) {
-        vpbroadcastd(vreg_scale, dword[reg_scale + reg_iterrow * sizeof(utils::fp16) + i * sizeof(utils::fp16)]);
-
-        if (masked_off != 0) {
-          int j = utils::padto_le(n_off, 16);
-
-          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
-          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
-          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
-          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
-          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
-          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
-          vpslldq(vreg0, vreg0, 2);
-          vpslldq(vreg1, vreg1, 2);
-          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
-          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
-          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
-          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
-          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
-          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
-          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)] | mask, vreg0);
-        }
-
-        for (int j = utils::padto(n_off, 16); j < n_tile; j += 16) {
-          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
-          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
-          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
-          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
-          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
-          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
-          vpslldq(vreg0, vreg0, 2);
-          vpslldq(vreg1, vreg1, 2);
-          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
-          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
-          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
-          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
-          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
-          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
-          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)], vreg0);
-        }
-      }
-    }
-    lea(reg_iterrow, ptr[reg_iterrow + unroll * row_pack]);
-    lea(reg_src, ptr[reg_src + unroll * row_pack * n_tile * sizeof(utils::bf16)]);
-    cmp(reg_iterrow, reg_rowsize);
-    jb(rowloop);
-
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int NTile, int RowPack = 2>
-  static void forward(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
-    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
-    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
-    constexpr auto unroll = 2;
-    assert(("row should be paded", row % (RowPack * unroll) == 0));
-    assert(("cow should be paded", col % NTile == 0));
-    assert(("can not skip more than col", n_offset < col));
-    int j = utils::padto_le(n_offset, NTile);
-    if (n_offset % NTile != 0) {
-      static const CScaleInterleavedBF16FP16 kern_off(NTile, n_offset % NTile, RowPack, unroll);
-      params param = {src + j * src_step, scale, row};
-      kern_off(&param);
-      j += NTile;
-    }
-
-    for (; j < col; j += NTile) {
-      static const CScaleInterleavedBF16FP16 kern(NTile, 0, RowPack, unroll);
-      params param = {src + j * src_step, scale, row};
-      kern(&param);
-    }
-  }
-
-  template <int NTile, int RowPack = 2>
-  static void reference(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
-    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
-    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
-    assert(("row should be paded", row % RowPack == 0));
-    assert(("cow should be paded", col % NTile == 0));
-    assert(("can not skip more than col", n_offset < col));
-    for (int j = 0; j < col; j += NTile) {
-      for (int i = 0; i < row; i += RowPack) {
-        for (int jj = 0; jj < NTile; ++jj) {
-          if (j + jj < n_offset) continue;
-          auto& rel = (src + j * src_step)[i * NTile + jj * RowPack + 0];
-          auto& img = (src + j * src_step)[i * NTile + jj * RowPack + 1];
-          const auto rel_f32 = static_cast<float>(rel);
-          const auto img_f32 = static_cast<float>(img);
-          const auto rel_scale = static_cast<float>(scale[i + 0]);
-          const auto img_scale = static_cast<float>(scale[i + 1]);
-          rel = static_cast<utils::bf16>(rel_f32 * rel_scale - img_f32 * img_scale);
-          img = static_cast<utils::bf16>(rel_f32 * img_scale + img_f32 * rel_scale);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace jit
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
deleted file mode 100644
index d3e49eecd6b4..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
+++ /dev/null
@@ -1,930 +0,0 @@
-//  Copyright (c) 2022 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-
-#pragma once
-
-#include <utility>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <map>
-#include <set>
-#include <array>
-
-#include "jit_blas.h"
-#include "jit_blas_utils.h"
-#include "xbyak/xbyak.h"
-
-namespace jblas {
-namespace kernel {
-namespace jit_injector {
-using Zmm = Xbyak::Zmm;
-using Ymm = Xbyak::Ymm;
-using Xmm = Xbyak::Xmm;
-class eltwise_injector {
- public:
-  eltwise_injector(JBLAS_ELTWISEOP eltwiseop) : elt_op(eltwiseop) { reigster_table_entries(); }
-  virtual ~eltwise_injector() {}
-
-  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_zmm_idx, const Xbyak::Reg64& table_reg,
-                        const Xbyak::Opmask& mask_reg) {
-    h = ptr;
-    k_mask = mask_reg;
-    p_table = table_reg;
-    assert(used_zmm_idx.size() <= 26);
-    assign_zmm(used_zmm_idx, &zmm_mask);
-    assign_zmm(used_zmm_idx, &zmm_aux0);
-    assign_zmm(used_zmm_idx, &zmm_aux1);
-    assign_zmm(used_zmm_idx, &zmm_aux2);
-    assign_zmm(used_zmm_idx, &zmm_aux3);
-    assign_zmm(used_zmm_idx, &zmm_aux4);
-  }
-  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_ymm_idx, const Xbyak::Reg64& table_reg) {
-    h = ptr;
-    p_table = table_reg;
-    assert(used_ymm_idx.size() <= 10);
-    assign_ymm(used_ymm_idx, &ymm_mask);
-    assign_ymm(used_ymm_idx, &ymm_aux0);
-    assign_ymm(used_ymm_idx, &ymm_aux1);
-    assign_ymm(used_ymm_idx, &ymm_aux2);
-    assign_ymm(used_ymm_idx, &ymm_aux3);
-    assign_ymm(used_ymm_idx, &ymm_aux4);
-  }
-  void assign_reg_elt_constp(const Xbyak::Reg64& reg) { reg_rt_const_p = reg; }
-  void vector_compute(const Xbyak::Zmm& zmm_src, int const_p_offset = 0) {
-    load_table_addr();
-    switch (elt_op) {
-      case EXP:
-        exp_compute_vector_fwd(zmm_src);
-        break;
-      case TANH:
-        tanh_compute_vector_fwd(zmm_src);
-        break;
-      case GELU:
-        gelu_compute_vector_fwd(zmm_src);
-        break;
-      case RELU:
-        relu_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      case LINEAR:
-        linear_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      case LOW_PRECISION_EXP:
-        low_precision_exp_compute_vector_fwd(zmm_src);
-        break;
-      case SWISH:
-        swish_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-  }
-  void vector_compute(const Xbyak::Ymm& ymm_src, int const_p_offset = 0) {
-    load_table_addr();
-    switch (elt_op) {
-      case EXP:
-        exp_compute_vector_fwd(ymm_src);
-        break;
-      case TANH:
-        tanh_compute_vector_fwd(ymm_src);
-        break;
-      case GELU:
-        gelu_compute_vector_fwd(ymm_src);
-        break;
-      case LOW_PRECISION_EXP:
-        low_precision_exp_compute_vector_fwd(ymm_src);
-        break;
-      case SWISH:
-        swish_compute_vector_fwd(ymm_src, const_p_offset);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-  }
-  void prepare_table() {
-    h->align(64);
-    h->L(l_table);
-    assert(sizeof(table_entry_val_t) == 4);  // sizeof(table_entry_val_t) should be 4
-    for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
-      const auto& te = (*it).second;
-      const auto len = te.bcast ? 64u : sizeof(table_entry_val_t);
-      for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val);
-    }
-  }
-
- private:
-  void reigster_table_entries() {
-    static const table_t common_values{
-        {zero, {0x00000000, true}},      {half, {0x3f000000, true}},          {one, {0x3f800000, true}},
-        {two, {0x40000000, true}},       {minus_one, {0xbf800000, true}},     {minus_two, {0xc0000000, true}},
-        {ln2f, {0x3f317218, true}},      {one_epi32, {0x00000001, true}},     {positive_mask, {0x7fffffff, true}},
-        {sign_mask, {0x80000000, true}}, {exponent_bias, {0x0000007f, true}},
-    };
-
-    static constexpr std::array<float, 3> exp_approx_f32_coeff{0.35815147f, 0.96963238f, 1.f};
-    static const table_t low_precision_exp_consts{
-        {low_precision_exp_const_v0, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[0]), true}},
-        {low_precision_exp_const_v1, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[1]), true}},
-        {low_precision_exp_const_v2, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[2]), true}},
-    };
-
-    static const table_t exp_consts{{exp_log2ef, {0x3fb8aa3b, true}},
-                                    {exp_ln_flt_max_f, {0x42b17218, true}},
-                                    {exp_ln_flt_min_f, {0xc2aeac50, true}}};
-
-    static const table_t exp_polynomial{
-        // p0 = 1.0f
-        {exp_pol, {0x3f7ffffb, true}},  // p1 = 0.999999701f
-        {exp_pol, {0x3efffee3, true}},  // p2 = 0.499991506f
-        {exp_pol, {0x3e2aad40, true}},  // p3 = 0.166676521f
-        {exp_pol, {0x3d2b9d0d, true}},  // p4 = 0.0418978221f
-        {exp_pol, {0x3c07cfce, true}}   // p5 = 0.00828929059f
-    };
-
-    static const table_t gelu_tanh_const{{gelu_tanh_fitting_const, {0x3d372713, true}},
-                                         {gelu_tanh_fitting_const_times_three, {0x3e095d4f, true}},
-                                         {gelu_tanh_sqrt_two_over_pi, {0x3f4c422a, true}},
-                                         {gelu_tanh_flt_max_x, {0x4154C480, true}},
-                                         {gelu_tanh_flt_min_x, {0xC154C480, true}}};
-
-    // tanh(x) constants for four interval approximation
-    static const table_t tanh_consts{{tanh_idx_bias, {0x39800000, true}},
-                                     {tanh_idx_mask, {0xffc00000, true}},
-                                     {tanh_linear_ubound, {0x39ddb3d7, true}},
-                                     {tanh_saturation_lbound, {0x41102cb3, true}}};
-
-    // tanh(x) polynomial approximation
-    // For each coefficient, there is 32 entries
-    static const table_t tanh_polynomial_table{
-        // coefficients of degree 0
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x39bfffff, false}},
-        {tanh_pol_table, {0x39ffffff, false}},
-        {tanh_pol_table, {0x3a3ffffe, false}},
-        {tanh_pol_table, {0x3a7ffffb, false}},
-        {tanh_pol_table, {0x3abffff7, false}},
-        {tanh_pol_table, {0x3affffeb, false}},
-        {tanh_pol_table, {0x3b3fffdc, false}},
-        {tanh_pol_table, {0x3b7fffab, false}},
-        {tanh_pol_table, {0x3bbfff70, false}},
-        {tanh_pol_table, {0x3bfffeab, false}},
-        {tanh_pol_table, {0x3c3ffdc0, false}},
-        {tanh_pol_table, {0x3c7ffaab, false}},
-        {tanh_pol_table, {0x3cbff701, false}},
-        {tanh_pol_table, {0x3cffeaad, false}},
-        {tanh_pol_table, {0x3d3fdc08, false}},
-        {tanh_pol_table, {0x3d7faacd, false}},
-        {tanh_pol_table, {0x3dbf7081, false}},
-        {tanh_pol_table, {0x3dfeacc9, false}},
-        {tanh_pol_table, {0x3e3dc7fd, false}},
-        {tanh_pol_table, {0x3e7acbf5, false}},
-        {tanh_pol_table, {0x3eb77a9f, false}},
-        {tanh_pol_table, {0x3eec9a9f, false}},
-        {tanh_pol_table, {0x3f22991f, false}},
-        {tanh_pol_table, {0x3f42f7d6, false}},
-        {tanh_pol_table, {0x3f67b7cc, false}},
-        {tanh_pol_table, {0x3f76ca83, false}},
-        {tanh_pol_table, {0x3f7ebbe9, false}},
-        {tanh_pol_table, {0x3f7fd40c, false}},
-        {tanh_pol_table, {0x3f7fff32, false}},
-        {tanh_pol_table, {0x3f7ffffc, false}},
-        {tanh_pol_table, {0x3f800000, false}},
-        // coefficients of degree 1
-        {tanh_pol_table, {0x3f800000, false}},
-        {tanh_pol_table, {0x3f800018, false}},
-        {tanh_pol_table, {0x3f7fffe8, false}},
-        {tanh_pol_table, {0x3f7fffda, false}},
-        {tanh_pol_table, {0x3f7fffdc, false}},
-        {tanh_pol_table, {0x3f7fffdc, false}},
-        {tanh_pol_table, {0x3f7fffac, false}},
-        {tanh_pol_table, {0x3f7fff70, false}},
-        {tanh_pol_table, {0x3f7ffeec, false}},
-        {tanh_pol_table, {0x3f7ffdc0, false}},
-        {tanh_pol_table, {0x3f7ffbed, false}},
-        {tanh_pol_table, {0x3f7ff704, false}},
-        {tanh_pol_table, {0x3f7feff5, false}},
-        {tanh_pol_table, {0x3f7fdbca, false}},
-        {tanh_pol_table, {0x3f7fbfff, false}},
-        {tanh_pol_table, {0x3f7f7041, false}},
-        {tanh_pol_table, {0x3f7f009b, false}},
-        {tanh_pol_table, {0x3f7dc36c, false}},
-        {tanh_pol_table, {0x3f7c0aa8, false}},
-        {tanh_pol_table, {0x3f7734b8, false}},
-        {tanh_pol_table, {0x3f70a4de, false}},
-        {tanh_pol_table, {0x3f5f1fd8, false}},
-        {tanh_pol_table, {0x3f495493, false}},
-        {tanh_pol_table, {0x3f18b9ec, false}},
-        {tanh_pol_table, {0x3ed706cb, false}},
-        {tanh_pol_table, {0x3e390b06, false}},
-        {tanh_pol_table, {0x3d90b11f, false}},
-        {tanh_pol_table, {0x3c21a053, false}},
-        {tanh_pol_table, {0x3aaf7fdb, false}},
-        {tanh_pol_table, {0x37ccc1a3, false}},
-        {tanh_pol_table, {0x355c6733, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 2
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xbe4e0ff1, false}},
-        {tanh_pol_table, {0x3d25b1b1, false}},
-        {tanh_pol_table, {0x3d6b6dab, false}},
-        {tanh_pol_table, {0x3c9fb1d5, false}},
-        {tanh_pol_table, {0xbabff06f, false}},
-        {tanh_pol_table, {0x3c07b3f6, false}},
-        {tanh_pol_table, {0xbb3fc1bc, false}},
-        {tanh_pol_table, {0x3a9f5921, false}},
-        {tanh_pol_table, {0xbbbf06f2, false}},
-        {tanh_pol_table, {0xbbb0f402, false}},
-        {tanh_pol_table, {0xbc47db9e, false}},
-        {tanh_pol_table, {0xbc73d5e7, false}},
-        {tanh_pol_table, {0xbca25bda, false}},
-        {tanh_pol_table, {0xbcfca780, false}},
-        {tanh_pol_table, {0xbd40e07c, false}},
-        {tanh_pol_table, {0xbd7dab03, false}},
-        {tanh_pol_table, {0xbdbe4a0f, false}},
-        {tanh_pol_table, {0xbdfb14a5, false}},
-        {tanh_pol_table, {0xbe36cc8d, false}},
-        {tanh_pol_table, {0xbe6bd102, false}},
-        {tanh_pol_table, {0xbe9fe7c5, false}},
-        {tanh_pol_table, {0xbeba0f10, false}},
-        {tanh_pol_table, {0xbec206a8, false}},
-        {tanh_pol_table, {0xbea3c388, false}},
-        {tanh_pol_table, {0xbe277d62, false}},
-        {tanh_pol_table, {0xbd8b7960, false}},
-        {tanh_pol_table, {0xbc209f49, false}},
-        {tanh_pol_table, {0xbaad44ca, false}},
-        {tanh_pol_table, {0xb7c6eeac, false}},
-        {tanh_pol_table, {0xb663aa41, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 3
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x45b3ae96, false}},
-        {tanh_pol_table, {0xc414eb20, false}},
-        {tanh_pol_table, {0xc450e02e, false}},
-        {tanh_pol_table, {0xc3152b4e, false}},
-        {tanh_pol_table, {0xbead2f56, false}},
-        {tanh_pol_table, {0xc2162e02, false}},
-        {tanh_pol_table, {0xbeb4bd5a, false}},
-        {tanh_pol_table, {0xc11a59a4, false}},
-        {tanh_pol_table, {0xbed2f507, false}},
-        {tanh_pol_table, {0xc020d32c, false}},
-        {tanh_pol_table, {0x3dd0f506, false}},
-        {tanh_pol_table, {0xbf2a75e2, false}},
-        {tanh_pol_table, {0xbff950e3, false}},
-        {tanh_pol_table, {0xbed47334, false}},
-        {tanh_pol_table, {0xbe809b8c, false}},
-        {tanh_pol_table, {0xbeb64532, false}},
-        {tanh_pol_table, {0xbe961a5b, false}},
-        {tanh_pol_table, {0xbe9b63ac, false}},
-        {tanh_pol_table, {0xbea0d4b2, false}},
-        {tanh_pol_table, {0xbe828a77, false}},
-        {tanh_pol_table, {0xbe378612, false}},
-        {tanh_pol_table, {0xbdc20908, false}},
-        {tanh_pol_table, {0x3d2d3957, false}},
-        {tanh_pol_table, {0x3dd46e89, false}},
-        {tanh_pol_table, {0x3db3f629, false}},
-        {tanh_pol_table, {0x3d2c5e7b, false}},
-        {tanh_pol_table, {0x3bd20403, false}},
-        {tanh_pol_table, {0x3a59dfae, false}},
-        {tanh_pol_table, {0x3770af45, false}},
-        {tanh_pol_table, {0x372cc014, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 4
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xcc981a1b, false}},
-        {tanh_pol_table, {0x4a7edd3d, false}},
-        {tanh_pol_table, {0x4ab1007c, false}},
-        {tanh_pol_table, {0x48fedd9c, false}},
-        {tanh_pol_table, {0x41a557b5, false}},
-        {tanh_pol_table, {0x477ee32a, false}},
-        {tanh_pol_table, {0x422557f5, false}},
-        {tanh_pol_table, {0x45ff3ce4, false}},
-        {tanh_pol_table, {0x42a55641, false}},
-        {tanh_pol_table, {0x446e0867, false}},
-        {tanh_pol_table, {0xc33dc19a, false}},
-        {tanh_pol_table, {0x42915214, false}},
-        {tanh_pol_table, {0x43af4fad, false}},
-        {tanh_pol_table, {0x4110fe88, false}},
-        {tanh_pol_table, {0xc1099b75, false}},
-        {tanh_pol_table, {0x3fc8a8dc, false}},
-        {tanh_pol_table, {0xbfbeaef5, false}},
-        {tanh_pol_table, {0xbe365aad, false}},
-        {tanh_pol_table, {0x3f4d9652, false}},
-        {tanh_pol_table, {0x3ddfa08f, false}},
-        {tanh_pol_table, {0x3e34e9b8, false}},
-        {tanh_pol_table, {0x3e2d07a6, false}},
-        {tanh_pol_table, {0x3dc63567, false}},
-        {tanh_pol_table, {0x3cdaeb78, false}},
-        {tanh_pol_table, {0xbcd17537, false}},
-        {tanh_pol_table, {0xbc92829c, false}},
-        {tanh_pol_table, {0xbb43ab99, false}},
-        {tanh_pol_table, {0xb9b471dd, false}},
-        {tanh_pol_table, {0xb6baad5a, false}},
-        {tanh_pol_table, {0xb78bafc7, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 5
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x52f688d5, false}},
-        {tanh_pol_table, {0xd0505c72, false}},
-        {tanh_pol_table, {0xd08f98e3, false}},
-        {tanh_pol_table, {0xce505cc9, false}},
-        {tanh_pol_table, {0xc7162b8a, false}},
-        {tanh_pol_table, {0xcc5061d6, false}},
-        {tanh_pol_table, {0xc7162bdf, false}},
-        {tanh_pol_table, {0xca50b37f, false}},
-        {tanh_pol_table, {0xc7162a3a, false}},
-        {tanh_pol_table, {0xc8422086, false}},
-        {tanh_pol_table, {0x471a714e, false}},
-        {tanh_pol_table, {0xc5ece1f1, false}},
-        {tanh_pol_table, {0xc70e3d90, false}},
-        {tanh_pol_table, {0xc3eba94a, false}},
-        {tanh_pol_table, {0x43e0c424, false}},
-        {tanh_pol_table, {0xc21f4552, false}},
-        {tanh_pol_table, {0x42217cc8, false}},
-        {tanh_pol_table, {0x405e7dc4, false}},
-        {tanh_pol_table, {0xc10dd401, false}},
-        {tanh_pol_table, {0x3e96b602, false}},
-        {tanh_pol_table, {0xbd1a6d2f, false}},
-        {tanh_pol_table, {0xbd393883, false}},
-        {tanh_pol_table, {0xbd674682, false}},
-        {tanh_pol_table, {0xbd310016, false}},
-        {tanh_pol_table, {0xb961e269, false}},
-        {tanh_pol_table, {0x3ba32495, false}},
-        {tanh_pol_table, {0x3a7680d5, false}},
-        {tanh_pol_table, {0x38b3173c, false}},
-        {tanh_pol_table, {0x35a9deea, false}},
-        {tanh_pol_table, {0x375c3f2a, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 6
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xd8995ed1, false}},
-        {tanh_pol_table, {0x558285ea, false}},
-        {tanh_pol_table, {0x55b2cd69, false}},
-        {tanh_pol_table, {0x53028625, false}},
-        {tanh_pol_table, {0x4bc9991f, false}},
-        {tanh_pol_table, {0x5082898a, false}},
-        {tanh_pol_table, {0x4b4999b3, false}},
-        {tanh_pol_table, {0x4e02c07c, false}},
-        {tanh_pol_table, {0x4ac99764, false}},
-        {tanh_pol_table, {0x4b72c822, false}},
-        {tanh_pol_table, {0xca40c0e1, false}},
-        {tanh_pol_table, {0x489413e4, false}},
-        {tanh_pol_table, {0x49b12224, false}},
-        {tanh_pol_table, {0x46134c4e, false}},
-        {tanh_pol_table, {0xc60c2d57, false}},
-        {tanh_pol_table, {0x43c83910, false}},
-        {tanh_pol_table, {0xc3c872d1, false}},
-        {tanh_pol_table, {0xc186bc9e, false}},
-        {tanh_pol_table, {0x42325bc3, false}},
-        {tanh_pol_table, {0xbf2ffa4a, false}},
-        {tanh_pol_table, {0x3d9a203c, false}},
-        {tanh_pol_table, {0xbc545a43, false}},
-        {tanh_pol_table, {0xbae08fee, false}},
-        {tanh_pol_table, {0x3c80225d, false}},
-        {tanh_pol_table, {0x3b1fd1df, false}},
-        {tanh_pol_table, {0xba36b9d1, false}},
-        {tanh_pol_table, {0xb91de544, false}},
-        {tanh_pol_table, {0xb71f100f, false}},
-        {tanh_pol_table, {0xb408e2ed, false}},
-        {tanh_pol_table, {0xb685fec8, false}},
-        {tanh_pol_table, {0x00000000, false}},
-    };
-
-    auto push_arg_entry_of = [&](const key_t key, const table_entry_val_t val, const bool broadcast) {
-      mapped_table_entry_t te{0, val, broadcast};
-      entry_map.insert(std::make_pair(key, te));
-    };
-
-    auto push_entries_of = [&](const table_t& t) {
-      for (auto it = t.begin(); it != t.end(); it++) {
-        auto key = it->first;
-        auto te = it->second;
-        push_arg_entry_of(key, te.val, te.bcast);
-      }
-    };
-
-    auto set_table_term_offset = [&]() {
-      size_t off = 0;
-      for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
-        auto& te = (*it).second;
-        te.off = off;
-        off += te.bcast ? 64u : sizeof(table_entry_val_t);
-      }
-    };
-
-    struct need_t {
-      explicit need_t(JBLAS_ELTWISEOP& op) {
-        if (op == EXP) exp_ = true;
-        if (op == TANH) tanh_ = true;
-        if (op == GELU) gelu_ = true;
-        if (op == SWISH) swish_ = true;
-        if (op == LOW_PRECISION_EXP) low_precision_exp_ = true;
-      }
-      bool bf16_ = false;
-      bool exp_ = false;
-      bool tanh_ = false;
-      bool gelu_ = false;
-      bool low_precision_exp_ = false;
-      bool swish_ = false;
-
-      bool bf16() const { return bf16_; }
-      bool exp() const { return exp_; }
-      bool tanh() const { return tanh_; }
-      bool gelu() const { return gelu_; }
-      bool low_precision_exp() { return low_precision_exp_; }
-      bool swish() const { return swish_; }
-    };
-
-    need_t need(elt_op);
-    push_entries_of(common_values);
-    if (need.exp()) {
-      push_entries_of(exp_consts);
-      push_entries_of(exp_polynomial);
-    }
-    if (need.low_precision_exp() || need.swish()) {
-      push_entries_of(exp_polynomial);
-      push_entries_of(exp_consts);
-      push_entries_of(low_precision_exp_consts);
-    }
-    if (need.tanh() || need.gelu()) {
-      push_entries_of(tanh_consts);
-      push_entries_of(tanh_polynomial_table);
-    }
-    if (need.gelu()) push_entries_of(gelu_tanh_const);
-
-    set_table_term_offset();
-  }
-  void exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    /* exp code */
-    h->vcmpps(ymm_mask, ymm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
-    h->vminps(ymm_src, ymm_src, table_val(exp_ln_flt_max_f));
-    h->vmaxps(ymm_src, ymm_src, table_val(exp_ln_flt_min_f));
-    h->vmovups(ymm_aux1, ymm_src);
-    h->vmulps(ymm_src, ymm_src, table_val(exp_log2ef));
-    h->vaddps(ymm_src, ymm_src, table_val(half));
-    h->vroundps(ymm_aux2, ymm_src, _op_floor);
-
-    // keep ymm_src = fx for further computations
-    h->vmovups(ymm_src, ymm_aux2);
-
-    // x = x - fx * ln2
-    h->vfnmadd231ps(ymm_aux1, ymm_aux2, table_val(ln2f));
-
-    // We do not count 2^n here, because n can reach 128 and 2^128 is not
-    // representable by fp32, so to get around this problem, instead of
-    // computing 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
-    // and 2 are numbers representable in fp32.
-
-    // compute 2^(n-1)
-    h->vsubps(ymm_src, ymm_src, table_val(one));
-    h->vcvtps2dq(ymm_aux2, ymm_src);
-    h->vpaddd(ymm_aux2, ymm_aux2, table_val(exponent_bias));
-    h->vpslld(ymm_aux2, ymm_aux2, n_mantissa_bits);
-
-    // use ymm_src as tmp ymm_zero when applying mask
-    h->vxorps(ymm_src, ymm_src, ymm_src);
-
-    // set zeroes at those points which were < log(FLT_MIN)
-    h->vblendvps(ymm_aux2, ymm_aux2, ymm_src, ymm_mask);
-
-    // compute polynomial
-    h->vmovups(ymm_src, table_val(exp_pol, 4));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 3));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 2));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 1));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 0));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
-
-    // y = y * 2^n
-
-    h->vmulps(ymm_src, ymm_src, ymm_aux2);
-    h->vmulps(ymm_src, ymm_src, table_val(two));
-  }
-  void exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    /* exp code */
-    h->vcmpps(k_mask, zmm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
-    h->vminps(zmm_src, zmm_src, table_val(exp_ln_flt_max_f));
-    h->vmaxps(zmm_src, zmm_src, table_val(exp_ln_flt_min_f));
-    h->vmovups(zmm_aux1, zmm_src);
-    h->vmulps(zmm_src, zmm_src, table_val(exp_log2ef));
-    h->vaddps(zmm_src, zmm_src, table_val(half));
-    h->vrndscaleps(zmm_aux2, zmm_src, _op_floor & 0x3);
-
-    // keep zmm_src = fx for further computations
-    h->vmovups(zmm_src, zmm_aux2);
-
-    // x = x - fx * ln2
-    h->vfnmadd231ps(zmm_aux1, zmm_aux2, table_val(ln2f));
-
-    // We do not count 2^n here, because n can reach 128 and 2^128 is not
-    // representable by fp32, so to get around this problem, instead of computing
-    // 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
-    // and 2 are numbers representable in fp32.
-
-    // compute 2^(n-1)
-    h->vsubps(zmm_src, zmm_src, table_val(one));
-    h->vcvtps2dq(zmm_aux2, zmm_src);
-    h->vpaddd(zmm_aux2, zmm_aux2, table_val(exponent_bias));
-    h->vpslld(zmm_aux2, zmm_aux2, n_mantissa_bits);
-
-    // use zmm_src as tmp zmm_zero when applying mask
-    h->vxorps(zmm_src, zmm_src, zmm_src);
-
-    // set zeroes at those points which were < log(FLT_MIN)
-    h->vblendmps(zmm_aux2 | k_mask, zmm_aux2, zmm_src);
-
-    // compute polynomial
-    h->vmovups(zmm_src, table_val(exp_pol, 4));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 3));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 2));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 1));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 0));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
-
-    // y = y * 2^n
-
-    h->vmulps(zmm_src, zmm_src, zmm_aux2);
-    h->vmulps(zmm_src, zmm_src, table_val(two));
-  }
-  void low_precision_exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    // support abs(x)<23
-    auto code = [&](Xbyak::CodeGenerator* h, const Ymm& dst, const Ymm& src, const Xbyak::Operand& log2e,
-                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
-                    const Xbyak::Operand& coeff2, const std::array<Ymm, 4>& tmp) {
-      h->vmulps(tmp[0], src, log2e);      // x / ln2
-      h->vroundps(tmp[0], tmp[0], 0x0A);  // round up
-      const auto& z = tmp[0];
-      h->vmulps(tmp[1], tmp[0], ln2);
-      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
-      h->vmovaps(dst, coeff1);
-      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
-      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
-
-      const auto& z_sign = tmp[2];
-      const auto& z_abs = tmp[3];
-      h->vcmpps(z_sign, z, table_val(zero), _cmp_lt_os);
-      h->vcvtps2dq(z, z);
-      h->vpabsd(z_abs, z);
-      h->vmovdqu(tmp[1], table_val(one_epi32));
-      h->vpsllvd(z_abs, tmp[1], z_abs);  // 2^z
-      h->vcvtdq2ps(z_abs, z_abs);
-      h->vrcpps(z, z_abs);
-      h->vblendvps(z, z_abs, z, z_sign);
-      h->vmulps(dst, dst, z);  // dst = exp(f) * 2^z
-    };
-    code(h, ymm_src, ymm_src, table_val(exp_log2ef), table_val(ln2f),  //
-         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
-         table_val(low_precision_exp_const_v2), {ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4});
-  }
-  void low_precision_exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    auto code = [&](Xbyak::CodeGenerator* h, const Zmm& dst, const Zmm& src, const Xbyak::Operand& log2e,
-                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
-                    const Xbyak::Operand& coeff2, const std::array<Zmm, 2>& tmp) {
-      h->vmovups(tmp[0], log2e);
-      h->vmulps(tmp[0] | h->T_ru_sae, src, tmp[0]);  // round up(x / ln2)
-      const auto& z = tmp[0];
-      h->vmulps(tmp[1], tmp[0], ln2);
-      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
-      h->vmovaps(dst, coeff1);
-      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
-      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
-      h->vscalefps(dst, dst, z);            // dst = exp(f) * 2^z
-    };
-    code(h, zmm_src, zmm_src, table_val(exp_log2ef), table_val(ln2f),  //
-         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
-         table_val(low_precision_exp_const_v2), {zmm_aux1, zmm_aux2});
-  }
-  void swish_compute_vector_fwd(const Xbyak::Ymm& ymm_src, int const_p_offset) {
-    h->vbroadcastss(ymm_aux0, h->ptr[reg_rt_const_p + const_p_offset]);
-    h->vmulps(ymm_aux0, ymm_aux0, ymm_src);
-    exp_compute_vector_fwd(ymm_aux0);
-    h->vaddps(ymm_aux0, ymm_aux0, table_val(one));
-    h->vrcpps(ymm_aux0, ymm_aux0);
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-  }
-  void swish_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vmovups(zmm_aux0, zmm_src);
-    h->vmulps(zmm_aux0, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset]);
-    low_precision_exp_compute_vector_fwd(zmm_aux0);
-    h->vaddps(zmm_aux0, zmm_aux0, table_val(one));
-    h->vrcp14ps(zmm_aux0, zmm_aux0);
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-  }
-  void tanh_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    // register mapping
-    Ymm ymm_dst = ymm_aux1, ymm_src_shift = ymm_aux1, ymm_coeff = ymm_aux1, ymm_pol = ymm_aux2, ymm_indices = ymm_aux3,
-        ymm_src_original = ymm_aux4, ymm_sign = ymm_aux4;
-
-    const int tanh_n_polynomials = 32;
-
-    // We split the positive domain in 33 intervals:
-    // a) [0; linear_ubound]: in this interval tanh(x) = x
-    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
-    //    half binade
-    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
-    //    one interval for each half binade, there are 29 of those
-    // d) [0x1.0p3; saturation_ubound]:
-    //    This interval spans part of a half binade
-    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
-    // For b-d, we need 31 polynomials and will do a table lookup for those.
-    // To simplify the logic, we will also put a) in the table.
-    auto coeffs_address = [&](int coeff_off, int off = 0) {
-      return table_val(tanh_pol_table, coeff_off * tanh_n_polynomials + off);
-    };
-    auto gather_coefficient = [&](Ymm vmm_coeff, int coeff_idx, Ymm vmm_pol_idx) {
-      Ymm ymm_coeff(vmm_coeff.getIdx());
-      Ymm ymm_pol_idx(vmm_pol_idx.getIdx());
-      Xbyak::Address idx_addr =
-          h->ptr[p_table + table_off(tanh_pol_table, coeff_idx * tanh_n_polynomials) + ymm_pol_idx * sizeof(float)];
-      h->vcmpps(ymm_mask, ymm_mask, ymm_mask, _cmp_eq_oq);
-      h->vgatherdps(vmm_coeff, idx_addr, ymm_mask);
-    };
-
-    // because tanh(x) = -tanh(-x), we extract sign to make x positive
-    // and reapply sign at the end
-    h->vmovups(ymm_src_original, ymm_src);
-    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
-
-    // We compute the indices for the table lookup
-    h->vmovups(ymm_indices, ymm_src);
-    h->vpsubd(ymm_indices, ymm_indices, table_val(tanh_idx_bias));
-    h->vandps(ymm_indices, ymm_indices, table_val(tanh_idx_mask));
-    h->vpsrld(ymm_indices, ymm_indices, 22);
-
-    // we do the argument reduction
-    h->vmovups(ymm_src_shift, ymm_src);
-    h->vandps(ymm_src_shift, ymm_src_shift, table_val(tanh_idx_mask));
-    h->vsubps(ymm_src, ymm_src, ymm_src_shift);
-
-    // we gather and evaluate the polynonials
-    gather_coefficient(ymm_pol, 6, ymm_indices);
-    for (int deg = 5; deg >= 0; --deg) {
-      gather_coefficient(ymm_coeff, deg, ymm_indices);
-      h->vfmadd213ps(ymm_pol, ymm_src, ymm_coeff);
-    }
-
-    // we restore src with cleared sign, and keep sign
-    h->vmovups(ymm_src, ymm_src_original);
-    h->vandps(ymm_sign, ymm_sign, table_val(sign_mask));
-    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
-
-    // Now we blend the results
-    // [saturation_ubound; +inf[ : we return +/- 1
-    h->vmovups(ymm_dst, table_val(one));
-    // [linear_ubound; saturation_lbound] : we return +/- P(x)
-    h->vmovups(ymm_mask, table_val(tanh_saturation_lbound));
-    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
-    h->vblendvps(ymm_dst, ymm_dst, ymm_pol, ymm_mask);
-    // [0; linear_ubound]  : we return x
-    h->vmovups(ymm_mask, table_val(tanh_linear_ubound));
-    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
-    h->vblendvps(ymm_dst, ymm_dst, ymm_src, ymm_mask);
-
-    // We reapply the sign and return
-    h->vxorps(ymm_dst, ymm_dst, ymm_sign);
-    h->vmovups(ymm_src, ymm_dst);
-  }
-  void tanh_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    // register mapping
-    Zmm zmm_dst = zmm_aux1, zmm_src_shift = zmm_aux1, zmm_coeff = zmm_aux1, zmm_pol = zmm_aux2, zmm_indices = zmm_aux3,
-        zmm_src_original = zmm_aux4, zmm_sign = zmm_aux4;
-
-    const int tanh_n_polynomials = 32;
-
-    // We split the positive domain in 33 intervals:
-    // a) [0; linear_ubound]: in this interval tanh(x) = x
-    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
-    //    half binade
-    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
-    //    one interval for each half binade, there are 29 of those
-    // d) [0x1.0p3; saturation_ubound]:
-    //    This interval spans part of a half binade
-    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
-    // For b-d, we need 31 polynomials and will do a table lookup for those.
-    // To simplify the logic, we will also put a) in the table.
-    auto coeffs_address = [&](int coeff_off, int off = 0) {
-      return table_val(tanh_pol_table, (size_t)coeff_off * tanh_n_polynomials + off);
-    };
-    auto gather_coefficient = [&](Zmm vmm_coeff, int coeff_idx, Zmm vmm_pol_idx) {
-      Zmm zmm_coeff(vmm_coeff.getIdx());
-      Zmm zmm_pol_idx(vmm_pol_idx.getIdx());
-      h->vmovups(zmm_coeff, coeffs_address(coeff_idx, 0));
-      h->vpermt2ps(zmm_coeff, zmm_pol_idx, coeffs_address(coeff_idx, 16));
-    };
-
-    // because tanh(x) = -tanh(-x), we extract sign to make x positive
-    // and reapply sign at the end
-    h->vmovups(zmm_src_original, zmm_src);
-    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
-
-    // We compute the indices for the table lookup
-    h->vmovups(zmm_indices, zmm_src);
-    h->vpsubd(zmm_indices, zmm_indices, table_val(tanh_idx_bias));
-    h->vpandd(zmm_indices, zmm_indices, table_val(tanh_idx_mask));
-    h->vpsrld(zmm_indices, zmm_indices, 22);
-
-    // we do the argument reduction
-    h->vmovups(zmm_src_shift, zmm_src);
-    h->vpandd(zmm_src_shift, zmm_src_shift, table_val(tanh_idx_mask));
-    h->vsubps(zmm_src, zmm_src, zmm_src_shift);
-
-    // we gather and evaluate the polynonials
-    gather_coefficient(zmm_pol, 6, zmm_indices);
-    for (int deg = 5; deg >= 0; --deg) {
-      gather_coefficient(zmm_coeff, deg, zmm_indices);
-      h->vfmadd213ps(zmm_pol, zmm_src, zmm_coeff);
-    }
-
-    // we restore src with cleared sign, and keep sign
-    h->vmovups(zmm_src, zmm_src_original);
-    h->vpandd(zmm_sign, zmm_sign, table_val(sign_mask));
-    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
-
-    // Now we blend the results
-    // [saturation_ubound; +inf[ : we return +/- 1
-    h->vmovups(zmm_dst, table_val(one));
-    // [linear_ubound; saturation_lbound] : we return +/- P(x)
-    h->vmovups(zmm_mask, table_val(tanh_saturation_lbound));
-    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
-    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_pol);
-    // [0; linear_ubound]  : we return x
-    h->vmovups(zmm_mask, table_val(tanh_linear_ubound));
-    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
-    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_src);
-
-    // We reapply the sign and return
-    h->vpxord(zmm_dst, zmm_dst, zmm_sign);
-    h->vmovups(zmm_src, zmm_dst);
-  }
-  void gelu_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    h->vmovups(ymm_aux0, ymm_src);
-    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
-    h->vmulps(ymm_src, ymm_src, ymm_src);
-    h->vmovups(ymm_aux1, table_val(gelu_tanh_fitting_const));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-    h->vmulps(ymm_src, ymm_src, table_val(gelu_tanh_sqrt_two_over_pi));
-
-    // compute tanh(G(x))
-    tanh_compute_vector_fwd(ymm_src);
-
-    // compute 0.5 * x * (1 + tanh(G(x)))
-    h->vaddps(ymm_src, ymm_src, table_val(one));
-    h->vmulps(ymm_src, ymm_src, table_val(half));
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-  }
-  void gelu_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    h->vmovups(zmm_aux0, zmm_src);
-    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
-    h->vmulps(zmm_src, zmm_src, zmm_src);
-    h->vmovups(zmm_aux1, table_val(gelu_tanh_fitting_const));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-    h->vmulps(zmm_src, zmm_src, table_val(gelu_tanh_sqrt_two_over_pi));
-
-    // compute tanh(G(x))
-    tanh_compute_vector_fwd(zmm_src);
-
-    // compute 0.5 * x * (1 + tanh(G(x)))
-    h->vaddps(zmm_src, zmm_src, table_val(one));
-    h->vmulps(zmm_src, zmm_src, table_val(half));
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-  }
-  void relu_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vmovups(zmm_aux1, zmm_src);
-    h->vcmpps(k_mask, zmm_src, table_val(zero), _cmp_nle_us);
-    h->vmulps(zmm_src, zmm_src, h->zword_b[reg_rt_const_p + const_p_offset]);
-    h->vblendmps(zmm_src | k_mask, zmm_src, zmm_aux1);
-  }
-  void linear_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vbroadcastss(zmm_aux0, h->dword[reg_rt_const_p + const_p_offset]);
-    h->vfmadd213ps(zmm_src, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset + 1 * sizeof(float)]);
-  }
-  void load_table_addr() { h->mov(p_table, l_table); }
-  void assign_zmm(const std::set<int>& used_zmm_idx, Zmm* zmm) {
-    constexpr int max_zmm_idx = 32;
-    for (int idx = 0; idx < max_zmm_idx; idx++) {
-      if (used_zmm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
-        *zmm = Zmm(idx);
-        assign_vmm_idx.insert(idx);
-        break;
-      }
-    }
-  }
-  void assign_ymm(const std::set<int>& used_ymm_idx, Ymm* ymm) {
-    constexpr int max_ymm_idx = 16;
-    for (int idx = 0; idx < max_ymm_idx; idx++) {
-      if (used_ymm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
-        *ymm = Ymm(idx);
-        assign_vmm_idx.insert(idx);
-        break;
-      }
-    }
-  }
-
- private:
-  JBLAS_ELTWISEOP elt_op;
-  Xbyak::CodeGenerator* h = nullptr;
-
-  /*labels*/
-  Xbyak::Label l_table;
-
-  /*register for fwd*/
-  Xbyak::Reg64 p_table;
-  Xbyak::Reg64 reg_rt_const_p;
-  std::set<int> assign_vmm_idx;  // use for zmm (in avx512) or ymm (in avx2)
-  Zmm zmm_mask, zmm_aux0, zmm_aux1, zmm_aux2, zmm_aux3, zmm_aux4;
-  Ymm ymm_mask, ymm_aux0, ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4;
-  Xbyak::Opmask k_mask;
-  static constexpr int n_mantissa_bits = 23;
-
-  enum {
-    _cmp_eq_oq = 0u,
-    _cmp_lt_os = 1u,
-    _cmp_le_os = 2u,
-    _cmp_neq_uq = 4u,
-    _cmp_nlt_us = 5u,
-    _cmp_nle_us = 6u,
-
-    _op_floor = 1u,
-    _op_mxcsr = 4u,
-  };
-
-  enum key_t {
-    zero = 0,                             // 0.f
-    half,                                 // 0.5f
-    one,                                  // 1.f  or  mask for exponent bits
-    two,                                  // 2.f
-    three,                                // 3.f
-    six,                                  // 6.f
-    minus_one,                            // -1.f  or  changes sign to opposite
-    minus_two,                            // -2.f
-    minus_three,                          // -3.f
-    ln2f,                                 // 0.69314718f
-    one_epi32,                            // 1 in int32
-    positive_mask,                        // changes sign to positive
-    sign_mask,                            // gets sign value
-    exponent_bias,                        // (127 = 2^7 - 1), gets exponent bits
-    exp_log2ef,                           // 1.44269502f - formula-based for approx
-    exp_ln_flt_max_f,                     // logf(FLT_MAX) - max normal value
-    exp_ln_flt_min_f,                     // logf(FLT_MIN) - min normal value
-    exp_pol,                              // see correspondent table for float values
-    gelu_tanh_fitting_const,              // 0.044715f
-    gelu_tanh_fitting_const_times_three,  // 0.134145f
-    gelu_tanh_sqrt_two_over_pi,           // sqrtf(2.f/pi) = 0.797884f
-    gelu_tanh_flt_max_x,
-    gelu_tanh_flt_min_x,
-    tanh_idx_bias,
-    tanh_idx_mask,
-    tanh_linear_ubound,
-    tanh_saturation_lbound,
-    tanh_pol_table,
-    low_precision_exp_const_v0,
-    low_precision_exp_const_v1,
-    low_precision_exp_const_v2,
-    undef_key,
-  };
-
-  size_t table_off(key_t key, size_t key_off_val_shift = 0) {
-    const auto it = entry_map.find(key);
-    assert(it != entry_map.end());  // "key is not in entry_map"
-    const auto& te = (*it).second;
-    const auto scale = te.bcast ? 64u : sizeof(table_entry_val_t);
-    return te.off + key_off_val_shift * scale;
-  }
-  Xbyak::Address table_val(key_t key, size_t key_off_val_shift = 0) {
-    auto off = table_off(key, key_off_val_shift);
-    return h->ptr[p_table + off];
-  }
-  using table_entry_val_t = uint32_t;
-  using table_entry_offset_t = size_t;  // offsets are in bytes wrt p_table
-  using table_entry_bcast_t = bool;
-
-  struct table_entry_t {
-    table_entry_val_t val;
-    table_entry_bcast_t bcast;
-  };
-  struct mapped_table_entry_t {
-    table_entry_offset_t off;
-    table_entry_val_t val;
-    table_entry_bcast_t bcast;
-  };
-  using table_t = std::multimap<key_t, table_entry_t>;
-  using mapped_table_t = std::multimap<key_t, mapped_table_entry_t>;
-  mapped_table_t entry_map = {};
-};
-}  // namespace jit_injector
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
deleted file mode 100644
index 6e00704395ed..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
+++ /dev/null
@@ -1,1039 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <vector>
-#include <algorithm>
-#include <limits>
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace kernel {
-namespace ref {
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
-                                            int colpad, int src_step, int dst_step, int NTile, int RowPack) {
-  const T_DST dst_0(0);
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  for (int i = 0; i < rowpad; i += RowPack) {
-    for (int j = 0; j < colpad; j += NTile) {
-      for (int jj = 0; jj < NTile; jj++) {
-        for (int ii = 0; ii < RowPack; ii++) {
-          dst_ptr[i * NTile + j * dst_step + jj * RowPack + ii] =
-              (i + ii) < row && (j + jj) < col  //
-                  ? static_cast<T_DST>(src_ptr[(i + ii) * src_step + (j + jj)])
-                  : dst_0;
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-// revert padding and interleave
-// row*col <= colpad/NTile*rowpad*NTile
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE revert_padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
-                                                   int colpad, int src_step, int dst_step, int NTile, int RowPack) {
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  for (int i = 0; i < rowpad; i += RowPack) {
-    for (int j = 0; j < colpad; j += NTile) {
-      for (int jj = 0; jj < NTile; jj++) {
-        if ((j + jj) < col) {
-          for (int ii = 0; ii < RowPack; ii++) {
-            if ((i + ii) < row) {
-              dst_ptr[(i + ii) * dst_step + (j + jj)] =
-                  static_cast<T_DST>(src_ptr[i * NTile + j * src_step + jj * RowPack + ii]);
-            }
-          }
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-// M x N ===> M/MTile x N/colPack x MTile x colPack (leading dim stride = MTile * dst_stride)
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE padding_trans_interleave(const T_SRC* src, T_DST* dst, int row, int col, int rowpad,
-                                                  int colpad, int src_step, int dst_step, int MTile, int ColPack) {
-  // Note: rows/cols and i/j are in terms of src
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  const T_DST dst_0(0);
-  for (int i = 0; i < rowpad; i += MTile) {
-    for (int j = 0; j < colpad; j += ColPack) {
-      for (int ii = 0; ii < MTile; ii++) {
-        for (int jj = 0; jj < ColPack; jj++) {
-          dst[i * dst_step + j * MTile + ii * ColPack + jj] =
-              (i + ii) < row && (j + jj) < col  //
-                  ? static_cast<T_DST>(src[(i + ii) * src_step + (j + jj)])
-                  : dst_0;
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_DT, typename DST_DT>
-static inline JBLAS_CODE dt_cvt_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col, int srcstride,
-                                              int dststride, bool zeropadding) {
-  for (int i = 0; i < row; i++) {
-    int j = 0;
-    for (; j < col; j++) {
-      const auto src = reinterpret_cast<const SRC_DT*>(reinterpret_cast<const char*>(raw_srcptr) + i * srcstride);
-      const auto dst = reinterpret_cast<DST_DT*>(reinterpret_cast<char*>(raw_dstptr) + i * dststride);
-      dst[j] = static_cast<DST_DT>(src[j]);
-    }
-    if (zeropadding) {
-      for (int bj = j * sizeof(DST_DT); bj < dststride; bj++) {
-        (reinterpret_cast<char*>(raw_dstptr) + i * dststride)[bj] = 0;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequan_s8_f32(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                       float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      dstptr[i * ld_dst + j] = static_cast<float>(srcptr[i * ld_src + j]) * scales[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequan_s8_bf16(int8_t* srcptr, uint16_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                        float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      dstptr[i * ld_dst + j] =
-          jblas::utils::cast<float, jblas::utils::bf16>(static_cast<float>(srcptr[i * ld_src + j]) * scales[j]).x;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _T>
-static inline JBLAS_CODE transpose2d(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < col; i++) {
-    for (size_t j = 0; j < row; j++) {
-      dstptr[j + i * ld_dst] = srcptr[j * ld_src + i];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE compress_s8_s4(const int8_t* srcptr, jblas::utils::int4x2* dstptr, int row, int col,
-                                        int ld_src, int ld_dst) {
-  for (int j = 0; j < row; j++) {
-    for (int ii = 0; ii < col; ii += 2) {
-      jblas::utils::int4x2 tmp;
-      tmp.x = jblas::utils::int4x2::convert(srcptr[j * ld_src + ii + 0]);
-      tmp.y = jblas::utils::int4x2::convert(srcptr[j * ld_src + ii + 1]);
-      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE compress_f4(const int8_t* srcptr, jblas::utils::f4x2* dstptr, int row, int col, int ld_src,
-                                     int ld_dst) {
-  for (int j = 0; j < row; j++) {
-    for (int ii = 0; ii < col; ii += 2) {
-      jblas::utils::f4x2 tmp;
-      tmp.x = srcptr[j * ld_src + ii + 0];
-      tmp.y = srcptr[j * ld_src + ii + 1];
-      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE decompress_s4_f32(jblas::utils::int4x2* srcptr, float* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      auto noffset = i * NTile + j % NTile;
-      dstptr[i * ld_dst + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scales[noffset + 0];
-      dstptr[i * ld_dst + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scales[noffset + 1];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-inline int8_t get_s8(int8_t v) {
-  switch (S4_T) {
-    case JBLAS_DTYPE::S4_CLIP:
-      return v << 4;
-    case JBLAS_DTYPE::S4_FULLRANGE:
-      v &= 0x0f;
-      return v - 8;
-    default:
-      assert(false);
-      break;
-  }
-  return static_cast<int8_t>(0);
-}
-
-template <JBLAS_DTYPE S4_T>
-inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = get_s8<S4_T>(tmp.x);
-      dstptr[i * ld_dst + j + 1] = get_s8<S4_T>(tmp.y);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_s8_f32(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                           _S_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 1) {
-      float tmp = static_cast<float>(srcptr[i * ld_src + j]);
-      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j]);
-      dstptr[i * ld_dst + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                          int ld_dst, _S_T* scales, int8_t* zero_points, int k_offset, int kblock,
-                                          int NPad, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      float scale0, scale1, dst0, dst1;
-      int s0_idx, s1_idx;
-      s0_idx = j / _PACK_ROW;
-      s1_idx = (j + 1) / _PACK_ROW;
-      scale0 = static_cast<float>(sptr[s0_idx]);
-      scale1 = static_cast<float>(sptr[s1_idx]);
-      if (zero_points != nullptr) {
-        dst0 = (static_cast<float>(get_s8<S4_T>(tmp.x)) - static_cast<float>((zero_points + kpos * NPad)[s0_idx])) *
-               scale0;
-        dst1 = (static_cast<float>(get_s8<S4_T>(tmp.y)) - static_cast<float>((zero_points + kpos * NPad)[s1_idx])) *
-               scale1;
-      } else {
-        dst0 = static_cast<float>(get_s8<S4_T>(tmp.x)) * scale0;
-        dst1 = static_cast<float>(get_s8<S4_T>(tmp.y)) * scale1;
-      }
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(static_cast<float>(get_s8<S4_T>(tmp.x)));
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(static_cast<float>(get_s8<S4_T>(tmp.y)));
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 1) {
-      auto tmp = srcptr[i * ld_src + j];
-      dstptr[i * ld_dst + j] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-  }
-  return JblasSuccess;
-}
-
-inline float fp4_bnb_unpack(uint8_t val) {
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if ((val & 0b0100) == 4)          // 0
-    if ((val & 0b0010) == 2)        // 01
-      if ((val & 0b0001) == 1)      // 111
-        return 0.25000000f * sign;  // 1111
-      else
-        return 0.16666667f * sign;  // 1110
-    else if ((val & 0b0001) == 1)   // 110
-      return 0.50000000f * sign;    // 1101
-    else
-      return 0.33333333f * sign;  // 1100
-  else if ((val & 0b0010) == 2)   // 10
-    if ((val & 0b0001) == 1)      // 101
-      return 1.00000000f * sign;  // 1011
-    else
-      return 0.66666667f * sign;     // 1010
-  else if ((val & 0b0001) == 1)      // 100
-    return 5.208333333e-03f * sign;  // 1001
-  else
-    return 0.00000000f * sign;  // 1000
-}
-
-inline float fp4_bnb_dequantize(uint8_t val, float absmax) { return fp4_bnb_unpack(val) * absmax; }
-
-inline int8_t fp4_bnb_quantize(float x) {
-  int sign = x < 0 ? 0b1000 : 0b0000;
-  x = fabsf(x);
-  if (x > 0.29166667f)
-    if (x > 0.583333f)
-      if (x > 0.8333333f)
-        return static_cast<int8_t>(0b0011 + sign);
-      else
-        return static_cast<int8_t>(0b0010 + sign);
-    else if (x > 0.4166667f)
-      return static_cast<int8_t>(0b101 + sign);
-    else
-      return static_cast<int8_t>(0b100 + sign);
-  else if (x > 0.0859375f)
-    if (x > 0.20833333f)
-      return static_cast<int8_t>(0b0111 + sign);
-    else
-      return static_cast<int8_t>(0b0110 + sign);
-  else if (x > 0.00260417f)
-    return static_cast<int8_t>(0b0001 + sign);
-  else
-    return static_cast<int8_t>(0b0000 + sign);
-}
-
-inline int8_t fp4_e2m1_quantize(float x) {
-  // FP4 with bias of 1
-  // first bit is a sign
-  // subnormals
-  // 0b000 = 0
-  // 0b001 = 0.0625
-  // 0b010 = 1
-  // 0b011 = 1.5
-  // 0b100 = 2
-  // 0b101 = 3
-  // 0b110 = 4
-  // 0b111 = 6
-
-  int sign = x < 0 ? 0b1000 : 0b0000;
-  x = fabsf(x);
-  if (x > 1.75f / 6) {
-    if (x > 3.5f / 6) {
-      if (x > 5.f / 6)
-        return static_cast<int8_t>(0b111 + sign);  // 6
-      else
-        return static_cast<int8_t>(0b110 + sign);  // 4
-    } else {
-      if (x > 2.5f / 6)
-        return static_cast<int8_t>(0b101 + sign);  // 3
-      else
-        return static_cast<int8_t>(0b100 + sign);  // 2
-    }
-  } else {
-    if (x > 0.53125f / 6) {
-      if (x > 1.25f / 6)
-        return static_cast<int8_t>(0b011 + sign);  // 1.5
-      else
-        return static_cast<int8_t>(0b010 + sign);  // 1
-    } else {
-      if (x > 0.03125f / 6)
-        return static_cast<int8_t>(0b0001 + sign);  // 0.0625
-      else
-        return static_cast<int8_t>(0b0000 + sign);  // 0
-    }
-  }
-}
-
-inline float fp4_e2m1_unpack(uint8_t val) {
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if ((val & 0b0100) == 4)      // 0
-    if ((val & 0b0010) == 2)    // 01
-      if ((val & 0b0001) == 1)  // 111
-        return 1.f * sign;      // 1111
-      else
-        return 0.6666666666666666f * sign;  // 1110
-    else if ((val & 0b0001) == 1)           // 110
-      return 0.5f * sign;                   // 1101
-    else
-      return 0.3333333333333333f * sign;  // 1100
-  else if ((val & 0b0010) == 2)           // 10
-    if ((val & 0b0001) == 1)              // 101
-      return 0.25f * sign;                // 1011
-    else
-      return 0.16666666666666666f * sign;  // 1010
-  else if ((val & 0b0001) == 1)            // 100
-    return 0.010416666666666666f * sign;   // 1001
-  else
-    return 0.00000000f * sign;  // 1000
-}
-
-inline float fp4_e2m1_dequantize(uint8_t val, float absmax) { return fp4_e2m1_unpack(val) * absmax; }
-
-inline float nf4_unpack(int8_t val) {
-  if ((val & 0b1000) == 8)
-    if ((val & 0b0100) == 4)      // 1
-      if ((val & 0b0010) == 2)    // 11
-        if ((val & 0b0001) == 1)  // 111
-          return 1.0f;
-        else
-          return 0.7229568362236023f;
-      else if ((val & 0b0001) == 1)  // 110
-        return 0.5626170039176941f;
-      else
-        return 0.44070982933044434f;
-    else if ((val & 0b0010) == 2)  // 10
-      if ((val & 0b0001) == 1)     // 101
-        return 0.33791524171829224f;
-      else
-        return 0.24611230194568634f;
-    else if ((val & 0b0001) == 1)  // 100
-      return 0.16093020141124725f;
-    else
-      return 0.07958029955625534f;
-
-  else if ((val & 0b0100) == 4)  // 0
-    if ((val & 0b0010) == 2)     // 01
-      if ((val & 0b0001) == 1)   // 011
-        return -1.f;
-      else
-        return -0.09105003625154495f;
-    else if ((val & 0b0001) == 1)  // 010
-      return -0.18477343022823334f;
-    else
-      return -0.28444138169288635f;
-  else if ((val & 0b0010) == 2)  // 00
-    if ((val & 0b0001) == 1)     // 001
-      return -0.39491748809814453f;
-    else
-      return -0.5250730514526367f;
-  else if ((val & 0b0001) == 1)  // 000
-    return -0.6961928009986877f;
-  else
-    return 0.f;
-}
-
-inline float nf4_dequantize(int8_t val, float absmax) { return nf4_unpack(val) * absmax; }
-
-// Note: In the BNB Nf4 definition, 0 has a non-zero value after dequantization, but Jblas uses 0 for padding, which
-// leads to calculation errors. We ultimately choose to swap the binary bits of -1 and 0 in Nf4 to avoid this
-// conflict.
-inline int8_t nf4_quantize(float x) {
-  if (x > 0.03979014977812767f)
-    if (x > 0.3893125355243683f)      // 1
-      if (x > 0.6427869200706482f)    // 11
-        if (x > 0.8614784181118011f)  // 111
-          return 0b1111;
-        else
-          return 0b1110;
-      else if (x > 0.5016634166240692f)  // 110
-        return 0b1101;
-      else
-        return 0b1100;
-    else if (x > 0.2035212516784668f)  // 10
-      if (x > 0.2920137718319893f)     // 101
-        return 0b1011;
-      else
-        return 0b1010;
-    else if (x > 0.1202552504837513f)  // 100
-      return 0b1001;
-    else
-      return 0b1000;
-  else if (x > -0.33967943489551544f)  // 0
-    if (x > -0.13791173323988914f)     // 01
-      if (x > -0.045525018125772476f)  // 011
-        return 0b0000;
-      else
-        return 0b0110;
-    else if (x > -0.23460740596055984f)  // 010
-      return 0b0101;
-    else
-      return 0b0100;
-  else if (x > -0.6106329262256622f)  // 00
-    if (x > -0.4599952697753906f)     // 001
-      return 0b0011;
-    else
-      return 0b0010;
-  else if (x > -0.8480964004993439f)  // 000
-    return 0b0001;
-  else
-    return 0b0111;
-}
-
-template <JBLAS_DTYPE F4_T>
-inline float f4_unpack(int8_t v) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  switch (F4_T) {
-    case JBLAS_DTYPE::F4_BNB:
-      return fp4_bnb_unpack(v);
-    case JBLAS_DTYPE::F4_NF4:
-      return nf4_unpack(v);
-    case JBLAS_DTYPE::F4_E2M1:
-      return fp4_e2m1_unpack(v);
-    default:
-      break;
-  }
-  return std::numeric_limits<float>::quiet_NaN();
-}
-
-template <JBLAS_DTYPE F4_T>
-inline float f4_dequantize(int8_t v, float scale) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  return f4_unpack<F4_T>(v) * scale;
-}
-
-template <JBLAS_DTYPE F4_T>
-inline int8_t f4_quantize(float x) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  switch (F4_T) {
-    case JBLAS_DTYPE::F4_BNB:
-      return fp4_bnb_quantize(x);
-    case JBLAS_DTYPE::F4_NF4:
-      return nf4_quantize(x);
-    case JBLAS_DTYPE::F4_E2M1:
-      return fp4_e2m1_quantize(x);
-    default:
-      break;
-  }
-  return static_cast<int8_t>(0);
-}
-
-template <JBLAS_DTYPE F4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                          _S_T* scales, int k_offset, int kblock, int NPad, int8_t* tmp,
-                                          size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      float scale0, scale1, dst0, dst1;
-      int s0_idx, s1_idx;
-      s0_idx = j / _PACK_ROW;
-      s1_idx = (j + 1) / _PACK_ROW;
-      scale0 = static_cast<float>(sptr[s0_idx]);
-      scale1 = static_cast<float>(sptr[s1_idx]);
-      dst0 = f4_dequantize<F4_T>(tmp.x, scale0);
-      dst1 = f4_dequantize<F4_T>(tmp.y, scale1);
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE F4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.x));
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.y));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE memcpy2d_dw2highw(const void* srcptr, void* dstptr, int row, int col, int srcstride,
-                                           int dststride) {
-  auto bsrcptr = (char*)srcptr;
-  auto bdstptr = (char*)dstptr;
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      std::memcpy(bdstptr + i * dststride + j * sizeof(jblas::utils::bf16),
-                  bsrcptr + i * srcstride + j * sizeof(float) + 2, sizeof(jblas::utils::bf16));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE memcpy2d(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride) {
-  auto bsrcptr = (const char*)srcptr;
-  auto bdstptr = (char*)dstptr;
-  for (int i = 0; i < row; i++) {
-    std::memcpy(bdstptr + i * dststride, bsrcptr + i * srcstride, col);
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-inline JBLAS_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  int raw_blocksize = blocksize;
-  for (int i = 0; i < col; i++) {
-    int align_row_loop = row / blocksize * blocksize;
-    int j = 0;
-    auto s8_calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float maxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      float scale = maxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
-      }
-    };
-    auto s4_fullrange_calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float amax = 0.f, max = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto v = srcptr[(j + ij) * ld_src + i];
-        if (amax < std::abs(v)) {
-          amax = std::abs(v);
-          max = v;
-        }
-      }
-      float scale = max / -8.f;
-      float rscale = scale != 0.f ? 1.f / scale : 0.f;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto quant_v = srcptr[(j + ij) * ld_src + i] * rscale;
-        int8_t x = std::min(static_cast<int8_t>(15), static_cast<int8_t>(quant_v + 8.5f));
-        dstptr[(j + ij) * ld_dst + i] = x << 4;
-      }
-    };
-    auto s8_calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
-        minval = std::min(minval, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (maxval - minval) / 255;
-      float rscale = 1.f / scale;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2;
-      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
-      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
-      }
-    };
-    auto s4_fullrange_calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto v = srcptr[(j + ij) * ld_src + i];
-        maxval = std::max(maxval, v);
-        minval = std::min(minval, v);
-      }
-      float max = std::abs(maxval) < std::abs(minval) ? minval - maxval : maxval - minval;
-      float scale = max / -16.f;
-      float rscale = scale != 0.f ? 1.f / scale : 0.f;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2;
-      ;
-      int8_t bzp = utils::cast<float, int8_t>((0.f - fmedium) * rscale);
-      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto quant_v = (srcptr[(j + ij) * ld_src + i] - fmedium) * rscale;
-        int8_t x = std::min(static_cast<int8_t>(15), static_cast<int8_t>(quant_v + 8.5f));
-        dstptr[(j + ij) * ld_dst + i] = x << 4;
-      }
-    };
-
-    auto dispatch_calc = [&](int blocksize) {
-      switch (S4_T) {
-        case JBLAS_DTYPE::S8:
-        case JBLAS_DTYPE::S4_CLIP:
-          if (zero_points == nullptr) {
-            s8_calc_store_scale_and_quantv_sym(blocksize);
-          } else {
-            s8_calc_store_scale_and_quantv_asym(blocksize);
-          }
-          break;
-        case JBLAS_DTYPE::S4_FULLRANGE:
-          if (zero_points == nullptr) {
-            s4_fullrange_calc_store_scale_and_quantv_sym(blocksize);
-          } else {
-            s4_fullrange_calc_store_scale_and_quantv_asym(blocksize);
-          }
-          break;
-        default:
-          assert(false);
-          break;
-      }
-    };
-
-    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
-    if (j < row) dispatch_calc(row - align_row_loop);
-  }
-  return JblasSuccess;
-}
-template <JBLAS_DTYPE F4_T>
-inline JBLAS_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  int raw_blocksize = blocksize;
-  for (int i = 0; i < col; i++) {
-    int align_row_loop = row / blocksize * blocksize;
-    int j = 0;
-    auto calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float absmax = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        absmax = std::max(absmax, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      scales[j / raw_blocksize * ld_dst + i] = absmax;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>(srcptr[(j + ij) * ld_src + i] * (1.f / absmax));
-      }
-    };
-    auto calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float amax = 0;
-      float amin = 0;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        amax = std::max(amax, srcptr[(j + ij) * ld_src + i]);
-        amin = std::max(amax, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (amax - amin) / 2;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (amax + amin) / 2;
-      zero_points[j / raw_blocksize * ld_dst + i] = f4_quantize<F4_T>((0 - fmedium) * (1.f / scale));
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>((srcptr[(j + ij) * ld_src + i] - fmedium) * (1.f / scale));
-      }
-    };
-    auto dispatch_calc = [&](int blocksize) {
-      if (zero_points == nullptr) {
-        calc_store_scale_and_quantv_sym(blocksize);
-      } else {
-        calc_store_scale_and_quantv_asym(blocksize);
-      }
-    };
-    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
-    if (j < row) dispatch_calc(row - align_row_loop);
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                          int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                          float* blkreduce) {
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      float maxval = std::numeric_limits<float>::min();
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      auto zpf = static_cast<float>(zp);
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto qtmp = utils::cast<float, int>(fsrc * rscale);
-        sum += qtmp;
-        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      auto zpf = float(zp);
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto qtmp = utils::cast<float, int>(fsrc * rscale);
-        sum += qtmp;
-        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-inline JBLAS_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
-                                          float* scales, int ld_scale, int blocksize, float* reduce) {
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        absmaxval = std::max(std::abs(fsrc), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      int sum = 0;
-      scales[j / blocksize + i * ld_scale] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto tmp = utils::cast<float, int8_t>(fsrc * rscale);
-        dstptr[(j + ij) + i * ld_dst] = tmp;
-        sum += tmp;
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-    if (j < col) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        absmaxval = std::max(std::abs(fsrc), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>(fsrc * rscale);
-        sum += dstptr[(ij) + i * ld_dst];
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  if (beta != 0.f) {
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    }
-    return JblasSuccess;
-  }
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  for (size_t i = 0; i < M; i++) {
-    for (size_t j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = static_cast<float>(alpha[j]) * srcptr[i * srcstep + j] + dstptr[i * dststep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                       const int M, const int N) {
-  for (size_t i = 0; i < M; i++) {
-    for (size_t j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = srcptr[i * srcstep + j] + dstptr[i * dststep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                                         const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
-                                         int zpDst) {
-  float factor = alpha * scaleSrc / scaleDst;
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
-      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int M, const int N, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  for (int i = 0; i < M; i++) {
-    float scale = scaleA[i * ldsa];
-    for (int j = 0; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * static_cast<float>(scaleB[j]) * scale;
-      dstptr[i * dststep + j] = fsrc;
-    }
-  }
-  return JblasSuccess;
-}
-
-inline JBLAS_CODE minmax_f32_kblock(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
-                                    int fsize_minmax, int blocksize) {
-  for (int i = 0; i < row; i++) {
-    if (col >= blocksize) {
-      for (int icol = 0; icol < col; icol += blocksize) {
-        float maxval = std::numeric_limits<float>::min();
-        float minval = std::numeric_limits<float>::max();
-        for (int ii = 0; ii < blocksize; ii++) {
-          maxval = std::max(srcptr[i * ld_src + icol + ii], maxval);
-          minval = std::min(srcptr[i * ld_src + icol + ii], minval);
-        }
-        auto colptr = &minmaxptr[i * ld_minmax + icol / blocksize * fsize_minmax];
-        colptr[0] = minval;
-        colptr[1] = maxval;
-      }
-    } else {
-      float maxval = std::numeric_limits<float>::min();
-      float minval = std::numeric_limits<float>::max();
-      for (int icol = 0; icol < col; icol++) {
-        maxval = std::max(srcptr[i * ld_src + icol], maxval);
-        minval = std::min(srcptr[i * ld_src + icol], minval);
-      }
-      minmaxptr[i * ld_minmax + 0] = minval;
-      minmaxptr[i * ld_minmax + 1] = maxval;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
-                                                       int row, int col, int ld_src, int ld_dst, float* ascales,
-                                                       int ldas, float* wscales) {
-  for (int irow = 0; irow < row; irow++) {
-    for (int icol = 0; icol < col; icol++) {
-      float scale = ascales[irow * ldas] * wscales[icol] * alpha;
-      dstptr[irow * ld_dst + icol] = scale * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
-  int i = 0;
-  for (; i < num; i++) {
-    dstptr[i] = srcval;
-  }
-  return JblasSuccess;
-}
-
-template <typename _RT>
-static inline JBLAS_CODE quant_s8_row_reduce_sum(const int8_t* srcptr, int ldsrc, const float* scales,
-                                                 const int8_t* zero_points, int row, int col, _RT* reduce) {
-  std::memset(reduce, 0, sizeof(reduce[0]) * col);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      if (zero_points != nullptr) {
-        reduce[j] += static_cast<_RT>((static_cast<float>(srcptr[i * ldsrc + j]) - static_cast<float>(zero_points[j])) *
-                                      static_cast<float>(scales[j]));
-      } else {
-        reduce[j] += static_cast<_RT>(srcptr[i * ldsrc + j] * scales[j]);
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _RT>
-static inline JBLAS_CODE row_reduce_sum(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
-  for (int j = 0; j < col; j++) {
-    float tmp = 0.f;
-    for (int i = 0; i < row; i++) {
-      tmp += srcptr[i * ldsrc + j];
-    }
-    reduce[j] = static_cast<_RT>(tmp);
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      for (size_t jj = 0; jj < blocksize; jj++) {
-        if (j + jj < col) {
-          tmp += srcptr[i * ldsrc + j + jj];
-        }
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    for (int j = 0; j < col; j++) {
-      accptr[i * ldacc + j] -= zpf * reduce[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  for (int i = 0; i < row; i++) {
-    auto reducef = reduce[i * lds];
-    for (int j = 0; j < col; j++) {
-      accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reducef;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  for (int i = 0; i < row; i++) {
-    auto reduceaf = reducea[i * lds];
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    for (int j = 0; j < col; j++) {
-      auto zpbf = static_cast<float>(zpb[j]) * scaleb[j];
-      accptr[i * ldacc + j] -= zpbf * reduceaf;
-      accptr[i * ldacc + j] -= zpaf * reduceb[j];
-      accptr[i * ldacc + j] -= zpaf * zpbf * k;
-    }
-  }
-  return JblasSuccess;
-}
-}  // namespace ref
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
deleted file mode 100644
index d25b72ee2fa4..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
+++ /dev/null
@@ -1,702 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <array>
-#include <cassert>
-#include <type_traits>
-
-#include "jblas/jit_blas.h"
-#include "jit_blas_utils.h"
-#include "kernel_avx2.h"
-#include "kernel_avx512f.h"
-#include "kernel_avx512_bf16.h"
-#include "kernel_jit.h"
-#include "kernel_ref.h"
-
-namespace jblas {
-namespace kernel {
-namespace wrapper {
-template <int NTile, int RowPack>
-class PaddingInterleaveMN {
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      const auto kern_ret = kernel::avx512f::padding_interleave_cvt<T_SRC, T_DST, RowPack>::forward(
-          src, dst, NTile, row, col, row_pad, col_pad, src_step, dst_step);
-      if (kern_ret != JblasNotSupport) return kern_ret;
-    }
-    return ref::padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
-  }
-};
-
-template <int NTile, int RowPack>
-class RevertPaddingInterleaveMN {
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    return ref::revert_padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
-  }
-};
-
-template <int MTile, int ColPack>
-class PaddingTransInterleaveMN {
-  // row and cols are in terms of src
-  // M x N ===> M/MTile x N/ColPack x MTile x ColPack (leading dim stride = MTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    // Note: rows/cols and i/j are in terms of src
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      const auto kern_ret = kernel::avx512f::padding_trans_interleave_cvt<T_SRC, T_DST, ColPack>::forward(
-          src, dst, MTile, row, col, row_pad, col_pad, src_step, dst_step);
-      if (kern_ret != JblasNotSupport) return kern_ret;
-    }
-    return ref::padding_trans_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, MTile, ColPack);
-  }
-};
-
-class Memcpy2D {
- public:
-  template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* const_elt_v = nullptr, Eltops... ops) {
-    auto ret = JblasNotSupport;
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = kernel::jit::JitMemcpy2DAvx512f::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                     const_elt_v, ops...);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      ret = kernel::jit::JitMemcpy2DAvx2::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                  const_elt_v, ops...);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-    assert(sizeof...(ops) == 0);                      // no post ops
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // no conversion
-    return kernel::ref::memcpy2d(srcptr, dstptr, row, col * sizeof(_SRC_T), srcstep * sizeof(_SRC_T),
-                                 dststep * sizeof(_DST_T));
-  }
-
-  template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP OP_T>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* const_elt_v = nullptr) {
-    auto ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = kernel::jit::JitMemcpy2DAvx512f::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                            const_elt_v);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      ret = kernel::jit::JitMemcpy2DAvx2::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                         const_elt_v);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-    assert(false);  // no ref implementation
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DFp32CvtBf16 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileBF16()
-    if constexpr (utils::isa_base<ISA_T>::amx_bf16) {
-      return kernel::avx512_bf16::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride,
-                                                              zeropadding);
-    }
-#endif
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return kernel::avx512f::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return kernel::avx2::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
-    }
-#endif
-    return kernel::ref::dt_cvt_2D_write_back<float, utils::bf16>(srcptr, dstptr, row, col, srcstride, dststride,
-                                                                 zeropadding);
-  }
-};
-
-class Memcpy2DFp32CvtFp16 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileFP16()
-    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
-      return kernel::avx512f::fp32_cvt_fp16_2D_write_back(
-          reinterpret_cast<const float*>(srcptr), reinterpret_cast<utils::fp16*>(dstptr), row, col,
-          srcstride / sizeof(float), dststride / sizeof(utils::fp16), zeropadding);
-    }
-#endif
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DFp16CvtFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileFP16()
-    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
-      return kernel::avx512f::fp16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::fp16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::fp16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DBf16CvtFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileBF16()
-    if constexpr (ISA_T >= JblasAMX_BF16) {
-      return kernel::avx512_bf16::bf16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-#if CompileAVX512F()
-    if constexpr (ISA_T >= JblasAVX512F) {
-      return kernel::avx512f::bf16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (ISA_T >= JblasAVX2) {
-      return kernel::avx2::bf16_cvt_fp32_2D_write_back(
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-    return kernel::ref::dt_cvt_2D_write_back<utils::bf16, float>(srcptr, dstptr, row, col, srcstride, dststride,
-                                                                 zeropadding);
-  }
-};
-
-template <int NTILE>
-class CompressS8S4 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, jblas::utils::int4x2* dstptr, int row, int col, int ld_src,
-                                   int ld_dst) {
-    return ref::compress_s8_s4<NTILE>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <int NTILE>
-class CompressFp4 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, jblas::utils::f4x2* dstptr, int row, int col, int ld_src,
-                                   int ld_dst) {
-    return ref::compress_f4<NTILE>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <typename _T>
-class Transpose2D {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
-    return ref::transpose2d(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-class QuantizeSignIntRowBlock {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   float* scales, int8_t* zero_points, int blocksize) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f &&
-                  S4_T != JBLAS_DTYPE::S4_FULLRANGE) {  // TODO(zhe): support simd version s4_fullrange quantization.
-      return avx512f::quantize_f32_sign_int_rowblock<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
-                                                           zero_points, blocksize);
-    }
-#endif
-    return ref::quantize_f32_sign_int_rowblock<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     blocksize);
-  }
-};
-
-class QuantizeF4RowBlock {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   float* scales, int8_t* zero_points, int blocksize) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     blocksize);
-    }
-#endif
-    return ref::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                               blocksize);
-  }
-};
-
-class QuantizeU8ColBlock {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr, int ld_dst,
-                                   float* scales, int ld_scale, uint8_t* zps, int blocksize, float* blkreduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
-                                                     blocksize, blkreduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
-                                                  blocksize, blkreduce);
-    }
-#endif
-    return ref::quantize_fp_u8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps, blocksize,
-                                        blkreduce);
-  }
-};
-
-class QuantizeS8ColBlock {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
-                                   float* scales, int ld_scale, int blocksize, float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_fp_s8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale,
-                                                     blocksize, reduce);
-    }
-#endif
-    return ref::quantize_fp_s8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, blocksize, reduce);
-  }
-};
-
-class Broadcast {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(int num, const uint8_t& srcval, uint8_t* dstptr) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::broadcast_u8(num, srcval, dstptr);
-    }
-#endif
-    return ref::broadcast_u8(num, srcval, dstptr);
-  }
-};
-
-class AccumulateDequantizeS32F32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int32_t* srcptr, float* dstptr, float alpha, float beta, int row, int col,
-                                   int ld_src, int ld_dst, float* ascales, int ldas, float* wscales) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales,
-                                                    ldas, wscales);
-    }
-#endif
-    return ref::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales, ldas,
-                                              wscales);
-  }
-};
-
-template <typename _DST_T, int _PACK_ROW, typename _Z_T = int8_t>  // zero points always be int8_t, not compressed
-class DecompressKBlockS4Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename _SCA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   _SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad, void* tmp,
-                                   size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = avx512f::decompress_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW, _SCA_T>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-#if CompileAVX2()
-    // AVX2 device only focus on fp32 data and layout
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<_SCA_T, float> && std::is_same_v<_DST_T, float> &&
-                  _PACK_ROW == 1) {
-      if (zero_points == nullptr) {
-        ret = avx2::decompress_kblock_bit4_packrow1<true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                          k_offset, kblock, NPad, &avx2::dequant_s8_N_avx2<48, true>,
-                                                          &avx2::convert_s4_s8_16_sse<S4_T>,
-                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      } else {
-        ret = avx2::decompress_kblock_bit4_packrow1<false>(
-            srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-            &avx2::dequant_s8_N_avx2<48, false>, &avx2::convert_s4_s8_16_sse<S4_T>, reinterpret_cast<int8_t*>(tmp),
-            tmpsize);
-      }
-
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-    ret = ref::decompress_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW, _SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                        scales, zero_points, k_offset, kblock, NPad,
-                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-    return ret;
-  }
-};
-
-template <typename _DST_T>  // zero points always be int8_t, not compressed
-class DecompressKBlockS4S8Fp {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-#endif
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                           reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    return ref::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-template <typename _DST_T, int _PACK_ROW>
-class DecompressKBlockF4Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   SCA_T* scales, int k_offset, int kblock, int NPad, void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = avx512f::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                             scales, k_offset, kblock, NPad,
-                                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float>) {
-      ret = avx2::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                          scales, k_offset, kblock, NPad,
-                                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-    return ref::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                        scales, k_offset, kblock, NPad,
-                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-template <typename _DST_T>
-class DecompressKBlockF4FpNoscale {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                    reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                 reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    return ref::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-class DecompressKBlockS4S8 {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f && S4_T == JBLAS_DTYPE::S4_CLIP) {
-      return jit::decompress_s4_s8(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#endif
-    return ref::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <int PACK_ROW>
-class DecompressKBlockS8F32 {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T>
-  static inline JBLAS_CODE forward(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<SCA_T, float> &&
-                  PACK_ROW == 1) {  // TODO Scale type support
-      return jit::DequanKBlockS8F32::forward_avx512f(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     k_offset, kblock, NPad);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float> &&
-                  PACK_ROW == 1) {  // TODO Scale type support
-      return avx2::dequant_kblock_s8_f32(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                         kblock, NPad);
-    }
-#endif
-    return ref::decompress_kblock_s8_f32<float, PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
-                                                                 zero_points, k_offset, kblock, NPad);
-  }
-};
-
-class DecompressKBlockS8S8Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename T>
-  static inline JBLAS_CODE forward(int8_t* srcptr, T* dstptr, int row, int col, int ld_src, int ld_dst) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {  // TODO Scale type support
-      return avx512f::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {  // TODO Scale type support
-      return avx2::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-    return ref::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-class AlphaBetaF32F32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                            const float* src1ptr, const int src1step, float* dstptr, const int dststep, const int M,
-                            const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-    }
-#endif
-#if CompileAVX2()
-    if (utils::isa_base<ISA_T>::avx2) {
-      return avx2::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-    }
-#endif
-    return ref::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-  }
-};
-
-class CompFp32BlockScale {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T>
-  static JBLAS_CODE forward(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                            const int dststep, const int M, const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-    }
-#endif
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-    }
-    return ref::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-  }
-};
-
-class AccumulateFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
-                            const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
-    }
-#endif
-    return ref::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
-  }
-};
-
-class QuanOutS32U32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                            const int dststep, const int M, const int N, float scaleSrc, float scaleDst, int zpDst) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
-    }
-#endif
-    return ref::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
-  }
-};
-
-// scaleA ldsa==0 per tensor, ldsa!=0 per M
-// scaleB per channel(N)
-class DequanS32Fp32 {
- public:
-  template <JBLAS_ISA ISA_T, typename SCAB_T>
-  static JBLAS_CODE forward(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
-                            const int N, const float* scaleA, const int ldsa, const SCAB_T* scaleB) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-    }
-#endif
-    return ref::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-  }
-};
-
-class MinMaxKBlock {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
-                                   int fsize_minmax, int blocksize) {
-    return ref::minmax_f32_kblock(srcptr, row, col, ld_src, minmaxptr, ld_minmax, fsize_minmax, blocksize);
-  }
-};
-
-template <typename _RT>
-class QuantS8RowReduceSum {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, int ldsrc, const float* scales, const int8_t* zero_points,
-                                   int row, int col, _RT* reduce) {
-    return ref::quant_s8_row_reduce_sum(srcptr, ldsrc, scales, zero_points, row, col, reduce);
-  }
-};
-
-template <typename _RT>
-class RowReduceSum {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
-    return ref::row_reduce_sum<_RT>(srcptr, ldsrc, row, col, reduce);
-  }
-};
-
-class ColBlockReduceSum {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize, float* reduce,
-                                   int ldr) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-    }
-    return ref::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-  }
-};
-
-class RemoveZeroPointBias {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_wei(float* accptr, int ldacc, int row, int col, int8_t* zps, float* scales, int lds,
-                                       const float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-    return ref::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-  }
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_act(float* accptr, int ldacc, int row, int col, uint8_t* zps, float* scales, int lds,
-                                       const float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-    return ref::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-  }
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_both(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                        float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                        const float* reduceb) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea,
-                                            reduceb);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
-    }
-#endif
-    return ref::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
-  }
-};
-
-}  // namespace wrapper
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
deleted file mode 100644
index 320593150fca..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
+++ /dev/null
@@ -1,3313 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#ifndef XBYAK_XBYAK_H_
-#define XBYAK_XBYAK_H_
-/*!
-        @file xbyak.h
-        @brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
-        @author herumi
-        @url https://github.com/herumi/xbyak
-        @note modified new BSD license
-        http://opensource.org/licenses/BSD-3-Clause
-*/
-#if (not +0) && !defined(XBYAK_NO_OP_NAMES)  // trick to detect whether 'not' is operator or not
-#define XBYAK_NO_OP_NAMES
-#endif
-
-#include <stdio.h>  // for debug print
-#include <assert.h>
-#include <list>
-#include <string>
-#include <algorithm>
-#ifndef NDEBUG
-#include <iostream>
-#endif
-
-// #define XBYAK_DISABLE_AVX512
-
-#if !defined(XBYAK_USE_MMAP_ALLOCATOR) && !defined(XBYAK_DONT_USE_MMAP_ALLOCATOR)
-#define XBYAK_USE_MMAP_ALLOCATOR
-#endif
-#if !defined(__GNUC__) || defined(__MINGW32__)
-#undef XBYAK_USE_MMAP_ALLOCATOR
-#endif
-
-#ifdef __GNUC__
-#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__)*100 + (__GNUC_MINOR__) >= (major)*100 + (minor))
-#else
-#define XBYAK_GNUC_PREREQ(major, minor) 0
-#endif
-
-// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
-#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) || \
-     ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
-#include <unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::unordered_set
-#include <unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
-
-/*
-        Clang/llvm-gcc and ICC-EDG in 'GCC-mode' always claim to be GCC 4.2, using
-        libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
-*/
-#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || \
-    defined(__llvm__)
-#include <tr1/unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
-#include <tr1/unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
-
-#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
-#include <unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
-#include <unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
-
-#else
-#include <set>
-#define XBYAK_STD_UNORDERED_SET std::set
-#include <map>
-#define XBYAK_STD_UNORDERED_MAP std::map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
-#endif
-#ifdef _WIN32
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-#include <malloc.h>
-#ifdef _MSC_VER
-#define XBYAK_TLS __declspec(thread)
-#else
-#define XBYAK_TLS __thread
-#endif
-#elif defined(__GNUC__)
-#include <unistd.h>
-#include <sys/mman.h>
-#include <stdlib.h>
-#define XBYAK_TLS __thread
-#endif
-#if defined(__APPLE__) && !defined(XBYAK_DONT_USE_MAP_JIT)
-#define XBYAK_USE_MAP_JIT
-#include <sys/sysctl.h>
-#ifndef MAP_JIT
-#define MAP_JIT 0x800
-#endif
-#endif
-#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
-#include <stdint.h>
-#endif
-
-// MFD_CLOEXEC defined only linux 3.17 or later.
-// Android wraps the memfd_create syscall from API version 30.
-#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
-#undef XBYAK_USE_MEMFD
-#endif
-
-#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
-#define XBYAK64_WIN
-#elif defined(__x86_64__)
-#define XBYAK64_GCC
-#endif
-#if !defined(XBYAK64) && !defined(XBYAK32)
-#if defined(XBYAK64_GCC) || defined(XBYAK64_WIN)
-#define XBYAK64
-#else
-#define XBYAK32
-#endif
-#endif
-
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
-#undef XBYAK_TLS
-#define XBYAK_TLS thread_local
-#define XBYAK_VARIADIC_TEMPLATE
-#define XBYAK_NOEXCEPT noexcept
-#else
-#define XBYAK_NOEXCEPT throw()
-#endif
-
-// require c++14 or later
-// Visual Studio 2017 version 15.0 or later
-// g++-6 or later
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
-    (defined(_MSC_VER) && _MSC_VER >= 1910)
-#define XBYAK_CONSTEXPR constexpr
-#else
-#define XBYAK_CONSTEXPR
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4514) /* remove inline function */
-#pragma warning(disable : 4786) /* identifier is too long */
-#pragma warning(disable : 4503) /* name is too long */
-#pragma warning(disable : 4127) /* constant expresison */
-#endif
-
-// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
-#if defined(__GNUC__) && !defined(__clang__)
-#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-namespace Xbyak {
-
-enum {
-  DEFAULT_MAX_CODE_SIZE = 4096,
-  VERSION = 0x6730 /* 0xABCD = A.BC(.D) */
-};
-
-#ifndef MIE_INTEGER_TYPE_DEFINED
-#define MIE_INTEGER_TYPE_DEFINED
-// for backward compatibility
-typedef uint64_t uint64;
-typedef int64_t sint64;
-typedef uint32_t uint32;
-typedef uint16_t uint16;
-typedef uint8_t uint8;
-#endif
-
-#ifndef MIE_ALIGN
-#ifdef _MSC_VER
-#define MIE_ALIGN(x) __declspec(align(x))
-#else
-#define MIE_ALIGN(x) __attribute__((aligned(x)))
-#endif
-#endif
-#ifndef MIE_PACK  // for shufps
-#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w))
-#endif
-
-enum {
-  ERR_NONE = 0,
-  ERR_BAD_ADDRESSING,
-  ERR_CODE_IS_TOO_BIG,
-  ERR_BAD_SCALE,
-  ERR_ESP_CANT_BE_INDEX,
-  ERR_BAD_COMBINATION,
-  ERR_BAD_SIZE_OF_REGISTER,
-  ERR_IMM_IS_TOO_BIG,
-  ERR_BAD_ALIGN,
-  ERR_LABEL_IS_REDEFINED,
-  ERR_LABEL_IS_TOO_FAR,
-  ERR_LABEL_IS_NOT_FOUND,
-  ERR_CODE_ISNOT_COPYABLE,
-  ERR_BAD_PARAMETER,
-  ERR_CANT_PROTECT,
-  ERR_CANT_USE_64BIT_DISP,
-  ERR_OFFSET_IS_TOO_BIG,
-  ERR_MEM_SIZE_IS_NOT_SPECIFIED,
-  ERR_BAD_MEM_SIZE,
-  ERR_BAD_ST_COMBINATION,
-  ERR_OVER_LOCAL_LABEL,  // not used
-  ERR_UNDER_LOCAL_LABEL,
-  ERR_CANT_ALLOC,
-  ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW,
-  ERR_BAD_PROTECT_MODE,
-  ERR_BAD_PNUM,
-  ERR_BAD_TNUM,
-  ERR_BAD_VSIB_ADDRESSING,
-  ERR_CANT_CONVERT,
-  ERR_LABEL_ISNOT_SET_BY_L,
-  ERR_LABEL_IS_ALREADY_SET_BY_L,
-  ERR_BAD_LABEL_STR,
-  ERR_MUNMAP,
-  ERR_OPMASK_IS_ALREADY_SET,
-  ERR_ROUNDING_IS_ALREADY_SET,
-  ERR_K0_IS_INVALID,
-  ERR_EVEX_IS_INVALID,
-  ERR_SAE_IS_INVALID,
-  ERR_ER_IS_INVALID,
-  ERR_INVALID_BROADCAST,
-  ERR_INVALID_OPMASK_WITH_MEMORY,
-  ERR_INVALID_ZERO,
-  ERR_INVALID_RIP_IN_AUTO_GROW,
-  ERR_INVALID_MIB_ADDRESS,
-  ERR_X2APIC_IS_NOT_SUPPORTED,
-  ERR_NOT_SUPPORTED,
-  ERR_SAME_REGS_ARE_INVALID,
-  ERR_INTERNAL  // Put it at last.
-};
-
-inline const char* ConvertErrorToString(int err) {
-  static const char* errTbl[] = {"none",
-                                 "bad addressing",
-                                 "code is too big",
-                                 "bad scale",
-                                 "esp can't be index",
-                                 "bad combination",
-                                 "bad size of register",
-                                 "imm is too big",
-                                 "bad align",
-                                 "label is redefined",
-                                 "label is too far",
-                                 "label is not found",
-                                 "code is not copyable",
-                                 "bad parameter",
-                                 "can't protect",
-                                 "can't use 64bit disp(use (void*))",
-                                 "offset is too big",
-                                 "MEM size is not specified",
-                                 "bad mem size",
-                                 "bad st combination",
-                                 "over local label",
-                                 "under local label",
-                                 "can't alloc",
-                                 "T_SHORT is not supported in AutoGrow",
-                                 "bad protect mode",
-                                 "bad pNum",
-                                 "bad tNum",
-                                 "bad vsib addressing",
-                                 "can't convert",
-                                 "label is not set by L()",
-                                 "label is already set by L()",
-                                 "bad label string",
-                                 "err munmap",
-                                 "opmask is already set",
-                                 "rounding is already set",
-                                 "k0 is invalid",
-                                 "evex is invalid",
-                                 "sae(suppress all exceptions) is invalid",
-                                 "er(embedded rounding) is invalid",
-                                 "invalid broadcast",
-                                 "invalid opmask with memory",
-                                 "invalid zero",
-                                 "invalid rip in AutoGrow",
-                                 "invalid mib address",
-                                 "x2APIC is not supported",
-                                 "not supported",
-                                 "same regs are invalid",
-                                 "internal error"};
-  assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
-  return err <= ERR_INTERNAL ? errTbl[err] : "unknown err";
-}
-
-#ifdef XBYAK_NO_EXCEPTION
-namespace local {
-
-inline int& GetErrorRef() {
-  static XBYAK_TLS int err = 0;
-  return err;
-}
-
-inline void SetError(int err) {
-  if (local::GetErrorRef()) return;  // keep the first err code
-  local::GetErrorRef() = err;
-}
-
-}  // namespace local
-
-inline void ClearError() { local::GetErrorRef() = 0; }
-inline int GetError() { return Xbyak::local::GetErrorRef(); }
-
-#define XBYAK_THROW(err)         \
-  {                              \
-    Xbyak::local::SetError(err); \
-    return;                      \
-  }
-#define XBYAK_THROW_RET(err, r)  \
-  {                              \
-    Xbyak::local::SetError(err); \
-    return r;                    \
-  }
-
-#else
-class Error : public std::exception {
-  int err_;
-
- public:
-  explicit Error(int err) : err_(err) {
-    if (err_ < 0 || err_ > ERR_INTERNAL) {
-      err_ = ERR_INTERNAL;
-    }
-  }
-  operator int() const { return err_; }
-  const char* what() const XBYAK_NOEXCEPT { return ConvertErrorToString(err_); }
-};
-
-// dummy functions
-inline void ClearError() {}
-inline int GetError() { return 0; }
-
-inline const char* ConvertErrorToString(const Error& err) { return err.what(); }
-
-#define XBYAK_THROW(err) \
-  { throw Error(err); }
-#define XBYAK_THROW_RET(err, r) \
-  { throw Error(err); }
-
-#endif
-
-inline void* AlignedMalloc(size_t size, size_t alignment) {
-#ifdef __MINGW32__
-  return __mingw_aligned_malloc(size, alignment);
-#elif defined(_WIN32)
-  return _aligned_malloc(size, alignment);
-#else
-  void* p;
-  int ret = posix_memalign(&p, alignment, size);
-  return (ret == 0) ? p : 0;
-#endif
-}
-
-inline void AlignedFree(void* p) {
-#ifdef __MINGW32__
-  __mingw_aligned_free(p);
-#elif defined(_MSC_VER)
-  _aligned_free(p);
-#else
-  free(p);
-#endif
-}
-
-template <class To, class From>
-inline const To CastTo(From p) XBYAK_NOEXCEPT {
-  return (const To)(size_t)(p);
-}
-namespace inner {
-
-#ifdef _WIN32
-struct SystemInfo {
-  SYSTEM_INFO info;
-  SystemInfo() { GetSystemInfo(&info); }
-};
-#endif
-// static const size_t ALIGN_PAGE_SIZE = 4096;
-inline size_t getPageSize() {
-#ifdef _WIN32
-  static const SystemInfo si;
-  return si.info.dwPageSize;
-#elif defined(__GNUC__)
-  static const long pageSize = sysconf(_SC_PAGESIZE);
-  if (pageSize > 0) {
-    return (size_t)pageSize;
-  }
-#endif
-  return 4096;
-}
-
-inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
-inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
-
-inline uint32_t VerifyInInt32(uint64_t x) {
-#if defined(XBYAK64) && !defined(__ILP32__)
-  if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
-#endif
-  return static_cast<uint32_t>(x);
-}
-
-enum LabelMode {
-  LasIs,   // as is
-  Labs,    // absolute
-  LaddTop  // (addr + top) for mov(reg, label) with AutoGrow
-};
-
-}  // namespace inner
-
-/*
-        custom allocator
-*/
-struct Allocator {
-  explicit Allocator(const std::string& = "") {}  // same interface with MmapAllocator
-  virtual uint8_t* alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::getPageSize())); }
-  virtual void free(uint8_t* p) { AlignedFree(p); }
-  virtual ~Allocator() {}
-  /* override to return false if you call protect() manually */
-  virtual bool useProtect() const { return true; }
-};
-
-#ifdef XBYAK_USE_MMAP_ALLOCATOR
-#ifdef XBYAK_USE_MAP_JIT
-namespace util {
-
-inline int getMacOsVersionPure() {
-  char buf[64];
-  size_t size = sizeof(buf);
-  int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0);
-  if (err != 0) return 0;
-  char* endp;
-  int major = strtol(buf, &endp, 10);
-  if (*endp != '.') return 0;
-  return major;
-}
-
-inline int getMacOsVersion() {
-  static const int version = getMacOsVersionPure();
-  return version;
-}
-
-}  // namespace util
-#endif
-class MmapAllocator : public Allocator {
-  struct Allocation {
-    size_t size;
-#if defined(XBYAK_USE_MEMFD)
-    // fd_ is only used with XBYAK_USE_MEMFD. We keep the file open
-    // during the lifetime of each allocation in order to support
-    // checkpoint/restore by unprivileged users.
-    int fd;
-#endif
-  };
-  const std::string name_;  // only used with XBYAK_USE_MEMFD
-  typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, Allocation> AllocationList;
-  AllocationList allocList_;
-
- public:
-  explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
-  uint8_t* alloc(size_t size) {
-    const size_t alignedSizeM1 = inner::getPageSize() - 1;
-    size = (size + alignedSizeM1) & ~alignedSizeM1;
-#if defined(MAP_ANONYMOUS)
-    int mode = MAP_PRIVATE | MAP_ANONYMOUS;
-#elif defined(MAP_ANON)
-    int mode = MAP_PRIVATE | MAP_ANON;
-#else
-#error "not supported"
-#endif
-#if defined(XBYAK_USE_MAP_JIT)
-    const int mojaveVersion = 18;
-    if (util::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
-#endif
-    int fd = -1;
-#if defined(XBYAK_USE_MEMFD)
-    fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
-    if (fd != -1) {
-      mode = MAP_SHARED;
-      if (ftruncate(fd, size) != 0) {
-        close(fd);
-        XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
-      }
-    }
-#endif
-    void* p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0);
-    if (p == MAP_FAILED) {
-      if (fd != -1) close(fd);
-      XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
-    }
-    assert(p);
-    Allocation& alloc = allocList_[(uintptr_t)p];
-    alloc.size = size;
-#if defined(XBYAK_USE_MEMFD)
-    alloc.fd = fd;
-#endif
-    return (uint8_t*)p;
-  }
-  void free(uint8_t* p) {
-    if (p == 0) return;
-    AllocationList::iterator i = allocList_.find((uintptr_t)p);
-    if (i == allocList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
-    if (munmap((void*)i->first, i->second.size) < 0) XBYAK_THROW(ERR_MUNMAP)
-#if defined(XBYAK_USE_MEMFD)
-    if (i->second.fd != -1) close(i->second.fd);
-#endif
-    allocList_.erase(i);
-  }
-};
-#else
-typedef Allocator MmapAllocator;
-#endif
-
-class Address;
-class Reg;
-
-class Operand {
-  static const uint8_t EXT8BIT = 0x20;
-  unsigned int idx_ : 6;  // 0..31 + EXT8BIT = 1 if spl/bpl/sil/dil
-  unsigned int kind_ : 10;
-  unsigned int bit_ : 14;
-
- protected:
-  unsigned int zero_ : 1;
-  unsigned int mask_ : 3;
-  unsigned int rounding_ : 3;
-  void setIdx(int idx) { idx_ = idx; }
-
- public:
-  enum Kind {
-    NONE = 0,
-    MEM = 1 << 0,
-    REG = 1 << 1,
-    MMX = 1 << 2,
-    FPU = 1 << 3,
-    XMM = 1 << 4,
-    YMM = 1 << 5,
-    ZMM = 1 << 6,
-    OPMASK = 1 << 7,
-    BNDREG = 1 << 8,
-    TMM = 1 << 9
-  };
-  enum Code {
-#ifdef XBYAK64
-    RAX = 0,
-    RCX,
-    RDX,
-    RBX,
-    RSP,
-    RBP,
-    RSI,
-    RDI,
-    R8,
-    R9,
-    R10,
-    R11,
-    R12,
-    R13,
-    R14,
-    R15,
-    R8D = 8,
-    R9D,
-    R10D,
-    R11D,
-    R12D,
-    R13D,
-    R14D,
-    R15D,
-    R8W = 8,
-    R9W,
-    R10W,
-    R11W,
-    R12W,
-    R13W,
-    R14W,
-    R15W,
-    R8B = 8,
-    R9B,
-    R10B,
-    R11B,
-    R12B,
-    R13B,
-    R14B,
-    R15B,
-    SPL = 4,
-    BPL,
-    SIL,
-    DIL,
-#endif
-    EAX = 0,
-    ECX,
-    EDX,
-    EBX,
-    ESP,
-    EBP,
-    ESI,
-    EDI,
-    AX = 0,
-    CX,
-    DX,
-    BX,
-    SP,
-    BP,
-    SI,
-    DI,
-    AL = 0,
-    CL,
-    DL,
-    BL,
-    AH,
-    CH,
-    DH,
-    BH
-  };
-  XBYAK_CONSTEXPR Operand() : idx_(0), kind_(0), bit_(0), zero_(0), mask_(0), rounding_(0) {}
-  XBYAK_CONSTEXPR Operand(int idx, Kind kind, int bit, bool ext8bit = 0)
-      : idx_(static_cast<uint8_t>(idx | (ext8bit ? EXT8BIT : 0))),
-        kind_(kind),
-        bit_(bit),
-        zero_(0),
-        mask_(0),
-        rounding_(0) {
-    assert((bit_ & (bit_ - 1)) == 0);  // bit must be power of two
-  }
-  XBYAK_CONSTEXPR Kind getKind() const { return static_cast<Kind>(kind_); }
-  XBYAK_CONSTEXPR int getIdx() const { return idx_ & (EXT8BIT - 1); }
-  XBYAK_CONSTEXPR bool isNone() const { return kind_ == 0; }
-  XBYAK_CONSTEXPR bool isMMX() const { return is(MMX); }
-  XBYAK_CONSTEXPR bool isXMM() const { return is(XMM); }
-  XBYAK_CONSTEXPR bool isYMM() const { return is(YMM); }
-  XBYAK_CONSTEXPR bool isZMM() const { return is(ZMM); }
-  XBYAK_CONSTEXPR bool isTMM() const { return is(TMM); }
-  XBYAK_CONSTEXPR bool isXMEM() const { return is(XMM | MEM); }
-  XBYAK_CONSTEXPR bool isYMEM() const { return is(YMM | MEM); }
-  XBYAK_CONSTEXPR bool isZMEM() const { return is(ZMM | MEM); }
-  XBYAK_CONSTEXPR bool isOPMASK() const { return is(OPMASK); }
-  XBYAK_CONSTEXPR bool isBNDREG() const { return is(BNDREG); }
-  XBYAK_CONSTEXPR bool isREG(int bit = 0) const { return is(REG, bit); }
-  XBYAK_CONSTEXPR bool isMEM(int bit = 0) const { return is(MEM, bit); }
-  XBYAK_CONSTEXPR bool isFPU() const { return is(FPU); }
-  XBYAK_CONSTEXPR bool isExt8bit() const { return (idx_ & EXT8BIT) != 0; }
-  XBYAK_CONSTEXPR bool isExtIdx() const { return (getIdx() & 8) != 0; }
-  XBYAK_CONSTEXPR bool isExtIdx2() const { return (getIdx() & 16) != 0; }
-  XBYAK_CONSTEXPR bool hasEvex() const { return isZMM() || isExtIdx2() || getOpmaskIdx() || getRounding(); }
-  XBYAK_CONSTEXPR bool hasRex() const { return isExt8bit() || isREG(64) || isExtIdx(); }
-  XBYAK_CONSTEXPR bool hasZero() const { return zero_; }
-  XBYAK_CONSTEXPR int getOpmaskIdx() const { return mask_; }
-  XBYAK_CONSTEXPR int getRounding() const { return rounding_; }
-  void setKind(Kind kind) {
-    if ((kind & (XMM | YMM | ZMM | TMM)) == 0) return;
-    kind_ = kind;
-    bit_ = kind == XMM ? 128 : kind == YMM ? 256 : kind == ZMM ? 512 : 8192;
-  }
-  // err if MMX/FPU/OPMASK/BNDREG
-  void setBit(int bit);
-  void setOpmaskIdx(int idx, bool /*ignore_idx0*/ = true) {
-    if (mask_) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET)
-    mask_ = idx;
-  }
-  void setRounding(int idx) {
-    if (rounding_) XBYAK_THROW(ERR_ROUNDING_IS_ALREADY_SET)
-    rounding_ = idx;
-  }
-  void setZero() { zero_ = true; }
-  // ah, ch, dh, bh?
-  bool isHigh8bit() const {
-    if (!isBit(8)) return false;
-    if (isExt8bit()) return false;
-    const int idx = getIdx();
-    return AH <= idx && idx <= BH;
-  }
-  // any bit is accetable if bit == 0
-  XBYAK_CONSTEXPR bool is(int kind, uint32_t bit = 0) const {
-    return (kind == 0 || (kind_ & kind)) && (bit == 0 || (bit_ & bit));  // cf. you can set (8|16)
-  }
-  XBYAK_CONSTEXPR bool isBit(uint32_t bit) const { return (bit_ & bit) != 0; }
-  XBYAK_CONSTEXPR uint32_t getBit() const { return bit_; }
-  const char* toString() const {
-    const int idx = getIdx();
-    if (kind_ == REG) {
-      if (isExt8bit()) {
-        static const char* tbl[4] = {"spl", "bpl", "sil", "dil"};
-        return tbl[idx - 4];
-      }
-      static const char* tbl[4][16] = {
-          {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh", "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b",
-           "r15b"},
-          {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w",
-           "r15w"},
-          {"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d",
-           "r15d"},
-          {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14",
-           "r15"},
-      };
-      return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx];
-    } else if (isOPMASK()) {
-      static const char* tbl[8] = {"k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7"};
-      return tbl[idx];
-    } else if (isTMM()) {
-      static const char* tbl[8] = {"tmm0", "tmm1", "tmm2", "tmm3", "tmm4", "tmm5", "tmm6", "tmm7"};
-      return tbl[idx];
-    } else if (isZMM()) {
-      static const char* tbl[32] = {"zmm0",  "zmm1",  "zmm2",  "zmm3",  "zmm4",  "zmm5",  "zmm6",  "zmm7",
-                                    "zmm8",  "zmm9",  "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
-                                    "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
-                                    "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31"};
-      return tbl[idx];
-    } else if (isYMM()) {
-      static const char* tbl[32] = {"ymm0",  "ymm1",  "ymm2",  "ymm3",  "ymm4",  "ymm5",  "ymm6",  "ymm7",
-                                    "ymm8",  "ymm9",  "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
-                                    "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
-                                    "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31"};
-      return tbl[idx];
-    } else if (isXMM()) {
-      static const char* tbl[32] = {"xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
-                                    "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
-                                    "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
-                                    "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"};
-      return tbl[idx];
-    } else if (isMMX()) {
-      static const char* tbl[8] = {"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"};
-      return tbl[idx];
-    } else if (isFPU()) {
-      static const char* tbl[8] = {"st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7"};
-      return tbl[idx];
-    } else if (isBNDREG()) {
-      static const char* tbl[4] = {"bnd0", "bnd1", "bnd2", "bnd3"};
-      return tbl[idx];
-    }
-    XBYAK_THROW_RET(ERR_INTERNAL, 0);
-  }
-  bool isEqualIfNotInherited(const Operand& rhs) const {
-    return idx_ == rhs.idx_ && kind_ == rhs.kind_ && bit_ == rhs.bit_ && zero_ == rhs.zero_ && mask_ == rhs.mask_ &&
-           rounding_ == rhs.rounding_;
-  }
-  bool operator==(const Operand& rhs) const;
-  bool operator!=(const Operand& rhs) const { return !operator==(rhs); }
-  const Address& getAddress() const;
-  const Reg& getReg() const;
-};
-
-inline void Operand::setBit(int bit) {
-  if (bit != 8 && bit != 16 && bit != 32 && bit != 64 && bit != 128 && bit != 256 && bit != 512 && bit != 8192)
-    goto ERR;
-  if (isBit(bit)) return;
-  if (is(MEM | OPMASK)) {
-    bit_ = bit;
-    return;
-  }
-  if (is(REG | XMM | YMM | ZMM | TMM)) {
-    int idx = getIdx();
-    // err if converting ah, bh, ch, dh
-    if (isREG(8) && (4 <= idx && idx < 8) && !isExt8bit()) goto ERR;
-    Kind kind = REG;
-    switch (bit) {
-      case 8:
-        if (idx >= 16) goto ERR;
-#ifdef XBYAK32
-        if (idx >= 4) goto ERR;
-#else
-        if (4 <= idx && idx < 8) idx |= EXT8BIT;
-#endif
-        break;
-      case 16:
-      case 32:
-      case 64:
-        if (idx >= 16) goto ERR;
-        break;
-      case 128:
-        kind = XMM;
-        break;
-      case 256:
-        kind = YMM;
-        break;
-      case 512:
-        kind = ZMM;
-        break;
-      case 8192:
-        kind = TMM;
-        break;
-    }
-    idx_ = idx;
-    kind_ = kind;
-    bit_ = bit;
-    if (bit >= 128) return;  // keep mask_ and rounding_
-    mask_ = 0;
-    rounding_ = 0;
-    return;
-  }
-ERR:
-  XBYAK_THROW(ERR_CANT_CONVERT)
-}
-
-class Label;
-
-struct Reg8;
-struct Reg16;
-struct Reg32;
-#ifdef XBYAK64
-struct Reg64;
-#endif
-class Reg : public Operand {
- public:
-  XBYAK_CONSTEXPR Reg() {}
-  XBYAK_CONSTEXPR Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) {}
-  // convert to Reg8/Reg16/Reg32/Reg64/XMM/YMM/ZMM
-  Reg changeBit(int bit) const {
-    Reg r(*this);
-    r.setBit(bit);
-    return r;
-  }
-  uint8_t getRexW() const { return isREG(64) ? 8 : 0; }
-  uint8_t getRexR() const { return isExtIdx() ? 4 : 0; }
-  uint8_t getRexX() const { return isExtIdx() ? 2 : 0; }
-  uint8_t getRexB() const { return isExtIdx() ? 1 : 0; }
-  uint8_t getRex(const Reg& base = Reg()) const {
-    uint8_t rex = getRexW() | getRexR() | base.getRexW() | base.getRexB();
-    if (rex || isExt8bit() || base.isExt8bit()) rex |= 0x40;
-    return rex;
-  }
-  Reg8 cvt8() const;
-  Reg16 cvt16() const;
-  Reg32 cvt32() const;
-#ifdef XBYAK64
-  Reg64 cvt64() const;
-#endif
-};
-
-inline const Reg& Operand::getReg() const {
-  assert(!isMEM());
-  return static_cast<const Reg&>(*this);
-}
-
-struct Reg8 : public Reg {
-  explicit XBYAK_CONSTEXPR Reg8(int idx = 0, bool ext8bit = false) : Reg(idx, Operand::REG, 8, ext8bit) {}
-};
-
-struct Reg16 : public Reg {
-  explicit XBYAK_CONSTEXPR Reg16(int idx = 0) : Reg(idx, Operand::REG, 16) {}
-};
-
-struct Mmx : public Reg {
-  explicit XBYAK_CONSTEXPR Mmx(int idx = 0, Kind kind = Operand::MMX, int bit = 64) : Reg(idx, kind, bit) {}
-};
-
-struct EvexModifierRounding {
-  enum { T_RN_SAE = 1, T_RD_SAE = 2, T_RU_SAE = 3, T_RZ_SAE = 4, T_SAE = 5 };
-  explicit XBYAK_CONSTEXPR EvexModifierRounding(int rounding) : rounding(rounding) {}
-  int rounding;
-};
-struct EvexModifierZero {
-  XBYAK_CONSTEXPR EvexModifierZero() {}
-};
-
-struct Xmm : public Mmx {
-  explicit XBYAK_CONSTEXPR Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) {}
-  XBYAK_CONSTEXPR Xmm(Kind kind, int idx) : Mmx(idx, kind, kind == XMM ? 128 : kind == YMM ? 256 : 512) {}
-  Xmm operator|(const EvexModifierRounding& emr) const {
-    Xmm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-  Xmm copyAndSetIdx(int idx) const {
-    Xmm ret(*this);
-    ret.setIdx(idx);
-    return ret;
-  }
-  Xmm copyAndSetKind(Operand::Kind kind) const {
-    Xmm ret(*this);
-    ret.setKind(kind);
-    return ret;
-  }
-};
-
-struct Ymm : public Xmm {
-  explicit XBYAK_CONSTEXPR Ymm(int idx = 0, Kind kind = Operand::YMM, int bit = 256) : Xmm(idx, kind, bit) {}
-  Ymm operator|(const EvexModifierRounding& emr) const {
-    Ymm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-};
-
-struct Zmm : public Ymm {
-  explicit XBYAK_CONSTEXPR Zmm(int idx = 0) : Ymm(idx, Operand::ZMM, 512) {}
-  Zmm operator|(const EvexModifierRounding& emr) const {
-    Zmm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-};
-
-#ifdef XBYAK64
-struct Tmm : public Reg {
-  explicit XBYAK_CONSTEXPR Tmm(int idx = 0, Kind kind = Operand::TMM, int bit = 8192) : Reg(idx, kind, bit) {}
-};
-#endif
-
-struct Opmask : public Reg {
-  explicit XBYAK_CONSTEXPR Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
-};
-
-struct BoundsReg : public Reg {
-  explicit XBYAK_CONSTEXPR BoundsReg(int idx = 0) : Reg(idx, Operand::BNDREG, 128) {}
-};
-
-template <class T>
-T operator|(const T& x, const Opmask& k) {
-  T r(x);
-  r.setOpmaskIdx(k.getIdx());
-  return r;
-}
-template <class T>
-T operator|(const T& x, const EvexModifierZero&) {
-  T r(x);
-  r.setZero();
-  return r;
-}
-template <class T>
-T operator|(const T& x, const EvexModifierRounding& emr) {
-  T r(x);
-  r.setRounding(emr.rounding);
-  return r;
-}
-
-struct Fpu : public Reg {
-  explicit XBYAK_CONSTEXPR Fpu(int idx = 0) : Reg(idx, Operand::FPU, 32) {}
-};
-
-struct Reg32e : public Reg {
-  explicit XBYAK_CONSTEXPR Reg32e(int idx, int bit) : Reg(idx, Operand::REG, bit) {}
-};
-struct Reg32 : public Reg32e {
-  explicit XBYAK_CONSTEXPR Reg32(int idx = 0) : Reg32e(idx, 32) {}
-};
-#ifdef XBYAK64
-struct Reg64 : public Reg32e {
-  explicit XBYAK_CONSTEXPR Reg64(int idx = 0) : Reg32e(idx, 64) {}
-};
-struct RegRip {
-  int64_t disp_;
-  const Label* label_;
-  bool isAddr_;
-  explicit XBYAK_CONSTEXPR RegRip(int64_t disp = 0, const Label* label = 0, bool isAddr = false)
-      : disp_(disp), label_(label), isAddr_(isAddr) {}
-  friend const RegRip operator+(const RegRip& r, int disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
-  friend const RegRip operator-(const RegRip& r, int disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
-  friend const RegRip operator+(const RegRip& r, int64_t disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
-  friend const RegRip operator-(const RegRip& r, int64_t disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
-  friend const RegRip operator+(const RegRip& r, const Label& label) {
-    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
-    return RegRip(r.disp_, &label);
-  }
-  friend const RegRip operator+(const RegRip& r, const void* addr) {
-    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
-    return RegRip(r.disp_ + (int64_t)addr, 0, true);
-  }
-};
-#endif
-
-inline Reg8 Reg::cvt8() const {
-  Reg r = changeBit(8);
-  return Reg8(r.getIdx(), r.isExt8bit());
-}
-
-inline Reg16 Reg::cvt16() const { return Reg16(changeBit(16).getIdx()); }
-
-inline Reg32 Reg::cvt32() const { return Reg32(changeBit(32).getIdx()); }
-
-#ifdef XBYAK64
-inline Reg64 Reg::cvt64() const { return Reg64(changeBit(64).getIdx()); }
-#endif
-
-#ifndef XBYAK_DISABLE_SEGMENT
-// not derived from Reg
-class Segment {
-  int idx_;
-
- public:
-  enum { es, cs, ss, ds, fs, gs };
-  explicit XBYAK_CONSTEXPR Segment(int idx) : idx_(idx) { assert(0 <= idx_ && idx_ < 6); }
-  int getIdx() const { return idx_; }
-  const char* toString() const {
-    static const char tbl[][3] = {"es", "cs", "ss", "ds", "fs", "gs"};
-    return tbl[idx_];
-  }
-};
-#endif
-
-class RegExp {
- public:
-#ifdef XBYAK64
-  enum { i32e = 32 | 64 };
-#else
-  enum { i32e = 32 };
-#endif
-  XBYAK_CONSTEXPR RegExp(size_t disp = 0) : scale_(0), disp_(disp) {}
-  XBYAK_CONSTEXPR RegExp(const Reg& r, int scale = 1) : scale_(scale), disp_(0) {
-    if (!r.isREG(i32e) && !r.is(Reg::XMM | Reg::YMM | Reg::ZMM | Reg::TMM)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    if (scale == 0) return;
-    if (scale != 1 && scale != 2 && scale != 4 && scale != 8) XBYAK_THROW(ERR_BAD_SCALE)
-    if (r.getBit() >= 128 || scale != 1) {  // xmm/ymm is always index
-      index_ = r;
-    } else {
-      base_ = r;
-    }
-  }
-  bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
-  RegExp optimize() const {
-    RegExp exp = *this;
-    // [reg * 2] => [reg + reg]
-    if (index_.isBit(i32e) && !base_.getBit() && scale_ == 2) {
-      exp.base_ = index_;
-      exp.scale_ = 1;
-    }
-    return exp;
-  }
-  bool operator==(const RegExp& rhs) const {
-    return base_ == rhs.base_ && index_ == rhs.index_ && disp_ == rhs.disp_ && scale_ == rhs.scale_;
-  }
-  const Reg& getBase() const { return base_; }
-  const Reg& getIndex() const { return index_; }
-  int getScale() const { return scale_; }
-  size_t getDisp() const { return disp_; }
-  XBYAK_CONSTEXPR void verify() const {
-    if (base_.getBit() >= 128) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    if (index_.getBit() && index_.getBit() <= 64) {
-      if (index_.getIdx() == Operand::ESP) XBYAK_THROW(ERR_ESP_CANT_BE_INDEX)
-      if (base_.getBit() && base_.getBit() != index_.getBit()) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    }
-  }
-  friend RegExp operator+(const RegExp& a, const RegExp& b);
-  friend RegExp operator-(const RegExp& e, size_t disp);
-  uint8_t getRex() const {
-    uint8_t rex = index_.getRexX() | base_.getRexB();
-    return rex ? uint8_t(rex | 0x40) : 0;
-  }
-
- private:
-  /*
-          [base_ + index_ * scale_ + disp_]
-          base : Reg32e, index : Reg32e(w/o esp), Xmm, Ymm
-  */
-  Reg base_;
-  Reg index_;
-  int scale_;
-  size_t disp_;
-};
-
-inline RegExp operator+(const RegExp& a, const RegExp& b) {
-  if (a.index_.getBit() && b.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
-  RegExp ret = a;
-  if (!ret.index_.getBit()) {
-    ret.index_ = b.index_;
-    ret.scale_ = b.scale_;
-  }
-  if (b.base_.getBit()) {
-    if (ret.base_.getBit()) {
-      if (ret.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
-      // base + base => base + index * 1
-      ret.index_ = b.base_;
-      // [reg + esp] => [esp + reg]
-      if (ret.index_.getIdx() == Operand::ESP) std::swap(ret.base_, ret.index_);
-      ret.scale_ = 1;
-    } else {
-      ret.base_ = b.base_;
-    }
-  }
-  ret.disp_ += b.disp_;
-  return ret;
-}
-inline RegExp operator*(const Reg& r, int scale) { return RegExp(r, scale); }
-inline RegExp operator*(int scale, const Reg& r) { return r * scale; }
-inline RegExp operator-(const RegExp& e, size_t disp) {
-  RegExp ret = e;
-  ret.disp_ -= disp;
-  return ret;
-}
-
-// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
-void* const AutoGrow = (void*)1;           //-V566
-void* const DontSetProtectRWE = (void*)2;  //-V566
-
-class CodeArray {
-  enum Type {
-    USER_BUF = 1,  // use userPtr(non alignment, non protect)
-    ALLOC_BUF,     // use new(alignment, protect)
-    AUTO_GROW      // automatically move and grow memory if necessary
-  };
-  CodeArray(const CodeArray& rhs);
-  void operator=(const CodeArray&);
-  bool isAllocType() const { return type_ == ALLOC_BUF || type_ == AUTO_GROW; }
-  struct AddrInfo {
-    size_t codeOffset;  // position to write
-    size_t jmpAddr;     // value to write
-    int jmpSize;        // size of jmpAddr
-    inner::LabelMode mode;
-    AddrInfo(size_t _codeOffset, size_t _jmpAddr, int _jmpSize, inner::LabelMode _mode)
-        : codeOffset(_codeOffset), jmpAddr(_jmpAddr), jmpSize(_jmpSize), mode(_mode) {}
-    uint64_t getVal(const uint8_t* top) const {
-      uint64_t disp = (mode == inner::LaddTop) ? jmpAddr + size_t(top)
-                      : (mode == inner::LasIs) ? jmpAddr
-                                               : jmpAddr - size_t(top);
-      if (jmpSize == 4) disp = inner::VerifyInInt32(disp);
-      return disp;
-    }
-  };
-  typedef std::list<AddrInfo> AddrInfoList;
-  AddrInfoList addrInfoList_;
-  const Type type_;
-#ifdef XBYAK_USE_MMAP_ALLOCATOR
-  MmapAllocator defaultAllocator_;
-#else
-  Allocator defaultAllocator_;
-#endif
-  Allocator* alloc_;
-
- protected:
-  size_t maxSize_;
-  uint8_t* top_;
-  size_t size_;
-  bool isCalledCalcJmpAddress_;
-
-  bool useProtect() const { return alloc_->useProtect(); }
-  /*
-          allocate new memory and copy old data to the new area
-  */
-  void growMemory() {
-    const size_t newSize = (std::max<size_t>)(DEFAULT_MAX_CODE_SIZE, maxSize_ * 2);
-    uint8_t* newTop = alloc_->alloc(newSize);
-    if (newTop == 0) XBYAK_THROW(ERR_CANT_ALLOC)
-    for (size_t i = 0; i < size_; i++) newTop[i] = top_[i];
-    alloc_->free(top_);
-    top_ = newTop;
-    maxSize_ = newSize;
-  }
-  /*
-          calc jmp address for AutoGrow mode
-  */
-  void calcJmpAddress() {
-    if (isCalledCalcJmpAddress_) return;
-    for (AddrInfoList::const_iterator i = addrInfoList_.begin(), ie = addrInfoList_.end(); i != ie; ++i) {
-      uint64_t disp = i->getVal(top_);
-      rewrite(i->codeOffset, disp, i->jmpSize);
-    }
-    isCalledCalcJmpAddress_ = true;
-  }
-
- public:
-  enum ProtectMode {
-    PROTECT_RW = 0,   // read/write
-    PROTECT_RWE = 1,  // read/write/exec
-    PROTECT_RE = 2    // read/exec
-  };
-  explicit CodeArray(size_t maxSize, void* userPtr = 0, Allocator* allocator = 0)
-      : type_(userPtr == AutoGrow                              ? AUTO_GROW
-              : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF
-                                                               : USER_BUF),
-        alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_),
-        maxSize_(maxSize),
-        top_(type_ == USER_BUF ? reinterpret_cast<uint8_t*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1))),
-        size_(0),
-        isCalledCalcJmpAddress_(false) {
-    if (maxSize_ > 0 && top_ == 0) XBYAK_THROW(ERR_CANT_ALLOC)
-    if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
-      alloc_->free(top_);
-      XBYAK_THROW(ERR_CANT_PROTECT)
-    }
-  }
-  virtual ~CodeArray() {
-    if (isAllocType()) {
-      if (useProtect()) setProtectModeRW(false);
-      alloc_->free(top_);
-    }
-  }
-  bool setProtectMode(ProtectMode mode, bool throwException = true) {
-    bool isOK = protect(top_, maxSize_, mode);
-    if (isOK) return true;
-    if (throwException) XBYAK_THROW_RET(ERR_CANT_PROTECT, false)
-    return false;
-  }
-  bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
-  bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
-  void resetSize() {
-    size_ = 0;
-    addrInfoList_.clear();
-    isCalledCalcJmpAddress_ = false;
-  }
-  void db(int code) {
-    if (size_ >= maxSize_) {
-      if (type_ == AUTO_GROW) {
-        growMemory();
-      } else {
-        XBYAK_THROW(ERR_CODE_IS_TOO_BIG)
-      }
-    }
-    top_[size_++] = static_cast<uint8_t>(code);
-  }
-  void db(const uint8_t* code, size_t codeSize) {
-    for (size_t i = 0; i < codeSize; i++) db(code[i]);
-  }
-  void db(uint64_t code, size_t codeSize) {
-    if (codeSize > 8) XBYAK_THROW(ERR_BAD_PARAMETER)
-    for (size_t i = 0; i < codeSize; i++) db(static_cast<uint8_t>(code >> (i * 8)));
-  }
-  void dw(uint32_t code) { db(code, 2); }
-  void dd(uint32_t code) { db(code, 4); }
-  void dq(uint64_t code) { db(code, 8); }
-  const uint8_t* getCode() const { return top_; }
-  template <class F>
-  const F getCode() const {
-    return reinterpret_cast<F>(top_);
-  }
-  const uint8_t* getCurr() const { return &top_[size_]; }
-  template <class F>
-  const F getCurr() const {
-    return reinterpret_cast<F>(&top_[size_]);
-  }
-  size_t getSize() const { return size_; }
-  void setSize(size_t size) {
-    if (size > maxSize_) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-    size_ = size;
-  }
-  void dump() const {
-    const uint8_t* p = getCode();
-    size_t bufSize = getSize();
-    size_t remain = bufSize;
-    for (int i = 0; i < 4; i++) {
-      size_t disp = 16;
-      if (remain < 16) {
-        disp = remain;
-      }
-      for (size_t j = 0; j < 16; j++) {
-        if (j < disp) {
-          printf("%02X", p[i * 16 + j]);
-        }
-      }
-      putchar('\n');
-      remain -= disp;
-      if (remain == 0) {
-        break;
-      }
-    }
-  }
-  /*
-          @param offset [in] offset from top
-          @param disp [in] offset from the next of jmp
-          @param size [in] write size(1, 2, 4, 8)
-  */
-  void rewrite(size_t offset, uint64_t disp, size_t size) {
-    assert(offset < maxSize_);
-    if (size != 1 && size != 2 && size != 4 && size != 8) XBYAK_THROW(ERR_BAD_PARAMETER)
-    uint8_t* const data = top_ + offset;
-    for (size_t i = 0; i < size; i++) {
-      data[i] = static_cast<uint8_t>(disp >> (i * 8));
-    }
-  }
-  void save(size_t offset, size_t val, int size, inner::LabelMode mode) {
-    addrInfoList_.push_back(AddrInfo(offset, val, size, mode));
-  }
-  bool isAutoGrow() const { return type_ == AUTO_GROW; }
-  bool isCalledCalcJmpAddress() const { return isCalledCalcJmpAddress_; }
-  /**
-          change exec permission of memory
-          @param addr [in] buffer address
-          @param size [in] buffer size
-          @param protectMode [in] mode(RW/RWE/RE)
-          @return true(success), false(failure)
-  */
-  static inline bool protect(const void* addr, size_t size, int protectMode) {
-#if defined(_WIN32)
-    const DWORD c_rw = PAGE_READWRITE;
-    const DWORD c_rwe = PAGE_EXECUTE_READWRITE;
-    const DWORD c_re = PAGE_EXECUTE_READ;
-    DWORD mode;
-#else
-    const int c_rw = PROT_READ | PROT_WRITE;
-    const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC;
-    const int c_re = PROT_READ | PROT_EXEC;
-    int mode;
-#endif
-    switch (protectMode) {
-      case PROTECT_RW:
-        mode = c_rw;
-        break;
-      case PROTECT_RWE:
-        mode = c_rwe;
-        break;
-      case PROTECT_RE:
-        mode = c_re;
-        break;
-      default:
-        return false;
-    }
-#if defined(_WIN32)
-    DWORD oldProtect;
-    return VirtualProtect(const_cast<void*>(addr), size, mode, &oldProtect) != 0;
-#elif defined(__GNUC__)
-    size_t pageSize = sysconf(_SC_PAGESIZE);
-    size_t iaddr = reinterpret_cast<size_t>(addr);
-    size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
-    return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
-#else
-    return true;
-#endif
-  }
-  /**
-          get aligned memory pointer
-          @param addr [in] address
-          @param alignedSize [in] power of two
-          @return aligned addr by alingedSize
-  */
-  static inline uint8_t* getAlignedAddress(uint8_t* addr, size_t alignedSize = 16) {
-    return reinterpret_cast<uint8_t*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) &
-                                      ~(alignedSize - static_cast<size_t>(1)));
-  }
-};
-
-class Address : public Operand {
- public:
-  enum Mode { M_ModRM, M_64bitDisp, M_rip, M_ripAddr };
-  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegExp& e)
-      : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), broadcast_(broadcast) {
-    e_.verify();
-  }
-#ifdef XBYAK64
-  explicit XBYAK_CONSTEXPR Address(size_t disp)
-      : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), broadcast_(false) {}
-  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegRip& addr)
-      : Operand(0, MEM, sizeBit),
-        e_(addr.disp_),
-        label_(addr.label_),
-        mode_(addr.isAddr_ ? M_ripAddr : M_rip),
-        broadcast_(broadcast) {}
-#endif
-  RegExp getRegExp(bool optimize = true) const { return optimize ? e_.optimize() : e_; }
-  Mode getMode() const { return mode_; }
-  bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; }
-  bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); }  // for mov eax
-  size_t getDisp() const { return e_.getDisp(); }
-  uint8_t getRex() const {
-    if (mode_ != M_ModRM) return 0;
-    return getRegExp().getRex();
-  }
-  bool is64bitDisp() const { return mode_ == M_64bitDisp; }  // for moffset
-  bool isBroadcast() const { return broadcast_; }
-  const Label* getLabel() const { return label_; }
-  bool operator==(const Address& rhs) const {
-    return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ &&
-           broadcast_ == rhs.broadcast_;
-  }
-  bool operator!=(const Address& rhs) const { return !operator==(rhs); }
-  bool isVsib() const { return e_.isVsib(); }
-
- private:
-  RegExp e_;
-  const Label* label_;
-  Mode mode_;
-  bool broadcast_;
-};
-
-inline const Address& Operand::getAddress() const {
-  assert(isMEM());
-  return static_cast<const Address&>(*this);
-}
-
-inline bool Operand::operator==(const Operand& rhs) const {
-  if (isMEM() && rhs.isMEM()) return this->getAddress() == rhs.getAddress();
-  return isEqualIfNotInherited(rhs);
-}
-
-class AddressFrame {
-  void operator=(const AddressFrame&);
-  AddressFrame(const AddressFrame&);
-
- public:
-  const uint32_t bit_;
-  const bool broadcast_;
-  explicit XBYAK_CONSTEXPR AddressFrame(uint32_t bit, bool broadcast = false) : bit_(bit), broadcast_(broadcast) {}
-  Address operator[](const RegExp& e) const { return Address(bit_, broadcast_, e); }
-  Address operator[](const void* disp) const {
-    return Address(bit_, broadcast_, RegExp(reinterpret_cast<size_t>(disp)));
-  }
-#ifdef XBYAK64
-  Address operator[](uint64_t disp) const { return Address(disp); }
-  Address operator[](const RegRip& addr) const { return Address(bit_, broadcast_, addr); }
-#endif
-};
-
-struct JmpLabel {
-  size_t endOfJmp; /* offset from top to the end address of jmp */
-  int jmpSize;
-  inner::LabelMode mode;
-  size_t disp;  // disp for [rip + disp]
-  explicit JmpLabel(size_t endOfJmp = 0, int jmpSize = 0, inner::LabelMode mode = inner::LasIs, size_t disp = 0)
-      : endOfJmp(endOfJmp), jmpSize(jmpSize), mode(mode), disp(disp) {}
-};
-
-class LabelManager;
-
-class Label {
-  mutable LabelManager* mgr;
-  mutable int id;
-  friend class LabelManager;
-
- public:
-  Label() : mgr(0), id(0) {}
-  Label(const Label& rhs);
-  Label& operator=(const Label& rhs);
-  ~Label();
-  void clear() {
-    mgr = 0;
-    id = 0;
-  }
-  int getId() const { return id; }
-  const uint8_t* getAddress() const;
-
-  // backward compatibility
-  static inline std::string toStr(int num) {
-    char buf[16];
-#if defined(_MSC_VER) && (_MSC_VER < 1900)
-    _snprintf_s
-#else
-    snprintf
-#endif
-        (buf, sizeof(buf), ".%08x", num);
-    return buf;
-  }
-};
-
-class LabelManager {
-  // for string label
-  struct SlabelVal {
-    size_t offset;
-    SlabelVal(size_t offset) : offset(offset) {}
-  };
-  typedef XBYAK_STD_UNORDERED_MAP<std::string, SlabelVal> SlabelDefList;
-  typedef XBYAK_STD_UNORDERED_MULTIMAP<std::string, const JmpLabel> SlabelUndefList;
-  struct SlabelState {
-    SlabelDefList defList;
-    SlabelUndefList undefList;
-  };
-  typedef std::list<SlabelState> StateList;
-  // for Label class
-  struct ClabelVal {
-    ClabelVal(size_t offset = 0) : offset(offset), refCount(1) {}
-    size_t offset;
-    int refCount;
-  };
-  typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
-  typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
-  typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
-
-  CodeArray* base_;
-  // global : stateList_.front(), local : stateList_.back()
-  StateList stateList_;
-  mutable int labelId_;
-  ClabelDefList clabelDefList_;
-  ClabelUndefList clabelUndefList_;
-  LabelPtrList labelPtrList_;
-
-  int getId(const Label& label) const {
-    if (label.id == 0) label.id = labelId_++;
-    return label.id;
-  }
-  template <class DefList, class UndefList, class T>
-  void define_inner(DefList& defList, UndefList& undefList, const T& labelId, size_t addrOffset) {
-    // add label
-    typename DefList::value_type item(labelId, addrOffset);
-    std::pair<typename DefList::iterator, bool> ret = defList.insert(item);
-    if (!ret.second) XBYAK_THROW(ERR_LABEL_IS_REDEFINED)
-    // search undefined label
-    for (;;) {
-      typename UndefList::iterator itr = undefList.find(labelId);
-      if (itr == undefList.end()) break;
-      const JmpLabel* jmp = &itr->second;
-      const size_t offset = jmp->endOfJmp - jmp->jmpSize;
-      size_t disp;
-      if (jmp->mode == inner::LaddTop) {
-        disp = addrOffset;
-      } else if (jmp->mode == inner::Labs) {
-        disp = size_t(base_->getCurr());
-      } else {
-        disp = addrOffset - jmp->endOfJmp + jmp->disp;
-#ifdef XBYAK64
-        if (jmp->jmpSize <= 4 && !inner::IsInInt32(disp)) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#endif
-        if (jmp->jmpSize == 1 && !inner::IsInDisp8((uint32_t)disp)) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
-      }
-      if (base_->isAutoGrow()) {
-        base_->save(offset, disp, jmp->jmpSize, jmp->mode);
-      } else {
-        base_->rewrite(offset, disp, jmp->jmpSize);
-      }
-      undefList.erase(itr);
-    }
-  }
-  template <class DefList, class T>
-  bool getOffset_inner(const DefList& defList, size_t* offset, const T& label) const {
-    typename DefList::const_iterator i = defList.find(label);
-    if (i == defList.end()) return false;
-    *offset = i->second.offset;
-    return true;
-  }
-  friend class Label;
-  void incRefCount(int id, Label* label) {
-    clabelDefList_[id].refCount++;
-    labelPtrList_.insert(label);
-  }
-  void decRefCount(int id, Label* label) {
-    labelPtrList_.erase(label);
-    ClabelDefList::iterator i = clabelDefList_.find(id);
-    if (i == clabelDefList_.end()) return;
-    if (i->second.refCount == 1) {
-      clabelDefList_.erase(id);
-    } else {
-      --i->second.refCount;
-    }
-  }
-  template <class T>
-  bool hasUndefinedLabel_inner(const T& list) const {
-#ifndef NDEBUG
-    for (typename T::const_iterator i = list.begin(); i != list.end(); ++i) {
-      std::cerr << "undefined label:" << i->first << std::endl;
-    }
-#endif
-    return !list.empty();
-  }
-  // detach all labels linked to LabelManager
-  void resetLabelPtrList() {
-    for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
-      (*i)->clear();
-    }
-    labelPtrList_.clear();
-  }
-
- public:
-  LabelManager() { reset(); }
-  ~LabelManager() { resetLabelPtrList(); }
-  void reset() {
-    base_ = 0;
-    labelId_ = 1;
-    stateList_.clear();
-    stateList_.push_back(SlabelState());
-    stateList_.push_back(SlabelState());
-    clabelDefList_.clear();
-    clabelUndefList_.clear();
-    resetLabelPtrList();
-  }
-  void enterLocal() { stateList_.push_back(SlabelState()); }
-  void leaveLocal() {
-    if (stateList_.size() <= 2) XBYAK_THROW(ERR_UNDER_LOCAL_LABEL)
-    if (hasUndefinedLabel_inner(stateList_.back().undefList)) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
-    stateList_.pop_back();
-  }
-  void set(CodeArray* base) { base_ = base; }
-  void defineSlabel(std::string label) {
-    if (label == "@b" || label == "@f") XBYAK_THROW(ERR_BAD_LABEL_STR)
-    if (label == "@@") {
-      SlabelDefList& defList = stateList_.front().defList;
-      SlabelDefList::iterator i = defList.find("@f");
-      if (i != defList.end()) {
-        defList.erase(i);
-        label = "@b";
-      } else {
-        i = defList.find("@b");
-        if (i != defList.end()) {
-          defList.erase(i);
-        }
-        label = "@f";
-      }
-    }
-    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    define_inner(st.defList, st.undefList, label, base_->getSize());
-  }
-  void defineClabel(Label& label) {
-    define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
-    label.mgr = this;
-    labelPtrList_.insert(&label);
-  }
-  void assign(Label& dst, const Label& src) {
-    ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
-    if (i == clabelDefList_.end()) XBYAK_THROW(ERR_LABEL_ISNOT_SET_BY_L)
-    define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
-    dst.mgr = this;
-    labelPtrList_.insert(&dst);
-  }
-  bool getOffset(size_t* offset, std::string& label) const {
-    const SlabelDefList& defList = stateList_.front().defList;
-    if (label == "@b") {
-      if (defList.find("@f") != defList.end()) {
-        label = "@f";
-      } else if (defList.find("@b") == defList.end()) {
-        XBYAK_THROW_RET(ERR_LABEL_IS_NOT_FOUND, false)
-      }
-    } else if (label == "@f") {
-      if (defList.find("@f") != defList.end()) {
-        label = "@b";
-      }
-    }
-    const SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    return getOffset_inner(st.defList, offset, label);
-  }
-  bool getOffset(size_t* offset, const Label& label) const {
-    return getOffset_inner(clabelDefList_, offset, getId(label));
-  }
-  void addUndefinedLabel(const std::string& label, const JmpLabel& jmp) {
-    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    st.undefList.insert(SlabelUndefList::value_type(label, jmp));
-  }
-  void addUndefinedLabel(const Label& label, const JmpLabel& jmp) {
-    clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
-  }
-  bool hasUndefSlabel() const {
-    for (StateList::const_iterator i = stateList_.begin(), ie = stateList_.end(); i != ie; ++i) {
-      if (hasUndefinedLabel_inner(i->undefList)) return true;
-    }
-    return false;
-  }
-  bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
-  const uint8_t* getCode() const { return base_->getCode(); }
-  bool isReady() const { return !base_->isAutoGrow() || base_->isCalledCalcJmpAddress(); }
-};
-
-inline Label::Label(const Label& rhs) {
-  id = rhs.id;
-  mgr = rhs.mgr;
-  if (mgr) mgr->incRefCount(id, this);
-}
-inline Label& Label::operator=(const Label& rhs) {
-  if (id) XBYAK_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this)
-  id = rhs.id;
-  mgr = rhs.mgr;
-  if (mgr) mgr->incRefCount(id, this);
-  return *this;
-}
-inline Label::~Label() {
-  if (id && mgr) mgr->decRefCount(id, this);
-}
-inline const uint8_t* Label::getAddress() const {
-  if (mgr == 0 || !mgr->isReady()) return 0;
-  size_t offset;
-  if (!mgr->getOffset(&offset, *this)) return 0;
-  return mgr->getCode() + offset;
-}
-
-typedef enum { DefaultEncoding, VexEncoding, EvexEncoding } PreferredEncoding;
-
-class CodeGenerator : public CodeArray {
- public:
-  enum LabelType {
-    T_SHORT,
-    T_NEAR,
-    T_FAR,  // far jump
-    T_AUTO  // T_SHORT if possible
-  };
-
- private:
-  CodeGenerator operator=(const CodeGenerator&);  // don't call
-#ifdef XBYAK64
-  enum {i32e = 32 | 64, BIT = 64};
-  static const uint64_t dummyAddr = uint64_t(0x1122334455667788ull);
-  typedef Reg64 NativeReg;
-#else
-  enum {i32e = 32, BIT = 32};
-  static const size_t dummyAddr = 0x12345678;
-  typedef Reg32 NativeReg;
-#endif
-  // (XMM, XMM|MEM)
-  static inline bool isXMM_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isXMM() || op2.isMEM());
-  }
-  // (MMX, MMX|MEM) or (XMM, XMM|MEM)
-  static inline bool isXMMorMMX_MEM(const Operand& op1, const Operand& op2) {
-    return (op1.isMMX() && (op2.isMMX() || op2.isMEM())) || isXMM_XMMorMEM(op1, op2);
-  }
-  // (XMM, MMX|MEM)
-  static inline bool isXMM_MMXorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isMMX() || op2.isMEM());
-  }
-  // (MMX, XMM|MEM)
-  static inline bool isMMX_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isMMX() && (op2.isXMM() || op2.isMEM());
-  }
-  // (XMM, REG32|MEM)
-  static inline bool isXMM_REG32orMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isREG(i32e) || op2.isMEM());
-  }
-  // (REG32, XMM|MEM)
-  static inline bool isREG32_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
-  }
-  // (REG32, REG32|MEM)
-  static inline bool isREG32_REG32orMEM(const Operand& op1, const Operand& op2) {
-    return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
-  }
-  static inline bool isValidSSE(const Operand& op1) {
-    // SSE instructions do not support XMM16 - XMM31
-    return !(op1.isXMM() && op1.getIdx() >= 16);
-  }
-  void rex(const Operand& op1, const Operand& op2 = Operand()) {
-    uint8_t rex = 0;
-    const Operand *p1 = &op1, *p2 = &op2;
-    if (p1->isMEM()) std::swap(p1, p2);
-    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (p2->isMEM()) {
-      const Address& addr = p2->getAddress();
-      if (BIT == 64 && addr.is32bit()) db(0x67);
-      rex = addr.getRex() | p1->getReg().getRex();
-    } else {
-      // ModRM(reg, base);
-      rex = op2.getReg().getRex(op1.getReg());
-    }
-    // except movsx(16bit, 32/64bit)
-    if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
-    if (rex) db(rex);
-  }
-  enum AVXtype {
-    // low 3 bit
-    T_N1 = 1,
-    T_N2 = 2,
-    T_N4 = 3,
-    T_N8 = 4,
-    T_N16 = 5,
-    T_N32 = 6,
-    T_NX_MASK = 7,
-    //
-    T_N_VL = 1 << 3,     // N * (1, 2, 4) for VL
-    T_DUP = 1 << 4,      // N = (8, 32, 64)
-    T_66 = 1 << 5,       // pp = 1
-    T_F3 = 1 << 6,       // pp = 2
-    T_F2 = T_66 | T_F3,  // pp = 3
-    T_ER_R = 1 << 7,     // reg{er}
-    T_0F = 1 << 8,
-    T_0F38 = 1 << 9,
-    T_0F3A = 1 << 10,
-    T_L0 = 1 << 11,
-    T_L1 = 1 << 12,
-    T_W0 = 1 << 13,
-    T_W1 = 1 << 14,
-    T_EW0 = 1 << 15,
-    T_EW1 = 1 << 16,
-    T_YMM = 1 << 17,  // support YMM, ZMM
-    T_EVEX = 1 << 18,
-    T_ER_X = 1 << 19,       // xmm{er}
-    T_ER_Y = 1 << 20,       // ymm{er}
-    T_ER_Z = 1 << 21,       // zmm{er}
-    T_SAE_X = 1 << 22,      // xmm{sae}
-    T_SAE_Y = 1 << 23,      // ymm{sae}
-    T_SAE_Z = 1 << 24,      // zmm{sae}
-    T_MUST_EVEX = 1 << 25,  // contains T_EVEX
-    T_B32 = 1 << 26,        // m32bcst
-    T_B64 = 1 << 27,        // m64bcst
-    T_B16 = T_B32 | T_B64,  // m16bcst (Be careful)
-    T_M_K = 1 << 28,        // mem{k}
-    T_VSIB = 1 << 29,
-    T_MEM_EVEX = 1 << 30,  // use evex if mem
-    T_FP16 = 1 << 31,      // avx512-fp16
-    T_MAP5 = T_FP16 | T_0F,
-    T_MAP6 = T_FP16 | T_0F38,
-    T_XXX
-  };
-  // T_66 = 1, T_F3 = 2, T_F2 = 3
-  uint32_t getPP(int type) const { return (type >> 5) & 3; }
-  void vex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false) {
-    int w = (type & T_W1) ? 1 : 0;
-    bool is256 = (type & T_L1) ? true : (type & T_L0) ? false : reg.isYMM();
-    bool r = reg.isExtIdx();
-    bool b = base.isExtIdx();
-    int idx = v ? v->getIdx() : 0;
-    if ((idx | reg.getIdx() | base.getIdx()) >= 16) XBYAK_THROW(ERR_BAD_COMBINATION)
-    uint32_t pp = getPP(type);
-    uint32_t vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
-    if (!b && !x && !w && (type & T_0F)) {
-      db(0xC5);
-      db((r ? 0 : 0x80) | vvvv);
-    } else {
-      uint32_t mmmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
-      db(0xC4);
-      db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm);
-      db((w << 7) | vvvv);
-    }
-    db(code);
-  }
-  void verifySAE(const Reg& r, int type) const {
-    if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
-    XBYAK_THROW(ERR_SAE_IS_INVALID)
-  }
-  void verifyER(const Reg& r, int type) const {
-    if ((type & T_ER_R) && r.isREG(32 | 64)) return;
-    if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
-    XBYAK_THROW(ERR_ER_IS_INVALID)
-  }
-  // (a, b, c) contains non zero two or three values then err
-  int verifyDuplicate(int a, int b, int c, int err) {
-    int v = a | b | c;
-    if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) XBYAK_THROW_RET(err, 0)
-    return v;
-  }
-  int evex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false, bool b = false,
-           int aaa = 0, uint32_t VL = 0, bool Hi16Vidx = false) {
-    if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
-    int w = (type & T_EW1) ? 1 : 0;
-    uint32_t mmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
-    if (type & T_FP16) mmm |= 4;
-    uint32_t pp = getPP(type);
-    int idx = v ? v->getIdx() : 0;
-    uint32_t vvvv = ~idx;
-
-    bool R = !reg.isExtIdx();
-    bool X = x ? false : !base.isExtIdx2();
-    bool B = !base.isExtIdx();
-    bool Rp = !reg.isExtIdx2();
-    int LL;
-    int rounding =
-        verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
-    int disp8N = 1;
-    if (rounding) {
-      if (rounding == EvexModifierRounding::T_SAE) {
-        verifySAE(base, type);
-        LL = 0;
-      } else {
-        verifyER(base, type);
-        LL = rounding - 1;
-      }
-      b = true;
-    } else {
-      if (v) VL = (std::max)(VL, v->getBit());
-      VL = (std::max)((std::max)(reg.getBit(), base.getBit()), VL);
-      LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
-      if (b) {
-        disp8N = ((type & T_B16) == T_B16) ? 2 : (type & T_B32) ? 4 : 8;
-      } else if (type & T_DUP) {
-        disp8N = VL == 128 ? 8 : VL == 256 ? 32 : 64;
-      } else {
-        if ((type & (T_NX_MASK | T_N_VL)) == 0) {
-          type |= T_N16 | T_N_VL;  // default
-        }
-        int low = type & T_NX_MASK;
-        if (low > 0) {
-          disp8N = 1 << (low - 1);
-          if (type & T_N_VL) disp8N *= (VL == 512 ? 4 : VL == 256 ? 2 : 1);
-        }
-      }
-    }
-    bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
-    bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
-    if (aaa == 0)
-      aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0),
-                            ERR_OPMASK_IS_ALREADY_SET);
-    if (aaa == 0) z = 0;  // clear T_z if mask is not set
-    db(0x62);
-    db((R ? 0x80 : 0) | (X ? 0x40 : 0) | (B ? 0x20 : 0) | (Rp ? 0x10 : 0) | mmm);
-    db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | 4 | (pp & 3));
-    db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (Vp ? 8 : 0) | (aaa & 7));
-    db(code);
-    return disp8N;
-  }
-  void setModRM(int mod, int r1, int r2) { db(static_cast<uint8_t>((mod << 6) | ((r1 & 7) << 3) | (r2 & 7))); }
-  void setSIB(const RegExp& e, int reg, int disp8N = 0) {
-    uint64_t disp64 = e.getDisp();
-#if defined(XBYAK64) && !defined(__ILP32__)
-#ifdef XBYAK_OLD_DISP_CHECK
-    // treat 0xffffffff as 0xffffffffffffffff
-    uint64_t high = disp64 >> 32;
-    if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#else
-    // displacement should be a signed 32-bit value, so also check sign bit
-    uint64_t high = disp64 >> 31;
-    if (high != 0 && high != 0x1FFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#endif
-#endif
-    uint32_t disp = static_cast<uint32_t>(disp64);
-    const Reg& base = e.getBase();
-    const Reg& index = e.getIndex();
-    const int baseIdx = base.getIdx();
-    const int baseBit = base.getBit();
-    const int indexBit = index.getBit();
-    enum { mod00 = 0, mod01 = 1, mod10 = 2 };
-    int mod = mod10;  // disp32
-    if (!baseBit || ((baseIdx & 7) != Operand::EBP && disp == 0)) {
-      mod = mod00;
-    } else {
-      if (disp8N == 0) {
-        if (inner::IsInDisp8(disp)) {
-          mod = mod01;
-        }
-      } else {
-        // disp must be casted to signed
-        uint32_t t = static_cast<uint32_t>(static_cast<int>(disp) / disp8N);
-        if ((disp % disp8N) == 0 && inner::IsInDisp8(t)) {
-          disp = t;
-          mod = mod01;
-        }
-      }
-    }
-    const int newBaseIdx = baseBit ? (baseIdx & 7) : Operand::EBP;
-    /* ModR/M = [2:3:3] = [Mod:reg/code:R/M] */
-    bool hasSIB = indexBit || (baseIdx & 7) == Operand::ESP;
-#ifdef XBYAK64
-    if (!baseBit && !indexBit) hasSIB = true;
-#endif
-    if (hasSIB) {
-      setModRM(mod, reg, Operand::ESP);
-      /* SIB = [2:3:3] = [SS:index:base(=rm)] */
-      const int idx = indexBit ? (index.getIdx() & 7) : Operand::ESP;
-      const int scale = e.getScale();
-      const int SS = (scale == 8) ? 3 : (scale == 4) ? 2 : (scale == 2) ? 1 : 0;
-      setModRM(SS, idx, newBaseIdx);
-    } else {
-      setModRM(mod, reg, newBaseIdx);
-    }
-    if (mod == mod01) {
-      db(disp);
-    } else if (mod == mod10 || (mod == mod00 && !baseBit)) {
-      dd(disp);
-    }
-  }
-  LabelManager labelMgr_;
-  bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
-  void opModR(const Reg& reg1, const Reg& reg2, int code0, int code1 = NONE, int code2 = NONE) {
-    rex(reg2, reg1);
-    db(code0 | (reg1.isBit(8) ? 0 : 1));
-    if (code1 != NONE) db(code1);
-    if (code2 != NONE) db(code2);
-    setModRM(3, reg1.getIdx(), reg2.getIdx());
-  }
-  void opModM(const Address& addr, const Reg& reg, int code0, int code1 = NONE, int code2 = NONE, int immSize = 0) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    rex(addr, reg);
-    db(code0 | (reg.isBit(8) ? 0 : 1));
-    if (code1 != NONE) db(code1);
-    if (code2 != NONE) db(code2);
-    opAddr(addr, reg.getIdx(), immSize);
-  }
-  void opLoadSeg(const Address& addr, const Reg& reg, int code0, int code1 = NONE) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    rex(addr, reg);
-    db(code0);
-    if (code1 != NONE) db(code1);
-    opAddr(addr, reg.getIdx());
-  }
-  void opMIB(const Address& addr, const Reg& reg, int code0, int code1) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    if (addr.getMode() != Address::M_ModRM) XBYAK_THROW(ERR_INVALID_MIB_ADDRESS)
-    if (BIT == 64 && addr.is32bit()) db(0x67);
-    const RegExp& regExp = addr.getRegExp(false);
-    uint8_t rex = regExp.getRex();
-    if (rex) db(rex);
-    db(code0);
-    db(code1);
-    setSIB(regExp, reg.getIdx());
-  }
-  void makeJmp(uint32_t disp, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
-    const int shortJmpSize = 2;
-    const int longHeaderSize = longPref ? 2 : 1;
-    const int longJmpSize = longHeaderSize + 4;
-    if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
-      db(shortCode);
-      db(disp - shortJmpSize);
-    } else {
-      if (type == T_SHORT) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
-      if (longPref) db(longPref);
-      db(longCode);
-      dd(disp - longJmpSize);
-    }
-  }
-  bool isNEAR(LabelType type) const { return type == T_NEAR || (type == T_AUTO && isDefaultJmpNEAR_); }
-  template <class T>
-  void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
-    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
-    size_t offset = 0;
-    if (labelMgr_.getOffset(&offset, label)) { /* label exists */
-      makeJmp(inner::VerifyInInt32(offset - size_), type, shortCode, longCode, longPref);
-    } else {
-      int jmpSize = 0;
-      if (isNEAR(type)) {
-        jmpSize = 4;
-        if (longPref) db(longPref);
-        db(longCode);
-        dd(0);
-      } else {
-        jmpSize = 1;
-        db(shortCode);
-        db(0);
-      }
-      JmpLabel jmp(size_, jmpSize, inner::LasIs);
-      labelMgr_.addUndefinedLabel(label, jmp);
-    }
-  }
-  void opJmpAbs(const void* addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0) {
-    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (isAutoGrow()) {
-      if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW)
-      if (size_ + 16 >= maxSize_) growMemory();
-      if (longPref) db(longPref);
-      db(longCode);
-      dd(0);
-      save(size_ - 4, size_t(addr) - size_, 4, inner::Labs);
-    } else {
-      makeJmp(inner::VerifyInInt32(reinterpret_cast<const uint8_t*>(addr) - getCurr()), type, shortCode, longCode,
-              longPref);
-    }
-  }
-  void opJmpOp(const Operand& op, LabelType type, int ext) {
-    const int bit = 16 | i32e;
-    if (type == T_FAR) {
-      if (!op.isMEM(bit)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-      opR_ModM(op, bit, ext + 1, 0xFF, NONE, NONE, false);
-    } else {
-      opR_ModM(op, bit, ext, 0xFF, NONE, NONE, true);
-    }
-  }
-  // reg is reg field of ModRM
-  // immSize is the size for immediate value
-  // disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement
-  void opAddr(const Address& addr, int reg, int immSize = 0, int disp8N = 0, bool permitVisb = false) {
-    if (!permitVisb && addr.isVsib()) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    if (addr.getMode() == Address::M_ModRM) {
-      setSIB(addr.getRegExp(), reg, disp8N);
-    } else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) {
-      setModRM(0, reg, 5);
-      if (addr.getLabel()) {  // [rip + Label]
-        putL_inner(*addr.getLabel(), true, addr.getDisp() - immSize);
-      } else {
-        size_t disp = addr.getDisp();
-        if (addr.getMode() == Address::M_ripAddr) {
-          if (isAutoGrow()) XBYAK_THROW(ERR_INVALID_RIP_IN_AUTO_GROW)
-          disp -= (size_t)getCurr() + 4 + immSize;
-        }
-        dd(inner::VerifyInInt32(disp));
-      }
-    }
-  }
-  /* preCode is for SSSE3/SSE4 */
-  void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&),
-             int imm8 = NONE, int preCode = NONE) {
-    if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (!isValidSSE(reg) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (pref != NONE) db(pref);
-    if (op.isMEM()) {
-      opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0);
-    } else {
-      opModR(reg.getReg(), op.getReg(), 0x0F, preCode, code);
-    }
-    if (imm8 != NONE) db(imm8);
-  }
-  void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext) {
-    if (!isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (mmx.isXMM()) db(0x66);
-    opModR(Reg32(ext), mmx, 0x0F, code);
-    db(imm8);
-  }
-  void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE) {
-    opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
-  }
-  void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref) {
-    if (!isValidSSE(op1) || !isValidSSE(op2)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (pref != NONE) db(pref);
-    if (op1.isXMM() && op2.isMEM()) {
-      opModM(op2.getAddress(), op1.getReg(), 0x0F, code);
-    } else if (op1.isMEM() && op2.isXMM()) {
-      opModM(op1.getAddress(), op2.getReg(), 0x0F, code | 1);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false) {
-    if (!isValidSSE(op) || !isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
-      if (mmx.isXMM()) db(0x66);
-      opModR(op.getReg(), mmx, 0x0F, 0xC5);
-      db(imm);
-    } else {
-      opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, 0x3A);
-    }
-  }
-  void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE,
-                bool disableRex = false, int immSize = 0) {
-    int opBit = op.getBit();
-    if (disableRex && opBit == 64) opBit = 32;
-    if (op.isREG(bit)) {
-      opModR(Reg(ext, Operand::REG, opBit), op.getReg().changeBit(opBit), code0, code1, code2);
-    } else if (op.isMEM()) {
-      opModM(op.getAddress(), Reg(ext, Operand::REG, opBit), code0, code1, code2, immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opShift(const Operand& op, int imm, int ext) {
-    verifyMemHasSize(op);
-    opR_ModM(op, 0, ext, (0xC0 | ((imm == 1 ? 1 : 0) << 4)), NONE, NONE, false, (imm != 1) ? 1 : 0);
-    if (imm != 1) db(imm);
-  }
-  void opShift(const Operand& op, const Reg8& _cl, int ext) {
-    if (_cl.getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opR_ModM(op, 0, ext, 0xD2);
-  }
-  void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE,
-               int code2 = NONE, int immSize = 0) {
-    if (condR) {
-      opModR(op1.getReg(), op2.getReg(), code0, code1, code2);
-    } else if (condM) {
-      opModM(op2.getAddress(), op1.getReg(), code0, code1, code2, immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opShxd(const Operand& op, const Reg& reg, uint8_t imm, int code, const Reg8* _cl = 0) {
-    if (_cl && _cl->getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opModRM(reg, op, (op.isREG(16 | i32e) && op.getBit() == reg.getBit()), op.isMEM() && (reg.isREG(16 | i32e)), 0x0F,
-            code | (_cl ? 1 : 0), NONE, _cl ? 0 : 1);
-    if (!_cl) db(imm);
-  }
-  // (REG, REG|MEM), (MEM, REG)
-  void opRM_RM(const Operand& op1, const Operand& op2, int code) {
-    if (op1.isREG() && op2.isMEM()) {
-      opModM(op2.getAddress(), op1.getReg(), code | 2);
-    } else {
-      opModRM(op2, op1, op1.isREG() && op1.getKind() == op2.getKind(), op1.isMEM() && op2.isREG(), code);
-    }
-  }
-  // (REG|MEM, IMM)
-  void opRM_I(const Operand& op, uint32_t imm, int code, int ext) {
-    verifyMemHasSize(op);
-    uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
-    if (op.isBit(8)) immBit = 8;
-    if (op.getBit() < immBit) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-    if (op.isBit(32 | 64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
-    if (op.isREG() && op.getIdx() == 0 &&
-        (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) {  // rax, eax, ax, al
-      rex(op);
-      db(code | 4 | (immBit == 8 ? 0 : 1));
-    } else {
-      int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
-      opR_ModM(op, 0, ext, 0x80 | tmp, NONE, NONE, false, immBit / 8);
-    }
-    db(imm, immBit / 8);
-  }
-  void opIncDec(const Operand& op, int code, int ext) {
-    verifyMemHasSize(op);
-#ifndef XBYAK64
-    if (op.isREG() && !op.isBit(8)) {
-      rex(op);
-      db(code | op.getIdx());
-      return;
-    }
-#endif
-    code = 0xFE;
-    if (op.isREG()) {
-      opModR(Reg(ext, Operand::REG, op.getBit()), op.getReg(), code);
-    } else {
-      opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code);
-    }
-  }
-  void opPushPop(const Operand& op, int code, int ext, int alt) {
-    int bit = op.getBit();
-    if (bit == 16 || bit == BIT) {
-      if (bit == 16) db(0x66);
-      if (op.isREG()) {
-        if (op.getReg().getIdx() >= 8) db(0x41);
-        db(alt | (op.getIdx() & 7));
-        return;
-      }
-      if (op.isMEM()) {
-        opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code);
-        return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void verifyMemHasSize(const Operand& op) const {
-    if (op.isMEM() && op.getBit() == 0) XBYAK_THROW(ERR_MEM_SIZE_IS_NOT_SPECIFIED)
-  }
-  /*
-          mov(r, imm) = db(imm, mov_imm(r, imm))
-  */
-  int mov_imm(const Reg& reg, uint64_t imm) {
-    int bit = reg.getBit();
-    const int idx = reg.getIdx();
-    int code = 0xB0 | ((bit == 8 ? 0 : 1) << 3);
-    if (bit == 64 && (imm & ~uint64_t(0xffffffffu)) == 0) {
-      rex(Reg32(idx));
-      bit = 32;
-    } else {
-      rex(reg);
-      if (bit == 64 && inner::IsInInt32(imm)) {
-        db(0xC7);
-        code = 0xC0;
-        bit = 32;
-      }
-    }
-    db(code | (idx & 7));
-    return bit / 8;
-  }
-  template <class T>
-  void putL_inner(T& label, bool relative = false, size_t disp = 0) {
-    const int jmpSize = relative ? 4 : (int)sizeof(size_t);
-    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory();
-    size_t offset = 0;
-    if (labelMgr_.getOffset(&offset, label)) {
-      if (relative) {
-        db(inner::VerifyInInt32(offset + disp - size_ - jmpSize), jmpSize);
-      } else if (isAutoGrow()) {
-        db(uint64_t(0), jmpSize);
-        save(size_ - jmpSize, offset, jmpSize, inner::LaddTop);
-      } else {
-        db(size_t(top_) + offset, jmpSize);
-      }
-      return;
-    }
-    db(uint64_t(0), jmpSize);
-    JmpLabel jmp(size_, jmpSize, (relative ? inner::LasIs : isAutoGrow() ? inner::LaddTop : inner::Labs), disp);
-    labelMgr_.addUndefinedLabel(label, jmp);
-  }
-  void opMovxx(const Reg& reg, const Operand& op, uint8_t code) {
-    if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    int w = op.isBit(16);
-    bool cond = reg.isREG() && (reg.getBit() > op.getBit());
-    opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
-  }
-  void opFpuMem(const Address& addr, uint8_t m16, uint8_t m32, uint8_t m64, uint8_t ext, uint8_t m64ext) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    uint8_t code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
-    if (!code) XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    if (m64ext && addr.isBit(64)) ext = m64ext;
-
-    rex(addr, st0);
-    db(code);
-    opAddr(addr, ext);
-  }
-  // use code1 if reg1 == st0
-  // use code2 if reg1 != st0 && reg2 == st0
-  void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32_t code1, uint32_t code2) {
-    uint32_t code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
-    if (!code) XBYAK_THROW(ERR_BAD_ST_COMBINATION)
-    db(uint8_t(code >> 8));
-    db(uint8_t(code | (reg1.getIdx() | reg2.getIdx())));
-  }
-  void opFpu(const Fpu& reg, uint8_t code1, uint8_t code2) {
-    db(code1);
-    db(code2 | reg.getIdx());
-  }
-  void opVex(const Reg& r, const Operand* p1, const Operand& op2, int type, int code, int imm8 = NONE) {
-    if (op2.isMEM()) {
-      const Address& addr = op2.getAddress();
-      const RegExp& regExp = addr.getRegExp();
-      const Reg& base = regExp.getBase();
-      const Reg& index = regExp.getIndex();
-      if (BIT == 64 && addr.is32bit()) db(0x67);
-      int disp8N = 0;
-      bool x = index.isExtIdx();
-      if ((type & (T_MUST_EVEX | T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() ||
-          addr.getOpmaskIdx()) {
-        int aaa = addr.getOpmaskIdx();
-        if (aaa && !(type & T_M_K)) XBYAK_THROW(ERR_INVALID_OPMASK_WITH_MEMORY)
-        bool b = false;
-        if (addr.isBroadcast()) {
-          if (!(type & (T_B32 | T_B64))) XBYAK_THROW(ERR_INVALID_BROADCAST)
-          b = true;
-        }
-        int VL = regExp.isVsib() ? index.getBit() : 0;
-        disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
-      } else {
-        vex(r, base, p1, type, code, x);
-      }
-      opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0);
-    } else {
-      const Reg& base = op2.getReg();
-      if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
-        evex(r, base, p1, type, code);
-      } else {
-        vex(r, base, p1, type, code);
-      }
-      setModRM(3, r.getIdx(), base.getIdx());
-    }
-    if (imm8 != NONE) db(imm8);
-  }
-  // (r, r, r/m) if isR_R_RM
-  // (r, r/m, r)
-  void opGpr(const Reg32e& r, const Operand& op1, const Operand& op2, int type, uint8_t code, bool isR_R_RM,
-             int imm8 = NONE) {
-    const Operand* p1 = &op1;
-    const Operand* p2 = &op2;
-    if (!isR_R_RM) std::swap(p1, p2);
-    const unsigned int bit = r.getBit();
-    if (p1->getBit() != bit || (p2->isREG() && p2->getBit() != bit)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    type |= (bit == 64) ? T_W1 : T_W0;
-    opVex(r, p1, *p2, type, code, imm8);
-  }
-  void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE) {
-    const Xmm* x2 = static_cast<const Xmm*>(&op1);
-    const Operand* op = &op2;
-    if (op2.isNone()) {  // (x1, op1) -> (x1, x1, op1)
-      x2 = &x1;
-      op = &op1;
-    }
-    // (x1, x2, op)
-    if (!((x1.isXMM() && x2->isXMM()) ||
-          ((type & T_YMM) && ((x1.isYMM() && x2->isYMM()) || (x1.isZMM() && x2->isZMM())))))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(x1, x2, *op, type, code0, imm8);
-  }
-  void opAVX_K_X_XM(const Opmask& k, const Xmm& x2, const Operand& op3, int type, int code0, int imm8 = NONE) {
-    if (!op3.isMEM() && (x2.getKind() != op3.getKind())) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(k, &x2, op3, type, code0, imm8);
-  }
-  // (x, x/m), (y, x/m256), (z, y/m)
-  void checkCvt1(const Operand& x, const Operand& op) const {
-    if (!op.isMEM() && !(x.is(Operand::XMM | Operand::YMM) && op.isXMM()) && !(x.isZMM() && op.isYMM()))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  // (x, x/m), (x, y/m256), (y, z/m)
-  void checkCvt2(const Xmm& x, const Operand& op) const {
-    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM)) &&
-        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void opCvt(const Xmm& x, const Operand& op, int type, int code) {
-    Operand::Kind kind = x.isXMM() ? (op.isBit(256) ? Operand::YMM : Operand::XMM) : Operand::ZMM;
-    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
-  }
-  void opCvt2(const Xmm& x, const Operand& op, int type, int code) {
-    checkCvt2(x, op);
-    opCvt(x, op, type, code);
-  }
-  void opCvt3(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int type64, int type32, uint8_t code) {
-    if (!(x1.isXMM() && x2.isXMM() && (op.isREG(i32e) || op.isMEM()))) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    Xmm x(op.getIdx());
-    const Operand* p = op.isREG() ? &x : &op;
-    opVex(x1, &x2, *p, type | (op.isBit(64) ? type64 : type32), code);
-  }
-  // (x, x/y/xword/yword), (y, z/m)
-  void checkCvt4(const Xmm& x, const Operand& op) const {
-    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM) && op.isBit(128 | 256)) &&
-        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  // (x, x/y/z/xword/yword/zword)
-  void opCvt5(const Xmm& x, const Operand& op, int type, int code) {
-    if (!(x.isXMM() && op.isBit(128 | 256 | 512))) XBYAK_THROW(ERR_BAD_COMBINATION)
-    Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM;
-    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
-  }
-  const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; }
-  // support (x, x/m, imm), (y, y/m, imm)
-  void opAVX_X_XM_IMM(const Xmm& x, const Operand& op, int type, int code, int imm8 = NONE) {
-    opAVX_X_X_XM(x, cvtIdx0(x), op, type, code, imm8);
-  }
-  // QQQ:need to refactor
-  void opSp1(const Reg& reg, const Operand& op, uint8_t pref, uint8_t code0, uint8_t code1) {
-    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
-    if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (is16bit) db(0x66);
-    db(pref);
-    opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, code0, code1);
-  }
-  void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8_t code, int mode) {
-    const RegExp& regExp = addr.getRegExp();
-    if (!regExp.isVsib(128 | 256)) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    const int y_vx_y = 0;
-    const int y_vy_y = 1;
-    //		const int x_vy_x = 2;
-    const bool isAddrYMM = regExp.getIndex().getBit() == 256;
-    if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
-      bool isOK = false;
-      if (mode == y_vx_y) {
-        isOK = x1.isYMM() && !isAddrYMM && x2.isYMM();
-      } else if (mode == y_vy_y) {
-        isOK = x1.isYMM() && isAddrYMM && x2.isYMM();
-      } else {  // x_vy_x
-        isOK = !x1.isYMM() && isAddrYMM && !x2.isYMM();
-      }
-      if (!isOK) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    }
-    int i1 = x1.getIdx();
-    int i2 = regExp.getIndex().getIdx();
-    int i3 = x2.getIdx();
-    if (i1 == i2 || i1 == i3 || i2 == i3) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
-    opAVX_X_X_XM(isAddrYMM ? Ymm(i1) : x1, isAddrYMM ? Ymm(i3) : x2, addr, type, code);
-  }
-  enum { xx_yy_zz = 0, xx_yx_zy = 1, xx_xy_yz = 2 };
-  void checkGather2(const Xmm& x1, const Reg& x2, int mode) const {
-    if (x1.isXMM() && x2.isXMM()) return;
-    switch (mode) {
-      case xx_yy_zz:
-        if ((x1.isYMM() && x2.isYMM()) || (x1.isZMM() && x2.isZMM())) return;
-        break;
-      case xx_yx_zy:
-        if ((x1.isYMM() && x2.isXMM()) || (x1.isZMM() && x2.isYMM())) return;
-        break;
-      case xx_xy_yz:
-        if ((x1.isXMM() && x2.isYMM()) || (x1.isYMM() && x2.isZMM())) return;
-        break;
-    }
-    XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-  }
-  void opGather2(const Xmm& x, const Address& addr, int type, uint8_t code, int mode) {
-    if (x.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
-    const RegExp& regExp = addr.getRegExp();
-    checkGather2(x, regExp.getIndex(), mode);
-    int maskIdx = x.getOpmaskIdx();
-    if ((type & T_M_K) && addr.getOpmaskIdx()) maskIdx = addr.getOpmaskIdx();
-    if (maskIdx == 0) XBYAK_THROW(ERR_K0_IS_INVALID);
-    if (!(type & T_M_K) && x.getIdx() == regExp.getIndex().getIdx()) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
-    opVex(x, 0, addr, type, code);
-  }
-  /*
-          xx_xy_yz ; mode = true
-          xx_xy_xz ; mode = false
-  */
-  void opVmov(const Operand& op, const Xmm& x, int type, uint8_t code, bool mode) {
-    if (mode) {
-      if (!op.isMEM() && !((op.isXMM() && x.isXMM()) || (op.isXMM() && x.isYMM()) || (op.isYMM() && x.isZMM())))
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-    } else {
-      if (!op.isMEM() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-    opVex(x, 0, op, type, code);
-  }
-  void opGatherFetch(const Address& addr, const Xmm& x, int type, uint8_t code, Operand::Kind kind) {
-    if (addr.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
-    if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    opVex(x, 0, addr, type, code);
-  }
-  void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) {
-    opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
-  }
-  int orEvexIf(PreferredEncoding encoding) {
-    if (encoding == DefaultEncoding) {
-      encoding = defaultEncoding_;
-    }
-    if (encoding == EvexEncoding) {
-#ifdef XBYAK_DISABLE_AVX512
-      XBYAK_THROW(ERR_EVEX_IS_INVALID)
-#endif
-      return T_MUST_EVEX;
-    }
-    return 0;
-  }
-  void opInOut(const Reg& a, const Reg& d, uint8_t code) {
-    if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) {
-      switch (a.getBit()) {
-        case 8:
-          db(code);
-          return;
-        case 16:
-          db(0x66);
-          db(code + 1);
-          return;
-        case 32:
-          db(code + 1);
-          return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void opInOut(const Reg& a, uint8_t code, uint8_t v) {
-    if (a.getIdx() == Operand::AL) {
-      switch (a.getBit()) {
-        case 8:
-          db(code);
-          db(v);
-          return;
-        case 16:
-          db(0x66);
-          db(code + 1);
-          db(v);
-          return;
-        case 32:
-          db(code + 1);
-          db(v);
-          return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-#ifdef XBYAK64
-  void opAMX(const Tmm& t1, const Address& addr, int type, int code0) {
-    // require both base and index
-    const RegExp exp = addr.getRegExp(false);
-    if (exp.getBase().getBit() == 0 || exp.getIndex().getBit() == 0) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    opVex(t1, &tmm0, addr, type, code0);
-  }
-#endif
- public:
-  unsigned int getVersion() const { return VERSION; }
-  using CodeArray::db;
-  const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
-  const Zmm zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
-  const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
-  const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
-  const Zmm &zm0, &zm1, &zm2, &zm3, &zm4, &zm5, &zm6, &zm7;
-  const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
-  const Reg16 ax, cx, dx, bx, sp, bp, si, di;
-  const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
-  const AddressFrame ptr, byte, word, dword, qword, xword, yword, zword;  // xword is same as oword of NASM
-  const AddressFrame ptr_b, xword_b, yword_b, zword_b;  // broadcast such as {1to2}, {1to4}, {1to8}, {1to16}, {b}
-  const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
-  const Opmask k0, k1, k2, k3, k4, k5, k6, k7;
-  const BoundsReg bnd0, bnd1, bnd2, bnd3;
-  const EvexModifierRounding T_sae, T_rn_sae, T_rd_sae, T_ru_sae,
-      T_rz_sae;                // {sae}, {rn-sae}, {rd-sae}, {ru-sae}, {rz-sae}
-  const EvexModifierZero T_z;  // {z}
-#ifdef XBYAK64
-  const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
-  const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
-  const Reg16 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w;
-  const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
-  const Reg8 spl, bpl, sil, dil;
-  const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  const Xmm xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23;
-  const Xmm xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31;
-  const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
-  const Ymm ymm16, ymm17, ymm18, ymm19, ymm20, ymm21, ymm22, ymm23;
-  const Ymm ymm24, ymm25, ymm26, ymm27, ymm28, ymm29, ymm30, ymm31;
-  const Zmm zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
-  const Zmm zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
-  const Zmm zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
-  const Tmm tmm0, tmm1, tmm2, tmm3, tmm4, tmm5, tmm6, tmm7;
-  const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15;  // for my convenience
-  const Xmm &xm16, &xm17, &xm18, &xm19, &xm20, &xm21, &xm22, &xm23;
-  const Xmm &xm24, &xm25, &xm26, &xm27, &xm28, &xm29, &xm30, &xm31;
-  const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
-  const Ymm &ym16, &ym17, &ym18, &ym19, &ym20, &ym21, &ym22, &ym23;
-  const Ymm &ym24, &ym25, &ym26, &ym27, &ym28, &ym29, &ym30, &ym31;
-  const Zmm &zm8, &zm9, &zm10, &zm11, &zm12, &zm13, &zm14, &zm15;
-  const Zmm &zm16, &zm17, &zm18, &zm19, &zm20, &zm21, &zm22, &zm23;
-  const Zmm &zm24, &zm25, &zm26, &zm27, &zm28, &zm29, &zm30, &zm31;
-  const RegRip rip;
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-  const Segment es, cs, ss, ds, fs, gs;
-#endif
- private:
-  bool isDefaultJmpNEAR_;
-  PreferredEncoding defaultEncoding_;
-
- public:
-  void L(const std::string& label) { labelMgr_.defineSlabel(label); }
-  void L(Label& label) { labelMgr_.defineClabel(label); }
-  Label L() {
-    Label label;
-    L(label);
-    return label;
-  }
-  void inLocalLabel() { labelMgr_.enterLocal(); }
-  void outLocalLabel() { labelMgr_.leaveLocal(); }
-  /*
-          assign src to dst
-          require
-          dst : does not used by L()
-          src : used by L()
-  */
-  void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
-  /*
-          put address of label to buffer
-          @note the put size is 4(32-bit), 8(64-bit)
-  */
-  void putL(std::string label) { putL_inner(label); }
-  void putL(const Label& label) { putL_inner(label); }
-
-  // set default type of `jmp` of undefined label to T_NEAR
-  void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; }
-  void jmp(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 4); }
-  void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
-  void jmp(const char* label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
-  void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
-  void jmp(const void* addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); }
-
-  void call(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 2); }
-  // call(string label), not const std::string&
-  void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
-  void call(const char* label) { call(std::string(label)); }
-  void call(const Label& label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
-  // call(function pointer)
-#ifdef XBYAK_VARIADIC_TEMPLATE
-  template <class Ret, class... Params>
-  void call(Ret (*func)(Params...)) {
-    call(reinterpret_cast<const void*>(func));
-  }
-#endif
-  void call(const void* addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
-
-  void test(const Operand& op, const Reg& reg) {
-    opModRM(reg, op, op.isREG() && (op.getKind() == reg.getKind()), op.isMEM(), 0x84);
-  }
-  void test(const Operand& op, uint32_t imm) {
-    verifyMemHasSize(op);
-    int immSize = (std::min)(op.getBit() / 8, 4U);
-    if (op.isREG() && op.getIdx() == 0) {  // al, ax, eax
-      rex(op);
-      db(0xA8 | (op.isBit(8) ? 0 : 1));
-    } else {
-      opR_ModM(op, 0, 0, 0xF6, NONE, NONE, false, immSize);
-    }
-    db(imm, immSize);
-  }
-  void imul(const Reg& reg, const Operand& op) {
-    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x0F, 0xAF);
-  }
-  void imul(const Reg& reg, const Operand& op, int imm) {
-    int s = inner::IsInDisp8(imm) ? 1 : 0;
-    int immSize = s ? 1 : reg.isREG(16) ? 2 : 4;
-    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x69 | (s << 1), NONE, NONE, immSize);
-    db(imm, immSize);
-  }
-  void push(const Operand& op) { opPushPop(op, 0xFF, 6, 0x50); }
-  void pop(const Operand& op) { opPushPop(op, 0x8F, 0, 0x58); }
-  void push(const AddressFrame& af, uint32_t imm) {
-    if (af.bit_ == 8) {
-      db(0x6A);
-      db(imm);
-    } else if (af.bit_ == 16) {
-      db(0x66);
-      db(0x68);
-      dw(imm);
-    } else {
-      db(0x68);
-      dd(imm);
-    }
-  }
-  /* use "push(word, 4)" if you want "push word 4" */
-  void push(uint32_t imm) {
-    if (inner::IsInDisp8(imm)) {
-      push(byte, imm);
-    } else {
-      push(dword, imm);
-    }
-  }
-  void mov(const Operand& reg1, const Operand& reg2) {
-    const Reg* reg = 0;
-    const Address* addr = 0;
-    uint8_t code = 0;
-    if (reg1.isREG() && reg1.getIdx() == 0 && reg2.isMEM()) {  // mov eax|ax|al, [disp]
-      reg = &reg1.getReg();
-      addr = &reg2.getAddress();
-      code = 0xA0;
-    } else if (reg1.isMEM() && reg2.isREG() && reg2.getIdx() == 0) {  // mov [disp], eax|ax|al
-      reg = &reg2.getReg();
-      addr = &reg1.getAddress();
-      code = 0xA2;
-    }
-#ifdef XBYAK64
-    if (addr && addr->is64bitDisp()) {
-      if (code) {
-        rex(*reg);
-        db(reg1.isREG(8) ? 0xA0 : reg1.isREG() ? 0xA1 : reg2.isREG(8) ? 0xA2 : 0xA3);
-        db(addr->getDisp(), 8);
-      } else {
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-      }
-    } else
-#else
-    if (code && addr->isOnlyDisp()) {
-      rex(*reg, *addr);
-      db(code | (reg->isBit(8) ? 0 : 1));
-      dd(static_cast<uint32_t>(addr->getDisp()));
-    } else
-#endif
-    {
-      opRM_RM(reg1, reg2, 0x88);
-    }
-  }
-  void mov(const Operand& op, uint64_t imm) {
-    if (op.isREG()) {
-      const int size = mov_imm(op.getReg(), imm);
-      db(imm, size);
-    } else if (op.isMEM()) {
-      verifyMemHasSize(op);
-      int immSize = op.getBit() / 8;
-      if (immSize <= 4) {
-        int64_t s = int64_t(imm) >> (immSize * 8);
-        if (s != 0 && s != -1) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-      } else {
-        if (!inner::IsInInt32(imm)) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-        immSize = 4;
-      }
-      opModM(op.getAddress(), Reg(0, Operand::REG, op.getBit()), 0xC6, NONE, NONE, immSize);
-      db(static_cast<uint32_t>(imm), immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-
-  // The template is used to avoid ambiguity when the 2nd argument is 0.
-  // When the 2nd argument is 0 the call goes to
-  // `void mov(const Operand& op, uint64_t imm)`.
-  template <typename T1, typename T2>
-  void mov(const T1&, const T2*) {
-    T1::unexpected;
-  }
-  void mov(const NativeReg& reg, const Label& label) {
-    mov_imm(reg, dummyAddr);
-    putL(label);
-  }
-  void xchg(const Operand& op1, const Operand& op2) {
-    const Operand *p1 = &op1, *p2 = &op2;
-    if (p1->isMEM() || (p2->isREG(16 | i32e) && p2->getIdx() == 0)) {
-      p1 = &op2;
-      p2 = &op1;
-    }
-    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (p2->isREG() && (p1->isREG(16 | i32e) && p1->getIdx() == 0)
-#ifdef XBYAK64
-        && (p2->getIdx() != 0 || !p1->isREG(32))
-#endif
-    ) {
-      rex(*p2, *p1);
-      db(0x90 | (p2->getIdx() & 7));
-      return;
-    }
-    opModRM(*p1, *p2, (p1->isREG() && p2->isREG() && (p1->getBit() == p2->getBit())), p2->isMEM(),
-            0x86 | (p1->isBit(8) ? 0 : 1));
-  }
-
-#ifndef XBYAK_DISABLE_SEGMENT
-  void push(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x06);
-        break;
-      case Segment::cs:
-        db(0x0E);
-        break;
-      case Segment::ss:
-        db(0x16);
-        break;
-      case Segment::ds:
-        db(0x1E);
-        break;
-      case Segment::fs:
-        db(0x0F);
-        db(0xA0);
-        break;
-      case Segment::gs:
-        db(0x0F);
-        db(0xA8);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void pop(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x07);
-        break;
-      case Segment::cs:
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-      case Segment::ss:
-        db(0x17);
-        break;
-      case Segment::ds:
-        db(0x1F);
-        break;
-      case Segment::fs:
-        db(0x0F);
-        db(0xA1);
-        break;
-      case Segment::gs:
-        db(0x0F);
-        db(0xA9);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void putSeg(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x2E);
-        break;
-      case Segment::cs:
-        db(0x36);
-        break;
-      case Segment::ss:
-        db(0x3E);
-        break;
-      case Segment::ds:
-        db(0x26);
-        break;
-      case Segment::fs:
-        db(0x64);
-        break;
-      case Segment::gs:
-        db(0x65);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void mov(const Operand& op, const Segment& seg) {
-    opModRM(Reg8(seg.getIdx()), op, op.isREG(16 | i32e), op.isMEM(), 0x8C);
-  }
-  void mov(const Segment& seg, const Operand& op) {
-    opModRM(Reg8(seg.getIdx()), op.isREG(16 | i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op,
-            op.isREG(16 | i32e), op.isMEM(), 0x8E);
-  }
-#endif
-
-  enum { NONE = 256 };
-  // constructor
-  CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void* userPtr = 0, Allocator* allocator = 0)
-      : CodeArray(maxSize, userPtr, allocator),
-        mm0(0),
-        mm1(1),
-        mm2(2),
-        mm3(3),
-        mm4(4),
-        mm5(5),
-        mm6(6),
-        mm7(7),
-        xmm0(0),
-        xmm1(1),
-        xmm2(2),
-        xmm3(3),
-        xmm4(4),
-        xmm5(5),
-        xmm6(6),
-        xmm7(7),
-        ymm0(0),
-        ymm1(1),
-        ymm2(2),
-        ymm3(3),
-        ymm4(4),
-        ymm5(5),
-        ymm6(6),
-        ymm7(7),
-        zmm0(0),
-        zmm1(1),
-        zmm2(2),
-        zmm3(3),
-        zmm4(4),
-        zmm5(5),
-        zmm6(6),
-        zmm7(7)
-        // for my convenience
-        ,
-        xm0(xmm0),
-        xm1(xmm1),
-        xm2(xmm2),
-        xm3(xmm3),
-        xm4(xmm4),
-        xm5(xmm5),
-        xm6(xmm6),
-        xm7(xmm7),
-        ym0(ymm0),
-        ym1(ymm1),
-        ym2(ymm2),
-        ym3(ymm3),
-        ym4(ymm4),
-        ym5(ymm5),
-        ym6(ymm6),
-        ym7(ymm7),
-        zm0(zmm0),
-        zm1(zmm1),
-        zm2(zmm2),
-        zm3(zmm3),
-        zm4(zmm4),
-        zm5(zmm5),
-        zm6(zmm6),
-        zm7(zmm7)
-
-        ,
-        eax(Operand::EAX),
-        ecx(Operand::ECX),
-        edx(Operand::EDX),
-        ebx(Operand::EBX),
-        esp(Operand::ESP),
-        ebp(Operand::EBP),
-        esi(Operand::ESI),
-        edi(Operand::EDI),
-        ax(Operand::AX),
-        cx(Operand::CX),
-        dx(Operand::DX),
-        bx(Operand::BX),
-        sp(Operand::SP),
-        bp(Operand::BP),
-        si(Operand::SI),
-        di(Operand::DI),
-        al(Operand::AL),
-        cl(Operand::CL),
-        dl(Operand::DL),
-        bl(Operand::BL),
-        ah(Operand::AH),
-        ch(Operand::CH),
-        dh(Operand::DH),
-        bh(Operand::BH),
-        ptr(0),
-        byte(8),
-        word(16),
-        dword(32),
-        qword(64),
-        xword(128),
-        yword(256),
-        zword(512),
-        ptr_b(0, true),
-        xword_b(128, true),
-        yword_b(256, true),
-        zword_b(512, true),
-        st0(0),
-        st1(1),
-        st2(2),
-        st3(3),
-        st4(4),
-        st5(5),
-        st6(6),
-        st7(7),
-        k0(0),
-        k1(1),
-        k2(2),
-        k3(3),
-        k4(4),
-        k5(5),
-        k6(6),
-        k7(7),
-        bnd0(0),
-        bnd1(1),
-        bnd2(2),
-        bnd3(3),
-        T_sae(EvexModifierRounding::T_SAE),
-        T_rn_sae(EvexModifierRounding::T_RN_SAE),
-        T_rd_sae(EvexModifierRounding::T_RD_SAE),
-        T_ru_sae(EvexModifierRounding::T_RU_SAE),
-        T_rz_sae(EvexModifierRounding::T_RZ_SAE),
-        T_z()
-#ifdef XBYAK64
-        ,
-        rax(Operand::RAX),
-        rcx(Operand::RCX),
-        rdx(Operand::RDX),
-        rbx(Operand::RBX),
-        rsp(Operand::RSP),
-        rbp(Operand::RBP),
-        rsi(Operand::RSI),
-        rdi(Operand::RDI),
-        r8(Operand::R8),
-        r9(Operand::R9),
-        r10(Operand::R10),
-        r11(Operand::R11),
-        r12(Operand::R12),
-        r13(Operand::R13),
-        r14(Operand::R14),
-        r15(Operand::R15),
-        r8d(8),
-        r9d(9),
-        r10d(10),
-        r11d(11),
-        r12d(12),
-        r13d(13),
-        r14d(14),
-        r15d(15),
-        r8w(8),
-        r9w(9),
-        r10w(10),
-        r11w(11),
-        r12w(12),
-        r13w(13),
-        r14w(14),
-        r15w(15),
-        r8b(8),
-        r9b(9),
-        r10b(10),
-        r11b(11),
-        r12b(12),
-        r13b(13),
-        r14b(14),
-        r15b(15),
-        spl(Operand::SPL, true),
-        bpl(Operand::BPL, true),
-        sil(Operand::SIL, true),
-        dil(Operand::DIL, true),
-        xmm8(8),
-        xmm9(9),
-        xmm10(10),
-        xmm11(11),
-        xmm12(12),
-        xmm13(13),
-        xmm14(14),
-        xmm15(15),
-        xmm16(16),
-        xmm17(17),
-        xmm18(18),
-        xmm19(19),
-        xmm20(20),
-        xmm21(21),
-        xmm22(22),
-        xmm23(23),
-        xmm24(24),
-        xmm25(25),
-        xmm26(26),
-        xmm27(27),
-        xmm28(28),
-        xmm29(29),
-        xmm30(30),
-        xmm31(31),
-        ymm8(8),
-        ymm9(9),
-        ymm10(10),
-        ymm11(11),
-        ymm12(12),
-        ymm13(13),
-        ymm14(14),
-        ymm15(15),
-        ymm16(16),
-        ymm17(17),
-        ymm18(18),
-        ymm19(19),
-        ymm20(20),
-        ymm21(21),
-        ymm22(22),
-        ymm23(23),
-        ymm24(24),
-        ymm25(25),
-        ymm26(26),
-        ymm27(27),
-        ymm28(28),
-        ymm29(29),
-        ymm30(30),
-        ymm31(31),
-        zmm8(8),
-        zmm9(9),
-        zmm10(10),
-        zmm11(11),
-        zmm12(12),
-        zmm13(13),
-        zmm14(14),
-        zmm15(15),
-        zmm16(16),
-        zmm17(17),
-        zmm18(18),
-        zmm19(19),
-        zmm20(20),
-        zmm21(21),
-        zmm22(22),
-        zmm23(23),
-        zmm24(24),
-        zmm25(25),
-        zmm26(26),
-        zmm27(27),
-        zmm28(28),
-        zmm29(29),
-        zmm30(30),
-        zmm31(31),
-        tmm0(0),
-        tmm1(1),
-        tmm2(2),
-        tmm3(3),
-        tmm4(4),
-        tmm5(5),
-        tmm6(6),
-        tmm7(7)
-        // for my convenience
-        ,
-        xm8(xmm8),
-        xm9(xmm9),
-        xm10(xmm10),
-        xm11(xmm11),
-        xm12(xmm12),
-        xm13(xmm13),
-        xm14(xmm14),
-        xm15(xmm15),
-        xm16(xmm16),
-        xm17(xmm17),
-        xm18(xmm18),
-        xm19(xmm19),
-        xm20(xmm20),
-        xm21(xmm21),
-        xm22(xmm22),
-        xm23(xmm23),
-        xm24(xmm24),
-        xm25(xmm25),
-        xm26(xmm26),
-        xm27(xmm27),
-        xm28(xmm28),
-        xm29(xmm29),
-        xm30(xmm30),
-        xm31(xmm31),
-        ym8(ymm8),
-        ym9(ymm9),
-        ym10(ymm10),
-        ym11(ymm11),
-        ym12(ymm12),
-        ym13(ymm13),
-        ym14(ymm14),
-        ym15(ymm15),
-        ym16(ymm16),
-        ym17(ymm17),
-        ym18(ymm18),
-        ym19(ymm19),
-        ym20(ymm20),
-        ym21(ymm21),
-        ym22(ymm22),
-        ym23(ymm23),
-        ym24(ymm24),
-        ym25(ymm25),
-        ym26(ymm26),
-        ym27(ymm27),
-        ym28(ymm28),
-        ym29(ymm29),
-        ym30(ymm30),
-        ym31(ymm31),
-        zm8(zmm8),
-        zm9(zmm9),
-        zm10(zmm10),
-        zm11(zmm11),
-        zm12(zmm12),
-        zm13(zmm13),
-        zm14(zmm14),
-        zm15(zmm15),
-        zm16(zmm16),
-        zm17(zmm17),
-        zm18(zmm18),
-        zm19(zmm19),
-        zm20(zmm20),
-        zm21(zmm21),
-        zm22(zmm22),
-        zm23(zmm23),
-        zm24(zmm24),
-        zm25(zmm25),
-        zm26(zmm26),
-        zm27(zmm27),
-        zm28(zmm28),
-        zm29(zmm29),
-        zm30(zmm30),
-        zm31(zmm31),
-        rip()
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-        ,
-        es(Segment::es),
-        cs(Segment::cs),
-        ss(Segment::ss),
-        ds(Segment::ds),
-        fs(Segment::fs),
-        gs(Segment::gs)
-#endif
-        ,
-        isDefaultJmpNEAR_(false),
-        defaultEncoding_(EvexEncoding) {
-    labelMgr_.set(this);
-  }
-  void reset() {
-    ClearError();
-    resetSize();
-    labelMgr_.reset();
-    labelMgr_.set(this);
-  }
-  bool hasUndefinedLabel() const { return labelMgr_.hasUndefSlabel() || labelMgr_.hasUndefClabel(); }
-  /*
-          MUST call ready() to complete generating code if you use AutoGrow mode.
-          It is not necessary for the other mode if hasUndefinedLabel() is true.
-  */
-  void ready(ProtectMode mode = PROTECT_RWE) {
-    if (hasUndefinedLabel()) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
-    if (isAutoGrow()) {
-      calcJmpAddress();
-      if (useProtect()) setProtectMode(mode);
-    }
-  }
-  // set read/exec
-  void readyRE() { return ready(PROTECT_RE); }
-#ifdef XBYAK_TEST
-  void dump(bool doClear = true) {
-    CodeArray::dump();
-    if (doClear) size_ = 0;
-  }
-#endif
-
-#ifdef XBYAK_UNDEF_JNL
-#undef jnl
-#endif
-
-  // set default encoding to select Vex or Evex
-  void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
-
-  /*
-          use single byte nop if useMultiByteNop = false
-  */
-  void nop(size_t size = 1, bool useMultiByteNop = true) {
-    if (!useMultiByteNop) {
-      for (size_t i = 0; i < size; i++) {
-        db(0x90);
-      }
-      return;
-    }
-    /*
-            Intel Architectures Software Developer's Manual Volume 2
-            recommended multi-byte sequence of NOP instruction
-            AMD and Intel seem to agree on the same sequences for up to 9 bytes:
-            https://support.amd.com/TechDocs/55723_SOG_Fam_17h_Processors_3.00.pdf
-    */
-    static const uint8_t nopTbl[9][9] = {
-        {0x90},
-        {0x66, 0x90},
-        {0x0F, 0x1F, 0x00},
-        {0x0F, 0x1F, 0x40, 0x00},
-        {0x0F, 0x1F, 0x44, 0x00, 0x00},
-        {0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
-        {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
-        {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-        {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    };
-    const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
-    while (size > 0) {
-      size_t len = (std::min)(n, size);
-      const uint8_t* seq = nopTbl[len - 1];
-      db(seq, len);
-      size -= len;
-    }
-  }
-
-#ifndef XBYAK_DONT_READ_LIST
-#include "xbyak_mnemonic.h"
-  /*
-          use single byte nop if useMultiByteNop = false
-  */
-  void align(size_t x = 16, bool useMultiByteNop = true) {
-    if (x == 1) return;
-    if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN)
-    if (isAutoGrow()) XBYAK_THROW(ERR_BAD_ALIGN)
-    size_t remain = size_t(getCurr()) % x;
-    if (remain) {
-      nop(x - remain, useMultiByteNop);
-    }
-  }
-#endif
-};
-
-template <>
-inline void CodeGenerator::mov(const NativeReg& reg, const char* label)  // can't use std::string
-{
-  assert(label);
-  mov_imm(reg, dummyAddr);
-  putL(label);
-}
-
-namespace util {
-static const XBYAK_CONSTEXPR Mmx mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7);
-static const XBYAK_CONSTEXPR Xmm xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7);
-static const XBYAK_CONSTEXPR Ymm ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7);
-static const XBYAK_CONSTEXPR Zmm zmm0(0), zmm1(1), zmm2(2), zmm3(3), zmm4(4), zmm5(5), zmm6(6), zmm7(7);
-static const XBYAK_CONSTEXPR Reg32 eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX),
-    esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI);
-static const XBYAK_CONSTEXPR Reg16 ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP),
-    bp(Operand::BP), si(Operand::SI), di(Operand::DI);
-static const XBYAK_CONSTEXPR Reg8 al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH),
-    ch(Operand::CH), dh(Operand::DH), bh(Operand::BH);
-static const XBYAK_CONSTEXPR AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64), xword(128), yword(256),
-    zword(512);
-static const XBYAK_CONSTEXPR AddressFrame ptr_b(0, true), xword_b(128, true), yword_b(256, true), zword_b(512, true);
-static const XBYAK_CONSTEXPR Fpu st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7);
-static const XBYAK_CONSTEXPR Opmask k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7);
-static const XBYAK_CONSTEXPR BoundsReg bnd0(0), bnd1(1), bnd2(2), bnd3(3);
-static const XBYAK_CONSTEXPR EvexModifierRounding T_sae(EvexModifierRounding::T_SAE),
-    T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE),
-    T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE);
-static const XBYAK_CONSTEXPR EvexModifierZero T_z;
-#ifdef XBYAK64
-static const XBYAK_CONSTEXPR Reg64 rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX),
-    rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9),
-    r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15);
-static const XBYAK_CONSTEXPR Reg32 r8d(8), r9d(9), r10d(10), r11d(11), r12d(12), r13d(13), r14d(14), r15d(15);
-static const XBYAK_CONSTEXPR Reg16 r8w(8), r9w(9), r10w(10), r11w(11), r12w(12), r13w(13), r14w(14), r15w(15);
-static const XBYAK_CONSTEXPR Reg8 r8b(8), r9b(9), r10b(10), r11b(11), r12b(12), r13b(13), r14b(14), r15b(15),
-    spl(Operand::SPL, true), bpl(Operand::BPL, true), sil(Operand::SIL, true), dil(Operand::DIL, true);
-static const XBYAK_CONSTEXPR Xmm xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15);
-static const XBYAK_CONSTEXPR Xmm xmm16(16), xmm17(17), xmm18(18), xmm19(19), xmm20(20), xmm21(21), xmm22(22), xmm23(23);
-static const XBYAK_CONSTEXPR Xmm xmm24(24), xmm25(25), xmm26(26), xmm27(27), xmm28(28), xmm29(29), xmm30(30), xmm31(31);
-static const XBYAK_CONSTEXPR Ymm ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15);
-static const XBYAK_CONSTEXPR Ymm ymm16(16), ymm17(17), ymm18(18), ymm19(19), ymm20(20), ymm21(21), ymm22(22), ymm23(23);
-static const XBYAK_CONSTEXPR Ymm ymm24(24), ymm25(25), ymm26(26), ymm27(27), ymm28(28), ymm29(29), ymm30(30), ymm31(31);
-static const XBYAK_CONSTEXPR Zmm zmm8(8), zmm9(9), zmm10(10), zmm11(11), zmm12(12), zmm13(13), zmm14(14), zmm15(15);
-static const XBYAK_CONSTEXPR Zmm zmm16(16), zmm17(17), zmm18(18), zmm19(19), zmm20(20), zmm21(21), zmm22(22), zmm23(23);
-static const XBYAK_CONSTEXPR Zmm zmm24(24), zmm25(25), zmm26(26), zmm27(27), zmm28(28), zmm29(29), zmm30(30), zmm31(31);
-static const XBYAK_CONSTEXPR Zmm tmm0(0), tmm1(1), tmm2(2), tmm3(3), tmm4(4), tmm5(5), tmm6(6), tmm7(7);
-static const XBYAK_CONSTEXPR RegRip rip;
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds),
-    fs(Segment::fs), gs(Segment::gs);
-#endif
-}  // namespace util
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-
-}  // namespace Xbyak
-
-#endif  // XBYAK_XBYAK_H_
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
deleted file mode 100644
index fda7da3c9b7c..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
+++ /dev/null
@@ -1,271 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-enum {
-  B00000000 = 0,
-  B00000001 = 1,
-  B00000010 = 2,
-  B00000011 = 3,
-  B00000100 = 4,
-  B00000101 = 5,
-  B00000110 = 6,
-  B00000111 = 7,
-  B00001000 = 8,
-  B00001001 = 9,
-  B00001010 = 10,
-  B00001011 = 11,
-  B00001100 = 12,
-  B00001101 = 13,
-  B00001110 = 14,
-  B00001111 = 15,
-  B00010000 = 16,
-  B00010001 = 17,
-  B00010010 = 18,
-  B00010011 = 19,
-  B00010100 = 20,
-  B00010101 = 21,
-  B00010110 = 22,
-  B00010111 = 23,
-  B00011000 = 24,
-  B00011001 = 25,
-  B00011010 = 26,
-  B00011011 = 27,
-  B00011100 = 28,
-  B00011101 = 29,
-  B00011110 = 30,
-  B00011111 = 31,
-  B00100000 = 32,
-  B00100001 = 33,
-  B00100010 = 34,
-  B00100011 = 35,
-  B00100100 = 36,
-  B00100101 = 37,
-  B00100110 = 38,
-  B00100111 = 39,
-  B00101000 = 40,
-  B00101001 = 41,
-  B00101010 = 42,
-  B00101011 = 43,
-  B00101100 = 44,
-  B00101101 = 45,
-  B00101110 = 46,
-  B00101111 = 47,
-  B00110000 = 48,
-  B00110001 = 49,
-  B00110010 = 50,
-  B00110011 = 51,
-  B00110100 = 52,
-  B00110101 = 53,
-  B00110110 = 54,
-  B00110111 = 55,
-  B00111000 = 56,
-  B00111001 = 57,
-  B00111010 = 58,
-  B00111011 = 59,
-  B00111100 = 60,
-  B00111101 = 61,
-  B00111110 = 62,
-  B00111111 = 63,
-  B01000000 = 64,
-  B01000001 = 65,
-  B01000010 = 66,
-  B01000011 = 67,
-  B01000100 = 68,
-  B01000101 = 69,
-  B01000110 = 70,
-  B01000111 = 71,
-  B01001000 = 72,
-  B01001001 = 73,
-  B01001010 = 74,
-  B01001011 = 75,
-  B01001100 = 76,
-  B01001101 = 77,
-  B01001110 = 78,
-  B01001111 = 79,
-  B01010000 = 80,
-  B01010001 = 81,
-  B01010010 = 82,
-  B01010011 = 83,
-  B01010100 = 84,
-  B01010101 = 85,
-  B01010110 = 86,
-  B01010111 = 87,
-  B01011000 = 88,
-  B01011001 = 89,
-  B01011010 = 90,
-  B01011011 = 91,
-  B01011100 = 92,
-  B01011101 = 93,
-  B01011110 = 94,
-  B01011111 = 95,
-  B01100000 = 96,
-  B01100001 = 97,
-  B01100010 = 98,
-  B01100011 = 99,
-  B01100100 = 100,
-  B01100101 = 101,
-  B01100110 = 102,
-  B01100111 = 103,
-  B01101000 = 104,
-  B01101001 = 105,
-  B01101010 = 106,
-  B01101011 = 107,
-  B01101100 = 108,
-  B01101101 = 109,
-  B01101110 = 110,
-  B01101111 = 111,
-  B01110000 = 112,
-  B01110001 = 113,
-  B01110010 = 114,
-  B01110011 = 115,
-  B01110100 = 116,
-  B01110101 = 117,
-  B01110110 = 118,
-  B01110111 = 119,
-  B01111000 = 120,
-  B01111001 = 121,
-  B01111010 = 122,
-  B01111011 = 123,
-  B01111100 = 124,
-  B01111101 = 125,
-  B01111110 = 126,
-  B01111111 = 127,
-  B10000000 = 128,
-  B10000001 = 129,
-  B10000010 = 130,
-  B10000011 = 131,
-  B10000100 = 132,
-  B10000101 = 133,
-  B10000110 = 134,
-  B10000111 = 135,
-  B10001000 = 136,
-  B10001001 = 137,
-  B10001010 = 138,
-  B10001011 = 139,
-  B10001100 = 140,
-  B10001101 = 141,
-  B10001110 = 142,
-  B10001111 = 143,
-  B10010000 = 144,
-  B10010001 = 145,
-  B10010010 = 146,
-  B10010011 = 147,
-  B10010100 = 148,
-  B10010101 = 149,
-  B10010110 = 150,
-  B10010111 = 151,
-  B10011000 = 152,
-  B10011001 = 153,
-  B10011010 = 154,
-  B10011011 = 155,
-  B10011100 = 156,
-  B10011101 = 157,
-  B10011110 = 158,
-  B10011111 = 159,
-  B10100000 = 160,
-  B10100001 = 161,
-  B10100010 = 162,
-  B10100011 = 163,
-  B10100100 = 164,
-  B10100101 = 165,
-  B10100110 = 166,
-  B10100111 = 167,
-  B10101000 = 168,
-  B10101001 = 169,
-  B10101010 = 170,
-  B10101011 = 171,
-  B10101100 = 172,
-  B10101101 = 173,
-  B10101110 = 174,
-  B10101111 = 175,
-  B10110000 = 176,
-  B10110001 = 177,
-  B10110010 = 178,
-  B10110011 = 179,
-  B10110100 = 180,
-  B10110101 = 181,
-  B10110110 = 182,
-  B10110111 = 183,
-  B10111000 = 184,
-  B10111001 = 185,
-  B10111010 = 186,
-  B10111011 = 187,
-  B10111100 = 188,
-  B10111101 = 189,
-  B10111110 = 190,
-  B10111111 = 191,
-  B11000000 = 192,
-  B11000001 = 193,
-  B11000010 = 194,
-  B11000011 = 195,
-  B11000100 = 196,
-  B11000101 = 197,
-  B11000110 = 198,
-  B11000111 = 199,
-  B11001000 = 200,
-  B11001001 = 201,
-  B11001010 = 202,
-  B11001011 = 203,
-  B11001100 = 204,
-  B11001101 = 205,
-  B11001110 = 206,
-  B11001111 = 207,
-  B11010000 = 208,
-  B11010001 = 209,
-  B11010010 = 210,
-  B11010011 = 211,
-  B11010100 = 212,
-  B11010101 = 213,
-  B11010110 = 214,
-  B11010111 = 215,
-  B11011000 = 216,
-  B11011001 = 217,
-  B11011010 = 218,
-  B11011011 = 219,
-  B11011100 = 220,
-  B11011101 = 221,
-  B11011110 = 222,
-  B11011111 = 223,
-  B11100000 = 224,
-  B11100001 = 225,
-  B11100010 = 226,
-  B11100011 = 227,
-  B11100100 = 228,
-  B11100101 = 229,
-  B11100110 = 230,
-  B11100111 = 231,
-  B11101000 = 232,
-  B11101001 = 233,
-  B11101010 = 234,
-  B11101011 = 235,
-  B11101100 = 236,
-  B11101101 = 237,
-  B11101110 = 238,
-  B11101111 = 239,
-  B11110000 = 240,
-  B11110001 = 241,
-  B11110010 = 242,
-  B11110011 = 243,
-  B11110100 = 244,
-  B11110101 = 245,
-  B11110110 = 246,
-  B11110111 = 247,
-  B11111000 = 248,
-  B11111001 = 249,
-  B11111010 = 250,
-  B11111011 = 251,
-  B11111100 = 252,
-  B11111101 = 253,
-  B11111110 = 254,
-  B11111111 = 255
-};
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
deleted file mode 100644
index 533b1712a766..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
+++ /dev/null
@@ -1,4728 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-const char* getVersionString() const { return "6.73"; }
-void aadd(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
-void aand(const Address& addr, const Reg32e& reg) {
-  db(0x66);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
-void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
-void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
-void add(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x00, 0); }
-void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
-void addpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x66, isXMM_XMMorMEM); }
-void addps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x100, isXMM_XMMorMEM); }
-void addsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF2, isXMM_XMMorMEM); }
-void addss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF3, isXMM_XMMorMEM); }
-void addsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0x66, isXMM_XMMorMEM); }
-void addsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0xF2, isXMM_XMMorMEM); }
-void adox(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0xF3, isREG32_REG32orMEM, NONE, 0x38); }
-void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aeskeygenassist(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void and_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x20, 4); }
-void and_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x20); }
-void andn(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_0F38, 0xf2, true); }
-void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXMM_XMMorMEM); }
-void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
-void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
-void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
-void aor(const Address& addr, const Reg32e& reg) {
-  db(0xF2);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void axor(const Address& addr, const Reg32e& reg) {
-  db(0xF3);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
-void blendpd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void blendps(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void bnd() { db(0xF2); }
-void bndcl(const BoundsReg& bnd, const Operand& op) {
-  db(0xF3);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
-}
-void bndcn(const BoundsReg& bnd, const Operand& op) {
-  db(0xF2);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1B, NONE, !op.isMEM());
-}
-void bndcu(const BoundsReg& bnd, const Operand& op) {
-  db(0xF2);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
-}
-void bndldx(const BoundsReg& bnd, const Address& addr) { opMIB(addr, bnd, 0x0F, 0x1A); }
-void bndmk(const BoundsReg& bnd, const Address& addr) {
-  db(0xF3);
-  opModM(addr, bnd, 0x0F, 0x1B);
-}
-void bndmov(const Address& addr, const BoundsReg& bnd) {
-  db(0x66);
-  opModM(addr, bnd, 0x0F, 0x1B);
-}
-void bndmov(const BoundsReg& bnd, const Operand& op) {
-  db(0x66);
-  opModRM(bnd, op, op.isBNDREG(), op.isMEM(), 0x0F, 0x1A);
-}
-void bndstx(const Address& addr, const BoundsReg& bnd) { opMIB(addr, bnd, 0x0F, 0x1B); }
-void bsf(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
-void bsr(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
-void bswap(const Reg32e& reg) { opModR(Reg32(1), reg, 0x0F); }
-void bt(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xA3);
-}
-void bt(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 4, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void btc(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xBB);
-}
-void btc(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 7, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void btr(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xB3);
-}
-void btr(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 6, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void bts(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xAB);
-}
-void bts(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 5, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void bzhi(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf5, false); }
-void cbw() {
-  db(0x66);
-  db(0x98);
-}
-void cdq() { db(0x99); }
-void clc() { db(0xF8); }
-void cld() { db(0xFC); }
-void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }
-void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
-void clflushopt(const Address& addr) {
-  db(0x66);
-  opModM(addr, Reg32(7), 0x0F, 0xAE);
-}
-void cli() { db(0xFA); }
-void clwb(const Address& addr) {
-  db(0x66);
-  opMIB(addr, esi, 0x0F, 0xAE);
-}
-void clzero() {
-  db(0x0F);
-  db(0x01);
-  db(0xFC);
-}
-void cmc() { db(0xF5); }
-void cmova(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
-}  //-V524
-void cmovae(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovb(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmovbe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
-}  //-V524
-void cmovc(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmove(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
-}  //-V524
-void cmovg(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
-}  //-V524
-void cmovge(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
-}  //-V524
-void cmovl(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
-}  //-V524
-void cmovle(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
-}  //-V524
-void cmovna(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
-}  //-V524
-void cmovnae(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmovnb(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovnbe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
-}  //-V524
-void cmovnc(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovne(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
-}  //-V524
-void cmovng(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
-}  //-V524
-void cmovnge(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
-}  //-V524
-void cmovnl(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
-}  //-V524
-void cmovnle(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
-}  //-V524
-void cmovno(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 1);
-}  //-V524
-void cmovnp(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
-}  //-V524
-void cmovns(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 9);
-}  //-V524
-void cmovnz(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
-}  //-V524
-void cmovo(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 0);
-}  //-V524
-void cmovp(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
-}  //-V524
-void cmovpe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
-}  //-V524
-void cmovpo(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
-}  //-V524
-void cmovs(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 8);
-}  //-V524
-void cmovz(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
-}  //-V524
-void cmp(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x38, 7); }
-void cmp(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x38); }
-void cmpeqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 0); }
-void cmpeqps(const Xmm& x, const Operand& op) { cmpps(x, op, 0); }
-void cmpeqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 0); }
-void cmpeqss(const Xmm& x, const Operand& op) { cmpss(x, op, 0); }
-void cmplepd(const Xmm& x, const Operand& op) { cmppd(x, op, 2); }
-void cmpleps(const Xmm& x, const Operand& op) { cmpps(x, op, 2); }
-void cmplesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 2); }
-void cmpless(const Xmm& x, const Operand& op) { cmpss(x, op, 2); }
-void cmpltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 1); }
-void cmpltps(const Xmm& x, const Operand& op) { cmpps(x, op, 1); }
-void cmpltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 1); }
-void cmpltss(const Xmm& x, const Operand& op) { cmpss(x, op, 1); }
-void cmpneqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 4); }
-void cmpneqps(const Xmm& x, const Operand& op) { cmpps(x, op, 4); }
-void cmpneqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 4); }
-void cmpneqss(const Xmm& x, const Operand& op) { cmpss(x, op, 4); }
-void cmpnlepd(const Xmm& x, const Operand& op) { cmppd(x, op, 6); }
-void cmpnleps(const Xmm& x, const Operand& op) { cmpps(x, op, 6); }
-void cmpnlesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 6); }
-void cmpnless(const Xmm& x, const Operand& op) { cmpss(x, op, 6); }
-void cmpnltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 5); }
-void cmpnltps(const Xmm& x, const Operand& op) { cmpps(x, op, 5); }
-void cmpnltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 5); }
-void cmpnltss(const Xmm& x, const Operand& op) { cmpss(x, op, 5); }
-void cmpordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 7); }
-void cmpordps(const Xmm& x, const Operand& op) { cmpps(x, op, 7); }
-void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); }
-void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); }
-void cmppd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); }
-void cmpps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); }
-void cmpsb() { db(0xA6); }
-void cmpsd() { db(0xA7); }
-void cmpsd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); }
-void cmpss(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); }
-void cmpsw() {
-  db(0x66);
-  db(0xA7);
-}
-void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); }
-void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); }
-void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); }
-void cmpunordss(const Xmm& x, const Operand& op) { cmpss(x, op, 3); }
-void cmpxchg(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
-          0xB0 | (reg.isBit(8) ? 0 : 1));
-}
-void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xC7); }
-void comisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x66, isXMM_XMMorMEM); }
-void comiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x100, isXMM_XMMorMEM); }
-void cpuid() {
-  db(0x0F);
-  db(0xA2);
-}
-void crc32(const Reg32e& reg, const Operand& op) {
-  if (reg.isBit(32) && op.isBit(16)) db(0x66);
-  db(0xF2);
-  opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
-}
-void cvtdq2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF3, isXMM_XMMorMEM); }
-void cvtdq2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x100, isXMM_XMMorMEM); }
-void cvtpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF2, isXMM_XMMorMEM); }
-void cvtpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x66, isMMX_XMMorMEM); }
-void cvtpd2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x66, isXMM_XMMorMEM); }
-void cvtpi2pd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x66, isXMM_MMXorMEM); }
-void cvtpi2ps(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x100, isXMM_MMXorMEM); }
-void cvtps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x66, isXMM_XMMorMEM); }
-void cvtps2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x100, isXMM_XMMorMEM); }
-void cvtps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x100, isMMX_XMMorMEM); }
-void cvtsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF2, isREG32_XMMorMEM); }
-void cvtsd2ss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF2, isXMM_XMMorMEM); }
-void cvtsi2sd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF2, isXMM_REG32orMEM); }
-void cvtsi2ss(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF3, isXMM_REG32orMEM); }
-void cvtss2sd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF3, isXMM_XMMorMEM); }
-void cvtss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF3, isREG32_XMMorMEM); }
-void cvttpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0x66, isXMM_XMMorMEM); }
-void cvttpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x66, isMMX_XMMorMEM); }
-void cvttps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0xF3, isXMM_XMMorMEM); }
-void cvttps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x100, isMMX_XMMorMEM); }
-void cvttsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF2, isREG32_XMMorMEM); }
-void cvttss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF3, isREG32_XMMorMEM); }
-void cwd() {
-  db(0x66);
-  db(0x99);
-}
-void cwde() { db(0x98); }
-void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
-void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
-void divpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x66, isXMM_XMMorMEM); }
-void divps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x100, isXMM_XMMorMEM); }
-void divsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF2, isXMM_XMMorMEM); }
-void divss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF3, isXMM_XMMorMEM); }
-void dppd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void dpps(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void emms() {
-  db(0x0F);
-  db(0x77);
-}
-void endbr32() {
-  db(0xF3);
-  db(0x0F);
-  db(0x1E);
-  db(0xFB);
-}
-void endbr64() {
-  db(0xF3);
-  db(0x0F);
-  db(0x1E);
-  db(0xFA);
-}
-void enter(uint16_t x, uint8_t y) {
-  db(0xC8);
-  dw(x);
-  db(y);
-}
-void extractps(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x17, imm); }
-void f2xm1() {
-  db(0xD9);
-  db(0xF0);
-}
-void fabs() {
-  db(0xD9);
-  db(0xE1);
-}
-void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
-void fadd(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C0, 0xDCC0); }
-void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
-void faddp() {
-  db(0xDE);
-  db(0xC1);
-}
-void faddp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC0); }
-void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
-void fbld(const Address& addr) { opModM(addr, Reg32(4), 0xDF, 0x100); }
-void fbstp(const Address& addr) { opModM(addr, Reg32(6), 0xDF, 0x100); }
-void fchs() {
-  db(0xD9);
-  db(0xE0);
-}
-void fclex() {
-  db(0x9B);
-  db(0xDB);
-  db(0xE2);
-}
-void fcmovb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC0, 0x00C0); }
-void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
-void fcmovbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD0, 0x00D0); }
-void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
-void fcmove(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC8, 0x00C8); }
-void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
-void fcmovnb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC0, 0x00C0); }
-void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
-void fcmovnbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD0, 0x00D0); }
-void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
-void fcmovne(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC8, 0x00C8); }
-void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
-void fcmovnu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD8, 0x00D8); }
-void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
-void fcmovu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD8, 0x00D8); }
-void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
-void fcom() {
-  db(0xD8);
-  db(0xD1);
-}
-void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
-void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
-void fcomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBF0, 0x00F0); }
-void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
-void fcomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFF0, 0x00F0); }
-void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
-void fcomp() {
-  db(0xD8);
-  db(0xD9);
-}
-void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
-void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
-void fcompp() {
-  db(0xDE);
-  db(0xD9);
-}
-void fcos() {
-  db(0xD9);
-  db(0xFF);
-}
-void fdecstp() {
-  db(0xD9);
-  db(0xF6);
-}
-void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
-void fdiv(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F0, 0xDCF8); }
-void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
-void fdivp() {
-  db(0xDE);
-  db(0xF9);
-}
-void fdivp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF8); }
-void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
-void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
-void fdivr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F8, 0xDCF0); }
-void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
-void fdivrp() {
-  db(0xDE);
-  db(0xF1);
-}
-void fdivrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF0); }
-void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
-void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
-void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
-void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
-void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
-void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
-void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
-void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
-void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
-void fincstp() {
-  db(0xD9);
-  db(0xF7);
-}
-void finit() {
-  db(0x9B);
-  db(0xDB);
-  db(0xE3);
-}
-void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
-void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
-void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
-void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
-void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
-void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
-void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
-void fld1() {
-  db(0xD9);
-  db(0xE8);
-}
-void fldcw(const Address& addr) { opModM(addr, Reg32(5), 0xD9, 0x100); }
-void fldenv(const Address& addr) { opModM(addr, Reg32(4), 0xD9, 0x100); }
-void fldl2e() {
-  db(0xD9);
-  db(0xEA);
-}
-void fldl2t() {
-  db(0xD9);
-  db(0xE9);
-}
-void fldlg2() {
-  db(0xD9);
-  db(0xEC);
-}
-void fldln2() {
-  db(0xD9);
-  db(0xED);
-}
-void fldpi() {
-  db(0xD9);
-  db(0xEB);
-}
-void fldz() {
-  db(0xD9);
-  db(0xEE);
-}
-void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
-void fmul(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C8, 0xDCC8); }
-void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
-void fmulp() {
-  db(0xDE);
-  db(0xC9);
-}
-void fmulp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC8); }
-void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
-void fnclex() {
-  db(0xDB);
-  db(0xE2);
-}
-void fninit() {
-  db(0xDB);
-  db(0xE3);
-}
-void fnop() {
-  db(0xD9);
-  db(0xD0);
-}
-void fnsave(const Address& addr) { opModM(addr, Reg32(6), 0xDD, 0x100); }
-void fnstcw(const Address& addr) { opModM(addr, Reg32(7), 0xD9, 0x100); }
-void fnstenv(const Address& addr) { opModM(addr, Reg32(6), 0xD9, 0x100); }
-void fnstsw(const Address& addr) { opModM(addr, Reg32(7), 0xDD, 0x100); }
-void fnstsw(const Reg16& r) {
-  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xDF);
-  db(0xE0);
-}
-void fpatan() {
-  db(0xD9);
-  db(0xF3);
-}
-void fprem() {
-  db(0xD9);
-  db(0xF8);
-}
-void fprem1() {
-  db(0xD9);
-  db(0xF5);
-}
-void fptan() {
-  db(0xD9);
-  db(0xF2);
-}
-void frndint() {
-  db(0xD9);
-  db(0xFC);
-}
-void frstor(const Address& addr) { opModM(addr, Reg32(4), 0xDD, 0x100); }
-void fsave(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(6), 0xDD, 0x100);
-}
-void fscale() {
-  db(0xD9);
-  db(0xFD);
-}
-void fsin() {
-  db(0xD9);
-  db(0xFE);
-}
-void fsincos() {
-  db(0xD9);
-  db(0xFB);
-}
-void fsqrt() {
-  db(0xD9);
-  db(0xFA);
-}
-void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
-void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
-void fstcw(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(7), 0xD9, 0x100);
-}
-void fstenv(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(6), 0xD9, 0x100);
-}
-void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
-void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
-void fstsw(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(7), 0xDD, 0x100);
-}
-void fstsw(const Reg16& r) {
-  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x9B);
-  db(0xDF);
-  db(0xE0);
-}
-void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
-void fsub(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E0, 0xDCE8); }
-void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
-void fsubp() {
-  db(0xDE);
-  db(0xE9);
-}
-void fsubp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE8); }
-void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
-void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
-void fsubr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E8, 0xDCE0); }
-void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
-void fsubrp() {
-  db(0xDE);
-  db(0xE1);
-}
-void fsubrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE0); }
-void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
-void ftst() {
-  db(0xD9);
-  db(0xE4);
-}
-void fucom() {
-  db(0xDD);
-  db(0xE1);
-}
-void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
-void fucomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBE8, 0x00E8); }
-void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
-void fucomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFE8, 0x00E8); }
-void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
-void fucomp() {
-  db(0xDD);
-  db(0xE9);
-}
-void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
-void fucompp() {
-  db(0xDA);
-  db(0xE9);
-}
-void fwait() { db(0x9B); }
-void fxam() {
-  db(0xD9);
-  db(0xE5);
-}
-void fxch() {
-  db(0xD9);
-  db(0xC9);
-}
-void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
-void fxrstor(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xAE); }
-void fxtract() {
-  db(0xD9);
-  db(0xF4);
-}
-void fyl2x() {
-  db(0xD9);
-  db(0xF1);
-}
-void fyl2xp1() {
-  db(0xD9);
-  db(0xF9);
-}
-void gf2p8affineinvqb(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0xCE, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
-void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
-void hlt() { db(0xF4); }
-void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
-void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
-void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
-void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
-void in_(const Reg& a, const Reg& d) { opInOut(a, d, 0xEC); }
-void in_(const Reg& a, uint8_t v) { opInOut(a, 0xE4, v); }
-void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
-void insertps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void int3() { db(0xCC); }
-void int_(uint8_t x) {
-  db(0xCD);
-  db(x);
-}
-void ja(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }    //-V524
-void ja(const char* label, LabelType type = T_AUTO) { ja(std::string(label), type); }             //-V524
-void ja(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                           //-V524
-void ja(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }     //-V524
-void jae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jae(const char* label, LabelType type = T_AUTO) { jae(std::string(label), type); }           //-V524
-void jae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
-void jb(const char* label, LabelType type = T_AUTO) { jb(std::string(label), type); }             //-V524
-void jb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
-void jb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
-void jbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
-void jbe(const char* label, LabelType type = T_AUTO) { jbe(std::string(label), type); }           //-V524
-void jbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
-void jbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
-void jc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
-void jc(const char* label, LabelType type = T_AUTO) { jc(std::string(label), type); }             //-V524
-void jc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
-void jc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
-void je(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
-void je(const char* label, LabelType type = T_AUTO) { je(std::string(label), type); }             //-V524
-void je(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
-void je(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
-void jg(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }    //-V524
-void jg(const char* label, LabelType type = T_AUTO) { jg(std::string(label), type); }             //-V524
-void jg(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                           //-V524
-void jg(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }     //-V524
-void jge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
-void jge(const char* label, LabelType type = T_AUTO) { jge(std::string(label), type); }           //-V524
-void jge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
-void jge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
-void jl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }    //-V524
-void jl(const char* label, LabelType type = T_AUTO) { jl(std::string(label), type); }             //-V524
-void jl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                           //-V524
-void jl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }     //-V524
-void jle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
-void jle(const char* label, LabelType type = T_AUTO) { jle(std::string(label), type); }           //-V524
-void jle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
-void jle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
-void jna(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
-void jna(const char* label, LabelType type = T_AUTO) { jna(std::string(label), type); }           //-V524
-void jna(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
-void jna(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
-void jnae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }  //-V524
-void jnae(const char* label, LabelType type = T_AUTO) { jnae(std::string(label), type); }         //-V524
-void jnae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                         //-V524
-void jnae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }   //-V524
-void jnb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jnb(const char* label, LabelType type = T_AUTO) { jnb(std::string(label), type); }           //-V524
-void jnb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jnb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jnbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }  //-V524
-void jnbe(const char* label, LabelType type = T_AUTO) { jnbe(std::string(label), type); }         //-V524
-void jnbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                         //-V524
-void jnbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }   //-V524
-void jnc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jnc(const char* label, LabelType type = T_AUTO) { jnc(std::string(label), type); }           //-V524
-void jnc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jnc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jne(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
-void jne(const char* label, LabelType type = T_AUTO) { jne(std::string(label), type); }           //-V524
-void jne(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
-void jne(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
-void jng(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
-void jng(const char* label, LabelType type = T_AUTO) { jng(std::string(label), type); }           //-V524
-void jng(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
-void jng(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
-void jnge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }  //-V524
-void jnge(const char* label, LabelType type = T_AUTO) { jnge(std::string(label), type); }         //-V524
-void jnge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                         //-V524
-void jnge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }   //-V524
-void jnl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
-void jnl(const char* label, LabelType type = T_AUTO) { jnl(std::string(label), type); }           //-V524
-void jnl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
-void jnl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
-void jnle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }  //-V524
-void jnle(const char* label, LabelType type = T_AUTO) { jnle(std::string(label), type); }         //-V524
-void jnle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                         //-V524
-void jnle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }   //-V524
-void jno(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }   //-V524
-void jno(const char* label, LabelType type = T_AUTO) { jno(std::string(label), type); }           //-V524
-void jno(const void* addr) { opJmpAbs(addr, T_NEAR, 0x71, 0x81, 0x0F); }                          //-V524
-void jno(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }    //-V524
-void jnp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
-void jnp(const char* label, LabelType type = T_AUTO) { jnp(std::string(label), type); }           //-V524
-void jnp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
-void jnp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
-void jns(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }   //-V524
-void jns(const char* label, LabelType type = T_AUTO) { jns(std::string(label), type); }           //-V524
-void jns(const void* addr) { opJmpAbs(addr, T_NEAR, 0x79, 0x89, 0x0F); }                          //-V524
-void jns(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }    //-V524
-void jnz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
-void jnz(const char* label, LabelType type = T_AUTO) { jnz(std::string(label), type); }           //-V524
-void jnz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
-void jnz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
-void jo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }    //-V524
-void jo(const char* label, LabelType type = T_AUTO) { jo(std::string(label), type); }             //-V524
-void jo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x70, 0x80, 0x0F); }                           //-V524
-void jo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }     //-V524
-void jp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
-void jp(const char* label, LabelType type = T_AUTO) { jp(std::string(label), type); }             //-V524
-void jp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                           //-V524
-void jp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }     //-V524
-void jpe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }   //-V524
-void jpe(const char* label, LabelType type = T_AUTO) { jpe(std::string(label), type); }           //-V524
-void jpe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                          //-V524
-void jpe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
-void jpo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
-void jpo(const char* label, LabelType type = T_AUTO) { jpo(std::string(label), type); }           //-V524
-void jpo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
-void jpo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
-void js(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }    //-V524
-void js(const char* label, LabelType type = T_AUTO) { js(std::string(label), type); }             //-V524
-void js(const void* addr) { opJmpAbs(addr, T_NEAR, 0x78, 0x88, 0x0F); }                           //-V524
-void js(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }     //-V524
-void jz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
-void jz(const char* label, LabelType type = T_AUTO) { jz(std::string(label), type); }             //-V524
-void jz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
-void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
-void lahf() { db(0x9F); }
-void lddqu(const Xmm& xmm, const Address& addr) {
-  db(0xF2);
-  opModM(addr, xmm, 0x0F, 0xF0);
-}
-void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
-void lea(const Reg& reg, const Address& addr) {
-  if (!reg.isBit(16 | i32e)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModM(addr, reg, 0x8D);
-}
-void leave() { db(0xC9); }
-void lfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xE8);
-}
-void lfs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB4); }
-void lgs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB5); }
-void lock() { db(0xF0); }
-void lodsb() { db(0xAC); }
-void lodsd() { db(0xAD); }
-void lodsw() {
-  db(0x66);
-  db(0xAD);
-}
-void loop(const Label& label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
-void loop(const char* label) { loop(std::string(label)); }
-void loop(std::string label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
-void loope(const Label& label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
-void loope(const char* label) { loope(std::string(label)); }
-void loope(std::string label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
-void loopne(const Label& label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
-void loopne(const char* label) { loopne(std::string(label)); }
-void loopne(std::string label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
-void lss(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB2); }
-void lzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBD); }
-void maskmovdqu(const Xmm& reg1, const Xmm& reg2) {
-  db(0x66);
-  opModR(reg1, reg2, 0x0F, 0xF7);
-}
-void maskmovq(const Mmx& reg1, const Mmx& reg2) {
-  if (!reg1.isMMX() || !reg2.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModR(reg1, reg2, 0x0F, 0xF7);
-}
-void maxpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x66, isXMM_XMMorMEM); }
-void maxps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x100, isXMM_XMMorMEM); }
-void maxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF2, isXMM_XMMorMEM); }
-void maxss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF3, isXMM_XMMorMEM); }
-void mfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xF0);
-}
-void minpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x66, isXMM_XMMorMEM); }
-void minps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x100, isXMM_XMMorMEM); }
-void minsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF2, isXMM_XMMorMEM); }
-void minss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF3, isXMM_XMMorMEM); }
-void monitor() {
-  db(0x0F);
-  db(0x01);
-  db(0xC8);
-}
-void monitorx() {
-  db(0x0F);
-  db(0x01);
-  db(0xFA);
-}
-void movapd(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x29);
-}
-void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x66); }
-void movaps(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x29); }
-void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x100); }
-void movbe(const Address& addr, const Reg& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF1); }
-void movbe(const Reg& reg, const Address& addr) { opModM(addr, reg, 0x0F, 0x38, 0xF0); }
-void movd(const Address& addr, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, 0x7E);
-}
-void movd(const Mmx& mmx, const Address& addr) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, 0x6E);
-}
-void movd(const Mmx& mmx, const Reg32& reg) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x6E);
-}
-void movd(const Reg32& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x7E);
-}
-void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
-void movdir64b(const Reg& reg, const Address& addr) {
-  db(0x66);
-  opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8);
-}
-void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }
-void movdq2q(const Mmx& mmx, const Xmm& xmm) {
-  db(0xF2);
-  opModR(mmx, xmm, 0x0F, 0xD6);
-}
-void movdqa(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x7F);
-}
-void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
-void movdqu(const Address& addr, const Xmm& xmm) {
-  db(0xF3);
-  opModM(addr, xmm, 0x0F, 0x7F);
-}
-void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0xF3); }
-void movhlps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x12); }
-void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x66); }
-void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x100); }
-void movlhps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x16); }
-void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
-void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x100); }
-void movmskpd(const Reg32e& reg, const Xmm& xmm) {
-  db(0x66);
-  movmskps(reg, xmm);
-}
-void movmskps(const Reg32e& reg, const Xmm& xmm) { opModR(reg, xmm, 0x0F, 0x50); }
-void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
-void movntdqa(const Xmm& xmm, const Address& addr) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x38, 0x2A);
-}
-void movnti(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0xC3); }
-void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0x2B); }
-void movntps(const Address& addr, const Xmm& xmm) { opModM(addr, Mmx(xmm.getIdx()), 0x0F, 0x2B); }
-void movntq(const Address& addr, const Mmx& mmx) {
-  if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModM(addr, mmx, 0x0F, 0xE7);
-}
-void movq(const Address& addr, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, mmx.isXMM() ? 0xD6 : 0x7F);
-}
-void movq(const Mmx& mmx, const Operand& op) {
-  if (mmx.isXMM()) db(0xF3);
-  opModRM(mmx, op, (mmx.getKind() == op.getKind()), op.isMEM(), 0x0F, mmx.isXMM() ? 0x7E : 0x6F);
-}
-void movq2dq(const Xmm& xmm, const Mmx& mmx) {
-  db(0xF3);
-  opModR(xmm, mmx, 0x0F, 0xD6);
-}
-void movsb() { db(0xA4); }
-void movsd() { db(0xA5); }
-void movsd(const Address& addr, const Xmm& xmm) {
-  db(0xF2);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF2); }
-void movshdup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x16, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
-void movsldup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
-void movss(const Address& addr, const Xmm& xmm) {
-  db(0xF3);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF3); }
-void movsw() {
-  db(0x66);
-  db(0xA5);
-}
-void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
-void movupd(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x66); }
-void movups(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x11); }
-void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x100); }
-void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
-void mpsadbw(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x42, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
-void mulpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x66, isXMM_XMMorMEM); }
-void mulps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x100, isXMM_XMMorMEM); }
-void mulsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF2, isXMM_XMMorMEM); }
-void mulss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF3, isXMM_XMMorMEM); }
-void mulx(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf6, true); }
-void mwait() {
-  db(0x0F);
-  db(0x01);
-  db(0xC9);
-}
-void mwaitx() {
-  db(0x0F);
-  db(0x01);
-  db(0xFB);
-}
-void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
-void not_(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
-void or_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x08, 1); }
-void or_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }
-void orpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x66, isXMM_XMMorMEM); }
-void orps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x100, isXMM_XMMorMEM); }
-void out_(const Reg& d, const Reg& a) { opInOut(a, d, 0xEE); }
-void out_(uint8_t v, const Reg& a) { opInOut(a, 0xE6, v); }
-void outsb() { db(0x6E); }
-void outsd() { db(0x6F); }
-void outsw() {
-  db(0x66);
-  db(0x6F);
-}
-void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
-void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
-void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
-void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
-void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
-void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
-void paddb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFC); }
-void paddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFE); }
-void paddq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD4); }
-void paddsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEC); }
-void paddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xED); }
-void paddusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDC); }
-void paddusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDD); }
-void paddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFD); }
-void palignr(const Mmx& mmx, const Operand& op, int imm) {
-  opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8_t>(imm), 0x3a);
-}
-void pand(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDB); }
-void pandn(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDF); }
-void pause() {
-  db(0xF3);
-  db(0x90);
-}
-void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
-void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
-void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pblendw(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void pclmulhqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
-void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
-void pclmullqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
-void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
-void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
-void pcmpeqd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x76); }
-void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pcmpeqw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x75); }
-void pcmpestri(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpestrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x60, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpgtb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x64); }
-void pcmpgtd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x66); }
-void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pcmpgtw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x65); }
-void pcmpistri(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpistrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pdep(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf5, true); }
-void pext(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F3 | T_0F38, 0xf5, true); }
-void pextrb(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x14, imm); }
-void pextrd(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x16, imm); }
-void pextrw(const Operand& op, const Mmx& xmm, uint8_t imm) { opExt(op, xmm, 0x15, imm, true); }
-void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
-void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
-void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
-void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
-void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
-void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
-void pinsrb(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x20, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
-void pinsrd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x22, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
-void pinsrw(const Mmx& mmx, const Operand& op, int imm) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(mmx, op, 0xC4, mmx.isXMM() ? 0x66 : NONE, 0, imm);
-}
-void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
-void pmaddwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF5); }
-void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEE); }
-void pmaxub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDE); }
-void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); }
-void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); }
-void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovmskb(const Reg32e& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(reg, mmx, 0x0F, 0xD7);
-}
-void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
-void pmulhuw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE4); }
-void pmulhw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE5); }
-void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmullw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD5); }
-void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
-void popcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
-void popf() { db(0x9D); }
-void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
-void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
-void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
-void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
-void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
-void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
-void prefetcht2(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0x18); }
-void prefetchw(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x0D); }
-void prefetchwt1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x0D); }
-void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
-void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
-void pshufd(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x66, imm8); }
-void pshufhw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF3, imm8); }
-void pshuflw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF2, imm8); }
-void pshufw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x00, imm8); }
-void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
-void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
-void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
-void pslld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF2); }
-void pslld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 6); }
-void pslldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 7); }
-void psllq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF3); }
-void psllq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 6); }
-void psllw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF1); }
-void psllw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 6); }
-void psrad(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE2); }
-void psrad(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 4); }
-void psraw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE1); }
-void psraw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 4); }
-void psrld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD2); }
-void psrld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 2); }
-void psrldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 3); }
-void psrlq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD3); }
-void psrlq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 2); }
-void psrlw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD1); }
-void psrlw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 2); }
-void psubb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF8); }
-void psubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFA); }
-void psubq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFB); }
-void psubsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE8); }
-void psubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE9); }
-void psubusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD8); }
-void psubusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD9); }
-void psubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF9); }
-void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void punpckhbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x68); }
-void punpckhdq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6A); }
-void punpckhqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6D, 0x66, isXMM_XMMorMEM); }
-void punpckhwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x69); }
-void punpcklbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x60); }
-void punpckldq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x62); }
-void punpcklqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6C, 0x66, isXMM_XMMorMEM); }
-void punpcklwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x61); }
-void pushf() { db(0x9C); }
-void pxor(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEF); }
-void rcl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 2); }
-void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
-void rcpps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0x100, isXMM_XMMorMEM); }
-void rcpss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0xF3, isXMM_XMMorMEM); }
-void rcr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 3); }
-void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
-void rdmsr() {
-  db(0x0F);
-  db(0x32);
-}
-void rdpmc() {
-  db(0x0F);
-  db(0x33);
-}
-void rdrand(const Reg& r) {
-  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(6, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
-}
-void rdseed(const Reg& r) {
-  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(7, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
-}
-void rdtsc() {
-  db(0x0F);
-  db(0x31);
-}
-void rdtscp() {
-  db(0x0F);
-  db(0x01);
-  db(0xF9);
-}
-void rep() { db(0xF3); }
-void repe() { db(0xF3); }
-void repne() { db(0xF2); }
-void repnz() { db(0xF2); }
-void repz() { db(0xF3); }
-void ret(int imm = 0) {
-  if (imm) {
-    db(0xC2);
-    dw(imm);
-  } else {
-    db(0xC3);
-  }
-}
-void retf(int imm = 0) {
-  if (imm) {
-    db(0xCA);
-    dw(imm);
-  } else {
-    db(0xCB);
-  }
-}
-void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
-void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
-void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
-void ror(const Operand& op, int imm) { opShift(op, imm, 1); }
-void rorx(const Reg32e& r, const Operand& op, uint8_t imm) {
-  opGpr(r, op, Reg32e(0, r.getBit()), T_0F3A | T_F2, 0xF0, false, imm);
-}
-void roundpd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x09, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void roundps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x08, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void roundsd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0B, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void roundss(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0A, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void rsqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0x100, isXMM_XMMorMEM); }
-void rsqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0xF3, isXMM_XMMorMEM); }
-void sahf() { db(0x9E); }
-void sal(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
-void sal(const Operand& op, int imm) { opShift(op, imm, 4); }
-void sar(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 7); }
-void sar(const Operand& op, int imm) { opShift(op, imm, 7); }
-void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F3 | T_0F38, 0xf7, false); }
-void sbb(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x18, 3); }
-void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
-void scasb() { db(0xAE); }
-void scasd() { db(0xAF); }
-void scasw() {
-  db(0x66);
-  db(0xAF);
-}
-void serialize() {
-  db(0x0F);
-  db(0x01);
-  db(0xE8);
-}
-void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }     //-V524
-void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
-void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
-void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
-void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
-void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }    //-V524
-void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
-void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }    //-V524
-void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
-void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
-void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }   //-V524
-void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }   //-V524
-void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
-void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
-void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }  //-V524
-void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
-void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }  //-V524
-void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 1); }    //-V524
-void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
-void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 9); }    //-V524
-void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
-void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 0); }     //-V524
-void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }    //-V524
-void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }   //-V524
-void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
-void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 8); }     //-V524
-void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
-void sfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xF8);
-}
-void sha1msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC9, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCA, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1nexte(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC8, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1rnds4(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, imm, 0x3A);
-}
-void sha256msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha256msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCD, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha256rnds2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCB, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void shl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
-void shl(const Operand& op, int imm) { opShift(op, imm, 4); }
-void shld(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xA4, &_cl); }
-void shld(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xA4); }
-void shlx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_66 | T_0F38, 0xf7, false); }
-void shr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 5); }
-void shr(const Operand& op, int imm) { opShift(op, imm, 5); }
-void shrd(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xAC, &_cl); }
-void shrd(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xAC); }
-void shrx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F2 | T_0F38, 0xf7, false); }
-void shufpd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x66, isXMM_XMMorMEM, imm8); }
-void shufps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x100, isXMM_XMMorMEM, imm8); }
-void sqrtpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x66, isXMM_XMMorMEM); }
-void sqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x100, isXMM_XMMorMEM); }
-void sqrtsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF2, isXMM_XMMorMEM); }
-void sqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF3, isXMM_XMMorMEM); }
-void stac() {
-  db(0x0F);
-  db(0x01);
-  db(0xCB);
-}
-void stc() { db(0xF9); }
-void std() { db(0xFD); }
-void sti() { db(0xFB); }
-void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
-void stosb() { db(0xAA); }
-void stosd() { db(0xAB); }
-void stosw() {
-  db(0x66);
-  db(0xAB);
-}
-void sub(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x28, 5); }
-void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); }
-void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); }
-void subps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x100, isXMM_XMMorMEM); }
-void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM_XMMorMEM); }
-void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
-void sysenter() {
-  db(0x0F);
-  db(0x34);
-}
-void sysexit() {
-  db(0x0F);
-  db(0x35);
-}
-void tpause(const Reg32& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void tzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
-void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
-void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
-void ud2() {
-  db(0x0F);
-  db(0x0B);
-}
-void umonitor(const Reg& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit();
-  if (BIT != bit) {
-    if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) {
-      db(0x67);
-    } else {
-      XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    }
-  }
-  db(0xF3);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void umwait(const Reg32& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
-void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
-void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
-void unpcklps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x100, isXMM_XMMorMEM); }
-void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x58);
-}
-void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x58);
-}
-void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x58);
-}
-void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x58);
-}
-void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0xD0);
-}
-void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0xD0);
-}
-void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDE);
-}
-void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDF);
-}
-void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDC);
-}
-void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDD);
-}
-void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_W0, 0xDB); }
-void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0xDF, imm);
-}
-void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55);
-}
-void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55);
-}
-void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54);
-}
-void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54);
-}
-void vbcstnebf162ps(const Xmm& x, const Address& addr) {
-  opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1);
-}
-void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
-void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm);
-}
-void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm);
-}
-void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4);
-}
-void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4A, x4.getIdx() << 4);
-}
-void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x1A); }
-void vbroadcasti128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x5A); }
-void vbroadcastsd(const Ymm& y, const Operand& op) {
-  if (!op.isMEM() && !(y.isYMM() && op.isXMM()) && !(y.isZMM() && op.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(y, op, T_0F38 | T_66 | T_W0 | T_YMM | T_EVEX | T_EW1 | T_N8, 0x19);
-}
-void vbroadcastss(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x18);
-}
-void vcmpeq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 16); }
-void vcmpeq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 16); }
-void vcmpeq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 16); }
-void vcmpeq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 16); }
-void vcmpeq_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 8); }
-void vcmpeq_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 8); }
-void vcmpeq_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 8); }
-void vcmpeq_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 8); }
-void vcmpeq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 24); }
-void vcmpeq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 24); }
-void vcmpeq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 24); }
-void vcmpeq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 24); }
-void vcmpeqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 0); }
-void vcmpeqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 0); }
-void vcmpeqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 0); }
-void vcmpeqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 0); }
-void vcmpfalse_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 27); }
-void vcmpfalse_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 27); }
-void vcmpfalse_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 27); }
-void vcmpfalse_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 27); }
-void vcmpfalsepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 11); }
-void vcmpfalseps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 11); }
-void vcmpfalsesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 11); }
-void vcmpfalsess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 11); }
-void vcmpge_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 29); }
-void vcmpge_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 29); }
-void vcmpge_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 29); }
-void vcmpge_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 29); }
-void vcmpgepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 13); }
-void vcmpgeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 13); }
-void vcmpgesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 13); }
-void vcmpgess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 13); }
-void vcmpgt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 30); }
-void vcmpgt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 30); }
-void vcmpgt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 30); }
-void vcmpgt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 30); }
-void vcmpgtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 14); }
-void vcmpgtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 14); }
-void vcmpgtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 14); }
-void vcmpgtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 14); }
-void vcmple_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 18); }
-void vcmple_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 18); }
-void vcmple_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 18); }
-void vcmple_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 18); }
-void vcmplepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 2); }
-void vcmpleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 2); }
-void vcmplesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 2); }
-void vcmpless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 2); }
-void vcmplt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 17); }
-void vcmplt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 17); }
-void vcmplt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 17); }
-void vcmplt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 17); }
-void vcmpltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 1); }
-void vcmpltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 1); }
-void vcmpltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 1); }
-void vcmpltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 1); }
-void vcmpneq_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 12); }
-void vcmpneq_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 12); }
-void vcmpneq_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 12); }
-void vcmpneq_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 12); }
-void vcmpneq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 28); }
-void vcmpneq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 28); }
-void vcmpneq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 28); }
-void vcmpneq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 28); }
-void vcmpneq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 20); }
-void vcmpneq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 20); }
-void vcmpneq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 20); }
-void vcmpneq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 20); }
-void vcmpneqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 4); }
-void vcmpneqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 4); }
-void vcmpneqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 4); }
-void vcmpneqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 4); }
-void vcmpnge_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 25); }
-void vcmpnge_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 25); }
-void vcmpnge_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 25); }
-void vcmpnge_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 25); }
-void vcmpngepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 9); }
-void vcmpngeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 9); }
-void vcmpngesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 9); }
-void vcmpngess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 9); }
-void vcmpngt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 26); }
-void vcmpngt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 26); }
-void vcmpngt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 26); }
-void vcmpngt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 26); }
-void vcmpngtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 10); }
-void vcmpngtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 10); }
-void vcmpngtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 10); }
-void vcmpngtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 10); }
-void vcmpnle_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 22); }
-void vcmpnle_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 22); }
-void vcmpnle_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 22); }
-void vcmpnle_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 22); }
-void vcmpnlepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 6); }
-void vcmpnleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 6); }
-void vcmpnlesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 6); }
-void vcmpnless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 6); }
-void vcmpnlt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 21); }
-void vcmpnlt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 21); }
-void vcmpnlt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 21); }
-void vcmpnlt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 21); }
-void vcmpnltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 5); }
-void vcmpnltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 5); }
-void vcmpnltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 5); }
-void vcmpnltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 5); }
-void vcmpord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 23); }
-void vcmpord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 23); }
-void vcmpord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 23); }
-void vcmpord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 23); }
-void vcmpordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 7); }
-void vcmpordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 7); }
-void vcmpordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 7); }
-void vcmpordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 7); }
-void vcmppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xC2, imm);
-}
-void vcmpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_YMM, 0xC2, imm);
-}
-void vcmpsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F, 0xC2, imm);
-}
-void vcmpss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0xC2, imm);
-}
-void vcmptrue_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 31); }
-void vcmptrue_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 31); }
-void vcmptrue_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 31); }
-void vcmptrue_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 31); }
-void vcmptruepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 15); }
-void vcmptrueps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 15); }
-void vcmptruesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 15); }
-void vcmptruess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 15); }
-void vcmpunord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 19); }
-void vcmpunord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 19); }
-void vcmpunord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 19); }
-void vcmpunord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 19); }
-void vcmpunordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 3); }
-void vcmpunordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 3); }
-void vcmpunordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 3); }
-void vcmpunordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 3); }
-void vcomisd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2F);
-}
-void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
-void vcvtdq2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6);
-}
-void vcvtdq2ps(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
-}
-void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72);
-}
-void vcvtpd2dq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
-}
-void vcvtpd2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A);
-}
-void vcvtph2ps(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13);
-}
-void vcvtps2dq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
-}
-void vcvtps2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_SAE_Y, 0x5A);
-}
-void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm);
-}
-void vcvtsd2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_ER_X, 0x2D);
-}
-void vcvtsd2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x5A);
-}
-void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_0F | T_F2 | T_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
-}
-void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
-}
-void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x5A);
-}
-void vcvtss2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D);
-}
-void vcvttpd2dq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
-}
-void vcvttps2dq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5B);
-}
-void vcvttsd2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C);
-}
-void vcvttss2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C);
-}
-void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5E);
-}
-void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5E);
-}
-void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5E);
-}
-void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5E);
-}
-void vdppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x41, imm);
-}
-void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x40, imm);
-}
-void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) {
-  if (!(op.isXMEM() && y.isYMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm);
-}
-void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) {
-  if (!(op.isXMEM() && y.isYMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm);
-}
-void vextractps(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm);
-}
-void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x98);
-}
-void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x98);
-}
-void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x99);
-}
-void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x99);
-}
-void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA8);
-}
-void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA8);
-}
-void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xA9);
-}
-void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xA9);
-}
-void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB8);
-}
-void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB8);
-}
-void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xB9);
-}
-void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xB9);
-}
-void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x96);
-}
-void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x96);
-}
-void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA6);
-}
-void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA6);
-}
-void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB6);
-}
-void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB6);
-}
-void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9A);
-}
-void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9A);
-}
-void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9B);
-}
-void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9B);
-}
-void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAA);
-}
-void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAA);
-}
-void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAB);
-}
-void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAB);
-}
-void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBA);
-}
-void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBA);
-}
-void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBB);
-}
-void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBB);
-}
-void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x97);
-}
-void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x97);
-}
-void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA7);
-}
-void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA7);
-}
-void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB7);
-}
-void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB7);
-}
-void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9C);
-}
-void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9C);
-}
-void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9D);
-}
-void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9D);
-}
-void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAC);
-}
-void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAC);
-}
-void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAD);
-}
-void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAD);
-}
-void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBC);
-}
-void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBC);
-}
-void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBD);
-}
-void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBD);
-}
-void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9E);
-}
-void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9E);
-}
-void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9F);
-}
-void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9F);
-}
-void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAE);
-}
-void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAE);
-}
-void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAF);
-}
-void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAF);
-}
-void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBE);
-}
-void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBE);
-}
-void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBF);
-}
-void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBF);
-}
-void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0);
-}
-void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x92, 1);
-}
-void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x93, 1);
-}
-void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x93, 2);
-}
-void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm);
-}
-void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm);
-}
-void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF);
-}
-void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7C);
-}
-void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7C);
-}
-void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7D);
-}
-void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D);
-}
-void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm);
-}
-void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm);
-}
-void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm);
-}
-void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
-void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
-void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
-void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2F);
-}
-void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D);
-}
-void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E);
-}
-void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C);
-}
-void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F);
-}
-void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F);
-}
-void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F);
-}
-void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F);
-}
-void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D);
-}
-void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D);
-}
-void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D);
-}
-void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D);
-}
-void vmovapd(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x29);
-}
-void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x28); }
-void vmovaps(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x29);
-}
-void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x28); }
-void vmovd(const Operand& op, const Xmm& x) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E);
-}
-void vmovd(const Xmm& x, const Operand& op) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E);
-}
-void vmovddup(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_DUP | T_F2 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_X | T_ER_Y | T_ER_Z, 0x12);
-}
-void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_YMM, 0x7F); }
-void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_YMM, 0x6F); }
-void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_F3 | T_0F | T_YMM, 0x7F); }
-void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM, 0x6F); }
-void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x12);
-}
-void vmovhpd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x17);
-}
-void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x16);
-}
-void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x17); }
-void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x16);
-}
-void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x16);
-}
-void vmovlpd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x13);
-}
-void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x12);
-}
-void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x13); }
-void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x12);
-}
-void vmovmskpd(const Reg& r, const Xmm& x) {
-  if (!r.isBit(i32e))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_66 | T_W0 | T_YMM, 0x50);
-}
-void vmovmskps(const Reg& r, const Xmm& x) {
-  if (!r.isBit(i32e))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_W0 | T_YMM, 0x50);
-}
-void vmovntdq(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW0, 0xE7); }
-void vmovntdqa(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_66 | T_YMM | T_EVEX | T_EW0, 0x2A); }
-void vmovntpd(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW1, 0x2B); }
-void vmovntps(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_YMM | T_EVEX | T_EW0, 0x2B); }
-void vmovq(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E);
-}
-void vmovq(const Xmm& x, const Address& addr) {
-  int type, code;
-  if (x.getIdx() < 16) {
-    type = T_0F | T_F3;
-    code = 0x7E;
-  } else {
-    type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8;
-    code = 0x6E;
-  }
-  opAVX_X_X_XM(x, xm0, addr, type, code);
-}
-void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }
-void vmovsd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_M_K, 0x11);
-}
-void vmovsd(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
-}
-void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
-}
-void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x16); }
-void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x12); }
-void vmovss(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_M_K, 0x11);
-}
-void vmovss(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
-}
-void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
-}
-void vmovupd(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x11);
-}
-void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x10); }
-void vmovups(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x11);
-}
-void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x10); }
-void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x42, imm);
-}
-void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59);
-}
-void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59);
-}
-void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59);
-}
-void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59);
-}
-void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56);
-}
-void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56);
-}
-void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1C); }
-void vpabsd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x1E);
-}
-void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1D); }
-void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6B);
-}
-void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x63);
-}
-void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x2B);
-}
-void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x67);
-}
-void vpaddb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFC);
-}
-void vpaddd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFE);
-}
-void vpaddq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xD4);
-}
-void vpaddsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEC);
-}
-void vpaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xED);
-}
-void vpaddusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDC);
-}
-void vpaddusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDD);
-}
-void vpaddw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFD);
-}
-void vpalignr(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_YMM | T_EVEX, 0x0F, imm);
-}
-void vpand(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDB); }
-void vpandn(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDF); }
-void vpavgb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE0);
-}
-void vpavgw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE3);
-}
-void vpblendd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x02, imm);
-}
-void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4C, x4.getIdx() << 4);
-}
-void vpblendw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0E, imm);
-}
-void vpbroadcastb(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x78);
-}
-void vpbroadcastd(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58);
-}
-void vpbroadcastq(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59);
-}
-void vpbroadcastw(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79);
-}
-void vpclmulhqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x11); }
-void vpclmulhqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x01); }
-void vpclmullqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x10); }
-void vpclmullqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x00); }
-void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm);
-}
-void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); }
-void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); }
-void vpcmpeqq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x29);
-}
-void vpcmpeqw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x75); }
-void vpcmpestri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x61, imm); }
-void vpcmpestrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x60, imm); }
-void vpcmpgtb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x64); }
-void vpcmpgtd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x66); }
-void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x37);
-}
-void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
-void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
-void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
-void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding);
-}
-void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding);
-}
-void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding);
-}
-void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding);
-}
-void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm);
-}
-void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm);
-}
-void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36);
-}
-void vpermilpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x0D);
-}
-void vpermilpd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_EVEX | T_B64, 0x05, imm);
-}
-void vpermilps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x0C);
-}
-void vpermilps(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_EVEX | T_B32, 0x04, imm);
-}
-void vpermpd(const Ymm& y, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x01, imm);
-}
-void vpermpd(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x16);
-}
-void vpermps(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x16);
-}
-void vpermq(const Ymm& y, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x00, imm);
-}
-void vpermq(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x36);
-}
-void vpextrb(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(8 | 16 | i32e) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x14, imm);
-}
-void vpextrd(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x16, imm);
-}
-void vpextrq(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(64) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x16, imm);
-}
-void vpextrw(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(16 | i32e) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) if (op.isREG() && x.getIdx() < 16) {
-      opAVX_X_X_XM(Xmm(op.getIdx()), xm0, x, T_0F | T_66, 0xC5, imm);
-    }
-  else {
-    opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N2, 0x15, imm);
-  }
-}
-void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x90, 1);
-}
-void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x90, 0);
-}
-void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x91, 2);
-}
-void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x91, 1);
-}
-void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x02); }
-void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x03);
-}
-void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x01); }
-void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38, 0x41); }
-void vphsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x06); }
-void vphsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x07);
-}
-void vphsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x05); }
-void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x20, imm);
-}
-void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm);
-}
-void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm);
-}
-void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm);
-}
-void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding);
-}
-void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding);
-}
-void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04);
-}
-void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5);
-}
-void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E);
-}
-void vpmaskmovd(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8C);
-}
-void vpmaskmovq(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8E);
-}
-void vpmaskmovq(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8C);
-}
-void vpmaxsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3C);
-}
-void vpmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3D);
-}
-void vpmaxsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEE);
-}
-void vpmaxub(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDE);
-}
-void vpmaxud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3F);
-}
-void vpmaxuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3E);
-}
-void vpminsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x38);
-}
-void vpminsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x39);
-}
-void vpminsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEA);
-}
-void vpminub(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDA);
-}
-void vpminud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3B);
-}
-void vpminuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3A);
-}
-void vpmovmskb(const Reg32e& r, const Xmm& x) {
-  if (!x.is(Operand::XMM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(x.isYMM() ? Ymm(r.getIdx()) : Xmm(r.getIdx()), 0, x, T_0F | T_66 | T_YMM, 0xD7);
-}
-void vpmovsxbd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x21);
-}
-void vpmovsxbq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x22);
-}
-void vpmovsxbw(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x20);
-}
-void vpmovsxdq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x25);
-}
-void vpmovsxwd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x23);
-}
-void vpmovsxwq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x24);
-}
-void vpmovzxbd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x31);
-}
-void vpmovzxbq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x32);
-}
-void vpmovzxbw(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x30);
-}
-void vpmovzxdq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x35);
-}
-void vpmovzxwd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x33);
-}
-void vpmovzxwq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x34);
-}
-void vpmuldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x28);
-}
-void vpmulhrsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x0B);
-}
-void vpmulhuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE4);
-}
-void vpmulhw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE5);
-}
-void vpmulld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x40);
-}
-void vpmullw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD5);
-}
-void vpmuludq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xF4);
-}
-void vpor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEB); }
-void vpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF6);
-}
-void vpshufb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x00);
-}
-void vpshufd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x70, imm);
-}
-void vpshufhw(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM | T_EVEX, 0x70, imm);
-}
-void vpshuflw(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_F2 | T_0F | T_YMM | T_EVEX, 0x70, imm);
-}
-void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
-void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
-void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
-void vpslld(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2);
-}
-void vpslldq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
-}
-void vpsllq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
-}
-void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3);
-}
-void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47);
-}
-void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47);
-}
-void vpsllw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1);
-}
-void vpsrad(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2);
-}
-void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46);
-}
-void vpsraw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1);
-}
-void vpsrld(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2);
-}
-void vpsrldq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
-}
-void vpsrlq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
-}
-void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3);
-}
-void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45);
-}
-void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45);
-}
-void vpsrlw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1);
-}
-void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8);
-}
-void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA);
-}
-void vpsubq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xFB);
-}
-void vpsubsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE8);
-}
-void vpsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE9);
-}
-void vpsubusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD8);
-}
-void vpsubusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD9);
-}
-void vpsubw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF9);
-}
-void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x17); }
-void vpunpckhbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x68);
-}
-void vpunpckhdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6A);
-}
-void vpunpckhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6D);
-}
-void vpunpckhwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x69);
-}
-void vpunpcklbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x60);
-}
-void vpunpckldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x62);
-}
-void vpunpcklqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6C);
-}
-void vpunpcklwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x61);
-}
-void vpxor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEF); }
-void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x53); }
-void vrcpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x53); }
-void vroundpd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x09, imm);
-}
-void vroundps(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x08, imm);
-}
-void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0B, imm);
-}
-void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm);
-}
-void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
-void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); }
-void vsha512msg1(const Ymm& y, const Xmm& x) {
-  if (!(y.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y, 0, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCC);
-}
-void vsha512msg2(const Ymm& y1, const Ymm& y2) {
-  if (!(y1.isYMM() && y2.isYMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, 0, y2, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCD);
-}
-void vsha512rnds2(const Ymm& y1, const Ymm& y2, const Xmm& x) {
-  if (!(y1.isYMM() && y2.isYMM() && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, &y2, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCB);
-}
-void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm);
-}
-void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm);
-}
-void vsm3msg1(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm3msg2(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm3rnds2(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0xDE, imm);
-}
-void vsm4key4(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm4rnds4(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsqrtpd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51);
-}
-void vsqrtps(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51);
-}
-void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51);
-}
-void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_ER_X, 0x51);
-}
-void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, T_0F, 0xAE); }
-void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5C);
-}
-void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5C);
-}
-void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5C);
-}
-void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5C);
-}
-void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0F); }
-void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0E); }
-void vucomisd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2E);
-}
-void vucomiss(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2E);
-}
-void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x15);
-}
-void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x15);
-}
-void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x14);
-}
-void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x14);
-}
-void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57);
-}
-void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57);
-}
-void vzeroall() {
-  db(0xC5);
-  db(0xFC);
-  db(0x77);
-}
-void vzeroupper() {
-  db(0xC5);
-  db(0xF8);
-  db(0x77);
-}
-void wait() { db(0x9B); }
-void wbinvd() {
-  db(0x0F);
-  db(0x09);
-}
-void wrmsr() {
-  db(0x0F);
-  db(0x30);
-}
-void xabort(uint8_t imm) {
-  db(0xC6);
-  db(0xF8);
-  db(imm);
-}
-void xadd(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
-          0xC0 | (reg.isBit(8) ? 0 : 1));
-}
-void xbegin(uint32_t rel) {
-  db(0xC7);
-  db(0xF8);
-  dd(rel);
-}
-void xend() {
-  db(0x0F);
-  db(0x01);
-  db(0xD5);
-}
-void xgetbv() {
-  db(0x0F);
-  db(0x01);
-  db(0xD0);
-}
-void xlatb() { db(0xD7); }
-void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); }
-void xor_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
-void xorpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x66, isXMM_XMMorMEM); }
-void xorps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x100, isXMM_XMMorMEM); }
-#ifdef XBYAK_ENABLE_OMITTED_OPERAND
-void vblendpd(const Xmm& x, const Operand& op, uint8_t imm) { vblendpd(x, x, op, imm); }
-void vblendps(const Xmm& x, const Operand& op, uint8_t imm) { vblendps(x, x, op, imm); }
-void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvpd(x1, x1, op, x4); }
-void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvps(x1, x1, op, x4); }
-void vcmpeq_ospd(const Xmm& x, const Operand& op) { vcmpeq_ospd(x, x, op); }
-void vcmpeq_osps(const Xmm& x, const Operand& op) { vcmpeq_osps(x, x, op); }
-void vcmpeq_ossd(const Xmm& x, const Operand& op) { vcmpeq_ossd(x, x, op); }
-void vcmpeq_osss(const Xmm& x, const Operand& op) { vcmpeq_osss(x, x, op); }
-void vcmpeq_uqpd(const Xmm& x, const Operand& op) { vcmpeq_uqpd(x, x, op); }
-void vcmpeq_uqps(const Xmm& x, const Operand& op) { vcmpeq_uqps(x, x, op); }
-void vcmpeq_uqsd(const Xmm& x, const Operand& op) { vcmpeq_uqsd(x, x, op); }
-void vcmpeq_uqss(const Xmm& x, const Operand& op) { vcmpeq_uqss(x, x, op); }
-void vcmpeq_uspd(const Xmm& x, const Operand& op) { vcmpeq_uspd(x, x, op); }
-void vcmpeq_usps(const Xmm& x, const Operand& op) { vcmpeq_usps(x, x, op); }
-void vcmpeq_ussd(const Xmm& x, const Operand& op) { vcmpeq_ussd(x, x, op); }
-void vcmpeq_usss(const Xmm& x, const Operand& op) { vcmpeq_usss(x, x, op); }
-void vcmpeqpd(const Xmm& x, const Operand& op) { vcmpeqpd(x, x, op); }
-void vcmpeqps(const Xmm& x, const Operand& op) { vcmpeqps(x, x, op); }
-void vcmpeqsd(const Xmm& x, const Operand& op) { vcmpeqsd(x, x, op); }
-void vcmpeqss(const Xmm& x, const Operand& op) { vcmpeqss(x, x, op); }
-void vcmpfalse_ospd(const Xmm& x, const Operand& op) { vcmpfalse_ospd(x, x, op); }
-void vcmpfalse_osps(const Xmm& x, const Operand& op) { vcmpfalse_osps(x, x, op); }
-void vcmpfalse_ossd(const Xmm& x, const Operand& op) { vcmpfalse_ossd(x, x, op); }
-void vcmpfalse_osss(const Xmm& x, const Operand& op) { vcmpfalse_osss(x, x, op); }
-void vcmpfalsepd(const Xmm& x, const Operand& op) { vcmpfalsepd(x, x, op); }
-void vcmpfalseps(const Xmm& x, const Operand& op) { vcmpfalseps(x, x, op); }
-void vcmpfalsesd(const Xmm& x, const Operand& op) { vcmpfalsesd(x, x, op); }
-void vcmpfalsess(const Xmm& x, const Operand& op) { vcmpfalsess(x, x, op); }
-void vcmpge_oqpd(const Xmm& x, const Operand& op) { vcmpge_oqpd(x, x, op); }
-void vcmpge_oqps(const Xmm& x, const Operand& op) { vcmpge_oqps(x, x, op); }
-void vcmpge_oqsd(const Xmm& x, const Operand& op) { vcmpge_oqsd(x, x, op); }
-void vcmpge_oqss(const Xmm& x, const Operand& op) { vcmpge_oqss(x, x, op); }
-void vcmpgepd(const Xmm& x, const Operand& op) { vcmpgepd(x, x, op); }
-void vcmpgeps(const Xmm& x, const Operand& op) { vcmpgeps(x, x, op); }
-void vcmpgesd(const Xmm& x, const Operand& op) { vcmpgesd(x, x, op); }
-void vcmpgess(const Xmm& x, const Operand& op) { vcmpgess(x, x, op); }
-void vcmpgt_oqpd(const Xmm& x, const Operand& op) { vcmpgt_oqpd(x, x, op); }
-void vcmpgt_oqps(const Xmm& x, const Operand& op) { vcmpgt_oqps(x, x, op); }
-void vcmpgt_oqsd(const Xmm& x, const Operand& op) { vcmpgt_oqsd(x, x, op); }
-void vcmpgt_oqss(const Xmm& x, const Operand& op) { vcmpgt_oqss(x, x, op); }
-void vcmpgtpd(const Xmm& x, const Operand& op) { vcmpgtpd(x, x, op); }
-void vcmpgtps(const Xmm& x, const Operand& op) { vcmpgtps(x, x, op); }
-void vcmpgtsd(const Xmm& x, const Operand& op) { vcmpgtsd(x, x, op); }
-void vcmpgtss(const Xmm& x, const Operand& op) { vcmpgtss(x, x, op); }
-void vcmple_oqpd(const Xmm& x, const Operand& op) { vcmple_oqpd(x, x, op); }
-void vcmple_oqps(const Xmm& x, const Operand& op) { vcmple_oqps(x, x, op); }
-void vcmple_oqsd(const Xmm& x, const Operand& op) { vcmple_oqsd(x, x, op); }
-void vcmple_oqss(const Xmm& x, const Operand& op) { vcmple_oqss(x, x, op); }
-void vcmplepd(const Xmm& x, const Operand& op) { vcmplepd(x, x, op); }
-void vcmpleps(const Xmm& x, const Operand& op) { vcmpleps(x, x, op); }
-void vcmplesd(const Xmm& x, const Operand& op) { vcmplesd(x, x, op); }
-void vcmpless(const Xmm& x, const Operand& op) { vcmpless(x, x, op); }
-void vcmplt_oqpd(const Xmm& x, const Operand& op) { vcmplt_oqpd(x, x, op); }
-void vcmplt_oqps(const Xmm& x, const Operand& op) { vcmplt_oqps(x, x, op); }
-void vcmplt_oqsd(const Xmm& x, const Operand& op) { vcmplt_oqsd(x, x, op); }
-void vcmplt_oqss(const Xmm& x, const Operand& op) { vcmplt_oqss(x, x, op); }
-void vcmpltpd(const Xmm& x, const Operand& op) { vcmpltpd(x, x, op); }
-void vcmpltps(const Xmm& x, const Operand& op) { vcmpltps(x, x, op); }
-void vcmpltsd(const Xmm& x, const Operand& op) { vcmpltsd(x, x, op); }
-void vcmpltss(const Xmm& x, const Operand& op) { vcmpltss(x, x, op); }
-void vcmpneq_oqpd(const Xmm& x, const Operand& op) { vcmpneq_oqpd(x, x, op); }
-void vcmpneq_oqps(const Xmm& x, const Operand& op) { vcmpneq_oqps(x, x, op); }
-void vcmpneq_oqsd(const Xmm& x, const Operand& op) { vcmpneq_oqsd(x, x, op); }
-void vcmpneq_oqss(const Xmm& x, const Operand& op) { vcmpneq_oqss(x, x, op); }
-void vcmpneq_ospd(const Xmm& x, const Operand& op) { vcmpneq_ospd(x, x, op); }
-void vcmpneq_osps(const Xmm& x, const Operand& op) { vcmpneq_osps(x, x, op); }
-void vcmpneq_ossd(const Xmm& x, const Operand& op) { vcmpneq_ossd(x, x, op); }
-void vcmpneq_osss(const Xmm& x, const Operand& op) { vcmpneq_osss(x, x, op); }
-void vcmpneq_uspd(const Xmm& x, const Operand& op) { vcmpneq_uspd(x, x, op); }
-void vcmpneq_usps(const Xmm& x, const Operand& op) { vcmpneq_usps(x, x, op); }
-void vcmpneq_ussd(const Xmm& x, const Operand& op) { vcmpneq_ussd(x, x, op); }
-void vcmpneq_usss(const Xmm& x, const Operand& op) { vcmpneq_usss(x, x, op); }
-void vcmpneqpd(const Xmm& x, const Operand& op) { vcmpneqpd(x, x, op); }
-void vcmpneqps(const Xmm& x, const Operand& op) { vcmpneqps(x, x, op); }
-void vcmpneqsd(const Xmm& x, const Operand& op) { vcmpneqsd(x, x, op); }
-void vcmpneqss(const Xmm& x, const Operand& op) { vcmpneqss(x, x, op); }
-void vcmpnge_uqpd(const Xmm& x, const Operand& op) { vcmpnge_uqpd(x, x, op); }
-void vcmpnge_uqps(const Xmm& x, const Operand& op) { vcmpnge_uqps(x, x, op); }
-void vcmpnge_uqsd(const Xmm& x, const Operand& op) { vcmpnge_uqsd(x, x, op); }
-void vcmpnge_uqss(const Xmm& x, const Operand& op) { vcmpnge_uqss(x, x, op); }
-void vcmpngepd(const Xmm& x, const Operand& op) { vcmpngepd(x, x, op); }
-void vcmpngeps(const Xmm& x, const Operand& op) { vcmpngeps(x, x, op); }
-void vcmpngesd(const Xmm& x, const Operand& op) { vcmpngesd(x, x, op); }
-void vcmpngess(const Xmm& x, const Operand& op) { vcmpngess(x, x, op); }
-void vcmpngt_uqpd(const Xmm& x, const Operand& op) { vcmpngt_uqpd(x, x, op); }
-void vcmpngt_uqps(const Xmm& x, const Operand& op) { vcmpngt_uqps(x, x, op); }
-void vcmpngt_uqsd(const Xmm& x, const Operand& op) { vcmpngt_uqsd(x, x, op); }
-void vcmpngt_uqss(const Xmm& x, const Operand& op) { vcmpngt_uqss(x, x, op); }
-void vcmpngtpd(const Xmm& x, const Operand& op) { vcmpngtpd(x, x, op); }
-void vcmpngtps(const Xmm& x, const Operand& op) { vcmpngtps(x, x, op); }
-void vcmpngtsd(const Xmm& x, const Operand& op) { vcmpngtsd(x, x, op); }
-void vcmpngtss(const Xmm& x, const Operand& op) { vcmpngtss(x, x, op); }
-void vcmpnle_uqpd(const Xmm& x, const Operand& op) { vcmpnle_uqpd(x, x, op); }
-void vcmpnle_uqps(const Xmm& x, const Operand& op) { vcmpnle_uqps(x, x, op); }
-void vcmpnle_uqsd(const Xmm& x, const Operand& op) { vcmpnle_uqsd(x, x, op); }
-void vcmpnle_uqss(const Xmm& x, const Operand& op) { vcmpnle_uqss(x, x, op); }
-void vcmpnlepd(const Xmm& x, const Operand& op) { vcmpnlepd(x, x, op); }
-void vcmpnleps(const Xmm& x, const Operand& op) { vcmpnleps(x, x, op); }
-void vcmpnlesd(const Xmm& x, const Operand& op) { vcmpnlesd(x, x, op); }
-void vcmpnless(const Xmm& x, const Operand& op) { vcmpnless(x, x, op); }
-void vcmpnlt_uqpd(const Xmm& x, const Operand& op) { vcmpnlt_uqpd(x, x, op); }
-void vcmpnlt_uqps(const Xmm& x, const Operand& op) { vcmpnlt_uqps(x, x, op); }
-void vcmpnlt_uqsd(const Xmm& x, const Operand& op) { vcmpnlt_uqsd(x, x, op); }
-void vcmpnlt_uqss(const Xmm& x, const Operand& op) { vcmpnlt_uqss(x, x, op); }
-void vcmpnltpd(const Xmm& x, const Operand& op) { vcmpnltpd(x, x, op); }
-void vcmpnltps(const Xmm& x, const Operand& op) { vcmpnltps(x, x, op); }
-void vcmpnltsd(const Xmm& x, const Operand& op) { vcmpnltsd(x, x, op); }
-void vcmpnltss(const Xmm& x, const Operand& op) { vcmpnltss(x, x, op); }
-void vcmpord_spd(const Xmm& x, const Operand& op) { vcmpord_spd(x, x, op); }
-void vcmpord_sps(const Xmm& x, const Operand& op) { vcmpord_sps(x, x, op); }
-void vcmpord_ssd(const Xmm& x, const Operand& op) { vcmpord_ssd(x, x, op); }
-void vcmpord_sss(const Xmm& x, const Operand& op) { vcmpord_sss(x, x, op); }
-void vcmpordpd(const Xmm& x, const Operand& op) { vcmpordpd(x, x, op); }
-void vcmpordps(const Xmm& x, const Operand& op) { vcmpordps(x, x, op); }
-void vcmpordsd(const Xmm& x, const Operand& op) { vcmpordsd(x, x, op); }
-void vcmpordss(const Xmm& x, const Operand& op) { vcmpordss(x, x, op); }
-void vcmppd(const Xmm& x, const Operand& op, uint8_t imm) { vcmppd(x, x, op, imm); }
-void vcmpps(const Xmm& x, const Operand& op, uint8_t imm) { vcmpps(x, x, op, imm); }
-void vcmpsd(const Xmm& x, const Operand& op, uint8_t imm) { vcmpsd(x, x, op, imm); }
-void vcmpss(const Xmm& x, const Operand& op, uint8_t imm) { vcmpss(x, x, op, imm); }
-void vcmptrue_uspd(const Xmm& x, const Operand& op) { vcmptrue_uspd(x, x, op); }
-void vcmptrue_usps(const Xmm& x, const Operand& op) { vcmptrue_usps(x, x, op); }
-void vcmptrue_ussd(const Xmm& x, const Operand& op) { vcmptrue_ussd(x, x, op); }
-void vcmptrue_usss(const Xmm& x, const Operand& op) { vcmptrue_usss(x, x, op); }
-void vcmptruepd(const Xmm& x, const Operand& op) { vcmptruepd(x, x, op); }
-void vcmptrueps(const Xmm& x, const Operand& op) { vcmptrueps(x, x, op); }
-void vcmptruesd(const Xmm& x, const Operand& op) { vcmptruesd(x, x, op); }
-void vcmptruess(const Xmm& x, const Operand& op) { vcmptruess(x, x, op); }
-void vcmpunord_spd(const Xmm& x, const Operand& op) { vcmpunord_spd(x, x, op); }
-void vcmpunord_sps(const Xmm& x, const Operand& op) { vcmpunord_sps(x, x, op); }
-void vcmpunord_ssd(const Xmm& x, const Operand& op) { vcmpunord_ssd(x, x, op); }
-void vcmpunord_sss(const Xmm& x, const Operand& op) { vcmpunord_sss(x, x, op); }
-void vcmpunordpd(const Xmm& x, const Operand& op) { vcmpunordpd(x, x, op); }
-void vcmpunordps(const Xmm& x, const Operand& op) { vcmpunordps(x, x, op); }
-void vcmpunordsd(const Xmm& x, const Operand& op) { vcmpunordsd(x, x, op); }
-void vcmpunordss(const Xmm& x, const Operand& op) { vcmpunordss(x, x, op); }
-void vcvtsd2ss(const Xmm& x, const Operand& op) { vcvtsd2ss(x, x, op); }
-void vcvtsi2sd(const Xmm& x, const Operand& op) { vcvtsi2sd(x, x, op); }
-void vcvtsi2ss(const Xmm& x, const Operand& op) { vcvtsi2ss(x, x, op); }
-void vcvtss2sd(const Xmm& x, const Operand& op) { vcvtss2sd(x, x, op); }
-void vdppd(const Xmm& x, const Operand& op, uint8_t imm) { vdppd(x, x, op, imm); }
-void vdpps(const Xmm& x, const Operand& op, uint8_t imm) { vdpps(x, x, op, imm); }
-void vinsertps(const Xmm& x, const Operand& op, uint8_t imm) { vinsertps(x, x, op, imm); }
-void vmpsadbw(const Xmm& x, const Operand& op, uint8_t imm) { vmpsadbw(x, x, op, imm); }
-void vpackssdw(const Xmm& x, const Operand& op) { vpackssdw(x, x, op); }
-void vpacksswb(const Xmm& x, const Operand& op) { vpacksswb(x, x, op); }
-void vpackusdw(const Xmm& x, const Operand& op) { vpackusdw(x, x, op); }
-void vpackuswb(const Xmm& x, const Operand& op) { vpackuswb(x, x, op); }
-void vpaddb(const Xmm& x, const Operand& op) { vpaddb(x, x, op); }
-void vpaddd(const Xmm& x, const Operand& op) { vpaddd(x, x, op); }
-void vpaddq(const Xmm& x, const Operand& op) { vpaddq(x, x, op); }
-void vpaddsb(const Xmm& x, const Operand& op) { vpaddsb(x, x, op); }
-void vpaddsw(const Xmm& x, const Operand& op) { vpaddsw(x, x, op); }
-void vpaddusb(const Xmm& x, const Operand& op) { vpaddusb(x, x, op); }
-void vpaddusw(const Xmm& x, const Operand& op) { vpaddusw(x, x, op); }
-void vpaddw(const Xmm& x, const Operand& op) { vpaddw(x, x, op); }
-void vpalignr(const Xmm& x, const Operand& op, uint8_t imm) { vpalignr(x, x, op, imm); }
-void vpand(const Xmm& x, const Operand& op) { vpand(x, x, op); }
-void vpandn(const Xmm& x, const Operand& op) { vpandn(x, x, op); }
-void vpavgb(const Xmm& x, const Operand& op) { vpavgb(x, x, op); }
-void vpavgw(const Xmm& x, const Operand& op) { vpavgw(x, x, op); }
-void vpblendd(const Xmm& x, const Operand& op, uint8_t imm) { vpblendd(x, x, op, imm); }
-void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { vpblendvb(x1, x1, op, x4); }
-void vpblendw(const Xmm& x, const Operand& op, uint8_t imm) { vpblendw(x, x, op, imm); }
-void vpclmulqdq(const Xmm& x, const Operand& op, uint8_t imm) { vpclmulqdq(x, x, op, imm); }
-void vpcmpeqb(const Xmm& x, const Operand& op) { vpcmpeqb(x, x, op); }
-void vpcmpeqd(const Xmm& x, const Operand& op) { vpcmpeqd(x, x, op); }
-void vpcmpeqq(const Xmm& x, const Operand& op) { vpcmpeqq(x, x, op); }
-void vpcmpeqw(const Xmm& x, const Operand& op) { vpcmpeqw(x, x, op); }
-void vpcmpgtb(const Xmm& x, const Operand& op) { vpcmpgtb(x, x, op); }
-void vpcmpgtd(const Xmm& x, const Operand& op) { vpcmpgtd(x, x, op); }
-void vpcmpgtq(const Xmm& x, const Operand& op) { vpcmpgtq(x, x, op); }
-void vpcmpgtw(const Xmm& x, const Operand& op) { vpcmpgtw(x, x, op); }
-void vphaddd(const Xmm& x, const Operand& op) { vphaddd(x, x, op); }
-void vphaddsw(const Xmm& x, const Operand& op) { vphaddsw(x, x, op); }
-void vphaddw(const Xmm& x, const Operand& op) { vphaddw(x, x, op); }
-void vphsubd(const Xmm& x, const Operand& op) { vphsubd(x, x, op); }
-void vphsubsw(const Xmm& x, const Operand& op) { vphsubsw(x, x, op); }
-void vphsubw(const Xmm& x, const Operand& op) { vphsubw(x, x, op); }
-void vpinsrb(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrb(x, x, op, imm); }
-void vpinsrd(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrd(x, x, op, imm); }
-void vpinsrq(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrq(x, x, op, imm); }
-void vpinsrw(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrw(x, x, op, imm); }
-void vpmaddubsw(const Xmm& x, const Operand& op) { vpmaddubsw(x, x, op); }
-void vpmaddwd(const Xmm& x, const Operand& op) { vpmaddwd(x, x, op); }
-void vpmaxsb(const Xmm& x, const Operand& op) { vpmaxsb(x, x, op); }
-void vpmaxsd(const Xmm& x, const Operand& op) { vpmaxsd(x, x, op); }
-void vpmaxsw(const Xmm& x, const Operand& op) { vpmaxsw(x, x, op); }
-void vpmaxub(const Xmm& x, const Operand& op) { vpmaxub(x, x, op); }
-void vpmaxud(const Xmm& x, const Operand& op) { vpmaxud(x, x, op); }
-void vpmaxuw(const Xmm& x, const Operand& op) { vpmaxuw(x, x, op); }
-void vpminsb(const Xmm& x, const Operand& op) { vpminsb(x, x, op); }
-void vpminsd(const Xmm& x, const Operand& op) { vpminsd(x, x, op); }
-void vpminsw(const Xmm& x, const Operand& op) { vpminsw(x, x, op); }
-void vpminub(const Xmm& x, const Operand& op) { vpminub(x, x, op); }
-void vpminud(const Xmm& x, const Operand& op) { vpminud(x, x, op); }
-void vpminuw(const Xmm& x, const Operand& op) { vpminuw(x, x, op); }
-void vpmuldq(const Xmm& x, const Operand& op) { vpmuldq(x, x, op); }
-void vpmulhrsw(const Xmm& x, const Operand& op) { vpmulhrsw(x, x, op); }
-void vpmulhuw(const Xmm& x, const Operand& op) { vpmulhuw(x, x, op); }
-void vpmulhw(const Xmm& x, const Operand& op) { vpmulhw(x, x, op); }
-void vpmulld(const Xmm& x, const Operand& op) { vpmulld(x, x, op); }
-void vpmullw(const Xmm& x, const Operand& op) { vpmullw(x, x, op); }
-void vpmuludq(const Xmm& x, const Operand& op) { vpmuludq(x, x, op); }
-void vpor(const Xmm& x, const Operand& op) { vpor(x, x, op); }
-void vpsadbw(const Xmm& x, const Operand& op) { vpsadbw(x, x, op); }
-void vpsignb(const Xmm& x, const Operand& op) { vpsignb(x, x, op); }
-void vpsignd(const Xmm& x, const Operand& op) { vpsignd(x, x, op); }
-void vpsignw(const Xmm& x, const Operand& op) { vpsignw(x, x, op); }
-void vpslld(const Xmm& x, const Operand& op) { vpslld(x, x, op); }
-void vpslld(const Xmm& x, uint8_t imm) { vpslld(x, x, imm); }
-void vpslldq(const Xmm& x, uint8_t imm) { vpslldq(x, x, imm); }
-void vpsllq(const Xmm& x, const Operand& op) { vpsllq(x, x, op); }
-void vpsllq(const Xmm& x, uint8_t imm) { vpsllq(x, x, imm); }
-void vpsllw(const Xmm& x, const Operand& op) { vpsllw(x, x, op); }
-void vpsllw(const Xmm& x, uint8_t imm) { vpsllw(x, x, imm); }
-void vpsrad(const Xmm& x, const Operand& op) { vpsrad(x, x, op); }
-void vpsrad(const Xmm& x, uint8_t imm) { vpsrad(x, x, imm); }
-void vpsraw(const Xmm& x, const Operand& op) { vpsraw(x, x, op); }
-void vpsraw(const Xmm& x, uint8_t imm) { vpsraw(x, x, imm); }
-void vpsrld(const Xmm& x, const Operand& op) { vpsrld(x, x, op); }
-void vpsrld(const Xmm& x, uint8_t imm) { vpsrld(x, x, imm); }
-void vpsrldq(const Xmm& x, uint8_t imm) { vpsrldq(x, x, imm); }
-void vpsrlq(const Xmm& x, const Operand& op) { vpsrlq(x, x, op); }
-void vpsrlq(const Xmm& x, uint8_t imm) { vpsrlq(x, x, imm); }
-void vpsrlw(const Xmm& x, const Operand& op) { vpsrlw(x, x, op); }
-void vpsrlw(const Xmm& x, uint8_t imm) { vpsrlw(x, x, imm); }
-void vpsubb(const Xmm& x, const Operand& op) { vpsubb(x, x, op); }
-void vpsubd(const Xmm& x, const Operand& op) { vpsubd(x, x, op); }
-void vpsubq(const Xmm& x, const Operand& op) { vpsubq(x, x, op); }
-void vpsubsb(const Xmm& x, const Operand& op) { vpsubsb(x, x, op); }
-void vpsubsw(const Xmm& x, const Operand& op) { vpsubsw(x, x, op); }
-void vpsubusb(const Xmm& x, const Operand& op) { vpsubusb(x, x, op); }
-void vpsubusw(const Xmm& x, const Operand& op) { vpsubusw(x, x, op); }
-void vpsubw(const Xmm& x, const Operand& op) { vpsubw(x, x, op); }
-void vpunpckhbw(const Xmm& x, const Operand& op) { vpunpckhbw(x, x, op); }
-void vpunpckhdq(const Xmm& x, const Operand& op) { vpunpckhdq(x, x, op); }
-void vpunpckhqdq(const Xmm& x, const Operand& op) { vpunpckhqdq(x, x, op); }
-void vpunpckhwd(const Xmm& x, const Operand& op) { vpunpckhwd(x, x, op); }
-void vpunpcklbw(const Xmm& x, const Operand& op) { vpunpcklbw(x, x, op); }
-void vpunpckldq(const Xmm& x, const Operand& op) { vpunpckldq(x, x, op); }
-void vpunpcklqdq(const Xmm& x, const Operand& op) { vpunpcklqdq(x, x, op); }
-void vpunpcklwd(const Xmm& x, const Operand& op) { vpunpcklwd(x, x, op); }
-void vpxor(const Xmm& x, const Operand& op) { vpxor(x, x, op); }
-void vrcpss(const Xmm& x, const Operand& op) { vrcpss(x, x, op); }
-void vroundsd(const Xmm& x, const Operand& op, uint8_t imm) { vroundsd(x, x, op, imm); }
-void vroundss(const Xmm& x, const Operand& op, uint8_t imm) { vroundss(x, x, op, imm); }
-void vrsqrtss(const Xmm& x, const Operand& op) { vrsqrtss(x, x, op); }
-void vshufpd(const Xmm& x, const Operand& op, uint8_t imm) { vshufpd(x, x, op, imm); }
-void vshufps(const Xmm& x, const Operand& op, uint8_t imm) { vshufps(x, x, op, imm); }
-void vsqrtsd(const Xmm& x, const Operand& op) { vsqrtsd(x, x, op); }
-void vsqrtss(const Xmm& x, const Operand& op) { vsqrtss(x, x, op); }
-void vunpckhpd(const Xmm& x, const Operand& op) { vunpckhpd(x, x, op); }
-void vunpckhps(const Xmm& x, const Operand& op) { vunpckhps(x, x, op); }
-void vunpcklpd(const Xmm& x, const Operand& op) { vunpcklpd(x, x, op); }
-void vunpcklps(const Xmm& x, const Operand& op) { vunpcklps(x, x, op); }
-#endif
-#ifdef XBYAK64
-void jecxz(std::string label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jecxz(const Label& label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void cdqe() {
-  db(0x48);
-  db(0x98);
-}
-void cqo() {
-  db(0x48);
-  db(0x99);
-}
-void cmpsq() {
-  db(0x48);
-  db(0xA7);
-}
-void popfq() { db(0x9D); }
-void pushfq() { db(0x9C); }
-void lodsq() {
-  db(0x48);
-  db(0xAD);
-}
-void movsq() {
-  db(0x48);
-  db(0xA5);
-}
-void scasq() {
-  db(0x48);
-  db(0xAF);
-}
-void stosq() {
-  db(0x48);
-  db(0xAB);
-}
-void syscall() {
-  db(0x0F);
-  db(0x05);
-}
-void sysret() {
-  db(0x0F);
-  db(0x07);
-}
-void clui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEE);
-}
-void stui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEF);
-}
-void testui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xED);
-}
-void uiret() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEC);
-}
-void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
-void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
-void movq(const Reg64& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x7E);
-}
-void movq(const Mmx& mmx, const Reg64& reg) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x6E);
-}
-void movsxd(const Reg64& reg, const Operand& op) {
-  if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63);
-}
-void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) {
-  if (!op.isREG(64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A);
-}
-void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  if (!op.isREG(64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A);
-}
-void senduipi(const Reg64& r) {
-  db(0xF3);
-  opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7);
-}
-void vcvtss2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D);
-}
-void vcvttss2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C);
-}
-void vcvtsd2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D);
-}
-void vcvttsd2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C);
-}
-void vmovq(const Xmm& x, const Reg64& r) {
-  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E);
-}
-void vmovq(const Reg64& r, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E);
-}
-void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false);
-}
-void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false);
-}
-void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false);
-}
-void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false);
-}
-void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false);
-}
-void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false);
-}
-void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false);
-}
-void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false);
-}
-void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false);
-}
-void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false);
-}
-void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false);
-}
-void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false);
-}
-void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false);
-}
-void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false);
-}
-void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false);
-}
-void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false);
-}
-void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
-void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
-void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
-void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }
-void tilerelease() {
-  db(0xc4);
-  db(0xe2);
-  db(0x78);
-  db(0x49);
-  db(0xc0);
-}
-void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
-void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
-void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
-void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
-void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
-void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
-void tdpfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
-void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
-#else
-void jcxz(std::string label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jcxz(const Label& label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jecxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void jecxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void aaa() { db(0x37); }
-void aad() {
-  db(0xD5);
-  db(0x0A);
-}
-void aam() {
-  db(0xD4);
-  db(0x0A);
-}
-void aas() { db(0x3F); }
-void daa() { db(0x27); }
-void das() { db(0x2F); }
-void into() { db(0xCE); }
-void popad() { db(0x61); }
-void popfd() { db(0x9D); }
-void pusha() { db(0x60); }
-void pushad() { db(0x60); }
-void pushfd() { db(0x9C); }
-void popa() { db(0x61); }
-void lds(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC5, 0x100); }
-void les(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC4, 0x100); }
-#endif
-#ifndef XBYAK_NO_OP_NAMES
-void and (const Operand& op1, const Operand& op2) { and_(op1, op2); }
-void and (const Operand& op, uint32_t imm) { and_(op, imm); }
-void or (const Operand& op1, const Operand& op2) { or_(op1, op2); }
-void or (const Operand& op, uint32_t imm) { or_(op, imm); }
-void xor (const Operand& op1, const Operand& op2) { xor_(op1, op2); } void xor
-    (const Operand& op, uint32_t imm) { xor_(op, imm); } void not(const Operand& op) {
-  not_(op);
-}
-#endif
-#ifndef XBYAK_DISABLE_AVX512
-void kaddb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4A);
-}
-void kaddd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x4A);
-}
-void kaddq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4A); }
-void kaddw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4A); }
-void kandb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x41);
-}
-void kandd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x41);
-}
-void kandnb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x42);
-}
-void kandnd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x42);
-}
-void kandnq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x42); }
-void kandnw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x42); }
-void kandq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x41); }
-void kandw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x41); }
-void kmovb(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W0, 0x91); }
-void kmovb(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W0, 0x90);
-}
-void kmovb(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_66 | T_W0, 0x92); }
-void kmovb(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_66 | T_W0, 0x93); }
-void kmovd(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W1, 0x91); }
-void kmovd(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W1, 0x90);
-}
-void kmovd(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W0, 0x92); }
-void kmovd(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W0, 0x93); }
-void kmovq(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W1, 0x91); }
-void kmovq(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W1, 0x90);
-}
-void kmovw(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W0, 0x91); }
-void kmovw(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W0, 0x90);
-}
-void kmovw(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_W0, 0x92); }
-void kmovw(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_W0, 0x93); }
-void knotb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x44); }
-void knotd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x44); }
-void knotq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x44); }
-void knotw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x44); }
-void korb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x45); }
-void kord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x45); }
-void korq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x45); }
-void kortestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x98); }
-void kortestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x98); }
-void kortestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x98); }
-void kortestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x98); }
-void korw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x45); }
-void kshiftlb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x32, imm); }
-void kshiftld(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x33, imm); }
-void kshiftlq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x33, imm); }
-void kshiftlw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x32, imm); }
-void kshiftrb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x30, imm); }
-void kshiftrd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x31, imm); }
-void kshiftrq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x31, imm); }
-void kshiftrw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x30, imm); }
-void ktestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x99); }
-void ktestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x99); }
-void ktestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x99); }
-void ktestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x99); }
-void kunpckbw(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4B);
-}
-void kunpckdq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4B); }
-void kunpckwd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4B); }
-void kxnorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x46);
-}
-void kxnord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x46);
-}
-void kxnorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x46); }
-void kxnorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x46); }
-void kxorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x47);
-}
-void kxord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x47);
-}
-void kxorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x47); }
-void kxorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x47); }
-void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A);
-}
-void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B);
-}
-void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA);
-}
-void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB);
-}
-void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58);
-}
-void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58);
-}
-void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x03, imm);
-}
-void valignq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x03, imm);
-}
-void vblendmpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x65);
-}
-void vblendmps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x65);
-}
-void vbroadcastf32x2(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x19);
-}
-void vbroadcastf32x4(const Ymm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x1A);
-}
-void vbroadcastf32x8(const Zmm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x1B);
-}
-void vbroadcastf64x2(const Ymm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x1A);
-}
-void vbroadcastf64x4(const Zmm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x1B);
-}
-void vbroadcasti32x2(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x59);
-}
-void vbroadcasti32x4(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x5A);
-}
-void vbroadcasti32x8(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B);
-}
-void vbroadcasti64x2(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A);
-}
-void vbroadcasti64x4(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B);
-}
-void vcmpeq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 16); }
-void vcmpeq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 16); }
-void vcmpeq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 16); }
-void vcmpeq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 16); }
-void vcmpeq_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 8); }
-void vcmpeq_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 8); }
-void vcmpeq_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 8); }
-void vcmpeq_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 8); }
-void vcmpeq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 24); }
-void vcmpeq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 24); }
-void vcmpeq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 24); }
-void vcmpeq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 24); }
-void vcmpeqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 0); }
-void vcmpeqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 0); }
-void vcmpeqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 0); }
-void vcmpeqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 0); }
-void vcmpfalse_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 27); }
-void vcmpfalse_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 27); }
-void vcmpfalse_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 27); }
-void vcmpfalse_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 27); }
-void vcmpfalsepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 11); }
-void vcmpfalseps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 11); }
-void vcmpfalsesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 11); }
-void vcmpfalsess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 11); }
-void vcmpge_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 29); }
-void vcmpge_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 29); }
-void vcmpge_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 29); }
-void vcmpge_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 29); }
-void vcmpgepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 13); }
-void vcmpgeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 13); }
-void vcmpgesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 13); }
-void vcmpgess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 13); }
-void vcmpgt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 30); }
-void vcmpgt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 30); }
-void vcmpgt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 30); }
-void vcmpgt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 30); }
-void vcmpgtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 14); }
-void vcmpgtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 14); }
-void vcmpgtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 14); }
-void vcmpgtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 14); }
-void vcmple_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 18); }
-void vcmple_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 18); }
-void vcmple_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 18); }
-void vcmple_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 18); }
-void vcmplepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 2); }
-void vcmpleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 2); }
-void vcmplesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 2); }
-void vcmpless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 2); }
-void vcmplt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 17); }
-void vcmplt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 17); }
-void vcmplt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 17); }
-void vcmplt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 17); }
-void vcmpltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 1); }
-void vcmpltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 1); }
-void vcmpltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 1); }
-void vcmpltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 1); }
-void vcmpneq_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 12); }
-void vcmpneq_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 12); }
-void vcmpneq_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 12); }
-void vcmpneq_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 12); }
-void vcmpneq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 28); }
-void vcmpneq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 28); }
-void vcmpneq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 28); }
-void vcmpneq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 28); }
-void vcmpneq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 20); }
-void vcmpneq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 20); }
-void vcmpneq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 20); }
-void vcmpneq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 20); }
-void vcmpneqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 4); }
-void vcmpneqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 4); }
-void vcmpneqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 4); }
-void vcmpneqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 4); }
-void vcmpnge_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 25); }
-void vcmpnge_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 25); }
-void vcmpnge_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 25); }
-void vcmpnge_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 25); }
-void vcmpngepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 9); }
-void vcmpngeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 9); }
-void vcmpngesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 9); }
-void vcmpngess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 9); }
-void vcmpngt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 26); }
-void vcmpngt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 26); }
-void vcmpngt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 26); }
-void vcmpngt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 26); }
-void vcmpngtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 10); }
-void vcmpngtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 10); }
-void vcmpngtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 10); }
-void vcmpngtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 10); }
-void vcmpnle_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 22); }
-void vcmpnle_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 22); }
-void vcmpnle_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 22); }
-void vcmpnle_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 22); }
-void vcmpnlepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 6); }
-void vcmpnleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 6); }
-void vcmpnlesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 6); }
-void vcmpnless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 6); }
-void vcmpnlt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 21); }
-void vcmpnlt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 21); }
-void vcmpnlt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 21); }
-void vcmpnlt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 21); }
-void vcmpnltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 5); }
-void vcmpnltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 5); }
-void vcmpnltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 5); }
-void vcmpnltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 5); }
-void vcmpord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 23); }
-void vcmpord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 23); }
-void vcmpord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 23); }
-void vcmpord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 23); }
-void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 7); }
-void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); }
-void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); }
-void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); }
-void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0xC2, imm);
-}
-void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0xC2, imm);
-}
-void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0xC2, imm);
-}
-void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmpsh(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N2 | T_F3 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmptrue_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 31); }
-void vcmptrue_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 31); }
-void vcmptrue_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 31); }
-void vcmptrue_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 31); }
-void vcmptruepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 15); }
-void vcmptrueps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 15); }
-void vcmptruesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 15); }
-void vcmptruess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 15); }
-void vcmpunord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 19); }
-void vcmpunord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 19); }
-void vcmpunord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 19); }
-void vcmpunord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 19); }
-void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 3); }
-void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); }
-void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); }
-void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); }
-void vcomish(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F);
-}
-void vcompressb(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63);
-}
-void vcompresspd(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A);
-}
-void vcompressps(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A);
-}
-void vcompressw(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63);
-}
-void vcvtdq2ph(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B);
-}
-void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72);
-}
-void vcvtpd2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A);
-}
-void vcvtpd2qq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B);
-}
-void vcvtpd2udq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
-}
-void vcvtpd2uqq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
-}
-void vcvtph2dq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x5B);
-}
-void vcvtph2pd(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x5A);
-}
-void vcvtph2psx(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x13);
-}
-void vcvtph2qq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x7B);
-}
-void vcvtph2udq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x79);
-}
-void vcvtph2uqq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x79);
-}
-void vcvtph2uw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtph2w(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtps2phx(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x1D);
-}
-void vcvtps2qq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x7B);
-}
-void vcvtps2udq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x79);
-}
-void vcvtps2uqq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x79);
-}
-void vcvtqq2pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0xE6);
-}
-void vcvtqq2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
-}
-void vcvtqq2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
-}
-void vcvtsd2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_MAP5 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x5A);
-}
-void vcvtsd2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N8 | T_F2 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvtsh2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x5A);
-}
-void vcvtsh2si(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x2D);
-}
-void vcvtsh2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x13);
-}
-void vcvtsh2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64)))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
-  opVex(x1, &x2, op, type, 0x2A);
-}
-void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x1D);
-}
-void vcvtss2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N4 | T_F3 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvttpd2qq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvttpd2udq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
-}
-void vcvttpd2uqq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
-}
-void vcvttph2dq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x5B);
-}
-void vcvttph2qq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x7A);
-}
-void vcvttph2udq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x78);
-}
-void vcvttph2uqq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x78);
-}
-void vcvttph2uw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
-}
-void vcvttph2w(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
-}
-void vcvttps2qq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvttps2udq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x78);
-}
-void vcvttps2uqq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x78);
-}
-void vcvttsd2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N8 | T_F2 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvttsh2si(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x2C);
-}
-void vcvttsh2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvttss2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N4 | T_F3 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvtudq2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtudq2ph(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtudq2ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtuqq2pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtuqq2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtuqq2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
-}
-void vcvtusi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64)))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
-  opVex(x1, &x2, op, type, 0x7B);
-}
-void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
-}
-void vcvtuw2ph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtw2ph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm);
-}
-void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E);
-}
-void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E);
-}
-void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52);
-}
-void vexp2pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8);
-}
-void vexp2ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8);
-}
-void vexpandpd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88);
-}
-void vexpandps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x88);
-}
-void vextractf32x4(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x19, imm);
-}
-void vextractf32x8(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1B, imm);
-}
-void vextractf64x2(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x19, imm);
-}
-void vextractf64x4(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1B, imm);
-}
-void vextracti32x4(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x39, imm);
-}
-void vextracti32x8(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3B, imm);
-}
-void vextracti64x2(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x39, imm);
-}
-void vextracti64x4(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3B, imm);
-}
-void vfcmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
-}
-void vfcmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
-}
-void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x54, imm);
-}
-void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x54, imm);
-}
-void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
-}
-void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
-}
-void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x98);
-}
-void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x99);
-}
-void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA8);
-}
-void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xA9);
-}
-void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB8);
-}
-void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xB9);
-}
-void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
-}
-void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x96);
-}
-void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA6);
-}
-void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB6);
-}
-void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9A);
-}
-void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9B);
-}
-void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAA);
-}
-void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAB);
-}
-void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBA);
-}
-void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBB);
-}
-void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x97);
-}
-void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA7);
-}
-void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB7);
-}
-void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
-}
-void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9C);
-}
-void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9D);
-}
-void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAC);
-}
-void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAD);
-}
-void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBC);
-}
-void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBD);
-}
-void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9E);
-}
-void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9F);
-}
-void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAE);
-}
-void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAF);
-}
-void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBE);
-}
-void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBF);
-}
-void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm);
-}
-void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm);
-}
-void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm);
-}
-void vfpclasssd(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm);
-}
-void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm);
-}
-void vfpclassss(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm);
-}
-void vgatherdpd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1);
-}
-void vgatherdps(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 0);
-}
-void vgatherpf0dpd(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vgatherpf0dps(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vgatherpf0qpd(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf0qps(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf1dpd(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vgatherpf1dps(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vgatherpf1qpd(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf1qps(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherqpd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 0);
-}
-void vgatherqps(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 2);
-}
-void vgetexppd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x42);
-}
-void vgetexpph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x42);
-}
-void vgetexpps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x42);
-}
-void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x26, imm);
-}
-void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x26, imm);
-}
-void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x26, imm);
-}
-void vgetmantsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vgetmantsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vgetmantss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vinsertf32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x18, imm);
-}
-void vinsertf32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1A, imm);
-}
-void vinsertf64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x18, imm);
-}
-void vinsertf64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1A, imm);
-}
-void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x38, imm);
-}
-void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3A, imm);
-}
-void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x38, imm);
-}
-void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3A, imm);
-}
-void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F);
-}
-void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F);
-}
-void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D);
-}
-void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D);
-}
-void vmovdqa32(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqa32(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqa64(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqa64(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu16(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu16(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu32(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu32(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu64(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu64(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu8(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu8(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovsh(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX | T_M_K, 0x11);
-}
-void vmovsh(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
-}
-void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) {
-  opAVX_X_X_XM(x1, x2, x3, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
-}
-void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
-void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
-void vmovw(const Xmm& x, const Operand& op) {
-  if (!op.isREG(32 | 64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x6E);
-}
-void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59);
-}
-void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59);
-}
-void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) {
-  if (k.getOpmaskIdx() != 0)
-    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68);
-}
-void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) {
-  if (k.getOpmaskIdx() != 0)
-    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68);
-}
-void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x52);
-}
-void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53);
-}
-void vpabsq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F);
-}
-void vpandd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDB);
-}
-void vpandnd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDF);
-}
-void vpandnq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDF);
-}
-void vpandq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDB);
-}
-void vpblendmb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x66);
-}
-void vpblendmd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x64);
-}
-void vpblendmq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x64);
-}
-void vpblendmw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x66);
-}
-void vpbroadcastb(const Xmm& x, const Reg8& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7A); }
-void vpbroadcastd(const Xmm& x, const Reg32& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7C); }
-void vpbroadcastmb2q(const Xmm& x, const Opmask& k) {
-  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, 0x2A);
-}
-void vpbroadcastmw2d(const Xmm& x, const Opmask& k) {
-  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 0x3A);
-}
-void vpbroadcastw(const Xmm& x, const Reg16& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7B); }
-void vpcmpb(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3F, imm);
-}
-void vpcmpd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1F, imm);
-}
-void vpcmpeqb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x74);
-}
-void vpcmpeqd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_B32, 0x76);
-}
-void vpcmpeqq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x29);
-}
-void vpcmpeqw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpcmpgtb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x64);
-}
-void vpcmpgtd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x66);
-}
-void vpcmpgtq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x37);
-}
-void vpcmpgtw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x65);
-}
-void vpcmpq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1F, imm);
-}
-void vpcmpub(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3E, imm);
-}
-void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1E, imm);
-}
-void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1E, imm);
-}
-void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3E, imm);
-}
-void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3F, imm);
-}
-void vpcompressd(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8B);
-}
-void vpcompressq(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B);
-}
-void vpconflictd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4);
-}
-void vpconflictq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4);
-}
-void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D);
-}
-void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76);
-}
-void vpermi2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x77);
-}
-void vpermi2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x77);
-}
-void vpermi2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x76);
-}
-void vpermi2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpermt2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7D);
-}
-void vpermt2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7E);
-}
-void vpermt2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7F);
-}
-void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7F);
-}
-void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E);
-}
-void vpermt2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7D);
-}
-void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D);
-}
-void vpexpandb(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
-}
-void vpexpandd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89);
-}
-void vpexpandq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89);
-}
-void vpexpandw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
-}
-void vpgatherdd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 0);
-}
-void vpgatherdq(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 1);
-}
-void vpgatherqd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 2);
-}
-void vpgatherqq(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0);
-}
-void vplzcntd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44);
-}
-void vplzcntq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44);
-}
-void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D);
-}
-void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F);
-}
-void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39);
-}
-void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B);
-}
-void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
-void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
-void vpmovdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false);
-}
-void vpmovdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true);
-}
-void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
-void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
-void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
-void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
-void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
-void vpmovqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false);
-}
-void vpmovqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true);
-}
-void vpmovqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false);
-}
-void vpmovsdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false);
-}
-void vpmovsdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true);
-}
-void vpmovsqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false);
-}
-void vpmovsqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true);
-}
-void vpmovsqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false);
-}
-void vpmovswb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true);
-}
-void vpmovusdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false);
-}
-void vpmovusdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true);
-}
-void vpmovusqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false);
-}
-void vpmovusqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true);
-}
-void vpmovusqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false);
-}
-void vpmovuswb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true);
-}
-void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
-void vpmovwb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true);
-}
-void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40);
-}
-void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83);
-}
-void vpopcntb(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
-}
-void vpopcntd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x55);
-}
-void vpopcntq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x55);
-}
-void vpopcntw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
-}
-void vpord(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEB);
-}
-void vporq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEB);
-}
-void vprold(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
-}
-void vprolq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vprolvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x15);
-}
-void vprolvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x15);
-}
-void vprord(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
-}
-void vprorq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vprorvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x14);
-}
-void vprorvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x14);
-}
-void vpscatterdd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 0);
-}
-void vpscatterdq(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 1);
-}
-void vpscatterqd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 2);
-}
-void vpscatterqq(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 0);
-}
-void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm);
-}
-void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm);
-}
-void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71);
-}
-void vpshldvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71);
-}
-void vpshldvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70);
-}
-void vpshldw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70, imm);
-}
-void vpshrdd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73, imm);
-}
-void vpshrdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73, imm);
-}
-void vpshrdvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73);
-}
-void vpshrdvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73);
-}
-void vpshrdvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72);
-}
-void vpshrdw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72, imm);
-}
-void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F);
-}
-void vpsllvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x12);
-}
-void vpsraq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vpsraq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX, 0xE2);
-}
-void vpsravq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x46);
-}
-void vpsravw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x11);
-}
-void vpsrlvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x10);
-}
-void vpternlogd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x25, imm);
-}
-void vpternlogq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x25, imm);
-}
-void vptestmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestmd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
-}
-void vptestmq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
-}
-void vptestmw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestnmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestnmd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
-}
-void vptestnmq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
-}
-void vptestnmw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vpxord(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEF);
-}
-void vpxorq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEF);
-}
-void vrangepd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x50, imm);
-}
-void vrangeps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50, imm);
-}
-void vrangesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
-}
-void vrangess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
-}
-void vrcp14pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4C);
-}
-void vrcp14ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4C);
-}
-void vrcp14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX, 0x4D);
-}
-void vrcp14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX, 0x4D);
-}
-void vrcp28pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCA);
-}
-void vrcp28ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA);
-}
-void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCB);
-}
-void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCB);
-}
-void vrcpph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4C);
-}
-void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4D);
-}
-void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x56, imm);
-}
-void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x56, imm);
-}
-void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x56, imm);
-}
-void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x09, imm);
-}
-void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x08, imm);
-}
-void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x08, imm);
-}
-void vrndscalesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x0B, imm);
-}
-void vrndscalesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
-}
-void vrndscaless(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
-}
-void vrsqrt14pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4E);
-}
-void vrsqrt14ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4E);
-}
-void vrsqrt14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x4F);
-}
-void vrsqrt14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x4F);
-}
-void vrsqrt28pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCC);
-}
-void vrsqrt28ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC);
-}
-void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCD);
-}
-void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCD);
-}
-void vrsqrtph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4E);
-}
-void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4F);
-}
-void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x2C);
-}
-void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x2C);
-}
-void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x2C);
-}
-void vscalefsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscalefsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscalefss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscatterdpd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 1);
-}
-void vscatterdps(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 0);
-}
-void vscatterpf0dpd(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vscatterpf0dps(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vscatterpf0qpd(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf0qps(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf1dpd(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vscatterpf1dps(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vscatterpf1qpd(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf1qps(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterqpd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 0);
-}
-void vscatterqps(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 2);
-}
-void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x23, imm);
-}
-void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm);
-}
-void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm);
-}
-void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm);
-}
-void vsqrtph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x51);
-}
-void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x51);
-}
-void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C);
-}
-void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C);
-}
-void vucomish(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E);
-}
-#ifdef XBYAK64
-void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
-void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
-void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
-#endif
-#endif
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h
deleted file mode 100644
index f9e43afc8371..000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h
+++ /dev/null
@@ -1,1160 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#ifndef XBYAK_XBYAK_UTIL_H_
-#define XBYAK_XBYAK_UTIL_H_
-
-#ifdef XBYAK_ONLY_CLASS_CPU
-#include <stdint.h>
-#include <stdlib.h>
-#include <assert.h>
-#ifndef XBYAK_THROW
-#define XBYAK_THROW(x) ;
-#define XBYAK_THROW_RET(x, y) return y;
-#endif
-#ifndef XBYAK_CONSTEXPR
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
-    (defined(_MSC_VER) && _MSC_VER >= 1910)
-#define XBYAK_CONSTEXPR constexpr
-#else
-#define XBYAK_CONSTEXPR
-#endif
-#endif
-#else
-#include <string.h>
-
-/**
-        utility class and functions for Xbyak
-        Xbyak::util::Clock ; rdtsc timer
-        Xbyak::util::Cpu ; detect CPU
-*/
-#include "xbyak.h"
-#endif  // XBYAK_ONLY_CLASS_CPU
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-#define XBYAK_INTEL_CPU_SPECIFIC
-#endif
-
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-#if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32)
-static inline __declspec(naked) void __cpuid(int[4], int) {
-  __asm {
-				push	ebx
-				push	esi
-				mov		eax, dword ptr [esp + 4 * 2 + 8]  // eaxIn
-				cpuid
-				mov		esi, dword ptr [esp + 4 * 2 + 4]  // data
-				mov		dword ptr [esi], eax
-				mov		dword ptr [esi + 4], ebx
-				mov		dword ptr [esi + 8], ecx
-				mov		dword ptr [esi + 12], edx
-				pop		esi
-				pop		ebx
-				ret
-  }
-}
-#else
-#include <intrin.h>  // for __cpuid
-#endif
-#else
-#ifndef __GNUC_PREREQ
-#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
-#endif
-#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
-#if !defined(signature_VORTEX_ebx) && !defined(signature_NEXGEN_ebx) && \
-    !defined(signature_AMD_ebx)  // workaround for Bug 96238 - [i386] cpuid.h header needs include guards
-#include <cpuid.h>
-#endif
-#else
-#if defined(__APPLE__) && defined(XBYAK32)  // avoid err : can't find a register in class `BREG' while reloading `asm'
-#define __cpuid(eaxIn, a, b, c, d)                                         \
-  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
-                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
-                       : "0"(eaxIn))
-#define __cpuid_count(eaxIn, ecxIn, a, b, c, d)                            \
-  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
-                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
-                       : "0"(eaxIn), "2"(ecxIn))
-#else
-#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
-#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) \
-  __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
-#endif
-#endif
-#endif
-#endif
-
-#ifdef XBYAK_USE_VTUNE
-// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
-#include <jitprofiling.h>
-#ifdef _MSC_VER
-#pragma comment(lib, "libittnotify.lib")
-#endif
-#ifdef __linux__
-#include <dlfcn.h>
-#endif
-#endif
-#ifdef __linux__
-#define XBYAK_USE_PERF
-#endif
-
-namespace Xbyak {
-namespace util {
-
-typedef enum { SmtLevel = 1, CoreLevel = 2 } IntelCpuTopologyLevel;
-
-namespace local {
-
-template <uint64_t L, uint64_t H = 0>
-struct TypeT {};
-
-template <uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
-XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) {
-  return TypeT<L1 | L2, H1 | H2>();
-}
-
-template <typename T>
-inline T max_(T x, T y) {
-  return x >= y ? x : y;
-}
-template <typename T>
-inline T min_(T x, T y) {
-  return x < y ? x : y;
-}
-
-}  // namespace local
-
-/**
-        CPU detection class
-        @note static inline const member is supported by c++17 or later, so use template hack
-*/
-class Cpu {
- public:
-  class Type {
-    uint64_t L;
-    uint64_t H;
-
-   public:
-    Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) {}
-    template <uint64_t L_, uint64_t H_>
-    Type(local::TypeT<L_, H_>) : L(L_), H(H_) {}
-    Type& operator&=(const Type& rhs) {
-      L &= rhs.L;
-      H &= rhs.H;
-      return *this;
-    }
-    Type& operator|=(const Type& rhs) {
-      L |= rhs.L;
-      H |= rhs.H;
-      return *this;
-    }
-    Type operator&(const Type& rhs) const {
-      Type t = *this;
-      t &= rhs;
-      return t;
-    }
-    Type operator|(const Type& rhs) const {
-      Type t = *this;
-      t |= rhs;
-      return t;
-    }
-    bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; }
-    bool operator!=(const Type& rhs) const { return !operator==(rhs); }
-    // without explicit because backward compatilibity
-    operator bool() const { return (H | L) != 0; }
-    uint64_t getL() const { return L; }
-    uint64_t getH() const { return H; }
-  };
-
- private:
-  Type type_;
-  // system topology
-  bool x2APIC_supported_;
-  static const size_t maxTopologyLevels = 2;
-  uint32_t numCores_[maxTopologyLevels];
-
-  static const uint32_t maxNumberCacheLevels = 10;
-  uint32_t dataCacheSize_[maxNumberCacheLevels];
-  uint32_t coresSharignDataCache_[maxNumberCacheLevels];
-  uint32_t dataCacheLevels_;
-
-  uint32_t get32bitAsBE(const char* x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); }
-  uint32_t mask(int n) const { return (1U << n) - 1; }
-  void setFamily() {
-    uint32_t data[4] = {};
-    getCpuid(1, data);
-    stepping = data[0] & mask(4);
-    model = (data[0] >> 4) & mask(4);
-    family = (data[0] >> 8) & mask(4);
-    // type = (data[0] >> 12) & mask(2);
-    extModel = (data[0] >> 16) & mask(4);
-    extFamily = (data[0] >> 20) & mask(8);
-    if (family == 0x0f) {
-      displayFamily = family + extFamily;
-    } else {
-      displayFamily = family;
-    }
-    if (family == 6 || family == 0x0f) {
-      displayModel = (extModel << 4) + model;
-    } else {
-      displayModel = model;
-    }
-  }
-  uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) { return (val >> base) & ((1u << (end - base)) - 1); }
-  void setNumCores() {
-    if (!has(tINTEL) && !has(tAMD)) return;
-
-    uint32_t data[4] = {};
-    getCpuidEx(0x0, 0, data);
-    if (data[0] >= 0xB) {
-      /*
-             if leaf 11 exists(x2APIC is supported),
-             we use it to get the number of smt cores and cores on socket
-
-             leaf 0xB can be zeroed-out by a hypervisor
-     */
-      x2APIC_supported_ = true;
-      for (uint32_t i = 0; i < maxTopologyLevels; i++) {
-        getCpuidEx(0xB, i, data);
-        IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
-        if (level == SmtLevel || level == CoreLevel) {
-          numCores_[level - 1] = extractBit(data[1], 0, 15);
-        }
-      }
-      /*
-              Fallback values in case a hypervisor has 0xB leaf zeroed-out.
-      */
-      numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
-      numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
-    } else {
-      /*
-              Failed to deremine num of cores without x2APIC support.
-              TODO: USE initial APIC ID to determine ncores.
-      */
-      numCores_[SmtLevel - 1] = 0;
-      numCores_[CoreLevel - 1] = 0;
-    }
-  }
-  void setCacheHierarchy() {
-    if (!has(tINTEL) && !has(tAMD)) return;
-
-    // https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
-    if (has(tAMD)) {
-      // There are 3 Data Cache Levels (L1, L2, L3)
-      dataCacheLevels_ = 3;
-      const uint32_t leaf = 0x8000001D;  // for modern AMD CPus
-      // Sub leaf value ranges from 0 to 3
-      // Sub leaf value 0 refers to L1 Data Cache
-      // Sub leaf value 1 refers to L1 Instruction Cache
-      // Sub leaf value 2 refers to L2 Cache
-      // Sub leaf value 3 refers to L3 Cache
-      // For legacy AMD CPU, use leaf 0x80000005 for L1 cache
-      // and 0x80000006 for L2 and L3 cache
-      int cache_index = 0;
-      for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
-        // Skip sub_leaf = 1 as it refers to
-        // L1 Instruction Cache (not required)
-        if (sub_leaf == 1) {
-          continue;
-        }
-        uint32_t data[4] = {};
-        getCpuidEx(leaf, sub_leaf, data);
-        // Cache Size = Line Size * Partitions * Associativity * Cache Sets
-        dataCacheSize_[cache_index] = (extractBit(data[1], 22, 31) + 1)    // Associativity-1
-                                      * (extractBit(data[1], 12, 21) + 1)  // Partitions-1
-                                      * (extractBit(data[1], 0, 11) + 1)   // Line Size
-                                      * (data[2] + 1);
-        // Calculate the number of cores sharing the current data cache
-        int smt_width = numCores_[0];
-        int logical_cores = numCores_[1];
-        int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
-        if (logical_cores != 0) {
-          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
-        }
-        coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
-        ++cache_index;
-      }
-      return;
-    }
-    // intel
-    const uint32_t NO_CACHE = 0;
-    const uint32_t DATA_CACHE = 1;
-    //		const uint32_t INSTRUCTION_CACHE = 2;
-    const uint32_t UNIFIED_CACHE = 3;
-    uint32_t smt_width = 0;
-    uint32_t logical_cores = 0;
-    uint32_t data[4] = {};
-
-    if (x2APIC_supported_) {
-      smt_width = numCores_[0];
-      logical_cores = numCores_[1];
-    }
-
-    /*
-            Assumptions:
-            the first level of data cache is not shared (which is the
-            case for every existing architecture) and use this to
-            determine the SMT width for arch not supporting leaf 11.
-            when leaf 4 reports a number of core less than numCores_
-            on socket reported by leaf 11, then it is a correct number
-            of cores not an upperbound.
-    */
-    for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
-      getCpuidEx(0x4, i, data);
-      uint32_t cacheType = extractBit(data[0], 0, 4);
-      if (cacheType == NO_CACHE) break;
-      if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
-        uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
-        if (logical_cores != 0) {  // true only if leaf 0xB is supported and valid
-          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
-        }
-        assert(actual_logical_cores != 0);
-        dataCacheSize_[dataCacheLevels_] = (extractBit(data[1], 22, 31) + 1) * (extractBit(data[1], 12, 21) + 1) *
-                                           (extractBit(data[1], 0, 11) + 1) * (data[2] + 1);
-        if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
-        assert(smt_width != 0);
-        coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
-        dataCacheLevels_++;
-      }
-    }
-  }
-
- public:
-  int model;
-  int family;
-  int stepping;
-  int extModel;
-  int extFamily;
-  int displayFamily;  // family + extFamily
-  int displayModel;   // model + extModel
-
-  uint32_t getNumCores(IntelCpuTopologyLevel level) const {
-    if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
-    switch (level) {
-      case SmtLevel:
-        return numCores_[level - 1];
-      case CoreLevel:
-        return numCores_[level - 1] / numCores_[SmtLevel - 1];
-      default:
-        XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
-    }
-  }
-
-  uint32_t getDataCacheLevels() const { return dataCacheLevels_; }
-  uint32_t getCoresSharingDataCache(uint32_t i) const {
-    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-    return coresSharignDataCache_[i];
-  }
-  uint32_t getDataCacheSize(uint32_t i) const {
-    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-    return dataCacheSize_[i];
-  }
-
-  /*
-          data[] = { eax, ebx, ecx, edx }
-  */
-  static inline void getCpuid(uint32_t eaxIn, uint32_t data[4]) {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-    __cpuid(reinterpret_cast<int*>(data), eaxIn);
-#else
-    __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
-#endif
-#else
-    (void)eaxIn;
-    (void)data;
-#endif
-  }
-  static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-    __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
-#else
-    __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
-#endif
-#else
-    (void)eaxIn;
-    (void)ecxIn;
-    (void)data;
-#endif
-  }
-  static inline uint64_t getXfeature() {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _MSC_VER
-    return _xgetbv(0);
-#else
-    uint32_t eax, edx;
-    // xgetvb is not support on gcc 4.2
-    //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
-    __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
-    return ((uint64_t)edx << 32) | eax;
-#endif
-#else
-    return 0;
-#endif
-  }
-
-#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0)
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */
-#define XBYAK_DEFINE_TYPE(id, NAME) \
-  static const constexpr local::TypeT<XBYAK_SPLIT_ID(id)> NAME {}
-#else
-#define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT<XBYAK_SPLIT_ID(id)> NAME
-#endif
-  XBYAK_DEFINE_TYPE(0, tMMX);
-  XBYAK_DEFINE_TYPE(1, tMMX2);
-  XBYAK_DEFINE_TYPE(2, tCMOV);
-  XBYAK_DEFINE_TYPE(3, tSSE);
-  XBYAK_DEFINE_TYPE(4, tSSE2);
-  XBYAK_DEFINE_TYPE(5, tSSE3);
-  XBYAK_DEFINE_TYPE(6, tSSSE3);
-  XBYAK_DEFINE_TYPE(7, tSSE41);
-  XBYAK_DEFINE_TYPE(8, tSSE42);
-  XBYAK_DEFINE_TYPE(9, tPOPCNT);
-  XBYAK_DEFINE_TYPE(10, tAESNI);
-  XBYAK_DEFINE_TYPE(11, tAVX512_FP16);
-  XBYAK_DEFINE_TYPE(12, tOSXSAVE);
-  XBYAK_DEFINE_TYPE(13, tPCLMULQDQ);
-  XBYAK_DEFINE_TYPE(14, tAVX);
-  XBYAK_DEFINE_TYPE(15, tFMA);
-  XBYAK_DEFINE_TYPE(16, t3DN);
-  XBYAK_DEFINE_TYPE(17, tE3DN);
-  XBYAK_DEFINE_TYPE(18, tWAITPKG);
-  XBYAK_DEFINE_TYPE(19, tRDTSCP);
-  XBYAK_DEFINE_TYPE(20, tAVX2);
-  XBYAK_DEFINE_TYPE(21, tBMI1);  // andn, bextr, blsi, blsmsk, blsr, tzcnt
-  XBYAK_DEFINE_TYPE(22, tBMI2);  // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
-  XBYAK_DEFINE_TYPE(23, tLZCNT);
-  XBYAK_DEFINE_TYPE(24, tINTEL);
-  XBYAK_DEFINE_TYPE(25, tAMD);
-  XBYAK_DEFINE_TYPE(26, tENHANCED_REP);  // enhanced rep movsb/stosb
-  XBYAK_DEFINE_TYPE(27, tRDRAND);
-  XBYAK_DEFINE_TYPE(28, tADX);     // adcx, adox
-  XBYAK_DEFINE_TYPE(29, tRDSEED);  // rdseed
-  XBYAK_DEFINE_TYPE(30, tSMAP);    // stac
-  XBYAK_DEFINE_TYPE(31, tHLE);     // xacquire, xrelease, xtest
-  XBYAK_DEFINE_TYPE(32, tRTM);     // xbegin, xend, xabort
-  XBYAK_DEFINE_TYPE(33, tF16C);    // vcvtph2ps, vcvtps2ph
-  XBYAK_DEFINE_TYPE(34, tMOVBE);   // mobve
-  XBYAK_DEFINE_TYPE(35, tAVX512F);
-  XBYAK_DEFINE_TYPE(36, tAVX512DQ);
-  XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
-  XBYAK_DEFINE_TYPE(37, tAVX512IFMA);  // = tAVX512_IFMA;
-  XBYAK_DEFINE_TYPE(38, tAVX512PF);
-  XBYAK_DEFINE_TYPE(39, tAVX512ER);
-  XBYAK_DEFINE_TYPE(40, tAVX512CD);
-  XBYAK_DEFINE_TYPE(41, tAVX512BW);
-  XBYAK_DEFINE_TYPE(42, tAVX512VL);
-  XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
-  XBYAK_DEFINE_TYPE(43, tAVX512VBMI);  // = tAVX512_VBMI; // changed by Intel's manual
-  XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
-  XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
-  XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
-  XBYAK_DEFINE_TYPE(47, tPREFETCHW);
-  XBYAK_DEFINE_TYPE(48, tSHA);
-  XBYAK_DEFINE_TYPE(49, tMPX);
-  XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2);
-  XBYAK_DEFINE_TYPE(51, tGFNI);
-  XBYAK_DEFINE_TYPE(52, tVAES);
-  XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ);
-  XBYAK_DEFINE_TYPE(54, tAVX512_VNNI);
-  XBYAK_DEFINE_TYPE(55, tAVX512_BITALG);
-  XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ);
-  XBYAK_DEFINE_TYPE(57, tAVX512_BF16);
-  XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT);
-  XBYAK_DEFINE_TYPE(59, tAMX_TILE);
-  XBYAK_DEFINE_TYPE(60, tAMX_INT8);
-  XBYAK_DEFINE_TYPE(61, tAMX_BF16);
-  XBYAK_DEFINE_TYPE(62, tAVX_VNNI);
-  XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT);
-  XBYAK_DEFINE_TYPE(64, tCLDEMOTE);
-  XBYAK_DEFINE_TYPE(65, tMOVDIRI);
-  XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
-  XBYAK_DEFINE_TYPE(67, tCLZERO);  // AMD Zen
-  XBYAK_DEFINE_TYPE(68, tAMX_FP16);
-  XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
-  XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
-  XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
-  XBYAK_DEFINE_TYPE(72, tRAO_INT);
-  XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
-  XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
-  XBYAK_DEFINE_TYPE(75, tSERIALIZE);
-  XBYAK_DEFINE_TYPE(76, tUINTR);
-  XBYAK_DEFINE_TYPE(77, tXSAVE);
-  XBYAK_DEFINE_TYPE(78, tSHA512);
-  XBYAK_DEFINE_TYPE(79, tSM3);
-  XBYAK_DEFINE_TYPE(80, tSM4);
-  XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
-
-#undef XBYAK_SPLIT_ID
-#undef XBYAK_DEFINE_TYPE
-
-  Cpu()
-      : type_(),
-        x2APIC_supported_(false),
-        numCores_(),
-        dataCacheSize_(),
-        coresSharignDataCache_(),
-        dataCacheLevels_(0) {
-    uint32_t data[4] = {};
-    const uint32_t& EAX = data[0];
-    const uint32_t& EBX = data[1];
-    const uint32_t& ECX = data[2];
-    const uint32_t& EDX = data[3];
-    getCpuid(0, data);
-    const uint32_t maxNum = EAX;
-    static const char intel[] = "ntel";
-    static const char amd[] = "cAMD";
-    if (ECX == get32bitAsBE(amd)) {
-      type_ |= tAMD;
-      getCpuid(0x80000001, data);
-      if (EDX & (1U << 31)) {
-        type_ |= t3DN;
-        // 3DNow! implies support for PREFETCHW on AMD
-        type_ |= tPREFETCHW;
-      }
-
-      if (EDX & (1U << 29)) {
-        // Long mode implies support for PREFETCHW on AMD
-        type_ |= tPREFETCHW;
-      }
-    }
-    if (ECX == get32bitAsBE(intel)) {
-      type_ |= tINTEL;
-    }
-
-    // Extended flags information
-    getCpuid(0x80000000, data);
-    const uint32_t maxExtendedNum = EAX;
-    if (maxExtendedNum >= 0x80000001) {
-      getCpuid(0x80000001, data);
-
-      if (EDX & (1U << 31)) type_ |= t3DN;
-      if (EDX & (1U << 30)) type_ |= tE3DN;
-      if (EDX & (1U << 27)) type_ |= tRDTSCP;
-      if (EDX & (1U << 22)) type_ |= tMMX2;
-      if (EDX & (1U << 15)) type_ |= tCMOV;
-      if (ECX & (1U << 5)) type_ |= tLZCNT;
-      if (ECX & (1U << 8)) type_ |= tPREFETCHW;
-    }
-
-    if (maxExtendedNum >= 0x80000008) {
-      getCpuid(0x80000008, data);
-      if (EBX & (1U << 0)) type_ |= tCLZERO;
-    }
-
-    getCpuid(1, data);
-    if (ECX & (1U << 0)) type_ |= tSSE3;
-    if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
-    if (ECX & (1U << 9)) type_ |= tSSSE3;
-    if (ECX & (1U << 19)) type_ |= tSSE41;
-    if (ECX & (1U << 20)) type_ |= tSSE42;
-    if (ECX & (1U << 22)) type_ |= tMOVBE;
-    if (ECX & (1U << 23)) type_ |= tPOPCNT;
-    if (ECX & (1U << 25)) type_ |= tAESNI;
-    if (ECX & (1U << 26)) type_ |= tXSAVE;
-    if (ECX & (1U << 27)) type_ |= tOSXSAVE;
-    if (ECX & (1U << 30)) type_ |= tRDRAND;
-    if (ECX & (1U << 29)) type_ |= tF16C;
-
-    if (EDX & (1U << 15)) type_ |= tCMOV;
-    if (EDX & (1U << 23)) type_ |= tMMX;
-    if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
-    if (EDX & (1U << 26)) type_ |= tSSE2;
-
-    if (type_ & tOSXSAVE) {
-      // check XFEATURE_ENABLED_MASK[2:1] = '11b'
-      uint64_t bv = getXfeature();
-      if ((bv & 6) == 6) {
-        if (ECX & (1U << 28)) type_ |= tAVX;
-        if (ECX & (1U << 12)) type_ |= tFMA;
-          // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
-#if !defined(__APPLE__)
-        if (((bv >> 5) & 7) == 7)
-#endif
-        {
-          getCpuidEx(7, 0, data);
-          if (EBX & (1U << 16)) type_ |= tAVX512F;
-          if (type_ & tAVX512F) {
-            if (EBX & (1U << 17)) type_ |= tAVX512DQ;
-            if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
-            if (EBX & (1U << 26)) type_ |= tAVX512PF;
-            if (EBX & (1U << 27)) type_ |= tAVX512ER;
-            if (EBX & (1U << 28)) type_ |= tAVX512CD;
-            if (EBX & (1U << 30)) type_ |= tAVX512BW;
-            if (EBX & (1U << 31)) type_ |= tAVX512VL;
-            if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
-            if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
-            if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
-            if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
-            if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
-            if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
-            if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
-            if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
-            if ((type_ & tAVX512BW) && (EDX & (1U << 23))) type_ |= tAVX512_FP16;
-          }
-        }
-      }
-    }
-    if (maxNum >= 7) {
-      getCpuidEx(7, 0, data);
-      const uint32_t maxNumSubLeaves = EAX;
-      if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
-      if (EBX & (1U << 3)) type_ |= tBMI1;
-      if (EBX & (1U << 8)) type_ |= tBMI2;
-      if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
-      if (EBX & (1U << 18)) type_ |= tRDSEED;
-      if (EBX & (1U << 19)) type_ |= tADX;
-      if (EBX & (1U << 20)) type_ |= tSMAP;
-      if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
-      if (EBX & (1U << 4)) type_ |= tHLE;
-      if (EBX & (1U << 11)) type_ |= tRTM;
-      if (EBX & (1U << 14)) type_ |= tMPX;
-      if (EBX & (1U << 29)) type_ |= tSHA;
-      if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
-      if (ECX & (1U << 5)) type_ |= tWAITPKG;
-      if (ECX & (1U << 8)) type_ |= tGFNI;
-      if (ECX & (1U << 9)) type_ |= tVAES;
-      if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
-      if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
-      if (ECX & (1U << 27)) type_ |= tMOVDIRI;
-      if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
-      if (EDX & (1U << 5)) type_ |= tUINTR;
-      if (EDX & (1U << 14)) type_ |= tSERIALIZE;
-      if (EDX & (1U << 22)) type_ |= tAMX_BF16;
-      if (EDX & (1U << 24)) type_ |= tAMX_TILE;
-      if (EDX & (1U << 25)) type_ |= tAMX_INT8;
-      if (maxNumSubLeaves >= 1) {
-        getCpuidEx(7, 1, data);
-        if (EAX & (1U << 0)) type_ |= tSHA512;
-        if (EAX & (1U << 1)) type_ |= tSM3;
-        if (EAX & (1U << 2)) type_ |= tSM4;
-        if (EAX & (1U << 3)) type_ |= tRAO_INT;
-        if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
-        if (type_ & tAVX512F) {
-          if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
-        }
-        if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
-        if (EAX & (1U << 21)) type_ |= tAMX_FP16;
-        if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
-        if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
-        if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
-        if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
-        if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
-      }
-    }
-    setFamily();
-    setNumCores();
-    setCacheHierarchy();
-  }
-  void putFamily() const {
-#ifndef XBYAK_ONLY_CLASS_CPU
-    printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n", family, model, stepping, extFamily,
-           extModel);
-    printf("display:family=%X, model=%X\n", displayFamily, displayModel);
-#endif
-  }
-  bool has(const Type& type) const { return (type & type_) == type; }
-};
-
-#ifndef XBYAK_ONLY_CLASS_CPU
-class Clock {
- public:
-  static inline uint64_t getRdtsc() {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _MSC_VER
-    return __rdtsc();
-#else
-    uint32_t eax, edx;
-    __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
-    return ((uint64_t)edx << 32) | eax;
-#endif
-#else
-    // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
-    return 0;
-#endif
-  }
-  Clock() : clock_(0), count_(0) {}
-  void begin() { clock_ -= getRdtsc(); }
-  void end() {
-    clock_ += getRdtsc();
-    count_++;
-  }
-  int getCount() const { return count_; }
-  uint64_t getClock() const { return clock_; }
-  void clear() {
-    count_ = 0;
-    clock_ = 0;
-  }
-
- private:
-  uint64_t clock_;
-  int count_;
-};
-
-#ifdef XBYAK64
-const int UseRCX = 1 << 6;
-const int UseRDX = 1 << 7;
-
-class Pack {
-  static const size_t maxTblNum = 15;
-  Xbyak::Reg64 tbl_[maxTblNum];
-  size_t n_;
-
- public:
-  Pack() : tbl_(), n_(0) {}
-  Pack(const Xbyak::Reg64* tbl, size_t n) { init(tbl, n); }
-  Pack(const Pack& rhs) : n_(rhs.n_) {
-    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
-  }
-  Pack& operator=(const Pack& rhs) {
-    n_ = rhs.n_;
-    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
-    return *this;
-  }
-  Pack(const Xbyak::Reg64& t0) {
-    n_ = 1;
-    tbl_[0] = t0;
-  }
-  Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 2;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-  }
-  Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 3;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-  }
-  Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 4;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-  }
-  Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
-       const Xbyak::Reg64& t0) {
-    n_ = 5;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-  }
-  Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
-       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 6;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-  }
-  Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
-       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 7;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-  }
-  Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
-       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 8;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-  }
-  Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5,
-       const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
-       const Xbyak::Reg64& t0) {
-    n_ = 9;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-  }
-  Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6,
-       const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
-       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 10;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-  }
-  Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7,
-       const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
-       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 11;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-    tbl_[10] = ta;
-  }
-  Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8,
-       const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
-       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 12;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-    tbl_[10] = ta;
-    tbl_[11] = tb;
-  }
-  Pack& append(const Xbyak::Reg64& t) {
-    if (n_ == maxTblNum) {
-      fprintf(stderr, "ERR Pack::can't append\n");
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
-    }
-    tbl_[n_++] = t;
-    return *this;
-  }
-  void init(const Xbyak::Reg64* tbl, size_t n) {
-    if (n > maxTblNum) {
-      fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
-      XBYAK_THROW(ERR_BAD_PARAMETER)
-    }
-    n_ = n;
-    for (size_t i = 0; i < n; i++) {
-      tbl_[i] = tbl[i];
-    }
-  }
-  const Xbyak::Reg64& operator[](size_t n) const {
-    if (n >= n_) {
-      fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
-    }
-    return tbl_[n];
-  }
-  size_t size() const { return n_; }
-  /*
-          get tbl[pos, pos + num)
-  */
-  Pack sub(size_t pos, size_t num = size_t(-1)) const {
-    if (num == size_t(-1)) num = n_ - pos;
-    if (pos + num > n_) {
-      fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
-    }
-    Pack pack;
-    pack.n_ = num;
-    for (size_t i = 0; i < num; i++) {
-      pack.tbl_[i] = tbl_[pos + i];
-    }
-    return pack;
-  }
-  void put() const {
-    for (size_t i = 0; i < n_; i++) {
-      printf("%s ", tbl_[i].toString());
-    }
-    printf("\n");
-  }
-};
-
-class StackFrame {
-#ifdef XBYAK64_WIN
-  static const int noSaveNum = 6;
-  static const int rcxPos = 0;
-  static const int rdxPos = 1;
-#else
-  static const int noSaveNum = 8;
-  static const int rcxPos = 3;
-  static const int rdxPos = 2;
-#endif
-  static const int maxRegNum = 14;  // maxRegNum = 16 - rsp - rax
-  Xbyak::CodeGenerator* code_;
-  int pNum_;
-  int tNum_;
-  bool useRcx_;
-  bool useRdx_;
-  int saveNum_;
-  int P_;
-  bool makeEpilog_;
-  Xbyak::Reg64 pTbl_[4];
-  Xbyak::Reg64 tTbl_[maxRegNum];
-  Pack p_;
-  Pack t_;
-  StackFrame(const StackFrame&);
-  void operator=(const StackFrame&);
-
- public:
-  const Pack& p;
-  const Pack& t;
-  /*
-          make stack frame
-          @param sf [in] this
-          @param pNum [in] num of function parameter(0 <= pNum <= 4)
-          @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
-          @param stackSizeByte [in] local stack size
-          @param makeEpilog [in] automatically call close() if true
-
-          you can use
-          rax
-          gp0, ..., gp(pNum - 1)
-          gt0, ..., gt(tNum-1)
-          rcx if tNum & UseRCX
-          rdx if tNum & UseRDX
-          rsp[0..stackSizeByte - 1]
-  */
-  StackFrame(Xbyak::CodeGenerator* code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
-      : code_(code),
-        pNum_(pNum),
-        tNum_(tNum & ~(UseRCX | UseRDX)),
-        useRcx_((tNum & UseRCX) != 0),
-        useRdx_((tNum & UseRDX) != 0),
-        saveNum_(0),
-        P_(0),
-        makeEpilog_(makeEpilog),
-        p(p_),
-        t(t_) {
-    using namespace Xbyak;
-    if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
-    const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
-    if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
-    const Reg64& _rsp = code->rsp;
-    saveNum_ = local::max_(0, allRegNum - noSaveNum);
-    const int* tbl = getOrderTbl() + noSaveNum;
-    for (int i = 0; i < saveNum_; i++) {
-      code->push(Reg64(tbl[i]));
-    }
-    P_ = (stackSizeByte + 7) / 8;
-    if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++;  // (rsp % 16) == 8, then increment P_ for 16 byte alignment
-    P_ *= 8;
-    if (P_ > 0) code->sub(_rsp, P_);
-    int pos = 0;
-    for (int i = 0; i < pNum; i++) {
-      pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
-    }
-    for (int i = 0; i < tNum_; i++) {
-      tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
-    }
-    if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
-    if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
-    p_.init(pTbl_, pNum);
-    t_.init(tTbl_, tNum_);
-  }
-  /*
-          make epilog manually
-          @param callRet [in] call ret() if true
-  */
-  void close(bool callRet = true) {
-    using namespace Xbyak;
-    const Reg64& _rsp = code_->rsp;
-    const int* tbl = getOrderTbl() + noSaveNum;
-    if (P_ > 0) code_->add(_rsp, P_);
-    for (int i = 0; i < saveNum_; i++) {
-      code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
-    }
-
-    if (callRet) code_->ret();
-  }
-  ~StackFrame() {
-    if (!makeEpilog_) return;
-    close();
-  }
-
- private:
-  const int* getOrderTbl() const {
-    using namespace Xbyak;
-    static const int tbl[] = {
-#ifdef XBYAK64_WIN
-        Operand::RCX, Operand::RDX, Operand::R8,  Operand::R9,  Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
-#else
-        Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8,  Operand::R9, Operand::R10, Operand::R11,
-#endif
-        Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15};
-    return &tbl[0];
-  }
-  int getRegIdx(int& pos) const {
-    assert(pos < maxRegNum);
-    using namespace Xbyak;
-    const int* tbl = getOrderTbl();
-    int r = tbl[pos++];
-    if (useRcx_) {
-      if (r == Operand::RCX) {
-        return Operand::R10;
-      }
-      if (r == Operand::R10) {
-        r = tbl[pos++];
-      }
-    }
-    if (useRdx_) {
-      if (r == Operand::RDX) {
-        return Operand::R11;
-      }
-      if (r == Operand::R11) {
-        return tbl[pos++];
-      }
-    }
-    return r;
-  }
-};
-#endif
-
-class Profiler {
-  int mode_;
-  const char* suffix_;
-  const void* startAddr_;
-#ifdef XBYAK_USE_PERF
-  FILE* fp_;
-#endif
- public:
-  enum { None = 0, Perf = 1, VTune = 2 };
-  Profiler()
-      : mode_(None),
-        suffix_(""),
-        startAddr_(0)
-#ifdef XBYAK_USE_PERF
-        ,
-        fp_(0)
-#endif
-  {
-  }
-  // append suffix to funcName
-  void setNameSuffix(const char* suffix) { suffix_ = suffix; }
-  void setStartAddr(const void* startAddr) { startAddr_ = startAddr; }
-  void init(int mode) {
-    mode_ = None;
-    switch (mode) {
-      default:
-      case None:
-        return;
-      case Perf:
-#ifdef XBYAK_USE_PERF
-        close();
-        {
-          const int pid = getpid();
-          char name[128];
-          snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
-          fp_ = fopen(name, "a+");
-          if (fp_ == 0) {
-            fprintf(stderr, "can't open %s\n", name);
-            return;
-          }
-        }
-        mode_ = Perf;
-#endif
-        return;
-      case VTune:
-#ifdef XBYAK_USE_VTUNE
-        dlopen("dummy", RTLD_LAZY);  // force to load dlopen to enable jit profiling
-        if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
-          fprintf(stderr, "VTune profiling is not active\n");
-          return;
-        }
-        mode_ = VTune;
-#endif
-        return;
-    }
-  }
-  ~Profiler() { close(); }
-  void close() {
-#ifdef XBYAK_USE_PERF
-    if (fp_ == 0) return;
-    fclose(fp_);
-    fp_ = 0;
-#endif
-  }
-  void set(const char* funcName, const void* startAddr, size_t funcSize) const {
-    if (mode_ == None) return;
-#if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
-    (void)funcName;
-    (void)startAddr;
-    (void)funcSize;
-#endif
-#ifdef XBYAK_USE_PERF
-    if (mode_ == Perf) {
-      if (fp_ == 0) return;
-      fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
-      /*
-              perf does not recognize the function name which is less than 3,
-              so append '_' at the end of the name if necessary
-      */
-      size_t n = strlen(funcName) + strlen(suffix_);
-      for (size_t i = n; i < 3; i++) {
-        fprintf(fp_, "_");
-      }
-      fprintf(fp_, "\n");
-      fflush(fp_);
-    }
-#endif
-#ifdef XBYAK_USE_VTUNE
-    if (mode_ != VTune) return;
-    char className[] = "";
-    char fileName[] = "";
-    iJIT_Method_Load jmethod = {};
-    jmethod.method_id = iJIT_GetNewMethodID();
-    jmethod.class_file_name = className;
-    jmethod.source_file_name = fileName;
-    jmethod.method_load_address = const_cast<void*>(startAddr);
-    jmethod.method_size = funcSize;
-    jmethod.line_number_size = 0;
-    char buf[128];
-    snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
-    jmethod.method_name = buf;
-    iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
-#endif
-  }
-  /*
-          for continuous set
-          funcSize = endAddr - <previous set endAddr>
-  */
-  void set(const char* funcName, const void* endAddr) {
-    set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
-    startAddr_ = endAddr;
-  }
-};
-#endif  // XBYAK_ONLY_CLASS_CPU
-
-}  // namespace util
-}  // namespace Xbyak
-
-#endif
diff --git a/onnxruntime/core/optimizer/common_subexpression_elimination.cc b/onnxruntime/core/optimizer/common_subexpression_elimination.cc
index b2e7ef0b4f55..48df511d0c67 100644
--- a/onnxruntime/core/optimizer/common_subexpression_elimination.cc
+++ b/onnxruntime/core/optimizer/common_subexpression_elimination.cc
@@ -4,6 +4,7 @@
 #include "common_subexpression_elimination.h"
 #include "core/optimizer/utils.h"
 #include "core/graph/graph_utils.h"
+#include "core/framework/tensorprotoutils.h"
 
 #include <memory>
 #include <type_traits>
@@ -170,6 +171,32 @@ bool AreRangesEqual(const Range& lhs, const Range& rhs) {
          std::equal(lhs.begin(), lhs.end(), rhs.begin());
 }
 
+// Check if two tensor attributes are equal scalar tensors, mainly to support ConstantOfShape Op.
+// Currently support float, float16 and int64 data types, and requires the data are raw data in TensorProto.
+bool AreScalarTensorAttributeEqual(const ONNX_NAMESPACE::TensorProto& lhs_t, const ONNX_NAMESPACE::TensorProto& rhs_t) {
+  if (!(utils::HasDataType(lhs_t) && utils::HasDataType(rhs_t) && lhs_t.data_type() == rhs_t.data_type() &&
+        (lhs_t.data_type() == onnx::TensorProto_DataType_FLOAT ||
+         lhs_t.data_type() == onnx::TensorProto_DataType_FLOAT16 ||
+         lhs_t.data_type() == onnx::TensorProto_DataType_INT64) &&
+        lhs_t.dims_size() == 1 && rhs_t.dims_size() == 1 && lhs_t.dims()[0] == 1 && rhs_t.dims()[0] == 1 &&
+        utils::HasRawData(lhs_t) && utils::HasRawData(rhs_t))) {
+    return false;
+  }
+  const void* lhs_value = lhs_t.raw_data().data();
+  const void* rhs_value = rhs_t.raw_data().data();
+  switch (lhs_t.data_type()) {
+    case onnx::TensorProto_DataType_FLOAT:
+      return *reinterpret_cast<const float*>(lhs_value) == *reinterpret_cast<const float*>(rhs_value);
+    case onnx::TensorProto_DataType_FLOAT16:
+      return *reinterpret_cast<const MLFloat16*>(lhs_value) == *reinterpret_cast<const MLFloat16*>(rhs_value);
+    case onnx::TensorProto_DataType_INT64:
+      return *reinterpret_cast<const int64_t*>(lhs_value) == *reinterpret_cast<const int64_t*>(rhs_value);
+    default:
+      break;
+  }
+  return false;
+}
+
 bool AreEqual(const ONNX_NAMESPACE::AttributeProto& lhs, const ONNX_NAMESPACE::AttributeProto& rhs) {
   if (&lhs == &rhs) {
     return true;
@@ -193,6 +220,7 @@ bool AreEqual(const ONNX_NAMESPACE::AttributeProto& lhs, const ONNX_NAMESPACE::A
     case onnx::AttributeProto_AttributeType_STRINGS:
       return AreRangesEqual(lhs.strings(), rhs.strings());
     case onnx::AttributeProto_AttributeType_TENSOR:
+      return AreScalarTensorAttributeEqual(lhs.t(), rhs.t());
     case onnx::AttributeProto_AttributeType_GRAPH:
     case onnx::AttributeProto_AttributeType_SPARSE_TENSOR:
     case onnx::AttributeProto_AttributeType_TYPE_PROTO:
@@ -207,6 +235,31 @@ bool AreEqual(const ONNX_NAMESPACE::AttributeProto& lhs, const ONNX_NAMESPACE::A
   return false;
 }
 
+// Support scalar float/int64/fp16 tensor attribute only for now, and requires data is raw data in TensorProto.
+std::size_t GetTensorAttributeHash(const ONNX_NAMESPACE::TensorProto& attr_t) {
+  std::size_t hash = 0;
+  if (utils::HasDataType(attr_t) && attr_t.dims_size() == 1 && attr_t.dims()[0] == 1 && utils::HasRawData(attr_t)) {
+    int data_type = attr_t.data_type();
+    switch (data_type) {
+      case onnx::TensorProto_DataType_FLOAT:
+        UpdateHash(data_type, hash);
+        UpdateHash(*reinterpret_cast<const float*>(attr_t.raw_data().data()), hash);
+        break;
+      case onnx::TensorProto_DataType_FLOAT16:
+        UpdateHash(data_type, hash);
+        UpdateHash(static_cast<float>(*reinterpret_cast<const MLFloat16*>(attr_t.raw_data().data())), hash);
+        break;
+      case onnx::TensorProto_DataType_INT64:
+        UpdateHash(data_type, hash);
+        UpdateHash(*reinterpret_cast<const int64_t*>(attr_t.raw_data().data()), hash);
+        break;
+      default:
+        break;
+    }
+  }
+  return hash;
+}
+
 std::size_t GetAttributeHash(const ONNX_NAMESPACE::AttributeProto& attr) {
   std::size_t hash = 0;
   UpdateHash(
@@ -233,6 +286,8 @@ std::size_t GetAttributeHash(const ONNX_NAMESPACE::AttributeProto& attr) {
       UpdateHashWithContainer(attr.strings(), hash);
       break;
     case onnx::AttributeProto_AttributeType_TENSOR:
+      UpdateHash(attr.t(), &GetTensorAttributeHash, hash);
+      break;
     case onnx::AttributeProto_AttributeType_GRAPH:
     case onnx::AttributeProto_AttributeType_SPARSE_TENSOR:
     case onnx::AttributeProto_AttributeType_TYPE_PROTO:
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
index 9c98ed6d3e11..1516fb37a7e9 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
@@ -4,6 +4,7 @@
 #ifdef ENABLE_TRAINING
 
 #include <onnx/defs/attr_proto_util.h>
+#include "core/common/string_utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/initializer.h"
 #include "core/optimizer/utils.h"
@@ -26,38 +27,38 @@ UpStreamGatherGraphTransformer::UpStreamGatherGraphTransformer(
       // 2. Whether the outputs have the same dim changes if the Gather node moves before that operator.
       // 3. Should all inputs be allowed when tracking back further (bottom-up);
       //    if not, add the input index restriction as MatMul did.
-      {GetFullQualifiedOpName("Add", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Add", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_14_13_7_6_1)},
-      {GetFullQualifiedOpName("BiasGelu", kMSDomain),
+      {utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(), opset_1)},
 
-      {GetFullQualifiedOpName("Cast", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_19_13_9_6_1)},
-      {GetFullQualifiedOpName("Div", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Div", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_14_13_7_6_1)},
-      {GetFullQualifiedOpName("Dropout", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_13_12_10_7_6_1)},
-      {GetFullQualifiedOpName("Gelu", kMSDomain),
+      {utils::GetFullQualifiedOpName("Gelu", kMSDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
                                                             opset_1)},
       {// Be noted, this is our own implementation of ONNX domain op.
-       GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
+       utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<LayerNormalizationGatherActor>(),
                                                             opset_1)},
-      {GetFullQualifiedOpName("MatMul", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<MatMulGatherActor>(),
                                                             opset_13_9_1)},
-      {GetFullQualifiedOpName("Reshape", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Reshape", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<ReshapeGatherActor>(),
                                                             opset_19_14_13_5_1)},
-      {GetFullQualifiedOpName("Softmax", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Softmax", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SoftmaxGatherActor>(),
                                                             opset_13_11_1)},
-      {GetFullQualifiedOpName("Transpose", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Transpose", kOnnxDomain),
        OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<TransposeGatherActor>(),
                                                             opset_13_1)},
   });
@@ -69,7 +70,7 @@ bool UpStreamGatherGraphTransformer::UpStreamInternal(
     const OpPassThroughConfig<UpStreamGatherOperatorActorBase>& pass_through_config,
     const logging::Logger& logger) const {
   Node& slice_node = *info.node_ptr;
-  const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
+  const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
 
   std::unordered_map<int, int> propagate_input_indices;
   std::unordered_map<int, std::vector<DimCompare>> all_input_cmp_rets;
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc b/onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc
index f7b48de2caaf..716988e93312 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc
@@ -4,6 +4,7 @@
 #ifdef ENABLE_TRAINING
 
 #include "core/framework/tensorprotoutils.h"
+#include "core/common/string_utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/utils.h"
 #include "core/optimizer/compute_optimizer/upstream_reshape_actors.h"
@@ -21,23 +22,23 @@ UpStreamReshapeGraphTransformer::UpStreamReshapeGraphTransformer(
       //    If optype is not enough to guarantee the equivalence, we need to add a customized pre-check function.
       // 2. Should all inputs be allowed when tracking back further (bottom-up);
       //    if not, add the input index restriction.
-      {GetFullQualifiedOpName("Add", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Add", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_14_13_7_6_1)},
-      {GetFullQualifiedOpName("BiasGelu", kMSDomain),
+      {utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_1)},
-      {GetFullQualifiedOpName("Cast", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_19_13_9_6_1)},
-      {GetFullQualifiedOpName("Dropout", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_13_12_10_7_6_1)},
       {// Be noted, this is our own implementation of ONNX domain op.
-       GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
+       utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<LayerNormalizationReshapeActor>(), opset_1)},
-      {GetFullQualifiedOpName("MatMul", kOnnxDomain),
+      {utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
        OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
            std::make_shared<MatMulReshapeActor>(), opset_13_9_1)},
   });
@@ -47,7 +48,7 @@ bool UpStreamReshapeGraphTransformer::UpStreamInternal(
     Graph& graph, std::deque<ReshapeInfo>& queue, Node& current_node, ReshapeInfo& info,
     const OpPassThroughConfig<UpStreamReshapeOperatorActorBase>& pass_through_config,
     const logging::Logger& logger) const {
-  const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
+  const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
 
   std::vector<int> propagate_input_indices;
   std::unordered_map<int, std::vector<DimCompare>> all_input_cmp_rets;
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.cc b/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.cc
index f08e37296d25..4582f26a7dc6 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.cc
@@ -5,6 +5,7 @@
 
 #include <onnx/defs/attr_proto_util.h>
 #include "core/common/safeint.h"
+#include "core/common/string_utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/initializer.h"
 #include "core/optimizer/utils.h"
@@ -130,7 +131,7 @@ template <typename T1, typename T2>
 bool UpStreamGraphTransformerBase<T1, T2>::Upstream(Graph& graph, std::deque<T1>& queue,
                                                     Node& current_node, T1& info,
                                                     const logging::Logger& logger) const {
-  const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
+  const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
   if (allowed_passthrough_ops_.count(op_type)) {
     auto& pass_through_config = allowed_passthrough_ops_.at(op_type);
     LOG_DEBUG_INFO(logger, "Enter reorder handle for node " + current_node.Name() + "(" + op_type + ")");
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.h b/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.h
index 6e22fc791ade..d848a03c555b 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.h
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_transformer_base.h
@@ -72,13 +72,6 @@ class UpStreamGraphTransformerBase : public GraphTransformer {
                                 const OpPassThroughConfig<T2>& pass_through_config,
                                 const logging::Logger& logger) const = 0;
 
-  /**
-   * @brief A consistent way to construct the full qualified op name.
-   */
-  std::string GetFullQualifiedOpName(const std::string& op_type, const std::string& domain) const {
-    return domain + "::" + op_type;
-  }
-
   std::unordered_map<std::string, OpPassThroughConfig<T2>> allowed_passthrough_ops_;
 
  private:
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index e3a2f2d74c0d..9df300d6f4f8 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -18,10 +18,12 @@ namespace onnxruntime {
 
 ConstantFolding::ConstantFolding(const IExecutionProvider& execution_provider,
                                  bool skip_dequantize_linear,
+                                 const ConfigOptions& config_options,
                                  const InlinedHashSet<std::string_view>& compatible_execution_providers,
                                  const InlinedHashSet<std::string>& excluded_initializers) noexcept
     : GraphTransformer("ConstantFolding", compatible_execution_providers),
       skip_dequantize_linear_(skip_dequantize_linear),
+      config_options_(config_options),
       excluded_initializers_(excluded_initializers),
       execution_provider_(execution_provider) {
 }
@@ -250,12 +252,12 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
         // override the EP assigned to the node so that it will use the CPU kernel for Compute.
         node->SetExecutionProviderType(kCpuExecutionProvider);
 
-        kernel = info.CreateKernel(node);
+        kernel = info.CreateKernel(node, config_options_);
 
         // undo the EP change to the value that was assigned at graph partitioning time
         node->SetExecutionProviderType(ep_type);
       } else {
-        kernel = info.CreateKernel(node);
+        kernel = info.CreateKernel(node, config_options_);
       }
 
       // We currently constant fold using the CPU EP only.
diff --git a/onnxruntime/core/optimizer/constant_folding.h b/onnxruntime/core/optimizer/constant_folding.h
index 47934307e893..14eb2a9c5f06 100644
--- a/onnxruntime/core/optimizer/constant_folding.h
+++ b/onnxruntime/core/optimizer/constant_folding.h
@@ -24,6 +24,7 @@ class ConstantFolding : public GraphTransformer {
   */
   ConstantFolding(const IExecutionProvider& execution_provider,
                   bool skip_dequantize_linear,
+                  const ConfigOptions& config_options,
                   const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
                   const InlinedHashSet<std::string>& excluded_initializers = {}) noexcept;
 
@@ -31,6 +32,7 @@ class ConstantFolding : public GraphTransformer {
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
 
   bool skip_dequantize_linear_;
+  const ConfigOptions& config_options_;
   const InlinedHashSet<std::string> excluded_initializers_;
   const IExecutionProvider& execution_provider_;
 };
diff --git a/onnxruntime/core/optimizer/constant_sharing.cc b/onnxruntime/core/optimizer/constant_sharing.cc
index a3c5a72ee79f..e2a5732d59aa 100644
--- a/onnxruntime/core/optimizer/constant_sharing.cc
+++ b/onnxruntime/core/optimizer/constant_sharing.cc
@@ -32,10 +32,9 @@ using SupportedTypeList = boost::mp11::mp_list<MLFloat16, float, double, int32_t
 // A threshold is defined here to restrict the graph transformation only applied to small tensors.
 // Be note: having a bigger threshold means more overhead when we do the graph transformations.
 // `8` is chosen to cover common constant use cases in some Reshape/Gather/Concat's inputs.
-// TODO(pengwa): we can gradually increase this threshold if we see more benefits (memory saving
+// TODO(pengwa): we can gradually increase this threshold if we see more benefits (memory-saving
 // or more CSE optimizations triggered). Should be careful to cover test cases that assume initializer
 // name did not change after transformation then.
-static constexpr char SHARED_INITIALIZER_PREFIX[] = "ortshared_";
 
 bool IsAllowedToShare(const ONNX_NAMESPACE::TensorShapeProto* input_shape,
                       int64_t& num_elements) {
@@ -78,7 +77,7 @@ bool PrepareInputPortsToReplace(Graph& graph, const NodeArg* origin_initializer_
     }
 
     // Iterate all input defs to replace those that are equal to origin_initializer_node_arg,
-    // Then it would be safe to remove the consumer node afterwards.
+    // Then it would be safe to remove the consumer node afterward.
     for (int i = 0; i < static_cast<int>(const_node->InputDefs().size()); ++i) {
       if (const_node->InputDefs()[i] == origin_initializer_node_arg) {
         consumer_node_to_input_ports_map[const_node].push_back(i);
@@ -233,24 +232,17 @@ Status ConstantSharing::ApplyImpl(Graph& graph, bool& modified, int /*graph_leve
     size_t value_id = GetOrAddValueInConstantStore(std::move(init_value), const_value_store, data_store_key);
 
     // Construct a string by data type, value, and rank. Used as a key in pattern_key_to_shared_arg_map.
-    const std::string pattern_key = MakeString(SHARED_INITIALIZER_PREFIX, data_store_key, "_", value_id);
+    const std::string pattern_key = MakeString(data_store_key, "_", value_id);
 
     // If there is no such existing scalar pattern, add a new one.
     if (pattern_key_to_shared_arg_map.find(pattern_key) == pattern_key_to_shared_arg_map.end()) {
-      // Do a copy and rename the TensorProto.
-      ONNX_NAMESPACE::TensorProto constant_tensor_proto_as_replacement(*tensor_proto);
-      constant_tensor_proto_as_replacement.set_name(graph.GenerateNodeArgName(pattern_key));
-      NodeArg& shared_scalar_initializer_node_arg = graph_utils::AddInitializer(graph,
-                                                                                constant_tensor_proto_as_replacement);
-      pattern_key_to_shared_arg_map[pattern_key] = &shared_scalar_initializer_node_arg;
+      pattern_key_to_shared_arg_map[pattern_key] = origin_initializer_node_arg;
     } else {
       shared_count += 1;
+      ReplaceInputsToUseSharedInitializer(graph, consumer_node_to_input_ports_map, origin_initializer_node_arg,
+                                          pattern_key_to_shared_arg_map[pattern_key]);
+      modified = true;
     }
-
-    ReplaceInputsToUseSharedInitializer(graph, consumer_node_to_input_ports_map, origin_initializer_node_arg,
-                                        pattern_key_to_shared_arg_map[pattern_key]);
-
-    modified = true;
   }
   if (shared_count > 0) {
     LOGS(logger, INFO) << "Total shared scalar initializer count: " << shared_count;
diff --git a/onnxruntime/core/optimizer/constant_sharing.h b/onnxruntime/core/optimizer/constant_sharing.h
index 3d0cb875da46..cfe252b3edb5 100644
--- a/onnxruntime/core/optimizer/constant_sharing.h
+++ b/onnxruntime/core/optimizer/constant_sharing.h
@@ -14,13 +14,13 @@ namespace onnxruntime {
 @class ConstantSharing
 
 Transformer that traverses the graph top-down and performs constant sharing, i.e.,
-constant initializers having same dtype, value and shape, will be replaced by one single (newly created) initializer.
-Currently, only scalar valued initializers are handled.
+constant initializers having same data type, value and shape, will be replaced by one single initializer.
+Currently, only scalar-valued initializers are handled.
 */
 class ConstantSharing : public GraphTransformer {
  public:
   /**
-   * @param compatible_execution_providers comptatible execution provider list for considered nodes.
+   * @param compatible_execution_providers compatible execution provider list for considered nodes.
    * @param excluded_initializers explicitly excluded initializer names that should not changed.
    */
   ConstantSharing(const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
@@ -29,6 +29,15 @@ class ConstantSharing : public GraphTransformer {
         excluded_initializers_(excluded_initializers) {
   }
 
+  bool ShouldOnlyApplyOnce() const override {
+#if defined(ENABLE_TRAINING)
+    return false;
+#else
+    // Reduce model processing time by applying this optimization only once for inference.
+    return true;
+#endif
+  }
+
   static constexpr int64_t TENSOR_ELEM_COUNT_THRESHOLD = 8;
 
  private:
diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc
index d27603e4ab3a..b7cb3ba488c6 100644
--- a/onnxruntime/core/optimizer/conv_activation_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc
@@ -111,7 +111,7 @@ class ConvActivationSelector : public NodeSelector {
       if (!graph_utils::IsSupportedOptypeVersionAndDomain(*next_node, "Relu", {6, 13, 14})) {
         return std::nullopt;
       }
-    } else if (node_ep.empty() || node_ep == kCpuExecutionProvider) {
+    } else if (node_ep.empty() || node_ep == kCpuExecutionProvider || node_ep == kJsExecutionProvider) {
       if (!is_supported_non_cuda_rocm_ep_activation(*next_node) &&
           !graph_utils::IsSupportedOptypeVersionAndDomain(*next_node, "HardSigmoid", {6})) {
         return std::nullopt;
diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc
index 4903bc1d6b96..1f2b31526c6b 100644
--- a/onnxruntime/core/optimizer/gather_fusion.cc
+++ b/onnxruntime/core/optimizer/gather_fusion.cc
@@ -9,55 +9,144 @@
 
 namespace onnxruntime {
 
-bool GatherToSplitFusion::IsSupportedGather(const Graph& graph, const Node& node, int64_t& index, int64_t& axis,
-                                            int64_t& indices_n_dims) const {
-  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Gather", {1, 11, 13}) ||
+namespace {
+static int64_t GetGatherAxis(const Node& node, int64_t rank) {
+  int64_t axis = 0;
+  auto& attrs = node.GetAttributes();
+  if (attrs.find("axis") != attrs.end()) {
+    auto& axis_attr = attrs.at("axis");
+    if (utils::HasInt(axis_attr)) {
+      axis = axis_attr.i();
+      if (axis < 0) axis += rank;
+    }
+  }
+  return axis;
+}
+
+static bool GetScalarInt64Initializer(const Graph& graph, const NodeArg& node_arg, int64_t& value, int64_t& rank) {
+  if (!optimizer_utils::IsScalar(node_arg)) return false;
+  const ONNX_NAMESPACE::TensorProto* tensor_proto = graph_utils::GetConstantInitializer(graph, node_arg.Name());
+  if (!tensor_proto || tensor_proto->data_type() != ONNX_NAMESPACE::TensorProto::INT64) return false;
+  Initializer init_const{*tensor_proto, graph.ModelPath()};
+  value = *(init_const.data<int64_t>());
+  rank = tensor_proto->dims_size();
+  return true;
+}
+
+static bool GetSliceAxis(const Graph& graph, const Node& node, int64_t rank, int64_t& axis) {
+  if (node.InputDefs().size() < 4) return false;
+  int64_t unused = 0;
+  if (!GetScalarInt64Initializer(graph, *node.InputDefs()[3], axis, unused)) return false;
+  if (axis < 0) axis += rank;
+  return true;
+}
+
+static bool GetAxis(const Graph& graph, const Node& node, int64_t rank, int64_t& axis) {
+  if (node.OpType() == "Gather") {
+    axis = GetGatherAxis(node, rank);
+    return true;
+  }
+  if (node.OpType() == "Slice") {
+    return GetSliceAxis(graph, node, rank, axis);
+  }
+  return false;
+}
+
+}  // namespace
+
+bool GatherSliceToSplitFusion::IsSupportedGather(const Graph& graph, const Node& node, int64_t rank,
+                                                 int64_t target_axis, int64_t dim_size, InlinedVector<bool>& consumed,
+                                                 int64_t& start, bool& need_squeeze) const {
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Gather", {13}) ||
       !graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) {
     return false;
   }
 
-  const NodeArg& input_arg = *(node.InputDefs()[1]);
-  if (!optimizer_utils::IsScalar(input_arg)) return false;
-  const ONNX_NAMESPACE::TensorProto* tensor_proto = graph_utils::GetConstantInitializer(graph, input_arg.Name());
-  if (!tensor_proto) return false;
-  if (tensor_proto->data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT64) return false;
-  Initializer init_const{*tensor_proto, graph.ModelPath()};
-  index = *(init_const.data<int64_t>());
-  axis = 0;  // Default value.
-  auto& attrs = node.GetAttributes();
-  if (attrs.find("axis") != attrs.end()) {
-    auto& axis_attr = attrs.at("axis");
-    if (utils::HasInt(axis_attr)) axis = axis_attr.i();
+  if (GetGatherAxis(node, rank) != target_axis) return false;
+  // Require the indices input to be a scalar tensor for now. Normally if not, the exporter will choose Slice.
+  // We can relax this later if needed.
+  int64_t indices_n_dims = 0;
+  if (!GetScalarInt64Initializer(graph, *(node.InputDefs()[1]), start, indices_n_dims)) return false;
+  if (start < 0) start += dim_size;
+  if (start < 0 || start >= dim_size || consumed[static_cast<size_t>(start)]) return false;
+  consumed[static_cast<size_t>(start)] = true;
+  need_squeeze = indices_n_dims == 0;
+  return true;
+}
+
+bool GatherSliceToSplitFusion::IsSupportedSlice(const Graph& graph, const Node& node, int64_t rank, int64_t target_axis,
+                                                int64_t dim_size, InlinedVector<bool>& consumed, int64_t& start,
+                                                int64_t& end) const {
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Slice", {13}) ||
+      !graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) {
+    return false;
+  }
+
+  int64_t axis = 0;
+  if (!GetSliceAxis(graph, node, rank, axis) || axis != target_axis) return false;
+  int64_t unused = 0;
+  if (!GetScalarInt64Initializer(graph, *node.InputDefs()[1], start, unused) ||
+      !GetScalarInt64Initializer(graph, *node.InputDefs()[2], end, unused)) {
+    return false;
+  }
+  // Handling start and end according to schema definition.
+  if (start < 0) start += dim_size;
+  if (end < 0) end += dim_size;
+  if (start < 0)
+    start = 0;
+  else if (start > dim_size)
+    start = dim_size;
+  if (end < 0)
+    end = 0;
+  else if (end > dim_size)
+    end = dim_size;
+  if (start >= end) return false;
+  if (node.InputDefs().size() >= 5) {
+    int64_t step = 0;
+    if (!GetScalarInt64Initializer(graph, *node.InputDefs()[4], step, unused) || step != 1) return false;
+  }
+  for (int64_t i = start; i < end; ++i) {
+    if (consumed[static_cast<size_t>(i)]) return false;
+    consumed[static_cast<size_t>(i)] = true;
   }
-  indices_n_dims = tensor_proto->dims_size();
   return true;
 }
 
 /*
-GatherToSplitFusion is to fuse:
-Node -> Gather(index=0, axis=axis)
-    |-> Gather(index=1, axis=axis)
-    |-> Gather(index=2, axis=axis)
+GatherSliceToSplitFusion is to fuse:
+Node -> Gather(indices=0, axis=axis)
+    |-> Gather(indices=[1], axis=axis)
+    |-> Slice(start=2, end=3, axes=[axis])
     |...
 
 To
 
 Node -> Split -> Squeeze(axis=axis)
-             |-> Squeeze(axis=axis)
-             |-> Squeeze(axis=axis)
+             |->
+             |->
              |...
 
 So that we can use one kernel to finish the job.
+The fusion requires that the indices of Gather nodes and start/end of Slice nodes are not overlapping and cover
+all the elements in the target axis. Step of Slice node should be 1.
 */
-Status GatherToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
-                                      const logging::Logger& logger) const {
+Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
+                                           const logging::Logger& logger) const {
+  // Squeeze, Gather, Slice and Split have different schemas before and after OpSet 13.
+  // To make code simple, support OpSet >= 13 only.
+  int onnx_opset_version = -1;
+  if (graph.DomainToVersionMap().find(kOnnxDomain) != graph.DomainToVersionMap().end()) {
+    onnx_opset_version = graph.DomainToVersionMap().at(kOnnxDomain);
+  }
+  if (onnx_opset_version < 13) return Status::OK();
+
   GraphViewer graph_viewer(graph);
   const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
 
-  InlinedVector<const NodeArg*> node_args;
+  InlinedVector<const NodeArg*> candidate_args;
   for (auto node_arg : graph.GetInputs()) {
     if (node_arg && graph.GetConsumerNodes(node_arg->Name()).size() > 1) {
-      node_args.push_back(node_arg);
+      candidate_args.push_back(node_arg);
     }
   }
 
@@ -65,7 +154,7 @@ Status GatherToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le
     if (graph.GetConsumerNodes(entry.first).size() > 1) {
       auto node_arg = graph.GetNodeArg(entry.first);
       if (node_arg) {
-        node_args.push_back(node_arg);
+        candidate_args.push_back(node_arg);
       }
     }
   }
@@ -90,129 +179,108 @@ Status GatherToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le
     size_t output_count = node.GetOutputEdgesCount();
     if (output_count <= 1) continue;
 
-    node_args.push_back(node.OutputDefs()[0]);
+    candidate_args.push_back(node.OutputDefs()[0]);
   }
 
-  for (const NodeArg* node_arg : node_args) {
+  for (const NodeArg* node_arg : candidate_args) {
     auto shape = node_arg->Shape();
     if (!shape) continue;
     int64_t rank = static_cast<int64_t>(shape->dim_size());
-
-    bool can_fuse = true;
-    bool first_edge = true;
-    int64_t split_axis = 0;
-    int64_t indices_n_dims = -1;
     auto consumers = graph.GetConsumerNodes(node_arg->Name());
-    size_t consumer_count = consumers.size();
-    InlinedVector<NodeArg*> gather_outputs(consumer_count, nullptr);
-    InlinedVector<std::reference_wrapper<Node>> nodes_to_fuse;
+    InlinedVector<const Node*> condidate_consumers;
     for (auto consumer : consumers) {
-      int64_t index, axis, dims;
-      if (!consumer || consumer->InputDefs()[0] != node_arg ||
-          !IsSupportedGather(graph, *consumer, index, axis, dims)) {
-        can_fuse = false;
-        break;
-      }
-      if (indices_n_dims == -1) {
-        indices_n_dims = dims;
-      } else if (indices_n_dims != dims) {
-        // Not the same number of dimensions (0 or 1) for all scalar indices.
-        can_fuse = false;
-        break;
+      if (consumer && consumer->InputDefs()[0] == node_arg &&
+          (consumer->OpType() == "Gather" || consumer->OpType() == "Slice")) {
+        condidate_consumers.emplace_back(consumer);
       }
-      if (axis < 0) axis += rank;
-      if (first_edge) {
-        auto dim = shape->dim(static_cast<int>(axis));
-        if (!utils::HasDimValue(dim) || dim.dim_value() != static_cast<int64_t>(consumer_count)) {
-          can_fuse = false;
-          break;
-        }
-        split_axis = axis;
-        first_edge = false;
-      } else if (axis != split_axis) {
+    }
+    if (condidate_consumers.size() < 2) continue;
+    int64_t axis = 0;
+    if (!GetAxis(graph, *condidate_consumers[0], rank, axis)) continue;
+    auto dim = shape->dim(static_cast<int>(axis));
+    if (!utils::HasDimValue(dim)) continue;
+    int64_t dim_size = dim.dim_value();
+    InlinedVector<bool> consumed(static_cast<size_t>(dim_size), false);
+    bool can_fuse = true;
+    InlinedVector<std::reference_wrapper<Node>> nodes_to_fuse;
+    InlinedVector<int64_t> starts;
+    InlinedHashMap<int64_t, std::tuple<NodeArg*, int64_t, bool>> output_info_map;
+    for (auto consumer : condidate_consumers) {
+      if (!consumer || consumer->InputDefs()[0] != node_arg) {
         can_fuse = false;
         break;
       }
-      if (index < 0) index += static_cast<int64_t>(consumer_count);
-      if (index < 0 || index >= static_cast<int64_t>(consumer_count) || gather_outputs[static_cast<size_t>(index)]) {
+      int64_t start = 0, end = 0;
+      bool need_squeeze = false;
+      if (IsSupportedGather(graph, *consumer, rank, axis, dim_size, consumed, start, need_squeeze)) {
+        Node& gather_node = *graph.GetNode(consumer->Index());
+        nodes_to_fuse.emplace_back(gather_node);
+        starts.emplace_back(start);
+        output_info_map[start] = std::make_tuple(gather_node.MutableOutputDefs()[0], 1, need_squeeze);
+      } else if (IsSupportedSlice(graph, *consumer, rank, axis, dim_size, consumed, start, end)) {
+        Node& slice_node = *graph.GetNode(consumer->Index());
+        nodes_to_fuse.emplace_back(slice_node);
+        starts.emplace_back(start);
+        output_info_map[start] = std::make_tuple(slice_node.MutableOutputDefs()[0], end - start, false);
+      } else {
         can_fuse = false;
         break;
       }
-      Node& gather_node = *graph.GetNode(consumer->Index());
-      nodes_to_fuse.emplace_back(gather_node);
-      gather_outputs[static_cast<size_t>(index)] = gather_node.MutableOutputDefs()[0];
-    }
-
-    if (!can_fuse) continue;
-
-    ONNX_NAMESPACE::TypeProto split_output_type;
-    const ONNX_NAMESPACE::TensorProto_DataType element_type =
-        static_cast<ONNX_NAMESPACE::TensorProto_DataType>(node_arg->TypeAsProto()->tensor_type().elem_type());
-    split_output_type.mutable_tensor_type()->set_elem_type(element_type);
-    for (int64_t i = 0; i < rank; ++i) {
-      if (i == split_axis) {
-        split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1LL);
-      } else {
-        *(split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()) = shape->dim(static_cast<int>(i));
-      }
     }
 
+    if (!can_fuse || std::find(consumed.begin(), consumed.end(), false) != consumed.end()) continue;
+    std::sort(starts.begin(), starts.end());
     InlinedVector<NodeArg*> split_outputs;
-    bool add_squeeze_node = indices_n_dims == 0;
-    if (add_squeeze_node) {
-      for (size_t i = 0; i < consumer_count; ++i) {
-        split_outputs.emplace_back(
-            &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("split" + std::to_string(i)), &split_output_type));
-      }
-    }
-
-    Node& split_node =
-        graph.AddNode(graph.GenerateNodeName("Split"), "Split", "Split for Fused Gather nodes",
-                      {graph.GetNodeArg(node_arg->Name())}, add_squeeze_node ? split_outputs : gather_outputs);
-    split_node.AddAttribute("axis", split_axis);
-    split_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
-
-    // Squeeze-11, Squeee-13, Split-13, Split-18 have different schemas.
-    int onnx_opset_version = -1;
-    if (graph.DomainToVersionMap().find(kOnnxDomain) != graph.DomainToVersionMap().end()) {
-      onnx_opset_version = graph.DomainToVersionMap().at(kOnnxDomain);
-    }
-
-    if (onnx_opset_version < 13) {
-      if (add_squeeze_node) {
-        for (size_t i = 0; i < consumer_count; ++i) {
-          Node& squeeze_node = graph.AddNode(graph.GenerateNodeName("Squeeze" + std::to_string(i)), "Squeeze",
-                                             "Squeeze for Fused Gather nodes", {split_outputs[i]}, {gather_outputs[i]});
-          squeeze_node.AddAttribute("axes", std::vector<int64_t>{split_axis});
-          squeeze_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
+    InlinedVector<int64_t> split_values;
+    for (int64_t start : starts) {
+      auto& output_info = output_info_map[start];
+      NodeArg* original_output_arg = std::get<0>(output_info);
+      int64_t split_value = std::get<1>(output_info);
+      split_values.emplace_back(split_value);
+      if (std::get<2>(output_info)) {
+        ONNX_NAMESPACE::TypeProto split_output_type;
+        const ONNX_NAMESPACE::TensorProto_DataType element_type =
+            static_cast<ONNX_NAMESPACE::TensorProto_DataType>(node_arg->TypeAsProto()->tensor_type().elem_type());
+        split_output_type.mutable_tensor_type()->set_elem_type(element_type);
+        for (int64_t i = 0; i < rank; ++i) {
+          if (i == axis) {
+            split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(split_value);
+          } else {
+            *(split_output_type.mutable_tensor_type()->mutable_shape()->add_dim()) = shape->dim(static_cast<int>(i));
+          }
         }
-      }
-    } else {
-      if (onnx_opset_version >= 18) {
-        split_node.AddAttribute("num_outputs", static_cast<int64_t>(consumer_count));
-      }
-
-      if (add_squeeze_node) {
+        NodeArg* split_output_arg =
+            &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("split_output"), &split_output_type);
         ONNX_NAMESPACE::TensorProto axes_initializer_proto;
-        axes_initializer_proto.set_name(graph.GenerateNodeName("SqueezeAxesInitializer"));
+        axes_initializer_proto.set_name(graph.GenerateNodeName("squeeze_axes"));
         axes_initializer_proto.add_dims(static_cast<int64_t>(1));
         axes_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-        InlinedVector<int64_t> axes_value{split_axis};
-        axes_initializer_proto.set_raw_data(axes_value.data(), axes_value.size() * sizeof(int64_t));
+        axes_initializer_proto.add_int64_data(axis);
         NodeArg* axes_arg = &graph_utils::AddInitializer(graph, axes_initializer_proto);
-
-        for (size_t i = 0; i < consumer_count; ++i) {
-          Node& squeeze_node =
-              graph.AddNode(graph.GenerateNodeName("Squeeze" + std::to_string(i)), "Squeeze",
-                            "Squeeze for Fused Gather nodes", {split_outputs[i], axes_arg}, {gather_outputs[i]});
-          squeeze_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
-        }
+        Node& squeeze_node =
+            graph.AddNode(graph.GenerateNodeName("Squeeze"), "Squeeze", "Squeeze for Fused Gather nodes",
+                          {split_output_arg, axes_arg}, {original_output_arg});
+        squeeze_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
+        split_outputs.emplace_back(split_output_arg);
+      } else {
+        split_outputs.emplace_back(original_output_arg);
       }
     }
 
-    for (Node& n : nodes_to_fuse) {
-      graph_utils::RemoveNodeOutputEdges(graph, n);
-      graph.RemoveNode(n.Index());
+    ONNX_NAMESPACE::TensorProto split_initializer_proto;
+    split_initializer_proto.set_name(graph.GenerateNodeName("splits"));
+    split_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+    split_initializer_proto.add_dims(static_cast<int64_t>(split_values.size()));
+    split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end());
+    NodeArg* split_initializer_arg = &graph_utils::AddInitializer(graph, split_initializer_proto);
+    Node& split_node = graph.AddNode(nodes_to_fuse[0].get().Name() + "/GatherSliceToSplitFusion/", "Split", "Split for Fused Gather nodes",
+                                     {graph.GetNodeArg(node_arg->Name()), split_initializer_arg}, split_outputs);
+    split_node.AddAttribute("axis", axis);
+    split_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
+
+    for (Node& node : nodes_to_fuse) {
+      graph_utils::RemoveNodeOutputEdges(graph, node);
+      graph.RemoveNode(node.Index());
     }
 
     modified = true;
diff --git a/onnxruntime/core/optimizer/gather_fusion.h b/onnxruntime/core/optimizer/gather_fusion.h
index 44c235915b6c..098278a77daf 100644
--- a/onnxruntime/core/optimizer/gather_fusion.h
+++ b/onnxruntime/core/optimizer/gather_fusion.h
@@ -8,19 +8,23 @@
 namespace onnxruntime {
 
 /**
-@Class GatherToSplitFusion
+@Class GatherSliceToSplitFusion
 
-Fuse multiple Gather nodes that comsuming one output to one Split node.
+Fuse multiple Gather/Slice nodes that comsuming one output to one Split node.
 */
-class GatherToSplitFusion : public GraphTransformer {
+class GatherSliceToSplitFusion : public GraphTransformer {
  public:
-  GatherToSplitFusion(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
-      : GraphTransformer("GatherToSplitFusion", compatible_execution_providers) {}
+  GatherSliceToSplitFusion(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("GatherSliceToSplitFusion", compatible_execution_providers) {}
 
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
 
  private:
-  bool IsSupportedGather(const Graph& graph, const Node& node, int64_t& index, int64_t& axis, int64_t& indices_n_dims) const;
+  bool IsSupportedGather(const Graph& graph, const Node& node, int64_t rank, int64_t target_axis, int64_t dim_size,
+                         InlinedVector<bool>& consumed, int64_t& start, bool& need_squeeze) const;
+
+  bool IsSupportedSlice(const Graph& graph, const Node& node, int64_t rank, int64_t target_axis, int64_t dim_size,
+                        InlinedVector<bool>& consumed, int64_t& start, int64_t& end) const;
 };
 
 /**
diff --git a/onnxruntime/core/optimizer/gemm_activation_fusion.cc b/onnxruntime/core/optimizer/gemm_activation_fusion.cc
index c62887da09fd..50be2cbd48f7 100644
--- a/onnxruntime/core/optimizer/gemm_activation_fusion.cc
+++ b/onnxruntime/core/optimizer/gemm_activation_fusion.cc
@@ -56,6 +56,13 @@ Status GemmActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l
       continue;
     }
 
+    NodeArg* node_output = node.MutableOutputDefs()[0];
+    auto data_type = node_output->TypeAsProto()->tensor_type().elem_type();
+    if (data_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+      // FusedGemm is only registered for float data type in fused_gemm.cc!
+      continue;
+    }
+
     const Node& next_node = *(node.OutputNodesBegin());
     if (!IsFusableActivation(next_node) || next_node.GetExecutionProviderType() != node.GetExecutionProviderType()) {
       continue;
diff --git a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
index b97cce9c2e78..a52517d23db8 100644
--- a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
+++ b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
@@ -75,7 +75,7 @@ Status GemmTransposeFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& m
     nodes_to_remove.push_back(output_node);
   }
 
-  Node& new_gemm_node = graph.AddNode(graph.GenerateNodeName(gemm_node.Name() + "_transformed"),
+  Node& new_gemm_node = graph.AddNode(graph.GenerateNodeName(gemm_node.Name() + "/GemmTransposeFusion/"),
                                       gemm_node.OpType(),
                                       "Fused Gemm with Transpose",
                                       new_gemm_input_defs,
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 3d6251a694cf..51fbbf0b7998 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -45,6 +45,7 @@
 #include "core/optimizer/identical_children_consolidation.h"
 #include "core/optimizer/identity_elimination.h"
 #include "core/optimizer/layer_norm_fusion.h"
+#include "core/optimizer/label_encoder_fusion.h"
 #include "core/optimizer/matmul_activation_fusion.h"
 #include "core/optimizer/matmul_add_fusion.h"
 #include "core/optimizer/matmul_integer_to_float.h"
@@ -69,6 +70,7 @@
 #include "core/optimizer/reshape_fusion.h"
 #include "core/optimizer/rocm_blas_alt_impl.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
+#include "core/optimizer/shape_input_merge.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
 #include "core/optimizer/slice_elimination.h"
 #include "core/optimizer/transpose_optimizer.h"
@@ -132,6 +134,7 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
       rules.push_back(std::make_unique<MatmulBNFusion>());
       rules.push_back(std::make_unique<ClipQuantFusion>());
       rules.push_back(std::make_unique<ReluQuantFusion>());
+      rules.push_back(std::make_unique<LabelEncoderFusion>());
       break;
 
     case TransformerLevel::Level2:
@@ -211,9 +214,9 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
         transformers.emplace_back(std::make_unique<DoubleQDQPairsRemover>());
       }
 
-      // Put ConstantSharing before CommonSubexpressionElimination by intention as it can create more opportunities for
-      // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by
-      // default, CSE will not merge them, because the different initializers are represented by different NodeArg.
+      // Put ConstantSharing and ShapeInputMerge before CommonSubexpressionElimination by intention as it can create
+      // more opportunities for CSE. For example, if A and B nodes consume same different args but produce same output
+      // or consume different initializers with same value, by default, CSE will not merge them.
       InlinedHashSet<std::string> excluded_initializers;
       excluded_initializers.reserve(session_options.initializers_to_share_map.size());
       for (const auto& p : session_options.initializers_to_share_map) {
@@ -221,9 +224,10 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       }
       const InlinedHashSet<std::string_view> no_limit_empty_ep_list = {};
       transformers.emplace_back(std::make_unique<ConstantSharing>(no_limit_empty_ep_list, excluded_initializers));
-
+      transformers.emplace_back(std::make_unique<ShapeInputMerge>());
       transformers.emplace_back(std::make_unique<CommonSubexpressionElimination>());
-      transformers.emplace_back(std::make_unique<ConstantFolding>(cpu_execution_provider, !disable_quant_qdq));
+      transformers.emplace_back(std::make_unique<ConstantFolding>(cpu_execution_provider, !disable_quant_qdq,
+                                                                  session_options.config_options));
       transformers.emplace_back(std::make_unique<MatMulAddFusion>());
       transformers.emplace_back(std::make_unique<ReshapeFusion>());
       transformers.emplace_back(std::make_unique<FreeDimensionOverrideTransformer>(
@@ -277,7 +281,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
                                                                                onnxruntime::kAclExecutionProvider,
                                                                                onnxruntime::kArmNNExecutionProvider,
                                                                                onnxruntime::kJsExecutionProvider};
-
+      const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
+                                                            onnxruntime::kDmlExecutionProvider};
 #ifdef MLAS_TARGET_AMD64_IX86
       const bool avx2_precision_mode =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
@@ -295,7 +300,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       }
 
       transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
-      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_ep));
+      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_eps));
       transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_ep));
 
       transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_cuda_rocm_acl_armnn_js_eps));
@@ -305,7 +310,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       transformers.emplace_back(std::make_unique<SimplifiedLayerNormFusion>(cpu_cuda_rocm_eps));
       transformers.emplace_back(std::make_unique<AttentionFusion>(cpu_cuda_dml_rocm_eps));
       transformers.emplace_back(std::make_unique<EmbedLayerNormFusion>(cpu_cuda_dml_rocm_eps));
-      transformers.emplace_back(std::make_unique<GatherToSplitFusion>(cpu_cuda_rocm_eps));
+      transformers.emplace_back(std::make_unique<GatherSliceToSplitFusion>(cpu_cuda_rocm_eps));
       transformers.emplace_back(std::make_unique<GatherToSliceFusion>(cpu_cuda_rocm_eps));
 
       transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_dml_rocm_eps));
@@ -313,7 +318,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
 
       transformers.emplace_back(std::make_unique<SkipLayerNormFusion>(cpu_cuda_dml_rocm_eps));
 
-      transformers.emplace_back(std::make_unique<FastGeluFusion>(cpu_cuda_rocm_eps));
+      transformers.emplace_back(std::make_unique<FastGeluFusion>(cpu_cuda_dml_rocm_eps));
       transformers.emplace_back(std::make_unique<QuickGeluFusion>(cpu_cuda_dml_rocm_eps));
 
       // GeluApproximation has side effects which may change results. It needs to be manually enabled,
diff --git a/onnxruntime/core/optimizer/initializer.cc b/onnxruntime/core/optimizer/initializer.cc
index 9e807ddc7be5..3679a40d32ee 100644
--- a/onnxruntime/core/optimizer/initializer.cc
+++ b/onnxruntime/core/optimizer/initializer.cc
@@ -27,10 +27,14 @@ Initializer::Initializer(ONNX_NAMESPACE::TensorProto_DataType data_type,
 
 Initializer::Initializer(const ONNX_NAMESPACE::TensorProto& tensor_proto, const Path& model_path) {
   ORT_ENFORCE(utils::HasDataType(tensor_proto), "Initializer must have a datatype");
+#if !defined(__wasm__)
+  // using full filepath is required by utils::TensorProtoToTensor(). One exception is WebAssembly platform, where
+  // external data is not loaded from real file system.
   if (utils::HasExternalData(tensor_proto)) {
     ORT_ENFORCE(!model_path.IsEmpty(),
                 "model_path must not be empty. Ensure that a path is provided when the model is created or loaded.");
   }
+#endif
 
   auto proto_data_type = tensor_proto.data_type();
   if (utils::HasName(tensor_proto)) {
diff --git a/onnxruntime/core/optimizer/label_encoder_fusion.cc b/onnxruntime/core/optimizer/label_encoder_fusion.cc
new file mode 100644
index 000000000000..043cd31b88d8
--- /dev/null
+++ b/onnxruntime/core/optimizer/label_encoder_fusion.cc
@@ -0,0 +1,162 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <unordered_map>
+#include <vector>
+#include <string>
+
+#include "core/optimizer/label_encoder_fusion.h"
+#include "core/framework/op_node_proto_helper.h"
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/utils.h"
+
+namespace onnxruntime {
+
+#define KEYS_ATTR_NAME(T) ("keys_" + GetTypename<T>() + "s")
+#define VALUES_ATTR_NAME(T) ("values_" + GetTypename<T>() + "s")
+#define DEFAULT_VALUE_ATTR_NAME(T) ("default_" + GetTypename<T>())
+
+// May be needed somewhere else
+// Think about moving into utils
+template <typename>
+[[maybe_unused]] constexpr bool false_for_T = false;
+
+template <typename T>
+std::string GetTypename() {
+  if constexpr (std::is_same<T, int64_t>()) {
+    return "int64";
+  } else if constexpr (std::is_same<T, std::string>()) {
+    return "string";
+  } else if constexpr (std::is_same<T, float>()) {
+    return "float";
+  } else {
+    static_assert(false_for_T<T>, "Unsupported type");
+  }
+}
+
+template <typename T1, typename T2, typename T3>
+bool LabelEncoderFusion::IsValidForFusion(const Node& node, const Node& next_node) const {
+  return (node.GetAttributes().find(KEYS_ATTR_NAME(T1)) != node.GetAttributes().end() &&
+          node.GetAttributes().find(VALUES_ATTR_NAME(T2)) != node.GetAttributes().end() &&
+          next_node.GetAttributes().find(KEYS_ATTR_NAME(T2)) != next_node.GetAttributes().end() &&
+          next_node.GetAttributes().find(VALUES_ATTR_NAME(T3)) != next_node.GetAttributes().end());
+}
+
+/**
+Transform that fuses two consecutive LabelEncoder nodes
+into one LabelEncoder node.
+ */
+bool LabelEncoderFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& /*logger*/) const {
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(
+          node, "LabelEncoder", {2, 4}, "ai.onnx.ml") ||
+      node.GetOutputEdgesCount() != 1) {
+    return false;
+  }
+
+  const auto& next_node = *node.OutputNodesBegin();
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "LabelEncoder", {4}, "ai.onnx.ml") ||
+      // Make sure the two nodes do not span execution providers.
+      next_node.GetExecutionProviderType() != node.GetExecutionProviderType()) {
+    return false;
+  }
+
+  if (graph.NodeProducesGraphOutput(node)) {
+    return false;
+  }
+
+  // Is one of the supported operations
+  return IsValidForFusion<std::string, std::string, std::string>(node, next_node) ||
+         IsValidForFusion<std::string, std::string, int64_t>(node, next_node) ||
+         IsValidForFusion<std::string, int64_t, std::string>(node, next_node) ||
+         IsValidForFusion<std::string, int64_t, int64_t>(node, next_node) ||
+         IsValidForFusion<int64_t, std::string, std::string>(node, next_node) ||
+         IsValidForFusion<int64_t, std::string, int64_t>(node, next_node) ||
+         IsValidForFusion<int64_t, int64_t, std::string>(node, next_node) ||
+         IsValidForFusion<int64_t, int64_t, int64_t>(node, next_node);
+}
+
+/**
+Since we need to be polymorphic on the datatype
+we will dispatch to this method from the main Apply
+*/
+template <typename T1, typename T2, typename T3>
+Status LabelEncoderFusion::ApplyHelper(
+    Graph& graph,
+    Node& node,
+    Node& next_node,
+    RewriteRuleEffect& rule_effect) const {
+  ProtoHelperNodeContext node_helper_ctx(node);
+  OpNodeProtoHelper<ProtoHelperNodeContext> node_helper(&node_helper_ctx);
+
+  ProtoHelperNodeContext next_node_helper_ctx(next_node);
+  OpNodeProtoHelper<ProtoHelperNodeContext> next_node_helper(&next_node_helper_ctx);
+
+  const std::vector<T1> node_keys =
+      node_helper.GetAttrsOrDefault<T1>(KEYS_ATTR_NAME(T1));
+  const std::vector<T2> node_values =
+      node_helper.GetAttrsOrDefault<T2>(VALUES_ATTR_NAME(T2));
+  const T2 node_default =
+      node_helper.GetAttr<T2>(DEFAULT_VALUE_ATTR_NAME(T2));
+
+  const std::vector<T2> next_node_keys =
+      next_node_helper.GetAttrsOrDefault<T2>(KEYS_ATTR_NAME(T2));
+  const std::vector<T3> next_node_values =
+      next_node_helper.GetAttrsOrDefault<T3>(VALUES_ATTR_NAME(T3));
+  const T3 next_node_default =
+      next_node_helper.GetAttr<T3>(DEFAULT_VALUE_ATTR_NAME(T3));
+
+  const auto getFromMapDefault = [](const auto& mp, const auto key, const auto def) {
+    return (mp.find(key) == mp.end()) ? def : mp.at(key);
+  };
+
+  // Perform value propagation through the second label encoder
+  std::unordered_map<T2, T3> mapping = {};
+  for (size_t i = 0; i < next_node_keys.size(); i++) {
+    mapping[next_node_keys[i]] = next_node_values[i];
+  }
+
+  std::vector<T3> new_node_values = {};
+  const auto new_node_default = getFromMapDefault(mapping, node_default, next_node_default);
+
+  for (const T2& node_value : node_values) {
+    new_node_values.push_back(getFromMapDefault(mapping, node_value, next_node_default));
+  }
+
+  // Remove old attributes:
+  // The keys attribute is correct, we just reroute
+  // the values
+  node.ClearAttribute(VALUES_ATTR_NAME(T2));
+  node.ClearAttribute(DEFAULT_VALUE_ATTR_NAME(T2));
+
+  node.AddAttribute(VALUES_ATTR_NAME(T3), new_node_values);
+  node.AddAttribute(DEFAULT_VALUE_ATTR_NAME(T3), new_node_default);
+
+  graph_utils::FinalizeNodeFusion(graph, node, next_node);
+
+  rule_effect = RewriteRuleEffect::kModifiedRestOfGraph;
+
+  return Status::OK();
+}
+
+#define FUSE_IF_VALID(T1, T2, T3)                      \
+  if (IsValidForFusion<T1, T2, T3>(node, next_node)) { \
+    return ApplyHelper<T1, T2, T3>(                    \
+        graph, node, next_node, rule_effect);          \
+  }
+
+Status LabelEncoderFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger& /*logger*/) const {
+  auto& next_node = *graph.GetNode(node.OutputNodesBegin()->Index());
+
+  FUSE_IF_VALID(std::string, std::string, std::string);
+  FUSE_IF_VALID(std::string, std::string, int64_t);
+  FUSE_IF_VALID(std::string, int64_t, std::string);
+  FUSE_IF_VALID(std::string, int64_t, int64_t);
+  FUSE_IF_VALID(int64_t, std::string, std::string);
+  FUSE_IF_VALID(int64_t, std::string, int64_t);
+  FUSE_IF_VALID(int64_t, int64_t, std::string);
+  FUSE_IF_VALID(int64_t, int64_t, int64_t);
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/label_encoder_fusion.h b/onnxruntime/core/optimizer/label_encoder_fusion.h
new file mode 100644
index 000000000000..30d69f0dcf3c
--- /dev/null
+++ b/onnxruntime/core/optimizer/label_encoder_fusion.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/rewrite_rule.h"
+
+namespace onnxruntime {
+/**
+@Class LabelEncoderFusion
+
+Rewrite rule that fuses two LabelEncoder -> LabelEncoder nodes to a single
+LabelEncoder node.
+
+*/
+class LabelEncoderFusion : public RewriteRule {
+ public:
+  LabelEncoderFusion() noexcept : RewriteRule("LabelEncoderFusion") {}
+
+  std::vector<std::string> TargetOpTypes() const noexcept override {
+    return {"LabelEncoder"};
+  }
+
+ private:
+  bool SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& logger) const override;
+  Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override;
+
+  template <typename T1, typename T2, typename T3>
+  Status ApplyHelper(Graph& graph, Node& node, Node& next_node, RewriteRuleEffect& rule_effect) const;
+
+  template <typename T1, typename T2, typename T3>
+  bool IsValidForFusion(const Node& node, const Node& next) const;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc
index 159e3b23d1ab..48edf4854fbb 100644
--- a/onnxruntime/core/optimizer/layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc
@@ -13,7 +13,7 @@ using namespace onnxruntime::common;
 namespace onnxruntime {
 
 // LayerNorm supports limited data types.
-static constexpr std::array<std::string_view, 3> supported_data_types{"tensor(float16)", "tensor(float)", "tensor(double)"};
+static constexpr std::array<std::string_view, 4> supported_data_types{"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"};
 // Default epsilon
 static constexpr float DEFAULT_LAYERNORM_EPSILON = 1e-5f;
 
@@ -447,8 +447,15 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 
     NodeArg* x_input = has_leading_cast ? graph.GetNode(p_reduce_mean_input_node->Index())->MutableInputDefs()[0]
                                         : reduce_mean_node.MutableInputDefs()[0];
+
+    // CPU doesn't support fp16
+    if (reduce_mean_node.GetExecutionProviderType() == kCpuExecutionProvider &&
+        x_input->TypeAsProto()->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+      continue;
+    }
+
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale, bias};
-    Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName("LayerNormalization"),
+    Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/LayerNormFusion/"),
                                           "LayerNormalization",
                                           "fused LayerNorm subgraphs ",
                                           layer_norm_input_defs,
@@ -689,9 +696,16 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr
 
     NodeArg* x_input = has_leading_cast ? graph.GetNode(p_pow_input_node->Index())->MutableInputDefs()[0]
                                         : pow_node.MutableInputDefs()[0];
+
+    // CPU doesn't support fp16
+    if (reduce_mean_node.GetExecutionProviderType() == kCpuExecutionProvider &&
+        x_input->TypeAsProto()->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+      continue;
+    }
+
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale};
     Node& layer_norm_node =
-        graph.AddNode(graph.GenerateNodeName("SimplifiedLayerNormalization"), "SimplifiedLayerNormalization",
+        graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/SimplifiedLayerNormFusion/"), "SimplifiedLayerNormalization",
                       "fused LayerNorm subgraphs ", layer_norm_input_defs, {}, {}, kOnnxDomain);
 
     // Get constant "epsilon" from "Add" node if available. Else, default value will be used.
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
index 4505d4afdf1e..7953cde6686c 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -31,6 +31,7 @@ CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const a
 }
 
 #if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
+// TODO(mtavenrath) generate list from registered kernels using nhwc domain
 const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
   static std::unordered_set<std::string_view> cuda_nhwc_ops = []() {
     return std::unordered_set<std::string_view>{
@@ -41,7 +42,10 @@ const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
         "MaxPool",
         "GlobalAveragePool",
         "AveragePool",
-    };
+        "GridSample",
+        "DepthToSpace",
+        "SpaceToDepth",
+        "LRN"};
   }();
   return cuda_nhwc_ops;
 }
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h b/onnxruntime/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h
index cfa02c916b73..6e627ecc0d7e 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h
@@ -23,7 +23,7 @@ inline constexpr std::array kLayoutTransformationPotentiallyAddedOps = {
     OpIdentifierWithStringViews{kOnnxDomain, "DequantizeLinear", 10},
     OpIdentifierWithStringViews{kOnnxDomain, "DequantizeLinear", 13},
     OpIdentifierWithStringViews{kOnnxDomain, "DequantizeLinear", 19},
-    // OpIdentifierWithStringViews{kOnnxDomain, "DequantizeLinear", 21}, pending CPU EP adding support
+    OpIdentifierWithStringViews{kOnnxDomain, "DequantizeLinear", 21},
     OpIdentifierWithStringViews{kOnnxDomain, "Gather", 1},
     OpIdentifierWithStringViews{kOnnxDomain, "Gather", 11},
     OpIdentifierWithStringViews{kOnnxDomain, "Gather", 13},
@@ -32,18 +32,22 @@ inline constexpr std::array kLayoutTransformationPotentiallyAddedOps = {
     OpIdentifierWithStringViews{kOnnxDomain, "Identity", 14},
     OpIdentifierWithStringViews{kOnnxDomain, "Identity", 16},
     OpIdentifierWithStringViews{kOnnxDomain, "Identity", 19},
+    OpIdentifierWithStringViews{kOnnxDomain, "Identity", 21},
     OpIdentifierWithStringViews{kOnnxDomain, "QuantizeLinear", 10},
     OpIdentifierWithStringViews{kOnnxDomain, "QuantizeLinear", 13},
     OpIdentifierWithStringViews{kOnnxDomain, "QuantizeLinear", 19},
-    // OpIdentifierWithStringViews{kOnnxDomain, "QuantizeLinear", 21}, pending CPU EP adding support
+    OpIdentifierWithStringViews{kOnnxDomain, "QuantizeLinear", 21},
     OpIdentifierWithStringViews{kOnnxDomain, "Squeeze", 1},
     OpIdentifierWithStringViews{kOnnxDomain, "Squeeze", 11},
     OpIdentifierWithStringViews{kOnnxDomain, "Squeeze", 13},
+    OpIdentifierWithStringViews{kOnnxDomain, "Squeeze", 21},
     OpIdentifierWithStringViews{kOnnxDomain, "Transpose", 1},
     OpIdentifierWithStringViews{kOnnxDomain, "Transpose", 13},
+    OpIdentifierWithStringViews{kOnnxDomain, "Transpose", 21},
     OpIdentifierWithStringViews{kOnnxDomain, "Unsqueeze", 1},
     OpIdentifierWithStringViews{kOnnxDomain, "Unsqueeze", 11},
     OpIdentifierWithStringViews{kOnnxDomain, "Unsqueeze", 13},
+    OpIdentifierWithStringViews{kOnnxDomain, "Unsqueeze", 21},
 
 #if !defined(DISABLE_CONTRIB_OPS)
     // kMSDomain ops
diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
index 56e51cb78793..4fee1a6ce224 100644
--- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc
+++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
@@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) {
   return bias_last_dim > 1;
 }
 
+bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
+  if (!node_arg.Exists()) {
+    return false;
+  }
+
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto) {
+    return false;
+  }
+
+  int32_t actual_data_type;
+  if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) {
+    return false;
+  }
+
+  return data_type == actual_data_type;
+}
+
 /**
 MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
 
@@ -63,9 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
     auto& mul_node = *node_ptr;
 
     ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger));
-
+    const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider;
     if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) ||
-        !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders())) {
+        !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) ||
+        (!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) {
       continue;
     }
 
diff --git a/onnxruntime/core/optimizer/matmul_scale_fusion.cc b/onnxruntime/core/optimizer/matmul_scale_fusion.cc
index b04d794cc946..e4cdeadbf54d 100644
--- a/onnxruntime/core/optimizer/matmul_scale_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_scale_fusion.cc
@@ -245,7 +245,7 @@ Status ProcessNode(
   }
 
   Node& matmul_scale_node = graph.AddNode(
-      graph.GenerateNodeName(node.Name() + "_FusedMatMulAndScale"),
+      graph.GenerateNodeName(node.Name() + "/MatMulScaleFusion/"),
       "FusedMatMul",
       "Fused MatMul and Scale",
       fused_node_inputs,
diff --git a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
index 789466778edc..8eb224013618 100644
--- a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
@@ -154,14 +154,14 @@ static Node* ReorderCastAndTranspose(Graph& graph, Node* cast,
   const ONNX_NAMESPACE::TensorProto_DataType element_type =
       static_cast<ONNX_NAMESPACE::TensorProto_DataType>(cast_output->TypeAsProto()->tensor_type().elem_type());
   new_cast_output_type_proto.mutable_tensor_type()->set_elem_type(element_type);
-  auto& new_cast_output = graph.GetOrCreateNodeArg(cast_output->Name() + "_transformed", &new_cast_output_type_proto);
+  auto& new_cast_output = graph.GetOrCreateNodeArg(cast_output->Name() + "/MatmulTransposeFusion/", &new_cast_output_type_proto);
 
   const std::array new_cast_input_defs{transpose_input};
   const std::array new_cast_output_defs{&new_cast_output};
   const std::array new_transpose_input_defs = {&new_cast_output};
   const std::array new_transpose_output_defs = {cast_output};
 
-  Node& new_cast = graph.AddNode(graph.GenerateNodeName(cast->Name() + "_transformed"),
+  Node& new_cast = graph.AddNode(graph.GenerateNodeName(cast->Name() + "/MatmulTransposeFusion/"),
                                  cast->OpType(),
                                  "Created a new Cast node to interchange Cast and Transpose nodes",
                                  new_cast_input_defs,
@@ -385,7 +385,7 @@ Status MatmulTransposeFusion::ApplyImpl(Graph& graph, bool& modified, int graph_
     const std::array input_defs{left_input, right_input};
     const std::array output_defs{node.MutableOutputDefs()[0]};
 
-    Node& matmul_node = graph.AddNode(graph.GenerateNodeName("MatMul_With_Transpose"),
+    Node& matmul_node = graph.AddNode(graph.GenerateNodeName(node.Name() + "/MatmulTransposeFusion/"),
                                       "FusedMatMul",
                                       "fused MatMul and Transpose ",
                                       input_defs,
diff --git a/onnxruntime/core/optimizer/noop_elimination.cc b/onnxruntime/core/optimizer/noop_elimination.cc
index b3c2991d54b2..bba39b698a27 100644
--- a/onnxruntime/core/optimizer/noop_elimination.cc
+++ b/onnxruntime/core/optimizer/noop_elimination.cc
@@ -42,49 +42,62 @@ bool NoopElimination::SatisfyCondition(const Graph& graph, const Node& node, con
 
   // if initializer_rank is bigger, the output is expected to be initializer_rank per broadcasting rule,
   // but it won't happen if the case is accepted, thus reject it
-  auto initializer_rank = initializer->dims().size();
+  const auto& dims = initializer->dims();
+  auto initializer_rank = dims.size();
   const auto* other_input_shape = node.InputDefs()[input0_is_initializer ? 1 : 0]->Shape();
   if (other_input_shape == nullptr || initializer_rank > other_input_shape->dim_size()) {
     return false;
   }
 
-  int32_t data_type = initializer->data_type();
-  Initializer add_init(*initializer, graph.ModelPath());
-  if (add_init.size() > 1) {
+  int64_t tensor_size = 1;
+  for (auto i : dims) {
+    tensor_size *= i;
+  }
+
+  if (tensor_size > 1) {
     return false;
   }
+
   // handle edge case where the total size of the initializer is 0
-  if (add_init.size() == 0) {
+  if (tensor_size == 0) {
     return true;
   }
 
-  float value = 0.0f;
-  switch (data_type) {
-    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
-      value = *add_init.data<float>();
-      break;
-    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
-      value = math::halfToFloat(add_init.data<MLFloat16>()->val);
-      break;
-    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
-      value = static_cast<float>(*add_init.data<double>());
-      break;
-    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
-      value = static_cast<float>(*add_init.data<int32_t>());
-      break;
-    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
-      value = static_cast<float>(*add_init.data<int64_t>());
-      break;
-    default:
+  if (op_type == "Add" ||
+      op_type == "Sub" ||
+      op_type == "Mul" ||
+      op_type == "Div") {
+    int32_t data_type = initializer->data_type();
+    Initializer add_init(*initializer, graph.ModelPath());
+
+    float value = 0.0f;
+    switch (data_type) {
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+        value = *add_init.data<float>();
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+        value = math::halfToFloat(add_init.data<MLFloat16>()->val);
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
+        value = static_cast<float>(*add_init.data<double>());
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+        value = static_cast<float>(*add_init.data<int32_t>());
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+        value = static_cast<float>(*add_init.data<int64_t>());
+        break;
+      default:
+        return false;
+    }
+
+    if (value != 0.0f && (op_type == "Add" || op_type == "Sub")) {
       return false;
-  }
+    }
 
-  if ((op_type == "Add" || op_type == "Sub") && value != 0.0f) {
-    return false;
-  }
-
-  if ((op_type == "Mul" || op_type == "Div") && value != 1.0f) {
-    return false;
+    if (value != 1.0f && (op_type == "Mul" || op_type == "Div")) {
+      return false;
+    }
   }
 
   // reject node output is graph output for now
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.cc b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
index 46041bca9dcc..1eabc079f3a2 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
@@ -128,26 +128,34 @@ static Status TryCreateKernel(const Node& node,
                               const OrtValueNameIdxMap& ort_value_name_idx_map,
                               FuncManager& funcs_mgr,
                               const DataTransferManager& data_transfer_mgr,
+                              const ConfigOptions& config_options,
                               /*out*/ std::unique_ptr<OpKernel>& op_kernel) {
   const OpSchemaKernelTypeStrResolver kernel_type_str_resolver{};
   const KernelCreateInfo* kernel_create_info = nullptr;
   ORT_RETURN_IF_ERROR(kernel_registry.TryFindKernel(node, execution_provider.Type(), kernel_type_str_resolver,
                                                     &kernel_create_info));
+
+  static const AllocatorMap dummy_allocators;
+
   OpKernelInfo kernel_info(node,
                            *kernel_create_info->kernel_def,
                            execution_provider,
                            constant_initialized_tensors,
                            ort_value_name_idx_map,
-                           data_transfer_mgr);
+                           data_transfer_mgr,
+                           dummy_allocators,
+                           config_options);
+
   return kernel_create_info->kernel_create_func(funcs_mgr, kernel_info, op_kernel);
 }
 
-std::unique_ptr<const OpKernel> OptimizerExecutionFrame::Info::CreateKernel(const Node* node) const {
+std::unique_ptr<const OpKernel>
+OptimizerExecutionFrame::Info::CreateKernel(const Node* node, const ConfigOptions& config_options) const {
   std::unique_ptr<OpKernel> op_kernel;
   std::shared_ptr<KernelRegistry> kernel_registry = execution_provider_.GetKernelRegistry();
   FuncManager func;
   auto status = TryCreateKernel(*node, *kernel_registry, execution_provider_, initializers_,
-                                ort_value_name_idx_map_, func, data_transfer_mgr_,
+                                ort_value_name_idx_map_, func, data_transfer_mgr_, config_options,
                                 op_kernel);
 
   // Kernel found in the CPU kernel registry
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.h b/onnxruntime/core/optimizer/optimizer_execution_frame.h
index 13cf9e652c40..3dbf6c1d97aa 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.h
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.h
@@ -27,11 +27,13 @@ class OptimizerExecutionFrame final : public IExecutionFrame {
          const Path& model_path,
          const IExecutionProvider& execution_provider,
          const std::function<bool(const std::string&)>& is_sparse_initializer_func);
+
     Info(const std::vector<const Node*>& nodes,
          const std::unordered_map<std::string, OrtValue>& initialized_tensor_set,
          const Path& model_path,
          const IExecutionProvider& execution_provider,
          const std::function<bool(const std::string&)>& is_sparse_initializer_func);
+
     ~Info() = default;
 
     const AllocatorPtr& GetAllocator() const {
@@ -52,7 +54,7 @@ class OptimizerExecutionFrame final : public IExecutionFrame {
       return -1;
     }
 
-    std::unique_ptr<const OpKernel> CreateKernel(const Node* node) const;
+    std::unique_ptr<const OpKernel> CreateKernel(const Node* node, const ConfigOptions& config_options) const;
 
     // Check if an kernel create info can be found in the registry.
     Status TryFindKernel(const Node* node, const KernelCreateInfo** out) const;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
index cc0f7854791d..9d53e2892178 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
@@ -53,7 +53,7 @@ Status DuplicateDQForOutputEdge(const graph_utils::GraphEdge& original_dq_output
                                     MakeString("Added by ", kTransformerName),
                                     dq_inputs,
                                     {&new_dq_output_nodearg},
-                                    nullptr,  // attributes
+                                    &original_dq_node.GetAttributes(),
                                     original_dq_node.Domain());
 
   // set up edges
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
index 221c06d7c8dc..e245636ce9a8 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
@@ -54,9 +54,69 @@ bool IsQDQPairSupported(
   Initializer dq_zp(*dq_zp_tensor_proto, model_path);
   Initializer dq_scale(*dq_scale_tensor_proto, model_path);
 
-  return q_zp.data_type() == dq_zp.data_type() &&
-         SpanEq(q_zp.DataAsByteSpan(), dq_zp.DataAsByteSpan()) &&
-         *q_scale.data<float>() == *dq_scale.data<float>();
+  if (q_zp.data_type() != dq_zp.data_type() ||
+      q_scale.data_type() != q_scale.data_type() ||
+      !SpanEq(q_zp.DataAsByteSpan(), dq_zp.DataAsByteSpan())) {
+    return false;
+  }
+
+  switch (q_scale.data_type()) {
+    case ONNX_NAMESPACE::TensorProto::FLOAT:
+      return *q_scale.data<float>() == *dq_scale.data<float>();
+
+    case ONNX_NAMESPACE::TensorProto::FLOAT16:
+      return *q_scale.data<MLFloat16>() == *dq_scale.data<MLFloat16>();
+
+    case ONNX_NAMESPACE::TensorProto::BFLOAT16:
+      return *q_scale.data<BFloat16>() == *dq_scale.data<BFloat16>();
+
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+bool IsDQQConversion(
+    const Node& dq_node, const Node& q_node,
+    const GetConstantInitializerFn& get_const_initializer,
+    const Path& model_path) {
+  ConstPointerContainer<std::vector<NodeArg*>> dq_input_defs = dq_node.InputDefs();
+  ConstPointerContainer<std::vector<NodeArg*>> q_input_defs = q_node.InputDefs();
+
+  // Q/DQ contains optional input is not supported
+  // non-scalar Q/DQ scale and zero point needs are not supported
+  if (dq_input_defs.size() != InputIndex::TOTAL_COUNT ||
+      q_input_defs.size() != InputIndex::TOTAL_COUNT ||
+      !optimizer_utils::IsScalar(*q_input_defs[InputIndex::SCALE_ID]) ||
+      !optimizer_utils::IsScalar(*q_input_defs[InputIndex::ZERO_POINT_ID]) ||
+      !optimizer_utils::IsScalar(*dq_input_defs[InputIndex::SCALE_ID]) ||
+      !optimizer_utils::IsScalar(*dq_input_defs[InputIndex::ZERO_POINT_ID])) {
+    return false;
+  }
+
+  // if Q/DQ scale and zero point are not constant, return false
+  const ONNX_NAMESPACE::TensorProto* dq_scale_tensor_proto =
+      get_const_initializer(dq_input_defs[InputIndex::SCALE_ID]->Name());
+  const ONNX_NAMESPACE::TensorProto* q_scale_tensor_proto =
+      get_const_initializer(q_input_defs[InputIndex::SCALE_ID]->Name());
+  const ONNX_NAMESPACE::TensorProto* dq_zp_tensor_proto =
+      get_const_initializer(dq_input_defs[InputIndex::ZERO_POINT_ID]->Name());
+  const ONNX_NAMESPACE::TensorProto* q_zp_tensor_proto =
+      get_const_initializer(q_input_defs[InputIndex::ZERO_POINT_ID]->Name());
+  if (nullptr == q_zp_tensor_proto ||
+      nullptr == dq_zp_tensor_proto ||
+      nullptr == q_scale_tensor_proto ||
+      nullptr == dq_scale_tensor_proto) {
+    return false;
+  }
+
+  // check Q/DQ have same scale type and different zero point type
+  Initializer q_zp(*q_zp_tensor_proto, model_path);
+  Initializer q_scale(*q_scale_tensor_proto, model_path);
+  Initializer dq_zp(*dq_zp_tensor_proto, model_path);
+  Initializer dq_scale(*dq_scale_tensor_proto, model_path);
+
+  return (dq_zp.data_type() != q_zp.data_type()) && (dq_scale.data_type() == q_scale.data_type());
 }
 
 bool IsDQSupported(const Node& dq_node, const GetConstantInitializerFn& get_const_initializer) {
@@ -102,12 +162,12 @@ bool QOrDQNodeHasConstantScalarScaleAndZeroPoint(
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
 bool MatchQNode(const Node& node) {
-  return graph_utils::IsSupportedOptypeVersionAndDomain(node, QOpName, {10, 13, 19}) ||
+  return graph_utils::IsSupportedOptypeVersionAndDomain(node, QOpName, {10, 13, 19, 21}) ||
          graph_utils::IsSupportedOptypeVersionAndDomain(node, QOpName, {1}, kMSDomain);
 }
 
 bool MatchDQNode(const Node& node) {
-  return graph_utils::IsSupportedOptypeVersionAndDomain(node, DQOpName, {10, 13, 19}) ||
+  return graph_utils::IsSupportedOptypeVersionAndDomain(node, DQOpName, {10, 13, 19, 21}) ||
          graph_utils::IsSupportedOptypeVersionAndDomain(node, DQOpName, {1}, kMSDomain);
 }
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
index bb0bf9438cfc..8333168b0093 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
@@ -38,6 +38,18 @@ bool IsQDQPairSupported(
     const GetConstantInitializerFn& get_const_initializer,
     const Path& model_path);
 
+// Check if a DQ -> Q sequence represents a conversion in quantization data type.
+// Example of uint8 to uint16:
+//     Dequantize (uint8 to float) -> Quantize (float to uint16)
+// Requires:
+// 1. Q/DQ doesn't have optional input.
+// 2. scale and zero-point are constant scalars.
+// 3. Q and DQ have the same scale *type* and different zero-point *types*.
+bool IsDQQConversion(
+    const Node& dq_node, const Node& q_node,
+    const GetConstantInitializerFn& get_const_initializer,
+    const Path& model_path);
+
 // Check if DQ is supported in extended level QDQ transformers. It requires:
 // 1. DQ doesn't have optional input.
 // 2. scale and zero point is constant scalar
diff --git a/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc b/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
index 3a8f2db62302..7417212c570c 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
@@ -46,8 +46,14 @@ Status ReluQuantFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_
   using ONNX_TENSOR_ELEM_TYPE = ONNX_NAMESPACE::TensorProto::DataType;
   Initializer zero_point(*zp_tensor_proto, graph.ModelPath());
   if (zero_point.size() != 1 ||
-      (zero_point.data_type() == ONNX_TENSOR_ELEM_TYPE::TensorProto_DataType_INT8 && zero_point.data<int8_t>()[0] != -128) ||
-      (zero_point.data_type() == ONNX_TENSOR_ELEM_TYPE::TensorProto_DataType_UINT8 && zero_point.data<uint8_t>()[0] != 0)) {
+      (zero_point.data_type() == ONNX_TENSOR_ELEM_TYPE::TensorProto_DataType_INT8 &&
+       zero_point.data<int8_t>()[0] != -128) ||
+      (zero_point.data_type() == ONNX_TENSOR_ELEM_TYPE::TensorProto_DataType_UINT8 &&
+       zero_point.data<uint8_t>()[0] != 0) ||
+      (zero_point.data_type() == ONNX_TENSOR_ELEM_TYPE::TensorProto_DataType_INT16 &&
+       zero_point.data<int16_t>()[0] != -32768) ||
+      (zero_point.data_type() == ONNX_TENSOR_ELEM_TYPE::TensorProto_DataType_UINT16 &&
+       zero_point.data<uint16_t>()[0] != 0)) {
     return Status::OK();
   }
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index 29178fe87f75..29f7575b2a63 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -105,8 +105,8 @@ void UnaryOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
   std::unique_ptr<Action> action = std::make_unique<QDQ::UnaryReplaceWithQLinear>(kMSDomain);
 
 #if !defined(ORT_MINIMAL_BUILD)
-  // TODO: Enable 16-bit types in selector when unary QLinear* ops support 16-bit.
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::UnarySelector>();
+  std::vector<const char*> providers = {kCpuExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::UnarySelector>(providers);
   qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                          {{"AveragePool", {}},
                                                           {"LeakyRelu", {}},
@@ -123,20 +123,43 @@ void UnaryOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
 void BinaryOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
   // 4 nodes. 2 x DQ for inputs, target, Q
   // Replace with internal QLinear version of operator. Delete all original nodes.
-  const std::string action_name{"2DQ"};
-  std::unique_ptr<Action> action = std::make_unique<QDQ::BinaryReplaceWithQLinear>(kMSDomain);
+  {
+    const std::string action_name{"2DQ"};
+    std::unique_ptr<Action> action = std::make_unique<QDQ::BinaryReplaceWithQLinear>(kMSDomain);
 
 #if !defined(ORT_MINIMAL_BUILD)
-  // TODO: Enable 16-bit types in selector when binary QLinear* ops support 16-bit.
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::BinarySelector>();
-  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
-                                                         {{"Add", {}},
-                                                          {"Mul", {}}},
-                                                         std::move(selector),
-                                                         std::move(action));
+    // TODO: Enable 16-bit types in selector when binary QLinear* ops support 16-bit.
+    std::vector<const char*> providers = {kCpuExecutionProvider};
+    std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::BinarySelector>(providers);
+    qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
+                                                           {{"Add", {}},
+                                                            {"Mul", {}}},
+                                                           std::move(selector),
+                                                           std::move(action));
 
 #else
-  qdq_selector_action_registry.RegisterAction(action_name, std::move(action));
+    qdq_selector_action_registry.RegisterAction(action_name, std::move(action));
+#endif
+  }
+
+#ifdef USE_DML
+  {
+    const std::string action_name{"2DQ_DML"};
+    std::unique_ptr<Action> action = std::make_unique<QDQ::BinaryReplaceWithQLinear>(kMSDomain);
+
+#if !defined(ORT_MINIMAL_BUILD)
+    std::vector<const char*> providers = {kDmlExecutionProvider};
+    std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::BinarySelector>(providers);
+
+    qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
+                                                           {{"Add", {}}},
+                                                           std::move(selector),
+                                                           std::move(action));
+
+#else
+#error "ORT_MINIMAL_BUILD and USE_DML are not expected simultaneously. This would require RegisterAction to be called here."
+#endif
+  }
 #endif
 }
 
@@ -214,8 +237,8 @@ void GemmQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
   std::unique_ptr<Action> action = std::make_unique<QDQ::GemmReplaceWithQuant>();
 
 #if !defined(ORT_MINIMAL_BUILD)
-  // TODO: Enable 16-bit types in selector when QGemm supports 16-bit.
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::GemmSelector>();
+  std::vector<const char*> providers = {kCpuExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::GemmSelector>(providers);
   qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                          {{"Gemm", {}}},
                                                          std::move(selector),
@@ -235,8 +258,9 @@ void WhereQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
   std::unique_ptr<Action> action = std::make_unique<QDQ::WhereReplaceWithQLinear>();
 
 #if !defined(ORT_MINIMAL_BUILD)
-  // TODO: Enable 16-bit types in selector when QLinearWhere supports 16-bit.
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::WhereSelector>();
+
+  std::vector<const char*> providers = {kCpuExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::WhereSelector>(providers);
   qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                          {{"Where", {}}},
                                                          std::move(selector),
@@ -271,8 +295,8 @@ QDQSelectorActionTransformer::QDQSelectorActionTransformer(
           "QDQSelectorActionTransformer",
           CreateSelectorActionRegistry(is_int8_allowed),
           apply_context,
-          // this transformer is only compatible with the CPU EP
-          {kCpuExecutionProvider}} {
+          // this transformer is only compatible with the CPU and DML EP
+          {kCpuExecutionProvider, kDmlExecutionProvider}} {
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 15b501c66704..6b4f62ae1343 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -58,8 +58,8 @@ bool NodeGroupSelector::CheckQDQNodes(const GraphViewer& graph_viewer, const Nod
     return false;
   }
 
-  if (const auto dq_validation_status = QDQ::ValidateNodeGroupDQNodes(graph_viewer, node, dq_nodes);
-      !dq_validation_status.IsOK()) {
+  if (const auto qdq_validation_status = NodeGroup::CanCreateNodeGroup(graph_viewer, node, dq_nodes, q_nodes);
+      !qdq_validation_status.IsOK()) {
     return false;
   }
 
@@ -91,6 +91,13 @@ std::optional<NodeGroup> NodeGroupSelector::GetQDQSelection(const GraphViewer& g
 }
 
 std::optional<NodesToOptimizeIndices> BaseSelector::Select(const GraphViewer& graph_viewer, const Node& node) const {
+  const std::string_view node_ep = node.GetExecutionProviderType();
+
+  if (!compatible_providers_.empty() &&
+      std::find(compatible_providers_.begin(), compatible_providers_.end(), node_ep) == compatible_providers_.end()) {
+    return std::nullopt;
+  }
+
   const auto qdq_group = node_group_selector_->GetQDQSelection(graph_viewer, node);
   if (!qdq_group.has_value()) {
     return std::nullopt;
@@ -146,8 +153,8 @@ bool DropDQNodeGroupSelector::Check(const GraphViewer& graph_viewer,
     return false;
   }
 
-  if (const auto dq_validation_status = QDQ::ValidateNodeGroupDQNodes(graph_viewer, node, dq_nodes);
-      !dq_validation_status.IsOK()) {
+  if (const auto qdq_validation_status = NodeGroup::CanCreateNodeGroup(graph_viewer, node, dq_nodes, q_nodes);
+      !qdq_validation_status.IsOK()) {
     return false;
   }
 
@@ -537,8 +544,8 @@ bool TopKNodeGroupSelector::Check(const GraphViewer& graph_viewer,
     return false;
   }
 
-  if (const auto dq_validation_status = QDQ::ValidateNodeGroupDQNodes(graph_viewer, node, dq_nodes);
-      !dq_validation_status.IsOK()) {
+  if (const auto qdq_validation_status = QDQ::NodeGroup::CanCreateNodeGroup(graph_viewer, node, dq_nodes, q_nodes);
+      !qdq_validation_status.IsOK()) {
     return false;
   }
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index d0d7fb2c2af1..5d550669e2e8 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -5,6 +5,7 @@
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
+#include "core/framework/node_unit.h"
 #include "core/optimizer/selectors_actions/selector_action_transformer.h"
 
 namespace onnxruntime {
@@ -13,13 +14,6 @@ class Node;
 
 namespace QDQ {
 
-// Struct to represent a DQ->Op->Q node group
-struct NodeGroup {
-  std::vector<NodeIndex> dq_nodes;
-  std::vector<NodeIndex> q_nodes;
-  NodeIndex target_node;
-};
-
 class NodeGroupSelector {
  public:
   // This is a QDQ Selectors only function, will return QDQ::NodeGroup instead of NodesToOptimizeIndices
@@ -257,12 +251,15 @@ class BaseSelector : public NodeSelector {
 
   // We std::move SelectorActionRegistry into the SelectorActionTransformer so this class needs to have a move ctor
   BaseSelector(BaseSelector&& rhs) noexcept
-      : node_group_selector_{std::move(rhs.node_group_selector_)} {
+      : node_group_selector_{std::move(rhs.node_group_selector_)},
+        compatible_providers_{std::move(rhs.compatible_providers_)} {
   }
 
  protected:
-  BaseSelector(std::unique_ptr<NodeGroupSelector> node_group_selector)
-      : node_group_selector_{std::move(node_group_selector)} {}
+  BaseSelector(std::unique_ptr<NodeGroupSelector> node_group_selector, gsl::span<const char*> compatible_providers = {})
+      : node_group_selector_{std::move(node_group_selector)},
+        compatible_providers_(compatible_providers.begin(), compatible_providers.end()) {
+  }
 
   // override if you need to adjust the values in NodesToOptimize.
   // e.g. add entries for missing optional DQ inputs or set num_inputs to handle variadic inputs
@@ -271,6 +268,7 @@ class BaseSelector : public NodeSelector {
 
  private:
   std::unique_ptr<NodeGroupSelector> node_group_selector_;
+  std::vector<std::string> compatible_providers_;
 };
 
 class DropQDQNodesSelector : public BaseSelector {
@@ -287,14 +285,14 @@ class DropDQNodesSelector : public BaseSelector {
 
 class UnarySelector : public BaseSelector {
  public:
-  explicit UnarySelector(bool allow_16bit = false)
-      : BaseSelector(std::make_unique<UnaryNodeGroupSelector>(allow_16bit)) {}
+  explicit UnarySelector(gsl::span<const char*> compatible_providers = {}, bool allow_16bit = false)
+      : BaseSelector(std::make_unique<UnaryNodeGroupSelector>(allow_16bit), compatible_providers) {}
 };
 
 class BinarySelector : public BaseSelector {
  public:
-  explicit BinarySelector(bool allow_16bit = false)
-      : BaseSelector(std::make_unique<BinaryNodeGroupSelector>(allow_16bit)) {}
+  explicit BinarySelector(gsl::span<const char*> compatible_providers = {}, bool allow_16bit = false)
+      : BaseSelector(std::make_unique<BinaryNodeGroupSelector>(allow_16bit), compatible_providers) {}
 };
 
 // Variadic DQ nodes -> node -> Q
@@ -326,8 +324,8 @@ class ConvSelector : public BaseSelector {
 
 class WhereSelector : public BaseSelector {
  public:
-  explicit WhereSelector(bool allow_16bit = false)
-      : BaseSelector(std::make_unique<WhereNodeGroupSelector>(allow_16bit)) {}
+  explicit WhereSelector(gsl::span<const char*> compatible_providers = {}, bool allow_16bit = false)
+      : BaseSelector(std::make_unique<WhereNodeGroupSelector>(allow_16bit), compatible_providers) {}
 };
 
 // 2 DQ nodes for input -> node -> optional Q if QLinearMatMul, MatMulIntegerToFloat if not
@@ -342,8 +340,8 @@ class MatMulSelector : public BaseSelector {
 // Output: optional Q node for Y
 class GemmSelector : public BaseSelector {
  public:
-  explicit GemmSelector(bool allow_16bit = false)
-      : BaseSelector(std::make_unique<GemmNodeGroupSelector>(allow_16bit)) {}
+  explicit GemmSelector(gsl::span<const char*> compatible_providers = {}, bool allow_16bit = false)
+      : BaseSelector(std::make_unique<GemmNodeGroupSelector>(allow_16bit), compatible_providers) {}
 
   void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
 };
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index 544fe82a268c..1876f7826c96 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -13,6 +13,7 @@
 #include <core/providers/common.h>
 
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 
 namespace onnxruntime {
 namespace QDQ {
@@ -43,6 +44,7 @@ static const OpVersionsAndSelector::OpVersionsMap GetMiscOpVersionsMap() {
           {"Tile", {}}};
 }
 
+// These produce int64 indices output, which can't be quantized, so there's no downstream Q node.
 static const OpVersionsAndSelector::OpVersionsMap GetDropDQOpVersionsMap() {
   return {{"ArgMax", {}},
           {"ArgMin", {}}};
@@ -324,28 +326,48 @@ std::vector<NodeGroup> SelectorManager::GetQDQSelections(const GraphViewer& grap
   return qdq_selections;
 }
 
-Status ValidateNodeGroupDQNodes(const GraphViewer& graph_viewer,
-                                const Node& target_node,
-                                gsl::span<const Node* const> dq_nodes) {
-  // Within a QDQ node group, a target node input is the only consumer of each DQ.
-  // This should have been ensured by the EnsureUniqueDQForNodeUnit graph transformer, but other graph modifications
-  // may have happened since. Verify that this is still true.
-  for (const auto* dq_node : dq_nodes) {
-    const bool dq_produces_graph_output = graph_viewer.NodeProducesGraphOutput(*dq_node);
-    ORT_RETURN_IF(dq_produces_graph_output,
-                  "QDQ node group cannot have DQ node that produces a graph output. DQ node: ", dq_node->Name(),
-                  ", target node: ", target_node.Name());
-
-    const bool dq_has_single_output_edge_to_target =
-        dq_node->GetOutputEdgesCount() == 1 &&
-        dq_node->OutputEdgesBegin()->GetNode().Index() == target_node.Index();
-    ORT_RETURN_IF_NOT(dq_has_single_output_edge_to_target,
-                      "QDQ node group cannot have DQ that doesn't have a single output edge to the target node. "
-                      "DQ node: ",
-                      dq_node->Name(), ", target node: ", target_node.Name());
+std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
+GetAllNodeUnits(const GraphViewer& graph_viewer) {
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
+
+  const auto add_node_unit_to_map = [&](const std::vector<NodeIndex>& node_indices, const NodeUnit* node_unit) {
+    for (const auto& node_idx : node_indices) {
+      const auto* node = graph_viewer.GetNode(node_idx);
+      node_unit_map.insert({node, node_unit});
+    }
+  };
+
+  // Get QDQ NodeUnits first
+  QDQ::SelectorManager selector_mgr;
+  const auto qdq_selections = selector_mgr.GetQDQSelections(graph_viewer);
+
+  for (const auto& qdq_selection : qdq_selections) {
+    auto qdq_unit = std::make_unique<NodeUnit>(graph_viewer, qdq_selection);
+
+    // Fill the node to node_unit map for all nodes in the QDQ Group
+    add_node_unit_to_map(qdq_selection.dq_nodes, qdq_unit.get());
+    add_node_unit_to_map(qdq_selection.q_nodes, qdq_unit.get());
+    add_node_unit_to_map({qdq_selection.target_node}, qdq_unit.get());
+
+    node_unit_holder.push_back(std::move(qdq_unit));
+  }
+
+  // Get the left over SingleNode NodeUnits
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+  for (const auto node_idx : node_indices) {
+    const auto* node(graph_viewer.GetNode(node_idx));
+
+    // This is already part of a QDQ NodeUnit
+    if (node_unit_map.find(node) != node_unit_map.cend())
+      continue;
+
+    auto node_unit = std::make_unique<NodeUnit>(*node);
+    node_unit_map[node] = node_unit.get();
+    node_unit_holder.push_back(std::move(node_unit));
   }
 
-  return Status::OK();
+  return std::make_pair(std::move(node_unit_holder), std::move(node_unit_map));
 }
 
 }  // namespace QDQ
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
index 246f26c1760e..de36202afff2 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
@@ -7,6 +7,7 @@
 #include "core/common/common.h"
 #include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/basic_types.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
@@ -78,11 +79,16 @@ class SelectorManager {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SelectorManager);
 };
 
-// Checks whether the provided DQ nodes are valid for forming a QDQ node group with the provided target node.
-// Returns successful status if so, failed status with reason otherwise.
-Status ValidateNodeGroupDQNodes(const GraphViewer& graph_viewer,
-                                const Node& target_node,
-                                gsl::span<const Node* const> dq_nodes);
+// Get all the nodes in the given graph_viewer as NodeUnits (SingleNode or QDQGroup)
+// And return a map to quick query the NodeUnit which contains the given Node,
+// Note, the value of the map is owned by the vector of std::unique_ptr<NodeUnit>
+//
+// TODO: The overall QDQ setup needs refactoring to separate out generic functionality from optimizer specific
+// functionality.
+// We currently have a bit of a mess with generic things like this to get all the node units being in the optimizer
+// library whereas it should be able to be used by an EP with no dependency on optimizers.
+std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
+GetAllNodeUnits(const GraphViewer& graph_viewer);
 
 }  // namespace QDQ
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/quick_gelu_fusion.cc b/onnxruntime/core/optimizer/quick_gelu_fusion.cc
index 6e5eb5612a70..b09ef1c460b8 100644
--- a/onnxruntime/core/optimizer/quick_gelu_fusion.cc
+++ b/onnxruntime/core/optimizer/quick_gelu_fusion.cc
@@ -88,7 +88,7 @@ Status QuickGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 
     NodeArg* quick_gelu_output_arg = mul_node.MutableOutputDefs()[0];
     Node& quick_gelu_node =
-        graph.AddNode(graph.GenerateNodeName("QuickGelu"), "QuickGelu", "QuickGelu", std::array{quick_gelu_input_arg},
+        graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/QuickGeluFusion/"), "QuickGelu", "QuickGelu", std::array{quick_gelu_input_arg},
                       std::array{quick_gelu_output_arg}, {}, kMSDomain);
     quick_gelu_node.AddAttribute("alpha", alpha);
     quick_gelu_node.SetExecutionProviderType(node.GetExecutionProviderType());
diff --git a/onnxruntime/core/optimizer/shape_input_merge.cc b/onnxruntime/core/optimizer/shape_input_merge.cc
new file mode 100644
index 000000000000..9f20520e3e3f
--- /dev/null
+++ b/onnxruntime/core/optimizer/shape_input_merge.cc
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/shape_input_merge.h"
+
+#include "core/graph/graph_utils.h"
+
+namespace onnxruntime {
+
+namespace {
+std::string GetShapeString(const NodeArg* input_arg) {
+  auto shape = input_arg->Shape();
+  if (!shape) return "";
+  std::stringstream ss;
+  ss << "[";
+  for (int i = 0; i < shape->dim_size(); ++i) {
+    if (i != 0) ss << ",";
+    auto dim = shape->dim(i);
+    if (dim.has_dim_value()) {
+      ss << std::to_string(dim.dim_value());
+    } else if (dim.has_dim_param()) {
+      ss << "'" << dim.dim_param() << "'";
+    } else {
+      return "";
+    }
+  }
+  ss << "]";
+  return ss.str();
+}
+
+}  // namespace
+
+Status ShapeInputMerge::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+  GraphViewer graph_viewer(graph);
+  const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+  InlinedHashMap<std::string, InlinedVector<Node*>> input_hash_to_nodes;
+  for (auto node_index : node_topology_list) {
+    auto* p_node = graph.GetNode(node_index);
+    if (!p_node) continue;  // we removed the node as part of an earlier fusion
+    ORT_RETURN_IF_ERROR(Recurse(*p_node, modified, graph_level, logger));
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(*p_node, "Shape", {1, 13, 15, 19, 21}) ||
+        !graph_utils::IsSupportedProvider(*p_node, GetCompatibleExecutionProviders())) {
+      continue;
+    }
+    std::string shape_str = GetShapeString(p_node->InputDefs()[0]);
+    if (shape_str.empty()) continue;
+    if (input_hash_to_nodes.find(shape_str) == input_hash_to_nodes.end()) {
+      input_hash_to_nodes[shape_str] = InlinedVector<Node*>();
+    }
+    input_hash_to_nodes[shape_str].emplace_back(p_node);
+  }
+
+  // All Shape nodes are processed in topological order, so we can safely merge the inputs to the first node's input.
+  for (auto& kv : input_hash_to_nodes) {
+    if (kv.second.size() < 2) continue;
+    NodeArg* first_input_arg = kv.second[0]->MutableInputDefs()[0];
+    bool is_first_input_arg_graph_input = graph.IsInputsIncludingInitializers(first_input_arg);
+    for (size_t i = 1; i < kv.second.size(); ++i) {
+      Node* p_node = kv.second[i];
+      const NodeArg* input_arg = p_node->InputDefs()[0];
+      if (p_node->InputDefs()[0]->Name() == first_input_arg->Name()) continue;
+      if (!graph.IsInputsIncludingInitializers(input_arg)) {
+        const Node::EdgeEnd& input_edge = *p_node->InputEdgesBegin();
+        graph.RemoveEdge(input_edge.GetNode().Index(), p_node->Index(), input_edge.GetSrcArgIndex(), 0);
+      }
+      graph_utils::ReplaceNodeInput(*p_node, 0, *first_input_arg);
+      if (!is_first_input_arg_graph_input) {
+        const Node::EdgeEnd& first_input_edge = *kv.second[0]->InputEdgesBegin();
+        graph.AddEdge(first_input_edge.GetNode().Index(), p_node->Index(), first_input_edge.GetSrcArgIndex(), 0);
+      }
+      modified = true;
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/shape_input_merge.h b/onnxruntime/core/optimizer/shape_input_merge.h
new file mode 100644
index 000000000000..5cb943998487
--- /dev/null
+++ b/onnxruntime/core/optimizer/shape_input_merge.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+
+namespace onnxruntime {
+
+/**
+@Class ShapeInputMerge
+Merge all shape inputs having same shape value to a single shape input.
+This change will not affect the performance, but it open chances for CSE fusion to merge nodes.
+*/
+class ShapeInputMerge : public GraphTransformer {
+ public:
+  ShapeInputMerge(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("ShapeInputMerge", compatible_execution_providers) {}
+
+  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc
new file mode 100644
index 000000000000..a54904ff15e1
--- /dev/null
+++ b/onnxruntime/core/optimizer/stft_decomposition.cc
@@ -0,0 +1,381 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <limits>
+
+#include "core/optimizer/stft_decomposition.h"
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/utils.h"
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/optimizer_execution_frame.h"
+#include "core/optimizer/utils.h"
+#include "core/framework/op_kernel.h"
+#include "core/framework/tensorprotoutils.h"
+
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+
+STFTDecomposition::STFTDecomposition(const InlinedHashSet<std::string_view>& compatible_execution_providers) noexcept
+    : GraphTransformer("STFTDecomposition", compatible_execution_providers) {
+}
+
+template <typename T>
+constexpr static ONNX_NAMESPACE::TensorProto_DataType GetDataType() {
+  if constexpr (std::is_same<T, float>::value) {
+    return ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+  } else if constexpr (std::is_same<T, MLFloat16>::value) {
+    return ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
+  } else if constexpr (std::is_same<T, double>::value) {
+    return ONNX_NAMESPACE::TensorProto_DataType_DOUBLE;
+  } else if constexpr (std::is_same<T, int64_t>::value) {
+    return ONNX_NAMESPACE::TensorProto_DataType_INT64;
+  } else {
+    throw std::logic_error("Invalid data type requested for STFT decomposition");
+  }
+}
+
+template <typename TDataType, size_t TDims>
+NodeArg* AddInitializer(Graph& graph, const char* name, const int64_t (&shape)[TDims], const TDataType* begin) {
+  ONNX_NAMESPACE::TensorProto proto;
+  proto.set_name(graph.GenerateNodeArgName(name));
+  proto.set_data_type(GetDataType<TDataType>());
+  int64_t element_count = 1;
+  for (size_t i = 0; i < TDims; i++) {
+    element_count *= shape[i];
+    proto.add_dims(shape[i]);
+  }
+  proto.set_raw_data(begin, element_count * sizeof(TDataType));
+  return &graph_utils::AddInitializer(graph, proto);
+}
+
+template <size_t TDims>
+NodeArg* AddShapeInitializer(Graph& graph, const char* name, const int64_t (&shape)[TDims]) {
+  int64_t shape_shape[] = {TDims};
+  return AddInitializer<int64_t>(graph, name, shape_shape, shape);
+}
+
+std::pair<Node*, NodeArg*> AddNode(Graph& graph,
+                                   const char* op_type,
+                                   ProviderType execution_provider_type,
+                                   gsl::span<NodeArg*> inputs) {
+  auto def_name = graph.GenerateNodeArgName(op_type);
+  auto node_arg = &graph.GetOrCreateNodeArg(def_name, nullptr);
+  Node& node = graph.AddNode(graph.GenerateNodeName(op_type),
+                             op_type,
+                             "",
+                             inputs,
+                             {node_arg});
+  node.SetExecutionProviderType(execution_provider_type);
+  return std::make_pair(&node, node_arg);
+}
+
+std::pair<Node*, NodeArg*> AddNodeCast(Graph& graph, NodeArg* in,
+                                       ONNX_NAMESPACE::TensorProto_DataType data_type) {
+  auto def_name = graph.GenerateNodeArgName("Cast");
+  auto node_arg = &graph.GetOrCreateNodeArg(def_name, nullptr);
+  Node& node = graph.AddNode(graph.GenerateNodeName("Cast"),
+                             "Cast",
+                             "",
+                             {in},
+                             {node_arg});
+  node.AddAttribute("to", static_cast<int64_t>(data_type));
+  node.SetExecutionProviderType(kCpuExecutionProvider);
+  return std::make_pair(&node, node_arg);
+}
+
+#define CONTINUE_IF_NO_DIM_VALUE(dim) \
+  if (!dim.has_dim_value()) {         \
+    continue;                         \
+  }
+#define CONTINUE_IF_NULL(x) \
+  if (x == nullptr) {       \
+    continue;               \
+  }
+
+/*
+    This function decomposes a STFT node into a subgraph.
+    The decomposition requires that:
+      1) The signal input is real valued and not complex valued!
+      2) Both (frame_step) *and* either (window or frame_length) inputs must be constant.
+    Otherwise the transform will not be applied.
+
+    Subgraph pattern 1: STFT with optional Window parameter set
+              [root]--(signal)--------------------+
+              [root]--(frame_step)---------------+|
+              [root]--(window)------------------+||
+              [root]--(frame_length) ----------+|||
+                                               ||||
+                                               vvvv
+                                              [STFT]--(output)-->
+    After Fusion:
+              [root]--(signal)-------------------------+
+              [root]                                   |
+              [root]--(window)--+                      |
+              [root]            |                      |
+                                v                      v
+         (only for non-fp32) [Cast]             +--[Reshape]
+                                |               |      |
+                                v               |      v
+                            [Reshape]-->[Mul]---|-->[Conv]-------+
+                                |               |                |
+                                |               +-----|          |
+                                |                     v          v
+                                +------>[Mul]------>[Conv]-->[Concat]-->[Reshape]-->[Transpose]--(output)-->
+
+
+    Subgraph pattern 2: STFT without optional Window parameter set
+              [root]--(signal)-------------------+
+              [root]--(frame_step)--------------+|
+              [root]                             |
+              [root]--(frame_length) ----------+||
+                                               |||
+                                               vvv
+                                              [STFT]--(output)-->
+    After Fusion:
+              [root]--(signal)-->[Reshape]-->[Conv]
+              [root]                 |         |
+              [root]                 |         v
+              [root]                 +------>[Conv]-->[Concat]-->[Reshape]-->[Transpose]--(output)-->
+*/
+Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+  GraphViewer graph_viewer(graph);
+  auto& order = graph_viewer.GetNodesInTopologicalOrder();
+
+  for (NodeIndex i : order) {
+    auto node = graph.GetNode(i);
+    CONTINUE_IF_NULL(node);
+    ORT_RETURN_IF_ERROR(Recurse(*node, modified, graph_level, logger));
+
+    if (node->OpType() != "STFT") {
+      continue;
+    }
+
+    Node& stft = *node;
+    auto signal = stft.MutableInputDefs()[0];
+    auto frame_step = stft.MutableInputDefs()[1];
+    auto window = stft.MutableInputDefs()[2];
+    auto frame_length = stft.MutableInputDefs()[3];
+
+    // If the signal has free dimensions, do not transform...
+    auto batch_size_dim = signal->Shape()->dim(0);
+    auto signal_length_dim = signal->Shape()->dim(1);
+    auto signal_components_dim = signal->Shape()->dim(2);
+    CONTINUE_IF_NO_DIM_VALUE(signal_length_dim);
+    CONTINUE_IF_NO_DIM_VALUE(signal_components_dim);
+
+    auto batch_size = batch_size_dim.has_dim_value() ? batch_size_dim.dim_value() : static_cast<int64_t>(-1);
+    auto signal_length = signal_length_dim.dim_value();
+    auto is_real = signal_components_dim.dim_value() == 1;
+    auto data_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(signal->TypeAsProto()->tensor_type().elem_type());
+
+    auto frame_step_initializer = graph_utils::GetConstantInitializer(graph, frame_step->Name());
+    auto window_initializer = graph_utils::GetConstantInitializer(graph, window->Name());
+    auto frame_length_initializer = graph_utils::GetConstantInitializer(graph, frame_length->Name());
+    CONTINUE_IF_NULL(frame_step_initializer);
+    if (!frame_length_initializer && !window_initializer) {
+      continue;
+    }
+
+    auto read_int64_initializer = [](Graph& graph, const ONNX_NAMESPACE::TensorProto* initializer) {
+      return *Initializer(*initializer, graph.ModelPath()).data<int64_t>();
+    };
+    auto frame_step_value = read_int64_initializer(graph, frame_step_initializer);
+
+    // Get DFT Size
+    int64_t dft_size = 0;
+    if (frame_length_initializer) {
+      dft_size = read_int64_initializer(graph, frame_length_initializer);
+    }
+    if (dft_size == 0 && window_initializer) {
+      auto window_length_dim = window->Shape()->dim(0);
+      CONTINUE_IF_NO_DIM_VALUE(window_length_dim);
+      dft_size = window_length_dim.dim_value();
+    }
+
+    bool is_onesided = true;
+    auto& attrs = stft.GetAttributes();
+    if (attrs.find("onesided") != attrs.end()) {
+      auto& onesided_attr = attrs.at("onesided");
+      if (utils::HasInt(onesided_attr)) {
+        is_onesided = static_cast<bool>(onesided_attr.i());
+      }
+    }
+
+    auto dft_unique_bins = is_onesided ? ((dft_size >> 1) + 1) : dft_size;
+
+    Node* signal_recipient = nullptr;
+    Node* window_recipient = nullptr;
+    Node* stft_producer = nullptr;
+    if (is_real) {
+      auto output_num_frames = stft.MutableOutputDefs()[0]->Shape()->dim(1).dim_value();
+      auto output_frame_length = stft.MutableOutputDefs()[0]->Shape()->dim(2).dim_value();
+      auto weight_size = static_cast<size_t>(dft_unique_bins * dft_size);
+      auto real_weights_data = std::vector<float>(weight_size);
+      auto imag_weights_data = std::vector<float>(weight_size);
+
+      // Populate weights
+      for (size_t k = 0; k < static_cast<size_t>(dft_unique_bins); k++) {
+        for (size_t n = 0; n < static_cast<size_t>(dft_size); n++) {
+          auto index = static_cast<size_t>(k * dft_size + n);
+          auto theta = -2 * M_PI * k * n / static_cast<float>(dft_size);
+          real_weights_data[index] = static_cast<float>(cos(theta));
+          imag_weights_data[index] = static_cast<float>(sin(theta));
+        }
+      }
+
+      const int64_t weight_shape[] = {dft_unique_bins, 1, 1, dft_size};
+      auto real_weights = AddInitializer<float>(graph, "stft_real_conv_weights", weight_shape, real_weights_data.data());
+      auto imaginary_weights = AddInitializer<float>(graph, "stft_imaginary_conv_weights", weight_shape, imag_weights_data.data());
+
+      const int64_t signal_reshaped[] = {batch_size, 1, 1, signal_length};
+      auto signal_shape = AddShapeInitializer(graph, "stft_signal_shape", signal_reshaped);
+
+      const int64_t unsqueezed_output_shape[] = {2, batch_size, output_frame_length, output_num_frames};
+      auto unsqueezed_shape = AddShapeInitializer(graph, "stft_output_reshaped", unsqueezed_output_shape);
+
+      NodeArg* signal_reshaped_inputs[] = {signal, signal_shape};
+      Node* reshape_signal_node = nullptr;
+      NodeArg* reshape_output = nullptr;
+      std::tie(reshape_signal_node, reshape_output) =
+          AddNode(graph, "Reshape", stft.GetExecutionProviderType(), signal_reshaped_inputs);
+
+      NodeArg* real_weights_final = real_weights;
+      NodeArg* imag_weights_final = imaginary_weights;
+      if (!window->Exists()) {
+        // When we are missing a window function
+        if (real_weights_final->TypeAsProto()->tensor_type().elem_type() != data_type) {
+          std::tie(std::ignore, real_weights_final) =
+              AddNodeCast(graph, real_weights_final, data_type);
+        }
+        if (imag_weights_final->TypeAsProto()->tensor_type().elem_type() != data_type) {
+          std::tie(std::ignore, imag_weights_final) =
+              AddNodeCast(graph, imag_weights_final, data_type);
+        }
+      } else {
+        // When we have a window function
+        const int64_t window_reshaped_shape[] = {1, 1, 1, dft_size};
+        auto window_shape = AddShapeInitializer(graph, "stft_window_shape", window_reshaped_shape);
+
+        auto window_final = window;
+        if (window->TypeAsProto()->tensor_type().elem_type() != GetDataType<float>()) {
+          Node* window_cast_node = nullptr;
+          std::tie(window_cast_node, window_final) =
+              AddNodeCast(graph, window, GetDataType<float>());
+          window_recipient = window_cast_node;
+        }
+
+        NodeArg* window_reshaped_inputs[] = {window_final, window_shape};
+        Node* window_reshape_node;
+        NodeArg* window_reshaped = nullptr;
+        std::tie(window_reshape_node, window_reshaped) =
+            AddNode(graph, "Reshape", kCpuExecutionProvider, window_reshaped_inputs);
+        if (!window_recipient) {
+          window_recipient = window_reshape_node;
+        }
+
+        NodeArg* scale_real_weights_inputs[] = {real_weights, window_reshaped};
+        NodeArg* windowed_real_weights_output = nullptr;
+        std::tie(std::ignore, windowed_real_weights_output) =
+            AddNode(graph, "Mul", kCpuExecutionProvider, scale_real_weights_inputs);
+
+        NodeArg* scale_imag_weights_inputs[] = {imaginary_weights, window_reshaped};
+        NodeArg* windowed_imag_weights_output = nullptr;
+        std::tie(std::ignore, windowed_imag_weights_output) =
+            AddNode(graph, "Mul", kCpuExecutionProvider, scale_imag_weights_inputs);
+
+        std::tie(std::ignore, real_weights_final) =
+            AddNodeCast(graph, windowed_real_weights_output, data_type);
+        std::tie(std::ignore, imag_weights_final) =
+            AddNodeCast(graph, windowed_imag_weights_output, data_type);
+      }
+
+      // Add Convolution (reals)
+      NodeArg* conv_real_inputs[] = {reshape_output, real_weights_final};
+      Node* real_conv_node = nullptr;
+      NodeArg* real_conv_output = nullptr;
+      std::tie(real_conv_node, real_conv_output) =
+          AddNode(graph, "Conv", stft.GetExecutionProviderType(), conv_real_inputs);
+      real_conv_node->AddAttribute("strides", std::vector<int64_t>{1, frame_step_value});
+
+      // Add Convolution (imaginary)
+      NodeArg* conv_imag_inputs[] = {reshape_output, imag_weights_final};
+      Node* imag_conv_node = nullptr;
+      NodeArg* imag_conv_output = nullptr;
+      std::tie(imag_conv_node, imag_conv_output) =
+          AddNode(graph, "Conv", stft.GetExecutionProviderType(), conv_imag_inputs);
+      imag_conv_node->AddAttribute("strides", std::vector<int64_t>{1, frame_step_value});
+
+      // Concatenate
+      NodeArg* concatenate_inputs[] = {real_conv_output, imag_conv_output};
+      Node* concat_node = nullptr;
+      NodeArg* concatenated_conv_output = nullptr;
+      std::tie(concat_node, concatenated_conv_output) =
+          AddNode(graph, "Concat", stft.GetExecutionProviderType(), concatenate_inputs);
+      concat_node->AddAttribute("axis", static_cast<int64_t>(0));
+
+      // Unsqueeze Reshape
+      NodeArg* unsqueeze_reshape_inputs[] = {concatenated_conv_output, unsqueezed_shape};
+      NodeArg* unsqueezed_output = nullptr;
+      std::tie(std::ignore, unsqueezed_output) =
+          AddNode(graph, "Reshape", stft.GetExecutionProviderType(), unsqueeze_reshape_inputs);
+
+      // Transpose
+      NodeArg* transpose_inputs[] = {unsqueezed_output};
+      Node* transpose_node = nullptr;
+      NodeArg* transpose_output = nullptr;
+      std::tie(transpose_node, transpose_output) =
+          AddNode(graph, "Transpose", stft.GetExecutionProviderType(), transpose_inputs);
+      transpose_node->AddAttribute("perm", std::vector<int64_t>{1, 3, 2, 0});
+
+      signal_recipient = reshape_signal_node;
+      stft_producer = transpose_node;
+    } else {
+      continue;
+    }
+
+    auto input_edges = graph_utils::GraphEdge::GetNodeInputEdges(stft);
+    auto output_edges = graph_utils::GraphEdge::GetNodeOutputEdges(stft);
+
+    // Copy inputs
+    auto signal_target_idx = signal_recipient->Index();
+    auto window_target_idx = window_recipient->Index();
+    for (auto cur = input_edges.cbegin(), end = input_edges.cend(); cur != end; ++cur) {
+      const graph_utils::GraphEdge& edge = *cur;
+      NodeIndex target_idx = 0;
+      Node* recipient = nullptr;
+      switch (cur->dst_arg_index) {
+        case 0:
+          target_idx = signal_target_idx;
+          recipient = signal_recipient;
+          break;
+        case 2:
+          target_idx = window_target_idx;
+          recipient = window_recipient;
+          break;
+      }
+
+      if (!recipient) {
+        continue;
+      }
+
+      auto arg_index = graph_utils::GetNodeInputIndexFromInputName(*recipient, edge.arg_name);
+      graph.AddEdge(edge.src_node, target_idx, edge.src_arg_index, arg_index);
+    }
+
+    // Copy STFT outputs to stft_producer
+    stft_producer->MutableOutputDefs() = stft.MutableOutputDefs();
+    auto stft_producer_target_idx = stft_producer->Index();
+    for (auto cur = output_edges.cbegin(), end = output_edges.cend(); cur != end; ++cur) {
+      graph.AddEdge(stft_producer_target_idx, cur->dst_node, cur->src_arg_index, cur->dst_arg_index);
+    }
+
+    graph_utils::GraphEdge::RemoveGraphEdges(graph, input_edges);
+    graph_utils::GraphEdge::RemoveGraphEdges(graph, output_edges);
+    graph.RemoveNode(stft.Index());
+
+    modified = true;
+  }
+  return Status::OK();
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/stft_decomposition.h b/onnxruntime/core/optimizer/stft_decomposition.h
new file mode 100644
index 000000000000..cac058474375
--- /dev/null
+++ b/onnxruntime/core/optimizer/stft_decomposition.h
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+#include "core/framework/ort_value.h"
+#include <memory>
+#include "core/framework/execution_provider.h"
+
+namespace onnxruntime {
+
+/**
+@class STFTDecomposition
+
+Transformer that traverses the graph top-down and decomposes
+STFT into convolution.
+*/
+class STFTDecomposition : public GraphTransformer {
+ public:
+  /*! STFT decomposition .
+      \param execution_provider Execution provider instance to execute constant folding.
+  */
+  STFTDecomposition(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept;
+
+ private:
+  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
index c45aaef0cf02..c042bb0059ac 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
+++ b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
@@ -455,7 +455,7 @@ class GraphRef {
 }  // namespace api
 
 constexpr int64_t kMinSupportedOpset = 7;
-constexpr int64_t kMaxSupportedOpset = 20;
+constexpr int64_t kMaxSupportedOpset = 21;
 
 // enum of results that a CostCheckFn can return.
 enum class CostCheckResult {
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
index d9f08ffe1171..c532f56b3d3d 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
@@ -115,7 +115,7 @@ class ApiGraph final : public api::GraphRef {
     const auto& graph_outputs = graph_.GetOutputs();
     graph_outputs_.reserve(graph_outputs.size());
     for (const auto* output : graph_outputs) {
-      graph_outputs_.insert(output->Name());
+      graph_outputs_.emplace(output->Name());
     }
   }
 
diff --git a/onnxruntime/core/optimizer/utils.cc b/onnxruntime/core/optimizer/utils.cc
index 7c3599a08ec7..7055882961e1 100644
--- a/onnxruntime/core/optimizer/utils.cc
+++ b/onnxruntime/core/optimizer/utils.cc
@@ -272,7 +272,7 @@ int32_t IndexOfNodeOutput(const Node& node, const NodeArg& node_arg) {
 // We could also allow other known domains (kMSDomain, kMSNchwcDomain, kMSFeaturizersDomain),
 // as long as we verify which of their operations are non-deterministic and add them in the map below.
 constexpr std::array kOnnxDomainNonDeterministicOps{"RandomUniform", "RandomNormal", "RandomUniformLike",
-                                                    "RandomNormalLike", "Multinomial"};
+                                                    "RandomNormalLike", "Multinomial", "Dropout"};
 
 // List of deterministic MS domain operators. Currently used for constant folding and common subexpression elimination.
 //
@@ -280,7 +280,8 @@ constexpr std::array kOnnxDomainNonDeterministicOps{"RandomUniform", "RandomNorm
 // with the above ONNX list. With the current approach, only MS domain Q/DQ operators
 // (plus ShrunkenGather for training) are considered deterministic.
 #ifdef ENABLE_TRAINING_OPS
-constexpr std::array kMSDomainDeterministicOps{"ShrunkenGather", "QuantizeLinear", "DequantizeLinear"};
+constexpr std::array kMSDomainDeterministicOps{"ShrunkenGather", "QuantizeLinear", "DequantizeLinear",
+                                               "ConcatTraining"};
 #else
 constexpr std::array kMSDomainDeterministicOps{"QuantizeLinear", "DequantizeLinear"};
 #endif
diff --git a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
index 8dbd8eecd0e7..78614ffd2819 100644
--- a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
+++ b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
@@ -13,7 +13,7 @@
 namespace logging {
 
 void AppleLogSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
-  using date::operator<<;
+  using timestamp_ns::operator<<;
   std::ostringstream msg;
   msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
       << message.Location().ToString() << "] " << message.Message();
diff --git a/onnxruntime/core/platform/env.cc b/onnxruntime/core/platform/env.cc
index 5ebda75b7c53..bb34e02e7c11 100644
--- a/onnxruntime/core/platform/env.cc
+++ b/onnxruntime/core/platform/env.cc
@@ -34,4 +34,29 @@ std::ostream& operator<<(std::ostream& os, gsl::span<const LogicalProcessors> af
 
 Env::Env() = default;
 
+std::pair<int, std::string> GetErrnoInfo() {
+  auto err = errno;
+  std::string msg;
+
+  if (err != 0) {
+    char buf[512];
+
+#if defined(_WIN32)
+    auto ret = strerror_s(buf, sizeof(buf), err);
+    msg = ret == 0 ? buf : "Failed to get error message";  // buf is guaranteed to be null terminated by strerror_s
+#else
+    // strerror_r return type differs by platform.
+    auto ret = strerror_r(err, buf, sizeof(buf));
+    if constexpr (std::is_same_v<decltype(ret), int>) {  // POSIX returns int
+      msg = ret == 0 ? buf : "Failed to get error message";
+    } else {
+      // GNU returns char*
+      msg = ret;
+    }
+#endif
+  }
+
+  return {err, msg};
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/env.h b/onnxruntime/core/platform/env.h
index 0425b2972f87..6917f42091bf 100644
--- a/onnxruntime/core/platform/env.h
+++ b/onnxruntime/core/platform/env.h
@@ -96,6 +96,12 @@ struct ThreadOptions {
 std::ostream& operator<<(std::ostream& os, const LogicalProcessors&);
 std::ostream& operator<<(std::ostream& os, gsl::span<const LogicalProcessors>);
 
+/// <summary>
+/// Get errno and the corresponding error message.
+/// </summary>
+/// <returns>errno and the error message string if errno indicates an error.</returns>
+std::pair<int, std::string> GetErrnoInfo();
+
 /// \brief An interface used by the onnxruntime implementation to
 /// access operating system functionality like the filesystem etc.
 ///
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index 7cd81d89d7d4..9999550c241c 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -62,39 +62,11 @@ class UnmapFileParam {
   size_t len;
 };
 
-/**
- * @brief Get System Error
- *
- * @return a pair of {errno, error message}
- */
-static std::pair<int, std::string> GetSystemError(int e) {
-  char buf[1024];
-  const char* msg = "";
-  if (e > 0) {
-#if defined(__GLIBC__) && defined(_GNU_SOURCE) && !defined(__ANDROID__)
-    msg = strerror_r(e, buf, sizeof(buf));
-#else
-    // for Mac OS X and Android lower than API 23
-    if (strerror_r(e, buf, sizeof(buf)) != 0) {
-      buf[0] = '\0';
-    }
-    msg = buf;
-#endif
-  }
-
-  return std::make_pair(e, msg);
-}
-
-static std::pair<int, std::string> GetSystemError() {
-  auto e = errno;
-  return GetSystemError(e);
-}
-
 static void UnmapFile(void* param) noexcept {
   std::unique_ptr<UnmapFileParam> p(reinterpret_cast<UnmapFileParam*>(param));
   int ret = munmap(p->addr, p->len);
   if (ret != 0) {
-    auto [err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetErrnoInfo();
     LOGS_DEFAULT(ERROR) << "munmap failed. error code: " << err_no << " error msg: " << err_msg;
   }
 }
@@ -104,8 +76,9 @@ struct FileDescriptorTraits {
   static Handle GetInvalidHandleValue() { return -1; }
   static void CleanUp(Handle h) {
     if (close(h) == -1) {
-      auto [err_no, err_msg] = GetSystemError();
-      LOGS_DEFAULT(ERROR) << "Failed to close file descriptor " << h << " - error code: " << err_no << " error msg: " << err_msg;
+      auto [err_no, err_msg] = GetErrnoInfo();
+      LOGS_DEFAULT(ERROR) << "Failed to close file descriptor " << h << " - error code: " << err_no
+                          << " error msg: " << err_msg;
     }
   }
 };
@@ -131,7 +104,7 @@ int nftw_remove(
     int /*typeflag*/, struct FTW* /*ftwbuf*/) {
   const auto result = remove(fpath);
   if (result != 0) {
-    auto [err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetErrnoInfo();
     LOGS_DEFAULT(WARNING) << "remove() failed. Error code: " << err_no << " error msg: " << err_msg
                           << ", path: " << fpath;
   }
@@ -188,7 +161,7 @@ class PosixThread : public EnvThread {
       pthread_attr_t attr;
       int s = pthread_attr_init(&attr);
       if (s != 0) {
-        auto [err_no, err_msg] = GetSystemError();
+        auto [err_no, err_msg] = GetErrnoInfo();
         ORT_THROW("pthread_attr_init failed, error code: ", err_no, " error msg: ", err_msg);
       }
 
@@ -196,14 +169,14 @@ class PosixThread : public EnvThread {
       if (stack_size > 0) {
         s = pthread_attr_setstacksize(&attr, stack_size);
         if (s != 0) {
-          auto [err_no, err_msg] = GetSystemError();
+          auto [err_no, err_msg] = GetErrnoInfo();
           ORT_THROW("pthread_attr_setstacksize failed, error code: ", err_no, " error msg: ", err_msg);
         }
       }
 
       s = pthread_create(&hThread, &attr, ThreadMain, param_ptr.get());
       if (s != 0) {
-        auto [err_no, err_msg] = GetSystemError();
+        auto [err_no, err_msg] = GetErrnoInfo();
         ORT_THROW("pthread_create failed, error code: ", err_no, " error msg: ", err_msg);
       }
       param_ptr.release();
@@ -249,7 +222,8 @@ class PosixThread : public EnvThread {
                                 << ", index: " << p->index
                                 << ", mask: " << *p->affinity;
         } else {
-          auto [err_no, err_msg] = GetSystemError(ret);
+          errno = ret;
+          auto [err_no, err_msg] = GetErrnoInfo();
 #if !defined(USE_MIGRAPHX)
           LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << syscall(SYS_gettid)
                               << ", index: " << p->index
@@ -461,7 +435,7 @@ class PosixEnv : public Env {
   }
 
   static common::Status ReportSystemError(const char* operation_name, const std::string& path) {
-    auto [err_no, err_msg] = GetSystemError();
+    auto [err_no, err_msg] = GetErrnoInfo();
     std::ostringstream oss;
     oss << operation_name << " file \"" << path << "\" failed: " << err_msg;
     return common::Status(common::SYSTEM, err_no, oss.str());
diff --git a/onnxruntime/core/platform/telemetry.cc b/onnxruntime/core/platform/telemetry.cc
index a99261d1d1ca..dc3b011cc796 100644
--- a/onnxruntime/core/platform/telemetry.cc
+++ b/onnxruntime/core/platform/telemetry.cc
@@ -12,6 +12,21 @@ void LogRuntimeError(uint32_t sessionId, const common::Status& status, const cha
   env.GetTelemetryProvider().LogRuntimeError(sessionId, status, file, function, line);
 }
 
+bool Telemetry::IsEnabled() const {
+  return false;
+}
+
+// Get the current logging level
+// The Level defined as uchar is coming from the ETW Enable callback in TraceLoggingRegisterEx.
+unsigned char Telemetry::Level() const {
+  return 0;
+}
+
+// Get the current keyword
+uint64_t Telemetry::Keyword() const {
+  return 0;
+}
+
 void Telemetry::EnableTelemetryEvents() const {
 }
 
diff --git a/onnxruntime/core/platform/telemetry.h b/onnxruntime/core/platform/telemetry.h
index da808e73d97c..7b61de9d5407 100644
--- a/onnxruntime/core/platform/telemetry.h
+++ b/onnxruntime/core/platform/telemetry.h
@@ -38,6 +38,14 @@ class Telemetry {
   virtual void DisableTelemetryEvents() const;
   virtual void SetLanguageProjection(uint32_t projection) const;
 
+  virtual bool IsEnabled() const;
+
+  // Get the current logging level
+  virtual unsigned char Level() const;
+
+  // Get the current keyword
+  virtual uint64_t Keyword() const;
+
   virtual void LogProcessInfo() const;
 
   virtual void LogSessionCreationStart() const;
diff --git a/onnxruntime/core/platform/windows/debug_alloc.cc b/onnxruntime/core/platform/windows/debug_alloc.cc
index ff6a05960736..f3520b4f7f7f 100644
--- a/onnxruntime/core/platform/windows/debug_alloc.cc
+++ b/onnxruntime/core/platform/windows/debug_alloc.cc
@@ -55,41 +55,67 @@ struct MemoryBlock {
 };
 
 struct SymbolHelper {
-  SymbolHelper() noexcept {
-    SymSetOptions(SymGetOptions() | SYMOPT_DEFERRED_LOADS);
-    SymInitialize(GetCurrentProcess(), nullptr, true);
+  HANDLE process_handle_ = GetCurrentProcess();
+  bool initialized_ = false;
+
+  bool InitializeWhenNeeded() {
+    // We try only once
+    if (!initialized_) {
+      SymSetOptions(SymGetOptions() | SYMOPT_DEFERRED_LOADS);
+      // We use GetCurrentProcess() because other libs are likely to use it
+      if (!SymInitialize(process_handle_, nullptr, true)) {
+        const unsigned long long error{GetLastError()};
+        std::cerr << "SymInitialize() failed: " << error << std::endl;
+        return false;
+      }
+      initialized_ = true;
+    }
+    return true;
+  }
+
+  SymbolHelper() = default;
+
+  static constexpr size_t kInitialBufferSize = sizeof(SYMBOL_INFO) + MAX_SYM_NAME;
+
+  bool LoookupSymAndInitialize(const ULONG_PTR address, char* buffer, size_t buffer_size, SYMBOL_INFO* symbol) {
+    if (SymFromAddr(process_handle_, address, 0, symbol) != TRUE) {
+      if (GetLastError() == ERROR_INVALID_HANDLE) {
+        // Try to initialize first
+        if (!InitializeWhenNeeded() || SymFromAddr(process_handle_, address, 0, symbol) != TRUE) {
+          _snprintf_s(buffer, buffer_size, _TRUNCATE, "0x%08IX (Unknown symbol)", address);
+          return false;
+        }
+      } else {
+        _snprintf_s(buffer, buffer_size, _TRUNCATE, "0x%08IX (Unknown symbol)", address);
+        return false;
+      }
+    }
+    return true;
   }
 
   void Lookup(std::string& string, const ULONG_PTR address) {
-    char buffer[2048] = {0};
-    Symbol symbol;
-    if (SymFromAddr(GetCurrentProcess(), address, 0, &symbol) == false) {
-      _snprintf_s(buffer, _TRUNCATE, "0x%08IX (Unknown symbol)", address);
+    alignas(SYMBOL_INFO) char buffer[kInitialBufferSize] = {0};
+    SYMBOL_INFO* symbol = reinterpret_cast<SYMBOL_INFO*>(buffer);
+    symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+    symbol->MaxNameLen = MAX_SYM_NAME;
+
+    if (!LoookupSymAndInitialize(address, buffer, kInitialBufferSize, symbol)) {
       string.append(buffer);
       return;
     }
 
     Line line;
     DWORD displacement;
-    if (SymGetLineFromAddr(GetCurrentProcess(), address, &displacement, &line) == false) {
-      _snprintf_s(buffer, _TRUNCATE, "(unknown file & line number): %s", symbol.Name);
+    if (SymGetLineFromAddr(process_handle_, address, &displacement, &line) == false) {
+      _snprintf_s(buffer, _TRUNCATE, "(unknown file & line number): %s", symbol->Name);
       string.append(buffer);
       return;
     }
 
-    _snprintf_s(buffer, _TRUNCATE, "%s(%d): %s", line.FileName, static_cast<int>(line.LineNumber), symbol.Name);
+    _snprintf_s(buffer, _TRUNCATE, "%s(%d): %s", line.FileName, static_cast<int>(line.LineNumber), symbol->Name);
     string.append(buffer);
   }
 
-  struct Symbol : SYMBOL_INFO {
-    Symbol() noexcept {
-      SizeOfStruct = sizeof(SYMBOL_INFO);
-      MaxNameLen = _countof(buffer);
-    }
-
-    char buffer[1024] = {0};
-  };
-
   struct Line : IMAGEHLP_LINE {
     Line() noexcept {
       SizeOfStruct = sizeof(IMAGEHLP_LINE);
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 45648010baf8..dc090e446e60 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -32,6 +32,9 @@ limitations under the License.
 #include "core/common/span_utils.h"
 #include "core/platform/env.h"
 #include "core/platform/scoped_resource.h"
+#if defined(_M_X64) && !defined(_M_ARM64EC)
+#include "core/platform/windows/hardware_core_enumerator.h"
+#endif
 #include <unsupported/Eigen/CXX11/ThreadPool>
 #include <wil/Resource.h>
 
@@ -107,11 +110,10 @@ class WindowsThread : public EnvThread {
                                       local_param.get(), 0,
                                       &threadID);
       if (th_handle == 0) {
-        auto err = errno;
         auto dos_error = _doserrno;
-        char message_buf[256];
-        strerror_s(message_buf, sizeof(message_buf), err);
-        ORT_THROW("WindowThread:_beginthreadex failed with message: ", message_buf, " doserrno: ", dos_error);
+        auto [err, msg] = GetErrnoInfo();
+        ORT_THROW("WindowThread:_beginthreadex failed with errno:", err, " message:", msg,
+                  " doserrno:", dos_error);
       }
       local_param.release();
       hThread.reset(reinterpret_cast<HANDLE>(th_handle));
@@ -248,12 +250,53 @@ void WindowsEnv::SleepForMicroseconds(int64_t micros) const {
   Sleep(static_cast<DWORD>(micros) / 1000);
 }
 
+// EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option.
+#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID)
+static constexpr std::array<int, 3> kVendorID_Intel = {0x756e6547, 0x6c65746e, 0x49656e69};  // "GenuntelineI"
+#endif
 int WindowsEnv::DefaultNumCores() {
   return std::max(1, static_cast<int>(std::thread::hardware_concurrency() / 2));
 }
 
 int WindowsEnv::GetNumPhysicalCpuCores() const {
-  return cores_.empty() ? DefaultNumCores() : static_cast<int>(cores_.size());
+// EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option.
+#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID)
+  // The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has
+  // a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work
+  // evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number
+  // of threads to exclude the slowest cores out.
+  // The following code is based on assumptions that:
+  // 1. All Intel hybrid CPUs should have 3 levels of cache.
+  // 2. If a CPU core is only associated with two levels of cache,  it should be a low performance CPU core and should
+  //    not be used.
+  // Since we don't know what the next Intel hybrid CPU would be like, later on we may need to rework the following code.
+  // However, no matter what the code should not cause any crash. The worst is it might return 1 that
+  //  thread pools will not be created, which is just a perf issue and does not impact usability.
+  // TODO: detect if CPUID instruction is available per instructions at https://wiki.osdev.org/CPUID#Checking_CPUID_availability
+  int regs[4];
+  __cpuid(regs, 0);
+  bool bIsIntel =
+      (kVendorID_Intel[0] == regs[1]) &&
+      (kVendorID_Intel[1] == regs[2]) &&
+      (kVendorID_Intel[2] == regs[3]);
+  if (bIsIntel && regs[0] >= 7) {
+    // Query Structured Extended Feature Flags Enumeration Leaf
+    __cpuid(regs, 0x7);
+    // The bit 15 of EDX indicates if the processor is identified as a hybrid part.
+    bool ishybrid = regs[3] & (1 << 15);
+    if (ishybrid) {
+      // NOTE: even if ishybrid is true, it doesn't mean the processor must have P-cores and E-cores.
+      // On Intel CPUs we assume the HardwareCoreEnumerator::DefaultIntraOpNumThreads function would never fail.
+      // NOTE: due to resource restrictions, we cannot test this branch in our CI build pipelines.
+      return std::max(static_cast<uint32_t>(1), HardwareCoreEnumerator::DefaultIntraOpNumThreads());
+    } else {
+      return cores_.empty() ? DefaultNumCores() : static_cast<int>(cores_.size());
+    }
+  } else
+#endif
+  {
+    return cores_.empty() ? DefaultNumCores() : static_cast<int>(cores_.size());
+  }
 }
 
 std::vector<LogicalProcessors> WindowsEnv::GetDefaultThreadAffinities() const {
@@ -380,18 +423,6 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
                            " - ", std::system_category().message(error_code));
   }
 
-#if NTDDI_VERSION >= NTDDI_WIN10_RS5 && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP | WINAPI_PARTITION_SYSTEM)
-  wil::unique_hfile file_mapping_handle{
-      CreateFileMapping2(file_handle.get(),
-                         nullptr,
-                         FILE_MAP_READ,
-                         PAGE_READONLY,
-                         SEC_COMMIT,
-                         0,
-                         nullptr,
-                         nullptr,
-                         0)};
-#else
   wil::unique_hfile file_mapping_handle{
       CreateFileMappingW(file_handle.get(),
                          nullptr,
@@ -399,7 +430,6 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
                          0,
                          0,
                          nullptr)};
-#endif
   if (file_mapping_handle.get() == INVALID_HANDLE_VALUE) {
     const auto error_code = GetLastError();
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
@@ -428,8 +458,8 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
 
   void* const mapped_base = MapViewOfFile(file_mapping_handle.get(),
                                           FILE_MAP_READ,
-                                          0,
-                                          static_cast<DWORD>(mapped_offset),
+                                          static_cast<DWORD>((mapped_offset >> 32) & 0xFFFFFFFF),
+                                          static_cast<DWORD>(mapped_offset & 0xFFFFFFFF),
                                           mapped_length);
   GSL_SUPPRESS(r.11)
   mapped_memory =
diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc
new file mode 100644
index 000000000000..bf3b53afbd7d
--- /dev/null
+++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc
@@ -0,0 +1,90 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "hardware_core_enumerator.h"
+#include <memory>
+#include <Windows.h>
+#include <assert.h>
+
+namespace onnxruntime {
+
+struct LogicalProcessorInformation {
+  std::unique_ptr<char[]> Buffer;
+  size_t Length;
+};
+
+struct CoreCounter {
+  uint32_t PhysicalCores = 0;
+  uint32_t LLCCores = 0;
+};
+
+static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
+  DWORD length = 0;
+  DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length);
+
+  assert(rc == FALSE);
+
+  auto processorInformationBytes = std::make_unique<char[]>(length);
+
+  rc = GetLogicalProcessorInformationEx(
+      relationship, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(processorInformationBytes.get()), &length);
+
+  assert(rc == TRUE);
+
+  return {std::move(processorInformationBytes), length};
+}
+
+uint32_t CountSetBits(DWORD input) {
+  uint32_t c;
+  for (c = 0; input; c++) {
+    input &= input - 1;
+  }
+  return c;
+}
+
+static CoreCounter GetCoreInfo() {
+  auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);
+
+  CoreCounter cores;
+  DWORD dwLevel2GroupMask = 0;
+  DWORD dwLevel3GroupMask = 0;
+  size_t read = 0;
+  PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL;
+
+  while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length) {
+    currentProcessorInfo =
+        reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(logicalProcessorInformation.Buffer.get() + read);
+    if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) {
+      break;
+    }
+
+    switch (currentProcessorInfo->Relationship) {
+      case RelationProcessorCore:
+        cores.PhysicalCores++;
+        break;
+      case RelationCache:
+        if (currentProcessorInfo->Cache.Level == 2) {
+          dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
+        } else if (currentProcessorInfo->Cache.Level == 3) {
+          dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
+        }
+        break;
+    }
+
+    read += currentProcessorInfo->Size;
+  }
+  // Cores with L2 and LLC cache levels = # Physical Cores - # logical cores without LLC
+  cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
+
+  return cores;
+}
+
+uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
+  // # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
+  // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
+  auto cores = GetCoreInfo();
+
+  return cores.LLCCores;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.h b/onnxruntime/core/platform/windows/hardware_core_enumerator.h
new file mode 100644
index 000000000000..93b50f452afc
--- /dev/null
+++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.h
@@ -0,0 +1,12 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+
+namespace onnxruntime {
+struct HardwareCoreEnumerator {
+  HardwareCoreEnumerator() = delete;
+  static uint32_t DefaultIntraOpNumThreads();
+};
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc
index 396695e6c570..5fb7f7a65161 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.cc
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc
@@ -58,42 +58,107 @@ TRACELOGGING_DEFINE_PROVIDER(etw_provider_handle, "ONNXRuntimeTraceLoggingProvid
 #pragma warning(pop)
 #endif
 
-// Class to unregister ETW provider at shutdown.
-// We expect one static instance to be created for the lifetime of the program.
-class EtwRegistrationManager {
- public:
-  static EtwRegistrationManager& Register() {
-    const HRESULT etw_status = ::TraceLoggingRegister(etw_provider_handle);
-
-    if (FAILED(etw_status)) {
-      ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status));
-    }
+EtwRegistrationManager& EtwRegistrationManager::Instance() {
+  static EtwRegistrationManager instance;
+  instance.LazyInitialize();
+  return instance;
+}
 
-    // return an instance that is just used to unregister as the program exits
-    static EtwRegistrationManager instance(etw_status);
-    return instance;
-  }
+bool EtwRegistrationManager::IsEnabled() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return is_enabled_;
+}
+
+UCHAR EtwRegistrationManager::Level() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return level_;
+}
 
-  const HRESULT Status() const noexcept {
-    return etw_status_;
+Severity EtwRegistrationManager::MapLevelToSeverity() {
+  switch (level_) {
+    case TRACE_LEVEL_NONE:
+      return Severity::kFATAL;  // There is no none severity option
+    case TRACE_LEVEL_VERBOSE:
+      return Severity::kVERBOSE;
+    case TRACE_LEVEL_INFORMATION:
+      return Severity::kINFO;
+    case TRACE_LEVEL_WARNING:
+      return Severity::kWARNING;
+    case TRACE_LEVEL_ERROR:
+      return Severity::kERROR;
+    case TRACE_LEVEL_CRITICAL:
+      return Severity::kFATAL;
+    default:
+      return Severity::kVERBOSE;
   }
+}
+
+ULONGLONG EtwRegistrationManager::Keyword() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return keyword_;
+}
 
-  ~EtwRegistrationManager() {
-    ::TraceLoggingUnregister(etw_provider_handle);
+HRESULT EtwRegistrationManager::Status() const {
+  return etw_status_;
+}
+
+void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) {
+  std::lock_guard<OrtMutex> lock(callbacks_mutex_);
+  callbacks_.push_back(callback);
+}
+
+void NTAPI EtwRegistrationManager::ORT_TL_EtwEnableCallback(
+    _In_ LPCGUID SourceId,
+    _In_ ULONG IsEnabled,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword,
+    _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+    _In_opt_ PVOID CallbackContext) {
+  auto& manager = EtwRegistrationManager::Instance();
+  {
+    std::lock_guard<OrtMutex> lock(manager.provider_change_mutex_);
+    manager.is_enabled_ = (IsEnabled != 0);
+    manager.level_ = Level;
+    manager.keyword_ = MatchAnyKeyword;
   }
+  manager.InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+}
+
+EtwRegistrationManager::~EtwRegistrationManager() {
+  ::TraceLoggingUnregister(etw_provider_handle);
+}
 
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(EtwRegistrationManager);
+EtwRegistrationManager::EtwRegistrationManager() {
+}
 
-  EtwRegistrationManager(const HRESULT status) noexcept : etw_status_{status} {}
-  const HRESULT etw_status_;
-};
+void EtwRegistrationManager::LazyInitialize() {
+  if (!initialized_) {
+    std::lock_guard<OrtMutex> lock(init_mutex_);
+    if (!initialized_) {  // Double-check locking pattern
+      initialized_ = true;
+      etw_status_ = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
+      if (FAILED(etw_status_)) {
+        ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status_));
+      }
+    }
+  }
+}
+
+void EtwRegistrationManager::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                                             ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
+                                             PVOID CallbackContext) {
+  std::lock_guard<OrtMutex> lock(callbacks_mutex_);
+  for (const auto& callback : callbacks_) {
+    callback(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+  }
+}
 
 void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
   UNREFERENCED_PARAMETER(timestamp);
 
   // register on first usage
-  static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Register();
+  static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Instance();
 
   // do something (not that meaningful) with etw_manager so it doesn't get optimized out
   // as we want an instance around to do the unregister
@@ -101,9 +166,8 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
     return;
   }
 
-  // Do we want to output Verbose level messages via ETW at any point it time?
   // TODO: Validate if this filtering makes sense.
-  if (message.Severity() <= Severity::kVERBOSE || message.DataType() == DataType::USER) {
+  if (message.DataType() == DataType::USER) {
     return;
   }
 
@@ -114,11 +178,13 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
   // TraceLoggingWrite requires (painfully) a compile time constant for the TraceLoggingLevel,
   // forcing us to use an ugly macro for the call.
 #define ETW_EVENT_NAME "ONNXRuntimeLogEvent"
-#define TRACE_LOG_WRITE(level)                                                             \
-  TraceLoggingWrite(etw_provider_handle, ETW_EVENT_NAME, TraceLoggingLevel(level),         \
-                    TraceLoggingString(logger_id.c_str(), "logger"),                       \
-                    TraceLoggingString(message.Category(), "category"),                    \
-                    TraceLoggingString(message.Location().ToString().c_str(), "location"), \
+#define TRACE_LOG_WRITE(level)                                                                                      \
+  TraceLoggingWrite(etw_provider_handle, ETW_EVENT_NAME,                                                            \
+                    TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)), \
+                    TraceLoggingLevel(level),                                                                       \
+                    TraceLoggingString(logger_id.c_str(), "logger"),                                                \
+                    TraceLoggingString(message.Category(), "category"),                                             \
+                    TraceLoggingString(message.Location().ToString().c_str(), "location"),                          \
                     TraceLoggingString(message.Message().c_str(), "message"))
 
   const auto severity{message.Severity()};
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h
index 1e4f49a61930..143c3fcfdfc5 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.h
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.h
@@ -3,7 +3,9 @@
 
 #pragma once
 
+#include <Windows.h>
 #include <ntverp.h>
+#include <evntrace.h>
 
 // check for Windows 10 SDK or later
 // https://stackoverflow.com/questions/2665755/how-can-i-determine-the-version-of-the-windows-sdk-installed-on-my-computer
@@ -18,9 +20,11 @@
 #include <atomic>
 #include <iostream>
 #include <string>
+#include <vector>
 
 #include "core/common/logging/capture.h"
 #include "core/common/logging/isink.h"
+#include "core/platform/ort_mutex.h"
 
 namespace onnxruntime {
 namespace logging {
@@ -41,6 +45,62 @@ class EtwSink : public ISink {
   // EtwTracingManager to ensure we cleanly unregister it
   static std::atomic_flag have_instance_;
 };
+
+class EtwRegistrationManager {
+ public:
+  using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                 ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                 PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
+
+  // Singleton instance access
+  static EtwRegistrationManager& Instance();
+
+  // Check if ETW logging is enabled
+  bool IsEnabled() const;
+
+  // Get the current logging level
+  UCHAR Level() const;
+
+  Severity MapLevelToSeverity();
+
+  // Get the current keyword
+  uint64_t Keyword() const;
+
+  // Get the ETW registration status
+  HRESULT Status() const;
+
+  void RegisterInternalCallback(const EtwInternalCallback& callback);
+
+ private:
+  EtwRegistrationManager();
+  ~EtwRegistrationManager();
+  void LazyInitialize();
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(EtwRegistrationManager);
+
+  void InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                       ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext);
+
+  static void NTAPI ORT_TL_EtwEnableCallback(
+      _In_ LPCGUID SourceId,
+      _In_ ULONG IsEnabled,
+      _In_ UCHAR Level,
+      _In_ ULONGLONG MatchAnyKeyword,
+      _In_ ULONGLONG MatchAllKeyword,
+      _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+      _In_opt_ PVOID CallbackContext);
+
+  std::vector<EtwInternalCallback> callbacks_;
+  OrtMutex callbacks_mutex_;
+  mutable OrtMutex provider_change_mutex_;
+  OrtMutex init_mutex_;
+  bool initialized_ = false;
+  bool is_enabled_;
+  UCHAR level_;
+  ULONGLONG keyword_;
+  HRESULT etw_status_;
+};
+
 }  // namespace logging
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc
index ec49c2edc212..654281d526e4 100644
--- a/onnxruntime/core/platform/windows/telemetry.cc
+++ b/onnxruntime/core/platform/windows/telemetry.cc
@@ -2,6 +2,8 @@
 // Licensed under the MIT License.
 
 #include "core/platform/windows/telemetry.h"
+#include "core/platform/ort_mutex.h"
+#include "core/common/logging/logging.h"
 #include "onnxruntime_config.h"
 
 // ETW includes
@@ -16,6 +18,7 @@
 
 #include <TraceLoggingProvider.h>
 #include <evntrace.h>
+#include <winmeta.h>
 
 // Seems this workaround can be dropped when we drop support for VS2017 toolchains
 // https://developercommunity.visualstudio.com/content/problem/85934/traceloggingproviderh-is-incompatible-with-utf-8.html
@@ -55,15 +58,20 @@ TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntim
 #endif
 
 OrtMutex WindowsTelemetry::mutex_;
+OrtMutex WindowsTelemetry::provider_change_mutex_;
 uint32_t WindowsTelemetry::global_register_count_ = 0;
 bool WindowsTelemetry::enabled_ = true;
 uint32_t WindowsTelemetry::projection_ = 0;
+UCHAR WindowsTelemetry::level_ = 0;
+UINT64 WindowsTelemetry::keyword_ = 0;
+std::vector<WindowsTelemetry::EtwInternalCallback> WindowsTelemetry::callbacks_;
+OrtMutex WindowsTelemetry::callbacks_mutex_;
 
 WindowsTelemetry::WindowsTelemetry() {
   std::lock_guard<OrtMutex> lock(mutex_);
   if (global_register_count_ == 0) {
     // TraceLoggingRegister is fancy in that you can only register once GLOBALLY for the whole process
-    HRESULT hr = TraceLoggingRegister(telemetry_provider_handle);
+    HRESULT hr = TraceLoggingRegisterEx(telemetry_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
     if (SUCCEEDED(hr)) {
       global_register_count_ += 1;
     }
@@ -80,6 +88,55 @@ WindowsTelemetry::~WindowsTelemetry() {
   }
 }
 
+bool WindowsTelemetry::IsEnabled() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return enabled_;
+}
+
+UCHAR WindowsTelemetry::Level() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return level_;
+}
+
+UINT64 WindowsTelemetry::Keyword() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return keyword_;
+}
+
+// HRESULT WindowsTelemetry::Status() {
+//     return etw_status_;
+// }
+
+void WindowsTelemetry::RegisterInternalCallback(const EtwInternalCallback& callback) {
+  std::lock_guard<OrtMutex> lock(callbacks_mutex_);
+  callbacks_.push_back(callback);
+}
+
+void NTAPI WindowsTelemetry::ORT_TL_EtwEnableCallback(
+    _In_ LPCGUID SourceId,
+    _In_ ULONG IsEnabled,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword,
+    _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+    _In_opt_ PVOID CallbackContext) {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  enabled_ = (IsEnabled != 0);
+  level_ = Level;
+  keyword_ = MatchAnyKeyword;
+
+  InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+}
+
+void WindowsTelemetry::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                                       ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
+                                       PVOID CallbackContext) {
+  std::lock_guard<OrtMutex> lock(callbacks_mutex_);
+  for (const auto& callback : callbacks_) {
+    callback(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+  }
+}
+
 void WindowsTelemetry::EnableTelemetryEvents() const {
   enabled_ = true;
 }
@@ -110,6 +167,7 @@ void WindowsTelemetry::LogProcessInfo() const {
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
                     // Telemetry info
                     TraceLoggingUInt8(0, "schemaVersion"),
                     TraceLoggingString(ORT_VERSION, "runtimeVersion"),
@@ -126,7 +184,8 @@ void WindowsTelemetry::LogSessionCreationStart() const {
                     "SessionCreationStart",
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
-                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES));
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO));
 }
 
 void WindowsTelemetry::LogEvaluationStop() const {
@@ -199,6 +258,8 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
                     // Telemetry info
                     TraceLoggingUInt8(0, "schemaVersion"),
                     TraceLoggingUInt32(session_id, "sessionId"),
@@ -227,6 +288,7 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_ERROR),
                     // Telemetry info
                     TraceLoggingUInt8(0, "schemaVersion"),
                     TraceLoggingHResult(hr, "hResult"),
@@ -243,6 +305,7 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_ERROR),
                     // Telemetry info
                     TraceLoggingUInt8(0, "schemaVersion"),
                     TraceLoggingUInt32(session_id, "sessionId"),
diff --git a/onnxruntime/core/platform/windows/telemetry.h b/onnxruntime/core/platform/windows/telemetry.h
index 08e48214c85b..cdb186e9ed70 100644
--- a/onnxruntime/core/platform/windows/telemetry.h
+++ b/onnxruntime/core/platform/windows/telemetry.h
@@ -2,10 +2,14 @@
 // Licensed under the MIT License.
 
 #pragma once
+#include <atomic>
+#include <vector>
+
 #include "core/platform/telemetry.h"
+#include <Windows.h>
+#include <TraceLoggingProvider.h>
 #include "core/platform/ort_mutex.h"
 #include "core/platform/windows/TraceLoggingConfig.h"
-#include <atomic>
 
 namespace onnxruntime {
 
@@ -22,6 +26,17 @@ class WindowsTelemetry : public Telemetry {
   void DisableTelemetryEvents() const override;
   void SetLanguageProjection(uint32_t projection) const override;
 
+  bool IsEnabled() const override;
+
+  // Get the current logging level
+  unsigned char Level() const override;
+
+  // Get the current keyword
+  UINT64 Keyword() const override;
+
+  // Get the ETW registration status
+  // static HRESULT Status();
+
   void LogProcessInfo() const override;
 
   void LogSessionCreationStart() const override;
@@ -45,11 +60,35 @@ class WindowsTelemetry : public Telemetry {
 
   void LogExecutionProviderEvent(LUID* adapterLuid) const override;
 
+  using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                 ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                 PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
+
+  static void RegisterInternalCallback(const EtwInternalCallback& callback);
+
  private:
   static OrtMutex mutex_;
   static uint32_t global_register_count_;
   static bool enabled_;
   static uint32_t projection_;
+
+  static std::vector<EtwInternalCallback> callbacks_;
+  static OrtMutex callbacks_mutex_;
+  static OrtMutex provider_change_mutex_;
+  static UCHAR level_;
+  static ULONGLONG keyword_;
+
+  static void InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                              ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext);
+
+  static void NTAPI ORT_TL_EtwEnableCallback(
+      _In_ LPCGUID SourceId,
+      _In_ ULONG IsEnabled,
+      _In_ UCHAR Level,
+      _In_ ULONGLONG MatchAnyKeyword,
+      _In_ ULONGLONG MatchAllKeyword,
+      _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+      _In_opt_ PVOID CallbackContext);
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/acl/math/gemm.h b/onnxruntime/core/providers/acl/math/gemm.h
index d2f297e83aed..f5288d7f231b 100644
--- a/onnxruntime/core/providers/acl/math/gemm.h
+++ b/onnxruntime/core/providers/acl/math/gemm.h
@@ -49,11 +49,18 @@ class Gemm : public onnxruntime::Gemm<T> {
   }
 
   Status Compute(OpKernelContext* context) const override {
+#ifdef ACL_2308
+    if (this->packed_b_) {
+      // Prepacked RHS not supported, defaulting to cpu execution provider
+      return onnxruntime::Gemm<T>::Compute(context);
+    }
+#endif
     const auto A = context->Input<Tensor>(0);
     const auto B = context->Input<Tensor>(1);
     const auto C = context->Input<Tensor>(2);
 
-    GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans, C->Shape());
+    GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans,
+                      C != nullptr ? C->Shape() : TensorShape({}));
 
     if (!helper.State().IsOK())
       return helper.State();
@@ -70,7 +77,7 @@ class Gemm : public onnxruntime::Gemm<T> {
       return onnxruntime::Gemm<T>::Compute(context);
     }
 
-    arm_compute::TensorShape cShape = ACLTensorShape(C->Shape());
+    arm_compute::TensorShape cShape = ACLTensorShape(C != nullptr ? C->Shape() : TensorShape({}));
     if (useC &&
         (cShape.num_dimensions() > 2 ||
          (cShape.num_dimensions() == 2 && cShape[0] > 1 && cShape[1] > 1))) {  // Multi-dimensional Bias
@@ -89,8 +96,13 @@ class Gemm : public onnxruntime::Gemm<T> {
           (cShape[1] == 1 && cShape[0] != (long unsigned int)N)) {
         return onnxruntime::Gemm<T>::Compute(context);
       }
+#ifdef ACL_2308
+      cShape = arm_compute::TensorShape(N);
+      LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {" << N << "}";
+#else
       cShape = arm_compute::TensorShape(1, N);
       LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {1," << N << "}";
+#endif
     }
 
     int64_t K = helper.K();
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.cc b/onnxruntime/core/providers/acl/nn/batch_norm.cc
index da7fff730c96..eb6a10074f1d 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.cc
@@ -44,6 +44,16 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
   const Tensor* M = context->Input<Tensor>(3);  // mean
   const Tensor* V = context->Input<Tensor>(4);  // var
 
+  if (S->Shape().NumDimensions() > 1) {
+    LOGS_DEFAULT(WARNING) << "ACL does not support scale with dimension greater then 1; defaulting to cpu implementation";
+    return onnxruntime::BatchNorm<T>::Compute(context);
+  }
+
+  if (this->is_train_) {
+    LOGS_DEFAULT(WARNING) << "ACL does not have batchnorm training support; defaulting to cpu implementation";
+    return onnxruntime::BatchNorm<T>::Compute(context);
+  }
+
   ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, S, B, M, V));
 
   LOGS_DEFAULT(VERBOSE) << "BatchNorm ACL:";
@@ -70,7 +80,23 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
 
     auto layer = std::make_shared<arm_compute::NEBatchNormalizationLayer>();
 
+#ifdef ACL_2308
+    arm_compute::TensorShape in_x_shape;
+    const TensorShape& x_shape = X->Shape();
+    const auto& dims_vec = x_shape.GetDims();
+    in_x_shape.set(3, onnxruntime::narrow<size_t>(dims_vec[0]));  // N
+    in_x_shape.set(1, 1);                                         // H
+    size_t W = 1;
+    for (size_t i = 2; i < dims_vec.size(); ++i) {
+      W *= narrow<size_t>(dims_vec[i]);
+    }
+    in_x_shape.set(0, W);                                         // W
+    in_x_shape.set(2, onnxruntime::narrow<size_t>(dims_vec[1]));  // C
+
+    tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(in_x_shape, arm_compute::Format::F32));
+#else
     tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(X->Shape()), arm_compute::Format::F32));
+#endif
     tbatch_norm.out->allocator()->init(arm_compute::TensorInfo(tbatch_norm.in->info()->tensor_shape(), arm_compute::Format::F32));
 
     tbatch_norm.scale->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(S->Shape()), arm_compute::Format::F32));
@@ -132,11 +158,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     7, 9,
     kAclExecutionProvider,
     KernelDefBuilder()
-        .TypeConstraint("X", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("scale", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("B", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("mean", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("var", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     BatchNorm<float>);
 
 }  // namespace acl
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.h b/onnxruntime/core/providers/acl/nn/batch_norm.h
index c9ec08b67a77..264301976e6d 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.h
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.h
@@ -31,9 +31,9 @@ typedef struct {
 typedef std::map<OpKernel*, ACLNEBatchNorm>::iterator BatchNormLayersIterator;
 
 template <typename T>
-class BatchNorm final : public OpKernel {
+class BatchNorm : public onnxruntime::BatchNorm<T> {
  public:
-  explicit BatchNorm(const OpKernelInfo& info) : OpKernel(info) {
+  explicit BatchNorm(const OpKernelInfo& info) : onnxruntime::BatchNorm<T>(info) {
     auto st = info.GetAttr<float>("epsilon", &epsilon_);
     ORT_ENFORCE(st.IsOK(), st.ErrorMessage());
 
diff --git a/onnxruntime/core/providers/acl/nn/conv.cc b/onnxruntime/core/providers/acl/nn/conv.cc
index 1613d927d0f7..85bd0cfe9627 100644
--- a/onnxruntime/core/providers/acl/nn/conv.cc
+++ b/onnxruntime/core/providers/acl/nn/conv.cc
@@ -105,7 +105,11 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
   TensorShapeVector Y_dims;
   Y_dims.insert(Y_dims.begin(), {N, M});
   TensorShape input_shape = X->Shape().Slice(2);
+#ifdef ACL_2308
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+#else
   ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+#endif
   Tensor* Y = context->Output(0, TensorShape(Y_dims));
   LOGS_DEFAULT(VERBOSE) << "Y " << Y->Shape().ToString().c_str();
 
@@ -222,6 +226,15 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
                                                                                           1 /* depth multiplier */,
                                                                                           acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
                                                                                           arm_compute::Size2D(aclDilation0, dilations[0])));
+#elif defined(ACL_2308)
+      bool optimizable = bool(arm_compute::NEDepthwiseConvolutionLayer::validate(tconv.in->info(),
+                                                                                 tconv.k->info(),
+                                                                                 (B != nullptr) ? tconv.b->info() : nullptr,
+                                                                                 tconv.out->info(),
+                                                                                 aclPadStride,
+                                                                                 1 /* depth multiplier */,
+                                                                                 acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
+                                                                                 arm_compute::Size2D(aclDilation0, dilations[0])));
 #endif
 
       if (optimizable) {
@@ -230,7 +243,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer3x3>();
 #elif defined(ACL_1908)
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayerOptimized>();
-#elif defined(ACL_2002)
+#elif defined(ACL_2002) || defined(ACL_2308)
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>();
 #endif
 
@@ -238,7 +251,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
         layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
                          aclPadStride, 1 /* depth multiplier */,
                          acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo());
-#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002)
+#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002) || defined(ACL_2308)
         layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
                          aclPadStride, 1 /* depth multiplier */,
                          acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
diff --git a/onnxruntime/core/providers/acl/nn/conv.h b/onnxruntime/core/providers/acl/nn/conv.h
index ecb11fb3c8f4..660d47b4172d 100644
--- a/onnxruntime/core/providers/acl/nn/conv.h
+++ b/onnxruntime/core/providers/acl/nn/conv.h
@@ -8,6 +8,9 @@
 #include "core/providers/acl/acl_execution_provider.h"
 
 // ACL
+#ifdef ACL_2308
+#include "arm_compute/runtime/Tensor.h"
+#endif
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "arm_compute/runtime/Allocator.h"
diff --git a/onnxruntime/core/providers/acl/nn/pool.cc b/onnxruntime/core/providers/acl/nn/pool.cc
index dc79ae65bf21..8fbcba3ed87a 100644
--- a/onnxruntime/core/providers/acl/nn/pool.cc
+++ b/onnxruntime/core/providers/acl/nn/pool.cc
@@ -61,7 +61,14 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
     tpool.out->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(Y->Shape(), PREF_DIM), arm_compute::Format::F32));
 
     if (pool_attrs.global_pooling) {
-      layer->configure(tpool.in.get(), tpool.out.get(), arm_compute::PoolingLayerInfo(pool_type));
+      layer->configure(tpool.in.get(),
+                       tpool.out.get(),
+                       arm_compute::PoolingLayerInfo(pool_type
+#ifdef ACL_2308
+                                                     ,
+                                                     arm_compute::DataLayout::NCHW
+#endif
+                                                     ));
     } else {
       TensorShapeVector aclStrides(2);
       aclStrides[0] = (strides.size() == 2) ? strides[1] : 1;
@@ -104,7 +111,13 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
       LOGS_DEFAULT(VERBOSE) << "strides: {" << aclStrides[0] << "," << aclStrides[1] << "}";
       LOGS_DEFAULT(VERBOSE) << "excludePadding: " << excludePadding;
 
-      arm_compute::PoolingLayerInfo pool_info(pool_type, aclSize, aclPadStride, excludePadding);
+      arm_compute::PoolingLayerInfo pool_info(pool_type,
+                                              aclSize,
+#ifdef ACL_2308
+                                              arm_compute::DataLayout::NCHW,
+#endif
+                                              aclPadStride,
+                                              excludePadding);
       layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
     }
 
diff --git a/onnxruntime/core/providers/acl/tensor/concat.cc b/onnxruntime/core/providers/acl/tensor/concat.cc
index 081472729cfc..75eedaac80ae 100644
--- a/onnxruntime/core/providers/acl/tensor/concat.cc
+++ b/onnxruntime/core/providers/acl/tensor/concat.cc
@@ -10,6 +10,8 @@
 #include "core/providers/acl/acl_common.h"
 #include "core/providers/acl/acl_fwd.h"
 
+#include <iostream>
+
 #define PREF_DIM 4
 
 namespace onnxruntime {
@@ -22,17 +24,27 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
     return onnxruntime::Concat::Compute(ctx);
   }
 
+  if (axis_ < 0) {
+    LOGS_DEFAULT(WARNING) << "ACL does not have support for negative axis; defaulting to cpu implementation";
+    return onnxruntime::Concat::Compute(ctx);
+  }
+
   // Number of input tensors to concatenate
   auto input_count = Node().InputArgCount().front();
 
   // Hold pointers to the input tensors to be used in the PrepareForCompute() step
   std::vector<const Tensor*> input_tensors;
-  input_tensors.reserve(input_count);
+  int empty_tensors = 0;
   for (int i = 0; i < input_count; ++i) {
+    if (ctx->Input<Tensor>(i)->Shape().Size() == 0) {
+      empty_tensors++;
+      continue;
+    }
     input_tensors.push_back(ctx->Input<Tensor>(i));
   }
+  input_count -= empty_tensors;
 
-  auto output_dims = input_tensors[0]->Shape().AsShapeVector();
+  auto output_dims = ctx->Input<Tensor>(0)->Shape().AsShapeVector();
 
   // 'Concat' mode
   if (!is_stack_) {
@@ -64,7 +76,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   LOGS_DEFAULT(VERBOSE) << "Concat ACL:";
 
   arm_compute::Tensor output;
+#ifdef ACL_2308
+  std::vector<const arm_compute::ITensor*> inputs_vector;
+#else
   std::vector<arm_compute::ITensor*> inputs_vector;
+#endif
   for (int i = 0; i < input_count; i++) {
     arm_compute::Tensor* input = new arm_compute::Tensor();
     auto X = input_tensors[i];
@@ -75,7 +91,9 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   }
 
   arm_compute::NEConcatenateLayer layer;
-  layer.configure(inputs_vector, &output, 3 - axis_);
+  if (input_count > 0) {
+    layer.configure(inputs_vector, &output, 3 - axis_);
+  }
 
   LOGS_DEFAULT(VERBOSE) << "axis: " << axis_;
   LOGS_DEFAULT(VERBOSE) << std::endl;
@@ -83,7 +101,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   for (int i = 0; i < input_count; i++) {
     auto X = input_tensors[i];
     const T* x_data = X->Data<T>();
+#ifdef ACL_2308
+    arm_compute::Tensor* in = const_cast<arm_compute::Tensor*>(static_cast<const arm_compute::Tensor*>(inputs_vector[i]));
+#else
     arm_compute::Tensor* in = static_cast<arm_compute::Tensor*>(inputs_vector[i]);
+#endif
 
     if (X->Shape().Size() != 0 && in->info()->has_padding()) {
       in->allocator()->allocate();
@@ -101,7 +123,9 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
     ACLImportMemory(output.allocator(), (void*)y_data, Y->Shape().Size() * 4);
   }
 
-  layer.run();
+  if (input_count > 0) {
+    layer.run();
+  }
 
   if (Y->Shape().Size() != 0 && output.info()->has_padding()) {
     importDataFromTensor<T>(&output, y_data);
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
index 127c37bd84d0..9a242919665b 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -9,7 +9,6 @@
 #include <map>
 #include <unordered_set>
 
-#include "core/providers/shared_library/provider_api.h"
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/cann/cann_execution_provider.h"
@@ -1029,13 +1028,14 @@ Status RegisterCANNKernels(KernelRegistry& kernel_registry) {
 }  // namespace cann
 
 CANNExecutionProvider::CANNExecutionProvider(const CANNExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kCannExecutionProvider, OrtDevice(OrtDevice::NPU, OrtDevice::MemType::DEFAULT, info.device_id), true}, info_{info} {
+    : IExecutionProvider{onnxruntime::kCannExecutionProvider, OrtDevice(OrtDevice::NPU, OrtDevice::MemType::DEFAULT, info.device_id)}, info_{info} {
   InitProviderOrtApi();
 
   CANN_CALL_THROW(aclrtSetDevice(info_.device_id));
 
   soc_name_ = aclrtGetSocName();
   ORT_ENFORCE(soc_name_ != nullptr, "aclrtGetSocName return nullptr");
+  metadef_id_generator_ = ModelMetadefIdGenerator::Create();
 }
 
 CANNExecutionProvider::~CANNExecutionProvider() {
@@ -1045,7 +1045,7 @@ CANNExecutionProvider::~CANNExecutionProvider() {
 }
 
 // All threads share the same context and stream
-Status CANNExecutionProvider::OnRunStart() {
+Status CANNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   CANN_RETURN_IF_ERROR(aclrtSetDevice(info_.device_id));
 
   return Status::OK();
@@ -1197,7 +1197,7 @@ std::unique_ptr<IndexedSubGraph> CANNExecutionProvider::GetSubGraph(
 
   // Generate unique kernel name for CANN subgraph
   HashValue model_hash = 0;
-  int id = GenerateMetaDefId(graph_viewer, model_hash);
+  int id = metadef_id_generator_->GenerateId(graph_viewer, model_hash);
   auto meta_def = IndexedSubGraph_MetaDef::Create();
   meta_def->name() = graph_viewer.Name() + "_" + std::to_string(model_hash) + "_" + std::to_string(id);
 
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.h b/onnxruntime/core/providers/cann/cann_execution_provider.h
index 76d3d9c33156..d83bd88d6958 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.h
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.h
@@ -33,7 +33,7 @@ class CANNExecutionProvider : public IExecutionProvider {
   explicit CANNExecutionProvider(const CANNExecutionProviderInfo& info);
   virtual ~CANNExecutionProvider();
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
   template <typename T>
   Status Fill(Tensor* y, void* addr, aclrtStream stream) const {
@@ -81,6 +81,7 @@ class CANNExecutionProvider : public IExecutionProvider {
   std::unordered_map<std::string, uint32_t> modelIDs_;
   std::unordered_map<std::string, std::string> models_;
   std::unordered_map<std::string, std::unordered_map<std::size_t, std::string>> names_;
+  std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider_info.cc b/onnxruntime/core/providers/cann/cann_execution_provider_info.cc
index 5f1a6d8f1bec..d1ba7544bc09 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider_info.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider_info.cc
@@ -21,6 +21,7 @@ constexpr const char* kMemLimit = "npu_mem_limit";
 constexpr const char* kArenaExtendStrategy = "arena_extend_strategy";
 constexpr const char* kEnableCannGraph = "enable_cann_graph";
 constexpr const char* kDumpGraphs = "dump_graphs";
+constexpr const char* kDumpOmModel = "dump_om_model";
 constexpr const char* kPrecisionMode = "precision_mode";
 constexpr const char* kOpSelectImplMode = "op_select_impl_mode";
 constexpr const char* kOpTypeListForImplMode = "optypelist_for_implmode";
@@ -58,6 +59,7 @@ CANNExecutionProviderInfo CANNExecutionProviderInfo::FromProviderOptions(const P
               arena_extend_strategy_mapping, info.arena_extend_strategy)
           .AddAssignmentToReference(cann::provider_option_names::kEnableCannGraph, info.enable_cann_graph)
           .AddAssignmentToReference(cann::provider_option_names::kDumpGraphs, info.dump_graphs)
+          .AddAssignmentToReference(cann::provider_option_names::kDumpOmModel, info.dump_om_model)
           .AddAssignmentToReference(cann::provider_option_names::kPrecisionMode, info.precision_mode)
           .AddAssignmentToReference(cann::provider_option_names::kOpSelectImplMode, info.op_select_impl_mode)
           .AddAssignmentToReference(cann::provider_option_names::kOpTypeListForImplMode, info.optypelist_for_implmode)
@@ -73,6 +75,7 @@ ProviderOptions CANNExecutionProviderInfo::ToProviderOptions(const CANNExecution
        EnumToName(arena_extend_strategy_mapping, info.arena_extend_strategy)},
       {cann::provider_option_names::kEnableCannGraph, MakeStringWithClassicLocale(info.enable_cann_graph)},
       {cann::provider_option_names::kDumpGraphs, MakeStringWithClassicLocale(info.dump_graphs)},
+      {cann::provider_option_names::kDumpOmModel, MakeStringWithClassicLocale(info.dump_om_model)},
       {cann::provider_option_names::kPrecisionMode, MakeStringWithClassicLocale(info.precision_mode)},
       {cann::provider_option_names::kOpSelectImplMode, MakeStringWithClassicLocale(info.op_select_impl_mode)},
       {cann::provider_option_names::kOpTypeListForImplMode, MakeStringWithClassicLocale(info.optypelist_for_implmode)}};
@@ -87,6 +90,7 @@ ProviderOptions CANNExecutionProviderInfo::ToProviderOptions(const OrtCANNProvid
        EnumToName(arena_extend_strategy_mapping, ArenaExtendStrategy(info.arena_extend_strategy))},
       {cann::provider_option_names::kEnableCannGraph, MakeStringWithClassicLocale(info.enable_cann_graph)},
       {cann::provider_option_names::kDumpGraphs, MakeStringWithClassicLocale(info.dump_graphs)},
+      {cann::provider_option_names::kDumpOmModel, MakeStringWithClassicLocale(info.dump_om_model)},
       {cann::provider_option_names::kPrecisionMode, MakeStringWithClassicLocale(info.precision_mode)},
       {cann::provider_option_names::kOpSelectImplMode, MakeStringWithClassicLocale(info.op_select_impl_mode)},
       {cann::provider_option_names::kOpTypeListForImplMode, MakeStringWithClassicLocale(info.optypelist_for_implmode)}};
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider_info.h b/onnxruntime/core/providers/cann/cann_execution_provider_info.h
index b5c022c9e931..7ac43e9a8ed6 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider_info.h
+++ b/onnxruntime/core/providers/cann/cann_execution_provider_info.h
@@ -19,6 +19,7 @@ struct CANNExecutionProviderInfo {
   ArenaExtendStrategy arena_extend_strategy{ArenaExtendStrategy::kNextPowerOfTwo};
   bool enable_cann_graph{true};
   bool dump_graphs{false};
+  bool dump_om_model{true};
   std::string precision_mode;
   std::string op_select_impl_mode;
   std::string optypelist_for_implmode;
diff --git a/onnxruntime/core/providers/cann/cann_graph.cc b/onnxruntime/core/providers/cann/cann_graph.cc
index c680e3638020..b8584d022ee5 100644
--- a/onnxruntime/core/providers/cann/cann_graph.cc
+++ b/onnxruntime/core/providers/cann/cann_graph.cc
@@ -115,7 +115,9 @@ Status BuildONNXModel(ge::Graph& graph, std::string input_shape, const char* soc
   options.emplace(ge::ir_option::INPUT_SHAPE, input_shape.c_str());
   CANN_GRAPH_RETURN_IF_ERROR(ge::aclgrphBuildModel(graph, options, model));
 
-  CANN_GRAPH_RETURN_IF_ERROR(ge::aclgrphSaveModel(file_name.c_str(), model));
+  if (info.dump_om_model) {
+    CANN_GRAPH_RETURN_IF_ERROR(ge::aclgrphSaveModel(file_name.c_str(), model));
+  }
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cann/cann_provider_factory.cc b/onnxruntime/core/providers/cann/cann_provider_factory.cc
index 679a42be868c..4a130b9b0ca2 100644
--- a/onnxruntime/core/providers/cann/cann_provider_factory.cc
+++ b/onnxruntime/core/providers/cann/cann_provider_factory.cc
@@ -77,6 +77,7 @@ struct CANN_Provider : Provider {
     info.arena_extend_strategy = params->arena_extend_strategy;
     info.enable_cann_graph = params->enable_cann_graph != 0;
     info.dump_graphs = params->dump_graphs != 0;
+    info.dump_om_model = params->dump_om_model != 0;
     info.precision_mode = params->precision_mode;
     info.op_select_impl_mode = params->op_select_impl_mode;
     info.optypelist_for_implmode = params->optypelist_for_implmode;
@@ -94,6 +95,7 @@ struct CANN_Provider : Provider {
     cann_options.arena_extend_strategy = internal_options.arena_extend_strategy;
     cann_options.enable_cann_graph = internal_options.enable_cann_graph;
     cann_options.dump_graphs = internal_options.dump_graphs;
+    cann_options.dump_om_model = internal_options.dump_om_model;
     cann_options.precision_mode = internal_options.precision_mode;
     cann_options.op_select_impl_mode = internal_options.op_select_impl_mode;
     cann_options.optypelist_for_implmode = internal_options.optypelist_for_implmode;
diff --git a/onnxruntime/core/providers/cann/cann_stream_handle.h b/onnxruntime/core/providers/cann/cann_stream_handle.h
index 4d03fe520120..5d822d23f966 100644
--- a/onnxruntime/core/providers/cann/cann_stream_handle.h
+++ b/onnxruntime/core/providers/cann/cann_stream_handle.h
@@ -12,6 +12,7 @@
 #include "core/providers/cann/cann_call.h"
 
 namespace onnxruntime {
+void WaitCannNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 
 struct CannStream : Stream {
   CannStream(aclrtStream stream, const OrtDevice& device, bool own_flag);
@@ -23,10 +24,11 @@ struct CannStream : Stream {
   void Flush() override;
 
   bool own_stream_{true};
+
+  WaitNotificationFn GetWaitNotificationFn() const override { return WaitCannNotificationOnDevice; }
 };
 
 void RegisterCannStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry,
                                const OrtDevice::DeviceType device_type);
 
-void WaitCannNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/coreml_spec.h b/onnxruntime/core/providers/coreml/builders/coreml_spec.h
index 631bb7e25830..9448f1167990 100644
--- a/onnxruntime/core/providers/coreml/builders/coreml_spec.h
+++ b/onnxruntime/core/providers/coreml/builders/coreml_spec.h
@@ -3,12 +3,33 @@
 
 #pragma once
 
-// TODO come up with a more intuitive way of limiting this to Apple platform builds
-// E.g., putting CoreML EP files that should be enabled iff `defined(__APPLE__)` in a separate directory.
-#if !defined(__APPLE__)
-#error "This file should only be included when building on Apple platforms."
+#include "onnxruntime_config.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+
+// Disable warning from protobuf code.
+//
+// In file included from coreml_proto/Model.pb.h:30:
+// In file included from _deps/protobuf-src/src/google/protobuf/extension_set.h:53:
+// _deps/protobuf-src/src/google/protobuf/parse_context.h:328:47:
+//     error: implicit conversion loses integer precision: 'long' to 'int' [-Werror,-Wshorten-64-to-32]
+#ifdef HAS_SHORTEN_64_TO_32
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4244)  // conversion from long to int
 #endif
 
-#include "coreml/Model.pb.h"
+// Model.pb.h is generated in the build output directory from the CoreML protobuf files in
+// <build output directory>/_deps/coremltools-src/mlmodel/format
+#include "coreml_proto/Model.pb.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
 namespace COREML_SPEC = CoreML::Specification;
diff --git a/onnxruntime/core/providers/coreml/builders/helper.cc b/onnxruntime/core/providers/coreml/builders/helper.cc
index 897856256cc7..b8ebbd05a2a2 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.cc
+++ b/onnxruntime/core/providers/coreml/builders/helper.cc
@@ -22,22 +22,35 @@
 namespace onnxruntime {
 namespace coreml {
 
-OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer, uint32_t coreml_flags) {
+OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer,
+                                         int32_t coreml_version,
+                                         uint32_t coreml_flags) {
   return OpBuilderInputParams{graph_viewer,
-                              (coreml_flags & COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES) != 0};
+                              coreml_version,
+                              (coreml_flags & COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES) != 0,
+                              (coreml_flags & COREML_FLAG_CREATE_MLPROGRAM) != 0};
 }
 
-bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) {
+const IOpBuilder* GetOpBuilder(const Node& node) {
   const auto& op_builders = GetOpBuilders();
-  if (Contains(op_builders, node.OpType())) {
-    const auto* op_builder = op_builders.at(node.OpType());
+  const auto it = op_builders.find(node.OpType());
+  if (it != op_builders.cend()) {
+    return it->second;
+  }
+
+  return nullptr;
+}
+
+bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) {
+  const auto* op_builder = GetOpBuilder(node);
+  if (op_builder) {
     return op_builder->IsOpSupported(node, input_params, logger);
   } else {
     return false;
   }
 }
 
-bool IsInputSupported(const NodeArg& input, const std::string& parent_name,
+bool IsInputSupported(const Node& node, const NodeArg& input,
                       const OpBuilderInputParams& input_params, const logging::Logger& logger) {
   if (!input.Exists()) {
     // optional input that is not provided
@@ -48,8 +61,8 @@ bool IsInputSupported(const NodeArg& input, const std::string& parent_name,
   std::vector<int64_t> shape;
   // We do not support input with no shape
   if (!GetShape(input, shape, logger)) {
-    LOGS(logger, VERBOSE) << "Input [" << input_name << "] of [" << parent_name
-                          << "] has no shape";
+    LOGS(logger, VERBOSE) << MakeString("Input [", input_name, "] of Node [", node.Name(), "] type [", node.OpType(),
+                                        "] has no shape");
     return false;
   }
 
@@ -63,11 +76,25 @@ bool IsInputSupported(const NodeArg& input, const std::string& parent_name,
     // For some undocumented reason, Apple CoreML framework will fail loading the model if the model
     // input has dimension > 16384
     // See this issue, https://github.com/apple/coremltools/issues/1003
+    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf has maximum texture widths which may be the
+    // root cause.
     if (dim > 16384) {
       LOGS(logger, WARNING) << "CoreML does not support input dim > 16384. Input:" << input_name
                             << ", shape: " << Shape2String(shape);
       return false;
     }
+
+    if (dim == 0) {
+      if (node.OpType() == "Resize" && &input == node.InputDefs()[1]) {
+        // one special case. Resize 'roi' input was originally a required input but is rarely used.
+        // ROI is not supported in the CoreML implementation so we will ignore the value, but is often added
+        // (at least in the unit tests) as an initializer with shape {0}.
+      } else {
+        LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
+                              << ", shape: " << Shape2String(shape);
+        return false;
+      }
+    }
   }
 
   // Limit input shape rank to 5.
@@ -87,13 +114,6 @@ std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewe
                                                   const logging::Logger& logger) {
   std::unordered_set<const Node*> supported_nodes{};
 
-#ifdef __APPLE__
-  if (!util::HasRequiredBaseOS()) {
-    LOGS(logger, WARNING) << "All ops will fallback to CPU EP, because we do not have supported OS";
-    return supported_nodes;
-  }
-#endif
-
   for (const auto& node : graph_viewer.Nodes()) {
     const bool supported = IsNodeSupported(node, input_params, logger);
     LOGS(logger, VERBOSE) << "Operator type: [" << node.OpType()
@@ -111,7 +131,7 @@ std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewe
 
 bool CheckIsConstantInitializer(const NodeArg& node_arg, const GraphViewer& graph_viewer,
                                 const logging::Logger& logger, std::string_view input_description) {
-  if (graph_viewer.GetConstantInitializer(node_arg.Name(), true) == nullptr) {
+  if (graph_viewer.GetConstantInitializer(node_arg.Name()) == nullptr) {
     LOGS(logger, VERBOSE) << input_description << " (NodeArg name: '" << node_arg.Name()
                           << "') is not a constant initializer tensor";
     return false;
@@ -149,7 +169,9 @@ bool HasNeuralEngine(const logging::Logger& logger) {
 #else
   // In this case, we are running the EP on non-apple platform, which means we are running the model
   // conversion with CoreML EP enabled, for this we always assume the target system has Neural Engine
-  LOGS(logger, VERBOSE) << "HasNeuralEngine running on non-Apple hardware for model conversion only";
+  LOGS(logger, INFO) << "HasNeuralEngine running on non-Apple hardware. "
+                        "Returning true to enable model conversion and local testing of CoreML EP implementation. "
+                        "No CoreML model will be compiled or run.";
   has_neural_engine = true;
 #endif  // #ifdef __APPLE__
 
diff --git a/onnxruntime/core/providers/coreml/builders/helper.h b/onnxruntime/core/providers/coreml/builders/helper.h
index d8b27ac76ae7..300de2dedd12 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.h
+++ b/onnxruntime/core/providers/coreml/builders/helper.h
@@ -23,10 +23,14 @@ class Logger;
 
 namespace coreml {
 
-OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer, uint32_t coreml_flags);
+OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer,
+                                         int32_t coreml_version,
+                                         uint32_t coreml_flags);
 
-bool IsInputSupported(const NodeArg& node_arg, const std::string& parent_name,
-                      const OpBuilderInputParams& input_params, const logging::Logger& logger);
+const IOpBuilder* GetOpBuilder(const Node& node);
+
+bool IsInputSupported(const Node& node, const NodeArg& node_arg, const OpBuilderInputParams& input_params,
+                      const logging::Logger& logger);
 
 bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger);
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/LRN_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/LRN_op_builder.cc
index 53f18b205880..e9e520156576 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/LRN_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/LRN_op_builder.cc
@@ -3,39 +3,26 @@
 
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class LRNOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-
-#ifdef __APPLE__
-
 Status LRNOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                            const Node& node,
-                                           const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+                                           const logging::Logger& /*logger*/) const {
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   auto* coreml_lrn = layer->mutable_lrn();
 
@@ -56,9 +43,6 @@ Status LRNOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool LRNOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                      const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
index 88d6616b4e09..dee87ce3632a 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -2,44 +2,32 @@
 // Licensed under the MIT License.
 
 #include "core/common/narrow.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/coreml/builders/impl/builder_utils.h"
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class ActivationOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
   int GetMinSupportedOpSet(const Node& node) const override;
 };
 
-// Add operator related
-
-#ifdef __APPLE__
 void ActivationOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
@@ -86,7 +74,7 @@ Status AddPReluWeight(ModelBuilder& model_builder, const Node& node,
 Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                   const Node& node,
                                                   const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   const auto& op_type(node.OpType());
   if (op_type == "Sigmoid") {
@@ -115,14 +103,10 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 namespace {
 // assumes that node.OpType() == "PRelu"
-bool IsPReluOpSupported(const Node& node, const OpBuilderInputParams& input_params,
-                        const logging::Logger& logger) {
+bool IsPReluOpSupported(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
 
   // X input rank must be 3 or 4
diff --git a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
index 7a5d4a5af673..e9a8176c8349 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
@@ -1,37 +1,26 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/shared/utils/utils.h"
-#ifdef __APPLE__
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/model_builder.h"
-#endif
 #include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace coreml {
 
 class ArgMaxOpBuilder : public BaseOpBuilder {
-  // Add operator related
- private:
-#ifdef __APPLE__
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-
-#ifdef __APPLE__
 Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
                                               const logging::Logger& /* logger */) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
   const auto& graph_viewer = model_builder.GetGraphViewer();
 
   NodeAttrHelper helper(node);
@@ -67,9 +56,6 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool ArgMaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                         const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 25d5bad14ceb..83a572f4b60f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -1,21 +1,18 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/coreml/builders/impl/base_op_builder.h"
-
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
+using namespace CoreML::Specification;
 
 namespace onnxruntime {
 namespace coreml {
 
-// Shared functions
-
+namespace {
 // TODO, move this to shared_library
 bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node& node,
                             const logging::Logger& logger) {
@@ -37,93 +34,83 @@ bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node
 
   return false;
 }
+}  // namespace
 
-// Add operator related
-#ifdef __APPLE__
 Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
-                                        const OpBuilderInputParams& input_params,
                                         const logging::Logger& logger) const {
-  ORT_RETURN_IF_NOT(
-      IsOpSupported(node, input_params, logger),
-      "Unsupported operator ",
-      node.OpType());
-
-  ORT_RETURN_IF_ERROR(AddToModelBuilderImpl(model_builder, node, logger));
-  LOGS(logger, VERBOSE) << "Operator name: [" << node.Name()
-                        << "] type: [" << node.OpType() << "] was added";
-  return Status::OK();
-}
+  Status status = AddToModelBuilderImpl(model_builder, node, logger);
 
-/* static */ std::unique_ptr<COREML_SPEC::NeuralNetworkLayer>
-BaseOpBuilder::CreateNNLayer(ModelBuilder& model_builder, const Node& node) {
-  auto layer_name = node.Name();
-  if (layer_name.empty()) {
-    // CoreML requires layer has a name, while the node name is optional in ONNX
-    // In this case, create a unique name for the layer
-    layer_name = model_builder.GetUniqueName(MakeString("Node_", node.Index(), "_type_", node.OpType()));
+  if (status.IsOK()) {
+    LOGS(logger, VERBOSE) << "Operator name: [" << node.Name() << "] type: [" << node.OpType() << "] was added";
   }
-  return CreateNNLayer(layer_name);
-}
 
-/* static */ std::unique_ptr<COREML_SPEC::NeuralNetworkLayer>
-BaseOpBuilder::CreateNNLayer(const std::string& layer_name) {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = std::make_unique<COREML_SPEC::NeuralNetworkLayer>();
-  layer->set_name(layer_name);
-  return layer;
+  return status;
 }
-#endif
-
-// Operator support related
 
 bool BaseOpBuilder::IsOpSupported(const Node& node, const OpBuilderInputParams& input_params,
                                   const logging::Logger& logger) const {
-  if (!HasSupportedInputs(node, input_params, logger))
+  if (input_params.create_mlprogram && !SupportsMLProgram()) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] does not support MLProgram";
     return false;
+  }
 
-  // We do not support external initializers for now
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (HasExternalInitializer(initializers, node, logger))
+  if (!HasSupportedOpSet(node, logger)) {
     return false;
+  }
 
-  if (!HasSupportedOpSet(node, logger))
+  if (!HasSupportedInputs(node, input_params, logger)) {
     return false;
+  }
+
+  // We do not support external initializers for now
+  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
+  if (HasExternalInitializer(initializers, node, logger)) {
+    return false;
+  }
 
   return IsOpSupportedImpl(node, input_params, logger);
 }
 
 bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputParams& input_params,
                                        const logging::Logger& logger) const {
-  const auto node_name = MakeString("Node [", node.Name(), "] type [", node.OpType(), "]");
   for (const auto* input : node.InputDefs()) {
-    if (!IsInputSupported(*input, node_name, input_params, logger)) {
+    if (!IsInputSupported(node, *input, input_params, logger)) {
       return false;
     }
   }
 
-  return HasSupportedInputsImpl(node, logger);
+  return HasSupportedInputsImpl(node, input_params, logger);
 }
 
-bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const {
-  // We only check the type of input 0 by default
-  // specific op builder can override this
-  const auto& input = *node.InputDefs()[0];
-
-  int32_t input_type;
-  if (!GetType(input, input_type, logger))
+/* static */
+bool BaseOpBuilder::IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& /*input_params*/,
+                                 const logging::Logger& logger) {
+  if (idx >= node.InputDefs().size()) {
+    LOGS(logger, VERBOSE) << "Input index [" << idx << "] is out of range";
     return false;
+  }
+
+  const auto& input = *node.InputDefs()[idx];
 
-  if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
-    LOGS(logger, VERBOSE) << "[" << node.OpType()
-                          << "] Input type: [" << input_type
-                          << "] is not supported for now";
+  int32_t input_type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+
+  // currently only float is supported
+  if (!GetType(input, input_type, logger) || input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
     return false;
   }
 
   return true;
 }
 
-bool BaseOpBuilder::HasSupportedOpSet(const Node& node,
-                                      const logging::Logger& logger) const {
+bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                           const logging::Logger& logger) const {
+  // We only check the type of input 0 by default
+  // specific op builder can override this
+  return IsInputFloat(node, 0, input_params, logger);
+}
+
+bool BaseOpBuilder::HasSupportedOpSet(const Node& node, const logging::Logger& logger) const {
   auto since_version = node.SinceVersion();
   if (since_version < GetMinSupportedOpSet(node) || since_version > GetMaxSupportedOpSet(node)) {
     LOGS(logger, VERBOSE) << node.OpType() << "is only supported for opset ["
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index b4132d3b770e..4a23640d0f34 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -3,11 +3,9 @@
 
 #pragma once
 
-#include "core/providers/coreml/builders/op_builder.h"
-
-#ifdef __APPLE__
+#include "core/common/span_utils.h"
 #include "core/providers/coreml/builders/coreml_spec.h"
-#endif
+#include "core/providers/coreml/builders/op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -18,45 +16,40 @@ class BaseOpBuilder : public IOpBuilder {
  public:
   virtual ~BaseOpBuilder() = default;
 
-  // Add operator related
+  // does the operator implementation support creating an ML Program
+  bool SupportsMLProgram() const override { return false; }
+
+  bool IsOpSupported(const Node& node, const OpBuilderInputParams& input_params,
+                     const logging::Logger& logger) const override final;
 
-#ifdef __APPLE__
- public:
-  virtual void AddInitializersToSkip(ModelBuilder& /* model_builder */, const Node& /* node */) const override {}
   Status AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
-                           const OpBuilderInputParams& input_params,
                            const logging::Logger& logger) const override final;
 
- protected:
-  virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                       const logging::Logger& logger) const = 0;
-
-  static std::unique_ptr<COREML_SPEC::NeuralNetworkLayer>
-  CreateNNLayer(ModelBuilder& model_builder, const Node& node);
-
-  static std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> CreateNNLayer(const std::string& layer_name);
-#endif
-
-  // Operator support related
- public:
-  bool IsOpSupported(const Node& node, const OpBuilderInputParams& input_params,
-                     const logging::Logger& logger) const override final;
+  void AddInitializersToSkip(ModelBuilder& /*model_builder*/, const Node& /*node*/) const override {}
 
  protected:
-  virtual bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */,
-                                 const logging::Logger& /* logger */) const {
+  // currently we only support float
+  static bool IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
+                           const logging::Logger& logger);
+
+ private:
+  virtual bool IsOpSupportedImpl(const Node& /*node*/, const OpBuilderInputParams& /*input_params*/,
+                                 const logging::Logger& /*logger*/) const {
     return true;
   }
 
-  virtual bool HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const;
+  virtual bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                      const logging::Logger& logger) const;
 
-  virtual int GetMinSupportedOpSet(const Node& /* node */) const { return 1; }
-  virtual int GetMaxSupportedOpSet(const Node& /* node */) const { return 20; }
+  virtual int GetMinSupportedOpSet(const Node& /*node*/) const { return 1; }
+  virtual int GetMaxSupportedOpSet(const Node& /*node*/) const { return 21; }
 
- private:
   bool HasSupportedOpSet(const Node& node, const logging::Logger& logger) const;
   bool HasSupportedInputs(const Node& node, const OpBuilderInputParams& input_params,
                           const logging::Logger& logger) const;
+
+  virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                                       const logging::Logger& logger) const = 0;
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
index 391b02eaec49..8da58f659acf 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
@@ -5,30 +5,20 @@
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class BatchNormalizationOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 
@@ -36,9 +26,6 @@ class BatchNormalizationOpBuilder : public BaseOpBuilder {
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 7; }
 };
 
-// Add operator related
-
-#ifdef __APPLE__
 void BatchNormalizationOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   // skip everything except input0 for BatchNormalization
   const auto& input_defs = node.InputDefs();
@@ -48,10 +35,9 @@ void BatchNormalizationOpBuilder::AddInitializersToSkip(ModelBuilder& model_buil
   model_builder.AddInitializerToSkip(input_defs[4]->Name());  // var
 }
 
-Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
-                                                          const Node& node,
+Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                                           const logging::Logger& /* logger */) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   const auto& input_defs = node.InputDefs();
   const auto& initializers(model_builder.GetInitializerTensors());
@@ -81,9 +67,6 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool BatchNormalizationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                                     const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 10c9b32d03f3..fb8e07633621 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -1,35 +1,31 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
-#ifdef __APPLE__
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
-
 class BinaryOpBuilder : public BaseOpBuilder {
-  // Add operator related
- private:
-#ifdef __APPLE__
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
-  // Operator support related
+
   int GetMinSupportedOpSet(const Node& node) const override;
 
-  bool HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const override;
+  bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                              const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
-#ifdef __APPLE__
-static bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger) {
+namespace {
+bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
 
   const auto* x_shape_proto = input_defs[0]->Shape();
@@ -57,78 +53,94 @@ static bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger&
                     y_shape_proto->dim().begin(), y_shape_proto->dim().end(),
                     dim_eq);
 }
-
-// Add operator related
+}  // namespace
 
 Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                               const logging::Logger& logger) const {
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
-
-  if (op_type == "Add") {
-    // original mutable_add() has limited broadcasting support
-    // updated to use CoreML::AddBroadcastableLayerParams which has more general broadcasting support
-    if (CheckIfBothInputShapesMatch(node, logger)) {
-      layer->mutable_add();
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_binary
+    std::string_view coreml_op_type;
+    if (op_type == "Add") {
+      coreml_op_type = "add";
+    } else if (op_type == "Mul") {
+      coreml_op_type = "mul";
+    } else if (op_type == "Sub") {
+      coreml_op_type = "sub";
+    } else if (op_type == "Div") {
+      // we only support fp32 currently. when we add support for integers we need to check the type and use
+      // "floor_div" or "real_div" accordingly
+      coreml_op_type = "real_div";
+    } else if (op_type == "Pow") {
+      coreml_op_type = "pow";
     } else {
-      layer->mutable_addbroadcastable();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "BinaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
     }
-  } else if (op_type == "Mul") {
-    if (CheckIfBothInputShapesMatch(node, logger)) {
-      layer->mutable_multiply();
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    AddOperationInput(*op, "y", input_defs[1]->Name());
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // defined (COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    if (op_type == "Add") {
+      // original mutable_add() has limited broadcasting support
+      // updated to use CoreML::AddBroadcastableLayerParams which has more general broadcasting support
+      if (CheckIfBothInputShapesMatch(node, logger)) {
+        layer->mutable_add();
+      } else {
+        layer->mutable_addbroadcastable();
+      }
+    } else if (op_type == "Mul") {
+      if (CheckIfBothInputShapesMatch(node, logger)) {
+        layer->mutable_multiply();
+      } else {
+        layer->mutable_multiplybroadcastable();
+      }
+    } else if (op_type == "Sub") {
+      layer->mutable_subtractbroadcastable();
+    } else if (op_type == "Div") {
+      layer->mutable_dividebroadcastable();
+    } else if (op_type == "Pow") {
+      layer->mutable_powbroadcastable();
     } else {
-      layer->mutable_multiplybroadcastable();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "BinaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
     }
-  } else if (op_type == "Sub") {
-    layer->mutable_subtractbroadcastable();
-  } else if (op_type == "Div") {
-    layer->mutable_dividebroadcastable();
-  } else if (op_type == "Pow") {
-    layer->mutable_powbroadcastable();
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "BinaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_input()->Add() = input_defs[1]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[1]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 int BinaryOpBuilder::GetMinSupportedOpSet(const Node& /* node */) const {
   // Add/Sub/Mul/Div opset 6- has broadcast attributes we do not support now
   return 7;
 }
 
-bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const {
-  bool is_pow = node.OpType() == "Pow";
-  if (!is_pow) {
-    return BaseOpBuilder::HasSupportedInputsImpl(node, logger);
-  }
-
-  const auto& input_1 = *node.InputDefs()[0];
-  const auto& input_2 = *node.InputDefs()[1];
-  // Pow we only support both inputs as fp32 for now
-  int32_t input_type_1;
-  if (!GetType(input_1, input_type_1, logger))
-    return false;
-
-  int32_t input_type_2;
-  if (!GetType(input_2, input_type_2, logger))
-    return false;
-
-  if (input_type_1 != ONNX_NAMESPACE::TensorProto_DataType_FLOAT || input_type_1 != input_type_2) {
-    LOGS(logger, VERBOSE) << "Pow only supports fp32 inputs, actual input type"
-                          << ", Input type 1: " << input_type_1
-                          << ", Input type 2: " << input_type_2;
+bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                             const logging::Logger& logger) const {
+  // Add/Sub/Mul/Div spec says inputs must be of the same type.
+  // Pow spec says inputs can be different types.
+  // We only support float for all of these inputs.
+  if (!IsInputFloat(node, 0, input_params, logger) ||
+      ((node.OpType() == "Pow") && !IsInputFloat(node, 1, input_params, logger))) {
     return false;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 3b7bd5c1840c..cbea969904ed 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -1,17 +1,17 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef __APPLE__
-
 #include "core/providers/coreml/builders/impl/builder_utils.h"
 
 #include "core/common/narrow.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/providers/coreml/builders/coreml_spec.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/optimizer/initializer.h"
 
-#include "coreml/NeuralNetwork.pb.h"
+using namespace COREML_SPEC;
 
 namespace onnxruntime {
 namespace coreml {
@@ -133,7 +133,249 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   CreateCoreMLWeightConvertingDataToFloats(weight, data);
 }
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+//
+// ML Program Utils
+//
+
+namespace {
+void SetTensorTypeInfo(MILSpec::TensorType& tensor_type, MILSpec::DataType data_type,
+                       std::optional<gsl::span<const int64_t>> shape) {
+  tensor_type.set_datatype(data_type);
+  if (shape) {
+    tensor_type.set_rank(shape->size());
+    for (const auto& dim : *shape) {
+      if (dim >= 0) {
+        tensor_type.add_dimensions()->mutable_constant()->set_size(narrow<int32_t>(dim));
+      } else {
+        tensor_type.add_dimensions()->mutable_unknown()->set_variadic(false);
+      }
+    }
+  }
+}
+
+void SetTensorTypeInfo(MILSpec::TensorType& tensor_type, MILSpec::DataType data_type,
+                       const ONNX_NAMESPACE::TensorShapeProto* shape) {
+  tensor_type.set_datatype(data_type);
+  if (shape) {
+    tensor_type.set_rank(shape->dim_size());
+    for (const auto& dim : shape->dim()) {
+      if (dim.has_dim_value()) {
+        tensor_type.add_dimensions()->mutable_constant()->set_size(narrow<int32_t>(dim.dim_value()));
+      } else {
+        tensor_type.add_dimensions()->mutable_unknown()->set_variadic(false);
+      }
+    }
+  }
+}
+
+template <typename T1, typename T2 = T1>
+void CopyDataToTensorValue(MILSpec::TensorValue& tensor_value, gsl::span<const T1> data) {
+  // need a 'false' that is dependent on the template types to make gcc happy and give a meaningful error message.
+  static_assert(false_for_T<T1> && false_for_T<T2>, "Unsupported data type");  // add specializations below as needed
+}
+
+template <>
+void CopyDataToTensorValue<float>(MILSpec::TensorValue& tensor_value, gsl::span<const float> data) {
+  tensor_value.mutable_floats()->mutable_values()->Add(data.begin(), data.end());
+}
+
+template <>
+void CopyDataToTensorValue<int32_t>(MILSpec::TensorValue& tensor_value, gsl::span<const int32_t> data) {
+  tensor_value.mutable_ints()->mutable_values()->Add(data.begin(), data.end());
+}
+
+template <>
+void CopyDataToTensorValue<std::string>(MILSpec::TensorValue& tensor_value, gsl::span<const std::string> data) {
+  tensor_value.mutable_strings()->mutable_values()->Add(data.begin(), data.end());
+}
+
+// copy int64_t (used by ONNX for strides/indexes/etc.) to int32_t (used by CoreML)
+template <>
+void CopyDataToTensorValue<int64_t, int32_t>(MILSpec::TensorValue& tensor_value, gsl::span<const int64_t> data) {
+  auto& int32_out = *tensor_value.mutable_ints()->mutable_values();
+  int32_out.Reserve(narrow<int32_t>(data.size()));
+  for (const int64_t v : data) {
+    int32_out.AddAlreadyReserved(narrow<int32_t>(v));
+  }
+}
+
+template <>
+void CopyDataToTensorValue<bool>(MILSpec::TensorValue& tensor_value, gsl::span<const bool> data) {
+  tensor_value.mutable_bools()->mutable_values()->Add(data.begin(), data.end());
+}
+
+}  // namespace
+
+MILSpec::DataType OnnxDataTypeToMILSpec(int onnx_type) {
+  switch (static_cast<ONNX_NAMESPACE::TensorProto_DataType>(onnx_type)) {
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+      return MILSpec::DataType::FLOAT32;
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
+      return MILSpec::DataType::FLOAT64;
+    case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16:
+      return MILSpec::DataType::BFLOAT16;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      return MILSpec::DataType::FLOAT16;
+
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      return MILSpec::DataType::INT8;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:
+      return MILSpec::DataType::INT16;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+      return MILSpec::DataType::INT32;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      return MILSpec::DataType::INT64;
+
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+      return MILSpec::DataType::UINT8;
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:
+      return MILSpec::DataType::UINT16;
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
+      return MILSpec::DataType::UINT32;
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
+      return MILSpec::DataType::UINT64;
+
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      return MILSpec::DataType::BOOL;
+    case ONNX_NAMESPACE::TensorProto_DataType_STRING:
+      return MILSpec::DataType::STRING;
+    default:
+      ORT_THROW("Unsupported data type: ", onnx_type);
+  }
+}
+
+template <typename T1, typename T2>
+MILSpec::Value CreateTensorValue(const gsl::span<const T1> data,
+                                 std::optional<gsl::span<const int64_t>> shape) {
+  MILSpec::Value value;
+  MILSpec::TensorType& tensor_type = *value.mutable_type()->mutable_tensortype();
+
+  if (shape) {
+    SetTensorTypeInfo(tensor_type, DataTypeToMILSpec<T2>(), *shape);
+  } else {
+    // infer as 1D shape
+    std::vector<int64_t> coreml_shape{narrow<int64_t>(data.size())};
+    SetTensorTypeInfo(tensor_type, DataTypeToMILSpec<T2>(), coreml_shape);
+  }
+
+  MILSpec::TensorValue& tensor_value = *value.mutable_immediatevalue()->mutable_tensor();
+  CopyDataToTensorValue<T1, T2>(tensor_value, data);
+
+  return value;
+}
+
+template <typename T>
+MILSpec::Value CreateScalarTensorValue(const T& data) {
+  gsl::span<const T> data_span{&data, 1};
+  std::vector<int64_t> shape = {};  // empty for scalar
+  return CreateTensorValue<T>(data_span, shape);
+}
+
+// explicit specializations for types we handle so the implementation can be in the .cc file
+template MILSpec::Value CreateTensorValue<int64_t, int32_t>(gsl::span<const int64_t> data,
+                                                            std::optional<gsl::span<const int64_t>> shape);
+
+template MILSpec::Value CreateScalarTensorValue(const float& data);
+template MILSpec::Value CreateScalarTensorValue(const int32_t& data);
+template MILSpec::Value CreateScalarTensorValue(const std::string& data);
+template MILSpec::Value CreateScalarTensorValue(const bool& data);
+
+COREML_SPEC::MILSpec::NamedValueType CreateNamedTensorValueType(const NodeArg& node_arg) {
+  MILSpec::NamedValueType nvt;
+  nvt.set_name(node_arg.Name());
+  MILSpec::TensorType& tensor_type = *nvt.mutable_type()->mutable_tensortype();
+
+  SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(node_arg.TypeAsProto()->tensor_type().elem_type()),
+                    node_arg.Shape());
+
+  return nvt;
+}
+
+void AddOperationInput(MILSpec::Operation& op, std::string_view input_name, std::string_view value_name) {
+  MILSpec::Argument arg;
+  arg.mutable_arguments()->Add()->set_name(std::string(value_name));
+
+  (*op.mutable_inputs())[input_name] = std::move(arg);
+}
+
+void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output) {
+  auto& outputs = *op.mutable_outputs();
+  auto& output_arg = *outputs.Add();
+  output_arg.set_name(output.Name());
+
+  MILSpec::ValueType& value = *output_arg.mutable_type();
+  MILSpec::TensorType& tensor_type = *value.mutable_tensortype();
+
+  SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(output.TypeAsProto()->tensor_type().elem_type()),
+                    output.Shape());
+}
+
+void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
+                       const NodeAttrHelper& helper, int num_spatial_dims) {
+  AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
+
+  // pad type (string)
+  //   valid - no pads  (ONNX auto_pad VALID)
+  //   custom - pads input  (ONNX NOTSET)
+  //   same - inferred to be `d_out[i] = ceil(d_in[i] / strides[i])`  (assuming == ONNX SAME_UPPER)
+  //   same_lower - as per same but any extra rows/cols are added at top/left if padding is odd (ONNX SAME_LOWER)
+  //
+  // TODO: See if we want to update HandleAutoPad to support 1D (and 3D) so we can infer if an autopad value
+  //       can be used. TBD if that provides any performance benefit with ML Program though as CoreML could
+  //       potentially do that same optimization internally.
+  switch (auto_pad_type) {
+    case AutoPadType::NOTSET: {
+      // use `pads` attribute.
+      auto onnx_pads = helper.GetInt64s("pads");  // 'pads' are used if auto_pad is NOTSET
+      if (onnx_pads) {
+        AddOperationInput(op, "pad_type",
+                          model_builder.AddScalarConstant(op_type, "pad_type", std::string("custom")));
+
+        // need to re-order from x1_start, x2_start..., x1_end, x2_end... to
+        // x1_start, x1_end, x2_start, x2_end,...
+        size_t num_pads = onnx_pads->size();
+        size_t num_dims = num_pads / 2;
+        std::vector<int64_t> reordered_pads(num_pads, 0);
+        for (size_t i = 0; i < num_pads; ++i) {
+          auto cur_dim = i % num_dims;
+          if (i < num_dims) {  // start values
+            reordered_pads[cur_dim * 2] = (*onnx_pads)[i];
+          } else {  // end values
+            reordered_pads[cur_dim * 2 + 1] = (*onnx_pads)[i];
+          }
+        }
+
+        AddOperationInput(op, "pad", model_builder.AddConstant(op_type, "pad", reordered_pads));
+
+        break;
+      }
+
+      // fall through if explicit pads were not provided as the default value for `pads` is all zeros,
+      // which is the same as 'valid' padding.
+      [[fallthrough]];
+    }
+    case AutoPadType::VALID:
+      AddOperationInput(op, "pad_type",
+                        model_builder.AddScalarConstant(op_type, "pad_type", std::string("valid")));
+
+      break;
+    case AutoPadType::SAME_UPPER:
+    case AutoPadType::SAME_LOWER: {
+      const auto pad_type = (auto_pad_type == AutoPadType::SAME_UPPER ? "same" : "same_lower");
+      AddOperationInput(op, "pad_type",
+                        model_builder.AddScalarConstant(op_type, "pad_type", std::string(pad_type)));
+
+      // despite what the spec says, a 'pad' input seems to be required.
+      // https://github.com/apple/coremltools/issues/2127
+      // Provide the default value as that's what coremltools does for conv/avg_pool/max_pool.
+      std::vector<int64_t> ignored_pads(num_spatial_dims * 2, 0);
+      AddOperationInput(op, "pad", model_builder.AddConstant(op_type, "pad", ignored_pads));
+
+      break;
+    }
+  }
+}
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index 23b11928f7dc..280458906563 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -5,21 +5,20 @@
 
 #pragma once
 
-#ifdef __APPLE__
+#include <optional>
 
 #include "core/common/gsl.h"
 #include "core/common/status.h"
 #include "core/graph/basic_types.h"
 #include "core/providers/common.h"
-
-namespace CoreML {
-namespace Specification {
-class WeightParams;
-}
-}  // namespace CoreML
+#include "core/providers/coreml/builders/coreml_spec.h"
+#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
+class NodeArg;
+
 namespace coreml {
+class ModelBuilder;
 
 // Try to see if we can map explicit padding to auto padding for Conv/Pool
 // Since usually use auto padding is more efficient
@@ -32,6 +31,10 @@ Status HandleAutoPad(const std::vector<int64_t> input_shape,
                      AutoPadType auto_pad_type,
                      AutoPadType& auto_pad_type_out);
 
+//
+// NeuralNetwork utils
+//
+
 // Copy an onnx initializer data to a coreml weight
 Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, const ONNX_NAMESPACE::TensorProto& tensor);
 
@@ -44,7 +47,103 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
 // Copy the int64_t array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int64_t> data);
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+//
+// MLProgram utils
+//
+
+// helper for static_assert where the value needs to be dependent on a template parameter
+template <typename>
+constexpr bool false_for_T = false;
+
+template <typename T>
+COREML_SPEC::MILSpec::DataType DataTypeToMILSpec() {
+  if constexpr (std::is_same_v<T, float>) {
+    return COREML_SPEC::MILSpec::DataType::FLOAT32;
+  } else if constexpr (std::is_same_v<T, double>) {
+    return COREML_SPEC::MILSpec::DataType::FLOAT64;
+  } else if constexpr (std::is_same_v<T, BFloat16>) {
+    return COREML_SPEC::MILSpec::DataType::BFLOAT16;
+  } else if constexpr (std::is_same_v<T, MLFloat16>) {
+    return COREML_SPEC::MILSpec::DataType::FLOAT16;
+
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    return COREML_SPEC::MILSpec::DataType::INT8;
+  } else if constexpr (std::is_same_v<T, int16_t>) {
+    return COREML_SPEC::MILSpec::DataType::INT16;
+  } else if constexpr (std::is_same_v<T, int32_t>) {
+    return COREML_SPEC::MILSpec::DataType::INT32;
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    return COREML_SPEC::MILSpec::DataType::INT64;
+
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    return COREML_SPEC::MILSpec::DataType::UINT8;
+  } else if constexpr (std::is_same_v<T, uint16_t>) {
+    return COREML_SPEC::MILSpec::DataType::UINT16;
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return COREML_SPEC::MILSpec::DataType::UINT32;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    return COREML_SPEC::MILSpec::DataType::UINT64;
+
+  } else if constexpr (std::is_same_v<T, bool>) {
+    return COREML_SPEC::MILSpec::DataType::BOOL;
+  } else if constexpr (std::is_same_v<T, std::string>) {
+    return COREML_SPEC::MILSpec::DataType::STRING;
+  } else {
+    static_assert(false_for_T<T>, "Unsupported type.");
+  }
+}
+
+// The TensorProto.data_type field is an int, but must be a valid TensorProto_DataType value.
+// Use int for the arg so the caller can pass TensorProto.data_type() value and do the cast to enum internally
+COREML_SPEC::MILSpec::DataType OnnxDataTypeToMILSpec(int onnx_type);
+
+/// <summary>
+/// Create a CoreML MILSpec::TensorValue for the given input data.
+/// </summary>
+/// <typeparam name="T1">Original C++ data type</typeparam>
+/// <typeparam name="T2">CoreML C++ data type</typeparam>
+/// <param name="data">ONNX data</param>
+/// <param name="shape">ONNX data shape. Inferred to be a 1D shape of `{data.size()}` if not specified.</param>
+/// <returns>TensorValue containing data.</returns>
+template <typename T1, typename T2 = T1>
+COREML_SPEC::MILSpec::Value CreateTensorValue(gsl::span<const T1> data,
+                                              std::optional<gsl::span<const int64_t>> shape = std::nullopt);
+
+template <typename T>
+COREML_SPEC::MILSpec::Value CreateScalarTensorValue(const T& data);
+
+/// <summary>Create a NamedValueType from an ONNX tensor NodeArg.</summary>
+/// <remarks>Used to create inputs for the 'main' function in an ML Program.</remarks>
+COREML_SPEC::MILSpec::NamedValueType CreateNamedTensorValueType(const NodeArg& node_arg);
+
+/// <summary>
+/// Add an input argument to a MILSpec::Operation
+/// </summary>
+/// <param name="op">Operation to update.</param>
+/// <param name="input_name">The input name defined by the spec for the operation.</param>
+/// <param name="value_name">The name of the value that is providing the input.</param>
+/// <see>"https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html"</see>
+void AddOperationInput(COREML_SPEC::MILSpec::Operation& op,
+                       std::string_view input_name, std::string_view value_name);
+
+/// <summary>
+/// Add an output to a MILSpec::Operation. Name, data type and shape are used from the NodeArg.
+/// </summary>
+/// <param name="op">Operation to update.</param>
+/// <param name="output">NodeArg with details of output to add.</param>
+void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output);
+
+/// <summary>
+/// Add pad_type and pad values.
+/// </summary>
+/// <param name="op">Operator to update</param>
+/// <param name="model_builder">ModelBuilder to add constants with.</param>
+/// <param name="op_type">Operator type.</param>
+/// <param name="helper">Node attribute helper.</param>
+/// <param name="num_spatial_dims">Number of spatial dims in input. Generally rank - 2 (ignore N and C dims).</param>
+void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
+                       const NodeAttrHelper& helper, int num_spatial_dims);
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
-
-#endif
diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
index 15ee1f0fc728..70053c2c606a 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
@@ -1,34 +1,25 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/coreml/builders/helper.h"
-#ifdef __APPLE__
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/model_builder.h"
-#endif
 #include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace coreml {
 
 class CastOpBuilder : public BaseOpBuilder {
-  // Add operator related
- private:
-#ifdef __APPLE__
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
-  // Operator support related
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
-  bool HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const override;
-};
 
-// Add operator related
+  bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                              const logging::Logger& logger) const override;
+};
 
-#ifdef __APPLE__
 Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& /* model_builder */,
                                             const Node& /* node */,
                                             const logging::Logger& /* logger */) const {
@@ -37,9 +28,6 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& /* model_builder */,
   // Cast node is not provided in CoreML model, so we're skipping adding the Cast node here.
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool CastOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
@@ -84,7 +72,8 @@ bool CastOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
   return true;
 }
 
-bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const {
+bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+                                           const logging::Logger& logger) const {
   // We only check the type of input 0
   const auto& input = *node.InputDefs()[0];
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
index 3a3f89d24c7d..41f4041ef118 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -1,40 +1,48 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef __APPLE__
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
-#endif
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace coreml {
 
 class ClipOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
-};
 
-// Add operator related
+  bool SupportsMLProgram() const override { return true; }
+};
 
-#ifdef __APPLE__
 void ClipOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  bool skip = true;
+
+  if (model_builder.CreateMLProgram()) {
+    float min, max;
+    ORT_IGNORE_RETURN_VALUE(GetClipMinMax(model_builder.GetGraphViewer(), node, min, max, model_builder.Logger()));
+
+    bool has_min = min != std::numeric_limits<float>::lowest();
+    bool has_max = max != std::numeric_limits<float>::max();
+    if (has_min && has_max && min == 0.f && max == 6.f) {
+      // relu6 - skip both
+    } else if (has_min && min == 0.f && !has_max) {
+      // relu - skip both
+    } else {
+      // clip - we will use both
+      skip = false;
+    }
+  }
+
   // Both min and max values will be injected into the layer, no need to add to the model
-  if (node.SinceVersion() >= 11) {
+  if (skip && node.SinceVersion() >= 11) {
     if (node.InputDefs().size() > 1)
       model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
 
@@ -48,92 +56,137 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                             const logging::Logger& logger) const {
   const auto& node_name = node.Name();
   const auto& input_name = node.InputDefs()[0]->Name();
-  const auto& output_name = node.OutputDefs()[0]->Name();
+  const auto& output = *node.OutputDefs()[0];
+  const auto& output_name = output.Name();
   float min, max;
-  ORT_RETURN_IF_NOT(GetClipMinMax(model_builder.GetInitializerTensors(), node, min, max, logger), "GetClipMinMax failed");
+  ORT_RETURN_IF_NOT(GetClipMinMax(model_builder.GetGraphViewer(), node, min, max, logger), "GetClipMinMax failed");
 
   bool has_min = min != std::numeric_limits<float>::lowest();
   bool has_max = max != std::numeric_limits<float>::max();
 
-  if (!has_min && !has_max) {
-    // Clip without min/max is an identity node
-    // In CoreML we don't have identity, use ActivationLinear instead
-    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
-    layer->mutable_activation()->mutable_linear()->set_alpha(1.0f);
-    *layer->mutable_input()->Add() = input_name;
-    *layer->mutable_output()->Add() = output_name;
-
-    model_builder.AddLayer(std::move(layer));
-  } else {
-    // The implementation of clip(min, max) is done by
-    // 1. Clipping at min -> max(input, min) is handled by
-    //    min_output = threshold(input, min)
-    // 2. Clipping at max -> min(min_output, max) is handled by
-    //    output = -1 * (threshold(-min_output, -max))
-
-    // Now we have at least one or min or max is not default value
-    // Clipping at max will need take the output of clipping at min, or the node input, if min value is default
-    // If max value is default, the output of clipping at min will be the output of the node
-    std::string min_output_name = output_name;
-    if (has_max) {
-      min_output_name = has_min
-                            ? model_builder.GetUniqueName(node_name + "min_output")
-                            : input_name;
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::unique_ptr<Operation> op;
+    if (!has_min && !has_max) {
+      // Clip without min/max is an identity node.
+      op = model_builder.CreateOperation(node, "identity");
+      Operation& identity_op = *op;
+      AddOperationInput(identity_op, "x", input_name);
+    } else {
+      if (has_min && has_max && min == 0.f && max == 6.f) {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.activation.relu6
+        op = model_builder.CreateOperation(node, "relu6");
+        Operation& relu6_op = *op;
+        AddOperationInput(relu6_op, "x", input_name);
+      } else if (has_min && min == 0.f && !has_max) {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.activation.relu
+        op = model_builder.CreateOperation(node, "relu");
+        Operation& relu_op = *op;
+        AddOperationInput(relu_op, "x", input_name);
+      } else {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary.clip
+        op = model_builder.CreateOperation(node, "clip");
+
+        Operation& clip_op = *op;
+        AddOperationInput(clip_op, "x", input_name);
+
+        // if min and max were attributes we need to add initializers. otherwise we use the existing inputs
+        const bool min_max_attribs = node.SinceVersion() < 11;
+        std::string_view min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
+                                                    : node.InputDefs()[1]->Name();
+
+        AddOperationInput(clip_op, "alpha", min_name);
+
+        if (has_max) {
+          std::string_view max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
+                                                      : node.InputDefs()[2]->Name();
+          AddOperationInput(clip_op, "beta", max_name);
+        }
+      }
     }
 
-    // Handle clipping at min first
-    if (has_min) {
-      const auto clip_min_layer_name = model_builder.GetUniqueName(MakeString(node_name, "_Clip_min"));
-      std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> min_layer = CreateNNLayer(clip_min_layer_name);
-      if (min == 0.0f) {  // If min is 0. then this min will be handled by relu
-        min_layer->mutable_activation()->mutable_relu();
-      } else {  // otherwise, min will be handled by unary->threshold
-        min_layer->mutable_unary()->set_alpha(min);
-        min_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+    AddOperationOutput(*op, output);
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    // TODO: CoreML has a Clip layer for NeuralNetwork. Added in CoreML 4. We could potentially use that if available
+    // to simplify.
+    // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#cliplayerparams
+
+    if (!has_min && !has_max) {
+      // Clip without min/max is an identity node
+      // In CoreML we don't have identity, use ActivationLinear instead
+      std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+      layer->mutable_activation()->mutable_linear()->set_alpha(1.0f);
+      *layer->mutable_input()->Add() = input_name;
+      *layer->mutable_output()->Add() = output_name;
+
+      model_builder.AddLayer(std::move(layer));
+    } else {
+      // The implementation of clip(min, max) is done by
+      // 1. Clipping at min -> max(input, min) is handled by
+      //    min_output = threshold(input, min)
+      // 2. Clipping at max -> min(min_output, max) is handled by
+      //    output = -1 * (threshold(-min_output, -max))
+
+      // Now we have at least one or min or max is not default value
+      // Clipping at max will need take the output of clipping at min, or the node input, if min value is default
+      // If max value is default, the output of clipping at min will be the output of the node
+      std::string min_output_name = output_name;
+      if (has_max) {
+        min_output_name = has_min
+                              ? model_builder.GetUniqueName(node_name + "min_output")
+                              : input_name;
       }
 
-      *min_layer->mutable_input()->Add() = input_name;
-      *min_layer->mutable_output()->Add() = min_output_name;
-      model_builder.AddLayer(std::move(min_layer));
-    }
-
-    // Clipping at max is handled by -1 * (threshold (-min_output, -max))
-    if (has_max) {
-      const auto threshold_output_name = model_builder.GetUniqueName(MakeString(node_name, "threshold_output"));
-      {  // Add threshold layer, which is actually max( -1 * min_output, -max)
-        const auto clip_max_threshold_layer_name =
-            model_builder.GetUniqueName(MakeString(node_name, "_Clip_max_threshold"));
-        auto threshold_layer = CreateNNLayer(clip_max_threshold_layer_name);
-        threshold_layer->mutable_unary()->set_alpha(-max);
-        threshold_layer->mutable_unary()->set_scale(-1.0f);
-        threshold_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
-        *threshold_layer->mutable_input()->Add() = min_output_name;
-        *threshold_layer->mutable_output()->Add() = threshold_output_name;
-        model_builder.AddLayer(std::move(threshold_layer));
+      // Handle clipping at min first
+      if (has_min) {
+        std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> min_layer = model_builder.CreateNNLayer(node, "_Clip_min");
+        if (min == 0.0f) {  // If min is 0. then this min will be handled by relu
+          min_layer->mutable_activation()->mutable_relu();
+        } else {  // otherwise, min will be handled by unary->threshold
+          min_layer->mutable_unary()->set_alpha(min);
+          min_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+        }
+
+        *min_layer->mutable_input()->Add() = input_name;
+        *min_layer->mutable_output()->Add() = min_output_name;
+        model_builder.AddLayer(std::move(min_layer));
       }
-      {  // Add linear activation layer -1 * threshold_output
-        const auto clip_max_linear_layer_name =
-            model_builder.GetUniqueName(MakeString(node_name, "_Clip_max_linear"));
-        auto linear_layer = CreateNNLayer(clip_max_linear_layer_name);
-        linear_layer->mutable_activation()->mutable_linear()->set_alpha(-1.0f);
-        *linear_layer->mutable_input()->Add() = threshold_output_name;
-        *linear_layer->mutable_output()->Add() = output_name;
-        model_builder.AddLayer(std::move(linear_layer));
+
+      // Clipping at max is handled by -1 * (threshold (-min_output, -max))
+      if (has_max) {
+        const auto threshold_output_name = model_builder.GetUniqueName(MakeString(node_name, "threshold_output"));
+        {  // Add threshold layer, which is actually max( -1 * min_output, -max)
+          auto threshold_layer = model_builder.CreateNNLayer(node, "_Clip_max_threshold");
+          threshold_layer->mutable_unary()->set_alpha(-max);
+          threshold_layer->mutable_unary()->set_scale(-1.0f);
+          threshold_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+          *threshold_layer->mutable_input()->Add() = min_output_name;
+          *threshold_layer->mutable_output()->Add() = threshold_output_name;
+          model_builder.AddLayer(std::move(threshold_layer));
+        }
+        {  // Add linear activation layer -1 * threshold_output
+          auto linear_layer = model_builder.CreateNNLayer(node, "_Clip_max_linear");
+          linear_layer->mutable_activation()->mutable_linear()->set_alpha(-1.0f);
+          *linear_layer->mutable_input()->Add() = threshold_output_name;
+          *linear_layer->mutable_output()->Add() = output_name;
+          model_builder.AddLayer(std::move(linear_layer));
+        }
       }
     }
   }
 
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool ClipOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
   float min, max;
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  return GetClipMinMax(initializers, node, min, max, logger);
+  return GetClipMinMax(input_params.graph_viewer, node, min, max, logger);
 }
 
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
index b1e761024f5c..34193318a026 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
@@ -4,37 +4,26 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class ConcatOpBuilder : public BaseOpBuilder {
-  // Add operator related
- private:
-#ifdef __APPLE__
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-
-#ifdef __APPLE__
 Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
                                               const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   layer->mutable_concat()->set_sequenceconcat(false);
 
@@ -48,9 +37,7 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
 
-// Operator support related
 bool ConcatOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
index ff9dcbd9f887..38125957bf48 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -4,39 +4,35 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
-#include "core/providers/coreml/builders/op_builder_factory.h"
-#include "core/providers/shared/utils/utils.h"
-
-#ifdef __APPLE__
 #include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
-#endif
+#include "core/providers/shared/utils/utils.h"
+
+using namespace CoreML::Specification;
 
 namespace onnxruntime {
 namespace coreml {
 
 class ConvOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */,
                          const logging::Logger& /* logger */) const override;
-};
 
-// Add operator related
+  bool SupportsMLProgram() const override { return true; }
+};
 
-#ifdef __APPLE__
 void ConvOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  if (model_builder.CreateMLProgram()) {
+    // we add the initializers as 'const' operations via ModelBuilder::RegisterInitializers
+    return;
+  }
+
   const auto& input_defs = node.InputDefs();
 
   // skip the weight and bias (if has it) for conv as we will directly set those as part of the NN layer
@@ -49,136 +45,177 @@ void ConvOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
 
 Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                             const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
-
   const auto& input_defs = node.InputDefs();
   const auto& output_defs = node.OutputDefs();
   const auto& input_name = input_defs[0]->Name();
   const auto& output_name = output_defs[0]->Name();
 
-  const auto& weight_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
-  std::vector<int64_t> weight_shape = {weight_tensor.dims().cbegin(), weight_tensor.dims().cend()};
+  NodeAttrHelper helper(node);
 
-  const bool is_1d_conv = (weight_shape.size() == 3);
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
 
-  if (is_1d_conv) {
-    // weight_shape needs to be expanded from MXCXH->MXCXHx1
-    weight_shape.push_back(1);
-  }
+    // https://github.com/apple/coremltools/blob/7.1/coremltools/converters/mil/mil/ops/defs/iOS15/conv.py
 
-  NodeAttrHelper helper(node);
-  auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
-  auto dilations = helper.Get("dilations", std::vector<int64_t>{1, 1});
-  auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
-  // Strides/dilations for 1d conv is normally of length 1. Expand them by 1
-  // to meet the required length 2 (for 2d conv it's normally 2)
-  // Similarly 1d conv normally has a length 2 padding. Expand it to length 4 by adding additional zeros.
-  if (is_1d_conv) {
-    if (strides.size() < 2) {
-      ORT_RETURN_IF_NOT(strides.size() == 1, "strides size does not equal 1 for Conv 1d");
-      strides.push_back(1);
+    std::unique_ptr<Operation> conv_op = model_builder.CreateOperation(node, "conv");
+
+    AddOperationInput(*conv_op, "x", input_name);
+    AddOperationInput(*conv_op, "weight", input_defs[1]->Name());
+
+    if (input_defs.size() > 2) {
+      AddOperationInput(*conv_op, "bias", input_defs[2]->Name());
     }
-    if (dilations.size() < 2) {
-      ORT_RETURN_IF_NOT(dilations.size() == 1, "dilations size does not equal 1 for Conv 1d");
-      dilations.push_back(1);
+
+    // we know this input has a valid shape due to the check in IsOpSupportedImpl. ignore N and C dims.
+    const auto num_spatial_dims = input_defs[1]->Shape()->dim_size() - 2;
+    const auto& op_type = conv_op->type();
+
+    // Spec says strides and dilations are optional, but reality is they're required for at least the iOS15 target
+    // (CoreML5).
+    const auto strides = helper.Get("strides", std::vector<int64_t>(num_spatial_dims, 1));
+    auto dilations = helper.Get("dilations", std::vector<int64_t>(num_spatial_dims, 1));
+    auto groups = helper.GetInt64("group");
+
+    AddOperationInput(*conv_op, "strides", model_builder.AddConstant(op_type, "strides", strides));
+    AddOperationInput(*conv_op, "dilations", model_builder.AddConstant(op_type, "dilations", dilations));
+
+    if (groups) {
+      AddOperationInput(*conv_op, "groups", model_builder.AddScalarConstant(op_type, "groups", *groups));
     }
-    if (onnx_pads.size() < 4) {
-      ORT_RETURN_IF_NOT(onnx_pads.size() == 2, "onnx_pads size does not equal 2 for Conv 1d");
-      onnx_pads.insert(onnx_pads.begin() + 1, 0);
-      onnx_pads.push_back(0);
+
+    AddPadTypeAndPads(*conv_op, model_builder, op_type, helper, num_spatial_dims);
+
+    AddOperationOutput(*conv_op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(conv_op));
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
+    auto dilations = helper.Get("dilations", std::vector<int64_t>{1, 1});
+    auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
+    const auto group = helper.Get("group", static_cast<int64_t>(1));
+
+    std::vector<int64_t> input_shape;
+    ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+
+    const auto& weight_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
+    std::vector<int64_t> weight_shape = {weight_tensor.dims().cbegin(), weight_tensor.dims().cend()};
+
+    const bool is_1d_conv = (weight_shape.size() == 3);
+
+    // add dummy 'W' dim with value of 1 so we can use 2D conv.
+    if (is_1d_conv) {
+      input_shape.push_back(1);
+      weight_shape.push_back(1);
+
+      // Strides/dilations for 1d conv is normally of length 1. Expand them by 1
+      // to meet the required length 2 (for 2d conv it's normally 2)
+      if (strides.size() < 2) {
+        ORT_RETURN_IF_NOT(strides.size() == 1, "strides size does not equal 1 for Conv 1d");
+        strides.push_back(1);
+      }
+
+      if (dilations.size() < 2) {
+        ORT_RETURN_IF_NOT(dilations.size() == 1, "dilations size does not equal 1 for Conv 1d");
+        dilations.push_back(1);
+      }
+
+      // Similarly 1d conv normally has a length 2 padding. Expand it to length 4 by adding additional zeros.
+      if (onnx_pads.size() < 4) {
+        ORT_RETURN_IF_NOT(onnx_pads.size() == 2, "onnx_pads size does not equal 2 for Conv 1d");
+        onnx_pads.insert(onnx_pads.begin() + 1, 0);
+        onnx_pads.push_back(0);
+      }
     }
-  }
-  const auto group = helper.Get("group", static_cast<int64_t>(1));
-
-  auto* coreml_conv = layer->mutable_convolution();
-
-  std::string expand_output_name = model_builder.GetUniqueName(node.Name() + "_expandDims");
-
-  if (is_1d_conv) {
-    const auto expand_layer_name = model_builder.GetUniqueName(MakeString(node.Name(), "_Conv_expand"));
-    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> expand_layer = CreateNNLayer(expand_layer_name);
-    // Add an expanddims layer here. CoreML only supports 2d convolution, so for 1d Conv case
-    // we need to add an additional dimension here to the input to make it "2d Conv" like.
-    // NxCxH -> NxCxHx1
-    expand_layer->mutable_expanddims()->add_axes(-1);
-    *expand_layer->mutable_input()->Add() = input_name;
-    *expand_layer->mutable_output()->Add() = expand_output_name;
-    model_builder.AddLayer(std::move(expand_layer));
-  }
-  coreml_conv->set_outputchannels(weight_shape[0]);  // M
-  coreml_conv->set_kernelchannels(weight_shape[1]);  // C/Group
-  coreml_conv->add_kernelsize(weight_shape[2]);      // H
-  coreml_conv->add_kernelsize(weight_shape[3]);      // W
-  coreml_conv->set_ngroups(group);
-  *coreml_conv->mutable_stride() = {strides.cbegin(), strides.cend()};
-  *coreml_conv->mutable_dilationfactor() = {dilations.cbegin(), dilations.cend()};
-
-  coreml_conv->set_isdeconvolution(false);
-
-  // Add Padding
-  // Usually using autopadding is more efficient than using explicit padding
-  // Try to see if we can map explicit padding to auto padding
-  std::vector<int64_t> input_shape;
-  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
-  AutoPadType auto_pad_type;
-  ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, weight_shape[2], weight_shape[3],
-                                    onnx_pads, strides, dilations,
-                                    StringToAutoPadType(helper.Get("auto_pad", "NOTSET")),
-                                    auto_pad_type));
-
-  if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
-    auto* padding_type = coreml_conv->mutable_same();
-    if (AutoPadType::SAME_LOWER == auto_pad_type) {  // default is SAME_UPPER
-      padding_type->set_asymmetrymode(COREML_SPEC::SamePadding_SamePaddingMode_TOP_LEFT_HEAVY);
+
+    auto* coreml_conv = layer->mutable_convolution();
+
+    std::string expand_output_name = model_builder.GetUniqueName(node.Name() + "_expandDims");
+
+    if (is_1d_conv) {
+      // Add an expanddims layer here. CoreML only supports 2d convolution, so for 1d Conv case
+      // we need to add an additional dimension here to the input to make it "2d Conv" like.
+      // NxCxH -> NxCxHx1
+      auto expand_layer = model_builder.CreateNNLayer(node, "_Conv_expand");
+      expand_layer->mutable_expanddims()->add_axes(-1);
+      *expand_layer->mutable_input()->Add() = input_name;
+      *expand_layer->mutable_output()->Add() = expand_output_name;
+      model_builder.AddLayer(std::move(expand_layer));
     }
-  } else {
-    auto* padding_type = coreml_conv->mutable_valid();
-    if (AutoPadType::NOTSET == auto_pad_type && onnx_pads != std::vector<int64_t>{0, 0, 0, 0}) {
-      // NOTSET is adding the explicit padding to the ValidPadding.paddingAmounts
-      auto* height_border = padding_type->mutable_paddingamounts()->add_borderamounts();
-      height_border->set_startedgesize(onnx_pads[0]);
-      height_border->set_endedgesize(onnx_pads[2]);
-      auto* width_border = padding_type->mutable_paddingamounts()->add_borderamounts();
-      width_border->set_startedgesize(onnx_pads[1]);
-      width_border->set_endedgesize(onnx_pads[3]);
+
+    coreml_conv->set_outputchannels(weight_shape[0]);  // M
+    coreml_conv->set_kernelchannels(weight_shape[1]);  // C/Group
+    coreml_conv->add_kernelsize(weight_shape[2]);      // H
+    coreml_conv->add_kernelsize(weight_shape[3]);      // W
+    coreml_conv->set_ngroups(group);
+    *coreml_conv->mutable_stride() = {strides.cbegin(), strides.cend()};
+    *coreml_conv->mutable_dilationfactor() = {dilations.cbegin(), dilations.cend()};
+
+    coreml_conv->set_isdeconvolution(false);
+
+    // Add Padding
+    // Usually using autopadding is more efficient than using explicit padding
+    // Try to see if we can map explicit padding to auto padding
+    AutoPadType auto_pad_type;
+    ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, weight_shape[2], weight_shape[3],
+                                      onnx_pads, strides, dilations,
+                                      StringToAutoPadType(helper.Get("auto_pad", "NOTSET")),
+                                      auto_pad_type));
+
+    if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
+      auto* padding_type = coreml_conv->mutable_same();
+      if (AutoPadType::SAME_LOWER == auto_pad_type) {  // default is SAME_UPPER
+        padding_type->set_asymmetrymode(COREML_SPEC::SamePadding_SamePaddingMode_TOP_LEFT_HEAVY);
+      }
+    } else {
+      auto* padding_type = coreml_conv->mutable_valid();
+      if (AutoPadType::NOTSET == auto_pad_type && onnx_pads != std::vector<int64_t>{0, 0, 0, 0}) {
+        // NOTSET is adding the explicit padding to the ValidPadding.paddingAmounts
+        auto* height_border = padding_type->mutable_paddingamounts()->add_borderamounts();
+        height_border->set_startedgesize(onnx_pads[0]);
+        height_border->set_endedgesize(onnx_pads[2]);
+        auto* width_border = padding_type->mutable_paddingamounts()->add_borderamounts();
+        width_border->set_startedgesize(onnx_pads[1]);
+        width_border->set_endedgesize(onnx_pads[3]);
+      }
     }
-  }
 
-  // Add weight
-  ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_conv->mutable_weights(), weight_tensor));
+    // Add weight
+    ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_conv->mutable_weights(), weight_tensor));
 
-  // Add bias if present
-  if (input_defs.size() > 2) {
-    coreml_conv->set_hasbias(true);
-    const auto& bias_tensor = *model_builder.GetInitializerTensors().at(input_defs[2]->Name());
-    ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_conv->mutable_bias(), bias_tensor));
-  }
+    // Add bias if present
+    if (input_defs.size() > 2) {
+      coreml_conv->set_hasbias(true);
+      const auto& bias_tensor = *model_builder.GetConstantInitializer(input_defs[2]->Name());
+      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_conv->mutable_bias(), bias_tensor));
+    }
 
-  if (is_1d_conv) {
-    std::string conv_output_name = model_builder.GetUniqueName(node.Name() + "_conv_output");
-    *layer->mutable_input()->Add() = expand_output_name;
-    *layer->mutable_output()->Add() = conv_output_name;
-    model_builder.AddLayer(std::move(layer));
-
-    // Add a squeeze layer here. Since CoreML only supports 2d conv and we expanded the dimension by 1 before,
-    // we need to squeeze it back from NxCxHx1->NxCxH.
-    const auto squeeze_layer_name = model_builder.GetUniqueName(MakeString(node.Name(), "_Conv_squeeze"));
-    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> squeeze_layer = CreateNNLayer(squeeze_layer_name);
-    squeeze_layer->mutable_squeeze()->add_axes(-1);
-    *squeeze_layer->mutable_input()->Add() = conv_output_name;
-    *squeeze_layer->mutable_output()->Add() = output_name;
-    model_builder.AddLayer(std::move(squeeze_layer));
-  } else {
-    *layer->mutable_input()->Add() = input_name;
-    *layer->mutable_output()->Add() = output_name;
-    model_builder.AddLayer(std::move(layer));
+    if (is_1d_conv) {
+      std::string conv_output_name = model_builder.GetUniqueName(node.Name() + "_conv_output");
+      *layer->mutable_input()->Add() = expand_output_name;
+      *layer->mutable_output()->Add() = conv_output_name;
+      model_builder.AddLayer(std::move(layer));
+
+      // Add a squeeze layer here. Since CoreML only supports 2d conv and we expanded the dimension by 1 before,
+      // we need to squeeze it back from NxCxHx1->NxCxH.
+      auto squeeze_layer = model_builder.CreateNNLayer(node, "_Conv_squeeze");
+      squeeze_layer->mutable_squeeze()->add_axes(-1);
+      *squeeze_layer->mutable_input()->Add() = conv_output_name;
+      *squeeze_layer->mutable_output()->Add() = output_name;
+      model_builder.AddLayer(std::move(squeeze_layer));
+    } else {
+      *layer->mutable_input()->Add() = input_name;
+      *layer->mutable_output()->Add() = output_name;
+      model_builder.AddLayer(std::move(layer));
+    }
   }
 
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
@@ -186,23 +223,73 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
   const auto& input_defs = node.InputDefs();
 
   const auto& weight_name = input_defs[1]->Name();
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (Contains(initializers, weight_name)) {
-    const auto& tensor = *initializers.at(weight_name);
-    if (tensor.dims().size() != 4 && tensor.dims().size() != 3) {
-      LOGS(logger, VERBOSE) << "Conv [" << name << "] dimension: " << tensor.dims().size()
-                            << " Only conv 2d and conv 1d are supported.";
+  const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (input_params.create_mlprogram) {
+    // ML Program supports non-const weight, 1D, 2D and 3D.
+    // keep to 1D and 2D for consistency with the NeuralNetwork implementation for now.
+    // add 3D support as/when needed.
+  } else
+#endif  // defined (COREML_ENABLE_MLPROGRAM)
+  {
+    if (!weight) {
+      LOGS(logger, VERBOSE) << "The weight of Conv [" << name << "] must be a constant initializer";
       return false;
     }
-  } else {
-    LOGS(logger, VERBOSE) << "The weight of Conv [" << name << "] must be known";
+  }
+
+  // use the weight for the shape as it should always be known
+  const auto* weight_shape = input_defs[1]->Shape();
+  int64_t num_dims = weight_shape ? weight_shape->dim_size() : -1;
+
+  // ONNX spec requires N and C as first 2 dims
+  if (num_dims != 3 && num_dims != 4) {
+    LOGS(logger, VERBOSE) << "Conv [" << name << "] is " << num_dims - 2 << "D. "
+                          << "Only 1D and 2D Conv are supported currently.";
     return false;
   }
 
-  if (input_defs.size() > 2) {
-    const auto& bias_name = input_defs[2]->Name();
-    if (!Contains(initializers, bias_name)) {
-      LOGS(logger, VERBOSE) << "The bias of Conv [" << name << "] must be a constant initializer";
+  if (input_defs.size() > 2 && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
+    LOGS(logger, VERBOSE) << "The bias of Conv [" << name << "] must be a constant initializer";
+    return false;
+  }
+
+  NodeAttrHelper helper(node);
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  // spec says same_lower is supported in CoreML 5. it lies. CoreML 6 is required otherwise you get
+  //   `Unexpected value for parameter pad_type[0] "same_lower" not in ("custom", "same", "valid").`
+  // We _could_ manually calculate the pads, but not implementing that until we have a real use case to justify
+  //  the effort as it's not clear how common usage of same_lower is.
+  if (input_params.create_mlprogram && input_params.coreml_version < 6) {
+    if (StringToAutoPadType(helper.Get("auto_pad", "NOTSET")) == AutoPadType::SAME_LOWER) {
+      LOGS(logger, VERBOSE) << "Pad type of SAME_LOWER  [" << name << "] is not supported until CoreML 6."
+                            << "Available version is CoreML " << input_params.coreml_version;
+      return false;
+    }
+  }
+#endif
+
+  // there's no equivalent to allow a manual kernel shape in CoreML.
+  // it's OK if a specified kernel_shape matches kH and kW dims of the weight input.
+  auto kernel_shape = helper.GetInt64s("kernel_shape");
+  if (kernel_shape) {
+    bool valid = true;
+    if (static_cast<int64_t>(kernel_shape->size()) == num_dims - 2) {
+      for (int i = 0; i < num_dims - 2; ++i) {
+        // check the specified kernel shape matches the weight shape. skip the initial N and C dims in the latter.
+        if ((*kernel_shape)[i] != weight_shape->dim()[i + 2].dim_value()) {
+          valid = false;
+          break;
+        }
+      }
+    } else {
+      valid = false;
+    }
+
+    if (!valid) {
+      LOGS(logger, VERBOSE) << "Conv [" << name << "] kernel_shape attribute does not match the weight shape";
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
index a4ad1c31b502..1eba312b2577 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
@@ -4,37 +4,26 @@
 #include "core/common/safeint.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class DepthToSpaceOpBuilder : public BaseOpBuilder {
-  // Add operator related
- private:
-#ifdef __APPLE__
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-
-#ifdef __APPLE__
 Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                     const Node& node,
                                                     const logging::Logger& /* logger */) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   const auto& input_defs = node.InputDefs();
   const auto& output_defs = node.OutputDefs();
@@ -54,9 +43,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                               const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
index b303fe7884cb..f0adb70587bc 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
@@ -3,39 +3,26 @@
 
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class FlattenOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-
-#ifdef __APPLE__
-
 Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
-                                               const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+                                               const logging::Logger& /*logger*/) const {
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   // Note: ONNX Flatten corresponds to CoreML FlattenTo2DLayerParams
   auto* coreml_flatten = layer->mutable_flattento2d();
@@ -51,9 +38,6 @@ Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool FlattenOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                          const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
index 9c7ec306ca09..7d32675e3e51 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
@@ -2,34 +2,24 @@
 // Licensed under the MIT License.
 
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
-
 #include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#if defined(__APPLE__)
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime::coreml {
 
 class GatherOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
-  bool HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const override;
+  bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                              const logging::Logger& logger) const override;
+
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-#if defined(__APPLE__)
 namespace {
 int64_t GetAxisAttribute(const Node& node) {
   NodeAttrHelper node_attr_helper{node};
@@ -38,8 +28,8 @@ int64_t GetAxisAttribute(const Node& node) {
 }  // namespace
 
 Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                              const logging::Logger& logger) const {
-  auto layer = CreateNNLayer(model_builder, node);
+                                              const logging::Logger& /*logger*/) const {
+  auto layer = model_builder.CreateNNLayer(node);
   layer->mutable_gather()->set_axis(GetAxisAttribute(node));
   *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();    // data
   *layer->mutable_input()->Add() = node.InputDefs()[1]->Name();    // indices
@@ -47,10 +37,9 @@ Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif  // defined(__APPLE__)
 
-// Operator support related
-bool GatherOpBuilder::HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const {
+bool GatherOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+                                             const logging::Logger& logger) const {
   int32_t input_type;
   if (!GetType(*node.InputDefs()[0], input_type, logger))
     return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
index 71b08db6d44d..8daf64dc4a45 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
@@ -7,46 +7,66 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/impl/builder_utils.h"
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class GemmOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
-  bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */,
-                         const logging::Logger& /* logger */) const override;
-};
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
 
-// Add operator related
+  bool SupportsMLProgram() const override { return true; }
+};
 
-#ifdef __APPLE__
 void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   const auto& op = node.OpType();
   const auto& input_defs(node.InputDefs());
-  // We have already embedded the weights (matrix B and C(if any)) into the coreml layer
-  // No need to copy them later to reduce memory consumption
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());
-  if (op == "Gemm" && input_defs.size() > 2) {
-    model_builder.AddInitializerToSkip(input_defs[2]->Name());
+  const bool is_gemm = op == "Gemm";
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    // we have to transpose the weight input of Gemm if transB is false, and potentially override the bias shape
+    if (is_gemm) {
+      NodeAttrHelper helper(node);
+      const auto transB = helper.Get("transB", 0);
+      if (transB == 0) {
+        model_builder.AddInitializerToSkip(input_defs[1]->Name());
+      }
+
+      if (input_defs.size() > 2) {
+        // ONNX spec requires B to be 2D and we required it to be a constant initializer so reading N this way is safe
+        // B is {K, N] by default. or {N, K} if transB is true
+        int N_dim = transB ? 0 : 1;
+        int64_t N = input_defs[1]->Shape()->dim().at(N_dim).dim_value();
+
+        const auto& bias_name = input_defs[2]->Name();
+        const auto& bias = *model_builder.GetConstantInitializer(bias_name);
+        if (bias.dims_size() != 1 || bias.dims(0) != N) {
+          // we have to override the shape/duplicate data to convert {}, {1} or {1, N} to 1D {N}
+          // when adding the Gemm operation so skip adding the original initializer
+          model_builder.AddInitializerToSkip(bias_name);
+        }
+      }
+    }
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    // We have already embedded the weights (matrix B and C(if any)) into the coreml layer
+    // No need to copy them later to reduce memory consumption
+    model_builder.AddInitializerToSkip(input_defs[1]->Name());
+    if (is_gemm && input_defs.size() > 2) {
+      model_builder.AddInitializerToSkip(input_defs[2]->Name());
+    }
   }
 }
 
@@ -70,156 +90,258 @@ static Status GetTensorFloatDataTransposed(const ONNX_NAMESPACE::TensorProto& te
 }
 
 Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                            const logging::Logger& /* logger */) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+                                            const logging::Logger& logger) const {
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
-  const auto& b_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
-  const auto& b_shape = b_tensor.dims();
-
-  auto* coreml_inner_product = layer->mutable_innerproduct();
-
-  // The coreml innerproduct weight (matrix B) is stored transposed
-  // - for MatMul and Gemm (transB = 0), the coreml weight is B'
-  // - for Gemm (transB = 1), the coreml weight is B
-  if (op_type == "MatMul") {
-    coreml_inner_product->set_inputchannels(b_shape[0]);
-    coreml_inner_product->set_outputchannels(b_shape[1]);
-    // Add weight (b of MatMul)
-    std::vector<float> b_transposed;
-    ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(b_tensor, b_transposed));
-    CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_transposed);
-  } else {  // Gemm
-    NodeAttrHelper helper(node);
-    const auto transB = helper.Get("transB", 0);
-    if (transB == 0) {
-      coreml_inner_product->set_inputchannels(b_shape[0]);
-      coreml_inner_product->set_outputchannels(b_shape[1]);
+  const auto& a = *input_defs[0];
+  const auto& b = *input_defs[1];
+  const auto* b_initializer = model_builder.GetConstantInitializer(b.Name());  // MLProgram MatMul may not be constant
+
+  const bool is_matmul = op_type == "MatMul";
+  const bool is_gemm = op_type == "Gemm";
+
+  NodeAttrHelper helper(node);
+  const auto transB = is_gemm ? helper.Get("transB", 0) : 0;
+
+  std::vector<int64_t> b_shape;
+  ORT_IGNORE_RETURN_VALUE(GetShape(b, b_shape, logger));
+  int64_t b0 = -1, b1 = -1;
+
+  // ML Program MatMul supports N-D input
+  if (model_builder.CreateMLProgram() && is_matmul) {
+    if (b_shape.size() == 1) {
+      // B is treated as {b_shape[0], 1} according to the numpy rules.
+      b0 = b_shape[0];
+      b1 = 1;
+    } else {
+      // last 2 dims are used
+      b0 = b_shape[b_shape.size() - 2];
+      b1 = b_shape[b_shape.size() - 1];
+    }
+  } else {
+    // we only support 2D input
+    b0 = b_shape[0];
+    b1 = b_shape[1];
+  }
+
+  // B is {K, N} in ONNX spec by default, or {N, K} in Gemm if transB is true
+  const auto K = transB ? b1 : b0;
+  const auto N = transB ? b0 : b1;
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    if (is_gemm) {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.linear.linear
+      auto gemm_op = model_builder.CreateOperation(node, "linear");
+      AddOperationInput(*gemm_op, "x", a.Name());
+
+      // CoreML takes weight input as {N, K} which is the reverse of ONNX.
+      // if transB is true the input weight is {N, K} so can be added directly.
+      if (transB) {
+        AddOperationInput(*gemm_op, "weight", b.Name());
+      } else {
+        // transpose from {K, N} to {N, K}
+        std::vector<float> weight_nk;
+        std::vector<int64_t> weight_nk_shape = {N, K};
+        ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(*b_initializer, weight_nk));
+
+        AddOperationInput(*gemm_op, "weight",
+                          model_builder.AddConstant(gemm_op->type(), b.Name() + "_t", weight_nk, weight_nk_shape));
+      }
+
+      if (input_defs.size() == 3) {
+        const auto& bias_arg = *input_defs[2];
+        const auto& bias = *model_builder.GetConstantInitializer(bias_arg.Name());
+
+        // CoreML linear op requires bias to be 1D tensor of size N
+        if (bias.dims_size() == 1 && bias.dims().at(0) == N) {
+          // can use existing initializer
+          AddOperationInput(*gemm_op, "bias", bias_arg.Name());
+        } else {
+          Initializer unpacked_tensor(bias);
+          auto bias_data = unpacked_tensor.DataAsSpan<float>();
+          std::string_view bias_data_name;
+          if (bias_data.size() == 1) {
+            // expand scalar to N
+            std::vector<float> expanded_bias_data(N, bias_data[0]);
+            bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", expanded_bias_data);
+          } else {
+            // can use data as-is but need to adjust shape (inferred by AddConstant as {bias_data.size()})
+            bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", bias_data);
+          }
+
+          AddOperationInput(*gemm_op, "bias", bias_data_name);
+        }
+      }
+
+      AddOperationOutput(*gemm_op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(gemm_op));
+    } else {
+      // CoreML implementation is the same as ONNX MatMul.
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.linear.matmul
+      auto matmul_op = model_builder.CreateOperation(node, "matmul");
+      AddOperationInput(*matmul_op, "x", a.Name());
+      AddOperationInput(*matmul_op, "y", b.Name());
+
+      // once again the spec lies and says transpose_y and transpose_x are optional...
+      auto false_value_name = model_builder.AddScalarConstant(matmul_op->type(), "false", false);
+      AddOperationInput(*matmul_op, "transpose_x", false_value_name);
+      AddOperationInput(*matmul_op, "transpose_y", false_value_name);
+
+      AddOperationOutput(*matmul_op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(matmul_op));
+    }
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    auto* coreml_inner_product = layer->mutable_innerproduct();
+
+    *layer->mutable_input()->Add() = a.Name();
+
+    coreml_inner_product->set_inputchannels(K);
+    coreml_inner_product->set_outputchannels(N);
+
+    // CoreML takes weight input as {N, K} which is the reverse of ONNX.
+    // if Gemm's transB is true the input weight is {N, K} and can be added directly.
+    if (transB) {
+      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), *b_initializer));
+    } else {
       std::vector<float> b_transposed;
-      ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(b_tensor, b_transposed));
+      ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(*b_initializer, b_transposed));
       CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_transposed);
-    } else {
-      coreml_inner_product->set_inputchannels(b_shape[1]);
-      coreml_inner_product->set_outputchannels(b_shape[0]);
-      // Add weight (b of MatMul)
-      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_tensor));
     }
 
-    // Add bias if present
-    if (input_defs.size() > 2) {
+    if (is_gemm && input_defs.size() > 2) {
+      // Add bias
       coreml_inner_product->set_hasbias(true);
-      const auto& bias_tensor = *model_builder.GetInitializerTensors().at(input_defs[2]->Name());
-      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), bias_tensor));
+      const auto& bias_tensor = *model_builder.GetConstantInitializer(input_defs[2]->Name());
+
+      // if scalar, or single value expand to 1D tensor of size N
+      // IsOpSupportedImpl enforces it's scalar, {1}, {N}, or {1, N}.
+      Initializer unpacked_tensor(bias_tensor);
+      auto bias_data = unpacked_tensor.DataAsSpan<float>();
+      if (bias_data.size() == 1 && N > 1) {
+        std::vector<float> expanded_bias_data(N, bias_data[0]);
+        CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), expanded_bias_data);
+      } else {
+        CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), bias_data);
+      }
     }
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool GemmOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
   const auto& input_defs(node.InputDefs());
+  const bool is_matmul = op_type == "MatMul";
+  const bool is_gemm = op_type == "Gemm";
+
   size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
 
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (!Contains(initializers, input_defs[b_idx]->Name())) {
-    LOGS(logger, VERBOSE) << "B of Gemm/Matmul must be an initializer tensor";
+  std::vector<int64_t> a_shape;
+  if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
     return false;
   }
 
-  std::vector<int64_t> a_shape;
-  {
-    if (!GetShape(*input_defs[a_idx], a_shape, logger))
-      return false;
-
-    if (a_shape.size() != 2) {
-      LOGS(logger, VERBOSE) << "A must be 2D";
-      return false;
-    }
+  std::vector<int64_t> b_shape;
+  if (!GetShape(*input_defs[b_idx], b_shape, logger)) {
+    return false;
+  }
 
-    // TODO is it ok if the shape is dynamic and empty?
-    if (Product(a_shape) == 0) {
-      LOGS(logger, VERBOSE) << "A must be non-empty";
+  if (!input_params.graph_viewer.GetConstantInitializer(input_defs[b_idx]->Name())) {
+    if (input_params.create_mlprogram && is_matmul) {
+      // ML Program MatMul allows non-constant B input
+    } else {
+      LOGS(logger, VERBOSE) << op_type << " B input must be a constant initializer";
       return false;
     }
   }
 
-  std::vector<int64_t> b_shape;
-  {
-    if (!GetShape(*input_defs[b_idx], b_shape, logger))
-      return false;
-
-    if (b_shape.size() != 2) {
-      LOGS(logger, VERBOSE) << "B must be 2D";
-      return false;
-    }
+  if (is_matmul) {
+    if (input_params.create_mlprogram) {
+      // ML Program matmul op has numpy semantics the same as the ONNX spec so we can use directly
+    } else {
+      // we could potentially support 1D and 3D if required. beyond 3D the dims that merge diverge.
+      // https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/onnx/_operators.py#L1607
+      // https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/backend/nn/op_mapping.py#L1374
+      // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#innerproductlayerparams
+      if (a_shape.size() != 2 || b_shape.size() != 2) {
+        LOGS(logger, VERBOSE) << "a and b inputs must be 2D. ";
+        return false;
+      }
 
-    if (Product(b_shape) == 0) {
-      LOGS(logger, VERBOSE) << "B must be non-empty";
-      return false;
+      if (input_defs.size() > 2) {
+        LOGS(logger, VERBOSE) << "MatMul with C input is not supported";
+        return false;
+      }
     }
   }
 
-  if (op_type == "Gemm") {
+  if (is_gemm) {
+    // A and B are 2D due to the ONNX spec
     NodeAttrHelper helper(node);
     const auto transA = helper.Get("transA", 0);
     const auto transB = helper.Get("transB", 0);
     const auto alpha = helper.Get("alpha", 1.0f);
     const auto beta = helper.Get("beta", 1.0f);
+
+    // TODO: We can support transA, alpha and beta by using multiple layers/operations if needed.
     if (!(transA == 0 && alpha == 1.f && beta == 1.f)) {
-      LOGS(logger, VERBOSE) << "Only transA == 0, alpha == 1.0 "
-                            << "and beta == 1.0 is supported."
+      LOGS(logger, VERBOSE) << "Only support for transA == 0, alpha == 1.0 "
+                            << "and beta == 1.0 is currently implemented."
                             << " transA " << transA
                             << " alpha " << alpha
                             << " beta " << beta;
       return false;
     }
 
-    // C of Gemm
-    // For now we only support {n} or {1,n} tensor
     if (input_defs.size() == 3) {
-      if (!Contains(initializers, input_defs[c_idx]->Name())) {
-        LOGS(logger, VERBOSE) << "C of Gemm must be an initializer tensor";
+      if (!input_params.graph_viewer.GetConstantInitializer(input_defs[c_idx]->Name())) {
+        LOGS(logger, VERBOSE) << "C of Gemm must be a constant initializer";
         return false;
       }
 
       std::vector<int64_t> c_shape;
-      if (!GetShape(*input_defs[c_idx], c_shape, logger))
+      if (!GetShape(*input_defs[c_idx], c_shape, logger)) {
         return false;
+      }
 
-      size_t c_dim = c_shape.size();
+      // B is {K, N} in ONNX spec by default, or {N, K} in Gemm if transB is true
+      const auto N = transB ? b_shape[0] : b_shape[1];
 
-      if (c_dim == 0) {
-        LOGS(logger, VERBOSE) << "C of Gemm cannot be a scalar";
-        return false;
-      }
+      size_t c_rank = c_shape.size();
 
-      if (c_dim != 1) {
-        // If C is a (2+)d tensor, it must have the format {1, 1, ..., 1, n}
-        // where every except the last dimension should be 1
-        for (size_t i = 0; i < c_dim - 1; ++i) {
-          if (c_shape[i] != 1) {
-            LOGS(logger, VERBOSE) << "C of Gemm must be a vector or a tensor with only last dimension != 1";
-            return false;
+      // allowed: scalar, or 1D where the value is 1 or N, 2D with shape {1, N}
+      bool c_valid = false;
+      switch (c_rank) {
+        case 0:
+          c_valid = true;
+          break;
+        case 1:
+          if (c_shape[0] == 1 || c_shape[0] == N) {
+            c_valid = true;
           }
-        }
+          break;
+        case 2:
+          if (c_shape[0] == 1 && c_shape[1] == N) {
+            c_valid = true;
+          }
+          break;
       }
 
-      auto c_size = c_shape[c_dim - 1];
-      if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) {
-        LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape["
-                              << (transB == 0 ? "1" : "0") << "]"
-                              << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]"
-                              << " c_size: " << c_size;
+      if (!c_valid) {
+        LOGS(logger, VERBOSE) << "Shape of C Gemm input must be {}, {1}, {N}, or {1, N}. N:" << N << " C shape:"
+                              << Shape2String(c_shape);
 
         return false;
       }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pad_op_builder.cc
index ba12600e8bc4..99d6f01cb8c5 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pad_op_builder.cc
@@ -7,30 +7,20 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class PadOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 
@@ -64,9 +54,6 @@ static InlinedVector<int64_t> GetPaddingAxesData(const InitializedTensorSet& ini
   return axes_tensor_data;
 }
 
-// Add operator related
-
-#ifdef __APPLE__
 void PadOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());  //  pads
   model_builder.AddInitializerToSkip(node.InputDefs()[2]->Name());  //  constant_value
@@ -78,7 +65,7 @@ void PadOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node
 Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                            const Node& node,
                                            const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   auto* coreml_pad = layer->mutable_padding();
   auto* constant_padding_type = coreml_pad->mutable_constant();  // CoreML::Specification::PaddingLayerParams_PaddingConstant
@@ -122,9 +109,6 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool PadOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                      const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
index fd1c77c851e6..17910ba6fd48 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
@@ -4,132 +4,191 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/impl/builder_utils.h"
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class PoolOpBuilder : public BaseOpBuilder {
-  // Add operator related
- private:
-#ifdef __APPLE__
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
-};
 
-// Add operator related
+  bool SupportsMLProgram() const override { return true; }
+};
 
-#ifdef __APPLE__
 Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                             const Node& node,
                                             const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
-
-  auto* coreml_pool = layer->mutable_pooling();
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
 
-  bool is_global_pooling = false;
-  if (op_type == "GlobalAveragePool") {
-    is_global_pooling = true;
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
-  } else if (op_type == "GlobalMaxPool") {
-    is_global_pooling = true;
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
-  } else if (op_type == "AveragePool") {
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
-  } else if (op_type == "MaxPool") {
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unknown op: ", op_type);
-  }
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::string_view coreml_op_type;
+    bool is_global = false;
+    bool is_avg_pool = false;
+    if (op_type == "GlobalAveragePool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.reduction.reduce_mean
+      coreml_op_type = "reduce_mean";
+      is_global = true;
+    } else if (op_type == "GlobalMaxPool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.reduction.reduce_max
+      coreml_op_type = "reduce_max";
+      is_global = true;
+    } else if (op_type == "AveragePool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.pool.avg_pool
+      coreml_op_type = "avg_pool";
+      is_avg_pool = true;
+    } else if (op_type == "MaxPool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.pool.max_pool
+      coreml_op_type = "max_pool";
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unexpected op: ", op_type);
+    }
 
-  if (is_global_pooling) {
-    coreml_pool->set_globalpooling(true);
-    coreml_pool->mutable_valid();
-  } else {  // AveragePool or MaxPool
-    NodeAttrHelper helper(node);
-    const auto kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{0, 0});
-    const auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
-    const auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-    coreml_pool->add_kernelsize(kernel_shape[0]);
-    coreml_pool->add_kernelsize(kernel_shape[1]);
-    coreml_pool->add_stride(strides[0]);
-    coreml_pool->add_stride(strides[1]);
-    coreml_pool->set_avgpoolexcludepadding(helper.Get("count_include_pad", 0) == 0);
-    coreml_pool->set_globalpooling(false);
-
-    // Add Padding
-    // Usually using autopadding is more efficient than using explicit padding
-    // Try to see if we can map explicit padding to auto padding
-    std::vector<int64_t> input_shape;
-    ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
-    AutoPadType auto_pad_type;
-    ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, kernel_shape[0], kernel_shape[1],
-                                      onnx_pads, strides, {1, 1} /* dilations */,
-                                      StringToAutoPadType(helper.Get("auto_pad", "NOTSET")),
-                                      auto_pad_type));
-
-    if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
-      auto* padding_type = coreml_pool->mutable_same();
-      if (AutoPadType::SAME_LOWER == auto_pad_type) {  // default is SAME_UPPER
-        padding_type->set_asymmetrymode(COREML_SPEC::SamePadding_SamePaddingMode_TOP_LEFT_HEAVY);
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+
+    if (is_global) {
+      // keep N and C dims, reduce the rest with keepdims=True. equivalent to the ONNX Global*Pool ops.
+      std::vector<int64_t> axes{2, 3};  // we only support 4D input currently.
+      AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", axes));
+      AddOperationInput(*op, "keep_dims", model_builder.AddScalarConstant(op->type(), "keep_dims", true));
+    } else {
+      NodeAttrHelper helper(node);
+      constexpr int num_spatial_dims = 2;  // we only support 4D. -2 for N and C dims.
+
+      AddPadTypeAndPads(*op, model_builder, op->type(), helper, num_spatial_dims);
+
+      const auto kernel_shape = helper.GetInt64s("kernel_shape");  // required
+      AddOperationInput(*op, "kernel_sizes", model_builder.AddConstant(op->type(), "kernel_sizes", *kernel_shape));
+
+      // in theory all these values are optional according to the CoreML spec but simpler to just provide default
+      // values as the actual model compilation tends to require them.
+      const auto strides = helper.Get("strides", std::vector<int64_t>(num_spatial_dims, 1));
+      const bool ceil_mode = helper.Get("ceil_mode", int64_t(0));  // convert int64_t to bool
+
+      AddOperationInput(*op, "strides", model_builder.AddConstant(op->type(), "strides", strides));
+      AddOperationInput(*op, "ceil_mode", model_builder.AddScalarConstant(op->type(), "ceil_mode", ceil_mode));
+
+      if (is_avg_pool) {
+        const bool count_exclude_pad = helper.Get("count_include_pad", int64_t(0)) == 0;
+        AddOperationInput(*op, "exclude_padding_from_average",
+                          model_builder.AddScalarConstant(op->type(), "count_exclude_pad", count_exclude_pad));
       }
+    }
+
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
+
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    auto* coreml_pool = layer->mutable_pooling();
+
+    bool is_global_pooling = false;
+    if (op_type == "GlobalAveragePool") {
+      is_global_pooling = true;
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
+    } else if (op_type == "GlobalMaxPool") {
+      is_global_pooling = true;
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
+    } else if (op_type == "AveragePool") {
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
+    } else if (op_type == "MaxPool") {
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
     } else {
-      auto* padding_type = coreml_pool->mutable_valid();
-      if (AutoPadType::NOTSET == auto_pad_type && onnx_pads != std::vector<int64_t>{0, 0, 0, 0}) {
-        // NOTSET is adding the explicit padding to the ValidPadding.paddingAmounts
-        auto* height_border = padding_type->mutable_paddingamounts()->add_borderamounts();
-        height_border->set_startedgesize(onnx_pads[0]);
-        height_border->set_endedgesize(onnx_pads[2]);
-        auto* width_border = padding_type->mutable_paddingamounts()->add_borderamounts();
-        width_border->set_startedgesize(onnx_pads[1]);
-        width_border->set_endedgesize(onnx_pads[3]);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unexpected op: ", op_type);
+    }
+
+    if (is_global_pooling) {
+      coreml_pool->set_globalpooling(true);
+      coreml_pool->mutable_valid();
+    } else {  // AveragePool or MaxPool
+      NodeAttrHelper helper(node);
+      const auto kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{0, 0});
+      const auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
+      const auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+      coreml_pool->add_kernelsize(kernel_shape[0]);
+      coreml_pool->add_kernelsize(kernel_shape[1]);
+      coreml_pool->add_stride(strides[0]);
+      coreml_pool->add_stride(strides[1]);
+      coreml_pool->set_avgpoolexcludepadding(helper.Get("count_include_pad", 0) == 0);
+      coreml_pool->set_globalpooling(false);
+
+      // Add Padding
+      // Usually using autopadding is more efficient than using explicit padding
+      // Try to see if we can map explicit padding to auto padding
+      std::vector<int64_t> input_shape;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+      AutoPadType auto_pad_type;
+      ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, kernel_shape[0], kernel_shape[1],
+                                        onnx_pads, strides, {1, 1} /* dilations */,
+                                        StringToAutoPadType(helper.Get("auto_pad", "NOTSET")),
+                                        auto_pad_type));
+
+      if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
+        auto* padding_type = coreml_pool->mutable_same();
+        if (AutoPadType::SAME_LOWER == auto_pad_type) {  // default is SAME_UPPER
+          padding_type->set_asymmetrymode(COREML_SPEC::SamePadding_SamePaddingMode_TOP_LEFT_HEAVY);
+        }
+      } else {
+        auto* padding_type = coreml_pool->mutable_valid();
+        if (AutoPadType::NOTSET == auto_pad_type && onnx_pads != std::vector<int64_t>{0, 0, 0, 0}) {
+          // NOTSET is adding the explicit padding to the ValidPadding.paddingAmounts
+          auto* height_border = padding_type->mutable_paddingamounts()->add_borderamounts();
+          height_border->set_startedgesize(onnx_pads[0]);
+          height_border->set_endedgesize(onnx_pads[2]);
+          auto* width_border = padding_type->mutable_paddingamounts()->add_borderamounts();
+          width_border->set_startedgesize(onnx_pads[1]);
+          width_border->set_endedgesize(onnx_pads[3]);
+        }
       }
     }
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
 
-// Operator support related
-bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */,
+bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
 
   std::vector<int64_t> input_shape;
-  if (!GetShape(*input_defs[0], input_shape, logger))
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
     return false;
+  }
 
+  // TODO: ML Program supports 3D and 5D. Add if we have a use case for that.
   const auto input_size = input_shape.size();
   if (input_size != 4) {
-    LOGS(logger, VERBOSE)
-        << op_type << " only supports rank-4 tensor, input ["
-        << input_defs[0]->Name() << "] has actual dim count " << input_size;
+    LOGS(logger, VERBOSE) << op_type << " only supports rank-4 tensor, input ["
+                          << input_defs[0]->Name() << "] has actual dim count " << input_size;
     return false;
   }
 
   if (op_type == "AveragePool" || op_type == "MaxPool") {
     NodeAttrHelper helper(node);
+
     const auto storage_order = helper.Get("storage_order", 0);
     if (storage_order == 1) {
       LOGS(logger, VERBOSE) << "storage_order == 1 is not supported";
@@ -141,12 +200,14 @@ bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
       return false;
     }
 
-    // TODO, add support of the ceil_mode by adjusting the padding
-    // See https://stackoverflow.com/questions/59906456/in-pytorchs-maxpool2d-is-padding-added-depending-on-ceil-mode
-    // and https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/frontend/torch/ops.py#L621-L644
-    if (helper.Get("ceil_mode", 0) == 1) {
-      LOGS(logger, VERBOSE) << "ceil_mode == 1 is not supported for pooling";
-      return false;
+    if (!input_params.create_mlprogram) {
+      // TODO, add support of the ceil_mode by adjusting the padding
+      // See https://stackoverflow.com/questions/59906456/in-pytorchs-maxpool2d-is-padding-added-depending-on-ceil-mode
+      // and https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/frontend/torch/ops.py#L621-L644
+      if (helper.Get("ceil_mode", 0) == 1) {
+        LOGS(logger, VERBOSE) << "ceil_mode == 1 is not supported for pooling";
+        return false;
+      }
     }
 
     if (helper.Get("dilations", std::vector<int32_t>{1, 1}) !=
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
index 6a2014e7952a..32378b1f654d 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
@@ -1,36 +1,27 @@
 // Copyright (c) Shukant Pal.
 // Licensed under the MIT License.
 
+#include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
-#include "core/optimizer/initializer.h"
-
-#include "base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace coreml {
 
 class ReductionOpBuilder : public BaseOpBuilder {
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
- private:
+
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-#ifdef __APPLE__
 namespace {
 template <typename T>
 void AddReductionParams(T* params, const std::vector<int64_t>& axes, bool keepdims, bool noop_with_empty_axes) {
@@ -76,7 +67,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
   const bool keepdims = helper.Get("keepdims", 1) != 0;
   const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0;
 
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   if (op_type == "ReduceSum") {
     AddReductionParams(layer->mutable_reducesum(), axes, keepdims, noop_with_empty_axes);
@@ -93,7 +84,6 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
 
 bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                            const logging::Logger& logger) const {
@@ -124,4 +114,4 @@ void CreateReductionOpBuilder(const std::string& op_type, OpBuilderRegistrations
 }
 
 }  // namespace coreml
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
index 67aee73630cd..27d24d9c2189 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
@@ -1,90 +1,96 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
-#include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/cpu/tensor/reshape_helper.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class ReshapeOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 
   // Reshape opset 4- uses attributes for new shape which we do not support for now
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 5; }
-};
 
-// Add operator related
+  bool SupportsMLProgram() const override { return true; }
+};
 
-#ifdef __APPLE__
 void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  // Skip the second input which is the new shape as we always have to create a new version as the CoreML rules
+  // are different from ONNX.
   model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
 }
 
 Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
-
   const auto& input_defs = node.InputDefs();
-  const auto& initializers(model_builder.GetInitializerTensors());
-  const auto& target_shape_tensor = *initializers.at(input_defs[1]->Name());
-  const int64_t* raw_target_shape = target_shape_tensor.int64_data().empty()
-                                        ? reinterpret_cast<const int64_t*>(target_shape_tensor.raw_data().data())
-                                        : target_shape_tensor.int64_data().data();
-
-  const auto size = target_shape_tensor.dims()[0];
-  TensorShapeVector target_shape{raw_target_shape, raw_target_shape + size};
   std::vector<int64_t> input_shape;
-  ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Cannot get shape");
-  ReshapeHelper helper(TensorShape(input_shape), target_shape);
-  *layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+  ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Cannot get shape of data");
+
+  const auto& data_name = input_defs[0]->Name();
+  const auto& new_shape_name = input_defs[1]->Name();
+  Initializer unpacked_tensor(*model_builder.GetConstantInitializer(new_shape_name));
+  TensorShapeVector new_shape = ToShapeVector(unpacked_tensor.DataAsSpan<int64_t>());
+
+  // ReshapeHelper applies the ONNX rules to create the concrete output shape
+  ReshapeHelper helper(TensorShape(input_shape), new_shape);
 
-  model_builder.AddLayer(std::move(layer));
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.reshape
+    std::unique_ptr<Operation> reshape_op = model_builder.CreateOperation(node, "reshape");
+
+    AddOperationInput(*reshape_op, "x", data_name);
+    AddOperationInput(*reshape_op, "shape",
+                      model_builder.AddConstant(reshape_op->type(), "shape", ToConstSpan(new_shape)));
+
+    AddOperationOutput(*reshape_op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(reshape_op));
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    *layer->mutable_reshapestatic()->mutable_targetshape() = {new_shape.cbegin(), new_shape.cend()};
+    *layer->mutable_input()->Add() = data_name;
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                          const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& new_shape_name = input_defs[1]->Name();
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (!Contains(initializers, new_shape_name)) {
+  const auto* new_shape_tensor = input_params.graph_viewer.GetConstantInitializer(new_shape_name);
+  if (!new_shape_tensor) {
+    // ONNX has different rules around how -1 and 0 values are used/combined, and
+    // we can't check if those can be translated to CoreML if the shape is unknown.
     LOGS(logger, VERBOSE) << "New shape of reshape must be a constant initializer";
     return false;
   }
 
-  const auto& new_shape_tensor = *initializers.at(new_shape_name);
-  Initializer unpacked_tensor(new_shape_tensor);
+  Initializer unpacked_tensor(*new_shape_tensor);
   auto new_shape = unpacked_tensor.DataAsSpan<int64_t>();
   if (new_shape.empty()) {
     LOGS(logger, VERBOSE) << "New shape of reshape cannot be empty";
@@ -100,7 +106,7 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
     return false;
   }
 
-  // CoreML reshape doesn't support new shape with more than 5 dimensions
+  // CoreML reshape doesn't support new shape with more than 5 dimensions.
   if (new_shape.size() > 5) {
     LOGS(logger, VERBOSE) << "Reshape does not support new shape with rank greater than 5. Input shape: "
                           << Shape2String(input_shape) << ", new shape: " << Shape2String(new_shape);
@@ -109,7 +115,7 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
 
   // CoreML reshape does not support 0 as dimension
   NodeAttrHelper helper(node);
-  const bool allow_zero = helper.Get("allowzero ", 0) == 1;
+  const bool allow_zero = helper.Get("allowzero", 0) == 1;
   if (allow_zero) {
     if (std::find(new_shape.begin(), new_shape.end(), int64_t{0}) != new_shape.end()) {
       LOGS(logger, VERBOSE) << "Reshape does not support new shape with 0 as dimension when allowzero is enabled. "
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 5f963dc30dd8..6c2fcc2ace85 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -8,31 +8,21 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/cpu/tensor/reshape_helper.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class ResizeOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 
@@ -41,7 +31,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 11; }
 };
 
-// Helper functions
+namespace {
 bool GetResizeScales(const InitializedTensorSet& initializers,
                      const Node& node, std::vector<float>& scales,
                      const logging::Logger&) {
@@ -73,10 +63,8 @@ bool GetResizeOutputSizes(const InitializedTensorSet& initializers,
   sizes = std::vector<int64_t>(sizes_data.begin(), sizes_data.end());
   return true;
 }
+}  // namespace
 
-// Add operator related
-
-#ifdef __APPLE__
 void ResizeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   // We don't really use ROI here, so add it to skipped list if it's an initializer tensor
   model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());  // ROI
@@ -96,7 +84,7 @@ void ResizeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N
 Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
                                               const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   auto* coreml_upsample = layer->mutable_upsample();
   NodeAttrHelper helper(node);
@@ -110,7 +98,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto& input_defs = node.InputDefs();
   const auto& initializers(model_builder.GetInitializerTensors());
 
-  if (input_defs.size() == 3) {  // use scales
+  if (input_defs.size() >= 3 && input_defs[2]->Exists()) {  // use scales
     std::vector<float> scales;
     ORT_RETURN_IF_NOT(GetResizeScales(initializers, node, scales, logger), "Error getting resize scales");
     coreml_upsample->add_scalingfactor(static_cast<int64_t>(scales[2]));
@@ -131,9 +119,6 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                         const logging::Logger& logger) const {
@@ -197,20 +182,24 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
       return false;
     }
 
+    bool using_scales = input_defs.size() >= 3 && input_defs[2]->Exists();
     // scales
-    if (input_defs.size() == 3 && !Contains(initializers, input_defs[2]->Name())) {
-      LOGS(logger, VERBOSE) << "Input scales of Resize must be known";
+    if (using_scales && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
+      LOGS(logger, VERBOSE) << "scales input of Resize must be a constant initializer";
       return false;
     }
 
     // sizes
-    if (input_defs.size() > 3 && !Contains(initializers, input_defs[3]->Name())) {
-      LOGS(logger, VERBOSE) << "Input sizes of Resize must be known";
+    if (!using_scales &&
+        (input_defs.size() < 4 ||
+         !input_defs[3]->Exists() ||
+         !input_params.graph_viewer.GetConstantInitializer(input_defs[3]->Name()))) {
+      LOGS(logger, VERBOSE) << "sizes input of Resize must be a constant initializer";
       return false;
     }
 
     // We want to check if the scales or sizes are not trying to resize on N/C channels here
-    if (input_defs.size() == 3) {  // we are using scales
+    if (using_scales) {
       std::vector<float> scales;
       if (!GetResizeScales(initializers, node, scales, logger))
         return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
index fd64153ffd28..a86e3d9538d8 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
@@ -2,44 +2,30 @@
 // Licensed under the MIT License.
 
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
-
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"  // for NodeAttrHelper
 
-#if defined(__APPLE__)
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime::coreml {
 
 class ShapeOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-#if defined(__APPLE__)
 Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const {
-  auto layer = CreateNNLayer(model_builder, node);
+                                             const logging::Logger& /*logger*/) const {
+  auto layer = model_builder.CreateNNLayer(node);
   layer->mutable_getshape();
   *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
   *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif  // defined(__APPLE__)
 
-// Operator support related
 bool ShapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                        const logging::Logger& logger) const {
   NodeAttrHelper node_attr_helper{node};
diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
index 2c250b3cc9f5..39bfbfe5bba1 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
@@ -1,39 +1,31 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/coreml/builders/impl/base_op_builder.h"
-
 #include "core/optimizer/initializer.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
 #include "core/providers/shared/utils/utils.h"
 
-#if defined(__APPLE__)
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime::coreml {
 
 class SliceOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- private:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   int GetMinSupportedOpSet(const Node& /* node */) const override {
     // Before Slice-10, some inputs were attributes instead. We don't support that for now.
     return 10;
   }
 
-  bool HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const override;
+  bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                              const logging::Logger& logger) const override;
+
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& builder_params,
                          const logging::Logger& logger) const override;
 };
@@ -62,7 +54,7 @@ Status PrepareSliceComputeMetadataFromConstantInitializers(const Node& slice_nod
       return Status::OK();
     }
 
-    const auto* tensor_proto = graph_viewer.GetConstantInitializer(input_defs[input_idx]->Name(), true);
+    const auto* tensor_proto = graph_viewer.GetConstantInitializer(input_defs[input_idx]->Name());
     ORT_RETURN_IF_NOT(tensor_proto, "Failed to get constant initializer.");
     Initializer unpacked_tensor(*tensor_proto, graph_viewer.ModelPath());
     const auto data_type = unpacked_tensor.data_type();
@@ -107,9 +99,6 @@ bool ValidateSliceComputeMetadataForCoreML(const SliceOp::PrepareForComputeMetad
 }
 }  // namespace
 
-// Add operator related
-#if defined(__APPLE__)
-
 void SliceOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   const auto& input_defs = node.InputDefs();
 
@@ -132,7 +121,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadataFromConstantInitializers(node, model_builder.GetGraphViewer(),
                                                                           compute_metadata));
 
-  auto layer = CreateNNLayer(model_builder, node);
+  auto layer = model_builder.CreateNNLayer(node);
   *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
   *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
   auto* slice_static = layer->mutable_slicestatic();
@@ -163,10 +152,8 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   return Status::OK();
 }
 
-#endif  // defined(__APPLE__)
-
-// Operator support related
-bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const {
+bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+                                            const logging::Logger& logger) const {
   int32_t input_type;
   if (!GetType(*node.InputDefs()[0], input_type, logger))
     return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
index c454a2a779f6..d6584124c6ab 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@@ -1,43 +1,29 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/coreml/builders/impl/base_op_builder.h"
-
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
-#include "core/providers/coreml/shape_utils.h"
-#include "core/providers/shared/utils/utils.h"
-
-#ifdef __APPLE__
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/model_builder.h"
-#endif
 #include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace coreml {
 
 class SoftmaxOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-
-#ifdef __APPLE__
-
 Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
   const auto& input_name = node.InputDefs()[0]->Name();
   const auto& output_name = node.OutputDefs()[0]->Name();
 
@@ -66,17 +52,15 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     target_shape.push_back(size_to_dimension);
     target_shape.push_back(size_from_dimension);
 
-    const auto reshape1_output_name = model_builder.GetUniqueName(MakeString(node.Name(), "reshape1_output"));
+    const auto reshape1_output_name = model_builder.GetUniqueName(node, "reshape1_output");
     {  // Add reshape layer
-      const auto softmax_reshape1_layer_name =
-          model_builder.GetUniqueName(MakeString(node.Name(), "_Softmax_reshape1"));
-      auto reshape_layer = CreateNNLayer(softmax_reshape1_layer_name);
+      auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape1");
       *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
       *reshape_layer->mutable_input()->Add() = input_name;
       *reshape_layer->mutable_output()->Add() = reshape1_output_name;
       model_builder.AddLayer(std::move(reshape_layer));
     }
-    const auto softmax_output_name = model_builder.GetUniqueName(MakeString(node.Name(), "softmax_output"));
+    const auto softmax_output_name = model_builder.GetUniqueName(node, "softmax_output");
     {
       auto* coreml_softmaxnd = layer->mutable_softmaxnd();
       coreml_softmaxnd->set_axis(-1);
@@ -86,9 +70,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     }
     {
       // Add reshape back layer
-      const auto softmax_reshape2_layer_name =
-          model_builder.GetUniqueName(MakeString(node.Name(), "_Softmax_reshape2"));
-      auto reshape_layer = CreateNNLayer(softmax_reshape2_layer_name);
+      auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape2");
       *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {data_shape.cbegin(), data_shape.cend()};
       *reshape_layer->mutable_input()->Add() = softmax_output_name;
       *reshape_layer->mutable_output()->Add() = output_name;
@@ -99,10 +81,6 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   return Status::OK();
 }
 
-#endif
-
-// Operator support related
-
 bool SoftmaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */,
                                          const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
index 815f68128ffa..0497357c45c5 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@@ -1,35 +1,24 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/coreml/builders/impl/base_op_builder.h"
-
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#if defined(__APPLE__)
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class SplitOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- private:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 
@@ -37,10 +26,6 @@ class SplitOpBuilder : public BaseOpBuilder {
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 13; }
 };
 
-// Add operator related
-
-#ifdef __APPLE__
-
 void SplitOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   const auto& input_defs = node.InputDefs();
 
@@ -63,7 +48,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   // attribute introduced since opset 18
   uint64_t num_outputs;
 
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
   auto* coreml_splitnd = layer->mutable_splitnd();
   coreml_splitnd->set_axis(axis);
 
@@ -82,7 +67,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     coreml_splitnd->set_numsplits(num_outputs);
   } else {
     // note: for opset 18+ 'num_outputs' is a required attribute
-    num_outputs = narrow<uint64_t>(helper.GetInt("num_outputs").value());
+    num_outputs = narrow<uint64_t>(helper.GetInt64("num_outputs").value());
     // note: checked in IsOpSupportedImpl that ensures the dim value at splitting axis exists
     auto split_dim_size = data_shape[HandleNegativeAxis(axis, data_shape.size())];
     uint64_t chunk_size = narrow<uint64_t>((split_dim_size + num_outputs - 1) / num_outputs);
@@ -111,10 +96,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   return Status::OK();
 }
 
-#endif
-
-// Operator support related
-
 bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                        const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
@@ -139,8 +120,8 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
     }
     const auto& splits_tensor = *initializers.at(input_defs[1]->Name());
     Initializer unpacked_tensor(splits_tensor);
-    auto splits_span = unpacked_tensor.DataAsSpan<uint64_t>();
-    int sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), 0);
+    auto splits_span = unpacked_tensor.DataAsSpan<int64_t>();
+    int64_t sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), int64_t{0});
     if (sum_of_splits != split_dims_at_axis) {
       LOGS(logger, VERBOSE) << "Mismatch between the sum of 'split'. Expected: "
                             << split_dims_at_axis
@@ -159,7 +140,7 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
     }
   } else {
     if (node.SinceVersion() >= 18) {
-      const auto num_outputs = helper.GetInt("num_outputs");
+      const auto num_outputs = helper.GetInt64("num_outputs");
       if (!num_outputs.has_value()) {
         LOGS(logger, VERBOSE) << "No 'num_outputs' provided. For split 18+, num_outputs is a required attribute.";
         return false;
@@ -169,9 +150,10 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
                               << "CoreML SplitND requires at least 2 outputs. num_outputs: " << num_outputs.value();
         return false;
       }
-      if (num_outputs.value() != static_cast<int32_t>(node.OutputDefs().size()) || num_outputs.value() > split_dims_at_axis) {
-        LOGS(logger, VERBOSE) << "Invalid num_outputs provided.\n."
-                              << "The value should be smaller or equal to the size of dimension being split. num_outputs: "
+      if (num_outputs.value() != static_cast<int32_t>(node.OutputDefs().size()) ||
+          num_outputs.value() > split_dims_at_axis) {
+        LOGS(logger, VERBOSE) << "Invalid num_outputs provided.\n. The value should be smaller or equal to the size "
+                                 "of dimension being split. num_outputs: "
                               << num_outputs.value();
         return false;
       }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index 2e14c85ce69c..e9cc1c2dbf63 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -1,48 +1,30 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#include <core/common/safeint.h>
+
+#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/optimizer/initializer.h"
-
-#ifdef __APPLE__
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/model_builder.h"
-#endif
 #include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/optimizer/initializer.h"
 
 namespace onnxruntime {
 namespace coreml {
 
 class SqueezeOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- public:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 
-  // Operator support related
- private:
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
 };
 
-// Add operator related
-
-#ifdef __APPLE__
-void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
-  if (node.SinceVersion() > 12 && node.InputDefs().size() > 1) {
-    model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
-  }
-}
-
-/* static */ Status GetAxes(ModelBuilder& model_builder, const Node& node, std::vector<int64_t>& axes) {
+namespace {
+Status GetAxes(ModelBuilder& model_builder, const Node& node, std::vector<int64_t>& axes) {
   // Squeeze opset 13 use input as axes
   if (node.SinceVersion() > 12) {
     // If axes is not provided, return an empty axes as default to squeeze all
@@ -62,11 +44,18 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
 
   return Status::OK();
 }
+}  // namespace
+
+void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  if (node.SinceVersion() > 12 && node.InputDefs().size() > 1) {
+    model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
+  }
+}
 
 Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                const logging::Logger& /* logger */) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   auto* coreml_squeeze = layer->mutable_squeeze();
   std::vector<int64_t> axes;
@@ -84,9 +73,6 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 bool SqueezeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                          const logging::Logger& /*logger*/) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index 7d5018a19f74..f6a61d55a3d6 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -3,33 +3,23 @@
 
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
-
 namespace onnxruntime {
 namespace coreml {
 
 class TransposeOpBuilder : public BaseOpBuilder {
-  // Add operator related
-#ifdef __APPLE__
- private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 };
 
-// Add operator related
-
-#ifdef __APPLE__
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                  const Node& node,
                                                  const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   NodeAttrHelper helper(node);
   std::vector<int64_t> perm = helper.Get("perm", std::vector<int64_t>());
@@ -51,7 +41,6 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
 
 void CreateTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   op_registrations.builders.push_back(std::make_unique<TransposeOpBuilder>());
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index 660755b43c04..3403378d5911 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -3,32 +3,25 @@
 
 #include "core/providers/common.h"
 
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#endif
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace coreml {
 
 class UnaryOpBuilder : public BaseOpBuilder {
- private:
-#ifdef __APPLE__
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
-#endif
 };
 
-#ifdef __APPLE__
 Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                              const logging::Logger& /* logger */) const {
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = CreateNNLayer(model_builder, node);
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   if (op_type == "Sqrt") {
     layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::SQRT);
@@ -45,9 +38,6 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
-#endif
-
-// Operator support related
 
 void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   op_registrations.builders.push_back(std::make_unique<UnaryOpBuilder>());
@@ -55,4 +45,4 @@ void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op
 }
 
 }  // namespace coreml
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 9c8b7bce507e..eb4723a3b974 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -2,56 +2,675 @@
 // Licensed under the MIT License.
 
 #include <fstream>
-#include <core/common/safeint.h>
-
-#include "model_builder.h"
-#include "helper.h"
-#include "op_builder_factory.h"
 
+#include "core/common/safeint.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/platform/env.h"
 #include "core/providers/common.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/coreml_provider_factory.h"
 #include "core/providers/coreml/model/host_utils.h"
-#include "core/providers/coreml/model/model.h"
 #include "core/providers/coreml/shape_utils.h"
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+// includes from coremltools-src in _deps
+#include "modelpackage/src/ModelPackage.hpp"
+#include "mlmodel/src/MILBlob/Blob/StorageWriter.hpp"
+using MILBlob::Blob::StorageWriter;
+#endif
+
+using namespace CoreML::Specification;
+
 namespace onnxruntime {
 namespace coreml {
 
-ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger, uint32_t coreml_flags)
+namespace {
+#if defined(COREML_ENABLE_MLPROGRAM)
+// Should the initializer be written to file or kept as an immediate value
+bool ShouldWriteInitializerToWeightsFile(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
+  // https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/load.py#L51-L57
+
+  bool use_weight_file = false;
+
+  switch (tensor_proto.data_type()) {
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
+      auto num_elements = TensorShape(utils::GetTensorShapeFromTensorProto(tensor_proto)).Size();
+      use_weight_file = num_elements >= 10;
+      break;
+    }
+    default:
+      break;
+  }
+
+  return use_weight_file;
+}
+
+// copy from the ONNX TensorProto to a CoreML field.
+// T1 is the source type. T2 is the target type. If the types differ, T1 must be smaller than T2.
+// e.g. uint32_t data can be written to RepeatedField<uint64_t>
+template <typename T1, typename T2 = T1>
+void CopyRawDataToRepeatedField(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                google::protobuf::RepeatedField<T2>& repeated_field) {
+  const auto& raw_data = tensor_proto.raw_data();
+  const T1* data = reinterpret_cast<const T1*>(raw_data.data());
+  const T1* data_end = data + (raw_data.size() / sizeof(T1));
+  if constexpr (sizeof(T1) == sizeof(T2)) {
+    repeated_field.Add(data, data_end);
+  } else {
+    static_assert(sizeof(T1) < sizeof(T2));
+    // we need to iterate over the data and copy to the repeated field, converting to T2 as we go.
+    repeated_field.Resize(data_end - data, T2(0));
+    for (int i = 0; data != data_end; ++data, ++i) {
+      repeated_field[i] = static_cast<T2>(*data);
+    }
+  }
+}
+
+// copy T data from the TensorProto.int32_t field to TensorValue.bytes
+template <typename T>
+void CopyInt32DataToBytes(const ONNX_NAMESPACE::TensorProto& tensor_proto, MILSpec::TensorValue tensor_value) {
+  const int num_entries = tensor_proto.int32_data_size();
+  std::string& bytes = *tensor_value.mutable_bytes()->mutable_values();
+  bytes.resize(num_entries * sizeof(T));
+  T* out = reinterpret_cast<T*>(bytes.data());
+
+  const int32_t* in = tensor_proto.int32_data().data();
+  for (int i = 0; i < num_entries; ++i) {
+    out[i] = static_cast<T>(in[i]);
+  }
+}
+
+// copy T data from the TensorProto.uint64_data field to TensorValue.bytes
+template <typename T>
+void CopyUInt64DataToBytes(const ONNX_NAMESPACE::TensorProto& tensor_proto, MILSpec::TensorValue tensor_value) {
+  const int num_entries = tensor_proto.uint64_data_size();
+  std::string& bytes = *tensor_value.mutable_bytes()->mutable_values();
+  bytes.resize(num_entries * sizeof(T));
+  T* out = reinterpret_cast<T*>(bytes.data());
+
+  const uint64_t* in = tensor_proto.uint64_data().data();
+  for (int i = 0; i < num_entries; ++i) {
+    out[i] = static_cast<T>(in[i]);
+  }
+}
+
+// NOTE: This supports all the ONNX data types. Weights in CoreML may not need all these
+void CopyOnnxTensorToCoreMLTensor(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                  MILSpec::TensorValue& tensor_value) {
+  bool has_raw_data = tensor_proto.has_raw_data();
+  auto data_type = tensor_proto.data_type();
+
+  // handling based on
+  // ONNX TensorProto field usage
+  // https://github.com/onnx/onnx/blob/b86cc54efce19530fb953e4b21f57e6b3888534c/onnx/onnx.proto#L544-L572
+  // CoreMLTools conversion implementation that maps data types to fields
+  // https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/helper.py#L98
+  // along with some special cased types that are stored in bytes
+  // https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/helper.py#L23
+  //   IMMEDIATE_VALUE_TYPES_IN_BYTES = (types.fp16, types.int8, types.uint8, types.uint32)
+
+  switch (data_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+      // from: float_data/raw, to: floats
+      if (has_raw_data) {
+        CopyRawDataToRepeatedField<float>(tensor_proto, *tensor_value.mutable_floats()->mutable_values());
+      } else {
+        tensor_value.mutable_floats()->mutable_values()->CopyFrom(tensor_proto.float_data());
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: {
+      // from: double_data/raw, to: doubles
+      if (has_raw_data) {
+        CopyRawDataToRepeatedField<double>(tensor_proto, *tensor_value.mutable_doubles()->mutable_values());
+      } else {
+        tensor_value.mutable_doubles()->mutable_values()->CopyFrom(tensor_proto.double_data());
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
+      // from: int32_data/raw, to: ints
+      if (has_raw_data) {
+        CopyRawDataToRepeatedField<int32_t>(tensor_proto, *tensor_value.mutable_ints()->mutable_values());
+      } else {
+        tensor_value.mutable_ints()->mutable_values()->CopyFrom(tensor_proto.int32_data());
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
+      // enable when this is proven to not be the case
+      ORT_THROW(
+          "INT64 is unexpected as CoreML uses 32-bit int for indices. "
+          "Most likely an initializer that should have been skipped was not.");
+      //// from: int64_data/raw, to: longints
+      // if (has_raw_data) {
+      //   CopyRawDataToRepeatedField<int64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
+
+      //} else {
+      //  tensor_value.mutable_longints()->mutable_values()->CopyFrom(tensor_proto.int64_data());
+      //}
+      // break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
+      // from: int32_data/raw, to: bytes
+      if (has_raw_data) {
+        *tensor_value.mutable_bytes()->mutable_values() = tensor_proto.raw_data();
+      } else {
+        // iterate the int32_data, taking the 16-bits from each entry, and copying to the bytes.
+        // we use uint16_t as only the size of the data type matters
+        CopyInt32DataToBytes<uint16_t>(tensor_proto, tensor_value);
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8: {
+      // from: int32_data/raw, to: bytes
+      if (has_raw_data) {
+        *tensor_value.mutable_bytes()->mutable_values() = tensor_proto.raw_data();
+      } else {
+        // copy from int32_data to bytes. uint8_t for both as only the size of the data type matters when copying
+        CopyInt32DataToBytes<uint8_t>(tensor_proto, tensor_value);
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32: {
+      // from: uint64_data/raw, to: bytes
+      if (has_raw_data) {
+        *tensor_value.mutable_bytes()->mutable_values() = tensor_proto.raw_data();
+      } else {
+        // copy uint32_t values from TensorProto.uint64_data
+        CopyUInt64DataToBytes<uint32_t>(tensor_proto, tensor_value);
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64: {
+      // enable when this is proven to not be the case
+      ORT_THROW(
+          "UINT64 is unexpected as CoreML uses 32-bit int for indices. "
+          "Most likely an initializer that should have been skipped was not.");
+      //// from: uint64_data/raw, to: longints
+      // if (has_raw_data) {
+      //   CopyRawDataToRepeatedField<uint64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
+      // } else {
+      //   // TODO: Is this safe? Need to check the CopyFrom implementation. As it's a straight copy of bytes this
+      //   // hopefully can do it as one block instead of iterating and potentially doing a static_cast of each
+      //   // individual value.
+      //   tensor_value.mutable_longints()->mutable_values()->CopyFrom(
+      //       reinterpret_cast<const google::protobuf::RepeatedField<int64_t>&>(tensor_proto.uint64_data()));
+      // }
+
+      // break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL: {
+      // from: int32_data/raw, to: bools
+      if (has_raw_data) {
+        CopyRawDataToRepeatedField<bool>(tensor_proto, *tensor_value.mutable_bools()->mutable_values());
+      } else {
+        const auto& int32s = tensor_proto.int32_data();
+        auto& bools = *tensor_value.mutable_bools()->mutable_values();
+        const int num_entries = int32s.size();
+        bools.Reserve(num_entries);
+        const int32_t* in = int32s.data();
+        for (int i = 0; i < num_entries; ++i) {
+          *bools.AddAlreadyReserved() = *in++;
+        }
+      }
+
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_STRING: {
+      // from: string_data (which is protobuf type bytes), to: strings (protobuf type string)
+      // due to the protobuf type mismatch we need to iterate and copy
+      auto& in = tensor_proto.string_data();
+      auto& out = *tensor_value.mutable_strings()->mutable_values();
+      out.Reserve(in.size());
+      for (const auto& iter : in) {
+        *out.Add() = iter;
+      }
+
+      break;
+    }
+    /* Not clear if there's an actual use-case for 16-bit int data currently, so leaving commented out
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16: {
+      // from: int32_data/raw, to: ints
+      // WARNING: This may change to write to mutable_bytes
+      // https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/helper.py#L113-L115
+      if (has_raw_data) {
+          CopyRawDataToRepeatedField<uint16_t, int32_t>(tensor_proto, *tensor_value.mutable_ints()->mutable_values());
+      } else {
+          tensor_value.mutable_ints()->mutable_values()->CopyFrom(tensor_proto.int32_data());
+      }
+      break;
+    } */
+    default:
+      ORT_THROW("AddTensorProtoDataToMILSpecTensorValue: Unsupported data type: ", data_type);
+  }
+}
+
+template <typename T>
+uint64_t WriteRawDataUsingStorageWriter(const onnx::TensorProto& tensor_proto,
+                                        MILBlob::Blob::StorageWriter& writer) {
+  MILBlob::Util::Span<const T> data(reinterpret_cast<const T*>(tensor_proto.raw_data().data()),
+                                    tensor_proto.raw_data().size() / sizeof(T));
+  return writer.WriteData(data);
+}
+
+// Write T1 data from the TensorProto.int32_data field using StorageWriter.
+// Currently int32_data can have any of these data types:
+//   INT32, INT16, INT8, UINT16, UINT8, BOOL, FLOAT16, BFLOAT16,
+//   FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ
+// T1 provides the size of the ONNX data type. T2 is the CoreML type.
+// The sizes and layout of T1 and T2 must match as we simply cast the bytes to T2.
+template <typename T1, typename T2 = T1>
+uint64_t WriteFromInt32DataUsingStorageWriter(const onnx::TensorProto& tensor_proto,
+                                              MILBlob::Blob::StorageWriter& writer) {
+  static_assert(sizeof(T1) == sizeof(T2), "Data sizes must match");
+
+  // need to copy to temporary data as we have to extract a subset of bytes from each int32_t entry.
+  // works better to extract the ONNX type first with static_cast, and reinterpret_cast to the CoreML type at the end.
+  std::vector<T1> values;
+  const int num_values = tensor_proto.int32_data_size();
+  values.resize(num_values);  // resize so we're not updating the length inside the copy loop
+
+  const int32_t* in = tensor_proto.int32_data().data();
+  for (int i = 0; i < num_values; ++i) {
+    values[i] = static_cast<T1>(in[i]);
+  }
+
+  MILBlob::Util::Span<const T2> data(reinterpret_cast<const T2*>(values.data()),
+                                     num_values);
+  return writer.WriteData(data);
+}
+
+// write the initializer to weight.bin and return the offset
+// StorageWriter is currently limited to fp32, fp16, bfloat16, uint8/int8, uint16/int16.
+// AFAIK we don't use bfloat16/int16/uint16 for weights in ONNX, so limit handling to fp32, fp16, uint8/int8
+uint64_t CopyOnnxTensorToCoreMLWeightsFile(const onnx::TensorProto& tensor_proto,
+                                           MILBlob::Blob::StorageWriter& writer) {
+  bool has_raw_data = tensor_proto.has_raw_data();
+  auto data_type = tensor_proto.data_type();
+
+  uint64_t offset = 0;
+
+  // See AddTensorProtoDataToMILSpecTensorValue for links to sources for info on where the different typed data is
+  // stored for ONNX and CoreML
+
+  switch (data_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+      // from: float_data/raw, to: floats
+      if (has_raw_data) {
+        offset = WriteRawDataUsingStorageWriter<float>(tensor_proto, writer);
+      } else {
+        MILBlob::Util::Span<const float> data(tensor_proto.float_data().data(), tensor_proto.float_data().size());
+        offset = writer.WriteData(data);
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
+      // from: int32_data/raw, to: bytes
+      if (has_raw_data) {
+        offset = WriteRawDataUsingStorageWriter<MILBlob::Fp16>(tensor_proto, writer);
+      } else {
+        offset = WriteFromInt32DataUsingStorageWriter<uint16_t, MILBlob::Fp16>(tensor_proto, writer);
+      }
+
+      break;
+    }
+
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
+      // from: int32_data/raw, to: bytes
+      if (has_raw_data) {
+        offset = WriteRawDataUsingStorageWriter<int8_t>(tensor_proto, writer);
+      } else {
+        offset = WriteFromInt32DataUsingStorageWriter<int8_t>(tensor_proto, writer);
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8: {
+      // from: int32_data/raw, to: bytes
+      if (has_raw_data) {
+        offset = WriteRawDataUsingStorageWriter<uint8_t>(tensor_proto, writer);
+
+      } else {
+        offset = WriteFromInt32DataUsingStorageWriter<uint8_t>(tensor_proto, writer);
+      }
+      break;
+    }
+    default:
+      ORT_THROW("AddWeightToFile: Unsupported data type: ", data_type);
+  }
+
+  return offset;
+}
+
+MILSpec::Value OnnxTensorToCoreMLTensor(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                        MILBlob::Blob::StorageWriter& weights_file_writer) {
+  MILSpec::Value value;
+
+  // populate ValueType with tensor data type, dims and rank
+  MILSpec::ValueType& value_type = *value.mutable_type();
+  MILSpec::TensorType& tensor_type = *value_type.mutable_tensortype();
+  tensor_type.set_datatype(OnnxDataTypeToMILSpec(tensor_proto.data_type()));
+
+  tensor_type.set_rank(tensor_proto.dims().size());
+  for (const auto& dim : tensor_proto.dims()) {
+    tensor_type.add_dimensions()->mutable_constant()->set_size(dim);
+  }
+
+  // add data to either weights.bin or as an immediate value
+  if (ShouldWriteInitializerToWeightsFile(tensor_proto)) {
+    uint64_t offset = CopyOnnxTensorToCoreMLWeightsFile(tensor_proto, weights_file_writer);
+
+    auto* file_value = value.mutable_blobfilevalue();
+    // Filename copied from
+    // https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/helper.py#L329
+    file_value->set_filename("@model_path/weights/weight.bin");
+    file_value->set_offset(offset);
+  } else {
+    MILSpec::TensorValue& tensor_value = *value.mutable_immediatevalue()->mutable_tensor();
+    CopyOnnxTensorToCoreMLTensor(tensor_proto, tensor_value);
+  }
+
+  return value;
+}
+
+void CreateEmptyFile(const std::string& filename) {
+  std::ofstream file(filename, std::ofstream::out | std::ofstream::binary);
+  ORT_ENFORCE(file.is_open(), "Failed to open file ", filename);
+}
+
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+
+std::string GetModelOutputPath(bool create_ml_program) {
+  // path is used to create the ML Package directory for ML Program, and for the model directly otherwise.
+  auto path = util::GetTemporaryFilePath();
+  if (!create_ml_program) {
+    path += ".model.mlmodel";
+  }
+
+  return path;
+}
+}  // namespace
+
+ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger,
+                           int32_t coreml_version, uint32_t coreml_flags,
+                           std::vector<std::string>&& onnx_input_names,
+                           std::vector<std::string>&& onnx_output_names)
     : graph_viewer_(graph_viewer),
       logger_(logger),
-      coreml_flags_(coreml_flags) {
-}
+      coreml_version_(coreml_version),
+      coreml_flags_(coreml_flags),
+      create_ml_program_((coreml_flags_ & COREML_FLAG_CREATE_MLPROGRAM) != 0),
+      model_output_path_(GetModelOutputPath(create_ml_program_)),
+      onnx_input_names_(std::move(onnx_input_names)),
+      onnx_output_names_(std::move(onnx_output_names)),
+      coreml_model_(std::make_unique<CoreML::Specification::Model>()) {
+  if (create_ml_program_) {
+#if defined(COREML_ENABLE_MLPROGRAM)
+    coreml_model_->set_specificationversion(CoreMLSpecVersion());
+    MILSpec::Program& mlprogram = *coreml_model_->mutable_mlprogram();
+    mlprogram.set_version(1);
+    mlprogram_main_fn_ = &(*mlprogram.mutable_functions())["main"];
 
-Status ModelBuilder::Initialize() {
-  coreml_model_ = std::make_unique<CoreML::Specification::Model>();
-  {  // initialize CoreML model
+    const std::string coreml_opset = "CoreML" + std::to_string(CoreMLVersion());
+    *mlprogram_main_fn_->mutable_opset() = coreml_opset;
+    mlprogram_main_block_ = &(*mlprogram_main_fn_->mutable_block_specializations())[coreml_opset];
+
+    // create the ModelPackage. this creates the output directory.
+    mlpackage_ = std::make_unique<MPL::ModelPackage>(model_output_path_, /* create */ true);
+
+    // ModelPackage::addItem does a copy of the file. Due to this we 'add' an empty file first,
+    // and do the actual writes to the file created in the package.
+    // We can't use ModelPackage::createFile as we have to add a directory for the weights.
+    std::string tmp_dir = model_output_path_ + "/tmp";
+    ORT_THROW_IF_ERROR(Env::Default().CreateFolder(ToPathString(tmp_dir)));
+    CreateEmptyFile(tmp_dir + "/weight.bin");
+
+    std::string weights_id = mlpackage_->addItem(tmp_dir, "weights", "com.microsoft.OnnxRuntime",
+                                                 "CoreML Model Weights");
+    auto weights_info = mlpackage_->findItem(weights_id);
+    weights_file_writer_ = std::make_unique<StorageWriter>(weights_info->path() + "/weight.bin");
+#else
+    // should never happen due to handling in coreml_execution_provider.cc
+    // throw here so all other code in this class can assume create_ml_program_ is only ever true in a build
+    // where ML Program support is enabled.
+    ORT_THROW("ML Program is not enabled in this build");
+#endif
+  } else {
     // We support CorelML Specification Version 4 (Core ML 3)
     coreml_model_->set_specificationversion(4);
     auto* neural_network = coreml_model_->mutable_neuralnetwork();
-    neural_network->set_arrayinputshapemapping(::CoreML::Specification::NeuralNetworkMultiArrayShapeMapping::EXACT_ARRAY_MAPPING);
+    neural_network->set_arrayinputshapemapping(
+        CoreML::Specification::NeuralNetworkMultiArrayShapeMapping::EXACT_ARRAY_MAPPING);
   }
 
-  PreprocessInitializers();
-  ORT_RETURN_IF_ERROR(RegisterInitializers());
-  ORT_RETURN_IF_ERROR(RegisterModelInputs());
-  ORT_RETURN_IF_ERROR(AddOperations());
-  ORT_RETURN_IF_ERROR(RegisterModelOutputs());
+  // populate names.
+  const auto& initializers = graph_viewer_.GetAllInitializedTensors();
+  const auto& inputs = graph_viewer_.GetInputs();
+  // rough guess to try and avoid reallocs. most nodes produce one output but some have more so allow for that.
+  // also need to convert attributes to constants so allow for that
+  unique_names_.reserve(initializers.size() + inputs.size() + size_t(graph_viewer_.NumberOfNodes() * 1.5));
+  for (const auto& pair : initializers) {
+    unique_names_.insert(pair.first);
+  }
 
-  return Status::OK();
+  for (const auto* input : inputs) {
+    unique_names_.insert(input->Name());
+  }
+
+  for (const auto& node : graph_viewer_.Nodes()) {
+    for (const auto& def : node.OutputDefs()) {
+      if (def->Exists()) {
+        unique_names_.insert(def->Name());
+      }
+    }
+  }
 }
 
-/* static */ const IOpBuilder* ModelBuilder::GetOpBuilder(const Node& node) {
-  const auto& op_builders = GetOpBuilders();
-  const auto it = op_builders.find(node.OpType());
-  if (it != op_builders.cend())
-    return it->second;
+ModelBuilder::~ModelBuilder() = default;
 
-  return nullptr;
+/*
+ * NeuralNetwork related helpers
+ */
+std::unique_ptr<NeuralNetworkLayer> ModelBuilder::CreateNNLayer(const Node& node, std::string_view suffix) {
+  auto layer_name = GetUniqueName(node, suffix);
+
+  std::unique_ptr<NeuralNetworkLayer> layer = std::make_unique<NeuralNetworkLayer>();
+  layer->set_name(layer_name);
+  return layer;
+}
+
+void ModelBuilder::AddLayer(std::unique_ptr<NeuralNetworkLayer> layer) {
+  auto* neural_network = coreml_model_->mutable_neuralnetwork();
+  neural_network->mutable_layers()->AddAllocated(layer.release());
 }
 
+/*
+ * ML Program related helpers
+ */
+#if defined(COREML_ENABLE_MLPROGRAM)
+const std::string& ModelBuilder::GetSafeName(const std::string& name) {
+  // Check the name is valid according to the MILSpec rules
+  // `Identifiers, generally used for names and keys, must match the regular expression [A-Za-z\_][A-Za-z0-9\_@]*.`
+  //
+  // There is a secondary list of reserved words that the coremltools python uses, but it's not clear if those are
+  // required here, or if we will ever hit a model that uses one of them. Due to that, skip checking them for now as
+  // it adds cost and code complexity
+  // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L151C1-L175C10
+  // static InlinedHashSet<std::string> reserved_names =
+  //    {"any", "bool", "program", "func", "tensor", "list", "dict", "tuple", "true", "false",
+  //     "string", "bf16", "fp16", "fp32", "fp64", "int8", "int16", "int32", "int64",
+  //     "uint8", "uint16", "uint32", "uint64"};
+
+  // handle empty name. shouldn't happen but code below assumes name is not empty
+  if (name.empty()) {
+    return name;
+  }
+
+  // We don't need '@' or '\' even though they're allowed. Optimize for a good name that does not need to be changed.
+
+  // has been sanitized and changed already
+  const auto entry = values_to_rename_.find(name);
+  if (entry != values_to_rename_.end()) {
+    return entry->second;
+  }
+
+  // Replace anything but a good char with '_'. If first char is 0-9 we prefix with '_';
+  bool changed = false;
+  std::string result = name;
+
+  if (std::isdigit(result[0])) {
+    changed = true;
+    result = '_' + name;
+  }
+
+  for (char& c : result) {
+    if (!std::isalnum(c) && c != '_') {
+      changed = true;
+      c = '_';
+    }
+  }
+
+  if (!changed) {
+    return name;  // return original as the return value is a reference that must remain valid
+  }
+
+  return (values_to_rename_[name] = GetUniqueName(result));
+}
+
+void ModelBuilder::SanitizeNames() {
+  // ML Model level inputs/outputs
+  auto* desc = coreml_model_->mutable_description();
+  for (auto& input : *desc->mutable_input()) {
+    input.set_name(GetSafeName(input.name()));
+  }
+
+  for (auto& output : *desc->mutable_output()) {
+    output.set_name(GetSafeName(output.name()));
+  }
+
+  // main function inputs/outputs.
+  for (auto& input : *mlprogram_main_fn_->mutable_inputs()) {
+    input.set_name(GetSafeName(input.name()));
+  }
+
+  // outputs from block with operations for current coreml version
+  for (auto& output : *mlprogram_main_block_->mutable_outputs()) {
+    output = GetSafeName(output);
+  }
+
+  // iterate operations changing input/output/node names
+  for (auto& op : *mlprogram_main_block_->mutable_operations()) {
+    for (auto& input : *op.mutable_inputs()) {
+      for (auto& arg : *input.second.mutable_arguments()) {
+        arg.set_name(GetSafeName(arg.name()));
+      }
+    }
+
+    for (auto& output : *op.mutable_outputs()) {
+      output.set_name(GetSafeName(output.name()));
+    }
+  }
+}
+
+std::unique_ptr<COREML_SPEC::MILSpec::Operation> ModelBuilder::CreateOperation(const Node& node,
+                                                                               std::string_view op_type,
+                                                                               std::string_view suffix) {
+  std::string operation_name = GetUniqueName(node, suffix);
+
+  std::unique_ptr<MILSpec::Operation> op = std::make_unique<MILSpec::Operation>();
+  op->set_type(std::string(op_type));
+  (*op->mutable_attributes())["name"] = CreateScalarTensorValue(operation_name);
+
+  return op;
+}
+
+const std::string& ModelBuilder::AddConstantOperation(std::string_view name, MILSpec::Value&& coreml_tensor) {
+  // Replicates coremltools/converters/mil/backend/mil/load.py translate_const logic
+  MILSpec::Operation& const_op = *mlprogram_main_block_->mutable_operations()->Add();
+  const_op.set_type("const");
+
+  MILSpec::NamedValueType& output = *const_op.mutable_outputs()->Add();
+  output.set_name(std::string(name));
+  *output.mutable_type() = coreml_tensor.type();
+
+  auto& attr_map = *const_op.mutable_attributes();
+  // the operation name doesn't really matter as it isn't used elsewhere, so sanitize name now
+  attr_map["name"] = CreateScalarTensorValue(GetSafeName(output.name()));
+  attr_map["val"] = std::move(coreml_tensor);
+
+  return output.name();
+}
+
+// Add operation to the Block for the main function in the ML Program
+void ModelBuilder::AddOperation(std::unique_ptr<COREML_SPEC::MILSpec::Operation> operation) {
+  mlprogram_main_block_->mutable_operations()->AddAllocated(operation.release());
+}
+
+const std::string& ModelBuilder::AddTensorValueAsConstantOperation(std::string_view op_type,
+                                                                   std::string_view value_type,
+                                                                   MILSpec::Value&& input_value) {
+  auto unique_value_name = GetUniqueName(MakeString(op_type, "_", value_type));
+  return AddConstantOperation(unique_value_name, std::move(input_value));
+}
+
+template <typename T>
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const T> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
+  // add specialization below
+  static_assert(false_for_T<T>, "Missing specialization for value type");
+
+  return "ModelBuilder::AddConstant error";  // unreachable
+}
+
+template <>
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const float> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
+  auto input_value = CreateTensorValue<float>(value, shape);
+  return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
+}
+
+template <>
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const int64_t> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
+  auto input_value = CreateTensorValue<int64_t, int32_t>(value, shape);  // CoreML uses int32
+  return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
+}
+
+template <>
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const bool> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
+  auto input_value = CreateTensorValue<bool>(value, shape);
+  return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
+}
+
+template <>
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const std::string> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
+  auto input_value = CreateTensorValue<std::string>(value, shape);
+  return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
+}
+
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+
+/*
+ * General implementation
+ */
 void ModelBuilder::PreprocessInitializers() {
-  // TODO: We should be using GetConstantInitializer not GetAllInitializedTensors in all places
+  // TODO: We should be using GetConstantInitializer not GetAllInitializedTensors in all places.
+  // non-constant initializers need to be passed in as model inputs in case they're overridden at runtime.
   const auto& initializers = graph_viewer_.GetAllInitializedTensors();
   const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
 
@@ -64,6 +683,7 @@ void ModelBuilder::PreprocessInitializers() {
         initializer_usage_[input->Name()]++;
       }
     }
+
     if (const auto* op_builder = GetOpBuilder(node)) {
       op_builder->AddInitializersToSkip(*this, node);
     }
@@ -77,27 +697,36 @@ Status ModelBuilder::RegisterInitializers() {
 
     // skip initializer if there is no remaining usage
     auto usage_count = initializer_usage_[name];
-    if (usage_count == 0)
+    if (usage_count == 0) {
       continue;
+    }
 
-    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = std::make_unique<COREML_SPEC::NeuralNetworkLayer>();
-    layer->set_name(GetUniqueName("initializer_" + name));
+#if defined(COREML_ENABLE_MLPROGRAM)
+    if (create_ml_program_) {
+      MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(tensor, *weights_file_writer_);
+      ORT_IGNORE_RETURN_VALUE(AddConstantOperation(name, std::move(coreml_tensor)));
+    } else
+#endif
+    {
+      std::unique_ptr<NeuralNetworkLayer> layer = std::make_unique<NeuralNetworkLayer>();
+      layer->set_name(GetUniqueName("initializer_" + name));
 
-    // TODO,look at using LoadConstantLayer instead of LoadConstantNDLayer
-    auto* constant_tensor = layer->mutable_loadconstantnd();
-    const auto& shape = tensor.dims();
-    if (shape.empty()) {
-      // This is a scalar initializer, CoreML constant layer requires a shape, make this a {1} tensor
-      constant_tensor->mutable_shape()->Add(1);
-    } else {
-      std::transform(shape.cbegin(), shape.cend(),
-                     google::protobuf::RepeatedFieldBackInserter(constant_tensor->mutable_shape()),
-                     [](int64_t dim) -> uint64_t { return SafeInt<uint64_t>(dim); });
-    }
+      // TODO,look at using LoadConstantLayer instead of LoadConstantNDLayer
+      auto* constant_tensor = layer->mutable_loadconstantnd();
+      const auto& shape = tensor.dims();
+      if (shape.empty()) {
+        // This is a scalar initializer, CoreML constant layer requires a shape, make this a {1} tensor
+        constant_tensor->mutable_shape()->Add(1);
+      } else {
+        std::transform(shape.cbegin(), shape.cend(),
+                       google::protobuf::RepeatedFieldBackInserter(constant_tensor->mutable_shape()),
+                       [](int64_t dim) -> uint64_t { return SafeInt<uint64_t>(dim); });
+      }
 
-    ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*constant_tensor->mutable_data(), tensor));
-    *layer->mutable_output()->Add() = name;
-    AddLayer(std::move(layer));
+      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*constant_tensor->mutable_data(), tensor));
+      *layer->mutable_output()->Add() = name;
+      AddLayer(std::move(layer));
+    }
   }
 
   return Status::OK();
@@ -109,32 +738,33 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
   if (is_input) {
     // input should not be an initializer
-    if (Contains(GetInitializerTensors(), name))
+    if (Contains(GetInitializerTensors(), name)) {
       return Status::OK();
+    }
 
     // This input will not be used
-    if (Contains(skipped_inputs_, name))
+    if (Contains(skipped_inputs_, name)) {
       return Status::OK();
+    }
   }
 
   auto* model_description = coreml_model_->mutable_description();
-  auto& input_output = is_input
-                           ? *model_description->mutable_input()->Add()
-                           : *model_description->mutable_output()->Add();
+  auto& input_output = is_input ? *model_description->mutable_input()->Add()
+                                : *model_description->mutable_output()->Add();
 
   input_output.set_name(name);
+
   auto* multi_array = input_output.mutable_type()->mutable_multiarraytype();
 
   std::vector<int64_t> shape;
-  ORT_RETURN_IF_NOT(GetShape(node_arg, shape, logger_),
-                    "Unable to get shape for ", input_output_type, ": ", name);
+  ORT_RETURN_IF_NOT(GetShape(node_arg, shape, logger_), "Unable to get shape for ", input_output_type, ": ", name);
 
   if (shape.empty()) {
-    // If we have an empty shape, this is a scalar input,
-    // Since all the input output of CoreML EP is MultiArray, we will make the scalar input output as a {1} MultiArray
+    // If we have an empty shape, this is a scalar
+    // Since all the input/output of CoreML EP is MultiArray, we will make the scalar input/output a {1} MultiArray
     shape.push_back(1);
 
-    // we need to change the shapes of these scalar outputs back to {} when CoreML EP returns these values to ORT
+    // we need to change the shapes of scalar outputs back to {} when CoreML EP returns values to ORT
     if (!is_input) {
       AddScalarOutput(name);
     }
@@ -179,15 +809,15 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     data_type = type_proto->tensor_type().elem_type();
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
-        multi_array->set_datatype(COREML_SPEC::ArrayFeatureType::FLOAT32);
+        multi_array->set_datatype(ArrayFeatureType::FLOAT32);
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_INT32:
-        multi_array->set_datatype(COREML_SPEC::ArrayFeatureType::INT32);
+        multi_array->set_datatype(ArrayFeatureType::INT32);
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_INT64:
         // If we have an int64 input/output type, since COREML_SPEC:ArrayFeatureType does not support INT64
         // we assign it to be INT32 here
-        multi_array->set_datatype(COREML_SPEC::ArrayFeatureType::INT32);
+        multi_array->set_datatype(ArrayFeatureType::INT32);
         if (!is_input) {
           // Record the output names and we need to change them back to Int64 when CoreML EP returns these values to ORT
           AddInt64Output(name);
@@ -204,6 +834,26 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
   input_output_info_.emplace(name, OnnxTensorInfo{data_type, shape});
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (create_ml_program_) {
+    if (is_input) {
+      // the model inputs need to be wired up as args to the 'main' function.
+      auto tensor_value_type = CreateNamedTensorValueType(node_arg);
+      tensor_value_type.set_name(name);
+      if (node_arg.Shape()->dim_size() == 0) {
+        // update shape from {} to {1} (same change we made at the model input level above).
+        tensor_value_type.mutable_type()->mutable_tensortype()->set_rank(1);
+        tensor_value_type.mutable_type()->mutable_tensortype()->add_dimensions()->mutable_constant()->set_size(1);
+      }
+
+      mlprogram_main_fn_->mutable_inputs()->Add(std::move(tensor_value_type));
+    } else {
+      // the model outputs need to be set as outputs of the Block for the 'main' function
+      *mlprogram_main_block_->mutable_outputs()->Add() = name;
+    }
+  }
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+
   return Status::OK();
 }
 
@@ -215,16 +865,16 @@ Status ModelBuilder::RegisterModelInputs() {
   return Status::OK();
 }
 
-Status ModelBuilder::AddOperations() {
-  const auto builder_params = MakeOpBuilderParams(graph_viewer_, coreml_flags_);
-  const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
-  for (size_t i = 0; i < node_indices.size(); i++) {
-    const auto* node(graph_viewer_.GetNode(node_indices[i]));
-    if (const auto* op_builder = GetOpBuilder(*node)) {
-      ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(*this, *node, builder_params, logger_));
+Status ModelBuilder::ProcessNodes() {
+  for (const auto node_idx : graph_viewer_.GetNodesInTopologicalOrder()) {
+    const auto& node = *graph_viewer_.GetNode(node_idx);
+    if (const auto* op_builder = GetOpBuilder(node)) {
+      ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(*this, node, logger_));
     } else {
+      // This shouldn't happen as this is called from CoreMLExecutionProvider::Compile and should only be processing
+      // nodes that we said were supported and were returned from CoreMLExecutionProvider::GetCapability.
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Node [", node->Name(), "], type [", node->OpType(), "] is not supported");
+                             "Node [", node.Name(), "], type [", node.OpType(), "] was not able to be processed");
     }
   }
 
@@ -239,29 +889,121 @@ Status ModelBuilder::RegisterModelOutputs() {
   return Status::OK();
 }
 
-Status ModelBuilder::Compile(std::unique_ptr<Model>& model, const std::string& path) {
-  ORT_RETURN_IF_ERROR(SaveCoreMLModel(path));
-  model.reset(new Model(path, logger_, coreml_flags_));
-  model->SetScalarOutputs(std::move(scalar_outputs_));
-  model->SetInt64Outputs(std::move(int64_outputs_));
-  model->SetInputOutputInfo(std::move(input_output_info_));
-  return model->LoadModel();
+Status ModelBuilder::CreateModel() {
+  PreprocessInitializers();
+
+  ORT_RETURN_IF_ERROR(RegisterInitializers());
+  ORT_RETURN_IF_ERROR(RegisterModelInputs());
+  ORT_RETURN_IF_ERROR(ProcessNodes());
+  ORT_RETURN_IF_ERROR(RegisterModelOutputs());
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (create_ml_program_) {
+    SanitizeNames();
+  }
+#endif
+
+  return Status::OK();
 }
 
-Status ModelBuilder::SaveCoreMLModel(const std::string& path) {
-  ORT_RETURN_IF_ERROR(Initialize());
-  std::ofstream stream(path, std::ofstream::out | std::ofstream::binary);
-  ORT_RETURN_IF_NOT(coreml_model_->SerializeToOstream(&stream), "Save the CoreML model failed");
+Status ModelBuilder::SaveModel() {
+  std::string output_path = model_output_path_;
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (create_ml_program_) {
+    std::string tmp_model_path = model_output_path_ + "/tmp/model.mlmodel";
+    CreateEmptyFile(tmp_model_path);
+
+    std::string model_id = mlpackage_->setRootModel(tmp_model_path, "model.mlmodel", "com.microsoft.OnnxRuntime",
+                                                    "CoreML Model Specification");
+    auto model_info = mlpackage_->findItem(model_id);
+    output_path = model_info->path();
+  }
+#endif
 
-  // TODO, Delete, debug only
-  if (const char* path = std::getenv("ORT_COREML_EP_CONVERTED_MODEL_PATH")) {
-    std::ofstream temp_stream(path, std::ofstream::out | std::ofstream::binary);
-    ORT_RETURN_IF_NOT(coreml_model_->SerializeToOstream(&temp_stream), "Save the CoreML model failed");
+  // scope this so the stream is closed and flushed by the ofstream dtor
+  {
+    LOGS(logger_, INFO) << "Writing CoreML Model to " << output_path;
+    std::ofstream stream(output_path, std::ofstream::out | std::ofstream::binary);
+    ORT_RETURN_IF_NOT(coreml_model_->SerializeToOstream(&stream), "Saving the CoreML model failed. Path=", output_path);
   }
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+  // need to delete the ModelPackage instance for it to write out the manifest. clear out the other ML Program
+  // related types as well.
+  mlprogram_main_block_ = nullptr;
+  mlpackage_.reset();
+  weights_file_writer_.reset();
+#endif
+
   return Status::OK();
 }
 
+Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (create_ml_program_) {
+    // we need to provide the sanitized names for model inputs/outputs so that info is captured.
+    // the input/output matching when we execute the model from the CoreML EP is based on order, so the change
+    // to the names doesn't matter for that.
+    auto get_sanitized_names = [this](std::vector<std::string>&& names) -> std::vector<std::string> {
+      std::vector<std::string> output(std::move(names));
+
+      for (std::string& name : output) {
+        name = GetSafeName(name);
+      }
+
+      return output;
+    };
+
+    // also need to update the keys in input_output_info_
+    auto get_sanitized_io_info = [this](std::unordered_map<std::string, OnnxTensorInfo>&& info) {
+      std::unordered_map<std::string, OnnxTensorInfo> output;
+      output.reserve(info.size());
+
+      for (auto entry = info.begin(), end = info.end(); entry != end; ++entry) {
+        output.emplace(GetSafeName(entry->first), std::move(entry->second));
+      }
+
+      return output;
+    };
+
+    model = std::make_unique<Model>(model_output_path_,
+                                    get_sanitized_names(std::move(onnx_input_names_)),
+                                    get_sanitized_names(std::move(onnx_output_names_)),
+                                    get_sanitized_io_info(std::move(input_output_info_)),
+                                    std::move(scalar_outputs_),
+                                    std::move(int64_outputs_),
+                                    logger_, coreml_flags_);
+  } else
+#endif
+  {
+    model = std::make_unique<Model>(model_output_path_,
+                                    std::move(onnx_input_names_),
+                                    std::move(onnx_output_names_),
+                                    std::move(input_output_info_),
+                                    std::move(scalar_outputs_),
+                                    std::move(int64_outputs_),
+                                    logger_, coreml_flags_);
+  }
+
+  return model->LoadModel();  // load using CoreML API, including compilation
+}
+
+// static
+Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
+                           int32_t coreml_version, uint32_t coreml_flags,
+                           std::vector<std::string>&& onnx_input_names,
+                           std::vector<std::string>&& onnx_output_names,
+                           std::unique_ptr<Model>& model) {
+  ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_flags,
+                       std::move(onnx_input_names), std::move(onnx_output_names));
+
+  ORT_RETURN_IF_ERROR(builder.CreateModel());
+  ORT_RETURN_IF_ERROR(builder.SaveModel());
+
+  return builder.LoadModel(model);
+}
+
 void ModelBuilder::AddScalarOutput(const std::string& output_name) {
   scalar_outputs_.insert(output_name);
 }
@@ -270,11 +1012,6 @@ void ModelBuilder::AddInt64Output(const std::string& output_name) {
   int64_outputs_.insert(output_name);
 }
 
-void ModelBuilder::AddLayer(std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer) {
-  auto* neural_network = coreml_model_->mutable_neuralnetwork();
-  neural_network->mutable_layers()->AddAllocated(layer.release());
-}
-
 void ModelBuilder::AddInitializerToSkip(const std::string& tensor_name) {
   // decrement usage count if this is a known initializer.
   // For simplicity the OpBuilder::AddInitializersToSkip implementations may call this for arbitrary input names
@@ -289,16 +1026,34 @@ void ModelBuilder::AddInputToSkip(const std::string& input_name) {
   skipped_inputs_.insert(input_name);
 }
 
-std::string ModelBuilder::GetUniqueName(const std::string& base_name) {
+const std::string& ModelBuilder::GetUniqueName(const std::string& base_name) {
+  if (unique_names_.find(base_name) == unique_names_.end()) {
+    return *unique_names_.insert(base_name).first;
+  }
+
   std::string unique_name;
-  do {
-    std::ostringstream os;
-    os << base_name << "_token_" << name_token_++;
-    unique_name = os.str();
-  } while (Contains(unique_names_, unique_name));
+  std::string suffix;
+
+  // supports up to 1000 unique names without having to grow in the loop
+  unique_name.reserve(base_name.size() + 5);
+  unique_name = base_name;
+
+  while (Contains(unique_names_, unique_name)) {
+    // assign followed by += to avoid creating temporary strings.
+    unique_name = base_name;
+    unique_name += "__";
+    unique_name += std::to_string(name_token_++);
+  }
 
-  return unique_name;
+  return *unique_names_.insert(unique_name).first;
 }
 
+const std::string& ModelBuilder::GetUniqueName(const Node& node, std::string_view suffix) {
+  if (node.Name().empty()) {
+    return GetUniqueName(MakeString(node.OpType(), "_", node.Index(), suffix));
+  } else {
+    return GetUniqueName(node.Name() + std::string(suffix));
+  }
+}
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index af2d5437be8d..8f85ab2c09e7 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -3,57 +3,175 @@
 
 #pragma once
 
+#include "core/common/span_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/providers/coreml/builders/coreml_spec.h"
+#include "core/providers/coreml/model/model.h"
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+// coremltools classes
+namespace MPL {
+class ModelPackage;
+}
+
+namespace MILBlob {
+namespace Blob {
+class StorageWriter;
+}
+}  // namespace MILBlob
+#endif
 
 namespace onnxruntime {
 namespace coreml {
 
 class IOpBuilder;
-class Model;
-struct OnnxTensorInfo;
 
 class ModelBuilder {
+ private:
+  ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger,
+               int32_t coreml_version, uint32_t coreml_flags,
+               std::vector<std::string>&& onnx_input_names,
+               std::vector<std::string>&& onnx_output_names);
+
  public:
-  ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger, uint32_t coreml_flags);
-  ~ModelBuilder() = default;
+  // Create the CoreML model, serialize to disk, load and compile using the CoreML API and return in `model`
+  static Status Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
+                      int32_t coreml_version, uint32_t coreml_flags,
+                      std::vector<std::string>&& onnx_input_names,
+                      std::vector<std::string>&& onnx_output_names,
+                      std::unique_ptr<Model>& model);
 
-  Status Compile(std::unique_ptr<Model>& model, const std::string& path);
-  Status SaveCoreMLModel(const std::string& path);
+  ~ModelBuilder();
 
-  // Accessors for members
   const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
   const InitializedTensorSet& GetInitializerTensors() const { return graph_viewer_.GetAllInitializedTensors(); }
-
+  const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const std::string& name) const {
+    return graph_viewer_.GetConstantInitializer(name, true);
+  }
+
+  // Since CoreML 2 the spec version is +1 as CoreML 1.1 was spec version 2.
+  // We only support CoreML 3 and later so the spec version is always version + 1.
+  int32_t CoreMLVersion() const { return coreml_version_; }
+  int32_t CoreMLSpecVersion() const { return coreml_version_ + 1; }
+
+  // Returns true if we are creating an ML Program
+  bool CreateMLProgram() const {
+#if defined(COREML_ENABLE_MLPROGRAM)
+    return create_ml_program_;
+#else
+    return false;
+#endif
+  }
+
+  /*
+   * NeuralNetworkLayer helpers
+   */
+
+  // Create a NeuralNetwork layer using the node name and optional suffix for the name.
+  // If Node has no name a unique name will be generated from the node index and operator.
+  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> CreateNNLayer(const Node& node, std::string_view suffix = "");
+
+  // Add layer to the Core ML NeuralNetwork model
   void AddLayer(std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer);
 
-  // The initializer will be processed separately, skip it as an initializer
+#if defined(COREML_ENABLE_MLPROGRAM)
+  /*
+   * MLProgram helpers
+   */
+
+  // Create Operation, set type and the unique name attribute.
+  std::unique_ptr<COREML_SPEC::MILSpec::Operation> CreateOperation(const Node& node, std::string_view op_type,
+                                                                   std::string_view suffix = "");
+
+  //
+  // Helpers for adding attributes from ONNX nodes as inputs to an ML Program Operation
+  //
+
+  /// <summary>
+  /// Add a value as a 'const' operation, generating a unique name for the value from op_type and value_type.
+  /// Use for values that were not initializers in the original ONNX model. e.g. attributes from ONNX nodes.
+  /// Add existing initializers using AddConstant with the TensorProto.
+  ///
+  /// e.g. adding the bias input of Gemm would have op_type='gemm' and value_type='bias'.
+  /// </summary>
+  /// <typeparam name="T">Value type.</typeparam>
+  /// <param name="op_type">Typically MILSpec::Operation.type().</param>
+  /// <param name="value_type">Typically the input name of the operation that will consume the value.</param>
+  /// <param name="value">Value to add.</param>
+  /// <param name="shape">Optional shape for the value.
+  /// If T is a primitive type `shape` is ignored and the value is treated as a scalar.
+  /// For a container type, if `shape` is not provided the shape is inferred to be 1-D of {value.size()}.
+  /// </param>
+  /// <returns>Unique name generated for value.</returns>
+  template <typename T>
+  std::string_view AddConstant(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
+                               std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
+    static_assert(std::is_same_v<T, float> ||
+                      std::is_same_v<T, int64_t> ||
+                      std::is_same_v<T, std::string> ||
+                      std::is_same_v<T, bool>,
+                  // add specialization in AddConstantImpl for new types if needed
+                  "AddConstant currently supports float, int64_t, std::string and bool.");
+    return AddConstantImpl(op_type, value_type, value, shape);
+  }
+
+  template <typename T>
+  std::string_view AddConstant(std::string_view op_type, std::string_view value_type, const std::vector<T>& value,
+                               std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
+    return AddConstant(op_type, value_type, AsSpan(value), shape);
+  }
+
+  /// <summary>
+  /// Add a scalar value as a 'const' operation. See AddConstant for details.
+  /// </summary>
+  template <typename T>
+  std::string_view AddScalarConstant(std::string_view op_type, std::string_view value_type, const T& value) {
+    return AddConstant(op_type, value_type, AsSpan({value}), AsSpan<const int64_t>({}));
+  }
+
+  // add the operation to the main function
+  void AddOperation(std::unique_ptr<COREML_SPEC::MILSpec::Operation> operation);
+#endif
+
+  /*
+   * General helpers
+   */
+
+  // The initializer is processed separately (e.g. layout is transformed) by the operator builder,
+  // so we don't do a copy of the original initializer into the model.
   void AddInitializerToSkip(const std::string& tensor_name);
 
   // There are some input which will not be used, add it to a list which will not
   // be added to CoreML model, since CoreML does not like input unused
   void AddInputToSkip(const std::string& input_name);
 
-  std::string GetUniqueName(const std::string& base_name);
-
- private:
-  const GraphViewer& graph_viewer_;
-  const logging::Logger& logger_;
-  uint32_t coreml_flags_;
-
-  std::unique_ptr<CoreML::Specification::Model> coreml_model_;
-  std::unordered_set<std::string> scalar_outputs_;
-  std::unordered_set<std::string> int64_outputs_;
-  std::unordered_map<std::string, OnnxTensorInfo> input_output_info_;
+  const std::string& GetUniqueName(const std::string& base_name);
+  const std::string& GetUniqueName(const Node& node, std::string_view suffix);
 
-  std::unordered_map<std::string, int> initializer_usage_;
-  std::unordered_set<std::string> skipped_inputs_;
+  const logging::Logger& Logger() const { return logger_; }
 
-  uint32_t name_token_{0};
-  std::unordered_set<std::string> unique_names_;
-
-  // Convert the onnx model to CoreML::Specification::Model
-  Status Initialize();
+ private:
+#if defined(COREML_ENABLE_MLPROGRAM)
+  template <typename T>
+  std::string_view AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
+                                   std::optional<gsl::span<const int64_t>> shape = std::nullopt);
+
+  // apply the CoreML naming rules and fix any invalid names.
+  const std::string& GetSafeName(const std::string& name);
+  // sanitize all the names in the ML Model
+  void SanitizeNames();
+
+  // add Value as a const operation. return value name in case sanitization changed it
+  const std::string& AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer);
+  const std::string& AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
+                                                       COREML_SPEC::MILSpec::Value&& input_value);
+#endif
+
+  // Convert the ONNX model in graph_viewer_ to a CoreML::Specification::Model and serialize to disk.
+  // We then load it using CoreML in order compile it.
+  Status CreateModel();
+  Status SaveModel();
+  Status LoadModel(std::unique_ptr<Model>& model);
 
   // If a CoreML operation will use initializers directly, we will add the initializers to the skip list
   void PreprocessInitializers();
@@ -61,7 +179,7 @@ class ModelBuilder {
   // Copy and process all the initializers to CoreML model
   Status RegisterInitializers();
 
-  Status AddOperations();
+  Status ProcessNodes();
   Status RegisterModelInputs();
   Status RegisterModelOutputs();
   Status RegisterModelInputOutput(const NodeArg& node_arg, bool is_input);
@@ -72,7 +190,45 @@ class ModelBuilder {
   // Record the onnx int64 type output names
   void AddInt64Output(const std::string& output_name);
 
-  static const IOpBuilder* GetOpBuilder(const Node& node);
+  const GraphViewer& graph_viewer_;
+  const logging::Logger& logger_;
+  const int32_t coreml_version_;
+  const uint32_t coreml_flags_;
+  const bool create_ml_program_;         // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
+  const std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
+
+  std::vector<std::string> onnx_input_names_;
+  std::vector<std::string> onnx_output_names_;
+
+  std::unique_ptr<CoreML::Specification::Model> coreml_model_;
+  std::unordered_set<std::string> scalar_outputs_;
+  std::unordered_set<std::string> int64_outputs_;
+  std::unordered_map<std::string, OnnxTensorInfo> input_output_info_;
+
+  std::unordered_map<std::string, int> initializer_usage_;
+  std::unordered_set<std::string> skipped_inputs_;
+
+  uint32_t name_token_{0};
+  std::unordered_set<std::string> unique_names_;
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  // mlprogram_main_ is the main block of the CoreML ML Program.
+  // It is set in CreateModel to the CoreML Model.mlprogram.functions['main'].block_specializations['CoreML<ver>']
+  // entry we create.
+  COREML_SPEC::MILSpec::Function* mlprogram_main_fn_{nullptr};  // Function that contains a Block with the operations
+  COREML_SPEC::MILSpec::Block* mlprogram_main_block_{nullptr};  // Block that all the operations are added to
+  std::unique_ptr<MPL::ModelPackage> mlpackage_;
+  std::unique_ptr<MILBlob::Blob::StorageWriter> weights_file_writer_;
+
+  // Values must start with [a-zA-A_]
+  // Additionally they can't be in a list of reserved words.
+  // If we need to sanitize an initializer name we do so during PreprocessInitializers and apply the change during
+  // RegisterInitializers.
+  // We also check inputs in AddOperation and apply the change there.
+  // This means an op builder author doesn't need to be aware of the renaming.
+  // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L146-L149
+  std::unordered_map<std::string, std::string> values_to_rename_;
+#endif
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder.h b/onnxruntime/core/providers/coreml/builders/op_builder.h
index 79de6438c970..0bb7f280c33e 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder.h
@@ -11,36 +11,39 @@ namespace coreml {
 class ModelBuilder;
 
 struct OpBuilderInputParams {
-  OpBuilderInputParams(const GraphViewer& graph_viewer, bool only_allow_static_input_shapes)
+  OpBuilderInputParams(const GraphViewer& graph_viewer,
+                       int32_t coreml_version,
+                       bool only_allow_static_input_shapes,
+                       bool create_mlprogram)
       : graph_viewer(graph_viewer),
-        only_allow_static_input_shapes(only_allow_static_input_shapes) {}
+        coreml_version(coreml_version),
+        only_allow_static_input_shapes(only_allow_static_input_shapes),
+        create_mlprogram(create_mlprogram) {}
 
   const GraphViewer& graph_viewer;
+  const int32_t coreml_version;  // required to determine which version of an operation can be used.
   const bool only_allow_static_input_shapes;
+  const bool create_mlprogram;  // whether to create ML Program (Core ML 5+) or NeuralNetwork (Core ML 3+)
 };
 
 class IOpBuilder {
  public:
   virtual ~IOpBuilder() = default;
 
-  // Add operator related
-#ifdef __APPLE__
- public:
   // Check if the initializers of this operator need preprocess
   // which will not be copied
   virtual void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const = 0;
 
   // Add the operator to CoreML model
   virtual Status AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
-                                   const OpBuilderInputParams& input_params,
                                    const logging::Logger& logger) const = 0;
-#endif
 
-  // Operator support related
- public:
   // Check if an operator is supported
   virtual bool IsOpSupported(const Node& node, const OpBuilderInputParams& input_params,
                              const logging::Logger& logger) const = 0;
+
+  // Does the builder implementation support creating an ML Program?
+  virtual bool SupportsMLProgram() const = 0;
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
index d72420bcfff8..6469b4cefa5e 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "op_builder.h"
+#include "core/providers/coreml/builders/op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index c9973671ffa2..0ba715cc7c6d 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -2,9 +2,11 @@
 // Licensed under the MIT License.
 
 #include "core/providers/coreml/coreml_execution_provider.h"
+#include "core/providers/coreml/coreml_provider_factory.h"  // defines flags
 
 #include <algorithm>
 
+#include "core/common/logging/logging.h"
 #include "core/framework/compute_capability.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
@@ -12,20 +14,35 @@
 #include "core/providers/partitioning_utils.h"
 #include "core/session/onnxruntime_cxx_api.h"
 
-#ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/coreml/model/model.h"
 #include "core/providers/coreml/shape_utils.h"
-#endif
 
 namespace onnxruntime {
 
 constexpr const char* COREML = "CoreML";
 
 CoreMLExecutionProvider::CoreMLExecutionProvider(uint32_t coreml_flags)
-    : IExecutionProvider{onnxruntime::kCoreMLExecutionProvider, true},
-      coreml_flags_(coreml_flags) {
+    : IExecutionProvider{onnxruntime::kCoreMLExecutionProvider},
+      coreml_flags_(coreml_flags),
+      coreml_version_(coreml::util::CoreMLVersion()) {
+  if (coreml_version_ < MINIMUM_COREML_VERSION) {
+    LOGS_DEFAULT(ERROR) << "CoreML EP is not supported on this platform.";
+  }
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (coreml_version_ < MINIMUM_COREML_MLPROGRAM_VERSION &&
+      (coreml_flags_ & COREML_FLAG_CREATE_MLPROGRAM) != 0) {
+    LOGS_DEFAULT(WARNING) << "ML Program is not supported on this OS version. Falling back to NeuralNetwork.";
+    coreml_flags_ ^= COREML_FLAG_CREATE_MLPROGRAM;
+  }
+#else
+  if ((coreml_flags_ & COREML_FLAG_CREATE_MLPROGRAM) != 0) {
+    LOGS_DEFAULT(WARNING) << "ML Program is not supported in this build. Falling back to NeuralNetwork.";
+    coreml_flags_ ^= COREML_FLAG_CREATE_MLPROGRAM;
+  }
+#endif
 }
 
 CoreMLExecutionProvider::~CoreMLExecutionProvider() {}
@@ -35,28 +52,34 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
                                        const IKernelLookup& /*kernel_lookup*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
-  // We do not run CoreML EP on subgraph, instead we cover this in the control flow nodes
-  // TODO investigate whether we want to support subgraph using CoreML EP
-  if (graph_viewer.IsSubgraph() && !(coreml_flags_ & COREML_FLAG_ENABLE_ON_SUBGRAPH)) {
+  if (coreml_version_ < MINIMUM_COREML_VERSION) {
     return result;
   }
 
   const auto& logger = *GetLogger();
 
+  // We do not run CoreML EP on subgraph, instead we cover this in the control flow nodes
+  // TODO investigate whether we want to support subgraph using CoreML EP. May simply require processing the
+  // implicit inputs of the control flow node that contains the subgraph as inputs to the CoreML model we generate.
+  if (graph_viewer.IsSubgraph() && !(coreml_flags_ & COREML_FLAG_ENABLE_ON_SUBGRAPH)) {
+    return result;
+  }
+
   const bool has_neural_engine = coreml::HasNeuralEngine(logger);
   if ((coreml_flags_ & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) && !has_neural_engine) {
-    LOGS(logger, VERBOSE) << "The current system does not have Apple Neural Engine";
+    LOGS(logger, WARNING) << "The current system does not have Apple Neural Engine. CoreML EP will not be used.";
     return result;
   }
 
-  const auto builder_params = coreml::MakeOpBuilderParams(graph_viewer, coreml_flags_);
+  const auto builder_params = coreml::MakeOpBuilderParams(graph_viewer, coreml_version_, coreml_flags_);
   const auto supported_nodes = coreml::GetSupportedNodes(graph_viewer, builder_params, logger);
 
-  const auto gen_metadef_name = [&]() {
-    HashValue model_hash;
-    int metadef_id = GenerateMetaDefId(graph_viewer, model_hash);
-    return MakeString(COREML, "_", model_hash, "_", metadef_id);
-  };
+  const auto gen_metadef_name =
+      [&]() {
+        HashValue model_hash;
+        int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
+        return MakeString(COREML, "_", model_hash, "_", metadef_id);
+      };
 
   result = utils::CreateSupportedPartitions(graph_viewer, supported_nodes, {},
                                             gen_metadef_name, COREML, kCoreMLExecutionProvider);
@@ -86,34 +109,32 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
   return result;
 }
 
-#ifdef __APPLE__
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                                 std::vector<NodeComputeInfo>& node_compute_funcs) {
   for (const auto& fused_node_and_graph : fused_nodes_and_graphs) {
     Node& fused_node = fused_node_and_graph.fused_node;
-    const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
 
-    coreml::ModelBuilder builder(graph_viewer, *GetLogger(), coreml_flags_);
     std::unique_ptr<coreml::Model> coreml_model;
-    const std::string coreml_model_file_path = coreml::util::GetTemporaryFilePath();
-    ORT_RETURN_IF_ERROR(builder.Compile(coreml_model, coreml_model_file_path));
-
     {
-      const auto& input_defs = fused_node.InputDefs();
-      std::vector<std::string> onnx_input_names(input_defs.size());
-      for (size_t i = 0, end = input_defs.size(); i < end; ++i) {
-        onnx_input_names[i] = input_defs[i]->Name();
-      }
-      coreml_model->SetOnnxInputs(std::move(onnx_input_names));
-    }
+      auto get_names = [](const ConstPointerContainer<std::vector<NodeArg*>>& args) -> std::vector<std::string> {
+        std::vector<std::string> names;
+        names.reserve(args.size());
 
-    {
-      const auto& output_defs = fused_node.OutputDefs();
-      std::vector<std::string> onnx_output_names(output_defs.size());
-      for (size_t i = 0, end = output_defs.size(); i < end; ++i) {
-        onnx_output_names[i] = output_defs[i]->Name();
-      }
-      coreml_model->SetOnnxOutputs(std::move(onnx_output_names));
+        for (const NodeArg* def : args) {
+          names.push_back(def->Name());
+        }
+
+        return names;
+      };
+
+      std::vector<std::string> onnx_input_names = get_names(fused_node.InputDefs());
+      std::vector<std::string> onnx_output_names = get_names(fused_node.OutputDefs());
+
+      const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
+      ORT_RETURN_IF_ERROR(coreml::ModelBuilder::Build(graph_viewer, *GetLogger(), coreml_version_, coreml_flags_,
+                                                      std::move(onnx_input_names), std::move(onnx_output_names),
+                                                      coreml_model));
     }
 
     coreml_models_.emplace(fused_node.Name(), std::move(coreml_model));
@@ -131,13 +152,14 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
     compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
       Ort::KernelContext ctx(context);
-
       const size_t num_inputs = ctx.GetInputCount();
       const size_t num_outputs = ctx.GetOutputCount();
 
       coreml::Model* model = reinterpret_cast<coreml::Model*>(state);
-      const auto& model_inputs = model->GetOnnxInputs();
-      const auto& model_outputs = model->GetOnnxOutputs();
+
+      // input/output names used by the CoreML model in the order that matches the fused_node InputDefs/OutputDefs
+      const auto& model_inputs = model->GetOrderedInputs();
+      const auto& model_outputs = model->GetOrderedOutputs();
 
       ORT_RETURN_IF_NOT(model_inputs.size() <= num_inputs, "Inconsistent input sizes");
       ORT_RETURN_IF_NOT(model_outputs.size() == num_outputs, "Inconsistent output sizes");
@@ -160,28 +182,25 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
         // Disallow inputs with dynamic shape which actually have zero elements.
         // CoreML doesn't consistently handle this well (e.g., there may be runtime errors).
-        {
-          const auto& inferred_shape = input_info->shape;
-          ORT_RETURN_IF(!coreml::IsStaticShape(inferred_shape) && coreml::DoesShapeSpecifyZeroElements(shape),
-                        "Input (", input_name, ") has a dynamic shape (", coreml::Shape2String(inferred_shape),
-                        ") but the runtime shape (", coreml::Shape2String(shape),
-                        ") has zero elements. This is not supported by the CoreML EP.");
-        }
+        const auto& inferred_shape = input_info->shape;
+        ORT_RETURN_IF(!coreml::IsStaticShape(inferred_shape) && coreml::DoesShapeSpecifyZeroElements(shape),
+                      "Input (", input_name, ") has a dynamic shape (", coreml::Shape2String(inferred_shape),
+                      ") but the runtime shape (", coreml::Shape2String(shape),
+                      ") has zero elements. This is not supported by the CoreML EP.");
 
         // If we have an empty shape, this is a scalar input,
         // Since all the input output of CoreML EP is MultiArray, we will make the scalar input as a {1} MultiArray
-        if (shape.empty())
+        if (shape.empty()) {
           shape.push_back(1);
+        }
 
         // CoreML MLMultiArray API expect input to be non-const
         // https://developer.apple.com/documentation/coreml/mlmultiarray/2881219-initwithdatapointer?language=objc
         void* inputBuffer = const_cast<void*>(input_tensor.GetTensorRawData());
-        inputs.emplace(
-            input_name,
-            coreml::OnnxTensorData{
-                coreml::OnnxTensorInfo{tensor_info.GetElementType(), shape},
-                inputBuffer,
-            });
+        inputs.emplace(input_name, coreml::OnnxTensorData{
+                                       coreml::OnnxTensorInfo{tensor_info.GetElementType(), shape},
+                                       inputBuffer,
+                                   });
       }
 
       // From this point we will need to take the exclusive lock on the model until the Predict is
@@ -193,14 +212,13 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
         outputs.reserve(model_outputs.size());
 
         coreml::GetOutputTensorMutableRawDataFn get_output_tensor_mutable_raw_data_fn =
-            [&ctx, &model_outputs](
-                const std::string& name,
-                int32_t requested_onnx_tensor_element_type,
-                gsl::span<const int64_t> static_shape) -> void* {
+            [&ctx, &model_outputs](const std::string& name,
+                                   int32_t requested_onnx_tensor_element_type,
+                                   gsl::span<const int64_t> static_shape) -> void* {
           const auto model_output_it = std::find(model_outputs.begin(), model_outputs.end(), name);
           ORT_ENFORCE(model_output_it != model_outputs.end(), "Failed to find CoreML model output name: ", name);
-          const auto output_idx = gsl::narrow_cast<size_t>(std::distance(model_outputs.begin(), model_output_it));
 
+          const auto output_idx = gsl::narrow_cast<size_t>(std::distance(model_outputs.begin(), model_output_it));
           auto output_tensor = ctx.GetOutput(output_idx, static_shape.data(), static_shape.size());
 
           const auto type_and_shape_info = output_tensor.GetTensorTypeAndShapeInfo();
@@ -221,13 +239,15 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
           // Since CoreML EP use {1} MLMultiArray as scalar, if the model output should have empty shape
           // We are going to replace the {1} shape of the output back to {}
-          if (model->IsScalarOutput(output_name))
+          if (model->IsScalarOutput(output_name)) {
             output_shape.clear();
+          }
 
           // Since CoreML EP only accepts int32 output type and onnx requires int64 output,
           // We are going to set the model output (from int32) ->int64
-          if (model->IsInt64Output(output_name))
+          if (model->IsInt64Output(output_name)) {
             output_type = ONNX_NAMESPACE::TensorProto_DataType_INT64;
+          }
 
           outputs.emplace(output_name, coreml::OnnxTensorInfo{output_type, output_shape});
         }
@@ -241,22 +261,6 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
   return Status::OK();
 }
-#else
-common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
-                                                std::vector<NodeComputeInfo>& node_compute_funcs) {
-  for (const auto& fused_node_and_graph : fused_nodes_and_graphs) {
-    ORT_UNUSED_PARAMETER(fused_node_and_graph);
-    NodeComputeInfo compute_info;
-    compute_info.create_state_func = [](ComputeContext* /*context*/, FunctionState* /*state*/) { return 0; };
-    compute_info.release_state_func = [](FunctionState /*state*/) {};
-    compute_info.compute_func = [](FunctionState /* state */, const OrtApi* /* api */,
-                                   OrtKernelContext* /* context */) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Compute is not supported in this build.");
-    };
-    node_compute_funcs.push_back(compute_info);
-  }
-  return Status::OK();
-}
-#endif  //__APPLE__
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.h b/onnxruntime/core/providers/coreml/coreml_execution_provider.h
index 67050e8079cf..24a001280eef 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.h
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.h
@@ -3,8 +3,9 @@
 
 #pragma once
 
+#include "core/common/inlined_containers.h"
 #include "core/framework/execution_provider.h"
-#include "core/providers/coreml/coreml_provider_factory.h"
+#include "core/framework/model_metadef_id_generator.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -25,14 +26,14 @@ class CoreMLExecutionProvider : public IExecutionProvider {
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
 #endif
 
+ private:
   // The bit flags which define bool options for COREML EP, bits are defined as
   // COREMLFlags in include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
-  const uint32_t coreml_flags_;
+  uint32_t coreml_flags_;
+  const int32_t coreml_version_;
+  ModelMetadefIdGenerator metadef_id_generator_;
 
- private:
-// <fused_node_name, <coreml_model_file_path, compiled_coreml_model>>
-#ifdef __APPLE__
-  std::unordered_map<std::string, std::unique_ptr<onnxruntime::coreml::Model>> coreml_models_;
-#endif
+  // map of fused_node_name to compiled_coreml_model
+  InlinedHashMap<std::string, std::unique_ptr<onnxruntime::coreml::Model>> coreml_models_;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/dump_mlprogram_model.py b/onnxruntime/core/providers/coreml/dump_mlprogram_model.py
new file mode 100644
index 000000000000..a3ceee70684d
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/dump_mlprogram_model.py
@@ -0,0 +1,27 @@
+import sys
+
+import coremltools as ct
+
+if len(sys.argv) < 2:
+    print(f"Usage: {sys.argv[0]} <path to model.mlmodel in ML Package>")
+    print("If generated by onnxruntime this will be <ML Package root>/Data/com.microsoft.onnxruntime/model.mlmodel")
+    sys.exit(-1)
+
+model_path = sys.argv[1]
+m = ct.models.MLModel(model_path)
+
+spec = m.get_spec()
+print(spec)
+
+# Example code if you want to filter output or do more advanced things
+# main = spec.mlProgram.functions["main"]
+# block = main.block_specializations[main.opset]
+# print(f"{len(block.operations)} operators")
+# for op in block.operations:
+# if op.type == 'const':
+#     if op.attributes["name"].immediateValue.tensor.strings.values[0] == "conv_0_pad_type_0":
+#         print(f"Conv pad_type={op.attributes['val'].immediateValue.tensor.strings.values}")
+#
+# if op.type == 'conv':
+#     #print(op)
+#     pass
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/ArrayFeatureExtractor.proto b/onnxruntime/core/providers/coreml/mlmodel_format/ArrayFeatureExtractor.proto
deleted file mode 100644
index 2b83ccbe3574..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/ArrayFeatureExtractor.proto
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
- * An array feature extractor.
- *
- * Given an index, extracts the value at that index from its array input.
- * Indexes are zero-based.
- */
-message ArrayFeatureExtractor {
-    repeated uint64 extractIndex = 1;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/BayesianProbitRegressor.proto b/onnxruntime/core/providers/coreml/mlmodel_format/BayesianProbitRegressor.proto
deleted file mode 100644
index 9688d87ce48b..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/BayesianProbitRegressor.proto
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
-* A Bayesian probit regressor.
-*
-* The probit regression model is superficially similar to the more commonly known
-* logistic regression, with sampling distribution of the model given by
-*
-*    P(y=+1|x,w) = Φ(<w,x>/β)
-*
-* where w are the set of weights,
-*       x are the set of features for the given event,
-*       β is a model hyper-parameter, and
-*       Φ is the link function, defined to be the CDF of the normal distribution.
-* The weights w[i,j] are Gaussian distributed, with mean μ[i,j] and precision 1/(σ[i,j])^2
-* (where i indexes over features and j indexes over the values for the feature).
-* The parameter β scales the steepness of the inverse link function.
-*
-* (see https://en.wikipedia.org/wiki/Probit_model and https://en.wikipedia.org/wiki/Logistic_regression
-* for more details on probit model and logistic regression, respectively)
-*
-* Input: X
-*   x represents a set of features, each taking on a discrete value (note that continuous values
-*   would first need to be discretized). x can be represented as a vector where the index i is
-*   the feature id and x[i] is the feature value. Alternatively, x can be represented as a matrix
-*   with 2 columns where the first column indicates the feature id and the second column contains
-*   the feature values, i.e. x[i,0] is the feature id and x[i,1] is the feature value.
-*
-*   additional input features:
-*   - "optimism": apply a mean shift to the probability, i.e. shift regression mean by o*stdev,
-*                 where o is the "optimism" parameter (see additional output features)
-*   - "samplingScale": for sampling from posterior, multiply standard deviation by this factor
-*   - "samplingTruncation": for sampling from posterior, truncate sampling distribution at given multiple of std from mean
-*
-* Output: Y
-*   probability P(y|x,w)
-*
-*   additional output features:
-*   - mean (regression output before applying link function)
-*   - variance (regression output variance before applying link function)
-*   - pessimistic probability: P(y|x,w) with a mean shift parameterized by "optimism" feature
-*   - sampled probability: p ~ P(y|x,w) with standard deviation scaling parametrized by "samplingScale" feature
-*                                       and distribution truncated at multiple of standard deviation,
-*                                       where multiple parameterized by "samplingTruncation" feature.
-*
-*/
-
-message BayesianProbitRegressor {
-
-    /*
-     * Parameterization of a Gaussian distribution
-     */
-    message Gaussian {
-        double mean = 1;
-        double precision = 2; // inverse of the variance
-    }
-
-    /*
-     * Weight for a specific feature value
-     * The weight is represented as a Gaussian distribution
-     * with a mean and precision (1/variance) to capture
-     * uncertainty in the weight
-     */
-    message FeatureValueWeight {
-        uint32 featureValue = 1;
-        Gaussian featureWeight = 2;
-    }
-
-    /*
-     * Feature with associated weights (for different values)
-     * Each feature has a set of weights for the (discrete) values
-     * it can take
-     */
-    message FeatureWeight {
-        uint32 featureId = 1;
-        repeated FeatureValueWeight weights = 2;
-    }
-
-    uint32 numberOfFeatures = 1;
-
-    Gaussian bias = 2;  // bias term
-
-    /*
-     * Set of features with associated weights
-     */
-    repeated FeatureWeight features = 3;  // feature weights
-
-    /*
-    * Set this name to be the same as input feature of type multi-array (1D)
-    * in the model description you want to use as the regression input
-    */
-    string regressionInputFeatureName = 10;
-
-    /*
-    * Set this name to be the same as optional input feature of type double
-    * in the model description you want to use as the optimism input
-    */
-    string optimismInputFeatureName = 11;
-
-    /*
-    * Set this name to be the same as optional input feature of type double
-    * in the model description you want to use as the samplingScale input
-    */
-    string samplingScaleInputFeatureName = 12;
-
-    /*
-    * Set this name to be the same as optional input feature of type double
-    * in the model description you want to use as the samplingBounds input
-    */
-    string samplingTruncationInputFeatureName = 13;
-
-    /*
-    * name of 'mean' output feature
-    */
-    string meanOutputFeatureName = 20;
-
-    /*
-    * name of 'variance' output feature
-    */
-    string varianceOutputFeatureName = 21;
-
-    /*
-    * name of 'pessimistic' output feature
-    */
-    string pessimisticProbabilityOutputFeatureName = 22;
-
-    /*
-    * name of 'sampled' output feature: samples from the scaled posterior probability distribuiton
-    */
-    string sampledProbabilityOutputFeatureName = 23;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/CategoricalMapping.proto b/onnxruntime/core/providers/coreml/mlmodel_format/CategoricalMapping.proto
deleted file mode 100644
index 23112d074213..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/CategoricalMapping.proto
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/**
- * A categorical mapping.
- *
- * This allows conversion from integers to strings, or from strings to integers.
- */
-message CategoricalMapping {
-    oneof MappingType {
-        // Conversion from strings to integers
-        StringToInt64Map stringToInt64Map = 1;
-
-        // Conversion from integer to string
-        Int64ToStringMap int64ToStringMap = 2;
-    }
-
-    /**
-     * The value returned if an input is not contained in the map above.
-     * If one of these is not set, then an error is raised on an unknown input.
-     */
-    oneof ValueOnUnknown {
-        // Default output when converting from an integer to a string.
-        string strValue = 101;
-
-        // Default output when converting from a string to an integer.
-        int64 int64Value = 102;
-    }
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/CustomModel.proto b/onnxruntime/core/providers/coreml/mlmodel_format/CustomModel.proto
deleted file mode 100644
index 9a6d36e009ad..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/CustomModel.proto
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
-* A parameterized model whose function is defined in code
-*/
-message CustomModel {
-
-    message CustomModelParamValue {
-        oneof value {
-            double doubleValue = 10;
-            string stringValue = 20;
-            int32 intValue = 30;
-            int64 longValue = 40;
-            bool boolValue = 50;
-            bytes bytesValue = 60;
-        }
-    }
-
-    string className = 10; // The name of the class (conforming to MLCustomModel) corresponding to this model
-    map<string, CustomModelParamValue> parameters = 30;
-    string description = 40; // An (optional) description provided by the model creator. This information is displayed when viewing the model, but does not affect the model's execution on device.
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/DataStructures.proto b/onnxruntime/core/providers/coreml/mlmodel_format/DataStructures.proto
deleted file mode 100644
index 8b120c2d7d10..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/DataStructures.proto
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "FeatureTypes.proto";
-
-package CoreML.Specification;
-
-/**
- * A mapping from a string
- * to a 64-bit integer.
- */
-message StringToInt64Map {
-    map<string, int64> map = 1;
-}
-
-/**
- * A mapping from a 64-bit integer
- * to a string.
- */
-message Int64ToStringMap {
-    map<int64, string> map = 1;
-}
-
-/**
- * A mapping from a string
- * to a double-precision floating point number.
- */
-message StringToDoubleMap {
-    map<string, double> map = 1;
-}
-
-/**
- * A mapping from a 64-bit integer
- * to a double-precision floating point number.
- */
-message Int64ToDoubleMap {
-    map<int64, double> map = 1;
-}
-
-/**
- * A vector of strings.
- */
-message StringVector {
-    repeated string vector = 1;
-}
-
-/**
- * A vector of 64-bit integers.
- */
-message Int64Vector {
-    repeated int64 vector = 1;
-}
-
-/**
- * A vector of floating point numbers.
- */
-message FloatVector {
-    repeated float vector = 1;
-}
-
-/**
- * A vector of double-precision floating point numbers.
- */
-message DoubleVector {
-    repeated double vector = 1;
-}
-
-/**
- * A range of int64 values
- */
-message Int64Range {
-    int64 minValue = 1;
-    int64 maxValue = 2;
-}
-
-/**
- * A set of int64 values
- */
-message Int64Set {
-    repeated int64 values = 1;
-}
-
-/**
- * A range of double values
- */
-message DoubleRange {
-    double minValue = 1;
-    double maxValue = 2;
-}
-
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/DictVectorizer.proto b/onnxruntime/core/providers/coreml/mlmodel_format/DictVectorizer.proto
deleted file mode 100644
index 3f94eeec1745..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/DictVectorizer.proto
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/**
- * Uses an index mapping to convert a dictionary to an array.
- *
- * The output array will be equal in length to the index mapping vector parameter.
- * All keys in the input dictionary must be present in the index mapping vector.
- *
- * For each item in the input dictionary, insert its value in the output array.
- * The position of the insertion is determined by the position of the item's key
- * in the index mapping. Any keys not present in the input dictionary, will be
- * zero in the output array.
- *
- * For example: if the ``stringToIndex`` parameter is set to ``["a", "c", "b", "z"]``,
- * then an input of ``{"a": 4, "c": 8}`` will produce an output of ``[4, 8, 0, 0]``.
- *
- */
-message DictVectorizer {
-    oneof Map {
-        /// String keys to indexes
-        StringVector stringToIndex = 1;
-
-        /// Int keys to indexes
-        Int64Vector int64ToIndex = 2;
-    }
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/FeatureTypes.proto b/onnxruntime/core/providers/coreml/mlmodel_format/FeatureTypes.proto
deleted file mode 100644
index 8711ac7de302..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/FeatureTypes.proto
+++ /dev/null
@@ -1,224 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
- * The 64-bit integer feature type.
- */
-message Int64FeatureType {}
-
-/**
- * The double-precision floating point number feature type.
- */
-message DoubleFeatureType {}
-
-/**
- * The string feature type.
- */
-message StringFeatureType {}
-
-
-message SizeRange {
-    uint64 lowerBound = 1;
-    int64 upperBound = 2; // negative value means unbound otherwise upperbound is included in range
-}
-
-/**
- * The image feature type.
- */
-message ImageFeatureType {
-    // Assumes raw (decompressed) format
-    enum ColorSpace {
-        INVALID_COLOR_SPACE = 0;
-        GRAYSCALE = 10; //  8 bits per pixel
-        RGB = 20;       // 32 bits per pixel: RGBA with A channel ignored
-        BGR = 30;       // 32 bits per pixel: BGRA with A channel ignored
-    }
-
-    message ImageSize {
-        uint64 width = 1;
-        uint64 height = 2;
-    }
-
-    message EnumeratedImageSizes {
-        repeated ImageSize sizes = 1;
-    }
-
-    message ImageSizeRange {
-        SizeRange widthRange = 1;
-        SizeRange heightRange = 2;
-    }
-
-    // The required or default image size is width x height
-    //
-    // If specificationVersion <= 2 or SizeFlexibility is empty,
-    // width x height is the required fixed image size
-    //
-    // If SizeFlexibility is present, width x height indicate a "default"
-    // image size which must be consistent with the flexibilty specified
-
-    int64 width = 1;
-    int64 height = 2;
-
-    // For specification version >= 3 you can specify image size flexibility.
-
-    oneof SizeFlexibility {
-
-        // Use enumeratedSizes for a set of distinct fixed sizes
-        // e.g. portrait or landscape: [80 x 100, 100 x 8]
-        //
-        // If the width x height fields above are specified then they must be
-        // one of the sizes listed.
-        //
-        // If width and height are not specified above then the default width
-        // and height will be enumeratedSizes[0]
-        //
-        // Must be non-empty
-
-        EnumeratedImageSizes enumeratedSizes = 21;
-
-        // Use imageSizeRange to allow for ranges of values
-        // e.g. any image greater than 10 x 20: [10..<max] x [20..<max]
-        //
-        // If width and height are specified above they must fall in the range
-        // specified in imageSizeRange. They will be treated as the default size.
-        //
-        // If width and height are not specified above then the default width
-        // and height will be imageSizeRange.widthRange.lowerBound x imageSizeRange.heightRange.lowerBound
-
-        ImageSizeRange imageSizeRange = 31;
-    }
-
-    ColorSpace colorSpace = 3;
-}
-
-/**
- * The array feature type.
- */
-message ArrayFeatureType {
-
-    enum ArrayDataType {
-        INVALID_ARRAY_DATA_TYPE = 0;
-        FLOAT32 = 65568; // 0x10000 | 32
-        DOUBLE = 65600;  // 0x10000 | 64
-        INT32 = 131104;  // 0x20000 | 32
-    }
-
-    // The required or default shape
-    //
-    // If specificationVersion <= 2 or ShapeFlexibility is empty,
-    // shape is the required fixed shape
-    //
-    // If ShapeFlexibility is present, shape indicate a "default"
-    // shape which must be consistent with the flexibilty specified
-
-    repeated int64 shape = 1;
-
-    ArrayDataType dataType = 2;
-
-    message Shape {
-        repeated int64 shape = 1;
-    }
-
-    message EnumeratedShapes {
-        repeated Shape shapes = 1;
-    }
-
-    message ShapeRange {
-        // sizeRanges.size() must be length 1 or 3
-        // sizeRanges[d] specifies the allowed range for dimension d
-        repeated SizeRange sizeRanges = 1;
-    }
-
-    // For specification version >= 3 you can specify image size flexibility.
-
-    oneof ShapeFlexibility {
-
-        // Use enumeratedShapes for a set of distinct fixed shapes
-        //
-        // If the shape field is specified then it must be
-        // one of the enumerated shapes.
-        ///
-        // If shape is not specifed, the "default" shape will be considered
-        // enumeratedShapes[0]
-        //
-        // Must be non-empty
-
-        EnumeratedShapes enumeratedShapes = 21;
-
-        // Use shapeRange to allow the size of each dimension vary within
-        // indpendently specified ranges
-        //
-        // If you specify shape above it must fall in the range
-        // specified in shapeRanges. It will be treated as the default shape.
-        //
-        // If you don't specify shape above then the default shape will
-        // have shape[d] = shapeRange.sizeRanges[d].lowerBound
-
-        ShapeRange shapeRange = 31;
-
-    }
-
-    oneof defaultOptionalValue {
-        int32 intDefaultValue = 41;
-        float floatDefaultValue = 51;
-        double doubleDefaultValue = 61;
-    }
-
-}
-
-/**
- * The dictionary feature type.
- */
-message DictionaryFeatureType {
-    /**
-     *  Key/value type tags, with the following restrictions:
-     *  - ``keyType`` must be a hashable type
-     *  - ``valueType`` is assumed to be a ``double``
-     */
-    oneof KeyType {
-        Int64FeatureType int64KeyType = 1;
-        StringFeatureType stringKeyType = 2;
-    }
-}
-
-/**
- * The Sequence feature type.
- */
-message SequenceFeatureType {
-
-    /**
-     * Currently only categorical int64 and String sequences are supported
-     */
-    oneof Type {
-        Int64FeatureType int64Type = 1;
-        StringFeatureType stringType = 3;
-    }
-
-    // Range of allowed size/length/count of sequence
-    SizeRange sizeRange = 101;
-}
-
-/**
- * A feature, which may be optional.
- */
-message FeatureType {
-    oneof Type {
-        Int64FeatureType int64Type = 1;
-        DoubleFeatureType doubleType = 2;
-        StringFeatureType stringType = 3;
-        ImageFeatureType imageType = 4;
-        ArrayFeatureType multiArrayType = 5;
-        DictionaryFeatureType dictionaryType = 6;
-        SequenceFeatureType sequenceType = 7;
-    }
-
-    bool isOptional = 1000;
-}
-
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/FeatureVectorizer.proto b/onnxruntime/core/providers/coreml/mlmodel_format/FeatureVectorizer.proto
deleted file mode 100644
index 75eaf14b5366..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/FeatureVectorizer.proto
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
- * A FeatureVectorizer puts one or more features into a single array.
- *
- * The ordering of features in the output array is determined by
- * ``inputList``.
- *
- * ``inputDimensions`` is a zero based index.
- */
-message FeatureVectorizer {
-    message InputColumn {
-        string inputColumn = 1;
-        uint64 inputDimensions = 2;
-    }
-
-    repeated InputColumn inputList = 1;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/GLMClassifier.proto b/onnxruntime/core/providers/coreml/mlmodel_format/GLMClassifier.proto
deleted file mode 100644
index 47f6f4a3c7b8..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/GLMClassifier.proto
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/**
- * A generalized linear model classifier.
- */
-message GLMClassifier {
-    message DoubleArray {
-        repeated double value = 1;
-    }
-
-    enum PostEvaluationTransform {
-        Logit = 0;
-        Probit = 1; /// Only binary classification is supported for probit
-    }
-
-    enum ClassEncoding {
-        ReferenceClass = 0; /// First class is the reference class
-        OneVsRest = 1; /// Also called One vs All
-    }
-
-    repeated DoubleArray weights = 1;
-    repeated double offset = 2;
-    PostEvaluationTransform postEvaluationTransform = 3;
-    ClassEncoding classEncoding = 4;
-
-    /**
-     * Required class label mapping.
-     */
-    oneof ClassLabels {
-        StringVector stringClassLabels = 100;
-        Int64Vector int64ClassLabels = 101;
-    }
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/GLMRegressor.proto b/onnxruntime/core/providers/coreml/mlmodel_format/GLMRegressor.proto
deleted file mode 100644
index 64093c4f156a..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/GLMRegressor.proto
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
- * A generalized linear model regressor.
- */
-message GLMRegressor {
-    message DoubleArray {
-        repeated double value = 1;
-    }
-
-    enum PostEvaluationTransform {
-        NoTransform = 0;
-        Logit = 1;
-        Probit = 2;
-    }
-
-    repeated DoubleArray weights = 1;
-    repeated double offset = 2;
-    PostEvaluationTransform postEvaluationTransform = 3;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/Gazetteer.proto b/onnxruntime/core/providers/coreml/mlmodel_format/Gazetteer.proto
deleted file mode 100644
index 6abbffaf623b..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/Gazetteer.proto
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification.CoreMLModels;
-
-/**
-* A model which uses an efficient probabilistic representation
-* for assigning labels to a set of strings.
-*/
-message Gazetteer {
-
-    /*
-    * Stores the revision number for the model, revision 2 is available on
-    * iOS, tvOS 13.0+, macOS 10.15+
-    */
-    uint32 revision = 1;
-    
-    /*
-    * Stores the language of the model, as specified in BCP-47 format,
-    * e.g. "en-US". See https://tools.ietf.org/html/bcp47
-    */
-    string language = 10;
-
-    /*
-    * Natural Lanaguge framework's efficient representation of a gazetter.
-    */
-    bytes modelParameterData = 100;
-    
-    /*
-    * Stores the set of output class labels
-    */
-    oneof ClassLabels {
-        StringVector stringClassLabels = 200;
-    }
-    
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/Identity.proto b/onnxruntime/core/providers/coreml/mlmodel_format/Identity.proto
deleted file mode 100644
index 123a15e59156..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/Identity.proto
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
- * An identity model.
- *
- * This model returns given inputs as outputs, unchanged.
- * Intended to be used for testing purposes.
- */
-message Identity {
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/Imputer.proto b/onnxruntime/core/providers/coreml/mlmodel_format/Imputer.proto
deleted file mode 100644
index 3de280b2f162..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/Imputer.proto
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/**
- * A transformer that replaces missing values with a default value,
- * such as a statistically-derived value.
- *
- * If ``ReplaceValue`` is set, then missing values of that type are
- * replaced with the corresponding value.
- *
- * For example: if ``replaceDoubleValue`` is set to ``NaN``
- * and a single ``NaN`` double value is provided as input,
- * then it is replaced by ``imputedDoubleValue``. However
- * if the input is an array of doubles, then any instances
- * of ``NaN`` in the array is replaced with the corresponding
- * value in ``imputedDoubleArray``.
- */
-message Imputer {
-    oneof ImputedValue {
-        double imputedDoubleValue = 1;
-        int64 imputedInt64Value = 2;
-        string imputedStringValue = 3;
-        DoubleVector imputedDoubleArray = 4;
-        Int64Vector imputedInt64Array = 5;
-        StringToDoubleMap imputedStringDictionary = 6;
-        Int64ToDoubleMap imputedInt64Dictionary = 7;
-    }
-
-    oneof ReplaceValue {
-        double replaceDoubleValue = 11;
-        int64 replaceInt64Value = 12;
-        string replaceStringValue = 13;
-    }
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/ItemSimilarityRecommender.proto b/onnxruntime/core/providers/coreml/mlmodel_format/ItemSimilarityRecommender.proto
deleted file mode 100644
index a5a8c11092d3..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/ItemSimilarityRecommender.proto
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-/**
- * Each tree is a collection of nodes,
- * each of which is identified by a unique identifier.
- *
- * Each node is either a branch or a leaf node.
- * A branch node evaluates a value according to a behavior;
- * if true, the node identified by ``true_child_node_id`` is evaluated next,
- * if false, the node identified by ``false_child_node_id`` is evaluated next.
- * A leaf node adds the evaluation value to the base prediction value
- * to get the final prediction.
- *
- * A tree must have exactly one root node,
- * which has no parent node.
- * A tree must not terminate on a branch node.
- * All leaf nodes must be accessible
- * by evaluating one or more branch nodes in sequence,
- * starting from the root node.
- */
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-
-/**
- * Item Similarity Recommender
- *
- *  The Item Similarity recommender takes as input a list of items and scores,
- *  then uses that information and a table of item similarities to predict similarity
- *  scores for all items.  By default, the items predicted are most similar to the given
- *  items but not part of that item set.
- *
- *  The predicted score for a given item k is
- *    sum_(i in observed items)   sim_(k,i) * (score_i - shift_k)
- *
- *  Because only the most similar scores for each item i are stored,
- *  sim_(k,i) is often zero.
- *
- *  For many models, the score adjustment parameter shift_j is zero -- it's occasionally used
- *  to counteract global biases for popular items.
- *
- *
- *  References:
- */
-message ItemSimilarityRecommender {
-
-    /** The items similar to a given base item.
-     */
-    message ConnectedItem {
-        uint64 itemId = 1;
-        double similarityScore = 2;
-    }
-
-    /**  The formula for the score of a given model as given above, with shift_k
-     *   parameter given by itemScoreAdjustment, and the similar item list filling in
-     *   all the known sim(k,i) scores for i given by itemID and k given by the itemID parameter in
-     *   the similarItemList.
-     */
-    message SimilarItems {
-        uint64 itemId = 1;
-        repeated ConnectedItem similarItemList = 2;
-        double itemScoreAdjustment = 3;
-    }
-
-    repeated SimilarItems itemItemSimilarities = 1;
-
-    /** One or none of these are given.  If none are given, then the items must number 0, 1, ..., num_items - 1.
-     *  If either is given, the length must be exactly num_items.
-     */
-    StringVector itemStringIds = 2;
-    Int64Vector itemInt64Ids = 3;
-
-    /** Input parameter names specifying different possible inputs to the recommender.
-     */
-    string itemInputFeatureName = 10;  /* Required */
-    string numRecommendationsInputFeatureName = 11;  /* Optional; defaults to all items if not given.*/
-    string itemRestrictionInputFeatureName = 12; /* Optional. */
-    string itemExclusionInputFeatureName = 13; /* Optional; defaults to input item list if not given. */
-
-    /** The predicted outputs.  At least one of these must be specified.
-     */
-    string recommendedItemListOutputFeatureName = 20;
-    string recommendedItemScoreOutputFeatureName = 21;
-
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/LinkedModel.proto b/onnxruntime/core/providers/coreml/mlmodel_format/LinkedModel.proto
deleted file mode 100644
index b113000e80a8..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/LinkedModel.proto
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-import public "Parameters.proto";
-
-package CoreML.Specification;
-
-/**
- * A model which wraps another (compiled) model external to this one
- */
-message LinkedModel {
-
-    oneof LinkType {
-        // A model located via a file system path
-        LinkedModelFile linkedModelFile = 1;
-    }
-}
-
-// Model is referenced by a model file name and search path
-message LinkedModelFile {
-
-    // Model file name: e.g. "MyFetureExtractor.mlmodelc"
-    StringParameter linkedModelFileName = 1;
-
-    // Search path to find the linked model file
-    // Multiple paths can be searched using the unix-style path separator ":"
-    // Each path can be relative (to this model) or absolute
-    //
-    // An empty string is the same as teh relative search path "."
-    // which searches in the same location as this model file
-    //
-    // There are some special paths which start with $
-    // - $BUNDLE_MAIN - Indicates to look in the main bundle
-    // - $BUNDLE_IDENTIFIER(identifier) - Looks in Bunde with given identifer
-    StringParameter linkedModelSearchPath = 2;
-}
-
-
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/Model.proto b/onnxruntime/core/providers/coreml/mlmodel_format/Model.proto
deleted file mode 100644
index 737233f2e3fe..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/Model.proto
+++ /dev/null
@@ -1,322 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-/**
- * A Core ML model consists of a specification version
- * and a model description,
- * and can be any one of the following types:
- *
- * Neural Networks
- *   - `NeuralNetwork`
- *
- * Regressors
- *   - ``GLMRegressor``
- *   - ``SupportVectorRegressor``
- *   - ``TreeEnsembleRegressor``
- *   - ``NeuralNetworkRegressor``
- *   - ``BayesianProbitRegressor``
- *
- * Classifiers
- *   - `NeuralNetworkClassifier`
- *   - `TreeEnsembleClassifier`
- *   - `GLMClassifier`
- *   - `SupportVectorClassifier`
- *   - `KNearestNeighborsClassifier`
- *
- * Other models
- *   - `CustomModel`
- *   - `TextClassifier`
- *   - `WordTagger`
- *   - `Gazetteer`
- *   - `WordEmbedding`
- *   - `VisionFeaturePrint`
- *   - `LinkedModel`
- *   - `SoundAnalysisPreprocessing`
- *   - `ItemSimilarityRecommender`
- *
- * Feature Engineering
- *   - `Imputer`
- *   - `Scaler`
- *   - `Normalizer`
- *   - `OneHotEncoder`
- *   - `CategoricalMapping`
- *   - `FeatureVectorizer`
- *   - `DictVectorizer`
- *   - `ArrayFeatureExtractor`
- *   - `NonMaximumSuppression`
- *
- * Pipelines
- *   - `PipelineClassifier`
- *   - `PipelineRegressor`
- *   - `Pipeline`
- *
- * Simple Mathematical Functions
- *   - `Identity`
- */
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "VisionFeaturePrint.proto";
-import public "TextClassifier.proto";
-import public "WordTagger.proto";
-import public "Gazetteer.proto";
-import public "WordEmbedding.proto";
-import public "ArrayFeatureExtractor.proto";
-import public "BayesianProbitRegressor.proto";
-import public "CategoricalMapping.proto";
-import public "CustomModel.proto";
-import public "DictVectorizer.proto";
-import public "FeatureTypes.proto";
-import public "FeatureVectorizer.proto";
-import public "GLMRegressor.proto";
-import public "GLMClassifier.proto";
-import public "NearestNeighbors.proto";
-import public "Identity.proto";
-import public "Imputer.proto";
-import public "NeuralNetwork.proto";
-import public "Normalizer.proto";
-import public "OneHotEncoder.proto";
-import public "Scaler.proto";
-import public "NonMaximumSuppression.proto";
-import public "SVM.proto";
-import public "TreeEnsemble.proto";
-import public "Parameters.proto";
-import public "ItemSimilarityRecommender.proto";
-import public "SoundAnalysisPreprocessing.proto";
-import public "LinkedModel.proto";
-
-package CoreML.Specification;
-
-/**
- * A pipeline consisting of one or more models.
- */
-message Pipeline {
-    repeated Model models = 1;
-
-    // Optional names given for each model
-    // If not supplied it defaults to ["model0",..., "model"(models.size()-1)]
-    // These names can be used to disambiguate the scope / domain of a parameter
-    repeated string names = 2;
-}
-
-/**
- * A classifier pipeline.
- */
-message PipelineClassifier {
-    Pipeline pipeline = 1;
-}
-
-/**
- * A regressor pipeline.
- */
-message PipelineRegressor {
-    Pipeline pipeline = 1;
-}
-
-/**
- * A feature description,
- * consisting of a name, short description, and type.
- */
-message FeatureDescription {
-    string name = 1;
-    string shortDescription = 2;
-    FeatureType type = 3;
-}
-
-/**
- * Model metadata,
- * consisting of a short description, a version string,
- * an author, a license, and any other user defined
- * key/value meta data.
- */
-message Metadata {
-    string shortDescription = 1;
-    string versionString = 2;
-    string author = 3;
-    string license = 4;
-    map<string, string> userDefined = 100;
-}
-
-/**
- * A description of a model,
- * consisting of descriptions of its input and output features.
- * Both regressor and classifier models require the name of the
- * primary predicted output feature (``predictedFeatureName``).
- * Classifier models can specify the output feature containing
- * probabilities for the predicted classes
- * (``predictedProbabilitiesName``).
- */
-message ModelDescription {
-    repeated FeatureDescription input = 1;
-    repeated FeatureDescription output = 10;
-
-    // [Required for regressor and classifier models]: the name
-    // to give to an output feature containing the prediction.
-    string predictedFeatureName = 11;
-
-    // [Optional for classifier models]: the name to give to an
-    // output feature containing a dictionary mapping class
-    // labels to their predicted probabilities. If not specified,
-    // the dictionary will not be returned by the model.
-    string predictedProbabilitiesName = 12;
-
-    repeated FeatureDescription trainingInput = 50;
-
-    Metadata metadata = 100;
-}
-
-message SerializedModel {
-    // Identifier whose content describes the model type of the serialized protocol buffer message.
-    string identifier = 1;
-
-    // Must be a valid serialized protocol buffer of the above specified type.
-    bytes model = 2;
-}
-
-/**
- * A Core ML model,
- * consisting of a specification version,
- * a model description, and a model type.
- *
- * Core ML model compatibility is indicated by
- * a monotonically increasing specification version number,
- * which is incremented anytime a backward-incompatible change is made
- * (this is functionally equivalent to the MAJOR version number
- * described by `Semantic Versioning 2.0.0 <http://semver.org/>`_).
- *
- * Specification Versions : OS Availability (Core ML Version)
- *
- * 1 : iOS 11, macOS 10.13, tvOS 11, watchOS 4 (Core ML 1)
- * - Feedforward & Recurrent Neural Networks
- * - General Linear Models
- * - Tree Ensembles
- * - Support Vector Machines
- * - Pipelines
- * - Feature Engineering
- *
- * 2 : iOS 11.2, macOS 10.13.2, tvOS 11.2, watchOS 4.2 (Core ML 1.2)
- * - Custom Layers for Neural Networks
- * - Float 16 support for Neural Network layers
- *
- * 3 : iOS 12, macOS 10.14, tvOS 12, watchOS 5 (Core ML 2)
- * - Flexible shapes and image sizes
- * - Categorical sequences
- * - Core ML Vision Feature Print, Text Classifier, Word Tagger
- * - Non Max Suppression
- * - Crop and Resize Bilinear NN layers
- * - Custom Models
- *
- * 4 : iOS 13, macOS 10.15, tvOS 13, watchOS 6 (Core ML 3)
- * - Updatable models
- * - Exact shape / general rank mapping for neural networks
- * - Large expansion of supported neural network layers
- *   - Generalized operations
- *   - Control flow
- *   - Dynamic layers
- *   - See NeuralNetwork.proto
- * - Nearest Neighbor Classifier
- * - Sound Analysis Prepreocessing
- * - Recommender
- * - Linked Model
- * - NLP Gazeteer
- * - NLP WordEmbedding
- *
- * 5 : iOS 14, macOS 11, tvOS 14, watchOS 7 (Core ML 4)
- * - Model Deployment
- * - Model Encryption
- * - Unified converter API with PyTorch and Tensorflow 2 Support in coremltools 4
- * - MIL builder for neural networks and composite ops in coremltools 4
- * - New layers in neural network:
- *      - CumSum
- *      - OneHot
- *      - ClampedReLu
- *      - ArgSort
- *      - SliceBySize
- *      - Convolution3D
- *      - Pool3D
- *      - Bilinear Upsample with align corners and fractional factors
- *      - PixelShuffle
- *      - MatMul with int8 weights and int8 activations
- *      - Concat interleave
- *      - See NeuralNetwork.proto
- * - Enhanced Xcode model view with interactive previews
- * - Enhanced Xcode Playground support for Core ML models
- *
- */
-message Model {
-    int32 specificationVersion = 1;
-    ModelDescription description = 2;
-    
-    /*
-     * Following model types support on-device update:
-     *
-     * - NeuralNetworkClassifier
-     * - NeuralNetworkRegressor
-     * - NeuralNetwork
-     * - KNearestNeighborsClassifier
-     */
-    bool isUpdatable = 10;
-    
-    // start at 200 here
-    // model specific parameters:
-    oneof Type {
-        // pipeline starts at 200
-        PipelineClassifier pipelineClassifier = 200;
-        PipelineRegressor pipelineRegressor = 201;
-        Pipeline pipeline = 202;
-
-        // regressors start at 300
-        GLMRegressor glmRegressor = 300;
-        SupportVectorRegressor supportVectorRegressor = 301;
-        TreeEnsembleRegressor treeEnsembleRegressor = 302;
-        NeuralNetworkRegressor neuralNetworkRegressor = 303;
-        BayesianProbitRegressor bayesianProbitRegressor = 304;
-
-        // classifiers start at 400
-        GLMClassifier glmClassifier = 400;
-        SupportVectorClassifier supportVectorClassifier = 401;
-        TreeEnsembleClassifier treeEnsembleClassifier = 402;
-        NeuralNetworkClassifier neuralNetworkClassifier = 403;
-        KNearestNeighborsClassifier kNearestNeighborsClassifier = 404;
-
-        // generic models start at 500
-        NeuralNetwork neuralNetwork = 500;
-        ItemSimilarityRecommender itemSimilarityRecommender = 501;
-
-        // Custom and linked models
-        CustomModel customModel = 555;
-        LinkedModel linkedModel = 556;
-
-        // feature engineering starts at 600
-        OneHotEncoder oneHotEncoder = 600;
-        Imputer imputer = 601;
-        FeatureVectorizer featureVectorizer = 602;
-        DictVectorizer dictVectorizer = 603;
-        Scaler scaler = 604;
-        CategoricalMapping categoricalMapping = 606;
-        Normalizer normalizer = 607;
-        ArrayFeatureExtractor arrayFeatureExtractor = 609;
-        NonMaximumSuppression nonMaximumSuppression = 610;
-
-
-        // simple mathematical functions used for testing start at 900
-        Identity identity = 900;
-
-        // reserved until 1000
-
-        // CoreML provided models
-        CoreMLModels.TextClassifier textClassifier = 2000;
-        CoreMLModels.WordTagger wordTagger = 2001;
-        CoreMLModels.VisionFeaturePrint visionFeaturePrint = 2002;
-        CoreMLModels.SoundAnalysisPreprocessing soundAnalysisPreprocessing = 2003;
-        CoreMLModels.Gazetteer gazetteer = 2004;
-        CoreMLModels.WordEmbedding wordEmbedding = 2005;
-        
-        // Reserved private messages start at 3000
-        // These messages are subject to change with no notice or support.
-        SerializedModel serializedModel = 3000;
-    }
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/NearestNeighbors.proto b/onnxruntime/core/providers/coreml/mlmodel_format/NearestNeighbors.proto
deleted file mode 100644
index 82acd8490374..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/NearestNeighbors.proto
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-import public "DataStructures.proto";
-import public "Parameters.proto";
-
-/**
- * A k-Nearest-Neighbor classifier
- */
-message KNearestNeighborsClassifier {
-
-    /**
-     * The "core" nearest neighbor model attributes.
-     */
-    NearestNeighborsIndex nearestNeighborsIndex = 1;
-
-    /**
-     * Number of neighbors to use for classification.
-     */
-    Int64Parameter numberOfNeighbors = 3;
-
-    /**
-     * Type of labels supported by the model. Currently supports String or Int64
-     * labels.
-     */
-    oneof ClassLabels {
-        StringVector stringClassLabels = 100;
-        Int64Vector int64ClassLabels = 101;
-    }
-
-	/**
-	 * Default value of class label (useful when prediction is called on an empty kNN classifier)
-	 */
-    oneof DefaultClassLabel {
-        string defaultStringLabel = 110;
-        int64 defaultInt64Label = 111;
-    }
-
-    /**
-     * Weighting scheme to be used when computing the majority label of a 
-     * new data point.
-     */
-    oneof WeightingScheme {
-        UniformWeighting uniformWeighting = 200;
-        InverseDistanceWeighting inverseDistanceWeighting = 210;
-    }
-}
-
-/**
- * The "core" attributes of a Nearest Neighbors model.
- */
-message NearestNeighborsIndex {
-
-    /** 
-     * Number of dimensions of the input data.
-     */
-    int32 numberOfDimensions = 1;
-
-    /**
-     * Vector of floating point data that makes up the model. Each data point must have 'numberOfDimensions'
-     * dimensions.
-     */
-    repeated FloatVector floatSamples = 2;
-
-    /** 
-     * Backing data structure for the Nearest Neighbors Index. Currently supports 
-     * a linear index or a kd-tree index.
-     */
-    oneof IndexType {
-        LinearIndex linearIndex = 100;
-        SingleKdTreeIndex singleKdTreeIndex = 110;
-    }
-
-    /** 
-     * Distance function to be used to find neighbors. Currently only Squared Euclidean
-     * Distance is supported.
-     */
-    oneof DistanceFunction {
-        SquaredEuclideanDistance squaredEuclideanDistance = 200;
-    }
-
-}
-
-/**
- * Specifies a uniform weighting scheme (i.e. each neighbor receives equal
- * voting power).
- */
-message UniformWeighting {
-}
-
-
-/**
- * Specifies a inverse-distance weighting scheme (i.e. closest neighbors receives higher
- * voting power). A nearest neighbor with highest sum of (1 / distance) is picked.
- */
-message InverseDistanceWeighting {
-}
-
-
-/**
- * Specifies a flat index of data points to be searched by brute force.
- */
-message LinearIndex {
-}
-
-
-/**
- * Specifies a kd-tree backend for the nearest neighbors model.
- */
-message SingleKdTreeIndex {
-
-    /**
-     * Number of data points contained within a leaf node of the kd-tree.
-     */
-    int32 leafSize = 1;
-
-}
-
-
-/**
- * Specifies the Squared Euclidean Distance function.
- */
-message SquaredEuclideanDistance {
-}
-
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/NeuralNetwork.proto b/onnxruntime/core/providers/coreml/mlmodel_format/NeuralNetwork.proto
deleted file mode 100644
index 44a77c6e7f5f..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/NeuralNetwork.proto
+++ /dev/null
@@ -1,6531 +0,0 @@
-// Copyright (c) 2017-2019, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-/**
- * A neural network is defined through a collection of layers
- * and represents a directed acyclic graph (DAG).
- * Each layer has a name, a layer type,
- * a list of input names, a list of output names,
- * and a collection of parameters specific to the layer type.
- *
- * The graph structure and connectivity of the neural network
- * is inferred from the input and output names.
- * A neural network starts with the layer
- * whose input name is equal to the value specified in
- * ``Model.description.input.name``,
- * and ends with the layer
- * whose output name is equal to the value specified in
- * ``Model.description.output.name``.
- * Layers must have unique input and output names,
- * and a layer may not have input or output names that
- * refer to layers that are not yet defined.
- *
- * For Core ML specification version <=3,
- * all inputs are mapped to static rank 5 tensors, with axis notations
- * [Sequence, Batch, Channel, Height, Width].
- *
- * From specification version 4 onwards (iOS >= 13, macOS >= 10.15), more options are available
- * (see enums ``NeuralNetworkMultiArrayShapeMapping``, ``NeuralNetworkImageShapeMapping``)
- * to map inputs to generic N-Dimensional (or N rank) tensors, where N >= 1.
- *
- * Each layer type may have specific constraints on the ranks of its inputs and outputs.
- *
- * Some of the layers (such as softmax, reduce, etc) have parameters that have been described in
- * terms of notational axis "Channel", "Height", "Width" or "Sequence". They can be re-interpreted easily in
- * the general ND setting by using the following rule:
- * "width" is same as axis = -1 (i.e. the last axis from the end)
- * "height" is same as axis = -2 (i.e. the second last axis from the end)
- * "channel" is same as axis = -3 (i.e. the third last axis from the end)
- * "sequence" is same as axis = -5 (i.e. the fifth last axis from the end)
- *
- * Several layers are available in 3 different variations, with the names ending
- * in identifiers: ``like``, ``static`` and ``dynamic``. For instance, ``FillLike``,
- * ``FillStatic`` and ``FillDynamic``. The ``static`` variation generally will have
- * a property corresponding to the shape of the output. For instance, if the
- * output of the ``FillStatic`` layer is desired to be of shape (10, 4), the
- * property ``targetShape`` will have to be set to [10, 4]. In the ``dynamic`` case,
- * the shape is an input, hence it can be changed at runtime. For instance, for
- * a ``FillDynamic`` layer, the input would have to be an array containing the
- * values 10 and 4, if the desired output is of shape (10, 4). Whereas in the
- * ``like`` case, the additional input's shape is used as the output shape, ignoring
- * its values. For instance, for a ``FillLike`` layer, for an input with shape
- * (10, 4), the output generated will also be of shape (10, 4), values of the
- * input will be ignored.
- */
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-import public "Parameters.proto";
-
-package CoreML.Specification;
-
-
-enum NeuralNetworkMultiArrayShapeMapping {
-
-    /*
-     * Describes how the MultiArray shape for the inputs,
-     * provided in Features Types proto via model description,
-     * is mapped to construct tensors that are fed into the Neural Network layers.
-     */
-
-    /*
-     * Default legacy value. Only supported for Core ML Specification version <= 3.
-     *
-     * The default legacy shape mapping resolves all input shapes to a rank 5 equivalent
-     * with axis notation of [Seq, Batch, Channel, Height, Width].
-     *
-     * When this enum value is selected,
-     * the repeated shape field in the message "ArrayFeatureType" in feature types proto,
-     * must be either length 1 or length 3.
-     *
-     * The following rule is used to map the values in the shape field to the actual tensor shape:
-     * rank 1 shape is mapped to shape [1,1,C,1,1]
-     * rank 3 shape is mapped to shape [1,1,C,H,W]
-     * At runtime, the first two dimensions (Seq or Batch) can be presented as well, with non-1 values.
-     *
-     * It is invalid to use this enum value if any of the layers added
-     * Specification version 4 (iOS >= 13, macOS >= 10.15) onwards are used in the network.
-     * Validator will raise an error in that case.
-     */
-    RANK5_ARRAY_MAPPING = 0;
-
-    /*
-     * The exact shape and rank (i.e. number of dimensions in the shape) of the input,
-     * as specified in the message "ArrayFeatureType", is passed through to the layers.
-     * Supported only for Specification version >= 4 (iOS >= 13, macOS >= 10.15).
-     */
-    EXACT_ARRAY_MAPPING = 1;
-
-}
-
-enum NeuralNetworkImageShapeMapping {
-
-    /*
-     * Describes how the shape of the input tensors is constructed from image inputs.
-     */
-
-    /*
-     * In this case, image input is mapped to a rank 5 tensor.
-     * For Color images, input tensor is shaped as [1,1,3,H,W].
-     * For Gray images, input tensor is shaped as [1,1,1,H,W].
-     */
-    RANK5_IMAGE_MAPPING = 0;
-
-    /*
-     * For Color images, input tensor is shaped as [1,3,H,W].
-     * For Gray images, input tensor is shaped as [1,1,H,W].
-     * Supported only for Specification version >= 4 (iOS >= 13, macOS >= 10.15).
-     */
-    RANK4_IMAGE_MAPPING = 1;
-
-}
-
-/**
- A neural network.
- */
-message NeuralNetwork {
-
-    repeated NeuralNetworkLayer layers = 1;
-    repeated NeuralNetworkPreprocessing preprocessing = 2;
-
-    // use this enum value to determine the input tensor shapes to the neural network, for multiarray inputs
-    NeuralNetworkMultiArrayShapeMapping arrayInputShapeMapping = 5;
-
-    // use this enum value to determine the input tensor shapes to the neural network, for image inputs
-    NeuralNetworkImageShapeMapping imageInputShapeMapping = 6;
-
-
-    NetworkUpdateParameters updateParams = 10;
-
-}
-
-/// Preprocessing
-/// -------------
-
-/**
- * A neural network preprocessor that
- * performs a scalar multiplication of an image
- * followed by addition of scalar biases to the channels.
- *
- * Input: X
- *    An image in BGR or RGB format with shape ``[3, H, W]``
- *    or in grayscale format with shape ``[1, H, W]``.
- * Output: Y
- *    An image with format and shape corresponding to the input.
- *
- * If the input image is in BGR format:
- *
- * .. code::
- *
- *     Y[0, :, :] = channelScale * X[0, :, :] + blueBias
- *     Y[1, :, :] = channelScale * X[1, :, :] + greenBias
- *     Y[2, :, :] = channelScale * X[2, :, :] + redBias
- *
- * If the input image is in RGB format:
- *
- * .. code::
- *
- *     Y[0, :, :] = channelScale * X[0, :, :] + redBias
- *     Y[1, :, :] = channelScale * X[1, :, :] + greenBias
- *     Y[2, :, :] = channelScale * X[2, :, :] + blueBias
- *
- * If the input image is in grayscale format:
- *
- * .. code::
- *
- *     Y[0, :, :] = channelScale * X[0, :, :] + grayBias
- */
-message NeuralNetworkImageScaler {
-
-    float channelScale = 10; ///Scalar to be multiplied.
-    float blueBias = 20; ///Scalar blue bias to be added.
-    float greenBias = 21; ///Scalar green bias to be added.
-    float redBias = 22; ///Scalar red bias to be added.
-    float grayBias = 30; ///Scalar bias to be added for grayscale images.
-
-}
-
-/**
- * A neural network preprocessor that
- * subtracts the provided mean image from the input image.
- * The mean image is subtracted from the input named
- * ``NeuralNetworkPreprocessing.featureName``.
- */
-message NeuralNetworkMeanImage {
-
-    /**
-     * Mean image stored as a flattened array of floats,
-     * representing shape [Channel,Height,Width].
-     */
-    repeated float meanImage = 1;
-
-}
-
-/// Preprocessing parameters for image inputs.
-message NeuralNetworkPreprocessing {
-
-    string featureName = 1; /// must be equal to the input name to which the preprocessing is applied
-    oneof preprocessor {
-        NeuralNetworkImageScaler scaler = 10;
-        NeuralNetworkMeanImage meanImage = 11;
-    }
-
-}
-
-/// Activation Functions
-/// --------------------
-
-/**
- * A rectified linear unit (ReLU) activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \text{max}(0, x)
- */
-message ActivationReLU {
-
-}
-
-/**
- * A leaky rectified linear unit (ReLU) activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \begin{cases}
- *             x      & \text{if } x \geq 0 \\
- *             \alpha x & \text{if } x < 0
- *            \end{cases}
- */
-message ActivationLeakyReLU {
-
-    float alpha = 1; //negative slope value for leakyReLU
-
-}
-
-/**
- * A hyperbolic tangent activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \dfrac{1 - e^{-2x}}{1 + e^{-2x}}
- */
-message ActivationTanh {
-
-}
-
-/**
- * A scaled hyperbolic tangent activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \alpha \tanh(\beta x)
- */
-message ActivationScaledTanh {
-
-    float alpha = 1;
-    float beta = 2;
-
-}
-
-/**
- * A sigmoid activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \dfrac{1}{1 + e^{-x}}
- */
-message ActivationSigmoid {
-
-}
-
-/**
- * A linear activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \alpha x + \beta
- */
-message ActivationLinear {
-
-    float alpha = 1;
-    float beta = 2;
-
-}
-
-/**
- * A hard sigmoid activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \text{min}(\text{max}(\alpha x + \beta, 0), 1)
- */
-message ActivationSigmoidHard {
-
-    float alpha = 1;
-    float beta = 2;
-
-}
-
-/**
- * A parameterized rectified linear unit (PReLU) activation function.
- * Input must be at least rank 3. Axis = -3 is denoted by "C", or channels.
- * "alpha" parameter can be a vector of length C.
- *
- * This function has the following formula:
- *
- * .. math::
- *    f(x_i) = \begin{cases}
- *                 x_i          & \text{if } x_i \geq 0 \\
- *                 \alpha_i x_i & \text{if } x_i < 0
- *             \end{cases} \;,\;i=1,...,C
- */
-message ActivationPReLU {
-
-    // parameter of length C or 1.
-    // If length is 1, same value is used for all channels
-    WeightParams alpha = 1;
-
-}
-
-/**
- * An exponential linear unit (ELU) activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \begin{cases}
- *             x              & \text{if } x \geq 0 \\
- *             \alpha (e^x - 1) & \text{if } x < 0
- *            \end{cases}
- */
-message ActivationELU {
-
-    float alpha = 1;
-
-}
-
-/**
- * A thresholded rectified linear unit (ReLU) activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \begin{cases}
- *             x & \text{if } x \geq \alpha \\
- *             0 & \text{if } x < \alpha
- *            \end{cases}
- */
-message ActivationThresholdedReLU {
-
-    float alpha = 1;
-
-}
-
-/**
- * A softsign activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \dfrac{x}{1 + |x|}
- */
-message ActivationSoftsign {
-
-}
-
-/**
- * A softplus activation function.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \text{log}(1 + e^x)
- */
-message ActivationSoftplus {
-
-}
-
-/**
- * A parametric softplus activation function.
- * Input must be at least rank 3. axis = -3 is denoted by "C", or channels.
- * "alpha"/"beta" parameter can be a vector of length C.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x_i) = \alpha_i \text{log}(1 + e^{\beta_i x_i}) \;,\;i=1,...,C
- */
-message ActivationParametricSoftplus {
-
-    // If length is 1, same value is used for all channels
-    WeightParams alpha = 1; //parameter of length C or 1
-    WeightParams beta = 2; //parameter of length C or 1
-
-}
-
-message ActivationParams {
-
-    oneof NonlinearityType {
-        ActivationLinear linear = 5;
-
-        ActivationReLU ReLU = 10;
-        ActivationLeakyReLU leakyReLU = 15;
-        ActivationThresholdedReLU thresholdedReLU = 20;
-        ActivationPReLU PReLU = 25;
-
-        ActivationTanh tanh = 30;
-        ActivationScaledTanh scaledTanh = 31;
-
-        ActivationSigmoid sigmoid = 40;
-        ActivationSigmoidHard sigmoidHard = 41;
-
-        ActivationELU ELU = 50;
-
-        ActivationSoftsign softsign = 60;
-        ActivationSoftplus softplus = 70;
-        ActivationParametricSoftplus parametricSoftplus = 71;
-    }
-
-}
-
-/**
- * Representation of the intermediate tensors
- */
-message Tensor {
-
-    // Number of dimensions in the tensor shape
-    uint32 rank = 1;
-    // actual value of the tensor shape.
-    // must be of length "rank". Can contain -1s for unknown dimensions.
-    repeated int64 dimValue = 2;
-
-}
-
-/**
- * A single neural network layer.
- */
-message NeuralNetworkLayer {
-
-    string name = 1; //descriptive name of the layer
-    repeated string input = 2;
-    repeated string output = 3;
-
-    repeated Tensor inputTensor = 4; // must be the same length as the "input" field
-    repeated Tensor outputTensor = 5; // must be the same length as the "output" field
-
-    // Must be set to true to mark the layer as updatable.
-    // If true, the weightParams in the layer's properties must also be set to updatable
-    // If false, the value of the isUpdatable parameter within the layer's weights are ignored
-    bool isUpdatable = 10;
-
-    oneof layer {
-
-        // Start at 100 here
-        ConvolutionLayerParams convolution = 100;
-
-        PoolingLayerParams pooling = 120;
-
-        ActivationParams activation = 130;
-
-        InnerProductLayerParams innerProduct = 140;
-        EmbeddingLayerParams embedding = 150;
-
-        // Normalization-related Layers
-        BatchnormLayerParams batchnorm = 160;
-        MeanVarianceNormalizeLayerParams mvn = 165;
-        L2NormalizeLayerParams l2normalize = 170;
-        SoftmaxLayerParams softmax = 175;
-        LRNLayerParams lrn = 180;
-
-        CropLayerParams crop = 190;
-        PaddingLayerParams padding = 200;
-        UpsampleLayerParams upsample = 210;
-
-        ResizeBilinearLayerParams resizeBilinear = 211;
-        CropResizeLayerParams cropResize = 212;
-
-        UnaryFunctionLayerParams unary = 220;
-
-        // Element-wise Operations
-        AddLayerParams add = 230;
-        MultiplyLayerParams multiply = 231;
-
-        AverageLayerParams average = 240;
-        ScaleLayerParams scale = 245;
-
-        BiasLayerParams bias = 250;
-        MaxLayerParams max = 260;
-        MinLayerParams min = 261;
-
-        DotProductLayerParams dot = 270;
-        ReduceLayerParams reduce = 280;
-        LoadConstantLayerParams loadConstant = 290;
-
-        // Data Reorganization
-        ReshapeLayerParams reshape = 300;
-        FlattenLayerParams flatten = 301;
-        PermuteLayerParams permute = 310;
-        ConcatLayerParams concat = 320;
-        SplitLayerParams split = 330;
-        SequenceRepeatLayerParams sequenceRepeat = 340;
-
-        ReorganizeDataLayerParams reorganizeData = 345;
-        SliceLayerParams slice = 350;
-
-        // Recurrent Layers
-        SimpleRecurrentLayerParams simpleRecurrent = 400;
-        GRULayerParams gru = 410;
-        UniDirectionalLSTMLayerParams uniDirectionalLSTM = 420;
-        BiDirectionalLSTMLayerParams biDirectionalLSTM = 430;
-
-        // Custom (user-implemented) Layer
-        CustomLayerParams custom = 500;
-
-        // Following layers are available only after Core ML Specification
-        // version >= 4 (iOS >= 13, macOS >= 10.15)
-
-        // Control Flow related Layers
-        CopyLayerParams copy = 600;
-        BranchLayerParams branch = 605;
-
-        LoopLayerParams loop = 615;
-        LoopBreakLayerParams loopBreak = 620;
-        LoopContinueLayerParams loopContinue = 625;
-
-        RangeStaticLayerParams rangeStatic = 635;
-        RangeDynamicLayerParams rangeDynamic = 640;
-
-        // Element-wise Unary Layers
-        ClipLayerParams clip = 660;
-        CeilLayerParams ceil = 665;
-        FloorLayerParams floor = 670;
-
-        SignLayerParams sign = 680;
-        RoundLayerParams round = 685;
-
-        Exp2LayerParams exp2 = 700;
-
-        SinLayerParams sin = 710;
-        CosLayerParams cos = 715;
-        TanLayerParams tan = 720;
-
-        AsinLayerParams asin = 730;
-        AcosLayerParams acos = 735;
-        AtanLayerParams atan = 740;
-
-        SinhLayerParams sinh = 750;
-        CoshLayerParams cosh = 755;
-        TanhLayerParams tanh = 760;
-
-        AsinhLayerParams asinh = 770;
-        AcoshLayerParams acosh = 775;
-        AtanhLayerParams atanh = 780;
-
-        ErfLayerParams erf = 790;
-        GeluLayerParams gelu = 795;
-
-        // Element-wise Binary with Broadcasting Support
-        EqualLayerParams equal = 815;
-        NotEqualLayerParams notEqual = 820;
-        LessThanLayerParams lessThan = 825;
-        LessEqualLayerParams lessEqual = 827;
-        GreaterThanLayerParams greaterThan = 830;
-        GreaterEqualLayerParams greaterEqual = 832;
-
-        LogicalOrLayerParams logicalOr = 840;
-        LogicalXorLayerParams logicalXor = 845;
-        LogicalNotLayerParams logicalNot = 850;
-        LogicalAndLayerParams logicalAnd = 855;
-
-        ModBroadcastableLayerParams modBroadcastable = 865;
-        MinBroadcastableLayerParams minBroadcastable = 870;
-        MaxBroadcastableLayerParams maxBroadcastable = 875;
-        AddBroadcastableLayerParams addBroadcastable = 880;
-        PowBroadcastableLayerParams powBroadcastable = 885;
-        DivideBroadcastableLayerParams divideBroadcastable = 890;
-        FloorDivBroadcastableLayerParams floorDivBroadcastable = 895;
-        MultiplyBroadcastableLayerParams multiplyBroadcastable = 900;
-        SubtractBroadcastableLayerParams subtractBroadcastable = 905;
-
-        // Tensor Manipulations
-        TileLayerParams tile = 920;
-        StackLayerParams stack = 925;
-        GatherLayerParams gather = 930;
-        ScatterLayerParams scatter = 935;
-        GatherNDLayerParams gatherND = 940;
-        ScatterNDLayerParams scatterND = 945;
-        SoftmaxNDLayerParams softmaxND = 950;
-        GatherAlongAxisLayerParams gatherAlongAxis = 952;
-        ScatterAlongAxisLayerParams scatterAlongAxis = 954;
-
-        ReverseLayerParams reverse = 960;
-        ReverseSeqLayerParams reverseSeq = 965;
-
-        SplitNDLayerParams splitND = 975;
-        ConcatNDLayerParams concatND = 980;
-        TransposeLayerParams transpose = 985;
-
-        SliceStaticLayerParams sliceStatic = 995;
-        SliceDynamicLayerParams sliceDynamic = 1000;
-        SlidingWindowsLayerParams slidingWindows = 1005;
-
-        TopKLayerParams topK = 1015;
-        ArgMinLayerParams argMin = 1020;
-        ArgMaxLayerParams argMax = 1025;
-
-        EmbeddingNDLayerParams embeddingND = 1040;
-        BatchedMatMulLayerParams batchedMatmul = 1045;
-
-        // Tensor Allocation / Reshape-related Operations
-        GetShapeLayerParams getShape = 1065;
-        LoadConstantNDLayerParams loadConstantND = 1070;
-
-        FillLikeLayerParams fillLike = 1080;
-        FillStaticLayerParams fillStatic = 1085;
-        FillDynamicLayerParams fillDynamic = 1090;
-
-        BroadcastToLikeLayerParams broadcastToLike = 1100;
-        BroadcastToStaticLayerParams broadcastToStatic = 1105;
-        BroadcastToDynamicLayerParams broadcastToDynamic = 1110;
-
-        SqueezeLayerParams squeeze = 1120;
-        ExpandDimsLayerParams expandDims = 1125;
-        FlattenTo2DLayerParams flattenTo2D = 1130;
-        ReshapeLikeLayerParams reshapeLike = 1135;
-        ReshapeStaticLayerParams reshapeStatic = 1140;
-        ReshapeDynamicLayerParams reshapeDynamic = 1145;
-        RankPreservingReshapeLayerParams rankPreservingReshape = 1150;
-
-        ConstantPaddingLayerParams constantPad = 1155;
-
-        // Random Distributions
-        RandomNormalLikeLayerParams randomNormalLike = 1170;
-        RandomNormalStaticLayerParams randomNormalStatic = 1175;
-        RandomNormalDynamicLayerParams randomNormalDynamic = 1180;
-
-        RandomUniformLikeLayerParams randomUniformLike = 1190;
-        RandomUniformStaticLayerParams randomUniformStatic = 1195;
-        RandomUniformDynamicLayerParams randomUniformDynamic = 1200;
-
-        RandomBernoulliLikeLayerParams randomBernoulliLike = 1210;
-        RandomBernoulliStaticLayerParams randomBernoulliStatic = 1215;
-        RandomBernoulliDynamicLayerParams randomBernoulliDynamic = 1220;
-
-        CategoricalDistributionLayerParams categoricalDistribution = 1230;
-
-        // Reduction-related Layers:
-        ReduceL1LayerParams reduceL1 = 1250;
-        ReduceL2LayerParams reduceL2 = 1255;
-        ReduceMaxLayerParams reduceMax = 1260;
-        ReduceMinLayerParams reduceMin = 1265;
-        ReduceSumLayerParams reduceSum = 1270;
-        ReduceProdLayerParams reduceProd = 1275;
-        ReduceMeanLayerParams reduceMean = 1280;
-        ReduceLogSumLayerParams reduceLogSum = 1285;
-        ReduceSumSquareLayerParams reduceSumSquare = 1290;
-        ReduceLogSumExpLayerParams reduceLogSumExp = 1295;
-
-        // Masking / Selection Layers
-        WhereNonZeroLayerParams whereNonZero = 1313;
-        MatrixBandPartLayerParams matrixBandPart = 1315;
-        LowerTriangularLayerParams lowerTriangular = 1320;
-        UpperTriangularLayerParams upperTriangular = 1325;
-        WhereBroadcastableLayerParams whereBroadcastable = 1330;
-
-        // Normalization Layers
-        LayerNormalizationLayerParams layerNormalization = 1350;
-
-        NonMaximumSuppressionLayerParams NonMaximumSuppression = 1400;
-
-        // Following layers are available only after Core ML Specification
-        // version >= 5 (iOS >= 14, macOS >= 11.0)
-        OneHotLayerParams oneHot = 1450;
-        CumSumLayerParams cumSum = 1455;
-        ClampedReLULayerParams clampedReLU = 1460;
-        ArgSortLayerParams argSort = 1461;
-        Pooling3DLayerParams pooling3d = 1465;
-        GlobalPooling3DLayerParams globalPooling3d = 1466;
-        SliceBySizeLayerParams sliceBySize = 1470;
-        Convolution3DLayerParams convolution3d = 1471;
-
-    }
-
-}
-
-/**
- * Branching Layer
- *
- * A layer that provides the functionality of branching or an If-Else block.
- *
- * Must have 1 input. There are no outputs as the execution is transferred to either the
- * if or the else branch based on the value of the input.
- *
- * Input is the condition predicate. Must be a scalar (length 1 tensor).
- *
- */
-message BranchLayerParams {
-
-    /**
-     * execute this graph if the absolute value of the input Tensor is greater than 1e-6
-     * This must be present.
-     */
-    NeuralNetwork ifBranch = 1;
-    /**
-     * execute this graph if the absolute value of the input Tensor is less than 1e-6
-     * This is optional.
-     */
-    NeuralNetwork elseBranch = 2;
-
-}
-
-/**
- * Loop Layer
- *
- * A layer that provides the functionality of a "for" loop or a "while" loop.
- *
- * There are either no inputs or 1 input. When an input is present, it corresponds to the maximum loop count,
- * in that case the value of the "maxLoopIterations" field is ignored. Input must be a scalar.
- * (For description below, maxLoopIterations is assumed to be the value of the input, when its present)
- *
- * No outputs are produced. Blobs produced by the condition or the body network are visible in the scope of the overall network.
- *
- * "conditionNetwork" must produce a tensor with the name specified in the "conditionVar" field.
- *
- * There are 3 possible cases for determining the termination condition:
- *
- * Case 1:
- *
- * If there is no "conditionNetwork", in this case the layer corresponds to a pure for loop, which is run "maxLoopIterations" number of times.
- * Equivalent pseudo-code:
- *
- * for loopIterator = 0 : maxLoopIterations
- *      bodyNetwork()
- *
- *
- * Case 2:
- *
- * "conditionNetwork" is present, and "maxLoopIterations" is 0 and there is no input,
- * in this case the layer corresponds to a while loop. Equivalent pseudo-code:
- *
- * conditionVar = conditionNetwork()
- * while conditionVar:
- *      bodyNetwork()
- *      conditionVar = conditionNetwork()
- *
- *
- * Case 3:
- *
- * "conditionNetwork" is provided, and "maxLoopIterations" is positive or there is an input,
- * in this case the layer corresponds to a while loop with a joint condition. Equivalent pseudo-code:
- *
- * loopIterator = 0
- * conditionVar = conditionNetwork()
- * while (conditionVar and loopIterator < maxLoopIterations):
- *      bodyNetwork()
- *      loopIterator = loopIterator + 1
- *      conditionVar = conditionNetwork()
- *
- */
-message LoopLayerParams {
-
-    /**
-     * maximum number of iterations. Ignored if input is present.
-     */
-    uint64 maxLoopIterations = 1;
-    /**
-     * This field provides the name of the tensor which is produced by the conditionNetwork
-     * and whose value is checked to start/continue/terminate the loop. Value close to 0.0f is treated as False.
-     * This field is optional.
-     * Must be a non empty string if and only if "conditionNetwork" is present.
-     */
-    string conditionVar = 2;
-    /**
-     * Must generate a tensor with the name provided in the "conditionVar" field.
-     * This field is optional.
-     * Must be present if and only if "conditionVar" field is a non empty string.
-     */
-    NeuralNetwork conditionNetwork = 3;
-    /**
-     * Body of the loop.
-     * This field must be present.
-     */
-    NeuralNetwork bodyNetwork = 4;
-
-}
-
-/**
- * Loop break Layer
- *
- * Terminate the loop that has this layer.
- * If present, it should always reside in the "bodyNetwork" of the loop layer
- *
- * No inputs/outputs
- *
- */
-message LoopBreakLayerParams {
-
-}
-
-/**
- * Loop Continue Layer
- *
- * Stop the current loop iteration and continue on the next iteration.
- * If present, it should always reside in the "bodyNetwork" of the loop layer
- *
- * No inputs/outputs
- *
- */
-message LoopContinueLayerParams {
-
-}
-
-/**
- * Copy Layer
- *
- * A layer that copies its input tensor to the output tensor.
- * Must have 1 input and 1 output, with distinct names.
- * This is the only layer that is allowed to re-generate an output that is already present in the neural network prior to this layer,
- * in which case it will overwrite the output tensor.
- *
- */
-message CopyLayerParams {
-
-}
-
-/**
- * GreaterThan Layer
- *
- * Either 1 or 2 inputs.
- * Produces 1 output.
- * Perform elementwise greater than operation.
- *
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = x1 > x2
- *          or
- *      y = x1 > alpha, if only one input is provided
- *
- * Broadcasting is supported.
- *
- */
-message GreaterThanLayerParams {
-
-    /**
-     * Compare to the scalar value provided here if there is 1 input
-     */
-    float alpha = 2;
-
-}
-
-/**
- * GreaterEqual Layer
- *
- * Either 1 or 2 inputs.
- * Produces 1 output.
- * Perform elementwise greater equal operation.
- *
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = x1 >= x2
- *          or
- *      y = x1 >= alpha, if only one input is provided
- *
- * Broadcasting is supported.
- *
- */
-message GreaterEqualLayerParams {
-
-    /**
-     * Compare to the scalar value provided here if there is 1 input
-     */
-    float alpha = 2;
-
-}
-
-/**
- * LessThan Layer
- *
- * Either 1 or 2 inputs.
- * Produces 1 output.
- * Perform elementwise less than operation.
- *
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = x1 < x2
- *          or
- *      y = x1 < alpha, if only one input is provided
- *
- * Broadcasting is supported.
- *
- */
-message LessThanLayerParams {
-
-    /**
-     * Compare to the scalar value provided here if there is 1 input
-     */
-    float alpha = 2;
-
-}
-
-/**
- * LessEqual Layer
- *
- * Either 1 or 2 inputs.
- * Produces 1 output.
- * Perform elementwise less equal operation.
- *
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = x1 <= x2
- *          or
- *      y = x1 <= alpha, if only one input is provided
- *
- * Broadcasting is supported.
- *
- */
-message LessEqualLayerParams {
-
-    /**
-     * Compare to the scalar value provided here if there is 1 input
-     */
-    float alpha = 2;
-
-}
-
-/**
- * Equal Layer
- *
- * Either 1 or 2 inputs.
- * Produces 1 output.
- * Perform elementwise equal operation.
- *
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = x1 == x2
- *          or
- *      y = x1 == alpha, if only one input is provided
- *
- * Broadcasting is supported.
- *
- */
-message EqualLayerParams {
-
-    /**
-     * Compare to the scalar value provided here if there is 1 input
-     */
-    float alpha = 1;
-
-}
-
-/**
- * NotEqual Layer
- *
- * Either 1 or 2 inputs.
- * Produces 1 output.
- * Perform elementwise not equal operation.
- *
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = x1 != x2
- *          or
- *      y = x1 != alpha, if only one input is provided
- *
- * Broadcasting is supported.
- *
- */
-message NotEqualLayerParams {
-
-    /**
-     * Compare to the scalar value provided here if there is 1 input
-     */
-    float alpha = 1;
-
-}
-
-/**
- * LogicalAnd Layer
- *
- * Must have 2 inputs, produces 1 output.
- * Perform elementwise logical AND operation.
- *
- * Input is considered False if equal to 0.0f otherwise True.
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = AND(x1, x2)
- *
- * Broadcasting is supported.
- *
- */
-message LogicalAndLayerParams {
-
-}
-
-/**
- * LogicalOr Layer
- *
- * Must have 2 inputs, produces 1 output.
- * Perform elementwise logical OR operation.
- *
- * Input is considered False if equal to 0.0f otherwise True.
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = OR(x1, x2)
- *
- * Broadcasting is supported.
- *
- */
-message LogicalOrLayerParams {
-
-}
-
-/**
- * LogicalXor Layer
- *
- * Must have 2 inputs, produces 1 output.
- * Perform elementwise logical XOR operation.
- *
- * Input is considered False if equal to 0.0f otherwise True.
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = XOR(x1, x2)
- *
- * Broadcasting is supported.
- *
- */
-message LogicalXorLayerParams {
-
-}
-
-/**
- * LogicalNot Layer
- *
- * Must have 1 input, produces 1 output.
- * Perform elementwise logical NOT operation.
- *
- * Input is considered False if equal to 0.0f otherwise True.
- * Output is 1.0f if the condition is true otherwise 0.0f.
- *
- * .. code::
- *
- *      y = NOT(x)
- *
- *
- */
-message LogicalNotLayerParams {
-
-}
-
-/// Border Amounts
-/// --------------
-
-/**
- * Specifies the amount of spatial border to be either padded or cropped.
- *
- * For padding:
- *
- * .. code::
- *
- *     H_out = borderAmounts[0].startEdgeSize + H_in + borderAmounts[0].endEdgeSize
- *     W_out = borderAmounts[1].startEdgeSize + W_in + borderAmounts[1].endEdgeSize
- *
- *     topPaddingAmount == Height startEdgeSize
- *     bottomPaddingAmount == Height endEdgeSize
- *     leftPaddingAmount == Width startEdgeSize
- *     rightPaddingAmount == Width endEdgeSize
- *
- * For cropping:
- *
- * .. code::
- *
- *     H_out = (-borderAmounts[0].startEdgeSize) + H_in + (-borderAmounts[0].endEdgeSize)
- *     W_out = (-borderAmounts[1].startEdgeSize) + W_in + (-borderAmounts[1].endEdgeSize)
- *
- *     topCropAmount == Height startEdgeSize
- *     bottomCropAmount == Height endEdgeSize
- *     leftCropAmount == Width startEdgeSize
- *     rightCropAmount == Width endEdgeSize
- */
-message BorderAmounts {
-
-    message EdgeSizes {
-        /**
-         * The amount to be padded or cropped from the beginning.
-         */
-        uint64 startEdgeSize = 1;
-
-        /**
-         * The amount to be padded or cropped from the end.
-         */
-        uint64 endEdgeSize = 2;
-    }
-
-    /**
-     * The border amounts.
-     * This must be length 2 in the order ``[H, W]``.
-     */
-    repeated EdgeSizes borderAmounts = 10;
-
-}
-
-/**
- * Specifies the type of padding to be used with Convolution/Deconvolution and Pooling layers.
- * After padding, input spatial shape: ``[H_in, W_in]``, gets modified to the
- * output spatial shape ``[H_out, W_out]``.
- *
- * .. code::
- *
- *      topPaddingAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize
- *      bottomPaddingAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize
- *      leftPaddingAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize
- *      rightPaddingAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize
- *
- * With Convolution or Pooling:
- *
- * .. code::
- *
- *    H_out = int_division_round_down((H_in + topPaddingAmount + bottomPaddingAmount - KernelSize[0]),stride[0]) + 1
- *
- * which is same as:
- *
- * .. code::
- *
- *    H_out = int_division_round_up((H_in + topPaddingAmount + bottomPaddingAmount - KernelSize[0] + 1),stride[0])
- *
- * With Deconvolution:
- *
- * .. code::
- *
- *    H_out = (H_in-1) * stride[0] + kernelSize[0] - (topPaddingAmount + bottomPaddingAmount)
- *
- *
- * The equivalent expressions hold true for ``W_out`` as well.
- *
- *
- * By default, the values of ``paddingAmounts`` are set to ``0``,
- * which results in a "true" valid padding.
- * If non-zero values are provided for ``paddingAmounts``,
- * "valid" convolution/pooling is performed within the spatially expanded input.
- *
- */
-message ValidPadding {
-
-    BorderAmounts paddingAmounts = 1;
-
-}
-
-/**
- * Specifies the type of padding to be used with Convolution/Deconvolution and pooling layers.
- * After padding, input spatial shape: ``[H_in, W_in]``, gets modified to the
- * output spatial shape ``[H_out, W_out]``.
- * With Convolution or pooling:
- *
- * .. code::
- *
- *      H_out = int_division_round_up(H_in,stride[0])
- *      W_out = int_division_round_up(W_in,stride[1])
- *
- * This is achieved by using the following padding amounts:
- *
- * .. code::
- *
- *     totalPaddingHeight = max(0,(H_out-1) * stride[0] + KernelSize[0] - Hin)
- *     totalPaddingWidth = max(0,(W_out-1) * stride[1] + KernelSize[1] - Win)
- *
- * There are two modes of asymmetry:
- * ``BOTTOM_RIGHT_HEAVY``, and ``TOP_LEFT_HEAVY``.
- *
- * If the mode is ``BOTTOM_RIGHT_HEAVY``:
- *
- * .. code::
- *
- *     topPaddingAmount = floor(totalPaddingHeight / 2)
- *     bottomPaddingAmount = totalPaddingHeight - topPaddingAmount
- *     leftPaddingAmount = floor(totalPaddingWidth / 2)
- *     rightPaddingAmount = totalPaddingWidth - leftPaddingAmount
- *
- * If the mode is ``TOP_LEFT_HEAVY``:
- *
- * .. code::
- *
- *     bottomPaddingAmount = floor(totalPaddingHeight / 2)
- *     topPaddingAmount = totalPaddingHeight - bottomPaddingAmount
- *     rightPaddingAmount = floor(totalPaddingWidth / 2)
- *     leftPaddingAmount = totalPaddingWidth - rightPaddingAmount
- *
- *
- * With Deconvolution:
- *
- * .. code::
- *
- *    H_out = H_in * stride[0]
- *    W_out = W_in * stride[1]
- */
-message SamePadding {
-
-    enum SamePaddingMode {
-
-        BOTTOM_RIGHT_HEAVY = 0;
-        TOP_LEFT_HEAVY = 1;
-
-    }
-    SamePaddingMode asymmetryMode = 1;
-
-}
-
-/**
- * Specifies how grid points are sampled from an interval.
- * Without the loss of generality, assume the interval to be [0, X-1] from which N points are to be sampled.
- * Here X may correspond to an input image's height or width.
- * All the methods can be expressed in terms of numpy's linspace function, along with the constraint that grid points have to lie in the interval [0, X-1].
- * Note: numpy.linspace(start = start, end = end, num = N, endpoint = True) corresponds to sampling
- * N points uniformly from the interval [start, end], endpoints included.
- * The methods vary in how the ``start`` and ``end`` values are computed.
- */
-message SamplingMode {
-
-    enum Method {
-
-        /**
-         * start = 0, end = X-1
-         * grid points = numpy.linspace(start, end)
-         */
-        STRICT_ALIGN_ENDPOINTS_MODE = 0;
-
-        /**
-         * if N == 1: start = end = (X-1)/2
-         * otherwise, start = 0, end = X-1
-         * grid points = numpy.linspace(start, end)
-         */
-        ALIGN_ENDPOINTS_MODE = 1;
-
-        /**
-         * start = 0, end = X - X/N
-         * grid points = min(X-1, numpy.linspace(start, end))
-         * This is same as the mode used in the upsample layer in this specification, when used with bilinear interpolation. In that case N/X = upsample ratio.
-         */
-        UPSAMPLE_MODE = 2;
-
-        /**
-         * spacing = max(1, X-1)/N
-         * start = 0.5 * spacing
-         * end = start + (N-1) * spacing
-         * grid points = min(X-1, numpy.linspace(start, end))
-         */
-        ROI_ALIGN_MODE = 3;
-
-    }
-
-    Method samplingMethod = 1;
-
-}
-
-/**
- * Specifies the convention used to specify four bounding box coordinates for an image of size (Height, Width).
- * The (0,0) coordinate corresponds to the top-left corner of the image.
- */
-message BoxCoordinatesMode {
-
-    enum Coordinates {
-
-        /**
-         * [h_start, w_start, h_end, w_end]
-         */
-        CORNERS_HEIGHT_FIRST = 0;
-
-        /**
-         * [w_start, h_start, w_end, h_end]
-         */
-        CORNERS_WIDTH_FIRST = 1;
-
-        /**
-         * [h_center, w_center, box_height, box_width]
-         */
-        CENTER_SIZE_HEIGHT_FIRST = 2;
-
-        /**
-         * [w_center, h_center, box_width, box_height]
-         */
-        CENTER_SIZE_WIDTH_FIRST = 3;
-
-    }
-
-    Coordinates boxMode = 1;
-
-}
-
-/**
- * Weights for layer parameters.
- * Weights are stored as repeated floating point numbers
- * using row-major ordering
- * and can represent 1-, 2-, 3-, or 4-dimensional data.
- */
-message WeightParams {
-
-    /**
-     * Values specified in single / float / FP32 precision.
-     */
-    repeated float floatValue = 1;
-
-    /**
-     * Values in 16-bit half precision floating point.
-     */
-    bytes float16Value = 2;
-
-    /**
-     * Raw value specification for quantized lower precisions.
-     *
-     * This field is interpreted as uintN, where N is the number of bits in quantization.
-     * E.g. if n=8, the field is interpreted as an array of UINT8.
-     * Use this field for quantized parameters unless specifically noted to use
-     * int8RawValue.
-     */
-    bytes rawValue = 30;
-
-    /**
-     * Field to be used if int8DynamicQuantize is set in the parent layer.
-     * Cannot be set if rawValue is also set.
-     * The values in this field are interpreted as INT8.
-     *
-     * If this field is set, following conditions must hold true:
-     * * QuantizationType == LinearQuantizationParams, such that
-     *   * size of the "scale" field is 1 and "bias" field is empty in "LinearQuantizationParams"
-     */
-    bytes int8RawValue = 31;
-
-    /**
-     * Quantization related parameters.
-     */
-    QuantizationParams quantization = 40;
-
-    bool isUpdatable = 50;
-
-}
-
-/**
- * Quantization parameters.
- */
-message QuantizationParams {
-
-    uint64 numberOfBits = 1;
-    oneof QuantizationType {
-        LinearQuantizationParams linearQuantization = 101;
-        LookUpTableQuantizationParams lookupTableQuantization = 102;
-    }
-
-}
-
-message LinearQuantizationParams {
-
-    /**
-     * Stores scale and bias values corresponding to the quantized weights.
-     * Must be an array of 1 element, or an array of C elements, where C
-     * is number of output channels. For recurrent layers it is equal to
-     * the output vector size.
-     *
-     * Relationship between quantized weights, unquantized weights, scale and bias:
-     *
-     * W_unquantized = W_quantized * scale + bias
-     *
-     */
-    repeated float scale = 1;
-    repeated float bias = 2;
-
-}
-
-message LookUpTableQuantizationParams {
-
-    /* Stores look-up table quantization values. Must be an array of
-    (2^numberOfBits) Elements.
-    */
-    repeated float floatValue = 1;
-
-}
-
-/// Layers
-/// ------
-
-/**
- * A layer that performs spatial convolution or deconvolution.
- *
- * .. code::
- *
- *      y = ConvolutionLayer(x)
- *
- * Requires 1 or 2 inputs and produces 1 output.
- *
- * Input
- *    First Input:
- *      A blob with rank greater than or equal to 4.
- *      Rank 4 blob represents [Batch, channels, height, width].
- *      For ranks greater than 4, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- *
- *     From Core ML specification version 4 onwards (iOS >= 13, macOS >= 10.15).
- *     convolution layer can have 2 inputs, in which case the second input is
- *     the blob representing the weights. This is allowed when "isDeconvolution" = False.
- *     The weight blob should have shape
- *     ``[outputChannels, kernelChannels, kernelHeight, kernelWidth]``,
- *     where kernelChannels == inputChannels / nGroups.
- *
- * Output
- *   Rank is same as the input. e.g.: for rank 4 input, output shape is [B, C_out, H_out, W_out]
- *
- *
- * If ``dilationFactor`` is not 1, effective kernel size is
- * modified as follows:
- *
- * .. code::
- *
- *      KernelSize[0] <-- (kernelSize[0]-1) * dilationFactor[0] + 1
- *      KernelSize[1] <-- (kernelSize[1]-1) * dilationFactor[1] + 1
- *
- * Type of padding can be ``valid`` or ``same``. Output spatial dimensions depend on the
- * the type of padding. For details, refer to the descriptions of the messages "ValidPadding"
- * and "SamePadding". Padded values are all zeros.
- *
- * For Deconvolution, ``ConvolutionPaddingType`` (``valid`` or ``same``) is ignored when ``outputShape`` is set.
- *
- *
- */
-message ConvolutionLayerParams {
-
-    /**
-     * The number of kernels.
-     * Same as ``C_out`` used in the layer description.
-     */
-    uint64 outputChannels = 1;
-
-    /**
-     * Channel dimension of the kernels.
-     * Must be equal to ``inputChannels / nGroups``, if isDeconvolution == False
-     * Must be equal to ``inputChannels``, if isDeconvolution == True
-     */
-    uint64 kernelChannels = 2;
-
-    /**
-     * Group convolution, i.e. weight reuse along channel axis.
-     * Input and kernels are divided into g groups
-     * and convolution / deconvolution is applied within the groups independently.
-     * If not set or 0, it is set to the default value 1.
-     */
-    uint64 nGroups = 10;
-
-    /**
-     * Must be length 2 in the order ``[H, W]``.
-     * If not set, default value ``[3, 3]`` is used.
-     */
-    repeated uint64 kernelSize = 20;
-
-    /**
-     * Must be length 2 in the order ``[H, W]``.
-     * If not set, default value ``[1, 1]`` is used.
-     */
-    repeated uint64 stride = 30;
-
-    /**
-     * Must be length 2 in order ``[H, W]``.
-     * If not set, default value ``[1, 1]`` is used.
-     * It is ignored if ``isDeconvolution == true``.
-     */
-    repeated uint64 dilationFactor = 40;
-
-    /**
-     * The type of padding.
-     */
-    oneof ConvolutionPaddingType {
-        ValidPadding valid = 50;
-        SamePadding same = 51;
-    }
-
-    /**
-     * Flag to specify whether it is a deconvolution layer.
-     */
-    bool isDeconvolution = 60;
-
-    /**
-     * Flag to specify whether a bias is to be added or not.
-     */
-    bool hasBias = 70;
-
-    /**
-     * Weights associated with this layer.
-     * If convolution (``isDeconvolution == false``), weights have the shape
-     * ``[outputChannels, kernelChannels, kernelHeight, kernelWidth]``, where kernelChannels == inputChannels / nGroups
-     * If deconvolution (``isDeconvolution == true``) weights have the shape
-     * ``[kernelChannels, outputChannels / nGroups, kernelHeight, kernelWidth]``, where kernelChannels == inputChannels
-     */
-    WeightParams weights = 90;
-    WeightParams bias = 91; /// Must be of size [outputChannels].
-
-    /**
-     * The output shape, which has length 2 ``[H_out, W_out]``.
-     * This is used only for deconvolution (``isDeconvolution == true``).
-     * If not set, the deconvolution output shape is calculated
-     * based on ``ConvolutionPaddingType``.
-     */
-    repeated uint64 outputShape = 100;
-
-}
-
-/**
- * A layer that performs a 3-dimensional convolution.
- *
- * .. code::
- *
- *      y = Convolution3DLayer(x)
- *
- * Input
- *    A blob of rank 5.
- *    The input blob's shape should be ``[batch, channels, depth, height, width]``.
- *
- * Fields
- *   The bias field, if set, should have shape of ``[channelsOut]``.
- *
- * Output
- *   A blob of rank 5.
- *   The output blob's shape is ``[batch, channelsOut, depthOut, heightOut, widthOut]``.
- *
- * Type of padding can be ``custom``, ``valid``, or ``same``. Padded values are all zeros.
- * Output spatial dimensions depend on the the type of padding. For details, refer to the
- * descriptions of the ``PaddingType`` field of this ``Convolution3DLayerParams`` message.
- *
- * Example
- *   For example, given an input of size ``[1, 3, 3, 8, 8]``, a stride of 2 in each dimension,
- *   a kernel of 3 in each dimension, 2 output channels, and ``same`` padding, this layer will
- *   compute the total padding applied in the depth, height, and width dimensions to be 2, 1, and 1,
- *   respectively. The depth padding is even and will be applied equally to both sides of the depth
- *   dimension. Since the height and width padding values are odd, they'll be applied to the
- *   bottom/right of the height/width dimensions. Thus, the padding applied to the input will be
- *   ``[1, 1, 0, 1, 0, 1]`` (front, back, top, bottom, left, right). Finally, the output produced
- *   will have size ``[1, 2, 2, 4, 4]``.
- *
- */
-message Convolution3DLayerParams {
-
-    /**
-     * The number of channels in the output (channelsOut). Must be a positive integer.
-     */
-    int32 outputChannels = 1;
-
-    /**
-     * The number of channels in the input (channels). Must be a positive integer.
-     */
-    int32 inputChannels = 2;
-
-    /**
-    * Group convolution, i.e., weight reuse along the channel axis.
-    * It must evenly divide both the number of input and output channels and be at most the number
-    * of input channels (a depthwise convolution).
-    * Input and kernels are divided into g groups and convolution is applied within the groups
-    * independently.
-    */
-    int32 nGroups = 10;
-
-    /* Depth of the convolution kernel. Must be a positive integer.
-     */
-    int32 kernelDepth = 20;
-
-    /* Height of the convolution kernel. Must be a positive integer.
-     */
-    int32 kernelHeight = 21;
-
-    /* Width of the convolution kernel. Must be a positive integer.
-     */
-    int32 kernelWidth = 22;
-
-    /* Stride along the depth direction. Must be a positive integer.
-     */
-    int32 strideDepth = 31;
-
-    /* Stride along the height direction. Must be a positive integer.
-     */
-    int32 strideHeight = 32;
-
-    /* Stride along the width direction. Must be a positive integer.
-     */
-    int32 strideWidth = 33;
-
-    /* Dilation along the depth direction. Must be a positive integer.
-     */
-    int32 dilationDepth = 40;
-
-    /* Dilation along the height direction. Must be a positive integer.
-     */
-    int32 dilationHeight = 41;
-
-    /* Dilation along the width direction. Must be a positive integer.
-     */
-    int32 dilationWidth = 42;
-
-    /**
-     * Flag to specify whether a bias is to be added or not.
-     * If false, then no bias is added.
-     */
-    bool hasBias = 50;
-
-    /**
-     * Weights associated with this layer.
-     * Weights have the shape
-     * if deconvolution == False
-     * ``[outputChannels, kernelChannels, kernelDepth, kernelHeight, kernelWidth]``, where
-     * kernelChannels == inputChannels / nGroups
-     * else if deconvolution == True
-     * ``[outputChannels / nGroups, kernelChannels, kernelDepth, kernelHeight, kernelWidth]``, where
-     */
-    WeightParams weights = 60;
-
-    /**
-     * Must be of size ``[outputChannels]``.
-     */
-    WeightParams bias = 61;
-
-
-    /**
-     * The type of padding.
-     * All padding types pad the input shape with zeros.
-     * CUSTOM padding will add the custom padding values specified below to their respective
-     * dimensions, e.g., `customPaddingFront` number of zeros will be added to one side of the
-     * input's depth dimension and `customPaddingBack` number of zeros will be added to the other
-     * side of the input's depth dimension.
-     * VALID padding adds no padding to any dimension. In this case, the last convolution along
-     * each dimension will be dropped if the input dimension and the kernel size, stride, and
-     * dilation do not match.
-     * SAME padding adds enough padding to each dimension such that the output of the convolution
-     * has size ``Ceiling(inputShape / stride)``. Padding is added evenly to both sides of each
-     * dimension unless the total padding to add is odd, in which case it is added to the
-     * back/bottom/right side of the respective dimension. For example, if the total padding needed
-     * in the depth dimension is 3, 1 zero will be added to the front side of the depth dimension
-     * and 2 zeros will be added to the back side.
-     */
-    enum PaddingType {
-        CUSTOM = 0;
-        VALID = 1;
-        SAME = 2;
-    }
-    PaddingType paddingType = 70;
-
-    /* Padding before the input in the depth direction. Must be zero or a positive integer.
-     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
-     */
-    int32 customPaddingFront = 80;
-
-    /* Padding after the input in the depth direction. Must be zero or a positive integer.
-     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
-     */
-    int32 customPaddingBack = 81;
-
-    /* Padding before the input in the height direction. Must be zero or a positive integer.
-     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
-     */
-    int32 customPaddingTop = 82;
-
-    /* Padding after the input in the height direction. Must be zero or a positive integer.
-     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
-     */
-    int32 customPaddingBottom = 83;
-
-    /* Padding before the input in the width direction. Must be zero or a positive integer.
-     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
-     */
-    int32 customPaddingLeft = 84;
-
-    /* Padding after the input in the width direction. Must be zero or a positive integer.
-     * Used when the `PaddingType` is `CustomPadding`, otherwise ignored by other padding types.
-     */
-    int32 customPaddingRight = 85;
-    
-    /* Flag to specify if this is Convolution Transpose or not.
-     */
-    bool isDeconvolution = 86;
-    
-    /*
-     * The output shape, which has length 3 ``[D_out, H_out, W_out]``.
-     * This is used only for deconvolution (``isDeconvolution == true``).
-     * If not set, the deconvolution output shape is calculated
-     * based on ``PaddingType``.
-     */
-    repeated uint64 outputShape = 87;
-
-}
-
-/**
- * A layer that performs a matrix-vector or matrix-matrix product.
- * This is equivalent to a fully-connected, or dense layer.
- * The weight parameters correspond to a matrix of dimensions (inputChannels, outputChannels) i.e. (C_in, C_out)
- *
- * .. code::
- *
- *      y = InnerProductLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *      Input can have rank 1 to rank 5. This is how it is reshaped in to the matrix (for rank > 1):
- *      rank 1 (x1) : in this case, the layer corresponds to a matrix-vector product. x1 must be equal to C_in
- *      rank 2 (x1, x2): x2 must be equal to C_in
- *      rank 3 (x1, x2, x3) --> (x1 * x2, x3). x3 must be equal to C_in
- *      rank 4 (x1, x2, x3, x4) ---> (x1, x2 * x3 * x4). x2 * x3 * x4 must be equal to C_in
- *      rank 5 (x1, x2, x3, x4, x5) ---> (x1 * x2, x3 * x4 * x5). x3 * x4 * x5 must be equal to C_in
- *
- * Output
- *      Output rank is same as the input rank
- *      rank 1: (C_out)
- *      rank 2: (x1, C_out)
- *      rank 3: (x1, x2, C_out)
- *      rank 4: (x1, C_out, 1, 1)
- *      rank 5: (x1, x2, C_out, 1, 1)
- *
- */
-message InnerProductLayerParams {
-
-    uint64 inputChannels = 1; /// Input size: C_in.
-    uint64 outputChannels = 2; /// Output size: C_out.
-
-    bool hasBias = 10; /// Whether a bias is added or not.
-
-    WeightParams weights = 20; /// Weight matrix [C_out, C_in].
-    WeightParams bias = 21; /// Bias vector [C_out].
-
-    /**
-     * If set, this layer, at runtime, quantizes the floating point input blob to int8 before applying an
-     * inner product using INT8 weight matrix parameters, as provided in weights->int8RawValue. The
-     * result is then dequantized.
-     * Requires:
-     * * hasBias == false
-     * * QuantizationType == LinearQuantizationParams, such that
-     *   * size of the "scale" field is 1 and "bias" field is empty in "LinearQuantizationParams"
-     * * numberOfBits == 8
-     * * weights->rawValue_size to be empty
-     */
-    bool int8DynamicQuantize = 22;
-
-}
-
-/**
- * A layer that performs a matrix lookup and optionally adds a bias.
- * The weights matrix is stored with dimensions [outputChannels, inputDim].
- *
- * .. code::
- *
- *      y = EmbeddingLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     Input values must be in the range ``[0, inputDim - 1]``.
- *
- *     Input must have rank equal to 4 or 5, such that the last 3 dimensions are all 1.
- *     rank 4: shape (x1, 1, 1, 1). x1 is effectively the batch/sequence length.
- *     rank 5: shape (x1, x2 , 1, 1, 1). x1 * x2 is effectively the combined batch/sequence length.
- *
- * Output
- *      Output rank is same as the input rank. Please see input description above.
- *      rank 4: shape (x1, outputChannels, 1, 1)
- *      rank 5: shape (x1, x2, outputChannels, 1, 1)
- *
- */
-message EmbeddingLayerParams {
-
-    uint64 inputDim = 1; /// Size of the input dictionary.
-    uint64 outputChannels = 2; /// Size of the output vectors.
-
-    bool hasBias = 10; /// Whether a bias is added or not.
-
-    WeightParams weights = 20; /// 2-D weights of dimensions [outputChannels, inputDim].
-    WeightParams bias = 21; /// Bias of size [outputChannels].
-
-}
-
-/**
- * A layer that performs a matrix lookup and optionally adds a bias.
- * The weights matrix is stored with dimensions [embeddingSize, vocabSize].
- *
- * .. code::
- *
- *      y = EmbeddingNDLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     Input values must be in the range ``[0, vocabSize - 1]``.
- *     Input must have rank at least 2. The last dimension must always be 1.
- *     rank 2: shape (x1, 1). x1 is the batch/sequence length.
- *     rank 3: shape (x1, x2, 1). x1 * x2 is effectively the combined batch/sequence length.
- *     rank 4: shape (x1, x2, x3, 1). x1 * x2 * x2 is effectively the combined batch/sequence length.
- *     rank 5: shape (x1, x2 , x3, x4, 1). x1 * x2 * x3 * x4 is effectively the combined batch/sequence length.
- *
- * Output
- *      Output rank is same as the input rank. Please see input description above.
- *      rank 2: shape (x1, embeddingSize)
- *      rank 3: shape (x1, x2, embeddingSize)
- *      rank 4: shape (x1, x2, x3, embeddingSize)
- *      rank 5: shape (x1, x2, x3, x4, embeddingSize)
- *
- */
-message EmbeddingNDLayerParams {
-
-    uint64 vocabSize = 1; /// Size of the input dictionary.
-    uint64 embeddingSize = 2; /// Size of the output vectors.
-    bool hasBias = 3; /// Whether a bias is added or not.
-    WeightParams weights = 20; /// 2-D weights of dimensions [embeddingSize, vocabSize].
-    WeightParams bias = 21; /// Bias of size [embeddingSize].
-
-}
-
-/**
- * A layer that performs batch normalization,
- * which is performed along axis = -3,
- * and repeated along the other axes, if present.
- *
- * .. code::
- *
- *      y = BatchnormLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * This operation is described by the following formula:
- *
- * .. math::
- *     y_i = \gamma_i \dfrac{ (x_i - \mu_i)}{\sqrt{\sigma_i^2 + \epsilon}} + \beta_i \;,\;i=1,....,C
- *
- * Input
- *     A blob with rank greater than equal to 3.
- *     Example: Rank 4 blob represents [Batch, channels, height, width]
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- *
- * Output
- *     A blob with the same shape as the input.
- */
-message BatchnormLayerParams {
-
-    uint64 channels = 1; /// Size of the channel dimension in the input.
-
-    /**
-     * If ``computeMeanVar == true``,
-     * the mean and variance are calculated from either
-     * the single input instance, if ``instanceNormalization == true``,
-     * or the whole batch, if ``instanceNormalization = false``.
-     * and the values provided in parameters "mean" and "variance" are ignored.
-     */
-    bool computeMeanVar = 5;
-    bool instanceNormalization = 6;
-
-    /**
-     * A small constant to avoid division by 0 while normalizing by variance.
-     * Defaults to ``1e-5`` if not set or set to ``0``.
-     */
-    float epsilon = 10;
-
-    WeightParams gamma = 15; /// Parameter of length [channels]
-    WeightParams beta = 16; /// Parameter of length [channels]
-    WeightParams mean = 17; /// Parameter of length [channels]
-    WeightParams variance = 18; /// Parameter of length [channels]
-
-}
-
-/**
- * A spatial pooling layer.
- *
- * .. code::
- *
- *      y = PoolingLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank greater than equal to 4.
- *     Rank 4 blob represents [Batch, channels, height, width]
- *     For ranks greater than 4, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- *
- * Output
- *     Rank is same as the input. e.g.: for rank 4 input, output shape is [B, C, H_out, W_out]
- *
- * Padding options are similar to ``ConvolutionLayerParams``
- * with the additional option of ``ValidCompletePadding`` (``includeLastPixel``),
- * which ensures that the last application of the kernel
- * always includes the last pixel of the input image, if there is padding.
- *
- * .. code::
- *
- *     H_out = ceil(float(H_in + 2 * paddingAmounts[0] - kernelSize[0])/float(Stride[0])) + 1
- *     if (paddingAmounts[0] > 0 or paddingAmounts[1] > 0)
- *          if ((H_out - 1) * Stride >= H_in + paddingAmounts[0]) {
- *              H_out = H_out - 1
- *          }
- *     }
- *
- * The equivalent expressions hold true for ``W_out`` as well.
- * Only symmetric padding is supported with this option.
- */
-message PoolingLayerParams {
-
-    enum PoolingType {
-
-        MAX = 0;
-        AVERAGE = 1;
-        L2 = 2;
-
-    }
-    PoolingType type = 1; /// Type of pooling operation.
-
-    /**
-     * Must be length 2 in the order ``[H, W]``.
-     * If not set, default value ``[3, 3]`` is used.
-     */
-    repeated uint64 kernelSize = 10;
-
-    /**
-     * Must be length 2 in the order ``[H, W]``.
-     * If not set, default value ``[1, 1]`` is used.
-     */
-    repeated uint64 stride = 20;
-
-    message ValidCompletePadding {
-
-        /**
-         * Must be length 2 in order ``[H, W]``.
-         * If not set, value ``[0, 0]`` is used.
-         */
-        repeated uint64 paddingAmounts = 10;
-
-    }
-
-    oneof PoolingPaddingType {
-        ValidPadding valid = 30;
-        SamePadding same = 31;
-        ValidCompletePadding includeLastPixel = 32;
-    }
-
-    /**
-     * If true, padded values are excluded from the count (denominator)
-     * when computing average pooling.
-     */
-    bool avgPoolExcludePadding = 50;
-
-    /**
-     * If true, global pooling is performed.
-     * Kernel size is inferred from the input data spatial dimensions.
-     */
-    bool globalPooling = 60;
-
-}
-
-/*
- * A layer to pool three spatial dimensions
- *
- * Input
- *      A blob with rank equal to 5, representing [Batch, channels, depth, height, width].
- *
- * Output
- *      Rank is same as the input: A blob with rank equal to 5, representing [Batch, channels, depth, height, width].
- *
- * Requires 1 input and produces 1 output.
- *
- * For example, given an input of shape (1,1,2,3,3):
- *        +----+----+----+
- *      / | 10 | 11 | 12 |
- *     /  +----+----+----+
- *    /   | 13 | 14 | 15 |
- *   /    +----+----+----+
- *  /     | 16 | 17 | 18 |
- * /      +----+----+----+
- * +----+----+----+      /
- * |  1 |  2 |  3 |     /
- * +----+----+----+    /
- * |  4 |  5 |  6 |   /
- * +----+----+----+  /
- * |  7 |  8 |  9 | /
- * +----+----+----+
- *
- * And applying MAX pooling using:
- *      Kernel: 2x2x2
- *      Stride: 1x1x1
- *      Valid Padding
- * We expect to get an output with shape: (1,1,1,2,2) and value:
- * +----+----+
- * | 14 | 15 |
- * +----+----+
- * | 17 | 18 |
- * +----+----+
- */
-message Pooling3DLayerParams {
-    
-    enum PoolingType3D {
-        MAX = 0;
-        AVERAGE = 1;
-    }
-    
-    // Whether to use Max or Average
-    PoolingType3D type = 1;
-    
-    // Depth of the pooling region.
-    int32 kernelDepth = 2;
-    
-    // Height of the pooling region.
-    int32 kernelHeight = 3;
-    
-    // Width of the pooling region.
-    int32 kernelWidth = 4;
-    
-    // Stride along the depth direction
-    int32 strideDepth = 5;
-    
-    // Stride along the height direction
-    int32 strideHeight = 6;
-    
-    // Stride along the width direction
-    int32 strideWidth = 7;
-    
-    /**
-     * The type of padding.
-     * All padding types pad the input shape with zeros.
-     * CUSTOM padding will add the custom padding values specified below to their respective
-     * dimensions, e.g., `customPaddingFront` number of zeros will be added to one side of the
-     * input's depth dimension and `customPaddingBack` number of zeros will be added to the other
-     * side of the input's depth dimension.
-     * VALID padding adds no padding to any dimension. In this case, the last pool along
-     * each dimension will be dropped if the input dimension and the kernel size, and stride do not match.
-     * SAME padding adds enough padding to each dimension such that the output
-     * has the same spatial dimensions as the input. Padding is added evenly to both
-     * sides of each dimension unless the total padding to add is odd, in which case the extra padding
-     * is added to the back/bottom/right side of the respective dimension.  For example, if the the
-     * total horizontal padding is 3, then there will be 1 padding on the left, and 2 padding on the right.
-     */
-    enum Pooling3DPaddingType {
-        CUSTOM = 0;
-        VALID = 1;
-        SAME = 2;
-    }
-    Pooling3DPaddingType paddingType = 15;
-    
-    // Padding before the input in the depth direction.
-    int32 customPaddingFront = 8;
-    
-    // Padding after the input in the depth direction.
-    int32 customPaddingBack = 9;
-    
-    // Padding before the input in the height direction.
-    int32 customPaddingTop = 10;
-    
-    // Padding after the input in the height direction.
-    int32 customPaddingBottom = 11;
-    
-    // Padding before the input in the width direction.
-    int32 customPaddingLeft = 12;
-    
-    // Padding after the input in the width direction.
-    int32 customPaddingRight = 13;
-    
-    // If true, exclude zeros from padding in Average pooling.  Meaningless in Max Pooling.
-    bool countExcludePadding = 14;
-}
-
-/*
- * A layer to pool three spatial dimensions down to one value.
- * This behaves like a special case of Pooling3DLayerParams in which
- * the Kernel is the size of the input and there is no padding.
- *
- * Input
- *      A blob with rank equal to 5, representing [Batch, channels, depth, height, width].
- *
- * Output
- *      Rank is same as the input: A blob with rank equal to 5, representing [Batch, channels, depth, height, width].
- *      Depth, height, and width of the output will always be 1.
- *
- * Requires 1 input and produces 1 output.
- *
- * For example, given an input of shape (1,1,2,3,3):
- *        +----+----+----+
- *      / | 10 | 11 | 12 |
- *     /  +----+----+----+
- *    /   | 13 | 14 | 15 |
- *   /    +----+----+----+
- *  /     | 16 | 17 | 18 |
- * /      +----+----+----+
- * +----+----+----+      /
- * |  1 |  2 |  3 |     /
- * +----+----+----+    /
- * |  4 |  5 |  6 |   /
- * +----+----+----+  /
- * |  7 |  8 |  9 | /
- * +----+----+----+
- *
- * And applying MAX global 3d pooling, we expect to get an output with shape: (1,1,1,1,1) and value:
- * +----+
- * | 18 |
- * +----+
- */
-message GlobalPooling3DLayerParams {
-    
-    enum GlobalPoolingType3D {
-        MAX = 0;
-        AVERAGE = 1;
-    }
-    
-    // Whether to use Max or Average
-    GlobalPoolingType3D type = 1;
-}
-
-/**
- * A layer that performs padding along spatial dimensions.
- *
- * .. code::
- *
- *      y = PaddingLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank at least 2.
- *     e.g.: blob with shape ``[H_in, W_in]``.
- *     For ranks greater than 2, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch
- *     i.e. Padding is applied on last two dimensions.
- *
- * Output
- *     Same rank as the input.
- *     e.g.: blob with shape ``[H_out, W_out]``.
- *
- * Output dimensions are calculated as follows:
- *
- * .. code::
- *
- *     H_out = H_in + topPaddingAmount + bottomPaddingAmount
- *     W_out = W_in + leftPaddingAmount + rightPaddingAmount
- *
- *     topPaddingAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize
- *     bottomPaddingAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize
- *     leftPaddingAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize
- *     rightPaddingAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize
- *
- * There are three types of padding:
- *
- * - ``PaddingConstant``, which fills a constant value at the border.
- * - ``PaddingReflection``, which reflects the values at the border.
- * - ``PaddingReplication``, which replicates the values at the border.
- *
- * Given the following input:
- *
- * .. code::
- *
- *     [1, 3, 4]  :  1   2   3   4
- *                   5   6   7   8
- *                   9   10  11  12
- *
- * Here is the output of applying the padding
- * ``(top=2, left=2, bottom=0, right=0)``
- * with each of the supported types:
- *
- * - ``PaddingConstant`` (``value = 0``):
- *   .. code::
- *
- *       [1, 5, 6]  :  0   0   0  0   0   0
- *                     0   0   0  0   0   0
- *                     0   0   1  2   3   4
- *                     0   0   5  6   7   8
- *                     0   0   9  10  11  12
- *
- * - ``PaddingReflection``:
- *   .. code::
- *
- *       [1, 5, 6]  :  11  10  9  10  11  12
- *                     7   6   5  6   7   8
- *                     3   2   1  2   3   4
- *                     7   6   5  6   7   8
- *                     11  10  9  10  11  12
- *
- * - ``PaddingReplication``:
- *   .. code::
- *
- *       [1, 5, 6]  :  1   1   1  2   3   4
- *                     1   1   1  2   3   4
- *                     1   1   1  2   3   4
- *                     5   5   5  6   7   8
- *                     9   9   9  10  11  12
- */
-message PaddingLayerParams {
-
-    /**
-     * Fill a constant value in the padded region.
-     */
-    message PaddingConstant {
-        float value = 1;
-    }
-
-    /**
-     * Reflect the values at the border for padding.
-     */
-    message PaddingReflection {
-    }
-
-    /**
-     * Replicate the values at the border for padding.
-     */
-    message PaddingReplication {
-    }
-
-    oneof PaddingType {
-        PaddingConstant constant = 1;
-        PaddingReflection reflection = 2;
-        PaddingReplication replication = 3;
-    }
-
-    BorderAmounts paddingAmounts = 10; /// Amounts to be padded to the input.
-
-}
-
-/**
- * A layer that concatenates along the axis = -3 or -5.
- * For general concatenation along any axis, see ConcatNDLayer.
- *
- * .. code::
- *
- *      y = ConcatLayer(x1,x2,....)
- *
- * Requires more than 1 input and produces 1 output.
- *
- * Input
- *   All input blobs must have same rank.
- *   If "sequenceConcat" = False, rank must be greater than equal to 3. In this case concatenation is along axis = -3
- *   If "sequenceConcat" = True, rank must be greater than equal to 5. In this case concatenation is along axis = -5
- *
- * Output
- *   Same rank as the input.
- *
- */
-message ConcatLayerParams {
-
-    /**
-     * If true, concatenate along the axis = -5 instead of axis = -3.
-     */
-    bool sequenceConcat = 100;
-
-}
-
-/**
- * A layer that performs local response normalization (LRN).
- *
- * .. code::
- *
- *      y = LRNLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank greater than equal to 3.
- *     Example: Rank 4 blob represents [Batch, channels, height, width]
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- * Output
- *     A blob with the same shape as the input.
- *
- * This layer is described by the following formula:
- *
- * .. math::
- *     x_i \leftarrow  \dfrac{x_i}{\left ( k + \dfrac{\alpha}{C} \sum_j x_j^2 \right )^\beta}
- *
- * where the summation is done over a ``(localSize, 1, 1)`` neighborhood ---
- * that is, over a window "across" channels in 1x1 spatial neighborhoods.
- */
-message LRNLayerParams {
-
-    float alpha = 1;
-    float beta = 2;
-    uint64 localSize = 3; /// Number of channels in the normalization window.
-    float k = 4; /// Defaults to 1 if not set or 0. Must be strictly positive.
-
-}
-
-/**
- * Softmax Normalization Layer
- *
- * A layer that performs softmax normalization.
- * Normalization is applied along axis = -3 or N-3 (where N is the rank of the input)
- * For softmax layer that can operate on any axis, see SoftmaxNDLayer.
- *
- *
- * .. code::
- *
- *      y = SoftmaxLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     Must be a blob with rank >= 3.
- * Output
- *     A blob with the same shape as the input.
- *
- * This layer is described by the following formula:
- *
- * .. math::
- *     x_i \leftarrow \dfrac{e^{x_i}}{\sum_i{e^{x_i}}}
- */
-message SoftmaxLayerParams {
-
-}
-
-/**
- * A layer that uniformly splits across axis = -3 to produce a specified number of outputs.
- * For general split operation along any axis, see SplitNDLayer.
- *
- * .. code::
- *
- *      (y1,y2,...yN) = SplitLayer(x), where N = nOutputs
- *
- * Requires 1 input and produces multiple outputs.
- *
- * Input
- *     A blob with rank at least 3.
- *     e.g.: blob with shape ``[C, H, W]``
- * Output
- *     ``nOutputs`` blobs each with same rank as the input.
- *     e.g.: For input that is of shape ``[C, H, W]``, output shapes will be ``[C/nOutputs, H, W]``
- */
-message SplitLayerParams {
-
-    uint64 nOutputs = 1; /// The number of outputs.
-
-}
-
-/**
- * A layer that performs elementwise addition.
- * This layer has limited broadcasting support. For general broadcasting see AddBroadcastableLayer.
- *
- * .. code::
- *
- *      y = AddLayer(x1,x2,...)
- *
- * Requires 1 or more than 1 input and produces 1 output.
- *
- * Input
- *     In general, there are no rank constraints.
- *     However, only certain set of shapes are broadcastable. For example:
- *     [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W]
- * Output
- *     A blob with shape equal to the input blob.
- *
- * If only one input is provided, scalar addition is performed:
- *
- * .. math::
- *     y = x + \alpha
- *
- */
-message AddLayerParams {
-
-    /**
-     * Scalar to be added to the input.
-     * Only used if there is a single input.
-     */
-    float alpha = 1;
-
-}
-
-/**
- * A layer that performs elementwise multiplication.
- * This layer has limited broadcasting support. For general broadcasting see MultiplyBroadcastableLayer.
- *
- * .. code::
- *
- *      y = MultiplyLayer(x1,x2,...)
- *
- * Requires 1 or more than 1 input and produces 1 output.
- *
- * Input
- *     In general, there are no rank constraints.
- *     However, only certain set of shapes are broadcastable. For example:
- *     [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W]
- * Output
- *     A blob with shape equal to the first input blob.
- *
- * If only one input is provided, scalar multiplication is performed:
- *
- * .. math::
- *     y = \alpha x
- *
- */
-message MultiplyLayerParams {
-
-    /**
-     * Scalar to be multiplied with the input.
-     * Only used if there is a single input.
-     */
-    float alpha = 1;
-
-}
-
-/**
- * A layer that applies a unary function.
- *
- * .. code::
- *
- *      y = UnaryFunctionLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with no rank constraints.
- * Output
- *     A blob with the same shape as the input.
- *
- * The input is first modified by shifting and scaling:
- *
- * .. math::
- *     x \leftarrow \text{scale} \cdot x + \text{shift}
- */
-message UnaryFunctionLayerParams {
-
-    /**
-     * A unary operator.
-     *
-     * The following functions are supported:
-     *
-     * ``SQRT``
-     *     .. math:: f(x) = \sqrt{x}
-     *
-     * ``RSQRT``
-     *     .. math:: f(x) = \dfrac{1}{\sqrt{x + \epsilon}}
-     *
-     * ``INVERSE``
-     *     .. math:: f(x) = \dfrac{1}{x + \epsilon}
-     *
-     * ``POWER``
-     *     .. math:: f(x) = x^\alpha
-     *
-     * ``EXP``
-     *     .. math:: f(x) = e^x
-     *
-     * ``LOG``
-     *     .. math:: f(x) = \log x
-     *
-     * ``ABS``
-     *     .. math:: f(x) = |x|
-     *
-     * ``THRESHOLD``
-     *     .. math:: f(x) = \text{max}(\alpha, x)
-     */
-    enum Operation {
-        SQRT = 0;
-        RSQRT = 1;
-        INVERSE = 2;
-        POWER = 3;
-        EXP = 4;
-        LOG = 5;
-        ABS = 6;
-        THRESHOLD = 7;
-    }
-    Operation type = 1; /// The type of unary function.
-
-    /**
-     * A constant used in ``POWER`` and ``THRESHOLD`` functions.
-     */
-    float alpha = 2;
-
-    /**
-     * A small constant to avoid division by 0 while normalizing variance.
-     * Defaults to ``1e-6`` if not set or set to ``0``.
-     */
-    float epsilon = 3;
-
-    /**
-     * Input is shifted by this amount
-     * before the unary function is applied.
-     * Defaults to ``0.0`` if not set.
-     */
-    float shift = 4;
-
-    /**
-     * Input is scaled by this amount
-     * before the unary function is applied.
-     * Defaults to ``1.0`` if not set or set to ``0``.
-     */
-    float scale = 5;
-
-}
-
-/**
- * A layer that scales up spatial dimensions.
- * It supports two modes: nearest neighbour (default) and bilinear.
- *
- * .. code::
- *
- *      y = UpsampleLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank at least 3.
- *     e.g.: blob with shape ``[C, H, W]``.
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- *
- * Output
- *     Same rank as the input.
- *     e.g.: blob with shape ``[C, scalingFactor[0] * H, scalingFactor[1] * W]``
- */
-message UpsampleLayerParams {
-
-    /**
-     * Scaling Factor. Mutually exclusive with fractionalScalingFactor.
-     * Must be length 2 in order ``[H, W]``.
-     * If not set, default value ``[1, 1]`` is used.
-     */
-    repeated uint64 scalingFactor = 1;
-
-    /**
-     * Fractional scaling factor. Mutually exclusive with scalingFactor.
-     * Must be length 2 in order ``[H, W]``.
-     * If not set, default value ``[1.0, 1.0]`` is used.
-     */
-    repeated float fractionalScalingFactor = 7;
-
-    /*
-     * Overall mode for interpolating new elements when upsampling.
-     * NN - Nearest Neighbors - simply pick the nearest true value for interpolated values.
-     * BILINEAR - Use bilinear interpolation. See LinearUpsamplingMode for behavior.
-     */
-    enum InterpolationMode {
-
-        NN = 0; /// Nearest Neighbour
-        BILINEAR = 1; /// Bilinear
-
-    }
-
-    InterpolationMode mode = 5;
-
-    /**
-     * LinearUpsampleMode specifies the behavior for linear upsampling. Only valid when Interpolation Mode is BILINEAR.
-     * If input grid is [0, Xin-1] (corresponding to an input size of Xin), and if the output size is Xout,
-     * then the grid points are sampled in the following manner:
-     * DEFAULT:
-     *   spacing = (Xin-Xin/Xout) / (Xout-1)
-     *   grid_point[i] = min(Xin-1, max(0, i * spacing)), for i = 0,1,2,….,Xout-1
-     * ALIGN_CORNERS_TRUE:
-     *   spacing = (Xin-1) / (Xout-1)
-     *   grid_point[i] = min(Xin-1, max(0, i * spacing)), for i = 0,1,2,….,Xout-1
-     * ALIGN_CORNERS_FALSE:
-     *   spacing = Xin / Xout
-     *   grid_point[i] = min(Xin-1, max(0, i * spacing + 0.5 * spacing - 0.5)), for i = 0,1,2,….,Xout-1
-     */
-    enum LinearUpsampleMode {
-
-        DEFAULT = 0;
-        ALIGN_CORNERS_TRUE = 1;
-        ALIGN_CORNERS_FALSE = 2;
-
-    }
-
-    LinearUpsampleMode linearUpsampleMode = 6;
-
-}
-
-/**
-* A layer that resizes the input to a pre-specified spatial size using bilinear interpolation.
-*
-* .. code::
-*
-*      y = ResizeBilinearLayer(x)
-*
-* Requires 1 input and produces 1 output.
-*
-* Input
-*     A blob with rank at least 3.
-*     e.g.: blob with shape ``[C, H_in, W_in]``.
-*     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
-*
-* Output
-*     Same rank as the input.
-*     e.g.: blob with shape ``[C, H_out, W_out]``.
-*
-*/
-message ResizeBilinearLayerParams {
-
-    /**
-     * Target Spatial Size.
-     * Must be length 2 in order ``[Height, Width]``, i.e. ``[H_out, W_out]``.
-     * If not set, default value ``[1, 1]`` is used.
-     */
-    repeated uint64 targetSize = 1;
-
-    /**
-     * Mode used to compute the grid on which the spatial output values are evaluated.
-     * Same mode is applied to both the height and width axes.
-     */
-    SamplingMode mode = 2;
-
-}
-
-/**
-* A layer that extracts cropped spatial patches or RoIs (regions of interest) from the input and resizes them to a pre-specified size using
-* bilinear interpolation.
-* Note that RoI Align layer can be implemented with this layer followed by a pooling layer.
-*
-* .. code::
-*
-*      y = CropResizeLayer(x)
-*
-* Requires 2 inputs and produces 1 output.
-*
-* Input
-*     There are two inputs.
-*     First input represents an image feature map.
-*     Second input represents the bounding box coordinates for N patches or RoIs (region of interest).
-*
-*     First input is rank 5: [1, Batch, C, H_in, W_in].
-*     Second input is rank 5. Its shape can be either [N, 1, 4, 1, 1] or [N, 1, 5, 1, 1].
-*
-*     N: number of patches/RoIs to be extracted
-*
-*     If RoI shape = ``[N, 1, 4, 1, 1]``
-*                    The axis=-3 corresponds to the four coordinates specifying the bounding box.
-*                    All the N RoIs are extracted from all the batches of the input.
-*
-*     If RoI shape = ``[N, 1, 5, 1, 1]``
-*                     The first element of the axis=-3 specifies the input batch id from which to extract the RoI and
-*                               must be in the interval ``[0, Batch - 1]``. That is, n-th RoI is extracted from the RoI[n,0,0,0,0]-th
-*                     input batch id. The last four elements of the axis=-3 specify the bounding box coordinates.
-*
-* Output
-*     A blob with rank 5.
-*           - Shape is [N, Batch, C, H_out, W_out] if input RoI shape is [N, 1, 4, 1, 1]
-*           - Shape is [N, 1, C, H_out, W_out] if input RoI shape is [N, 1, 5, 1, 1]
-*
-*/
-message CropResizeLayerParams {
-
-    /**
-     * Target Spatial Size.
-     * Must be length 2 in order ``[Height, Width]``, i.e. ``[H_out, W_out]``.
-     * If not set, default value ``[1, 1]`` is used.
-     */
-    repeated uint64 targetSize = 1;
-
-    /**
-     * If true the bounding box coordinates must be in the interval [0, 1].
-     * They are scaled by (H_in - 1), (W_in - 1), i.e. based on the input spatial dimensions.
-     * If false the bounding box coordinates must be in the interval
-     * [0, H_in -1] and [0, W_in - 1], respectively for height and width dimensions.
-     */
-    bool normalizedCoordinates = 2;
-
-    /**
-     * Mode used to compute the grid on which the spatial output values are evaluated.
-     * Same mode is applied to both the height and width axes.
-     */
-    SamplingMode mode = 3;
-
-    /**
-     * Representation used to express the bounding box coordinates.
-     * It determines how the values of the second input are interpreted.
-     */
-    BoxCoordinatesMode boxIndicesMode = 4;
-
-    /**
-     * Additional spatial scale that multiplies the bounding box coordinates.
-     * Generally used while implementing the RoI Align layer,
-     * which uses unnormalized RoI coordinates along with a spatial scale less than or equal to 1.
-     */
-    float spatialScale = 5;
-
-}
-
-/**
- * A layer that performs elementwise addition of a bias,
- * which is broadcasted to match the input shape.
- *
- * .. code::
- *
- *      y = BiasLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank at least 3.
- *     e.g.: blob with shape ``[C, H, W]``.
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- * Output
- *     A blob with the same shape as the input.
- */
-message BiasLayerParams {
-
-    /**
-     * The shape of the bias.
-     * Must be one of the following:
-     * ``[1]``, ``[C]``, ``[1, H, W]`` or ``[C, H, W]``.
-     */
-    repeated uint64 shape = 1;
-
-    /**
-     * The bias values.
-     * The size must be equal to the product of the ``shape`` dimensions.
-     */
-    WeightParams bias = 2;
-
-}
-
-/**
- * A layer that performs elmentwise multiplication by a scale factor
- * and optionally adds a bias;
- * both the scale and bias are broadcasted to match the input shape.
- *
- * .. code::
- *
- *      y = ScaleLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank at least 3.
- *     e.g.: blob with shape ``[C, H, W]``.
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- * Output
- *     A blob with the same shape as the input.
- */
-message ScaleLayerParams {
-
-    /**
-     * The shape of the scale.
-     * Must be one of the following:
-     * ``[1]``, ``[C]``, ``[1, H, W]`` or ``[C, H, W]``.
-     */
-    repeated uint64 shapeScale = 1;
-
-    /**
-     * The scale values.
-     * The size must be equal to the product of the ``shape`` dimensions.
-     */
-    WeightParams scale = 2; /// Scale values. Size must be equal to the product of dimensions specified in shapeScale.
-
-    bool hasBias = 3; /// If true, a bias is added after scaling.
-
-    /**
-     * The shape of the bias.
-     * Must be one of the following:
-     * ``[1]``, ``[C]``, ``[1, H, W]`` or ``[C, H, W]``.
-     */
-    repeated uint64 shapeBias = 4;
-
-    /**
-     * The bias values.
-     * The size must be equal to the product of the ``shape`` dimensions.
-     */
-    WeightParams bias = 5;
-
-}
-
-/**
- * A layer that loads data as a parameter and provides it as an output.
- * The output is rank 5. For general rank, see LoadConstantNDLayer.
- *
- * .. code::
- *
- *      y = LoadConstantLayer()
- *
- * Requires no input and produces 1 output.
- *
- * Output:
- *     A blob with rank 5 and shape ``[1, 1, C, H, W]``
- */
-message LoadConstantLayerParams {
-
-    /**
-     * The shape of the constant to be loaded,
-     * which must be``[C, H, W]``, that is length 3.
-     */
-    repeated uint64 shape = 1;
-
-    /**
-     * The data values,
-     * of size ``C * H * W``.
-     */
-    WeightParams data = 2;
-
-}
-
-/**
- * A layer that performs L2 normalization, i.e. divides by the
- * the square root of the sum of squares of all elements of input.
- *
- * .. code::
- *
- *      y = L2NormalizeLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank greater than equal to 3.
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- * Output
- *     A blob with the same shape as the input.
- *
- * This layer is described by the following formula:
- *
- * .. math::
- *     x_i \leftarrow \dfrac{x_i}{\sqrt{\sum{x_i^2} + \epsilon}}
- */
-message L2NormalizeLayerParams {
-
-    /**
-     * A small constant to avoid division by 0 while normalizing variance.
-     * Defaults to ``1e-6`` if not set or set to ``0``.
-     */
-    float epsilon = 1;
-
-}
-
-/// Data Reorganization Layers
-/// --------------------------
-
-/**
- * A layer that flattens the input.
- *
- * .. code::
- *
- *      y = FlattenLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank greater than equal to 3.
- *     e.g.: Rank 4 blob represents [Batch, C, H, W]
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- * Output
- *     Same rank as the input, such that last two dimensions are both 1.
- *     e.g.: For rank 4 input, output shape is ``[Batch, C * H * W, 1, 1]``
- *
- * There are two X orders: ``CHANNEL_FIRST`` and ``CHANNEL_LAST``.
- * ``CHANNEL_FIRST`` does not require data to be rearranged,
- * because row major ordering is used by internal storage.
- * ``CHANNEL_LAST`` requires data to be rearranged.
- */
-message FlattenLayerParams {
-
-    enum FlattenOrder {
-
-        CHANNEL_FIRST = 0;
-        CHANNEL_LAST = 1;
-
-    }
-    FlattenOrder mode = 1;
-
-}
-
-/**
- * A layer that recasts the input into a new shape.
- *
- * .. code::
- *
- *      y = ReshapeLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank 5.
- *     e.g.: ``[1, 1, C, H, W]`` or ``[Seq, 1, C, H, W]``.
- * Output
- *     A blob with rank 5.
- *     e.g.: ``[1, 1, C_out, H_out, W_out]`` or ``[Seq_out, 1, C_out, H_out, W_out]``.
- *
- * There are two reshape orders: ``CHANNEL_FIRST`` and ``CHANNEL_LAST``.
- * ``CHANNEL_FIRST`` is equivalent to
- * flattening the input to ``[Seq, 1, C * H * W, 1, 1]`` in channel first order
- * and then reshaping it to the target shape;
- * no data rearrangement is required.
- * ``CHANNEL_LAST`` is equivalent to
- * flattening the input to ``[Seq, 1, H * W * C, 1, 1]`` in channel last order,
- * reshaping it to ``[Seq_out, 1, H_out, W_out, C_out]`` (it is now in "H_out-major"" order),
- * and then permuting it to ``[C_out, H_out, W_out]``;
- * both the flattening and permuting requires the data to be rearranged.
- */
-message ReshapeLayerParams {
-
-    /**
-     * The shape of the output.
-     * Must be of length 3 or 4.
-     * If set to 3, ``targetShape`` is interpreted as
-     * ``[1, 1, C_out, H_out, W_out]``, and sequence length of the input is preserved.
-     * If set to 4, ``targetShape`` is interpreted as
-     * ``[Seq_out, 1, C_out, H_out, W_out]``,
-     * where ``Seq_out`` is the new sequence length.
-     */
-    repeated int64 targetShape = 1;
-
-    enum ReshapeOrder {
-
-        CHANNEL_FIRST = 0;
-        CHANNEL_LAST = 1;
-
-    }
-    ReshapeOrder mode = 2;
-
-}
-
-/**
- * A layer that rearranges the dimensions and data of an input.
- * For generic transpose/permute operation see TransposeLayer.
- *
- * .. code::
- *
- *      y = PermuteLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     Must be a rank 5 blob.
- *     e.g.: shape ``[Seq, B, C, H, W]``.
- * Output
- *     Rank 5 blob. Transposed version of the input, such that dimensions at axis=1 or axis=-4 is unchanged.
- *
- *
- * Examples:
- *
- *  Assume input shape is [Seq, B, C, H, W]
- *
- * - If ``axis`` is set to ``[0, 3, 1, 2]``,
- *   then the output has shape ``[Seq, B, W, C, H]``
- *
- * - If ``axis`` is set to ``[3, 1, 2, 0]``,
- *   then the output has shape ``[W, B, C, H, Seq]``
- *
- * - If ``axis`` is set to ``[0, 3, 2, 1]``,
- *   then the output has shape ``[Seq, B, W, H, C]``
- *
- * - If ``axis`` is not set, or is set to ``[0, 1, 2, 3]``,
- *   the output is the same as the input.
- */
-message PermuteLayerParams {
-
-    /**
-     * The order in which to permute the dimensions.
-     * Must have length 4 and a permutation of ``[0, 1, 2, 3]``.
-     */
-    repeated uint64 axis = 1;
-
-}
-
-/**
- * A layer that reorganizes data in the input in specific ways.
- *
- * .. code::
- *
- *      y = ReorganizeDataLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank at least 3.
- *     e.g.: blob with shape ``[C, H, W]``.
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- * Output
- *     Same rank as the input.
- *     e.g.: blob with shape ``[C_out, H_out, W_out]``.
- *
- * mode == SPACE_TO_DEPTH
- *  ``[C_out, H_out, W_out]`` : ``[C * blockSize * blockSize, H/blockSize, W/blockSize]``.
- *  blockSize must divide H and W.
- *  Data is moved from the spatial dimensions to the channel dimension. Input is spatially divided into
- *  non-overlapping blocks of size blockSize X blockSize and data from each block is moved into the
- *  channel dimension.
- *
- * mode == DEPTH_TO_SPACE
- *  ``[C_out, H_out, W_out]`` : ``[C/(blockSize * blockSize), H * blockSize, W * blockSize]``.
- *  Square of blockSize must divide C.
- *  Reverse of SPACE_TO_DEPTH. Data is moved from the channel dimension to the spatial dimensions.
- *
- * mode == PIXEL_SHUFFLE
- *  ``[C_out, H_out, W_out]`` : ``[C/(blockSize * blockSize), H * blockSize, W *  blockSize]``.
- *  Square of blockSize must divide C.
- *  Similar to DEPTH_TO_SPACE, but using the pixel-shuffle semantics for channel order in the output space.
- *  In both modes, elements along the channel dimension are collapsed into
- *  blocks in the spatial dimensions. The difference is in the arrangement of
- *  the input-channels' data in the output space. See below example for more
- *  detail.
- *  (Only available in Core ML Specification >= 5 (iOS >= 14, macOS >= 11.0)
- *
- *
- * Examples:
- *
- * Assume input is the following [C = 8, H = 1, W = 2] tensor:
- *
- * .. code::
- *
- *    [[[1 2]] [[3 4]] [[5 6]] [[7 8]] [[9 10]] [[11 12]] [[13 14]] [[15 16]]]
- *
- * If block_size == 2 and mode == DEPTH_TO_SPACE, output will be the following
- * [C = 2, H = 2, W = 4] tensor:
- *
- * .. code::
- *
- *    [[[ 1  5  2  6]
- *      [ 9 13 10 14]]
- *
- *     [[ 3  7  4  8]
- *      [11 15 12 16]]]
- *
- * For mode == SPACE_TO_DEPTH, the behavior is the same as mode ==
- * DEPTH_TO_SPACE, but with the input and output swapped.
- *
- * If block_size == 2 and mode == PIXEL_SHUFFLE, output will be the following
- * [C = 2, H = 2, W = 4] tensor:
- *
- * .. code::
- *
- *    [[[ 1  3  2  4]
- *      [ 5  7  6  8]]
- *
- *     [[ 9 11 10 12]
- *      [13 15 14 16]]]
- *
- */
-message ReorganizeDataLayerParams {
-
-    enum ReorganizationType {
-
-        SPACE_TO_DEPTH = 0;
-        DEPTH_TO_SPACE = 1;
-        PIXEL_SHUFFLE = 2;
-
-    }
-    ReorganizationType mode = 1;
-    uint64 blockSize = 2; /// must be greater than 1
-
-}
-
-/**
- * A layer that slices the input data along axis = -1 or -2 or -3.
- * For general slice along any axis, please see SliceStaticLayer/SliceDynamicLayer.
- *
- * .. code::
- *
- *      y = SliceLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob that can, in general, have any rank. However, depending on the value of "axis" ,
- *     there may be additional rank constraints.
- * Output
- *     A blob with the same rank as the input.
- *
- * Sliced section is taken from the interval ``[startIndex, endIndex)``, i.e.
- * startIndex is inclusive while endIndex is exclusive.
- * stride must be positive and represents the step size for slicing.
- * Negative indexing is supported for startIndex and endIndex.
- * -1 denotes N-1, -2 denotes N-2 and so on, where N is the length of the dimension to be sliced.
- *
- */
-message SliceLayerParams {
-
-    int64 startIndex = 1; /// start of the sliced section. Inclusive.
-    int64 endIndex = 2; /// end of sliced section. Exclusive.
-    uint64 stride = 3; /// The step size. Must be positive.
-
-    enum SliceAxis {
-
-        CHANNEL_AXIS = 0;
-        HEIGHT_AXIS = 1;
-        WIDTH_AXIS = 2;
-
-    }
-    // The following mapping is used for interpreting this parameter:
-    // CHANNEL_AXIS => axis = -3, input must have rank at least 3.
-    // HEIGHT_AXIS => axis = -2, input must have rank at least 2.
-    // WIDTH_AXIS => axis = -1
-    SliceAxis axis = 4;
-
-}
-
-/**
- * A layer that reduces the input using a specified operation.
- *
- * .. code::
- *
- *      y = ReduceLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob that can, in general, have any rank. However, depending on the value of "axis" ,
- *      there may be additional rank constraints.
- * Output
- *     A blob with the same rank as the input, which has 1s on the dimensions specified in the parameter "axis"
- *
- *     Values supported for axis are [-1], [-2], [-3], [-2,-1], [-3,-2,-1]
- *     and the equivalent positive values (depending on the rank of the input)
- *     For mode == 'ArgMax', axis must be [-1] or [-2] or [-3].
- */
-message ReduceLayerParams {
-
-    /*
-     * The following reduction operations are supported
-     * and are applied on the specified axis of the input array:
-     *
-     * ``SUM``
-     *     Sum of all elements
-     *
-     *     .. math:: \sum{x_i}
-     *
-     * ``AVG``
-     *     Sum of all elements divided by the number of elements
-     *
-     *     .. math:: \dfrac{\sum^n{x_i}}{n}
-     *
-     * ``PROD``
-     *     Product of all elements
-     *
-     *     .. math:: \prod{x_i}
-     *
-     * ``LOGSUM``
-     *     Sum of the natural logarithm of all elements
-     *
-     *     .. math:: \sum{\ln{(x_i + \epsilon)}}
-     *
-     * ``SUMSQUARE``
-     *     Sum of squares of all elements
-     *
-     *     .. math:: \sum{x^2}
-     *
-     * ``L1``
-     *     L1 normalization of all elements
-     *
-     *     .. math:: ||x||_1 = \sum{|x_i|}
-     *
-     * ``L2``
-     *     L2 normalization of all elements
-     *
-     *     .. math:: ||x||_2 = \sqrt{\sum{x_i^2}}
-     *
-     * ``MAX``
-     *     Maximum of all elements
-     *
-     *     .. math:: \text{max}(x_i)
-     *
-     * ``MIN``
-     *     Minumum of all elements
-     *
-     *     .. math:: \text{min}(x_i)
-     *
-     * ``ARGMAX``
-     *     Argument of the maximum of all elements
-     *
-     *     .. math:: \text{argmax}(x_i)
-     *
-     */
-    enum ReduceOperation {
-
-        SUM = 0;
-        AVG = 1;
-        PROD = 2;
-        LOGSUM = 3;
-        SUMSQUARE = 4;
-        L1 = 5;
-        L2 = 6;
-        MAX = 7;
-        MIN = 8;
-        ARGMAX = 9; /// only supported with axis = C, H or W.
-
-    }
-    ReduceOperation mode = 1; /// Specifies function used to reduce.
-
-    /**
-     * Used if mode is ``LOGSUM``.
-     * Defaults to ``1e-6`` if not set or is set to ``0``.
-     */
-    float epsilon = 2;
-
-    enum ReduceAxis {
-
-        CHW = 0;
-        HW = 1;
-        C = 2;
-        H = 3;
-        W = 4;
-
-    }
-
-    // The following mapping is used for interpreting this parameter:
-    // CHW = axis [-3, -2, -1], input must have rank at least 3.
-    // HW = axis [-2, -1], input must have rank at least 2.
-    // C = axis [-3]
-    // H = axis [-2]
-    // W = axis [-1]
-    ReduceAxis axis = 3;
-
-}
-
-/**
- * A layer that crops the spatial dimensions of an input.
- * If two inputs are provided, the shape of the second input is used as the reference shape.
- *
- * .. code::
- *
- *      y = CropLayer(x1) or y = CropLayer(x1,x2)
- *
- * Requires 1 or 2 inputs and produces 1 output.
- *
- * Input
- *    1 or 2 tensors, each with rank at least 3, both inputs must have equal rank.
- *    Example:
- *     - 1 input case: A blob with shape ``[C, H_in, W_in]``.
- *     - 2 input case: 1st blob with shape ``[C, H_in, W_in]``, 2nd blob with shape ``[C, H_out, W_out]``.
- *
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- *
- * Output
- *     Same rank as the inputs.
- *     e.g.: A blob with shape ``[C, H_out, W_out]``.
- *
- * If one input is used, output is computed as follows:
- *
- * .. code::
- *
- *      y = x1[:, topCropAmount:H_in - bottomCropAmount, leftCropAmount:W_in - rightCropAmount]
- *
- *      topCropAmount == Height startEdgeSize == borderAmounts[0].startEdgeSize
- *      bottomCropAmount == Height endEdgeSize == borderAmounts[0].endEdgeSize
- *      leftCropAmount == Width startEdgeSize == borderAmounts[1].startEdgeSize
- *      rightCropAmount == Width endEdgeSize == borderAmounts[1].endEdgeSize
- *
- *      H_out = H_in - topCropAmount - bottomCropAmount
- *      W_out = W_in - leftCropAmount - rightCropAmount
- *
- * If two inputs are used, output is computed as follows:
- *
- * .. code::
- *
- *      y = x1[:, offset[0]:offset[0] + H_out, offset[1]:offset[1] + W_out]
- */
-message CropLayerParams {
-
-    /**
-     * The amounts to be cropped from the input.
-     * Used only if a single input is provided.
-     */
-    BorderAmounts cropAmounts = 1;
-
-    /**
-     * The offset amounts.
-     * Used only if two inputs are provided.
-     * Must be of length 2, in order ``[H, W]``.
-     */
-    repeated uint64 offset = 5;
-
-}
-
-/**
- * A layer that computes the elementwise average of the inputs.
- * This layer has limited broadcasting support. For general broadcasting see AddBroadcastableLayer.
- *
- * .. code::
- *
- *      y = AverageLayer(x1,x2,...)
- *
- * Requires multiple inputs and produces 1 output.
- *
- * Input
- *     In general, there are no rank constraints.
- *     However, only certain set of shapes are broadcastable. For example:
- *     [B, 1, 1, 1], [B, C, 1, 1], [B, 1, H, W], [B, C, H, W]
- * Output
- *     A blob with the same shape as each input.
- */
-message AverageLayerParams {
-
-}
-
-/**
- * A layer that computes the elementwise maximum over the inputs.
- *
- * .. code::
- *
- *      y = MaxLayer(x1,x2,...)
- *
- * Requires multiple inputs and produces 1 output.
- *
- * Input
- *     In general, there are no rank constraints.
- *     However, only certain set of shapes are broadcastable. For example:
- *     [B, C, 1, 1], [B, C, H, W]
- * Output
- *     A blob with the same shape as each input.
- */
-message MaxLayerParams {
-
-}
-
-/**
- * A layer that computes the elementwise minimum over the inputs.
- *
- * .. code::
- *
- *      y = MinLayer(x1,x2,...)
- *
- * Requires multiple inputs and produces 1 output.
- *
- * Input
- *     In general, there are no rank constraints.
- *     However, only certain set of shapes are broadcastable. For example:
- *     [B, C, 1, 1], [B, C, H, W]
- * Output
- *     A blob with the same shape as each input.
- */
-message MinLayerParams {
-
-}
-
-/**
- * A layer that computes the dot product of two vectors.
- *
- * .. code::
- *
- *      y = DotProductLayer(x1,x2)
- *
- * Requires 2 inputs and produces 1 output.
- *
- * Input
- *     Two blobs with rank at least 3, such that the last two dimensions must be 1.
- *     e.g.: blobs with shape ``[B, C, 1, 1]``.
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- *
- * Output
- *     Same rank as the input.
- *     e.g. for rank 4 inputs, output shape: [B, 1, 1, 1]
- */
-message DotProductLayerParams {
-
-    /**
-     * If true, inputs are normalized first,
-     * thereby computing the cosine similarity.
-     */
-    bool cosineSimilarity = 1;
-
-}
-
-/**
- * A layer that performs mean variance normalization, along axis = -3.
- *
- * .. code::
- *
- *      y = MeanVarianceNormalizeLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank greater than equal to 3.
- *     Example: Rank 4 blob represents [Batch, channels, height, width]
- *     For ranks greater than 3, the leading dimensions, starting from 0 to -4 (inclusive), are all treated as batch.
- *
- * Output
- *     A blob with the same shape as the input.
- *
- * If ``acrossChannels == true``
- * normalization is performed on flattened input, i.e. the input is reshaped to (Batch,C), where "Batch" contains
- * all dimensions from 0 to -4 (inclusive), and C contains dimensions -1, -2, -3.
- *
- * If ``acrossChannels == false``
- * normalization is performed within a channel,
- * across spatial dimensions (i.e. last two dimensions).
- */
-message MeanVarianceNormalizeLayerParams {
-
-    /**
-     * If true, mean and variance are computed across channels.
-     */
-    bool acrossChannels = 1;
-
-    /**
-     * If false, only mean is subtracted.
-     */
-    bool normalizeVariance = 2;
-
-    /**
-     * A small constant to avoid division by 0 while normalizing variance.
-     * Defaults to ``1e-6`` if not set or set to ``0``.
-     */
-    float epsilon = 3;
-
-}
-
-/**
- * A layer that repeats a sequence or the dimension sitting at axis = -5
- *
- * .. code::
- *
- *      y = SequenceRepeatLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A blob with rank at least 5.
- *     e.g: shape ``[Seq, B, C, H, W]``
- * Output
- *     A blob with the same rank as the input.
- *     e.g.: for input shape ``[Seq, B, C, H, W]``, output shape is ``[nRepetitions * Seq, B, C, H, W]``.
- */
-message SequenceRepeatLayerParams {
-
-    /**
-     * Number of repetitions.
-     * Defaults to ``1`` if not set or set to ``0``.
-     */
-    uint64 nRepetitions = 1;
-
-}
-
-/// Recurrent Layers
-/// ----------------
-
-/*
- * The following activations are supported with recurrent layers:
- * - Linear
- * - Sigmoid
- * - Tanh
- * - ReLU
- * - Scaled Hyperbolic Tangent: alpha * tanh(beta * x), currently only supported for alpha = 1.7159, beta = 2/3
- * - Hard Sigmoid: min(max(alpha * x + beta, 0), 1), currently only supported for alpha = 0.2, beta = 0.5
- */
-
-/**
- * A simple recurrent layer.
- *
- * .. code::
- *
- *      y_t = SimpleRecurrentLayer(x_t, y_{t-1})
- *
- * Input
- *    A blob of rank 5, with shape `[Seq, Batch, inputVectorSize, 1, 1]``.
- *    This represents a sequence of vectors of size ``inputVectorSize``.
- * Output
- *    Same rank as the input.
- *    Represents a vector of size ``outputVectorSize``. It is either the final output or a sequence of outputs at all time steps.
- *
- * - Output Shape: ``[1, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == false``
- * - Output Shape: ``[Seq, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == true``
- *
- * This layer is described by the following equation:
- *
- * .. math::
- *     \boldsymbol{y_t} = f(\mathrm{clip}(W \boldsymbol{x_t} + \
- *                                        R \boldsymbol{y_{t-1}} + b))
- *
- * - ``W`` is a 2-dimensional weight matrix
- *   (``[outputVectorSize, inputVectorSize]``, row-major)
- * - ``R`` is a 2-dimensional recursion matrix
- *   (``[outputVectorSize, outputVectorSize]``, row-major)
- * - ``b`` is a 1-dimensional bias vector (``[outputVectorSize]``)
- * - ``f()`` is an activation
- * - ``clip()`` is a function that constrains values between ``[-50.0, 50.0]``
- */
-message SimpleRecurrentLayerParams {
-
-    uint64 inputVectorSize = 1; /// The size of the input vectors.
-    uint64 outputVectorSize = 2; /// The size of the output vectors.
-
-    /**
-    * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5)
-    */
-    ActivationParams activation = 10; /// The activation function.
-
-    /**
-        If false output is just the result after final state update.
-        If true, output is a sequence, containing outputs at all time steps.
-    */
-    bool sequenceOutput = 15;
-
-    bool hasBiasVector = 20; /// If false, no bias is added.
-
-    WeightParams weightMatrix = 30; /// Weight matrix W.
-    WeightParams recursionMatrix = 31; /// Recursion Weight matrix R.
-    WeightParams biasVector = 32; /// Bias vector b.
-
-    bool reverseInput = 100;
-    // If true, then the node processes the input sequence from right to left
-
-}
-
-/**
- * Gated-Recurrent Unit (GRU) Layer
- *
- * .. code::
- *
- *      y_t = GRULayer(x_t, y_{t-1})
- *
- * Input
- *    A blob of rank 5, with shape `[Seq, Batch, inputVectorSize, 1, 1]``.
- *    This represents a sequence of vectors of size ``inputVectorSize``.
- * Output
- *    Same rank as the input.
- *    Represents a vector of size ``outputVectorSize``. It is either the final output or a sequence of outputs at all time steps.
- *
- * - Output Shape: ``[1, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == false``
- * - Output Shape: ``[Seq, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == true``
- *
- * This layer is described by the following equations:
- *
- * Update Gate
- *     .. math::
- *         \boldsymbol{z_t} = \
- *             f(\mathrm{clip}(W_z \boldsymbol{x_t} + \
- *                             R_z \boldsymbol{y_{t-1}} + b_z)
- *
- * Reset Gate
- *     .. math::
- *         \boldsymbol{r_t} = \
- *             f(\mathrm{clip}(W_r \boldsymbol{x_t} + \
- *                             R_r \boldsymbol{y_{t-1}} + b_r))
- *
- * Cell Memory State
- *     .. math::
- *         \boldsymbol{c_t} = \
- *             \boldsymbol{y_{t-1}} \odot \boldsymbol{r_t}
- *
- * Output Gate
- *     .. math::
- *         \boldsymbol{o_t} = \
- *             g(\mathrm{clip}(W_o \boldsymbol{x_t} + \
- *                             R_o \boldsymbol{c_t} + b_o))
- *
- * Output
- *     .. math::
- *         \boldsymbol{y_t} = \
- *             (1 - \boldsymbol{z_t}) \odot \boldsymbol{o_t} + \
- *              \boldsymbol{z_t} \odot \boldsymbol{y_{t-1}}
- *
- * - ``W_z``, ``W_r``, ``W_o`` are 2-dimensional input weight matrices
- *   (``[outputVectorSize, inputVectorSize]``, row-major)
- * - ``R_z``, ``R_r``, ``R_o`` are 2-dimensional recursion matrices
- *   (``[outputVectorSize, outputVectorSize]``, row-major)
- * - ``b_z``, ``b_r``, ``b_o`` are 1-dimensional bias vectors
- *   (``[outputVectorSize]``)
- * - ``f()``, ``g()`` are activations
- * - ``clip()`` is a function that constrains values between ``[-50.0, 50.0]``
- * - ``⊙`` denotes the elementwise product of matrices
- */
-message GRULayerParams {
-
-    uint64 inputVectorSize = 1; /// Size of the input vectors.
-    uint64 outputVectorSize = 2; /// Size of the output vectors.
-
-    /**
-     * 2 element array representing activations [f(), g()] in that order.
-     * Typical values used = [sigmoid, tanh].
-     * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5)
-     */
-    repeated ActivationParams activations = 10;
-
-    /**
-     * If false output is just the result after final state update.
-     * If true, output is a sequence, containing outputs at all time steps.
-     */
-    bool sequenceOutput = 15;
-
-    /**
-     * If false, no biases (``b_z``, ``b_r``, ``b_o``) are added.
-     */
-    bool hasBiasVectors = 20;
-
-    WeightParams updateGateWeightMatrix = 30; /// Weight Matrix W_z.
-    WeightParams resetGateWeightMatrix = 31; /// Weight Matrix W_r.
-    WeightParams outputGateWeightMatrix = 32; /// Weight Matrix W_o.
-
-    WeightParams updateGateRecursionMatrix = 50; /// Recursion Weight Matrix R_z.
-    WeightParams resetGateRecursionMatrix = 51; /// Recursion Weight Matrix R_r.
-    WeightParams outputGateRecursionMatrix = 52; /// Recursion Weight Matrix R_o.
-
-    WeightParams updateGateBiasVector = 70; /// Bias vector b_z.
-    WeightParams resetGateBiasVector = 71; /// Bias vector b_r.
-    WeightParams outputGateBiasVector = 72; /// Bias vector b_o.
-
-    /// If true, then the node processes the input sequence from right to left
-    bool reverseInput = 100;
-
-}
-
-/**
- * Long short-term memory (LSTM) parameters.
- *
- * This is described by the following equations:
- *
- * Input Gate
- *     .. math::
- *         \boldsymbol{i_t} = \
- *             f(\mathrm{clip}(W_i \boldsymbol{x_t} + \
- *                             R_i \boldsymbol{y_{t-1}} + \
- *                             p_i \odot c_{t-1} + b_i))
- *
- * Forget Gate
- *     .. math::
- *         \boldsymbol{f_t} = \
- *             f(\mathrm{clip}(W_f \boldsymbol{x_t} + \
- *                             R_f \boldsymbol{y_{t-1}} + \
- *                             p_f \odot c_{t-1} + b_f))
- *
- * Block Input
- *     .. math::
- *         \boldsymbol{z_t} = \
- *             g(\mathrm{clip}(W_z \boldsymbol{x_t} + \
- *                             R_z \boldsymbol{y_{t-1}} + b_z))
- *
- * Cell Memory State
- *     .. math::
- *         \boldsymbol{c_t} = \
- *             \boldsymbol{c_{t-1}} \odot \boldsymbol{f_t} + \
- *             \boldsymbol{i_t} \odot \boldsymbol{z_t}
- *
- * Output Gate
- *     .. math::
- *         \boldsymbol{o_t} = \
- *             f(\mathrm{clip}(W_o \boldsymbol{x_t} + \
- *                             R_o \boldsymbol{y_{t-1}} + \
- *                             p_o \odot c_t + b_o))
- *
- * Output
- *     .. math::
- *         \boldsymbol{y_t} = \
- *             h(\boldsymbol{c_t}) \odot \boldsymbol{o_t}
- *
- * - ``W_i``, ``W_f``, ``W_z``, ``W_o`` are 2-dimensional input weight matrices
- *   (``[outputVectorSize, inputVectorSize]``, row-major)
- * - ``R_i``, ``R_f``, ``R_z``, ``R_o`` are 2-dimensional recursion matrices
- *   (``[outputVectorSize, outputVectorSize]``, row-major)
- * - ``b_i``, ``b_f``, ``b_z``, ``b_o`` are 1-dimensional bias vectors
- *   (``[outputVectorSize]``)
- * - ``p_``, ``p_f``, ``p_o`` are 1-dimensional peephole vectors
- *   (``[outputVectorSize]``)
- * - ``f()``, ``g()``, ``h()`` are activations
- * - ``clip()`` is a function that constrains values between ``[-50.0, 50.0]``
- * - ``⊙`` denotes the elementwise product of matrices
- */
-message LSTMParams {
-
-    /**
-     * If true, output is a sequence, containing outputs at all time steps.
-     * If false, output is just the result after final state update.
-     */
-    bool sequenceOutput = 10;
-
-    /**
-     * If false, no biases (``b_i``, ``b_f``, ``b_z``, ``b_o``) are added.
-     */
-    bool hasBiasVectors = 20;
-
-    /**
-     * If true, a vector of ``1`` values is added to ``b_f``.
-     */
-    bool forgetBias = 30;
-
-    /**
-     * If true, peephole vectors are included.
-     */
-    bool hasPeepholeVectors = 40;
-
-    /**
-     * If the coupled Input and Forget flag is on, the behaviour of
-     * ``c_t`` is changed to the following (i.e. forget gate is not used):
-     *
-     * .. math::
-     *     \boldsymbol{c_t} = \
-     *         \boldsymbol{c_{t-1}} \odot (1 - \boldsymbol{i_t}) + \
-     *         \boldsymbol{i_t} \odot \boldsymbol{z_t}
-     *
-     */
-    bool coupledInputAndForgetGate = 50;
-
-    /**
-     * Places a limit on the maximum and minimum values of ``c_t``.
-     * c_t = min(c_t, cellClipThreshold)
-     * c_t = max(c_t, -cellClipThreshold)
-     * If 0, it is set to its default value = 50.0.
-     */
-    float cellClipThreshold = 60;
-
-}
-
-/**
- * Weights for long short-term memory (LSTM) layers
- */
-message LSTMWeightParams {
-
-    WeightParams inputGateWeightMatrix = 1; /// Weight Matrix W_i.
-    WeightParams forgetGateWeightMatrix = 2; /// Weight Matrix W_f.
-    WeightParams blockInputWeightMatrix = 3; /// Weight Matrix W_z.
-    WeightParams outputGateWeightMatrix = 4; /// Weight Matrix W_o.
-
-    WeightParams inputGateRecursionMatrix = 20; /// Recursion Weight Matrix R_i.
-    WeightParams forgetGateRecursionMatrix = 21; /// Recursion Weight Matrix R_f.
-    WeightParams blockInputRecursionMatrix = 22; /// Recursion Weight Matrix R_z.
-    WeightParams outputGateRecursionMatrix = 23; /// Recursion Weight Matrix R_o.
-
-    //biases:
-    WeightParams inputGateBiasVector = 40; /// Bias vector b_i.
-    WeightParams forgetGateBiasVector = 41; /// Bias vector b_f.
-    WeightParams blockInputBiasVector = 42; /// Bias vector b_z.
-    WeightParams outputGateBiasVector = 43; /// Bias vector b_o.
-
-    //peepholes:
-    WeightParams inputGatePeepholeVector = 60; /// Peephole vector p_i.
-    WeightParams forgetGatePeepholeVector = 61; /// Peephole vector p_f.
-    WeightParams outputGatePeepholeVector = 62; /// Peephole vector p_o.
-
-}
-
-/**
- * A unidirectional long short-term memory (LSTM) layer.
- *
- * .. code::
- *
- *      (y_t, c_t) = UniDirectionalLSTMLayer(x_t, y_{t-1}, c_{t-1})
- *
- * Input
- *    A blob of rank 5, with shape `[Seq, Batch, inputVectorSize, 1, 1]``.
- *    This represents a sequence of vectors of size ``inputVectorSize``.
- * Output
- *    Same rank as the input.
- *    Represents a vector of size ``outputVectorSize``. It is either the final output or a sequence of outputs at all time steps.
- *
- * - Output Shape: ``[1, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == false``
- * - Output Shape: ``[Seq, Batch, outputVectorSize, 1, 1]`` , if ``sequenceOutput == true``
- *
- */
-message UniDirectionalLSTMLayerParams {
-
-    uint64 inputVectorSize = 1; /// Size of the input vectors.
-    uint64 outputVectorSize = 2; /// Size of the output vectors.
-
-    /**
-     * 3 element array representing activations [f(),g(),h()] in that order.
-     * Typical values used = [sigmoid, tanh, tanh].
-     * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5)
-     */
-    repeated ActivationParams activations = 10;
-
-    LSTMParams params = 15;
-
-    LSTMWeightParams weightParams = 20; /// Weights, biases and peepholes.
-
-    /// If true, then the node processes the input sequence from right to left
-    bool reverseInput = 100;
-
-}
-
-/**
- * Bidirectional long short-term memory (LSTM) layer
- *
- * .. code::
- *
- *      (y_t, c_t, y_t_reverse, c_t_reverse) = BiDirectionalLSTMLayer(x_t, y_{t-1}, c_{t-1}, y_{t-1}_reverse, c_{t-1}_reverse)
- *
- * Input
- *    A blob of rank 5, with shape `[Seq, Batch, inputVectorSize, 1, 1]``.
- *    This represents a sequence of vectors of size ``inputVectorSize``.
- * Output
- *    Same rank as the input.
- *    Represents a vector of size ``2 * outputVectorSize``. It is either the final output or a sequence of outputs at all time steps.
- *
- * - Output Shape: ``[1, Batch, 2 * outputVectorSize, 1, 1]`` , if ``sequenceOutput == false``
- * - Output Shape: ``[Seq, Batch, 2 * outputVectorSize, 1, 1]`` , if ``sequenceOutput == true``
- *
- *
- * The first LSTM operates on the input sequence in the forward direction.
- * The second LSTM operates on the input sequence in the reverse direction.
- *
- * Example: given the input sequence ``[x_1, x_2, x_3]``,
- * where ``x_i`` are vectors at time index ``i``:
- *
- * The forward LSTM output is ``[yf_1, yf_2, yf_3]``,
- *
- * where ``yf_i`` are vectors of size ``outputVectorSize``:
- *
- * - ``yf_1`` is the output at the end of sequence {``x_1``}
- * - ``yf_2`` is the output at the end of sequence {``x_1``, ``x_2``}
- * - ``yf_3`` is the output at the end of sequence {``x_1``, ``x_2``, ``x_3``}
- *
- * The backward LSTM output: ``[yb_1, yb_2, yb_3]``,
- *
- * where ``yb_i`` are vectors of size ``outputVectorSize``:
- *
- * - ``yb_1`` is the output at the end of sequence {``x_3``}
- * - ``yb_2`` is the output at the end of sequence {``x_3``, ``x_2``}
- * - ``yb_3`` is the output at the end of sequence {``x_3``, ``x_2``, ``x_1``}
- *
- * Output of the bi-dir layer:
- *
- * - if ``sequenceOutput = True`` : { ``[yf_1, yb_3]``,  ``[yf_2, yb_2]``,  ``[yf_3, yb_1]`` }
- * - if ``sequenceOutput = False`` : { ``[yf_3, yb_3]`` }
- */
-message BiDirectionalLSTMLayerParams {
-
-    /**
-     * Size of the input vectors.
-     */
-    uint64 inputVectorSize = 1;
-    /**
-     * Size of the outputs vectors.
-     * It is same for both forward and backward LSTMs.
-     */
-    uint64 outputVectorSize = 2;
-
-    /**
-     * 3 element array representing activations [f(),g(),h()] in that order.
-     * Typical values used = [sigmoid, tanh, tanh].
-     * Activations supported are Linear, Sigmoid, Tanh, ReLU, Scaled Tanh (alpha = 1.71, beta = 2/3), Hard sigmoid (alpha = 0.2, beta = 0.5)
-     */
-    repeated ActivationParams activationsForwardLSTM = 10;
-    /**
-     * Currently, backward LSTM activations
-     * must be same as the ones for the forward LSTM.
-     */
-    repeated ActivationParams activationsBackwardLSTM = 11;
-
-    /**
-     * Common parameters shared by the forward and backward LSTMs.
-     */
-    LSTMParams params = 15;
-
-    /**
-     * Weights and biases.
-     * Must be a length 2 message,
-     * for the forward and backward LSTM respectively.
-     */
-    repeated LSTMWeightParams weightParams = 20;
-
-}
-
-message CustomLayerParams {
-
-    message CustomLayerParamValue {
-        oneof value {
-            double doubleValue = 10;
-            string stringValue = 20;
-            int32 intValue = 30;
-            int64 longValue = 40;
-            bool boolValue = 50;
-        }
-    }
-
-    string className = 10; // The name of the class (conforming to MLCustomLayer) corresponding to this layer
-    repeated WeightParams weights = 20; // Any weights -- these are serialized in binary format and memmapped at runtime
-    map<string, CustomLayerParamValue> parameters = 30; // these may be handled as strings, so this should not be large
-    string description = 40; // An (optional) description of the layer provided by the model creator. This information is displayed when viewing the model, but does not affect the model's execution on device.
-
-}
-
-/**
- * A layer that rearranges the dimensions and data of an input.
- *
- * .. code::
- *
- *      y = TransposeLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     A N-Dimensional tensor.
- * Output
- *     A N-Dimensional tensor of the same rank but with dimensions and data permuted according to axes.
- *     Shape: ``[InputShape[axis[0]], InputShape[axis[1]], ... , InputShape[axis[N-1]]]``
- *
- * Examples:
- *
- * - If ``axes`` is set to ``[3, 1, 2, 0]`` and the input shape is ``[6,7,8,9]``,
- *   then the output has shape ``[9,7,8,6]``
- */
-
-message TransposeLayerParams {
-
-    /**
-     * Length of "axes" should match the rank of input & output tensor
-     * "axes" should be a permutation of "[0,1,2,...,N-1]" where N is the rank.
-     */
-    repeated uint64 axes = 1; //
-
-}
-
-/**
- * A layer that computes the matrix multiplication of two tensors with numpy-like broadcasting
- * where the matrices reside in the last two indices of the tensor.
- *
- * .. code::
- *
- *      y = BatchedMatMul(a,b)
- *
- * Requires 1 or 2 inputs and produces 1 output.
- *
- * The first tensor, "a", must be provided as an input. The second tensor can either be an input or provided as a weight matrix parameter.
- *
- * Input
- *     - a: First N-Dimensional tensor
- *     - b: Second N-Dimensional tensor (either a rank-N input or a matrix, i.e. N=2, provided as a layer parameter)
- *
- * Output
- *     A tensor containing the matrix product of two tensors.
- *     When there are two inputs: rank is max(2, rank(a), rank(b))
- *     When there is one input: rank is same as that of the input.
- *
- * This operation behaves as following:
- *
- *  When there are two inputs:
- *      - If N >= 2 for both tensors, it is treated as a batch of matrices residing in the last two indices.
- *        All the indices, except for the last two, are broadcasted using conventional rules.
- *      - If the first tensor is 1-D, it is converted to a 2-D tensor by prepending a 1 to its shape. Eg. (D) -> (1,D)
- *      - If the second tensor is 1-D, it is converted to a 2-D tensor by appending a 1 to its shape. Eg. (D) -> (D,1)
- *
- *  When there is one input:
- *      - The weight matrix corresponds to a matrix, of shape (X1, X2). Values of X1, X2 must be provided as layer parameters.
- *      - The input, "a", is reshaped into a matrix by combining all the leading dimensions, except the last, into a batch dimension. eg:
- *             - if "a" is rank 1 (X1,) -->  (1, X1). Output shape will be (X2,)
- *             - if "a" is rank 2 (B1, X1) --> no need to reshape. Output shape will be (B1, X2)
- *             - if "a" is rank 3 (B1, B2, X1) --> (B1 * B2, X1). Output shape will be (B1, B2, X2)
- *             - etc
- */
-message BatchedMatMulLayerParams {
-
-    /**
-     * If transposeA is true, it transposes the left matrix on the fly before matrix multiplication.
-     * (is ignored when there is one input)
-     */
-    bool transposeA = 1;
-    /**
-     * If transposeB is true, it transposes the right matrix on the fly before matrix multiplication.
-     * (is ignored when there is one input)
-     */
-    bool transposeB = 2;
-
-    /*
-     * Following parameters are ignored when there are two inputs.
-     */
-
-    uint64 weightMatrixFirstDimension = 5; /// X1: same as the last dimension of the input tensor
-    uint64 weightMatrixSecondDimension = 6; /// X2: same as the last dimension of the output tensor
-
-    bool hasBias = 7; /// Whether a bias is added or not. Supported only when there is one input.
-
-    /*
-     * Weight matrix representing shape [X1, X2].
-     * Values are however stored in column major order,
-     * in the "repeated float" or "bytes" fields of the message "WeightParams"
-     */
-    WeightParams weights = 8;
-    WeightParams bias = 9; /// Bias vector [X2]. Supported only when there is one input.
-
-    /**
-     * If set, this layer, at runtime, quantizes the floating point input blob to int8 before applying the
-     * matrix multiplication using the INT8 weight parameters provided in weights->int8RawValue. The
-     * result is then dequantized.
-     * Requires:
-     * * number of inputs to be 1
-     * * hasBias == false
-     * * QuantizationType == LinearQuantizationParams, such that
-     *   * size of the "scale" field is 1 and "bias" field is empty in "LinearQuantizationParams"
-     * * numberOfBits == 8
-     * * weights->rawValue_size to be empty
-     */
-    bool int8DynamicQuantize = 10;
-
-}
-
-/**
- * A layer that concatenates a list of tensors along a specified axis.
- *
- * .. code::
- *
- *      y = ConcatNDLayer(x1,x2,....)
- *
- * Requires at least 2 input and produces 1 output.
- *
- * Input
- *     The rank of the input tensors must match and all dimensions also must match, except for the dimension 'axis'.
- *
- *
- * Output
- *     Same rank as the input. The dimension along "axis", is the sum of the dimensions of the inputs.
- *
- * example:
- *
- * in1 : shape (3, 2), value = [[1, 2], [3, 4], [5, 6]]
- * in2 : shape (3, 2), value = [[7, 8], [9, 10], [11, 12]]
- * axis = 0
- *
- * if interleave = False (default)
- * output : shape (6, 2)
- * output[0:3, :] = in1
- * output[3:6, :] = in2
- * value = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]
- *
- * if interleave = True
- * output : shape (6, 2)
- * output[0::2, :] = in1
- * output[1::2, :] = in2
- * value = [[1, 2], [7, 8], [3, 4], [9, 10], [5, 6], [11, 12]]
- *
- */
-message ConcatNDLayerParams {
-
-    /**
-     * Dimension along which to concatenate. Supports negative values of the parameter 'axis'.
-     */
-    int64 axis = 1;
-    
-    /**
-     * (Only available in Core ML Specification >= 5 (iOS >= 14, macOS >= 11.0)
-     * Interleave option. If True, concatenation is done via interleaving the inputs.
-     * This requires all inputs to have the exact same shape.
-     */
-    bool interleave = 2;
-    
-
-}
-
-/**
- * A layer that performs softmax normalization along a specified axis.
- *
- * .. code::
- *
- *      y = SoftmaxNDLayer(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Output shape is same as the input.
- */
-message SoftmaxNDLayerParams {
-
-    /**
-     * Dimension on which the softmax would be performed. Supports negative values of the parameter 'axis'.
-     */
-    int64 axis = 1;
-
-}
-
-/**
- * A layer that reverses specific dimensions of the input tensor.
- * It is similar in functionality to the numpy.flip method.
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- */
-message ReverseLayerParams {
-
-    /**
-     * Reverses each dimension of the input tensor for which corresponding reverseDim is set to True.
-     * Requires len(reverseDim) == rank(inputTensor)
-     */
-    repeated bool reverseDim = 1;
-
-}
-
-/**
- * A layer that reverses variable length slices.
- *
- * Requires 2 inputs and produces 1 output.
- *
- * 2 inputs, in order are denoted by "data", "seq_lengths".
- * "seq_lenghts" must be a rank 1 tensor, i.e. seq_lengths.shape = (B,)
- * which contains the lengths of the amount of sequence to be reversed, for each element of the batch.
- * Dimension "batchAxis" in "data" must be equal to B, i.e,
- * data.shape[batchAxis] = B.
- *
- * According to the batch axis, input "data" is first divided into a batch of B inputs,
- * each of which is flipped along the dimension "sequenceAxis", by the amount specified in
- * "seq_lengths", the second input.
- *
- * e.g.:
- *
- * data [shape = (2,4)]:
- * [0 1 2 3]
- * [4 5 6 7]
- * seq_lengths [shape = (2,)]:
- * [3, 0]
- * batchAxis = 0
- * sequenceAxis = 1
- *
- * output [shape = (2,4)]:
- * [2 1 0 3]
- * [4 5 6 7]
- *
- *
- * data [shape = (2,3,2)]:
- * [0 1]
- * [2 3]
- * [4 5] (slice = 0)
- * [6 7]
- * [8 9]
- * [10 11] (slice = 1)
- * seq_lengths [shape = (2,)]:
- * [2, 3]
- * batchAxis = 0
- * sequenceAxis = 1
- *
- * output [shape = (2,3,2)]:
- * [2 3]
- * [0 1]
- * [4 5] (slice = 0)
- * [10 11]
- * [8 9]
- * [6 7] (slice = 1)
- *
- * Output shape is same as the input.
- */
-message ReverseSeqLayerParams {
-
-    int64 batchAxis = 1; // batch axis has to be strictly less than seq_axis
-    int64 sequenceAxis = 2;
-
-}
-
-/**
- * A layer that loads data as a parameter and provides it as an output.
- *
- * .. code::
- *
- *      y = LoadConstantNDLayer()
- *
- * Requires no input and produces 1 output.
- *
- * Output: A tensor with shape as provided in the parameter "shape"
- */
-message LoadConstantNDLayerParams {
-
-    /**
-     * The shape of the constant to be loaded.
-     */
-    repeated uint64 shape = 1;
-    WeightParams data = 2;
-
-}
-
-/**
- * A layer that generates an output tensor with a constant value.
- * Input is only used to determine the shape of the output.
- * This layer is used to allocate a tensor with a dynamic shape (that of the input) and constant value.
- *
- * Requires 1 input and produces 1 output.
- *
- * .. code::
- *
- *      y = FillLikeLayer(x)
- *
- * Input
- *     A N-Dimensional tensor, whose values are ignored. Only the shape is used to
- *     infer the shape of the output.
- *
- * Output
- *     A N-Dimensional tensor with the same shape as the input tensor.
- *
- */
-message FillLikeLayerParams {
-
-    float value = 1;
-
-}
-
-/**
- * A layer that generates an output tensor with a constant value.
- * This layer is used to allocate a tensor with a static shape and constant value.
- *
- * Requires no input and produces 1 output.
- *
- * .. code::
- *
- *      y = FillStaticLayer(x)
- *
- * Output
- *     A N-Dimensional tensor of shape "targetShape".
- *
- */
-message FillStaticLayerParams {
-
-    float value = 1;
-    repeated uint64 targetShape = 2;
-
-}
-
-/**
- * A layer that generates an output tensor with a constant value.
- * This layer is used to allocate a tensor with a dynamic shape (as specified by the input) and constant value.
- *
- * Requires 1 input and produces 1 output.
- *
- * .. code::
- *
- *      y = FillDynamicLayer(x)
- *
- * Input
- *     A rank 1 tensor specifying the shape of the output
- *
- * Output
- *     An N-Dimensional tensor with the shape specified by the values in the input tensor.
- *
- */
-message FillDynamicLayerParams {
-
-    float value = 1;
-
-}
-
-/**
- * A layer that returns the elements either from tensor x or tensor y,
- * depending on the value in the condition tensor.
- * It is similar in functionality to the numpy.where method with 3 inputs.
- *
- * Requires 3 inputs and produces 1 output.
- * Inputs, in order, are the condition tensor, x and y.
- *
- * for each vector index (i,...,j):
- *    output[i,...,j] = x[i,...,j] if condition[i,...,j] = True
- *                      y[i,...,j] if condition[i,...,j] = False
- *
- * All the 3 inputs are first broadcasted to a common shape.
- * (the shapes must be broadcastable)
- *
- * output.rank = max(input[0].rank, input[1].rank, input[2].rank)
- *
- */
-message WhereBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric sine function.
- *
- *
- * .. code::
- *
- *      y = SinLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message SinLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric cosine function.
- *
- *
- * .. code::
- *
- *      y = CosLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message CosLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric tangent function.
- *
- *
- * .. code::
- *
- *      y = TanLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message TanLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric arcsine function.
- *
- *
- * .. code::
- *
- *      y = AsinLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message AsinLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric arccosine function.
- *
- *
- * .. code::
- *
- *      y = AcosLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message AcosLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric arctangent function.
- *
- *
- * .. code::
- *
- *      y = AtanLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message AtanLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric hyperbolic sine function.
- *
- *
- * .. code::
- *
- *      y = SinhLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message SinhLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric hyperbolic cosine function.
- *
- *
- * .. code::
- *
- *      y = CoshLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message CoshLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric hyperbolic tangent function.
- *
- *
- * .. code::
- *
- *      y = TanhLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message TanhLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric hyperbolic arcsine function.
- *
- *
- * .. code::
- *
- *      y = AsinhLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message AsinhLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric hyperbolic arccosine function.
- *
- *
- * .. code::
- *
- *      y = AcoshLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message AcoshLayerParams {
-
-}
-
-/**
- * A layer that computes elementwise trigonometric hyperbolic arctangent function.
- *
- *
- * .. code::
- *
- *      y = AtanhLayer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message AtanhLayerParams {
-
-}
-/**
- * A layer that raises each element in first tensor to the power of
- * corresponding element in the second tensor.
- * Supports conventional numpy-like broadcasting.
- *
- * .. code::
- *
- *      y = PowBroadcastableLayer(x)
- *
- * Requires 2 inputs and produces 1 output.
- *
- * Input
- *     - First N-Dimensional tensor
- *     - Second N-Dimensional tensor
- *
- * Output
- *     An N-Dimensional tensor with the broadcast shape.
- *
- */
-message PowBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that computes the exponential of all elements in the input tensor, with the base 2.
- *
- *
- * .. code::
- *
- *      y = Exp2Layer(x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message Exp2LayerParams {
-
-}
-
-/**
- * A layer that returns a tensor containing the indices of all non-zero
- * elements of input tensor.
- * It is similar in functionality to the numpy.where method with 1 input.
- *
- * Requires 1 input and produces 1 output.
- * Output is of rank 2, of shape (N,R),
- * where N is the number of non-zero elements in the input and R is the rank of the input.
- *
- * Output contains indices represented in the multi-index form
- *
- * e.g.:
- * input {shape = (4,)}:
- * [0 1 0 2]
- * output {shape = (2,1)}:
- * [1]
- * [3]
- *
- *
- * input {shape = (3, 3)}:
- * [1 2 1]
- * [0 2 2]
- * [2 1 0]
- * output {shape = (7,1)}:
- * [0. 0.]
- * [0. 1.]
- * [0. 2.]
- * [1. 1.]
- * [1. 2.]
- * [2. 0.]
- * [2. 1.]
- *
- */
-message WhereNonZeroLayerParams {
-
-}
-
-/**
- * A layer that copies a tensor setting everything outside a central band in
- * each inner-most matrix to zero.
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameters for matrix_band_part layer
- * band(m, n) = (num_lower < 0 || (m-n) <= num_lower) && (num_upper < 0 || (n-m) <= num_upper).
- * output[i, j, k, ..., m, n] = band(m, n) * input[i, j, k, ..., m, n]
- *
- *
- * Output shape is same as the input shape.
- * Rank of the input must be at least 2.
- * For rank higher than 2, the last 2 dimensions are treated as the matrix, while the rest are treated as batch.
- */
-message MatrixBandPartLayerParams {
-
-    int64 numLower = 1;
-    int64 numUpper = 2;
-
-}
-
-/**
- * A layer that copies a tensor setting everything outside upper triangular to zero.
- *
- * Requires 1 input and produces 1 output.
- *
- * Output shape is same as the input shape.
- * Rank of the input must be at least 2.
- * For rank higher than 2, the last 2 dimensions are treated as the matrix, while the rest are treated as batch.
- */
-message UpperTriangularLayerParams {
-
-    int64 k = 1; // Diagonal below which to zero elements. k = 0 (the default) is the main diagonal, k < 0 is below it and k > 0 is above
-
-}
-
-/**
- * A layer that copies a tensor setting everything outside lower triangular to zero.
- *
- * Requires 1 input and produces 1 output.
- *
- * Output shape is same as the input shape.
- * Rank of the input must be at least 2.
- * For rank higher than 2, the last 2 dimensions are treated as the matrix, while the rest are treated as batch.
- */
-message LowerTriangularLayerParams {
-
-    int64 k = 1; // Diagonal above which to zero elements. k = 0 (the default) is the main diagonal, k < 0 is below it and k > 0 is above
-
-}
-
-/**
- *
- * A layer that broadcasts a tensor to a new shape.
- *
- * Requires 2 inputs and produces 1 output.
- *
- * First input is broadcast to produce the output, while the second input is only
- * used to determine the shape of the output. Values of second input are not used.
- *
- * Output is a tensor with the same shape as the second input.
- *
- */
-message BroadcastToLikeLayerParams {
-
-}
-
-/**
- *
- * A layer that broadcasts a tensor to a new shape.
- *
- * Requires 1 input and produces 1 output.
- *
- * Output tensor is the broadcasted version of the input and has shape as specified in the
- * parameter "targetShape".
- */
-message BroadcastToStaticLayerParams {
-
-    repeated uint64 targetShape = 1;
-
-}
-
-/**
- *
- * A layer that broadcasts a tensor to a new shape.
- *
- * Requires 2 inputs and produces 1 output.
- *
- * First input is the one that is broadcasted to produce the output.
- * Second input is a rank 1 tensor specifying the shape of the output.
- * Output tensor has shape as specified by the values in the 2nd input tensor.
- */
-message BroadcastToDynamicLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise addition operation with broadcast support.
- *
- * Requires 2 inputs and produces 1 output.
- */
-message AddBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise maximum operation with broadcast support.
- *
- * Requires 2 inputs and produces 1 output.
- */
-message MaxBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise minimum operation with broadcast support.
- *
- * Requires 2 inputs and produces 1 output.
- */
-message MinBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise modular operation with broadcast support.
- *
- * Requires 2 inputs and produces 1 output.
- */
-message ModBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise floor division operation with broadcast support.
- *
- * Requires 2 inputs and produces 1 output.
- */
-message FloorDivBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise subtract operation with broadcast support.
- *
- * Requires 2 inputs and produces 1 output.
- */
-message SubtractBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise multiply operation with broadcast support.
- *
- * Requires 2 inputs and produces 1 output.
- */
-message MultiplyBroadcastableLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise division operation with broadcast support.
- *
- * Requires 2 inputs and produces 1 output.
- */
-message DivideBroadcastableLayerParams {
-
-}
-
-/**
- * Gather layer that gathers elements from the first input, along a specified axis,
- * at indices specified in the second input.
- * It is similar in functionality to the numpy.take method.
- *
- * Requires 2 inputs and produces 1 output.
- *
- * Given two inputs, 'data' and 'indices', gather the slices of 'data'
- * and store into output.
- * e.g.
- * for i in [0, length(indices) - 1]
- *    output[i] = data[indices[i]]  (1-D case, axis=0)
- *
- * if axis = 0:
- * for each vector index (i,...,j)
- *    output[i,...,j,:,..,:] = data[indices[i,...,j],:,..,:]
- *
- * output.rank = (data.rank - 1) + indices.rank
- *
- * Negative indices and negative axis are supported.
- *
- * e.g:
- *
- * data shape = (2, 3)
- * indices shape = (6, 8)
- * axis = 0
- * output shape = (6, 8) + (3,) = (6, 8, 3)
- *
- * data shape = (2, 3, 5)
- * indices shape = (6, 8)
- * axis = 1
- * output shape = (2,) + (6, 8) + (5,) =  (2, 6, 8, 5)
- *
- */
-message GatherLayerParams {
-
-    int64 axis = 1;
-
-}
-
-/*
- * Scatter accumulation mode.
- */
-enum ScatterMode {
-
-    SCATTER_UPDATE = 0;
-    SCATTER_ADD = 1; /// add
-    SCATTER_SUB = 2; /// subtract
-    SCATTER_MUL = 3; /// multiply
-    SCATTER_DIV = 4; /// divide
-    SCATTER_MAX = 5; /// maximum
-    SCATTER_MIN = 6; /// minimum
-
-}
-
-/*
- * A layer that scatters data into a new tensor according to indices from the input.
- * This is the inverse operation of Gather.
- *
- * Requires 3 inputs and produces 1 output.
- *
- * Output is initialized with the first input.
- * Then updated with the values in the third input, at indices specified by the second input.
- *
- * An example when axis=0:
- * Given three inputs, in order, "container", "indices", "updates", where
- *
- * - "container" is a rank R+1 tensor of shape [D_0, D_1, ..., D_R], which
- *   contains D_0 number of tensors, each with shape [D_1, ..., D_R].
- *
- * - "indices" is a rank 1 tensor with shape [N], where N is the number of updates.
- *   The values in this tensor must be in the range [0, D_0 - 1]. (negative indexing is supported)
- *
- * - "updates" is a rank R+1 tensor with shape [N, D_1, ..., D_R], which represents
- *   a total number of N tensors, each of shape [D_1, ..., D_R].
- *
- * The effect of this operation is as follows:
- *
- * output = container;
- * For each i in 0, ..., N - 1
- *    output[indices[i], :, ..., :] = updates[i, :, ..., :] // if mode == "SCATTER_UPDATE"
- *
- * or
- * For each i in 0, ..., N - 1
- *    output[indices[i], :, ..., :] += updates[i, :, ..., :] // if mode == "SCATTER_ADD"
- *
- * etc
- *
- * When "indices" is a tensor of rank greater than 1, the equation becomes (for axis=0):
- * For each vector index (i,...,j)
- *   output[indices[i,...,j],...] -= updates[i,...,j,...] // if mode == "SCATTER_SUB"
- *
- *
- * The output has the same shape as the first input.
- * "indices" input must have rank less than or equal to the "updates" input and its shape
- * must be a subset of the the shape of the "updates" input.
- *
- * e.g:
- *
- * container shape = (4, 3)
- * indices shape = (5, 2, 3)
- * updates shape = (4, 5, 2, 3)
- * axis = 1
- * output shape = (4, 3)
- *
- * container shape = (4, 4, 3)
- * indices shape = (6,)
- * updates shape = (4, 6, 3)
- * axis = -2
- * output shape = (4, 4, 3)
- *
- * container shape = (5,)
- * indices shape = (5, 7, 5, 6)
- * updates shape = (5, 7, 5, 6)
- * axis = -1
- * output shape = (5,)
- */
-
-message ScatterLayerParams {
-
-    int64 axis = 1;
-    ScatterMode mode = 2; /// mode of accumulation.
-
-}
-
-/**
- * A layer that gathers elements from the first input, 'params', at the multi-indices specified
- * by the second input, 'indices'.
- *
- * Requires 2 inputs and produces 1 output.
- *
- * 'params' = input[0], 'indices' = input[1]
- *
- * 'indices' is a rank K+1 tensor of shape [I_0, I_1, .., I_(K-1), I_K] which is viewed as a collection of
- * indices of (I_0 * I_1 * ... * I_(K-1)) points in the I_K dimensional space. For instance, the multi-index of the first point
- * is indices[0,0,...,0,:].
- *
- * Here is how the output is constructed:
- *
- * for i = 0,1,...,(I_0-1)
- *   ...
- *     for j = 0,1,....,(I_(K-1)-1)
- *          output[i,....,j,:,:,..,:] = params[indices[i,...,j,:], :,:,..,:]
- *
- * Hence, output shape is [I_0, I_1,...,I(K-1)] + params.shape[I_K:]
- *
- * output.rank = indices.rank - 1 + params.rank - indices.shape[-1]
- *
- * e.g:
- *
- * input[0] shape = (4, 2, 3, 4)
- * input[1] shape = (6, 2)
- * output shape = (6,) + (3, 4) = (6, 3, 4)
- *
- * input[0] shape = (3, 3, 3, 4, 7)
- * input[1] shape = (3, 5)
- * output shape = (3,) + () = (3,)
- *
- * input[0] shape = (5, 3, 2, 5)
- * input[1] shape = (2, 7, 3, 2)
- * output shape = (2, 7, 3) + (2, 5) = (2, 7, 3, 2, 5)
- *
- */
-message GatherNDLayerParams {
-
-}
-
-/*
- * A layer that scatters data into a new tensor according to multi-indices from the input.
- * This is the inverse operation of GatherND.
- *
- * Requires 3 inputs and produces 1 output.
- * 3 inputs, in order are denoted as "container", "indices", "updates".
- *
- * 'indices' is a rank K+1 tensor of shape [I_0, I_1, .., I_(K-1), I_K] which is viewed as a collection of
- * indices of (I_0 * I_1 * ... * I_(K-1)) points in the I_K dimensional space. For instance, the multi-index of the first point
- * is indices[0,0,...,0,:].
- *
- * container.rank >= I_K
- * updates.rank = K + (container.rank - I_K)
- * shape of 'updates' = [I_0, I_1,...,I(K-1)] + container.shape[I_K:]
- *
- * output = container
- * For each vector index (i,...,j) s.t. 0<=i<I_0,..., 0<=j<I_K
- *   output[indices[i,...,j,:], :,:,..,:] = updates[i,....,j,:,:,..,:] // if mode == "SCATTER_UPDATE"
- *
- * The output has the same shape as the first input.
- *
- * e.g:
- *
- * container shape = (3, 2)
- * indices shape = (4, 2)
- * updates shape = (4,)
- * output shape = (3, 2)
- *
- * container shape = (7, 6)
- * indices shape = (4, 7, 2, 5, 1)
- * updates shape = (4, 7, 2, 5, 6)
- * output shape = (7, 6)
- *
- */
-message ScatterNDLayerParams {
-
-    ScatterMode mode = 1; /// mode of accumulation.
-
-}
-
-/**
- * Gather layer that gathers elements from the first input, along a specified axis,
- * at indices specified in the second input.
- * It is similar in functionality to the numpy.take_along_axis method.
- *
- * Requires 2 inputs and produces 1 output.
- *
- * Given two inputs, 'data' and 'indices', gather the slices of 'data'
- * and store into output.
- *
- * Both inputs and output have the same rank.
- * Output shape is same as the shape of 'indices'
- * Shapes of 'indices' and 'data' match, except at the 'axis' dimension.
- *
- * This operation performs the following operation for axis=0:
- * for each vector index (i,j,....,k)
- *    output[i,j,....,k] = data[index[i,j,....,k],j,....,k]
- *
- * Negative indices and negative axis are supported.
- *
- * e.g:
- *
- * data shape = (4, 4, 7)
- * indices shape = (4, 5, 7)
- * axis = 1
- * output shape = (4, 5, 7)
- *
- */
-message GatherAlongAxisLayerParams {
-
-    int64 axis = 1;
-
-}
-
-/**
- * A layer that scatters data into a new tensor according to indices from
- * the input along the given axis into the output tensor.
- * This is the inverse operation of GatherAlongAxis.
- * It is similar in functionality to the numpy.put_along_axis method.
- *
- * Requires 3 inputs and produces 1 output.
- * 3 inputs, in order are denoted as "container", "indices", "updates".
- *
- * All inputs and output have the same rank.
- * Output shape is same as the shape of 'container'
- * Shapes of 'indices' and 'updates' match, which is same as the shape of 'container' except at the 'axis' dimension.
- *
- * Negative indices and negative axis are supported.
- *
- * This operation performs the following operation for axis=0:
- * output = container
- * for each vector index (i,j,....,k)
- *    output[index[i,j,....,k],j,....,k] = updates[i,j,....,k]
- *
- * e.g.:
- *
- * container shape = (2, 5, 6)
- * indices shape = (2, 2, 6)
- * updates shape = (2, 2, 6)
- * axis = -2
- * output shape = (2, 5, 6)
- *
- */
-message ScatterAlongAxisLayerParams {
-
-    int64 axis = 1;
-    ScatterMode mode = 2; /// mode of accumulation.
-
-}
-
-/**
- * A layer that stacks the input tensors along the given axis.
- * It is similar in functionality to the numpy.stack method.
- *
- * Requires at least 2 inputs and produces 1 output.
- * All inputs must have the same shape.
- * Rank of the output is 1 greater than the rank of the inputs.
- *
- * Negative indexing is supported for the "axis" parameter.
- *
- * e.g.:
- *
- * input shape = (2, 4, 2)
- * number of inputs = 5
- * axis = 3
- * output shape = (2, 4, 2, 5)
- *
- * input shape = (2, 4, 2)
- * number of inputs = 5
- * axis = -2
- * output shape = (2, 4, 5, 2)
- */
-message StackLayerParams {
-
-    int64 axis = 1;
-
-}
-
-/**
- * A layer that reshapes a tensor that does not alter the rank of the input.
- * Order of the data is left unchanged.
- *
- * Requires 1 input and produces 1 output.
- *
- * e.g:
- *
- * input shape = (20,10)
- * targetShape = (5,-1)
- * output shape = (5,40)
- *
- * input shape = (20,10,5)
- * targetShape = (0,2,25)
- * output shape = (20,2,25)
- *
- * input shape = (10,3,5)
- * targetShape = (25,0,-1)
- * output shape = (25,3,2)
- */
-message RankPreservingReshapeLayerParams {
-
-    /**
-     * Length of this field must be same as the input/output rank.
-     * It can have 0's, in which case the corresponding input dimension is kept intact.
-     * At most one element can be -1, in which case the output dimension is calculated from rest of the shape.
-     */
-    repeated int64 targetShape = 1;
-
-}
-
-/**
- * Constant padding layer.
- * Pad the input array with a constant value, either along a single given axis or along a set of axes.
- *
- * Requires 1 or 2 inputs and produces 1 output.
- * The amount of padding can be either set as a parameter ("padAmounts") or provided as a second input.
- *
- * Output rank is same as the rank of the first input.
- *
- * when "padToGivenOutputSizeMode" is False:
- *
- * output_shape[i] = input_shape[i] + padAmounts[2*i] + padAmounts[2*i+1], i=0,...,rank-1
- *
- * Examples:
- *
- * input shape = (20,10)
- * padAmounts = [0,1,4,0]
- * output shape = (21,14)
- *
- * input shape = (20,10,5)
- * padAmounts = [0,0,3,4,0,9]
- * output shape = (20,17,14)
- *
- *
- * when "padToGivenOutputSizeMode" is True
- *
- * output_shape[i] = max(input_shape[i], max(padAmounts[2*i] + padAmounts[2*i+1])), i=0,...,rank-1
- *
- * input shape = (20,10)
- * padAmounts = [0,21,14,0]
- * output shape = (21,14)
- *
- * input shape = (20,10,5)
- * padAmounts = [0,0,17,0,0,14]
- * output shape = (20,17,14)
- */
-message ConstantPaddingLayerParams {
-    /**
-     * The value to be used for padding.
-     */
-    float value = 1;
-
-    /**
-     * Length of this repeated field must be twice the rank of the first input.
-     * 2*i-th and (2*i+1)-th values represent the amount of padding to be applied to the the i-th input
-     * dimension, "before" and "after" the input values, respectively.
-     */
-    repeated uint64 padAmounts = 2;
-
-    /**
-     * When this is True, positive values in "padAmounts" are equivalent to the output shape.
-     * In that case only one of padAmounts[2*i] and padAmounts[2*i+1] can be non zero, for i=0,..,rank-1.
-     */
-    bool padToGivenOutputSizeMode = 3;
-}
-
-/**
- * A layer that returns a tensor filled with values from the normal distribution.
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameters
- *     seed: seed used for the normal distribution.
- *     mean: mean of the normal distribution.
- *     stdDev: standard deviation of the normal distribution.
- *
- * Input
- *     An N-Dimensional tensor, whose values are ignored. Only the shape is used to
- *     infer the shape of the output.
- *
- * Output
- *     An N-Dimensional tensor with the same shape as the input tensor.
- *
- */
-message RandomNormalLikeLayerParams {
-
-    int64 seed = 1;
-    float mean = 2;
-    float stdDev = 3;
-
-}
-
-/**
- * A layer that returns a tensor filled with values from the normal distribution.
- *
- * Requires no input and produces 1 output.
- *
- * Parameters
- *     seed: seed used for the normal distribution.
- *     mean: mean of the normal distribution.
- *     stdDev: standard deviation of the normal distribution.
- *     outputShape: shape of the output tensor.
- *
- * Output
- *     An N-Dimensional tensor of shape "outputShape".
- *
- */
-message RandomNormalStaticLayerParams {
-
-    int64 seed = 1;
-    float mean = 2;
-    float stdDev = 3;
-    repeated uint64 outputShape = 4;
-
-}
-
-/**
- * A layer that returns a tensor filled with values from the normal distribution.
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *     seed: seed used for the normal distribution.
- *     mean: mean of the normal distribution.
- *     stdDev: standard deviation of the normal distribution.
- *
- * Input
- *     A rank 1 tensor specifying the shape of the output
- *
- * Output
- *     An N-Dimensional tensor with the shape specified by the values in the input tensor.
- */
-message RandomNormalDynamicLayerParams {
-
-    int64 seed = 1;
-    float mean = 2;
-    float stdDev = 3;
-
-}
-
-/**
- * A layer that returns a tensor filled with values from the uniform distribution.
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameters
- *     seed: seed used for the uniform distribution.
- *     minVal: lower bound on the range of random values for the uniform distribution.
- *     maxVal: upper bound on the range of random values for the uniform distribution.
- *
- * Input
- *     An N-Dimensional tensor, whose values are ignored. Only the shape is used to
- *     infer the shape of the output.
- *
- * Output
- *     An N-Dimensional tensor with the same shape as the input tensor.
- *
- */
-message RandomUniformLikeLayerParams {
-
-    int64 seed = 1;
-    float minVal = 2;
-    float maxVal = 3;
-
-}
-
-/**
- * A layer that returns a tensor filled with values from the uniform distribution.
- *
- * Requires no input and produces 1 output.
- *
- * Parameters
- *     seed: seed used for the uniform distribution.
- *     minVal: lower bound on the range of random values for the uniform distribution.
- *     maxVal: upper bound on the range of random values for the uniform distribution.
- *     outputShape: shape of the output tensor.
- *
- * Output
- *     An N-Dimensional tensor of shape "outputShape".
- *
- */
-message RandomUniformStaticLayerParams {
-
-    int64 seed = 1;
-    float minVal = 2;
-    float maxVal = 3;
-    repeated uint64 outputShape = 4;
-
-}
-
-/**
- * A layer that returns a tensor filled with values from the uniform distribution.
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *     seed: seed used for the uniform distribution.
- *     minVal: lower bound on the range of random values for the uniform distribution.
- *     maxVal: upper bound on the range of random values for the uniform distribution.
- *
- * Input
- *     A rank 1 tensor specifying the shape of the output
- *
- * Output
- *     An N-Dimensional tensor with the shape specified by the values in the input tensor.
- *
- */
-message RandomUniformDynamicLayerParams {
-
-    int64 seed = 1;
-    float minVal = 2;
-    float maxVal = 3;
-
-}
-
-/**
- * A layer that returns a tensor filled with values from the Bernoulli distribution.
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameters
- *     seed: seed used for the Bernoulli distribution.
- *     prob: probability of a 1 event.
- *
- * Input
- *     An N-Dimensional tensor, whose values are ignored. Only the shape is used to
- *     infer the shape of the output.
- *
- * Output
- *     An N-Dimensional tensor with the same shape as the input tensor.
- *
- */
-message RandomBernoulliLikeLayerParams {
-
-    int64 seed = 1;
-    float prob = 2;
-
-}
-
-/**
- * A layer that returns a tensor filled with values from the Bernoulli distribution.
- *
- * Requires no input and produces 1 output.
- *
- * Parameters
- *     seed: seed used for the Bernoulli distribution.
- *     prob: probability of a 1 event.
- *     outputShape: shape of the output tensor.
- *
- * Output
- *     An N-Dimensional tensor of shape "outputShape".
- */
-message RandomBernoulliStaticLayerParams {
-
-    int64 seed = 1;
-    float prob = 2;
-    repeated uint64 outputShape = 3;
-
-}
-
-/**
- * A layer that returns a tensor filled with values from the Bernoulli distribution.
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *     seed: seed used for the Bernoulli distribution.
- *     prob: probability of a 1 event.
- *
- * Input
- *     A rank 1 tensor specifying the shape of the output
- *
- * Output
- *     An N-Dimensional tensor with the shape specified by the values in the input tensor.
- */
-message RandomBernoulliDynamicLayerParams {
-
-    int64 seed = 1;
-    float prob = 2;
-
-}
-
-/**
- * A layer that returns a tensor of the specified shape filled with values from the categorical distribution.
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameter:
- *     seed: seed used for the categorical distribution.
- *     numSamples: number of samples to draw.
- *     isLogits: true if the inputs are logits, false if the inputs are probabilities.
- *     eps: default value is 1e-10.
- *     temperature: default value is 1.0.
- *
- * Input tensor shape = [D_1, D_2, ... , D_(R-1), D_R] (Rank = R)
- * Then the shape of the output is [D_1, D_2, ... , D_(R-1), numSamples] (Rank = R)
- *
- */
-message CategoricalDistributionLayerParams {
-
-    int64 seed = 1;
-    int64 numSamples = 2;
-    bool isLogits = 3;
-    float eps = 4;
-    float temperature = 5;
-}
-
-/**
- * A layer that performs reduction with L1 normalization operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceL1LayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with L2 normalization operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceL2LayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with max operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceMaxLayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with min operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceMinLayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with sum operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceSumLayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with prod operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceProdLayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with mean operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceMeanLayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with logSum operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceLogSumLayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with logSumExp operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceSumSquareLayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that performs reduction with logSumExp operation.
- *
- * Negative indexing is supported.
- * Requires 1 input and produces 1 output.
- *
- * Parameters:
- *    axes: dimensions along which to perform reduction
- *    keepDims: if True, keep the reduced dimensions (value will be 1), otherwise, reduced dimensions are squeezed
- *    reduceAll: ignore the "axes" parameter, perform reduction along all axes
- *
- */
-message ReduceLogSumExpLayerParams {
-
-    repeated int64 axes = 1;
-    bool keepDims = 2;
-    bool reduceAll = 3;
-
-}
-
-/**
- * A layer that increases the rank of the input tensor by adding unit dimensions.
- *
- * Requires 1 input and produces 1 output.
- *
- * e.g.:
- *
- * input shape = (10,5)
- * axes = (0,1)
- * output shape = (1,1,10,5)
- *
- * input shape = (10,5)
- * axes = (0,2)
- * output shape = (1,10,1,5)
- *
- * input shape = (10,5)
- * axes = (-2,-1)
- * output shape = (10,5,1,1)
- *
- */
-message ExpandDimsLayerParams {
-
-    /**
-     * Axis values provided here get dimension 1 in the output tensor.
-     * Negative indexing is supported.
-     */
-    repeated int64 axes = 1;
-
-}
-
-/**
- * A layer that flattens the input tensor into a 2-dimensional matrix.
- *
- * Requires 1 input and produces 1 output.
- * Output tensor is always rank 2.
- *
- * First dimension of output is the product of all the dimensions in input[:axis] ("axis" is exclusive)
- * Second dimension of output is the product of all the dimensions in input[axis:] ("axis" is inclusive)
- *
- * e.g.:
- * input shape:  (3,)
- * axis:  -1
- * output shape:  (1, 3)
- *
- * input shape:  (3,)
- * axis:  1
- * output shape:  (3, 1)
- *
- * input shape:  (4, 3)
- * axis:  -1
- * output shape:  (4, 3)
- *
- * input shape:  (5, 2)
- * axis:  0
- * output shape:  (1, 10)
- *
- * input shape:  (5, 5, 3)
- * axis:  -2
- * output shape:  (5, 15)
- *
- * input shape:  (2, 3, 2)
- * axis:  -1
- * output shape:  (6, 2)
- *
- */
-message FlattenTo2DLayerParams {
-
-    int64 axis = 1;
-
-}
-
-/**
- * A layer that reshapes a tensor.
- *
- * Requires 1 input and produces 1 output.
- *
- * Output tensor is the reshaped version of the input and has shape as specified in the
- * parameter "targetShape".
- *
- */
-message ReshapeStaticLayerParams {
-
-    repeated int64 targetShape = 1;
-
-}
-
-/**
- * A layer that reshapes a tensor.
- *
- * Requires 2 inputs and produces 1 output.
- *
- * First input is reshaped to produce the output, while the second input is only
- * used to determine the shape of the output. Values of the second input are not used.
- *
- * Output is a tensor with the same shape as the second input.
- *
- */
-message ReshapeLikeLayerParams {
-
-}
-
-/**
- * A layer that reshapes a tensor.
- *
- * Requires 2 inputs and produces 1 output.
- *
- * First input is the one that is reshaped to produce the output.
- * Second input is a rank 1 tensor specifying the shape of the output.
- * Output tensor has shape as specified by the values in the 2nd input tensor.
- */
-message ReshapeDynamicLayerParams {
-
-}
-
-/**
- * A layer that decreases the rank of the input tensor by removing unit dimensions.
- *
- * Requires 1 input and produces 1 output.
- *
- * Output rank is one less than input rank, if input rank is more than 1.
- * If input rank is 1, output rank is also 1.
- *
- * e.g.:
- *
- * input shape = (1,1,10,5)
- * axes = (0,1)
- * output shape = (10,5)
- *
- * input shape = (1,10,5,1)
- * axes = (0,3)
- * output shape = (10,5)
- *
- * input shape = (10,5,1,1)
- * axes = (-2,-1)
- * output shape = (10,5)
- *
- * input shape = (1,)
- * axes = (0)
- * output shape = (1,)
- *
- */
-message SqueezeLayerParams {
-
-    /**
-     * Axis values provided here get removed from the input tensor.
-     * Negative indexing is supported.
-     */
-    repeated int64 axes = 1;
-    bool squeezeAll = 2; // if true squeeze all dimensions that are 1.
-
-}
-
-/**
- * A layer that returns top K (or bottom K) values and the corresponding indices
- * of the input along a given axis.
- *
- * Requires 1 or 2 inputs and produces 2 outputs.
- *
- * The second input is the value of the K, and is optional.
- * If there is only one input, value of K that is specified in the layer parameter is used.
- *
- * Both outputs have the same rank as the first input.
- * Second input must correspond to a scalar tensor.
- *
- * e.g.:
- *
- * first input's shape = (45, 34, 10, 5)
- * axis = 1
- * output shape, for both outputs = (45, K, 10, 5)
- *
- */
-message TopKLayerParams {
-
-    int64 axis = 1; ///  negative indexing is supported
-    uint64 K = 2; /// is ignored if a second input is present.
-    bool useBottomK = 3; /// if true, bottom K (values, indices) are returned instead
-
-}
-
-/**
- * A layer that returns the indices of the maximum value along a specified axis in a tensor.
- *
- * Requires 1 input and produces 1 output. Negative indexing is supported.
- *
- * Output has the same rank as the input if "removeDim" is False (default).
- * Output has rank one less than the input if "removeDim" is True and input rank is more than 1.
- *
- * e.g.:
- *
- * input shape = (45, 34, 10, 5)
- * axis = -2
- * output shape = (45, 1, 10, 5), if removeDim = False (default)
- * output shape = (45, 10, 5), if removeDim = True
- *
- * input shape = (5,)
- * axis = 0
- * output shape = (1,), if removeDim = False or True
- *
- */
-message ArgMaxLayerParams {
-
-    int64 axis = 1;
-    bool removeDim = 2;
-
-}
-
-/**
-* A layer that returns the indices of the minimum value along a specified axis in a tensor.
-*
-* Requires 1 input and produces 1 output. Negative indexing is supported.
-*
-* Output has the same rank as the input if "removeDim" is False (default).
-* Output has rank one less than the input if "removeDim" is True and input rank is more than 1.
-*
-* e.g.:
-*
-* input shape = (45, 34, 10, 5)
-* axis = -2
-* output shape = (45, 1, 10, 5), if removeDim = False (default)
-* output shape = (45, 10, 5), if removeDim = True
-*
-* input shape = (5,)
-* axis = 0
-* output shape = (1,), if removeDim = False or True
-*
-*/
-message ArgMinLayerParams {
-
-    int64 axis = 1;
-    bool removeDim = 2;
-
-}
-
-/**
- * A layer layer that splits the input tensor into multiple output tensors,
- * along the specified axis.
- *
- * The layer either uniformly splits the input tensor into ``num_splits`` tensors, or
- * splits according to the given split sizes in ``split_sizes``.
- * Supports unequal splits and negative indexing.
- *
- * Requires 1 input and produces at least 2 outputs.
- * Rank of all the outputs is same as that of the input.
- *
- * If parameter "splitSizes" is provided, value of the parameter "numSplits" is ignored, since in that case
- * "numSplits" is automatically inferred to be the length of "splitSizes".
- *
- *
- * e.g.:
- * input shape:  (5, 3, 4)
- * axis = -3, split_sizes = [3, 2]
- * output shape:  (3, 3, 4)
- * output shape:  (2, 3, 4)
- */
-message SplitNDLayerParams {
-
-    int64 axis = 1;
-    uint64 numSplits = 2;
-    repeated uint64 splitSizes = 3;
-
-}
-
-/**
- * A layer that performs element-wise ceil operation on the input tensor that
- * rounds the value to the smallest integer not less than x.
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message CeilLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise round operation on the input tensor
- * that rounds the value to the nearest integer.
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message RoundLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise floor operation on the input tensor
- * that rounds the value to the largest integer not greater than x.
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message FloorLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise sign operation (+1 for positive values,
- * -1 for negative values, 0 for zeros).
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message SignLayerParams {
-
-}
-
-/**
- * A layer that performs element-wise clip operation. Clip the values in the
- * input tensor to the threshold values [min_value, max_value].
- *
- * Requires 1 input and produces 1 output.
- *
- * Parameter minVal: the minimum threshold.
- * Parameter maxVal: the maximum threshold.
- *
- * output =  min(max(input, minVal), maxVal)
- *
- * Output shape is same as the input.
- */
-message ClipLayerParams {
-
-    float minVal = 1;
-    float maxVal = 2;
-
-}
-
-/**
- * A layer that extracts a slice of size ``(end - begin) / stride``
- * from the given input tensor.
- * Support negative indexing and negative strides.
- *
- * Requires 1 input and produces 1 output.
- * Output rank is same as the input rank.
- *
- * Value of beginIds, beginMasks, endIds, endMasks, strides are required parameters.
- * Lengths of all the parameters must equal the rank of the input.
- *
- * i-th element of "beginIds" is ignored and assumed to be 0 if the i-th element of
- * "beginMasks" is True
- *
- * i-th element of "endIds" is ignored and assumed to be -1 if the i-th element of
- * "endMasks" is True
- *
- * e.g.:
- * if i-th element of "squeezeMasks" is set to True, only beginIds[i] would be sliced
- * out, and all other masks and inputs are ignored.
- *
- * e.g. (without squeezeMasks):
- * input shape:  (5, 5, 5)
- * beginIds:  [1, 2, 3]
- * beginMasks:  [True, False, True]
- * endIds:  [3, -3, 2]
- * endMasks:  [False, True, True]
- * strides:  [2, 2, 2]
- * SqueezeMasks:  [False, False, False]
- * output shape:  (2, 2, 3)
- * This is equivalent to input[:3:2, 2::2, ::2]
- *
- * e.g. (with squeezeMasks):
- * input shape:  (5, 5, 5)
- * beginIds:  [1, 2, 3]
- * beginMasks:  [True, False, True]
- * endIds:  [3, -3, 2]
- * endMasks:  [False, True, True]
- * strides:  [2, 2, 2]
- * SqueezeMasks:  [False, True, False]
- * output shape:  (2, 3)
- * This is equivalent to input[:3:2, 2, ::2]
- *
- */
-message SliceStaticLayerParams {
-
-    repeated int64 beginIds = 1;
-    repeated bool beginMasks = 2;
-    repeated int64 endIds = 3;
-    repeated bool endMasks = 4;
-    repeated int64 strides = 5;
-    repeated bool squeezeMasks = 6;
-
-
-}
-
-/**
- * A layer that extracts a slice of size ``(end - begin) / stride``
- * from the given input tensor.
- * Support negative indexing and negative strides.
- * See "SliceStaticLayerParams" for the description and an example of the functionality of the layer.
- *
- * Requires 2 to 7 inputs and produces 1 output.
- * Rank of the output is same as the rank of the first input unless squeezeMask is set.
- *
- * Value of beginIds, beginMasks, endIds, endMasks, strides can be passed in either
- * as dynamic inputs or as static parameters.
- * Lengths of all the parameters or inputs from 2-6 must equal the rank of the first input.
- *
- * The 2nd input represents the "beginIds".
- * The 3rd input, if present, corresponds to "endIds". In this case the value of the "endIds" parameter is ignored.
- * The 4th input, if present, corresponds to "strides". In this case the value of the "strides" parameter is ignored.
- * The 5th input, if present, corresponds to "beginMasks". In this case the value of the "beginMasks" parameter is ignored.
- * The 6th input, if present, corresponds to "endMasks". In this case the value of the "endMasks" parameter is ignored.
- * The 7th input, if present, corresponds to "squeezeMasks". In this case the value of the "squeezeMasks" parameter is ignored.
- *
- */
-message SliceDynamicLayerParams {
-
-    repeated bool beginMasks = 2;
-    repeated int64 endIds = 3;
-    repeated bool endMasks = 4;
-    repeated int64 strides = 5;
-    repeated bool squeezeMasks = 6;
-
-}
-
-/**
- * A layer that constructs a tensor by repeating the input tensor multiple
- * number of times.
- *
- * Requires 1 or 2 inputs and produces 1 output.
- * Output rank is same as the input rank.
- *
- * If two inputs are provided, second input is used as "reps"
- * and "reps" parameter is ignored.
- *
- * If only one input is provided,
- * length of the "reps" parameter must be at least 1 and
- * not greater than the rank of the input.
- * If it is less than the input rank, it is made equal to the input rank by prepending 1's to it.
- *
- * e.g.:
- *
- * input shape = (2, 4, 2)
- * reps = (1, 2, 6)
- * output shape = (2, 8, 12)
- *
- * input shape = (2, 4, 2)
- * reps = (6)
- * reps after prepending ones = (1, 1, 6)
- * output shape = (2, 4, 12)
- *
- * input shape = (2, 4, 2)
- * second input = [1, 2, 6] -> shape: (3,)
- * reps = N/A [Ignored]
- * output shape = (2, 8, 12)
- *
- */
-message TileLayerParams {
-
-    repeated uint64 reps = 1;
-
-}
-
-/**
- * A layer that returns the shape of an input tensor.
- *
- * Requires 1 input and produces 1 output.
- *
- * Input: a tensor.
- * Output: a vector of length R, where R is the rank of the input tensor
- * Output is always a rank 1 tensor.
- */
-message GetShapeLayerParams {
-
-}
-
-/**
- * A layer that computes the Gauss error function,
- * which is defined as:
- *
- * .. math::
- *     f(x) = \dfrac{1}{\sqrt{\pi}}\int_{-x}^{x}{e^{-t^2}dt}
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- */
-message ErfLayerParams {
-
-}
-
-/**
- * A layer that evaluates the Gaussian Error Linear Unit (GELU) activation.
- * Following equations are used to compute the activation based on the value of the "mode" parameter:
- *
- * mode == 'EXACT':
- * .. math::
- *     f(x) = 0.5x\left ( 1+\rm{erf}\left ( \frac{x}{\sqrt{2}} \right ) \right )
- *
- * mode == 'TANH_APPROXIMATION':
- * .. math::
- *     f(x) = 0.5x\left ( 1+\rm{tanh}\left ( \sqrt{2/\pi}\left ( x + 0.044715x^3 \right ) \right ) \right )
- *
- * mode == 'SIGMOID_APPROXIMATION':
- * .. math::
- *     f(x) = x*\rm{sigmoid}(1.702x)
- *
- * Requires 1 input and produces 1 output.
- * Output shape is same as the input.
- *
- */
-message GeluLayerParams {
-
-    enum GeluMode {
-
-        EXACT = 0;
-        TANH_APPROXIMATION = 1;
-        SIGMOID_APPROXIMATION = 2;
-
-    }
-
-    GeluMode mode = 1; /// mode of GELU operation.
-
-}
-
-/**
- * RangeStatic layer that returns a tensor that contains evenly spaced values.
- * It is similar in functionality to the numpy.arange method.
- *
- * Requires no input and produces 1 output.
- * Output is a rank 1 tensor.
- */
-message RangeStaticLayerParams {
-
-    float endValue = 1;
-    float startValue = 2;
-    float stepSizeValue = 3;
-
-}
-
-/**
- * A layer that returns a tensor that contains evenly spaced values.
- * Its functionality is similar to the numpy.arange method.
- *
- * Requires at least 1 input, up to a maximum of 3 inputs.
- * Produces 1 output, which is a rank 1 tensor.
- *
- * Each input must be a scalar, or rank 1 and shape (1,).
- *
- * The first input represents the "endValue".
- * The second input, if present, corresponds to "startValue". In this case the value of the "startValue" parameter is ignored.
- * The third input, if present, corresponds to "stepSizeValue". In this case the value of the "stepSizeValue" parameter is ignored.
- *
- */
-message RangeDynamicLayerParams {
-
-    float startValue = 2;
-    float stepSizeValue = 3;
-
-}
-
-/**
- * A layer that returns a tensor containing all windows of size ``windowSize``
- * separated by ``step`` along the dimension ``axis``.
- *
- * .. code::
- *
- *      y = SlidingWindows(x)
- *
- * Requires 1 input and produces 1 output.
- *
- * Input
- *     An N-Dimensional tensor.
- *
- * Output
- *     An (N+1)-Dimensional tensor.
- *
- * This operation behaves as following:
- *      - if axis = 0 & input is rank 1 (L,). Output shape will be (M, W).
- *      - if axis = 1 & input is rank 3 (B1, L, C1). Output shape will be (B1, M, W, C1)
- *      - if axis = 2 & input is rank 5 (B1, B2, L, C1, C2) --> (B1 * B2, L, C1 * C2) --> (B1 * B2, M, W, C1 * C2). Output shape will be (B1, B2, M, W, C1, C2)
- *      - etc.
- * where
- *      - L, C, B refer to input length, feature dimension length & batch size respectively
- *      - W is the window size.
- *      - M is the number of windows/slices calculated as M = (L - W) / step + 1
- */
-message SlidingWindowsLayerParams {
-
-    int64 axis = 1;
-    uint64 windowSize = 2;
-    uint64 step = 3;
-
-}
-
-/**
- * A layer that applies layer normalization over the input tensor.
- *
- * Requires 1 input and produces 1 output.
- *
- * output = gamma * (input - computed_mean) / (sqrt(computed_variance + eps)) + beta
- *
- * Parameters
- *     normalizedShape: subset of the input shape, along with layer norm is performed, rest of the input shape is treated as the batch dimension. The mean and variance are computed for the input, over the last few dimensions as specified by the normalizedShape parameter.
- *     gamma: must have shape = "normalizedShape"
- *     beta: must have shape = "normalizedShape"
- *     eps: small constant to avoid division by 0
- *
- * Output shape is same as the input.
- *
- * e.g.:
- * input shape = (10,5)
- * normalized shape = (5,) or (10,5)
- *
- * input shape = (10,5,6,7)
- * normalized shape = (7,) or (6,7) or (5,6,7) or (10,5,6,7)
- */
-message LayerNormalizationLayerParams {
-
-    repeated int64 normalizedShape = 1;
-    float eps = 2;
-    WeightParams gamma = 3;
-    WeightParams beta = 4;
-
-}
-
-/**
- * Non maximum suppression (NMS) layer.
- * Applies the non maximum suppression algorithm to input bounding box coordinates.
- * The effect of this layer is similar to the functionality of the "NonMaximumSuppression"
- * model type (for details please see NonMaximumSuppression.proto) with a couple of differences.
- * One, this is a layer in a neural network model, whereas that is a different model type. Second,
- * this layer supports a batch of bounding boxes.
- *
- * The NMS layer requires at least 2 inputs, and up to a maximum of 5 inputs. It produces 4 outputs.
- * Following is the description of inputs and outputs:
- *
- * input 1, shape (B,N,4): coordinates of N boxes, for a batch size B.
- * input 2, shape (B,N,C): class scores for each box. C can be 1 when there is only 1 score per box, i.e., no class specific score.
- *
- * input 3, optional, shape (1,): IoU threshold. When present, it overwrites the value provided in layer parameter "iouThreshold".
- * input 4, optional, shape (1,): Score threshold. When present, it overwrites the value provided in layer parameter "scoreThreshold".
- * input 5, optional, shape (1,): Maximum number of boxes. When present, it overwrites the value provided in layer parameter "maxBoxes".
- *
- * output 1, shape (B,maxBoxes,4): box coordinates, corresponding to the surviving boxes.
- * output 2, shape (B,maxBoxes,C): box scores, corresponding to the surviving boxes.
- * output 3, shape (B,maxBoxes): indices of the surviving boxes. Hence it will have values in the range [0,N-1], except for padding.
- * output 4, shape (B,): number of boxes selected after the NMS algorithm, for each batch.
- *
- * When surviving boxes are less than "maxBoxes", the first 3 outputs are padded.
- * For the first two outputs, the padding is done using values 0, whereas for the third output the
- * padding value used is -1, since the output values represent indices.
- *
- * If no box survives, that is, all the scores are below the "scoreThreshold",
- * then for that batch, number of boxes (value of the fourth output) will be 1. The first 3 outputs will
- * correspond to the box with the highest score. This is to avoid generating an "empty" output.
- *
- * The four values that describe the box dimensions are (in order):
- *
- *  - x (center location of the box along the horizontal axis)
- *  - y (center location of the box along the vertical axis)
- *  - width (size of box along the horizontal axis)
- *  - height (size of box on along the vertical axis)
- *
- * In each batch,
- * the N scores for N boxes, used for suppression, are generated by taking the max of the matrix (N,C)
- * along the columns.
- * If "perClassSuppression" flag is false, suppression happens across all classes.
- * If "perClassSuppression" flag is true, each box is assigned to the class with the highest
- * score and then the suppression happens separately for boxes within the same class.
- *
- * Note that the 4th output can be used to dynamically slice the first 3 outputs, in case
- * the padded outputs are not required.
- *
- */
-message NonMaximumSuppressionLayerParams {
-    /**
-     * The intersection over union (IoU) threshold over which boxes are suppressed.
-     */
-    float iouThreshold = 1;
-
-    /**
-     * Before IoU suppression is performed, boxes with class scores below this threshold are rejected.
-     */
-    float scoreThreshold = 2;
-
-    /**
-     * The maximum number of boxes to be given out as output.
-     * If the number of surviving boxes are less, output is padded up to this number.
-     */
-    uint64 maxBoxes = 3;
-
-    /**
-     * If true, suppression is performed independently within boxes of each class.
-     */
-    bool perClassSuppression = 4;
-}
-
-/**
- * A layer that performs element-wise clamped ReLU operation.
- *
- * Requires 1 input and produces 1 output.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = \begin{cases}
- *               \text{min}(\text{beta},x) \;\; \text{if} \;\; x \geq 0\\
- *               \text{min}(\text{beta} ,\text{alpha}\cdot x) \;\; \text{if} \;\; x<0
- *            \end{cases}
- *
- * Output shape is same as the input.
- *
- * Available (iOS >= 14, macOS >= 11.0, watchOS >= 7)
- */
-message ClampedReLULayerParams {
-
-    float alpha = 1;
-    float beta = 2;
-
-}
-
-/**
-* A layer that returns the indices that would sort the input tensor, along a specified axis.
-*
-* Requires 1 input and produces 1 output.
-*
-* Output has the same rank and shape as the input.
-*
-* Value of "axis" must be positive and less than the rank of the input.
-*
-* e.g.:
-*
-* input shape = (5,)
-* axis = 0
-* input values = [3.1, 5.4, 32.9, 3.2, 77.0]
-* output shape = (5,)
-* output values = [0, 3, 1, 2, 4], descending = False
-* output values = [4, 2, 1, 3, 0], descending = True
-*
-* input shape = (2,3)
-* axis = 1
-* input values = [[3, 5, 32], [3, 77, 6]]
-* output shape = (2,3)
-* output values = [[0, 1, 2], [0, 2, 1]], descending = False
-* output values = [[2, 1, 0], [1, 2, 0]], descending = True
-*
-*/
-message ArgSortLayerParams {
-
-    int64 axis = 1; /// must be between [0, input_rank - 1]
-    bool descending = 2;
-
-}
-
-/**
- * A layer that does slice operation by providing size to be extracted 
- * from the given input tensor.
- *
- * Requires 2 inputs and produces 1 output.
- * Rank of the output is same as the rank of the first input.
- *
- * The 1st input represents the tensor to be sliced.
- * The 2nd input represents the beginning index to be sliced from.
- *
- * Example:
- * Input 1: x (x.shape = (2, 3, 4))
- * Input 2: begin
- * size: 2
- * axis: 1
- *
- * Output: x[:, begin:begin+2, :]
- *
- */
-message SliceBySizeLayerParams {
-
-    int64 size = 2;
-    int64 axis = 3;
-
-}
-
-
-/// Neural Network Specializations
-/// ------------------------------
-
-/**
- * A neural network specialized as a classifier.
- */
-message NeuralNetworkClassifier {
-
-    repeated NeuralNetworkLayer layers = 1;
-    repeated NeuralNetworkPreprocessing preprocessing = 2;
-
-    // use this enum value to determine the input tensor shapes to the neural network, for multiarray inputs
-    NeuralNetworkMultiArrayShapeMapping arrayInputShapeMapping = 5;
-
-    // use this enum value to determine the input tensor shapes to the neural network, for image inputs
-    NeuralNetworkImageShapeMapping imageInputShapeMapping = 6;
-
-    NetworkUpdateParameters updateParams = 10;
-
-    // The set of labels for every possible class.
-    oneof ClassLabels {
-        StringVector stringClassLabels = 100;
-        Int64Vector int64ClassLabels = 101;
-    }
-
-    // The name of the output blob containing the probability of each class.
-    // In other words, the score vector. Must be a 1-D tensor with the same
-    // number and order of elements as ClassLabels.
-    string labelProbabilityLayerName = 200;
-}
-
-
-/**
- * A layer that computes the one hot representation of the input.
- *
- * Requires 1 or 2 inputs and produces 1 output.
- * Rank of the output is one more than the first input.
- * If the second input is present, it is used to determine the value of "oneHotVectorSize" and the parameter "oneHotVectorSize" is ignored.
- *
- * Input values correspond to indices and should typically be in the range [0,"oneHotVectorSize" -1]. If it is outside this range, a vector of all "offValue" will be chosen.
- *
- * Typically one hot vectors contain 0s everywhere, except 1 at the index that the input corresponds to.
- * However, instead of 0, any float value could be generated by using the "offValue" parameter.
- * Similarly, instead of 1, any other value can be used by employing the "onValue" parameter.
- *
- * e.g.:
- * input shape: (10,), "oneHotVectorSize" : 32, axis=-1, then output shape will be (10,32)
- * input shape: (10,23), "oneHotVectorSize" : 32, axis=1, then output shape will be (10,32,23)
- * input shape: (10,), "oneHotVectorSize" : 32, axis=0, then output shape will be (32,10)
- *
- * input shape: (2,), "oneHotVectorSize" : 4, axis=-1, then output shape will be (2,4)
- * say input values = [2, 0], and "onValue" = 5, and "offValue" = -1, then output will be:
- * [-1, -1, 5, -1
- *  5, -1, -1, -1]
- *
- *  say input values = [2, -1], and "onValue" = 5, and "offValue" = -1, then output will be:
- * [-1, -1, 5, -1
- *  -1, -1, -1, -1]
- *
- * Available (iOS >= 14, macOS >= 11.0, watchOS >= 7)
- */
-
-message OneHotLayerParams {
-
-    uint64 oneHotVectorSize = 1; /// size of the one hot vector
-    int64 axis = 2; ///  negative indexing is supported. It refers to the axis in the output tensor.
-    float onValue = 3;
-    float offValue = 4;
-}
-
-
-/**
- * A layer that computes the cumsum values of the input along a given axis.
- *
- * Requires 1 or 2 inputs and produces 1 output.
- *
- * Output shape and rank is same as the first input.
- * If the second input is present, it is used to determine the value of "axis" and the parameter "axis" is ignored.
- *
- * e.g.:
- * Input shape = (3,), values it has:  [4, 6, 7]
- *
- * Then output values will be:
- *
- * if "excludeFinalSum" = False and "reverse" = False:
- * output values : [4, 10, 17]
- *
- * if "excludeFinalSum" = True and "reverse" = False:
- * output values : [0, 4, 10]
- *
- * if "excludeFinalSum" = False and "reverse" = True:
- * output values : [17, 13, 7]
- *
- * if "excludeFinalSum" = True and "reverse" = True:
- * output values : [13, 7, 0]
- *
- *
- * Available (iOS >= 14, macOS >= 11.0, watchOS >= 7)
- */
-
-
-message CumSumLayerParams {
-
-    int64 axis = 1; ///  negative indexing is supported
-
-    /// if true, the first element of the output is 0, and the last element contains the sum of the input up to the penultimate value
-    /// if false, the first element of the output is same as the input and the last element is the sum of all the input values
-    /// (this behavior is reversed when "reverse" flag is True)
-    bool excludeFinalSum = 2;
-
-    bool reverse = 3; /// if true, cumsum is performed in the opposite direction
-}
-
-
-/**
- * A neural network specialized as a regressor.
- */
-message NeuralNetworkRegressor {
-
-    repeated NeuralNetworkLayer layers = 1;
-    repeated NeuralNetworkPreprocessing preprocessing = 2;
-
-    // use this enum value to determine the input tensor shapes to the neural network, for multiarray inputs
-    NeuralNetworkMultiArrayShapeMapping arrayInputShapeMapping = 5;
-
-    // use this enum value to determine the input tensor shapes to the neural network, for image inputs
-    NeuralNetworkImageShapeMapping imageInputShapeMapping = 6;
-
-    NetworkUpdateParameters updateParams = 10;
-
-}
-
-/// ---------------------------------------------------------
-/// On-device Training related messages
-/// ---------------------------------------------------------
-
-/**
- * Details on how the network will be updated
- */
-message NetworkUpdateParameters {
-
-    repeated LossLayer lossLayers = 1;
-    Optimizer optimizer = 2;
-    Int64Parameter epochs = 3;
-
-    /**
-     * Describes whether to shuffle the batch of data between epochs.
-     */
-    BoolParameter shuffle = 10;
-
-    /**
-     * The seed to be used in an associated random number generator.
-     */
-    Int64Parameter seed = 20;
-}
-
-/**
- * Loss layer - categorical cross entropy and mean squared error are the only supported loss functions currently
- */
-message LossLayer {
-
-    string name = 1;
-    oneof LossLayerType {
-
-        CategoricalCrossEntropyLossLayer categoricalCrossEntropyLossLayer = 10;
-        MeanSquaredErrorLossLayer meanSquaredErrorLossLayer = 11;
-
-    }
-
-}
-
-/**
- * Categorical cross entropy loss layer
- * Categorical cross entropy is used for single label categorization (only one category is applicable for each data point).
- *
- * The input is a vector of length N representing the distribution over N categories.  It must be the output of a softmax.
- *
- * The target is a single value representing the true category or class label. If the target is the predictedFeatureName of a neural network classifier it will be inverse mapped to the corresponding categorical index for you.
- *
- * math:
- * Loss_{CCE}(input, target) = -\sum_{i=1}^{N} (target == i) log( input[i] ) = - log (input[target])
- */
-message CategoricalCrossEntropyLossLayer {
-
-    string input = 1;
-    string target = 2;
-
-}
-
-/**
- * Mean squared error loss layer,
- * specifying input and target
- */
-message MeanSquaredErrorLossLayer {
-
-    string input = 1;
-    string target = 2;
-
-}
-
-/**
- * Optimizer - stochastic gradient descent and adam are the only supported optimizers currently
- */
-message Optimizer {
-
-    oneof OptimizerType {
-
-        SGDOptimizer sgdOptimizer = 10;
-        AdamOptimizer adamOptimizer = 11;
-
-    }
-
-}
-
-/**
- * Stochastic gradient descent optimizer,
- * specifying configurable learning rate, mini batch size, and momentum
- */
-message SGDOptimizer {
-
-    DoubleParameter learningRate = 1;
-    Int64Parameter miniBatchSize = 2;
-    DoubleParameter momentum = 3;
-
-}
-
-/**
- * Adam optimizer,
- * specifying configurable learning rate, mini batch size, betas, and eps
- */
-message AdamOptimizer {
-
-    DoubleParameter learningRate = 1;
-    Int64Parameter miniBatchSize = 2;
-    DoubleParameter beta1 = 3;
-    DoubleParameter beta2 = 4;
-    DoubleParameter eps = 5;
-
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/NonMaximumSuppression.proto b/onnxruntime/core/providers/coreml/mlmodel_format/NonMaximumSuppression.proto
deleted file mode 100644
index c98949a0c2e2..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/NonMaximumSuppression.proto
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright (c) 2018, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/*
-* Non-maximum suppression of axis-aligned bounding boxes.
-*
-* This is used primarily for object detectors that tend to produce multiple
-* boxes around a single object.  This is a byproduct of the detector's
-* robustness to spatial translation. If there are two or more bounding boxes
-* that are very similar to one another, the algorithm should return only a
-* single representative.
-*
-* Similarity between two bounding boxes is measured by intersection-over-union
-* (IOU), the fraction between the area of intersection and area of the union.
-* Here is an example where the areas can be calculated by hand by counting glyphs::
-*
-*     +-------+                            +-------+
-*     |       |                            |       |
-*     |    +------+          +--+          |       +---+
-*     |    |  |   |          |  |          |           |
-*     +-------+   |          +--+          +----+      |
-*          |      |                             |      |
-*          +------+                             +------+
-*                        Intersection         Union
-*      IOU: 0.16      =       12       /       73
-*
-* All IOU scores are fractions betwen 0.0 (fully disjoint) and 1.0 (perfect
-* overlap). The standard algorithm (PickTop) is defined as follows:
-*
-*  1. Sort boxes by descending order of confidence
-*  2. Take the top one and mark it as keep
-*  3. Suppress (mark it as discard) all boxes within a fixed IOU radius of the
-*     keep box
-*  4. Go to 2 and repeat on the subset of boxes not already kept or discarded
-*  5. When all boxes are processed, output only the ones marked as keep
-*
-* Before the algorithm, boxes that fall below the confidence threshold are
-* discarded.
-*/
-message NonMaximumSuppression {
-    // Suppression methods:
-    /*
-    * Pick the bounding box of the top confidence, suppress all within a radius.
-    */
-    message PickTop {
-        /*
-        * Suppression is only done among predictions with the same label
-        * (argmax of the confidence).
-        */
-        bool perClass = 1;
-    }
-
-    /*
-    * Choose which underlying suppression method to use
-    */
-    oneof SuppressionMethod {
-        PickTop pickTop = 1;
-    }
-
-    /*
-    * Optional class label mapping.
-    */
-    oneof ClassLabels {
-        StringVector stringClassLabels = 100;
-        Int64Vector int64ClassLabels = 101;
-    }
-
-    /*
-    * This defines the radius of suppression. A box is considered to be within
-    * the radius of another box if their IOU score is less than this value.
-    */
-    double iouThreshold = 110;
-
-    /*
-    * Remove bounding boxes below this threshold.  The algorithm run-time is
-    * proportional to the square of the number of incoming bounding boxes
-    * (O(N^2)). This threshold is a way to reduce N to make the algorithm
-    * faster. The confidence threshold can be any non-negative value. Negative
-    * confidences are not allowed, since if the output shape is specified to be
-    * larger than boxes after suppression, the unused boxes are filled with
-    * zero confidence. If the prediction is handled by Core Vision, it is also
-    * important that confidences are defined with the following semantics:
-    * 
-    *   1. Confidences should be between 0 and 1
-    *   2. The sum of the confidences for a prediction should not exceed 1, but is
-    *      allowed to be less than 1
-    *   3. The sum of the confidences will be interpreted as the confidence of
-    *      any object (e.g. if the confidences for two classes are 0.2 and 0.4,
-           it means there is a 60% (0.2 + 0.4) confidence that an object is
-           present)
-    */
-    double confidenceThreshold = 111;
-
-    /*
-    * Set the name of the confidence input.
-    *
-    * The input should be a multi-array of type double and shape N x C. N is
-    * the number of boxes and C the number of classes. Each row describes the
-    * confidences of each object category being present at that particular
-    * location. Confidences should be nonnegative, where 0.0 means the highest
-    * certainty the object is not present.
-    *
-    * Specifying shape is optional.
-    */
-    string confidenceInputFeatureName = 200;
-
-    /*
-    * Set the name of the coordinates input.
-    *
-    * The input should be a multi-array of type double and shape N x 4. The
-    * rows correspond to the rows of the confidence matrix. The four values
-    * describe (in order):
-    *
-    *  - x (center location of the box along the horizontal axis)
-    *  - y (center location of the box along the vertical axis)
-    *  - width (size of box along the horizontal axis)
-    *  - height (size of box on along the vertical axis)
-    *
-    * Specifying shape is optional.
-    */
-    string coordinatesInputFeatureName = 201;
-
-    /*
-    * The iouThreshold can be optionally overridden by specifying this string
-    * and providing a corresponding input of type double. This allows changing
-    * the value of the parameter during run-time.
-    *
-    * The input should be a scalar double between 0.0 and 1.0. Setting it to 1.0
-    * means there will be no suppression based on IOU.
-    */
-    string iouThresholdInputFeatureName = 202;
-
-    /*
-    * The confidenceThreshold can be optionally overridden by specifying this
-    * string and providing a corresponding input. This allows changing the
-    * value of the parameter during run-time, which can aid setting it just
-    * right for a particular use case.
-    *
-    * The input should be a scalar double with nonnegative value.
-    */
-    string confidenceThresholdInputFeatureName = 203;
-
-    /*
-    * Set the name of the confidence output. The output will be the same type
-    * and shape as the corresponding input. The only difference is that the
-    * number of rows may have been reduced.
-    *
-    * Specifying shape is optional. One reason to specify shape is to limit
-    * the number of output boxes. This can be done is several ways:
-    *
-    * Fixed shape:
-    * The output can be pinned to a fixed set of boxes. If this number is larger
-    * than the number of boxes that would have been returned, the output is padded
-    * with zeros for both confidence and coordinates. Specifying a fixed shape
-    * can be done by setting either shape (deprecated) or allowedShapes set to
-    * fixedsize.
-    *
-    * Min/max:
-    * It is also possible to set both a minimum and a maximum. The same zero-padding
-    * as for fixed shape is applied when necessary. Setting min/max is done by defining
-    * two allowedShapes, where the first dimension uses a rangeofsizes defining lowerbound
-    * and upperbound.
-    */
-    string confidenceOutputFeatureName = 210;
-
-    /*
-    * Set the name of the coordinates output. The output will be the same type
-    * and shape as the corresponding input. The only difference is that the
-    * number of rows may have been reduced.
-    *
-    * Specifying shape is optional. See confidence output for a more detailed
-    * description. Note that to achieve either fixed shape output or a
-    * constraint range of boxes, only one of confidence or coordinates need to
-    * set a shape. Both shapes are allowed to be defined, but in such case they
-    * have to be consistent along dimension 0.
-    */
-    string coordinatesOutputFeatureName = 211;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/Normalizer.proto b/onnxruntime/core/providers/coreml/mlmodel_format/Normalizer.proto
deleted file mode 100644
index 627f7e2e3afd..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/Normalizer.proto
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
- * A normalization preprocessor.
- */
-message Normalizer {
-    /**
-     * There are three normalization modes,
-     * which have the corresponding formulas:
-     *
-     * Max
-     *     .. math::
-     *         max(x_i)
-     *
-     * L1
-     *     .. math::
-     *         z = ||x||_1 = \sum_{i=1}^{n} |x_i|
-     *
-     * L2
-     *     .. math::
-     *         z = ||x||_2 = \sqrt{\sum_{i=1}^{n} x_i^2}
-     */
-    enum NormType {
-        LMax = 0;
-        L1 = 1;
-        L2 = 2;
-    }
-
-    NormType normType = 1;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/OneHotEncoder.proto b/onnxruntime/core/providers/coreml/mlmodel_format/OneHotEncoder.proto
deleted file mode 100644
index f47cf2816622..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/OneHotEncoder.proto
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/**
- * Transforms a categorical feature into an array. The array will be all
- * zeros expect a single entry of one.
- *
- * Each categorical value will map to an index, this mapping is given by
- * either the ``stringCategories`` parameter or the ``int64Categories``
- * parameter.
- */
-message OneHotEncoder {
-    enum HandleUnknown {
-        ErrorOnUnknown = 0;
-        IgnoreUnknown = 1;   // Output will be all zeros for unknown values.
-    }
-
-    /**
-     * Mapping to be used for the encoding. The position of the category in
-     * the below vector determines where the single one entry will be in the
-     * output.
-     */
-    oneof CategoryType {
-        StringVector stringCategories = 1;
-        Int64Vector int64Categories = 2;
-    }
-
-    // Output can be a dictionary with only one entry, instead of an array.
-    bool outputSparse = 10;
-
-    HandleUnknown handleUnknown = 11;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/Parameters.proto b/onnxruntime/core/providers/coreml/mlmodel_format/Parameters.proto
deleted file mode 100644
index ed1ebe525181..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/Parameters.proto
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/**
- * Int64 parameter,
- * consisting of a default int64 value, and allowed range or set of values
- * value is unbounded if AllowedValues is not set.
- */
-message Int64Parameter {
-    int64 defaultValue = 1;
-    oneof AllowedValues {
-        Int64Range range = 10;
-        Int64Set set = 11;
-    }
-}
-
-/**
- * Double parameter,
- * consisting of a default double value, and allowed range of values
- * value is unbounded if AllowedValues is not set.
- */
-message DoubleParameter {
-    double defaultValue = 1;
-    oneof AllowedValues {
-        DoubleRange range = 10;
-    }
-}
-
-/**
- * String parameter,
- * A default string value must be provided
- */
-message StringParameter {
-    string defaultValue = 1;
-}
-
-/**
- * String parameter,
- * A default bool value must be provided
- */
-message BoolParameter {
-    bool defaultValue = 1;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/README.md b/onnxruntime/core/providers/coreml/mlmodel_format/README.md
deleted file mode 100644
index e5eba65f982a..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Core ML Model Format Specification
-This directory contains the protobuf message definitions that comprise the Core ML model document (``.mlmodel``) format.
-
-The top-level message is ``Model``, which is defined in ``Model.proto``.
-Other message types describe data structures, feature types, feature engineering model types, and predictive model types.
-
-# Update the Core ML Model Format Specification
-Please do not modify protobuf message definitions, they are copied directly from [Core ML Tools](https://github.com/apple/coremltools) repository.
-
-To update the Core ML Model Format Schema schema files to a more recent version:
-1. Delete all the protobuf message definitions (`.proto`) from this directory.
-2. Copy the new version of protobuf message definitions (`.proto`) from the `mlmodel/format/` directory of preferred coremltools release branch.
-
-# Core ML Model Format Schema version history
-## [coremltools 4.0](https://github.com/apple/coremltools/releases/tag/4.0)
-[Core ML Model Format Specification](https://github.com/apple/coremltools/tree/4.0/mlmodel/format)
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/SVM.proto b/onnxruntime/core/providers/coreml/mlmodel_format/SVM.proto
deleted file mode 100644
index 932a4ec21668..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/SVM.proto
+++ /dev/null
@@ -1,195 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/// Kernel Definitions
-/// ------------------
-
-/**
- * A linear kernel.
- *
- * This function has the following formula:
- *
- * .. math::
- *     K(\boldsymbol{x}, \boldsymbol{x'}) = \boldsymbol{x}^T \boldsymbol{x'}
- */
-message LinearKernel {
-}
-
-/**
- * A Gaussian radial basis function (RBF) kernel.
- *
- * This function has the following formula:
- *
- * .. math::
- *     K(\boldsymbol{x}, \boldsymbol{x'}) = \
- *          \exp(-\gamma || \boldsymbol{x} - \boldsymbol{x'} ||^2 )
- *
- */
-message RBFKernel {
-    double gamma = 1;
-}
-
-/**
- * A polynomial kernel.
- *
- * This function has the following formula:
- *
- * .. math::
- *     K(\boldsymbol{x}, \boldsymbol{x'}) = \
- *           (\gamma \boldsymbol{x}^T \boldsymbol{x'} + c)^{degree}
- */
-message PolyKernel {
-    int32 degree = 1;
-    double c = 2;
-    double gamma = 3;
-}
-
-/**
- * A sigmoid kernel.
- *
- * This function has the following formula:
- *
- * .. math::
- *     K(\boldsymbol{x}, \boldsymbol{x'}) = \
- *           \tanh(\gamma \boldsymbol{x}^T \boldsymbol{x'} + c)
- */
-message SigmoidKernel {
-    double gamma = 1;
-    double c = 2;
-}
-
-/**
- * A kernel.
- */
-message Kernel {
-    oneof kernel {
-        LinearKernel linearKernel = 1;
-        RBFKernel rbfKernel = 2;
-        PolyKernel polyKernel = 3;
-        SigmoidKernel sigmoidKernel = 4;
-    }
-}
-
-
-/// Support Vector Definitions
-/// --------------------------
-
-/**
- * A sparse node.
- */
-message SparseNode {
-    int32 index = 1; // 1-based indexes, like libsvm
-    double value = 2;
-}
-
-/**
- * A sparse vector.
- */
-message SparseVector {
-    repeated SparseNode nodes = 1;
-}
-
-/**
- * One or more sparse support vectors.
- */
-message SparseSupportVectors {
-    repeated SparseVector vectors = 1;
-}
-
-/**
- * A dense vector.
- */
-message DenseVector {
-    repeated double values = 1;
-}
-
-/**
- * One or more dense support vectors.
- */
-message DenseSupportVectors {
-    repeated DenseVector vectors = 1;
-}
-
-/**
- * One or more coefficients.
- */
-message Coefficients {
-    repeated double alpha = 1;
-}
-
-/**
- * A support vector regressor.
- */
-message SupportVectorRegressor {
-    Kernel kernel = 1;
-
-    // Support vectors, either sparse or dense format
-    oneof supportVectors {
-        SparseSupportVectors sparseSupportVectors = 2;
-        DenseSupportVectors denseSupportVectors = 3;
-    }
-
-    // Coefficients, one for each support vector
-    Coefficients coefficients = 4;
-
-    double rho = 5;
-}
-
-/**
- * A support vector classifier
- */
-message SupportVectorClassifier {
-    Kernel kernel = 1;
-
-    /**
-     * The number of support vectors for each class.
-     */
-    repeated int32 numberOfSupportVectorsPerClass = 2;
-
-    /**
-     * The support vectors, in either sparse or dense format.
-     */
-    oneof supportVectors {
-        SparseSupportVectors sparseSupportVectors = 3;
-        DenseSupportVectors denseSupportVectors = 4;
-    }
-
-    /**
-     * The coefficients, essentially a two dimensional array of
-     * size: (numberOfClasses-1) by (total number of support vectors)
-     */
-    repeated Coefficients coefficients = 5;
-
-    /**
-     * Constants for decision function,
-     * with K*(K-1) / 2 elements,
-     * where K is the number of classes.
-     */
-    repeated double rho = 6;
-
-    /**
-     * Pairwise probability information for A vs B classifier.
-     * Total of K*(K-1)/2 elements where K is the number of classes.
-     * These fields are optional,
-     * and only required if you want probabilities or multi class predictions.
-     */
-    repeated double probA = 7;
-    repeated double probB = 8;
-
-    /**
-     * Class label mapping.
-     */
-    oneof ClassLabels {
-        StringVector stringClassLabels = 100;
-        Int64Vector int64ClassLabels = 101;
-    }
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/Scaler.proto b/onnxruntime/core/providers/coreml/mlmodel_format/Scaler.proto
deleted file mode 100644
index f0e13d54be2e..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/Scaler.proto
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification;
-
-/**
- * A scaling operation.
- *
- * This function has the following formula:
- *
- * .. math::
- *     f(x) = scaleValue \cdot (x + shiftValue)
- *
- * If the ``scaleValue`` is not given, the default value 1 is used.
- * If the ``shiftValue`` is not given, the default value 0 is used.
- *
- * If ``scaleValue`` and ``shiftValue`` are each a single value
- * and the input is an array, then the scale and shift are applied
- * to each element of the array.
- *
- * If the input is an integer, then it is converted to a double to
- * perform the scaling operation. If the output type is an integer,
- * then it is cast to an integer. If that cast is lossy, then an
- * error is generated.
- */
-message Scaler {
-    repeated double shiftValue = 1;
-    repeated double scaleValue = 2;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/SoundAnalysisPreprocessing.proto b/onnxruntime/core/providers/coreml/mlmodel_format/SoundAnalysisPreprocessing.proto
deleted file mode 100644
index 05bb744a9af9..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/SoundAnalysisPreprocessing.proto
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification.CoreMLModels;
-
-/**
-* A model which takes audio signal samples as input and outputs an array of
-* preprocessed samples according to the specified preprocessing types
-*/
-message SoundAnalysisPreprocessing {
-
-    // Specific preprocessing types for sound analysis
-
-    /* Vggish preprocesses input audio samples and makes them ready to
-       be fed to Vggish feature extractor.
-       c.f. https://arxiv.org/pdf/1609.09430.pdf
-
-       The preprocessing takes input a single channel (monophonic) audio samples
-       975 miliseconds long, sampled at 16KHz, i.e., 15600 samples 1D multiarray
-       and produces preprocessed samples in multiarray of shape [1, 96, 64]
-
-     (1) Splits the input audio samples into overlapping frames, where each
-         frame is 25 milliseconds long and hops forward by 10 milliseconds.
-         Any partial frames at the end are dropped.
-
-     (2) Hann window: apply a periodic Hann with a window_length of
-         25 milliseconds, which translates to 400 samples in 16KHz sampling rate
-
-         w(n) = 0.5 - 0.5 * cos(2*pi*n/window_length_sample),
-         where 0 <= n <= window_lenth_samples - 1 and window_lenth_samples = 400
-
-         Then, the Hann window is applied to each frame as below
-
-         windowed_frame(n) = frame(n) * w(n)
-         where 0 <= n <= window_lenth_samples - 1 and window_lenth_samples = 400
-
-     (3) Power spectrum: calculate short-time Fourier transfor magnitude, with
-         an FFT length of 512
-
-     (4) Log Mel filter bank: calculates a log magnitude mel-frequency
-         spectrogram minimum frequency of 125Hz and maximum frequency of 7500Hz,
-         number of mel bins is 64, log_offset is 0.01, number of spectrum bins
-         is 64.
-    */
-
-    message Vggish {
-        // no specific parameter
-    }
-
-    // Vision feature print type
-    oneof SoundAnalysisPreprocessingType {
-        Vggish vggish = 20;
-    }
-
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/TextClassifier.proto b/onnxruntime/core/providers/coreml/mlmodel_format/TextClassifier.proto
deleted file mode 100644
index bf6d3c7f7f3e..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/TextClassifier.proto
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification.CoreMLModels;
-
-/**
-* A model which takes a single input string and outputs a
-* label for the input.
-*/
-message TextClassifier {
-
-    /*
-    * Stores the resivion number for the model, revision 1 is available on
-    * iOS, tvOS 12.0+, macoOS 10.14+
-    */
-    uint32 revision = 1;
-    
-    /*
-    * Stores the language of the model, as specified in BCP-47 format,
-    * e.g. "en-US". See https://tools.ietf.org/html/bcp47
-    */
-    string language = 10;
-
-    /*
-    * Stores the byte representation of learned model parameters
-    */
-    bytes modelParameterData = 100;
-    
-    /*
-    * Stores the set of output class labels
-    */
-    oneof ClassLabels {
-        StringVector stringClassLabels = 200;
-    }
-    
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/TreeEnsemble.proto b/onnxruntime/core/providers/coreml/mlmodel_format/TreeEnsemble.proto
deleted file mode 100644
index defebee98852..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/TreeEnsemble.proto
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright (c) 2017, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-/**
- * Each tree is a collection of nodes,
- * each of which is identified by a unique identifier.
- *
- * Each node is either a branch or a leaf node.
- * A branch node evaluates a value according to a behavior;
- * if true, the node identified by ``true_child_node_id`` is evaluated next,
- * if false, the node identified by ``false_child_node_id`` is evaluated next.
- * A leaf node adds the evaluation value to the base prediction value
- * to get the final prediction.
- *
- * A tree must have exactly one root node,
- * which has no parent node.
- * A tree must not terminate on a branch node.
- * All leaf nodes must be accessible
- * by evaluating one or more branch nodes in sequence,
- * starting from the root node.
- */
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification;
-
-/**
- * A tree ensemble post-evaluation transform.
- */
-enum TreeEnsemblePostEvaluationTransform {
-    NoTransform = 0;
-    Classification_SoftMax = 1;
-    Regression_Logistic = 2;
-    Classification_SoftMaxWithZeroClassReference = 3;
-}
-
-/**
- * Tree ensemble parameters.
- */
-message TreeEnsembleParameters {
-    message TreeNode {
-        uint64 treeId = 1;
-        uint64 nodeId = 2;
-
-        enum TreeNodeBehavior {
-            BranchOnValueLessThanEqual = 0;
-            BranchOnValueLessThan = 1;
-            BranchOnValueGreaterThanEqual = 2;
-            BranchOnValueGreaterThan = 3;
-            BranchOnValueEqual = 4;
-            BranchOnValueNotEqual = 5;
-            LeafNode = 6;
-        }
-
-        /**
-         * The branch mode parameters.
-         *
-         * If branch is false,
-         * then the parameters in this section must be filled in
-         * to determine how the branching functions.
-         */
-        TreeNodeBehavior nodeBehavior = 3;
-
-        /**
-         * If the node behavior mode is a branch mode,
-         * then these values must be filled in.
-         */
-        uint64 branchFeatureIndex = 10;
-        double branchFeatureValue = 11;
-        uint64 trueChildNodeId = 12;
-        uint64 falseChildNodeId = 13;
-        bool missingValueTracksTrueChild = 14;
-
-        /**
-         * The leaf mode.
-         *
-         * If ``nodeBahavior`` == ``LeafNode``,
-         * then the evaluationValue is added to the base prediction value
-         * in order to get the final prediction.
-         * To support multiclass classification
-         * as well as regression and binary classification,
-         * the evaluation value is encoded here as a sparse vector,
-         * with evaluationIndex being the index of the base vector
-         * that evaluation value is added to.
-         * In the single class case,
-         * it is expected that evaluationIndex is exactly 0.
-         */
-        message EvaluationInfo {
-           uint64 evaluationIndex = 1;
-           double evaluationValue = 2;
-        }
-
-        repeated EvaluationInfo evaluationInfo = 20;
-
-        /**
-         * The relative hit rate of a node for optimization purposes.
-         *
-         * This value has no effect on the accuracy of the result;
-         * it allows the tree to optimize for frequent branches.
-         * The value is relative,
-         * compared to the hit rates of other branch nodes.
-         *
-         * You typically use a proportion of training samples
-         * that reached this node
-         * or some similar metric to derive this value.
-         */
-        double relativeHitRate = 30;
-    }
-
-    repeated TreeNode nodes = 1;
-
-    /**
-     * The number of prediction dimensions or classes in the model.
-     *
-     * All instances of ``evaluationIndex`` in a leaf node
-     * must be less than this value,
-     * and the number of values in the ``basePredictionValue`` field
-     * must be equal to this value.
-     *
-     * For regression,
-     * this is the dimension of the prediction.
-     * For classification,
-     * this is the number of classes.
-     */
-    uint64 numPredictionDimensions = 2;
-
-    /**
-     * The base prediction value.
-     *
-     * The number of values in this must match
-     * the default values of the tree model.
-     */
-    repeated double basePredictionValue = 3;
-}
-
-/**
- * A tree ensemble classifier.
- */
-message TreeEnsembleClassifier {
-    TreeEnsembleParameters treeEnsemble = 1;
-    TreeEnsemblePostEvaluationTransform postEvaluationTransform = 2;
-
-    // Required class label mapping
-    oneof ClassLabels {
-        StringVector stringClassLabels = 100;
-        Int64Vector int64ClassLabels = 101;
-    }
-}
-
-/**
- * A tree ensemble regressor.
- */
-message TreeEnsembleRegressor {
-    TreeEnsembleParameters treeEnsemble = 1;
-    TreeEnsemblePostEvaluationTransform postEvaluationTransform = 2;
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/VisionFeaturePrint.proto b/onnxruntime/core/providers/coreml/mlmodel_format/VisionFeaturePrint.proto
deleted file mode 100644
index cd13d290e421..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/VisionFeaturePrint.proto
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2018, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-package CoreML.Specification.CoreMLModels;
-
-/**
-* A model which takes an input image and outputs array(s) of features
-* according to the specified feature types
-*/
-message VisionFeaturePrint {
-
-    // Specific vision feature print types
-   
-    // Scene extracts features useful for identifying contents of natural images
-    // in both indoor and outdoor environments
-    message Scene {
-        enum SceneVersion {
-            SCENE_VERSION_INVALID = 0;
-            // VERSION_1 is available on iOS,tvOS 12.0+, macOS 10.14+
-            // It uses a 299x299 input image and yields a 2048 float feature vector
-            SCENE_VERSION_1 = 1;
-        }
-        
-        SceneVersion version = 1;
-    }
-
-    // Objects extracts features useful for identifying and localizing
-    // objects in natural images
-    message Objects {
-        enum ObjectsVersion {
-            OBJECTS_VERSION_INVALID = 0;
-            // VERSION_1 is available on iOS,tvOS 14.0+, macOS 11.0+
-            // It uses a 299x299 input image and yields two multiarray
-            // features: one at high resolution of shape (288, 35, 35)
-            // the other at low resolution of shape (768, 17, 17)
-            OBJECTS_VERSION_1 = 1;
-        }
-
-        ObjectsVersion version = 1;
-
-        /*
-        * Stores the names of the output features according to the
-        * order of them being computed from the neural network, i.e.,
-        * the first element in the output is the earliest being
-        * computed, while the last is the latest being computed. In
-        * general, the order reflects the resolution of the feature.
-        * The earlier it is computed, the higher the feature resolution.
-        */
-        repeated string output = 100;
-    }
-
-    // Vision feature print type
-    oneof VisionFeaturePrintType {
-        Scene scene = 20;
-        Objects objects = 21;
-    }
-
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/WordEmbedding.proto b/onnxruntime/core/providers/coreml/mlmodel_format/WordEmbedding.proto
deleted file mode 100644
index ec11a67ca529..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/WordEmbedding.proto
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification.CoreMLModels;
-
-/**
-* A model which maps a set of strings into a finite-dimensional real vector space.
-*/
-message WordEmbedding {
-
-    /*
-    * Stores the revision number for the model, revision 2 is available on
-    * iOS, tvOS 13.0+, macOS 10.15+
-    */
-    uint32 revision = 1;
-    
-    /*
-    * Stores the language of the model, as specified in BCP-47 format,
-    * e.g. "en-US". See https://tools.ietf.org/html/bcp47
-    */
-    string language = 10;
-
-    /*
-    * Stores efficient representation of emebedding as encoded by the Natural Language Framework
-    */
-    bytes modelParameterData = 100;
-    
-}
diff --git a/onnxruntime/core/providers/coreml/mlmodel_format/WordTagger.proto b/onnxruntime/core/providers/coreml/mlmodel_format/WordTagger.proto
deleted file mode 100644
index 8523e05df2c0..000000000000
--- a/onnxruntime/core/providers/coreml/mlmodel_format/WordTagger.proto
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2018, Apple Inc. All rights reserved.
-//
-// Use of this source code is governed by a BSD-3-clause license that can be
-// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
-
-syntax = "proto3";
-option optimize_for = LITE_RUNTIME;
-
-import public "DataStructures.proto";
-
-package CoreML.Specification.CoreMLModels;
-
-/**
-* A model which takes a single input string and outputs a
-* sequence of tokens, tags for tokens, along with their
-* locations and lengths, in the original string.
-*/
-message WordTagger {
-
-    /*
-    * Stores the resivion number for the model, revision 1 is available on
-    * iOS, tvOS 12.0+, macoOS 10.14+
-    */
-    uint32 revision = 1;
-
-    /*
-    * Stores the language of the model, as specified in BCP-47 format,
-    * e.g. "en-US". See https://tools.ietf.org/html/bcp47
-    */
-    string language = 10;
-
-    /*
-    * Stores the name of tokens output. The output will be
-    * a sequence of strings that contains the tokens in the
-    * input string
-    */
-    string tokensOutputFeatureName = 20;
-
-    /*
-    * Stores the name of token tags output. The output will be
-    * a sequence of strings that contains the tags for each
-    * token in the input string
-    */
-    string tokenTagsOutputFeatureName = 21;
-
-    /*
-    * Stores the name of token locations output. The output will be
-    * a sequence of integers that contains the locations (indices)
-    * for each token in the input string, location starts from 0
-    */
-    string tokenLocationsOutputFeatureName = 22;
-
-    /*
-    * Stores the name of token lengths output. The output will be
-    * a sequence of integers that contains the lengths for each
-    * token in the input string
-    */
-    string tokenLengthsOutputFeatureName = 23;
-
-    /*
-    * Stores the byte representation of learned model parameters
-    */
-    bytes modelParameterData = 100;
-
-    /*
-    * Stores the set of output tags
-    */
-    oneof Tags {
-        StringVector stringTags = 200;
-    }
-
-
-    
-}
-
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h
index f7f45bce087b..a9991ccb945c 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.h
+++ b/onnxruntime/core/providers/coreml/model/host_utils.h
@@ -8,10 +8,50 @@
 
 #include <string>
 
-#define API_AVAILABLE_OS_VERSIONS API_AVAILABLE(macos(10.15), ios(13))
+#if defined(__APPLE__)
+// See https://apple.github.io/coremltools/mlmodel/Format/Model.html for the info on each CoreML specification version.
+// See https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html for the list of ops
+// in each CoreML specification version.
 
-// Base requireed OS to run CoreML Specification Version 4 (Core ML 3)
-#define HAS_VALID_BASE_OS_VERSION @available(macOS 10.15, iOS 13, *)
+// Specification Versions : OS Availability(Core ML Version)
+//
+// 4 : iOS 13, macOS 10.15, tvOS 13, watchOS 6 (Core ML 3)
+//     - initial version of CoreML EP
+// 5 : iOS 14, macOS 11, tvOS 14, watchOS 7 (Core ML 4)
+//     - additional layers in NeuralNetwork but currently none are implemented by the CoreML EP
+// 6 : iOS 15, macOS 12, tvOS 15, watchOS 8 (Core ML 5)
+//     - adds MLProgram (MILSpec.Program)
+//     - iOS 15 ops
+// 7 : iOS 16, macOS 13, tvOS 16, watchOS 9 (Core ML 6)
+//     - iOS 16 ops
+// 8 : iOS 17, macOS 14, tvOS 17, watchOS 10 (Core ML 7)
+//     - iOS 17 ops
+//
+// **NOTE** We use the Core ML version not the spec version.
+//
+// e.g. iOS 13 has Core ML 3 (which is Core ML Specification version 4), and the related macros are
+// API_AVAILABLE_COREML3, HAS_COREML3_OR_LATER and onnxruntime::coreml::util::CoreMLVersion() will return 3.
+
+// https://developer.apple.com/documentation/swift/marking-api-availability-in-objective-c
+// API_AVAILABLE is used to decorate Objective-C APIs
+#define API_AVAILABLE_COREML3 API_AVAILABLE(macos(10.15), ios(13))
+#define API_AVAILABLE_COREML4 API_AVAILABLE(macos(11), ios(14))
+#define API_AVAILABLE_COREML5 API_AVAILABLE(macos(12), ios(15))
+#define API_AVAILABLE_COREML6 API_AVAILABLE(macos(13), ios(16))
+#define API_AVAILABLE_COREML7 API_AVAILABLE(macos(14), ios(17))
+
+// @available is used in implementation code
+// Base required OS to run CoreML Specification Version 4 (Core ML 3)
+#define HAS_COREML3_OR_LATER @available(macOS 10.15, iOS 13, *)
+#define HAS_COREML4_OR_LATER @available(macOS 11, iOS 14, *)
+#define HAS_COREML5_OR_LATER @available(macOS 12, iOS 15, *)
+#define HAS_COREML6_OR_LATER @available(macOS 13, iOS 16, *)
+#define HAS_COREML7_OR_LATER @available(macOS 14, iOS 17, *)
+
+#endif
+
+#define MINIMUM_COREML_VERSION 3            // first version we support
+#define MINIMUM_COREML_MLPROGRAM_VERSION 5  // first version where ML Program was available
 
 namespace onnxruntime {
 namespace coreml {
@@ -21,9 +61,18 @@ namespace util {
 // This corresponds to [CoreML Specification Version 4 (Core ML 3)]
 bool HasRequiredBaseOS();
 
+// Return the CoreML version if 3 or higher. Otherwise returns -1.
+int CoreMLVersion();
+
 // Get a temporary macOS/iOS temp file path
 std::string GetTemporaryFilePath();
 
+#if !defined(NDEBUG) && defined(__APPLE__)
+// Override location the model is written to so that a) it's easily found and b) it is not automatically deleted
+// when the EP exits. Use to debug the model that is generated.
+// See onnxruntime/core/providers/coreml/dump_mlprogram_model.py for a script to dump the ML Program.
+constexpr const char* kOverrideModelOutputDirectoryEnvVar = "ORT_COREML_EP_MODEL_DIR";
+#endif
 }  // namespace util
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.mm b/onnxruntime/core/providers/coreml/model/host_utils.mm
index 4c394386cd37..5487ea35388f 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.mm
+++ b/onnxruntime/core/providers/coreml/model/host_utils.mm
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/platform/env.h"
 #include "core/providers/coreml/model/host_utils.h"
 
 #import <Foundation/Foundation.h>
@@ -10,19 +11,42 @@
 namespace util {
 
 bool HasRequiredBaseOS() {
-  // This may look strange, but it is required "@available(macOS ....)" to safe-guard some code
-  // otherwise the compiler will spit -Wunsupported-availability-guard
-  if (HAS_VALID_BASE_OS_VERSION)
-    return true;
-  else
-    return false;
+  return CoreMLVersion() >= 3;
+}
+
+int32_t CoreMLVersion() {
+  if (HAS_COREML7_OR_LATER)
+    return 7;
+  if (HAS_COREML6_OR_LATER)
+    return 6;
+  if (HAS_COREML5_OR_LATER)
+    return 5;
+  if (HAS_COREML4_OR_LATER)
+    return 4;
+  if (HAS_COREML3_OR_LATER)
+    return 3;
+
+  return -1;
 }
 
 std::string GetTemporaryFilePath() {
-  // Get temporary directory.
+  // Get temporary directory for user.
   NSURL* temporary_directory_url = [NSURL fileURLWithPath:NSTemporaryDirectory() isDirectory:YES];
+
+#if !defined(NDEBUG)
+  std::string path_override = Env::Default().GetEnvironmentVar(kOverrideModelOutputDirectoryEnvVar);
+  if (!path_override.empty()) {
+    NSString* ns_path_override = [NSString stringWithUTF8String:path_override.c_str()];
+    temporary_directory_url = [NSURL fileURLWithPath:ns_path_override isDirectory:YES];
+  }
+#endif
+
   // Generate a Unique file name to use.
   NSString* temporary_filename = [[NSProcessInfo processInfo] globallyUniqueString];
+
+  // make it easy to see who generated it
+  temporary_filename = [@"onnxruntime-" stringByAppendingString:temporary_filename];
+
   // Create URL to that file.
   NSURL* temporary_file_url = [temporary_directory_url URLByAppendingPathComponent:temporary_filename];
 
diff --git a/onnxruntime/core/providers/coreml/model/host_utils_stub.cc b/onnxruntime/core/providers/coreml/model/host_utils_stub.cc
new file mode 100644
index 000000000000..5c383b0274e8
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/model/host_utils_stub.cc
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <atomic>
+
+#include "core/platform/env.h"
+#include "core/providers/coreml/model/host_utils.h"
+
+namespace onnxruntime {
+namespace coreml {
+namespace util {
+
+bool HasRequiredBaseOS() {
+  return true;
+}
+
+int CoreMLVersion() {
+  return 7;  // CoreML 7 is the latest we support.
+}
+
+std::string GetTemporaryFilePath() {
+  static std::atomic<int> counter = 0;
+
+  // we want to avoid creating endless directories/names whilst avoiding clashes if tests run in parallel so cycle
+  // through 20 potential output names.
+  auto dir_name = "coreml_ep_test_run." + std::to_string(counter++ % 20);
+
+  // to replicate the iOS/macOS host_utils.mm behavior where the output is <user temporary directory>/<unique_name>
+  // we want to return the name of something that does not exist. this is required for ML Package creation.
+  auto& env = Env::Default();
+  if (env.FolderExists(dir_name)) {
+    ORT_THROW_IF_ERROR(env.DeleteFolder(ToPathString(dir_name)));
+  }
+
+  return dir_name;
+}
+
+}  // namespace util
+}  // namespace coreml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h
index 105b6a0333b1..e3cd43d786fc 100644
--- a/onnxruntime/core/providers/coreml/model/model.h
+++ b/onnxruntime/core/providers/coreml/model/model.h
@@ -33,59 +33,62 @@ using GetOutputTensorMutableRawDataFn = std::function<void*(const std::string& n
                                                             gsl::span<const int64_t> static_shape)>;
 
 class Model {
-  friend class ModelBuilder;
-
  public:
+  Model(const std::string& path,
+        std::vector<std::string>&& model_input_names,
+        std::vector<std::string>&& model_output_names,
+        std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
+        std::unordered_set<std::string>&& scalar_outputs,
+        std::unordered_set<std::string>&& int64_outputs,
+        const logging::Logger& logger, uint32_t coreml_flags);
+
   ~Model();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Model);
 
+  Status LoadModel();
+
   Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
                  const std::unordered_map<std::string, OnnxTensorInfo>& outputs,
                  const GetOutputTensorMutableRawDataFn& get_output_tensor_mutable_raw_data_fn);
 
-  bool IsScalarOutput(const std::string& output_name) const;
+  bool IsScalarOutput(const std::string& output_name) const {
+    return Contains(scalar_outputs_, output_name);
+  }
 
-  bool IsInt64Output(const std::string& output_name) const;
+  bool IsInt64Output(const std::string& output_name) const {
+    return Contains(int64_outputs_, output_name);
+  }
 
   // Mutex for exclusive lock to this model object
   OrtMutex& GetMutex() { return mutex_; }
 
-  // Input and output names in the onnx model's order
-  const std::vector<std::string>& GetOnnxInputs() const { return onnx_inputs_; }
-  void SetOnnxInputs(std::vector<std::string>&& inputs) { onnx_inputs_ = std::move(inputs); }
+  // Input and output names in the ORT fused node's order.
+  // Names may have been adjusted from the originals due to CoreML naming rules.
+  // We do inputs/outputs based on order at the ONNX level so this doesn't matter.
+  const std::vector<std::string>& GetOrderedInputs() const { return model_input_names_; }
+  const std::vector<std::string>& GetOrderedOutputs() const { return model_output_names_; }
 
-  const std::vector<std::string>& GetOnnxOutputs() const { return onnx_outputs_; }
-  void SetOnnxOutputs(std::vector<std::string>&& outputs) { onnx_outputs_ = std::move(outputs); }
+  const OnnxTensorInfo* TryGetInputOutputInfo(const std::string& name) const {
+    const auto info_it = input_output_info_.find(name);
+    return info_it != input_output_info_.end() ? &info_it->second : nullptr;
+  }
 
-  const OnnxTensorInfo* TryGetInputOutputInfo(const std::string& name) const;
-  const OnnxTensorInfo& GetInputOutputInfo(const std::string& name) const;
+  const OnnxTensorInfo& GetInputOutputInfo(const std::string& name) const {
+    const auto* info = TryGetInputOutputInfo(name);
+    ORT_ENFORCE(info != nullptr, "Failed to get info for input/output: ", name);
+    return *info;
+  }
 
  private:
   std::unique_ptr<Execution> execution_;
-  std::unordered_set<std::string> scalar_outputs_;
-  std::unordered_set<std::string> int64_outputs_;
-
-  std::vector<std::string> onnx_inputs_;
-  std::vector<std::string> onnx_outputs_;
+  std::vector<std::string> model_input_names_;   // input names in the order of the ORT fused node's inputs
+  std::vector<std::string> model_output_names_;  // output names in the order of the ORT fused node's outputs
 
   std::unordered_map<std::string, OnnxTensorInfo> input_output_info_;
+  std::unordered_set<std::string> scalar_outputs_;
+  std::unordered_set<std::string> int64_outputs_;
 
   OrtMutex mutex_;
-
-  Model(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags);
-  Status LoadModel();
-
-  void SetInputOutputInfo(std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info) {
-    input_output_info_ = std::move(input_output_info);
-  }
-
-  void SetScalarOutputs(std::unordered_set<std::string>&& scalar_outputs) {
-    scalar_outputs_ = std::move(scalar_outputs);
-  }
-
-  void SetInt64Outputs(std::unordered_set<std::string>&& int64_outputs) {
-    int64_outputs_ = std::move(int64_outputs);
-  }
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 155201ad4c39..1434043e064f 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -19,6 +19,7 @@
 #include "core/common/narrow.h"
 #include "core/common/span_utils.h"
 #include "core/graph/onnx_protobuf.h"
+#include "core/platform/env.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/coreml_provider_factory.h"
 #include "core/providers/coreml/model/host_utils.h"
@@ -252,14 +253,14 @@ - (instancetype)initWithPath:(const std::string&)path
                 coreml_flags:(uint32_t)coreml_flags;
 - (void)cleanup;
 - (void)dealloc;
-- (Status)loadModel API_AVAILABLE_OS_VERSIONS;
+- (Status)loadModel API_AVAILABLE_COREML3;
 - (Status)predict:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
                   outputs:(const std::unordered_map<std::string, OnnxTensorInfo>&)outputs
     getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&)
                               get_output_tensor_mutable_raw_data_fn
-    API_AVAILABLE_OS_VERSIONS;
+    API_AVAILABLE_COREML3;
 
-@property(nullable) MLModel* model API_AVAILABLE_OS_VERSIONS;
+@property(nullable) MLModel* model API_AVAILABLE_COREML3;
 
 @end
 
@@ -287,6 +288,14 @@ - (void)cleanup {
     compiled_model_path_ = nil;
   }
 
+#if !defined(NDEBUG)
+  std::string path_override = Env::Default().GetEnvironmentVar(util::kOverrideModelOutputDirectoryEnvVar);
+  if (!path_override.empty()) {
+    // don't cleanup
+    coreml_model_path_ = nil;
+  }
+#endif
+
   if (coreml_model_path_ != nil) {
     error = nil;
     [[NSFileManager defaultManager] removeItemAtPath:coreml_model_path_ error:&error];
@@ -308,6 +317,10 @@ - (Status)loadModel {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create model URL from path");
   }
 
+  // TODO: Update this to version with callback handler as the API used here is deprecated.
+  // https://developer.apple.com/documentation/coreml/mlmodel/3929553-compilemodelaturl
+  // As we call loadModel during EP Compile there shouldn't be an issue letting the actual compile run in the
+  // background. We will have to check for completion in `predict` and block until it is done.
   NSError* error = nil;
   NSURL* compileUrl = [MLModel compileModelAtURL:modelUrl error:&error];
 
@@ -454,7 +467,7 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
     return Status::OK();
   }
 
-  if (HAS_VALID_BASE_OS_VERSION) {
+  if (HAS_COREML3_OR_LATER) {
     Status status{};
     @autoreleasepool {
       status = [execution_ loadModel];
@@ -471,7 +484,7 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
                           const GetOutputTensorMutableRawDataFn& get_output_tensor_mutable_raw_data_fn) {
   ORT_RETURN_IF_NOT(model_loaded, "Execution::Predict requires Execution::LoadModel");
 
-  if (HAS_VALID_BASE_OS_VERSION) {
+  if (HAS_COREML3_OR_LATER) {
     @autoreleasepool {
       return [execution_ predict:inputs
                          outputs:outputs
@@ -482,8 +495,20 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
   return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Execution::Predict requires macos 10.15+ or ios 13+");
 }
 
-Model::Model(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags)
-    : execution_(std::make_unique<Execution>(path, logger, coreml_flags)) {
+Model::Model(const std::string& path,
+             std::vector<std::string>&& model_input_names,
+             std::vector<std::string>&& model_output_names,
+             std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
+             std::unordered_set<std::string>&& scalar_outputs,
+             std::unordered_set<std::string>&& int64_outputs,
+             const logging::Logger& logger,
+             uint32_t coreml_flags)
+    : execution_(std::make_unique<Execution>(path, logger, coreml_flags)),
+      model_input_names_(std::move(model_input_names)),
+      model_output_names_(std::move(model_output_names)),
+      input_output_info_(std::move(input_output_info)),
+      scalar_outputs_(std::move(scalar_outputs)),
+      int64_outputs_(std::move(int64_outputs)) {
 }
 
 Model::~Model() {}
@@ -497,25 +522,5 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
                       const GetOutputTensorMutableRawDataFn& get_output_tensor_mutable_raw_data_fn) {
   return execution_->Predict(inputs, outputs, get_output_tensor_mutable_raw_data_fn);
 }
-
-bool Model::IsScalarOutput(const std::string& output_name) const {
-  return Contains(scalar_outputs_, output_name);
-}
-
-bool Model::IsInt64Output(const std::string& output_name) const {
-  return Contains(int64_outputs_, output_name);
-}
-
-const OnnxTensorInfo* Model::TryGetInputOutputInfo(const std::string& name) const {
-  const auto info_it = input_output_info_.find(name);
-  return info_it != input_output_info_.end() ? &info_it->second : nullptr;
-}
-
-const OnnxTensorInfo& Model::GetInputOutputInfo(const std::string& name) const {
-  const auto* info = TryGetInputOutputInfo(name);
-  ORT_ENFORCE(info != nullptr, "Failed to get info for input/output: ", name);
-  return *info;
-}
-
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/model/model_stub.cc b/onnxruntime/core/providers/coreml/model/model_stub.cc
new file mode 100644
index 000000000000..c6f2e7401ea1
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/model/model_stub.cc
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/coreml/model/model.h"
+
+namespace onnxruntime {
+namespace coreml {
+
+class Execution {};
+
+Model::Model(const std::string& /*path*/,
+             std::vector<std::string>&& model_input_names,
+             std::vector<std::string>&& model_output_names,
+             std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
+             std::unordered_set<std::string>&& scalar_outputs,
+             std::unordered_set<std::string>&& int64_outputs,
+             const logging::Logger& /*logger*/,
+             uint32_t /*coreml_flags*/)
+    : execution_(std::make_unique<Execution>()),
+      model_input_names_(std::move(model_input_names)),
+      model_output_names_(std::move(model_output_names)),
+      input_output_info_(std::move(input_output_info)),
+      scalar_outputs_(std::move(scalar_outputs)),
+      int64_outputs_(std::move(int64_outputs)) {
+}
+
+Model::~Model() {
+}
+
+Status Model::LoadModel() {
+  // return OK so we hit more CoreML EP code.
+  return Status::OK();
+}
+
+Status Model::Predict(const std::unordered_map<std::string, OnnxTensorData>& /*inputs*/,
+                      const std::unordered_map<std::string, OnnxTensorInfo>& /*outputs*/,
+                      const GetOutputTensorMutableRawDataFn& /*get_output_tensor_mutable_raw_data_fn*/) {
+  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Executing a CoreML model is not supported on this platform.");
+}
+
+}  // namespace coreml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/controlflow/if.cc b/onnxruntime/core/providers/cpu/controlflow/if.cc
index 51d2fc8291e4..8b17c297e1e5 100644
--- a/onnxruntime/core/providers/cpu/controlflow/if.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/if.cc
@@ -104,12 +104,24 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(If,
                                    If);
 
 // float 8 support was added.
-ONNX_CPU_OPERATOR_KERNEL(If,
-                         19,
-                         KernelDefBuilder()
-                             .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
-                             .TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypesIRv9()),
-                         If);
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
+    If,
+    19,
+    20,
+    KernelDefBuilder()
+        .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
+        .TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypesIRv9()),
+    If);
+
+// uint4 and int4 support was added.
+// TODO(adrianlizarraga): Implement int4 and uint4 support.
+ONNX_CPU_OPERATOR_KERNEL(
+    If,
+    21,
+    KernelDefBuilder()
+        .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
+        .TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypesIRv9()),
+    If);
 
 If::Info::Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in) : subgraph(subgraph_in) {
   num_implicit_inputs = static_cast<int>(node.ImplicitInputDefs().size());
diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.cc b/onnxruntime/core/providers/cpu/controlflow/loop.cc
index 9bb080c0a9fa..9837aabe786c 100644
--- a/onnxruntime/core/providers/cpu/controlflow/loop.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/loop.cc
@@ -131,13 +131,25 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(Loop,
                                        .TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypes()),
                                    Loop);
 
-ONNX_CPU_OPERATOR_KERNEL(Loop,
-                         19,
-                         KernelDefBuilder()
-                             .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>())
-                             .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
-                             .TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypesIRv9()),
-                         Loop);
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
+    Loop,
+    19, 20,
+    KernelDefBuilder()
+        .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>())
+        .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
+        .TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypesIRv9()),
+    Loop);
+
+// Opset 21 added int4 and uint4 support.
+// TODO(adrianlizarraga): Implement int4 and uint4 support.
+ONNX_CPU_OPERATOR_KERNEL(
+    Loop,
+    21,
+    KernelDefBuilder()
+        .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>())
+        .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
+        .TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypesIRv9()),
+    Loop);
 
 Loop::Info::Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in)
     : subgraph(subgraph_in) {
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
index 7c19dca6a4f1..f7548fbf6050 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
@@ -524,11 +524,21 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(Scan,
                                    Scan<9>);
 
 // Opset 19 starts to support float 8 types for the type constraint "V"
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(Scan,
+                                   19, 20,
+                                   KernelDefBuilder()
+                                       // 'I' is in the ONNX spec but is not actually used for any inputs or outputs
+                                       // .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>())
+                                       .TypeConstraint("V", DataTypeImpl::AllTensorTypesIRv9()),
+                                   Scan<9>);
+
+// Opset 21 starts to support 4-bit int types for the type constraint "V"
+// TODO(adrianlizarraga): Implement int4 and uint4 support.
 ONNX_CPU_OPERATOR_KERNEL(Scan,
-                         19,
+                         21,
                          KernelDefBuilder()
                              // 'I' is in the ONNX spec but is not actually used for any inputs or outputs
-                             //.TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>())
+                             // .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>())
                              .TypeConstraint("V", DataTypeImpl::AllTensorTypesIRv9()),
                          Scan<9>);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 1390f6024317..8a270a05d728 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/cpu_execution_provider.h"
+#include <absl/base/config.h>
 #include "core/framework/op_kernel.h"
 #include "core/framework/kernel_registry.h"
 #include "core/mlas/inc/mlas.h"
@@ -25,12 +26,11 @@ struct KernelRegistryAndStatus {
 
 namespace onnxruntime {
 CPUExecutionProvider::CPUExecutionProvider(const CPUExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} {
-}
+    : IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} {}
 
 std::vector<AllocatorPtr> CPUExecutionProvider::CreatePreferredAllocators() {
   bool create_arena = info_.create_arena;
-#if defined(USE_JEMALLOC) || defined(USE_MIMALLOC)
+#if defined(USE_JEMALLOC) || defined(USE_MIMALLOC) || defined(ABSL_HAVE_ADDRESS_SANITIZER)
   // JEMalloc/mimalloc already have memory pool, so just use device allocator.
   create_arena = false;
 #elif !(defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64))
@@ -143,9 +143,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Aco
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Atan);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, float, Gemm);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, double, Gemm);
-#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Gemm);
-#endif
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Hardmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, float, LogSoftmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, double, LogSoftmax);
@@ -155,8 +152,10 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, double, Softmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, float, TopK);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, double, TopK);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, float, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, double, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Conv);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, Flatten);
@@ -185,10 +184,8 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int32_t,
-                                                      ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int64_t,
-                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceLogSum);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, float,
                                                       ReduceLogSumExp);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, double,
@@ -290,17 +287,28 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, Sign);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Shrink);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float, Erf);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_int64_t_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_int64_t_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_string_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_string_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_float_float, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_int32_t_float, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int32_t_float_int32_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int32_t_float_float, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_float, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_int32_t, OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                      int64_t_int64_t_int64_t, OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_int64_t_int64_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_string_int64_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_string_int64_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_float_float,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_int32_t_float,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_int64_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int32_t_float_int32_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int32_t_float_float,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_float,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_int32_t,
+                                                      OneHot);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, MaxUnpool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Sinh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Cosh);
@@ -324,15 +332,14 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, Flatten);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float, Gemm);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, double, Gemm);
-#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, MLFloat16, Gemm);
-#endif
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float, MatMul);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double, MatMul);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t, MatMul);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int64_t, MatMul);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, float, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, double, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, PRelu);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9, float, Upsample);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9, int32_t, Upsample);
@@ -350,11 +357,16 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, int8_t, Resize);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, uint8_t, Resize);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ThresholdedRelu);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int32_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t, QuantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int32_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t,
+                                                      QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t,
+                                                      QuantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, QLinearMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t, QLinearMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, MatMulInteger);
@@ -400,12 +412,18 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t,
+                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t,
+                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t,
+                                                      ReduceLogSumExp);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, float, ReduceMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, double, ReduceMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, int32_t, ReduceMax);
@@ -424,10 +442,14 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSum);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceSum);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t,
+                                                      ReduceSumSquare);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Hardmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, LogSoftmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, LogSoftmax);
@@ -453,7 +475,8 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDoma
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv);
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18, MLFloat16, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18, MLFloat16,
+                                                      AveragePool);
 #endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If);
@@ -468,9 +491,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Sp
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, ScatterND);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, Gemm);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, Gemm);
-#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Gemm);
-#endif
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, GatherElements);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t, BitShift);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint32_t, BitShift);
@@ -531,15 +551,22 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Ei
 // class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_float, Dropout);
 // class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_double, Dropout);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_float, Dropout);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_double, Dropout);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_float, Dropout);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_double, Dropout);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_double,
+                                                      Dropout);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_float,
+                                                      Dropout);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_double,
+                                                      Dropout);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Celu);
 
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float, GreaterOrEqual);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double, GreaterOrEqual);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t, GreaterOrEqual);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int64_t, GreaterOrEqual);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float,
+                                                      GreaterOrEqual);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double,
+                                                      GreaterOrEqual);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t,
+                                                      GreaterOrEqual);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int64_t,
+                                                      GreaterOrEqual);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float, LessOrEqual);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double, LessOrEqual);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t, LessOrEqual);
@@ -549,9 +576,12 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Erf);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Cast);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Clip);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int32_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int32_t,
+                                                      DequantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Expand);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Expand);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Expand);
@@ -567,9 +597,6 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, string, Expand);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Gemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Gemm);
-#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16, Gemm);
-#endif
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, MatMul);
@@ -577,14 +604,16 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Min);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Max);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Mean);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t, QuantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t,
+                                                      QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t,
+                                                      QuantizeLinear);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Sigmoid);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Sign);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Size);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sum);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sum);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Flatten);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20, Flatten);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, LRN);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MeanVarianceNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_float, Dropout);
@@ -673,9 +702,9 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, De
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, SpaceToDepth);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Slice);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, Split);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Unsqueeze);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Squeeze);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Transpose);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20, Unsqueeze);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20, Squeeze);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20, Transpose);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Tile);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Gather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherElements);
@@ -685,6 +714,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDoma
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, float, IsNaN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, double, IsNaN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, MLFloat16, IsNaN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, BFloat16, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, NonZero);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, NonZero);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, NonZero);
@@ -699,12 +729,18 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
+                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
+                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
+                                                      ReduceLogSumExp);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double, ReduceMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceMax);
@@ -723,10 +759,14 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceProd);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceProd);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceProd);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
+                                                      ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ReduceSum);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, ReduceSum);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, ReduceSum);
@@ -774,8 +814,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Div);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 18, Reshape);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 15, Identity);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, GRU);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, LSTM);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, RNN);
@@ -850,21 +892,21 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double, ReduceLogSumExp);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceLogSumExp);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, ReduceLogSumExp);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float, ReduceMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double, ReduceMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, ReduceMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, float, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, double, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int32_t, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int64_t, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int8_t, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, uint8_t, ReduceMax);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float, ReduceMean);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double, ReduceMean);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMean);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float, ReduceMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double, ReduceMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, ReduceMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, float, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, double, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int32_t, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int64_t, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int8_t, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, uint8_t, ReduceMin);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float, ReduceProd);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, ReduceProd);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, ReduceProd);
@@ -917,20 +959,27 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Op
 #endif
 
 // Opset 19
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Size);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Size);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, AveragePool);
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, MLFloat16, AveragePool);
 #endif
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Cast);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t, DequantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t, DequantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Cast);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, int32_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, int8_t,
+                                                      DequantizeLinear);
 #if !defined(DISABLE_FLOAT8_TYPES)
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, DequantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E4M3FNUZ, DequantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E5M2, DequantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E5M2FNUZ, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FNUZ,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2FNUZ,
+                                                      DequantizeLinear);
 #endif
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, bool, Equal);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t, Equal);
@@ -938,28 +987,48 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Equal);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, double, Equal);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, string, Equal);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Identity);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, If);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Loop);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Pad);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t, QuantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Identity);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, If);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Loop);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
+                                                      QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, int8_t,
+                                                      QuantizeLinear);
 #if !defined(DISABLE_FLOAT8_TYPES)
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, QuantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E4M3FNUZ, QuantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E5M2, QuantizeLinear);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E5M2FNUZ, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN,
+                                                      QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FNUZ,
+                                                      QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2,
+                                                      QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2FNUZ,
+                                                      QuantizeLinear);
 #endif
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Reshape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Reshape);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Resize);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t, Resize);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t, Resize);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t, Resize);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Scan);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Shape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Scan);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Shape);
 
 // Opset 20
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, ConstantOfShape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, 20, ConstantOfShape);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, bool, ReduceMax);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, ReduceMax);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, ReduceMax);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMax);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMax);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMax);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMax);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, bool, ReduceMin);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, ReduceMin);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, ReduceMin);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMin);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMin);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMin);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMin);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, DFT);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, GridSample);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, GridSample);
@@ -968,6 +1037,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, BFloat16, IsNaN);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu);
 #if !defined(DISABLE_FLOAT8_TYPES)
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ, IsNaN);
@@ -975,6 +1046,46 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN);
 #endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringSplit);
+
+// Opset 21
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Cast);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, ConstantOfShape);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Identity);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Reshape);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Scan);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Shape);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Size);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Squeeze);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Transpose);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Unsqueeze);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, If);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Loop);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Flatten);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int32_t, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint16_t, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t, DequantizeLinear);
+#if !defined(DISABLE_FLOAT8_TYPES)
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2, DequantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2FNUZ, DequantizeLinear);
+#endif
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint16_t, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t, QuantizeLinear);
+#if !defined(DISABLE_FLOAT8_TYPES)
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2FNUZ, QuantizeLinear);
+#endif
 
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
 
@@ -1018,96 +1129,127 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
 Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
     BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 10,
-                                                                    Clip)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 10, Clip)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Elu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, HardSigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15,
+                                                                    LeakyRelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Relu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Selu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Sigmoid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Sigmoid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Sigmoid)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Softplus)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Softsign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                    PRelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Tanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Tanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, PRelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomNormal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniform)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomNormalLike)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniformLike)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Multinomial)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int32_t,
-                                                                          Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int64_t,
-                                                                          Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, float, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int32_t,
-                                                                          Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int64_t,
-                                                                          Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, float, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int32_t,
-                                                                          Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int64_t,
-                                                                          Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, float, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int32_t,
-                                                                          Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int64_t,
-                                                                          Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int16_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int32_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int64_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, uint8_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, uint16_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, uint32_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, uint64_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float,
-                                                                          Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double,
-                                                                          Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int8_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int32_t,
-                                                                          Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int64_t,
-                                                                          Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          float, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          double, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int32_t, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int64_t, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          float, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          double, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int32_t, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int64_t, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          float, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          double, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int32_t, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int64_t, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          float, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          double, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int32_t, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int64_t, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int8_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int16_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int32_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int64_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          uint8_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          uint16_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          uint32_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          uint64_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Floor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Floor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Ceil)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Ceil)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Reciprocal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Reciprocal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Sqrt)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Sqrt)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int8_t, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int32_t, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int64_t, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 11, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Log)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Exp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Exp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Log)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Log)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
                                                                           float, Sum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
                                                                           double, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, float, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, double, Sum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          float, Sum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          double, Sum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
                                                                           float, Min)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11, Min)>,
@@ -1138,7 +1280,8 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           double, Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
                                                                           float, Mean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, float, Mean)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          float, Mean)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, float, Sin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, double, Sin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Cos)>,
@@ -1146,8 +1289,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Asin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Acos)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Atan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, double, Gemm)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                          float, Gemm)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                          double, Gemm)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     Hardmax)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
@@ -1170,27 +1315,23 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           float, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
                                                                           double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Conv)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8,
-                                                                    Flatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6,
                                                           InstanceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1,
-                                                                float, LpNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1,
-                                                                double, LpNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float,
+                                                                LpNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double,
+                                                                LpNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, LRN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9,
                                                                     AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 7,
-                                                                    MaxPool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11,
                                                                     MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10,
-                                                                    LpPool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, LpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, GlobalLpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
@@ -1281,31 +1422,28 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, LSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, RNN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 4, 10,
-                                                                    Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Gather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9,
-                                                                    Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Gather)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9, Dropout)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
                                                                     Identity)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 4,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 4, Reshape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 5, 12,
                                                                     Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 5, 12, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Shape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Size)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9,
-                                                                    Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, SpaceToDepth)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, Slice)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
+                                                                    SpaceToDepth)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     DepthToSpace)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10,
-                                                                    Split)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, Split)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     Squeeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, Tile)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
+                                                                    Transpose)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     Unsqueeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
@@ -1316,59 +1454,59 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           int8_t, Upsample)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
                                                                           uint8_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, float,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, double,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, int8_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, int16_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, int32_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, int64_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, uint8_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, uint16_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, uint32_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, uint64_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, bool,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, MLFloat16,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, string,
-                                                                          Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          float, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          double, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          int8_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          int16_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          int32_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          int64_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          uint8_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          uint16_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          uint32_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          uint64_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          bool, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          MLFloat16, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          string, Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 8, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Loop)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Loop)>,
 
     // Opset 9
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                     Compress)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 19, ConstantOfShape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 19,
+                                                                    ConstantOfShape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
                                                                     MeanVarianceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float,
-                                                                          Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double,
-                                                                          Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t,
-                                                                          Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int64_t,
-                                                                          Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float,
-                                                                          Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double,
-                                                                          Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t,
-                                                                          Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int64_t,
-                                                                          Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          float, Greater)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          double, Greater)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int32_t, Greater)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int64_t, Greater)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          float, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          double, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int32_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int64_t, Less)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, EyeLike)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
                                                                           float, IsNaN)>,
@@ -1376,10 +1514,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           double, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
                                                                           MLFloat16, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                    Sign)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, Sign)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float, Erf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          float, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                           int64_t_int64_t_int64_t, OneHot)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
@@ -1409,8 +1547,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Asinh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Acosh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Atanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                    Scan)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                     Scatter)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, TfIdfVectorizer)>,
@@ -1424,36 +1561,36 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           int64_t, NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
                                                                           uint8_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, string,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, float,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, double,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, int32_t,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, int64_t,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, uint8_t,
-                                                                          Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          string, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          float, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          double, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          int32_t, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          int64_t, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          uint8_t, Where)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                     Flatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                           float, Gemm)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                           double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float,
-                                                                          MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double,
-                                                                          MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t,
-                                                                          MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int64_t,
-                                                                          MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, float,
-                                                                          BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, double,
-                                                                          BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          float, MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          double, MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int32_t, MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int64_t, MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13,
+                                                                          float, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13,
+                                                                          double, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, PRelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
                                                                           float, Upsample)>,
@@ -1482,40 +1619,42 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
                                                                           uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ThresholdedRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int32_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t,
-                                                                          QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t,
-                                                                          QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, QLinearMatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t, QLinearMatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          uint8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          int8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          int32_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          uint8_t, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          int8_t, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
+                                                                QLinearMatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
+                                                                QLinearMatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
                                                                 MatMulInteger)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
                                                                 MatMulInteger)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ConvInteger)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, QLinearConv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t, QLinearConv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                    Slice)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
+                                                                QLinearConv)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
+                                                                QLinearConv)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 11,
                                                                     Dropout)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
                                                                     NonMaxSuppression)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 19, IsInf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15, float,
-                                                                          RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15, double,
-                                                                          RoiAlign)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15,
+                                                                          float, RoiAlign)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15,
+                                                                          double, RoiAlign)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ReverseSequence)>,
     // opset 11
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                    Clip)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, Clip)>,
 
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
                                                                           float, CumSum)>,
@@ -1535,10 +1674,8 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           float, Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
                                                                           double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float,
-                                                                Round)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double,
-                                                                Round)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, Round)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double, Round)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16,
                                                                 Round)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t,
@@ -1560,32 +1697,41 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
                                                                           int32_t, ArgMin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Hardmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float,
-                                                                          LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double,
-                                                                          LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double,
-                                                                          Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float,
-                                                                          Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Hardmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          float, LogSoftmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          double, LogSoftmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          double, Softmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          float, Softmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    DepthToSpace)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 15, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Flatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Flatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Compress)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Gather)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Concat)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Gather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Unsqueeze)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Squeeze)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Unsqueeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Det)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                          NonMaxSuppression)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18, AveragePool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18,
+                                                                    AveragePool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17,
+                                                                    LpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If)>,
@@ -1594,15 +1740,17 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceEmpty)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceInsert)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceErase)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                          SequenceConstruct)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                          ConcatFromSequence)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceConstruct)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConcatFromSequence)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SplitToSequence)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    ScatterND)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          float, Gemm)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          double, Gemm)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    GatherElements)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t,
                                                                 BitShift)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint32_t,
@@ -1614,14 +1762,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                     GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Range)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Unique)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float,
-                                                                TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double,
-                                                                TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int64_t,
-                                                                TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int32_t,
-                                                                TopK)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, TopK)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double, TopK)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int64_t, TopK)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int32_t, TopK)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
                                                                 int64_t_int64_t_int64_t, OneHot)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
@@ -1763,8 +1907,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                     GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Einsum)>,
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
                                                                 Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t,
@@ -1783,8 +1926,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t,
                                                                 Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool,
-                                                                Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16,
                                                                 Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, string,
@@ -1792,36 +1934,41 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Erf)>,
     // REVIEW(codemzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
     // However these types work on GPU implementation.
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_MLFloat16, Dropout)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_float, Dropout)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_double, Dropout)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12,
+    // MLFloat16_MLFloat16, Dropout)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12,
+    // MLFloat16_float, Dropout)>, BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider,
+    // kOnnxDomain, 12, MLFloat16_double, Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                          float_float, Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                          float_double, Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                          double_float, Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                          double_double, Dropout)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Celu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float,
-                                                                          GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double,
-                                                                          GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t,
-                                                                          GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int64_t,
-                                                                          GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float,
-                                                                          LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double,
-                                                                          LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t,
-                                                                          LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int64_t,
-                                                                          LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          float, GreaterOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          double, GreaterOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          int32_t, GreaterOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          int64_t, GreaterOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          float, LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          double, LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          int32_t, LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          int64_t, LessOrEqual)>,
 
     // opset 13
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Cast)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Clip)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, MatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
                                                                 MatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
@@ -1837,56 +1984,61 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Size)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int32_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t,
-                                                                          QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t,
-                                                                          QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Flatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                Sigmoid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                Sigmoid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          uint8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int32_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          uint8_t, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int8_t, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
+                                                                    Flatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, LRN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
                                                           MeanVarianceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                float, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                double, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                int8_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                uint8_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                int32_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                float, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                double, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                int32_t, ArgMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_float,
+                                                                Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_double,
+                                                                Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_float,
+                                                                Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_double,
+                                                                Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t,
+                                                                ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
+                                                                ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                ArgMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                ArgMin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
                                                                     Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, bool,
-                                                                          Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int32_t,
-                                                                          Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int64_t,
-                                                                          Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, float,
-                                                                          Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, double,
-                                                                          Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          bool, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int32_t, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int64_t, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          float, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          double, Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
                                                                 Greater)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
@@ -1895,16 +2047,16 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 Greater)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
                                                                 Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, double, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                                int32_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                                int64_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                          float, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                          double, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
                                                                           int32_t, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
@@ -1936,40 +2088,27 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Mod)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int16_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint16_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint32_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t,
-                                                                Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int16_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint16_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint32_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
                                                                 Reciprocal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
                                                                 Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Ceil)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Floor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Floor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Ceil)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Ceil)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sqrt)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sqrt)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
@@ -1985,25 +2124,30 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 14, Pow)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Unsqueeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Transpose)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
+                                                                    Unsqueeze)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
+                                                                    Squeeze)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
+                                                                    Transpose)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Tile)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Gather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherElements)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, SpaceToDepth)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, float,
-                                                                          IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, double,
-                                                                          IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, MLFloat16,
-                                                                          IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool,
-                                                                NonZero)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15,
+                                                                    ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15,
+                                                                    ScatterND)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                    Identity)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                          float, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                          double, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                          MLFloat16, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
                                                                 NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
@@ -2014,76 +2158,76 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int8_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, uint8_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int8_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, uint8_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceL1)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceL1)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceL1)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceL2)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceL2)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceL2)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceLogSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceLogSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceLogSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceLogSumExp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceLogSumExp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceLogSumExp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceLogSumExp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int8_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          uint8_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceMean)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceMean)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceMean)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int8_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          uint8_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceProd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceProd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceProd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
                                                                 ReduceSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
@@ -2113,53 +2257,42 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 Softmax)>,
 
     // OpSet 14
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float,
-                                                                CumSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, CumSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double,
                                                                 CumSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
                                                                 CumSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
                                                                 CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float,
-                                                                Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double,
-                                                                Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int8_t,
-                                                                Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int8_t, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Relu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, Trilu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Sub)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Sub)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Mul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Mul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Div)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 18, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 15, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, float,
-                                                                          BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, double,
-                                                                          BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 18,
+                                                                    Reshape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 15,
+                                                                    Identity)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14,
+                                                                          float, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14,
+                                                                          double, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, GRU)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, LSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, RNN)>,
@@ -2173,29 +2306,37 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 18, Shape)>,
 
 #if !defined(DISABLE_OPTIONAL_TYPE)
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17, OptionalHasElement)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17, OptionalGetElement)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17,
+                                                                    OptionalHasElement)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17,
+                                                                    OptionalGetElement)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, Optional)>,
 #endif
 
     // Opset 16
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Identity)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18,
+                                                                    Identity)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, If)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float,
                                                                 RoiAlign)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double,
                                                                 RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 19, float,
-                                                                          GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 19,
+                                                                          float, GridSample)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17,
+                                                                    ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17,
+                                                                    ScatterND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, string, Where)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float, Where)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, uint8_t, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t,
+                                                                Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t,
+                                                                Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, uint8_t,
+                                                                Where)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, LeakyRelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, PRelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Scan)>,
@@ -2229,14 +2370,14 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 LayerNormalization)>,
 
     // Opset 18
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, float,
-                                                                          Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, int32_t,
-                                                                          Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, int8_t,
-                                                                          Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, uint8_t,
-                                                                          Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                          float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                          int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                          int8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                          uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
                                                                 ReduceL1)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
@@ -2263,36 +2404,36 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 ReduceLogSumExp)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
                                                                 ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
-                                                                ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
-                                                                ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
-                                                                ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, float,
+                                                                          ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, double,
+                                                                          ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int32_t,
+                                                                          ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int64_t,
+                                                                          ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int8_t,
+                                                                          ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, uint8_t,
+                                                                          ReduceMax)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
                                                                 ReduceMean)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
                                                                 ReduceMean)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
                                                                 ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
-                                                                ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
-                                                                ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
-                                                                ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, float,
+                                                                          ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, double,
+                                                                          ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int32_t,
+                                                                          ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int64_t,
+                                                                          ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int8_t,
+                                                                          ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, uint8_t,
+                                                                          ReduceMin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
                                                                 ReduceProd)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
@@ -2309,38 +2450,70 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t, BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                BitwiseXor)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, ScatterND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
@@ -2351,74 +2524,165 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
 #endif
 
     // Opset 19
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Size)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Size)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Cast)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Cast)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          uint8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          int8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          int32_t, DequantizeLinear)>,
+#if !defined(DISABLE_FLOAT8_TYPES)
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          Float8E4M3FN, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          Float8E4M3FNUZ, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          Float8E5M2, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          Float8E5M2FNUZ, DequantizeLinear)>,
+#endif
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, bool, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
+                                                                Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int64_t,
+                                                                Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, double, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, string, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                    Identity)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, If)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Loop)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          uint8_t, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          int8_t, QuantizeLinear)>,
+#if !defined(DISABLE_FLOAT8_TYPES)
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          Float8E4M3FN, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          Float8E4M3FNUZ, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          Float8E5M2, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                          Float8E5M2FNUZ, QuantizeLinear)>,
+#endif
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                    Reshape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
+                                                                Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t,
+                                                                Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t,
+                                                                Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Scan)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Shape)>,
+
+    // Opset 20
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, 20,
+                                                                    ConstantOfShape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, bool, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, bool, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, DFT)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float,
+                                                                GridSample)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double,
+                                                                GridSample)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float,
+                                                                AffineGrid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double,
+                                                                AffineGrid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16,
+                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, BFloat16,
+                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu)>,
+#if !defined(DISABLE_FLOAT8_TYPES)
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN,
+                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ,
+                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2,
+                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ,
+                                                                IsNaN)>,
+#endif
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringSplit)>,
+
+    // Opset 21
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Cast)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, ConstantOfShape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Identity)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Reshape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Scan)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Shape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Size)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Flatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Squeeze)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Transpose)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Unsqueeze)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, If)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Loop)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t,
                                                                 DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t,
                                                                 DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint16_t,
+                                                                DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t,
+                                                                DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int32_t,
                                                                 DequantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E4M3FN,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN,
                                                                 DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E4M3FNUZ,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ,
                                                                 DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E5M2,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2,
                                                                 DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E5M2FNUZ,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2FNUZ,
                                                                 DequantizeLinear)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, string, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t,
                                                                 QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t,
+                                                                QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint16_t,
+                                                                QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t,
                                                                 QuantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E4M3FN,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN,
                                                                 QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E4M3FNUZ,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ,
                                                                 QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E5M2,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2,
                                                                 QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Float8E5M2FNUZ,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2FNUZ,
                                                                 QuantizeLinear)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Shape)>,
-
-    // Opset 20
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, ConstantOfShape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, DFT)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, AffineGrid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, AffineGrid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN)>,
-#if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN)>,
-#endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
   };
 
   for (auto& function_table_entry : function_table) {
@@ -2434,23 +2698,28 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 Status RegisterFp16Kernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, MLFloat16, GlobalAveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18, MLFloat16, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, MLFloat16, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11, MLFloat16, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Relu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Relu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, MLFloat16, Relu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15, MLFloat16, LeakyRelu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, MLFloat16, LeakyRelu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Gemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                            MLFloat16, Gemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Gemm)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, MLFloat16,
+                                                                  GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16,
+                                                                  Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18,
+                                                                            MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, MLFloat16,
+                                                                  AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11,
+                                                                            MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16,
+                                                                  MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            MLFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            MLFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, MLFloat16,
+                                                                  Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15,
+                                                                            MLFloat16, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, MLFloat16,
+                                                                  LeakyRelu)>,
   };
 
   for (auto& function_table_entry : function_table) {
@@ -2498,23 +2767,37 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int32_t, Scaler);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, SVMClassifier);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, SVMRegressor);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float, TreeEnsembleClassifier);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double, TreeEnsembleClassifier);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int64_t, TreeEnsembleClassifier);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int32_t, TreeEnsembleClassifier);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float, TreeEnsembleRegressor);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double, TreeEnsembleRegressor);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float,
+                                                      TreeEnsembleClassifier);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double,
+                                                      TreeEnsembleClassifier);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int64_t,
+                                                      TreeEnsembleClassifier);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int32_t,
+                                                      TreeEnsembleClassifier);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float,
+                                                      TreeEnsembleRegressor);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double,
+                                                      TreeEnsembleRegressor);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, ZipMap);
-
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_string, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_float, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_float, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_int64, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_string, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_int64, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_int64, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_string, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_float, LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, float_string,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, string_float,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, int64_float,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, float_int64,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, int64_string,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, string_int64,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, int64_int64,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, string_string,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, float_float,
+                                                      LabelEncoder);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, float, TreeEnsembleClassifier);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double, TreeEnsembleClassifier);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, int64_t, TreeEnsembleClassifier);
@@ -2522,6 +2805,22 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, float, TreeEnsembleRegressor);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double, TreeEnsembleRegressor);
 
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_string, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_float, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_float, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_int64, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_string, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_int64, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_int64, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_string, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_float, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_int16, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_string, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_double, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_double, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_int64, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_double, LabelEncoder);
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   KernelCreateInfo info;
@@ -2572,46 +2871,45 @@ Status RegisterOnnxMLOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, string,
                                                                   OneHotEncoder)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, float, Scaler)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, double,
-                                                                  Scaler)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, double, Scaler)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int64_t,
                                                                   Scaler)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int32_t,
                                                                   Scaler)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, SVMClassifier)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, SVMRegressor)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float,
-                                                                            TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double,
-                                                                            TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int64_t,
-                                                                            TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int32_t,
-                                                                            TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float,
-                                                                            TreeEnsembleRegressor)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double,
-                                                                            TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            float, TreeEnsembleClassifier)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            double, TreeEnsembleClassifier)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            int64_t, TreeEnsembleClassifier)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            int32_t, TreeEnsembleClassifier)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            float, TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            double, TreeEnsembleRegressor)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, ZipMap)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_string,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_float,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_float,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_int64,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_string,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_int64,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_int64,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_string,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_float,
-                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            float_string, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            string_float, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            int64_float, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            float_int64, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            int64_string, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            string_int64, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            int64_int64, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            string_string, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            float_float, LabelEncoder)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, float,
                                                                   TreeEnsembleClassifier)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double,
@@ -2624,6 +2922,37 @@ Status RegisterOnnxMLOperatorKernels(KernelRegistry& kernel_registry) {
                                                                   TreeEnsembleRegressor)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double,
                                                                   TreeEnsembleRegressor)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_string,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_float,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_float,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_int64,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_string,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_int64,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_int64,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_string,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_float,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_int16,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_string,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_double,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_double,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_int64,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_double,
+                                                                  LabelEncoder)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
index 9c55d37f550f..c4a83efa01a9 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
@@ -25,6 +25,7 @@
 #include "core/providers/cpu/tensor/tile.h"
 #include "core/providers/cpu/tensor/gather_elements.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
 
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/cpu/bert/attention_base.h"
@@ -62,6 +63,7 @@
 #endif
 
 #include "cpu_provider_shared.h"
+#include <limits>
 
 namespace onnxruntime {
 // The suppressed warning is: "The type with a virtual function needs either public virtual or protected nonvirtual destructor."
@@ -87,7 +89,13 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
                                        const TensorShape& indice_shape,
                                        const TensorShape& update_shape) override { return ScatterND::ValidateShapes(input_shape, indice_shape, update_shape); }
   // From cpu/tensor/padbase.h (direct)
-  Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) override { return PadBase::HandleDimValueZero(mode, input_shape, output_shape); }
+  Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) override { return PadBase::HandleDimValueZero(mode, input_shape, output_shape); }
+
+  void PadBase__ComputePads(OpKernelContext& ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
+                            PadsVector& pads) override {
+    PadBase::ComputePads(ctx, data_rank, pads_data, pads);
+  }
+
   // From cpu/tensor/split.h (direct)
   Status SplitBase__PrepareForCompute(const SplitBase* p, const TensorShape& input_shape, int num_outputs, int64_t& axis, int& before_dims,
                                       int& after_dims_including_split_axis, int& after_dims_excluding_split,
@@ -286,6 +294,12 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
   Status Sampling__Compute(const contrib::transformers::Sampling* p, OpKernelContext* ctx) override { return p->contrib::transformers::Sampling::Compute(ctx); }
   Status Sampling__SetupSubgraphExecutionInfo(contrib::transformers::Sampling* p, const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) override { return p->contrib::transformers::Sampling::SetupSubgraphExecutionInfo(session_state, attribute_name, subgraph_session_state); }
 
+  void UpsampleBase__AdjustOutputSizeAsPolicy(const UpsampleBase* p, TensorShapeVector& output_dims,
+                                              gsl::span<const int64_t> input_dims,
+                                              InlinedVector<float>& scales) const override {
+    p->AdjustOutputSizeAsPolicy(output_dims, input_dims, scales);
+  }
+
 #ifdef ENABLE_ATEN
   Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) override { return p->ATen::Compute(p_ctx); }
 #endif
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.h b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
index 8dee1cd62028..c0e674827e4d 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.h
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
@@ -24,6 +24,9 @@ class SliceOp__PrepareForComputeMetadata;  // Directly maps to SliceOp::PrepareF
 class UnsqueezeBase__Prepare;              // Directly maps to UnsqueezeBase::Prepare
 class contrib__AdamWOptimizerBase__Prepare;
 class contrib__SGDOptimizerV2Base__Prepare;
+class UpsampleBase;
+
+using PadsVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;
 
 struct ProviderHostCPU {
   // From cpu/tensor/gatherbase.h
@@ -44,7 +47,11 @@ struct ProviderHostCPU {
                                                const TensorShape& indice_shape,
                                                const TensorShape& update_shape) = 0;
   // From cpu/tensor/padbase.h
-  virtual Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) = 0;
+  virtual Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) = 0;
+
+  virtual void PadBase__ComputePads(OpKernelContext& ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
+                                    PadsVector& pads) = 0;
+
   // From cpu/tensor/split.h
   virtual Status SplitBase__PrepareForCompute(const SplitBase* p, const TensorShape& input_shape, int num_outputs, int64_t& axis, int& before_dims,
                                               int& after_dims_including_split_axis, int& after_dims_excluding_split,
@@ -196,6 +203,10 @@ struct ProviderHostCPU {
   virtual Status Sampling__Compute(const contrib::transformers::Sampling* p, OpKernelContext* ctx) = 0;
   virtual Status Sampling__SetupSubgraphExecutionInfo(contrib::transformers::Sampling* p, const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) = 0;
 
+  virtual void UpsampleBase__AdjustOutputSizeAsPolicy(const UpsampleBase* p, TensorShapeVector& output_dims,
+                                                      gsl::span<const int64_t> input_dims,
+                                                      InlinedVector<float>& scales) const = 0;
+
 #ifdef ENABLE_ATEN
   virtual Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) = 0;
 #endif
diff --git a/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc b/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc
index a93da12ccf59..24b028b8561f 100644
--- a/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc
+++ b/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc
@@ -15,6 +15,10 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPE_LIST(
     kCpuExecutionProvider, kOnnxDomain, ConstantOfShape, 20, Output, 0,
     ConstantOfShapeDefaultOutputTypesOpset20);
 
+ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPE_LIST(
+    kCpuExecutionProvider, kOnnxDomain, ConstantOfShape, 21, Output, 0,
+    ConstantOfShapeDefaultOutputTypesOpset21);
+
 // pytorch converter uses ConstantOfShape with int64 to create Pad input
 // https://github.com/pytorch/pytorch/blob/044b519a80459f6787f6723c1c091a18b153d184/torch/onnx/symbolic_opset11.py#L449
 ORT_SPECIFY_OP_KERNEL_ARG_REQUIRED_TYPES_ALL_OPSETS(
@@ -33,6 +37,10 @@ using EnabledOutputTypesOpset20 =
     ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
         kCpuExecutionProvider, kOnnxDomain, ConstantOfShape, 20, Output, 0);
 
+using EnabledOutputTypesOpset21 =
+    ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
+        kCpuExecutionProvider, kOnnxDomain, ConstantOfShape, 21, Output, 0);
+
 class ConstantOfShape final : public ConstantOfShapeBase<EnabledOutputTypes>, public OpKernel {
  public:
   explicit ConstantOfShape(const OpKernelInfo& info) : ConstantOfShapeBase(info), OpKernel(info) {}
@@ -85,12 +93,22 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
                         BuildKernelDefConstraintsFromTypeList<EnabledOutputTypes>()),
     ConstantOfShape);
 
-ONNX_CPU_OPERATOR_KERNEL(
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     ConstantOfShape,
     20,
+    20,
     KernelDefBuilder()
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>())
         .TypeConstraint("T2",
                         BuildKernelDefConstraintsFromTypeList<EnabledOutputTypesOpset20>()),
     ConstantOfShape);
+
+ONNX_CPU_OPERATOR_KERNEL(
+    ConstantOfShape,
+    21,
+    KernelDefBuilder()
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>())
+        .TypeConstraint("T2",
+                        BuildKernelDefConstraintsFromTypeList<EnabledOutputTypesOpset21>()),
+    ConstantOfShape);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h b/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
index 9aa73c714dae..5ce7ab855327 100644
--- a/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
+++ b/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
@@ -35,6 +35,20 @@ using ConstantOfShapeDefaultOutputTypesOpset20 =
         uint8_t, uint16_t, uint32_t, uint64_t,
         bool>;
 
+// Opset 21 added int4 and uint4
+// TODO(adrianlizarraga): Implement int4 and uint4 support.
+using ConstantOfShapeDefaultOutputTypesOpset21 =
+    TypeList<
+        BFloat16,
+        MLFloat16,
+        float, double,
+#if !defined(DISABLE_FLOAT8_TYPES)
+        Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ,
+#endif
+        int8_t, int16_t, int32_t, int64_t,
+        uint8_t, uint16_t, uint32_t, uint64_t,
+        bool>;
+
 template <typename EnabledOutputTypeList = ConstantOfShapeDefaultOutputTypes>
 class ConstantOfShapeBase {
  protected:
diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
index ec395cf018f5..583ee759cc2e 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -6,7 +6,6 @@
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
-#include "core/mlas/inc/mlas.h"
 
 namespace onnxruntime {
 
@@ -125,6 +124,44 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
 
   return Status::OK();
 }
+#if defined(__aarch64__) && defined(__linux__)
+bool GemmPackBBfloat16(AllocatorPtr& alloc,
+                       const Tensor& tensor_b,
+                       bool trans_b,
+                       IAllocatorUniquePtr<void>& packed_b,
+                       size_t& packed_b_size,
+                       TensorShape& b_shape) {
+  // Only handle the common case of a 2D weight matrix. Additional matrices
+  // could be handled by stacking the packed buffers.
+  if (tensor_b.Shape().NumDimensions() != 2) {
+    return false;
+  }
+
+  b_shape = tensor_b.Shape();
+
+  const size_t K = trans_b ? static_cast<size_t>(b_shape[1]) : static_cast<size_t>(b_shape[0]);
+  const size_t N = trans_b ? static_cast<size_t>(b_shape[0]) : static_cast<size_t>(b_shape[1]);
+
+  packed_b_size = MlasSBGemmPackBSize(N, K);
+  if (packed_b_size == 0) {
+    return false;
+  }
+
+  packed_b = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size, true);
+  auto* packed_b_data = packed_b.get();
+
+  // Initialize memory to 0 as there could be some padding associated with pre-packed
+  // buffer memory and we don not want it uninitialized and generate different hashes
+  // if and when we try to cache this pre-packed buffer for sharing between sessions.
+  memset(packed_b_data, 0, packed_b_size);
+  MlasSBGemmConvertPackB(N,
+                         K,
+                         tensor_b.Data<float>(),
+                         trans_b ? K : N,
+                         packed_b_data);
+  return true;
+}
+#endif
 
 Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
                               /*out*/ bool& is_packed,
@@ -134,7 +171,24 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
   // only pack Matrix B
   if (input_idx == 1) {
     size_t packed_b_size;
-    is_packed = GemmPackBFp32(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
+#if defined(__aarch64__) && defined(__linux__)
+    size_t dim1 = 0;
+    size_t dim2 = 0;
+    TensorShape b_shape = tensor.Shape();
+
+    if (b_shape.NumDimensions() == 2) {
+      dim1 = static_cast<size_t>(b_shape[0]);
+      dim2 = static_cast<size_t>(b_shape[1]);
+    }
+
+    if (use_fastmath_mode_ && (trans_b_attr_ == 0) && ((dim1 * dim2) >= kFastMathModeKernelsizeThreshold)) {
+      is_packed = GemmPackBBfloat16(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
+    } else
+#endif
+    {
+      is_packed = GemmPackBFp32(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
+    }
+
     bool share_prepacked_weights = (prepacked_weights != nullptr);
     if (is_packed && share_prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
@@ -186,22 +240,40 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
   const size_t K = static_cast<size_t>(helper.K());
   const size_t lda = helper.Lda(trans_a);
   const size_t ldb = helper.Ldb(trans_b);
-
-  std::vector<MLAS_SGEMM_DATA_PARAMS> data(max_len);
-  for (size_t i = 0; i < max_len; i++) {
-    data[i].BIsPacked = bool(packed_b_);
-    data[i].A = a_data + helper.LeftOffsets()[i];
-    data[i].lda = lda;
-    data[i].B = data[i].BIsPacked ? (float*)packed_b_.get() : b_data + helper.RightOffsets()[i];
-    data[i].ldb = ldb;
-    data[i].C = y_data + helper.OutputOffsets()[i];
-    data[i].ldc = N;
-    data[i].alpha = alpha_attr_;
-    data[i].beta = 0.0f;
+#if defined(__aarch64__) && defined(__linux__)
+  if (use_fastmath_mode_ && !trans_b && ((N * K) >= kFastMathModeKernelsizeThreshold)) {
+    std::vector<MLAS_SBGEMM_DATA_PARAMS> data(max_len);
+    for (size_t i = 0; i < max_len; i++) {
+      data[i].BIsfp32 = !(bool(packed_b_));
+      data[i].AIsfp32 = true;
+      data[i].A = a_data + helper.LeftOffsets()[i];
+      data[i].lda = lda;
+      data[i].B = data[i].BIsfp32 ? b_data + helper.RightOffsets()[i] : (float*)packed_b_.get();
+      data[i].ldb = ldb;
+      data[i].C = y_data + helper.OutputOffsets()[i];
+      data[i].ldc = N;
+      data[i].Bias = nullptr;
+      data[i].OutputProcessor = nullptr;
+    }
+    MlasSBGemmBatch(M, N, K, max_len, data.data(), thread_pool);
+  } else
+#endif
+  {
+    std::vector<MLAS_SGEMM_DATA_PARAMS> data(max_len);
+    for (size_t i = 0; i < max_len; i++) {
+      data[i].BIsPacked = bool(packed_b_);
+      data[i].A = a_data + helper.LeftOffsets()[i];
+      data[i].lda = lda;
+      data[i].B = data[i].BIsPacked ? (float*)packed_b_.get() : b_data + helper.RightOffsets()[i];
+      data[i].ldb = ldb;
+      data[i].C = y_data + helper.OutputOffsets()[i];
+      data[i].ldc = N;
+      data[i].alpha = alpha_attr_;
+      data[i].beta = 0.0f;
+    }
+    MlasGemmBatch(trans_a ? CblasTrans : CblasNoTrans, trans_b ? CblasTrans : CblasNoTrans,
+                  M, N, K, data.data(), max_len, thread_pool);
   }
-  MlasGemmBatch(trans_a ? CblasTrans : CblasNoTrans, trans_b ? CblasTrans : CblasNoTrans,
-                M, N, K, data.data(), max_len, thread_pool);
-
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cpu/math/matmul.h b/onnxruntime/core/providers/cpu/math/matmul.h
index b960fa4fb058..b9bbe3658387 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.h
+++ b/onnxruntime/core/providers/cpu/math/matmul.h
@@ -4,6 +4,8 @@
 #pragma once
 
 #include "core/framework/op_kernel.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 namespace onnxruntime {
 
@@ -27,6 +29,11 @@ class MatMul<float> final : public OpKernel {
     info.GetAttrOrDefault<int64_t>("transBatchB", &trans_batch_b_attr, 0);
     trans_batch_a_ = trans_batch_a_attr != 0;
     trans_batch_b_ = trans_batch_b_attr != 0;
+
+#if defined(__aarch64__) && defined(__linux__)
+    auto config_ops = info.GetConfigOptions().GetConfigEntry(kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16);
+    use_fastmath_mode_ = (config_ops == "1") && MlasBf16AccelerationSupported();
+#endif
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
@@ -48,6 +55,14 @@ class MatMul<float> final : public OpKernel {
   int64_t trans_b_attr_;
   bool trans_batch_a_;
   bool trans_batch_b_;
+
+#if defined(__aarch64__) && defined(__linux__)
+  // fastmath mode state
+  bool use_fastmath_mode_;
+  // sbgemm kernel is implemented as 8x8 blocks with weights pre-packed to 4 blocks of 4x2
+  // so a minimum of 32 elements is defined to outweigh the additional prepacking overhead
+  const size_t kFastMathModeKernelsizeThreshold = 32;
+#endif
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/label_encoder.cc b/onnxruntime/core/providers/cpu/ml/label_encoder.cc
index 7f626cfefb0c..65102b62a963 100644
--- a/onnxruntime/core/providers/cpu/ml/label_encoder.cc
+++ b/onnxruntime/core/providers/cpu/ml/label_encoder.cc
@@ -10,14 +10,12 @@ namespace onnxruntime {
 namespace ml {
 
 ONNX_CPU_OPERATOR_VERSIONED_ML_KERNEL(
-    LabelEncoder,
-    1, 1,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
-                                                              DataTypeImpl::GetTensorType<int64_t>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
-                                                DataTypeImpl::GetTensorType<int64_t>()})
+    LabelEncoder, 1, 1,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
+                                                      DataTypeImpl::GetTensorType<int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
+                                                      DataTypeImpl::GetTensorType<int64_t>()})
         .SinceVersion(1, 2),
     LabelEncoder);
 
@@ -39,12 +37,11 @@ Status LabelEncoder::Compute(OpKernelContext* context) const {
     // map isn't going to change so get end() once instead of calling inside the for_each loop
     const auto map_end = string_to_int_map_.end();
 
-    std::for_each(input.begin(), input.end(),
-                  [&out, &map_end, this](const std::string& value) {
-                    auto map_to = string_to_int_map_.find(value);
-                    *out = map_to == map_end ? default_int_ : map_to->second;
-                    ++out;
-                  });
+    std::for_each(input.begin(), input.end(), [&out, &map_end, this](const std::string& value) {
+      auto map_to = string_to_int_map_.find(value);
+      *out = map_to == map_end ? default_int_ : map_to->second;
+      ++out;
+    });
   } else {
     if (!Y.IsDataTypeString())
       return Status(ONNXRUNTIME, FAIL, "Input of tensor(int64) must have output of tensor(string)");
@@ -55,169 +52,346 @@ Status LabelEncoder::Compute(OpKernelContext* context) const {
 
     const auto map_end = int_to_string_map_.end();
 
-    std::for_each(input.begin(), input.end(),
-                  [&out, &map_end, this](const int64_t& value) {
-                    auto map_to = int_to_string_map_.find(value);
-                    *out = map_to == map_end ? default_string_ : map_to->second;
-                    ++out;
-                  });
+    std::for_each(input.begin(), input.end(), [&out, &map_end, this](const int64_t& value) {
+      auto map_to = int_to_string_map_.find(value);
+      *out = map_to == map_end ? default_string_ : map_to->second;
+      ++out;
+    });
   }
 
   return Status::OK();
 }
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    float_string,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, float_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
     LabelEncoder_2<float, std::string>);
 
 template <>
 void LabelEncoder_2<float, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_floats";
-  _value_field_name = "values_strings";
-  info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
-};
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_strings";
+  info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    string_float,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, string_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
     LabelEncoder_2<std::string, float>);
 
 template <>
 void LabelEncoder_2<std::string, float>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_strings";
-  _value_field_name = "values_floats";
-  info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
-};
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_floats";
+  info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    int64_float,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, int64_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
     LabelEncoder_2<std::int64_t, float>);
 
 template <>
 void LabelEncoder_2<std::int64_t, float>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_int64s";
-  _value_field_name = "values_floats";
-  info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
-};
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_floats";
+  info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    float_int64,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, float_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
     LabelEncoder_2<float, std::int64_t>);
 
 template <>
 void LabelEncoder_2<float, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_floats";
-  _value_field_name = "values_int64s";
-  info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
-};
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_int64s";
+  info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, (std::int64_t)-1);
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    string_string,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, string_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
     LabelEncoder_2<std::string, std::string>)
 
 template <>
 void LabelEncoder_2<std::string, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_strings";
-  _value_field_name = "values_strings";
-  info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
-};
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_strings";
+  info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    float_float,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, float_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
     LabelEncoder_2<float, float>)
 
 template <>
 void LabelEncoder_2<float, float>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_floats";
-  _value_field_name = "values_floats";
-  info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
-};
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_floats";
+  info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    int64_string,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, int64_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
     LabelEncoder_2<std::int64_t, std::string>)
 
 template <>
 void LabelEncoder_2<std::int64_t, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_int64s";
-  _value_field_name = "values_strings";
-  info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
-};
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_strings";
+  info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    string_int64,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, string_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
     LabelEncoder_2<std::string, std::int64_t>)
 
 template <>
 void LabelEncoder_2<std::string, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_strings";
-  _value_field_name = "values_int64s";
-  info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
-};
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_int64s";
+  info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, static_cast<std::int64_t>(-1));
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    int64_int64,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, int64_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
     LabelEncoder_2<std::int64_t, std::int64_t>)
 
 template <>
 void LabelEncoder_2<std::int64_t, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_int64s";
-  _value_field_name = "values_int64s";
-  info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
-};
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_int64s";
+  info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, static_cast<std::int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, int64_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+    LabelEncoder_4<std::int64_t, std::int64_t>)
+
+template <>
+void LabelEncoder_4<std::int64_t, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_int64s";
+  default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, int64_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+    LabelEncoder_4<std::int64_t, std::string>)
+
+template <>
+void LabelEncoder_4<std::int64_t, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_strings";
+  default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, int64_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+    LabelEncoder_4<std::int64_t, float>)
+
+template <>
+void LabelEncoder_4<std::int64_t, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_floats";
+  default_value_ = GetDefault(kernel_info, "default_float", 0.f);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(LabelEncoder, 4, float_float,
+                                  KernelDefBuilder()
+                                      .TypeConstraint("T1",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+                                      .TypeConstraint("T2",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+                                  LabelEncoder_4<float, float>)
+
+template <>
+void LabelEncoder_4<float, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_floats";
+  default_value_ = GetDefault(kernel_info, "default_float", -0.f);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, float_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+    LabelEncoder_4<float, std::string>)
+
+template <>
+void LabelEncoder_4<float, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_strings";
+  default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, float_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+    LabelEncoder_4<float, std::int64_t>)
+
+template <>
+void LabelEncoder_4<float, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_int64s";
+  default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+    LabelEncoder_4<std::string, std::int64_t>)
+
+template <>
+void LabelEncoder_4<std::string, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_int64s";
+  default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+    LabelEncoder_4<std::string, float>)
+
+template <>
+void LabelEncoder_4<std::string, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_floats";
+  default_value_ = GetDefault(kernel_info, "default_float", 0.f);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+    LabelEncoder_4<std::string, std::string>)
+
+template <>
+void LabelEncoder_4<std::string, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_strings";
+  default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_int16,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int16_t>()}),
+    LabelEncoder_4<std::string, std::int16_t>)
+
+template <>
+void LabelEncoder_4<std::string, std::int16_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  default_value_ = static_cast<std::int16_t>(GetDefault(kernel_info, "", static_cast<std::int16_t>(-1)));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(LabelEncoder, 4, double_double,
+                                  KernelDefBuilder()
+                                      .TypeConstraint("T1",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
+                                      .TypeConstraint("T2",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
+                                  LabelEncoder_4<double, double>)
+
+template <>
+void LabelEncoder_4<double, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  default_value_ = GetDefault(kernel_info, "default_float", -0.);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, double_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+    LabelEncoder_4<double, std::string>)
+
+template <>
+void LabelEncoder_4<double, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  value_field_name_ = "values_strings";
+  default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_double,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
+    LabelEncoder_4<std::string, double>)
+
+template <>
+void LabelEncoder_4<std::string, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  default_value_ = GetDefault(kernel_info, "default_float", -0.);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, double_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+    LabelEncoder_4<double, std::int64_t>)
+
+template <>
+void LabelEncoder_4<double, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  value_field_name_ = "values_int64s";
+  default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, int64_double,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
+    LabelEncoder_4<std::int64_t, double>)
+
+template <>
+void LabelEncoder_4<std::int64_t, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_int64s";
+  default_value_ = GetDefault(kernel_info, "default_float", -0.);
+}
 
 }  // namespace ml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/label_encoder.h b/onnxruntime/core/providers/cpu/ml/label_encoder.h
index 1b4fa01900ae..0f9f7cfb5dba 100644
--- a/onnxruntime/core/providers/cpu/ml/label_encoder.h
+++ b/onnxruntime/core/providers/cpu/ml/label_encoder.h
@@ -6,6 +6,8 @@
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/ml/ml_common.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/common/safeint.h"
 
 namespace onnxruntime {
 namespace ml {
@@ -53,57 +55,182 @@ class LabelEncoder_2 final : public OpKernel {
     std::vector<TKey> keys;
     std::vector<TValue> values;
 
-    ORT_THROW_IF_ERROR(info.GetAttrs<TKey>(_key_field_name, keys));
-    ORT_THROW_IF_ERROR(info.GetAttrs<TValue>(_value_field_name, values));
+    ORT_THROW_IF_ERROR(info.GetAttrs<TKey>(key_field_name_, keys));
+    ORT_THROW_IF_ERROR(info.GetAttrs<TValue>(value_field_name_, values));
 
     auto num_keys = keys.size();
     auto num_values = values.size();
-    ORT_ENFORCE(num_keys == num_values,
-                "The ", _key_field_name, " and ", _value_field_name, " attribtues in LabelEncoder ",
-                "(name: ", info.node().Name(), ") must have the same length. ",
-                "However, the number of key is ", num_keys, " and the number of ",
-                "values is ", num_values, ".");
-    _map.reserve(num_keys);
-    for (size_t i = 0; i < num_keys; ++i)
-      _map.emplace(keys[i], values[i]);
+    ORT_ENFORCE(num_keys == num_values, "The ", key_field_name_, " and ", value_field_name_,
+                " attributes in LabelEncoder ", "(name: ", info.node().Name(), ") must have the same length. ",
+                "However, the number of key is ", num_keys, " and the number of ", "values is ", num_values, ".");
+    map_.reserve(num_keys);
+    for (size_t i = 0; i < num_keys; ++i) map_.emplace(keys[i], values[i]);
   }
 
   Status Compute(OpKernelContext* context) const override {
-    const auto* tensor_pointer = context->Input<Tensor>(0);
-    if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
-    const Tensor& X = *tensor_pointer;
-    const TensorShape& shape = X.Shape();
-    Tensor& Y = *context->Output(0, shape);
-
-    auto input = X.template DataAsSpan<TKey>();
-    auto output = Y.template MutableDataAsSpan<TValue>();
-
-    for (int64_t i = 0; i < shape.Size(); ++i) {
-      const auto found = _map.find(input[onnxruntime::narrow<size_t>(i)]);
-      if (found == _map.end())
-        output[onnxruntime::narrow<size_t>(i)] = _default_value;
-      else
-        output[onnxruntime::narrow<size_t>(i)] = found->second;
+    const auto* X = context->Input<Tensor>(0);
+    const TensorShape& shape = X->Shape();
+    auto* Y = context->Output(0, shape);
+
+    auto input = X->template DataAsSpan<TKey>();
+    auto output = Y->template MutableDataAsSpan<TValue>();
+    auto input_iter = input.begin();
+    auto output_iter = output.begin();
+    while (input_iter != input.end()) {
+      const auto found = map_.find(*input_iter);
+      *output_iter = found == map_.end() ? default_value_ : found->second;
+      ++output_iter;
+      ++input_iter;
     }
-
     return Status::OK();
   }
 
  private:
   // Specialize this method to set attribute names. For example, if keys' type
-  // is 64-bit integer, _key_field_name should be "keys_int64s". Field names
+  // is 64-bit integer, key_field_name_ should be "keys_int64s". Field names
   // for other types can be found in ONNX spec.
   void InitializeSomeFields(const OpKernelInfo& info);
 
   // A collection of key-value pairs. Each (a_key, a_value) pair
   // means that the "a_key" in the input would be mapped to "a_value".
-  // If _map doesn't contain "a_key", we use _default_value as its output.
-  InlinedHashMap<TKey, TValue> _map;
-  TValue _default_value;
+  // If map_ doesn't contain "a_key", we use default_value_ as its output.
+  InlinedHashMap<TKey, TValue> map_;
+  TValue default_value_;
   // ONNX attribute name to load keys.
-  std::string _key_field_name;
+  std::string key_field_name_;
   // ONNX attribute name to load values.
-  std::string _value_field_name;
+  std::string value_field_name_;
+};
+
+template <typename T>
+std::vector<T> GetAttribute(const OpKernelInfo& info, const std::string& name, const std::string& tensor_name) {
+  if constexpr (std::is_same_v<T, std::string> || std::is_same_v<T, float> || std::is_same_v<T, int64_t>) {
+    std::vector<T> attrs;
+    if (info.GetAttrs<T>(name, attrs).IsOK()) {
+      return attrs;
+    }
+  }
+  ONNX_NAMESPACE::TensorProto attr_tensor_proto;
+  auto result = info.GetAttr(tensor_name, &attr_tensor_proto);
+  if (name.empty()) {
+    ORT_ENFORCE(result.IsOK(), "LabelEncoder is missing attribute ", tensor_name);
+  } else {
+    ORT_ENFORCE(result.IsOK(), "LabelEncoder is missing attribute ", tensor_name, " or ", name);
+  }
+  SafeInt<int64_t> element_count(1);
+  for (auto dim : attr_tensor_proto.dims()) {
+    element_count *= dim;
+  }
+  const SafeInt<size_t> tensor_size(element_count);
+  std::vector<T> out(tensor_size);
+  result = utils::UnpackTensor<T>(attr_tensor_proto, Path(), out.data(), tensor_size);
+  ORT_ENFORCE(result.IsOK(), "LabelEncoder could not unpack tensor attribute ", name);
+  return out;
+}
+
+template <typename T>
+T GetDefault(const OpKernelInfo& info, const std::string& attr_name, const T& backup) {
+  ONNX_NAMESPACE::TensorProto attr_tensor_proto;
+  auto result = info.GetAttr("default_tensor", &attr_tensor_proto);
+  if (result.IsOK() && utils::HasDataType(attr_tensor_proto)) {
+    T default_value;
+    result = utils::UnpackTensor<T>(attr_tensor_proto, Path(), &default_value, 1);
+    ORT_ENFORCE(result.IsOK(), "LabelEncoder could not unpack default tensor ", attr_name);
+    return default_value;
+  } else if constexpr (std::is_same_v<T, std::string> || std::is_same_v<T, float> || std::is_same_v<T, int64_t>) {
+    T default_value;
+    result = info.GetAttr<T>(attr_name, &default_value);
+    if (result.IsOK()) {
+      return default_value;
+    }
+  }
+  return backup;
+}
+
+// We don't make use of InlinedHashMap since we make use of a custom hash and equality function.
+// Introducing new template parameters in inlined_containers_fwd.h creates compilation errors
+// (see https://github.com/microsoft/onnxruntime/pull/17977#discussion_r1446510961).
+#ifndef DISABLE_ABSEIL
+template <typename T>
+using HashFunc = absl::container_internal::hash_default_hash<T>;
+
+template <typename T>
+using EqualFunc = absl::container_internal::hash_default_eq<T>;
+
+template <typename K, typename V, typename Hash, typename Equal>
+using HashMap = absl::flat_hash_map<K, V, Hash, Equal>;
+#else
+template <typename T>
+using HashFunc = std::hash<T>;
+
+template <typename T>
+using EqualFunc = std::equal_to<T>;
+
+template <typename K, typename V, typename Hash, typename Equal>
+using HashMap = std::unordered_map<K, V, Hash, Equal>;
+#endif  // DISABLE_ABSEIL
+
+template <typename T>
+struct NaNHash {
+  size_t operator()(const T& value) const {
+    if constexpr (std::is_floating_point_v<T>) {
+      if (std::isnan(value)) {
+        return 0;
+      }
+    }
+    return HashFunc<T>{}(value);
+  }
+};
+
+template <typename T>
+struct NaNEqual {
+  bool operator()(const T& lhs, const T& rhs) const {
+    if constexpr (std::is_floating_point_v<T>) {
+      if (std::isnan(lhs) && std::isnan(rhs)) {
+        return true;
+      }
+    }
+    return EqualFunc<T>{}(lhs, rhs);
+  }
+};
+
+template <typename TKey, typename TValue>
+class LabelEncoder_4 final : public OpKernel {
+ public:
+  LabelEncoder_4(const OpKernelInfo& kernel_info) : OpKernel(kernel_info) {
+    InitializeAttrFields(kernel_info);
+    auto keys = GetAttribute<TKey>(kernel_info, key_field_name_, "keys_tensor");
+    auto values = GetAttribute<TValue>(kernel_info, value_field_name_, "values_tensor");
+    ORT_ENFORCE(keys.size() == values.size(), "Keys and values must have the same length.");
+    for (size_t i = 0; i < keys.size(); ++i) {
+      map_.emplace(keys[i], values[i]);
+    }
+  }
+  Status Compute(OpKernelContext* context) const override {
+    const auto* X = context->Input<Tensor>(0);
+    const TensorShape& shape = X->Shape();
+    auto* Y = context->Output(0, shape);
+
+    auto input = X->template DataAsSpan<TKey>();
+    auto output = Y->template MutableDataAsSpan<TValue>();
+    auto input_iter = input.begin();
+    auto output_iter = output.begin();
+    while (input_iter != input.end()) {
+      const auto found = map_.find(*input_iter);
+      *output_iter = found == map_.end() ? default_value_ : found->second;
+      ++output_iter;
+      ++input_iter;
+    }
+    return Status::OK();
+  }
+
+ private:
+  void InitializeAttrFields(const OpKernelInfo& kernel_info);
+  HashMap<TKey, TValue, NaNHash<TKey>, NaNEqual<TKey>> map_;
+  TValue default_value_;
+  std::string key_field_name_;
+  std::string value_field_name_;
 };
+
 }  // namespace ml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
index 180b3153fbb3..e2981da3a6f2 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#if !defined(ORT_MINIMAL_BUILD)
+
 #include "core/providers/cpu/ml/tree_ensemble_helper.h"
 #include "core/common/common.h"
 #include "onnx/defs/tensor_proto_util.h"
@@ -64,3 +66,5 @@ Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name
 
 }  // namespace ml
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
index 3c8a5a840bc7..33172c343a88 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #pragma once
+
+#if !defined(ORT_MINIMAL_BUILD)
+
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 
@@ -13,3 +16,5 @@ Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name
 
 }  // namespace ml
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
index a5d46aff83b5..ccecbabfa3db 100644
--- a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
@@ -25,6 +25,8 @@ class BatchNormHelper {
                                        const Tensor* var,
                                        bool is_spatial = true,
                                        bool is_nhwc = false) {
+    // NHWC dependent shape: X
+    // All other shapes are assumed to be in NCHW layout?
     const auto& x_dims = X->Shape().GetDims();
 
     // If x_dims size < 2, num_channels defaults to 1.
@@ -48,16 +50,22 @@ class BatchNormHelper {
     // validate 'scales' shape
     const auto& scale_dims = scale->Shape().GetDims();
     if (static_cast<int>(scale_dims.size()) != kNumInputScaleDimensions) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: NumDimensions() != ", kNumInputScaleDimensions);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input scale: NumDimensions() != ", kNumInputScaleDimensions);
     }
     if (scale_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: 0th dimension != ", num_channels);
     }
+    // N & C do not belong to features
+    // skip the first element for NHWC and the first two elements for NCHW.
+    int feature_offset = is_nhwc ? 1 : 2;
+
     // in non-spatial cases - the other dims of 'scale' must be validated
     if (!is_spatial) {
       for (int feature = 0; feature < num_feature_dims; ++feature) {
-        if (scale_dims[1 + feature] != x_dims[2 + feature]) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        if (scale_dims[1 + feature] != x_dims[feature_offset + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: ", (1 + feature),
+                                 " dimension != ", x_dims[feature_offset + feature]);
         }
       }
     }
@@ -65,7 +73,8 @@ class BatchNormHelper {
     // validate 'B' shape
     const auto& B_dims = B->Shape().GetDims();
     if (static_cast<int>(B_dims.size()) != kNumInputBiasDimensions) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: NumDimensions() != ", kNumInputBiasDimensions);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input B: NumDimensions() != ", kNumInputBiasDimensions);
     }
     if (B_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: 0th dimension != ", num_channels);
@@ -73,8 +82,9 @@ class BatchNormHelper {
     // in non-spatial cases - the other dims of 'B' must be validated
     if (!is_spatial) {
       for (int feature = 0; feature < num_feature_dims; ++feature) {
-        if (B_dims[1 + feature] != x_dims[2 + feature]) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        if (B_dims[1 + feature] != x_dims[feature_offset + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: ", (1 + feature),
+                                 " dimension != ", x_dims[feature_offset + feature]);
         }
       }
     }
@@ -82,16 +92,19 @@ class BatchNormHelper {
     // validate 'mean' shape
     const auto& mean_dims = mean->Shape().GetDims();
     if (static_cast<int>(mean_dims.size()) != kNumInputMeanDimensions) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: NumDimensions() != ", kNumInputMeanDimensions);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input mean: NumDimensions() != ", kNumInputMeanDimensions);
     }
     if (mean_dims[0] != num_channels) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: 0th dimension != ", num_channels);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input mean: 0th dimension != ", num_channels);
     }
     // in non-spatial cases - the other dims of 'mean' must be validated
     if (!is_spatial) {
       for (int feature = 0; feature < num_feature_dims; ++feature) {
-        if (mean_dims[1 + feature] != x_dims[2 + feature]) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        if (mean_dims[1 + feature] != x_dims[feature_offset + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: ", (1 + feature),
+                                 " dimension != ", x_dims[feature_offset + feature]);
         }
       }
     }
@@ -99,7 +112,8 @@ class BatchNormHelper {
     // validate 'var' shape
     const auto& var_dims = var->Shape().GetDims();
     if (static_cast<int>(var_dims.size()) != kNumInputVarianceDimensions) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: NumDimensions() != ", kNumInputVarianceDimensions);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input var: NumDimensions() != ", kNumInputVarianceDimensions);
     }
     if (var_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: 0th dimension != ", num_channels);
@@ -107,8 +121,9 @@ class BatchNormHelper {
     // in non-spatial cases - the other dims of 'var' must be validated
     if (!is_spatial) {
       for (int feature = 0; feature < num_feature_dims; ++feature) {
-        if (var_dims[1 + feature] != x_dims[2 + feature]) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        if (var_dims[1 + feature] != x_dims[feature_offset + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: ", (1 + feature),
+                                 " dimension != ", x_dims[feature_offset + feature]);
         }
       }
     }
diff --git a/onnxruntime/core/providers/cpu/nn/flatten.cc b/onnxruntime/core/providers/cpu/nn/flatten.cc
index ec930854ca54..f9deef72c40c 100644
--- a/onnxruntime/core/providers/cpu/nn/flatten.cc
+++ b/onnxruntime/core/providers/cpu/nn/flatten.cc
@@ -32,9 +32,20 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
         .TypeConstraint("T", DataTypeImpl::AllTensorTypes()),
     Flatten);
 
-ONNX_CPU_OPERATOR_KERNEL(
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     Flatten,
     13,
+    20,
+    KernelDefBuilder()
+        .Alias(0, 0)
+        .TypeConstraint("T", DataTypeImpl::AllTensorTypes()),
+    Flatten);
+
+// Opset 21 added support for float8e4m3fnuz, float8e5m2, float8e5m2fnuz, int4 and uint4.
+// TODO(adrianlizarraga): Add support for float8e4m3fnuz, float8e5m2, float8e5m2fnuz, int4 and uint4.
+ONNX_CPU_OPERATOR_KERNEL(
+    Flatten,
+    21,
     KernelDefBuilder()
         .Alias(0, 0)
         .TypeConstraint("T", DataTypeImpl::AllTensorTypes()),
diff --git a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc b/onnxruntime/core/providers/cpu/nn/string_normalizer.cc
deleted file mode 100644
index 27407e999945..000000000000
--- a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc
+++ /dev/null
@@ -1,413 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "string_normalizer.h"
-#include "core/common/common.h"
-#include "core/framework/tensor.h"
-#include "onnxruntime_config.h"
-
-#ifdef _MSC_VER
-#include <codecvt>
-#include <locale.h>
-#elif defined(__APPLE__) || defined(__ANDROID__)
-#include <codecvt>
-#else
-#include <limits>
-#include <iconv.h>
-
-#endif  // _MSC_VER
-
-#include <locale>
-#include <functional>
-#include <unordered_set>
-
-#if defined(__GNUC__)
-// Allow deprecated-declarations warning - std::wstring_convert is deprecated.
-// TODO find a suitable replacement
-// Note: GNU libiconv (e.g., on Apple platforms) is not suitable due to its LGPL license.
-#if defined(HAS_DEPRECATED_DECLARATIONS)
-#pragma GCC diagnostic warning "-Wdeprecated-declarations"
-#endif  // defined(HAS_DEPRECATED_DECLARATIONS)
-#endif  // defined(__GNUC__)
-
-namespace onnxruntime {
-
-ONNX_CPU_OPERATOR_KERNEL(
-    StringNormalizer,
-    10,
-    KernelDefBuilder()
-        .TypeConstraint("X", DataTypeImpl::GetTensorType<std::string>()),
-    StringNormalizer);
-
-namespace string_normalizer {
-const std::string conv_error("Conversion Error");
-const std::wstring wconv_error(L"Conversion Error");
-
-// We need to specialize for MS as there is
-// a std::locale creation bug that affects different
-// environments in a different way
-#ifdef _MSC_VER
-
-class Locale {
- public:
-  explicit Locale(const std::string& name)
-      : loc_(nullptr) {
-    loc_ = _create_locale(LC_CTYPE, name.c_str());
-    if (loc_ == nullptr) {
-      ORT_THROW("Failed to construct locale with name:",
-                name, ":", ":Please, install necessary language-pack-XX and configure locales");
-    }
-  }
-
-  ~Locale() {
-    if (loc_ != nullptr) {
-      _free_locale(loc_);
-    }
-  }
-
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Locale);
-
-  void ChangeCase(StringNormalizer::CaseAction caseaction,
-                  std::wstring& wstr) const {
-    assert(caseaction != StringNormalizer::NONE);
-    if (caseaction == StringNormalizer::LOWER) {
-      std::transform(wstr.begin(), wstr.end(), wstr.begin(),
-                     [this](wchar_t ch) { return ::_towlower_l(ch, loc_); });
-    } else {
-      std::transform(wstr.begin(), wstr.end(), wstr.begin(),
-                     [this](wchar_t ch) { return ::_towupper_l(ch, loc_); });
-    }
-  }
-
- private:
-  _locale_t loc_;
-};
-
-using Utf8Converter = std::wstring_convert<std::codecvt_utf8<wchar_t>>;
-
-const std::string default_locale("en-US");
-
-#else  // _MSC_VER
-
-class Locale {
- public:
-  explicit Locale(const std::string& name) {
-    ORT_TRY {
-      loc_ = std::locale(name.c_str());
-    }
-    ORT_CATCH(const std::runtime_error& e) {
-      ORT_HANDLE_EXCEPTION([&]() {
-        ORT_THROW("Failed to construct locale with name:",
-                  name, ":", e.what(), ":Please, install necessary language-pack-XX and configure locales");
-      });
-    }
-  }
-
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Locale);
-
-  void ChangeCase(StringNormalizer::CaseAction caseaction,
-                  std::wstring& wstr) const {
-    assert(caseaction != StringNormalizer::NONE);
-    if (caseaction == StringNormalizer::LOWER) {
-      std::transform(wstr.begin(), wstr.end(), wstr.begin(),
-                     [this](wchar_t ch) { return std::tolower(ch, loc_); });
-    } else {
-      std::transform(wstr.begin(), wstr.end(), wstr.begin(),
-                     [this](wchar_t ch) { return std::toupper(ch, loc_); });
-    }
-  }
-
- private:
-  std::locale loc_;
-};
-
-#if defined(__APPLE__) || defined(__ANDROID__)
-
-using Utf8Converter = std::wstring_convert<std::codecvt_utf8<wchar_t>>;
-
-#else
-
-// All others (not Windows, Apple, or Android)
-class Utf8Converter {
- public:
-  Utf8Converter(const std::string&, const std::wstring&) {}
-
-  std::wstring from_bytes(const std::string& s) const {
-    std::wstring result;
-    if (s.empty()) {
-      return result;
-    }
-    // Order of arguments is to, from
-    auto icvt = iconv_open("WCHAR_T", "UTF-8");
-    // CentOS is not happy with -1
-    if (std::numeric_limits<iconv_t>::max() == icvt) {
-      return wconv_error;
-    }
-
-    char* iconv_in = const_cast<char*>(s.c_str());
-    size_t iconv_in_bytes = s.length();
-    // Temporary buffer assumes 1 byte to 1 wchar_t
-    // to make sure it is enough.
-    const size_t buffer_len = iconv_in_bytes * sizeof(wchar_t);
-    auto buffer = std::make_unique<char[]>(buffer_len);
-    char* iconv_out = buffer.get();
-    size_t iconv_out_bytes = buffer_len;
-    auto ret = iconv(icvt, &iconv_in, &iconv_in_bytes, &iconv_out, &iconv_out_bytes);
-    if (static_cast<size_t>(-1) == ret) {
-      result = wconv_error;
-    } else {
-      size_t converted_bytes = buffer_len - iconv_out_bytes;
-      assert((converted_bytes % sizeof(wchar_t)) == 0);
-      result.assign(reinterpret_cast<const wchar_t*>(buffer.get()), converted_bytes / sizeof(wchar_t));
-    }
-    iconv_close(icvt);
-    return result;
-  }
-
-  std::string to_bytes(const std::wstring& wstr) const {
-    std::string result;
-    if (wstr.empty()) {
-      return result;
-    }
-    // Order of arguments is to, from
-    auto icvt = iconv_open("UTF-8", "WCHAR_T");
-    // CentOS is not happy with -1
-    if (std::numeric_limits<iconv_t>::max() == icvt) {
-      return conv_error;
-    }
-
-    // I hope this does not modify the incoming buffer
-    wchar_t* non_const_in = const_cast<wchar_t*>(wstr.c_str());
-    char* iconv_in = reinterpret_cast<char*>(non_const_in);
-    size_t iconv_in_bytes = wstr.length() * sizeof(wchar_t);
-    // Temp buffer, assume every code point converts into 3 bytes, this should be enough
-    // We do not convert terminating zeros
-    const size_t buffer_len = wstr.length() * 3;
-    auto buffer = std::make_unique<char[]>(buffer_len);
-
-    char* iconv_out = buffer.get();
-    size_t iconv_out_bytes = buffer_len;
-    auto ret = iconv(icvt, &iconv_in, &iconv_in_bytes, &iconv_out, &iconv_out_bytes);
-    if (static_cast<size_t>(-1) == ret) {
-      result = conv_error;
-    } else {
-      size_t converted_len = buffer_len - iconv_out_bytes;
-      result.assign(buffer.get(), converted_len);
-    }
-    iconv_close(icvt);
-    return result;
-  }
-};
-
-#endif
-
-#if defined(__APPLE__)
-#include <TargetConditionals.h>
-#if TARGET_OS_IPHONE || TARGET_OS_SIMULATOR
-const std::string default_locale("en-US.UTF-8");
-#else
-const std::string default_locale("en_US.UTF-8");  // Other kinds of Apple Platforms including MacOS, etc
-#endif
-#else
-const std::string default_locale("en_US.UTF-8");  // All non-MS and not Apple
-#endif
-
-#endif  // _MSC_VER
-
-template <class ForwardIter>
-Status CopyCaseAction(ForwardIter first, ForwardIter end, OpKernelContext* ctx,
-                      const Locale& loc,
-                      Utf8Converter& converter,
-                      size_t N, size_t C,
-                      StringNormalizer::CaseAction caseaction) {
-  std::vector<int64_t> output_dims;
-  if (N == 1) {
-    output_dims.push_back(1);
-  }
-
-  // Empty output case
-  if (C == 0) {
-    output_dims.push_back(1);
-    TensorShape output_shape(output_dims);
-    // This will create one empty string
-    ctx->Output(0, output_shape);
-    return Status::OK();
-  }
-
-  output_dims.push_back(C);
-
-  TensorShape output_shape(output_dims);
-  auto output_tensor = ctx->Output(0, output_shape);
-  auto const output_data = output_tensor->MutableData<std::string>();
-
-  size_t output_idx = 0;
-  while (first != end) {
-    auto& s = *first;
-    if (caseaction == StringNormalizer::LOWER || caseaction == StringNormalizer::UPPER) {
-      std::wstring wstr = converter.from_bytes(s);
-      if (wstr == wconv_error) {
-        // Please do not include the input text in the error message as it could
-        // be deemed as a compliance violation by teams using this operator
-        return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                      "Input contains invalid utf8 chars");
-      }
-      // In place transform
-      loc.ChangeCase(caseaction, wstr);
-      *(output_data + output_idx) = converter.to_bytes(wstr);
-    } else {
-      assert(caseaction == StringNormalizer::NONE);
-      // Simple copy or move if the iterator points to a non-const string
-      *(output_data + output_idx) = std::move(s);
-    }
-    ++output_idx;
-    ++first;
-  }
-  return Status::OK();
-}
-}  // namespace string_normalizer
-
-using namespace string_normalizer;
-
-StringNormalizer::StringNormalizer(const OpKernelInfo& info) : OpKernel(info),
-                                                               is_case_sensitive_(true),
-                                                               case_change_action_(NONE),
-                                                               compare_caseaction_(NONE) {
-  int64_t iscasesensitive = 0;
-  Status status = info.GetAttr("is_case_sensitive", &iscasesensitive);
-  ORT_ENFORCE(status.IsOK(), "attribute is_case_sensitive is not set");
-  is_case_sensitive_ = iscasesensitive != 0;
-
-  std::string case_change_action;
-  status = info.GetAttr("case_change_action", &case_change_action);
-  ORT_ENFORCE(status.IsOK(), "attribute case_change_action is not set");
-  if (case_change_action == "LOWER") {
-    case_change_action_ = LOWER;
-  } else if (case_change_action == "UPPER") {
-    case_change_action_ = UPPER;
-  } else if (case_change_action == "NONE") {
-    case_change_action_ = NONE;
-  } else {
-    ORT_ENFORCE(false, "attribute case_change_action has invalid value");
-  }
-
-  if (!is_case_sensitive_) {
-    // Convert stop words to a case which can help us preserve the case of filtered strings
-    compare_caseaction_ = (case_change_action_ == UPPER) ? UPPER : LOWER;
-  }
-
-  locale_name_ = info.GetAttrOrDefault("locale", default_locale);
-  Locale locale(locale_name_);
-  Utf8Converter converter(conv_error, wconv_error);
-
-  std::vector<std::string> swords = info.GetAttrsOrDefault<std::string>("stopwords");
-  for (auto& sw : swords) {
-    ORT_ENFORCE(!sw.empty(), "Empty stopwords not allowed");
-    if (is_case_sensitive_) {
-      auto p = stopwords_.insert(std::move(sw));
-      ORT_ENFORCE(p.second, "Duplicate stopwords not allowed");
-    } else {
-      std::wstring wstr = converter.from_bytes(sw);
-      ORT_ENFORCE(wstr != wconv_error, "Stopword contains invalid utf8 chars");
-      locale.ChangeCase(compare_caseaction_, wstr);
-      auto p = wstopwords_.insert(std::move(wstr));
-      ORT_ENFORCE(p.second, "Duplicate stopwords not allowed");
-    }
-  }
-}
-
-Status StringNormalizer::Compute(OpKernelContext* ctx) const {
-  using namespace string_normalizer;
-
-  auto X = ctx->Input<Tensor>(0);
-  if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
-  auto input_dims = X->Shape().GetDims();
-
-  size_t N = 0;
-  size_t C = 0;
-  if (input_dims.size() == 1) {
-    if (input_dims[0] < 1) {
-      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                    "Single dimension value must be greater than 0");
-    }
-    C = onnxruntime::narrow<size_t>(input_dims[0]);
-  } else if (input_dims.size() == 2) {
-    if (input_dims[0] != 1 || input_dims[1] < 1) {
-      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                    "Input dimensions are either[C > 0] or [1][C > 0] allowed");
-    }
-    N = 1;
-    C = onnxruntime::narrow<size_t>(input_dims[1]);
-  } else {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                  "Input dimensions are either[C > 0] or [1][C > 0] allowed");
-  }
-
-  Status status;
-  Locale locale(locale_name_);
-  Utf8Converter converter(conv_error, wconv_error);
-  auto* const input_data = X->Data<std::string>();
-  using StrRef = std::reference_wrapper<const std::string>;
-  if (is_case_sensitive_) {
-    if (!stopwords_.empty()) {
-      InlinedVector<StrRef> filtered_strings;
-      filtered_strings.reserve(C);
-      auto first = input_data;
-      auto const last = input_data + C;
-      while (first != last) {
-        const std::string& s = *first;
-        if (0 == stopwords_.count(s)) {
-          filtered_strings.push_back(std::cref(s));
-        }
-        ++first;
-      }
-      status = CopyCaseAction(filtered_strings.cbegin(), filtered_strings.cend(), ctx, locale, converter,
-                              N, filtered_strings.size(), case_change_action_);
-    } else {
-      // Nothing to filter. Copy input to output and change case if needed
-      status = CopyCaseAction(input_data, input_data + C, ctx, locale, converter, N, C, case_change_action_);
-    }
-  } else {
-    if (!wstopwords_.empty()) {
-      // Filter input. When no case action is required
-      // we simply store original string references.
-      // Otherwise, we store converted strings.
-      InlinedVector<StrRef> filtered_orignal_strings;
-      InlinedVector<std::string> filtered_cased_strings;
-      filtered_orignal_strings.reserve(C);
-      filtered_cased_strings.reserve(C);
-      auto first = input_data;
-      auto const last = input_data + C;
-      while (first != last) {
-        const std::string& s = *first;
-        std::wstring wstr = converter.from_bytes(s);
-        if (wstr == wconv_error) {
-          // Please do not include the input text in the error message as it could
-          // be deemed as a compliance violation by teams using this operator
-          return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                        "Input contains invalid utf8 chars");
-        }
-        locale.ChangeCase(compare_caseaction_, wstr);
-        if (0 == wstopwords_.count(wstr)) {
-          if (case_change_action_ == NONE) {
-            filtered_orignal_strings.push_back(std::cref(s));
-          } else {
-            filtered_cased_strings.push_back(converter.to_bytes(wstr));
-          }
-        }
-        ++first;
-      }
-      if (case_change_action_ == NONE) {
-        status = CopyCaseAction(filtered_orignal_strings.cbegin(), filtered_orignal_strings.cend(), ctx, locale, converter,
-                                N, filtered_orignal_strings.size(), NONE);
-      } else {
-        status = CopyCaseAction(filtered_cased_strings.begin(), filtered_cased_strings.end(), ctx, locale, converter,
-                                N, filtered_cased_strings.size(), NONE);
-      }
-    } else {
-      // Nothing to filter. Copy input to output and change case if needed
-      status = CopyCaseAction(input_data, input_data + C, ctx, locale, converter, N, C, case_change_action_);
-    }
-  }
-  return status;
-}
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
index a0d75e8cc0e6..d8924551e529 100644
--- a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
+++ b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
@@ -79,8 +79,20 @@ static void PrepareForQDQ(const TensorShape& input_shape,
 
 #define REGISTER_DEQUANTIZELINEAR(T)                                         \
   ONNX_CPU_OPERATOR_TYPED_KERNEL(                                            \
+      DequantizeLinear,                                                      \
+      21,                                                                    \
+      T,                                                                     \
+      KernelDefBuilder()                                                     \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())            \
+          .TypeConstraint("T2", {DataTypeImpl::GetTensorType<float>(),       \
+                                 DataTypeImpl::GetTensorType<MLFloat16>()}), \
+      DequantizeLinear<T>);
+
+#define REGISTER_DEQUANTIZELINEAR_VERSIONED(T)                               \
+  ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                                  \
       DequantizeLinear,                                                      \
       19,                                                                    \
+      20,                                                                    \
       T,                                                                     \
       KernelDefBuilder()                                                     \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())            \
@@ -88,7 +100,7 @@ static void PrepareForQDQ(const TensorShape& input_shape,
                                  DataTypeImpl::GetTensorType<MLFloat16>()}), \
       DequantizeLinear<T>);
 
-#define REGISTER_DEQUANTIZELINEAR_VERSIONED(T)                    \
+#define REGISTER_DEQUANTIZELINEAR_VERSIONED_PRE_19(T)             \
   ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                       \
       DequantizeLinear,                                           \
       13,                                                         \
@@ -107,8 +119,12 @@ static void PrepareForQDQ(const TensorShape& input_shape,
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       DequantizeLinear<T>);
 
+// Opset 21 added 16-bit and 4-bit int to DQ.
+// TODO(adrianlizarraga): Also support 4-bit int types and 'block' quantization.
 REGISTER_DEQUANTIZELINEAR(int8_t)
 REGISTER_DEQUANTIZELINEAR(uint8_t)
+REGISTER_DEQUANTIZELINEAR(int16_t)
+REGISTER_DEQUANTIZELINEAR(uint16_t)
 REGISTER_DEQUANTIZELINEAR(int32_t)
 #if !defined(DISABLE_FLOAT8_TYPES)
 REGISTER_DEQUANTIZELINEAR(Float8E4M3FN)
@@ -116,9 +132,22 @@ REGISTER_DEQUANTIZELINEAR(Float8E4M3FNUZ)
 REGISTER_DEQUANTIZELINEAR(Float8E5M2)
 REGISTER_DEQUANTIZELINEAR(Float8E5M2FNUZ)
 #endif
+
+// Opset 19 added 8-bit float inputs and 16-bit float outputs to DQ.
 REGISTER_DEQUANTIZELINEAR_VERSIONED(int8_t)
 REGISTER_DEQUANTIZELINEAR_VERSIONED(uint8_t)
 REGISTER_DEQUANTIZELINEAR_VERSIONED(int32_t)
+#if !defined(DISABLE_FLOAT8_TYPES)
+REGISTER_DEQUANTIZELINEAR_VERSIONED(Float8E4M3FN)
+REGISTER_DEQUANTIZELINEAR_VERSIONED(Float8E4M3FNUZ)
+REGISTER_DEQUANTIZELINEAR_VERSIONED(Float8E5M2)
+REGISTER_DEQUANTIZELINEAR_VERSIONED(Float8E5M2FNUZ)
+#endif
+
+// Before opset 19, DQ only supported int8, uint8 and int32.
+REGISTER_DEQUANTIZELINEAR_VERSIONED_PRE_19(int8_t)
+REGISTER_DEQUANTIZELINEAR_VERSIONED_PRE_19(uint8_t)
+REGISTER_DEQUANTIZELINEAR_VERSIONED_PRE_19(int32_t)
 
 #if !defined(DISABLE_CONTRIB_OPS)
 namespace contrib {
@@ -264,8 +293,20 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
 
 #define REGISTER_QUANTIZELINEAR(T)                                          \
   ONNX_CPU_OPERATOR_TYPED_KERNEL(                                           \
+      QuantizeLinear,                                                       \
+      21,                                                                   \
+      T,                                                                    \
+      KernelDefBuilder()                                                    \
+          .TypeConstraint("T1", {DataTypeImpl::GetTensorType<float>(),      \
+                                 DataTypeImpl::GetTensorType<MLFloat16>()}) \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),          \
+      QuantizeLinear<T>);
+
+#define REGISTER_QUANTIZELINEAR_VERSIONED(T)                                \
+  ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                                 \
       QuantizeLinear,                                                       \
       19,                                                                   \
+      20,                                                                   \
       T,                                                                    \
       KernelDefBuilder()                                                    \
           .TypeConstraint("T1", {DataTypeImpl::GetTensorType<float>(),      \
@@ -273,7 +314,7 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
           .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),          \
       QuantizeLinear<T>);
 
-#define REGISTER_QUANTIZELINEAR_VERSIONED(T)                          \
+#define REGISTER_QUANTIZELINEAR_VERSIONED_PRE_19(T)                   \
   ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                           \
       QuantizeLinear,                                                 \
       13,                                                             \
@@ -294,8 +335,12 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
           .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),    \
       QuantizeLinear<T>);
 
+// Opset 21 added 16-bit and 4-bit int support to Q ops.
+// TODO(adrianlizarraga): Support int4 and block quantization.
 REGISTER_QUANTIZELINEAR(int8_t)
 REGISTER_QUANTIZELINEAR(uint8_t)
+REGISTER_QUANTIZELINEAR(int16_t)
+REGISTER_QUANTIZELINEAR(uint16_t)
 
 #if !defined(DISABLE_FLOAT8_TYPES)
 REGISTER_QUANTIZELINEAR(Float8E4M3FN)
@@ -304,9 +349,21 @@ REGISTER_QUANTIZELINEAR(Float8E5M2)
 REGISTER_QUANTIZELINEAR(Float8E5M2FNUZ)
 #endif
 
+// Opset 19 added 8-bit floats to Q ops.
 REGISTER_QUANTIZELINEAR_VERSIONED(int8_t)
 REGISTER_QUANTIZELINEAR_VERSIONED(uint8_t)
 
+#if !defined(DISABLE_FLOAT8_TYPES)
+REGISTER_QUANTIZELINEAR_VERSIONED(Float8E4M3FN)
+REGISTER_QUANTIZELINEAR_VERSIONED(Float8E4M3FNUZ)
+REGISTER_QUANTIZELINEAR_VERSIONED(Float8E5M2)
+REGISTER_QUANTIZELINEAR_VERSIONED(Float8E5M2FNUZ)
+#endif
+
+// Before opset 19, Q only supported int8 and uint8.
+REGISTER_QUANTIZELINEAR_VERSIONED_PRE_19(int8_t)
+REGISTER_QUANTIZELINEAR_VERSIONED_PRE_19(uint8_t)
+
 #if !defined(DISABLE_CONTRIB_OPS)
 namespace contrib {
 
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_kernel_base.h b/onnxruntime/core/providers/cpu/reduction/reduction_kernel_base.h
new file mode 100644
index 000000000000..5725e85f8e1e
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_kernel_base.h
@@ -0,0 +1,40 @@
+#ifndef CORE_PROVIDERS_CPU_REDUCTION_KERNEL_BASE_H
+#define CORE_PROVIDERS_CPU_REDUCTION_KERNEL_BASE_H
+
+#ifndef SHARED_PROVIDER
+#include "core/common/optional.h"
+#include "core/framework/op_kernel.h"
+#endif
+
+namespace onnxruntime {
+
+template <bool allow_multi_axes>
+class ReduceKernelBase {
+ protected:
+  ReduceKernelBase(const OpKernelInfo& info, optional<int64_t> keepdims_override = {}) {
+    if (allow_multi_axes) {
+      axes_ = ToShapeVector(info.GetAttrsOrDefault<int64_t>("axes"));
+    } else {
+      auto v = info.GetAttrOrDefault<int64_t>("axis", 0);
+      axes_.push_back(v);
+    }
+    int64_t keepdims = 1;
+    if (keepdims_override.has_value()) {
+      keepdims = *keepdims_override;
+    } else {
+      ORT_ENFORCE(info.GetAttr("keepdims", &keepdims).IsOK());
+    }
+    keepdims_ = (keepdims == 1);
+    int64_t noop_with_empty_axes = info.GetAttrOrDefault<int64_t>("noop_with_empty_axes", 0);
+    noop_with_empty_axes_ = (noop_with_empty_axes == 1);
+    int64_t select_last_index = info.GetAttrOrDefault<int64_t>("select_last_index", 0);
+    select_last_index_ = (select_last_index != 0);
+  }
+
+  TensorShapeVector axes_;
+  bool keepdims_;
+  bool noop_with_empty_axes_;
+  bool select_last_index_;
+};
+}  // namespace onnxruntime
+#endif  // !CORE_PROVIDERS_CPU_REDUCTION_KERNEL_BASE_H
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
index 3c83394fb0bf..244da35427f4 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@@ -114,6 +114,14 @@ namespace onnxruntime {
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<uint8_t>()), \
       x<uint8_t>);
 
+#define REGISTER_UNARY_ELEMENTWISE_KERNEL_BOOL_ONLY(x, sinceVersion)               \
+  ONNX_CPU_OPERATOR_TYPED_KERNEL(                                                  \
+      x,                                                                           \
+      sinceVersion,                                                                \
+      bool,                                                                        \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>()), \
+      x<bool>);
+
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 1, 10);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_INT64_ONLY(ReduceL1, 1, 10);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 11, 12);
@@ -173,11 +181,18 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_DOUBLE_ONLY(ReduceMax, 13, 17);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_INT8_ONLY(ReduceMax, 13, 17);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_UINT8_ONLY(ReduceMax, 13, 17);
 
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 18);
-REGISTER_UNARY_ELEMENTWISE_KERNEL_INT64_ONLY(ReduceMax, 18);
-REGISTER_UNARY_ELEMENTWISE_KERNEL_DOUBLE_ONLY(ReduceMax, 18);
-REGISTER_UNARY_ELEMENTWISE_KERNEL_INT8_ONLY(ReduceMax, 18);
-REGISTER_UNARY_ELEMENTWISE_KERNEL_UINT8_ONLY(ReduceMax, 18);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 18, 19);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_INT64_ONLY(ReduceMax, 18, 19);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_DOUBLE_ONLY(ReduceMax, 18, 19);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_INT8_ONLY(ReduceMax, 18, 19);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_UINT8_ONLY(ReduceMax, 18, 19);
+
+REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_INT64_ONLY(ReduceMax, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_DOUBLE_ONLY(ReduceMax, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_INT8_ONLY(ReduceMax, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_UINT8_ONLY(ReduceMax, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_BOOL_ONLY(ReduceMax, 20);
 
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
@@ -207,11 +222,18 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_DOUBLE_ONLY(ReduceMin, 13, 17);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_INT8_ONLY(ReduceMin, 13, 17);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_UINT8_ONLY(ReduceMin, 13, 17);
 
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMin, 18);
-REGISTER_UNARY_ELEMENTWISE_KERNEL_INT64_ONLY(ReduceMin, 18);
-REGISTER_UNARY_ELEMENTWISE_KERNEL_DOUBLE_ONLY(ReduceMin, 18);
-REGISTER_UNARY_ELEMENTWISE_KERNEL_INT8_ONLY(ReduceMin, 18);
-REGISTER_UNARY_ELEMENTWISE_KERNEL_UINT8_ONLY(ReduceMin, 18);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 18, 19);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_INT64_ONLY(ReduceMin, 18, 19);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_DOUBLE_ONLY(ReduceMin, 18, 19);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_INT8_ONLY(ReduceMin, 18, 19);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_UINT8_ONLY(ReduceMin, 18, 19);
+
+REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMin, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_INT64_ONLY(ReduceMin, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_DOUBLE_ONLY(ReduceMin, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_INT8_ONLY(ReduceMin, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_UINT8_ONLY(ReduceMin, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL_BOOL_ONLY(ReduceMin, 20);
 
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 1, 10);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL_INT64_ONLY(ReduceProd, 1, 10);
@@ -822,10 +844,57 @@ static void ValidateKeepDims(const Tensor* input, int64_t keepdims) {
   ValidateKeepDims(input->Shape(), keepdims);
 }
 
+template <typename AGG>
+bool check_and_reduce_empty_set_input(OpKernelContext* ctx, const gsl::span<const int64_t> axes, bool keepdims) {
+  const Tensor* input = ctx->Input<Tensor>(0);
+  const TensorShape& input_shape = input->Shape();
+  if (input_shape.Size() != 0) {
+    return false;
+  }
+
+  // input is an empty set
+  std::vector<int64_t> input_axes;
+  if (ctx->InputCount() == 2) {
+    ORT_ENFORCE(axes.empty(), "Axes input and attribute should not both be present for reduction.");
+    // second input holds the axes.
+    const Tensor* axes_tensor = ctx->Input<Tensor>(1);
+    auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
+    const auto* data = axes_tensor->Data<int64_t>();
+    input_axes.insert(input_axes.begin(), data, data + nDims);
+  } else {
+    input_axes.resize(axes.size());
+    std::copy(axes.begin(), axes.end(), input_axes.begin());
+  }
+
+  gsl::span<const int64_t> shape_dims = input_shape.GetDims();
+  const int64_t input_shape_size = narrow<int64_t>(shape_dims.size());
+  TensorShapeVector output_shape_vector;
+  for (int64_t i = 0; i < input_shape_size; ++i) {
+    if (input_axes.empty() || std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) {
+      if (keepdims) {
+        output_shape_vector.push_back(1);
+      }
+    } else {
+      output_shape_vector.push_back(input_shape[onnxruntime::narrow<size_t>(i)]);
+    }
+  }
+
+  TensorShape output_shape(output_shape_vector);
+  Tensor* output = ctx->Output(0, output_shape);
+  if (output_shape.Size() != 0) {
+    AGG::fill_for_empty_set(*output);
+  }
+  return true;
+}
+
 template <typename AGG>
 void CommonReduce1Loop(OpKernelContext* ctx,
                        const gsl::span<const int64_t>& axes_, int64_t keepdims_,
                        bool noop_with_empty_axes) {
+  if (check_and_reduce_empty_set_input<AGG>(ctx, axes_, keepdims_ != 0)) {
+    return;
+  }
+
   FastReduceKind fast_kind;
   TensorShapeVector fast_shape;
   TensorShapeVector output_shape;
@@ -838,8 +907,8 @@ void CommonReduce1Loop(OpKernelContext* ctx,
   const Tensor* input = ctx->Input<Tensor>(0);
   Tensor* output = ctx->Output(0, output_shape);
   if (fast_kind == FastReduceKind::kEmpty) {
-    const TensorShape& new_input_shape = input->Shape();
-    if (new_input_shape.Size() == 1) {
+    const TensorShape& input_shape = input->Shape();
+    if (input_shape.Size() == 1) {
       const typename AGG::input_type* from_data = input->Data<typename AGG::input_type>();
       typename AGG::value_type* to_data = output->MutableData<typename AGG::value_type>();
       AGG agg(1, *from_data);
@@ -859,6 +928,10 @@ template <typename AGG>
 void CommonReduce2Loops(OpKernelContext* ctx,
                         const gsl::span<const int64_t>& axes_, int64_t keepdims_,
                         bool noop_with_empty_axes) {
+  if (check_and_reduce_empty_set_input<AGG>(ctx, axes_, keepdims_ != 0)) {
+    return;
+  }
+
   FastReduceKind fast_kind;
   TensorShapeVector fast_shape, output_shape, fast_axes;
   if (CommonFastReduce<AGG>(ctx, axes_, keepdims_, noop_with_empty_axes,
@@ -869,8 +942,8 @@ void CommonReduce2Loops(OpKernelContext* ctx,
   const Tensor* input = ctx->Input<Tensor>(0);
   Tensor* output = ctx->Output(0, output_shape);
   if (fast_kind == FastReduceKind::kEmpty) {
-    const TensorShape& new_input_shape = input->Shape();
-    if (new_input_shape.Size() == 1) {
+    const TensorShape& input_shape = input->Shape();
+    if (input_shape.Size() == 1) {
       const typename AGG::input_type* from_data = input->Data<typename AGG::input_type>();
       typename AGG::value_type* to_data = output->MutableData<typename AGG::value_type>();
       AGG agg(1, *from_data);
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
index 7105fd2ddad2..4d205acaa015 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
@@ -11,8 +11,10 @@
 #include "core/providers/cpu/containers.h"
 #include "core/util/math.h"
 #endif
+#include "core/framework/math.h"
 #include "core/util/math_cpuonly.h"
 #include "core/platform/threadpool.h"
+#include "core/providers/cpu/reduction/reduction_kernel_base.h"
 #include "core/common/safeint.h"
 #include <cmath>
 
@@ -178,6 +180,7 @@ class ReduceAggregator : public ReduceAggregatorBase {
   inline void update0(const T&) {}
   inline TVAL aggall(const T*) {}
   inline TVAL get_value() { return accumulator_; }
+  static void fill_for_empty_set(Tensor&) { ORT_NOT_IMPLEMENTED(); }
 
  protected:
   static void CommonFastReduceRKR(const Tensor& input, const gsl::span<const int64_t>& fast_shape,
@@ -217,6 +220,10 @@ class ReduceAggregatorSum : public ReduceAggregator<T, T> {
     return aggall(from_data, this->N_);
   }
 
+  static void fill_for_empty_set(Tensor& output) {
+    EigenMap<T>(output).array() = static_cast<T>(0);
+  }
+
   // Fast reduction
   static inline FastReduceKind WhichFastReduce() {
     return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
@@ -290,6 +297,9 @@ class ReduceAggregatorSumSquare : public ReduceAggregator<T, TVAL> {
     return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(this->N_)).squaredNorm();
   }
   inline void update(const T& v) { this->accumulator_ += v * v; }
+  static void fill_for_empty_set(Tensor& output) {
+    EigenMap<T>(output).array() = static_cast<T>(0);
+  }
 };
 
 template <typename T>
@@ -363,7 +373,11 @@ class ReduceAggregatorMax : public ReduceAggregator<T> {
  public:
   inline ReduceAggregatorMax(int64_t N, const T& init) : ReduceAggregator<T, T>(N, init) {}
   static T aggall(const T* from_data, int64_t size) {
-    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).maxCoeff();
+    if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+      return Eigen::Map<const Eigen::Matrix<bool, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).cast<int>().maxCoeff();
+    } else { /* generic impl */
+      return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).maxCoeff();
+    }
   }
   inline T aggall(const T* from_data) {
     return aggall(from_data, this->N_);
@@ -383,10 +397,19 @@ class ReduceAggregatorMax : public ReduceAggregator<T> {
     concurrency::ThreadPool::TryParallelFor(
         tp, onnxruntime::narrow<std::ptrdiff_t>(fast_shape[0]), ParallelReduceFastCost(1, stridei, sizeof(T), 6),
         [data, stridei, out](std::ptrdiff_t first, std::ptrdiff_t last) {
-          EigenVectorMap<T>(out + first, last - first) = ConstEigenMatrixMap<T>(
-                                                             data + first * stridei, onnxruntime::narrow<size_t>(stridei), last - first)
-                                                             .colwise()
-                                                             .maxCoeff();
+          if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+            EigenVectorMap<bool>(out + first, last - first) = ConstEigenMatrixMap<bool>(
+                                                                  data + first * stridei, onnxruntime::narrow<size_t>(stridei), last - first)
+                                                                  .cast<unsigned char>()
+                                                                  .colwise()
+                                                                  .maxCoeff()
+                                                                  .cast<bool>();
+          } else {
+            EigenVectorMap<T>(out + first, last - first) = ConstEigenMatrixMap<T>(
+                                                               data + first * stridei, onnxruntime::narrow<size_t>(stridei), last - first)
+                                                               .colwise()
+                                                               .maxCoeff();
+          }
         });
   }
 
@@ -405,8 +428,12 @@ class ReduceAggregatorMax : public ReduceAggregator<T> {
           for (int64_t row = 1; row < n_rows; ++row) {
             p = data + row * N;
             for (int64_t j = begin; j < end; ++j) {
-              if (out[j] < p[j])
-                out[j] = p[j];
+              if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+                out[j] = out[j] || p[j];
+              } else {
+                if (out[j] < p[j])
+                  out[j] = p[j];
+              }
             }
           }
         });
@@ -422,11 +449,21 @@ class ReduceAggregatorMax : public ReduceAggregator<T> {
         tp, onnxruntime::narrow<std::ptrdiff_t>(fast_shape[0]), ParallelReduceFastCost(fast_shape[1], fast_shape[2], sizeof(T), 6),
         [data, fast_shape, stridei, strideo, out](ptrdiff_t begin, ptrdiff_t end) {
           for (ptrdiff_t j = begin; j < end; ++j) {
-            EigenVectorMap<T>(out + j * strideo, onnxruntime::narrow<size_t>(strideo)) =
-                ConstEigenMatrixMap<T>(
-                    data + j * stridei, onnxruntime::narrow<size_t>(fast_shape[2]), onnxruntime::narrow<size_t>(fast_shape[1]))
-                    .rowwise()
-                    .maxCoeff();
+            if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+              EigenVectorMap<bool>(out + j * strideo, onnxruntime::narrow<size_t>(strideo)) =
+                  ConstEigenMatrixMap<bool>(
+                      data + j * stridei, onnxruntime::narrow<size_t>(fast_shape[2]), onnxruntime::narrow<size_t>(fast_shape[1]))
+                      .cast<unsigned char>()
+                      .rowwise()
+                      .maxCoeff()
+                      .cast<bool>();
+            } else {
+              EigenVectorMap<T>(out + j * strideo, onnxruntime::narrow<size_t>(strideo)) =
+                  ConstEigenMatrixMap<T>(
+                      data + j * stridei, onnxruntime::narrow<size_t>(fast_shape[2]), onnxruntime::narrow<size_t>(fast_shape[1]))
+                      .rowwise()
+                      .maxCoeff();
+            }
           }
         });
   }
@@ -438,8 +475,12 @@ class ReduceAggregatorMax : public ReduceAggregator<T> {
         [=](const T* p) -> T { return p[0]; },
         [=](T& value, const T* p, int64_t size) {
           T v = aggall(p, size);
-          if (v > value)
-            value = v;
+          if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+            value = value || v;
+          } else {
+            if (v > value)
+              value = v;
+          }
         });
   }
 };
@@ -545,6 +586,14 @@ class ReduceAggregatorMin : public ReduceAggregator<T, T> {
   }
   inline void update(const T& v) { this->accumulator_ = v < this->accumulator_ ? v : this->accumulator_; }
 
+  static void fill_for_empty_set(Tensor& output) {
+    if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+      ORT_NOT_IMPLEMENTED();
+    } else {
+      EigenMap<T>(output).array() = std::numeric_limits<T>::infinity();
+    }
+  }
+
   // Fast reduction
   static inline FastReduceKind WhichFastReduce() {
     return FastReduceKind::kKR | FastReduceKind::kRK | FastReduceKind::kKRK | FastReduceKind::kRKR;
@@ -558,10 +607,19 @@ class ReduceAggregatorMin : public ReduceAggregator<T, T> {
     concurrency::ThreadPool::TryParallelFor(
         tp, onnxruntime::narrow<std::ptrdiff_t>(fast_shape[0]), ParallelReduceFastCost(1, stridei, sizeof(T), 6),
         [data, stridei, out](std::ptrdiff_t first, std::ptrdiff_t last) {
-          EigenVectorMap<T>(out + first, last - first) = ConstEigenMatrixMap<T>(
-                                                             data + first * stridei, onnxruntime::narrow<size_t>(stridei), last - first)
-                                                             .colwise()
-                                                             .minCoeff();
+          if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+            EigenVectorMap<bool>(out + first, last - first) = ConstEigenMatrixMap<bool>(
+                                                                  data + first * stridei, onnxruntime::narrow<size_t>(stridei), last - first)
+                                                                  .cast<unsigned char>()
+                                                                  .colwise()
+                                                                  .minCoeff()
+                                                                  .cast<bool>();
+          } else {
+            EigenVectorMap<T>(out + first, last - first) = ConstEigenMatrixMap<T>(
+                                                               data + first * stridei, onnxruntime::narrow<size_t>(stridei), last - first)
+                                                               .colwise()
+                                                               .minCoeff();
+          }
         });
   }
 
@@ -580,8 +638,12 @@ class ReduceAggregatorMin : public ReduceAggregator<T, T> {
           for (int64_t row = 1; row < n_rows; ++row) {
             p = data + row * N;
             for (int64_t j = begin; j < end; ++j) {
-              if (out[j] > p[j])
-                out[j] = p[j];
+              if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+                out[j] = out[j] && p[j];
+              } else {
+                if (out[j] > p[j])
+                  out[j] = p[j];
+              }
             }
           }
         });
@@ -597,11 +659,21 @@ class ReduceAggregatorMin : public ReduceAggregator<T, T> {
         tp, onnxruntime::narrow<std::ptrdiff_t>(fast_shape[0]), ParallelReduceFastCost(fast_shape[1], fast_shape[2], sizeof(T), 6),
         [data, fast_shape, stridei, strideo, out](ptrdiff_t begin, ptrdiff_t end) {
           for (ptrdiff_t j = begin; j < end; ++j) {
-            EigenVectorMap<T>(out + j * strideo, onnxruntime::narrow<size_t>(strideo)) =
-                ConstEigenMatrixMap<T>(
-                    data + j * stridei, onnxruntime::narrow<size_t>(fast_shape[2]), onnxruntime::narrow<size_t>(fast_shape[1]))
-                    .rowwise()
-                    .minCoeff();
+            if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+              EigenVectorMap<bool>(out + j * strideo, onnxruntime::narrow<size_t>(strideo)) =
+                  ConstEigenMatrixMap<bool>(
+                      data + j * stridei, onnxruntime::narrow<size_t>(fast_shape[2]), onnxruntime::narrow<size_t>(fast_shape[1]))
+                      .cast<unsigned char>()
+                      .rowwise()
+                      .minCoeff()
+                      .cast<bool>();
+            } else {
+              EigenVectorMap<T>(out + j * strideo, onnxruntime::narrow<size_t>(strideo)) =
+                  ConstEigenMatrixMap<T>(
+                      data + j * stridei, onnxruntime::narrow<size_t>(fast_shape[2]), onnxruntime::narrow<size_t>(fast_shape[1]))
+                      .rowwise()
+                      .minCoeff();
+            }
           }
         });
   }
@@ -613,8 +685,12 @@ class ReduceAggregatorMin : public ReduceAggregator<T, T> {
         [=](const T* p) -> T { return p[0]; },
         [=](T& value, const T* p, int64_t size) {
           T v = aggall(p, size);
-          if (v < value)
-            value = v;
+          if constexpr (std::is_same_v<bool, T>) { /* bool specific impl */
+            value = value && v;
+          } else {
+            if (v < value)
+              value = v;
+          }
         });
   }
 };
@@ -627,6 +703,9 @@ class ReduceAggregatorProd : public ReduceAggregator<T, T> {
     return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(this->N_)).prod();
   }
   inline void update(const T& v) { this->accumulator_ *= v; }
+  static void fill_for_empty_set(Tensor& output) {
+    EigenMap<T>(output).array() = static_cast<T>(1);
+  }
 };
 
 template <typename T>
@@ -637,6 +716,10 @@ class ReduceAggregatorL1 : public ReduceAggregator<T, T> {
     return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(this->N_)).cwiseAbs().sum();
   }
   inline void update(const T& v) { this->accumulator_ += v > 0 ? v : -v; }
+
+  static void fill_for_empty_set(Tensor& output) {
+    EigenMap<T>(output).array() = static_cast<T>(0);
+  }
 };
 
 template <typename T>
@@ -648,6 +731,9 @@ class ReduceAggregatorL2 : public ReduceAggregator<T, T> {
   }
   inline void update(const T& v) { this->accumulator_ += v * v; }
   inline T get_value() { return reduce_sqrt<T>(this->accumulator_); }
+  static void fill_for_empty_set(Tensor& output) {
+    EigenMap<T>(output).array() = static_cast<T>(0);
+  }
 };
 
 template <typename T>
@@ -659,6 +745,9 @@ class ReduceAggregatorLogSum : public ReduceAggregator<T, T> {
   }
   inline void update(const T& v) { this->accumulator_ += v; }
   inline T get_value() { return reduce_log<T>(this->accumulator_); }
+  static void fill_for_empty_set(Tensor& output) {
+    EigenMap<T>(output).array() = -std::numeric_limits<T>::infinity();
+  }
 };
 
 template <typename T>
@@ -682,6 +771,9 @@ class ReduceAggregatorLogSumExp : public ReduceAggregator<T, T> {
   }
   inline void update(const T& v) { this->accumulator_ += reduce_exp(v - max_); }
   inline T get_value() { return reduce_log<T>(this->accumulator_) + max_; }
+  static void fill_for_empty_set(Tensor& output) {
+    EigenMap<T>(output).array() = -std::numeric_limits<T>::infinity();
+  }
 };
 
 void NoTransposePrepareForReduce(const TensorShape& new_input_shape,
@@ -710,35 +802,6 @@ void CommonReduce2Loops(OpKernelContext* ctx,
                         const gsl::span<const int64_t>& axes_, int64_t keepdims_,
                         bool noop_with_empty_axes = false);
 
-template <bool allow_multi_axes>
-class ReduceKernelBase {
- protected:
-  ReduceKernelBase(const OpKernelInfo& info, optional<int64_t> keepdims_override = {}) {
-    if (allow_multi_axes) {
-      axes_ = ToShapeVector(info.GetAttrsOrDefault<int64_t>("axes"));
-    } else {
-      auto v = info.GetAttrOrDefault<int64_t>("axis", 0);
-      axes_.push_back(v);
-    }
-    int64_t keepdims = 1;
-    if (keepdims_override.has_value()) {
-      keepdims = *keepdims_override;
-    } else {
-      ORT_ENFORCE(info.GetAttr("keepdims", &keepdims).IsOK());
-    }
-    keepdims_ = (keepdims == 1);
-    int64_t noop_with_empty_axes = info.GetAttrOrDefault<int64_t>("noop_with_empty_axes", 0);
-    noop_with_empty_axes_ = (noop_with_empty_axes == 1);
-    int64_t select_last_index = info.GetAttrOrDefault<int64_t>("select_last_index", 0);
-    select_last_index_ = (select_last_index != 0);
-  }
-
-  TensorShapeVector axes_;
-  bool keepdims_;
-  bool noop_with_empty_axes_;
-  bool select_last_index_;
-};
-
 template <bool allow_multi_axes>
 class ReduceKernel : public OpKernel, public ReduceKernelBase<allow_multi_axes> {
  protected:
diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
index 8064bc0a58cb..2913f4ac32b6 100644
--- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
+++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -453,7 +453,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
   int num_remaining_splits = 0;
   InlinedVector<int64_t> split_sizes;
   const bool is_string_type = input.IsDataTypeString();
-  const size_t element_size = (is_string_type) ? 0U : input.DataType()->Size();
+  const size_t element_size = input.DataType()->Size();
 
   // figure out split_scalar or split_sizes
   if (p_split_input) {
diff --git a/onnxruntime/core/providers/cpu/signal/dft.cc b/onnxruntime/core/providers/cpu/signal/dft.cc
index 15bf633579e5..50fe7d1344ea 100644
--- a/onnxruntime/core/providers/cpu/signal/dft.cc
+++ b/onnxruntime/core/providers/cpu/signal/dft.cc
@@ -506,7 +506,7 @@ static Status short_time_fourier_transform(OpKernelContext* ctx, bool is_oneside
 
   // Calculate the window size with preference to the window input.
   const auto window_size = window ? window->Shape()[0] : frame_length;
-  ORT_ENFORCE(window_size < signal_size, "Ensure that the dft size is smaller than the signal.");
+  ORT_ENFORCE(window_size <= signal_size, "Ensure that the dft size is smaller than the signal.");
 
   // Calculate the number of dfts to run
   const auto n_dfts =
diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
index 058f5572c9c8..cbc4d8360d4b 100644
--- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@@ -434,9 +434,20 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
         .MayInplace(0, 0),  // allocation planner will check input and output sizes match before inplacing
     Cast);
 
-ONNX_CPU_OPERATOR_KERNEL(
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     Cast,
     19,
+    20,
+    KernelDefBuilder()
+        .TypeConstraint("T1", BuildKernelDefConstraintsFromTypeList<EnabledSrcTypes>())
+        .TypeConstraint("T2", BuildKernelDefConstraintsFromTypeList<EnabledDstTypes>())
+        .MayInplace(0, 0),  // allocation planner will check input and output sizes match before inplacing
+    Cast);
+
+// TODO(adrianlizarraga): Implement support for int4 and uint4.
+ONNX_CPU_OPERATOR_KERNEL(
+    Cast,
+    21,
     KernelDefBuilder()
         .TypeConstraint("T1", BuildKernelDefConstraintsFromTypeList<EnabledSrcTypes>())
         .TypeConstraint("T2", BuildKernelDefConstraintsFromTypeList<EnabledDstTypes>())
diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.cc b/onnxruntime/core/providers/cpu/tensor/gelu.cc
new file mode 100644
index 000000000000..d55973eda180
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/gelu.cc
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/framework/op_kernel.h"
+#include "core/util/math_cpuonly.h"
+#include "core/mlas/inc/mlas.h"
+
+#include "core/platform/threadpool.h"
+#include <unsupported/Eigen/SpecialFunctions>
+#include "core/providers/cpu/element_wise_ranged_transform.h"
+#include "core/providers/cpu/tensor/gelu.h"
+
+using onnxruntime::narrow;
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+
+// May revisit the implementations to support inplace computation, if needed.
+
+ONNX_CPU_OPERATOR_KERNEL(
+    Gelu,
+    20,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Gelu<float>);
+
+#ifndef DISABLE_CONTRIB_OPS
+namespace contrib {
+ONNX_OPERATOR_KERNEL_EX(
+    Gelu,
+    kMSDomain,
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Gelu<float>);
+}
+#endif
+
+template <typename T>
+Status Gelu<T>::Compute(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const T* input_data = input->Data<T>();
+
+  Tensor* output = context->Output(0, input->Shape());
+  T* output_data = output->MutableData<T>();
+
+  concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
+  int64_t elem_count = input->Shape().Size();
+  constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
+  int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
+
+  if (approximation_algorithm_ == "tanh") {
+    // FastGelu allows optional bias. Here we split input data into chunks. Each chunk
+    // has N elements (except the last chunk), and use thread pool to parallel chunks.
+    // N = 4096 is selected based on performance test results on input shape 1x128x768.
+    // FastGelu uses approximation for Gelu. The formula is 0.5 * (1 + Tanh(x * (C * x * x + B))) * x.
+    static constexpr float B = 0.7978845608028654f;    // sqrt(2.0 / M_PI)
+    static constexpr float C = 0.035677408136300125f;  // 0.044715 * sqrt(2.0 / M_PI)
+
+    concurrency::ThreadPool::TryBatchParallelFor(
+        tp, static_cast<int32_t>(task_count),
+        [&](ptrdiff_t task_idx) {
+          const auto start = task_idx * length_per_task;
+          const T* p_input = input_data + start;
+          T* p_output = output_data + start;
+          int64_t count = std::min(length_per_task, elem_count - start);
+
+          for (int64_t i = 0; i < count; i++) {
+            T value = p_input[i];
+            p_output[i] = value * (static_cast<T>(C) * value * value + static_cast<T>(B));
+          }
+
+          MlasComputeTanh(p_output, p_output, narrow<size_t>(count));
+
+          for (int64_t i = 0; i < count; i++) {
+            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
+          }
+        },
+        0);
+    return Status::OK();
+  } else if (approximation_algorithm_ == "none") {
+    concurrency::ThreadPool::TryBatchParallelFor(
+        tp, static_cast<int32_t>(task_count),
+        [&](ptrdiff_t task_idx) {
+          const auto start = task_idx * length_per_task;
+          const T* p_input = input_data + start;
+          T* p_output = output_data + start;
+          int64_t count = std::min(length_per_task, elem_count - start);
+
+          for (int64_t i = 0; i < count; i++) {
+            T value = p_input[i];
+            p_output[i] = value * static_cast<T>(M_SQRT1_2);
+          }
+
+          MlasComputeErf(p_output, p_output, narrow<size_t>(count));
+
+          for (int64_t i = 0; i < count; i++) {
+            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
+          }
+        },
+        0);
+    return Status::OK();
+  }
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_);
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.h b/onnxruntime/core/providers/cpu/tensor/gelu.h
new file mode 100644
index 000000000000..13238028d878
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/gelu.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace onnxruntime {
+
+template <typename T>
+class Gelu final : public OpKernel {
+ public:
+  explicit Gelu(const OpKernelInfo& info) : OpKernel(info) {
+    approximation_algorithm_ = info.GetAttrOrDefault<std::string>("approximate", "none");
+  }
+  Status Compute(OpKernelContext* ctx) const override;
+
+ private:
+  std::string approximation_algorithm_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/identity_op.cc b/onnxruntime/core/providers/cpu/tensor/identity_op.cc
index cdf30c3b32a5..5ccd99f94a58 100644
--- a/onnxruntime/core/providers/cpu/tensor/identity_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/identity_op.cc
@@ -51,10 +51,16 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     KernelDefBuilder().TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypes()).Alias(0, 0),
     IdentityOp<false>);
 
-// Opset 19 supported float 8 types.
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
+    Identity,
+    19, 20,
+    KernelDefBuilder().TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypesIRv9()).Alias(0, 0),
+    IdentityOp<false>);
+
+// TODO(liqunfu): Opset 21 supported int4 and uint4 types.
 ONNX_CPU_OPERATOR_KERNEL(
     Identity,
-    19,
+    21,
     KernelDefBuilder().TypeConstraint("V", DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypesIRv9()).Alias(0, 0),
     IdentityOp<false>);
 
diff --git a/onnxruntime/core/providers/cpu/tensor/isinf.cc b/onnxruntime/core/providers/cpu/tensor/isinf.cc
index 1b449f46927a..9d18d1fa6228 100644
--- a/onnxruntime/core/providers/cpu/tensor/isinf.cc
+++ b/onnxruntime/core/providers/cpu/tensor/isinf.cc
@@ -23,7 +23,9 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPE_LIST(
 using IsInfTypesOpset20 =
     TypeList<
         float,
-        double
+        double,
+        MLFloat16,
+        BFloat16
 #if !defined(DISABLE_FLOAT8_TYPES)
         ,
         Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ
@@ -76,10 +78,8 @@ ONNX_CPU_OPERATOR_KERNEL(
     IsInf);
 
 IsInf::IsInf(const OpKernelInfo& info) : OpKernel(info) {
-  Status status = info.GetAttr("detect_positive", &detect_positive_);
-  ORT_ENFORCE(status.IsOK(), "Failed to obtain detect_positive");
-  status = info.GetAttr("detect_negative", &detect_negative_);
-  ORT_ENFORCE(status.IsOK(), "Failed to obtain detect_negative");
+  detect_positive_ = info.GetAttrOrDefault<int64_t>("detect_positive", 1);
+  detect_negative_ = info.GetAttrOrDefault<int64_t>("detect_negative", 1);
   opset_ = info.node().SinceVersion();
 }
 
@@ -87,29 +87,67 @@ namespace isinf_internal {
 template <class T>
 struct ComputeDispatchTarget {
   void operator()(const Tensor& X, Tensor& Y, bool detect_positive, bool detect_negative) const {
-    const auto total_items = X.Shape().Size();
+    auto input_data = X.DataAsSpan<T>();
     auto output_data = Y.MutableData<bool>();
 
     if (detect_positive && detect_negative) {
       EigenMap<bool>(Y) = EigenMap<T>(X).array().isInf();
     } else if (detect_positive) {
-      auto input_data = X.Data<T>();
-      auto end_data = input_data + total_items;
       std::transform(
-          input_data, end_data, output_data, [](T v) {
+          input_data.begin(), input_data.end(), output_data, [](T v) {
             return (v == std::numeric_limits<T>::infinity());
           });
 
     } else if (detect_negative) {
-      auto input_data = X.Data<T>();
-      auto end_data = input_data + total_items;
       std::transform(
-          input_data, end_data, output_data, [](T v) {
+          input_data.begin(), input_data.end(), output_data, [](T v) {
             return (v == -std::numeric_limits<T>::infinity());
           });
     } else {
       // all false
-      memset(output_data, false, onnxruntime::narrow<size_t>(total_items));
+      memset(output_data, false, input_data.size());
+    }
+  }
+};
+
+template <>
+struct ComputeDispatchTarget<MLFloat16> {
+  void operator()(const Tensor& X, Tensor& Y, bool detect_positive, bool detect_negative) const {
+    auto output_data = Y.MutableData<bool>();
+    auto input_data = X.DataAsSpan<MLFloat16>();
+    if (detect_positive && detect_negative) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](MLFloat16 v) { return v.IsInfinity(); });
+    } else if (detect_positive) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](MLFloat16 v) { return v.IsPositiveInfinity(); });
+    } else if (detect_negative) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](MLFloat16 v) { return v.IsNegativeInfinity(); });
+    } else {
+      // all false
+      memset(output_data, false, input_data.size());
+    }
+  }
+};
+
+template <>
+struct ComputeDispatchTarget<BFloat16> {
+  void operator()(const Tensor& X, Tensor& Y, bool detect_positive, bool detect_negative) const {
+    auto output_data = Y.MutableData<bool>();
+    auto input_data = X.DataAsSpan<BFloat16>();
+    if (detect_positive && detect_negative) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](BFloat16 v) { return v.IsInfinity(); });
+    } else if (detect_positive) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](BFloat16 v) { return v.IsPositiveInfinity(); });
+    } else if (detect_negative) {
+      std::transform(input_data.begin(), input_data.end(), output_data,
+                     [](BFloat16 v) { return v.IsNegativeInfinity(); });
+    } else {
+      // all false
+      memset(output_data, false, input_data.size());
     }
   }
 };
diff --git a/onnxruntime/core/providers/cpu/tensor/isnan.cc b/onnxruntime/core/providers/cpu/tensor/isnan.cc
index 34495e382278..0e15c64b126f 100644
--- a/onnxruntime/core/providers/cpu/tensor/isnan.cc
+++ b/onnxruntime/core/providers/cpu/tensor/isnan.cc
@@ -46,9 +46,11 @@ ADD_TYPED_ISNAN_OP_9(MLFloat16);
 ADD_TYPED_ISNAN_OP_13(float);
 ADD_TYPED_ISNAN_OP_13(double);
 ADD_TYPED_ISNAN_OP_13(MLFloat16);
+ADD_TYPED_ISNAN_OP_13(BFloat16);
 ADD_TYPED_ISNAN_OP(float);
 ADD_TYPED_ISNAN_OP(double);
 ADD_TYPED_ISNAN_OP(MLFloat16);
+ADD_TYPED_ISNAN_OP(BFloat16);
 
 #if !defined(DISABLE_FLOAT8_TYPES)
 ADD_TYPED_ISNAN_OP(Float8E4M3FN);
@@ -75,9 +77,7 @@ Status IsNaN<T>::Compute(OpKernelContext* context) const {
 template <>
 Status IsNaN<MLFloat16>::Compute(OpKernelContext* context) const {
   const auto* X_ptr = context->Input<Tensor>(0);
-  if (!X_ptr) {
-    return Status(common::ONNXRUNTIME, common::FAIL, "Null input ptr");
-  }
+
   auto X_data = X_ptr->Data<MLFloat16>();
   auto& dims = X_ptr->Shape();
   auto shape_size = dims.Size();
@@ -91,6 +91,19 @@ Status IsNaN<MLFloat16>::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
+template <>
+Status IsNaN<BFloat16>::Compute(OpKernelContext* context) const {
+  const auto* X_ptr = context->Input<Tensor>(0);
+
+  auto X_data = X_ptr->DataAsSpan<BFloat16>();
+  auto& Y = *context->Output(0, X_ptr->Shape());
+
+  std::transform(X_data.begin(), X_data.end(), Y.MutableData<bool>(),
+                 [](BFloat16 x) { return x.IsNaN(); });
+
+  return Status::OK();
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 template <>
 Status IsNaN<Float8E4M3FN>::Compute(OpKernelContext* context) const {
diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index fe5267f20712..dc590ab8422a 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -9,6 +9,8 @@
 #include "core/providers/op_kernel_type_control.h"
 #include "core/util/math.h"
 
+#include <functional>
+
 // there's no way to use a raw pointer as the copy destination with std::copy_n
 // (which gsl::copy uses with span::data() which returns a raw pointer) with the 14.11 toolset
 // without generating a 4996 warning. going through an iterator is way too much overhead so turn off the warning.
@@ -91,6 +93,20 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
     uint8_t,
     bool);
 
+// Opset 21 added int4 and uint4.
+// TODO(adrianlizarraga): Implement int4 and uint4 support.
+ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
+    kCpuExecutionProvider, kOnnxDomain, Pad, 21, Input, 0,
+    float,
+    double,
+    int32_t,
+    int64_t,
+    uint32_t,
+    uint64_t,
+    int8_t,
+    uint8_t,
+    bool);
+
 ORT_SPECIFY_OP_KERNEL_ARG_REQUIRED_TYPES(
     kCpuExecutionProvider, kOnnxDomain, Pad, 11, Input, 0, int32_t, int64_t);
 ORT_SPECIFY_OP_KERNEL_ARG_REQUIRED_TYPES(
@@ -99,6 +115,8 @@ ORT_SPECIFY_OP_KERNEL_ARG_REQUIRED_TYPES(
     kCpuExecutionProvider, kOnnxDomain, Pad, 18, Input, 0, int32_t, int64_t);
 ORT_SPECIFY_OP_KERNEL_ARG_REQUIRED_TYPES(
     kCpuExecutionProvider, kOnnxDomain, Pad, 19, Input, 0, int32_t, int64_t);
+ORT_SPECIFY_OP_KERNEL_ARG_REQUIRED_TYPES(
+    kCpuExecutionProvider, kOnnxDomain, Pad, 21, Input, 0, int32_t, int64_t);
 }  // namespace op_kernel_type_control
 
 using EnabledPad2Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
@@ -111,6 +129,8 @@ using EnabledPad18Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
     kCpuExecutionProvider, kOnnxDomain, Pad, 18, Input, 0);
 using EnabledPad19Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
     kCpuExecutionProvider, kOnnxDomain, Pad, 19, Input, 0);
+using EnabledPad21Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
+    kCpuExecutionProvider, kOnnxDomain, Pad, 21, Input, 0);
 
 using AllEnabledPadTypes =
     utils::TypeSetUnion<
@@ -156,58 +176,27 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
             BuildKernelDefConstraintsFromTypeList<EnabledPad18Types>()),
     Pad);
 
-ONNX_CPU_OPERATOR_KERNEL(
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     Pad,
-    19,
+    19, 20,
     KernelDefBuilder()
         .TypeConstraint(
             "T",
             BuildKernelDefConstraintsFromTypeList<EnabledPad19Types>()),
     Pad);
 
-using PadsVector = PadBase::PadsVector;
-
-// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
-template <typename T>
-static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
-                    size_t block_size, size_t block_count) {
-  for (size_t block_index = 0; block_index < block_count; block_index++) {
-    for (size_t i = 0; i < block_size; i++) {
-      *output++ = *input;
-      input += input_delta;
-    }
-    input += input_pitch;
-  }
-}
-
-// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
-// and inputPitch and inputDelta are just a single value added each iteration.
-template <typename T>
-static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
-  for (size_t block_index = 0; block_index < block_count; block_index++) {
-    *output++ = *input;
-    input += input_delta;
-  }
-}
+ONNX_CPU_OPERATOR_KERNEL(
+    Pad,
+    21,
+    KernelDefBuilder()
+        .TypeConstraint(
+            "T",
+            BuildKernelDefConstraintsFromTypeList<EnabledPad21Types>()),
+    Pad);
 
-// For constant padding, there is no input, just a size to write the constant to
-template <typename T>
-static void PadAxisConstant(T* output, T constant, size_t size) {
-  if (size == 1) {
-    *output = constant;
-  } else if (size == 2) {
-    *output = constant;
-    *(output + 1) = constant;
-  } else {
-    // This would be faster with SSE instructions.
-    // That would mean to have an implementation for each type (uint8, uint32, uint64).
-    T* end = output + size;
-    for (; output != end;)
-      *output++ = constant;
-  }
-}
+using PadsVector = PadBase::PadsVector;
 
-Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) {
+Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) {
   switch (mode) {
     case Mode::Constant: {
       // default behavior is fine
@@ -242,34 +231,66 @@ Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_sh
   return Status::OK();
 }
 
-// special handling for edge case where the input has one or more dims with value of 0
-template <typename T>
-static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
-                                         const Mode& mode,
-                                         const TensorShape& input_shape,
-                                         TensorShapeVector& output_dims,
-                                         T value) {
-  TensorShape output_shape(output_dims);
-  ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape));
-
-  auto& output_tensor = *ctx->Output(0, output_shape);
-
-  // we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty
-  if (mode == Mode::Constant) {
-    // we add pads with the default value to all dims including those with a value of 0
-    auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
-    std::fill_n(output, output_shape.Size(), value);
+static void ComputePadWithAxes(
+    gsl::span<const int64_t> pads_tensor_raw_data,
+    std::function<int64_t(size_t)> get_axis,
+    size_t axes_size,
+    size_t data_rank,
+    PadsVector& pads) {
+  for (size_t i = 0; i < axes_size; ++i) {
+    const size_t axis = onnxruntime::narrow<size_t>(HandleNegativeAxis(get_axis(i), data_rank));
+    pads[axis] = pads_tensor_raw_data[i];                          // xi_begin
+    pads[data_rank + axis] = pads_tensor_raw_data[axes_size + i];  // xi_end
   }
+}
 
-  return Status::OK();
+void PadBase::ComputePads(OpKernelContext& ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
+                          PadsVector& pads) {
+  pads.reserve(2 * data_rank);
+  const Tensor* axes_tensor = ctx.Input<Tensor>(3);
+  if (axes_tensor) {
+    const size_t num_axes_dims = axes_tensor->Shape().NumDimensions();
+    ORT_ENFORCE(num_axes_dims == 1, "Axes tensor should be a 1D tensor ");
+
+    const int64_t num_axes = axes_tensor->Shape().Size();
+    ORT_ENFORCE(pads_data.size() == narrow<size_t>(2 * num_axes),
+                "Pads tensor size should be equal to twice the number of explicitly provided axes.");
+
+    pads.resize(2 * data_rank, 0);
+    if (axes_tensor->IsDataType<int32_t>()) {
+      auto axes_data = axes_tensor->DataAsSpan<int32_t>();
+      ComputePadWithAxes(
+          pads_data,
+          [axes_data](size_t idx) -> int64_t {
+            return axes_data[idx];
+          },
+          axes_data.size(),
+          data_rank,
+          pads);
+    } else if (axes_tensor->IsDataType<int64_t>()) {
+      auto axes_data = axes_tensor->DataAsSpan<int64_t>();
+      ComputePadWithAxes(
+          pads_data,
+          [axes_data](size_t idx) {
+            return axes_data[idx];
+          },
+          axes_data.size(),
+          data_rank,
+          pads);
+    }
+  } else {
+    ORT_ENFORCE(pads_data.size() == 2 * data_rank,
+                "Pads tensor size should be equal to twice the input dimension count ");
+    pads.assign(pads_data.begin(), pads_data.end());
+  }
 }
 
 // Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
 // For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as
 // [1,224,224*3] with padding [0,3,3*3,0,3,3*3].
-static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVector& pads,
-                              const PadsVector& slices, TensorShapeVector& reshaped_dims) {
-  size_t dims_count = input_dims.size();
+void PadBase::FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<const int64_t> pads,
+                                gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims) {
+  const size_t dims_count = input_dims.size();
   size_t inner_axis = dims_count - 1;
   size_t inner_size = 1;
 
@@ -288,14 +309,14 @@ static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVec
   } while (inner_axis-- > 0);
 
   reshaped_dims.reserve(inner_axis + 1);
-  std::copy(input_dims.cbegin(), input_dims.cbegin() + inner_axis + 1, std::back_inserter(reshaped_dims));
+  std::copy(input_dims.begin(), input_dims.begin() + inner_axis + 1, std::back_inserter(reshaped_dims));
 
   // Flatten inner axis.
   reshaped_dims[inner_axis] = inner_size;
 }
 
-static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t new_dim_count,
-                        size_t inner_no_pad_size, PadsVector& reshaped_pad) {
+void PadBase::ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count, size_t new_dim_count,
+                          size_t inner_no_pad_size, PadsVector& reshaped_pad) {
   size_t inner_axis = new_dim_count - 1;
   std::copy(src_pad.begin(), src_pad.begin() + inner_axis, reshaped_pad.begin());
   std::copy(src_pad.begin() + src_dim_count, src_pad.begin() + src_dim_count + inner_axis,
@@ -306,6 +327,68 @@ static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t
   reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size;
 }
 
+// special handling for edge case where the input has one or more dims with value of 0
+template <typename T>
+static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
+                                         const Mode& mode,
+                                         const TensorShape& input_shape,
+                                         TensorShapeVector& output_dims,
+                                         T value) {
+  TensorShape output_shape(output_dims);
+  ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape));
+
+  auto& output_tensor = *ctx->Output(0, output_shape);
+
+  // we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty
+  if (mode == Mode::Constant) {
+    // we add pads with the default value to all dims including those with a value of 0
+    auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
+    std::fill_n(output, output_shape.Size(), value);
+  }
+
+  return Status::OK();
+}
+
+// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
+template <typename T>
+static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
+                    size_t block_size, size_t block_count) {
+  for (size_t block_index = 0; block_index < block_count; block_index++) {
+    for (size_t i = 0; i < block_size; i++) {
+      *output++ = *input;
+      input += input_delta;
+    }
+    input += input_pitch;
+  }
+}
+
+// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
+// and inputPitch and inputDelta are just a single value added each iteration.
+template <typename T>
+static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
+  for (size_t block_index = 0; block_index < block_count; block_index++) {
+    *output++ = *input;
+    input += input_delta;
+  }
+}
+
+// For constant padding, there is no input, just a size to write the constant to
+template <typename T>
+static void PadAxisConstant(T* output, T constant, size_t size) {
+  if (size == 1) {
+    *output = constant;
+  } else if (size == 2) {
+    *output = constant;
+    *(output + 1) = constant;
+  } else {
+    // This would be faster with SSE instructions.
+    // That would mean to have an implementation for each type (uint8, uint32, uint64).
+    T* end = output + size;
+    for (; output != end;)
+      *output++ = constant;
+  }
+}
+
 template <typename T>
 static Status PadImpl(OpKernelContext* ctx,
                       const PadsVector& pads,
@@ -327,7 +410,7 @@ static Status PadImpl(OpKernelContext* ctx,
 
   // Reshape input dims
   TensorShapeVector reshaped_input_dims;
-  FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
+  PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
 
   // Reshape padding
   size_t new_dims_count = reshaped_input_dims.size();
@@ -336,8 +419,8 @@ static Status PadImpl(OpKernelContext* ctx,
                                                              ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
                                                              : 0);
   PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count);
-  ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
-  ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
+  PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
+  PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
 
   TensorShapeVector reshaped_output_dims = reshaped_input_dims;
   TensorShapeVector input_starts;
@@ -575,20 +658,6 @@ static PadValue PadValueFromFloat(float value, MLDataType data_type) {
   return result;
 }
 
-template <class T>
-void ComputePadWithAxes(
-    gsl::span<const int64_t> pads_tensor_raw_data,
-    gsl::span<const T> axes_tensor_raw_data,
-    size_t data_rank,
-    PadsVector& pads) {
-  size_t axes_size = axes_tensor_raw_data.size();
-  for (size_t i = 0; i < axes_size; ++i) {
-    int64_t axis = HandleNegativeAxis(onnxruntime::narrow<int64_t>(axes_tensor_raw_data[i]), data_rank);
-    pads[onnxruntime::narrow<size_t>(axis)] = pads_tensor_raw_data[i];                          // xi_begin
-    pads[data_rank + onnxruntime::narrow<size_t>(axis)] = pads_tensor_raw_data[axes_size + i];  // xi_end
-  }
-}
-
 Status Pad::Compute(OpKernelContext* ctx) const {
   const Tensor& input_tensor = *ctx->Input<Tensor>(0);
   MLDataType data_type = input_tensor.DataType();
@@ -608,48 +677,14 @@ Status Pad::Compute(OpKernelContext* ctx) const {
     ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
                 "Pads tensor should be a 1D tensor of shape [2 * num_axes] "
                 "or a 2D tensor of shape [1, 2 * num_axes]");
-    const int64_t* pads_tensor_raw_data = pads_tensor.Data<int64_t>();
-    size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
-    pads.reserve(2 * data_rank);
-
-    const Tensor* axes_tensor = ctx->Input<Tensor>(3);
-    if (axes_tensor) {
-      const auto& axes_tensor_dims = axes_tensor->Shape().GetDims();
-      ORT_ENFORCE(axes_tensor_dims.size() == 1, "Axes tensor should be a 1D tensor ");
-      int64_t axes_size = axes_tensor_dims[0];
-
-      pads.resize(2 * data_rank, 0);
-      if (axes_tensor->IsDataType<int32_t>()) {
-        const int32_t* axes_tensor_raw_data = axes_tensor->Data<int32_t>();
-        ComputePadWithAxes<int32_t>(
-            {pads_tensor_raw_data, onnxruntime::narrow<size_t>(2 * axes_size)},
-            {axes_tensor_raw_data, onnxruntime::narrow<size_t>(axes_size)},
-            data_rank,
-            pads);
-      } else if (axes_tensor->IsDataType<int64_t>()) {
-        const int64_t* axes_tensor_raw_data = axes_tensor->Data<int64_t>();
-        ComputePadWithAxes<int64_t>(
-            {pads_tensor_raw_data, onnxruntime::narrow<size_t>(2 * axes_size)},
-            {axes_tensor_raw_data, onnxruntime::narrow<size_t>(axes_size)},
-            data_rank,
-            pads);
-      }
-    } else {
-      ORT_ENFORCE(pads_size == 2 * data_rank,
-                  "Pads tensor size should be equal to twice the input dimension count ");
-      for (size_t i = 0; i < pads_size; ++i) {
-        pads.push_back(pads_tensor_raw_data[i]);
-      }
-    }
+
+    const auto pads_data = pads_tensor.DataAsSpan<int64_t>();
+
+    // Compute Pads by applying axes if specified otherwise copy the supplied pads.
+    PadBase::ComputePads(*ctx, data_rank, pads_data, pads);
 
     // Separate out any negative pads into the slices array
-    slices.assign(pads.size(), 0);
-    for (size_t index = 0; index < pads.size(); index++) {
-      if (pads[index] < 0) {
-        slices[index] = pads[index];
-        pads[index] = 0;
-      }
-    }
+    PadBase::SeparateNegativeToSlices(pads, slices);
 
     value.u64 = 0U;
     const Tensor* value_tensor = ctx->Input<Tensor>(2);
diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h
index d869ed1a6dda..43f9cbfc9f9a 100644
--- a/onnxruntime/core/providers/cpu/tensor/padbase.h
+++ b/onnxruntime/core/providers/cpu/tensor/padbase.h
@@ -19,9 +19,80 @@ class PadBase {
   // Pads and slices are usually about twice the shapes involved
   using PadsVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;
 
-  // Update the output_shape to make it consistent with numpy handling where there are one or more dimensions
-  // in the input_shape with a value of zero.
-  static Status HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape);
+  // The following several functions are shared among the providers
+
+  /// <summary>
+  /// Handle the case when the input shape has zero dim values.
+  /// Depending on the mode, the input dim with zero value must match the output dim value.
+  ///
+  /// </summary>
+  /// <param name="mode">Padding mode enum value</param>
+  /// <param name="input_shape">actual input shape</param>
+  /// <param name="output_shape">output_shape</param>
+  /// <returns>Error if current mode padding can not be achieved with zero dim values</returns>
+  static Status HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape);
+
+  /// <summary>
+  /// Compute Pads by applying axes if specified otherwise copy the supplied pads.
+  ///
+  /// The function queries optional axes input (since version 18) and if present,
+  /// applies it as a mask to the pads. If axes is not present, the pads are copied as is.
+  /// If axes are present, they  are used as a mask over pads, so only those axes are being padded.
+  /// </summary>
+  /// <param name="ctx">kernel context to query axes input</param>
+  /// <param name="data_rank">input rank</param>
+  /// <param name="pads_data">pads data from pads input</param>
+  /// <param name="pads">resulting pads</param>
+  static void ComputePads(OpKernelContext& ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
+                          PadsVector& pads);
+
+  /// <summary>
+  /// Separates negative pad values to slices and zeros them out in original pads.
+  /// Leaving the rest of slices values as zero.
+  ///
+  /// This function is used inline in the Pad CUDA implementation and is not exposed via a provider
+  /// interfaces.
+  /// </summary>
+  /// <param name="pads">pad values</param>
+  /// <param name="slices">slices output</param>
+  static void SeparateNegativeToSlices(gsl::span<int64_t> pads, PadsVector& slices) {
+    slices.assign(pads.size(), 0);
+    for (size_t index = 0, lim = pads.size(); index < lim; index++) {
+      if (pads[index] < 0) {
+        slices[index] = pads[index];
+        pads[index] = 0;
+      }
+    }
+  }
+
+  // End provider shared
+
+  /// <summary>
+  /// Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
+  /// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as
+  /// [1,224,224*3] with padding [0,3,3*3,0,3,3*3].
+  ///
+  /// This is a helper function pads are expected to be twice the rank
+  /// </summary>
+  /// <param name="input_dims">original input dims</param>
+  /// <param name="pads">pad values</param>
+  /// <param name="slices">slices</param>
+  /// <param name="reshaped_dims">result dims</param>
+  static void FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<const int64_t> pads,
+                                gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims);
+
+  /// <summary>
+  /// Used after the inner shape is flattened, so we can apply this function to pads and slices
+  /// to reshape them as well.
+  /// </summary>
+  /// <param name="src_pad">pads</param>
+  /// <param name="src_dim_count">original dim count</param>
+  /// <param name="new_dim_count">expected flattended dim count</param>
+  /// <param name="inner_no_pad_size">is the left most dimension that was flattened.
+  ///  In the example above, that would be 224, reverse computed from 224*3</param>
+  /// <param name="reshaped_pad">resulting reshaped pads or slices</param>
+  static void ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count, size_t new_dim_count,
+                          size_t inner_no_pad_size, PadsVector& reshaped_pad);
 
  protected:
   PadBase(const OpKernelInfo& info) : value_(info.GetAttrOrDefault("value", 0.f)) {
diff --git a/onnxruntime/core/providers/cpu/tensor/reshape.cc b/onnxruntime/core/providers/cpu/tensor/reshape.cc
index 9b1cf63bc730..3038213bfe57 100644
--- a/onnxruntime/core/providers/cpu/tensor/reshape.cc
+++ b/onnxruntime/core/providers/cpu/tensor/reshape.cc
@@ -42,9 +42,20 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
         .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>()),
     Reshape);
 
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
+    Reshape,
+    19, 20,
+    KernelDefBuilder()
+        .Alias(0, 0)
+        .TypeConstraint("T", DataTypeImpl::AllTensorTypes())
+        .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>()),
+    Reshape);
+
+// Opset 21 added support for int4 and uint4.
+// TODO(adrianlizarraga): Implement int4 and uint4 support.
 ONNX_CPU_OPERATOR_KERNEL(
     Reshape,
-    19,
+    21,
     KernelDefBuilder()
         .Alias(0, 0)
         .TypeConstraint("T", DataTypeImpl::AllTensorTypesIRv9())
diff --git a/onnxruntime/core/providers/cpu/tensor/reshape_helper.h b/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
index 596168667442..d7ceda16e61e 100644
--- a/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
+++ b/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
@@ -37,12 +37,14 @@ class ReshapeHelper {
     if (unknown_dim != -1) {
       // calculate unknown dimension
       ORT_ENFORCE(size != 0 && (input_shape_size % size) == 0,
-                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape, ", requested shape:", TensorShape(requested_shape));
+                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape,
+                  ", requested shape:", TensorShape(requested_shape));
       requested_shape[unknown_dim] = input_shape_size / size;
     } else {
       // check if the output shape is valid.
       ORT_ENFORCE(input_shape_size == size,
-                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape, ", requested shape:", TensorShape(requested_shape));
+                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape,
+                  ", requested shape:", TensorShape(requested_shape));
     }
   }
 };
diff --git a/onnxruntime/core/providers/cpu/tensor/scatter.cc b/onnxruntime/core/providers/cpu/tensor/scatter.cc
index 8844b7e7a26c..c7a200592483 100644
--- a/onnxruntime/core/providers/cpu/tensor/scatter.cc
+++ b/onnxruntime/core/providers/cpu/tensor/scatter.cc
@@ -198,13 +198,6 @@ struct Func_Min<std::string> {
   }
 };
 
-template <>
-struct Func_Min<MLFloat16> {
-  void operator()(MLFloat16*, const MLFloat16*) const {
-    ORT_NOT_IMPLEMENTED("CPU execution provider: MLFloat16 data type is not supported with ScatterElements opset 18 when reduction is 'min'.");
-  }
-};
-
 template <>
 struct Func_Min<BFloat16> {
   void operator()(BFloat16*, const BFloat16*) const {
@@ -233,13 +226,6 @@ struct Func_Max<std::string> {
   }
 };
 
-template <>
-struct Func_Max<MLFloat16> {
-  void operator()(MLFloat16*, const MLFloat16*) const {
-    ORT_NOT_IMPLEMENTED("CPU execution provider: MLFloat16 data type is not supported with ScatterElements opset 18 when reduction is 'max'.");
-  }
-};
-
 template <>
 struct Func_Max<BFloat16> {
   void operator()(BFloat16*, const BFloat16*) const {
diff --git a/onnxruntime/core/providers/cpu/tensor/shape_op.cc b/onnxruntime/core/providers/cpu/tensor/shape_op.cc
index e6020d059e70..91d9e4581e78 100644
--- a/onnxruntime/core/providers/cpu/tensor/shape_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/shape_op.cc
@@ -24,9 +24,17 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()).TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
     Shape);
 
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
+    Shape,
+    19, 20,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()).TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
+    Shape);
+
+// Opset 21 added support for int4 and uint4.
+// TODO(adrianlizarraga): Implement int4 and uint4 support.
 ONNX_CPU_OPERATOR_KERNEL(
     Shape,
-    19,
+    21,
     KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypesIRv9()).TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
     Shape);
 
diff --git a/onnxruntime/core/providers/cpu/tensor/size.cc b/onnxruntime/core/providers/cpu/tensor/size.cc
index 9d9862b4cedc..a994845d5833 100644
--- a/onnxruntime/core/providers/cpu/tensor/size.cc
+++ b/onnxruntime/core/providers/cpu/tensor/size.cc
@@ -64,9 +64,30 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
     Size);
 
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
+    Size,
+    19, 20,
+    KernelDefBuilder().TypeConstraint("T",
+                                      std::vector<MLDataType>({DataTypeImpl::GetTensorType<float>(),
+                                                               DataTypeImpl::GetTensorType<double>(),
+                                                               DataTypeImpl::GetTensorType<int8_t>(),
+                                                               DataTypeImpl::GetTensorType<int16_t>(),
+                                                               DataTypeImpl::GetTensorType<int32_t>(),
+                                                               DataTypeImpl::GetTensorType<int64_t>(),
+                                                               DataTypeImpl::GetTensorType<uint8_t>(),
+                                                               DataTypeImpl::GetTensorType<uint16_t>(),
+                                                               DataTypeImpl::GetTensorType<uint32_t>(),
+                                                               DataTypeImpl::GetTensorType<uint64_t>(),
+                                                               DataTypeImpl::GetTensorType<std::string>(),
+                                                               DataTypeImpl::GetTensorType<bool>()}))
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
+    Size);
+
+// Opset 21 added the int4 and uint4 types.
+// TODO(adrianlizarraga): Implement support for int4 and uint4.
 ONNX_CPU_OPERATOR_KERNEL(
     Size,
-    19,
+    21,
     KernelDefBuilder().TypeConstraint("T",
                                       std::vector<MLDataType>({DataTypeImpl::GetTensorType<float>(),
                                                                DataTypeImpl::GetTensorType<double>(),
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
index 7d117317ba17..3218c8952d6e 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
@@ -14,6 +14,7 @@ class SpaceDepthBase {
                 "Attribute blocksize is not set.");
   }
 
+  template <bool IsNHWC = false>
   Status InputValidationsAndOutputDimsCalc(const Tensor& input,
                                            int64_t& batch,
                                            int64_t& input_depth, int64_t& input_height, int64_t& input_width,
@@ -27,9 +28,15 @@ class SpaceDepthBase {
     }
 
     batch = input_shape[0];
-    input_depth = input_shape[1];
-    input_height = input_shape[2];
-    input_width = input_shape[3];
+    if constexpr (IsNHWC) {
+      input_depth = input_shape[3];
+      input_height = input_shape[1];
+      input_width = input_shape[2];
+    } else {
+      input_depth = input_shape[1];
+      input_height = input_shape[2];
+      input_width = input_shape[3];
+    }
 
     if (is_space_to_depth) {  // SpaceToDepth op
       if ((input_height % this->blocksize_) != 0) {
@@ -46,7 +53,8 @@ class SpaceDepthBase {
 
     } else {  // DepthToSpace op
       if ((input_depth % (blocksize_ * blocksize_) != 0)) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "DepthToSpace requires input depth to be a multiple of (block_size * blok_size)");
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "DepthToSpace requires input depth to be a multiple of (block_size * block_size)");
       }
 
       output_depth = input_depth / blocksize_ / blocksize_;
diff --git a/onnxruntime/core/providers/cpu/tensor/squeeze.cc b/onnxruntime/core/providers/cpu/tensor/squeeze.cc
index ef4e500e8b9e..5217786ca14c 100644
--- a/onnxruntime/core/providers/cpu/tensor/squeeze.cc
+++ b/onnxruntime/core/providers/cpu/tensor/squeeze.cc
@@ -25,9 +25,20 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     Squeeze);
 
 // axes is input instead of attribute
-ONNX_CPU_OPERATOR_KERNEL(
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     Squeeze,
     13,
+    20,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::AllTensorTypes())
+        .Alias(0, 0),
+    Squeeze);
+
+// Opset 21 added support for float8e4m3fnuz, float8e5m2, float8e5m2fnuz, int4 and uint4.
+// TODO(adrianlizarraga): Implement support for float8e4m3fnuz, float8e5m2, float8e5m2fnuz, int4 and uint4.
+ONNX_CPU_OPERATOR_KERNEL(
+    Squeeze,
+    21,
     KernelDefBuilder()
         .TypeConstraint("T", DataTypeImpl::AllTensorTypes())
         .Alias(0, 0),
diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc
index 277dccac35b4..ec4624cf59ae 100644
--- a/onnxruntime/core/providers/cpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc
@@ -418,9 +418,18 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     KernelDefBuilder().TypeConstraint("T", BuildKernelDefConstraintsFromTypeList<EnabledDataTypes>()),
     Transpose);
 
-ONNX_CPU_OPERATOR_KERNEL(
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     Transpose,
     13,
+    20,
+    KernelDefBuilder().TypeConstraint("T", BuildKernelDefConstraintsFromTypeList<EnabledDataTypes>()),
+    Transpose);
+
+// Opset 21 added support for float8e4m3fnuz, float8e5m2, float8e5m2fnuz, int4 and uint4.
+// TODO(adrianlizarraga): Implement support for float8e4m3fnuz, float8e5m2, float8e5m2fnuz, int4 and uint4.
+ONNX_CPU_OPERATOR_KERNEL(
+    Transpose,
+    21,
     KernelDefBuilder().TypeConstraint("T", BuildKernelDefConstraintsFromTypeList<EnabledDataTypes>()),
     Transpose);
 
diff --git a/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc b/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
index 0433077b37c8..3e521bcc4cbf 100644
--- a/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
+++ b/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
@@ -28,9 +28,20 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     Unsqueeze);
 
 // axes is input instead of attribute
-ONNX_CPU_OPERATOR_KERNEL(
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     Unsqueeze,
     13,
+    20,
+    KernelDefBuilder()
+        .Alias(0, 0)
+        .TypeConstraint("T", DataTypeImpl::AllTensorTypes()),
+    Unsqueeze);
+
+// Opset 21 added support for float8e4m3fnuz, float8e5m2, float8e5m2fnuz, int4 and uint4.
+// TODO(adrianlizarraga): Implement support for float8e4m3fnuz, float8e5m2, float8e5m2fnuz, int4 and uint4.
+ONNX_CPU_OPERATOR_KERNEL(
+    Unsqueeze,
+    21,
     KernelDefBuilder()
         .Alias(0, 0)
         .TypeConstraint("T", DataTypeImpl::AllTensorTypes()),
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc
index fa69e144be55..babbac0b7be1 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc
@@ -1,10 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/cpu/tensor/upsample.h"
+
+#include <limits>
+
+#include "core/common/inlined_containers.h"
 #include "core/common/safeint.h"
 #include "core/platform/threadpool.h"
-#include "core/providers/cpu/tensor/upsample.h"
 #include "core/providers/cpu/tensor/upsample_antialias.h"
+
 using namespace onnxruntime::common;
 using namespace std;
 using onnxruntime::narrow;
@@ -30,6 +35,46 @@ REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(int8_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
 
+void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                            InlinedVector<float>& scales) const {
+  // AspectRatioPolicy::STRETCH is default policy when opset < 18
+  if (keep_aspect_ratio_policy_ == AspectRatioPolicy::STRETCH) {
+    return;
+  }
+
+  InlinedHashSet<int64_t> axes_set(axes_.begin(), axes_.end());
+
+  float scale_in_policy = 0.0f;
+  if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_LARGER) {
+    scale_in_policy = std::numeric_limits<float>::max();
+
+    for (size_t i = 0; i < scales.size(); i++) {
+      if (axes_set.empty() || axes_set.count(i) > 0) {
+        scale_in_policy = std::min(scale_in_policy, scales[i]);
+      }
+    }
+  } else if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_SMALLER) {
+    scale_in_policy = std::numeric_limits<float>::min();
+
+    for (size_t i = 0; i < scales.size(); i++) {
+      if (axes_set.empty() || axes_set.count(i) > 0) {
+        scale_in_policy = std::max(scale_in_policy, scales[i]);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < scales.size(); i++) {
+    // if axes is not specified (AKA axes_set.empty()), we apply the policy to all axes
+    if (axes_set.empty() || axes_set.count(i) > 0) {
+      scales[i] = scale_in_policy;
+      output_dims[i] = static_cast<int64_t>(std::round(scales[i] * input_dims[i]));
+    } else {
+      scales[i] = 1.0f;
+      output_dims[i] = input_dims[i];
+    }
+  }
+}
+
 template <typename T>
 void UpsampleNearest2x(int64_t batch_size,
                        int64_t num_channels,
@@ -94,8 +139,8 @@ UpsampleNearestSetupInputMappings(int64_t n_dim,
                                   const TensorShape& input_shape,
                                   const TensorShape& output_shape,
                                   const std::vector<int64_t>& input_dim_factor,
-                                  const vector<float>& scales,
-                                  const vector<float>& roi,
+                                  gsl::span<const float> scales,
+                                  gsl::span<const float> roi,
                                   bool extrapolation_enabled,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
                                   const GetNearestPixelFunc& get_nearest_pixel) {
@@ -141,8 +186,8 @@ static Status UpsampleNearestImpl(const T* input,
                                   T* output,
                                   const TensorShape& input_shape,
                                   const TensorShape& output_shape,
-                                  const vector<float>& scales,
-                                  const vector<float>& roi,
+                                  gsl::span<const float> scales,
+                                  gsl::span<const float> roi,
                                   bool extrapolation_enabled,
                                   const T extrapolation_value,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
@@ -285,8 +330,8 @@ static Status UpsampleNearest(const T* input,
                               T* output,
                               const TensorShape& input_shape,
                               const TensorShape& output_shape,
-                              const vector<float>& scales,
-                              const vector<float>& roi,
+                              gsl::span<const float> scales,
+                              gsl::span<const float> roi,
                               bool is_resize,
                               bool extrapolation_enabled,
                               T extrapolation_value,
@@ -412,7 +457,7 @@ BilinearParams SetupUpsampleBilinear(const int32_t input_height,
                                      const int32_t output_width,
                                      const float height_scale,
                                      const float width_scale,
-                                     const std::vector<float>& roi,
+                                     gsl::span<const float> roi,
                                      AllocatorPtr& alloc,
                                      const GetOriginalCoordinateFunc& get_original_coordinate,
                                      const bool is_nchw) {
@@ -518,7 +563,7 @@ BilinearParamsInteger SetupUpsampleBilinearInteger(const int32_t input_height,
                                                    const int32_t output_width,
                                                    const float height_scale,
                                                    const float width_scale,
-                                                   const std::vector<float>& roi,
+                                                   gsl::span<const float> roi,
                                                    AllocatorPtr& alloc,
                                                    const GetOriginalCoordinateFunc& get_original_coordinate,
                                                    const bool is_nchw) {
@@ -650,7 +695,7 @@ static TrilinearParams SetupUpsampleTrilinear(int64_t input_depth,
                                               float depth_scale,
                                               float height_scale,
                                               float width_scale,
-                                              const std::vector<float>& roi,
+                                              gsl::span<const float> roi,
                                               AllocatorPtr& alloc,
                                               const GetOriginalCoordinateFunc& get_original_coordinate) {
   TrilinearParams p;
@@ -796,7 +841,7 @@ void UpsampleTrilinear(int64_t batch_size,
                        float depth_scale,
                        float height_scale,
                        float width_scale,
-                       const std::vector<float>& roi,
+                       gsl::span<const float> roi,
                        bool use_extrapolation,
                        float extrapolation_value,
                        const T* XdataBase,
@@ -929,7 +974,7 @@ void ResizeBiCubic(int64_t batch_size,
                    bool use_extrapolation,
                    float extrapolation_value,
                    bool exclude_outside,
-                   const std::vector<float>& roi,
+                   gsl::span<const float> roi,
                    const T* Xdata,
                    T* Ydata,
                    const GetOriginalCoordinateFunc& get_original_coordinate) {
@@ -1067,9 +1112,9 @@ void ResizeBiCubic(int64_t batch_size,
 
 template <typename T>
 Status Upsample<T>::BaseCompute(OpKernelContext* context,
-                                const std::vector<float>& roi,
-                                const std::vector<float>& scales,
-                                const gsl::span<const int64_t>& output_dims) const {
+                                gsl::span<const float> roi,
+                                gsl::span<const float> scales,
+                                gsl::span<const int64_t> output_dims) const {
   const auto* X = context->Input<Tensor>(0);
   auto dims = X->Shape().GetDims();
   ORT_RETURN_IF_NOT(output_dims.size() == dims.size(), "Rank of input and output tensor should be same.");
@@ -1327,7 +1372,7 @@ Status Upsample<T>::Compute(OpKernelContext* context) const {
   // Initialize the roi array to all zeros as this will be the most common case
   // Roi data is needed only when coordinate transformation mode is set to tf_crop_and_resize
   // for all other cases we need a 0 initialized roi array
-  std::vector<float> roi_array(roi_);
+  InlinedVector<float> roi_array(roi_);
 
   if (!roi_cached_) {
     bool use_default_roi = true;
@@ -1353,7 +1398,7 @@ Status Upsample<T>::Compute(OpKernelContext* context) const {
 
   ComputeROIWithAxes(roi_array, input_dims.size());
   // Get scales data
-  std::vector<float> scales_array(input_dims.size());
+  InlinedVector<float> scales_array(input_dims.size());
 
   if (OpKernel::Node().InputDefs().size() == 1) {
     // Compute output shape from scales and input dims
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.h b/onnxruntime/core/providers/cpu/tensor/upsample.h
index 3046ee4b8260..8ff04781f6ad 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.h
@@ -66,8 +66,8 @@ class Upsample : public UpsampleBase, public OpKernel {
 
   Status Compute(OpKernelContext* context) const override;
 
-  Status BaseCompute(OpKernelContext* context, const std::vector<float>& roi, const std::vector<float>& scales,
-                     const gsl::span<const int64_t>& output_dims) const;
+  Status BaseCompute(OpKernelContext* context, gsl::span<const float> roi, gsl::span<const float> scales,
+                     gsl::span<const int64_t> output_dims) const;
 };
 
 BilinearParams SetupUpsampleBilinear(const int32_t input_height,
@@ -76,7 +76,7 @@ BilinearParams SetupUpsampleBilinear(const int32_t input_height,
                                      const int32_t output_width,
                                      const float height_scale,
                                      const float width_scale,
-                                     const std::vector<float>& roi,
+                                     gsl::span<const float> roi,
                                      AllocatorPtr& alloc,
                                      const GetOriginalCoordinateFunc& get_original_coordinate,
                                      const bool is_nchw);
@@ -90,7 +90,7 @@ void UpsampleBilinear(const int32_t batch_size,
                       const int32_t output_width,
                       const float height_scale,
                       const float width_scale,
-                      const std::vector<float>& roi,
+                      gsl::span<const float> roi,
                       const bool use_extrapolation,
                       const float extrapolation_value,
                       const T* const XdataBase,
@@ -144,7 +144,7 @@ void NhwcUpsampleBilinear(const int32_t batch_size,
                           const int32_t output_width,
                           const float height_scale,
                           const float width_scale,
-                          const std::vector<float>& roi,
+                          gsl::span<const float> roi,
                           const float extrapolation_value,
                           const T* const XdataBase,
                           T* const YdataBase,
@@ -227,7 +227,7 @@ BilinearParamsInteger SetupUpsampleBilinearInteger(const int32_t input_height,
                                                    const int32_t output_width,
                                                    const float height_scale,
                                                    const float width_scale,
-                                                   const std::vector<float>& roi,
+                                                   gsl::span<const float> roi,
                                                    AllocatorPtr& alloc,
                                                    const GetOriginalCoordinateFunc& get_original_coordinate,
                                                    const bool is_nchw);
@@ -241,7 +241,7 @@ void NhwcUpsampleBilinearInteger(const int32_t batch_size,
                                  const int32_t output_width,
                                  const float height_scale,
                                  const float width_scale,
-                                 const std::vector<float>& roi,
+                                 gsl::span<const float> roi,
                                  const float extrapolation_value,
                                  const T* const XdataBase,
                                  T* const YdataBase,
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
index 59b512def619..1e32b7e874b1 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
@@ -21,32 +21,6 @@
 
 namespace onnxruntime {
 
-namespace ConstValue {
-constexpr int32_t mag_factor = 1 << (22 - 1);
-}
-
-namespace {
-const uint8_t* GetLookupTableShared() {
-  // initialized once
-  static const auto* lookup_table = []() {
-    // if we have already initialized the lookup table, just return
-    // ideally we could have a global lookup table, but that account for too much space.
-    /* Handles values form -640 to 639. */
-    static uint8_t table[1280] = {0};
-
-    // taken from https://github.com/python-pillow/Pillow/blob/66add095a50d76c35c7f58643461f2edf78a3f05/src/libImaging/Resample.c#L94
-    //  we need to handle negative values
-    //  it's equivalent to :x = np.clip(x, 0, 255) where x \in [-640, 639]
-    // we will accept a negative x for (&table[640])[x] means table +640 -x
-    for (int i = 0; i < 1280; ++i) {
-      table[i] = static_cast<uint8_t>(std::min(std::max(i - 640, 0), 255));
-    }
-    return table;
-  }();
-  return lookup_table;
-}
-}  // namespace
-
 template <typename T>
 struct FilterParamsBaseAntiAlias {
   std::vector<int64_t> bound;
@@ -57,15 +31,15 @@ struct FilterParamsBaseAntiAlias {
 
 template <typename T>
 struct FilterParamsAntiAlias {
-  float support_size = 2.0f;
-  float cubic_coeff_a = -0.75f;
+  float support_size = antialias_constants::kSupportSize;
+  float cubic_coeff_a = antialias_constants::kCubicCoeffA;
 
   FilterParamsBaseAntiAlias<T> dim_x;
   FilterParamsBaseAntiAlias<T> dim_y;
   FilterParamsBaseAntiAlias<T> dim_z;
 
   const uint8_t* GetClip8LookupTable() const {
-    return GetLookupTableShared();
+    return UpsampleBase::GetLookupTableShared();
   }
   virtual ~FilterParamsAntiAlias() = default;
   virtual float Filter(float x) const = 0;
@@ -89,7 +63,7 @@ struct BilinearParamsAntiAlias : FilterParamsAntiAlias<T> {
 template <typename T>
 struct BiCubicParamsAntiAlias : FilterParamsAntiAlias<T> {
   BiCubicParamsAntiAlias() {
-    this->support_size = 4.0f;
+    this->support_size = antialias_constants::kBiCubicSupportSize;
   }
 
   // taken from
@@ -124,27 +98,6 @@ struct TriLinearParamsAntiAlias : FilterParamsAntiAlias<T> {
   }
 };
 
-template <typename T>
-struct AccumulateType {
-  using type = int32_t;
-  using Dtype = T;
-};
-
-template <>
-struct AccumulateType<int32_t> {
-  using type = float;
-};
-
-template <>
-struct AccumulateType<float> {
-  using type = float;
-};
-
-template <>
-struct AccumulateType<double> {
-  using type = double;
-};
-
 // The following method supports a 3/4/5-D input in 'Linear mode, cubic mode'
 // that amounts to 'Bilinear,TriLinear, Bicubic/Tricubic' Upsampling/Resizing in the sense that it assumes
 // A N-D tensor has
@@ -156,19 +109,20 @@ struct AccumulateType<double> {
 // - [N, H, W, C] and the scales are [1.0, height_scale, width_scale, 1.0]
 template <class T>
 void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
-                                  const gsl::span<int64_t> input_h_w_c,
-                                  const gsl::span<int64_t> output_h_w_c,
-                                  const gsl::span<float> scale_h_w_c,
-                                  const std::vector<float>& roi,
+                                  gsl::span<const int64_t> input_h_w_c,
+                                  gsl::span<const int64_t> output_h_w_c,
+                                  gsl::span<const float> scale_h_w_c,
+                                  gsl::span<const float> roi,
                                   AllocatorPtr& alloc,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
                                   bool exclude_outside, const bool is_nchw) {
-  auto compute_weight_coefficients = [&alloc, &roi, &get_original_coordinate, exclude_outside](const FilterParamsAntiAlias<T>& p,
-                                                                                               const int64_t input_size,
-                                                                                               const int64_t output_size,
-                                                                                               size_t rindex,
-                                                                                               FilterParamsBaseAntiAlias<T>& param_base,
-                                                                                               const float rscale) -> int64_t {
+  auto compute_weight_coefficients = [&alloc, roi, &get_original_coordinate, exclude_outside](
+                                         const FilterParamsAntiAlias<T>& p,
+                                         const int64_t input_size,
+                                         const int64_t output_size,
+                                         size_t rindex,
+                                         FilterParamsBaseAntiAlias<T>& param_base,
+                                         const float rscale) -> int64_t {
     param_base.bound.reserve(static_cast<size_t>(output_size) * 2);
     param_base.out_of_bound_idx.reserve(static_cast<size_t>(output_size));
 
@@ -245,13 +199,14 @@ void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
 
         // normalize the scale to 1 << 22 for int8/uint8
         if constexpr (std::is_same<T, int32_t>::value) {
-          scale_buffer_int[x] = static_cast<int32_t>(std::round(scale_buffer[x] * ConstValue::mag_factor * 2.f));
+          scale_buffer_int[x] = static_cast<int32_t>(std::round(scale_buffer[x] * ConstValue::mag_factor_x_2));
         }
       }
       /*for (; x < window_size; x++) {
         scale_buffer[x] = 0;
       }*/
     }
+
     return window_size;
   };
 
@@ -269,9 +224,6 @@ void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
   }
 }
 
-template <class T>
-inline constexpr bool is_8bit_v = std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
-
 /**
  * @brief To compute interpolation along with the last axis.
  * For brief,we assume the input tensor has 3 dimensions and we all it CHW for each character represent a dim.
@@ -398,6 +350,7 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in
                 output += *Xdata_offset * (*weight_coeff_start++);
                 Xdata_offset += output_width;
               }
+
               if constexpr (is_8bit_v<InputType>) {
                 *Ydata_offset++ = static_cast<InputType>(clip8_lookups[output >> 22]);
               } else if constexpr (std::is_same<InputType, int32_t>::value) {
@@ -444,6 +397,7 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in
                 output += *Xdata_offset * (*weight_coeff_start++);
                 Xdata_offset += output_width;
               }
+
               if constexpr (is_8bit_v<InputType>) {
                 *Ydata_offset++ = static_cast<InputType>(clip8_lookups[output >> 22]);
               } else if constexpr (std::is_same<InputType, int32_t>::value) {
@@ -515,6 +469,7 @@ void UpsampleBaseAntiAlias(FilterParamsAntiAlias<T1>& p,
                                        narrow<size_t>(input_height * num_channels * input_width));
       auto ydata_span = gsl::make_span(image_temp_buffer.get(), narrow<size_t>(input_height * num_channels * output_width));
 
+      // This computes only the width direction.Thus height keeps unchanged.
       ComputeInterpolationAtLevel1(num_channels, input_height, input_width, input_height, output_width,
                                    xdata_span, ydata_span, p, p.dim_x, tp);
     }
@@ -546,7 +501,7 @@ void UpsampleBilinearAntiAlias(const int64_t batch_size,
                                const int64_t output_width,
                                const float height_scale,
                                const float width_scale,
-                               const std::vector<float>& roi,
+                               gsl::span<const float> roi,
                                const bool use_extrapolation,
                                const float extrapolation_value,
                                bool exclude_outside,
@@ -575,7 +530,7 @@ void NhwcUpsampleBilinearAntiAlias(const int64_t batch_size,
                                    const int64_t output_width,
                                    const float height_scale,
                                    const float width_scale,
-                                   const std::vector<float>& roi,
+                                   gsl::span<const float> roi,
                                    const bool use_extrapolation,
                                    const float extrapolation_value,
                                    bool exclude_outside,
@@ -608,7 +563,7 @@ void NhwcResizeBiCubicAntiAlias(const int64_t batch_size,
                                 bool use_extrapolation,
                                 float extrapolation_value,
                                 bool exclude_outside,
-                                const std::vector<float>& roi,
+                                gsl::span<const float> roi,
                                 const Tensor* X,
                                 T* Ydata_base,
                                 AllocatorPtr& alloc,
@@ -688,7 +643,7 @@ void ResizeBiCubicAntiAlias(int64_t batch_size,
                             bool use_extrapolation,
                             float extrapolation_value,
                             bool exclude_outside,
-                            const std::vector<float>& roi,
+                            gsl::span<const float> roi,
                             const Tensor* X,
                             T* Ydata_base,
                             AllocatorPtr& alloc,
@@ -700,7 +655,7 @@ void ResizeBiCubicAntiAlias(int64_t batch_size,
   BiCubicParamsAntiAlias<typename AccumulateType<T>::type> p;
   p.cubic_coeff_a = cubic_coeff_a;
   SetupUpsampleFilterAntiAlias(p, input_paras, output_paras, scale_paras, roi,
-                               alloc, get_original_coordinate, exclude_outside, false);
+                               alloc, get_original_coordinate, exclude_outside, true);
 
   return UpsampleBaseAntiAlias<T>(p, batch_size, num_channels, input_height, input_width, output_height, output_width,
                                   use_extrapolation, extrapolation_value,
@@ -719,7 +674,7 @@ void UpsampleTrilinearAntiAlias(int64_t batch_size,
                                 float depth_scale,
                                 float height_scale,
                                 float width_scale,
-                                const std::vector<float>& roi,
+                                gsl::span<const float> roi,
                                 bool use_extrapolation,
                                 float extrapolation_value,
                                 bool exclude_outside,
diff --git a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
index a0e7ca1084fe..b768fedd8513 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
@@ -3,11 +3,13 @@
 
 #pragma once
 
+#include <algorithm>
 #include <string>
 #include <string_view>
 #include <unordered_map>
 #include <vector>
-#include <unordered_set>
+
+#include <core/common/inlined_containers_fwd.h>
 #include "core/common/status.h"
 #include <core/common/safeint.h>
 #include <core/common/narrow.h>
@@ -58,7 +60,73 @@ enum class AspectRatioPolicy {
   NOT_SMALLER,
 };
 
+// Antialias types
+template <typename T>
+struct AccumulateType {
+  using type = int32_t;
+  using Dtype = T;
+};
+
+template <>
+struct AccumulateType<int32_t> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<float> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<MLFloat16> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<double> {
+  using type = double;
+};
+
+namespace antialias_constants {
+constexpr float kCubicCoeffA = -0.75f;
+constexpr float kSupportSize = 2.0f;
+constexpr float kBiCubicSupportSize = 4.0f;
+}  // namespace antialias_constants
+
+namespace ConstValue {
+constexpr int32_t mag_factor = 1 << (22 - 1);
+// We use to multiply by 2, let's make a constant which is twice as big
+constexpr int32_t mag_factor_x_2 = 1 << 22;
+}  // namespace ConstValue
+
+template <class T>
+inline constexpr bool is_8bit_v = std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+
+template <typename T>
+void PrintAntiAliasBuffers(std::ostream& os, gsl::span<int64_t> bounds, gsl::span<int64_t> out_of_bounds,
+                           gsl::span<T> weight_coefficients) {
+  os << "#### Bounds: ";
+  std::copy(bounds.begin(), bounds.end(), std::ostream_iterator<int64_t>(os, " "));
+  os << std::endl;
+
+  os << "#### Out of Bounds: ";
+  std::copy(out_of_bounds.begin(), out_of_bounds.end(),
+            std::ostream_iterator<int64_t>(os, " "));
+  os << std::endl;
+
+  os << "#### Scale Buffer: ";
+  std::copy(weight_coefficients.begin(), weight_coefficients.end(),
+            std::ostream_iterator<T>(os, " "));
+  os << std::endl;
+}
+
 class UpsampleBase {
+ public:
+  // Make this available in other EP via provider bridge
+  // it works iff output_shape is specified
+  void AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                InlinedVector<float>& scales) const;
+
  protected:
   explicit UpsampleBase(const OpKernelInfo& info)
       : scales_cached_(false), roi_cached_(false), use_extrapolation_(false) {
@@ -69,23 +137,32 @@ class UpsampleBase {
     std::string mode;
     ORT_ENFORCE(info.GetAttr<std::string>("mode", &mode).IsOK());
     mode_ = StringToUpsampleMode(mode);
-    antialias_ = info.GetAttrOrDefault<int64_t>("antialias", 0) == 0 ? false : true;
-    if (antialias_) {
-      ORT_ENFORCE((UpsampleMode::LINEAR == mode_ || UpsampleMode::CUBIC == mode_),
-                  "when anti-aliasing is set, Resize only supports mode `LINEAR` and `CUBIC`.");
-    }
 
     auto input_count = info.GetInputCount();
     if (input_count == 1) {  // opset < 10
-      ORT_THROW_IF_ERROR(info.GetAttrs<float>("scales", scales_));
-      ORT_THROW_IF_ERROR(ScalesValidation(scales_, mode_));
+      std::vector<float> scales;
+      ORT_THROW_IF_ERROR(info.GetAttrs<float>("scales", scales));
+      ORT_THROW_IF_ERROR(ScalesValidation(scales, mode_));
+      scales_.assign(scales.cbegin(), scales.cend());
       scales_cached_ = true;
     }
 
-    std::string keep_aspect_ratio_policy = info.GetAttrOrDefault<std::string>("keep_aspect_ratio_policy", "stretch");
-    keep_aspect_ratio_policy_ = StringToKeepAspectRatioPolicy(keep_aspect_ratio_policy);
+    if (opset >= 18) {
+      antialias_ = info.GetAttrOrDefault<int64_t>("antialias", 0) == 0 ? false : true;
+
+      if (antialias_) {
+        ORT_ENFORCE((UpsampleMode::LINEAR == mode_ || UpsampleMode::CUBIC == mode_),
+                    "when anti-aliasing is set, Resize only supports mode `LINEAR` and `CUBIC`.");
+      }
 
-    axes_ = info.GetAttrsOrDefault<int64_t>("axes");
+      // The attribute is absent in opset < 18, but the default value as if stretch.
+      std::string keep_aspect_ratio_policy = info.GetAttrOrDefault<std::string>("keep_aspect_ratio_policy", "stretch");
+      keep_aspect_ratio_policy_ = StringToKeepAspectRatioPolicy(keep_aspect_ratio_policy);
+
+      // guard against unit tests that can add an attribute
+      auto axes = info.GetAttrsOrDefault<int64_t>("axes");
+      axes_.assign(axes.cbegin(), axes.cend());
+    }
 
     extrapolation_value_ = info.GetAttrOrDefault<float>("extrapolation_value", 0.0f);
 
@@ -112,7 +189,7 @@ class UpsampleBase {
     nearest_mode_ = StringToNearestMode(nearest_mode_name);
     get_nearest_pixel_ = GetNearestPixelFromOriginal(nearest_mode_);
 
-    cubic_coeff_a_ = info.GetAttrOrDefault<float>("cubic_coeff_a", -0.75f);
+    cubic_coeff_a_ = info.GetAttrOrDefault<float>("cubic_coeff_a", antialias_constants::kCubicCoeffA);
     exclude_outside_ = info.GetAttrOrDefault<int64_t>("exclude_outside", 0) == 0 ? false : true;
 
     if ((exclude_outside_ == 1 && mode_ != CUBIC) && (antialias_ == false || mode_ != LINEAR)) {
@@ -166,7 +243,7 @@ class UpsampleBase {
   ResizeCoordinateTransformationMode coordinate_transform_mode_;
   GetOriginalCoordinateFunc get_original_coordinate_;
   ResizeNearestMode nearest_mode_;
-  AspectRatioPolicy keep_aspect_ratio_policy_;
+  AspectRatioPolicy keep_aspect_ratio_policy_{AspectRatioPolicy::STRETCH};
   GetNearestPixelFunc get_nearest_pixel_;
   float cubic_coeff_a_;
   bool exclude_outside_;
@@ -174,9 +251,9 @@ class UpsampleBase {
   float extrapolation_value_;
   bool use_nearest2x_optimization_ = false;
 
-  std::vector<float> scales_;
-  std::vector<float> roi_;
-  std::vector<int64_t> axes_;
+  InlinedVector<float> scales_;
+  InlinedVector<float> roi_;
+  TensorShapeVector axes_;
 
   bool scales_cached_;
   bool roi_cached_;
@@ -335,7 +412,7 @@ class UpsampleBase {
     }
   }
 
-  [[nodiscard]] Status ScalesValidation(const std::vector<float>& scales, const UpsampleMode mode) const {
+  [[nodiscard]] Status ScalesValidation(gsl::span<const float> scales, const UpsampleMode mode) const {
     if (!is_resize_) {
       for (auto& scale : scales) {
         ORT_RETURN_IF_NOT(scale >= 1, "Scale value should be greater than or equal to 1.");
@@ -372,7 +449,7 @@ class UpsampleBase {
   }
 
   [[nodiscard]] Status
-  ParseScalesData(const Tensor* scale, std::vector<float>& scales, int64_t rank) const {
+  ParseScalesData(const Tensor* scale, InlinedVector<float>& scales, int64_t rank) const {
     const auto* scale_data = scale->Data<float>();
     int64_t scales_size = scale->Shape().Size();
     ORT_RETURN_IF_NOT(scales_size > 0, "scales size should be greater than 0.");
@@ -387,19 +464,19 @@ class UpsampleBase {
     // in which case the other axes is ignored and use default scale of 1
     // scales_size == axes_.size() should be guaranteed if axes is not empty
     if (rank > 0 && (scales_size != rank || axes_.size())) {
-      std::vector<float> new_scales(size_t(rank), 1.0f);
+      InlinedVector<float> new_scales(size_t(rank), 1.0f);
       ORT_RETURN_IF_NOT(*std::max_element(axes_.begin(), axes_.end()) < rank && (int64_t(axes_.size()) == scales_size),
                         "all values in axes should be less than rank of the data");
 
       for (size_t i = 0; i < axes_.size(); i++) {
         new_scales[static_cast<size_t>(axes_[i])] = scales[i];
       }
-      scales = new_scales;
+      scales.swap(new_scales);
     }
     return ScalesValidation(scales, mode_);
   }
 
-  void ParseRoiData(const Tensor* roi, std::vector<float>& roi_array) const {
+  void ParseRoiData(const Tensor* roi, InlinedVector<float>& roi_array) const {
     int64_t roi_size = roi->Shape().Size();
     if (roi_size > 0) {
       roi_array.resize(onnxruntime::narrow<size_t>(roi_size));
@@ -429,52 +506,11 @@ class UpsampleBase {
     return Status::OK();
   }
 
-  // it works iff output_shape is specified
-  void AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
-                                std::vector<float>& scales) const {
-    std::unordered_set<int64_t> axes_set(axes_.begin(), axes_.end());
-
-    // AspectRatioPolicy::STRETCH is default policy when opset < 18
-    if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::STRETCH) {
-      return;
-    }
-
-    float scale_in_policy = 0.0f;
-    if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_LARGER) {
-      scale_in_policy = std::numeric_limits<float>::max();
-
-      for (size_t i = 0; i < scales.size(); i++) {
-        if (axes_set.empty() || axes_set.count(i) > 0) {
-          scale_in_policy = std::min(scale_in_policy, scales[i]);
-        }
-      }
-    } else if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_SMALLER) {
-      scale_in_policy = std::numeric_limits<float>::min();
-
-      for (size_t i = 0; i < scales.size(); i++) {
-        if (axes_set.empty() || axes_set.count(i) > 0) {
-          scale_in_policy = std::max(scale_in_policy, scales[i]);
-        }
-      }
-    }
-
-    for (size_t i = 0; i < scales.size(); i++) {
-      // if axes is not specified (AKA axes_set.empty()), we apply the policy to all axes
-      if (axes_set.empty() || axes_set.count(i) > 0) {
-        scales[i] = scale_in_policy;
-        output_dims[i] = static_cast<int64_t>(std::round(scales[i] * input_dims[i]));
-      } else {
-        scales[i] = 1.0f;
-        output_dims[i] = input_dims[i];
-      }
-    }
-  }
-
   // It's different in Opset 18 and before.
   // we will modify output_shape by sorts of policy even if it's specified
   [[nodiscard]] Status ParseScalesDataAndAdjustOutputSize(TensorShapeVector& output_dims,
                                                           gsl::span<const int64_t> input_dims,
-                                                          std::vector<float>& scales) const {
+                                                          InlinedVector<float>& scales) const {
     for (size_t i = 0, end = input_dims.size(); i < end; ++i) {
       // Handle corner case to avoid dividing by zero in the next step
       if (input_dims[i] == 0) {
@@ -507,9 +543,9 @@ class UpsampleBase {
 
   // Roi is redefined in Opset-18, we have a concept of axes.
   // So we need to update it accordingly.
-  void ComputeROIWithAxes(std::vector<float>& roi_array, size_t rank) const {
+  void ComputeROIWithAxes(InlinedVector<float>& roi_array, size_t rank) const {
     if (axes_.size()) {
-      std::vector<float> roi_tmp(rank * 2, 0);
+      InlinedVector<float> roi_tmp(rank * 2, 0);
       for (size_t i = rank; i < rank * 2; ++i) {
         roi_tmp[i] = 1;
       }
@@ -518,9 +554,32 @@ class UpsampleBase {
         roi_tmp[v_in_axes] = (roi_array[i]);
         roi_tmp[rank + v_in_axes] = (roi_array[axes_.size() + i]);
       }
-      roi_array = roi_tmp;
+      roi_array.swap(roi_tmp);
     }
   }
+
+ public:
+  static constexpr size_t kLookupTableSize = 1280;
+
+  static const uint8_t* GetLookupTableShared() {
+    // initialized once
+    static const auto* lookup_table = []() {
+      // if we have already initialized the lookup table, just return
+      // ideally we could have a global lookup table, but that account for too much space.
+      /* Handles values form -640 to 639. */
+      static uint8_t table[kLookupTableSize] = {0};
+
+      // taken from https://github.com/python-pillow/Pillow/blob/66add095a50d76c35c7f58643461f2edf78a3f05/src/libImaging/Resample.c#L94
+      //  we need to handle negative values
+      //  it's equivalent to :x = np.clip(x, 0, 255) where x \in [-640, 639]
+      // we will accept a negative x for (&table[640])[x] means table +640 -x
+      for (int i = 0; i < static_cast<int>(kLookupTableSize); ++i) {
+        table[i] = static_cast<uint8_t>(std::min(std::max(i - 640, 0), 255));
+      }
+      return table;
+    }();
+    return lookup_table;
+  }
 };  // UpsampleBase
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/regex_full_match.cc b/onnxruntime/core/providers/cpu/text/regex_full_match.cc
new file mode 100644
index 000000000000..cc4a5a9ae4e6
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/regex_full_match.cc
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "regex_full_match.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+ONNX_CPU_OPERATOR_KERNEL(
+    RegexFullMatch,
+    20,
+    KernelDefBuilder()
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<std::string>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    RegexFullMatch);
+
+RegexFullMatch::RegexFullMatch(const OpKernelInfo& info) : OpKernel(info), re_{info.GetAttr<std::string>("pattern")} {
+  ORT_ENFORCE(re_.ok(), "Invalid regex pattern: ", re_.pattern());
+}
+
+Status RegexFullMatch::Compute(OpKernelContext* context) const {
+  const auto* input_tensor = context->Input<Tensor>(0);
+  const auto input_data = input_tensor->template DataAsSpan<std::string>();
+  auto* output_tensor = context->Output(0, input_tensor->Shape());
+  auto output_data = output_tensor->template MutableDataAsSpan<bool>();
+  auto output_iter = output_data.begin();
+  auto input_iter = input_data.begin();
+  while (input_iter != input_data.end()) {
+    *output_iter = RE2::FullMatch(*input_iter, re_);
+    input_iter++;
+    output_iter++;
+  }
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/regex_full_match.h b/onnxruntime/core/providers/cpu/text/regex_full_match.h
new file mode 100644
index 000000000000..0d3f1f4b4b82
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/regex_full_match.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+#include "re2/re2.h"
+
+namespace onnxruntime {
+
+class RegexFullMatch final : public OpKernel {
+ public:
+  explicit RegexFullMatch(const OpKernelInfo& info);
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  RE2 re_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/string_concat.cc b/onnxruntime/core/providers/cpu/text/string_concat.cc
new file mode 100644
index 000000000000..bc626f8e055a
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_concat.cc
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "string_concat.h"
+#include "core/providers/cpu/math/element_wise_ops.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+ONNX_CPU_OPERATOR_KERNEL(StringConcat, 20,
+                         KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<std::string>()),
+                         StringConcat);
+
+Status StringConcat::Compute(OpKernelContext* context) const {
+  ProcessBroadcastSpanFuncs broadcast_funcs{[](BroadcastHelper& broadcast_helper) {
+                                              auto x = broadcast_helper.ScalarInput0<std::string>();
+                                              auto y = broadcast_helper.SpanInput1<std::string>();
+                                              auto y_iter = y.begin();
+                                              auto output_iter = broadcast_helper.OutputSpan<std::string>().begin();
+                                              const auto x_size = x.length();
+                                              while (y_iter != y.end()) {
+                                                output_iter->reserve(x_size + y_iter->length());
+                                                output_iter->append(x);
+                                                output_iter->append(*y_iter);
+                                                y_iter++;
+                                                output_iter++;
+                                              }
+                                            },
+                                            [](BroadcastHelper& broadcast_helper) {
+                                              auto x = broadcast_helper.SpanInput0<std::string>();
+                                              auto x_iter = x.begin();
+                                              auto y = broadcast_helper.ScalarInput1<std::string>();
+                                              auto output_iter = broadcast_helper.OutputSpan<std::string>().begin();
+                                              const auto y_size = y.length();
+                                              while (x_iter != x.end()) {
+                                                output_iter->reserve(y_size + x_iter->length());
+                                                output_iter->append(*x_iter);
+                                                output_iter->append(y);
+                                                x_iter++;
+                                                output_iter++;
+                                              }
+                                            },
+                                            [](BroadcastHelper& broadcast_helper) {
+                                              auto x_iter = broadcast_helper.SpanInput0<std::string>().begin();
+                                              auto y_iter = broadcast_helper.SpanInput1<std::string>().begin();
+                                              auto output = broadcast_helper.OutputSpan<std::string>();
+                                              auto output_iter = output.begin();
+                                              while (output_iter != output.end()) {
+                                                output_iter->reserve(x_iter->length() + y_iter->length());
+                                                output_iter->append(*x_iter);
+                                                output_iter->append(*y_iter);
+                                                x_iter++;
+                                                y_iter++;
+                                                output_iter++;
+                                              }
+                                            }};
+  UntypedBroadcastTwo(*context, broadcast_funcs);
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/string_concat.h b/onnxruntime/core/providers/cpu/text/string_concat.h
new file mode 100644
index 000000000000..63c1ea8a4114
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_concat.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+class StringConcat final : public OpKernel {
+ public:
+  StringConcat(const OpKernelInfo& info) : OpKernel(info) {}
+
+  Status Compute(OpKernelContext* context) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/string_normalizer.cc b/onnxruntime/core/providers/cpu/text/string_normalizer.cc
new file mode 100644
index 000000000000..32de3105d627
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_normalizer.cc
@@ -0,0 +1,636 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "string_normalizer.h"
+#include "core/common/common.h"
+#include "core/framework/tensor.h"
+// Used below HAS_DEPRECATED_DECLARATIONS
+#include "onnxruntime_config.h"
+
+#ifdef _MSC_VER
+#include <locale.h>
+#endif  // _MSC_VER
+
+#include <codecvt>
+#include <locale>
+#include <functional>
+
+#if defined(__GNUC__)
+// Allow deprecated-declarations warning - std::codecvt_utf8 is deprecatedd
+#if defined(HAS_DEPRECATED_DECLARATIONS)
+#pragma GCC diagnostic warning "-Wdeprecated-declarations"
+#endif  // defined(HAS_DEPRECATED_DECLARATIONS)
+#endif  // defined(__GNUC__)
+
+namespace onnxruntime {
+
+ONNX_CPU_OPERATOR_KERNEL(
+    StringNormalizer,
+    10,
+    KernelDefBuilder()
+        .TypeConstraint("X", DataTypeImpl::GetTensorType<std::string>()),
+    StringNormalizer);
+
+namespace string_normalizer {
+
+// codecvt_utf8 is deprecated, we will want to replace it with our class
+class Utf8ConverterGeneric {
+ public:
+  size_t ComputeRequiredSizeToUtf8(const std::wstring& wstr) const {
+    if (wstr.empty()) {
+      return 0;
+    }
+
+    size_t result = 0;
+    std::mbstate_t state = std::mbstate_t();
+
+    const wchar_t* src = wstr.data();
+    const wchar_t* src_end = src + wstr.length();
+
+    char dummy_dest[128] = {0};
+
+    char* char_next = dummy_dest;
+    const wchar_t* wchar_next = src;
+
+    size_t converted = 0;
+
+    std::codecvt_base::result ret_code = std::codecvt_base::ok;
+
+    // Continue while we exhaust the sequence
+    while (converted < wstr.length()) {
+      ret_code = converter_.out(state,
+                                wchar_next,
+                                src_end,
+                                wchar_next,
+                                std::begin(dummy_dest),
+                                std::end(dummy_dest),
+                                char_next);
+      result += (char_next - dummy_dest);
+      converted = (wchar_next - src);
+
+      if (ret_code != std::codecvt_base::partial &&
+          ret_code != std::codecvt_base::ok) {
+        break;
+      }
+    }
+
+    ORT_ENFORCE(ret_code != std::codecvt_base::noconv, "Conversion is expected");
+
+    if (ret_code != std::codecvt_base::ok) {
+      ORT_THROW("Failed to compute size for UTF-8. Converted only first: ",
+                converted, " codepoints out of: ", wstr.length());
+    }
+
+    return result;
+  }
+
+  // We assume the caller pre-allocated the correct length
+  Status ConvertToUtf8(const std::wstring& wstr, std::string& str) const {
+    if (wstr.empty()) {
+      str.clear();
+      return Status::OK();
+    }
+
+    std::mbstate_t state = std::mbstate_t();
+
+    const wchar_t* src = wstr.data();
+    const wchar_t* src_end = src + wstr.length();
+
+    char* dest = str.data();
+    char* dest_end = dest + str.length();
+
+    char* char_next = dest;
+    const wchar_t* wchar_next = src;
+
+    std::codecvt_base::result ret_code = converter_.out(state,
+                                                        src,
+                                                        src_end,
+                                                        wchar_next,
+                                                        dest,
+                                                        dest_end,
+                                                        char_next);
+
+    if (ret_code != std::codecvt_base::ok) {
+      size_t converted = narrow<size_t>(wchar_next - wstr.data());
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to convert to UTF-8. Converted only first: ",
+                             converted, " codepoints out of: ", wstr.length());
+    }
+
+    str.resize(char_next - dest);
+
+    return Status::OK();
+  }
+
+  Status ComputeRequiredSizeToWideChar(const std::string& str, size_t& wchars) {
+    if (str.empty()) {
+      wchars = 0;
+      return Status::OK();
+    }
+
+    size_t result = 0;
+    std::mbstate_t state = std::mbstate_t();
+
+    const char* src = str.data();
+    const char* src_end = src + str.length();
+
+    wchar_t dummy_dest[128] = {0};
+    const char* char_next = src;
+    wchar_t* wchar_next = dummy_dest;
+
+    size_t converted = 0;
+
+    std::codecvt_base::result ret_code = std::codecvt_base::ok;
+    while (converted < str.length()) {
+      ret_code = converter_.in(state,
+                               char_next,
+                               src_end,
+                               char_next,
+                               std::begin(dummy_dest),
+                               std::end(dummy_dest),
+                               wchar_next);
+      result += (wchar_next - dummy_dest);
+      converted = (char_next - src);
+
+      if (ret_code != std::codecvt_base::partial &&
+          ret_code != std::codecvt_base::ok) {
+        break;
+      }
+    }
+
+    ORT_ENFORCE(ret_code != std::codecvt_base::noconv, "Conversion is expected");
+
+    if (ret_code != std::codecvt_base::ok) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "Failed to compute buffer size for wchar_t. Converted only first: ",
+                             converted, " bytes out of: ", str.length(),
+                             " Source: ", src);
+    }
+
+    wchars = result;
+    return Status::OK();
+  }
+
+  // We assume the destination buffer is preallocated correctly
+  Status ConvertToWideChar(const std::string& str, std::wstring& wstr) {
+    if (str.empty()) {
+      // Preserve the buffer for re-use, just set size to 0
+      wstr.clear();
+      return Status::OK();
+    }
+
+    std::mbstate_t state = std::mbstate_t();
+    const char* src = str.data();
+    const char* src_end = src + str.length();
+
+    wchar_t* dest = wstr.data();
+    wchar_t* dest_end = dest + wstr.length();
+
+    const char* char_next = src;
+    wchar_t* wchar_next = dest;
+
+    std::codecvt_base::result ret_code = converter_.in(state,
+                                                       src,
+                                                       src_end,
+                                                       char_next,
+                                                       dest,
+                                                       dest_end,
+                                                       wchar_next);
+
+    if (ret_code != std::codecvt_base::ok) {
+      size_t converted = narrow<size_t>(char_next - str.data());
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to convert to wchar_t. Converted only first: ",
+                             converted, " bytes out of: ", str.length(),
+                             " Source: ", src);
+    }
+
+    wstr.resize(wchar_next - dest);
+
+    return Status::OK();
+  }
+
+  std::wstring from_bytes(const std::string& s) {
+    std::wstring result;
+
+    size_t wchars = 0;
+    ORT_THROW_IF_ERROR(ComputeRequiredSizeToWideChar(s, wchars));
+
+    result.resize(wchars);
+    ORT_THROW_IF_ERROR(ConvertToWideChar(s, result));
+    return result;
+  }
+
+ private:
+  std::codecvt_utf8<wchar_t> converter_;
+};
+
+// We need to specialize for MS as there is
+// a std::locale creation bug that affects different
+// environments in a different way
+#ifdef _MSC_VER
+
+class Locale {
+ public:
+  explicit Locale(const std::string& name)
+      : loc_(nullptr) {
+    loc_ = _create_locale(LC_CTYPE, name.c_str());
+    if (loc_ == nullptr) {
+      ORT_THROW("Failed to construct locale with name:",
+                name, ":", ":Please, install necessary language-pack-XX and configure locales");
+    }
+  }
+
+  ~Locale() {
+    if (loc_ != nullptr) {
+      _free_locale(loc_);
+    }
+  }
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Locale);
+
+  void ChangeCase(StringNormalizer::CaseAction caseaction,
+                  std::wstring& wstr) const {
+    assert(caseaction != StringNormalizer::NONE);
+    if (caseaction == StringNormalizer::LOWER) {
+      std::transform(wstr.begin(), wstr.end(), wstr.begin(),
+                     [this](wchar_t ch) { return ::_towlower_l(ch, loc_); });
+    } else {
+      std::transform(wstr.begin(), wstr.end(), wstr.begin(),
+                     [this](wchar_t ch) { return ::_towupper_l(ch, loc_); });
+    }
+  }
+
+ private:
+  _locale_t loc_;
+};
+
+class Utf8ConverterWindows {
+ public:
+  size_t ComputeRequiredSizeToUtf8(const std::wstring& wstr) const {
+    if (wstr.empty()) {
+      return 0;
+    }
+
+    int ret = WideCharToMultiByte(CP_UTF8,
+                                  0,
+                                  wstr.data(),
+                                  narrow<int>(wstr.length()),  // We specify the length so no trailing zero terminator
+                                  NULL,
+                                  0,      // indicates we need the buffer size.
+                                  NULL,   // Must be NULL for UTF-8
+                                  NULL);  // Must be NULL for UTF-8
+
+    // Failed. This is unlikely since the original UTF-8 to wchar_t succeeded.
+    // So we throw.
+    if (ret == 0) {
+      const auto error_code = GetLastError();
+      ORT_THROW("WideCharToMultiByte failed errcode = ",
+                error_code, " - ",
+                std::system_category().message(error_code));
+    }
+
+    return narrow<size_t>(ret);
+  }
+
+  Status ConvertToUtf8(const std::wstring& wstr, std::string& dest) const {
+    if (wstr.empty()) {
+      dest.clear();
+      return Status::OK();
+    }
+
+    const int ret = WideCharToMultiByte(CP_UTF8, 0,
+                                        wstr.data(),
+                                        narrow<int>(wstr.length()),
+                                        dest.data(),
+                                        narrow<int>(dest.length()),
+                                        nullptr,
+                                        nullptr);
+
+    if (ret == 0) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "WideCharToMultiByte failed errcode = ",
+                             error_code, " - ",
+                             std::system_category().message(error_code));
+    }
+
+    dest.resize(narrow<size_t>(ret));
+    return Status::OK();
+  }
+
+  Status ComputeRequiredSizeToWideChar(const std::string& str, size_t& wchars) {
+    if (str.empty()) {
+      wchars = 0;
+      return Status::OK();
+    }
+
+    const int ret = MultiByteToWideChar(CP_UTF8,
+                                        MB_ERR_INVALID_CHARS,
+                                        str.data(),
+                                        narrow<int>(str.length()),
+                                        nullptr,
+                                        0);
+
+    if (ret == 0) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MultiByteToWideChar failed errcode = ",
+                             error_code, " - ",
+                             std::system_category().message(error_code));
+    }
+
+    wchars = narrow<size_t>(ret);
+    return Status::OK();
+  }
+
+  Status ConvertToWideChar(const std::string& str, std::wstring& wstr) {
+    if (str.empty()) {
+      // Preserve the buffer for re-use, just set size to 0
+      wstr.clear();
+      return Status::OK();
+    }
+
+    const int ret = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                                        str.data(),
+                                        narrow<int>(str.length()),
+                                        wstr.data(),
+                                        narrow<int>(wstr.length()));
+
+    if (ret == 0) {
+      const auto error_code = GetLastError();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MultiByteToWideChar failed errcode = ",
+                             error_code, " - ",
+                             std::system_category().message(error_code));
+    }
+
+    wstr.resize(narrow<size_t>(ret));
+    return Status::OK();
+  }
+
+  // Used only in the constructor to initialize stop_words
+  std::wstring from_bytes(const std::string& s) {
+    size_t size_required = 0;
+    ORT_THROW_IF_ERROR(ComputeRequiredSizeToWideChar(s, size_required));
+    std::wstring result;
+    result.resize(size_required);
+    ORT_THROW_IF_ERROR(ConvertToWideChar(s, result));
+    return result;
+  }
+};
+
+const std::string default_locale("en-US");
+
+using Utf8Converter = Utf8ConverterWindows;
+
+#else  // _MSC_VER
+
+class Locale {
+ public:
+  explicit Locale(const std::string& name) {
+    ORT_TRY {
+      loc_ = std::locale(name.c_str());
+    }
+    ORT_CATCH(const std::runtime_error& e) {
+      ORT_HANDLE_EXCEPTION([&]() {
+        ORT_THROW("Failed to construct locale with name:",
+                  name, ":", e.what(), ":Please, install necessary language-pack-XX and configure locales");
+      });
+    }
+  }
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Locale);
+
+  void ChangeCase(StringNormalizer::CaseAction caseaction,
+                  std::wstring& wstr) const {
+    assert(caseaction != StringNormalizer::NONE);
+    if (caseaction == StringNormalizer::LOWER) {
+      std::transform(wstr.begin(), wstr.end(), wstr.begin(),
+                     [this](wchar_t ch) { return std::tolower(ch, loc_); });
+    } else {
+      std::transform(wstr.begin(), wstr.end(), wstr.begin(),
+                     [this](wchar_t ch) { return std::toupper(ch, loc_); });
+    }
+  }
+
+ private:
+  std::locale loc_;
+};
+
+#if defined(__APPLE__) || defined(__ANDROID__)
+
+using Utf8Converter = Utf8ConverterGeneric;
+
+#else
+
+using Utf8Converter = Utf8ConverterGeneric;
+
+#endif
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE || TARGET_OS_SIMULATOR
+const std::string default_locale("en-US.UTF-8");
+#else
+const std::string default_locale("en_US.UTF-8");  // Other kinds of Apple Platforms including MacOS, etc
+#endif
+#else
+const std::string default_locale("en_US.UTF-8");  // All non-MS and not Apple
+#endif
+
+#endif  // _MSC_VER
+}  // namespace string_normalizer
+
+using namespace string_normalizer;
+
+StringNormalizer::StringNormalizer(const OpKernelInfo& info) : OpKernel(info) {
+  int64_t iscasesensitive = 0;
+  Status status = info.GetAttr("is_case_sensitive", &iscasesensitive);
+  ORT_ENFORCE(status.IsOK(), "attribute is_case_sensitive is not set");
+  is_case_sensitive_ = iscasesensitive != 0;
+
+  std::string case_change_action;
+  status = info.GetAttr("case_change_action", &case_change_action);
+  ORT_ENFORCE(status.IsOK(), "attribute case_change_action is not set");
+  if (case_change_action == "LOWER") {
+    case_change_action_ = LOWER;
+  } else if (case_change_action == "UPPER") {
+    case_change_action_ = UPPER;
+  } else if (case_change_action == "NONE") {
+    case_change_action_ = NONE;
+  } else {
+    ORT_ENFORCE(false, "attribute case_change_action has invalid value");
+  }
+
+  locale_name_ = info.GetAttrOrDefault("locale", default_locale);
+
+  std::vector<std::string> stop_words = info.GetAttrsOrDefault<std::string>("stopwords");
+  if (is_case_sensitive_) {
+    stopwords_.reserve(stop_words.size());
+    for (std::string& s : stop_words) {
+      stopwords_.insert(std::move(s));
+    }
+  } else {
+    Locale locale(locale_name_);
+    Utf8Converter converter;
+    wstopwords_.reserve(stop_words.size());
+    for (std::string& s : stop_words) {
+      std::wstring wstr = converter.from_bytes(s);
+      locale.ChangeCase(compare_caseaction_, wstr);
+      wstopwords_.insert(std::move(wstr));
+    }
+  }
+}
+
+Status StringNormalizer::Compute(OpKernelContext* ctx) const {
+  using namespace string_normalizer;
+
+  auto X = ctx->Input<Tensor>(0);
+  auto input_dims = X->Shape().GetDims();
+  auto input_span = X->DataAsSpan<std::string>();
+
+  TensorShapeVector output_shape;
+  int64_t C = 0;
+  if (input_dims.size() == 1) {
+    if (input_dims[0] < 1) {
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                    "Single dimension value must be greater than 0");
+    }
+    C = input_dims[0];
+  } else if (input_dims.size() == 2) {
+    if (input_dims[0] != 1 || input_dims[1] < 1) {
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                    "Input dimensions are either[C > 0] or [1][C > 0] allowed");
+    }
+    output_shape.push_back(1);
+    C = input_dims[1];
+  } else {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "Input dimensions are either[C > 0] or [1][C > 0] allowed");
+  }
+
+  // Special case, no filtering and no case change
+  if (case_change_action_ == NONE &&
+      ((is_case_sensitive_ && stopwords_.empty()) ||
+       (!is_case_sensitive_ && wstopwords_.empty()))) {
+    output_shape.push_back(C);
+    auto output_tensor = ctx->Output(0, output_shape);
+    auto const output_data = output_tensor->MutableData<std::string>();
+    std::copy(input_span.begin(), input_span.end(), output_data);
+    return Status::OK();
+  }
+
+  // We need to know the result dimension, and for that we need to filter
+  // the words first. If comparison mode is case sensitive, we just go ahead
+  // and compare with the original strings. Otherwise, we need to convert the string
+  // to widechar, lowercase it and then compare. Case-insensitive comparison is complicated
+  // for UTF-8 and requires additional dependency.
+
+  Locale locale(locale_name_);
+  Utf8Converter converter;
+
+  // Compute the largest widestring buffer needed.
+  size_t max_wide_buffer_len = 0;
+  for (const auto& s : input_span) {
+    size_t wchars = 0;
+    // Checks for invalid UTF-8 characters on Windows
+    ORT_RETURN_IF_ERROR(converter.ComputeRequiredSizeToWideChar(s, wchars));
+    max_wide_buffer_len = std::max(max_wide_buffer_len, wchars);
+  }
+
+  // Reuse reserved space
+  std::wstring wchar_buffer;
+  wchar_buffer.reserve(max_wide_buffer_len);
+
+  // Output everything and change case as required
+  auto output_no_filtering = [&](const TensorShape& output_shape) {
+    auto output_tensor = ctx->Output(0, output_shape);
+    auto const output_data = output_tensor->MutableData<std::string>();
+    for (size_t i = 0, lim = input_span.size(); i < lim; ++i) {
+      const std::string& s = input_span[i];
+      wchar_buffer.resize(max_wide_buffer_len);
+      ORT_RETURN_IF_ERROR(converter.ConvertToWideChar(s, wchar_buffer));
+      locale.ChangeCase(case_change_action_, wchar_buffer);
+
+      auto& dest = output_data[i];
+      size_t utf8_buffer_len = converter.ComputeRequiredSizeToUtf8(wchar_buffer);
+      dest.resize(utf8_buffer_len);
+      ORT_RETURN_IF_ERROR(converter.ConvertToUtf8(wchar_buffer, dest));
+    }
+    return Status::OK();
+  };
+
+  auto output_filtered = [&](const TensorShape& output_shape, gsl::span<const size_t> filtered_indices) {
+    auto output_tensor = ctx->Output(0, output_shape);
+    auto output_data = output_tensor->MutableData<std::string>();
+    for (size_t i : filtered_indices) {
+      const std::string& s = input_span[i];
+      if (case_change_action_ != NONE) {
+        wchar_buffer.resize(max_wide_buffer_len);
+        ORT_RETURN_IF_ERROR(converter.ConvertToWideChar(s, wchar_buffer));
+        locale.ChangeCase(case_change_action_, wchar_buffer);
+
+        auto& dest = *output_data++;
+        size_t utf8_buffer_len = converter.ComputeRequiredSizeToUtf8(wchar_buffer);
+        dest.resize(utf8_buffer_len);
+        ORT_RETURN_IF_ERROR(converter.ConvertToUtf8(wchar_buffer, dest));
+      } else {
+        *output_data++ = s;
+      }
+    }
+    return Status::OK();
+  };
+
+  Status status;
+
+  if (is_case_sensitive_) {
+    if (stopwords_.empty()) {
+      assert(case_change_action_ != NONE);
+      output_shape.push_back(C);
+      status = output_no_filtering(output_shape);
+    } else {
+      // we need to filter
+      InlinedVector<size_t> filtered_strings_indices;
+      filtered_strings_indices.reserve(input_span.size());
+
+      for (size_t i = 0, lim = input_span.size(); i < lim; ++i) {
+        const std::string& s = input_span[i];
+        if (stopwords_.count(s) == 0) {
+          filtered_strings_indices.push_back(i);
+        }
+      }
+
+      // According to the spec, if all strings are filtered out
+      // the output must have a shape of {1} with a single empty string.
+      const int64_t filtered_count = std::max<int64_t>(1, narrow<int64_t>(filtered_strings_indices.size()));
+      output_shape.push_back(filtered_count);
+      status = output_filtered(output_shape, filtered_strings_indices);
+    }
+  } else {
+    if (wstopwords_.empty()) {
+      assert(case_change_action_ != NONE);
+      output_shape.push_back(C);
+      status = output_no_filtering(output_shape);
+    } else {
+      // Case insensitive filtering is performed by converting the input strings
+      // to compare_caseaction_. For that we convert to wchar_t UNICODE.
+      // Otherwise, we need to pull ICU library on all platforms.
+      InlinedVector<size_t> filtered_strings_indices;
+      filtered_strings_indices.reserve(input_span.size());
+      for (size_t i = 0, lim = input_span.size(); i < lim; ++i) {
+        const std::string& s = input_span[i];
+        wchar_buffer.resize(max_wide_buffer_len);
+        ORT_RETURN_IF_ERROR(converter.ConvertToWideChar(s, wchar_buffer));
+        locale.ChangeCase(compare_caseaction_, wchar_buffer);
+        if (wstopwords_.count(wchar_buffer) == 0) {
+          filtered_strings_indices.push_back(i);
+        }
+      }
+
+      // According to the spec, if all strings are filtered out
+      // the output must have a shape of {1} with a single empty string.
+      const int64_t filtered_count = std::max<int64_t>(1, narrow<int64_t>(filtered_strings_indices.size()));
+      output_shape.push_back(filtered_count);
+      status = output_filtered(output_shape, filtered_strings_indices);
+    }
+  }
+
+  return status;
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/string_normalizer.h b/onnxruntime/core/providers/cpu/text/string_normalizer.h
similarity index 76%
rename from onnxruntime/core/providers/cpu/nn/string_normalizer.h
rename to onnxruntime/core/providers/cpu/text/string_normalizer.h
index 50a4afd8d614..750c59ec21e2 100644
--- a/onnxruntime/core/providers/cpu/nn/string_normalizer.h
+++ b/onnxruntime/core/providers/cpu/text/string_normalizer.h
@@ -25,9 +25,11 @@ class StringNormalizer : public OpKernel {
   Status Compute(OpKernelContext* ctx) const override;
 
  private:
-  bool is_case_sensitive_;
-  CaseAction case_change_action_;
-  CaseAction compare_caseaction_;  // used for case-insensitive compare
+  bool is_case_sensitive_{true};
+  CaseAction case_change_action_{NONE};
+  // Set this to lower because some characters do not have capital case.
+  // used for case-insensitive compare
+  CaseAction compare_caseaction_{LOWER};
   std::string locale_name_;
   // Either if these are populated but not both
   InlinedHashSet<std::string> stopwords_;
diff --git a/onnxruntime/core/providers/cpu/text/string_split.cc b/onnxruntime/core/providers/cpu/text/string_split.cc
new file mode 100644
index 000000000000..2b8230983846
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_split.cc
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "string_split.h"
+#include <algorithm>
+#include <limits>
+#include <string>
+#include "core/common/common.h"
+namespace onnxruntime {
+
+ONNX_CPU_OPERATOR_KERNEL(StringSplit, 20,
+                         KernelDefBuilder()
+                             .TypeConstraint("T1", DataTypeImpl::GetTensorType<std::string>())
+                             .TypeConstraint("T2", DataTypeImpl::GetTensorType<std::string>())
+                             .TypeConstraint("T3", DataTypeImpl::GetTensorType<int64_t>()),
+                         StringSplit);
+
+/// Calculate substrings in ``str`` delimited by ``delimiter``. A maximum of ``max_splits`` splits are permitted.
+/// Returns a vector of string slices into ``str`` representing the substrings as string views. The user must ensure
+/// the returned views' lifetime does not exceed ``str``'s.
+void ComputeSubstrings(std::string_view str, std::string_view delimiter, int64_t max_splits, InlinedVector<std::string_view>& out) {
+  if (str.empty()) {
+    return;
+  }
+  if (delimiter.empty()) {
+    // Count consecutive whitespace as one delimiter. Preceding and trailing whitespace is meant to be ignored.
+    size_t pos = str.find_first_not_of(" ");
+    int64_t token_count = 0;
+    while (pos != std::string::npos) {
+      if (token_count++ == max_splits) {
+        // Trim down last substring as required in specification
+        size_t next_pos = str.length() - 1;
+        while (str[next_pos] == ' ') {
+          next_pos--;
+        }
+        out.push_back(str.substr(pos, next_pos - pos + 1));
+        break;
+      } else {
+        auto next_pos = str.find_first_of(" ", pos);
+        out.push_back(str.substr(pos, next_pos - pos));
+        pos = str.find_first_not_of(" ", next_pos);
+      }
+    }
+  } else {
+    size_t pos = 0;
+    int64_t token_count = 0;
+    while (pos != std::string::npos) {
+      auto next_pos = str.find(delimiter, pos);
+      if (token_count++ == max_splits || next_pos == std::string::npos) {
+        out.push_back(str.substr(pos));
+        break;
+      }
+      out.push_back(str.substr(pos, next_pos - pos));
+      pos = next_pos + delimiter.size();
+    }
+  }
+}
+
+StringSplit::StringSplit(const OpKernelInfo& info) : OpKernel(info) {
+  info.GetAttrOrDefault("maxsplit", &maxsplit_, std::numeric_limits<int64_t>::max() - 1);
+  info.GetAttrOrDefault("delimiter", &delimiter_, std::string());
+}
+
+Status StringSplit::Compute(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  auto input_data = input->template DataAsSpan<std::string>();
+
+  // Set up number of tokens output
+  auto num_tokens_data = context->Output(1, input->Shape())->template MutableDataAsSpan<int64_t>();
+  auto num_tokens_iter = num_tokens_data.begin();
+
+  InlinedVector<InlinedVector<std::string_view>> input_slices;
+  input_slices.reserve(input_data.size());
+  size_t last_dim = 0;
+
+  for (const auto& s : input_data) {
+    auto& substrs = input_slices.emplace_back();
+    ComputeSubstrings(s, delimiter_, maxsplit_, substrs);
+    auto substr_count = substrs.size();
+    last_dim = std::max(last_dim, substr_count);
+    *num_tokens_iter = static_cast<int64_t>(substr_count);
+    ++num_tokens_iter;
+  }
+
+  // Set up splits output
+  auto splits_shape = input->Shape().AsShapeVector();
+  splits_shape.push_back(last_dim);
+
+  auto splits_data = context->Output(0, splits_shape)->template MutableDataAsSpan<std::string>();
+  auto slices_iter = input_slices.begin();
+  for (auto output_splits_iter = splits_data.begin(); output_splits_iter != splits_data.end(); output_splits_iter += last_dim, ++slices_iter) {
+    std::copy(slices_iter->begin(), slices_iter->end(), output_splits_iter);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/string_split.h b/onnxruntime/core/providers/cpu/text/string_split.h
new file mode 100644
index 000000000000..6be249261d4e
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_split.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+class StringSplit final : public OpKernel {
+ public:
+  explicit StringSplit(const OpKernelInfo& info);
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  std::string delimiter_;
+  int64_t maxsplit_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/atomic/common.cuh b/onnxruntime/core/providers/cuda/atomic/common.cuh
index 14fa2d0706f7..170aa3a2d8d0 100644
--- a/onnxruntime/core/providers/cuda/atomic/common.cuh
+++ b/onnxruntime/core/providers/cuda/atomic/common.cuh
@@ -122,5 +122,316 @@ __device__ __forceinline__ void AtomicAdd<half>(half* start_addr, size_t index,
 #endif
 }
 
+// Disable default template instantiation.
+// For every type T, we need to define a specialization
+// to select the right type for calling atomicCAS.
+template <typename T>
+class AtomicCasType;
+
+template<>
+class AtomicCasType<int8_t> {
+ public:
+  using type = unsigned short int;
+  static const unsigned int mask = 0xffu;
+};
+
+template<>
+class AtomicCasType<half> {
+ public:
+  using type = unsigned short int;
+  static const unsigned int mask = 0xffffu;
+};
+
+template<>
+class AtomicCasType<float> {
+ public:
+  using type = unsigned int;
+  static const unsigned int mask = 0xffffffffu;
+};
+
+template<>
+class AtomicCasType<double> {
+ public:
+  using type = unsigned long long int;
+  static const unsigned int mask = 0xffffffffu;
+};
+
+template<>
+class AtomicCasType<int> {
+ public:
+  using type = int;
+  static const unsigned int mask = 0xffffffffu;
+};
+
+template<>
+class AtomicCasType<int64_t> {
+ public:
+  using type = unsigned long long int;
+  static const unsigned int mask = 0xffffffffu;
+};
+
+// Obtained from pytorch/aten/src/ATen/cuda/Atomic.cuh.
+//
+// This function compute 8-bit atomic binary operation using 32-bit atomicCAS.
+// It accumulate `val` into the `address` using the `func`.
+// The accumulation is atomic (i.e., thread-safe).
+//
+// E.g., Assume ValueType is
+//  int8_t
+// and BinaryFunc is
+//  struct AddFunc {
+//    __device__ __forceinline__ int8_t operator()(int8_t a, int8_t b) const {
+//      return a + b;
+//  }
+// This function becomes atomic_add for int8_t.
+template<typename ValueType, typename BinaryFunc>
+__device__ __forceinline__ void atomic_byte_func_with_unit32_cas(ValueType* address, ValueType val, BinaryFunc func) {
+    // Assert to ensure the following bit-wise manipulation is correct.
+    static_assert(sizeof(ValueType) == 1 | sizeof(ValueType) == 2 | sizeof(ValueType) == 4,
+      "ValueType must be 1-byte, 2-byte or 4-byte large.");
+    // Number of bytes to the lower 4-byte aligned address.
+    // If the current address is b1010"10", then offset = b10 = 2,
+    // which means the current address is 2 bytes away from
+    // the lower 4-byte aligned address b1010"00".
+    size_t offset = (size_t)address & 3;
+    // Find an new 4-byte aligned address `address_as_ui` lower than
+    // or equal to `address`. Lower than `address` so that the actual
+    // int8_t byte is in the 4-byte word that we load.
+    //
+    // This address has the following properties:
+    //   1. It is 4-byte aligned.
+    //   2. It is lower than or equal to `address`.
+    //   3. De-referencing this address may return
+    //      a uint32_t value that contains the same int8_t
+    //      value indicated by `address`.
+    //
+    // E.g.,
+    //  address = b101010
+    //  offset = b101010 & b000011 = b10 = 2
+    //  (char*)address - offset => (char*)b101010 - b000010 => b1010"00",
+    // which is (32-bit aligned).
+    uint32_t * address_as_ui = (uint32_t*)((char*)address - offset);
+    uint32_t old = *address_as_ui;
+    // E.g., offset = 2.
+    // address_as_ui is an address 2 bytes lower than `address`.
+    //
+    // ..... byte 3 ..... | ..... byte 2 ..... | ..... byte 1 ..... | ..... byte 0 .....
+    //                  ^                    ^                                         ^
+    //                  |                    |                                         |
+    //                  |                  address <--- offset * 8 (bit)----->  address_as_ui
+    //                  |                                                              ^
+    //                  |                                                              |
+    //                  ------------------------- *address_as_ui -----------------------
+    //
+    // This visualization shows
+    //  1. the 32-bit word at address_as_ui.
+    //  2. the gap between address_as_ui and address.
+    //  3. *address_as_ui contains the int8_t value at `address`.
+    uint32_t shift = offset * 8;
+    uint32_t old_byte;
+    uint32_t newval;
+    uint32_t assumed;
+    do {
+      assumed = old;
+      // Select 8-bit value from 32-bit word. Assume offset = 2 (byte), so
+      // we want to select the 3rd byte (byte 2 below) from the word.
+      //
+      // Journey of a 32-bit value:
+      //
+      // ..... byte 3 ..... | ..... byte 2 ..... | ..... byte 1 ..... | ..... byte 0 .....
+      //
+      //                                         |
+      //                                         |  old >> offset * 8, where offset = 2.
+      //                                         |  Effectively, push lower two bytes
+      //                                         |  out of the word.
+      //                                         V
+      //
+      //      00000000      |      00000000      | ..... byte 3 ..... | ..... byte 2 .....
+      //
+      //                                                              |  apply bit-wise AND,
+      //                                                              |  & 0xff (i.e., & b11111111),
+      //                                                              |  so that we only keep
+      //                                                              |  the byte of interest.
+      //                                                              |  Otherwise, overflow may
+      //                                                              |  happen when casting this
+      //                                                              |  32-bit value to int8_t.
+      //                                                              V
+      //
+      //      00000000      |      00000000      |      00000000      | ..... byte 2 .....
+      old_byte = (old >> shift) & AtomicCasType<ValueType>::mask;
+      // Compute new int8_t value and store it to newrawvalue.
+      // Journey of a 32-bit value (cont'd):
+      //
+      // newrawvalue
+      // ... new byte 2 ...
+      auto newrawvalue = func(val, reinterpret_cast<ValueType&>(old_byte));
+      // Put the new int8_t value back to 32-bit word.
+      // Also ensure that bits not occupied by the int8_t value are 0s.
+      //
+      // Journey of a 32-bit value (cont'd):
+      //
+      // reinterpret_cast<uint32_t&>(newrawvalue)
+      //    random values   |   random values    |   random values    | ... new byte 2 ...
+      //
+      // reinterpret_cast<uint32_t&>(newrawvalue) & AtomicCasType<ValueType>::mask
+      //      00000000      |      00000000      |      00000000      | ... new byte 2 ...
+      newval = reinterpret_cast<uint32_t&>(newrawvalue) & AtomicCasType<ValueType>::mask;
+      // Journey of a 32-bit value (cont'd):
+      //
+      // old
+      // ..... byte 3 ..... | ..... byte 2 ..... | ..... byte 1 ..... | ..... byte 0 .....
+      //
+      // 0x000000ff
+      //      00000000      |      00000000      |      00000000      |      11111111
+      //
+      // 0x000000ff << shift
+      //      00000000      |      11111111      |      00000000      |      00000000
+      //
+      // ~(0x000000ff << shift)
+      //      11111111      |      00000000      |      11111111      |      11111111
+      //
+      // old & ~(0x000000ff << shift)
+      // ..... byte 3 ..... |      00000000      | ..... byte 1 ..... | ..... byte 0 .....
+      //
+      // newval << shift
+      //      00000000      | ... new byte 2 ... |      00000000      |      00000000
+      //
+      // (old & ~(0x000000ff << shift)) | (newval << shift)
+      // ..... byte 3 ..... | ... new byte 2 ... | ..... byte 1 ..... | ..... byte 0 .....
+      newval = (old & ~(AtomicCasType<ValueType>::mask << shift)) | (newval << shift);
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+}
+
+// It accumulates `val` into the `address` using the `func`.
+// This function is thread-safe (i.e., atomic).
+template<typename ValueType, typename BinaryFunc>
+__device__ __forceinline__ void atomic_binary_func(ValueType* address, ValueType val, BinaryFunc func) {
+  ValueType observed = *address, assumed, new_value;
+  using CasType = typename AtomicCasType<ValueType>::type;
+  static_assert(sizeof(ValueType) == sizeof(CasType),
+    "ValueType and CasType must have the same size for calling atomicCAS.");
+  auto address_as_cas_type = reinterpret_cast<CasType*>(address);
+  do {
+      // Record the value used to compute new value.
+      assumed = observed;
+
+      // Compute expected new value.
+      new_value = func(observed, val);
+
+      // Cast to aribitrary 2-byte type to desired integer type supported by atomicCAS.
+      //                    4
+      //                    8
+      auto observed_as_cas_type = *reinterpret_cast<CasType*>(&observed);
+      auto new_value_as_cas_type = *reinterpret_cast<CasType*>(&new_value);
+
+      // Call atomicCAS as if the 2-byte type variables are all unsigned short int.
+      //                          4                             unsigned int (or int)
+      //                          8                             unsigned long long int
+      auto cas_observed_as_cas_type = atomicCAS(address_as_cas_type, observed_as_cas_type, new_value_as_cas_type);
+
+      // Cast the freshly observed value in memory back to the TwoByteType.
+      observed = *reinterpret_cast<ValueType*>(&cas_observed_as_cas_type);
+
+      // Two cases:
+      // 1. compare-and-swap success
+      //    a. `address` holds `new_value`
+      //    b. `observed` becomes the new value after the assignment.
+      //       Thus, the following `observed != new_value` is false,
+      //       and the loop terminates.
+      //  2. compare-and-swap fails
+      //     a. `address` holds a value different from `observed`, thus,
+      //        the `new_value` is stale.
+      //     b. `observed` becomes the fresh value observed in `address`.
+      //        Thus, the following (observed != new_value) is true,
+      //        and the loop continues. In the next iteration, the
+      //        `new_value` is computed again using the fresh `observed`.
+  } while (observed != assumed);
+}
+
+struct AddFunc {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+struct MulFunc {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a * b;
+  }
+};
+
+struct MaxFunc {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return b > a ? b : a;
+  }
+};
+
+struct MinFunc {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return b < a ? b : a;
+  }
+};
+
+__device__ __forceinline__ void atomic_add(int8_t* address, int8_t value) {
+  atomic_byte_func_with_unit32_cas(address, value, AddFunc());
+}
+__device__ __forceinline__ void atomic_mul(int8_t* address, int8_t value) {
+  atomic_byte_func_with_unit32_cas(address, value, MulFunc());
+}
+__device__ __forceinline__ void atomic_max(int8_t* address, int8_t value) {
+  atomic_byte_func_with_unit32_cas(address, value, MaxFunc());
+}
+__device__ __forceinline__ void atomic_min(int8_t* address, int8_t value) {
+  atomic_byte_func_with_unit32_cas(address, value, MinFunc());
+}
+
+__device__ __forceinline__ void atomic_mul(half* address, half value) {
+#if __CUDA_ARCH__ >= 700
+  atomic_binary_func(address, value, MulFunc());
+#else
+  atomic_byte_func_with_unit32_cas(address, value, MulFunc());
+#endif
+}
+__device__ __forceinline__ void atomic_max(half* address, half value) {
+#if __CUDA_ARCH__ >= 700
+  atomic_binary_func(address, value, MaxFunc());
+#else
+  atomic_byte_func_with_unit32_cas(address, value, MaxFunc());
+#endif
+}
+__device__ __forceinline__ void atomic_min(half* address, half value) {
+#if __CUDA_ARCH__ >= 700
+  atomic_binary_func(address, value, MinFunc());
+#else
+  atomic_byte_func_with_unit32_cas(address, value, MinFunc());
+#endif
+}
+
+__device__ __forceinline__ void atomic_mul(float* address, float value) {
+  atomic_binary_func(address, value, MulFunc());
+}
+__device__ __forceinline__ void atomic_max(float* address, float value) {
+  atomic_binary_func(address, value, MaxFunc());
+}
+__device__ __forceinline__ void atomic_min(float* address, float value) {
+  atomic_binary_func(address, value, MinFunc());
+}
+
+__device__ __forceinline__ void atomic_mul(double* address, double value) {
+  atomic_binary_func(address, value, MulFunc());
+}
+__device__ __forceinline__ void atomic_max(double* address, double value) {
+  atomic_binary_func(address, value, MaxFunc());
+}
+__device__ __forceinline__ void atomic_min(double* address, double value) {
+  atomic_binary_func(address, value, MinFunc());
+}
+
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 0d9928baa86e..052dd05574ab 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -5,7 +5,9 @@
 #include <stdint.h>
 #include <vector>
 #include <mutex>
+#include <limits>
 #include <assert.h>
+#include <math.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #include "core/providers/cuda/cuda_common.h"
@@ -194,13 +196,13 @@ template <>
 __device__ __inline__ half _Ceil(half a) { return half(ceilf((float)a)); }
 
 template <typename T>
-__device__ __inline__ T _Floor(T a);
+__device__ __host__ __inline__ T _Floor(T a);
 
 template <>
-__device__ __inline__ float _Floor(float a) { return floorf(a); }
+__device__ __host__ __inline__ float _Floor(float a) { return floorf(a); }
 
 template <>
-__device__ __inline__ double _Floor(double a) { return floor(a); }
+__device__ __host__ __inline__ double _Floor(double a) { return floor(a); }
 
 template <>
 __device__ __inline__ half _Floor(half a) { return half(floorf((float)a)); }
@@ -230,13 +232,13 @@ template <>
 __device__ __inline__ half _Erf(half a) { return half(erff((float)a)); }
 
 template <typename T>
-__device__ __inline__ T _Round(T a);
+__device__ __host__ __inline__ T _Round(T a);
 
 template <>
-__device__ __inline__ float _Round(float a) { return rintf(a); }
+__device__ __host__ __inline__ float _Round(float a) { return rintf(a); }
 
 template <>
-__device__ __inline__ double _Round(double a) { return rint(a); }
+__device__ __host__ __inline__ double _Round(double a) { return rint(a); }
 
 template <>
 __device__ __inline__ half _Round(half a) {
@@ -345,9 +347,29 @@ __device__ __inline__ half _Pow(half a, half b) { return half(powf((float)a, (fl
 template <typename T>
 __device__ __inline__ T _Min(T a, T b) { return a < b ? a : b; }
 
+template <>
+__device__ __inline__ float _Min(float a, float b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<float>::quiet_NaN() : ( a < b ? a : b );
+}
+
+template <>
+__device__ __inline__ double _Min(double a, double b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<double>::quiet_NaN() : ( a < b ? a : b );
+}
+
 template <typename T>
 __device__ __inline__ T _Max(T a, T b) { return a > b ? a : b; }
 
+template <>
+__device__ __inline__ float _Max(float a, float b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<float>::quiet_NaN() : ( a > b ? a : b );
+}
+
+template <>
+__device__ __inline__ double _Max(double a, double b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<double>::quiet_NaN() : ( a > b ? a : b );
+}
+
 template <typename T>
 __device__ __inline__ T _Abs(T a) { return a > (T)0 ? a : -a; }
 
@@ -438,6 +460,157 @@ __device__ __inline__ BFloat16 _Fmod(BFloat16 a, BFloat16 b) {
   return fmodf((float)a, (float)b);
 }
 
+namespace isinf_details {
+template <typename T>
+struct IsInfTyped {
+  static __device__ __inline__ bool IsInf(T a) {
+    // cast is needed because on non MS compilers,
+    // because there isinf() returns int
+    // and we want to avoid stupid warnings
+    return static_cast<bool>(isinf(a));
+  }
+  static __device__ __inline__ bool IsInfPos(T a) {
+    return a == std::numeric_limits<T>::infinity();
+  }
+  static __device__ __inline__ bool IsInfNeg(T a) {
+    return a == -std::numeric_limits<T>::infinity();
+  }
+};
+
+template <>
+struct IsInfTyped<half> {
+  static __device__ __inline__ bool IsInf(half a) {
+    return MLFloat16::kPositiveInfinityBits ==
+           static_cast<uint16_t>(*reinterpret_cast<uint16_t*>(&a) & ~MLFloat16::kSignMask);
+  }
+  static __device__ __inline__ bool IsInfPos(half a) {
+    return MLFloat16::kPositiveInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+  static __device__ __inline__ bool IsInfNeg(half a) {
+    return MLFloat16::kNegativeInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+};
+
+template <>
+struct IsInfTyped<BFloat16> {
+  static __device__ __inline__ bool IsInf(BFloat16 a) {
+    return BFloat16::kPositiveInfinityBits ==
+           static_cast<uint16_t>(*reinterpret_cast<uint16_t*>(&a) & ~BFloat16::kSignMask);
+  }
+  static __device__ __inline__ bool IsInfPos(BFloat16 a) {
+    return BFloat16::kPositiveInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+  static __device__ __inline__ bool IsInfNeg(BFloat16 a) {
+    return BFloat16::kNegativeInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+};
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template <typename T>
+struct ReturnFalse {
+  constexpr static bool __device__ __inline__ IsInf(T) { return false; }
+  constexpr static bool __device__ __inline__ IsInfPos(T) { return false; }
+  constexpr static bool __device__ __inline__ IsInfNeg(T) { return false; }
+};
+
+template <>
+struct IsInfTyped<Float8E4M3FN> : ReturnFalse<Float8E4M3FN> {};
+
+template <>
+struct IsInfTyped<Float8E4M3FNUZ> : ReturnFalse<Float8E4M3FNUZ> {};
+
+template <>
+struct IsInfTyped<Float8E5M2> {
+  static __device__ __inline__ bool IsInf(Float8E5M2 a) {
+    return a.val == 0b01111100 || a.val == 0b11111100;
+  }
+  static __device__ __inline__ bool IsInfPos(Float8E5M2 a) {
+    return a.val == 0b01111100;
+  }
+  static __device__ __inline__ bool IsInfNeg(Float8E5M2 a) {
+    return a.val == 0b11111100;
+  }
+};
+
+template <>
+struct IsInfTyped<Float8E5M2FNUZ> : ReturnFalse<Float8E5M2FNUZ> {};
+
+#endif
+}  // namespace isinf_details
+
+template <typename T, bool detect_positive, bool detect_negative>
+struct _IsInf {
+  __device__ __inline__ bool operator()(T a) const {
+    if constexpr (detect_positive && detect_negative) {
+      return isinf_details::IsInfTyped<T>::IsInf(a);
+    } else if constexpr (detect_positive) {
+      return isinf_details::IsInfTyped<T>::IsInfPos(a);
+    } else if constexpr (detect_negative) {
+      return isinf_details::IsInfTyped<T>::IsInfNeg(a);
+    } else {
+      return false;
+    }
+  }
+};
+
+// float and double
+template <typename T>
+struct _IsNan {
+  __device__ __inline__ bool operator()(T a) const {
+    return isnan(a);
+  }
+};
+
+template <>
+struct _IsNan<half> {
+  __device__ __inline__ bool operator()(half a) const {
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~MLFloat16::kSignMask)
+           > MLFloat16::kPositiveInfinityBits;
+  }
+};
+
+template <>
+struct _IsNan<BFloat16> {
+  __device__ __inline__ bool operator()(BFloat16 a) const {
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~BFloat16::kSignMask)
+           > BFloat16::kPositiveInfinityBits;
+  }
+};
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template<>
+struct _IsNan<Float8E4M3FN> {
+  __device__ __inline__ bool operator()(Float8E4M3FN a) const {
+    return (*reinterpret_cast<const uint8_t*>(&a) & 0x7f) == 0x7f;
+  }
+};
+
+template<>
+struct _IsNan<Float8E4M3FNUZ> {
+  __device__ __inline__ bool operator()(Float8E4M3FNUZ a) const {
+    return *reinterpret_cast<const uint8_t*>(&a) == 0x80;
+  }
+};
+
+template<>
+struct _IsNan<Float8E5M2> {
+  __device__ __inline__ bool operator()(Float8E5M2 a) const {
+    uint8_t c = *reinterpret_cast<const uint8_t*>(&a);
+    return ((c & 0x7c) == 0x7c) && ((c & 0x03) != 0x00);
+  }
+};
+
+template<>
+struct _IsNan<Float8E5M2FNUZ> {
+  __device__ __inline__ bool operator()(Float8E5M2FNUZ a) const {
+    return *reinterpret_cast<const uint8_t*>(&a) == 0x80;
+  }
+};
+
+#endif
+
 // We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
 #ifndef CUDA_LONG
diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc
index 4f223041e04e..f60684795a4b 100644
--- a/onnxruntime/core/providers/cuda/cuda_call.cc
+++ b/onnxruntime/core/providers/cuda/cuda_call.cc
@@ -30,6 +30,7 @@ const char* CudaErrString<cudaError_t>(cudaError_t x) {
   return cudaGetErrorString(x);
 }
 
+#ifndef USE_CUDA_MINIMAL
 template <>
 const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
   cudaDeviceSynchronize();
@@ -76,6 +77,7 @@ const char* CudaErrString<cufftResult>(cufftResult e) {
       return "Unknown cufft error status";
   }
 }
+#endif
 
 #ifdef ORT_USE_NCCL
 template <>
@@ -132,6 +134,7 @@ std::conditional_t<THRW, void, Status> CudaCall(
 
 template Status CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
+#ifndef USE_CUDA_MINIMAL
 template Status CudaCall<cublasStatus_t, false>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cublasStatus_t, true>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line);
 template Status CudaCall<cudnnStatus_t, false>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg, const char* file, const int line);
@@ -140,6 +143,7 @@ template Status CudaCall<curandStatus_t, false>(curandStatus_t retCode, const ch
 template void CudaCall<curandStatus_t, true>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg, const char* file, const int line);
 template Status CudaCall<cufftResult, false>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cufftResult, true>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line);
+#endif
 
 #ifdef ORT_USE_NCCL
 template Status CudaCall<ncclResult_t, false>(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg, const char* file, const int line);
diff --git a/onnxruntime/core/providers/cuda/cuda_common.cc b/onnxruntime/core/providers/cuda/cuda_common.cc
index 33f2938940e4..65083f89f7f7 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.cc
+++ b/onnxruntime/core/providers/cuda/cuda_common.cc
@@ -14,6 +14,27 @@ namespace cuda {
 //   0x04 - pedantic
 constexpr const char* kCudaGemmOptions = "ORT_CUDA_GEMM_OPTIONS";
 
+const char* CudaDataTypeToString(cudaDataType_t dt) {
+  switch (dt) {
+    case CUDA_R_16F:
+      return "CUDA_R_16F";
+    case CUDA_R_16BF:
+      return "CUDA_R_16BF";
+    case CUDA_R_32F:
+      return "CUDA_R_32F";
+#if !defined(DISABLE_FLOAT8_TYPES)
+    // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8
+    case CUDA_R_8F_E4M3:
+      return "CUDA_R_8F_E4M3";
+    case CUDA_R_8F_E5M2:
+      return "CUDA_R_8F_E5M2";
+#endif
+    default:
+      return "<unknown>";
+  }
+}
+
+#ifndef USE_CUDA_MINIMAL
 // Initialize the singleton instance
 HalfGemmOptions HalfGemmOptions::instance;
 
@@ -54,26 +75,6 @@ const char* cublasGetErrorEnum(cublasStatus_t error) {
   }
 }
 
-const char* CudaDataTypeToString(cudaDataType_t dt) {
-  switch (dt) {
-    case CUDA_R_16F:
-      return "CUDA_R_16F";
-    case CUDA_R_16BF:
-      return "CUDA_R_16BF";
-    case CUDA_R_32F:
-      return "CUDA_R_32F";
-#if !defined(DISABLE_FLOAT8_TYPES)
-    // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8
-    case CUDA_R_8F_E4M3:
-      return "CUDA_R_8F_E4M3";
-    case CUDA_R_8F_E5M2:
-      return "CUDA_R_8F_E5M2";
-#endif
-    default:
-      return "<unknown>";
-  }
-}
-
 const char* CublasComputeTypeToString(cublasComputeType_t ct) {
   switch (ct) {
     case CUBLAS_COMPUTE_16F:
@@ -92,6 +93,7 @@ const char* CublasComputeTypeToString(cublasComputeType_t ct) {
       return "<unknown>";
   }
 }
+#endif
 
 // It must exist somewhere already.
 cudaDataType_t ToCudaDataType(int32_t element_type) {
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
index 707099bac3ce..61da125b4095 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.h
+++ b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -22,13 +22,14 @@ namespace onnxruntime {
 namespace cuda {
 
 #define CUDA_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDA_CALL(expr))
+#ifndef USE_CUDA_MINIMAL
 #define CUBLAS_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUBLAS_CALL(expr))
 #define CUSPARSE_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUSPARSE_CALL(expr))
 #define CURAND_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CURAND_CALL(expr))
 #define CUDNN_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDNN_CALL(expr))
 #define CUDNN2_RETURN_IF_ERROR(expr, m) ORT_RETURN_IF_ERROR(CUDNN_CALL2(expr, m))
 #define CUFFT_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUFFT_CALL(expr))
-
+#endif
 // Type mapping for MLFloat16 to half
 template <typename T>
 class ToCudaType {
@@ -69,6 +70,15 @@ class ToCudaType<Float8E4M3FN> {
   }
 };
 
+template <>
+class ToCudaType<Float8E4M3FNUZ> {
+ public:
+  typedef Float8E4M3FNUZ MappedType;
+  static MappedType FromFloat(float f) {
+    return MappedType(f);
+  }
+};
+
 template <>
 class ToCudaType<Float8E5M2> {
  public:
@@ -78,6 +88,15 @@ class ToCudaType<Float8E5M2> {
   }
 };
 
+template <>
+class ToCudaType<Float8E5M2FNUZ> {
+ public:
+  typedef Float8E5M2FNUZ MappedType;
+  static MappedType FromFloat(float f) {
+    return MappedType(f);
+  }
+};
+
 #endif
 
 inline bool CalculateFdmStrides(gsl::span<fast_divmod> p, const std::vector<int64_t>& dims) {
@@ -93,7 +112,7 @@ inline bool CalculateFdmStrides(gsl::span<fast_divmod> p, const std::vector<int6
   }
   return true;
 }
-
+#ifndef USE_CUDA_MINIMAL
 class CublasMathModeSetter {
  public:
   CublasMathModeSetter(const cudaDeviceProp& prop, cublasHandle_t handle, cublasMath_t mode) : handle_(handle) {
@@ -140,8 +159,7 @@ class HalfGemmOptions {
   }
 #else
   cublasMath_t GetMathMode() const {
-    // CublasMathModeSetter will check whether device has tensor cores later.
-    return CUBLAS_TENSOR_OP_MATH;
+    return CUBLAS_DEFAULT_MATH;
   }
 
   cudaDataType GetComputeType() const {
@@ -189,6 +207,7 @@ const char* cublasGetErrorEnum(cublasStatus_t error);
 const char* CudaDataTypeToString(cudaDataType_t dt);
 
 const char* CublasComputeTypeToString(cublasComputeType_t ct);
+#endif
 
 cudaDataType_t ToCudaDataType(int32_t element_type);
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index d8a0792209b0..05d9f3b5a1e8 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -3,6 +3,7 @@
 // Licensed under the MIT License.
 
 #include "core/common/inlined_containers.h"
+#include "core/common/parse_string.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/platform/env_var_utils.h"
 #include "core/providers/cuda/cuda_execution_provider.h"
@@ -11,7 +12,9 @@
 #include "core/providers/cuda/cuda_fwd.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
 #include "core/providers/cuda/cuda_profiler.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 
+#ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/cuda/cuda_contrib_kernels.h"
 #endif
@@ -27,6 +30,7 @@
 #ifdef USE_TRITON_KERNEL
 #include "core/providers/cuda/triton_kernel.h"
 #endif
+#endif
 
 #include "core/providers/cuda/cuda_stream_handle.h"
 
@@ -169,48 +173,79 @@ CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de
                                                           ArenaExtendStrategy /*arena_extend_strategy*/, CUDAExecutionProviderExternalAllocatorInfo /*external_allocator_info*/,
                                                           OrtArenaCfg* /*default_memory_arena_cfg*/) {
   CUDA_CALL_THROW(cudaSetDevice(device_id));
-
+#ifndef USE_CUDA_MINIMAL
   CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
   CUBLAS_CALL_THROW(cublasLtCreate(&cublas_lt_handle_));
   CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
 
   CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_));
   CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
-
+#endif
   cuda_graph_.SetStream(stream);
 }
 
 CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
+#ifndef USE_CUDA_MINIMAL
   ORT_IGNORE_RETURN_VALUE(CUBLAS_CALL(cublasDestroy(cublas_handle_)));
   ORT_IGNORE_RETURN_VALUE(CUBLAS_CALL(cublasLtDestroy(cublas_lt_handle_)));
   ORT_IGNORE_RETURN_VALUE(CUDNN_CALL(cudnnDestroy(cudnn_handle_)));
+#endif
 }
 
-bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
-  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed(
+    CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  if (!IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id)) {
+    return false;
+  }
+  if (graph_id_to_run_count_.find(cuda_graph_annotation_id) == graph_id_to_run_count_.end()) {
+    return false;
+  }
+  return graph_id_to_run_count_.at(cuda_graph_annotation_id) >= min_num_runs_before_cuda_graph_capture_;
 }
 
-void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
-  cuda_graph_.Reset();
-  cuda_graph_.CaptureBegin();
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowedOnRun(
+    CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graph_.IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id);
 }
 
-void CUDAExecutionProvider::PerThreadContext::CaptureEnd() {
-  cuda_graph_.CaptureEnd();
-  is_graph_captured_ = true;
+CudaGraphAnnotation_t CUDAExecutionProvider::PerThreadContext::GetCudaGraphAnnotationId(
+    const onnxruntime::RunOptions& run_options) const {
+  auto graph_annotation_str =
+      run_options.GetConfigOptions().GetConfigEntry(kOrtRunOptionsConfigCudaGraphAnnotation);
+  // If graph annotation is not provided, fall back to the one cuda graph per session behavior
+  CudaGraphAnnotation_t cuda_graph_annotation_id = 0;
+  if (graph_annotation_str.has_value()) {
+    ORT_ENFORCE(TryParseStringWithClassicLocale<int>(*graph_annotation_str, cuda_graph_annotation_id),
+                "Failed to parse the cuda graph annotation id: ",
+                *graph_annotation_str);
+  }
+
+  return cuda_graph_annotation_id;
 }
 
-bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptured() const {
-  return is_graph_captured_;
+void CUDAExecutionProvider::PerThreadContext::CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  cuda_graph_.CaptureBegin(cuda_graph_annotation_id);
 }
 
-Status CUDAExecutionProvider::PerThreadContext::ReplayGraph() {
-  ORT_ENFORCE(IsGraphCaptured());
-  return cuda_graph_.Replay();
+void CUDAExecutionProvider::PerThreadContext::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  cuda_graph_.CaptureEnd(cuda_graph_annotation_id);
 }
 
-void CUDAExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
-  ++regular_run_count_before_graph_capture_;
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptured(CudaGraphAnnotation_t graph_annotation_id) const {
+  return cuda_graph_.IsGraphCaptured(graph_annotation_id);
+}
+
+Status CUDAExecutionProvider::PerThreadContext::ReplayGraph(CudaGraphAnnotation_t graph_annotation_id) {
+  return cuda_graph_.Replay(graph_annotation_id);
+}
+
+void CUDAExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture(
+    CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  if (graph_id_to_run_count_.find(cuda_graph_annotation_id) == graph_id_to_run_count_.end()) {
+    graph_id_to_run_count_[cuda_graph_annotation_id] = 1;
+    return;
+  }
+  graph_id_to_run_count_[cuda_graph_annotation_id]++;
 }
 
 void OverrideTunableOpInfoByEnv(CUDAExecutionProviderInfo& info) {
@@ -382,25 +417,28 @@ Status CUDAExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::OnRunStart() {
+Status CUDAExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
   // always set CUDA device when session::Run() in case it runs in a worker thread
   CUDA_RETURN_IF_ERROR(cudaSetDevice(GetDeviceId()));
-  if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
+  CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCudaGraphAnnotationId(run_options);
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id) &&
+      GetPerThreadContext().IsGraphCaptureAllowed(cuda_graph_annotation_id)) {
     LOGS(*GetLogger(), INFO) << "Capturing the cuda graph for this model";
-    GetPerThreadContext().CaptureBegin();
+    GetPerThreadContext().CaptureBegin(cuda_graph_annotation_id);
   }
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::OnRunEnd(bool sync_stream) {
-  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
-    if (GetPerThreadContext().IsGraphCaptureAllowed()) {
-      GetPerThreadContext().CaptureEnd();
+Status CUDAExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) {
+  CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCudaGraphAnnotationId(run_options);
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id)) {
+    if (GetPerThreadContext().IsGraphCaptureAllowed(cuda_graph_annotation_id)) {
+      GetPerThreadContext().CaptureEnd(cuda_graph_annotation_id);
       // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
       // so run the captured graph here to actually execute the work.
-      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph());
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id));
     } else {
-      GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture();
+      GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture(cuda_graph_annotation_id);
     }
   }
 
@@ -429,18 +467,19 @@ bool CUDAExecutionProvider::IsGraphCaptureEnabled() const {
   return info_.enable_cuda_graph;
 }
 
-bool CUDAExecutionProvider::IsGraphCaptured() const {
-  return GetPerThreadContext().IsGraphCaptured();
+bool CUDAExecutionProvider::IsGraphCaptured(int graph_annotation_id) const {
+  return GetPerThreadContext().IsGraphCaptured(graph_annotation_id);
 }
 
-Status CUDAExecutionProvider::ReplayGraph() {
-  return GetPerThreadContext().ReplayGraph();
+Status CUDAExecutionProvider::ReplayGraph(int graph_annotation_id) {
+  return GetPerThreadContext().ReplayGraph(graph_annotation_id);
 }
 
 namespace cuda {
 // opset 1 to 9
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
+#ifndef USE_CUDA_MINIMAL
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos);
@@ -717,6 +756,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint32_t, Cast);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint64_t, Cast);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, bool, Cast);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, IsNaN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, float, Pad);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, double, Pad);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, MLFloat16, Pad);
@@ -825,6 +865,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, ThresholdedRelu);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, TopK);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, Mod);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 19, IsInf);
 
 // opset 11
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Compress);
@@ -908,7 +949,6 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 
 // OpSet 12
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Clip);
-
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, double, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, MLFloat16, MaxPool);
@@ -984,6 +1024,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sqrt);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sqrt);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sqrt);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sqrt);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Log);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Log);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Log);
@@ -1041,7 +1082,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Shape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Size);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Transpose);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, ScatterElements);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 15, ScatterElements);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Slice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Slice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Softmax);
@@ -1056,6 +1097,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, U
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Concat);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Gather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, GatherElements);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 19, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, MatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, MatMul);
@@ -1103,11 +1145,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, GatherND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Dropout);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, If);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, Loop);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Flatten);
@@ -1116,10 +1158,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, LRN);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Identity);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, ScatterND);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Pad);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Pad);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Pad);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, bool, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, bool, Pad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, SpaceToDepth);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, DepthToSpace);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Sign);
@@ -1196,9 +1238,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, LSTM);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, LSTM);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, LSTM);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMin);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMin);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMin);
@@ -1249,6 +1294,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, LessOrEqual);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample);
 
 // Opset 17
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
@@ -1264,6 +1311,16 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMax);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMax);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMax);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterElements);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, Resize);
 
 // Opset 19
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, float, Cast);
@@ -1315,6 +1372,14 @@ class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
+#endif
+
+// Opset 20
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN);
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -1326,6 +1391,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
+#ifndef USE_CUDA_MINIMAL
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
@@ -1499,6 +1565,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Erf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, bool, Not)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, BatchNormalization)>,
@@ -1711,6 +1778,8 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, Mod)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10,
+                                                                    19, IsInf)>,
 
     // opset 11
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, float, ArgMax)>,
@@ -1869,6 +1938,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sqrt)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sqrt)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sqrt)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sqrt)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Log)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Log)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Log)>,
@@ -1922,11 +1992,12 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint32_t, Cast)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint64_t, Cast)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, bool, Cast)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 19, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Size)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Transpose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 15, ScatterElements)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Softmax)>,
@@ -1988,11 +2059,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, If)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Flatten)>,
@@ -2001,10 +2072,10 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, LRN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Identity)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, bool, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, bool, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, SpaceToDepth)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Sign)>,
@@ -2081,16 +2152,12 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, LSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, LSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+        kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+        kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+        kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Sub)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Mul)>,
@@ -2134,6 +2201,8 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, LessOrEqual)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample)>,
 
     // Opset 17
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
@@ -2143,11 +2212,28 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
 
     // Opset 18
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, Split)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int8_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMax)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMax)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMax)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMax)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, Resize)>,
 
     // Opset 19
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, float, Cast)>,
@@ -2201,6 +2287,14 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape)>,
+
+    // Opset 20
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN)>,
+#endif
   };
 
   for (auto& function_table_entry : function_table) {
@@ -2210,6 +2304,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     }
   }
 
+#ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::contrib::cuda::RegisterCudaContribKernels(kernel_registry));
 #endif
@@ -2220,6 +2315,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
 
 #ifdef ENABLE_TRAINING_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaTrainingKernels(kernel_registry));
+#endif
 #endif
 
   return Status::OK();
@@ -2465,7 +2561,8 @@ void CUDAExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegistry&
                             stream_,
                             use_ep_level_unified_stream_,
                             GetPerThreadContext().CudnnHandle(),
-                            GetPerThreadContext().CublasHandle());
+                            GetPerThreadContext().CublasHandle(),
+                            info_);
 }
 
 OrtDevice CUDAExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index d0bb2321edf0..f53779058a8a 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -29,9 +29,9 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   DataLayout GetPreferredLayout() const override;
 
@@ -78,6 +78,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
   bool GetCudnnConv1dPadToNc1d() const { return info_.cudnn_conv1d_pad_to_nc1d; }
   bool IsSkipLayerNormInStrictMode() const { return info_.enable_skip_layer_norm_strict_mode; }
   bool IsNHWCPreferred() const { return info_.prefer_nhwc; }
+  bool UseTF32() const { return info_.use_tf32; }
 
   ProviderOptions GetProviderOptions() const override {
     return CUDAExecutionProviderInfo::ToProviderOptions(info_);
@@ -91,8 +92,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
   std::unique_ptr<profiling::EpProfiler> GetProfiler() override;
 
   bool IsGraphCaptureEnabled() const override;
-  bool IsGraphCaptured() const override;
-  Status ReplayGraph() override;
+  bool IsGraphCaptured(CudaGraphAnnotation_t graph_annotation_id) const override;
+  Status ReplayGraph(CudaGraphAnnotation_t graph_annotation_id) override;
   void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
   OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
@@ -114,6 +115,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
     PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy,
                      CUDAExecutionProviderExternalAllocatorInfo external_alloc_info, OrtArenaCfg* arena_cfg);
     ~PerThreadContext();
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext);
 
     cublasHandle_t CublasHandle() const {
       return cublas_handle_;
@@ -129,41 +131,33 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
     template <typename T>
     const T* GetConstOnes(size_t count, cudaStream_t stream) {
-      constexpr bool is_float = std::is_same<T, float>::value;
-      constexpr bool is_double = std::is_same<T, double>::value;
-      constexpr bool is_half = std::is_same<T, half>::value;
-      constexpr bool is_BFloat16 = std::is_same<T, BFloat16>::value;
-#if !defined(DISABLE_FLOAT8_TYPES)
-      constexpr bool is_Float8E4M3FN = std::is_same<T, Float8E4M3FN>::value;
-      constexpr bool is_Float8E5M2 = std::is_same<T, Float8E5M2>::value;
-#endif
-      if (is_float) {
+      if constexpr (std::is_same<T, float>::value) {
         if (!constant_ones_float_) {
           constant_ones_float_ = cuda::CreateConstantOnes<float>();
         }
         return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(stream, count));
-      } else if (is_double) {
+      } else if constexpr (std::is_same<T, double>::value) {
         if (!constant_ones_double_) {
           constant_ones_double_ = cuda::CreateConstantOnes<double>();
         }
         return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(stream, count));
-      } else if (is_half) {
+      } else if constexpr (std::is_same<T, half>::value) {
         if (!constant_ones_half_) {
           constant_ones_half_ = cuda::CreateConstantOnes<half>();
         }
         return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(stream, count));
-      } else if (is_BFloat16) {
+      } else if constexpr (std::is_same<T, BFloat16>::value) {
         if (!constant_ones_bfloat16_) {
           constant_ones_bfloat16_ = cuda::CreateConstantOnes<BFloat16>();
         }
         return reinterpret_cast<const T*>(constant_ones_bfloat16_->GetBuffer(stream, count));
 #if !defined(DISABLE_FLOAT8_TYPES)
-      } else if (is_Float8E4M3FN) {
+      } else if constexpr (std::is_same<T, Float8E4M3FN>::value) {
         if (!constant_ones_float8e4m3fn_) {
           constant_ones_float8e4m3fn_ = cuda::CreateConstantOnes<Float8E4M3FN>();
         }
         return reinterpret_cast<const T*>(constant_ones_float8e4m3fn_->GetBuffer(stream, count));
-      } else if (is_Float8E5M2) {
+      } else if constexpr (std::is_same<T, Float8E5M2>::value) {
         if (!constant_ones_float8e5m2_) {
           constant_ones_float8e5m2_ = cuda::CreateConstantOnes<Float8E5M2>();
         }
@@ -174,12 +168,14 @@ class CUDAExecutionProvider : public IExecutionProvider {
       }
     }
 
-    bool IsGraphCaptureAllowed() const;
-    void CaptureBegin();
-    void CaptureEnd();
-    bool IsGraphCaptured() const;
-    Status ReplayGraph();
-    void IncrementRegularRunCountBeforeGraphCapture();
+    bool IsGraphCaptureAllowed(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    bool IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    bool IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    CudaGraphAnnotation_t GetCudaGraphAnnotationId(const onnxruntime::RunOptions& run_options) const;
+    Status ReplayGraph(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    void IncrementRegularRunCountBeforeGraphCapture(CudaGraphAnnotation_t cuda_graph_annotation_id);
 
    private:
     cublasHandle_t cublas_handle_ = nullptr;
@@ -198,8 +194,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
     // Cuda graph with multi threads will be supported in the future, so cuda_graph_
     // is put under PerThreadContext.
     CUDAGraph cuda_graph_;
-    bool is_graph_captured_ = false;
-    int regular_run_count_before_graph_capture_ = 0;
+    // Map of graph id to regular_run_count_before_graph_capture
+    std::unordered_map<CudaGraphAnnotation_t, int> graph_id_to_run_count_;
 
     // There is chance that the second regular run allocates GPU memory for causes like:
     // (1) memory pattern is enabled. (2) arena allocation for stream.
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
index daa3b5ff3d72..c96381e3e68b 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
@@ -16,6 +16,7 @@ namespace cuda {
 namespace provider_option_names {
 constexpr const char* kDeviceId = "device_id";
 constexpr const char* kHasUserComputeStream = "has_user_compute_stream";
+constexpr const char* kUserComputeStream = "user_compute_stream";
 constexpr const char* kMemLimit = "gpu_mem_limit";
 constexpr const char* kArenaExtendStrategy = "arena_extend_strategy";
 constexpr const char* kCudnnConvAlgoSearch = "cudnn_conv_algo_search";
@@ -30,8 +31,10 @@ constexpr const char* kTunableOpEnable = "tunable_op_enable";
 constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
 constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
 constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_strict_mode";
-constexpr const char* kPreferNCHWMode = "prefer_nhwc";
-constexpr const char* KUseEPLevelUnifiedStream = "use_ep_level_unified_stream";
+constexpr const char* kPreferNHWCMode = "prefer_nhwc";
+constexpr const char* kUseEPLevelUnifiedStream = "use_ep_level_unified_stream";
+constexpr const char* kUseTF32 = "use_tf32";
+
 }  // namespace provider_option_names
 }  // namespace cuda
 
@@ -51,6 +54,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
   void* alloc = nullptr;
   void* free = nullptr;
   void* empty_cache = nullptr;
+  void* user_compute_stream = nullptr;
   ORT_THROW_IF_ERROR(
       ProviderOptionsParser{}
           .AddValueParser(
@@ -66,6 +70,14 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
                 return Status::OK();
               })
           .AddAssignmentToReference(cuda::provider_option_names::kHasUserComputeStream, info.has_user_compute_stream)
+          .AddValueParser(
+              cuda::provider_option_names::kUserComputeStream,
+              [&user_compute_stream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                user_compute_stream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
           .AddValueParser(
               cuda::provider_option_names::kGpuExternalAlloc,
               [&alloc](const std::string& value_str) -> Status {
@@ -102,8 +114,9 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
           .AddAssignmentToReference(cuda::provider_option_names::kEnableCudaGraph, info.enable_cuda_graph)
           .AddAssignmentToReference(cuda::provider_option_names::kCudnnConv1dPadToNc1d, info.cudnn_conv1d_pad_to_nc1d)
           .AddAssignmentToReference(cuda::provider_option_names::kEnableSkipLayerNormStrictMode, info.enable_skip_layer_norm_strict_mode)
-          .AddAssignmentToReference(cuda::provider_option_names::kPreferNCHWMode, info.prefer_nhwc)
-          .AddAssignmentToReference(cuda::provider_option_names::KUseEPLevelUnifiedStream, info.use_ep_level_unified_stream)
+          .AddAssignmentToReference(cuda::provider_option_names::kPreferNHWCMode, info.prefer_nhwc)
+          .AddAssignmentToReference(cuda::provider_option_names::kUseEPLevelUnifiedStream, info.use_ep_level_unified_stream)
+          .AddAssignmentToReference(cuda::provider_option_names::kUseTF32, info.use_tf32)
           .AddValueParser(
               cuda::provider_option_names::kTunableOpEnable,
               [&info](const std::string& value_str) -> Status {
@@ -126,6 +139,10 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
 
   CUDAExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
   info.external_allocator_info = alloc_info;
+
+  info.user_compute_stream = user_compute_stream;
+  info.has_user_compute_stream = (user_compute_stream != nullptr);
+
   return info;
 }
 
@@ -133,6 +150,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
   const ProviderOptions options{
       {cuda::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
       {cuda::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {cuda::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {cuda::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)},
       {cuda::provider_option_names::kGpuExternalAlloc, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.alloc))},
       {cuda::provider_option_names::kGpuExternalFree, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.free))},
@@ -149,8 +167,9 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
       {cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
       {cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
       {cuda::provider_option_names::kEnableSkipLayerNormStrictMode, MakeStringWithClassicLocale(info.enable_skip_layer_norm_strict_mode)},
-      {cuda::provider_option_names::kPreferNCHWMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
-      {cuda::provider_option_names::KUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
+      {cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
+      {cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
+      {cuda::provider_option_names::kUseTF32, MakeStringWithClassicLocale(info.use_tf32)},
   };
 
   return options;
@@ -160,6 +179,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid
   const ProviderOptions options{
       {cuda::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
       {cuda::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {cuda::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {cuda::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)},
       {cuda::provider_option_names::kArenaExtendStrategy, EnumToName(arena_extend_strategy_mapping, info.arena_extend_strategy)},
       {cuda::provider_option_names::kCudnnConvAlgoSearch, EnumToName(ort_cudnn_conv_algo_search_mapping, info.cudnn_conv_algo_search)},
@@ -169,8 +189,9 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid
       {cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)},
       {cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)},
       {cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)},
-      {cuda::provider_option_names::kPreferNCHWMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
-      {cuda::provider_option_names::KUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
+      {cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
+      {cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
+      {cuda::provider_option_names::kUseTF32, MakeStringWithClassicLocale(info.use_tf32)},
   };
 
   return options;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
index b286f5a9161b..1cac3d151369 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
@@ -76,6 +76,9 @@ struct CUDAExecutionProviderInfo {
 
   bool use_ep_level_unified_stream{false};
 
+  // By default, enable TF32 to speed up float GEMM/MatMul or cuDNN convolution of float matrices.
+  bool use_tf32{true};
+
   static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info);
   static ProviderOptions ToProviderOptions(const OrtCUDAProviderOptionsV2& info);
@@ -83,12 +86,37 @@ struct CUDAExecutionProviderInfo {
 }  // namespace onnxruntime
 
 template <>
-struct std::hash<::onnxruntime::cuda::TunableOpInfo> {
-  size_t operator()(const ::onnxruntime::cuda::TunableOpInfo& info) const {
-    size_t seed_and_value{0xbc9f1d34};
-    onnxruntime::HashCombine(info.enable, seed_and_value);
-    onnxruntime::HashCombine(info.tuning_enable, seed_and_value);
-    onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value);
-    return seed_and_value;
+struct std::hash<::onnxruntime::CUDAExecutionProviderInfo> {
+  size_t operator()(const ::onnxruntime::CUDAExecutionProviderInfo& info) const {
+    size_t value{0xbc9f1d34};  // seed
+
+    // Bits: device_id (16), arena_extend_strategy/cudnn_conv_algo_search (reserved 2), boolean options (1 each)
+    size_t data = static_cast<size_t>(info.device_id) ^
+                  (static_cast<size_t>(info.arena_extend_strategy) << 16) ^
+                  (static_cast<size_t>(info.cudnn_conv_algo_search) << 18) ^
+                  (static_cast<size_t>(info.do_copy_in_default_stream) << 20) ^
+                  (static_cast<size_t>(info.has_user_compute_stream) << 21) ^
+                  (static_cast<size_t>(info.cudnn_conv_use_max_workspace) << 22) ^
+                  (static_cast<size_t>(info.enable_cuda_graph) << 23) ^
+                  (static_cast<size_t>(info.tunable_op.enable) << 24) ^
+                  (static_cast<size_t>(info.tunable_op.tuning_enable) << 25) ^
+                  (static_cast<size_t>(info.cudnn_conv1d_pad_to_nc1d) << 26) ^
+                  (static_cast<size_t>(info.enable_skip_layer_norm_strict_mode) << 27) ^
+                  (static_cast<size_t>(info.prefer_nhwc) << 28) ^
+                  (static_cast<size_t>(info.use_ep_level_unified_stream) << 29) ^
+                  (static_cast<size_t>(info.use_tf32) << 30);
+    onnxruntime::HashCombine(data, value);
+
+    onnxruntime::HashCombine(info.gpu_mem_limit, value);
+    onnxruntime::HashCombine(info.tunable_op.max_tuning_duration_ms, value);
+
+    // Memory pointers
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.user_compute_stream), value);
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.alloc), value);
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.free), value);
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.empty_cache), value);
+
+    // The default memory arena cfg is not used in hashing right now.
+    return value;
   }
 };
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.cc b/onnxruntime/core/providers/cuda/cuda_graph.cc
index 230d66439161..8353c654681f 100644
--- a/onnxruntime/core/providers/cuda/cuda_graph.cc
+++ b/onnxruntime/core/providers/cuda/cuda_graph.cc
@@ -9,17 +9,44 @@
 
 namespace onnxruntime {
 
-CUDAGraph::CUDAGraph(cudaStream_t stream) : stream_(stream) {
+CudaGraphSet::~CudaGraphSet() {
+  Clear();
 }
 
-void CUDAGraph::SetStream(cudaStream_t stream) {
+void CudaGraphSet::Clear() {
+  for (auto& it : cuda_graphs_) {
+    CUDA_CALL_THROW(cudaGraphExecDestroy(it.second));
+  }
+  cuda_graphs_.clear();
+}
+
+bool CudaGraphSet::Contains(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graphs_.find(cuda_graph_annotation_id) != cuda_graphs_.end();
+}
+
+void CudaGraphSet::Put(CudaGraphAnnotation_t cuda_graph_annotation_id, cudaGraphExec_t graph_exec) {
+  ORT_ENFORCE(!Contains(cuda_graph_annotation_id));
+  cuda_graphs_.emplace(cuda_graph_annotation_id, graph_exec);
+}
+
+cudaGraphExec_t CudaGraphSet::Get(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  ORT_ENFORCE(Contains(cuda_graph_annotation_id));
+  return cuda_graphs_.at(cuda_graph_annotation_id);
+}
+
+CUDAGraphManager::CUDAGraphManager(cudaStream_t stream) : stream_(stream) {
+}
+
+void CUDAGraphManager::SetStream(cudaStream_t stream) {
   stream_ = stream;
 }
 
-void CUDAGraph::CaptureBegin() {
-  ORT_ENFORCE(!has_graph_exec_,
-              "This cuda graph has already captured a graph. "
-              "Create a new instance to capture a new graph.");
+void CUDAGraphManager::CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  ORT_ENFORCE(IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id));
+
+  ORT_ENFORCE(!cuda_graph_set_.Contains(cuda_graph_annotation_id),
+              "Trying to capture a graph with annotation id ", cuda_graph_annotation_id,
+              " that already used. Please use a different annotation id.");
 
   CUDA_CALL_THROW(cudaStreamSynchronize(stream_));
   // For now cuda graph can only work with a single thread. In the future, we
@@ -29,40 +56,48 @@ void CUDAGraph::CaptureBegin() {
   CUDA_CALL_THROW(cudaStreamBeginCapture(stream_, cudaStreamCaptureModeGlobal));
 }
 
-void CUDAGraph::CaptureEnd() {
-  CUDA_CALL_THROW(cudaStreamEndCapture(stream_, &graph_));
-  if (graph_ == NULL) {
+void CUDAGraphManager::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  cudaGraph_t graph = NULL;
+  CUDA_CALL_THROW(cudaStreamEndCapture(stream_, &graph));
+  if (graph == NULL) {
     ORT_THROW("CUDAGraph::CaptureEnd: graph_ is NULL");
   }
 
-  has_graph_ = true;
-  CUDA_CALL_THROW(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
-  has_graph_exec_ = true;
-  CUDA_CALL_THROW(cudaGraphDestroy(graph_));
-  has_graph_ = false;
+  cudaGraphExec_t graph_exec = NULL;
+  CUDA_CALL_THROW(cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0));
+  CUDA_CALL_THROW(cudaGraphDestroy(graph));
+
+  // Currently all the captured graphs will be tied to the session's lifecycle
+  // TODO(wy): Addd an interface to free captured graphs
+  cuda_graph_set_.Put(cuda_graph_annotation_id, graph_exec);
 }
 
-Status CUDAGraph::Replay() {
+Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id) {
   // Although this function is not thread safe, the lock is not needed here because
   // CUDA EP maintains a separate cuda graph per thread
-  LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_;
-  CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec_, stream_));
+  LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_ << " with cuda_graph_annotation_id "
+                     << cuda_graph_annotation_id;
+
+  cudaGraphExec_t graph_exec = cuda_graph_set_.Get(cuda_graph_annotation_id);
+  CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec, stream_));
+
   CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
   return Status::OK();
 }
 
-void CUDAGraph::Reset() {
-  if (has_graph_) {
-    CUDA_CALL_THROW(cudaGraphDestroy(graph_));
-    has_graph_ = false;
-  }
-  if (has_graph_exec_) {
-    CUDA_CALL_THROW(cudaGraphExecDestroy(graph_exec_));
-    has_graph_exec_ = false;
-  }
+bool CUDAGraphManager::IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graph_annotation_id != kCudaGraphAnnotationSkip;
+}
+
+bool CUDAGraphManager::IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graph_set_.Contains(cuda_graph_annotation_id);
+}
+
+void CUDAGraphManager::Reset() {
+  cuda_graph_set_.Clear();
 }
 
-CUDAGraph::~CUDAGraph() {
+CUDAGraphManager::~CUDAGraphManager() {
   Reset();
 }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h
index 9bcefcc64ea7..064994c1f14a 100644
--- a/onnxruntime/core/providers/cuda/cuda_graph.h
+++ b/onnxruntime/core/providers/cuda/cuda_graph.h
@@ -3,33 +3,55 @@
 
 #pragma once
 
+#include <unordered_map>
+
 #include "core/common/common.h"
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_pch.h"
 
 namespace onnxruntime {
 
-using CaptureId_t = unsigned long long;
+using CudaGraphAnnotation_t = int;
+using CudaGraphSet_t = std::unordered_map<CudaGraphAnnotation_t, cudaGraphExec_t>;
+
+constexpr CudaGraphAnnotation_t kCudaGraphAnnotationSkip = -1;
+constexpr CudaGraphAnnotation_t kCudaGraphAnnotationDefault = 0;
+
+struct CudaGraphSet {
+  CudaGraphSet(){};
+  ~CudaGraphSet();
 
-struct CUDAGraph {
-  CUDAGraph(){};
-  CUDAGraph(cudaStream_t stream);
-  ~CUDAGraph();
+  void Clear();
+  bool Contains(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+  void Put(CudaGraphAnnotation_t cuda_graph_annotation_id, cudaGraphExec_t graph_exec);
+  cudaGraphExec_t Get(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+
+ private:
+  CudaGraphSet_t cuda_graphs_;
+};
+
+struct CUDAGraphManager {
+  CUDAGraphManager(){};
+  CUDAGraphManager(cudaStream_t stream);
+  ~CUDAGraphManager();
 
   void SetStream(cudaStream_t stream);
-  void CaptureBegin();
-  void CaptureEnd();
-  Status Replay();
+  void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id);
+  void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id);
+  Status Replay(CudaGraphAnnotation_t cuda_graph_annotation_id);
+
   void Reset();
 
- private:
-  cudaGraph_t graph_ = NULL;
-  cudaGraphExec_t graph_exec_ = NULL;
+  bool IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+  bool IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
 
-  bool has_graph_ = false;
-  bool has_graph_exec_ = false;
+ private:
+  CudaGraphSet cuda_graph_set_;
+  CudaGraphAnnotation_t cuda_graph_annotation_id_ = kCudaGraphAnnotationDefault;
 
   cudaStream_t stream_ = nullptr;  // Does not own the stream
 };
 
+using CUDAGraph = CUDAGraphManager;
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h
index e3106e41e77c..288da23f35ec 100644
--- a/onnxruntime/core/providers/cuda/cuda_kernel.h
+++ b/onnxruntime/core/providers/cuda/cuda_kernel.h
@@ -90,6 +90,10 @@ class CudaKernel : public OpKernel {
     return stream->cublas_handle_;
   }
 
+  bool UseTF32() const {
+    return provider_->UseTF32();
+  }
+
   tunable::CudaTuningContext* GetTuningContext() const {
     return static_cast<tunable::CudaTuningContext*>(provider_->GetTuningContext());
   }
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
index f416caecd115..7afd2d430ec4 100644
--- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -18,10 +18,14 @@ namespace onnxruntime::cuda {
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float,
                                                       BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16,
                                                       BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float,
                                                       BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16,
                                                       BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float,
@@ -70,14 +74,34 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kM
                                                       MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, int8_t, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, uint8_t, MaxPool);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float,
                                                       BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16,
                                                       BatchNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, float,
                                             BatchNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, double,
+                                            BatchNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, MLFloat16,
                                             BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, DepthToSpace);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, SpaceToDepth);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, SpaceToDepth);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, float, LRN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, double, LRN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+    kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, MLFloat16, LRN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, float, LRN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, double, LRN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, MLFloat16, LRN);
 
 Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn nhwc_function_table[] = {
@@ -86,18 +110,26 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
           kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, double, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, double, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, double, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15,
                                                                   MLFloat16, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15,
                                                                   float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15,
+                                                                  double, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider,
@@ -135,6 +167,7 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
           kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
                                                                   float, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
@@ -147,6 +180,10 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
                                                                   float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
                                                                   MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
+                                                                  int8_t, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
+                                                                  uint8_t, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
                                                                   float, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
@@ -155,6 +192,29 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
           kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, ConvTranspose)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                                      1, 10, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                                      11, 12, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                            13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                                      1, 12, SpaceToDepth)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
+                                                            13, SpaceToDepth)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, float, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, double, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, MLFloat16, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 13, float, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 13, double, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 13, MLFloat16, LRN)>,
   };
 
   for (auto& function_table_entry : nhwc_function_table) {
diff --git a/onnxruntime/core/providers/cuda/cuda_pch.h b/onnxruntime/core/providers/cuda/cuda_pch.h
index f48554e8f128..dfe50fe0a883 100644
--- a/onnxruntime/core/providers/cuda/cuda_pch.h
+++ b/onnxruntime/core/providers/cuda/cuda_pch.h
@@ -10,12 +10,19 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#ifndef USE_CUDA_MINIMAL
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <curand.h>
 #include <cudnn.h>
 #include <cufft.h>
 #include <cublasLt.h>
+#else
+typedef void* cudnnHandle_t;
+typedef void* cublasHandle_t;
+typedef void* cublasLtHandle_t;
+#endif
 
 #ifdef ORT_USE_NCCL
 #include <nccl.h>
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index 892e8d5329eb..103c79c93b2c 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -225,6 +225,7 @@ struct CUDA_Provider : Provider {
     info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms;
     info.enable_skip_layer_norm_strict_mode = params->enable_skip_layer_norm_strict_mode != 0;
     info.use_ep_level_unified_stream = params->use_ep_level_unified_stream != 0;
+    info.use_tf32 = params->use_tf32 != 0;
 
     return std::make_shared<CUDAProviderFactory>(info);
   }
@@ -258,6 +259,7 @@ struct CUDA_Provider : Provider {
     cuda_options.enable_skip_layer_norm_strict_mode = internal_options.enable_skip_layer_norm_strict_mode;
     cuda_options.prefer_nhwc = internal_options.prefer_nhwc;
     cuda_options.use_ep_level_unified_stream = internal_options.use_ep_level_unified_stream;
+    cuda_options.use_tf32 = internal_options.use_tf32;
   }
 
   ProviderOptions GetProviderOptions(const void* provider_options) override {
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 9aad461b1d1c..3c0bf183362d 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -62,11 +62,14 @@ CudaStream::CudaStream(cudaStream_t stream,
                        bool release_cpu_buffer_on_cuda_stream,
                        bool own_flag,
                        cudnnHandle_t external_cudnn_handle,
-                       cublasHandle_t external_cublas_handle) : Stream(stream, device),
-                                                                own_stream_(own_flag),
-                                                                cpu_allocator_(cpu_allocator),
-                                                                release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream),
-                                                                deferred_cpu_allocator_(*this) {
+                       cublasHandle_t external_cublas_handle,
+                       const CUDAExecutionProviderInfo& ep_info) : Stream(stream, device),
+                                                                   own_stream_(own_flag),
+                                                                   cpu_allocator_(cpu_allocator),
+                                                                   release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream),
+                                                                   deferred_cpu_allocator_(*this),
+                                                                   ep_info_(ep_info) {
+#ifndef USE_CUDA_MINIMAL
   if (own_flag) {
     CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
     CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
@@ -78,10 +81,12 @@ CudaStream::CudaStream(cudaStream_t stream,
     cudnn_handle_ = external_cudnn_handle;
     CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
   }
+#endif
 }
 
 CudaStream::~CudaStream() {
   ORT_IGNORE_RETURN_VALUE(CleanUpOnRunEnd());
+#ifndef USE_CUDA_MINIMAL
   if (own_stream_) {
     cublasDestroy(cublas_handle_);
     cudnnDestroy(cudnn_handle_);
@@ -89,6 +94,7 @@ CudaStream::~CudaStream() {
     if (handle)
       cudaStreamDestroy(static_cast<cudaStream_t>(handle));
   }
+#endif
 }
 
 std::unique_ptr<synchronize::Notification> CudaStream::CreateNotification(size_t /*num_consumers*/) {
@@ -185,6 +191,30 @@ void* CudaStream::GetResource(int version, int id) const {
     case CudaResource::deferred_cpu_allocator_t:
       return const_cast<DeferredCpuAllocator*>(&deferred_cpu_allocator_);
       break;
+    case CudaResource::device_id_t:
+      return reinterpret_cast<void*>(ep_info_.device_id);
+      break;
+    case CudaResource::arena_extend_strategy_t:
+      return reinterpret_cast<void*>(ep_info_.arena_extend_strategy);
+      break;
+    case CudaResource::cudnn_conv_algo_search_t:
+      return reinterpret_cast<void*>(ep_info_.cudnn_conv_algo_search);
+      break;
+    case CudaResource::cudnn_conv_use_max_workspace_t:
+      return reinterpret_cast<void*>(ep_info_.cudnn_conv_use_max_workspace);
+      break;
+    case CudaResource::cudnn_conv1d_pad_to_nc1d_t:
+      return reinterpret_cast<void*>(ep_info_.cudnn_conv1d_pad_to_nc1d);
+      break;
+    case CudaResource::enable_skip_layer_norm_strict_mode_t:
+      return reinterpret_cast<void*>(ep_info_.enable_skip_layer_norm_strict_mode);
+      break;
+    case CudaResource::prefer_nhwc_t:
+      return reinterpret_cast<void*>(ep_info_.prefer_nhwc);
+      break;
+    case CudaResource::use_tf32_t:
+      return reinterpret_cast<void*>(ep_info_.use_tf32);
+      break;
     default:
       break;
   }
@@ -207,26 +237,28 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                cudaStream_t external_stream,
                                bool use_existing_stream,
                                cudnnHandle_t external_cudnn_handle,
-                               cublasHandle_t external_cublas_handle) {
+                               cublasHandle_t external_cublas_handle,
+                               const CUDAExecutionProviderInfo& ep_info) {
   // wait cuda notification on cuda ep
   stream_handle_registry.RegisterWaitFn(device_type, device_type, WaitCudaNotificationOnDevice);
   // wait cuda notification on cpu ep
   stream_handle_registry.RegisterWaitFn(device_type, OrtDevice::CPU, WaitCudaNotificationOnHost);
   if (!use_existing_stream)
-    stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_cuda_stream](const OrtDevice& device) {
+    stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_cuda_stream, ep_info](const OrtDevice& device) {
       CUDA_CALL_THROW(cudaSetDevice(device.Id()));
       cudaStream_t stream = nullptr;
       CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
       // CUDA_CALL_THROW(cudaStreamCreate(&stream));
-      return std::make_unique<CudaStream>(stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, true, nullptr, nullptr);
+      return std::make_unique<CudaStream>(stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, true, nullptr, nullptr, ep_info);
     });
   else
     stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator,
                                                                 release_cpu_buffer_on_cuda_stream,
                                                                 external_stream,
                                                                 external_cudnn_handle,
-                                                                external_cublas_handle](const OrtDevice& device) {
-      return std::make_unique<CudaStream>(external_stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, false, external_cudnn_handle, external_cublas_handle);
+                                                                external_cublas_handle,
+                                                                ep_info](const OrtDevice& device) {
+      return std::make_unique<CudaStream>(external_stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, false, external_cudnn_handle, external_cublas_handle, ep_info);
     });
 }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.h b/onnxruntime/core/providers/cuda/cuda_stream_handle.h
index 917702fae08f..15e7a0553c84 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.h
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.h
@@ -6,10 +6,12 @@
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/framework/stream_handles.h"
+#include "core/providers/cuda/cuda_execution_provider_info.h"
 
 namespace onnxruntime {
 
 struct CudaStream;
+void WaitCudaNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 
 struct DeferredCpuAllocator : public OrtAllocator {
   DeferredCpuAllocator(CudaStream&);
@@ -23,7 +25,8 @@ struct CudaStream : Stream {
              bool release_cpu_buffer_on_cuda_stream,
              bool own_flag,
              cudnnHandle_t external_cudnn_handle,
-             cublasHandle_t external_cublass_handle);
+             cublasHandle_t external_cublass_handle,
+             const CUDAExecutionProviderInfo& ep_info);
 
   ~CudaStream();
 
@@ -45,11 +48,14 @@ struct CudaStream : Stream {
 
   onnxruntime::IAllocator* GetCpuAllocator() const { return cpu_allocator_.get(); }
 
+  WaitNotificationFn GetWaitNotificationFn() const override { return WaitCudaNotificationOnDevice; }
+
  private:
   std::vector<void*> deferred_cpu_buffers_;
   AllocatorPtr cpu_allocator_;
   bool release_cpu_buffer_on_cuda_stream_{true};
   DeferredCpuAllocator deferred_cpu_allocator_;
+  const CUDAExecutionProviderInfo ep_info_;
 };
 
 void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry,
@@ -59,6 +65,6 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                cudaStream_t external_stream,
                                bool use_existing_stream,
                                cudnnHandle_t external_cudnn_handle,
-                               cublasHandle_t external_cublass_handle);
-void WaitCudaNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
+                               cublasHandle_t external_cublass_handle,
+                               const CUDAExecutionProviderInfo& ep_info);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index 4df59a98b12e..9aa011c1d0ec 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -9,7 +9,7 @@
 #include "core/common/gsl.h"
 #include "shared_inc/cuda_call.h"
 #include "core/providers/cpu/tensor/utils.h"
-
+#ifndef USE_CUDA_MINIMAL
 namespace onnxruntime {
 namespace cuda {
 
@@ -37,13 +37,28 @@ Status CudnnTensor::Set(gsl::span<const int64_t> input_dims, cudnnDataType_t dat
   TensorPitches pitches(input_dims);
   InlinedVector<int, kTensorShapeSmallBufferElementsSize> dims(rank);
   InlinedVector<int, kTensorShapeSmallBufferElementsSize> strides(rank);
-  for (int i = 0; i < rank; i++) {
-    dims[i] = gsl::narrow_cast<int>(input_dims[i]);
-    strides[i] = gsl::narrow_cast<int>(pitches[i]);
-  }
-  if (is_nhwc) {
-    std::swap(dims[1], dims[rank - 1]);
-    std::swap(strides[1], strides[rank - 1]);
+
+  if (!is_nhwc) {
+    for (int i = 0; i < rank; i++) {
+      dims[i] = gsl::narrow_cast<int>(input_dims[i]);
+      strides[i] = gsl::narrow_cast<int>(pitches[i]);
+    }
+  } else {
+    // NHWDC <-> NCHWD
+
+    // N
+    dims[0] = gsl::narrow_cast<int>(input_dims[0]);
+    strides[0] = gsl::narrow_cast<int>(pitches[0]);
+
+    // HWD
+    for (int i = 1; i < rank - 1; i++) {
+      dims[i + 1] = gsl::narrow_cast<int>(input_dims[i]);
+      strides[i + 1] = gsl::narrow_cast<int>(pitches[i]);
+    }
+
+    // C
+    dims[1] = gsl::narrow_cast<int>(input_dims[rank - 1]);
+    strides[1] = gsl::narrow_cast<int>(pitches[rank - 1]);
   }
   CUDNN_RETURN_IF_ERROR(cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast<int>(rank), dims.data(), strides.data()));
   return Status::OK();
@@ -160,7 +175,6 @@ cudnnDataType_t CudnnTensor::GetDataType<half>() {
 template <>
 cudnnDataType_t CudnnTensor::GetDataType<BFloat16>() {
   ORT_THROW("cuDNN doesn't support BFloat16.");
-  return CUDNN_DATA_FLOAT;
 }
 
 template <>
@@ -222,3 +236,4 @@ const Float8E5M2 Consts<Float8E5M2>::One = Float8E5M2(1.0f, true);
 
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h
index 8a94a334ee68..2cbeb1369627 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.h
+++ b/onnxruntime/core/providers/cuda/cudnn_common.h
@@ -7,7 +7,7 @@
 #include <cfloat>
 
 #include "core/providers/cuda/cuda_common.h"
-
+#ifndef USE_CUDA_MINIMAL
 namespace onnxruntime {
 namespace cuda {
 
@@ -24,12 +24,12 @@ class CudnnTensor final {
 
   operator cudnnTensorDescriptor_t() const { return tensor_; }
 
+  Status CreateTensorIfNeeded();
+
   template <typename T>
   static cudnnDataType_t GetDataType();
 
  private:
-  Status CreateTensorIfNeeded();
-
   cudnnTensorDescriptor_t tensor_;
 };
 
@@ -260,3 +260,4 @@ SetPoolingNdDescriptorHelper(cudnnPoolingDescriptor_t poolingDesc,
 
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc
index 3e50116eafd1..ee0334e55202 100644
--- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc
@@ -51,25 +51,27 @@ Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data,
   CudaT one = cuda::ToCudaType<T>::FromFloat(1.0f);
   CudaT zero = cuda::ToCudaType<T>::FromFloat(0.0f);
 
-  CUBLAS_RETURN_IF_ERROR(cublasGemmStridedBatchedHelper(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cublas_handle_,
-                                                        CUBLAS_OP_N,
-                                                        CUBLAS_OP_N,
-                                                        static_cast<int>(N),
-                                                        static_cast<int>(M),
-                                                        static_cast<int>(K),
-                                                        &one,
-                                                        reinterpret_cast<const CudaT*>(input_2_data),
-                                                        static_cast<int>(N),
-                                                        static_cast<int>(right_stride),
-                                                        reinterpret_cast<const CudaT*>(input_1_data),
-                                                        static_cast<int>(K),
-                                                        static_cast<int>(left_stride),
-                                                        &zero,
-                                                        reinterpret_cast<CudaT*>(output_data),
-                                                        static_cast<int>(N),
-                                                        static_cast<int>(output_stride),
-                                                        static_cast<int>(num_batches),
-                                                        static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetDeviceProp()));
+  CUBLAS_RETURN_IF_ERROR(cublasGemmStridedBatchedHelper(
+      static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cublas_handle_,
+      CUBLAS_OP_N,
+      CUBLAS_OP_N,
+      static_cast<int>(N),
+      static_cast<int>(M),
+      static_cast<int>(K),
+      &one,
+      reinterpret_cast<const CudaT*>(input_2_data),
+      static_cast<int>(N),
+      static_cast<int>(right_stride),
+      reinterpret_cast<const CudaT*>(input_1_data),
+      static_cast<int>(K),
+      static_cast<int>(left_stride),
+      &zero,
+      reinterpret_cast<CudaT*>(output_data),
+      static_cast<int>(N),
+      static_cast<int>(output_stride),
+      static_cast<int>(num_batches),
+      static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetDeviceProp(),
+      static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->UseTF32()));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/math/gemm.cc b/onnxruntime/core/providers/cuda/math/gemm.cc
index 8fe23c9a036c..4e61e0c8c69c 100644
--- a/onnxruntime/core/providers/cuda/math/gemm.cc
+++ b/onnxruntime/core/providers/cuda/math/gemm.cc
@@ -118,7 +118,7 @@ Status Gemm<T>::ComputeDefault(OpKernelContext* ctx, int M, int N, int K) const
           b_data, N,
           GetConstOnes<CudaT>(M, Stream(ctx)), 1,
           /*beta*/ &zero,
-          out_data, N, device_prop));
+          out_data, N, device_prop, UseTF32()));
     } else if (b_shape.NumDimensions() == 2 && b_shape[1] == 1) {
       // B is (M, 1), broadcast using Y(N,M) = 1 * ones(N,1) x B(1,M) + 0 * Y
       CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
@@ -130,7 +130,7 @@ Status Gemm<T>::ComputeDefault(OpKernelContext* ctx, int M, int N, int K) const
           GetConstOnes<CudaT>(N, Stream(ctx)), N,
           b_data, 1,
           /*beta*/ &zero,
-          out_data, N, device_prop));
+          out_data, N, device_prop, UseTF32()));
     } else {
       // B is (M, N), no broadcast needed.
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, static_cast<size_t>(M) * N * sizeof(T), cudaMemcpyDeviceToDevice, Stream(ctx)));
@@ -153,7 +153,7 @@ Status Gemm<T>::ComputeDefault(OpKernelContext* ctx, int M, int N, int K) const
       // ideally we need to set the output buffer contents to 0 if bias is missing,
       // but passing 0 for beta is cheaper and it will ignore any junk in the output buffer
       B != nullptr ? &beta : &zero,
-      out_data, N, device_prop));
+      out_data, N, device_prop, UseTF32()));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/math/matmul.cc b/onnxruntime/core/providers/cuda/math/matmul.cc
index e4c37c52a178..6e126fbeadce 100644
--- a/onnxruntime/core/providers/cuda/math/matmul.cc
+++ b/onnxruntime/core/providers/cuda/math/matmul.cc
@@ -173,7 +173,8 @@ Status FuncMatMul(
         &cuda_zero,
         reinterpret_cast<CudaT*>(Y->MutableData<T>()),
         ldc,
-        device_prop));
+        device_prop,
+        cuda_kernel->UseTF32()));
     return Status::OK();
   } else if (CanUseStridedBatchedGemm(A->Shape(), B->Shape(),
                                       trans_A, trans_B, trans_batch_B, trans_batch_B, stride_A, stride_B, stride_C, batch_count)) {
@@ -195,7 +196,8 @@ Status FuncMatMul(
                                                           ldc,
                                                           stride_C,
                                                           static_cast<int>(batch_count),
-                                                          device_prop));
+                                                          device_prop,
+                                                          cuda_kernel->UseTF32()));
 
     return Status::OK();
   }
@@ -213,12 +215,12 @@ Status FuncMatMul(
   ORT_RETURN_IF_ERROR(Y_arrays.CopyToGpu(ctx->GetComputeStream()));
 
   // TF32 provides a huge performance gain for training and inference while preserving FP32 levels of accuracy.
-  // It requires Ampere or newer GPU, and pointers of matrics shall be aligned (ideal alignment is 16-byte).
+  // It requires Ampere or newer GPU, and pointers of matrices shall be aligned (ideal alignment is 16-byte).
   // Assume that start memory of input/output tensor is aligned, we only check offsets of sub-matrix per batch here.
-  cublasMath_t mode = (std::is_same<T, float>::value && device_prop.major >= 8 && helper.IsBatchedGemmAligned())
-                          ? CUBLAS_TF32_TENSOR_OP_MATH
-                          : CUBLAS_DEFAULT_MATH;
-  CublasMathModeSetter math_mode_setter(device_prop, cuda_kernel->GetCublasHandle(ctx), mode);
+  bool use_tf32 = std::is_same<T, float>::value &&
+                  cuda_kernel->UseTF32() &&
+                  device_prop.major >= 8 &&
+                  helper.IsBatchedGemmAligned();
 
   // note that onnxruntime OrtValue is row major, while cublas is column major,
   // so swap left/right operands
@@ -238,7 +240,8 @@ Status FuncMatMul(
       Y_arrays.GpuPtr(),
       ldc,
       static_cast<int>(helper.OutputOffsets().size()),
-      device_prop));
+      device_prop,
+      use_tf32));
   return Status::OK();
 }
 
@@ -321,7 +324,8 @@ Status MatMul<T>::ComputeDefault(OpKernelContext* ctx, MatMulComputeHelper& help
         &zero,
         reinterpret_cast<CudaT*>(Y->MutableData<T>()),
         ldc,
-        device_prop));
+        device_prop,
+        UseTF32()));
     return Status::OK();
   } else if (CanUseStridedBatchedGemm(left_X->Shape(), right_X->Shape(),
                                       transa, transb, trans_batch_a_, trans_batch_b_, stride_A, stride_B, stride_C, batch_count)) {
@@ -343,7 +347,8 @@ Status MatMul<T>::ComputeDefault(OpKernelContext* ctx, MatMulComputeHelper& help
                                                           ldc,
                                                           stride_C,
                                                           static_cast<int>(batch_count),
-                                                          device_prop));
+                                                          device_prop,
+                                                          UseTF32()));
 
     return Status::OK();
   }
@@ -361,12 +366,12 @@ Status MatMul<T>::ComputeDefault(OpKernelContext* ctx, MatMulComputeHelper& help
   ORT_RETURN_IF_ERROR(output_arrays.CopyToGpu(ctx->GetComputeStream()));
 
   // TF32 provides a huge performance gain for training and inference while preserving FP32 levels of accuracy.
-  // It requires Ampere or newer GPU, and pointers of matrics shall be aligned (ideal alignment is 16-byte).
+  // It requires Ampere or newer GPU, and pointers of matrices shall be aligned (ideal alignment is 16-byte).
   // Assume that start memory of input/output tensor is aligned, we only check offsets of sub-matrix per batch here.
-  cublasMath_t mode = (std::is_same<T, float>::value && device_prop.major >= 8 && helper.IsBatchedGemmAligned())
-                          ? CUBLAS_TF32_TENSOR_OP_MATH
-                          : CUBLAS_DEFAULT_MATH;
-  CublasMathModeSetter math_mode_setter(device_prop, GetCublasHandle(ctx), mode);
+  bool use_tf32 = std::is_same<T, float>::value &&
+                  this->UseTF32() &&
+                  device_prop.major >= 8 &&
+                  helper.IsBatchedGemmAligned();
 
   // note that onnxruntime OrtValue is row major, while cublas is column major,
   // so swap left/right operands
@@ -386,7 +391,8 @@ Status MatMul<T>::ComputeDefault(OpKernelContext* ctx, MatMulComputeHelper& help
       output_arrays.GpuPtr(),
       ldc,
       static_cast<int>(helper.OutputOffsets().size()),
-      device_prop));
+      device_prop,
+      use_tf32));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/math/softmax_warpwise_impl.cuh b/onnxruntime/core/providers/cuda/math/softmax_warpwise_impl.cuh
index c1b3d6ada8b7..5e2cec464a86 100644
--- a/onnxruntime/core/providers/cuda/math/softmax_warpwise_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/softmax_warpwise_impl.cuh
@@ -1,18 +1,18 @@
 /**
-* Copyright (c) 2016-present, Facebook, Inc.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 // The code below is mostly copied from Pytorch PersistentSoftmax.cuh
 
@@ -55,7 +55,6 @@ __device__ __forceinline__ void warp_reduce(acc_t* sum) {
   }
 }
 
-
 // The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension.
 // Each sample contains element_count scalar elements. element_count can be any integer value <= 1024.
 // The template arguments have the following meaning:
@@ -163,7 +162,6 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc
   }
 }
 
-
 // softmax_warp_forward uses register to store data in fp32 even when data is fp16, which will cause register resource oversubscription when data is large,
 // and will lead to low CUDA warp occupancy and thus a poor kernel performance.
 // softmax_warp_forward_resource_efficient is implemented to solve the issue, it caches data in original data type, and casts it into fp32 when needed,
@@ -176,17 +174,19 @@ __global__ void softmax_warp_forward_resource_efficient(output_t* dst, const inp
   constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
 
   int local_idx = threadIdx.x;
-  src +=  blockIdx.x * stride + local_idx;
-  dst +=  blockIdx.x * stride + local_idx;
+  src += blockIdx.x * stride + local_idx;
+  dst += blockIdx.x * stride + local_idx;
   extern __shared__ unsigned char smem[];
-  input_t (&elements)[WARP_ITERATIONS][WARP_SIZE] = *reinterpret_cast<input_t (*)[WARP_ITERATIONS][WARP_SIZE]>(smem);
+  input_t(&elements)[WARP_ITERATIONS][WARP_SIZE] = *reinterpret_cast<input_t(*)[WARP_ITERATIONS][WARP_SIZE]>(smem);
 #pragma unroll
   for (int it = 0; it < WARP_ITERATIONS; ++it) {
     int element_index = local_idx + it * WARP_SIZE;
     if (element_index < element_count) {
       elements[it][local_idx] = src[it * WARP_SIZE];
     } else {
-      elements[it][local_idx] = -std::numeric_limits<input_t>::infinity();
+      static_assert(std::numeric_limits<acc_t>::has_infinity,
+                    "type of acc_t should have infinity to avoid infinity function return 0");
+      elements[it][local_idx] = static_cast<input_t>(-std::numeric_limits<acc_t>::infinity());
     }
   }
   // compute max_value
diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc
index d516537e2594..cf26e0acfa55 100644
--- a/onnxruntime/core/providers/cuda/math/topk.cc
+++ b/onnxruntime/core/providers/cuda/math/topk.cc
@@ -56,7 +56,7 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
   info.GetAttrOrDefault<int64_t>("largest", &largest_, 1);
   info.GetAttrOrDefault<int64_t>("sorted", &sorted_, 1);
   if (!inputk) {
-    info.GetAttrOrDefault<int64_t>("k", &K_, 0);
+    info.GetAttrOrDefault<int64_t>("k", &attr_k_, 0);
   }
 }
 
@@ -67,7 +67,7 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
                                 static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
                                 elem_nums_cuda,                                    \
                                 elem_nums.size(),                                  \
-                                axis, K_, largest_, sorted_, N, dimension)
+                                axis, k_value, largest_, sorted_, N, dimension)
 
 template <bool inputk>
 Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
@@ -77,19 +77,29 @@ Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
   int32_t axis = static_cast<int32_t>(axis_ < 0 ? rank + axis_ : axis_);
   ORT_ENFORCE(axis > -1 && axis < rank);
 
+  int64_t k_value = 0;
   if (inputk) {
     auto tensor_K = ctx->Input<Tensor>(1);
     ORT_ENFORCE(nullptr != tensor_K);
-    K_ = *tensor_K->Data<int64_t>();
-    ORT_ENFORCE(K_ >= 0 && K_ <= tensor_X->Shape().GetDims()[axis]);
+    k_value = *tensor_K->Data<int64_t>();
+  } else {  // from attribute
+    k_value = attr_k_;
   }
 
-  auto output_shape = tensor_X->Shape();
-  output_shape[axis] = K_;
+  // Now that we know the value of 'K' and the input shape,
+  // make a final validation before going to the implementation
+  const auto& input_shape = tensor_X->Shape();
+  if ((k_value < 0) || (k_value > input_shape.GetDims()[axis])) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Value of K outside range. K value: ", k_value,
+                           ". Input shape: ", input_shape, " . Axis: ", axis);
+  }
+
+  auto output_shape = input_shape;
+  output_shape[axis] = k_value;
   auto tensor_V = ctx->Output(0, output_shape);
   auto tensor_I = ctx->Output(1, output_shape);
 
-  if (0 == K_) {
+  if (output_shape.Size() == 0) {  // Bail out early if the output is going to be empty
     return Status::OK();
   }
 
diff --git a/onnxruntime/core/providers/cuda/math/topk.h b/onnxruntime/core/providers/cuda/math/topk.h
index 9dec13ad2a93..5731df3130c5 100644
--- a/onnxruntime/core/providers/cuda/math/topk.h
+++ b/onnxruntime/core/providers/cuda/math/topk.h
@@ -17,7 +17,7 @@ class TopK final : public CudaKernel {
   int64_t axis_;
   int64_t largest_;
   int64_t sorted_;
-  mutable int64_t K_;
+  int64_t attr_k_;
 };
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
index 655877f42505..24593b255371 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
@@ -71,6 +71,88 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa
     return Status::OK();                                                                          \
   }
 
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    IsInf,
+    kOnnxDomain,
+    10,
+    19,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<float, double>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsInf);
+
+ONNX_OPERATOR_KERNEL_EX(
+    IsInf,
+    kOnnxDomain,
+    20,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<ISINF_OPSET20_ALL_FLOATS>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsInf);
+
+IsInf::IsInf(const OpKernelInfo& info) : UnaryElementwise(info) {
+  detect_positive_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("detect_positive", 1));
+  detect_negative_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("detect_negative", 1));
+  opset_ = info.node().SinceVersion();
+}
+
+Status IsInf::ComputeInternal(OpKernelContext* context) const {
+  UnaryElementwisePreparation p;
+  ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p));
+
+  Explicit_Impl_IsInf(Stream(context), opset_, detect_positive_, detect_negative_,
+                      p.input_tensor->GetElementType(), p.input_tensor->DataRaw(),
+                      p.output_tensor->MutableData<bool>(),
+                      p.input_tensor->Shape().Size());
+  return Status::OK();
+}
+
+// IsNan
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    IsNaN,
+    kOnnxDomain,
+    9,
+    12,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<ISNAN_OPSET9_FLOATS>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsNaN);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    IsNaN,
+    kOnnxDomain,
+    13,
+    19,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<ISNAN_OPSET13_FLOATS>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsNaN);
+
+ONNX_OPERATOR_KERNEL_EX(
+    IsNaN,
+    kOnnxDomain,
+    20,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", BuildKernelDefConstraints<ISNAN_OPSET20_FLOATS>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsNaN);
+
+Status IsNaN::ComputeInternal(OpKernelContext* context) const {
+  UnaryElementwisePreparation p;
+  ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p));
+
+  Explicit_Impl_IsNan(Stream(context), p.input_tensor->GetElementType(), p.input_tensor->DataRaw(),
+                      p.output_tensor->MutableData<bool>(),
+                      p.input_tensor->Shape().Size());
+
+  return Status::OK();
+}
+
 #define UNARY_OP_VERSIONED_TYPED(name, startver, endver, T) \
   UNARY_ELEMENTWISE_REGISTER_VERSIONED_KERNEL(name, startver, endver, T)
 
@@ -160,7 +242,7 @@ UNARY_OP_CSILHFD(Neg, 13)
 UNARY_OP_HFD(Floor, 13)
 UNARY_OP_HFD(Ceil, 13)
 UNARY_OP_HFD(Reciprocal, 13)
-UNARY_OP_HFD(Sqrt, 13)
+UNARY_OP_HFDX(Sqrt, 13)
 UNARY_OP_HFD(Log, 13)
 UNARY_OP_HFD(Exp, 13)
 UNARY_OP_HFD(Erf, 13)
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
index 775b78c43a73..95d68b5e1d53 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.h
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #pragma once
+
 #include "core/providers/cuda/cuda_kernel.h"
 
 namespace onnxruntime {
@@ -119,5 +120,22 @@ class Sign final : public UnaryElementwise {
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
+class IsInf final : public UnaryElementwise {
+ public:
+  explicit IsInf(const OpKernelInfo& info);
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  bool detect_positive_{true};
+  bool detect_negative_{true};
+  int opset_;
+};
+
+class IsNaN : public UnaryElementwise {
+ public:
+  explicit IsNaN(const OpKernelInfo& info) : UnaryElementwise(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
index 5c3db4a49997..2cdfcda5be26 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu
@@ -11,6 +11,7 @@
 #endif
 
 namespace onnxruntime {
+
 namespace cuda {
 
 #define OP(name, expr)                                     \
@@ -83,7 +84,7 @@ SPECIALIZED_UNARY_ELEMENTWISE_IMPL_CSILHFD(Neg)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Floor)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Ceil)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Reciprocal)
-SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Sqrt)
+SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(Sqrt)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(Log)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFDX(Exp)
 SPECIALIZED_UNARY_ELEMENTWISE_IMPL_HFD(Erf)
@@ -126,9 +127,10 @@ struct OP_Cast {
     UnaryElementWiseImpl(stream, input_data, output_data, OP_Cast<InT, OutT>(), count);                  \
   }
 
-#define IMPL_CAST_IMPL_THROW(InT, OutT)                                                                  \
-  void Explicit_Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { \
-    ORT_THROW("Cast from " #InT " to " #OutT " must define saturate.");                                  \
+#define IMPL_CAST_IMPL_THROW(InT, OutT)                                                              \
+  void Explicit_Impl_Cast(cudaStream_t /*stream*/, const InT* /*input_data*/, OutT* /*output_data*/, \
+                          size_t /*count*/) {                                                        \
+    ORT_THROW("Cast from " #InT " to " #OutT " must define saturate.");                              \
   }
 
 #if !defined(DISABLE_FLOAT8_TYPES)
@@ -284,5 +286,62 @@ EXPLICIT_IMPL_CASTSAT(__nv_bfloat16, Float8E5M2)
 
 #endif
 
+namespace isinf_details {
+template <typename T>
+struct IsInf_DispFunc {
+  void operator()(cudaStream_t stream, const void* input_raw, bool* output_data,
+                  bool detect_positive, bool detect_negative, size_t count) const {
+    using CudaType = typename ToCudaType<T>::MappedType;
+    const auto* input_data = reinterpret_cast<const CudaType*>(input_raw);
+    if (detect_positive && detect_negative) {
+      UnaryElementWiseImpl(stream, input_data, output_data, _IsInf<CudaType, true, true>{}, count);
+    } else if (detect_positive) {
+      UnaryElementWiseImpl(stream, input_data, output_data, _IsInf<CudaType, true, false>{}, count);
+    } else if (detect_negative) {
+      UnaryElementWiseImpl(stream, input_data, output_data, _IsInf<CudaType, false, true>{}, count);
+    } else {
+      UnaryElementWiseImpl(stream, input_data, output_data, _IsInf<CudaType, false, false>{}, count);
+    }
+  }
+};
+
+}  // namespace isinf_details
+
+void Explicit_Impl_IsInf(cudaStream_t stream, int op_set,
+                         bool detect_positive, bool detect_negative,
+                         int32_t input_data_type,
+                         const void* input_raw, bool* output_data,
+                         size_t count) {
+  if (op_set < 20) {
+    utils::MLTypeCallDispatcher<float, double> dispatcher{input_data_type};
+    dispatcher.Invoke<isinf_details::IsInf_DispFunc>(stream, input_raw, output_data,
+                                                     detect_positive, detect_negative, count);
+  } else {
+    utils::MLTypeCallDispatcher<ISINF_OPSET20_ALL_FLOATS> dispatcher{input_data_type};
+    dispatcher.Invoke<isinf_details::IsInf_DispFunc>(stream, input_raw, output_data,
+                                                     detect_positive, detect_negative, count);
+  }
+}
+
+// IsNan
+
+namespace isnan_details {
+template <typename T>
+struct IsNan_Disp {
+  void operator()(cudaStream_t stream, const void* input_raw, bool* output_data, size_t count) const {
+    using CudaType = typename ToCudaType<T>::MappedType;
+    const auto* input_data = reinterpret_cast<const CudaType*>(input_raw);
+    UnaryElementWiseImpl(stream, input_data, output_data, _IsNan<CudaType>{}, count);
+  }
+};
+}  // namespace isnan_details
+
+void Explicit_Impl_IsNan(cudaStream_t stream, int32_t input_data_type,
+                         const void* input_raw, bool* output_data, size_t count) {
+  // KernelDef constraints would ensure only subset of datatypes is used.
+  utils::MLTypeCallDispatcher<ISNAN_OPSET20_FLOATS> dispatcher{input_data_type};
+  dispatcher.Invoke<isnan_details::IsNan_Disp>(stream, input_raw, output_data, count);
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
index 608a81a24cf4..2588f56e32c1 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h
@@ -137,5 +137,34 @@ void Impl_CastSat(
 
 #endif
 
+// IsInf
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+#define ISINF_OPSET20_ALL_FLOATS float, double, MLFloat16, BFloat16, Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, \
+                                 Float8E5M2FNUZ
+#else
+#define ISINF_OPSET20_ALL_FLOATS float, double, MLFloat16, BFloat16
+#endif
+
+void Explicit_Impl_IsInf(cudaStream_t stream, int op_set,
+                         bool detect_positive, bool detect_negative,
+                         int32_t input_data_type,
+                         const void* input_raw, bool* output_data,
+                         size_t count);
+
+// IsNan
+#define ISNAN_OPSET9_FLOATS float, double, MLFloat16
+#define ISNAN_OPSET13_FLOATS float, double, MLFloat16, BFloat16
+#if !defined(DISABLE_FLOAT8_TYPES)
+#define ISNAN_OPSET20_FLOATS float, double, MLFloat16, BFloat16, Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, \
+                             Float8E5M2FNUZ
+#else
+#define ISNAN_OPSET20_FLOATS ISNAN_OPSET13_FLOATS
+#endif
+
+void Explicit_Impl_IsNan(cudaStream_t stream, int32_t input_data_type,
+                         const void* input_raw, bool* output_data, size_t count);
+
 }  // namespace cuda
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.cc b/onnxruntime/core/providers/cuda/nn/batch_norm.cc
index c468971e1e42..02da1a2c99df 100644
--- a/onnxruntime/core/providers/cuda/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/cuda/nn/batch_norm.cc
@@ -87,7 +87,7 @@ Status BatchNorm<T, NHWC>::ComputeInternal(OpKernelContext* p_op_kernel_context)
 
   CudnnTensor data_desc;
   vector<int64_t> new_dims;
-  BatchNormHelper::NormalizeDims(x_shape, new_dims);
+  BatchNormHelper::NormalizeDims(x_shape, new_dims, NHWC);
   ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, CudnnTensor::GetDataType<CudaT>(), NHWC));
 
   // For half data type, the alpha, beta, scale, B, mean, var need to be float type
@@ -137,6 +137,12 @@ Status BatchNorm<T, NHWC>::ComputeInternal(OpKernelContext* p_op_kernel_context)
     auto saved_mean_data = reinterpret_cast<CudaT*>(saved_mean->MutableData<T>());
     auto saved_inv_var_data = reinterpret_cast<CudaT*>(saved_var->MutableData<T>());
 
+    auto stream = static_cast<cudaStream_t>(p_op_kernel_context->GetComputeStream()->GetHandle());
+    CUDA_RETURN_IF_ERROR(
+        cudaMemcpyAsync(running_mean_data, mean_data, mean->SizeInBytes(), cudaMemcpyDeviceToDevice, stream));
+    CUDA_RETURN_IF_ERROR(
+        cudaMemcpyAsync(running_var_data, var_data, var->SizeInBytes(), cudaMemcpyDeviceToDevice, stream));
+
     CUDNN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
         GetCudnnHandle(p_op_kernel_context),
         cudnn_batch_norm_mode_,
@@ -149,7 +155,7 @@ Status BatchNorm<T, NHWC>::ComputeInternal(OpKernelContext* p_op_kernel_context)
         bn_tensor_desc,
         scale_data,
         b_data,
-        momentum_,
+        1.0 - momentum_,
         running_mean_data,
         running_var_data,
         epsilon_,
@@ -186,6 +192,7 @@ SPECIALIZED_COMPUTE(MLFloat16, kOnnxDomain, false)
 
 #ifdef ENABLE_CUDA_NHWC_OPS
 SPECIALIZED_COMPUTE(float, kMSInternalNHWCDomain, true)
+SPECIALIZED_COMPUTE(double, kMSInternalNHWCDomain, true)
 SPECIALIZED_COMPUTE(MLFloat16, kMSInternalNHWCDomain, true)
 #endif
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index 82f350391923..e05786248cbc 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -97,11 +97,11 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream,
 
 template <typename T, bool NHWC>
 Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                              bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) {
+                              bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
   // only layout of weight input is adjusted via PrePack
-  if (NHWC && is_nhwc_domain_) {  // InputTensors::IN_W
-    if (input_idx == 1) {
+  if constexpr (NHWC) {
+    if (is_nhwc_domain_ && input_idx == 1) {  // InputTensors::IN_W
       // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group}
       auto orig_shape = tensor.Shape();
 
@@ -123,6 +123,10 @@ Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr
       CUDA_CALL_THROW(cudaStreamSynchronize(DefaultCudaStream()));
       is_packed = true;
     }
+  } else {
+    ORT_UNUSED_PARAMETER(tensor);
+    ORT_UNUSED_PARAMETER(input_idx);
+    ORT_UNUSED_PARAMETER(alloc);
   }
 
   return Status::OK();
@@ -149,8 +153,11 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 
   // Make sure input and weight are 4D for NHWC since we set 4D descriptor for NHWC.
   constexpr bool channels_last = NHWC;
-  if (channels_last && (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Number of dimensions of X and W should be 4 for channels_last format (NHWC)");
+  if constexpr (channels_last) {
+    if (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Number of dimensions of X and W should be 4 for channels_last format (NHWC)");
+    }
   }
 
   // set B
@@ -326,7 +333,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 
     ORT_RETURN_IF_ERROR(s_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
                                          gsl::narrow_cast<int>(conv_attrs_.group),
-                                         CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType<CudaT>()));
+                                         CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType<CudaT>(),
+                                         UseTF32()));
 
     if (context->InputCount() >= 3) {
       const Tensor* B = context->Input<Tensor>(2);
@@ -351,8 +359,13 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 
     if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) {
       // set math type to tensor core before algorithm search
-      if constexpr (std::is_same<T, MLFloat16>::value)
+      if constexpr (std::is_same<T, MLFloat16>::value) {
         CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH));
+      } else if constexpr (std::is_same<T, float>::value) {
+        if (!UseTF32()) {
+          CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH));
+        }
+      }
 
       cudnnConvolutionFwdAlgoPerf_t perf;
       int algo_count = 1;
@@ -397,8 +410,11 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
         default:
           perf.algo = kDefaultConvAlgo;
           CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(GetCudnnHandle(context), s_, perf.algo, &perf.memory));
-          if (std::is_same<T, MLFloat16>::value) {
+
+          if constexpr (std::is_same<T, MLFloat16>::value) {
             perf.mathType = CUDNN_TENSOR_OP_MATH;
+          } else if (std::is_same<T, float>::value && !UseTF32()) {
+            perf.mathType = CUDNN_FMA_MATH;
           } else {
             perf.mathType = CUDNN_DEFAULT_MATH;
           }
@@ -480,7 +496,8 @@ Status CudnnConvolutionDescriptor::Set(
     const gsl::span<const int64_t>& dilations,
     int groups,
     cudnnConvolutionMode_t mode,
-    cudnnDataType_t data_type) {
+    cudnnDataType_t data_type,
+    bool use_tf32) {
   if (!desc_)
     CUDNN_RETURN_IF_ERROR(cudnnCreateConvolutionDescriptor(&desc_));
 
@@ -513,6 +530,8 @@ Status CudnnConvolutionDescriptor::Set(
   CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_DEFAULT_MATH));
   if (data_type == CUDNN_DATA_HALF) {
     CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_TENSOR_OP_MATH));
+  } else if (data_type == CUDNN_DATA_FLOAT && !use_tf32) {
+    CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_FMA_MATH));
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h
index bcaa4d855b81..3aec654224e3 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.h
+++ b/onnxruntime/core/providers/cuda/nn/conv.h
@@ -29,7 +29,8 @@ class CudnnConvolutionDescriptor final {
              const gsl::span<const int64_t>& dilations,
              int groups,
              cudnnConvolutionMode_t mode,
-             cudnnDataType_t data_type);
+             cudnnDataType_t data_type,
+             bool use_tf32);
 
   operator cudnnConvolutionDescriptor_t() const { return desc_; }
 
@@ -194,7 +195,7 @@ class Conv : public CudaKernel {
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
+                 bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
   Status ComputeInternal(OpKernelContext* context) const override;
 
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
index 55dceaa2698e..939b9959af81 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
@@ -167,7 +167,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
       cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
       ORT_RETURN_IF_ERROR(s_.conv_desc.Set(p.kernel_shape.size(), p.pads, p.strides, p.dilations,
                                            gsl::narrow_cast<int>(conv_transpose_attrs_.group), mode,
-                                           CudnnTensor::GetDataType<CudaT>()));
+                                           CudnnTensor::GetDataType<CudaT>(),
+                                           UseTF32()));
 
       if (has_bias) {
         const auto& b_shape = p.B->Shape();
@@ -187,8 +188,13 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
             GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
 
         // set math type to tensor core before algorithm search
-        if constexpr (std::is_same<T, MLFloat16>::value)
+        if constexpr (std::is_same<T, MLFloat16>::value) {
           CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH));
+        } else if constexpr (std::is_same<T, float>::value) {
+          if (!UseTF32()) {
+            CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH));
+          }
+        }
 
         cudnnConvolutionBwdDataAlgoPerf_t perf;
         int algo_count = 1;
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm.h b/onnxruntime/core/providers/cuda/nn/layer_norm.h
index ff231f4f1ad5..c021d3ffe63a 100644
--- a/onnxruntime/core/providers/cuda/nn/layer_norm.h
+++ b/onnxruntime/core/providers/cuda/nn/layer_norm.h
@@ -7,8 +7,6 @@
 namespace onnxruntime {
 namespace cuda {
 
-using namespace onnxruntime::cuda;
-
 // NOTE: This was originally a contrib op with 3 type constraints. The ONNX spec merges 'T' and 'V'.
 // the kernel is templatized on all three for backwards compatibility, but in ONNX usage T == V.
 template <typename T, typename U, typename V, bool simplified>
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
index 679b8b6b7888..b9e8b4530707 100644
--- a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
+++ b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
@@ -29,8 +29,6 @@
 namespace onnxruntime {
 namespace cuda {
 
-using namespace onnxruntime::cuda;
-
 template <typename U, bool simplified>
 __device__ void cuWelfordOnlineSum(
     const U curr,
diff --git a/onnxruntime/core/providers/cuda/nn/lrn.cc b/onnxruntime/core/providers/cuda/nn/lrn.cc
index 6fcdec74d84b..788299b5eb8d 100644
--- a/onnxruntime/core/providers/cuda/nn/lrn.cc
+++ b/onnxruntime/core/providers/cuda/nn/lrn.cc
@@ -6,37 +6,47 @@
 namespace onnxruntime {
 namespace cuda {
 
-#define REGISTER_KERNEL_VERSIONED_TYPED(START_VER, END_VER, T)                             \
+#define REGISTER_KERNEL_VERSIONED_TYPED(START_VER, END_VER, T, DOMAIN, LAYOUT)             \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
       LRN,                                                                                 \
-      kOnnxDomain,                                                                         \
+      DOMAIN,                                                                              \
       START_VER,                                                                           \
       END_VER,                                                                             \
       T,                                                                                   \
       kCudaExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      LRN<T>);
+      LRN<T, LAYOUT>);
 
-#define REGISTER_KERNEL_TYPED(VER, T)                                                      \
+#define REGISTER_KERNEL_TYPED(VER, T, DOMAIN, LAYOUT)                                      \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
       LRN,                                                                                 \
-      kOnnxDomain,                                                                         \
+      DOMAIN,                                                                              \
       VER,                                                                                 \
       T,                                                                                   \
       kCudaExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      LRN<T>);
+      LRN<T, LAYOUT>);
 
-REGISTER_KERNEL_VERSIONED_TYPED(1, 12, float)
-REGISTER_KERNEL_VERSIONED_TYPED(1, 12, double)
-REGISTER_KERNEL_VERSIONED_TYPED(1, 12, MLFloat16)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, float, kOnnxDomain, false)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, double, kOnnxDomain, false)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, MLFloat16, kOnnxDomain, false)
 
-REGISTER_KERNEL_TYPED(13, float)
-REGISTER_KERNEL_TYPED(13, double)
-REGISTER_KERNEL_TYPED(13, MLFloat16)
+REGISTER_KERNEL_TYPED(13, float, kOnnxDomain, false)
+REGISTER_KERNEL_TYPED(13, double, kOnnxDomain, false)
+REGISTER_KERNEL_TYPED(13, MLFloat16, kOnnxDomain, false)
 
-template <typename T>
-LRN<T>::LRN(const OpKernelInfo& info) : CudaKernel(info) {
+#ifdef ENABLE_CUDA_NHWC_OPS
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, float, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, double, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, MLFloat16, kMSInternalNHWCDomain, true)
+
+REGISTER_KERNEL_TYPED(13, float, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_TYPED(13, double, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_TYPED(13, MLFloat16, kMSInternalNHWCDomain, true)
+#endif
+
+template <typename T, bool Layout>
+LRN<T, Layout>::LRN(const OpKernelInfo& info) : CudaKernel(info) {
   int64_t size;
   ORT_ENFORCE(info.GetAttr<int64_t>("size", &size).IsOK());
   ORT_ENFORCE(size > 0);
@@ -58,8 +68,8 @@ LRN<T>::LRN(const OpKernelInfo& info) : CudaKernel(info) {
                   .IsOK());
 }
 
-template <typename T>
-Status LRN<T>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool Layout>
+Status LRN<T, Layout>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
 
   const Tensor* X = context->Input<Tensor>(0);
@@ -71,7 +81,7 @@ Status LRN<T>::ComputeInternal(OpKernelContext* context) const {
   Tensor* Y = context->Output(0, X->Shape());
 
   CudnnTensor x_tensor;
-  ORT_RETURN_IF_ERROR(x_tensor.Set(X->Shape().GetDims(), CudnnTensor::GetDataType<CudaT>()));
+  ORT_RETURN_IF_ERROR(x_tensor.Set(X->Shape().GetDims(), CudnnTensor::GetDataType<CudaT>(), Layout == NHWC));
 
   const auto one = Consts<CudaT>::One;
   const auto zero = Consts<CudaT>::Zero;
diff --git a/onnxruntime/core/providers/cuda/nn/lrn.h b/onnxruntime/core/providers/cuda/nn/lrn.h
index 319e323c72a9..31b2819ccc52 100644
--- a/onnxruntime/core/providers/cuda/nn/lrn.h
+++ b/onnxruntime/core/providers/cuda/nn/lrn.h
@@ -20,7 +20,7 @@ class CudnnLRNDescriptor final {
   cudnnLRNDescriptor_t desc_;
 };
 
-template <typename T>
+template <typename T, bool Layout>
 class LRN : public CudaKernel {
  public:
   LRN(const OpKernelInfo& info);
diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
index ef1155af127d..9311f044f4ec 100644
--- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
+++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
@@ -7,10 +7,11 @@
 
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/shared_inc/fast_divmod.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
 
 namespace onnxruntime {
 namespace cuda {
-template <typename T>
+template <typename T, bool Layout>
 __global__ void MaxPoolWithIndexKernel(
     int64_t batch,
     int64_t channels,
@@ -44,11 +45,27 @@ __global__ void MaxPoolWithIndexKernel(
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   if (id >= output_size) return;
 
+  auto compute_offset =
+    [height, width, depth, channels](int n_index, int c_index, int h_index, int w_index, int d_index) -> int64_t {
+    if constexpr (Layout == LAYOUT_NCHW) {
+      return (((n_index * channels + c_index) * height + h_index) * width + w_index) * depth + d_index;
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      return (((n_index * height + h_index) * width + w_index) * depth + d_index) * channels + c_index;
+    }
+  };
+
   int d_index, w_index, h_index, c_index, n_index, id_tmp;
-  fdm_d.divmod(id, id_tmp, d_index);
-  fdm_w.divmod(id_tmp, id_tmp, w_index);
-  fdm_h.divmod(id_tmp, id_tmp, h_index);
-  fdm_c.divmod(id_tmp, n_index, c_index);
+  if constexpr (Layout == LAYOUT_NCHW) {
+    fdm_d.divmod(id, id_tmp, d_index);
+    fdm_w.divmod(id_tmp, id_tmp, w_index);
+    fdm_h.divmod(id_tmp, id_tmp, h_index);
+    fdm_c.divmod(id_tmp, n_index, c_index);
+  } else if constexpr (Layout == LAYOUT_NHWC) {
+    fdm_c.divmod(id, id_tmp, c_index);
+    fdm_d.divmod(id_tmp, id_tmp, d_index);
+    fdm_w.divmod(id_tmp, id_tmp, w_index);
+    fdm_h.divmod(id_tmp, n_index, h_index);
+  }
 
   int64_t d_start = d_index * stride_d - pad_d;
   int64_t w_start = w_index * stride_w - pad_w;
@@ -64,29 +81,45 @@ __global__ void MaxPoolWithIndexKernel(
   int64_t d_index_max = -1;
   int64_t w_index_max = -1;
   int64_t h_index_max = -1;
-  int64_t offset = (n_index * channels + c_index) * height * width * depth;
+  int64_t offset = compute_offset(n_index, c_index, 0, 0, 0);
   const T* p_slice = p_input + offset;
-  T maxval = p_slice[h_start * width * depth + w_start * depth + d_start] - (T)1;
+  T maxval = p_slice[compute_offset(0, 0, h_start, w_start, d_start)] - (T)1;
   for (int64_t d = d_start; d < d_end; d += dilation_d) {
     for (int64_t w = w_start; w < w_end; w += dilation_w) {
       for (int64_t h = h_start; h < h_end; h += dilation_h) {
-        if (p_slice[h * width * depth + w * depth + d] > maxval) {
+        auto pool_offset = compute_offset(0, 0, h, w, d);
+        if (p_slice[pool_offset] > maxval) {
           h_index_max = h;
           w_index_max = w;
           d_index_max = d;
-          maxval = static_cast<float>(p_slice[h * width * depth + w * depth + d]);
+          maxval = static_cast<float>(p_slice[pool_offset]);
         }
       }
     }
   }
-  p_output[id] = p_input[offset + h_index_max * width * depth + w_index_max * depth + d_index_max];
+  p_output[id] = p_input[offset + compute_offset(0, 0, h_index_max, w_index_max, d_index_max)];
+
   if (p_indices) {
-    p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max
-                                       : offset + h_index_max + w_index_max * height + d_index_max * width * height;
+    if constexpr (Layout == LAYOUT_NCHW) {
+      p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max
+                                         : offset + h_index_max + w_index_max * height + d_index_max * width * height;
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      // The tests currently have to be provided in NHWC layout so that tests do not fail. When converting between
+      // layouts, does it make sense to do an index conversion as well?
+      // Storing indices in NHWC layout isn't critical as they are supposed to be used by Unpooling operations
+      // which currently assume that indices reference to Tensors in NHWC layout.
+      int64_t id_nchw = 
+        (((n_index * channels + c_index) * pooled_height + h_index) * pooled_width + w_index) * pooled_depth + d_index;
+      int64_t offset_nchw = (n_index * channels + c_index) * width * height * depth;
+
+      p_indices[id_nchw] = (storage_order == 0)
+                               ? offset_nchw + h_index_max * width * depth + w_index_max * depth + d_index_max
+                               : offset_nchw + h_index_max + w_index_max * height + d_index_max * width * height;
+    }
   }
 }
 
-template <typename T>
+template <typename T, bool Layout>
 void MaxPoolWithIndex(
     cudaStream_t stream,
     const TensorShape& input_shape,
@@ -99,14 +132,29 @@ void MaxPoolWithIndex(
     const T* p_input,
     T* p_output,
     int64_t* p_indices) {
-  int64_t batchs = input_shape[0];
-  int64_t channels = input_shape[1];
-  int64_t height = input_shape[2];
-  int64_t width = kernel_shape.size() > 1 ? input_shape[3] : 1;
-  int64_t depth = kernel_shape.size() > 2 ? input_shape[4] : 1;
-  int64_t pooled_height = output_shape[2];
-  int64_t pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1;
-  int64_t pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1;
+  int64_t batchs, channels, height, width, depth;
+  int64_t pooled_height, pooled_width, pooled_depth;
+  if constexpr (Layout == LAYOUT_NCHW) {
+    batchs = input_shape[0];
+    channels = input_shape[1];
+    height = input_shape[2];
+    width = kernel_shape.size() > 1 ? input_shape[3] : 1;
+    depth = kernel_shape.size() > 2 ? input_shape[4] : 1;
+
+    pooled_height = output_shape[2];
+    pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1;
+    pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1;
+  } else if constexpr (Layout == LAYOUT_NHWC) {
+    batchs = input_shape[0];
+    height = input_shape[1];
+    width = kernel_shape.size() > 1 ? input_shape[2] : 1;
+    depth = kernel_shape.size() > 2 ? input_shape[3] : 1;
+    channels = input_shape[input_shape.NumDimensions() - 1];
+
+    pooled_height = output_shape[1];
+    pooled_width = kernel_shape.size() > 1 ? output_shape[2] : 1;
+    pooled_depth = kernel_shape.size() > 2 ? output_shape[3] : 1;
+  }
   int64_t kernel_h = kernel_shape[0];
   int64_t kernel_w = kernel_shape.size() > 1 ? kernel_shape[1] : 1;
   int64_t kernel_d = kernel_shape.size() > 2 ? kernel_shape[2] : 1;
@@ -130,7 +178,7 @@ void MaxPoolWithIndex(
   fast_divmod fdm_d(static_cast<int>(pooled_depth));
 
   int blocksPerGrid = (int)((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
-  MaxPoolWithIndexKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+  MaxPoolWithIndexKernel<T, Layout><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
       batchs,
       channels,
       height,
@@ -162,8 +210,8 @@ void MaxPoolWithIndex(
       p_indices);
 }
 
-#define INSTANTIATEMAXPOOLWITHINDEX(T)              \
-  template void MaxPoolWithIndex<T>(                \
+#define INSTANTIATEMAXPOOLWITHINDEX(T, Layout)      \
+  template void MaxPoolWithIndex<T, Layout>(        \
       cudaStream_t stream,                          \
       const TensorShape& input_shape,               \
       const TensorShape& output_shape,              \
@@ -176,11 +224,19 @@ void MaxPoolWithIndex(
       T* p_output,                                  \
       int64_t* p_indices);
 
-INSTANTIATEMAXPOOLWITHINDEX(float)
-INSTANTIATEMAXPOOLWITHINDEX(double)
-INSTANTIATEMAXPOOLWITHINDEX(half)
-INSTANTIATEMAXPOOLWITHINDEX(int8_t)
-INSTANTIATEMAXPOOLWITHINDEX(uint8_t)
+INSTANTIATEMAXPOOLWITHINDEX(float, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(double, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(half, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(int8_t, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(uint8_t, LAYOUT_NCHW)
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+INSTANTIATEMAXPOOLWITHINDEX(float, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(double, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(half, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(int8_t, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(uint8_t, LAYOUT_NHWC)
+#endif
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
index 27f5b241cc78..98f14c3f6a62 100644
--- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
+++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
@@ -7,7 +7,7 @@
 
 namespace onnxruntime {
 namespace cuda {
-template <typename T>
+template <typename T, bool Layout>
 void MaxPoolWithIndex(
     cudaStream_t stream,
     const TensorShape& input_shape,
diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc
index 8bc96958693b..4acdcfcf3549 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.cc
+++ b/onnxruntime/core/providers/cuda/nn/pool.cc
@@ -87,6 +87,8 @@ POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11, kMSInt
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11, kMSInternalNHWCDomain, true)
 POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
 POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
 
 POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1, kMSInternalNHWCDomain, true)
 POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1, kMSInternalNHWCDomain, true)
@@ -145,8 +147,8 @@ class CudnnPoolingDescriptor final {
   cudnnPoolingDescriptor_t desc_;
 };
 
-template <typename T, typename PoolType, bool NHWC>
-Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, typename PoolType, bool Layout>
+Status Pool<T, PoolType, Layout>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
@@ -157,16 +159,21 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   }
 
   auto kernel_shape = pool_attrs_.kernel_shape;
-  auto pads = pool_attrs_.pads;
   auto strides = pool_attrs_.strides;
+  TensorShapeVector pads = pool_attrs_.pads;
 
   if (pool_attrs_.global_pooling) {
-    kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
-    pads.assign(kernel_shape.size(), 0);
+    if constexpr (Layout == LAYOUT_NCHW) {
+      kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      kernel_shape.assign(x_dims.begin() + 1, x_dims.end() - 1);
+    }
+    pads.assign(2 * kernel_shape.size(), 0);
     strides.assign(kernel_shape.size(), 1);
   }
-  auto out_channel = NHWC ? x_shape[3] : x_shape[1];
-  auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC);
+  auto out_channel = (Layout == LAYOUT_NHWC) ? x_shape[x_dims.size() - 1] : x_shape[1];
+
+  auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, Layout == LAYOUT_NHWC);
   TensorShape y_shape(y_dims);
   Tensor* Y = context->Output(0, y_shape);
   // special case when there is a dim value of 0 in the shape.
@@ -178,20 +185,22 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   TensorShapeVector x_dims_cudnn(x_dims.begin(), x_dims.end());
   TensorShapeVector y_dims_cudnn(y_dims);
   if (kernel_shape.size() < 2) {
-    // cudnn only takes 4D or 5D input, so pad dimensions if needed
-    if (NHWC) {
-      x_dims_cudnn.insert(x_dims_cudnn.begin() + 1, 1);
-      y_dims_cudnn.insert(y_dims_cudnn.begin() + 1, 1);
-      kernel_shape.insert(kernel_shape.begin() + 1, 1);
-      strides.insert(strides.begin() + 1, 1);
-    } else {
-      x_dims_cudnn.push_back(1);
-      y_dims_cudnn.push_back(1);
-      kernel_shape.push_back(1);
-      strides.push_back(1);
+    // cuDNN only takes 4D or 5D input, so pad dimensions if needed
+    if constexpr (Layout == LAYOUT_NHWC) {
+      x_dims_cudnn.insert(x_dims_cudnn.end() - 1, 1);
+      y_dims_cudnn.insert(y_dims_cudnn.end() - 1, 1);
+      pads.insert(pads.begin() + pads.size() / 2, 0);
+      pads.insert(pads.end(), 0);
+      kernel_shape.insert(kernel_shape.end(), 1);
+      strides.insert(strides.end(), 1);
+    } else {  // Layout == LAYOUT_NCHW
+      x_dims_cudnn.insert(x_dims_cudnn.end(), 1);
+      y_dims_cudnn.insert(y_dims_cudnn.end(), 1);
+      pads.insert(pads.begin() + pads.size() / 2, 0);
+      pads.insert(pads.end(), 0);
+      kernel_shape.insert(kernel_shape.end(), 1);
+      strides.insert(strides.end(), 1);
     }
-    pads.insert(pads.begin() + kernel_shape.size(), 0);
-    pads.insert(pads.end(), 0);
   }
 
   cudnnPoolingMode_t mode = CUDNN_POOLING_MAX;
@@ -208,8 +217,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
     const auto beta = Consts<float>::Zero;
     CudnnTensor x_tensor;
     CudnnTensor y_tensor;
-    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<float>(), NHWC));
-    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<float>(), NHWC));
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<float>(), Layout == LAYOUT_NHWC));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<float>(), Layout == LAYOUT_NHWC));
 
     const auto input_count = x_shape.Size();
     const auto output_count = y_shape.Size();
@@ -225,8 +234,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
     const auto beta = Consts<CudaT>::Zero;
     CudnnTensor x_tensor;
     CudnnTensor y_tensor;
-    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), NHWC));
-    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), NHWC));
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), Layout == LAYOUT_NHWC));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), Layout == LAYOUT_NHWC));
 
     CUDNN_RETURN_IF_ERROR(
         PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data));
@@ -235,8 +244,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   return Status::OK();
 }
 
-template <typename T, bool NHWC>
-Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool Layout>
+Status Pool<T, MaxPool<8>, Layout>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
@@ -251,12 +260,16 @@ Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) cons
   auto strides = this->pool_attrs_.strides;
 
   if (this->pool_attrs_.global_pooling) {
-    kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
-    pads.assign(kernel_shape.size(), 0);
+    if constexpr (Layout == LAYOUT_NCHW) {
+      kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      kernel_shape.assign(x_dims.begin() + 1, x_dims.end() - 1);
+    }
+    pads.assign(2 * kernel_shape.size(), 0);  // x{i}_begin + x{i}_end
     strides.assign(kernel_shape.size(), 1);
   }
-  auto out_channel = NHWC ? x_shape[3] : x_shape[1];
-  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC);
+  auto out_channel = Layout == LAYOUT_NHWC ? x_shape[x_shape.NumDimensions() - 1] : x_shape[1];
+  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, Layout == LAYOUT_NHWC);
   Tensor* Y = context->Output(0, TensorShape(y_dims));
 
   // special case when there is a dim value of 0 in the shape.
@@ -265,13 +278,22 @@ Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) cons
   auto x_data = reinterpret_cast<const CudaT*>(X->Data<T>());
   auto y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
 
-  Tensor* I = context->Output(1, TensorShape(y_dims));
+  // I is in NCHW format and the contained indices use NCHW math to compute the index
+  auto i_dims = y_dims;
+  if constexpr (Layout == LAYOUT_NHWC) {
+    // y_dims in NHWDC format, i_dims has to be in NCHWD format.
+    i_dims.insert(i_dims.begin() + 1, i_dims.back());  // N*C*HWDC
+    i_dims.pop_back();                                 // NCHW
+  }
+
+  Tensor* I = context->Output(1, TensorShape(i_dims));
   if (nullptr != I || !this->pool_attrs_.default_dilations) {
     auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
-    MaxPoolWithIndex<CudaT>(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape, strides, pads,
-                            this->pool_attrs_.dilations, this->pool_attrs_.storage_order, x_data, y_data, i_data);
+    MaxPoolWithIndex<CudaT, Layout == LAYOUT_NHWC>(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape,
+                                                   strides, pads, this->pool_attrs_.dilations,
+                                                   this->pool_attrs_.storage_order, x_data, y_data, i_data);
   } else {
-    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>, NHWC>::ComputeInternal(context)));
+    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>, Layout == LAYOUT_NHWC>::ComputeInternal(context)));
   }
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/nn/pool.h b/onnxruntime/core/providers/cuda/nn/pool.h
index 8b5152a1565a..97f7c8b8762d 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.h
+++ b/onnxruntime/core/providers/cuda/nn/pool.h
@@ -19,10 +19,10 @@ class Pool : public CudaKernel, public PoolBase {
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
-template <typename T, bool NHWC>
-class Pool<T, MaxPool<8>, NHWC> final : public Pool<T, MaxPool<1>, NHWC> {
+template <typename T, bool Layout>
+class Pool<T, MaxPool<8>, Layout> final : public Pool<T, MaxPool<1>, Layout> {
  public:
-  explicit Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, NHWC>(info) {}
+  explicit Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, Layout>(info) {}
 
   Status ComputeInternal(OpKernelContext* context) const override;
 };
diff --git a/onnxruntime/core/providers/cuda/nvtx_profile.cc b/onnxruntime/core/providers/cuda/nvtx_profile.cc
index 6c7c594066b8..867e7c1f2458 100644
--- a/onnxruntime/core/providers/cuda/nvtx_profile.cc
+++ b/onnxruntime/core/providers/cuda/nvtx_profile.cc
@@ -4,13 +4,8 @@
 #ifdef ENABLE_NVTX_PROFILE
 #include "nvtx_profile.h"
 #include "core/common/common.h"
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
 #include <nvtx3/nvToolsExt.h>
 #include <nvtx3/nvToolsExtCuda.h>
-#else
-#include <nvToolsExt.h>
-#include <nvToolsExtCuda.h>
-#endif
 
 namespace onnxruntime {
 namespace profile {
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h
index ee8e13db2eb5..c22ff2d01a37 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h
@@ -4,7 +4,7 @@
 #pragma once
 #include "core/common/optional.h"
 #include "core/providers/cuda/cuda_kernel.h"
-#include "core/providers/cpu/reduction/reduction_ops.h"
+#include "core/providers/cpu/reduction/reduction_kernel_base.h"
 #include "core/providers/cuda/reduction/reduction_functions.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
index 99c1f48e21c7..6476364a211f 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
@@ -9,40 +9,49 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-void CudnnRnnBase<T>::SetWeightBias(const cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnn_desc,
-                                    const int pseudo_layer,
-                                    const cudnnTensorDescriptor_t x_desc,
-                                    const cudnnFilterDescriptor_t w_desc,
-                                    const cudnnFilterDescriptor_t filter_desc,
-                                    const void* reorganized_w_data,
-                                    const int lin_layer_id,
-                                    const T* pos,
-                                    int& offset,
-                                    bool is_matrix,
-                                    cudaStream_t cuda_stream) const {
+Status CudnnRnnBase<T>::SetWeightBias(const cudnnHandle_t handle,
+                                      const cudnnRNNDescriptor_t rnn_desc,
+                                      const int pseudo_layer,
+                                      size_t reorganized_w_data_size,
+                                      const void* reorganized_w_data,
+                                      const int lin_layer_id,
+                                      const T* pos,
+                                      int& offset,
+                                      bool is_matrix,
+                                      cudaStream_t cuda_stream) const {
   int numDims;
-  std::vector<int> matDims(3);
+  std::array<int, 3> matDims;
+  std::array<int, 3> strideA;
   cudnnDataType_t dt;
-  cudnnTensorFormat_t tf;
   T* mem_offset;
 
-  if (is_matrix) {
-    cudnnGetRNNLinLayerMatrixParams(handle, rnn_desc, pseudo_layer, x_desc, w_desc, reorganized_w_data, lin_layer_id, filter_desc, (void**)&mem_offset);
-  } else {
-    cudnnGetRNNLinLayerBiasParams(handle, rnn_desc, pseudo_layer, x_desc, w_desc, reorganized_w_data, lin_layer_id, filter_desc, (void**)&mem_offset);
-  }
+  CudnnTensor tensor_desc_matrix, tensor_desc_bias;
+  ORT_RETURN_IF_ERROR(tensor_desc_bias.CreateTensorIfNeeded());
+  ORT_RETURN_IF_ERROR(tensor_desc_matrix.CreateTensorIfNeeded());
 
-  cudnnGetFilterNdDescriptor(filter_desc, 3, &dt, &tf, &numDims, matDims.data());
+  T *mem_offset_matrix, *mem_offset_bias;
+  CUDNN_RETURN_IF_ERROR(cudnnGetRNNWeightParams(
+      handle, rnn_desc, pseudo_layer, reorganized_w_data_size, reorganized_w_data,
+      lin_layer_id, tensor_desc_matrix, (void**)&mem_offset_matrix, tensor_desc_bias, (void**)&mem_offset_bias));
+  CUDNN_RETURN_IF_ERROR(cudnnGetTensorNdDescriptor(
+      is_matrix ? tensor_desc_matrix : tensor_desc_bias, 3, &dt, &numDims, matDims.data(), strideA.data()));
+
+  mem_offset = is_matrix ? mem_offset_matrix : mem_offset_bias;
   int count = matDims[0] * matDims[1] * matDims[2];
+
+  if (strideA[0] != count) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::INVALID_ARGUMENT, "Stride is not packed");
+  }
   CUDA_CALL_THROW(cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice, cuda_stream));
+
   offset += count;
+
+  return Status::OK();
 }
 template <typename T>
 Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
                                               const cudnnRNNDescriptor_t rnn_desc,
-                                              const cudnnTensorDescriptor_t x_desc,
-                                              const cudnnFilterDescriptor_t w_desc,
+                                              size_t reorganized_w_data_size,
                                               void* reorganized_w_data,
                                               const T* W_data,
                                               const T* R_data,
@@ -51,18 +60,22 @@ Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
   int w_offset = 0;
   int r_offset = 0;
   int bias_offset = 0;
-  CudnnFilterDescriptor filter_desc;
   for (int layer = 0; layer < RNN_NUM_LAYERS * num_directions_; ++layer) {
     for (size_t idx = 0; idx < W_lin_layer_id_.size(); ++idx) {
-      SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, W_lin_layer_id_[idx], W_data, w_offset, true, cuda_stream);
+      ORT_RETURN_IF_ERROR(SetWeightBias(
+          cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+          W_lin_layer_id_[idx], W_data, w_offset, true, cuda_stream));
       if (B_data != nullptr) {
-        SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, W_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream);
+        ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                          W_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream));
       }
     }
     for (size_t idx = 0; idx < R_lin_layer_id_.size(); ++idx) {
-      SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, R_lin_layer_id_[idx], R_data, r_offset, true, cuda_stream);
+      ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                        R_lin_layer_id_[idx], R_data, r_offset, true, cuda_stream));
       if (B_data != nullptr) {
-        SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, R_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream);
+        ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                          R_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream));
       }
     }
   }
@@ -72,6 +85,7 @@ Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
 
 template <typename T>
 Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, const Tensor* B,
+                                          size_t& reorganized_w_data_size_in_bytes,
                                           IAllocatorUniquePtr<void>& reorganized_w_data,
                                           CudnnFilterDescriptor& target_w_desc,
                                           CudnnRNN& rnn_desc, onnxruntime::Stream* ort_stream) const {
@@ -91,19 +105,16 @@ Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, cons
   TensorShapeVector dims_w({w_size, 1, 1});
   ORT_RETURN_IF_ERROR(target_w_desc.Set(dims_w, CudnnTensor::GetDataType<CudaT>()));
 
-  TensorShapeVector fake_dims_x({1, input_size, 1});
-  CudnnTensor fake_x_desc;
-  ORT_RETURN_IF_ERROR(fake_x_desc.Set(fake_dims_x, CudnnTensor::GetDataType<CudaT>()));
-
   // Prepare the weight data
-  reorganized_w_data = GetScratchBuffer<void>(w_size * sizeof(T), ort_stream);
+  reorganized_w_data_size_in_bytes = w_size * sizeof(T);
+  reorganized_w_data = GetScratchBuffer<void>(reorganized_w_data_size_in_bytes, ort_stream);
 
   // In many cases, this allocation is bigger than needed, leaving part of
-  // the buffer unintialized. non-zero garbage data leads to wrong result
+  // the buffer uninitialized. non-zero garbage data leads to wrong result
   // in call to cudnnRNNForwardInference()
   // TODO! refine allocation size for each case.
   cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
-  cudaMemsetAsync(reorganized_w_data.get(), 0, w_size * sizeof(T), cuda_stream);
+  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(reorganized_w_data.get(), 0, reorganized_w_data_size_in_bytes, cuda_stream));
 
   const T* W_data = W->Data<T>();
   const T* R_data = R->Data<T>();
@@ -111,8 +122,9 @@ Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, cons
 
   auto* ort_cuda_stream = dynamic_cast<CudaStream*>(ort_stream);
   cudnnHandle_t cudnn_handle = ort_cuda_stream ? ort_cuda_stream->cudnn_handle_ : DefaultCudnnHandle();
-  ORT_RETURN_IF_ERROR(SetCudnnRnnWeightBias(cudnn_handle, rnn_desc, fake_x_desc, target_w_desc,
-                                            reorganized_w_data.get(), W_data, R_data, B_data, cuda_stream));
+  ORT_RETURN_IF_ERROR(SetCudnnRnnWeightBias(cudnn_handle, rnn_desc,
+                                            reorganized_w_data_size_in_bytes, reorganized_w_data.get(),
+                                            W_data, R_data, B_data, cuda_stream));
 
   return Status::OK();
 }
@@ -128,22 +140,31 @@ Status CudnnRnnBase<T>::CacheCudnnRnnWeights(const OpKernelInfo& info) {
   bool get_R = info.TryGetConstantInput(RNN_Input_Index::R, &R);
   bool get_B = info.TryGetConstantInput(RNN_Input_Index::B, &B);
 
+  bool has_bias = B != nullptr;
+
   if (get_W && get_R) {
     CudnnRNN tmp_rnn_desc;
-    ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(DefaultCudnnHandle(),
+    auto proj_size = hidden_size_;
+    ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(W->Shape()[2],  // input_size
                                          hidden_size_,
+                                         proj_size,
                                          RNN_NUM_LAYERS,
                                          cudnn_dropout_desc_,
                                          cudnn_direction_mode_,
                                          rnn_mode_,
-                                         CudnnTensor::GetDataType<CudaT>(),
-                                         GetDeviceProp()));
+                                         has_bias,
+                                         CudnnTensor::GetDataType<CudaT>()));
     if (get_B) {
-      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, B, w_data_cache_, w_desc_cache_, tmp_rnn_desc, nullptr));
+      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, B,
+                                            w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_,
+                                            tmp_rnn_desc, nullptr));
     } else {
-      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, nullptr, w_data_cache_, w_desc_cache_, tmp_rnn_desc, nullptr));
+      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, nullptr,
+                                            w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_,
+                                            tmp_rnn_desc, nullptr));
     }
     cudaStreamSynchronize(nullptr);
+
     weight_cached_ = true;
   }
 
@@ -158,17 +179,72 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   ORT_ENFORCE(nullptr != X);
 
   // optional inputs
-  const Tensor* sequence_lens = ctx->Input<Tensor>(RNN_Input_Index::sequence_lens);  // [batch_size]
-  const Tensor* initial_h = ctx->Input<Tensor>(RNN_Input_Index::initial_h);          // initial hidden. [num_directions_, batch_size, hidden_size_]
+  // [batch_size]
+  const Tensor* sequence_lens = ctx->Input<Tensor>(RNN_Input_Index::sequence_lens);
+  // initial hidden. [num_directions_, batch_size, hidden_size_]
+  const Tensor* initial_h = ctx->Input<Tensor>(RNN_Input_Index::initial_h);
   const Tensor* initial_c(nullptr);
   if (rnn_mode_ == CUDNN_LSTM) {
-    initial_c = ctx->Input<Tensor>(RNN_Input_Index::initial_c);  // initial cell. [num_directions_, batch_size, hidden_size_]
+    // initial cell. [num_directions_, batch_size, hidden_size_]
+    initial_c = ctx->Input<Tensor>(RNN_Input_Index::initial_c);
   }
 
+  size_t proj_size = hidden_size_;
   int64_t seq_length = X->Shape()[0];
   int64_t batch_size = X->Shape()[1];
   int64_t input_size = X->Shape()[2];
 
+  // we thread a single input as sequence_lens of length 1, require to expand to [batch_size]?
+  std::vector<int32_t> sequence_lengths_temp;
+  if (!sequence_lens) {
+    sequence_lengths_temp.resize(batch_size, gsl::narrow_cast<int32_t>(seq_length));
+  }
+
+  const int32_t* sequence_lens_data = (sequence_lens == nullptr)
+                                          ? sequence_lengths_temp.data()
+                                          : sequence_lens->Data<int32_t>();
+
+  // cuDNN doesn't support 0 sequence inside the batch, find the 0 sequence and set it to 1
+  // there's a ZeroMask kernel to reset the result to 0 for the 0 sequence
+  int64_t zero_seq_count = 0;
+  std::vector<int32_t> zero_seq_index_cache(batch_size, 0);
+
+  CudaAsyncBuffer<int32_t> sequence_lens_buffer(this, batch_size);
+  int32_t* seq_len_array = sequence_lens_buffer.CpuPtr();
+
+  // 0-len sequences are not supported by cuDNN.
+  // Replace them by sequences of len 1 and mask them out with SetZeroSequences
+  for (int i = 0; i < batch_size; ++i) {
+    if (0 == sequence_lens_data[i]) {
+      seq_len_array[i] = 1;
+      zero_seq_index_cache[zero_seq_count] = i;
+      ++zero_seq_count;
+    } else {
+      seq_len_array[i] = sequence_lens_data[i];
+    }
+  }
+
+  // Calculate the zero position cache for reverse direction if it's bidirectional
+  // The cache is for Y_h or Y_c, and the 1st sequence for Y, no need to do it for other sequence in Y since
+  // we hacked the 0 sequence to 1
+  if (zero_seq_count && num_directions_ > 1) {
+    zero_seq_index_cache.resize(zero_seq_count * num_directions_);
+    for (int64_t i = 0; i < zero_seq_count; ++i) {
+      zero_seq_index_cache[static_cast<size_t>(zero_seq_count) + i] =
+          static_cast<int32_t>(batch_size + zero_seq_index_cache[i]);
+    }
+    zero_seq_count *= num_directions_;
+  }
+
+  // Prior to cuDNN 8.9.1 the sequence lens buffer must be passed to cudnnRNNForward and thus is must
+  // be copied to the GPU always.
+  ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
+  // Starting with cuDNN 8.9.1 the sequence lens buffer is ignored by cudnnRNNForward and thus it must
+  // be copied to the GPU only for the ReverseBySequence kernels.
+  // if (reverse_) {
+  //  ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
+  // }
+
   // optional outputs
   TensorShapeVector dims_Y({seq_length, num_directions_, batch_size, hidden_size_});
   TensorShapeVector dims_hxy({RNN_NUM_LAYERS * num_directions_, batch_size, hidden_size_});
@@ -177,25 +253,6 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   Tensor* Y_h = ctx->Output(Output_Index::Y_h, dims_hxy);
   Tensor* Y_c = ctx->Output(Output_Index::Y_c, dims_yc);
 
-  std::vector<int64_t> dims_x({batch_size, input_size, 1});
-  std::vector<int64_t> dims_y({batch_size, hidden_size_ * num_directions_, 1});
-
-  CudnnTensor x_desc_temp;
-  ORT_RETURN_IF_ERROR(x_desc_temp.Set(dims_x, CudnnTensor::GetDataType<CudaT>()));
-  CudnnTensor y_desc_temp;
-  ORT_RETURN_IF_ERROR(y_desc_temp.Set(dims_y, CudnnTensor::GetDataType<CudaT>()));
-  std::vector<cudnnTensorDescriptor_t> x_desc(seq_length, x_desc_temp);
-  std::vector<cudnnTensorDescriptor_t> y_desc(seq_length, y_desc_temp);
-
-  CudnnTensor hx_desc;
-  CudnnTensor cx_desc;
-  CudnnTensor y_h_desc;
-  CudnnTensor y_c_desc;
-  ORT_RETURN_IF_ERROR(hx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(cx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(y_h_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(y_c_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-
   IAllocatorUniquePtr<T> x_reversed_data;
   const T* x_data = X->Data<T>();
   if (reverse_) {
@@ -203,6 +260,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     x_reversed_data = GetScratchBuffer<T>(seq_length * batch_size * input_size, ctx->GetComputeStream());
     ReverseBySequence(Stream(ctx),
                       gsl::narrow_cast<int32_t>(seq_length),
+                      sequence_lens_buffer.GpuPtr(),
                       gsl::narrow_cast<int32_t>(batch_size),
                       gsl::narrow_cast<int32_t>(input_size),
                       reinterpret_cast<const CudaT*>(x_data),
@@ -226,115 +284,81 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     y_data = y_alloc_data.get();
   }
 
-  const int32_t* sequence_lens_data = (sequence_lens == nullptr) ? nullptr : sequence_lens->Data<int32_t>();
+  const Tensor* B = ctx->Input<Tensor>(RNN_Input_Index::B);
+  bool has_bias = B != nullptr;
 
   CudnnRNN rnn_desc;
-  ORT_RETURN_IF_ERROR(rnn_desc.Set(GetCudnnHandle(ctx),
+  ORT_RETURN_IF_ERROR(rnn_desc.Set(input_size,
                                    hidden_size_,
+                                   proj_size,
                                    RNN_NUM_LAYERS,
                                    cudnn_dropout_desc_,
                                    cudnn_direction_mode_,
                                    rnn_mode_,
-                                   CudnnTensor::GetDataType<CudaT>(),
-                                   GetDeviceProp()));
+                                   has_bias,
+                                   CudnnTensor::GetDataType<CudaT>()));
 
   // Prepare the weight data
+  size_t w_data_size_in_bytes = 0;
   IAllocatorUniquePtr<void> w_data;
   CudnnFilterDescriptor w_desc;
   if (!weight_cached_) {
     const Tensor& W = *ctx->Input<Tensor>(RNN_Input_Index::W);
     const Tensor& R = *ctx->Input<Tensor>(RNN_Input_Index::R);
-    const Tensor* B = ctx->Input<Tensor>(RNN_Input_Index::B);
-    ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data, w_desc, rnn_desc, ctx->GetComputeStream()));
+    ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data_size_in_bytes, w_data, w_desc,
+                                          rnn_desc, ctx->GetComputeStream()));
   }
 
-  // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
-  CUDNN_RETURN_IF_ERROR(cudnnSetRNNPaddingMode(rnn_desc, CUDNN_RNN_PADDED_IO_ENABLED));
+  CudnnDataTensor x_desc1;
+  ORT_RETURN_IF_ERROR(x_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size,
+                                  input_size, seq_len_array));
+  CudnnDataTensor y_desc1;
+  ORT_RETURN_IF_ERROR(y_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size,
+                                  ((rnn_mode_ == CUDNN_LSTM) ? proj_size : hidden_size_) * num_directions_,
+                                  seq_len_array));
 
-  size_t workspace_bytes;
-  CUDNN_RETURN_IF_ERROR(cudnnGetRNNWorkspaceSize(GetCudnnHandle(ctx), rnn_desc, gsl::narrow_cast<int>(seq_length), x_desc.data(), &workspace_bytes));
-  auto workspace_cuda = GetScratchBuffer<void>(workspace_bytes, ctx->GetComputeStream());
-  int64_t zero_seq_count = 0;
-  std::vector<int32_t> zero_seq_index_cache(batch_size, 0);
-  int64_t zero_seq_index_cache_size = 0;
-
-  if (CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_ || nullptr == sequence_lens_data) {
-    CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInference(GetCudnnHandle(ctx),
-                                                   rnn_desc,
-                                                   gsl::narrow_cast<int>(seq_length),
-                                                   x_desc.data(),
-                                                   x_data_input,
-                                                   hx_desc,
-                                                   hx_data,
-                                                   cx_desc,
-                                                   cx_data,
-                                                   weight_cached_ ? w_desc_cache_ : w_desc,
-                                                   weight_cached_ ? w_data_cache_.get() : w_data.get(),
-                                                   y_desc.data(),
-                                                   y_data,
-                                                   y_h_desc,
-                                                   y_h_data,
-                                                   y_c_desc,
-                                                   y_c_data,
-                                                   workspace_cuda.get(),
-                                                   workspace_bytes));
-  } else {
-    // cudnn doesn't support 0 sequence inside the batch, find the 0 sequence and set it to 1
-    // there's a ZeroMask kernel to reset the result to 0 for the 0 sequence
-    std::vector<int32_t> seq_len_array(sequence_lens_data, sequence_lens_data + batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      if (0 == seq_len_array[i]) {
-        seq_len_array[i] = 1;
-        zero_seq_index_cache[zero_seq_count] = i;
-        ++zero_seq_count;
-      }
-    }
+  CudnnTensor cx_desc;
+  ORT_RETURN_IF_ERROR(cx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
 
-    // Calculate the zero position cache for reverse direction if it's bidirectional
-    // The cache is for Y_h or Y_c, and the 1st sequence for Y, no need to do it for other sequence in Y since
-    // we hacked the 0 sequence to 1
-    if (zero_seq_count && num_directions_ > 1) {
-      zero_seq_index_cache_size = zero_seq_count * num_directions_;
-      zero_seq_index_cache.resize(zero_seq_index_cache_size);
-      for (int64_t i = 0; i < zero_seq_count; ++i) {
-        zero_seq_index_cache[static_cast<size_t>(zero_seq_count) + i] = static_cast<int32_t>(batch_size + zero_seq_index_cache[i]);
-      }
-    }
+  CudnnTensor hx_desc;
+  ORT_RETURN_IF_ERROR(hx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
+
+  // reserveSpaceSize is not required cudnnRNNForward, but returned by cudnnGetRNNTempSpaceSizes
+  size_t workspace_bytes, reservespace_bytes;
 
-    CudnnDataTensor x_desc1;
-    ORT_RETURN_IF_ERROR(x_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size, input_size, seq_len_array.data()));
-    CudnnDataTensor y_desc1;
-    ORT_RETURN_IF_ERROR(y_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size, hidden_size_ * num_directions_, seq_len_array.data()));
-
-    CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInferenceEx(GetCudnnHandle(ctx),
-                                                     rnn_desc,
-                                                     x_desc1,
-                                                     x_data_input,
-                                                     hx_desc,
-                                                     hx_data,
-                                                     cx_desc,
-                                                     cx_data,
-                                                     weight_cached_ ? w_desc_cache_ : w_desc,
-                                                     weight_cached_ ? w_data_cache_.get() : w_data.get(),
-                                                     y_desc1,
-                                                     y_data,
-                                                     y_h_desc,
-                                                     y_h_data,
-                                                     y_c_desc,
-                                                     y_c_data,
-                                                     nullptr, nullptr, nullptr, nullptr,
-                                                     nullptr, nullptr, nullptr, nullptr,
-                                                     workspace_cuda.get(),
-                                                     workspace_bytes));
-
-    // Early terminate for this case since Y data is not required, and Y_h is obtained correctly, no need the following code to retrive Y_h from Y data.
-    if (nullptr == Y) {
+  CUDNN_RETURN_IF_ERROR(cudnnGetRNNTempSpaceSizes(GetCudnnHandle(ctx), rnn_desc, CUDNN_FWD_MODE_INFERENCE,
+                                                  x_desc1, &workspace_bytes, &reservespace_bytes));
+  auto workspace_cuda = GetScratchBuffer<void>(workspace_bytes, ctx->GetComputeStream());
+  auto reservespace_cuda = GetScratchBuffer<void>(reservespace_bytes, ctx->GetComputeStream());
+
+  CUDNN_RETURN_IF_ERROR(cudnnRNNForward(GetCudnnHandle(ctx),
+                                        rnn_desc,
+                                        CUDNN_FWD_MODE_INFERENCE,
+                                        sequence_lens_buffer.GpuPtr(),  // should be zero starting with cudnn 8.9.1
+                                        x_desc1,
+                                        x_data_input,
+                                        y_desc1,
+                                        y_data,  // output
+                                        hx_desc,
+                                        hx_data,   // input
+                                        y_h_data,  // output
+                                        cx_desc, cx_data, y_c_data,
+                                        weight_cached_ ? w_data_cache_size_in_bytes_ : w_data_size_in_bytes,
+                                        weight_cached_ ? w_data_cache_.get() : w_data.get(),
+                                        workspace_bytes,
+                                        workspace_cuda.get(),
+                                        reservespace_bytes,
+                                        reservespace_cuda.get()));
+
+  // Early terminate for this case since Y data is not required, and Y_h is obtained correctly,
+  // no need the following code to retrieve Y_h from Y data.
+  if (nullptr == Y) {
+    // Mask on output for 0 sequence batches
+    if (zero_seq_count > 0) {
       // Mask on output for 0 sequence batches
-      if (zero_seq_count > 0) {
-        SetZeroSequences(zero_seq_index_cache_size, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
-      }
-      return Status::OK();
+      SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
     }
+    return Status::OK();
   }
 
   IAllocatorUniquePtr<T> y_reorganized_data;
@@ -345,6 +369,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
       // reverse output data
       ReverseBySequence(Stream(ctx),
                         gsl::narrow_cast<int32_t>(seq_length),
+                        sequence_lens_buffer.GpuPtr(),
                         gsl::narrow_cast<int32_t>(batch_size),
                         gsl::narrow_cast<int32_t>(hidden_size_),
                         reinterpret_cast<CudaT*>(y_data),
@@ -361,8 +386,9 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     }
 
     if (Y != nullptr) {
-      // User specified this optional output, so need to copy the reversed data to orignial place
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), cudaMemcpyDeviceToDevice, Stream(ctx)));
+      // User specified this optional output, so need to copy the reversed data to original place
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T),
+                                           cudaMemcpyDeviceToDevice, Stream(ctx)));
     } else {
       y_data = y_reorganized_data.get();
     }
@@ -370,23 +396,9 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   // Mask on output for 0 sequence batches
   if (zero_seq_count > 0) {
-    SetZeroSequences(zero_seq_index_cache_size, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
+    SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
   }
 
-  if ((CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_) && sequence_lens_data != nullptr && y_h_data != nullptr && y_data != nullptr) {
-    CudaAsyncBuffer<int32_t> sequence_lens_buffer(this, batch_size);
-    memcpy(sequence_lens_buffer.CpuPtr(), sequence_lens_data, batch_size * sizeof(int32_t));
-    ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
-    RnnMaskImpl(Stream(ctx),
-                gsl::narrow_cast<int32_t>(num_directions_),
-                gsl::narrow_cast<int32_t>(seq_length),
-                gsl::narrow_cast<int32_t>(batch_size),
-                gsl::narrow_cast<int32_t>(hidden_size_),
-                sequence_lens_buffer.GpuPtr(),
-                reinterpret_cast<CudaT*>(y_data),
-                reinterpret_cast<CudaT*>(y_h_data),
-                output_size);
-  }
   return Status::OK();
 }
 
@@ -399,7 +411,8 @@ void CudnnRnnBase<T>::SetZeroSequences(const int64_t zero_seq_index_cache_size,
                                        onnxruntime::Stream* ort_stream) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaAsyncBuffer<int32_t> zero_seq_index_cache_async_buffer(this, zero_seq_index_cache_size);
-  memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(), zero_seq_index_cache_size * sizeof(int32_t));
+  memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(),
+         zero_seq_index_cache_size * sizeof(int32_t));
   ORT_THROW_IF_ERROR(zero_seq_index_cache_async_buffer.CopyToGpu(ort_stream));
   cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
   MaskZeroSequences(cuda_stream,
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
index 1c9483b2afd3..0fa01d3486e9 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
@@ -38,26 +38,28 @@ class CudnnRNN {
     }
   }
 
-  Status Set(const cudnnHandle_t& cudnnHandle, int64_t hidden_size, int num_layers,
+  Status Set(int64_t input_size, int64_t hidden_size, int64_t proj_size, int num_layers,
              cudnnDropoutDescriptor_t cudnn_dropout_desc, cudnnDirectionMode_t cudnn_direction_model,
-             cudnnRNNMode_t rnn_mode, cudnnDataType_t dataType, const cudaDeviceProp& prop) {
+             cudnnRNNMode_t rnn_mode, bool has_bias, cudnnDataType_t dataType) {
     if (!cudnn_rnn_desc_)
       CUDNN_RETURN_IF_ERROR(cudnnCreateRNNDescriptor(&cudnn_rnn_desc_));
 
-    CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v6(cudnnHandle,
-                                                   cudnn_rnn_desc_,
+    CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v8(cudnn_rnn_desc_,
+                                                   CUDNN_RNN_ALGO_STANDARD,  // CUDNN_RNN_ALGO_PERSIST_STATIC, CUDNN_RNN_ALGO_PERSIST_DYNAMIC
+                                                   rnn_mode,
+                                                   has_bias ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS,
+                                                   cudnn_direction_model,
+                                                   CUDNN_LINEAR_INPUT,
+                                                   dataType,
+                                                   dataType,
+                                                   dataType == CUDNN_DATA_HALF ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH,
+                                                   gsl::narrow_cast<int>(input_size),
                                                    gsl::narrow_cast<int>(hidden_size),
+                                                   gsl::narrow_cast<int>(proj_size),  // projected size
                                                    num_layers,
                                                    cudnn_dropout_desc,
-                                                   CUDNN_LINEAR_INPUT,  // We can also skip the input matrix transformation
-                                                   cudnn_direction_model,
-                                                   rnn_mode,
-                                                   CUDNN_RNN_ALGO_STANDARD,  // CUDNN_RNN_ALGO_PERSIST_STATIC, CUDNN_RNN_ALGO_PERSIST_DYNAMIC
-                                                   dataType));
-
-    if (prop.major >= 7 && dataType == CUDNN_DATA_HALF) {
-      cudnnSetRNNMatrixMathType(cudnn_rnn_desc_, CUDNN_TENSOR_OP_MATH);
-    }
+                                                   // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
+                                                   CUDNN_RNN_PADDED_IO_ENABLED));
 
     return Status::OK();
   }
@@ -119,8 +121,7 @@ class CudnnRnnBase : public CudaKernel {
  private:
   Status SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
                                const cudnnRNNDescriptor_t rnn_desc,
-                               const cudnnTensorDescriptor_t x_desc,
-                               const cudnnFilterDescriptor_t w_desc,
+                               size_t w_data_size,
                                void* w_data,
                                const T* W_data,
                                const T* R_data,
@@ -128,23 +129,22 @@ class CudnnRnnBase : public CudaKernel {
                                cudaStream_t cuda_stream) const;
 
   Status ReorganizeWeights(const Tensor* W, const Tensor* R, const Tensor* B,
+                           size_t& target_w_data_size_in_bytes,
                            IAllocatorUniquePtr<void>& target_w_data,
                            CudnnFilterDescriptor& target_w_desc,
                            CudnnRNN& rnn_desc,
                            onnxruntime::Stream* ort_stream) const;
 
-  void SetWeightBias(const cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnn_desc,
-                     const int pseudo_layer,
-                     const cudnnTensorDescriptor_t x_desc,
-                     const cudnnFilterDescriptor_t w_desc,
-                     const cudnnFilterDescriptor_t filter_desc,
-                     const void* w_data,
-                     const int lin_layer_id,
-                     const T* pos,
-                     int& offset,
-                     bool is_matrix,
-                     cudaStream_t cuda_stream) const;
+  Status SetWeightBias(const cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnn_desc,
+                       const int pseudo_layer,
+                       size_t w_data_size,
+                       const void* w_data,
+                       const int lin_layer_id,
+                       const T* pos,
+                       int& offset,
+                       bool is_matrix,
+                       cudaStream_t cuda_stream) const;
 
   void SetZeroSequences(const int64_t zero_seq_index_cache_size,
                         const std::vector<int32_t> zero_seq_index_cache,
@@ -167,6 +167,7 @@ class CudnnRnnBase : public CudaKernel {
   cudnnRNNMode_t rnn_mode_;
   // w_desc_cache_ & w_data_cache_ are changed in Constructor if we can get the weights as constant input
   CudnnFilterDescriptor w_desc_cache_;
+  size_t w_data_cache_size_in_bytes_;
   IAllocatorUniquePtr<void> w_data_cache_;
   bool weight_cached_;
   int64_t layout_;
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.cc b/onnxruntime/core/providers/cuda/rnn/rnn.cc
index 4bd22340ef2b..ed8be6367970 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn.cc
+++ b/onnxruntime/core/providers/cuda/rnn/rnn.cc
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/shared_library/provider_api.h"
 #include "rnn.h"
+
+#include "core/providers/shared_library/provider_api.h"
 #include "rnn_impl.h"
 #include "core/providers/cuda/cudnn_common.h"
 
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.h b/onnxruntime/core/providers/cuda/rnn/rnn.h
index e4e50046b372..6221afb003b2 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn.h
+++ b/onnxruntime/core/providers/cuda/rnn/rnn.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "cudnn_rnn_base.h"
+
 #include "core/providers/cuda/cuda_common.h"
 #include <cudnn.h>
 
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
index d485855ddb41..94c8036be6cd 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
+++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
@@ -8,22 +8,32 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-__global__ void _ReverseBySequenceKernel(const int32_t seq_length,
+__global__ void _ReverseBySequenceKernel(const int32_t max_seq_length,
+                                         const int32_t* seq_lengths,
                                          const int32_t block_size,
                                          const fast_divmod div_batch_block,
+                                         const fast_divmod div_input_or_hidden_size,
                                          const T* data,
                                          T* reversed_data,
                                          const CUDA_LONG N) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   int seq_id, offset;
   div_batch_block.divmod(id, seq_id, offset);
-  int org_id = (seq_length - seq_id - 1) * block_size + offset;
-  reversed_data[id] = data[org_id];
+  int batch, batch_offset;
+  div_input_or_hidden_size.divmod(offset, batch, batch_offset);
+  int seq_id_org = seq_lengths[batch] - seq_id - 1;
+  if (seq_id_org >= 0) {
+    int org_id = seq_id_org * block_size + offset;
+    reversed_data[id] = data[org_id];
+  } else {
+    reversed_data[id] = T{};
+  }
 }
 
 template <typename T>
 void ReverseBySequence(cudaStream_t stream,
-                       const int32_t seq_length,
+                       const int32_t max_seq_length,
+                       const int32_t *seq_lengths,
                        const int32_t batch_size,
                        const int32_t input_or_hidden_size,
                        const T* data,
@@ -32,9 +42,10 @@ void ReverseBySequence(cudaStream_t stream,
   // kerneral
   int32_t block_size = batch_size * input_or_hidden_size;
   fast_divmod div_batch_block(block_size);
+  fast_divmod div_input_or_hidden_size(input_or_hidden_size);
   int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
   _ReverseBySequenceKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      seq_length, block_size, div_batch_block, data, reversed_data, (CUDA_LONG)N);
+      max_seq_length, seq_lengths, block_size, div_batch_block, div_input_or_hidden_size, data, reversed_data, (CUDA_LONG)N);
 }
 
 template <typename T>
@@ -82,60 +93,6 @@ void ReorderBidirectionalDataInSequence(cudaStream_t stream,
       data, reordered_data, (CUDA_LONG)N);
 }
 
-template <typename T>
-__global__ void _RnnMaskKernel(const int32_t seq_length,
-                               const int32_t batch_size,
-                               const int32_t hidden_size,
-                               const int32_t* sequence_lens,
-                               const fast_divmod div_seq_block,
-                               const fast_divmod div_dir_block,
-                               const fast_divmod div_batch_block,
-                               T* y_output_data,
-                               T* y_h_output_data,
-                               const CUDA_LONG N) {
-  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
-
-  int seq_id, direction_id, batch_id, offset;
-  div_seq_block.divmod(id, seq_id, offset);
-  div_dir_block.divmod(offset, direction_id, offset);
-  div_batch_block.divmod(offset, batch_id, offset);
-  int32_t batch_seq_length = sequence_lens[batch_id];
-
-  if (batch_id >= batch_size || batch_seq_length == seq_length) {
-    return;
-  }
-
-  if (seq_id >= batch_seq_length) {
-    y_output_data[id] = 0;
-    return;
-  }
-
-  if ((y_h_output_data != nullptr) && 
-      ((direction_id == 0 && (seq_id + 1) == batch_seq_length) || (direction_id == 1 && seq_id == 0))) {
-    int hy_idx = direction_id * batch_size * hidden_size + batch_id * hidden_size + offset;
-    y_h_output_data[hy_idx] = y_output_data[id];
-  }
-}
-
-template <typename T>
-void RnnMaskImpl(cudaStream_t stream,
-                 const int32_t num_directions,
-                 const int32_t seq_length,
-                 const int32_t batch_size,
-                 const int32_t hidden_size,
-                 const int32_t* sequence_lens,
-                 T* y_output_data,
-                 T* y_h_output_data,
-                 const size_t N) {
-  fast_divmod div_seq_block(batch_size * hidden_size * num_directions);
-  fast_divmod div_dir_block(batch_size * hidden_size);
-  fast_divmod div_batch_block(hidden_size);
-  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
-  _RnnMaskKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      seq_length, batch_size, hidden_size, sequence_lens, div_seq_block,
-      div_dir_block, div_batch_block, y_output_data, y_h_output_data, (CUDA_LONG)N);
-}
-
 template <typename T>
 __global__ void _MaskZeroSequences(const int32_t hidden_size,
                                    T* y_output_data,
@@ -180,17 +137,9 @@ void MaskZeroSequences(cudaStream_t stream,
 }
 
 #define SPECIALIZED_RNN_IMPL(T)                                                 \
-  template void RnnMaskImpl<T>(cudaStream_t stream,                       \
-                               const int32_t num_directions,                    \
-                               const int32_t seq_length,                        \
-                               const int32_t batch_size,                        \
-                               const int32_t hidden_size,                       \
-                               const int32_t* sequence_lens,                    \
-                               T* y_output_data,                                \
-                               T* y_h_output_data,                              \
-                               const size_t N);                                 \
-  template void ReverseBySequence<T>(cudaStream_t stream,                 \
-                                     const int32_t seq_length,                  \
+  template void ReverseBySequence<T>(cudaStream_t stream,                       \
+                                     const int32_t max_seq_length,              \
+                                     const int32_t* seq_lengths,                \
                                      const int32_t batch_size,                  \
                                      const int32_t hidden_size,                 \
                                      const T* data,                             \
@@ -203,7 +152,7 @@ void MaskZeroSequences(cudaStream_t stream,
                                                       const T* data,            \
                                                       T* reordered_data,        \
                                                      const size_t N);           \
-template void MaskZeroSequences<T>(cudaStream_t stream,                   \
+template void MaskZeroSequences<T>(cudaStream_t stream,                         \
                                    const int32_t hidden_size,                   \
                                    T* y_output_data,                            \
                                    T* y_h_output_data,                          \
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
index 9844e04ff6ec..ba876011f6b6 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
+++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
@@ -10,7 +10,8 @@ namespace cuda {
 
 template <typename T>
 void ReverseBySequence(cudaStream_t stream,
-                       const int32_t seq_length,
+                       const int32_t max_seq_length,
+                       const int32_t* seq_lengths,
                        const int32_t batch_size,
                        const int32_t input_or_hidden_size,
                        const T* data,
@@ -26,17 +27,6 @@ void ReorderBidirectionalDataInSequence(cudaStream_t stream,
                                         T* reordered_data,
                                         const size_t N);
 
-template <typename T>
-void RnnMaskImpl(cudaStream_t stream,
-                 const int32_t num_directions,
-                 const int32_t seq_length,
-                 const int32_t batch_size,
-                 const int32_t hidden_size,
-                 const int32_t* sequence_lens,
-                 T* y_output_data,
-                 T* y_h_output_data,
-                 const size_t N);
-
 template <typename T>
 void MaskZeroSequences(cudaStream_t stream,
                        const int32_t hidden_size,
diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
index fa987866c002..54c024793ff0 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
@@ -168,5 +168,31 @@ struct NumericLimits<double> {
   }
 };
 
+// TODO Where to put this? good places might be
+// core/framework/tensor_shape.h
+// core/util/matrix_layout.h
+
+constexpr bool LAYOUT_NCHW = false;
+constexpr bool LAYOUT_NHWC = true;
+
+template <bool IsNHWC>
+struct Channels;
+
+template <>
+struct Channels<LAYOUT_NHWC> {
+  static constexpr size_t N = 0;
+  static constexpr size_t H = 1;
+  static constexpr size_t W = 2;
+  static constexpr size_t C = 3;
+};
+
+template <>
+struct Channels<LAYOUT_NCHW> {
+  static constexpr size_t N = 0;
+  static constexpr size_t C = 1;
+  static constexpr size_t H = 2;
+  static constexpr size_t W = 3;
+};
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h
index 510cc5cfbb7d..053c66ddcb34 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h
@@ -29,13 +29,15 @@ cublasGemmHelper(cublasHandle_t handle,
                  const float* B, int ldb,
                  const float* beta,
                  float* C, int ldc,
-                 const cudaDeviceProp& prop) {
+                 const cudaDeviceProp& prop,
+                 bool use_tf32) {
 #if defined(USE_CUDA)
-  // TF32 uses 10 bit mantissa which has sufficient margin of precision for most use cases. It gets 8x throughput than FP32 in A100.
-  // It can be overrided by setting environment variable NVIDIA_TF32_OVERRIDE = 0 to disable TF32
-  onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, CUBLAS_TF32_TENSOR_OP_MATH);
+  // To disable TF32, set environment variable NVIDIA_TF32_OVERRIDE = 0 or set provider option use_tf32 = 0
+  cublasMath_t mode = use_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
+  onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, mode);
 #else
   ORT_UNUSED_PARAMETER(prop);
+  ORT_UNUSED_PARAMETER(use_tf32);
 #endif
 
   return cublasSgemm(handle,
@@ -58,7 +60,8 @@ inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle,
                                        const double* B, int ldb,
                                        const double* beta,
                                        double* C, int ldc,
-                                       const cudaDeviceProp& /*prop*/) {
+                                       const cudaDeviceProp& /*prop*/,
+                                       bool /*use_tf32*/) {
   return cublasDgemm(handle,
                      transa,
                      transb,
@@ -79,7 +82,8 @@ inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle,
                                        const half* B, int ldb,
                                        const half* beta,
                                        half* C, int ldc,
-                                       const cudaDeviceProp& prop) {
+                                       const cudaDeviceProp& prop,
+                                       bool /*use_tf32*/) {
   const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance();
   onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode());
   if (half_options->IsCompute16F()) {
@@ -121,7 +125,8 @@ inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle,
                                        const half* B, int ldb,
                                        const float* beta,
                                        half* C, int ldc,
-                                       const cudaDeviceProp& prop) {
+                                       const cudaDeviceProp& prop,
+                                       bool /*use_tf32*/) {
   const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance();
   onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode());
   if (half_options->IsCompute16F()) {
@@ -155,10 +160,11 @@ inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle,
 }
 
 #if defined(USE_CUDA)
-inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m,
-                                       int n, int k, const BFloat16* alpha, const BFloat16* A, int lda,
-                                       const BFloat16* B, int ldb, const BFloat16* beta, BFloat16* C, int ldc,
-                                       const cudaDeviceProp& /*prop*/) {
+inline cublasStatus_t cublasGemmHelper(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m,
+    int n, int k, const BFloat16* alpha, const BFloat16* A, int lda,
+    const BFloat16* B, int ldb, const BFloat16* beta, BFloat16* C, int ldc,
+    const cudaDeviceProp& /*prop*/, bool /*use_tf32*/) {
   float h_a = alpha->ToFloat();
   float h_b = beta->ToFloat();
 
@@ -169,7 +175,7 @@ inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle, cublasOperation_t
 #else
 inline cublasStatus_t cublasGemmHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
                                        const BFloat16*, const BFloat16*, int, const BFloat16*, int, const BFloat16*,
-                                       BFloat16*, int, const cudaDeviceProp&) {
+                                       BFloat16*, int, const cudaDeviceProp&, bool /*use_tf32*/) {
   return CUBLAS_STATUS_NOT_SUPPORTED;
 }
 #endif
@@ -185,7 +191,17 @@ inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t handle,
                                               const float* beta,
                                               float* Carray[], int ldc,
                                               int batch_count,
-                                              const cudaDeviceProp&) {
+                                              const cudaDeviceProp& prop,
+                                              bool use_tf32) {
+// The caller shall check memory alignments of the matrices when use_tf32 is true.
+#if defined(USE_CUDA)
+  cublasMath_t mode = use_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
+  onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, mode);
+#else
+  ORT_UNUSED_PARAMETER(prop);
+  ORT_UNUSED_PARAMETER(use_tf32);
+#endif
+
   return cublasSgemmBatched(handle,
                             transa,
                             transb,
@@ -208,7 +224,8 @@ inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t handle,
                                               const double* beta,
                                               double* Carray[], int ldc,
                                               int batch_count,
-                                              const cudaDeviceProp& /*prop*/) {
+                                              const cudaDeviceProp& /*prop*/,
+                                              bool /*use_tf32*/) {
   return cublasDgemmBatched(handle,
                             transa,
                             transb,
@@ -231,7 +248,8 @@ inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t handle,
                                               const half* beta,
                                               half* Carray[], int ldc,
                                               int batch_count,
-                                              const cudaDeviceProp& prop) {
+                                              const cudaDeviceProp& prop,
+                                              bool /*use_tf32*/) {
   const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance();
   onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode());
   if (half_options->IsCompute16F()) {
@@ -266,11 +284,12 @@ inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t handle,
 }
 
 #if defined(USE_CUDA)
-inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-                                              int m, int n, int k, const BFloat16* alpha, const BFloat16* Aarray[],
-                                              int lda, const BFloat16* Barray[], int ldb, const BFloat16* beta,
-                                              BFloat16* Carray[], int ldc, int batch_count,
-                                              const cudaDeviceProp& /*prop*/) {
+inline cublasStatus_t cublasGemmBatchedHelper(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const BFloat16* alpha, const BFloat16* Aarray[],
+    int lda, const BFloat16* Barray[], int ldb, const BFloat16* beta,
+    BFloat16* Carray[], int ldc, int batch_count,
+    const cudaDeviceProp& /*prop*/, bool /*use_tf32*/) {
   float h_a = alpha->ToFloat();
   float h_b = beta->ToFloat();
 
@@ -282,7 +301,8 @@ inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t handle, cublasOpera
 #else
 inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
                                               const BFloat16*, const BFloat16*[], int, const BFloat16*[], int,
-                                              const BFloat16*, BFloat16*[], int, int, const cudaDeviceProp&) {
+                                              const BFloat16*, BFloat16*[], int, int, const cudaDeviceProp&,
+                                              bool /*use_tf32*/) {
   return CUBLAS_STATUS_NOT_SUPPORTED;
 }
 #endif
@@ -301,15 +321,14 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle,
                                                      float* C, int ldc,
                                                      long long int strideC,
                                                      int batch_count,
-                                                     const cudaDeviceProp& prop) {
-#ifdef ENABLE_TRAINING_OPS
+                                                     const cudaDeviceProp& prop,
+                                                     bool use_tf32) {
 #if defined(USE_CUDA)
-  onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, CUBLAS_TF32_TENSOR_OP_MATH);
-#else
-  ORT_UNUSED_PARAMETER(prop);
-#endif
+  cublasMath_t mode = use_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
+  onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, mode);
 #else
   ORT_UNUSED_PARAMETER(prop);
+  ORT_UNUSED_PARAMETER(use_tf32);
 #endif
 
   return cublasSgemmStridedBatched(handle,
@@ -337,7 +356,8 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle,
                                                      double* C, int ldc,
                                                      long long int strideC,
                                                      int batch_count,
-                                                     const cudaDeviceProp& /*prop*/) {
+                                                     const cudaDeviceProp& /*prop*/,
+                                                     bool /*use_tf32*/) {
   return cublasDgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -363,7 +383,8 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle,
                                                      __half* C, int ldc,
                                                      long long int strideC,
                                                      int batch_count,
-                                                     const cudaDeviceProp& prop) {
+                                                     const cudaDeviceProp& prop,
+                                                     bool /*use_tf32*/) {
   const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance();
   onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode());
   if (half_options->IsCompute16F()) {
@@ -411,7 +432,8 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle,
                                                      __half* C, int ldc,
                                                      long long int strideC,
                                                      int batch_count,
-                                                     const cudaDeviceProp& prop) {
+                                                     const cudaDeviceProp& prop,
+                                                     bool /*use_tf32*/) {
   const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance();
   onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode());
   if (half_options->IsCompute16F()) {
@@ -447,49 +469,66 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle,
 }
 
 #if defined(USE_CUDA)
-inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, cublasOperation_t transa,
-                                                     cublasOperation_t transb, int m, int n, int k,
-                                                     const BFloat16* alpha, const BFloat16* A, int lda,
-                                                     long long int strideA, const BFloat16* B, int ldb,
-                                                     long long int strideB, const BFloat16* beta, BFloat16* C, int ldc,
-                                                     long long int strideC, int batch_count,
-                                                     const cudaDeviceProp& /*prop*/) {
+inline cublasStatus_t cublasGemmStridedBatchedHelper(
+    cublasHandle_t handle, cublasOperation_t transa,
+    cublasOperation_t transb, int m, int n, int k,
+    const BFloat16* alpha, const BFloat16* A, int lda,
+    long long int strideA, const BFloat16* B, int ldb,
+    long long int strideB, const BFloat16* beta, BFloat16* C, int ldc,
+    long long int strideC, int batch_count,
+    const cudaDeviceProp& /*prop*/, bool /*use_tf32*/) {
   float h_a = alpha->ToFloat();
   float h_b = beta->ToFloat();
   // accumulating in FP32
-  return cublasGemmStridedBatchedEx(handle, transa, transb, m, n, k, &h_a, A, CUDA_R_16BF, lda, strideA, B, CUDA_R_16BF,
-                                    ldb, strideB, &h_b, C, CUDA_R_16BF, ldc, strideC, batch_count, CUDA_R_32F,
-                                    CUBLAS_GEMM_DEFAULT);
+  return cublasGemmStridedBatchedEx(
+      handle, transa, transb, m, n, k, &h_a, A, CUDA_R_16BF, lda, strideA, B, CUDA_R_16BF,
+      ldb, strideB, &h_b, C, CUDA_R_16BF, ldc, strideC, batch_count, CUDA_R_32F,
+      CUBLAS_GEMM_DEFAULT);
 }
 #else
-inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
-                                                     int, const BFloat16*, const BFloat16*, int, long long int,
-                                                     const BFloat16*, int, long long int, const BFloat16*, BFloat16*,
-                                                     int, long long int, int, const cudaDeviceProp&) {
+inline cublasStatus_t cublasGemmStridedBatchedHelper(
+    cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+    int, const BFloat16*, const BFloat16*, int, long long int,
+    const BFloat16*, int, long long int, const BFloat16*, BFloat16*,
+    int, long long int, int, const cudaDeviceProp&, bool /*use_tf32*/) {
   return CUBLAS_STATUS_NOT_SUPPORTED;
 }
 #endif
 
 // transpose using geam
-inline cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) {
+inline cublasStatus_t cublasTransposeHelper(
+    cudaStream_t, cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb,
+    float* C, int ldc) {
   return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }
 
-inline cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) {
+inline cublasStatus_t cublasTransposeHelper(
+    cudaStream_t, cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb,
+    double* C, int ldc) {
   return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }
 
 bool CanUse_cublasTransposeHelper_MLFloat16(int m, int n);
-cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int);
+
+cublasStatus_t cublasTransposeHelper(
+    cudaStream_t, cublasHandle_t, cublasOperation_t, cublasOperation_t,
+    int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int);
 
 // copy
-inline cublasStatus_t cublasCopyHelper(cudaStream_t, cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy) {
+inline cublasStatus_t cublasCopyHelper(
+    cudaStream_t, cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy) {
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 
-inline cublasStatus_t cublasCopyHelper(cudaStream_t, cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy) {
+inline cublasStatus_t cublasCopyHelper(
+    cudaStream_t, cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy) {
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t handle, int n, const half* x, int incx, half* y, int incy);
-cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t handle, int n, const BFloat16* x, int incx, BFloat16* y, int incy);
+cublasStatus_t cublasCopyHelper(
+    cudaStream_t stream, cublasHandle_t handle, int n, const half* x, int incx, half* y, int incy);
+
+cublasStatus_t cublasCopyHelper(
+    cudaStream_t stream, cublasHandle_t handle, int n, const BFloat16* x, int incx, BFloat16* y, int incy);
diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
index 10c8625b39ef..b710e8a1b48c 100644
--- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
@@ -95,7 +95,37 @@ struct OffsetCalculatorFor2D {
 
 template <class T>
 struct FuncAssignment {
-  __device__ __inline__ void operator()(T* start_addr, size_t index, T value) const { start_addr[index] = value; }
+  __device__ __inline__ void operator()(T* start_addr, size_t index, T value) const {
+    start_addr[index] = value;
+  }
+};
+
+template <class T>
+struct FuncAdd {
+  __device__ __inline__ void operator()(T* start_addr, size_t index, T value) const {
+    atomic_add(start_addr + index, value);
+  }
+};
+
+template <class T>
+struct FuncMul {
+  __device__ __inline__ void operator()(T* start_addr, size_t index, T value) const {
+    atomic_mul(start_addr + index, value);
+  }
+};
+
+template <class T>
+struct FuncMax {
+  __device__ __inline__ void operator()(T* start_addr, size_t index, T value) const {
+    atomic_max(start_addr + index, value);
+  }
+};
+
+template <class T>
+struct FuncMin {
+  __device__ __inline__ void operator()(T* start_addr, size_t index, T value) const {
+    atomic_min(start_addr + index, value);
+  }
 };
 
 template <typename T, typename TIndex, bool IsGather, typename OffsetCalcT, typename TFunc>
@@ -238,8 +268,24 @@ Status ScatterElementsImplInternal(cudaStream_t stream, const T* input_data, con
 template <typename T, typename TIndex>
 Status ScatterElementsImpl(cudaStream_t stream, const T* input_data, const TIndex* indices_data, const T* updates_data,
                            T* output_data, const GatherScatterElementsArgs& args) {
-  return ScatterElementsImplInternal(stream, input_data, indices_data, updates_data, output_data, args,
-                                     FuncAssignment<T>());
+  if (args.operation == GatherScatterElementsArgs::Operation::NONE) {
+      return ScatterElementsImplInternal(stream, input_data, indices_data, updates_data, output_data, args,
+                                        FuncAssignment<T>());
+  } else if (args.operation == GatherScatterElementsArgs::Operation::ADD) {
+      return ScatterElementsImplInternal(stream, input_data, indices_data, updates_data, output_data, args,
+                                        FuncAdd<T>());
+  } else if (args.operation == GatherScatterElementsArgs::Operation::MUL) {
+      return ScatterElementsImplInternal(stream, input_data, indices_data, updates_data, output_data, args,
+                                        FuncMul<T>());
+  } else if (args.operation == GatherScatterElementsArgs::Operation::MAX) {
+      return ScatterElementsImplInternal(stream, input_data, indices_data, updates_data, output_data, args,
+                                        FuncMax<T>());
+  } else if (args.operation == GatherScatterElementsArgs::Operation::MIN) {
+      return ScatterElementsImplInternal(stream, input_data, indices_data, updates_data, output_data, args,
+                                        FuncMin<T>());
+  } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported reduction operator.");
+  }
 }
 
 #define GATHER_SCATTER_ELEMENTS_SPECIALIZED_TINDEX_IMPL(T, TIndex)                                                     \
diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h
index 631d0bf049c6..7b1c88f1fc1c 100644
--- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h
@@ -10,6 +10,14 @@ namespace onnxruntime {
 namespace cuda {
 
 struct GatherScatterElementsArgs {
+  enum class Operation {
+    NONE,
+    ADD,
+    MUL,
+    MAX,
+    MIN
+  };
+
   int64_t rank;
   int64_t axis;
   int64_t input_size;
@@ -19,6 +27,9 @@ struct GatherScatterElementsArgs {
   TArray<fast_divmod> indices_fdms;
   TArray<int64_t> indices_strides;
   int64_t indices_size;
+  // operation used to combine values associated the same
+  // memory location in the output tensor.
+  Operation operation;
 };
 
 template <typename T, typename TIndex>
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu.cc b/onnxruntime/core/providers/cuda/tensor/gelu.cc
new file mode 100644
index 000000000000..67b2fad373a7
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu.cc
@@ -0,0 +1,89 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/cudnn_common.h"
+#include "core/providers/cuda/tensor/gelu.h"
+#include "core/providers/cuda/tensor/gelu_impl.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+#define REGISTER_KERNEL_TYPED(T)                                 \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                 \
+      Gelu,                                                      \
+      kOnnxDomain,                                               \
+      20,                                                        \
+      T,                                                         \
+      kCudaExecutionProvider,                                    \
+      (*KernelDefBuilder::Create())                              \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
+          .MayInplace(0, 0),                                     \
+      Gelu<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(double)
+
+template <typename T>
+Status Gelu<T>::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const auto& input_dims = input->Shape().GetDims();
+  if (input_dims.size() < 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 0 is expected to have 1 or more dimensions, got ", input_dims.size());
+  }
+
+  Tensor* output = context->Output(0, input->Shape());
+
+  int64_t input_length = input->Shape().Size();
+  if (input_length == 0) {
+    return Status::OK();
+  }
+
+  typedef typename ToCudaType<T>::MappedType CudaT;
+
+  if (approximation_algorithm_ == "tanh") {
+    return LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
+                                       Stream(context),
+                                       static_cast<int>(input_length),
+                                       0 /* no bias */,
+                                       reinterpret_cast<const CudaT*>(input->Data<T>()),
+                                       nullptr /* no bias */,
+                                       reinterpret_cast<CudaT*>(output->MutableData<T>()),
+                                       use_half2_);
+  } else if (approximation_algorithm_ == "none") {
+    return LaunchGeluKernel<CudaT>(Stream(context),
+                                   reinterpret_cast<const CudaT*>(input->Data<T>()),
+                                   reinterpret_cast<CudaT*>(output->MutableData<T>()),
+                                   static_cast<size_t>(input_length));
+  }
+
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_);
+}
+
+}  // namespace cuda
+
+#ifndef DISABLE_CONTRIB_OPS
+namespace contrib::cuda {
+#define REGISTER_CONTRIB_KERNEL_TYPED(T)                         \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                 \
+      Gelu,                                                      \
+      kMSDomain,                                                 \
+      1,                                                         \
+      T,                                                         \
+      kCudaExecutionProvider,                                    \
+      (*KernelDefBuilder::Create())                              \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
+          .MayInplace(0, 0),                                     \
+      onnxruntime::cuda::Gelu<T>);
+
+REGISTER_CONTRIB_KERNEL_TYPED(float)
+REGISTER_CONTRIB_KERNEL_TYPED(MLFloat16)
+REGISTER_CONTRIB_KERNEL_TYPED(double)
+
+#undef REGISTER_CONTRIB_KERNEL_TYPED
+}  // namespace contrib::cuda
+#endif
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu.h b/onnxruntime/core/providers/cuda/tensor/gelu.h
new file mode 100644
index 000000000000..1c8189ab2412
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+#include "core/providers/cuda/math/unary_elementwise_ops.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+class Gelu final : public UnaryElementwise {
+ public:
+  Gelu(const OpKernelInfo& info) : UnaryElementwise(info) {
+    approximation_algorithm_ = info.GetAttrOrDefault<std::string>("approximate", "none");
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override;
+
+ private:
+  const bool use_half2_{true};
+
+  std::string approximation_algorithm_;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
similarity index 84%
rename from onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu
rename to onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
index c9498eb1bcd7..7a27b7af3313 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
@@ -24,12 +24,9 @@ limitations under the License.
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
-#include "contrib_ops/cuda/bert/fast_gelu_impl.h"
-
-using namespace onnxruntime::cuda;
+#include "core/providers/cuda/tensor/gelu_impl.h"
 
 namespace onnxruntime {
-namespace contrib {
 namespace cuda {
 
 // constants for approximating the normal cdf
@@ -65,7 +62,7 @@ __global__ void FastGeluKernel2(const half2 a, const half2 b, const half2 c, int
 }
 
 template <>
-Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
+Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
                             const float* input, const float* bias, float* output, bool /*use_half2*/) {
   constexpr int blockSize = 256;
   const int gridSize = (input_length + blockSize - 1) / blockSize;
@@ -75,6 +72,17 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
   return CUDA_CALL(cudaGetLastError());
 }
 
+template <>
+Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
+                            const double* input, const double* bias, double* output, bool /*use_half2*/) {
+  constexpr int blockSize = 256;
+  const int gridSize = (input_length + blockSize - 1) / blockSize;
+  FastGeluKernel<double, blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, input_length, bias_length,
+                                                                        input, bias, output);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
 template <>
 Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
                             const half* input, const half* bias, half* output, bool use_half2) {
@@ -100,7 +108,7 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
 }
 
 template <>
-Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
+Status LaunchFastGeluKernel(const cudaDeviceProp& /*prop*/, cudaStream_t stream, int input_length, int bias_length,
                             const BFloat16* input, const BFloat16* bias, BFloat16* output, bool /*use_half2*/) {
   constexpr int blockSize = 256;
 
@@ -114,5 +122,4 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
 }
 
 }  // namespace cuda
-}  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu
new file mode 100644
index 000000000000..3f96da38b37b
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cuda_runtime.h>
+#include "core/providers/cuda/tensor/gelu_impl.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/cu_inc/unary_elementwise_impl.cuh"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+struct OP_Gelu {
+  __device__ __inline__ T operator()(const T& a) const {
+    return _Gelu(a);
+  }
+};
+
+template <>
+struct OP_Gelu<half> {
+  __device__ __inline__ half operator()(const half& a) const {
+    return static_cast<half>(_Gelu(static_cast<float>(a)));
+  }
+};
+
+template <typename T>
+Status LaunchGeluKernel(
+    cudaStream_t stream,
+    const T* input_data,
+    T* output_data,
+    size_t count) {
+  UnaryElementWiseImpl(stream, input_data, output_data, OP_Gelu<T>(), count);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+#define SPECIALIZED_GELU_IMPL(T)                                                                \
+  template Status LaunchGeluKernel<T>(cudaStream_t stream, const T* input_data, T* output_data, \
+                                      size_t count);
+
+SPECIALIZED_GELU_IMPL(float);
+SPECIALIZED_GELU_IMPL(half);
+SPECIALIZED_GELU_IMPL(double);
+
+#undef SPECIALIZED_GELU_IMPL
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h b/onnxruntime/core/providers/cuda/tensor/gelu_impl.h
similarity index 80%
rename from onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h
rename to onnxruntime/core/providers/cuda/tensor/gelu_impl.h
index ba78310f5dfc..2ea0d3441fda 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_impl.h
@@ -1,17 +1,18 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
 #pragma once
+
 #include "core/common/common.h"
 
 namespace onnxruntime {
-namespace contrib {
 namespace cuda {
 
+template <typename T>
+Status LaunchGeluKernel(cudaStream_t stream, const T* input, T* output, size_t count);
+
 template <typename T>
 Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
                             const T* input, const T* bias, T* output, bool use_half2);
 
 }  // namespace cuda
-}  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index 4584e5fd8272..bdd6567d2ef3 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -29,15 +29,27 @@ namespace cuda {
           .InputMemoryType(OrtMemTypeCPUInput, 2)                 \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Pad<T>);                                                    \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                        \
+      Pad,                                                        \
+      kOnnxDomain,                                                \
+      13, 17,                                                     \
+      T,                                                          \
+      kCudaExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                 \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)                 \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Pad<T>);                                                    \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
       Pad,                                                        \
       kOnnxDomain,                                                \
-      13,                                                         \
+      18,                                                         \
       T,                                                          \
       kCudaExecutionProvider,                                     \
       (*KernelDefBuilder::Create())                               \
           .InputMemoryType(OrtMemTypeCPUInput, 1)                 \
           .InputMemoryType(OrtMemTypeCPUInput, 2)                 \
+          .InputMemoryType(OrtMemTypeCPUInput, 3)                 \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Pad<T>);
 
@@ -94,28 +106,15 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   if (is_dynamic_) {
     const Tensor& pads_tensor = *ctx->Input<Tensor>(1);
     const auto pads_tensor_dims = pads_tensor.Shape().GetDims();
-    ORT_ENFORCE(utils::IsPrimitiveDataType<int64_t>(pads_tensor.DataType()),
-                "Pads tensor should be an INT64 tensor");
     ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
-                "Pads tensor should be a 1D tensor of shape [2 * input_rank] or a 2D tensor of shape [1, 2 * input_rank]");
+                "Pads tensor should be a 1D tensor of shape [2 * num_axes] or a 2D tensor of shape [1, 2 * num_axes]");
 
-    const int64_t* pads_tensor_raw_data = pads_tensor.Data<int64_t>();
-    size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
-    ORT_ENFORCE(pads_size == 2 * static_cast<size_t>(dimension_count),
-                "Pads tensor size should be equal to twice the input dimension count ");
+    const auto pads_data = pads_tensor.DataAsSpan<int64_t>();
+
+    PadBase::ComputePads(*ctx, input_shape.NumDimensions(), pads_data, pads);
 
-    pads.reserve(2LL * dimension_count);
-    for (size_t i = 0; i < pads_size; ++i) {
-      pads.push_back(pads_tensor_raw_data[i]);
-    }
     // Separate out any negative pads into the slices array
-    slices.resize(pads.size(), 0);
-    for (size_t index = 0; index < pads.size(); index++) {
-      if (pads[index] < 0) {
-        slices[index] = pads[index];
-        pads[index] = 0;
-      }
-    }
+    PadBase::SeparateNegativeToSlices(pads, slices);
 
     T raw_value{};
     const Tensor* value_tensor = ctx->Input<Tensor>(2);
diff --git a/onnxruntime/core/providers/cuda/tensor/resize.cc b/onnxruntime/core/providers/cuda/tensor/resize.cc
index 764172a8d1fa..97d4eb71e970 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize.cc
+++ b/onnxruntime/core/providers/cuda/tensor/resize.cc
@@ -28,10 +28,22 @@ namespace cuda {
           .InputMemoryType(OrtMemTypeCPUInput, 3)                  \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
       Resize<T>);                                                  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
+      Resize,                                                      \
+      kOnnxDomain,                                                 \
+      13, 17,                                                      \
+      T,                                                           \
+      kCudaExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 3)                  \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
+      Resize<T>);                                                  \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
       Resize,                                                      \
       kOnnxDomain,                                                 \
-      13,                                                          \
+      18,                                                          \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
new file mode 100644
index 000000000000..d56e4bc53874
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
@@ -0,0 +1,1179 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/tensor/resize_impl.h"
+
+#define FUNC_DEF __device__
+
+namespace onnxruntime {
+namespace cuda {
+
+using onnxruntime::ResizeCoordinateTransformationMode;
+using onnxruntime::UpsampleMode;
+
+/// <summary>
+/// Compute a buffer for bilinear data for CUDA antialias resizing.
+/// </summary>
+static std::tuple<int64_t, int64_t> ComputeBilinearScaleBufferSize(
+    int64_t output_height, int64_t output_width,
+    float height_rscale, float width_rscale,
+    float support_value,
+    float& scaled_support_height, float& scaled_support_width,
+    int32_t& window_size_height, int32_t& window_size_width) {
+  scaled_support_height = ComputeScaledSupportValue(support_value, height_rscale);
+  scaled_support_width = ComputeScaledSupportValue(support_value, width_rscale);
+  window_size_height = ComputeWindowSize(scaled_support_height);
+  window_size_width = ComputeWindowSize(scaled_support_width);
+
+  auto height_buffer_size = ComputeWeightedCoeffBufferSize(output_height, window_size_height);
+  auto width_buffer_size = ComputeWeightedCoeffBufferSize(output_width, window_size_width);
+
+  return std::make_tuple(height_buffer_size, width_buffer_size);
+}
+
+/// <summary>
+/// Compute a buffer for btrilinear data for CUDA antialias resizing.
+/// </summary>
+static std::tuple<int64_t, int64_t, int64_t> ComputeTrilinearScaleBufferSize(
+    int64_t output_depth, int64_t output_height, int64_t output_width,
+    float depth_rscale, float height_rscale, float width_rscale,
+    float support_value,
+    float& scaled_support_depth, float& scaled_support_height,
+    float& scaled_support_width, int32_t& window_size_depth,
+    int32_t& window_size_height, int32_t& window_size_width) {
+  scaled_support_depth = ComputeScaledSupportValue(support_value, depth_rscale);
+  window_size_depth = ComputeWindowSize(scaled_support_depth);
+  auto depth_buffer_size = ComputeWeightedCoeffBufferSize(output_depth, window_size_depth);
+
+  const auto [y_buffer_size, w_buffer_size] = ComputeBilinearScaleBufferSize(output_height,
+                                                                             output_width, height_rscale,
+                                                                             width_rscale, support_value,
+                                                                             scaled_support_height,
+                                                                             scaled_support_width,
+                                                                             window_size_height, window_size_width);
+  return std::make_tuple(depth_buffer_size, y_buffer_size, w_buffer_size);
+}
+
+// Antialiasing filters
+struct BilinearFilter {
+  __device__ __host__ float operator()(float x, float /* cubic_coeff_a */) const {
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return 1.0f - x;
+    }
+    return 0.0f;
+  }
+};
+
+struct BiCubicFilter {
+  __device__ __host__ float operator()(float x, float cubic_coeff_a) const {
+    /* https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+     */
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return ((cubic_coeff_a + 2.0f) * x - (cubic_coeff_a + 3.0f)) * x * x + 1;
+    }
+    if (x < 2.0f) {
+      return (((x - 5.0f) * x + 8.f) * x - 4.f) * cubic_coeff_a;
+    }
+    return 0.0f;
+  }
+};
+
+struct TriLinearFilter {
+  __device__ __host__ float operator()(float x, float /* cubic_coeff_a */) const {
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return 1.0f - x;
+    }
+    return 0.0f;
+  }
+};
+
+template <typename AccumType>
+struct AccumTypeCaster {
+  static __device__ __host__ AccumType* cast(AccumType* p) {
+    return p;
+  }
+};
+
+template <>
+struct AccumTypeCaster<int32_t> {
+  static __device__ __host__ float* cast(int32_t* p) {
+    return reinterpret_cast<float*>(p);
+  }
+};
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel1(
+    int64_t num_channels,
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata,
+    const int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (output_width == input_width) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_y, output_x;
+  div_output_width.divmod(output_image_index, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * num_channels * input_height * input_width);
+  CUDA_LONG output_index = static_cast<CUDA_LONG>(bxc * num_channels * output_height * output_width);
+
+  auto* Ydata_offset = Ydata + output_index + output_width * output_y + output_x;
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_x;
+  int64_t xmin = bound[static_cast<ptrdiff_t>(output_x) * 2];
+  int64_t xmax = bound[static_cast<ptrdiff_t>(output_x) * 2 + 1];
+
+  // Input window
+  const auto* Xdata_offset = Xdata + input_index + input_width * output_y + xmin;
+
+  for (; xmin < xmax; ++xmin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // This cast is needed when we deal with half
+      output += static_cast<AccumType>((*Xdata_offset++)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset++) * (*weight_coeff++);
+    }
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = static_cast<T>(output);
+  }
+}
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel2(
+    int64_t num_channels,
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_height,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    bool use_extrapolation, float extrapolation_value,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata, int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (output_height == input_height) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_z, output_y, output_x, temp;
+  div_output_height.divmod(output_image_index, output_z, temp);
+  div_output_width.divmod(temp, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * num_channels * input_height * input_width +
+                                                 output_z * input_height * input_width);
+  CUDA_LONG output_index = static_cast<CUDA_LONG>(bxc * num_channels * output_height * output_width +
+                                                  output_z * output_height * output_width);
+
+  auto* Ydata_offset = Ydata + output_index + output_width * output_y + output_x;
+
+  if (use_extrapolation) {
+    const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers);
+    // Extrapolate along the w dimension
+    if (w_outof_bounds[static_cast<ptrdiff_t>(output_x)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the y dimension
+    const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers);
+    if (y_outof_bounds[static_cast<ptrdiff_t>(output_y)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+  }
+
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_y;
+  int64_t ymin = bound[static_cast<ptrdiff_t>(output_y) * 2];
+  int64_t ymax = bound[static_cast<ptrdiff_t>(output_y) * 2 + 1];
+
+  const auto* Xdata_offset = Xdata + input_index + ymin * output_width + output_x;
+
+  for (; ymin < ymax; ++ymin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // We cast to AccumType to resolve ambiguous call to operator* for half in CUDA
+      output += static_cast<AccumType>((*Xdata_offset)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset) * (*weight_coeff++);
+    }
+    Xdata_offset += input_width;
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = output;
+  }
+}
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel3(
+    int64_t input_depth,
+    int64_t input_height, int64_t input_width,
+    int64_t output_depth,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_height,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    bool use_extrapolation, float extrapolation_value,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata, int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (input_depth == output_depth) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_z, output_y, output_x, temp;
+  div_output_height.divmod(output_image_index, output_z, temp);
+  div_output_width.divmod(temp, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * input_depth * input_height * input_width);
+
+  auto* Ydata_offset = Ydata + id;
+
+  if (use_extrapolation) {
+    const auto* w_outof_bounds = std::get<2>(outof_bounds_buffers);
+    // Extrapolate along the w dimension
+    if (w_outof_bounds[static_cast<ptrdiff_t>(output_x)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the y dimension
+    const auto* y_outof_bounds = std::get<1>(outof_bounds_buffers);
+    if (y_outof_bounds[static_cast<ptrdiff_t>(output_y)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the z dimension
+    const int64_t* z_outof_bounds = std::get<0>(outof_bounds_buffers);
+    if (z_outof_bounds != nullptr && z_outof_bounds[static_cast<ptrdiff_t>(output_z)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+  }
+
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_z;
+  int64_t zmin = bound[static_cast<ptrdiff_t>(output_z) * 2];
+  int64_t zmax = bound[static_cast<ptrdiff_t>(output_z) * 2 + 1];
+
+  const auto z_step = input_height * input_width;
+  const auto* Xdata_offset = Xdata + input_index + zmin * z_step + output_y * output_width + output_x;
+
+  for (; zmin < zmax; ++zmin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // We cast to AccumType to resolve ambiguous call to operator* for half in CUDA
+      output += static_cast<AccumType>((*Xdata_offset)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset) * (*weight_coeff++);
+    }
+    Xdata_offset += z_step;
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = output;
+  }
+}
+
+/// <summary>
+/// This function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2]
+/// 2. out_of_bounds: int64_t[output_size]
+/// 3. scale_data: T[output_size * window_size]
+///
+/// Template parameter AccumType
+/// </summary>
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+FUNC_DEF void SetupUpsampleFilterAnitAliasImpl(
+    int64_t i,
+    int64_t input_size, int64_t output_size,
+    float rscale,
+    float roi_start, float roi_end,
+    float scaled_support, int32_t window_size, bool exclude_outside,
+    float cubic_coeff_a,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    AccumType* scale_data) {
+  Filter filter{};
+  CudaFunctionOriginalCoordinate get_original_coordinate{};
+
+  const auto scale = 1.f / rscale;
+  const float inv_scale = (scale >= 1.0f) ? 1.0f / scale : 1.0f;
+
+  const float id = static_cast<float>(i);
+  float center = 0.5f;
+  if (scale == 1.0f) {
+    center += id;
+  } else {
+    center += get_original_coordinate(id, rscale,
+                                      static_cast<float>(output_size),
+                                      static_cast<float>(input_size),
+                                      roi_start, roi_end);
+  }
+
+  if (center - 0.5f < 0 || center - 0.5f > static_cast<float>(input_size - 1)) {
+    out_of_bounds[i] = i;
+  } else {
+    out_of_bounds[i] = -1;
+  }
+
+  float total_weight{0};
+
+  auto fmin = _Floor(center - scaled_support + 0.5f);
+  auto fmax = _Floor(center + scaled_support + 0.5f);
+
+  int64_t min_real = static_cast<int64_t>(fmin);
+  int64_t max_real = static_cast<int64_t>(fmax);
+  int64_t min_cut = std::max<int64_t>(min_real, 0);
+  int64_t max_cut = std::min(max_real, input_size);
+
+  int64_t min_val = exclude_outside ? min_cut : min_real;
+  int64_t max_val = exclude_outside ? max_cut : max_real;
+  bounds[i * 2] = min_cut;
+  bounds[i * 2 + 1] = max_cut;
+
+  // This is done for int32_t case, when the final result is in int32_t, but
+  // we perform calculations in float. All other types as is.
+  auto* scale_buffer = AccumTypeCaster<AccumType>::cast(&scale_data[i * window_size]);
+
+  max_val -= min_val;
+  for (int64_t x = 0; x < max_val; x++) {
+    const float arg = (x + min_val - center + 0.5f) * inv_scale;
+    const auto w = filter(arg, cubic_coeff_a);
+    scale_buffer[x] = w;
+    total_weight += w;
+  }
+
+  if (!exclude_outside) {
+    int64_t neg_xsize = min_val < 0 ? -min_val : 0;
+    for (int64_t x = 0; x < neg_xsize; x++) {
+      scale_buffer[neg_xsize] += scale_buffer[x];
+    }
+
+    int64_t bound_size =
+        max_val + min_val > input_size ? max_val + min_val - input_size : 0;
+    for (int64_t x = max_val - bound_size; x < max_val; x++) {
+      scale_buffer[max_val - bound_size - 1] +=
+          scale_buffer[x];
+    }
+
+    for (int64_t x = 0; (neg_xsize | bound_size) > 0 && x < max_cut - min_cut; x++) {
+      scale_buffer[x] = scale_buffer[x + neg_xsize];
+    }
+  }
+
+  const float total_weight_inv = (total_weight == 0) ? 1.f : (1.f / total_weight);
+  if constexpr (std::is_same<AccumType, int32_t>::value) {
+    auto* scale_buffer_int = reinterpret_cast<int32_t*>(scale_buffer);
+    for (int64_t x = 0; x < max_cut - min_cut; x++) {
+      scale_buffer[x] *= total_weight_inv;
+      // normalize the scale to 1 << 22 for int8/uint8
+      scale_buffer_int[x] = static_cast<int32_t>(_Round(scale_buffer[x] * ConstValue::mag_factor_x_2));
+    }
+  } else {
+    for (int64_t x = 0; x < max_cut - min_cut; x++) {
+      scale_buffer[x] *= total_weight_inv;
+    }
+  }
+}
+
+/// This kernel computes antialias filter for bilinear or bicubic upsampling.
+/// The function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2] for each of the two dimensions
+/// 2. out_of_bounds: int64_t[output_size] for each of the two dimensions
+/// 3. scale_data: AccumType[output_size * window_size] for each of the two dimensions
+/// Buffers layout [h_data, w_data]
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+__global__ void _SetupBilinearUpsampleFilterAntiAlias(
+    std::tuple<int64_t, int64_t> input_dims,       // h, w
+    std::tuple<int64_t, int64_t> output_dims,      // h, w
+    std::tuple<float, float> inv_scale_vals,       // h, w
+    std::tuple<float, float> roi_start_vals,       // h, w
+    std::tuple<float, float> roi_end_vals,         // h, w
+    std::tuple<float, float> dim_scaled_support,   // Pre-computed scaled support values h, w
+    std::tuple<int32_t, int32_t> dim_window_size,  // Pre-computed windows sizes h, w
+    float cubic_coeff_a,
+    bool exclude_outside,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    std::tuple<AccumType*, AccumType*> weighted_coefficients  // y, h buffers
+) {
+  const auto N = std::get<0>(output_dims) + std::get<1>(output_dims);
+
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  if (id < std::get<0>(output_dims)) {
+    // Setup for y
+    int64_t input_size = std::get<0>(input_dims);
+    int64_t output_size = std::get<0>(output_dims);
+    float inv_scale = std::get<0>(inv_scale_vals);
+    float roi_start = std::get<0>(roi_start_vals);
+    float roi_end = std::get<0>(roi_end_vals);
+    float scaled_support = std::get<0>(dim_scaled_support);
+    int32_t window_size = std::get<0>(dim_window_size);
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        id,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outside,
+        cubic_coeff_a,
+        bounds,
+        out_of_bounds,
+        std::get<0>(weighted_coefficients));
+
+  } else {
+    // Setup for w
+    // w = id - output_height
+
+    int64_t input_size = std::get<1>(input_dims);
+    int64_t output_size = std::get<1>(output_dims);
+    float inv_scale = std::get<1>(inv_scale_vals);
+    float roi_start = std::get<1>(roi_start_vals);
+    float roi_end = std::get<1>(roi_end_vals);
+
+    float scaled_support = std::get<1>(dim_scaled_support);
+    int32_t window_size = std::get<1>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto y_output_size = std::get<0>(output_dims);
+
+    auto i = id - y_output_size;
+    bounds += (y_output_size * 2);
+    out_of_bounds += y_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outside,
+        cubic_coeff_a,
+        bounds,
+        out_of_bounds,
+        std::get<1>(weighted_coefficients));
+  }
+}
+
+/// <summary>
+/// Compute AntiAlias filter for trilinear upsampling, all in one go
+/// The function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2] for each of the three dimensions
+/// 2. out_of_bounds: int64_t[output_size] for each of the three dimensions
+/// 3. scale_data: AccumType[output_size * window_size] for each of the three dimensions
+/// Each kind of buffer contains data for all 3 dims.
+/// Buffers layout [d_data, h_data, w_data]
+/// </summary>
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+__global__ void _SetupTrilinerarUpsampleFilterAntiAlias(
+    std::tuple<int64_t, int64_t, int64_t> input_dims,       // d, h, w
+    std::tuple<int64_t, int64_t, int64_t> output_dims,      // d, h, w
+    std::tuple<float, float, float> inv_scale_vals,         // d, h, w
+    std::tuple<float, float, float> roi_start_vals,         // d, h, w
+    std::tuple<float, float, float> roi_end_vals,           // d, h, w
+    std::tuple<float, float, float> dim_scaled_support,     // Pre-computed scaled support values d, h, w
+    std::tuple<int32_t, int32_t, int32_t> dim_window_size,  // Pre-computed windows sizes d, h, w
+    bool exclude_outisde,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    std::tuple<AccumType*, AccumType*, AccumType*> weighted_coefficients) {
+  const auto N = std::get<0>(output_dims) + std::get<1>(output_dims) + std::get<2>(output_dims);
+
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  if (id < std::get<0>(output_dims)) {
+    // Setup for d by default (id < output_depth)
+    int64_t input_size = std::get<0>(input_dims);
+    int64_t output_size = std::get<0>(output_dims);
+    float inv_scale = std::get<0>(inv_scale_vals);
+    float roi_start = std::get<0>(roi_start_vals);
+    float roi_end = std::get<0>(roi_end_vals);
+    float scaled_support = std::get<0>(dim_scaled_support);
+    int32_t window_size = std::get<0>(dim_window_size);
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        id,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<0>(weighted_coefficients));
+
+  } else if (id >= std::get<0>(output_dims) && id < (std::get<0>(output_dims) + std::get<1>(output_dims))) {
+    int64_t input_size = std::get<1>(input_dims);
+    int64_t output_size = std::get<1>(output_dims);
+    float inv_scale = std::get<1>(inv_scale_vals);
+    float roi_start = std::get<1>(roi_start_vals);
+    float roi_end = std::get<1>(roi_end_vals);
+
+    float scaled_support = std::get<1>(dim_scaled_support);
+    int32_t window_size = std::get<1>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto d_output_size = std::get<0>(output_dims);
+
+    auto i = id - d_output_size;
+    bounds += d_output_size * 2;
+    out_of_bounds += d_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<1>(weighted_coefficients));
+  } else {
+    int64_t input_size = std::get<2>(input_dims);
+    int64_t output_size = std::get<2>(output_dims);
+    float inv_scale = std::get<2>(inv_scale_vals);
+    float roi_start = std::get<2>(roi_start_vals);
+    float roi_end = std::get<2>(roi_end_vals);
+    float scaled_support = std::get<2>(dim_scaled_support);
+    int32_t window_size = std::get<2>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto d_y_output_size = std::get<0>(output_dims) + std::get<1>(output_dims);
+
+    auto i = id - d_y_output_size;
+    bounds += (d_y_output_size * 2);
+    out_of_bounds += d_y_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<2>(weighted_coefficients));
+  }
+}
+
+#define CASEA_COORD_ANTIALIAS(coordinate_mode, TransformCoordType, ...) \
+  case coordinate_mode: {                                               \
+    using coord_t = TransformCoordType;                                 \
+    return __VA_ARGS__();                                               \
+    break;                                                              \
+  }
+
+#define DISPATCH_ANTIALIAS_FILTER_SETUP(coord_enum, ...)                              \
+  [&] {                                                                               \
+    const auto the_type = coord_enum;                                                 \
+    switch (the_type) {                                                               \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::HALF_PIXEL,           \
+                            TransformCoordinate_HALF_PIXEL, __VA_ARGS__)              \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::ASYMMETRIC,           \
+                            TransformCoordinate_ASYMMETRIC, __VA_ARGS__)              \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,   \
+                            TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)      \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::ALIGN_CORNERS,        \
+                            TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)           \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, \
+                            TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__)    \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,   \
+                            TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)      \
+      default:                                                                        \
+        ORT_THROW("unknown ResizeCoordinateTransformationMode");                      \
+    }                                                                                 \
+  }()
+
+namespace {
+template <typename T>
+IAllocatorUniquePtr<uint8_t> AllocateTyped(
+    const TempSpaceAllocateFunc& alloc,
+    size_t elements) {
+  return alloc(elements * sizeof(T));
+}
+
+template <typename T>
+T* GetTyped(IAllocatorUniquePtr<uint8_t>& bytes) {
+  return reinterpret_cast<T*>(bytes.get());
+}
+}  // namespace
+
+template <typename T>
+void ResizeTrilinearUpsample(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode /*upsample_mode*/,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> /*input_shape*/,
+    gsl::span<const int64_t> /*output_shape*/,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,
+    const std::optional<float>& extrapolation,
+    bool exclude_outside,
+    const TempSpaceAllocateFunc& allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      static_cast<int>(ceil((output_depth + output_height + output_width) / 32.0));
+
+  int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+
+  constexpr float support_value = antialias_constants::kSupportSize;
+  float z_scale, h_scale, w_scale;
+  std::tie(z_scale, h_scale, w_scale) = inferred_dim_rscales;
+
+  const auto& div_output_width = output_div_pitches[rank - 2];
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_depth) + output_height + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_depth) + output_height + output_width);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* z_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* y_bounds_buffer = z_bounds_buffer + output_depth * 2;
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* z_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* y_outof_bounds_buffer = z_outof_bounds_buffer + output_depth;
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  float z_scaled_support, h_scaled_support, w_scaled_support;
+  int32_t z_window_size, h_window_size, w_window_size;
+  const auto [z_buffer_size, y_buffer_size, w_buffer_size] = ComputeTrilinearScaleBufferSize(
+      output_depth, output_height, output_width,
+      z_scale, h_scale, w_scale, support_value,
+      z_scaled_support, h_scaled_support, w_scaled_support,
+      z_window_size, h_window_size, w_window_size);
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(z_buffer_size) + y_buffer_size + w_buffer_size;
+
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, weighted_buffer_size);
+  AccumType* z_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* y_weighted_buffer = z_weighted_buffer + z_buffer_size;
+  AccumType* w_weighted_buffer = y_weighted_buffer + y_buffer_size;
+
+  const auto h_w_interpolate_temp_buf_size = SafeInt<int64_t>(batch_size) * num_channels *
+                                             input_depth * input_height * output_width;
+  auto h_w_interpolate_temp_buffer_ptr = AllocateTyped<T>(allocate_temp_space,
+                                                          narrow<size_t>(h_w_interpolate_temp_buf_size));
+
+  const auto h_w_interpolate_result_buffer_size = SafeInt<int64_t>(batch_size) * num_channels *
+                                                  input_depth * output_height * output_width;
+  auto h_w_interpolate_result_buffer_ptr = AllocateTyped<T>(allocate_temp_space, h_w_interpolate_result_buffer_size);
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    _SetupTrilinerarUpsampleFilterAntiAlias<AccumType,
+                                            TriLinearFilter,
+                                            coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        inferred_input_dims,
+        inferred_output_dims,
+        inferred_dim_rscales,
+        std::make_tuple(roi_vals[rank - 3], roi_vals[rank - 2], roi_vals[rank - 1]),  // roi starts d, h, w
+        std::make_tuple(roi_vals[rank - 3 + rank], roi_vals[rank - 2 + rank],         // roi ends d, h, w
+                        roi_vals[rank - 1 + rank]),
+        std::make_tuple(z_scaled_support, h_scaled_support, w_scaled_support),
+        std::make_tuple(z_window_size, h_window_size, w_window_size),
+        exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(z_weighted_buffer, y_weighted_buffer, w_weighted_buffer));
+  });
+
+  // clang-format on
+  const fast_divmod div_w_image(narrow<int>(num_channels * input_depth * input_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels * input_depth, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_w_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data,
+      GetTyped<T>(h_w_interpolate_temp_buffer_ptr),
+      narrow<int>(h_w_interpolate_temp_buf_size));
+
+  // clang-format on
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  const fast_divmod div_h_w_image(narrow<int>(num_channels * input_depth * output_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels * input_depth, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_h_w_image,
+      h_window_size,
+      false, 0.f,  // No extrapolation
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(h_w_interpolate_temp_buffer_ptr),
+      GetTyped<T>(h_w_interpolate_result_buffer_ptr),
+      narrow<int>(h_w_interpolate_result_buffer_size));
+
+  // clang-format on
+  const fast_divmod div_z_h_w_image(narrow<int>(input_depth * output_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel3<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      input_depth, output_height, output_width,
+      output_depth, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_z_h_w_image,
+      z_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      z_bounds_buffer,
+      std::make_tuple(z_outof_bounds_buffer, y_outof_bounds_buffer, w_outof_bounds_buffer),
+      z_weighted_buffer, GetTyped<T>(h_w_interpolate_result_buffer_ptr),
+      output_data,
+      narrow<int>(N));
+  // clang-format on
+}
+
+template <class T>
+void ResizeBiLinearUpsample(cudaStream_t stream,
+                            int rank,
+                            const UpsampleMode /*upsample_mode*/,
+                            ResizeCoordinateTransformationMode coordinate_transform_mode,
+                            gsl::span<const int64_t> /*input_shape*/,
+                            gsl::span<const int64_t> /*output_shape*/,
+                            int64_t /*batch_size*/, int64_t num_channels,
+                            std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+                            std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+                            std::tuple<float, float, float> inferred_dim_rscales,
+                            const TArray<fast_divmod>& output_div_pitches,
+                            gsl::span<const float> roi_vals,
+                            const std::optional<float>& extrapolation,
+                            bool exclude_outside,
+                            const TempSpaceAllocateFunc& allocate_temp_space,
+                            const uint8_t* clip8_lookups,
+                            const T* input_data,
+                            T* output_data,
+                            const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      narrow<int>(CeilDiv((output_depth + output_height + output_width), 32));
+
+  // rank 2 or 4
+  const fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 4]
+                                                  : fast_divmod(gsl::narrow_cast<int>(N));
+  const fast_divmod& div_output_width = output_div_pitches[rank - 2];
+
+  constexpr float support_value = antialias_constants::kSupportSize;
+
+  float h_scale, w_scale;
+  std::tie(std::ignore, h_scale, w_scale) = inferred_dim_rscales;
+
+  int blocksPerGrid = narrow<int>(CeilDiv(N, GridDim::maxThreadsPerBlock));
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width);
+
+  float h_scaled_support, w_scaled_support;
+  int32_t h_window_size, w_window_size;
+  const auto [weighted_y_size, weighted_w_size] =
+      ComputeBilinearScaleBufferSize(output_height, output_width,
+                                     h_scale, w_scale, support_value,
+                                     h_scaled_support, w_scaled_support, h_window_size, w_window_size);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* y_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* y_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(weighted_y_size) + weighted_w_size;
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, narrow<size_t>(weighted_buffer_size));
+
+  AccumType* y_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* w_weighted_buffer = y_weighted_buffer + weighted_y_size;
+
+  const auto temp_buf_size = num_channels * input_height * output_width;
+  auto image_temp_buffer = AllocateTyped<T>(allocate_temp_space, narrow<size_t>(temp_buf_size));
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    //  Data is d, h, w in tuples
+
+    _SetupBilinearUpsampleFilterAntiAlias<AccumType,
+                                          BilinearFilter,
+                                          coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        std::make_tuple(input_height, input_width),
+        std::make_tuple(output_height, output_width),
+        std::make_tuple(h_scale, w_scale),
+        std::make_tuple(roi_vals[rank - 2], roi_vals[rank - 1]),                // roi starts h, w
+        std::make_tuple(roi_vals[rank - 2 + rank], roi_vals[rank - 1 + rank]),  // roi ends h, w
+        std::make_tuple(h_scaled_support, w_scaled_support),
+        std::make_tuple(h_window_size, w_window_size),
+        onnxruntime::antialias_constants::kCubicCoeffA, exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(y_weighted_buffer, w_weighted_buffer));
+  });
+
+  // clang-format on
+  const fast_divmod div_step_image{narrow<int>(num_channels * input_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_step_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data, GetTyped<T>(image_temp_buffer),
+      narrow<int>(temp_buf_size));
+
+  // clang-format on
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_output_image,
+      h_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(image_temp_buffer), output_data,
+      narrow<int>(N));
+
+  // clang-format on
+}
+
+template <typename T>
+void ResizeBicubicUpsample(cudaStream_t stream,
+                           int rank,
+                           const UpsampleMode /*upsample_mode*/,
+                           ResizeCoordinateTransformationMode coordinate_transform_mode,
+                           gsl::span<const int64_t> /*input_shape*/,
+                           gsl::span<const int64_t> /*output_shape*/,
+                           int64_t batch_size, int64_t num_channels,
+                           std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+                           std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+                           std::tuple<float, float, float> inferred_dim_rscales,
+                           // const TArray<int64_t>& input_strides,
+                           const TArray<fast_divmod>& output_div_pitches,
+                           gsl::span<const float> roi_vals,
+                           const std::optional<float>& extrapolation,
+                           bool exclude_outside,
+                           const TempSpaceAllocateFunc& allocate_temp_space,
+                           const uint8_t* clip8_lookups,
+                           const T* input_data,
+                           T* output_data,
+                           const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int blocksPerGrid = narrow<int>(CeilDiv(N, GridDim::maxThreadsPerBlock));
+  const fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 4]
+                                                  : fast_divmod(gsl::narrow_cast<int>(N));
+  const fast_divmod& div_output_width = output_div_pitches[rank - 2];
+
+  constexpr float support_value = antialias_constants::kBiCubicSupportSize;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      narrow<int>(CeilDiv((output_depth + output_height + output_width), 32));
+
+  float h_scale, w_scale;
+  std::tie(std::ignore, h_scale, w_scale) = inferred_dim_rscales;
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width);
+
+  float h_scaled_support, w_scaled_support;
+  int32_t h_window_size, w_window_size;
+  const auto [weighted_y_size, weighted_w_size] =
+      ComputeBilinearScaleBufferSize(output_height, output_width,
+                                     h_scale, w_scale, support_value,
+                                     h_scaled_support, w_scaled_support, h_window_size, w_window_size);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* y_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* y_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(weighted_y_size) +
+                                       weighted_w_size;
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, weighted_buffer_size);
+
+  AccumType* y_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* w_weighted_buffer = y_weighted_buffer + weighted_y_size;
+
+  const auto temp_buf_size = SafeInt<int64_t>(batch_size) * num_channels * input_height * output_width;
+  auto image_temp_buffer = AllocateTyped<T>(allocate_temp_space, narrow<size_t>(temp_buf_size));
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    _SetupBilinearUpsampleFilterAntiAlias<AccumType,
+                                          BiCubicFilter,
+                                          coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        std::make_tuple(input_height, input_width),
+        std::make_tuple(output_height, output_width),
+        std::make_tuple(h_scale, w_scale),
+        std::make_tuple(roi_vals[rank - 2], roi_vals[rank - 1]),                // roi starts h, w
+        std::make_tuple(roi_vals[rank - 2 + rank], roi_vals[rank - 1 + rank]),  // roi ends h, w
+        std::make_tuple(h_scaled_support, w_scaled_support),
+        std::make_tuple(h_window_size, w_window_size),
+        onnxruntime::antialias_constants::kCubicCoeffA, exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(y_weighted_buffer, w_weighted_buffer));
+  });
+  // clang-format on
+  const fast_divmod div_step_image(narrow<int>(num_channels * input_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_step_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data, GetTyped<T>(image_temp_buffer),
+      narrow<int>(temp_buf_size));
+  // clang-format on
+
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_output_image,
+      h_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(image_temp_buffer), output_data,
+      narrow<int>(N));
+  // clang-format on
+}
+
+template <class T>
+void ResizeAntiAliasImpl(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode upsample_mode,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> input_shape,
+    gsl::span<const int64_t> output_shape,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,
+    const std::optional<float>& extrapolation,
+    bool exclude_outside,
+    TempSpaceAllocateFunc allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N) {
+  // We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  const bool is_2D = (rank == 2 || rank == 4);
+
+  // We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  const bool is_3D = (rank == 3 || rank == 5);
+
+  // Should not hit this as we have already validated input rank/scales and we provide verbose error messages
+  // to the user.
+  ORT_ENFORCE(is_2D || is_3D, "Only bilinear/trilinear and bicubic modes are supported in Resize anti-alias mode");
+
+  switch (upsample_mode) {
+    case UpsampleMode::LINEAR: {
+      if (is_2D) {
+        ResizeBiLinearUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                  input_shape, output_shape, batch_size, num_channels,
+                                  inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                  output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                  allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else if (is_3D) {
+        ResizeTrilinearUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                   input_shape, output_shape, batch_size, num_channels,
+                                   inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                   output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                   allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else {
+        ORT_NOT_IMPLEMENTED("Resize supports only 2-D or 3-D in LINEAR mode.");
+      }
+    } break;
+    case CUBIC: {
+      if (is_2D) {
+        ResizeBicubicUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                 input_shape, output_shape, batch_size, num_channels,
+                                 inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                 output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                 allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else {
+        ORT_NOT_IMPLEMENTED("Resize supports only 2-D in CUBIC mode.");
+      }
+    } break;
+    default:
+      ORT_NOT_IMPLEMENTED("Only bilinear/trilinear and bicubic modes are supported in Resize anti-alias mode");
+      break;
+  }
+}
+
+#define SPECIALIZED_ANTIALIAS_IMPL(T)                               \
+  template void ResizeAntiAliasImpl<T>(                             \
+      cudaStream_t stream,                                          \
+      int rank,                                                     \
+      const UpsampleMode upsample_mode,                             \
+      ResizeCoordinateTransformationMode coordinate_transform_mode, \
+      gsl::span<const int64_t> input_shape,                         \
+      gsl::span<const int64_t> output_shape,                        \
+      int64_t batch_size, int64_t num_channels,                     \
+      std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,    \
+      std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,   \
+      std::tuple<float, float, float> inferred_dim_rscales,         \
+      const TArray<fast_divmod>& output_div_pitches,                \
+      gsl::span<const float> roi_vals,                              \
+      const std::optional<float>& extrapolation_value,              \
+      bool exclude_outside,                                         \
+      TempSpaceAllocateFunc allocate_temp_space,                    \
+      const uint8_t* clip8_lookups,                                 \
+      const T* input_data,                                          \
+      T* output_data,                                               \
+      const size_t N);
+
+SPECIALIZED_ANTIALIAS_IMPL(float)
+SPECIALIZED_ANTIALIAS_IMPL(double)
+SPECIALIZED_ANTIALIAS_IMPL(half)
+SPECIALIZED_ANTIALIAS_IMPL(int32_t)
+SPECIALIZED_ANTIALIAS_IMPL(uint8_t)
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index 1a94c7705e91..e788f2405298 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -12,7 +12,7 @@ using onnxruntime::ResizeNearestMode;
 using onnxruntime::UpsampleMode;
 
 struct NearestPixel_SIMPLE {
-  __device__ __forceinline__ int operator() (float x_original, bool is_down_sampling) const {
+  __device__ __forceinline__ int operator()(float x_original, bool is_down_sampling) const {
     if (is_down_sampling) {
       return static_cast<int>(_Ceil(x_original));
     }
@@ -21,7 +21,7 @@ struct NearestPixel_SIMPLE {
 };
 
 struct NearestPixel_ROUND_PREFER_FLOOR {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     if (x_original == static_cast<int>(x_original) + 0.5f) {
       return static_cast<int>(_Floor(x_original));
     }
@@ -30,62 +30,23 @@ struct NearestPixel_ROUND_PREFER_FLOOR {
 };
 
 struct NearestPixel_ROUND_PREFER_CEIL {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(roundf(x_original));
   }
 };
 
 struct NearestPixel_FLOOR {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(_Floor(x_original));
   }
 };
 
 struct NearestPixel_CEIL {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(_Ceil(x_original));
   }
 };
 
-struct TransformCoordinate_ASYMMETRIC {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return x_resized / x_scale;
-  }
-};
-
-struct TransformCoordinate_HALF_PIXEL {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return ((x_resized + 0.5f) / x_scale) - 0.5f;
-  }
-};
-
-struct TransformCoordinate_PYTORCH_HALF_PIXEL {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float length_resized, float, float, float) const {
-    return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
-  }
-};
-
-struct TransformCoordinate_TF_HALF_PIXEL_FOR_NN {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return (x_resized + 0.5f) / x_scale;
-  }
-};
-
-struct TransformCoordinate_ALIGN_CORNERS {
-  __device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float, float) const {
-    return length_resized == 1 ? 0 : x_resized * (length_original - 1) / (length_resized - 1);
-  }
-};
-
-struct TransformCoordinate_TF_CROP_AND_RESIZE {
-  __device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float roi_start, float roi_end) const {
-    auto orig = length_resized > 1
-      ? roi_start * (length_original - 1) + (x_resized * (roi_end - roi_start) * (length_original - 1)) / (length_resized - 1)
-      : 0.5 * (roi_start + roi_end) * (length_original - 1);
-    return static_cast<float>(orig);
-  }
-};
-
 #define CASE_TYPE_USING_HINT(enum_type, type, HINT, ...) \
   case enum_type: {                                      \
     using HINT = type;                                   \
@@ -95,20 +56,24 @@ struct TransformCoordinate_TF_CROP_AND_RESIZE {
 #define CASE_TYPE_COORD(enum_type, type, ...) \
   CASE_TYPE_USING_HINT(enum_type, type, coord_t, __VA_ARGS__)
 
-#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...)                                                                      \
-  [&] {                                                                                                                                \
-    const auto& the_type = TYPE;                                                                                                       \
-    /* don't use TYPE again in case it is an expensive or side-effect op */                                                            \
-    switch (the_type) {                                                                                                                \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL,           TransformCoordinate_HALF_PIXEL, __VA_ARGS__)           \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC,           TransformCoordinate_ASYMMETRIC, __VA_ARGS__)           \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,   TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)   \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS,        TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)        \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__) \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,   TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)   \
-      default:                                                                                                                         \
-        ORT_THROW("unknown ResizeCoordinateTransformationMode");                                                                       \
-    }                                                                                                                                  \
+#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...)                                                  \
+  [&] {                                                                                                            \
+    const auto& the_type = TYPE;                                                                                   \
+    /* don't use TYPE again in case it is an expensive or side-effect op */                                        \
+    switch (the_type) {                                                                                            \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL, TransformCoordinate_HALF_PIXEL, __VA_ARGS__) \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC, TransformCoordinate_ASYMMETRIC, __VA_ARGS__) \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,                                      \
+                      TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)                                         \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS,                                           \
+                      TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)                                              \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN,                                    \
+                      TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__)                                       \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,                                      \
+                      TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)                                         \
+      default:                                                                                                     \
+        ORT_THROW("unknown ResizeCoordinateTransformationMode");                                                   \
+    }                                                                                                              \
   }()
 
 #define CASE_TYPE_NEAREST(enum_type, type, ...) \
@@ -119,11 +84,11 @@ struct TransformCoordinate_TF_CROP_AND_RESIZE {
     const auto& the_type = TYPE;                                                                             \
     /* don't use TYPE again in case it is an expensive or side-effect op */                                  \
     switch (the_type) {                                                                                      \
-      CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE,             NearestPixel_SIMPLE, __VA_ARGS__)             \
+      CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE, NearestPixel_SIMPLE, __VA_ARGS__)                         \
       CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_FLOOR, NearestPixel_ROUND_PREFER_FLOOR, __VA_ARGS__) \
-      CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL,  NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__)  \
-      CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR,              NearestPixel_FLOOR, __VA_ARGS__)              \
-      CASE_TYPE_NEAREST(ResizeNearestMode::CEIL,               NearestPixel_CEIL, __VA_ARGS__)               \
+      CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL, NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__)   \
+      CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR, NearestPixel_FLOOR, __VA_ARGS__)                           \
+      CASE_TYPE_NEAREST(ResizeNearestMode::CEIL, NearestPixel_CEIL, __VA_ARGS__)                             \
       default:                                                                                               \
         ORT_THROW("unknown ResizeNearestMode");                                                              \
     }                                                                                                        \
@@ -151,10 +116,12 @@ __global__ void _ResizeNearestMappingKernel2D(
 
     // only apply co-ordinate transformation if scale != 1.0
     if (scales_height == 1.0f) {
-        dims_mapping[id].extrapolate_ = 0;
+      dims_mapping[id].extrapolate_ = 0;
     } else {
-      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height, static_cast<float>(output_height),
-                                              static_cast<float>(input_height), roi_start_height, roi_end_height);
+      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height,
+                                              static_cast<float>(output_height),
+                                              static_cast<float>(input_height),
+                                              roi_start_height, roi_end_height);
       dims_mapping[id].extrapolate_ = static_cast<int>(
           extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_height - 1)));
       dim = calc_nearest_pixel(orig_coord, scales_height < 1);
@@ -210,9 +177,12 @@ __global__ void _ResizeNearestMappingKernel(
       if (scales[axis] == 1.0f) {
         dims_mapping[id].extrapolate_ = 0;
       } else {
-        float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis], static_cast<float>(output_shape[axis]),
+        float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis],
+                                                static_cast<float>(output_shape[axis]),
                                                 static_cast<float>(input_shape[axis]), roi[axis], roi[axis + rank]);
-        dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_shape[axis] - 1)));
+        dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled &&
+                                                         (orig_coord < 0.f ||
+                                                          orig_coord > static_cast<float>(input_shape[axis] - 1)));
         dim = calc_nearest_pixel(orig_coord, scales[axis] < 1);
         if (dim >= input_shape[axis]) dim = input_shape[axis] - 1;
         if (dim < 0) dim = 0;
@@ -293,21 +263,27 @@ __global__ void _ResizeBilinearCoordinateMapping(
     LinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
   if (id < output_height) {  //  y = id
-    float input_y = scale_height == 1 ? static_cast<float>(id) :
-                                        transform_coordinate(static_cast<float>(id), scale_height,
-                                        static_cast<float>(output_height), static_cast<float>(input_height),
-                                        roi_height_start, roi_height_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    float input_y = scale_height == 1 ? static_cast<float>(id)
+                                      : transform_coordinate(static_cast<float>(id), scale_height,
+                                                             static_cast<float>(output_height),
+                                                             static_cast<float>(input_height),
+                                                             roi_height_start, roi_height_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_y < 0 ||
+                                                       input_y > static_cast<float>(input_height - 1))));
     input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
     int y_int = static_cast<int>(input_y);
     dims_mapping[id].origin_ = y_int;
     dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
-  } else {  //x = id - output_height
-    float input_x = scale_width == 1 ? static_cast<float>(id - output_height) :
-                                       transform_coordinate(static_cast<float>(id - output_height), scale_width,
-                                       static_cast<float>(output_width), static_cast<float>(input_width),
-                                       roi_width_start, roi_width_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+  } else {  // x = id - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_height)
+                                     : transform_coordinate(static_cast<float>(id - output_height),
+                                                            scale_width, static_cast<float>(output_width),
+                                                            static_cast<float>(input_width), roi_width_start,
+                                                            roi_width_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_x < 0 ||
+                                                       input_x > static_cast<float>(input_width - 1))));
     input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
     int x_int = static_cast<int>(input_x);
     dims_mapping[id].origin_ = x_int;
@@ -371,32 +347,40 @@ __global__ void _ResizeTrilinearCoordinateMapping(
     LinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumDHW);
   if (id < output_depth) {  //  z = id
-    float input_z = scale_depth == 1 ? static_cast<float>(id)  :
-                                       transform_coordinate(static_cast<float>(id), scale_depth,
-                                       static_cast<float>(output_depth), static_cast<float>(input_depth),
-                                       roi_depth_start, roi_depth_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_z < 0 || input_z > static_cast<float>(input_depth - 1)));
+    float input_z = scale_depth == 1 ? static_cast<float>(id)
+                                     : transform_coordinate(static_cast<float>(id), scale_depth,
+                                                            static_cast<float>(output_depth),
+                                                            static_cast<float>(input_depth),
+                                                            roi_depth_start, roi_depth_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_z < 0 ||
+                                                       input_z > static_cast<float>(input_depth - 1))));
     input_z = max(0.0f, min(input_z, static_cast<float>(input_depth - 1)));
     int z_int = static_cast<int>(input_z);
     dims_mapping[id].origin_ = z_int;
     dims_mapping[id].weight_ = (z_int >= input_depth - 1) ? 0.5f : input_z - z_int;
   } else if (id >= output_depth && id < (output_depth + output_height)) {  //  y = id - output_depth
-    float input_y = scale_height == 1 ? static_cast<float>(id - output_depth) :
-                                        transform_coordinate(static_cast<float>(id - output_depth), scale_height,
-                                        static_cast<float>(output_height), static_cast<float>(input_height),
-                                        roi_height_start, roi_height_end);
-
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    float input_y = scale_height == 1 ? static_cast<float>(id - output_depth)
+                                      : transform_coordinate(static_cast<float>(id - output_depth),
+                                                             scale_height, static_cast<float>(output_height),
+                                                             static_cast<float>(input_height),
+                                                             roi_height_start, roi_height_end);
+
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_y < 0 ||
+                                                       input_y > static_cast<float>(input_height - 1))));
     input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
     int y_int = static_cast<int>(input_y);
     dims_mapping[id].origin_ = y_int;
     dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
-  } else {  //x = id - output_depth - output_height
-    float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height) :
-                                       transform_coordinate(static_cast<float>(id - output_depth - output_height), scale_width,
-                                       static_cast<float>(output_width), static_cast<float>(input_width),
-                                       roi_width_start, roi_width_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+  } else {  // x = id - output_depth - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height)
+                                     : transform_coordinate(static_cast<float>(id - output_depth - output_height),
+                                                            scale_width, static_cast<float>(output_width),
+                                                            static_cast<float>(input_width),
+                                                            roi_width_start, roi_width_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 ||
+                                                                    input_x > static_cast<float>(input_width - 1)));
     input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
     int x_int = static_cast<int>(input_x);
     dims_mapping[id].origin_ = x_int;
@@ -513,21 +497,33 @@ __global__ void _ResizeCubicCoordinateMapping(
   int max_input_coord = static_cast<int>(is_y_axis ? input_height : input_width);
 
   float scale = is_y_axis ? scale_height : scale_width;
-  float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height) :
-      transform_coordinate(
-      static_cast<float>(is_y_axis ? id : id - output_height),
-      scale,
-      static_cast<float>(is_y_axis ? output_height : output_width),
-      static_cast<float>(max_input_coord),
-      (is_y_axis ? roi_height_start : roi_width_start),
-      (is_y_axis ? roi_height_end : roi_width_end));
+  float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height)
+                                     : transform_coordinate(
+                                           static_cast<float>(is_y_axis ? id : id - output_height),
+                                           scale,
+                                           static_cast<float>(is_y_axis ? output_height : output_width),
+                                           static_cast<float>(max_input_coord),
+                                           (is_y_axis ? roi_height_start : roi_width_start),
+                                           (is_y_axis ? roi_height_end : roi_width_end));
   int coord_int = static_cast<int>(_Floor(input_coordinat));
   float s_coord = abs(input_coordinat - coord_int);
   float coeff_sum = 1.0f;
-  float coeff_0 = static_cast<float>(((cubic_coeff_a * (s_coord + 1) - 5 * cubic_coeff_a) * (s_coord + 1) + 8 * cubic_coeff_a) * (s_coord + 1) - 4 * cubic_coeff_a);
-  float coeff_1 = static_cast<float>(((cubic_coeff_a + 2) * s_coord - (cubic_coeff_a + 3)) * s_coord * s_coord + 1);
-  float coeff_2 = static_cast<float>(((cubic_coeff_a + 2) * (1 - s_coord) - (cubic_coeff_a + 3)) * (1 - s_coord) * (1 - s_coord) + 1);
-  float coeff_3 = static_cast<float>(((cubic_coeff_a * (2 - s_coord) - 5 * cubic_coeff_a) * (2 - s_coord) + 8 * cubic_coeff_a) * (2 - s_coord) - 4 * cubic_coeff_a);
+  float coeff_0 = static_cast<float>(((cubic_coeff_a * (s_coord + 1) - 5 * cubic_coeff_a) *
+                                          (s_coord + 1) +
+                                      8 * cubic_coeff_a) *
+                                         (s_coord + 1) -
+                                     4 * cubic_coeff_a);
+  float coeff_1 = static_cast<float>(((cubic_coeff_a + 2) * s_coord - (cubic_coeff_a + 3)) *
+                                         s_coord * s_coord +
+                                     1);
+  float coeff_2 = static_cast<float>(((cubic_coeff_a + 2) * (1 - s_coord) - (cubic_coeff_a + 3)) *
+                                         (1 - s_coord) * (1 - s_coord) +
+                                     1);
+  float coeff_3 = static_cast<float>(((cubic_coeff_a * (2 - s_coord) - 5 * cubic_coeff_a) *
+                                          (2 - s_coord) +
+                                      8 * cubic_coeff_a) *
+                                         (2 - s_coord) -
+                                     4 * cubic_coeff_a);
   if (exclude_outside) {
     coeff_0 = (coord_int - 1 < 0 || coord_int - 1 >= max_input_coord) ? 0.0 : coeff_0;
     coeff_1 = (coord_int + 0 < 0 || coord_int + 0 >= max_input_coord) ? 0.0 : coeff_1;
@@ -540,7 +536,8 @@ __global__ void _ResizeCubicCoordinateMapping(
   dm.coeff1_ = coeff_1 / coeff_sum;
   dm.coeff2_ = coeff_2 / coeff_sum;
   dm.coeff3_ = coeff_3 / coeff_sum;
-  dm.extrapolate_ = (int)(extrapolation_enabled && (input_coordinat < 0 || input_coordinat > static_cast<float>(max_input_coord - 1)));
+  dm.extrapolate_ = (int)(extrapolation_enabled && (input_coordinat < 0 ||
+                                                    input_coordinat > static_cast<float>(max_input_coord - 1)));
 }
 
 template <typename T>
@@ -569,21 +566,30 @@ __global__ void _ResizeBiCubicKernel(
   int x_int = x_info.origin_;
   int y_int = y_info.origin_;
   const T* image = input_data + input_index;
-  output_data[id] = y_info.coeff0_ * CubicInterpolationRowwise(image, x_int, y_int - 1, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff1_ * CubicInterpolationRowwise(image, x_int, y_int, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff2_ * CubicInterpolationRowwise(image, x_int, y_int + 1, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff3_ * CubicInterpolationRowwise(image, x_int, y_int + 2, input_height, input_width, w0, w1, w2, w3);
+  output_data[id] = y_info.coeff0_ *
+                        CubicInterpolationRowwise(image, x_int, y_int - 1, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff1_ *
+                        CubicInterpolationRowwise(image, x_int, y_int, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff2_ *
+                        CubicInterpolationRowwise(image, x_int, y_int + 1, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff3_ *
+                        CubicInterpolationRowwise(image, x_int, y_int + 2, input_height, input_width, w0, w1, w2, w3);
 }
 
 size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
                             const gsl::span<const int64_t>& output_dims) {
   switch (upsample_mode) {
     case UpsampleMode::NN:
-      return sizeof(int64_t) * output_dims.size() + sizeof(NearestMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.begin(), output_dims.end(), (int64_t)0));
+      return sizeof(int64_t) * output_dims.size() +
+             sizeof(NearestMappingInfo) *
+                 static_cast<size_t>(std::accumulate(output_dims.begin(),
+                                                     output_dims.end(), (int64_t)0));
     case UpsampleMode::LINEAR:
-      return sizeof(LinearMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
+      return sizeof(LinearMappingInfo) *
+             static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
     case UpsampleMode::CUBIC:
-      return sizeof(CubicMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
+      return sizeof(CubicMappingInfo) *
+             static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
   }
   return 0;
 }
@@ -603,7 +609,7 @@ void ResizeNearestImpl(
     const size_t N,
     bool extrapolation_enabled,
     const T extrapolation_value,
-    float cubic_coeff_a,
+    float /*cubic_coeff_a*/,
     ResizeCoordinateTransformationMode transform_coordinate,
     ResizeNearestMode calc_nearest_pixel,
     int64_t* /* prefix_dim_sum */,
@@ -616,7 +622,8 @@ void ResizeNearestImpl(
   if (could2d) {
     int64_t output_height = output_shape[rank - 2];
     int64_t output_width = output_shape[rank - 1];
-    fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(static_cast<int>(output_height * output_width));
+    fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3]
+                                              : fast_divmod(static_cast<int>(output_height * output_width));
     int blocksPerDimsMappingGrid = static_cast<int>(ceil((output_height + output_width) / 32.0));
 
     DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(transform_coordinate, [&]() {
@@ -694,13 +701,6 @@ void ResizeImpl(
     ResizeCoordinateTransformationMode coordinate_transform_mode,
     ResizeNearestMode nearest_mode,
     void* dims_mapping) {
-  bool isSame = std::all_of(scales_vals.Data(), scales_vals.Data() + rank, [](float v) { return v == 1.0f; }) &&
-                (coordinate_transform_mode != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE);
-  if (isSame) {
-    CUDA_CALL_THROW(cudaMemcpyAsync(output_data, input_data, N * sizeof(T), cudaMemcpyDeviceToDevice, stream));
-    return;
-  }
-
   if (upsample_mode == UpsampleMode::NN) {
     ResizeNearestImpl(
         stream, rank, input_shape, output_shape, input_strides, output_div_pitches,
@@ -761,7 +761,7 @@ void ResizeImpl(
       } else if (is_3D) {
         DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
           _ResizeTrilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
-              input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1],
+              input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1],
               output_depth, output_height, output_width,
               scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1],
               roi_vals[rank - 3], roi_vals[rank - 3 + rank],
@@ -778,7 +778,7 @@ void ResizeImpl(
             reinterpret_cast<LinearMappingInfo*>(dims_mapping));
         return;
       }
-      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+      ORT_THROW("Resize support 2-D and 3-D dimensions in LINEAR mode.");
       break;
     case UpsampleMode::CUBIC:
       if (is_2D) {
@@ -801,7 +801,7 @@ void ResizeImpl(
             reinterpret_cast<CubicMappingInfo*>(dims_mapping));
         return;
       }
-      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+      ORT_THROW("Resize supports only 2-D in CUBIC mode.");
     case UpsampleMode::NN:
       ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
   }
@@ -809,7 +809,7 @@ void ResizeImpl(
 
 #define SPECIALIZED_IMPL(T)                                         \
   template void ResizeImpl<T>(                                      \
-      cudaStream_t stream,                                    \
+      cudaStream_t stream,                                          \
       const UpsampleMode upsample_mode,                             \
       const int rank,                                               \
       TArray<int64_t>& input_shape,                                 \
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.h b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
index d459dbff18d3..ad06eebb9efb 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
@@ -2,15 +2,69 @@
 // Licensed under the MIT License.
 
 #pragma once
+
 #include <stdint.h>
+
+#include <tuple>
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/common/common.h"
 #include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cuda/cuda_common.h"
 
 namespace onnxruntime {
+template <>
+struct AccumulateType<half> {
+  using type = float;
+};
 namespace cuda {
 
+struct TransformCoordinate_ASYMMETRIC {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return x_resized / x_scale;
+  }
+};
+
+struct TransformCoordinate_HALF_PIXEL {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return ((x_resized + 0.5f) / x_scale) - 0.5f;
+  }
+};
+
+struct TransformCoordinate_PYTORCH_HALF_PIXEL {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale, float length_resized, float,
+                                                       float, float) const {
+    return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
+  }
+};
+
+struct TransformCoordinate_TF_HALF_PIXEL_FOR_NN {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return (x_resized + 0.5f) / x_scale;
+  }
+};
+
+struct TransformCoordinate_ALIGN_CORNERS {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float, float length_resized,
+                                                       float length_original, float, float) const {
+    return length_resized == 1 ? 0 : x_resized * (length_original - 1) / (length_resized - 1);
+  }
+};
+
+struct TransformCoordinate_TF_CROP_AND_RESIZE {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float, float length_resized,
+                                                       float length_original, float roi_start, float roi_end) const {
+    auto orig = length_resized > 1
+                    ? roi_start * (length_original - 1) +
+                          (x_resized * (roi_end - roi_start) * (length_original - 1)) / (length_resized - 1)
+                    : 0.5 * (roi_start + roi_end) * (length_original - 1);
+    return static_cast<float>(orig);
+  }
+};
+
 size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
                             const gsl::span<const int64_t>& output_dims);
 
@@ -36,5 +90,62 @@ void ResizeImpl(
     onnxruntime::ResizeNearestMode nearest_mode,
     void* dims_mapping);
 
+using TempSpaceAllocateFunc = std::function<onnxruntime::IAllocatorUniquePtr<uint8_t>(size_t buffer_size)>;
+
+template <class T>
+void ResizeAntiAliasImpl(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode upsample_mode,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> input_shape,
+    gsl::span<const int64_t> output_shape,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,  // CPU
+    const std::optional<float>& extrapolation_value,
+    bool exclude_outside,
+    TempSpaceAllocateFunc allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N);
+
+/// <summary>
+/// Compute scaled support value for a given dimension inverse scale
+/// </summary>
+/// <param name="support_value">Support value from parameters</param>
+/// <param name="inv_scale">inverse scale value comes from input/attr for</param>
+/// <returns></returns>
+inline float ComputeScaledSupportValue(float support_value, float rscale) {
+  const float scale = 1.0f / rscale;
+  float scaled_support = (scale >= 1.0f) ? (support_value * 0.5f) * scale : support_value * 0.5f;
+  return scaled_support;
+}
+
+/// <summary>
+/// Compute window size for a given dimension scaled support value.
+/// </summary>
+/// <param name="scaled_support"></param>
+/// <returns></returns>
+inline int32_t ComputeWindowSize(float scaled_support) {
+  SafeInt<int32_t> window_size(ceilf(scaled_support));
+  return window_size * 2 + 1;
+}
+
+/// <summary>
+/// Computes scale buffer size in number of elements for allocation purposes.
+/// </summary>
+/// <param name="output_size"></param>
+/// <param name="window_size"></param>
+/// <returns>Number of elements to fit in the buffer</returns>
+inline SafeInt<int64_t> ComputeWeightedCoeffBufferSize(int64_t output_size, int32_t window_size) {
+  SafeInt<int64_t> buffer_size(output_size);
+  return buffer_size * window_size;
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc b/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc
index e4d145154971..42a9f5000110 100755
--- a/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc
+++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc
@@ -27,7 +27,23 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(ScatterElements, kOnnxDomain, 11, 12, kCudaExe
                                                                               DataTypeImpl::GetTensorType<int64_t>()}),
                                   ScatterElements);
 
-ONNX_OPERATOR_KERNEL_EX(ScatterElements, kOnnxDomain, 13, kCudaExecutionProvider,
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(ScatterElements, kOnnxDomain, 13, 15, kCudaExecutionProvider,
+                                  (*KernelDefBuilder::Create())
+                                      .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+                                      .TypeConstraint("Tind",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                                              DataTypeImpl::GetTensorType<int64_t>()}),
+                                  ScatterElements);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(ScatterElements, kOnnxDomain, 16, 17, kCudaExecutionProvider,
+                                  (*KernelDefBuilder::Create())
+                                      .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+                                      .TypeConstraint("Tind",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                                              DataTypeImpl::GetTensorType<int64_t>()}),
+                                  ScatterElements);
+
+ONNX_OPERATOR_KERNEL_EX(ScatterElements, kOnnxDomain, 18, kCudaExecutionProvider,
                         (*KernelDefBuilder::Create())
                             .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
                             .TypeConstraint("Tind", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
@@ -106,6 +122,20 @@ Status ScatterElements::ComputeInternal(OpKernelContext* context) const {
   TensorShapeVector indices_shape_vec = indices_shape.AsShapeVector();
   CoalesceDimensions(input_shape_vec, indices_shape_vec, nullptr, axis, args);
 
+  if (reduction_ == "none") {
+    args.operation = GatherScatterElementsArgs::Operation::NONE;
+  } else if (reduction_ == "add") {
+    args.operation = GatherScatterElementsArgs::Operation::ADD;
+  } else if (reduction_ == "mul") {
+    args.operation = GatherScatterElementsArgs::Operation::MUL;
+  } else if (reduction_ == "min") {
+    args.operation = GatherScatterElementsArgs::Operation::MIN;
+  } else if (reduction_ == "max") {
+    args.operation = GatherScatterElementsArgs::Operation::MAX;
+  } else {
+    ORT_THROW("Unsupported reduction type");
+  }
+
   // Use element size instead of concrete types so we can specialize less template functions to reduce binary size.
   int dtype = GetElementType(input_tensor->DataType()->Size());
   if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements.h b/onnxruntime/core/providers/cuda/tensor/scatter_elements.h
index 3e9e0ce04184..3884b716da30 100755
--- a/onnxruntime/core/providers/cuda/tensor/scatter_elements.h
+++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements.h
@@ -14,6 +14,12 @@ class ScatterElements final : public CudaKernel {
   ScatterElements(const OpKernelInfo& info) : CudaKernel(info) {
     ORT_ENFORCE(info.GetAttr<int64_t>("axis", &axis_).IsOK(),
                 "Missing/Invalid 'axis' attribute value");
+    reduction_ = info.GetAttrOrDefault<std::string>("reduction", "none");
+
+    ORT_ENFORCE(reduction_ == "none" || reduction_ == "add" ||
+                    reduction_ == "mul" || reduction_ == "max" ||
+                    reduction_ == "min",
+                "Invalid reduction attribute value of ", reduction_);
   }
   ~ScatterElements() = default;
   Status ComputeInternal(OpKernelContext* context) const override;
@@ -23,6 +29,10 @@ class ScatterElements final : public CudaKernel {
   struct ComputeImpl;
 
   int64_t axis_;
+  // "reduction" attribute has been defined since opset 13 but
+  // we never implemented it. Let's try to support them starting
+  // with opset 18.
+  std::string reduction_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cuda/tensor/space_depth_ops.cc
index 407a2ef3981f..aaaf3600b676 100644
--- a/onnxruntime/core/providers/cuda/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cuda/tensor/space_depth_ops.cc
@@ -20,7 +20,22 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    SpaceToDepth);
+    SpaceToDepth<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    SpaceToDepth,
+    kMSInternalNHWCDomain,
+    1,
+    12,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    SpaceToDepth<LAYOUT_NHWC>);
+#endif
 
 ONNX_OPERATOR_KERNEL_EX(
     SpaceToDepth,
@@ -32,7 +47,21 @@ ONNX_OPERATOR_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    SpaceToDepth);
+    SpaceToDepth<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_KERNEL_EX(
+    SpaceToDepth,
+    kMSInternalNHWCDomain,
+    13,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    SpaceToDepth<LAYOUT_NHWC>);
+#endif
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     DepthToSpace,
@@ -45,7 +74,22 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    DepthToSpace);
+    DepthToSpace<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    DepthToSpace,
+    kMSInternalNHWCDomain,
+    1,
+    10,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    DepthToSpace<LAYOUT_NHWC>);
+#endif
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     DepthToSpace,
@@ -58,7 +102,22 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    DepthToSpace);
+    DepthToSpace<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    DepthToSpace,
+    kMSInternalNHWCDomain,
+    11,
+    12,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    DepthToSpace<LAYOUT_NHWC>);
+#endif
 
 ONNX_OPERATOR_KERNEL_EX(
     DepthToSpace,
@@ -70,23 +129,35 @@ ONNX_OPERATOR_KERNEL_EX(
                         {DataTypeImpl::GetTensorType<float>(),
                          DataTypeImpl::GetTensorType<double>(),
                          DataTypeImpl::GetTensorType<MLFloat16>()}),
-    DepthToSpace);
+    DepthToSpace<LAYOUT_NCHW>);
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+ONNX_OPERATOR_KERNEL_EX(
+    DepthToSpace,
+    kMSInternalNHWCDomain,
+    13,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T",
+                        {DataTypeImpl::GetTensorType<float>(),
+                         DataTypeImpl::GetTensorType<double>(),
+                         DataTypeImpl::GetTensorType<MLFloat16>()}),
+    DepthToSpace<LAYOUT_NHWC>);
+#endif
 
 static Status SpaceDepthOpCudaImpl(const cudaDeviceProp& prop,
                                    cudaStream_t stream,
                                    const cublasHandle_t cublas_handle,
                                    const Tensor& input, Tensor& output,
                                    const std::vector<size_t>& permutation,
-                                   const int64_t batch_size,
-                                   const int64_t in_dim1, const int64_t in_dim2, const int64_t in_dim3,
-                                   const int64_t in_dim4, const int64_t in_dim5,
+                                   const TensorShape& virtual_input_shape,
                                    const TensorShape& virtual_output_shape) {
-  TensorShape virtual_input_shape{batch_size, in_dim1, in_dim2, in_dim3, in_dim4, in_dim5};
   return Transpose::DoTranspose(prop, stream, cublas_handle, permutation, input, output,
                                 &virtual_input_shape, &virtual_output_shape);
 }
 
-Status SpaceToDepth::ComputeInternal(OpKernelContext* context) const {
+template <bool Layout>
+Status SpaceToDepth<Layout>::ComputeInternal(OpKernelContext* context) const {
   const auto* tensor_pointer = context->Input<Tensor>(0);
   if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
   const Tensor& input = *tensor_pointer;
@@ -101,29 +172,44 @@ Status SpaceToDepth::ComputeInternal(OpKernelContext* context) const {
   int64_t output_height = -1;
   int64_t output_width = -1;
 
-  ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc(input,
-                                                        batch,
-                                                        input_depth, input_height, input_width,
-                                                        output_depth, output_height, output_width,
-                                                        true));
+  ORT_RETURN_IF_ERROR(
+      InputValidationsAndOutputDimsCalc<Layout == LAYOUT_NHWC>(input,
+                                                               batch,
+                                                               input_depth, input_height, input_width,
+                                                               output_depth, output_height, output_width,
+                                                               true));
 
   // We use the "actual" output shape to construct the output tensor
-  Tensor& output = *context->Output(0, {batch, output_depth, output_height, output_width});
+  Tensor& output = (Layout == LAYOUT_NCHW)
+                       ? *context->Output(0, {batch, output_depth, output_height, output_width})
+                       : *context->Output(0, {batch, output_height, output_width, output_depth});
+
+  TensorShape virtual_input_shape = (Layout == LAYOUT_NCHW)
+                                        ? TensorShape{batch, input_depth, input_height / blocksize_,
+                                                      blocksize_, input_width / blocksize_, blocksize_}
+                                        : TensorShape{batch, input_height / blocksize_, blocksize_,
+                                                      input_width / blocksize_, blocksize_, input_depth};
 
   // We will pass in the "virtual" output shape to be used by DoTranspose() in SpaceDepthOpCudaImpl(...)
-  TensorShape virtual_output_shape{batch, blocksize_, blocksize_, input_depth,
-                                   input_height / blocksize_, input_width / blocksize_};
+  TensorShape virtual_output_shape = (Layout == LAYOUT_NCHW)
+                                         ? TensorShape{batch, blocksize_, blocksize_, input_depth,
+                                                       input_height / blocksize_, input_width / blocksize_}
+                                         : TensorShape{batch, input_height / blocksize_, input_width / blocksize_,
+                                                       blocksize_, blocksize_, input_depth};
 
-  std::vector<size_t> permutation = {0, 3, 5, 1, 2, 4};
+  std::vector<size_t> permutation = (Layout == LAYOUT_NCHW)
+                                        ? std::vector<size_t>{0, 3, 5, 1, 2, 4}
+                                        : std::vector<size_t>{0, 1, 3, 2, 4, 5};
 
-  ORT_RETURN_IF_ERROR(SpaceDepthOpCudaImpl(GetDeviceProp(), Stream(context), GetCublasHandle(context), input, output, permutation, batch,
-                                           input_depth, input_height / blocksize_, blocksize_, input_width / blocksize_, blocksize_,
-                                           virtual_output_shape));
+  ORT_RETURN_IF_ERROR(
+      SpaceDepthOpCudaImpl(GetDeviceProp(), Stream(context), GetCublasHandle(context), input, output, permutation,
+                           virtual_input_shape, virtual_output_shape));
 
   return Status::OK();
 }
 
-Status DepthToSpace::ComputeInternal(OpKernelContext* context) const {
+template <bool Layout>
+Status DepthToSpace<Layout>::ComputeInternal(OpKernelContext* context) const {
   const auto* tensor_pointer = context->Input<Tensor>(0);
   if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
   const Tensor& input = *tensor_pointer;
@@ -138,46 +224,56 @@ Status DepthToSpace::ComputeInternal(OpKernelContext* context) const {
   int64_t output_height = -1;
   int64_t output_width = -1;
 
-  ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc(input,
-                                                        batch,
-                                                        input_depth, input_height, input_width,
-                                                        output_depth, output_height, output_width,
-                                                        false));
+  ORT_RETURN_IF_ERROR(
+      InputValidationsAndOutputDimsCalc<Layout == LAYOUT_NHWC>(input,
+                                                               batch,
+                                                               input_depth, input_height, input_width,
+                                                               output_depth, output_height, output_width,
+                                                               false));
 
   // We use the "actual" output shape to construct the output tensor
-  Tensor& output = *context->Output(0, {batch, output_depth, output_height, output_width});
+  Tensor& output = (Layout == LAYOUT_NCHW)
+                       ? *context->Output(0, {batch, output_depth, output_height, output_width})
+                       : *context->Output(0, {batch, output_height, output_width, output_depth});
+
+  int64_t virtual_input_depth = input_depth / blocksize_ / blocksize_;
+  TensorShape virtual_input_shape;
+
+  // cdr only here!
+  if (is_dcr_) {
+    virtual_input_shape = (Layout == LAYOUT_NCHW)
+                              ? TensorShape{batch, blocksize_, blocksize_,
+                                            virtual_input_depth, input_height, input_width}
+                              : TensorShape{batch, input_height, input_width,
+                                            blocksize_, blocksize_, virtual_input_depth};
+  } else {
+    virtual_input_shape = (Layout == LAYOUT_NCHW)
+                              ? TensorShape{batch, virtual_input_depth, blocksize_,
+                                            blocksize_, input_height, input_width}
+                              : TensorShape{batch, input_height, input_width,
+                                            virtual_input_depth, blocksize_, blocksize_};
+  }
 
   // We will pass in the "virtual" output shape to be used by DoTranspose() in SpaceDepthOpCudaImpl(...)
-  TensorShape virtual_output_shape{batch, input_depth / blocksize_ / blocksize_,
-                                   input_height, blocksize_, input_width, blocksize_};
+  TensorShape virtual_output_shape = (Layout == LAYOUT_NCHW)
+                                         ? TensorShape{batch, virtual_input_depth, input_height,
+                                                       blocksize_, input_width, blocksize_}
+                                         : TensorShape{batch, input_height, blocksize_,
+                                                       input_width, blocksize_, virtual_input_depth};
 
   std::vector<size_t> permutation;
-  permutation.reserve(6);
-  permutation.push_back(0);
 
   if (is_dcr_) {
-    permutation.push_back(3);
-    permutation.push_back(4);
-    permutation.push_back(1);
-    permutation.push_back(5);
-    permutation.push_back(2);
+    permutation = (Layout == LAYOUT_NCHW)
+                      ? std::vector<size_t>({0, 3, 4, 1, 5, 2})
+                      : std::vector<size_t>({0, 1, 3, 2, 4, 5});
 
   } else {
-    permutation.push_back(1);
-    permutation.push_back(4);
-    permutation.push_back(2);
-    permutation.push_back(5);
-    permutation.push_back(3);
+    permutation = std::vector<size_t>({0, 1, 4, 2, 5, 3});
   }
 
-  int64_t dim1 = is_dcr_ ? blocksize_ : input_depth / blocksize_ / blocksize_;
-  int64_t dim3 = is_dcr_ ? input_depth / blocksize_ / blocksize_ : blocksize_;
-
   ORT_RETURN_IF_ERROR(SpaceDepthOpCudaImpl(GetDeviceProp(), Stream(context), GetCublasHandle(context), input, output,
-                                           permutation,
-                                           batch,
-                                           dim1, blocksize_, dim3, input_height, input_width,
-                                           virtual_output_shape));
+                                           permutation, virtual_input_shape, virtual_output_shape));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/tensor/space_depth_ops.h b/onnxruntime/core/providers/cuda/tensor/space_depth_ops.h
index 57b85556f1db..8780d9b36500 100644
--- a/onnxruntime/core/providers/cuda/tensor/space_depth_ops.h
+++ b/onnxruntime/core/providers/cuda/tensor/space_depth_ops.h
@@ -9,6 +9,7 @@
 namespace onnxruntime {
 namespace cuda {
 
+template <bool Layout>
 class SpaceToDepth final : public CudaKernel, SpaceDepthBase {
  public:
   explicit SpaceToDepth(const OpKernelInfo& info) : CudaKernel(info), SpaceDepthBase(info) {
@@ -17,6 +18,7 @@ class SpaceToDepth final : public CudaKernel, SpaceDepthBase {
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
+template <bool Layout>
 class DepthToSpace final : public CudaKernel, SpaceDepthBase {
  public:
   explicit DepthToSpace(const OpKernelInfo& info) : CudaKernel(info), SpaceDepthBase(info) {
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
index 9f9c365d2a53..6344845359b3 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
@@ -80,7 +80,7 @@ bool CanDoTranspose3D(const cudaDeviceProp& prop, size_t rank, const gsl::span<c
   } break
 
 Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray<int64_t>& input_shape,
-                       const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t N,
+                       const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t /*N*/,
                        const dim3& grid_size, const dim3& block_size) {
   switch (element_size) {
     HANDLE_TRANSPOSE_3D_TILE_DIM(int8_t);
@@ -248,10 +248,10 @@ __global__ void Transpose4DKernelParallelizeOneElementPerThread(
 }
 
 bool CanDoTranspose4DParallelizeOneElementPerThread(const cudaDeviceProp& prop,
-                                                    size_t element_size,
+                                                    size_t /*element_size*/,
                                                     int32_t rank,
                                                     const gsl::span<const int64_t>& input_dims,
-                                                    const gsl::span<const size_t>& permutations,
+                                                    const gsl::span<const size_t>& /*permutations*/,
                                                     dim3& grid_size, dim3& block_size) {
   if (rank == 4) {
     // dims[3]: block.x
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc
index ae12ca328bc7..17533eb3d9a7 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #include "upsample.h"
+
+#include <utility>
+
 #include "upsample_impl.h"
 #include "core/providers/cuda/tensor/resize_impl.h"
 #include "core/providers/cpu/tensor/utils.h"
@@ -37,11 +40,23 @@ REGISTER_VERSIONED_TYPED_KERNEL(MLFloat16, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
 
+template <typename T>
+Upsample<T>::Upsample(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
+  if (UpsampleBase::antialias_) {
+    // Copy the table on DEVICE
+    const uint8_t* lookup_table = GetLookupTableShared();
+    auto alloc = info.GetAllocator(OrtMemTypeDefault);
+    shared_lookup_table_ondevice_ = IAllocator::MakeUniquePtr<uint8_t>(std::move(alloc), kLookupTableSize);
+    CUDA_CALL_THROW(cudaMemcpyAsync(shared_lookup_table_ondevice_.get(), lookup_table, kLookupTableSize,
+                                    cudaMemcpyHostToDevice, nullptr));
+  }
+}
+
 template <typename T>
 Status Upsample<T>::BaseCompute(OpKernelContext* context,
-                                const std::vector<float>& roi,
-                                const std::vector<float>& scales,
-                                const gsl::span<const int64_t>& output_dims) const {
+                                gsl::span<const float> roi,
+                                gsl::span<const float> scales,
+                                gsl::span<const int64_t> output_dims) const {
   const Tensor* X = context->Input<Tensor>(0);
   auto X_dims = X->Shape().GetDims();
   int32_t rank = static_cast<int32_t>(X_dims.size());
@@ -52,7 +67,8 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
                   is_resize_ ? "Resize: input tensor cannot be scalar." : "Upsample: input tensor cannot be scalar.");
   if (rank != static_cast<int32_t>(scales.size()))
     return Status(ONNXRUNTIME, INVALID_ARGUMENT,
-                  is_resize_ ? "Resize: input tensor's dimension does not match the scales." : "Upsample: input tensor's dimension does not match the scales.");
+                  is_resize_ ? "Resize: input tensor's dimension does not match the scales."
+                             : "Upsample: input tensor's dimension does not match the scales.");
   if (roi.size() != 2 * X_dims.size())
     return Status(ONNXRUNTIME, INVALID_ARGUMENT,
                   "Resize: size of roi array should be 2 * N where N is the rank of input tensor X.");
@@ -79,22 +95,194 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
   size_t output_count = Y->Shape().Size();
 
   if (is_resize_) {
-    TArray<int64_t> input_shape(X_dims);
-    TArray<int64_t> output_shape(output_dims);
-    TArray<float, 10> roi_vals(roi);
-    TArray<float> scales_vals(scales);
-
-    size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
-    auto dims_mapping_buffer = GetScratchBuffer<unsigned char>(temp_buffer_size, context->GetComputeStream());
-    void* dims_mapping = reinterpret_cast<void*>(dims_mapping_buffer.get());
-    ResizeImpl(Stream(context), mode_, (int)rank, input_shape, output_shape,
-               input_strides, output_div_pitches, scales_vals, roi_vals,
-               reinterpret_cast<const CudaT*>(X->Data<T>()),
-               reinterpret_cast<CudaT*>(Y->MutableData<T>()),
-               output_count, use_extrapolation_, ToCudaType<T>::FromFloat(extrapolation_value_),
-               cubic_coeff_a_, exclude_outside_,
-               coordinate_transform_mode_, nearest_mode_,
-               dims_mapping);
+    const bool is_same = std::all_of(scales.begin(), scales.end(), [](float v) { return v == 1.0f; }) &&
+                         (coordinate_transform_mode_ != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE);
+    if (is_same) {
+      CUDA_CALL_THROW(cudaMemcpyAsync(Y->MutableData<T>(), X->Data<T>(),
+                                      output_count * sizeof(T), cudaMemcpyDeviceToDevice, Stream(context)));
+      return Status::OK();
+    }
+
+    if (antialias_) {
+      TempSpaceAllocateFunc allocate_temp_space = [&](size_t bytes_size) {
+        return GetScratchBuffer<uint8_t>(bytes_size, context->GetComputeStream());
+      };
+
+      std::optional<float> extrapolation_value;
+      if (use_extrapolation_)
+        extrapolation_value.emplace(extrapolation_value_);
+
+      switch (mode_) {
+        case UpsampleMode::LINEAR: {
+          if (X_dims.size() == 2 || X_dims.size() == 4) {
+            const bool is_2D = X_dims.size() == 2;
+
+            int64_t batch_size = 1;
+            int64_t num_channels = 1;
+
+            int64_t input_height;
+            int64_t input_width;
+
+            int64_t output_height;
+            int64_t output_width;
+
+            float height_scale;
+            float width_scale;
+
+            if (is_2D) {
+              input_height = X_dims[0];
+              input_width = X_dims[1];
+
+              output_height = output_dims[0];
+              output_width = output_dims[1];
+
+              height_scale = scales[0];
+              width_scale = scales[1];
+            } else {
+              if (scales[0] == 1.0f && scales[1] == 1.0f) {
+                batch_size = X_dims[Channels<LAYOUT_NCHW>::N];
+                num_channels = X_dims[Channels<LAYOUT_NCHW>::C];
+                input_height = X_dims[Channels<LAYOUT_NCHW>::H];
+                input_width = X_dims[Channels<LAYOUT_NCHW>::W];
+
+                output_height = output_dims[Channels<LAYOUT_NCHW>::H];
+                output_width = output_dims[Channels<LAYOUT_NCHW>::W];
+
+                height_scale = scales[2];
+                width_scale = scales[3];
+              } else {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize", ": NHWC is not supported yet");
+              }
+            }
+
+            ResizeAntiAliasImpl(Stream(context),
+                                rank,
+                                mode_,
+                                coordinate_transform_mode_,
+                                X_dims, output_dims,
+                                batch_size, num_channels,
+                                std::make_tuple(0, input_height, input_width),
+                                std::make_tuple(0, output_height, output_width),
+                                std::make_tuple(0.f, height_scale, width_scale),
+                                output_div_pitches,
+                                roi,
+                                extrapolation_value,
+                                exclude_outside_,
+                                allocate_temp_space,
+                                shared_lookup_table_ondevice_.get(),
+                                reinterpret_cast<const CudaT*>(X->Data<T>()),
+                                reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                                output_count);
+
+          } else if (X_dims.size() == 3 || X_dims.size() == 5) {
+            const bool is_3D = X_dims.size() == 3;
+
+            if (!is_3D) {
+              if (!(scales[0] == 1.0f && scales[1] == 1.0f)) {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize", ": NDHWC is not supported yet");
+              }
+            }
+
+            const int64_t batch_size = is_3D ? 1 : X_dims[0];
+            const int64_t num_channels = is_3D ? 1 : X_dims[1];
+            const int64_t input_depth = is_3D ? X_dims[0] : X_dims[2];
+            const int64_t input_height = is_3D ? X_dims[1] : X_dims[3];
+            const int64_t input_width = is_3D ? X_dims[2] : X_dims[4];
+
+            const int64_t output_depth = is_3D ? output_dims[0] : output_dims[2];
+            const int64_t output_height = is_3D ? output_dims[1] : output_dims[3];
+            const int64_t output_width = is_3D ? output_dims[2] : output_dims[4];
+
+            const float depth_scale = is_3D ? scales[0] : scales[2];
+            const float height_scale = is_3D ? scales[1] : scales[3];
+            const float width_scale = is_3D ? scales[2] : scales[4];
+
+            ResizeAntiAliasImpl(Stream(context),
+                                rank,
+                                mode_,
+                                coordinate_transform_mode_,
+                                X_dims, output_dims,
+                                batch_size, num_channels,
+                                std::make_tuple(input_depth, input_height, input_width),
+                                std::make_tuple(output_depth, output_height, output_width),
+                                std::make_tuple(depth_scale, height_scale, width_scale),
+                                output_div_pitches,
+                                roi,
+                                extrapolation_value,
+                                exclude_outside_,
+                                allocate_temp_space,
+                                shared_lookup_table_ondevice_.get(),
+                                reinterpret_cast<const CudaT*>(X->Data<T>()),
+                                reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                                output_count);
+          } else {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize",
+                                   ": 'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') "
+                                   "or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values "
+                                   "being 1.");
+          }
+        } break;
+        case UpsampleMode::CUBIC: {
+          if (X_dims.size() != 2 && X_dims.size() != 4) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize",
+                                   ": 'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs "
+                                   "with the corresponding outermost 2 scale values being 1.");
+          }
+
+          const bool is_2D = X_dims.size() == 2;
+          const bool is_nchw = is_2D ? true : (scales[1] == 1.0f && scales[1] == 1.0f);
+
+          ORT_RETURN_IF_NOT(is_nchw,
+                            "Resize 'Cubic' mode only supports NCWH layout "
+                            " with 2-D or 4-D with leading dims equal to 1");
+
+          const int64_t batch_size = is_2D ? 1 : X_dims[Channels<LAYOUT_NCHW>::N];
+          const int64_t num_channels = is_2D ? 1 : X_dims[Channels<LAYOUT_NCHW>::C];
+          const int64_t input_height = is_2D ? X_dims[0] : X_dims[Channels<LAYOUT_NCHW>::H];
+          const int64_t input_width = is_2D ? X_dims[1] : X_dims[Channels<LAYOUT_NCHW>::W];
+
+          const int64_t output_height = is_2D ? output_dims[0] : output_dims[Channels<LAYOUT_NCHW>::H];
+          const int64_t output_width = is_2D ? output_dims[1] : output_dims[Channels<LAYOUT_NCHW>::W];
+          const float height_scale = is_2D ? scales[0] : scales[2];
+          const float width_scale = is_2D ? scales[1] : scales[3];
+
+          ResizeAntiAliasImpl(Stream(context), rank, mode_, coordinate_transform_mode_,
+                              X_dims, output_dims,
+                              batch_size, num_channels,
+                              std::make_tuple(0, input_height, input_width),
+                              std::make_tuple(0, output_height, output_width),
+                              std::make_tuple(0.f, height_scale, width_scale),
+                              output_div_pitches,
+                              roi,
+                              extrapolation_value,
+                              exclude_outside_,
+                              allocate_temp_space,
+                              shared_lookup_table_ondevice_.get(),
+                              reinterpret_cast<const CudaT*>(X->Data<T>()),
+                              reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                              output_count);
+        } break;
+        default:
+          return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Resize: unexpected mode");
+      }
+    } else {
+      TArray<int64_t> input_shape(X_dims);
+      TArray<int64_t> output_shape(output_dims);
+      TArray<float, 10> roi_vals(roi);
+      TArray<float> scales_vals(scales);
+
+      size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
+      auto dims_mapping_buffer = GetScratchBuffer<unsigned char>(temp_buffer_size, context->GetComputeStream());
+      void* dims_mapping = reinterpret_cast<void*>(dims_mapping_buffer.get());
+      ResizeImpl(Stream(context), mode_, rank, input_shape, output_shape,
+                 input_strides, output_div_pitches, scales_vals, roi_vals,
+                 reinterpret_cast<const CudaT*>(X->Data<T>()),
+                 reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                 output_count, use_extrapolation_, ToCudaType<T>::FromFloat(extrapolation_value_),
+                 cubic_coeff_a_, exclude_outside_,
+                 coordinate_transform_mode_, nearest_mode_,
+                 dims_mapping);
+    }
   } else {
     TArray<fast_divmod> scales_div(rank);
 
@@ -124,7 +312,7 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
   auto input_dims = X->Shape().GetDims();
 
   TensorShapeVector output_dims(input_dims.size());
-  std::vector<float> roi_array(input_dims.size() * 2, 0.0f);
+  InlinedVector<float> roi_array(input_dims.size() * 2, 0.0f);
   if (!roi_cached_) {
     bool use_default_roi = true;
     if (need_roi_input_) {
@@ -147,29 +335,37 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
     }
   }
 
-  const std::vector<float>& roi = roi_cached_ ? roi_ : roi_array;
-  std::vector<float> scales_array = scales_;
+  ComputeROIWithAxes(roi_array, input_dims.size());
 
+  InlinedVector<float> scales_array(input_dims.size());
+  // opset < 10
   if (OpKernel::Node().InputDefs().size() == 1) {
-    // Compute output shape from scales and input dims
+    // Compute output shape from scales attributes and input dims
+    scales_array = scales_;
+
     ComputeOutputShape(scales_array, input_dims, output_dims);
-    return BaseCompute(context, roi, scales_, output_dims);
+    return BaseCompute(context, roi_array, scales_, output_dims);
   }
 
   const Tensor* scales = context->Input<Tensor>(scales_input_idx_);
   const Tensor* sizes = context->Input<Tensor>(sizes_input_idx_);
 
+  // This is when scales are obtained and cached from a constant initializer
   if (scales_cached_) {
-    ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
+    ORT_RETURN_IF_NOT(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
+    scales_array = scales_;
+    // Compute output shape from scales and input dims
     ComputeOutputShape(scales_array, input_dims, output_dims);
-    return BaseCompute(context, roi, scales_, output_dims);
+    return BaseCompute(context, roi_array, scales_array, output_dims);
   }
 
-  scales_array.resize((input_dims.size()));
+  // Scales and sizes are input to the node
   if (scales != nullptr && scales->Shape().Size() != 0) {
     // use scales input data
     ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
     ORT_RETURN_IF_ERROR(ParseScalesData(scales, scales_array, input_dims.size()));
+
+    // Compute output shape from scales and input dims
     ComputeOutputShape(scales_array, input_dims, output_dims);
   } else {
     // When sizes input is available directly populate it into the output_dims array.
@@ -179,7 +375,7 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
     ORT_RETURN_IF_ERROR(ParseScalesDataAndAdjustOutputSize(output_dims, input_dims, scales_array));
   }
 
-  return BaseCompute(context, roi, scales_array, output_dims);
+  return BaseCompute(context, roi_array, scales_array, output_dims);
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.h b/onnxruntime/core/providers/cuda/tensor/upsample.h
index 7bf2a23ede39..50597e0fba1b 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.h
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.h
@@ -13,12 +13,14 @@ namespace cuda {
 template <typename T>
 class Upsample : public UpsampleBase, public CudaKernel {
  public:
-  Upsample(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
-  }
+  explicit Upsample(const OpKernelInfo& info);
 
   Status ComputeInternal(OpKernelContext* context) const override;
-  Status BaseCompute(OpKernelContext* context, const std::vector<float>& roi, const std::vector<float>& scales,
-                     const gsl::span<const int64_t>& output_dims) const;
+  Status BaseCompute(OpKernelContext* context, gsl::span<const float> roi, gsl::span<const float> scales,
+                     gsl::span<const int64_t> output_dims) const;
+
+ private:
+  IAllocatorUniquePtr<uint8_t> shared_lookup_table_ondevice_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/triton_kernel.cu b/onnxruntime/core/providers/cuda/triton_kernel.cu
index 6ffbf0420a15..b42dbd0291b7 100644
--- a/onnxruntime/core/providers/cuda/triton_kernel.cu
+++ b/onnxruntime/core/providers/cuda/triton_kernel.cu
@@ -130,27 +130,11 @@ void LoadOrtTritonKernel() {
   std::call_once(load_ort_triton_kernel_flag, TryToLoadKernel);
 }
 
-Status LaunchTritonKernel(cudaStream_t stream, std::string fname,
-                          int grid0, int grid1, int grid2, void* args, size_t args_size) {
-#ifdef USE_TRITON_KERNEL
-  if (ort_triton_kernel_map.count(fname) == 0) {
-    // Return unsupported status if function name not found in registry.
-    // This error status will be used by TunableOp
-    std::ostringstream message_stream;
-    message_stream << "Can't find ort triton kernel name: " << fname;
-    std::string message = message_stream.str();
-    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(true, message);
-  }
-  auto idx = ort_triton_kernel_map[fname];
-  return LaunchTritonKernel(stream, idx, grid0, grid1, grid2, args, args_size);
-#else
-  return Status::OK();
-#endif
-}
 
-Status LaunchTritonKernel(cudaStream_t stream, size_t idx,
-                          int grid0, int grid1, int grid2, void* args, size_t args_size) {
+
 #ifdef USE_TRITON_KERNEL
+Status LaunchTritonKernel(cudaStream_t stream, size_t idx, int grid0, int grid1, int grid2,
+                          void* args, size_t args_size) {
   if (idx >= ort_triton_kernel_metadata.size()) {
     // Return unsupported status when idx exceeds the size of ort_triton_kernel_metadata.
     // This error status will be used by TunableOp
@@ -181,11 +165,37 @@ Status LaunchTritonKernel(cudaStream_t stream, size_t idx,
                                   nullptr,
                                   (void**)&config),
                    "Launching kernel failed.");
-#endif
 
   return Status::OK();
 }
 
+Status LaunchTritonKernel(cudaStream_t stream, std::string fname, int grid0, int grid1, int grid2,
+                          void* args, size_t args_size) {
+  if (ort_triton_kernel_map.count(fname) == 0) {
+    // Return unsupported status if function name not found in registry.
+    // This error status will be used by TunableOp
+    std::ostringstream message_stream;
+    message_stream << "Can't find ort triton kernel name: " << fname;
+    std::string message = message_stream.str();
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(true, message);
+  }
+  auto idx = ort_triton_kernel_map[fname];
+  return LaunchTritonKernel(stream, idx, grid0, grid1, grid2, args, args_size);
+}
+
+#else
+Status LaunchTritonKernel(cudaStream_t /*stream*/, std::string /*fname*/, int /*grid0*/, int /*grid1*/, int /*grid2*/,
+                          void* /*args*/, size_t /*args_size*/) {
+  return Status::OK();
+}
+
+Status LaunchTritonKernel(cudaStream_t /*stream*/, size_t /*idx*/, int /*grid0*/, int /*grid1*/, int /*grid2*/,
+                          void* /*args*/, size_t /*args_size*/) {
+  return Status::OK();
+}
+#endif
+
+
 const TritonKernelMetaData* GetOrtTritonKernelMetadata(size_t idx) {
   if (idx >= ort_triton_kernel_metadata.size()) {
     return nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index 074f13b30918..88e3dd487d42 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -80,12 +80,10 @@ namespace Windows::AI::MachineLearning::Adapter
     };
 
     // This is the counterpart to the MLOperatorGraphDesc ABI struct which owns its memory and uses containers.
-    // Either nodesAsOperatorDesc or nodesAsIDMLOperator can have non-zero size.
     struct DmlGraphNodeCreateInfo
     {
         uint32_t nodeCount = 0;
-        std::vector<std::unique_ptr<AbstractOperatorDesc>> nodesAsOperatorDesc;
-        std::vector<Microsoft::WRL::ComPtr<IDMLOperator>> nodesAsIDMLOperator;
+        std::vector<std::unique_ptr<AbstractOperatorDesc>> nodes;
         std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
         std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
         std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h
index 59a827a4ffa1..9c395e9cc906 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h
@@ -5,7 +5,7 @@
 
 #include <wrl/client.h>
 #include <wrl/implements.h>
-#include <d3d12.h>
+#include "directx/d3d12.h"
 #include "DmlResourceWrapper.h"
 
 namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
new file mode 100644
index 000000000000..bf9800458102
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
@@ -0,0 +1,570 @@
+﻿//---------------------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+// This file is automatically generated. Please do not edit it directly.
+// To modify this file, edit the schema: dml/Tools/DirectMLSchema.json
+// And run this script to regenerate: dml/Tools/GenerateSchema.ps1
+//
+// #dml-new-operator-location
+//---------------------------------------------------------------------------
+
+#pragma once
+
+#include "precomp.h"
+
+template <typename T>
+T ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+#ifndef WAI_BUILD_LINUX
+    // Clang will instantiate this template even if it isn't used,
+    // so this static_assert will always fire and break the build.
+    static_assert(false, "Not implemented for this type");
+#endif
+}
+
+template <>
+DML_TENSOR_DATA_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_TENSOR_DATA_TYPE_UNKNOWN", DML_TENSOR_DATA_TYPE_UNKNOWN},
+        {"DML_TENSOR_DATA_TYPE_FLOAT32", DML_TENSOR_DATA_TYPE_FLOAT32},
+        {"DML_TENSOR_DATA_TYPE_FLOAT16", DML_TENSOR_DATA_TYPE_FLOAT16},
+        {"DML_TENSOR_DATA_TYPE_UINT32", DML_TENSOR_DATA_TYPE_UINT32},
+        {"DML_TENSOR_DATA_TYPE_UINT16", DML_TENSOR_DATA_TYPE_UINT16},
+        {"DML_TENSOR_DATA_TYPE_UINT8", DML_TENSOR_DATA_TYPE_UINT8},
+        {"DML_TENSOR_DATA_TYPE_INT32", DML_TENSOR_DATA_TYPE_INT32},
+        {"DML_TENSOR_DATA_TYPE_INT16", DML_TENSOR_DATA_TYPE_INT16},
+        {"DML_TENSOR_DATA_TYPE_INT8", DML_TENSOR_DATA_TYPE_INT8},
+        {"DML_TENSOR_DATA_TYPE_FLOAT64", DML_TENSOR_DATA_TYPE_FLOAT64},
+        {"DML_TENSOR_DATA_TYPE_UINT64", DML_TENSOR_DATA_TYPE_UINT64},
+        {"DML_TENSOR_DATA_TYPE_INT64", DML_TENSOR_DATA_TYPE_INT64},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_TENSOR_DATA_TYPE>(0);
+    }
+    return static_cast<DML_TENSOR_DATA_TYPE>(*index);
+}
+
+
+template <>
+DML_TENSOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_TENSOR_TYPE_INVALID", DML_TENSOR_TYPE_INVALID},
+        {"DML_TENSOR_TYPE_BUFFER", DML_TENSOR_TYPE_BUFFER},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_TENSOR_TYPE>(0);
+    }
+    return static_cast<DML_TENSOR_TYPE>(*index);
+}
+
+
+template <>
+DML_OPERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_OPERATOR_INVALID", DML_OPERATOR_INVALID},
+        {"DML_OPERATOR_ELEMENT_WISE_IDENTITY", DML_OPERATOR_ELEMENT_WISE_IDENTITY},
+        {"DML_OPERATOR_ELEMENT_WISE_ABS", DML_OPERATOR_ELEMENT_WISE_ABS},
+        {"DML_OPERATOR_ELEMENT_WISE_ACOS", DML_OPERATOR_ELEMENT_WISE_ACOS},
+        {"DML_OPERATOR_ELEMENT_WISE_ADD", DML_OPERATOR_ELEMENT_WISE_ADD},
+        {"DML_OPERATOR_ELEMENT_WISE_ASIN", DML_OPERATOR_ELEMENT_WISE_ASIN},
+        {"DML_OPERATOR_ELEMENT_WISE_ATAN", DML_OPERATOR_ELEMENT_WISE_ATAN},
+        {"DML_OPERATOR_ELEMENT_WISE_CEIL", DML_OPERATOR_ELEMENT_WISE_CEIL},
+        {"DML_OPERATOR_ELEMENT_WISE_CLIP", DML_OPERATOR_ELEMENT_WISE_CLIP},
+        {"DML_OPERATOR_ELEMENT_WISE_COS", DML_OPERATOR_ELEMENT_WISE_COS},
+        {"DML_OPERATOR_ELEMENT_WISE_DIVIDE", DML_OPERATOR_ELEMENT_WISE_DIVIDE},
+        {"DML_OPERATOR_ELEMENT_WISE_EXP", DML_OPERATOR_ELEMENT_WISE_EXP},
+        {"DML_OPERATOR_ELEMENT_WISE_FLOOR", DML_OPERATOR_ELEMENT_WISE_FLOOR},
+        {"DML_OPERATOR_ELEMENT_WISE_LOG", DML_OPERATOR_ELEMENT_WISE_LOG},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_AND", DML_OPERATOR_ELEMENT_WISE_LOGICAL_AND},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_EQUALS", DML_OPERATOR_ELEMENT_WISE_LOGICAL_EQUALS},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN", DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN", DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN_OR_EQUAL", DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN_OR_EQUAL},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN_OR_EQUAL", DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN_OR_EQUAL},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_NOT", DML_OPERATOR_ELEMENT_WISE_LOGICAL_NOT},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_OR", DML_OPERATOR_ELEMENT_WISE_LOGICAL_OR},
+        {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_XOR", DML_OPERATOR_ELEMENT_WISE_LOGICAL_XOR},
+        {"DML_OPERATOR_ELEMENT_WISE_MAX", DML_OPERATOR_ELEMENT_WISE_MAX},
+        {"DML_OPERATOR_ELEMENT_WISE_MEAN", DML_OPERATOR_ELEMENT_WISE_MEAN},
+        {"DML_OPERATOR_ELEMENT_WISE_MIN", DML_OPERATOR_ELEMENT_WISE_MIN},
+        {"DML_OPERATOR_ELEMENT_WISE_MULTIPLY", DML_OPERATOR_ELEMENT_WISE_MULTIPLY},
+        {"DML_OPERATOR_ELEMENT_WISE_POW", DML_OPERATOR_ELEMENT_WISE_POW},
+        {"DML_OPERATOR_ELEMENT_WISE_CONSTANT_POW", DML_OPERATOR_ELEMENT_WISE_CONSTANT_POW},
+        {"DML_OPERATOR_ELEMENT_WISE_RECIP", DML_OPERATOR_ELEMENT_WISE_RECIP},
+        {"DML_OPERATOR_ELEMENT_WISE_SIN", DML_OPERATOR_ELEMENT_WISE_SIN},
+        {"DML_OPERATOR_ELEMENT_WISE_SQRT", DML_OPERATOR_ELEMENT_WISE_SQRT},
+        {"DML_OPERATOR_ELEMENT_WISE_SUBTRACT", DML_OPERATOR_ELEMENT_WISE_SUBTRACT},
+        {"DML_OPERATOR_ELEMENT_WISE_TAN", DML_OPERATOR_ELEMENT_WISE_TAN},
+        {"DML_OPERATOR_ELEMENT_WISE_THRESHOLD", DML_OPERATOR_ELEMENT_WISE_THRESHOLD},
+        {"DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR", DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR},
+        {"DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR", DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR},
+        {"DML_OPERATOR_ACTIVATION_ELU", DML_OPERATOR_ACTIVATION_ELU},
+        {"DML_OPERATOR_ACTIVATION_CELU", DML_OPERATOR_ACTIVATION_CELU},
+        {"DML_OPERATOR_ACTIVATION_HARDMAX", DML_OPERATOR_ACTIVATION_HARDMAX},
+        {"DML_OPERATOR_ACTIVATION_HARDMAX1", DML_OPERATOR_ACTIVATION_HARDMAX1},
+        {"DML_OPERATOR_ACTIVATION_HARD_SIGMOID", DML_OPERATOR_ACTIVATION_HARD_SIGMOID},
+        {"DML_OPERATOR_ACTIVATION_IDENTITY", DML_OPERATOR_ACTIVATION_IDENTITY},
+        {"DML_OPERATOR_ACTIVATION_LEAKY_RELU", DML_OPERATOR_ACTIVATION_LEAKY_RELU},
+        {"DML_OPERATOR_ACTIVATION_LINEAR", DML_OPERATOR_ACTIVATION_LINEAR},
+        {"DML_OPERATOR_ACTIVATION_LOG_SOFTMAX", DML_OPERATOR_ACTIVATION_LOG_SOFTMAX},
+        {"DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1", DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1},
+        {"DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU", DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU},
+        {"DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS", DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS},
+        {"DML_OPERATOR_ACTIVATION_RELU", DML_OPERATOR_ACTIVATION_RELU},
+        {"DML_OPERATOR_ACTIVATION_SCALED_ELU", DML_OPERATOR_ACTIVATION_SCALED_ELU},
+        {"DML_OPERATOR_ACTIVATION_SCALED_TANH", DML_OPERATOR_ACTIVATION_SCALED_TANH},
+        {"DML_OPERATOR_ACTIVATION_SIGMOID", DML_OPERATOR_ACTIVATION_SIGMOID},
+        {"DML_OPERATOR_ACTIVATION_SOFTMAX", DML_OPERATOR_ACTIVATION_SOFTMAX},
+        {"DML_OPERATOR_ACTIVATION_SOFTMAX1", DML_OPERATOR_ACTIVATION_SOFTMAX1},
+        {"DML_OPERATOR_ACTIVATION_SOFTPLUS", DML_OPERATOR_ACTIVATION_SOFTPLUS},
+        {"DML_OPERATOR_ACTIVATION_SOFTSIGN", DML_OPERATOR_ACTIVATION_SOFTSIGN},
+        {"DML_OPERATOR_ACTIVATION_TANH", DML_OPERATOR_ACTIVATION_TANH},
+        {"DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU", DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU},
+        {"DML_OPERATOR_CONVOLUTION", DML_OPERATOR_CONVOLUTION},
+        {"DML_OPERATOR_GEMM", DML_OPERATOR_GEMM},
+        {"DML_OPERATOR_REDUCE", DML_OPERATOR_REDUCE},
+        {"DML_OPERATOR_AVERAGE_POOLING", DML_OPERATOR_AVERAGE_POOLING},
+        {"DML_OPERATOR_AVERAGE_POOLING1", DML_OPERATOR_AVERAGE_POOLING1},
+        {"DML_OPERATOR_LP_POOLING", DML_OPERATOR_LP_POOLING},
+        {"DML_OPERATOR_LP_POOLING1", DML_OPERATOR_LP_POOLING1},
+        {"DML_OPERATOR_MAX_POOLING", DML_OPERATOR_MAX_POOLING},
+        {"DML_OPERATOR_ROI_POOLING", DML_OPERATOR_ROI_POOLING},
+        {"DML_OPERATOR_SLICE", DML_OPERATOR_SLICE},
+        {"DML_OPERATOR_CAST", DML_OPERATOR_CAST},
+        {"DML_OPERATOR_SPLIT", DML_OPERATOR_SPLIT},
+        {"DML_OPERATOR_JOIN", DML_OPERATOR_JOIN},
+        {"DML_OPERATOR_PADDING", DML_OPERATOR_PADDING},
+        {"DML_OPERATOR_PADDING1", DML_OPERATOR_PADDING1},
+        {"DML_OPERATOR_VALUE_SCALE_2D", DML_OPERATOR_VALUE_SCALE_2D},
+        {"DML_OPERATOR_UPSAMPLE_2D", DML_OPERATOR_UPSAMPLE_2D},
+        {"DML_OPERATOR_GATHER", DML_OPERATOR_GATHER},
+        {"DML_OPERATOR_SPACE_TO_DEPTH", DML_OPERATOR_SPACE_TO_DEPTH},
+        {"DML_OPERATOR_DEPTH_TO_SPACE", DML_OPERATOR_DEPTH_TO_SPACE},
+        {"DML_OPERATOR_TILE", DML_OPERATOR_TILE},
+        {"DML_OPERATOR_TOP_K", DML_OPERATOR_TOP_K},
+        {"DML_OPERATOR_BATCH_NORMALIZATION", DML_OPERATOR_BATCH_NORMALIZATION},
+        {"DML_OPERATOR_BATCH_NORMALIZATION_TRAINING", DML_OPERATOR_BATCH_NORMALIZATION_TRAINING},
+        {"DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION", DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION},
+        {"DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION", DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION},
+        {"DML_OPERATOR_LP_NORMALIZATION", DML_OPERATOR_LP_NORMALIZATION},
+        {"DML_OPERATOR_RNN", DML_OPERATOR_RNN},
+        {"DML_OPERATOR_LSTM", DML_OPERATOR_LSTM},
+        {"DML_OPERATOR_GRU", DML_OPERATOR_GRU},
+        {"DML_OPERATOR_ELEMENT_WISE_SIGN", DML_OPERATOR_ELEMENT_WISE_SIGN},
+        {"DML_OPERATOR_ELEMENT_WISE_IS_NAN", DML_OPERATOR_ELEMENT_WISE_IS_NAN},
+        {"DML_OPERATOR_ELEMENT_WISE_ERF", DML_OPERATOR_ELEMENT_WISE_ERF},
+        {"DML_OPERATOR_ELEMENT_WISE_SINH", DML_OPERATOR_ELEMENT_WISE_SINH},
+        {"DML_OPERATOR_ELEMENT_WISE_COSH", DML_OPERATOR_ELEMENT_WISE_COSH},
+        {"DML_OPERATOR_ELEMENT_WISE_TANH", DML_OPERATOR_ELEMENT_WISE_TANH},
+        {"DML_OPERATOR_ELEMENT_WISE_ASINH", DML_OPERATOR_ELEMENT_WISE_ASINH},
+        {"DML_OPERATOR_ELEMENT_WISE_ACOSH", DML_OPERATOR_ELEMENT_WISE_ACOSH},
+        {"DML_OPERATOR_ELEMENT_WISE_ATANH", DML_OPERATOR_ELEMENT_WISE_ATANH},
+        {"DML_OPERATOR_ELEMENT_WISE_IF", DML_OPERATOR_ELEMENT_WISE_IF},
+        {"DML_OPERATOR_ELEMENT_WISE_ADD1", DML_OPERATOR_ELEMENT_WISE_ADD1},
+        {"DML_OPERATOR_ACTIVATION_SHRINK", DML_OPERATOR_ACTIVATION_SHRINK},
+        {"DML_OPERATOR_MAX_POOLING1", DML_OPERATOR_MAX_POOLING1},
+        {"DML_OPERATOR_MAX_UNPOOLING", DML_OPERATOR_MAX_UNPOOLING},
+        {"DML_OPERATOR_DIAGONAL_MATRIX", DML_OPERATOR_DIAGONAL_MATRIX},
+        {"DML_OPERATOR_SCATTER", DML_OPERATOR_SCATTER},
+        {"DML_OPERATOR_ONE_HOT", DML_OPERATOR_ONE_HOT},
+        {"DML_OPERATOR_RESAMPLE", DML_OPERATOR_RESAMPLE},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_LEFT", DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_LEFT},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_RIGHT", DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_RIGHT},
+        {"DML_OPERATOR_ELEMENT_WISE_ROUND", DML_OPERATOR_ELEMENT_WISE_ROUND},
+        {"DML_OPERATOR_ELEMENT_WISE_IS_INFINITY", DML_OPERATOR_ELEMENT_WISE_IS_INFINITY},
+        {"DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE", DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE},
+        {"DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR", DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR},
+        {"DML_OPERATOR_FILL_VALUE_SEQUENCE", DML_OPERATOR_FILL_VALUE_SEQUENCE},
+        {"DML_OPERATOR_FILL_VALUE_CONSTANT", DML_OPERATOR_FILL_VALUE_CONSTANT},
+        {"DML_OPERATOR_CUMULATIVE_SUMMATION", DML_OPERATOR_CUMULATIVE_SUMMATION},
+        {"DML_OPERATOR_REVERSE_SUBSEQUENCES", DML_OPERATOR_REVERSE_SUBSEQUENCES},
+        {"DML_OPERATOR_GATHER_ELEMENTS", DML_OPERATOR_GATHER_ELEMENTS},
+        {"DML_OPERATOR_GATHER_ND", DML_OPERATOR_GATHER_ND},
+        {"DML_OPERATOR_SCATTER_ND", DML_OPERATOR_SCATTER_ND},
+        {"DML_OPERATOR_MAX_POOLING2", DML_OPERATOR_MAX_POOLING2},
+        {"DML_OPERATOR_SLICE1", DML_OPERATOR_SLICE1},
+        {"DML_OPERATOR_TOP_K1", DML_OPERATOR_TOP_K1},
+        {"DML_OPERATOR_DEPTH_TO_SPACE1", DML_OPERATOR_DEPTH_TO_SPACE1},
+        {"DML_OPERATOR_SPACE_TO_DEPTH1", DML_OPERATOR_SPACE_TO_DEPTH1},
+        {"DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION1", DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION1},
+        {"DML_OPERATOR_RESAMPLE1", DML_OPERATOR_RESAMPLE1},
+        {"DML_OPERATOR_MATRIX_MULTIPLY_INTEGER", DML_OPERATOR_MATRIX_MULTIPLY_INTEGER},
+        {"DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY", DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY},
+        {"DML_OPERATOR_CONVOLUTION_INTEGER", DML_OPERATOR_CONVOLUTION_INTEGER},
+        {"DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION", DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_AND", DML_OPERATOR_ELEMENT_WISE_BIT_AND},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_OR", DML_OPERATOR_ELEMENT_WISE_BIT_OR},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_XOR", DML_OPERATOR_ELEMENT_WISE_BIT_XOR},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_NOT", DML_OPERATOR_ELEMENT_WISE_BIT_NOT},
+        {"DML_OPERATOR_ELEMENT_WISE_BIT_COUNT", DML_OPERATOR_ELEMENT_WISE_BIT_COUNT},
+        {"DML_OPERATOR_ACTIVATION_RELU_GRAD", DML_OPERATOR_ACTIVATION_RELU_GRAD},
+        {"DML_OPERATOR_AVERAGE_POOLING_GRAD", DML_OPERATOR_AVERAGE_POOLING_GRAD},
+        {"DML_OPERATOR_MAX_POOLING_GRAD", DML_OPERATOR_MAX_POOLING_GRAD},
+        {"DML_OPERATOR_RANDOM_GENERATOR", DML_OPERATOR_RANDOM_GENERATOR},
+        {"DML_OPERATOR_NONZERO_COORDINATES", DML_OPERATOR_NONZERO_COORDINATES},
+        {"DML_OPERATOR_RESAMPLE_GRAD", DML_OPERATOR_RESAMPLE_GRAD},
+        {"DML_OPERATOR_SLICE_GRAD", DML_OPERATOR_SLICE_GRAD},
+        {"DML_OPERATOR_ADAM_OPTIMIZER", DML_OPERATOR_ADAM_OPTIMIZER},
+        {"DML_OPERATOR_ARGMIN", DML_OPERATOR_ARGMIN},
+        {"DML_OPERATOR_ARGMAX", DML_OPERATOR_ARGMAX},
+        {"DML_OPERATOR_ROI_ALIGN", DML_OPERATOR_ROI_ALIGN},
+        {"DML_OPERATOR_GATHER_ND1", DML_OPERATOR_GATHER_ND1},
+        {"DML_OPERATOR_ELEMENT_WISE_ATAN_YX", DML_OPERATOR_ELEMENT_WISE_ATAN_YX},
+        {"DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD", DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD},
+        {"DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE", DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE},
+        {"DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD", DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD},
+        {"DML_OPERATOR_CUMULATIVE_PRODUCT", DML_OPERATOR_CUMULATIVE_PRODUCT},
+        {"DML_OPERATOR_BATCH_NORMALIZATION_GRAD", DML_OPERATOR_BATCH_NORMALIZATION_GRAD},
+        {"DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD", DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD},
+        {"DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD", DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD},
+        {"DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR", DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR},
+        {"DML_OPERATOR_ROI_ALIGN1", DML_OPERATOR_ROI_ALIGN1},
+        {"DML_OPERATOR_ELEMENT_WISE_CLIP1", DML_OPERATOR_ELEMENT_WISE_CLIP1},
+        {"DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1", DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1},
+        {"DML_OPERATOR_ELEMENT_WISE_NEGATE", DML_OPERATOR_ELEMENT_WISE_NEGATE},
+        {"DML_OPERATOR_ACTIVATION_GELU", DML_OPERATOR_ACTIVATION_GELU},
+        {"DML_OPERATOR_ACTIVATION_SWISH", DML_OPERATOR_ACTIVATION_SWISH},
+        {"DML_OPERATOR_ACTIVATION_HARD_SWISH", DML_OPERATOR_ACTIVATION_HARD_SWISH},
+        {"DML_OPERATOR_RESAMPLE2", DML_OPERATOR_RESAMPLE2},
+        {"DML_OPERATOR_RESAMPLE_GRAD1", DML_OPERATOR_RESAMPLE_GRAD1},
+        {"DML_OPERATOR_DIAGONAL_MATRIX1", DML_OPERATOR_DIAGONAL_MATRIX1},
+        {"DML_OPERATOR_MULTIHEAD_ATTENTION", DML_OPERATOR_MULTIHEAD_ATTENTION},
+        {"DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING", DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING},
+        {"DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT", DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_OPERATOR_TYPE>(0);
+    }
+    return static_cast<DML_OPERATOR_TYPE>(*index);
+}
+
+
+template <>
+DML_BINDING_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_BINDING_TYPE_NONE", DML_BINDING_TYPE_NONE},
+        {"DML_BINDING_TYPE_BUFFER", DML_BINDING_TYPE_BUFFER},
+        {"DML_BINDING_TYPE_BUFFER_ARRAY", DML_BINDING_TYPE_BUFFER_ARRAY},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_BINDING_TYPE>(0);
+    }
+    return static_cast<DML_BINDING_TYPE>(*index);
+}
+
+
+template <>
+DML_REDUCE_FUNCTION ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_REDUCE_FUNCTION_ARGMAX", DML_REDUCE_FUNCTION_ARGMAX},
+        {"DML_REDUCE_FUNCTION_ARGMIN", DML_REDUCE_FUNCTION_ARGMIN},
+        {"DML_REDUCE_FUNCTION_AVERAGE", DML_REDUCE_FUNCTION_AVERAGE},
+        {"DML_REDUCE_FUNCTION_L1", DML_REDUCE_FUNCTION_L1},
+        {"DML_REDUCE_FUNCTION_L2", DML_REDUCE_FUNCTION_L2},
+        {"DML_REDUCE_FUNCTION_LOG_SUM", DML_REDUCE_FUNCTION_LOG_SUM},
+        {"DML_REDUCE_FUNCTION_LOG_SUM_EXP", DML_REDUCE_FUNCTION_LOG_SUM_EXP},
+        {"DML_REDUCE_FUNCTION_MAX", DML_REDUCE_FUNCTION_MAX},
+        {"DML_REDUCE_FUNCTION_MIN", DML_REDUCE_FUNCTION_MIN},
+        {"DML_REDUCE_FUNCTION_MULTIPLY", DML_REDUCE_FUNCTION_MULTIPLY},
+        {"DML_REDUCE_FUNCTION_SUM", DML_REDUCE_FUNCTION_SUM},
+        {"DML_REDUCE_FUNCTION_SUM_SQUARE", DML_REDUCE_FUNCTION_SUM_SQUARE},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_REDUCE_FUNCTION>(0);
+    }
+    return static_cast<DML_REDUCE_FUNCTION>(*index);
+}
+
+template <>
+DML_MATRIX_TRANSFORM ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_MATRIX_TRANSFORM_NONE", DML_MATRIX_TRANSFORM_NONE},
+        {"DML_MATRIX_TRANSFORM_TRANSPOSE", DML_MATRIX_TRANSFORM_TRANSPOSE},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_MATRIX_TRANSFORM>(0);
+    }
+    return static_cast<DML_MATRIX_TRANSFORM>(*index);
+}
+
+
+template <>
+DML_CONVOLUTION_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_CONVOLUTION_MODE_CONVOLUTION", DML_CONVOLUTION_MODE_CONVOLUTION},
+        {"DML_CONVOLUTION_MODE_CROSS_CORRELATION", DML_CONVOLUTION_MODE_CROSS_CORRELATION},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_CONVOLUTION_MODE>(0);
+    }
+    return static_cast<DML_CONVOLUTION_MODE>(*index);
+}
+
+
+template <>
+DML_CONVOLUTION_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_CONVOLUTION_DIRECTION_FORWARD", DML_CONVOLUTION_DIRECTION_FORWARD},
+        {"DML_CONVOLUTION_DIRECTION_BACKWARD", DML_CONVOLUTION_DIRECTION_BACKWARD},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_CONVOLUTION_DIRECTION>(0);
+    }
+    return static_cast<DML_CONVOLUTION_DIRECTION>(*index);
+}
+
+template <>
+DML_PADDING_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_PADDING_MODE_CONSTANT", DML_PADDING_MODE_CONSTANT},
+        {"DML_PADDING_MODE_EDGE", DML_PADDING_MODE_EDGE},
+        {"DML_PADDING_MODE_REFLECTION", DML_PADDING_MODE_REFLECTION},
+        {"DML_PADDING_MODE_SYMMETRIC", DML_PADDING_MODE_SYMMETRIC},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_PADDING_MODE>(0);
+    }
+    return static_cast<DML_PADDING_MODE>(*index);
+}
+
+
+template <>
+DML_INTERPOLATION_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR", DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR},
+        {"DML_INTERPOLATION_MODE_LINEAR", DML_INTERPOLATION_MODE_LINEAR},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_INTERPOLATION_MODE>(0);
+    }
+    return static_cast<DML_INTERPOLATION_MODE>(*index);
+}
+
+
+template <>
+DML_RECURRENT_NETWORK_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_RECURRENT_NETWORK_DIRECTION_FORWARD", DML_RECURRENT_NETWORK_DIRECTION_FORWARD},
+        {"DML_RECURRENT_NETWORK_DIRECTION_BACKWARD", DML_RECURRENT_NETWORK_DIRECTION_BACKWARD},
+        {"DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL", DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_RECURRENT_NETWORK_DIRECTION>(0);
+    }
+    return static_cast<DML_RECURRENT_NETWORK_DIRECTION>(*index);
+}
+
+
+template <>
+DML_FEATURE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT", DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT},
+        {"DML_FEATURE_FEATURE_LEVELS", DML_FEATURE_FEATURE_LEVELS},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_FEATURE>(0);
+    }
+    return static_cast<DML_FEATURE>(*index);
+}
+
+
+template <>
+DML_FEATURE_LEVEL ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_FEATURE_LEVEL_1_0", DML_FEATURE_LEVEL_1_0},
+        {"DML_FEATURE_LEVEL_2_0", DML_FEATURE_LEVEL_2_0},
+        {"DML_FEATURE_LEVEL_2_1", DML_FEATURE_LEVEL_2_1},
+        {"DML_FEATURE_LEVEL_3_0", DML_FEATURE_LEVEL_3_0},
+        {"DML_FEATURE_LEVEL_3_1", DML_FEATURE_LEVEL_3_1},
+        {"DML_FEATURE_LEVEL_4_0", DML_FEATURE_LEVEL_4_0},
+        {"DML_FEATURE_LEVEL_4_1", DML_FEATURE_LEVEL_4_1},
+        {"DML_FEATURE_LEVEL_5_0", DML_FEATURE_LEVEL_5_0},
+        {"DML_FEATURE_LEVEL_5_1", DML_FEATURE_LEVEL_5_1},
+        {"DML_FEATURE_LEVEL_5_2", DML_FEATURE_LEVEL_5_2},
+        {"DML_FEATURE_LEVEL_6_0", DML_FEATURE_LEVEL_6_0},
+        {"DML_FEATURE_LEVEL_6_1", DML_FEATURE_LEVEL_6_1},
+        {"DML_FEATURE_LEVEL_6_2", DML_FEATURE_LEVEL_6_2},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_FEATURE_LEVEL>(0);
+    }
+    return static_cast<DML_FEATURE_LEVEL>(*index);
+}
+
+
+template <>
+DML_IS_INFINITY_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_IS_INFINITY_MODE_EITHER", DML_IS_INFINITY_MODE_EITHER},
+        {"DML_IS_INFINITY_MODE_POSITIVE", DML_IS_INFINITY_MODE_POSITIVE},
+        {"DML_IS_INFINITY_MODE_NEGATIVE", DML_IS_INFINITY_MODE_NEGATIVE},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_IS_INFINITY_MODE>(0);
+    }
+    return static_cast<DML_IS_INFINITY_MODE>(*index);
+}
+
+
+template <>
+DML_DEPTH_SPACE_ORDER ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW", DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW},
+        {"DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH", DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_DEPTH_SPACE_ORDER>(0);
+    }
+    return static_cast<DML_DEPTH_SPACE_ORDER>(*index);
+}
+
+
+template <>
+DML_AXIS_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_AXIS_DIRECTION_INCREASING", DML_AXIS_DIRECTION_INCREASING},
+        {"DML_AXIS_DIRECTION_DECREASING", DML_AXIS_DIRECTION_DECREASING},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_AXIS_DIRECTION>(0);
+    }
+    return static_cast<DML_AXIS_DIRECTION>(*index);
+}
+
+
+template <>
+DML_ROUNDING_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN", DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN},
+        {"DML_ROUNDING_MODE_TOWARD_ZERO", DML_ROUNDING_MODE_TOWARD_ZERO},
+        {"DML_ROUNDING_MODE_TOWARD_INFINITY", DML_ROUNDING_MODE_TOWARD_INFINITY},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_ROUNDING_MODE>(0);
+    }
+    return static_cast<DML_ROUNDING_MODE>(*index);
+}
+
+
+template <>
+DML_RANDOM_GENERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10", DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_RANDOM_GENERATOR_TYPE>(0);
+    }
+    return static_cast<DML_RANDOM_GENERATOR_TYPE>(*index);
+}
+
+
+template <>
+DML_MULTIHEAD_ATTENTION_MASK_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value)
+{
+    constexpr StringUtil::NameAndIndex mapping[] =
+    {
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE", DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE},
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH},
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START},
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END},
+        {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN", DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN},
+    };
+    auto index = StringUtil::MapToIndex(value, mapping);
+    if (!index)
+    {
+        assert(false);
+        return static_cast<DML_MULTIHEAD_ATTENTION_MASK_TYPE>(0);
+    }
+    return static_cast<DML_MULTIHEAD_ATTENTION_MASK_TYPE>(*index);
+}
+
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h
index 570f62aac810..2eee9c9a9e5a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h
@@ -47,6 +47,14 @@ namespace Dml
             return m_commandAllocators[m_currentCommandAllocator].Get();
         }
 
+        // Updates the completion event of the current allocator to a different value.  This is used when the caller
+        // decides to issue an unrelated call to the queue such as ExecuteCommandLists which updates its fence between calling 
+        // GetNextAllocator and executing the work which it recorded using the allocator it received.
+        void UpdateCurrentAllocatorCompletionEvent(GpuEvent nextCompletionEvent)
+        {
+            m_commandAllocators[m_currentCommandAllocator].completionEvent = nextCompletionEvent;
+        }
+
     private:
         struct CommandAllocatorInfo
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
index 5516fc62cdda..2b4f3bb96537 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
@@ -46,12 +46,12 @@ namespace Dml
         return GpuEvent{ m_lastFenceValue + 1, m_fence };
     }
 
-    void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork) 
+    void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork)
     {
-        // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK 
-        // to queue additional references at this time, since those references would be leaked. This 
-        // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference; 
-        // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference 
+        // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK
+        // to queue additional references at this time, since those references would be leaked. This
+        // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference;
+        // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference
         // to its underlying D3D resource when freed. Furthermore, these references are unnecessary
         // since Close() already blocks for scheduled GPU work before clearing m_queuedReferences.
         if (!m_closing)
@@ -68,7 +68,7 @@ namespace Dml
             m_queuedReferences.push_back(queuedReference);
         }
     }
-    
+
     void CommandQueue::Close()
     {
         // Wait for flushed work:
@@ -79,7 +79,7 @@ namespace Dml
         m_queuedReferences.clear();
         m_closing = false;
     }
-    
+
     void CommandQueue::ReleaseCompletedReferences()
     {
         uint64_t completedValue = GetFence()->GetCompletedValue();
@@ -89,5 +89,4 @@ namespace Dml
         }
     }
 
-
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 530c26d21208..5254b23f5637 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -251,42 +251,41 @@ void DmlCommandRecorder::ExecuteCommandList(
     _Out_ uint64_t* completionValue
     )
 {
-    ORT_THROW_IF_FAILED(m_currentCommandList->Close());
-
-    if (m_operationsRecordedInCurrentCommandList)
+    if (!m_operationsRecordedInCurrentCommandList)
     {
-        m_pendingCommandLists.push_back(m_currentCommandList.Get());
-        m_pendingCommandListsCacheable.push_back(true);
-    }
-    else
-    {
-        m_cachedCommandLists.push_back(m_currentCommandList.Get());
-    }
+        // The caller can re-use relevant resources after the next set of work to be
+        // flushed has completed.  Its command list hasn't been executed yet, just batched.
+        GpuEvent gpuEvent = m_queue->GetNextCompletionEvent();
+        gpuEvent.fence.CopyTo(fence);
+        *completionValue = gpuEvent.fenceValue;
 
-    m_currentCommandList = nullptr;
-    m_operationsRecordedInCurrentCommandList = false;
+        m_queue->ExecuteCommandLists(
+        gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(&commandList), 1));
 
-    m_pendingCommandLists.push_back(commandList);
-    m_pendingCommandListsCacheable.push_back(false);
+        // The fence value at which the current command allocator may be re-used will now be higher
+        m_commandAllocatorRing.UpdateCurrentAllocatorCompletionEvent(m_queue->GetNextCompletionEvent());
 
-    // Remember the descriptor heap and apply it to the next command list
-    auto heap = m_currentDescriptorHeap;
-    m_currentDescriptorHeap = nullptr;
-    Open();
+        // Fail early if something horrifying happens
+        ORT_THROW_IF_FAILED(m_dmlDevice->GetDeviceRemovedReason());
+        ORT_THROW_IF_FAILED(m_d3dDevice->GetDeviceRemovedReason());
 
-    // The caller can re-use relevant resources after the next set of work to be
-    // flushed has completed.  Its command list hasn't been executed yet, just batched.
-    GpuEvent gpuEvent = m_queue->GetNextCompletionEvent();
-    gpuEvent.fence.CopyTo(fence);
-    *completionValue = gpuEvent.fenceValue;
+        return;
+    }
+
+    // Remember the descriptor heap and apply it to the next command list.  This avoids unnecessarily setting it onto
+    // the D3D object lazily at a point when the operation may not be parallelized with GPU work.
+    auto heap = m_currentDescriptorHeap;
 
-    // Trigger a flush of the command list, with the assumption that it contains enough GPU work that this
-    // will help parallelize GPU work with subsequent CPU work.  This policy is related to the choice of
-    // minNodeCountToReuseCommandList within FusedGraphKernel, so both should be tuned together.
-    CloseAndExecute();
+    // Execute work in the current command list plus provided command list while closing the recorder.
+    CloseAndExecute(commandList);
     Open();
 
+    // Reset the descriptor heap opportunistically per above comment
     SetDescriptorHeap(heap);
+
+    GpuEvent gpuEvent = m_queue->GetCurrentCompletionEvent();
+    gpuEvent.fence.CopyTo(fence);
+    *completionValue = gpuEvent.fenceValue;
 }
 
 ComPtr<ID3D12GraphicsCommandList> DmlCommandRecorder::GetCommandList()
@@ -316,7 +315,7 @@ void DmlCommandRecorder::Open()
 
     ID3D12CommandAllocator* allocator = m_commandAllocatorRing.GetNextAllocator(m_queue->GetNextCompletionEvent());
 
-    if (m_cachedCommandLists.empty())
+    if (!m_cachedCommandList)
     {
         ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList(
             0,
@@ -327,47 +326,43 @@ void DmlCommandRecorder::Open()
     }
     else
     {
-        m_currentCommandList = m_cachedCommandLists.front();
-        m_cachedCommandLists.pop_front();
+        m_currentCommandList = m_cachedCommandList;
+        m_cachedCommandList = nullptr;
         ORT_THROW_IF_FAILED(m_currentCommandList->Reset(allocator, nullptr));
     }
 }
 
 void DmlCommandRecorder::CloseAndExecute()
 {
+    CloseAndExecute(nullptr);
+}
+
+void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList)
+{   
     ORT_THROW_IF_FAILED(m_currentCommandList->Close());
 
+    ID3D12GraphicsCommandList* commandListsToExecute[2] = {};
+    uint32_t commandListsToExecuteCount = 0;
+
     if (m_operationsRecordedInCurrentCommandList)
     {
-        m_pendingCommandLists.push_back(m_currentCommandList.Get());
-        m_pendingCommandListsCacheable.push_back(true);
+        commandListsToExecute[commandListsToExecuteCount++] = m_currentCommandList.Get();
     }
-    else
+
+    if (commandList)
     {
-        m_cachedCommandLists.push_back(m_currentCommandList.Get());
+        commandListsToExecute[commandListsToExecuteCount++] = commandList;
     }
 
-    m_currentCommandList = nullptr;
-    m_operationsRecordedInCurrentCommandList = false;
-
-    if (!m_pendingCommandLists.empty())
+    if (commandListsToExecuteCount > 0)
     {
-        // Close and execute the command list
         m_queue->ExecuteCommandLists(
-            gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(m_pendingCommandLists.data()), m_pendingCommandLists.size()));
-
-        assert(m_pendingCommandLists.size() == m_pendingCommandListsCacheable.size());
-        for (size_t i = 0; i < m_pendingCommandLists.size(); ++i)
-        {
-            if (m_pendingCommandListsCacheable[i])
-            {
-                m_cachedCommandLists.push_back(m_pendingCommandLists[i]);
-            }
-        }
-
-        m_pendingCommandLists.clear();
-        m_pendingCommandListsCacheable.clear();
+                gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(commandListsToExecute), commandListsToExecuteCount));
     }
+    
+    m_cachedCommandList = m_currentCommandList;
+    m_currentCommandList = nullptr;
+    m_operationsRecordedInCurrentCommandList = false;
 
     // The descriptor heap must be set on the command list the next time it's opened.
     m_currentDescriptorHeap = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
index 7ad7032317d7..83051c8ca4ff 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
@@ -58,7 +58,7 @@ namespace Dml
 
         bool HasUnsubmittedWork() override
         {
-            return m_operationsRecordedInCurrentCommandList || !m_pendingCommandLists.empty();
+            return m_operationsRecordedInCurrentCommandList;
         }
 
         // Forces the descriptor heap to be reset to D3D before executing future operations
@@ -68,7 +68,8 @@ namespace Dml
         }
 
     private:
-
+        void CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList);
+    
         std::shared_ptr<CommandQueue> m_queue;
         ComPtr<ID3D12Device> m_d3dDevice;
         ComPtr<IDMLDevice> m_dmlDevice;
@@ -89,15 +90,8 @@ namespace Dml
         ComPtr<ID3D12GraphicsCommandList> m_currentCommandList;
         bool m_operationsRecordedInCurrentCommandList = false;
 
-        // Command lists which have been batched up for execution.  The values in 
-        // m_pendingCommandListsCacheable indicate whether they can be moved into this
-        // class's cache after execution, versus if they belong to the caller and were
-        // passed to ExecuteCommandList.
-        std::vector<ComPtr<ID3D12GraphicsCommandList>> m_pendingCommandLists;
-        std::vector<bool> m_pendingCommandListsCacheable;
-
-        // A pool of cached command lists which may be re-used.
-        std::deque<ComPtr<ID3D12GraphicsCommandList>> m_cachedCommandLists;
+        // A cached command list which may be re-used.
+        ComPtr<ID3D12GraphicsCommandList> m_cachedCommandList;
 
         void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap);
     };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
index b696aefecf66..54393e9bf153 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
@@ -16,7 +16,7 @@ namespace Dml
             unmove_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)),
             D3D12_HEAP_FLAG_NONE,
             &buffer,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON,
             nullptr,
             IID_GRAPHICS_PPV_ARGS(resource.GetAddressOf())
         ));
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
index 9514a24b4e78..b22f0b2853e5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <wrl/client.h>
-#include <d3d12.h>
+#include "directx/d3d12.h"
 #include <wil/wrl.h>
 #include <wil/result_macros.h>
 #include "External/D3DX12/d3dx12.h"
@@ -39,7 +39,7 @@ namespace Dml
                 &props,
                 D3D12_HEAP_FLAG_NONE,
                 &buffer,
-                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+                D3D12_RESOURCE_STATE_COMMON,
                 nullptr,
                 IID_GRAPHICS_PPV_ARGS(resource.GetAddressOf())
             ));
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp
new file mode 100644
index 000000000000..013ad949c1c3
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp
@@ -0,0 +1,555 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include "precomp.h"
+
+OperatorFieldVariant CreateAttribute(
+    const DML_SCHEMA_FIELD* schemaField,
+    const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc);
+
+OperatorFieldVariant CreateActivation(
+    const dml::ir::operatorFieldTypes::Activation* activationDesc)
+{
+    DML_OPERATOR_TYPE activationOperatorType = ApiTraits::StringifyHelpers::FromString<DML_OPERATOR_TYPE>(activationDesc->type()->c_str());
+    const DML_OPERATOR_SCHEMA& activationSchema = SchemaHelpers::GetSchema(activationOperatorType);
+    std::vector<OperatorField> activationOperatorFields(activationSchema.FieldCount);
+    uint32_t attributeIndex = 0;
+
+    for (uint32_t fieldIndex = 0; fieldIndex < activationSchema.FieldCount; fieldIndex++)
+    {
+        const DML_SCHEMA_FIELD* schemaField = &activationSchema.Fields[fieldIndex];
+        OperatorFieldVariant field;
+        switch (schemaField->Kind)
+        {
+            case DML_SCHEMA_FIELD_KIND_INPUT_TENSOR:
+            case DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR:
+            {
+                if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC)
+                {
+                    field = OperatorFieldTypes::TensorDesc();
+                }
+                else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY)
+                {
+                    field = OperatorFieldTypes::TensorDescArray();
+                }
+                break;
+            }
+            case DML_SCHEMA_FIELD_KIND_ATTRIBUTE:
+            {
+                const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc = 
+                    attributeIndex >= activationDesc->attributes()->size() ?
+                    nullptr : 
+                    activationDesc->attributes()->Get(attributeIndex++);
+                field = CreateAttribute(schemaField, attributeDesc);
+                break;
+            }
+        }
+
+        activationOperatorFields[fieldIndex] = OperatorField(schemaField, std::move(field));
+    }
+
+    return AbstractOperatorDesc(&activationSchema, std::move(activationOperatorFields));
+}
+
+OperatorFieldVariant CreateActivations(
+    const dml::ir::operatorFieldTypes::ActivationArray* activationDescs)
+{
+    std::vector<AbstractOperatorDesc> activations;
+    for (uint32_t index = 0; index < static_cast<uint32_t>(activationDescs->data()->size()); index++)
+    {
+        OperatorFieldVariant activation = CreateActivation(activationDescs->data()->Get(index));
+        activations.push_back(std::get<OperatorFieldTypes::FusedActivationOperatorDesc>(activation).value());
+    }
+    return activations;
+}
+
+OperatorFieldVariant CreateAttribute(
+    const DML_SCHEMA_FIELD* schemaField,
+    const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc)
+{
+    switch (schemaField->Type)
+    {
+        case DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC:
+        {
+            return attributeDesc != nullptr && attributeDesc->val_as_Activation() != nullptr ?  
+                CreateActivation(attributeDesc->val_as_Activation()) : 
+                OperatorFieldTypes::FusedActivationOperatorDesc();
+        }
+        case DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY:
+        {
+            return attributeDesc != nullptr && attributeDesc->val_as_ActivationArray() != nullptr ?  
+                CreateActivations(attributeDesc->val_as_ActivationArray()) : 
+                OperatorFieldTypes::FusedActivationOperatorDescArray();
+        }
+        case DML_SCHEMA_FIELD_TYPE_UINT:
+        {
+            OperatorFieldTypes::UInt data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_UInt32()->data();
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_UINT64:
+        {
+            OperatorFieldTypes::UInt64 data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_UInt64()->data();
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_INT:
+        {
+            OperatorFieldTypes::Int data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_Int32()->data();
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_FLOAT:
+        {
+            OperatorFieldTypes::Float data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_Float32()->data();
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_UINT_ARRAY:
+        {
+            OperatorFieldTypes::UIntArray data;
+            if (attributeDesc != nullptr)
+            {
+                data.assign(attributeDesc->val_as_UIntArray()->data()->begin(), attributeDesc->val_as_UIntArray()->data()->end());
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_INT_ARRAY:
+        {
+            OperatorFieldTypes::IntArray data;
+            if (attributeDesc != nullptr)
+            {
+                data.assign(attributeDesc->val_as_IntArray()->data()->begin(), attributeDesc->val_as_IntArray()->data()->end());
+            }
+            return data;
+        }
+        case DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY:
+        {
+            OperatorFieldTypes::FloatArray data;
+            if (attributeDesc != nullptr)
+            {
+                data.assign(attributeDesc->val_as_FloatArray()->data()->begin(), attributeDesc->val_as_FloatArray()->data()->end());
+            }
+            return data;
+        }	
+        case DML_SCHEMA_FIELD_TYPE_SCALE_BIAS:
+        {
+            OperatorFieldTypes::ScaleBias scaleBias;
+            const dml::ir::operatorFieldTypes::ScaleBias* scaleBiasAttribute = attributeDesc->val_as_ScaleBias();
+            if (scaleBiasAttribute != nullptr)
+            {
+                scaleBias = {scaleBiasAttribute->scale(), scaleBiasAttribute->bias()};
+            }
+            return scaleBias;
+        }
+        case DML_SCHEMA_FIELD_TYPE_SIZE_2D:
+        {
+            OperatorFieldTypes::Size2D size2d = {};
+            if (attributeDesc != nullptr)
+            {
+                size2d.Height = attributeDesc->val_as_Size2D()->height();
+                size2d.Width = attributeDesc->val_as_Size2D()->width();
+            }
+            return size2d;
+        }
+        case DML_SCHEMA_FIELD_TYPE_SCALAR_UNION:
+        {
+            DML_SCALAR_UNION scalarUnion;
+            if (attributeDesc != nullptr)
+            {
+                const dml::ir::operatorFieldTypes::ByteArray* byteArr = attributeDesc->val_as_ScalarUnionData()->data_as_ByteArray();
+                std::copy(byteArr->data()->begin(), byteArr->data()->end(), scalarUnion.Bytes);
+            }
+            return scalarUnion;
+        }
+        case DML_SCHEMA_FIELD_TYPE_BOOL:
+        {
+            OperatorFieldTypes::Bool data;
+            if (attributeDesc != nullptr)
+            {
+                data = attributeDesc->val_as_Bool()->data();
+            }
+            return data;
+        }
+        default:
+        {
+            throw std::invalid_argument("Invalid attribute type.");
+        }
+    }
+}
+
+OperatorFieldTypes::TensorDesc CreateBufferTensorDesc(
+    const dml::ir::DmlBufferTensorDesc* tensorDesc,
+    const bool isConstantTensor = false)
+{
+    DmlBufferTensorDesc bufferTensorDesc = {};
+    bufferTensorDesc.dataType = ApiTraits::StringifyHelpers::FromString<DML_TENSOR_DATA_TYPE>(tensorDesc->dataType()->c_str());
+    if (isConstantTensor)
+    {
+        bufferTensorDesc.flags = DML_TENSOR_FLAG_OWNED_BY_DML;
+    }
+    bufferTensorDesc.sizes.assign(tensorDesc->sizes()->begin(), tensorDesc->sizes()->end());
+    if (flatbuffers::IsFieldPresent(tensorDesc, dml::ir::DmlBufferTensorDesc::VT_STRIDES))
+    {
+        bufferTensorDesc.strides.emplace(tensorDesc->strides()->begin(), tensorDesc->strides()->end());
+    }
+    bufferTensorDesc.totalTensorSizeInBytes = tensorDesc->totalTensorSizeInBytes();
+    return bufferTensorDesc;
+}
+
+AbstractOperatorDesc CreateAbstractOperatorDesc(
+    uint32_t nodeIndex,
+    const dml::ir::OperatorNodeDesc* flatbufferOperatorNodeDesc,
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* nodeInputNames,
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* nodeOutputNames,
+    const std::unordered_set<std::string_view>& constantInputs)
+{
+    DML_OPERATOR_TYPE type = ApiTraits::StringifyHelpers::FromString<DML_OPERATOR_TYPE>(flatbufferOperatorNodeDesc->type()->c_str());
+    if (type == DML_OPERATOR_INVALID)
+    {
+        throw std::invalid_argument("Graph operator node at index:" + std::to_string(nodeIndex) +
+                                    " either has empty or invalid operator type.");
+    }
+    const DML_OPERATOR_SCHEMA& schema = SchemaHelpers::GetSchema(type);
+    std::vector<OperatorField> operatorFields(schema.FieldCount);
+    
+    auto inputNameItr = nodeInputNames->begin();
+    uint32_t inputTensorDescIndex = 0;
+    
+    uint32_t outputTensorDescIndex = 0;
+    auto outputNameItr = nodeOutputNames->begin();
+
+    uint32_t attributeIndex = 0;
+    
+
+    for (uint32_t fieldIndex = 0; fieldIndex < schema.FieldCount; fieldIndex++)
+    {
+        const DML_SCHEMA_FIELD* schemaField = &schema.Fields[fieldIndex];
+        
+        OperatorFieldVariant field;
+        switch (schemaField->Kind)
+        {
+            case DML_SCHEMA_FIELD_KIND_INPUT_TENSOR:
+            {
+                if (inputNameItr == nodeInputNames->end())
+                {
+                    throw std::invalid_argument("Missing input names for node at index:" + std::to_string(nodeIndex));
+                }
+
+                if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC)
+                {
+                    const flatbuffers::String* inputName = *inputNameItr;
+                    inputNameItr++;
+                    if (inputName->size() == 0)
+                    {
+                        field = OperatorFieldTypes::TensorDesc();
+                        break;
+                    }
+                    bool isConstantTensor = !constantInputs.empty() && constantInputs.find(inputName->c_str()) != constantInputs.end();
+
+                    if (flatbufferOperatorNodeDesc->inputs()->size() <= inputTensorDescIndex)
+                    {
+                        throw std::invalid_argument("Expecting at least " + std::to_string(inputTensorDescIndex + 1) + 
+                                                    "input tensor desc for graph operator node at index:" + std::to_string(nodeIndex));
+                    }
+                    const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->inputs()->Get(inputTensorDescIndex++);
+                    field = CreateBufferTensorDesc(tensorDesc, isConstantTensor);
+                }
+                else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY)
+                {
+                    std::vector<DmlBufferTensorDesc> tensors;
+                    while (inputTensorDescIndex < static_cast<uint32_t>(flatbufferOperatorNodeDesc->inputs()->size()))
+                    {
+                        const flatbuffers::String* inputName = *inputNameItr;
+                        inputNameItr++;
+                        bool isConstantTensor = !constantInputs.empty() && constantInputs.find(inputName->c_str()) != constantInputs.end();
+                        
+                        if (flatbufferOperatorNodeDesc->inputs()->size() <= inputTensorDescIndex)
+                        {
+                            throw std::invalid_argument("Expecting at least " + std::to_string(inputTensorDescIndex + 1) + 
+                                                        "input tensor desc for graph operator node at index:" + std::to_string(nodeIndex));
+                        }
+                        const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->inputs()->Get(inputTensorDescIndex++);
+                        tensors.push_back(CreateBufferTensorDesc(tensorDesc, isConstantTensor).value());
+                    }
+                    field = tensors;
+                }
+                break;
+            }
+            case DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR:
+            {
+                if (outputNameItr == nodeOutputNames->end())
+                {
+                    throw std::invalid_argument("Missing output names for node at index:" + std::to_string(nodeIndex));
+                }
+
+                if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC)
+                {
+                    const flatbuffers::String* outputName = *outputNameItr;
+                    outputNameItr++;
+
+                    if (outputName->size() == 0)
+                    {
+                        field = OperatorFieldTypes::TensorDesc();
+                        break;
+                    }
+
+                    if (flatbufferOperatorNodeDesc->outputs()->size() <= outputTensorDescIndex)
+                    {
+                        throw std::invalid_argument("Expecting at least " + std::to_string(outputTensorDescIndex + 1) + 
+                                                    "output tensor desc for graph operator node at index:" + std::to_string(nodeIndex));
+                    }
+                    const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->outputs()->Get(outputTensorDescIndex++);
+                    field = CreateBufferTensorDesc(tensorDesc);
+                }
+                else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY)
+                {
+                    std::vector<DmlBufferTensorDesc> tensors;
+                    while (outputTensorDescIndex < static_cast<uint32_t>(flatbufferOperatorNodeDesc->outputs()->size()))
+                    {
+                        if (flatbufferOperatorNodeDesc->outputs()->size() <= outputTensorDescIndex)
+                        {
+                            throw std::invalid_argument("Expecting at least " + std::to_string(outputTensorDescIndex + 1) + 
+                                                        "output tensor desc for graph operator node at index:" + std::to_string(nodeIndex));
+                        }
+                        const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->outputs()->Get(outputTensorDescIndex++);
+                        tensors.push_back(CreateBufferTensorDesc(tensorDesc).value());
+                    }
+                    field = tensors;
+                }
+                break;
+            }
+            case DML_SCHEMA_FIELD_KIND_ATTRIBUTE:
+            {
+                if (flatbufferOperatorNodeDesc->attributes()->size() <= attributeIndex)
+                {
+                    throw std::invalid_argument("Expecting at least " + std::to_string(attributeIndex + 1) + 
+                                                "attributes for graph operator node at index:" + std::to_string(nodeIndex));
+                }
+                const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc = 
+                    attributeIndex >= flatbufferOperatorNodeDesc->attributes()->size() ?
+                    nullptr : 
+                    flatbufferOperatorNodeDesc->attributes()->Get(attributeIndex++);
+                field = CreateAttribute(schemaField, attributeDesc);
+                break;
+            }
+        }
+
+        operatorFields[fieldIndex] = OperatorField(schemaField, std::move(field));
+    }
+
+    return AbstractOperatorDesc(&schema, std::move(operatorFields));
+}
+
+std::unordered_map<std::string_view, uint32_t> ConvertToEdgeNameToIndexMap(
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* list)
+{
+    std::unordered_map<std::string_view, uint32_t> nameToIndexMap;
+    for (uint32_t index = 0; index < list->size(); index++)
+    {
+        const flatbuffers::String* name = list->GetAsString(index);
+        if (name->size() == 0)
+        {
+            continue;
+        }
+        nameToIndexMap[name->string_view()] = index;
+    }
+    return nameToIndexMap; // NRVO will automatically move it. no need to use std::move
+}
+
+template <typename EdgeType> void PopulateEdges(
+    const uint32_t nodeIndex,
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* edgeNames,
+    const std::unordered_map<std::string_view, uint32_t>& edgeNameToIndexMap,
+    /*out*/ std::vector<EdgeType>& edges,
+    /*out*/ std::vector<DmlIntermediateSerializedGraphEdge>& intermediateEdges,
+    /*out*/ std::unordered_map<std::string_view, NodeIndex>& edgeToOutgoingNodeIndexMap)
+{
+    for (flatbuffers::uoffset_t edgeIndex = 0; edgeIndex < edgeNames->size(); edgeIndex++)
+    {
+        const flatbuffers::String* edgeName = edgeNames->Get(edgeIndex);
+        if (edgeName->size() == 0)
+        {
+            // This must be optional input/output
+            continue;
+        }
+        // edge can be graphInput or graphOutput
+        if (edgeNameToIndexMap.find(edgeName->string_view()) != edgeNameToIndexMap.end())
+        {
+            EdgeType edge = {};
+            edge.Name = edgeName->str();
+            
+            if constexpr (std::is_same_v<EdgeType, DmlInputSerializedGraphEdge>)
+            {
+                edge.GraphInputIndex = edgeNameToIndexMap.at(edgeName->string_view());
+                edge.ToNodeIndex = nodeIndex;
+                edge.ToNodeInputIndex = edgeIndex;
+            }
+            else if constexpr (std::is_same_v<EdgeType, DmlOutputSerializedGraphEdge>)
+            {
+                edge.GraphOutputIndex = edgeNameToIndexMap.at(edgeName->string_view());
+                edge.FromNodeIndex = nodeIndex;
+                edge.FromNodeOutputIndex = edgeIndex;
+                edgeToOutgoingNodeIndexMap[edgeName->string_view()] = {nodeIndex, edgeIndex};
+            }
+
+            edges.push_back(edge);
+        }
+        // edge is intermediate edge
+        else 
+        {
+            if constexpr (std::is_same_v<EdgeType, DmlInputSerializedGraphEdge>)
+            {
+                if (edgeToOutgoingNodeIndexMap.find(edgeName->string_view()) == edgeToOutgoingNodeIndexMap.end())
+                {
+                    throw std::range_error("Neither there is any graph input with name " + edgeName->str() + 
+                                           " nor there is any node which has " + edgeName->str() + " as one of the output.");
+                }
+                auto& intermediateEdgeNodeIndex = edgeToOutgoingNodeIndexMap[edgeName->string_view()];
+                DmlIntermediateSerializedGraphEdge intermediateEdge = {};
+                intermediateEdge.Name = edgeName->str();
+                intermediateEdge.FromNodeIndex = intermediateEdgeNodeIndex.nodeIndex;
+                intermediateEdge.FromNodeOutputIndex = intermediateEdgeNodeIndex.nodeOutputIndex;
+                intermediateEdge.ToNodeIndex = nodeIndex;
+                intermediateEdge.ToNodeInputIndex = edgeIndex;
+                intermediateEdges.push_back(std::move(intermediateEdge));
+            }
+            else if constexpr (std::is_same_v<EdgeType, DmlOutputSerializedGraphEdge>)
+            {
+                edgeToOutgoingNodeIndexMap[edgeName->string_view()] = {nodeIndex, edgeIndex};
+            }
+        }
+    }
+}
+
+/*
+* - Handling of empty optional input/output/attibute for non-constant node:
+*   input/output
+*   - <DmlGraphNode.inputNames> and <DmlGraphNode.outputNames> will have an null entry
+*      but the actual OperatorNodeDesc variant's <OperatorNodeDesc.inputs> 
+*      and <OperatorNodeDesc.outputs> will not have any entry.
+*   attribute
+*   - <OperatorNodeDesc.attributes> will have null entry
+*/
+DmlSerializedGraphDesc DeserializeDmlGraph(
+    const uint8_t* flatbufferGraphDescBlob,
+    /*out*/ std::vector<std::unique_ptr<std::byte[]>>& rawData)
+{
+    if (flatbufferGraphDescBlob == nullptr)
+    {
+        throw std::invalid_argument("Given pointer to flatbuffer blob is null");
+    }
+    const dml::ir::DmlGraphDesc* flatbufferGraphDesc = dml::ir::GetDmlGraphDesc(flatbufferGraphDescBlob);
+    
+    std::unordered_map<std::string_view, uint32_t> graphInputEdgeToIndexMap = ConvertToEdgeNameToIndexMap(flatbufferGraphDesc->graphInputNames());
+    std::unordered_map<std::string_view, uint32_t> graphOutputEdgeToIndexMap = ConvertToEdgeNameToIndexMap(flatbufferGraphDesc->graphOutputNames());
+    
+    std::unordered_map<std::string_view, NodeIndex> edgeToOutgoingNodeIndexMap;
+    std::unordered_set<std::string_view> constantInputs;
+
+    std::vector<DmlSerializedGraphNode> nodes(flatbufferGraphDesc->nodes()->size());
+    std::vector<DmlInputSerializedGraphEdge> inputEdges;
+    std::vector<DmlOutputSerializedGraphEdge> outputEdges;
+    std::vector<DmlIntermediateSerializedGraphEdge> intermediateEdges;
+
+    for (uint32_t nodeIndex = 0; nodeIndex < flatbufferGraphDesc->nodes()->size(); nodeIndex++)
+    {
+        const dml::ir::DmlGraphNode* flatbufferNode = flatbufferGraphDesc->nodes()->Get(nodeIndex);
+
+        PopulateEdges<DmlInputSerializedGraphEdge>(
+            nodeIndex,
+            flatbufferNode->inputNames(),
+            graphInputEdgeToIndexMap,
+            inputEdges,
+            intermediateEdges,
+            edgeToOutgoingNodeIndexMap);
+
+        PopulateEdges<DmlOutputSerializedGraphEdge>(
+            nodeIndex,
+            flatbufferNode->outputNames(),
+            graphOutputEdgeToIndexMap,
+            outputEdges,
+            intermediateEdges,
+            edgeToOutgoingNodeIndexMap);
+        
+        DmlSerializedGraphNode node = {};
+        if (flatbufferNode->name()->size() == 0)
+        {
+            throw std::invalid_argument("Graph node at index:" + std::to_string(nodeIndex) + " doesn't have any name");
+        }
+        node.Name = flatbufferNode->name()->c_str();
+
+        if (flatbufferNode->desc_type() == dml::ir::NodeDesc_ConstantNodeDesc)
+        {
+            const dml::ir::ConstantNodeDesc* flatbufferConstantNode = flatbufferNode->desc_as_ConstantNodeDesc();
+            if (flatbufferConstantNode->data_type() == dml::ir::ConstantNodeDescDetail_ConstantName)
+            {
+                if (flatbufferConstantNode->data_as_ConstantName()->name()->size() == 0)
+                {
+                    throw std::invalid_argument("Constant node at index:" + std::to_string(nodeIndex) + 
+                                                " doesn't have constant data name.");
+                }
+
+                ConstantName constantNode = {flatbufferConstantNode->data_as_ConstantName()->name()->c_str()};
+                node.Desc = constantNode;
+                // Output of this node will be part of constantInputs list.
+                for (uint32_t outputIndex = 0; outputIndex < flatbufferNode->outputNames()->size(); outputIndex++)
+                {
+                    constantInputs.insert(flatbufferNode->outputNames()->Get(outputIndex)->c_str());
+                }
+            }
+            else if (flatbufferConstantNode->data_type() == dml::ir::ConstantNodeDescDetail_ConstantRawData)
+            {
+                
+                uint32_t rawDataSize = flatbufferConstantNode->data_as_ConstantRawData()->data()->size();
+                rawData.push_back(std::make_unique<std::byte[]>(rawDataSize));
+                std::transform(
+                    flatbufferConstantNode->data_as_ConstantRawData()->data()->begin(),
+                    flatbufferConstantNode->data_as_ConstantRawData()->data()->end(),
+                    rawData.back().get(),
+                    [](uint8_t b) {return static_cast<std::byte>(b);});
+
+                ConstantData constantData = {};
+                constantData.dataSize = rawDataSize;
+                constantData.data = rawData.back().get();
+                node.Desc = constantData;
+            }
+
+
+        }
+        else if (flatbufferNode->desc_type() == dml::ir::NodeDesc::NodeDesc_OperatorNodeDesc)
+        {
+            // convert dml::ir::OperatorNodeDesc to AbstractOperatorDesc
+            const dml::ir::OperatorNodeDesc* flatbufferOperatorNodeDesc = flatbufferNode->desc_as_OperatorNodeDesc();
+            node.Desc = CreateAbstractOperatorDesc(
+                nodeIndex,
+                flatbufferOperatorNodeDesc,
+                flatbufferNode->inputNames(),
+                flatbufferNode->outputNames(),
+                constantInputs);
+        }
+
+        nodes[nodeIndex] = node;
+    }
+
+    DmlSerializedGraphDesc graphDesc;
+    graphDesc.InputCount = flatbufferGraphDesc->graphInputNames()->size();
+    graphDesc.OutputCount = flatbufferGraphDesc->graphOutputNames()->size();
+    graphDesc.InputEdges = std::move(inputEdges);
+    graphDesc.IntermediateEdges = std::move(intermediateEdges);
+    graphDesc.OutputEdges = std::move(outputEdges);
+    graphDesc.Nodes = std::move(nodes);
+    return graphDesc;	
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 4f7ec188140b..27168bc8e976 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -36,7 +36,7 @@ namespace DmlGraphFusionHelper
             &heapProperties,
             D3D12_HEAP_FLAG_NONE,
             &resourceDesc,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON,
             nullptr,
             IID_GRAPHICS_PPV_ARGS(buffer.GetAddressOf())));
 
@@ -74,7 +74,7 @@ namespace DmlGraphFusionHelper
             &heapProperties,
             D3D12_HEAP_FLAG_NONE,
             &resourceDesc,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            D3D12_RESOURCE_STATE_COMMON,
             nullptr,
             IID_GRAPHICS_PPV_ARGS(buffer.GetAddressOf())));
 
@@ -135,8 +135,10 @@ namespace DmlGraphFusionHelper
 
     void ProcessInputData(
         const ExecutionProviderImpl* providerImpl,
+        const bool graphSerializationEnabled,
         const std::vector<uint8_t>& isInputsUploadedByDmlEP,
-        const std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex,
         const gsl::span<const std::string> subGraphInputArgNames,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
         onnxruntime::Graph& graph,
@@ -162,8 +164,17 @@ namespace DmlGraphFusionHelper
 
         // Walk through each graph edge and mark used inputs
         inputsUsed.assign(fusedNodeInputCount, false);
-        for (const DML_INPUT_GRAPH_EDGE_DESC& edge : inputEdges) {
-            inputsUsed[edge.GraphInputIndex] = true;
+        for (auto it = serializedGraphInputIndexToSubgraphInputIndex->begin(); it != serializedGraphInputIndexToSubgraphInputIndex->end(); it++) {
+            inputsUsed[it->second] = true;
+        }
+        for (auto it = serializedGraphLargeConstantNameToSubgraphInputIndex->begin(); it != serializedGraphLargeConstantNameToSubgraphInputIndex->end(); it++) {
+            inputsUsed[it->second] = true;
+        }
+
+        std::wstring modelName;
+        if (graphSerializationEnabled)
+        {
+            modelName = GetModelName(graph.ModelPath());
         }
 
         for (uint32_t i = 0; i < initInputBindings.size(); i++)
@@ -209,6 +220,10 @@ namespace DmlGraphFusionHelper
 
                 // Tensor sizes in DML must be a multiple of 4 bytes large.
                 tensorByteSize = AlignToPow2<size_t>(tensorByteSize, 4);
+                if(graphSerializationEnabled)
+                {
+                    WriteToFile(modelName, ConvertToWString(iter->first) + L".bin", reinterpret_cast<uint8_t*>(tensorPtr), tensorByteSize);
+                }
 
                 if (inputRawData)
                 {
@@ -226,8 +241,7 @@ namespace DmlGraphFusionHelper
                 {
                     ComPtr<ID3D12Resource> initializeInputBuffer;
 
-                    // D3D_FEATURE_LEVEL_1_0_CORE doesn't support Custom heaps
-                    if (providerImpl->IsMcdmDevice())
+                    if (!providerImpl->CustomHeapsSupported())
                     {
                         initializeInputBuffer = CreateResource(providerImpl, tensorPtr, tensorByteSize);
                     }
@@ -288,38 +302,158 @@ namespace DmlGraphFusionHelper
         return initializerPartitionMap;
     }
 
+    inline uint32_t GetConstantNodeGraphInputIndex(
+        const std::string& constantName,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphConstantNameToMainGraphInputIndex,
+        uint32_t& graphMaxInputIndex,
+        std::unordered_map<std::string_view, uint32_t>& localConstantNameToIndexMap)
+    {
+        if (serializedGraphConstantNameToMainGraphInputIndex == nullptr)
+        {
+            if (localConstantNameToIndexMap.find(constantName) == localConstantNameToIndexMap.end())
+            {
+                localConstantNameToIndexMap[constantName] = ++graphMaxInputIndex;
+            }
+            return localConstantNameToIndexMap[constantName];
+        }
+        else
+        {
+            graphMaxInputIndex = std::max(graphMaxInputIndex, serializedGraphConstantNameToMainGraphInputIndex->at(constantName));
+            return serializedGraphConstantNameToMainGraphInputIndex->at(constantName);
+        }
+    }
+
+    template <size_t AllocatorSize>
     void ConvertGraphDesc(
         const Dml::GraphDescBuilder::GraphDesc& graphDesc,
-        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
         const uint32_t inputCount,
         const uint32_t outputCount,
-        _Inout_ std::vector<DML_OPERATOR_GRAPH_NODE_DESC>& dmlOperatorGraphNodes,
+        IDMLDevice* device,
+        StackAllocator<AllocatorSize>& allocator,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex,
+        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
+        _Inout_ std::vector<ComPtr<IDMLOperator>>& dmlOperators,
         _Inout_ std::vector<DML_GRAPH_NODE_DESC>& dmlGraphNodes,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlInputEdges,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlIntermediateEdges)
     {
-        for (size_t i = 0; i < graphDesc.nodes.size(); ++i)
+        std::unordered_map<uint32_t, uint32_t> oldNodeIndexToNewNodeIndexMap;
+        for (uint32_t index = 0; index < static_cast<uint32_t>(graphDesc.Nodes.size()); index++)
         {
-            auto& nodeInfo = graphDesc.nodes[i];
-            dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{nodeInfo.op.Get(), nodeInfo.name.data()};
-            dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i]};
+            const DmlSerializedGraphNode& node = graphDesc.Nodes[index];
+            if (std::holds_alternative<AbstractOperatorDesc>(node.Desc))
+            {
+                oldNodeIndexToNewNodeIndexMap[index] = static_cast<uint32_t>(dmlGraphNodes.size());
+                DML_OPERATOR_DESC dmlDesc = SchemaHelpers::ConvertOperatorDesc<AllocatorSize>(std::get<AbstractOperatorDesc>(node.Desc), &allocator);
+                ComPtr<IDMLOperator> op;
+                ORT_THROW_IF_FAILED(device->CreateOperator(&dmlDesc, IID_PPV_ARGS(&op)));
+                dmlOperators.push_back(op);
+                DML_OPERATOR_GRAPH_NODE_DESC* dmlOperatorGraphNode = allocator.template Allocate<DML_OPERATOR_GRAPH_NODE_DESC>();
+                dmlOperatorGraphNode->Name = node.Name.data();
+                dmlOperatorGraphNode->Operator = op.Get();
+                dmlGraphNodes.push_back(DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, dmlOperatorGraphNode});
+            }
+            else
+            {
+                auto& constantNodeVariant = std::get<DmlSerializedGraphNodeConstantVariant>(node.Desc);
+                if (std::holds_alternative<ConstantData>(constantNodeVariant))
+                {
+                    oldNodeIndexToNewNodeIndexMap[index] = static_cast<uint32_t>(dmlGraphNodes.size());
+
+                    auto& constantData = std::get<ConstantData>(constantNodeVariant);
+                    
+                    DML_CONSTANT_DATA_GRAPH_NODE_DESC* constantNode = allocator.template Allocate<DML_CONSTANT_DATA_GRAPH_NODE_DESC>();
+                    constantNode->Name = node.Name.data();
+                    constantNode->DataSize = constantData.dataSize;
+                    constantNode->Data = constantData.data;
+                    dmlGraphNodes.push_back(DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_CONSTANT, constantNode});
+                }
+            }
         }
 
-        for (size_t i = 0; i < graphDesc.inputEdges.size(); ++i)
+        uint32_t graphMaxInputIndex = 0;
+
+        for (size_t i = 0; i < graphDesc.InputEdges.size(); ++i)
         {
-            dmlInputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INPUT, &graphDesc.inputEdges[i]};
+            DML_INPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_INPUT_GRAPH_EDGE_DESC>();
+            // 1. If serializedGraphInputIndexToMainGraphInputIndex is not null:
+            //      then use the corresponding main graph input index, because the caller will use corresponding
+            //      main graph input index for extracting the actual input tensor from the main graph and
+            //      the caller does not own the creation of dml bindings directly.
+            //      Use Case: When the caller is ORT (DML EP) or DmlEngine.
+            //
+            // 2. If serializedGraphInputIndexToMainGraphInputIndex is null:
+            //      then assign the sequential graph input index, because it owns the creation of dml bindings
+            //      directly.
+            edge->GraphInputIndex = serializedGraphInputIndexToSubgraphInputIndex == nullptr ?
+                graphDesc.InputEdges[i].GraphInputIndex :
+                serializedGraphInputIndexToSubgraphInputIndex->at(graphDesc.InputEdges[i].GraphInputIndex);
+            edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.InputEdges[i].ToNodeIndex];
+            edge->ToNodeInputIndex = graphDesc.InputEdges[i].ToNodeInputIndex;
+            edge->Name = graphDesc.InputEdges[i].Name.data();
+
+            graphMaxInputIndex = std::max(graphMaxInputIndex, edge->GraphInputIndex);
+            dmlInputEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INPUT, edge});
         }
 
-        for (size_t i = 0; i < graphDesc.outputEdges.size(); ++i)
+        for (size_t i = 0; i < graphDesc.OutputEdges.size(); ++i)
         {
-            dmlOutputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_OUTPUT, &graphDesc.outputEdges[i]};
+            DML_OUTPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_OUTPUT_GRAPH_EDGE_DESC>();
+            edge->GraphOutputIndex = graphDesc.OutputEdges[i].GraphOutputIndex;
+            edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.OutputEdges[i].FromNodeIndex];
+            edge->FromNodeOutputIndex = graphDesc.OutputEdges[i].FromNodeOutputIndex;
+            edge->Name = graphDesc.OutputEdges[i].Name.data();
+
+            dmlOutputEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_OUTPUT, edge});
         }
 
-        for (size_t i = 0; i < graphDesc.intermediateEdges.size(); ++i)
+        std::unordered_map<std::string_view, uint32_t> localConstantNameToIndexMap;
+        for (uint32_t i = 0; i < static_cast<uint32_t>(graphDesc.IntermediateEdges.size()); ++i)
         {
-            dmlIntermediateEdges[i] =
-                DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, &graphDesc.intermediateEdges[i]};
+            DmlSerializedGraphNodeDescVariant descVariant = graphDesc.Nodes[graphDesc.IntermediateEdges[i].FromNodeIndex].Desc;
+            bool isConstantEdge = std::holds_alternative<DmlSerializedGraphNodeConstantVariant>(descVariant);
+            if (isConstantEdge)
+            {
+                auto& constantNodeVariant = std::get<DmlSerializedGraphNodeConstantVariant>(descVariant);
+                if (std::holds_alternative<ConstantData>(constantNodeVariant))
+                {
+                    DML_INTERMEDIATE_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_INTERMEDIATE_GRAPH_EDGE_DESC>();
+                    edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].FromNodeIndex];
+                    edge->FromNodeOutputIndex = graphDesc.IntermediateEdges[i].FromNodeOutputIndex;
+                    edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex];
+                    edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex;
+                    edge->Name = graphDesc.IntermediateEdges[i].Name.data();
+                    dmlIntermediateEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, edge});
+                }
+                else
+                {
+                    const std::string& constantName = graphDesc.Nodes[graphDesc.IntermediateEdges[i].FromNodeIndex].Name;
+
+                    DML_INPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_INPUT_GRAPH_EDGE_DESC>();
+                    edge->GraphInputIndex = GetConstantNodeGraphInputIndex(
+                        constantName,
+                        serializedGraphLargeConstantNameToSubgraphInputIndex,
+                        graphMaxInputIndex,
+                        localConstantNameToIndexMap);
+                    edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex];
+                    edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex;
+                    edge->Name = graphDesc.IntermediateEdges[i].Name.data();
+
+                    dmlInputEdges.push_back({DML_GRAPH_EDGE_TYPE_INPUT, edge});
+                }
+            }
+            else
+            {
+                DML_INTERMEDIATE_GRAPH_EDGE_DESC* edge = allocator.template Allocate<DML_INTERMEDIATE_GRAPH_EDGE_DESC>();
+                edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].FromNodeIndex];
+                edge->FromNodeOutputIndex = graphDesc.IntermediateEdges[i].FromNodeOutputIndex;
+                edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex];
+                edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex;
+                edge->Name = graphDesc.IntermediateEdges[i].Name.data();
+                dmlIntermediateEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, edge});
+            }
         }
 
         dmlGraphDesc.InputCount = inputCount;
@@ -384,24 +518,34 @@ namespace DmlGraphFusionHelper
     Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
         const GraphDescBuilder::GraphDesc& graphDesc,
         const onnxruntime::IndexedSubGraph& indexedSubGraph,
-        const ExecutionProviderImpl* providerImpl)
+        const ExecutionProviderImpl* providerImpl,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex)
     {
         const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
         const uint32_t fusedNodeOutputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->outputs.size());
 
         // convert DML EP GraphDesc into DML_GRAPH_DESC and create IDMLCompiledOperator
+        ComPtr<IDMLDevice> device;
+        ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
+
+        StackAllocator<1024> allocator;
         DML_GRAPH_DESC dmlGraphDesc = {};
-        std::vector<DML_OPERATOR_GRAPH_NODE_DESC> dmlOperatorGraphNodes(graphDesc.nodes.size());
-        std::vector<DML_GRAPH_NODE_DESC> dmlGraphNodes(graphDesc.nodes.size());
-        std::vector<DML_GRAPH_EDGE_DESC> dmlInputEdges(graphDesc.inputEdges.size());
-        std::vector<DML_GRAPH_EDGE_DESC> dmlOutputEdges(graphDesc.outputEdges.size());
-        std::vector<DML_GRAPH_EDGE_DESC> dmlIntermediateEdges(graphDesc.intermediateEdges.size());
+        std::vector<ComPtr<IDMLOperator>> dmlOperators;
+        std::vector<DML_GRAPH_NODE_DESC> dmlGraphNodes;
+        std::vector<DML_GRAPH_EDGE_DESC> dmlInputEdges;
+        std::vector<DML_GRAPH_EDGE_DESC> dmlOutputEdges;
+        std::vector<DML_GRAPH_EDGE_DESC> dmlIntermediateEdges;
         ConvertGraphDesc(
             graphDesc,
-            dmlGraphDesc,
             fusedNodeInputCount,
             fusedNodeOutputCount,
-            dmlOperatorGraphNodes,
+            device.Get(),
+            allocator,
+            serializedGraphInputIndexToSubgraphInputIndex,
+            serializedGraphLargeConstantNameToSubgraphInputIndex,
+            dmlGraphDesc,
+            dmlOperators,
             dmlGraphNodes,
             dmlInputEdges,
             dmlOutputEdges,
@@ -419,8 +563,6 @@ namespace DmlGraphFusionHelper
             executionFlags |= DML_EXECUTION_FLAG_DISABLE_META_COMMANDS;
         }
 
-        ComPtr<IDMLDevice> device;
-        ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
 
         ComPtr<IDMLDevice1> device1;
         ORT_THROW_IF_FAILED(device.As(&device1));
@@ -441,6 +583,7 @@ namespace DmlGraphFusionHelper
     }
 
     void FusePartitionAndRegisterKernel(
+        const uint32_t partitionIndex,
         onnxruntime::Graph& graph,
         onnxruntime::KernelRegistry* registryForPartitionKernels,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
@@ -448,8 +591,43 @@ namespace DmlGraphFusionHelper
         const onnxruntime::IndexedSubGraph& indexedSubGraph,
         std::vector<uint8_t>&& isInputsUploadedByDmlEP,
         const GraphDescBuilder::GraphDesc& graphDesc,
-        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator)
+        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
+        const bool graphSerializationEnabled,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex)
     {
+        if (graphSerializationEnabled)
+        {
+        
+          const std::wstring modelName = GetModelName(graph.ModelPath());
+          auto buffer = SerializeDmlGraph(graphDesc);
+        
+          const std::wstring partitionName =
+              L"Partition_" +
+              std::to_wstring(partitionIndex) +
+              L".bin";
+          WriteToFile(modelName, partitionName, buffer.data(), buffer.size());
+        
+          std::vector<std::unique_ptr<std::byte[]>> rawData;
+          DmlSerializedGraphDesc deserializedGraphDesc = DeserializeDmlGraph(buffer.data(), rawData);
+          GraphDescBuilder::GraphDesc deserializedDmlGraphDesc = {};
+          deserializedDmlGraphDesc.InputCount = deserializedGraphDesc.InputCount;
+          deserializedDmlGraphDesc.InputEdges = std::move(deserializedGraphDesc.InputEdges);
+          deserializedDmlGraphDesc.IntermediateEdges = std::move(deserializedGraphDesc.IntermediateEdges);
+          deserializedDmlGraphDesc.Nodes = std::move(deserializedGraphDesc.Nodes);
+          deserializedDmlGraphDesc.OutputCount = deserializedGraphDesc.OutputCount;
+          deserializedDmlGraphDesc.OutputEdges = std::move(deserializedGraphDesc.OutputEdges);
+          deserializedDmlGraphDesc.reuseCommandList = graphDesc.reuseCommandList;
+          deserializedDmlGraphDesc.outputShapes = graphDesc.outputShapes;
+        
+          compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator(
+                          deserializedDmlGraphDesc,
+                          indexedSubGraph,
+                          providerImpl,
+                          serializedGraphInputIndexToSubgraphInputIndex,
+                          serializedGraphLargeConstantNameToSubgraphInputIndex);
+        }
+
         auto& fusedNode = graph.BeginFuseSubGraph(indexedSubGraph, indexedSubGraph.GetMetaDef()->name);
         fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider);
 
@@ -463,8 +641,10 @@ namespace DmlGraphFusionHelper
         std::vector<bool> inputsUsed;
         ProcessInputData(
             providerImpl,
+            graphSerializationEnabled,
             isInputsUploadedByDmlEP,
-            graphDesc.inputEdges,
+            serializedGraphInputIndexToSubgraphInputIndex,
+            serializedGraphLargeConstantNameToSubgraphInputIndex,
             indexedSubGraph.GetMetaDef()->inputs,
             initializerNameToInitializerMap,
             graph,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
index f8f6162aaa1e..f1e965402119 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
@@ -45,12 +45,17 @@ namespace DmlGraphFusionHelper
         gsl::span<std::unique_ptr<GraphPartition>> partitions
     );
 
+    template <size_t AllocatorSize>
     void ConvertGraphDesc(
         const Dml::GraphDescBuilder::GraphDesc& graphDesc,
-        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
         const uint32_t inputCount,
         const uint32_t outputCount,
-        _Inout_ std::vector<DML_OPERATOR_GRAPH_NODE_DESC>& dmlOperatorGraphNodes,
+        IDMLDevice* device,
+        StackAllocator<AllocatorSize>& allocator,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex,
+        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
+        _Inout_ std::vector<ComPtr<IDMLOperator>>& dmlOperators,
         _Inout_ std::vector<DML_GRAPH_NODE_DESC>& dmlGraphNodes,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlInputEdges,
         _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
@@ -69,9 +74,12 @@ namespace DmlGraphFusionHelper
     Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
         const GraphDescBuilder::GraphDesc& graphDesc,
         const onnxruntime::IndexedSubGraph& indexedSubGraph,
-        const ExecutionProviderImpl* providerImpl);
+        const ExecutionProviderImpl* providerImpl,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex);
 
     void FusePartitionAndRegisterKernel(
+        const uint32_t partitionIndex,
         onnxruntime::Graph& graph,
         onnxruntime::KernelRegistry* registryForPartitionKernels,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
@@ -79,7 +87,10 @@ namespace DmlGraphFusionHelper
         const onnxruntime::IndexedSubGraph& indexedSubGraph,
         std::vector<uint8_t>&& isInputsUploadedByDmlEP,
         const GraphDescBuilder::GraphDesc& graphDesc,
-        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator);
+        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
+        const bool graphSerializationEnabled,
+        const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex = nullptr,
+        const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex = nullptr);
 
     void RegisterDynamicKernel(
         onnxruntime::Graph& graph,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
index 679738b639ec..35a2c451a49a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@@ -24,15 +24,20 @@ namespace Dml
             std::vector<uint8_t> isInputsUploadedByDmlEP;
             GraphDescBuilder::GraphDesc graphDesc;
             std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
+            std::vector<std::unique_ptr<std::byte[]>> smallConstantData; // Need to keep it alive for maintaining lifetime
+            std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
+            std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
         };
     }
 
     DmlGraphFusionTransformer::DmlGraphFusionTransformer(
         const std::string& name,
-        const onnxruntime::IExecutionProvider* provider
+        const onnxruntime::IExecutionProvider* provider,
+        const bool graphSerializationEnabled
     )
         :onnxruntime::GraphTransformer(name),
-         m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl())
+         m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl()),
+         graphSerializationEnabled(graphSerializationEnabled)
     {
     }
 
@@ -227,23 +232,39 @@ namespace Dml
 
                     ComPtr<IDMLDevice> device;
                     ORT_THROW_IF_FAILED(m_providerImpl->GetDmlDevice(device.GetAddressOf()));
+                    // This map will be used to transfer the initializer to D3D12 system heap memory.
+                    // 'serializedDmlGraphDesc' will have constant input as intermediate edges, that's why
+                    // we need a mapping between intermediateEdgeIndex and indexedSubGraph's (a given partition)
+                    // input arg index.
+                    //   For ex: Let's say intermediate edge index = idx, then
+                    //           indexedSubGraphInputArgIdx = constantEdgeIdxToSubgraphInputArgIdxMap[idx];
+                    //           corresponding constant tensor = initializerNameToInitializerMap[indexedSubGraph.GetMetaDef()->inputs[indexedSubGraphInputArgIdx]]
+                    // We are using intermediate edge index as a key because same constant tensor can be used by
+                    // multiple nodes.
+                    std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
+                    std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
+                    std::vector<std::unique_ptr<std::byte[]>> smallConstantData;
                     GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
                         isInputsUploadedByDmlEP.data(),
                         isInputsUploadedByDmlEP.size(),
                         isInitializerTransferable,
                         partitionNodePropsMap,
-                        device.Get(),
                         m_providerImpl,
                         modelPath,
                         subgraphNodes,
                         subgraphInputs,
-                        subgraphOutputs);
+                        subgraphOutputs,
+                        serializedGraphInputIndexToSubgraphInputIndex,
+                        serializedGraphLargeConstantNameToSubgraphInputIndex,
+                        smallConstantData);
 
                     // Compile the operator
                     auto compiledPartition = DmlGraphFusionHelper::TryCreateCompiledOperator(
                         graphDesc,
                         indexedSubGraph,
-                        m_providerImpl);
+                        m_providerImpl,
+                        &serializedGraphInputIndexToSubgraphInputIndex,
+                        &serializedGraphLargeConstantNameToSubgraphInputIndex);
 
                     if (!compiledPartition)
                     {
@@ -264,6 +285,9 @@ namespace Dml
                         compiledPartitionInfo->isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP);
                         compiledPartitionInfo->graphDesc = std::move(graphDesc);
                         compiledPartitionInfo->isInitializerTransferable = std::move(isInitializerTransferable);
+                        compiledPartitionInfo->smallConstantData = std::move(smallConstantData);
+                        compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex = std::move(serializedGraphInputIndexToSubgraphInputIndex);
+                        compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex = std::move(serializedGraphLargeConstantNameToSubgraphInputIndex);
                         compiledPartitionInfos[partitionIndex] = std::move(compiledPartitionInfo);
                     }
                 }
@@ -271,12 +295,14 @@ namespace Dml
         }
         while (!additionalSplittingNodes.empty());
 
+        uint32_t partitionIndex = 0;
         for (auto&& compiledPartitionInfo : compiledPartitionInfos)
         {
             // Null compiled operators were not DML partitions
             if (compiledPartitionInfo)
             {
                 DmlGraphFusionHelper::FusePartitionAndRegisterKernel(
+                    partitionIndex++,
                     graph,
                     m_providerImpl->GetKernelRegistry().get(),
                     compiledPartitionInfo->isInitializerTransferable,
@@ -284,7 +310,10 @@ namespace Dml
                     compiledPartitionInfo->indexedSubGraph,
                     std::move(compiledPartitionInfo->isInputsUploadedByDmlEP),
                     compiledPartitionInfo->graphDesc,
-                    compiledPartitionInfo->compiledOperator);
+                    compiledPartitionInfo->compiledOperator,
+                    graphSerializationEnabled,
+                    &compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex,
+                    &compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex);
             }
         }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
index 19dab0c89943..b370f3ef9043 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
@@ -16,7 +16,8 @@ class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
 public:
     DmlGraphFusionTransformer(
         const std::string& name,
-        const onnxruntime::IExecutionProvider* provider
+        const onnxruntime::IExecutionProvider* provider,
+        const bool graphSerializationEnabled
     );
 
 public:
@@ -38,5 +39,6 @@ class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
 
 private:
     const ExecutionProviderImpl* m_providerImpl = nullptr;
+    const bool graphSerializationEnabled = false;
 };
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp
new file mode 100644
index 000000000000..ed406fa259fe
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp
@@ -0,0 +1,583 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include "precomp.h"
+
+template <typename T>
+T* ReadAs(uint8_t* base, size_t byteOffset)
+{
+    return reinterpret_cast<T*>(base + byteOffset);
+}
+
+void SerializeAttributeDescs(
+    flatbuffers::FlatBufferBuilder& builder,
+    const AbstractOperatorDesc& operatorDesc,
+    /*out*/ std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>& attributeDescs);
+
+flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation> serializeActivation(
+    flatbuffers::FlatBufferBuilder& builder,
+    const AbstractOperatorDesc& activationOperatorDesc)
+{
+    std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> attributeDescs;
+    SerializeAttributeDescs(builder, activationOperatorDesc, attributeDescs);
+    
+    flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation> offset = dml::ir::operatorFieldTypes::CreateActivationDirect(
+        builder,
+        activationOperatorDesc.schema->OperatorName,
+        &attributeDescs);
+    return offset;
+}
+
+void SerializeAttributeDescs(
+    flatbuffers::FlatBufferBuilder& builder,
+    const AbstractOperatorDesc& operatorDesc,
+    /*out*/ std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>& attributeDescs)
+{
+    for (const OperatorField& field : operatorDesc.fields)
+    {
+        if (field.GetSchema()->Kind == DML_SCHEMA_FIELD_KIND_INPUT_TENSOR || 
+            field.GetSchema()->Kind == DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR)
+        {
+            continue;
+        }
+
+        flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc> offset;
+
+        if (std::holds_alternative<OperatorFieldTypes::FusedActivationOperatorDesc>(field.GetData()))
+        {
+            const OperatorFieldTypes::FusedActivationOperatorDesc& fusedActivation = field.AsFusedActivationOperatorDesc();
+            if (!fusedActivation.has_value())
+            {
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    nullptr,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation);
+            }
+            else
+            {
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    field.GetSchema()->Name,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation,
+                    serializeActivation(builder, fusedActivation.value()).Union());
+            }
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::FusedActivationOperatorDescArray>(field.GetData()))
+        {
+            const OperatorFieldTypes::FusedActivationOperatorDescArray& fusedActivations = 
+                field.AsFusedActivationOperatorDescArray();
+            if (!fusedActivations.has_value())
+            {
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    nullptr,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray);
+            }
+            else
+            {
+                std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> fbActivations;
+
+                for (AbstractOperatorDesc activationOpDesc : fusedActivations.value())
+                {
+                    flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation> fbActivation = 
+                        serializeActivation(builder, activationOpDesc);
+                    fbActivations.push_back(fbActivation);
+                }
+
+                flatbuffers::Offset<dml::ir::operatorFieldTypes::ActivationArray> activationOffset = 
+                    dml::ir::operatorFieldTypes::CreateActivationArrayDirect(builder, &fbActivations);
+                
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    field.GetSchema()->Name,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray,
+                    activationOffset.Union());
+            }
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::UInt>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt32,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::UInt32(field.AsUInt())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::UInt64>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt64,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::UInt64(field.AsUInt64())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::Int>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_Int32,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::Int32(field.AsInt())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::Float>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_Float32,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::Float32(field.AsFloat())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::UIntArray>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_UIntArray,
+                dml::ir::operatorFieldTypes::CreateUIntArray(builder, builder.CreateVector(field.AsUIntArray())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::IntArray>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_IntArray,
+                dml::ir::operatorFieldTypes::CreateIntArray(builder, builder.CreateVector(field.AsIntArray())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::FloatArray>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_FloatArray,
+                dml::ir::operatorFieldTypes::CreateFloatArray(builder, builder.CreateVector(field.AsFloatArray())).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::ScaleBias>(field.GetData()))
+        {
+            const OperatorFieldTypes::ScaleBias& scaleBias = field.AsScaleBias();
+            if (!scaleBias.has_value())
+            {
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    nullptr,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias);
+            }
+            else
+            {
+                dml::ir::operatorFieldTypes::ScaleBias fbScaleBias(scaleBias.value().Scale, scaleBias.value().Bias);
+                offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                    builder,
+                    field.GetSchema()->Name,
+                    dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias,
+                    builder.CreateStruct(fbScaleBias).Union());
+            }
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::Size2D>(field.GetData()))
+        {
+            const DML_SIZE_2D size2d = field.AsSize2D();
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_Size2D,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::Size2D(size2d.Width, size2d.Height)).Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::ScalarUnion>(field.GetData()))
+        {
+            OperatorFieldTypes::ScalarUnion scalarUnion = field.AsScalarUnion();
+            dml::ir::operatorFieldTypes::ByteArray byteArr;
+            for (uint32_t index = 0; index < static_cast<uint32_t>(sizeof(scalarUnion.Bytes)); index++)
+            {
+                byteArr.mutable_data()->Mutate(index, scalarUnion.Bytes[index]);
+            }
+
+            flatbuffers::Offset<dml::ir::operatorFieldTypes::ScalarUnionData> scalarUnionOffset = 
+                dml::ir::operatorFieldTypes::CreateScalarUnionData(
+                    builder,
+                    dml::ir::operatorFieldTypes::ScalarVariant_ByteArray,
+                    builder.CreateStruct(byteArr).Union());
+
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_ScalarUnionData,
+                scalarUnionOffset.Union());
+        }
+        else if (std::holds_alternative<OperatorFieldTypes::Bool>(field.GetData()))
+        {
+            offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect(
+                builder,
+                field.GetSchema()->Name,
+                dml::ir::operatorFieldTypes::AttributeFieldVariant_Bool,
+                builder.CreateStruct(dml::ir::operatorFieldTypes::Bool(field.AsBool())).Union());
+        }
+        else
+        {
+            continue;
+        }
+        
+        attributeDescs.push_back(offset);
+    }
+}
+
+flatbuffers::Offset<dml::ir::DmlBufferTensorDesc> SerializeDmlTensorDesc(
+    flatbuffers::FlatBufferBuilder& builder,
+    const DmlBufferTensorDesc* tensorDesc)
+{
+    const std::vector<uint32_t> *strides = nullptr;
+    if (tensorDesc->strides.has_value())
+    {
+        strides = &tensorDesc->strides.value();
+    }
+    
+    flatbuffers::Offset<dml::ir::DmlBufferTensorDesc> offset = dml::ir::CreateDmlBufferTensorDescDirect(
+        builder,
+        ApiTraits::StringifyHelpers::ToString(tensorDesc->dataType),
+        &tensorDesc->sizes,
+        strides,
+        tensorDesc->totalTensorSizeInBytes);
+    return offset;
+}
+
+flatbuffers::Offset<void> SerializeOperatorNodeDesc(
+    flatbuffers::FlatBufferBuilder& builder,
+    const AbstractOperatorDesc& operatorDesc)
+{
+    const DML_OPERATOR_SCHEMA* operatorSchema = operatorDesc.schema;
+
+    std::vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> inputTensorDescs;
+    std::vector<flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> outputTensorDescs;
+    
+    for (const DmlBufferTensorDesc* tensorDesc : operatorDesc.GetInputTensors())
+    {
+        if (tensorDesc == nullptr)
+        {
+            continue;
+        }
+        flatbuffers::Offset<dml::ir::DmlBufferTensorDesc> serializedDmlTensorDesc = SerializeDmlTensorDesc(builder, tensorDesc);
+        inputTensorDescs.push_back(serializedDmlTensorDesc);
+    }
+    
+    for (const DmlBufferTensorDesc* tensorDesc : operatorDesc.GetOutputTensors())
+    {
+        if (tensorDesc == nullptr)
+        {
+            continue;
+        }
+        flatbuffers::Offset<dml::ir::DmlBufferTensorDesc> serializedDmlTensorDesc = SerializeDmlTensorDesc(builder, tensorDesc);
+        outputTensorDescs.push_back(serializedDmlTensorDesc);
+    }
+    
+    std::vector<flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> attributeDescs;
+    SerializeAttributeDescs(builder, operatorDesc, attributeDescs);
+    
+    flatbuffers::Offset<dml::ir::OperatorNodeDesc> offset = dml::ir::CreateOperatorNodeDesc(
+        builder,
+        builder.CreateString(operatorSchema->OperatorName),
+        builder.CreateVector(inputTensorDescs),
+        builder.CreateVector(outputTensorDescs),
+        builder.CreateVector(attributeDescs));
+    return offset.Union();
+}
+
+flatbuffers::Offset<void> SerializeConstantNodeDesc(
+    flatbuffers::FlatBufferBuilder& builder,
+    uint32_t nodeIndex,
+    const DmlSerializedGraphNodeConstantVariant& constantNodeDesc)
+{
+    flatbuffers::Offset<dml::ir::ConstantNodeDesc> offset;
+    
+    if (std::holds_alternative<ConstantName>(constantNodeDesc))
+    {
+        auto& constantName = std::get<ConstantName>(constantNodeDesc);
+        if (constantName.name.empty())
+        {
+            throw std::invalid_argument("Graph constant node at index:" + std::to_string(nodeIndex) +
+                                        " doesn't have the constant data name.");
+        }
+
+        flatbuffers::Offset<dml::ir::ConstantName> constantNameOffset = dml::ir::CreateConstantName(
+            builder, 
+            builder.CreateString(constantName.name));
+
+        offset = dml::ir::CreateConstantNodeDesc(
+            builder,
+            dml::ir::ConstantNodeDescDetail_ConstantName,
+            constantNameOffset.Union());
+    }
+    else
+    {
+        auto& constantData = std::get<ConstantData>(constantNodeDesc);
+        std::vector<uint8_t> rawBytes;
+        std::transform(constantData.data, constantData.data + constantData.dataSize, 
+                       std::back_inserter(rawBytes), [](std::byte b) {return static_cast<uint8_t>(b); });
+        flatbuffers::Offset<dml::ir::ConstantRawData> constantDataOffset = dml::ir::CreateConstantRawDataDirect(
+            builder,
+            &rawBytes);
+
+        offset = dml::ir::CreateConstantNodeDesc(
+            builder,
+            dml::ir::ConstantNodeDescDetail_ConstantRawData,
+            constantDataOffset.Union());
+    }
+    
+    return offset.Union();
+}
+
+flatbuffers::Offset<dml::ir::DmlGraphNode> SerializeNode(
+    flatbuffers::FlatBufferBuilder& builder,
+    const uint32_t nodeIndex,
+    const DmlSerializedGraphNode& graphNode,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>>& nodeInputNames,
+    const std::vector<flatbuffers::Offset<flatbuffers::String>>& nodeOutputNames)
+{
+    if (graphNode.Name.empty())
+    {        
+        throw std::invalid_argument("Graph node at index:" + std::to_string(nodeIndex) + 
+                                    " does not have any name.");
+    }
+
+    flatbuffers::Offset<dml::ir::DmlGraphNode> offset;
+    if (std::holds_alternative<AbstractOperatorDesc>(graphNode.Desc))
+    {
+        auto& operatorNode = std::get<AbstractOperatorDesc>(graphNode.Desc);
+        offset = dml::ir::CreateDmlGraphNode(
+            builder,
+            dml::ir::NodeDesc_OperatorNodeDesc,
+            SerializeOperatorNodeDesc(builder, operatorNode),
+            builder.CreateString(graphNode.Name),
+            builder.CreateVector(nodeInputNames),
+            builder.CreateVector(nodeOutputNames));
+    }
+    else
+    {
+        auto& constantNodeVariant = std::get<DmlSerializedGraphNodeConstantVariant>(graphNode.Desc);
+        offset = dml::ir::CreateDmlGraphNode(
+            builder,
+            dml::ir::NodeDesc_ConstantNodeDesc,
+            SerializeConstantNodeDesc(builder, nodeIndex, constantNodeVariant),
+            builder.CreateString(graphNode.Name),
+            builder.CreateVector(nodeInputNames),
+            builder.CreateVector(nodeOutputNames));
+    }
+    return offset;
+}
+
+/*
+* validates input/output edges and throws exception if an edge 
+* does not have a name or if an edge has more than 1 names.
+*/
+template <typename Edge>
+std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> ConvertToEdgeIndexToNameMap(
+    const std::vector<Edge>& edges,
+    flatbuffers::FlatBufferBuilder& builder)
+{
+    std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> edgeIndexToNameMap;
+    for (auto& edge : edges)
+    {
+        uint32_t index;
+        if constexpr (std::is_same_v<Edge, DmlInputSerializedGraphEdge>)
+        {
+            index = edge.GraphInputIndex;
+        }
+        else if constexpr (std::is_same_v<Edge, DmlOutputSerializedGraphEdge>)
+        {
+            index = edge.GraphOutputIndex;
+        }
+        
+        if (edge.Name.empty())
+        {
+            throw std::invalid_argument("Graph input or output edge at index " + std::to_string(index) + " does not have name.");
+        }
+
+        if (edgeIndexToNameMap.find(index) != edgeIndexToNameMap.end())
+        {
+            flatbuffers::String* edgeName = ReadAs<flatbuffers::String>(
+                builder.GetCurrentBufferPointer(),
+                builder.GetSize() - edgeIndexToNameMap[index].o);
+            if (edge.Name != edgeName->str())
+            {
+                throw std::invalid_argument("Graph input or output edge at index " + std::to_string(index) + " has more than 1 names.");
+            }
+        }
+
+        edgeIndexToNameMap[index] = builder.CreateString(edge.Name);
+    }
+    return edgeIndexToNameMap; // NRVO will automatically move it. no need to use std::move
+}
+
+void PopulateNonConstantNodeInputOutputCount(
+    const std::vector<DmlSerializedGraphNode>& nodes,
+    /*out*/ std::vector<uint32_t>& nodeInputCounts,
+    /*out*/ std::vector<uint32_t>& nodeOutputCounts)
+{
+    for (uint32_t nodeIndex = 0; nodeIndex < static_cast<uint32_t>(nodes.size()); nodeIndex++)
+    {
+        auto& node = nodes[nodeIndex];
+        if (std::holds_alternative<AbstractOperatorDesc>(node.Desc))
+        {
+            auto& operatorNode = std::get<AbstractOperatorDesc>(node.Desc);
+            nodeInputCounts[nodeIndex] = std::max(
+                nodeInputCounts[nodeIndex], 
+                static_cast<uint32_t>(operatorNode.GetInputTensors().size()));
+
+            nodeOutputCounts[nodeIndex] = std::max(
+                nodeOutputCounts[nodeIndex], 
+                static_cast<uint32_t>(operatorNode.GetOutputTensors().size()));
+        }
+    }
+}
+
+void PopulateConstantNodeInputOutputCount(
+    const std::vector<DmlIntermediateSerializedGraphEdge>& edges,
+    /*out*/std::vector<uint32_t>& maxInputIndexForNodes,
+    /*out*/std::vector<uint32_t>& maxOutputIndexForNodes)
+{
+    for (auto& edge : edges)
+    {
+        maxInputIndexForNodes[edge.ToNodeIndex] = std::max(maxInputIndexForNodes[edge.ToNodeIndex], edge.ToNodeInputIndex + 1);
+        maxOutputIndexForNodes[edge.FromNodeIndex] = std::max(maxOutputIndexForNodes[edge.FromNodeIndex], edge.FromNodeOutputIndex + 1);
+    }
+}
+
+/*
+* validates intermediate edge and throws exception if an edge 
+* does not have a name or if an edge has more than 1 names.
+*/
+void PopulateNodeInputOutputNames(
+    flatbuffers::FlatBufferBuilder& builder,
+    const DmlSerializedGraphDesc& graphDesc,
+    const std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>>& graphInputIndexToNameMap,
+    const std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>>& graphOutputIndexToNameMap,
+    /*out*/std::vector<std::vector<flatbuffers::Offset<flatbuffers::String>>>& nodeToInputNames, 
+    /*out*/std::vector<std::vector<flatbuffers::Offset<flatbuffers::String>>>& nodeToOutputNames)
+{
+    for (auto& edge : graphDesc.InputEdges)
+    {
+        nodeToInputNames[edge.ToNodeIndex][edge.ToNodeInputIndex] = graphInputIndexToNameMap.at(edge.GraphInputIndex);
+    }
+
+    for (auto& edge : graphDesc.OutputEdges)
+    {
+        nodeToOutputNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = graphOutputIndexToNameMap.at(edge.GraphOutputIndex);
+    }
+
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>>> intermediateEdgeNames;
+    for (uint32_t edgeIndex = 0; edgeIndex < static_cast<uint32_t>(graphDesc.IntermediateEdges.size()); edgeIndex++)
+    {
+        auto& edge = graphDesc.IntermediateEdges[edgeIndex];
+        if (edge.Name.empty())
+        {
+            throw std::invalid_argument(
+                    "Graph intermediate edge from nodeIndex:" + std::to_string(edge.FromNodeIndex) + 
+                    " & nodeOutputIndex:" + std::to_string(edge.FromNodeOutputIndex) + " doesn't have name.");
+        }
+        
+        if (intermediateEdgeNames.find(edge.FromNodeIndex) != intermediateEdgeNames.end() &&
+            intermediateEdgeNames[edge.FromNodeIndex].find(edge.FromNodeOutputIndex) != intermediateEdgeNames[edge.FromNodeIndex].end())
+        {
+            flatbuffers::Offset edgeNameOffset = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex];
+            flatbuffers::String* edgeName = ReadAs<flatbuffers::String>(
+                builder.GetCurrentBufferPointer(),
+                builder.GetSize() - edgeNameOffset.o);
+
+            if (edgeName->str() != edge.Name)
+            {
+                throw std::invalid_argument(
+                    "Graph intermediate edge from nodeIndex:" + std::to_string(edge.FromNodeIndex) + 
+                    " & nodeOutputIndex:" + std::to_string(edge.FromNodeOutputIndex) + " has more than 1 names.");
+            }
+        }
+        else
+        {
+            intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = builder.CreateString(edge.Name.c_str());
+        }
+        nodeToInputNames[edge.ToNodeIndex][edge.ToNodeInputIndex] = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex];
+        nodeToOutputNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex];
+    }
+}
+
+
+/*
+* - If an edge is connected to multiple nodes, then there will be multiple instances 
+*   of input or intermediate edges, all with the same name.
+* - The input <graphDesc> will be validated incrementally throughout the execution 
+*   of the method.
+* - Handling of empty optional input/output/attibute for non-constant node:
+*   input/output
+*   - <DmlGraphNode.inputNames> and <DmlGraphNode.outputNames> will have an null entry
+*      but the actual OperatorNodeDesc variant's <OperatorNodeDesc.inputs> 
+*      and <OperatorNodeDesc.outputs> will not have any entry.
+*   attribute
+*   - <OperatorNodeDesc.attributes> will have null entry
+*/
+flatbuffers::DetachedBuffer SerializeDmlGraph(const DmlSerializedGraphDesc& graphDesc)
+{
+
+    flatbuffers::FlatBufferBuilder builder(1024);
+    if (graphDesc.Nodes.empty())
+    {
+        return builder.Release();
+    }
+
+    std::vector<uint32_t> nodesInTopologicalOrder(graphDesc.Nodes.size());
+    PerformTopologicalSortAndCheckIsAcyclic(graphDesc, nodesInTopologicalOrder);
+
+    // create input/output edge index to name map
+    std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> graphInputIndexToNameMap = 
+        ConvertToEdgeIndexToNameMap<DmlInputSerializedGraphEdge>(graphDesc.InputEdges, builder);
+    std::unordered_map<uint32_t, flatbuffers::Offset<flatbuffers::String>> graphOutputIndexToNameMap = 
+        ConvertToEdgeIndexToNameMap<DmlOutputSerializedGraphEdge>(graphDesc.OutputEdges, builder);
+
+    /*
+    * - Calculate number of input/output for each operator to allocate
+    *   appropriate amount of memory for each node to store input/output names.
+    * - Non-constant node's input/output count can be determined by the
+    *   AbstractOperatorDesc.
+    * - Constant node will only have outgoing edges and those outgoing edges 
+    *   will be intermediate edges.
+    */
+    std::vector<uint32_t> nodeInputCounts(graphDesc.Nodes.size(), 0);
+    std::vector<uint32_t> nodeOutputCounts(graphDesc.Nodes.size(), 0);
+    PopulateNonConstantNodeInputOutputCount(graphDesc.Nodes, nodeInputCounts, nodeOutputCounts);
+    PopulateConstantNodeInputOutputCount(graphDesc.IntermediateEdges, nodeInputCounts, nodeOutputCounts);
+    
+    // populate node input/output names.
+    std::vector<std::vector<flatbuffers::Offset<flatbuffers::String>>> nodeToInputNames(graphDesc.Nodes.size());
+    std::vector<std::vector<flatbuffers::Offset<flatbuffers::String>>> nodeToOutputNames(graphDesc.Nodes.size());
+    for (uint32_t nodeIndex = 0; nodeIndex < static_cast<uint32_t>(graphDesc.Nodes.size()); nodeIndex++)
+    {
+        nodeToInputNames[nodeIndex].assign(nodeInputCounts[nodeIndex], builder.CreateString(nullptr, 0));
+        nodeToOutputNames[nodeIndex].assign(nodeOutputCounts[nodeIndex], builder.CreateString(nullptr, 0));
+    }
+    PopulateNodeInputOutputNames(builder, graphDesc, graphInputIndexToNameMap, graphOutputIndexToNameMap, nodeToInputNames, nodeToOutputNames);
+
+    // Create flatbuffer node objects
+    std::vector<flatbuffers::Offset<dml::ir::DmlGraphNode>> nodes(graphDesc.Nodes.size());
+    for (uint32_t nodeIndex = 0; nodeIndex < static_cast<uint32_t>(nodesInTopologicalOrder.size()); nodeIndex++)
+    {
+        nodes[nodeIndex] = SerializeNode(
+                            builder,
+                            nodesInTopologicalOrder[nodeIndex],
+                            graphDesc.Nodes[nodesInTopologicalOrder[nodeIndex]],
+                            nodeToInputNames[nodesInTopologicalOrder[nodeIndex]],
+                            nodeToOutputNames[nodesInTopologicalOrder[nodeIndex]]);
+    }
+
+    // Convert to std::vector to create the <dml::ir::DmlGraphDesc> object.
+    std::vector<flatbuffers::Offset<flatbuffers::String>> graphInputNames(graphDesc.InputCount, builder.CreateString(nullptr, 0));
+    std::vector<flatbuffers::Offset<flatbuffers::String>> graphOutputNames(graphDesc.OutputCount, builder.CreateString(nullptr, 0));
+    for (const auto& [key, value] : graphInputIndexToNameMap)
+    {
+        graphInputNames[key] = value;
+    }
+    for (const auto& [key, value] : graphOutputIndexToNameMap)
+    {
+        graphOutputNames[key] = value;
+    }
+
+    flatbuffers::Offset<dml::ir::DmlGraphDesc> dmlGraphDescOffset = dml::ir::CreateDmlGraphDescDirect(
+        builder,
+        &nodes,
+        &graphInputNames,
+        &graphOutputNames);
+    builder.Finish(dmlGraphDescOffset);
+    return builder.Release();
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
index 5c7b7bff1e37..0f0d445a95ba 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
@@ -180,32 +180,50 @@ namespace Dml
                 // Convert partitionONNXGraph into DML EP GraphDesc
                 ComPtr<IDMLDevice> device;
                 ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
+                // This map will be used to transfer the initializer to D3D12 system heap memory.
+                // 'serializedDmlGraphDesc' will have constant input as intermediate edges, that's why
+                // we need a mapping between intermediateEdgeIndex and indexedSubGraph's (a given partition)
+                // input arg index.
+                //   For ex: Let's say intermediate edge index = idx, then
+                //           indexedSubGraphInputArgIdx = constantEdgeIdxToSubgraphInputArgIdxMap[idx];
+                //           corresponding constant tensor = initializerNameToInitializerMap[indexedSubGraph.GetMetaDef()->inputs[indexedSubGraphInputArgIdx]]
+                // We are using intermediate edge index as a key because same constant tensor can be used by
+                // multiple nodes.
+                std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
+                std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
+                std::vector<std::unique_ptr<std::byte[]>> smallConstantData;
                 GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
                     isInputsUploadedByDmlEP.data(),
                     isInputsUploadedByDmlEP.size(),
                     m_isInitializerTransferable,
                     m_partitionNodePropsMap,
-                    device.Get(),
                     providerImpl,
                     m_modelPath,
                     m_subgraphNodePointers,
                     m_subgraphInputs,
-                    m_subgraphOutputs);
+                    m_subgraphOutputs,
+                    serializedGraphInputIndexToSubgraphInputIndex,
+                    serializedGraphLargeConstantNameToSubgraphInputIndex,
+                    smallConstantData);
 
                 m_outputShapes = graphDesc.outputShapes;
 
                 // Walk through each graph edge and mark used inputs
                 m_inputsUsed.resize(fusedNodeInputCount, false);
-                for (const DML_INPUT_GRAPH_EDGE_DESC& edge : graphDesc.inputEdges)
-                {
-                    m_inputsUsed[edge.GraphInputIndex] = true;
+                for (auto it = serializedGraphInputIndexToSubgraphInputIndex.begin(); it != serializedGraphInputIndexToSubgraphInputIndex.end(); it++) {
+                    m_inputsUsed[it->second] = true;
+                }
+                for (auto it = serializedGraphLargeConstantNameToSubgraphInputIndex.begin(); it != serializedGraphLargeConstantNameToSubgraphInputIndex.end(); it++) {
+                    m_inputsUsed[it->second] = true;
                 }
 
                 // Compile the operator
                 m_compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator(
                     graphDesc,
                     *m_indexedSubGraph,
-                    providerImpl);
+                    providerImpl,
+                    &serializedGraphInputIndexToSubgraphInputIndex,
+                    &serializedGraphLargeConstantNameToSubgraphInputIndex);
 
                 // Queue references to objects which must be kept alive until resulting GPU work completes
                 m_winmlProvider->QueueReference(m_compiledExecutionPlanOperator.Get());
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index a894d0660d6f..bc82c7ab1c44 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -15,7 +15,7 @@ namespace Dml
         : m_queue(std::make_shared<CommandQueue>(queue))
         , m_dmlRecorder(d3d12Device, dmlDevice, m_queue)
     {
-        ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf())));        
+        ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf())));
     }
 
     void ExecutionContext::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
@@ -78,14 +78,14 @@ namespace Dml
         ID3D12GraphicsCommandList* commandList,
         _Outptr_ ID3D12Fence** fence,
         _Out_ uint64_t* completionValue
-        ) 
+        )
     {
         assert(!m_closed);
 
         SetCommandRecorder(&m_dmlRecorder);
         m_dmlRecorder.ExecuteCommandList(commandList, fence, completionValue);
     }
-       
+
     void ExecutionContext::InitializeOperator(
         IDMLCompiledOperator* op,
         const DML_BINDING_DESC& persistentResourceBinding,
@@ -110,7 +110,7 @@ namespace Dml
     }
 
     void ExecutionContext::AddUAVBarrier()
-    {        
+    {
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -173,9 +173,9 @@ namespace Dml
         m_currentRecorder = nullptr;
         SetCommandRecorder(&m_dmlRecorder);
     }
-    
-    void ExecutionContext::QueueReference(IUnknown* object) 
-    {              
+
+    void ExecutionContext::QueueReference(IUnknown* object)
+    {
         assert(!m_closed);
         // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
         // value is the one to signal completion.
@@ -186,14 +186,14 @@ namespace Dml
     void ExecutionContext::Close()
     {
         assert(!m_closed);
-        
+
         // Discard unflushed work and clear queued references.  This prevents the circular reference:
         // Kernel --> ProviderImpl -->  Context --> QueuedRefs --> Kernel
         m_queue->Close();
         m_currentRecorder = nullptr;
         m_closed = true;
     }
-    
+
     GpuEvent ExecutionContext::GetCurrentCompletionEvent()
     {
         assert(!m_closed);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
index b06f11a5efd0..ac8d3ff87578 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
@@ -20,13 +20,13 @@ namespace Dml
     public:
         // Constructs an ExecutionContext that executes on the supplied queue.
         ExecutionContext(
-            ID3D12Device* d3d12Device, 
-            IDMLDevice* dmlDevice, 
+            ID3D12Device* d3d12Device,
+            IDMLDevice* dmlDevice,
             ID3D12CommandQueue* queue);
 
         void SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator);
 
-        // Waits for flushed work, discards unflushed work, and discards associated references to 
+        // Waits for flushed work, discards unflushed work, and discards associated references to
         // prevent circular references.  Must be the last call on the object before destruction.
         void Close();
 
@@ -75,12 +75,12 @@ namespace Dml
         // Returns an event which will become signaled when everything submitted to the execution context thus far has
         // completed execution on the GPU, including work that has yet to be flushed to the queue.
         GpuEvent GetCurrentCompletionEvent();
-        
+
         // Adds a reference which will be released when queued GPU work is completed
         void QueueReference(IUnknown* object);
 
         // Release any accumulated references who corresponding GPU fence values have
-        // been reached.  
+        // been reached.
         void ReleaseCompletedReferences();
 
         D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 8644b8d56a42..d24bf3350b29 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -142,13 +142,7 @@ namespace Dml
         }
     }
 
-// ORT release pipelines agent pools do not have 19H1 SDK installed which defines D3D_FEATURE_LEVEL_1_0_CORE.
-// Once ORT/WinML github project can be built with VS2019, we can update these pools to use install the 19H1 SDK
-// using the command line installer tool with VS2019
-// Task 24384515: Update ORT AIInfra release agent pool to install 19H1 SDK on VM bootstrap
-#define D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE ((D3D_FEATURE_LEVEL)0x1000)
-
-    ExecutionProviderImpl::ExecutionProviderImpl(IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, bool enableMetacommands, bool enableDynamicGraphFusion)
+ExecutionProviderImpl::ExecutionProviderImpl(IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, bool enableMetacommands, bool enableDynamicGraphFusion)
         : m_d3d12Device(d3d12Device),
           m_dmlDevice(dmlDevice),
           m_areMetacommandsEnabled(enableMetacommands),
@@ -157,7 +151,10 @@ namespace Dml
         D3D12_FEATURE_DATA_FEATURE_LEVELS featureLevels = {};
 
         D3D_FEATURE_LEVEL featureLevelsList[] = {
-            D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE,
+  #ifndef _GAMING_XBOX
+            D3D_FEATURE_LEVEL_1_0_GENERIC,
+  #endif
+            D3D_FEATURE_LEVEL_1_0_CORE,
             D3D_FEATURE_LEVEL_11_0,
             D3D_FEATURE_LEVEL_11_1,
             D3D_FEATURE_LEVEL_12_0,
@@ -181,7 +178,33 @@ namespace Dml
             m_native16BitShaderOpsSupported = featureOptions.Native16BitShaderOpsSupported;
         }
 
-        m_isMcdmDevice = (featureLevels.MaxSupportedFeatureLevel == D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE);
+        m_isMcdmDevice = (featureLevels.MaxSupportedFeatureLevel <= D3D_FEATURE_LEVEL_1_0_CORE);
+        m_areCustomHeapsSupported = !m_isMcdmDevice;
+
+        if (m_isMcdmDevice)
+        {
+
+            // TODO: Ingest updated header file
+            typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS19
+            {
+                BOOL MismatchingOutputDimensionsSupported;
+                UINT SupportedSampleCountsWithNoOutputs;
+                BOOL PointSamplingAddressesNeverRoundUp;
+                BOOL RasterizerDesc2Supported;
+                BOOL NarrowQuadrilateralLinesSupported;
+                BOOL AnisoFilterWithPointMipSupported;
+                UINT MaxSamplerDescriptorHeapSize;
+                UINT MaxSamplerDescriptorHeapSizeWithStaticSamplers;
+                UINT MaxViewDescriptorHeapSize;
+                _Out_  BOOL ComputeOnlyCustomHeapSupported;
+            } 	D3D12_FEATURE_DATA_D3D12_OPTIONS19;
+
+            D3D12_FEATURE_DATA_D3D12_OPTIONS19 options19 = {};
+
+            // The call may fail in which case the default value is false
+            d3d12Device->CheckFeatureSupport(static_cast<D3D12_FEATURE>(48) /*D3D12_FEATURE_D3D12_OPTIONS19*/, &options19, sizeof(options19));
+            m_areCustomHeapsSupported = options19.ComputeOnlyCustomHeapSupported;
+        }
 
         m_context = std::make_shared<ExecutionContext>(m_d3d12Device.Get(), m_dmlDevice.Get(), queue);
 
@@ -745,8 +768,14 @@ namespace Dml
                 !native16BitShaderOpsSupported &&
                 IsCustomOpShader(node))
             {
-                nodeContainsSupportedDataTypes = false;
-                return;
+                // STFT is a special case since it has a dml ep registered
+                // graph transformation that will decompose fp16 STFT into convolution
+                // and so it is OK to register for fp16.
+                if (strcmp("STFT", node.OpType().c_str()) != 0)
+                {
+                    nodeContainsSupportedDataTypes = false;
+                    return;
+                }
             }
 
             // Allow nodeArgs that are SequenceTensor when they are actually implemented by CPU Kernels.
@@ -1089,6 +1118,11 @@ namespace Dml
         return m_isMcdmDevice;
     }
 
+    bool __stdcall ExecutionProviderImpl::CustomHeapsSupported() const noexcept
+    {
+        return m_areCustomHeapsSupported;
+    }
+
     bool __stdcall ExecutionProviderImpl::MetacommandsEnabled() const noexcept
     {
         return m_areMetacommandsEnabled;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 3aaa11cdee47..841d6244a983 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -150,6 +150,7 @@ namespace Dml
         }
 
         STDMETHOD_(bool, IsMcdmDevice)() const noexcept final;
+        STDMETHOD_(bool, CustomHeapsSupported)() const noexcept final;
 
         STDMETHOD_(bool, MetacommandsEnabled)() const noexcept final;
         bool DynamicGraphFusionEnabled() const noexcept;
@@ -186,6 +187,7 @@ namespace Dml
         ComPtr<ID3D12Device> m_d3d12Device;
         ComPtr<IDMLDevice> m_dmlDevice;
         bool m_isMcdmDevice = false;
+        bool m_areCustomHeapsSupported = false;
         bool m_areMetacommandsEnabled = true;
         bool m_dynamicGraphFusionEnabled = false;
         bool m_native16BitShaderOpsSupported = false;
@@ -268,7 +270,7 @@ namespace Dml
             return m_impl->OnSessionInitializationEnd();
         }
 
-        virtual onnxruntime::Status Sync() const final override
+        onnxruntime::Status Sync() const final override
         {
             // Completely wait until the device has completed all preceding tasks.
             // The application could have called SynchronizeBoundOutputs().
@@ -276,7 +278,7 @@ namespace Dml
             return Status::OK();
         }
 
-        virtual onnxruntime::Status OnRunEnd(bool /*sync_stream*/) final override
+        onnxruntime::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) final override
         {
             // Flush any pending work to the GPU, but don't block for completion, permitting it
             // to overlap other work.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
index c75b662af788..7c25755a7d09 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
@@ -24,8 +24,8 @@ struct EnumTraits<DML_TENSOR_TYPE>
 template <>
 struct EnumTraits<DML_OPERATOR_TYPE>
 {
-    static constexpr auto ValueCount = 160;
-    static constexpr size_t ActivationFunctionCount = 24;
+    static constexpr auto ValueCount = 168;
+    static constexpr size_t ActivationFunctionCount = 26;
 };
 
 template <>
@@ -62,7 +62,7 @@ struct EnumTraits<DML_CONVOLUTION_DIRECTION>
 template <>
 struct EnumTraits<DML_PADDING_MODE>
 {
-    static constexpr auto ValueCount = 4;
+    static constexpr auto ValueCount = 5;
 };
 
 template <>
@@ -86,7 +86,7 @@ struct EnumTraits<DML_FEATURE>
 template <>
 struct EnumTraits<DML_FEATURE_LEVEL>
 {
-    static constexpr auto ValueCount = 8;
+    static constexpr auto ValueCount = 13;
 };
 
 template <>
@@ -119,6 +119,12 @@ struct EnumTraits<DML_RANDOM_GENERATOR_TYPE>
     static constexpr auto ValueCount = 1;
 };
 
+template <>
+struct EnumTraits<DML_MULTIHEAD_ATTENTION_MASK_TYPE>
+{
+    static constexpr auto ValueCount = 5;
+};
+
 template <typename T>
 constexpr auto EnumValueCount = EnumTraits<T>::ValueCount;
 
@@ -459,12 +465,24 @@ struct OperatorDescTraits<DML_AVERAGE_POOLING_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_AVERAGE_POOLING;
 };
 
+template <>
+struct OperatorDescTraits<DML_AVERAGE_POOLING1_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_AVERAGE_POOLING1;
+};
+
 template <>
 struct OperatorDescTraits<DML_LP_POOLING_OPERATOR_DESC>
 {
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_LP_POOLING;
 };
 
+template <>
+struct OperatorDescTraits<DML_LP_POOLING1_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_LP_POOLING1;
+};
+
 template <>
 struct OperatorDescTraits<DML_MAX_POOLING_OPERATOR_DESC>
 {
@@ -861,6 +879,12 @@ struct OperatorDescTraits<DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY;
 };
 
+template <>
+struct OperatorDescTraits<DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT;
+};
+
 template <>
 struct OperatorDescTraits<DML_CONVOLUTION_INTEGER_OPERATOR_DESC>
 {
@@ -1011,6 +1035,18 @@ struct OperatorDescTraits<DML_DIAGONAL_MATRIX1_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_DIAGONAL_MATRIX1;
 };
 
+template <>
+struct OperatorDescTraits<DML_MULTIHEAD_ATTENTION_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MULTIHEAD_ATTENTION;
+};
+
+template <>
+struct OperatorDescTraits<DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING;
+};
+
 template <>
 struct OperatorDescTraits<DML_ACTIVATION_ELU_OPERATOR_DESC>
 {
@@ -1156,9 +1192,15 @@ struct OperatorDescTraits<DML_ACTIVATION_GELU_OPERATOR_DESC>
 };
 
 template <>
-struct OperatorDescTraits<DML_MULTIHEAD_ATTENTION_OPERATOR_DESC>
+struct OperatorDescTraits<DML_ACTIVATION_SWISH_OPERATOR_DESC>
 {
-    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MULTIHEAD_ATTENTION;
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ACTIVATION_SWISH;
+};
+
+template <>
+struct OperatorDescTraits<DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ACTIVATION_HARD_SWISH;
 };
 
 template <DML_OPERATOR_TYPE Type>
@@ -1448,12 +1490,24 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_AVERAGE_POOLING>
     using DescType = DML_AVERAGE_POOLING_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_AVERAGE_POOLING1>
+{
+    using DescType = DML_AVERAGE_POOLING1_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_LP_POOLING>
 {
     using DescType = DML_LP_POOLING_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_LP_POOLING1>
+{
+    using DescType = DML_LP_POOLING1_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MAX_POOLING>
 {
@@ -2000,6 +2054,24 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_DIAGONAL_MATRIX1>
     using DescType = DML_DIAGONAL_MATRIX1_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MULTIHEAD_ATTENTION>
+{
+    using DescType = DML_MULTIHEAD_ATTENTION_OPERATOR_DESC;
+};
+
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING>
+{
+    using DescType = DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC;
+};
+
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT>
+{
+    using DescType = DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_ELU>
 {
@@ -2145,14 +2217,20 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_GELU>
 };
 
 template <>
-struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MULTIHEAD_ATTENTION>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_SWISH>
 {
-    using DescType = DML_MULTIHEAD_ATTENTION_OPERATOR_DESC;
+    using DescType = DML_ACTIVATION_SWISH_OPERATOR_DESC;
+};
+
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_HARD_SWISH>
+{
+    using DescType = DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC;
 };
 
 // Calls a visitor functor, supplying an empty operator desc corresponding to the given DML_OPERATOR_TYPE as
 // the first argument.
-//
+// 
 // For example:
 //   Visit(DML_OPERATOR_ELEMENT_WISE_IDENTITY, [](auto tag) {
 //       using T = decltype(tag); // T is one of the DML_*_OPERATOR_DESC structs
@@ -2259,8 +2337,12 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_ARGMAX_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_AVERAGE_POOLING:
         return std::invoke(std::forward<Visitor>(visitor), DML_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_AVERAGE_POOLING1:
+        return std::invoke(std::forward<Visitor>(visitor), DML_AVERAGE_POOLING1_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_LP_POOLING:
         return std::invoke(std::forward<Visitor>(visitor), DML_LP_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_LP_POOLING1:
+        return std::invoke(std::forward<Visitor>(visitor), DML_LP_POOLING1_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MAX_POOLING:
         return std::invoke(std::forward<Visitor>(visitor), DML_MAX_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MAX_POOLING1:
@@ -2445,6 +2527,10 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_DIAGONAL_MATRIX1_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MULTIHEAD_ATTENTION:
         return std::invoke(std::forward<Visitor>(visitor), DML_MULTIHEAD_ATTENTION_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
+        return std::invoke(std::forward<Visitor>(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
+        return std::invoke(std::forward<Visitor>(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_ACTIVATION_ELU:
         return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_ELU_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_ACTIVATION_CELU:
@@ -2493,7 +2579,10 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_SHRINK_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_ACTIVATION_GELU:
         return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_GELU_OPERATOR_DESC{}, std::forward<Ts>(args)...);
-
+    case DML_OPERATOR_ACTIVATION_SWISH:
+        return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_SWISH_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_ACTIVATION_HARD_SWISH:
+        return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     default:
         ORT_THROW_HR(E_INVALIDARG);
         return std::invoke(std::forward<Visitor>(visitor), DML_ACTIVATION_RELU_OPERATOR_DESC{}, std::forward<Ts>(args)...);
@@ -2501,7 +2590,55 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
 }
 #pragma warning(pop)
 
+namespace StringifyHelpers
+{
+template <typename T>
+inline gsl::czstring ToString(T value)
+{
+#ifndef WAI_BUILD_LINUX
+    // Clang will instantiate this template even if it isn't used,
+    // so this static_assert will always fire and break the build.
+    static_assert(false, "Not implemented for this type");
+#endif
+}
 
+template <>
+inline gsl::czstring ToString(DML_TENSOR_DATA_TYPE value)
+{
+    switch (value)
+    {
+    case DML_TENSOR_DATA_TYPE_UNKNOWN: return "DML_TENSOR_DATA_TYPE_UNKNOWN";
+    case DML_TENSOR_DATA_TYPE_FLOAT32: return "DML_TENSOR_DATA_TYPE_FLOAT32";
+    case DML_TENSOR_DATA_TYPE_FLOAT16: return "DML_TENSOR_DATA_TYPE_FLOAT16";
+    case DML_TENSOR_DATA_TYPE_UINT32: return "DML_TENSOR_DATA_TYPE_UINT32";
+    case DML_TENSOR_DATA_TYPE_UINT16: return "DML_TENSOR_DATA_TYPE_UINT16";
+    case DML_TENSOR_DATA_TYPE_UINT8: return "DML_TENSOR_DATA_TYPE_UINT8";
+    case DML_TENSOR_DATA_TYPE_INT32: return "DML_TENSOR_DATA_TYPE_INT32";
+    case DML_TENSOR_DATA_TYPE_INT16: return "DML_TENSOR_DATA_TYPE_INT16";
+    case DML_TENSOR_DATA_TYPE_INT8: return "DML_TENSOR_DATA_TYPE_INT8";
+    case DML_TENSOR_DATA_TYPE_FLOAT64: return "DML_TENSOR_DATA_TYPE_FLOAT64";
+    case DML_TENSOR_DATA_TYPE_UINT64: return "DML_TENSOR_DATA_TYPE_UINT64";
+    case DML_TENSOR_DATA_TYPE_INT64: return "DML_TENSOR_DATA_TYPE_INT64";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_TENSOR_TYPE value)
+{
+    switch (value)
+    {
+    case DML_TENSOR_TYPE_INVALID: return "DML_TENSOR_TYPE_INVALID";
+    case DML_TENSOR_TYPE_BUFFER: return "DML_TENSOR_TYPE_BUFFER";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
 inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
 {
     switch (value)
@@ -2515,9 +2652,6 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ELEMENT_WISE_ATAN: return "DML_OPERATOR_ELEMENT_WISE_ATAN";
     case DML_OPERATOR_ELEMENT_WISE_CEIL: return "DML_OPERATOR_ELEMENT_WISE_CEIL";
     case DML_OPERATOR_ELEMENT_WISE_CLIP: return "DML_OPERATOR_ELEMENT_WISE_CLIP";
-    case DML_OPERATOR_ELEMENT_WISE_CLIP1: return "DML_OPERATOR_ELEMENT_WISE_CLIP1";
-    case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD";
-    case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1";
     case DML_OPERATOR_ELEMENT_WISE_COS: return "DML_OPERATOR_ELEMENT_WISE_COS";
     case DML_OPERATOR_ELEMENT_WISE_DIVIDE: return "DML_OPERATOR_ELEMENT_WISE_DIVIDE";
     case DML_OPERATOR_ELEMENT_WISE_EXP: return "DML_OPERATOR_ELEMENT_WISE_EXP";
@@ -2541,22 +2675,41 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ELEMENT_WISE_RECIP: return "DML_OPERATOR_ELEMENT_WISE_RECIP";
     case DML_OPERATOR_ELEMENT_WISE_SIN: return "DML_OPERATOR_ELEMENT_WISE_SIN";
     case DML_OPERATOR_ELEMENT_WISE_SQRT: return "DML_OPERATOR_ELEMENT_WISE_SQRT";
-    case DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE: return "DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE";
-    case DML_OPERATOR_ELEMENT_WISE_ATAN_YX: return "DML_OPERATOR_ELEMENT_WISE_ATAN_YX";
     case DML_OPERATOR_ELEMENT_WISE_SUBTRACT: return "DML_OPERATOR_ELEMENT_WISE_SUBTRACT";
     case DML_OPERATOR_ELEMENT_WISE_TAN: return "DML_OPERATOR_ELEMENT_WISE_TAN";
     case DML_OPERATOR_ELEMENT_WISE_THRESHOLD: return "DML_OPERATOR_ELEMENT_WISE_THRESHOLD";
     case DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR: return "DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR";
     case DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR: return "DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR";
+    case DML_OPERATOR_ACTIVATION_ELU: return "DML_OPERATOR_ACTIVATION_ELU";
+    case DML_OPERATOR_ACTIVATION_CELU: return "DML_OPERATOR_ACTIVATION_CELU";
+    case DML_OPERATOR_ACTIVATION_HARDMAX: return "DML_OPERATOR_ACTIVATION_HARDMAX";
+    case DML_OPERATOR_ACTIVATION_HARDMAX1: return "DML_OPERATOR_ACTIVATION_HARDMAX1";
+    case DML_OPERATOR_ACTIVATION_HARD_SIGMOID: return "DML_OPERATOR_ACTIVATION_HARD_SIGMOID";
+    case DML_OPERATOR_ACTIVATION_IDENTITY: return "DML_OPERATOR_ACTIVATION_IDENTITY";
+    case DML_OPERATOR_ACTIVATION_LEAKY_RELU: return "DML_OPERATOR_ACTIVATION_LEAKY_RELU";
+    case DML_OPERATOR_ACTIVATION_LINEAR: return "DML_OPERATOR_ACTIVATION_LINEAR";
+    case DML_OPERATOR_ACTIVATION_LOG_SOFTMAX: return "DML_OPERATOR_ACTIVATION_LOG_SOFTMAX";
+    case DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1: return "DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1";
+    case DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU: return "DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU";
+    case DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS: return "DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS";
+    case DML_OPERATOR_ACTIVATION_RELU: return "DML_OPERATOR_ACTIVATION_RELU";
+    case DML_OPERATOR_ACTIVATION_SCALED_ELU: return "DML_OPERATOR_ACTIVATION_SCALED_ELU";
+    case DML_OPERATOR_ACTIVATION_SCALED_TANH: return "DML_OPERATOR_ACTIVATION_SCALED_TANH";
+    case DML_OPERATOR_ACTIVATION_SIGMOID: return "DML_OPERATOR_ACTIVATION_SIGMOID";
+    case DML_OPERATOR_ACTIVATION_SOFTMAX: return "DML_OPERATOR_ACTIVATION_SOFTMAX";
+    case DML_OPERATOR_ACTIVATION_SOFTMAX1: return "DML_OPERATOR_ACTIVATION_SOFTMAX1";
+    case DML_OPERATOR_ACTIVATION_SOFTPLUS: return "DML_OPERATOR_ACTIVATION_SOFTPLUS";
+    case DML_OPERATOR_ACTIVATION_SOFTSIGN: return "DML_OPERATOR_ACTIVATION_SOFTSIGN";
+    case DML_OPERATOR_ACTIVATION_TANH: return "DML_OPERATOR_ACTIVATION_TANH";
+    case DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU: return "DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU";
     case DML_OPERATOR_CONVOLUTION: return "DML_OPERATOR_CONVOLUTION";
     case DML_OPERATOR_GEMM: return "DML_OPERATOR_GEMM";
     case DML_OPERATOR_REDUCE: return "DML_OPERATOR_REDUCE";
-    case DML_OPERATOR_ARGMIN: return "DML_OPERATOR_ARGMIN";
-    case DML_OPERATOR_ARGMAX: return "DML_OPERATOR_ARGMAX";
     case DML_OPERATOR_AVERAGE_POOLING: return "DML_OPERATOR_AVERAGE_POOLING";
+    case DML_OPERATOR_AVERAGE_POOLING1: return "DML_OPERATOR_AVERAGE_POOLING1";
     case DML_OPERATOR_LP_POOLING: return "DML_OPERATOR_LP_POOLING";
+    case DML_OPERATOR_LP_POOLING1: return "DML_OPERATOR_LP_POOLING1";
     case DML_OPERATOR_MAX_POOLING: return "DML_OPERATOR_MAX_POOLING";
-    case DML_OPERATOR_MAX_POOLING1: return "DML_OPERATOR_MAX_POOLING1";
     case DML_OPERATOR_ROI_POOLING: return "DML_OPERATOR_ROI_POOLING";
     case DML_OPERATOR_SLICE: return "DML_OPERATOR_SLICE";
     case DML_OPERATOR_CAST: return "DML_OPERATOR_CAST";
@@ -2572,18 +2725,15 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_TILE: return "DML_OPERATOR_TILE";
     case DML_OPERATOR_TOP_K: return "DML_OPERATOR_TOP_K";
     case DML_OPERATOR_BATCH_NORMALIZATION: return "DML_OPERATOR_BATCH_NORMALIZATION";
-    case DML_OPERATOR_BATCH_NORMALIZATION_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_GRAD";
-    case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD";
+    case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING";
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION: return "DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION";
     case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION";
-    case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD";
     case DML_OPERATOR_LP_NORMALIZATION: return "DML_OPERATOR_LP_NORMALIZATION";
     case DML_OPERATOR_RNN: return "DML_OPERATOR_RNN";
     case DML_OPERATOR_LSTM: return "DML_OPERATOR_LSTM";
     case DML_OPERATOR_GRU: return "DML_OPERATOR_GRU";
     case DML_OPERATOR_ELEMENT_WISE_SIGN: return "DML_OPERATOR_ELEMENT_WISE_SIGN";
     case DML_OPERATOR_ELEMENT_WISE_IS_NAN: return "DML_OPERATOR_ELEMENT_WISE_IS_NAN";
-    case DML_OPERATOR_ELEMENT_WISE_NEGATE: return "DML_OPERATOR_ELEMENT_WISE_NEGATE";
     case DML_OPERATOR_ELEMENT_WISE_ERF: return "DML_OPERATOR_ELEMENT_WISE_ERF";
     case DML_OPERATOR_ELEMENT_WISE_SINH: return "DML_OPERATOR_ELEMENT_WISE_SINH";
     case DML_OPERATOR_ELEMENT_WISE_COSH: return "DML_OPERATOR_ELEMENT_WISE_COSH";
@@ -2593,6 +2743,8 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ELEMENT_WISE_ATANH: return "DML_OPERATOR_ELEMENT_WISE_ATANH";
     case DML_OPERATOR_ELEMENT_WISE_IF: return "DML_OPERATOR_ELEMENT_WISE_IF";
     case DML_OPERATOR_ELEMENT_WISE_ADD1: return "DML_OPERATOR_ELEMENT_WISE_ADD1";
+    case DML_OPERATOR_ACTIVATION_SHRINK: return "DML_OPERATOR_ACTIVATION_SHRINK";
+    case DML_OPERATOR_MAX_POOLING1: return "DML_OPERATOR_MAX_POOLING1";
     case DML_OPERATOR_MAX_UNPOOLING: return "DML_OPERATOR_MAX_UNPOOLING";
     case DML_OPERATOR_DIAGONAL_MATRIX: return "DML_OPERATOR_DIAGONAL_MATRIX";
     case DML_OPERATOR_SCATTER: return "DML_OPERATOR_SCATTER";
@@ -2604,10 +2756,9 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ELEMENT_WISE_IS_INFINITY: return "DML_OPERATOR_ELEMENT_WISE_IS_INFINITY";
     case DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE: return "DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE";
     case DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR: return "DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR";
-    case DML_OPERATOR_FILL_VALUE_CONSTANT: return "DML_OPERATOR_FILL_VALUE_CONSTANT";
     case DML_OPERATOR_FILL_VALUE_SEQUENCE: return "DML_OPERATOR_FILL_VALUE_SEQUENCE";
+    case DML_OPERATOR_FILL_VALUE_CONSTANT: return "DML_OPERATOR_FILL_VALUE_CONSTANT";
     case DML_OPERATOR_CUMULATIVE_SUMMATION: return "DML_OPERATOR_CUMULATIVE_SUMMATION";
-    case DML_OPERATOR_CUMULATIVE_PRODUCT: return "DML_OPERATOR_CUMULATIVE_PRODUCT";
     case DML_OPERATOR_REVERSE_SUBSEQUENCES: return "DML_OPERATOR_REVERSE_SUBSEQUENCES";
     case DML_OPERATOR_GATHER_ELEMENTS: return "DML_OPERATOR_GATHER_ELEMENTS";
     case DML_OPERATOR_GATHER_ND: return "DML_OPERATOR_GATHER_ND";
@@ -2636,20 +2787,278 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_RESAMPLE_GRAD: return "DML_OPERATOR_RESAMPLE_GRAD";
     case DML_OPERATOR_SLICE_GRAD: return "DML_OPERATOR_SLICE_GRAD";
     case DML_OPERATOR_ADAM_OPTIMIZER: return "DML_OPERATOR_ADAM_OPTIMIZER";
+    case DML_OPERATOR_ARGMIN: return "DML_OPERATOR_ARGMIN";
+    case DML_OPERATOR_ARGMAX: return "DML_OPERATOR_ARGMAX";
     case DML_OPERATOR_ROI_ALIGN: return "DML_OPERATOR_ROI_ALIGN";
-    case DML_OPERATOR_ROI_ALIGN1: return "DML_OPERATOR_ROI_ALIGN1";
     case DML_OPERATOR_GATHER_ND1: return "DML_OPERATOR_GATHER_ND1";
-    case DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR: return "DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR";
+    case DML_OPERATOR_ELEMENT_WISE_ATAN_YX: return "DML_OPERATOR_ELEMENT_WISE_ATAN_YX";
+    case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD";
+    case DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE: return "DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE";
+    case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD";
+    case DML_OPERATOR_CUMULATIVE_PRODUCT: return "DML_OPERATOR_CUMULATIVE_PRODUCT";
+    case DML_OPERATOR_BATCH_NORMALIZATION_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_GRAD";
+    case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD";
     case DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD: return "DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD";
-    case DML_OPERATOR_ROI_ALIGN_GRAD: return "DML_OPERATOR_ROI_ALIGN_GRAD";
-    case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING";
+    case DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR: return "DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR";
+    case DML_OPERATOR_ROI_ALIGN1: return "DML_OPERATOR_ROI_ALIGN1";
+    case DML_OPERATOR_ELEMENT_WISE_CLIP1: return "DML_OPERATOR_ELEMENT_WISE_CLIP1";
+    case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1";
+    case DML_OPERATOR_ELEMENT_WISE_NEGATE: return "DML_OPERATOR_ELEMENT_WISE_NEGATE";
+    case DML_OPERATOR_ACTIVATION_GELU: return "DML_OPERATOR_ACTIVATION_GELU";
+    case DML_OPERATOR_ACTIVATION_SWISH: return "DML_OPERATOR_ACTIVATION_SWISH";
+    case DML_OPERATOR_ACTIVATION_HARD_SWISH: return "DML_OPERATOR_ACTIVATION_HARD_SWISH";
     case DML_OPERATOR_RESAMPLE2: return "DML_OPERATOR_RESAMPLE2";
     case DML_OPERATOR_RESAMPLE_GRAD1: return "DML_OPERATOR_RESAMPLE_GRAD1";
     case DML_OPERATOR_DIAGONAL_MATRIX1: return "DML_OPERATOR_DIAGONAL_MATRIX1";
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return "DML_OPERATOR_MULTIHEAD_ATTENTION";
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING";
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_BINDING_TYPE value)
+{
+    switch (value)
+    {
+    case DML_BINDING_TYPE_NONE: return "DML_BINDING_TYPE_NONE";
+    case DML_BINDING_TYPE_BUFFER: return "DML_BINDING_TYPE_BUFFER";
+    case DML_BINDING_TYPE_BUFFER_ARRAY: return "DML_BINDING_TYPE_BUFFER_ARRAY";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_REDUCE_FUNCTION value)
+{
+    switch (value)
+    {
+    case DML_REDUCE_FUNCTION_ARGMAX: return "DML_REDUCE_FUNCTION_ARGMAX";
+    case DML_REDUCE_FUNCTION_ARGMIN: return "DML_REDUCE_FUNCTION_ARGMIN";
+    case DML_REDUCE_FUNCTION_AVERAGE: return "DML_REDUCE_FUNCTION_AVERAGE";
+    case DML_REDUCE_FUNCTION_L1: return "DML_REDUCE_FUNCTION_L1";
+    case DML_REDUCE_FUNCTION_L2: return "DML_REDUCE_FUNCTION_L2";
+    case DML_REDUCE_FUNCTION_LOG_SUM: return "DML_REDUCE_FUNCTION_LOG_SUM";
+    case DML_REDUCE_FUNCTION_LOG_SUM_EXP: return "DML_REDUCE_FUNCTION_LOG_SUM_EXP";
+    case DML_REDUCE_FUNCTION_MAX: return "DML_REDUCE_FUNCTION_MAX";
+    case DML_REDUCE_FUNCTION_MIN: return "DML_REDUCE_FUNCTION_MIN";
+    case DML_REDUCE_FUNCTION_MULTIPLY: return "DML_REDUCE_FUNCTION_MULTIPLY";
+    case DML_REDUCE_FUNCTION_SUM: return "DML_REDUCE_FUNCTION_SUM";
+    case DML_REDUCE_FUNCTION_SUM_SQUARE: return "DML_REDUCE_FUNCTION_SUM_SQUARE";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_MATRIX_TRANSFORM value)
+{
+    switch (value)
+    {
+    case DML_MATRIX_TRANSFORM_NONE: return "DML_MATRIX_TRANSFORM_NONE";
+    case DML_MATRIX_TRANSFORM_TRANSPOSE: return "DML_MATRIX_TRANSFORM_TRANSPOSE";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_CONVOLUTION_MODE value)
+{
+    switch (value)
+    {
+    case DML_CONVOLUTION_MODE_CONVOLUTION: return "DML_CONVOLUTION_MODE_CONVOLUTION";
+    case DML_CONVOLUTION_MODE_CROSS_CORRELATION: return "DML_CONVOLUTION_MODE_CROSS_CORRELATION";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_CONVOLUTION_DIRECTION value)
+{
+    switch (value)
+    {
+    case DML_CONVOLUTION_DIRECTION_FORWARD: return "DML_CONVOLUTION_DIRECTION_FORWARD";
+    case DML_CONVOLUTION_DIRECTION_BACKWARD: return "DML_CONVOLUTION_DIRECTION_BACKWARD";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_PADDING_MODE value)
+{
+    switch (value)
+    {
+    case DML_PADDING_MODE_CONSTANT: return "DML_PADDING_MODE_CONSTANT";
+    case DML_PADDING_MODE_EDGE: return "DML_PADDING_MODE_EDGE";
+    case DML_PADDING_MODE_REFLECTION: return "DML_PADDING_MODE_REFLECTION";
+    case DML_PADDING_MODE_SYMMETRIC: return "DML_PADDING_MODE_SYMMETRIC";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_INTERPOLATION_MODE value)
+{
+    switch (value)
+    {
+    case DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR: return "DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR";
+    case DML_INTERPOLATION_MODE_LINEAR: return "DML_INTERPOLATION_MODE_LINEAR";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_RECURRENT_NETWORK_DIRECTION value)
+{
+    switch (value)
+    {
+    case DML_RECURRENT_NETWORK_DIRECTION_FORWARD: return "DML_RECURRENT_NETWORK_DIRECTION_FORWARD";
+    case DML_RECURRENT_NETWORK_DIRECTION_BACKWARD: return "DML_RECURRENT_NETWORK_DIRECTION_BACKWARD";
+    case DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL: return "DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_FEATURE value)
+{
+    switch (value)
+    {
+    case DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT: return "DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT";
+    case DML_FEATURE_FEATURE_LEVELS: return "DML_FEATURE_FEATURE_LEVELS";
     default:
         assert(false);
         return "<unknown>";
     }
 }
+
+template <>
+inline gsl::czstring ToString(DML_FEATURE_LEVEL value)
+{
+    switch (value)
+    {
+    case DML_FEATURE_LEVEL_1_0: return "DML_FEATURE_LEVEL_1_0";
+    case DML_FEATURE_LEVEL_2_0: return "DML_FEATURE_LEVEL_2_0";
+    case DML_FEATURE_LEVEL_2_1: return "DML_FEATURE_LEVEL_2_1";
+    case DML_FEATURE_LEVEL_3_0: return "DML_FEATURE_LEVEL_3_0";
+    case DML_FEATURE_LEVEL_3_1: return "DML_FEATURE_LEVEL_3_1";
+    case DML_FEATURE_LEVEL_4_0: return "DML_FEATURE_LEVEL_4_0";
+    case DML_FEATURE_LEVEL_4_1: return "DML_FEATURE_LEVEL_4_1";
+    case DML_FEATURE_LEVEL_5_0: return "DML_FEATURE_LEVEL_5_0";
+    case DML_FEATURE_LEVEL_5_1: return "DML_FEATURE_LEVEL_5_1";
+    case DML_FEATURE_LEVEL_5_2: return "DML_FEATURE_LEVEL_5_2";
+    case DML_FEATURE_LEVEL_6_0: return "DML_FEATURE_LEVEL_6_0";
+    case DML_FEATURE_LEVEL_6_1: return "DML_FEATURE_LEVEL_6_1";
+    case DML_FEATURE_LEVEL_6_2: return "DML_FEATURE_LEVEL_6_2";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_IS_INFINITY_MODE value)
+{
+    switch (value)
+    {
+    case DML_IS_INFINITY_MODE_EITHER: return "DML_IS_INFINITY_MODE_EITHER";
+    case DML_IS_INFINITY_MODE_POSITIVE: return "DML_IS_INFINITY_MODE_POSITIVE";
+    case DML_IS_INFINITY_MODE_NEGATIVE: return "DML_IS_INFINITY_MODE_NEGATIVE";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_DEPTH_SPACE_ORDER value)
+{
+    switch (value)
+    {
+    case DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW: return "DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW";
+    case DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH: return "DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_AXIS_DIRECTION value)
+{
+    switch (value)
+    {
+    case DML_AXIS_DIRECTION_INCREASING: return "DML_AXIS_DIRECTION_INCREASING";
+    case DML_AXIS_DIRECTION_DECREASING: return "DML_AXIS_DIRECTION_DECREASING";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_ROUNDING_MODE value)
+{
+    switch (value)
+    {
+    case DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN: return "DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN";
+    case DML_ROUNDING_MODE_TOWARD_ZERO: return "DML_ROUNDING_MODE_TOWARD_ZERO";
+    case DML_ROUNDING_MODE_TOWARD_INFINITY: return "DML_ROUNDING_MODE_TOWARD_INFINITY";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_RANDOM_GENERATOR_TYPE value)
+{
+    switch (value)
+    {
+    case DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10: return "DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+template <>
+inline gsl::czstring ToString(DML_MULTIHEAD_ATTENTION_MASK_TYPE value)
+{
+    switch (value)
+    {
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE";
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH";
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START";
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END";
+    case DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN";
+    default:
+        assert(false);
+        return "<unknown>";
+    }
+}
+
+
+template <typename T>
+T FromString(std::string_view value);
+
+}
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
index 1ebd52d4ed42..64ea5b7801a8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
@@ -618,7 +618,7 @@ constexpr DML_OPERATOR_SCHEMA DML_ELEMENT_WISE_THRESHOLD_OPERATOR_SCHEMA {
 constexpr DML_SCHEMA_FIELD DML_ELEMENT_WISE_QUANTIZE_LINEAR_OPERATOR_SCHEMA_FIELDS[4] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
 };
 
@@ -633,7 +633,7 @@ constexpr DML_OPERATOR_SCHEMA DML_ELEMENT_WISE_QUANTIZE_LINEAR_OPERATOR_SCHEMA {
 constexpr DML_SCHEMA_FIELD DML_ELEMENT_WISE_DEQUANTIZE_LINEAR_OPERATOR_SCHEMA_FIELDS[4] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ScaleTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
 };
 
@@ -757,6 +757,26 @@ constexpr DML_OPERATOR_SCHEMA DML_AVERAGE_POOLING_OPERATOR_SCHEMA {
     DML_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_AVERAGE_POOLING1_OPERATOR_SCHEMA_FIELDS[9] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "IncludePadding", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_AVERAGE_POOLING1_OPERATOR_SCHEMA {
+    "DML_OPERATOR_AVERAGE_POOLING1",
+    DML_OPERATOR_AVERAGE_POOLING1,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    9,
+    DML_AVERAGE_POOLING1_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_LP_POOLING_OPERATOR_SCHEMA_FIELDS[8] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
@@ -776,6 +796,26 @@ constexpr DML_OPERATOR_SCHEMA DML_LP_POOLING_OPERATOR_SCHEMA {
     DML_LP_POOLING_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_LP_POOLING1_OPERATOR_SCHEMA_FIELDS[9] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "P", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_LP_POOLING1_OPERATOR_SCHEMA {
+    "DML_OPERATOR_LP_POOLING1",
+    DML_OPERATOR_LP_POOLING1,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    9,
+    DML_LP_POOLING1_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_MAX_POOLING_OPERATOR_SCHEMA_FIELDS[7] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
@@ -1081,7 +1121,7 @@ constexpr DML_SCHEMA_FIELD DML_BATCH_NORMALIZATION_TRAINING_GRAD_OPERATOR_SCHEMA
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputGradientTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleGradientTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputBiasGradientTensor", false },
-    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Epsilon", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Epsilon", false },
 };
 
 constexpr DML_OPERATOR_SCHEMA DML_BATCH_NORMALIZATION_TRAINING_GRAD_OPERATOR_SCHEMA {
@@ -1825,6 +1865,25 @@ constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA {
     DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
+    "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
+    static_cast<DML_OPERATOR_TYPE>(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT),
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    8,
+    DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS[9] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
@@ -2247,7 +2306,7 @@ constexpr DML_OPERATOR_SCHEMA DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_SCHEMA {
     DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_SCHEMA_FIELDS,
 };
 
-constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8]{
+constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "InterpolationMode", false },
@@ -2258,7 +2317,7 @@ constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8]{
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "OutputPixelOffsets", false },
 };
 
-constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE2_OPERATOR_SCHEMA{
+constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE2_OPERATOR_SCHEMA {
     "DML_OPERATOR_RESAMPLE2",
     DML_OPERATOR_RESAMPLE2,
     DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
@@ -2277,7 +2336,7 @@ constexpr DML_SCHEMA_FIELD DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA_FIELDS[8]{
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "OutputPixelOffsets", false },
 };
 
-constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA{
+constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA {
     "DML_OPERATOR_RESAMPLE_GRAD1",
     DML_OPERATOR_RESAMPLE_GRAD1,
     DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
@@ -2285,7 +2344,7 @@ constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA{
     DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA_FIELDS,
 };
 
-constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6]{
+constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", true },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "ValueDataType", false },
@@ -2294,7 +2353,7 @@ constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6]{
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_INT, "DiagonalFillEnd", false },
 };
 
-constexpr DML_OPERATOR_SCHEMA DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA{
+constexpr DML_OPERATOR_SCHEMA DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA {
     "DML_OPERATOR_DIAGONAL_MATRIX1",
     DML_OPERATOR_DIAGONAL_MATRIX1,
     DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
@@ -2331,6 +2390,30 @@ constexpr DML_OPERATOR_SCHEMA DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA {
     DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS[13] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "IncludePadding", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA {
+    "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING",
+    DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    13,
+    DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_ACTIVATION_ELU_OPERATOR_SCHEMA_FIELDS[3] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
@@ -2667,6 +2750,35 @@ constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_GELU_OPERATOR_SCHEMA {
     DML_ACTIVATION_GELU_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_ACTIVATION_SWISH_OPERATOR_SCHEMA_FIELDS[3] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "SigmoidInputScale", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_SWISH_OPERATOR_SCHEMA {
+    "DML_OPERATOR_ACTIVATION_SWISH",
+    DML_OPERATOR_ACTIVATION_SWISH,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    3,
+    DML_ACTIVATION_SWISH_OPERATOR_SCHEMA_FIELDS,
+};
+
+constexpr DML_SCHEMA_FIELD DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA_FIELDS[4] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Alpha", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Beta", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA {
+    "DML_OPERATOR_ACTIVATION_HARD_SWISH",
+    DML_OPERATOR_ACTIVATION_HARD_SWISH,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    4,
+    DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_RNN_ZERO_OPERATOR_SCHEMA_FIELDS[3] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "SequenceLengthsTensor", false },
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h
new file mode 100644
index 000000000000..df485396f1e4
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h
@@ -0,0 +1,850 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_
+#define FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_
+
+#include "core/common/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 26,
+             "Non-compatible flatbuffers version included");
+
+#include "OperatorFieldTypes_generated.h"
+
+namespace dml {
+namespace ir {
+
+struct ConstantRawData;
+struct ConstantRawDataBuilder;
+
+struct ConstantName;
+struct ConstantNameBuilder;
+
+struct ConstantNodeDesc;
+struct ConstantNodeDescBuilder;
+
+struct DmlBufferTensorDesc;
+struct DmlBufferTensorDescBuilder;
+
+struct OperatorNodeDesc;
+struct OperatorNodeDescBuilder;
+
+struct DmlGraphNode;
+struct DmlGraphNodeBuilder;
+
+struct DmlGraphDesc;
+struct DmlGraphDescBuilder;
+
+enum ConstantNodeDescDetail : uint8_t {
+  ConstantNodeDescDetail_NONE = 0,
+  ConstantNodeDescDetail_ConstantName = 1,
+  ConstantNodeDescDetail_ConstantRawData = 2,
+  ConstantNodeDescDetail_MIN = ConstantNodeDescDetail_NONE,
+  ConstantNodeDescDetail_MAX = ConstantNodeDescDetail_ConstantRawData
+};
+
+inline const ConstantNodeDescDetail (&EnumValuesConstantNodeDescDetail())[3] {
+  static const ConstantNodeDescDetail values[] = {
+    ConstantNodeDescDetail_NONE,
+    ConstantNodeDescDetail_ConstantName,
+    ConstantNodeDescDetail_ConstantRawData
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesConstantNodeDescDetail() {
+  static const char * const names[4] = {
+    "NONE",
+    "ConstantName",
+    "ConstantRawData",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameConstantNodeDescDetail(ConstantNodeDescDetail e) {
+  if (::flatbuffers::IsOutRange(e, ConstantNodeDescDetail_NONE, ConstantNodeDescDetail_ConstantRawData)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesConstantNodeDescDetail()[index];
+}
+
+template<typename T> struct ConstantNodeDescDetailTraits {
+  static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_NONE;
+};
+
+template<> struct ConstantNodeDescDetailTraits<dml::ir::ConstantName> {
+  static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_ConstantName;
+};
+
+template<> struct ConstantNodeDescDetailTraits<dml::ir::ConstantRawData> {
+  static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_ConstantRawData;
+};
+
+bool VerifyConstantNodeDescDetail(::flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type);
+bool VerifyConstantNodeDescDetailVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
+
+enum NodeDesc : uint8_t {
+  NodeDesc_NONE = 0,
+  NodeDesc_OperatorNodeDesc = 1,
+  NodeDesc_ConstantNodeDesc = 2,
+  NodeDesc_MIN = NodeDesc_NONE,
+  NodeDesc_MAX = NodeDesc_ConstantNodeDesc
+};
+
+inline const NodeDesc (&EnumValuesNodeDesc())[3] {
+  static const NodeDesc values[] = {
+    NodeDesc_NONE,
+    NodeDesc_OperatorNodeDesc,
+    NodeDesc_ConstantNodeDesc
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesNodeDesc() {
+  static const char * const names[4] = {
+    "NONE",
+    "OperatorNodeDesc",
+    "ConstantNodeDesc",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameNodeDesc(NodeDesc e) {
+  if (::flatbuffers::IsOutRange(e, NodeDesc_NONE, NodeDesc_ConstantNodeDesc)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesNodeDesc()[index];
+}
+
+template<typename T> struct NodeDescTraits {
+  static const NodeDesc enum_value = NodeDesc_NONE;
+};
+
+template<> struct NodeDescTraits<dml::ir::OperatorNodeDesc> {
+  static const NodeDesc enum_value = NodeDesc_OperatorNodeDesc;
+};
+
+template<> struct NodeDescTraits<dml::ir::ConstantNodeDesc> {
+  static const NodeDesc enum_value = NodeDesc_ConstantNodeDesc;
+};
+
+bool VerifyNodeDesc(::flatbuffers::Verifier &verifier, const void *obj, NodeDesc type);
+bool VerifyNodeDescVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
+
+struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConstantRawDataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  ::flatbuffers::Vector<uint8_t> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ConstantRawDataBuilder {
+  typedef ConstantRawData Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(ConstantRawData::VT_DATA, data);
+  }
+  explicit ConstantRawDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConstantRawData> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConstantRawData>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConstantRawData> CreateConstantRawData(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0) {
+  ConstantRawDataBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ConstantRawData> CreateConstantRawDataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return dml::ir::CreateConstantRawData(
+      _fbb,
+      data__);
+}
+
+struct ConstantName FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConstantNameBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ConstantNameBuilder {
+  typedef ConstantName Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(ConstantName::VT_NAME, name);
+  }
+  explicit ConstantNameBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConstantName> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConstantName>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConstantName> CreateConstantName(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0) {
+  ConstantNameBuilder builder_(_fbb);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ConstantName> CreateConstantNameDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return dml::ir::CreateConstantName(
+      _fbb,
+      name__);
+}
+
+struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConstantNodeDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA_TYPE = 4,
+    VT_DATA = 6
+  };
+  dml::ir::ConstantNodeDescDetail data_type() const {
+    return static_cast<dml::ir::ConstantNodeDescDetail>(GetField<uint8_t>(VT_DATA_TYPE, 0));
+  }
+  const void *data() const {
+    return GetPointer<const void *>(VT_DATA);
+  }
+  template<typename T> const T *data_as() const;
+  const dml::ir::ConstantName *data_as_ConstantName() const {
+    return data_type() == dml::ir::ConstantNodeDescDetail_ConstantName ? static_cast<const dml::ir::ConstantName *>(data()) : nullptr;
+  }
+  const dml::ir::ConstantRawData *data_as_ConstantRawData() const {
+    return data_type() == dml::ir::ConstantNodeDescDetail_ConstantRawData ? static_cast<const dml::ir::ConstantRawData *>(data()) : nullptr;
+  }
+  void *mutable_data() {
+    return GetPointer<void *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_DATA_TYPE, 1) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           VerifyConstantNodeDescDetail(verifier, data(), data_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const dml::ir::ConstantName *ConstantNodeDesc::data_as<dml::ir::ConstantName>() const {
+  return data_as_ConstantName();
+}
+
+template<> inline const dml::ir::ConstantRawData *ConstantNodeDesc::data_as<dml::ir::ConstantRawData>() const {
+  return data_as_ConstantRawData();
+}
+
+struct ConstantNodeDescBuilder {
+  typedef ConstantNodeDesc Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data_type(dml::ir::ConstantNodeDescDetail data_type) {
+    fbb_.AddElement<uint8_t>(ConstantNodeDesc::VT_DATA_TYPE, static_cast<uint8_t>(data_type), 0);
+  }
+  void add_data(::flatbuffers::Offset<void> data) {
+    fbb_.AddOffset(ConstantNodeDesc::VT_DATA, data);
+  }
+  explicit ConstantNodeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConstantNodeDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConstantNodeDesc>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConstantNodeDesc> CreateConstantNodeDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    dml::ir::ConstantNodeDescDetail data_type = dml::ir::ConstantNodeDescDetail_NONE,
+    ::flatbuffers::Offset<void> data = 0) {
+  ConstantNodeDescBuilder builder_(_fbb);
+  builder_.add_data(data);
+  builder_.add_data_type(data_type);
+  return builder_.Finish();
+}
+
+struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DmlBufferTensorDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATATYPE = 4,
+    VT_SIZES = 6,
+    VT_STRIDES = 8,
+    VT_TOTALTENSORSIZEINBYTES = 10
+  };
+  const ::flatbuffers::String *dataType() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DATATYPE);
+  }
+  ::flatbuffers::String *mutable_dataType() {
+    return GetPointer<::flatbuffers::String *>(VT_DATATYPE);
+  }
+  const ::flatbuffers::Vector<uint32_t> *sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_SIZES);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_sizes() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_SIZES);
+  }
+  const ::flatbuffers::Vector<uint32_t> *strides() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_STRIDES);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_strides() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_STRIDES);
+  }
+  uint64_t totalTensorSizeInBytes() const {
+    return GetField<uint64_t>(VT_TOTALTENSORSIZEINBYTES, 0);
+  }
+  bool mutate_totalTensorSizeInBytes(uint64_t _totalTensorSizeInBytes = 0) {
+    return SetField<uint64_t>(VT_TOTALTENSORSIZEINBYTES, _totalTensorSizeInBytes, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATATYPE) &&
+           verifier.VerifyString(dataType()) &&
+           VerifyOffset(verifier, VT_SIZES) &&
+           verifier.VerifyVector(sizes()) &&
+           VerifyOffset(verifier, VT_STRIDES) &&
+           verifier.VerifyVector(strides()) &&
+           VerifyField<uint64_t>(verifier, VT_TOTALTENSORSIZEINBYTES, 8) &&
+           verifier.EndTable();
+  }
+};
+
+struct DmlBufferTensorDescBuilder {
+  typedef DmlBufferTensorDesc Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_dataType(::flatbuffers::Offset<::flatbuffers::String> dataType) {
+    fbb_.AddOffset(DmlBufferTensorDesc::VT_DATATYPE, dataType);
+  }
+  void add_sizes(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> sizes) {
+    fbb_.AddOffset(DmlBufferTensorDesc::VT_SIZES, sizes);
+  }
+  void add_strides(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> strides) {
+    fbb_.AddOffset(DmlBufferTensorDesc::VT_STRIDES, strides);
+  }
+  void add_totalTensorSizeInBytes(uint64_t totalTensorSizeInBytes) {
+    fbb_.AddElement<uint64_t>(DmlBufferTensorDesc::VT_TOTALTENSORSIZEINBYTES, totalTensorSizeInBytes, 0);
+  }
+  explicit DmlBufferTensorDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DmlBufferTensorDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DmlBufferTensorDesc>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> dataType = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> sizes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> strides = 0,
+    uint64_t totalTensorSizeInBytes = 0) {
+  DmlBufferTensorDescBuilder builder_(_fbb);
+  builder_.add_totalTensorSizeInBytes(totalTensorSizeInBytes);
+  builder_.add_strides(strides);
+  builder_.add_sizes(sizes);
+  builder_.add_dataType(dataType);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<DmlBufferTensorDesc> CreateDmlBufferTensorDescDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *dataType = nullptr,
+    const std::vector<uint32_t> *sizes = nullptr,
+    const std::vector<uint32_t> *strides = nullptr,
+    uint64_t totalTensorSizeInBytes = 0) {
+  auto dataType__ = dataType ? _fbb.CreateString(dataType) : 0;
+  auto sizes__ = sizes ? _fbb.CreateVector<uint32_t>(*sizes) : 0;
+  auto strides__ = strides ? _fbb.CreateVector<uint32_t>(*strides) : 0;
+  return dml::ir::CreateDmlBufferTensorDesc(
+      _fbb,
+      dataType__,
+      sizes__,
+      strides__,
+      totalTensorSizeInBytes);
+}
+
+struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OperatorNodeDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_ATTRIBUTES = 10
+  };
+  const ::flatbuffers::String *type() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE);
+  }
+  ::flatbuffers::String *mutable_type() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_INPUTS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *mutable_inputs() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_INPUTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_OUTPUTS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *mutable_outputs() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *>(VT_OUTPUTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *mutable_attributes() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TYPE) &&
+           verifier.VerifyString(type()) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           verifier.VerifyVectorOfTables(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           verifier.VerifyVectorOfTables(outputs()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           verifier.EndTable();
+  }
+};
+
+struct OperatorNodeDescBuilder {
+  typedef OperatorNodeDesc Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type(::flatbuffers::Offset<::flatbuffers::String> type) {
+    fbb_.AddOffset(OperatorNodeDesc::VT_TYPE, type);
+  }
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> inputs) {
+    fbb_.AddOffset(OperatorNodeDesc::VT_INPUTS, inputs);
+  }
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> outputs) {
+    fbb_.AddOffset(OperatorNodeDesc::VT_OUTPUTS, outputs);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes) {
+    fbb_.AddOffset(OperatorNodeDesc::VT_ATTRIBUTES, attributes);
+  }
+  explicit OperatorNodeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<OperatorNodeDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<OperatorNodeDesc>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> type = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes = 0) {
+  OperatorNodeDescBuilder builder_(_fbb);
+  builder_.add_attributes(attributes);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<OperatorNodeDesc> CreateOperatorNodeDescDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *type = nullptr,
+    const std::vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *inputs = nullptr,
+    const std::vector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>> *outputs = nullptr,
+    const std::vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes = nullptr) {
+  auto type__ = type ? _fbb.CreateString(type) : 0;
+  auto inputs__ = inputs ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::DmlBufferTensorDesc>>(*outputs) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>(*attributes) : 0;
+  return dml::ir::CreateOperatorNodeDesc(
+      _fbb,
+      type__,
+      inputs__,
+      outputs__,
+      attributes__);
+}
+
+struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DmlGraphNodeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DESC_TYPE = 4,
+    VT_DESC = 6,
+    VT_NAME = 8,
+    VT_INPUTNAMES = 10,
+    VT_OUTPUTNAMES = 12
+  };
+  dml::ir::NodeDesc desc_type() const {
+    return static_cast<dml::ir::NodeDesc>(GetField<uint8_t>(VT_DESC_TYPE, 0));
+  }
+  const void *desc() const {
+    return GetPointer<const void *>(VT_DESC);
+  }
+  template<typename T> const T *desc_as() const;
+  const dml::ir::OperatorNodeDesc *desc_as_OperatorNodeDesc() const {
+    return desc_type() == dml::ir::NodeDesc_OperatorNodeDesc ? static_cast<const dml::ir::OperatorNodeDesc *>(desc()) : nullptr;
+  }
+  const dml::ir::ConstantNodeDesc *desc_as_ConstantNodeDesc() const {
+    return desc_type() == dml::ir::NodeDesc_ConstantNodeDesc ? static_cast<const dml::ir::ConstantNodeDesc *>(desc()) : nullptr;
+  }
+  void *mutable_desc() {
+    return GetPointer<void *>(VT_DESC);
+  }
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *inputNames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_INPUTNAMES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_inputNames() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_INPUTNAMES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *outputNames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_OUTPUTNAMES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_outputNames() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_OUTPUTNAMES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_DESC_TYPE, 1) &&
+           VerifyOffset(verifier, VT_DESC) &&
+           VerifyNodeDesc(verifier, desc(), desc_type()) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_INPUTNAMES) &&
+           verifier.VerifyVector(inputNames()) &&
+           verifier.VerifyVectorOfStrings(inputNames()) &&
+           VerifyOffset(verifier, VT_OUTPUTNAMES) &&
+           verifier.VerifyVector(outputNames()) &&
+           verifier.VerifyVectorOfStrings(outputNames()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const dml::ir::OperatorNodeDesc *DmlGraphNode::desc_as<dml::ir::OperatorNodeDesc>() const {
+  return desc_as_OperatorNodeDesc();
+}
+
+template<> inline const dml::ir::ConstantNodeDesc *DmlGraphNode::desc_as<dml::ir::ConstantNodeDesc>() const {
+  return desc_as_ConstantNodeDesc();
+}
+
+struct DmlGraphNodeBuilder {
+  typedef DmlGraphNode Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_desc_type(dml::ir::NodeDesc desc_type) {
+    fbb_.AddElement<uint8_t>(DmlGraphNode::VT_DESC_TYPE, static_cast<uint8_t>(desc_type), 0);
+  }
+  void add_desc(::flatbuffers::Offset<void> desc) {
+    fbb_.AddOffset(DmlGraphNode::VT_DESC, desc);
+  }
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(DmlGraphNode::VT_NAME, name);
+  }
+  void add_inputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputNames) {
+    fbb_.AddOffset(DmlGraphNode::VT_INPUTNAMES, inputNames);
+  }
+  void add_outputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputNames) {
+    fbb_.AddOffset(DmlGraphNode::VT_OUTPUTNAMES, outputNames);
+  }
+  explicit DmlGraphNodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DmlGraphNode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DmlGraphNode>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE,
+    ::flatbuffers::Offset<void> desc = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputNames = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputNames = 0) {
+  DmlGraphNodeBuilder builder_(_fbb);
+  builder_.add_outputNames(outputNames);
+  builder_.add_inputNames(inputNames);
+  builder_.add_name(name);
+  builder_.add_desc(desc);
+  builder_.add_desc_type(desc_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<DmlGraphNode> CreateDmlGraphNodeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE,
+    ::flatbuffers::Offset<void> desc = 0,
+    const char *name = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *inputNames = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *outputNames = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto inputNames__ = inputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*inputNames) : 0;
+  auto outputNames__ = outputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*outputNames) : 0;
+  return dml::ir::CreateDmlGraphNode(
+      _fbb,
+      desc_type,
+      desc,
+      name__,
+      inputNames__,
+      outputNames__);
+}
+
+struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DmlGraphDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NODES = 4,
+    VT_GRAPHINPUTNAMES = 6,
+    VT_GRAPHOUTPUTNAMES = 8
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *nodes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *>(VT_NODES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *mutable_nodes() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *>(VT_NODES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *graphInputNames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHINPUTNAMES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_graphInputNames() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHINPUTNAMES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *graphOutputNames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHOUTPUTNAMES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_graphOutputNames() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHOUTPUTNAMES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NODES) &&
+           verifier.VerifyVector(nodes()) &&
+           verifier.VerifyVectorOfTables(nodes()) &&
+           VerifyOffset(verifier, VT_GRAPHINPUTNAMES) &&
+           verifier.VerifyVector(graphInputNames()) &&
+           verifier.VerifyVectorOfStrings(graphInputNames()) &&
+           VerifyOffset(verifier, VT_GRAPHOUTPUTNAMES) &&
+           verifier.VerifyVector(graphOutputNames()) &&
+           verifier.VerifyVectorOfStrings(graphOutputNames()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DmlGraphDescBuilder {
+  typedef DmlGraphDesc Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_nodes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>>> nodes) {
+    fbb_.AddOffset(DmlGraphDesc::VT_NODES, nodes);
+  }
+  void add_graphInputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphInputNames) {
+    fbb_.AddOffset(DmlGraphDesc::VT_GRAPHINPUTNAMES, graphInputNames);
+  }
+  void add_graphOutputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphOutputNames) {
+    fbb_.AddOffset(DmlGraphDesc::VT_GRAPHOUTPUTNAMES, graphOutputNames);
+  }
+  explicit DmlGraphDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DmlGraphDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DmlGraphDesc>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>>> nodes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphInputNames = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphOutputNames = 0) {
+  DmlGraphDescBuilder builder_(_fbb);
+  builder_.add_graphOutputNames(graphOutputNames);
+  builder_.add_graphInputNames(graphInputNames);
+  builder_.add_nodes(nodes);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<DmlGraphDesc> CreateDmlGraphDescDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<dml::ir::DmlGraphNode>> *nodes = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *graphInputNames = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *graphOutputNames = nullptr) {
+  auto nodes__ = nodes ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::DmlGraphNode>>(*nodes) : 0;
+  auto graphInputNames__ = graphInputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*graphInputNames) : 0;
+  auto graphOutputNames__ = graphOutputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*graphOutputNames) : 0;
+  return dml::ir::CreateDmlGraphDesc(
+      _fbb,
+      nodes__,
+      graphInputNames__,
+      graphOutputNames__);
+}
+
+inline bool VerifyConstantNodeDescDetail(::flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type) {
+  switch (type) {
+    case ConstantNodeDescDetail_NONE: {
+      return true;
+    }
+    case ConstantNodeDescDetail_ConstantName: {
+      auto ptr = reinterpret_cast<const dml::ir::ConstantName *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case ConstantNodeDescDetail_ConstantRawData: {
+      auto ptr = reinterpret_cast<const dml::ir::ConstantRawData *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyConstantNodeDescDetailVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyConstantNodeDescDetail(
+        verifier,  values->Get(i), types->GetEnum<ConstantNodeDescDetail>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool VerifyNodeDesc(::flatbuffers::Verifier &verifier, const void *obj, NodeDesc type) {
+  switch (type) {
+    case NodeDesc_NONE: {
+      return true;
+    }
+    case NodeDesc_OperatorNodeDesc: {
+      auto ptr = reinterpret_cast<const dml::ir::OperatorNodeDesc *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case NodeDesc_ConstantNodeDesc: {
+      auto ptr = reinterpret_cast<const dml::ir::ConstantNodeDesc *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyNodeDescVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyNodeDesc(
+        verifier,  values->Get(i), types->GetEnum<NodeDesc>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const dml::ir::DmlGraphDesc *GetDmlGraphDesc(const void *buf) {
+  return ::flatbuffers::GetRoot<dml::ir::DmlGraphDesc>(buf);
+}
+
+inline const dml::ir::DmlGraphDesc *GetSizePrefixedDmlGraphDesc(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<dml::ir::DmlGraphDesc>(buf);
+}
+
+inline DmlGraphDesc *GetMutableDmlGraphDesc(void *buf) {
+  return ::flatbuffers::GetMutableRoot<DmlGraphDesc>(buf);
+}
+
+inline dml::ir::DmlGraphDesc *GetMutableSizePrefixedDmlGraphDesc(void *buf) {
+  return ::flatbuffers::GetMutableSizePrefixedRoot<dml::ir::DmlGraphDesc>(buf);
+}
+
+inline bool VerifyDmlGraphDescBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<dml::ir::DmlGraphDesc>(nullptr);
+}
+
+inline bool VerifySizePrefixedDmlGraphDescBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<dml::ir::DmlGraphDesc>(nullptr);
+}
+
+inline void FinishDmlGraphDescBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<dml::ir::DmlGraphDesc> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedDmlGraphDescBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<dml::ir::DmlGraphDesc> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace ir
+}  // namespace dml
+
+#endif  // FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
new file mode 100644
index 000000000000..9decf0dce1bb
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include "DmlSerializedGraphDesc.h"
+
+struct NodeIndex
+{
+    uint32_t nodeIndex;
+    uint32_t nodeOutputIndex;
+};
+
+DmlSerializedGraphDesc DeserializeDmlGraph(
+    const uint8_t* flatbufferGraphDescBlob,
+    /*out*/ std::vector<std::unique_ptr<std::byte[]>>& rawData);
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphHelper.h
new file mode 100644
index 000000000000..d2dd7cd8eff1
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphHelper.h
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include <queue>
+
+inline void PerformTopologicalSortAndCheckIsAcyclic(
+    const DmlSerializedGraphDesc& graphDesc,
+    std::vector<uint32_t>& nodesInTopologicalOrder)
+{
+    uint32_t nodeCount = static_cast<uint32_t>(graphDesc.Nodes.size());
+    std::queue<uint32_t> queue;
+    std::vector<uint32_t> inDegree(nodeCount, 0);
+    std::vector<std::vector<uint32_t>> children(nodeCount);
+
+    // Don't need to iterate through InputEdges because those inputs don't represent any node
+    // and the purpose of this topological sort is to come up with a order to correctly iterate 
+    // through nodes .
+    for (const DmlIntermediateSerializedGraphEdge& intermediateEdge : graphDesc.IntermediateEdges)
+    {
+        inDegree[intermediateEdge.ToNodeIndex]++;
+        children[intermediateEdge.FromNodeIndex].push_back(intermediateEdge.ToNodeIndex);
+    }
+
+    for (uint32_t nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+    {
+        if (inDegree[nodeIndex] == 0)
+        {
+            queue.push(nodeIndex);
+        }
+    }
+
+    uint32_t nodeIndex = 0;
+    while (!queue.empty())
+    {
+        if (nodeIndex >= nodeCount)
+        {
+            throw std::invalid_argument("Given graph is not acyclic.");
+        }
+
+        uint32_t currNodeIndex = queue.front();
+        queue.pop();
+        nodesInTopologicalOrder[nodeIndex++] = currNodeIndex;
+
+        for (uint32_t child : children[currNodeIndex])
+        {
+            if (--inDegree[child] == 0)
+            {
+                queue.push(child);
+            }
+        }
+    }
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h
new file mode 100644
index 000000000000..d8d069da906b
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h
@@ -0,0 +1,8 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+
+#pragma once
+#include "DmlGraphDesc_generated.h"
+
+struct DmlSerializedGraphDesc;
+
+flatbuffers::DetachedBuffer SerializeDmlGraph(const DmlSerializedGraphDesc& graphDesc);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h
new file mode 100644
index 000000000000..51c3d6c81244
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h
@@ -0,0 +1,73 @@
+//-----------------------------------------------------------------------------
+//
+//  Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//-----------------------------------------------------------------------------
+
+#pragma once
+
+struct ConstantName
+{
+    std::string name;
+};
+
+struct ConstantData
+{
+    std::byte* data;
+    uint64_t dataSize;
+};
+
+using DmlSerializedGraphNodeConstantVariant = std::variant<
+    ConstantName,
+    ConstantData
+>;
+
+using DmlSerializedGraphNodeDescVariant = std::variant<
+    AbstractOperatorDesc,
+    DmlSerializedGraphNodeConstantVariant
+>;
+
+struct DmlSerializedGraphNode   
+{
+    DmlSerializedGraphNodeDescVariant Desc;
+    std::string Name; 
+};
+
+struct DmlInputSerializedGraphEdge
+{
+    uint32_t GraphInputIndex; 
+    uint32_t ToNodeIndex; 
+    uint32_t ToNodeInputIndex; 
+    std::string Name; 
+};
+
+struct DmlOutputSerializedGraphEdge
+{
+    uint32_t FromNodeIndex; 
+    uint32_t FromNodeOutputIndex; 
+    uint32_t GraphOutputIndex; 
+    std::string Name; 
+};
+
+struct DmlIntermediateSerializedGraphEdge
+{
+    uint32_t FromNodeIndex; 
+    uint32_t FromNodeOutputIndex; 
+    uint32_t ToNodeIndex; 
+    uint32_t ToNodeInputIndex; 
+    std::string Name; 
+};
+
+struct DmlSerializedGraphDesc
+{
+    uint32_t InputCount;
+    uint32_t OutputCount;
+    // nodes must be present in topological order for deserialization to work
+    // because while creating a intermediate edge during deserialization, node (from
+    // which given intermediate edge is outputting) must be visited before than the node
+    // (to which given intermediate edge is inputting)
+    std::vector<DmlSerializedGraphNode> Nodes;
+    std::vector<DmlInputSerializedGraphEdge> InputEdges;
+    std::vector<DmlOutputSerializedGraphEdge> OutputEdges;
+    std::vector<DmlIntermediateSerializedGraphEdge> IntermediateEdges;
+};
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
index 833871de0bbd..86c66d8cca26 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
@@ -425,6 +425,20 @@ inline std::vector<OperatorField> GetFields(const DML_AVERAGE_POOLING_OPERATOR_D
         OperatorField(&DML_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<UINT>(desc.IncludePadding))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_AVERAGE_POOLING1_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSize), desc.DimensionCount)),
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_AVERAGE_POOLING1_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<UINT>(desc.IncludePadding))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_LP_POOLING_OPERATOR_DESC& desc)
 {
     return {
@@ -438,6 +452,20 @@ inline std::vector<OperatorField> GetFields(const DML_LP_POOLING_OPERATOR_DESC&
         OperatorField(&DML_LP_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<UINT>(desc.P))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_LP_POOLING1_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSize), desc.DimensionCount)),
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_LP_POOLING1_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<UINT>(desc.P))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_MAX_POOLING_OPERATOR_DESC& desc)
 {
     return {
@@ -1111,6 +1139,19 @@ inline std::vector<OperatorField> GetFields(const DML_QUANTIZED_LINEAR_MATRIX_MU
         OperatorField(&DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.ATensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.AScaleTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.AZeroPointTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BScaleTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BZeroPointTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.BiasTensor))),
+        OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_CONVOLUTION_INTEGER_OPERATOR_DESC& desc)
 {
     return {
@@ -1441,6 +1482,24 @@ inline std::vector<OperatorField> GetFields(const DML_MULTIHEAD_ATTENTION_OPERAT
         OperatorField(&DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA.Fields[17], ToOperatorFieldType(static_cast<UINT>(desc.MaskType))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputScaleTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputZeroPointTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputScaleTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputZeroPointTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSize), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[9], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[10], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[11], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast<UINT>(desc.IncludePadding))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_ELU_OPERATOR_DESC& desc)
 {
     return {
@@ -1633,6 +1692,23 @@ inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_GELU_OPERATOR_D
         OperatorField(&DML_ACTIVATION_GELU_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_SWISH_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<FLOAT>(desc.SigmoidInputScale))),
+    };
+}
+inline std::vector<OperatorField> GetFields(const DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<FLOAT>(desc.Alpha))),
+        OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<FLOAT>(desc.Beta))),
+    };
+}
 inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
 {
     switch (operatorType)
@@ -1684,7 +1760,9 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_ARGMIN: return DML_ARGMIN_OPERATOR_SCHEMA;
     case DML_OPERATOR_ARGMAX: return DML_ARGMAX_OPERATOR_SCHEMA;
     case DML_OPERATOR_AVERAGE_POOLING: return DML_AVERAGE_POOLING_OPERATOR_SCHEMA;
+    case DML_OPERATOR_AVERAGE_POOLING1: return DML_AVERAGE_POOLING1_OPERATOR_SCHEMA;
     case DML_OPERATOR_LP_POOLING: return DML_LP_POOLING_OPERATOR_SCHEMA;
+    case DML_OPERATOR_LP_POOLING1: return DML_LP_POOLING1_OPERATOR_SCHEMA;
     case DML_OPERATOR_MAX_POOLING: return DML_MAX_POOLING_OPERATOR_SCHEMA;
     case DML_OPERATOR_MAX_POOLING1: return DML_MAX_POOLING1_OPERATOR_SCHEMA;
     case DML_OPERATOR_ROI_POOLING: return DML_ROI_POOLING_OPERATOR_SCHEMA;
@@ -1751,6 +1829,7 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_RESAMPLE1: return DML_RESAMPLE1_OPERATOR_SCHEMA;
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER: return DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY: return DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA;
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA;
     case DML_OPERATOR_CONVOLUTION_INTEGER: return DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION: return DML_QUANTIZED_LINEAR_CONVOLUTION_OPERATOR_SCHEMA;
     case DML_OPERATOR_ELEMENT_WISE_BIT_AND: return DML_ELEMENT_WISE_BIT_AND_OPERATOR_SCHEMA;
@@ -1777,6 +1856,7 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_RESAMPLE_GRAD1: return DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA;
     case DML_OPERATOR_DIAGONAL_MATRIX1: return DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA;
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA;
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_ELU: return DML_ACTIVATION_ELU_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_CELU: return DML_ACTIVATION_CELU_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_HARDMAX: return DML_ACTIVATION_HARDMAX_OPERATOR_SCHEMA;
@@ -1801,6 +1881,8 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU: return DML_ACTIVATION_THRESHOLDED_RELU_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_SHRINK: return DML_ACTIVATION_SHRINK_OPERATOR_SCHEMA;
     case DML_OPERATOR_ACTIVATION_GELU: return DML_ACTIVATION_GELU_OPERATOR_SCHEMA;
+    case DML_OPERATOR_ACTIVATION_SWISH: return DML_ACTIVATION_SWISH_OPERATOR_SCHEMA;
+    case DML_OPERATOR_ACTIVATION_HARD_SWISH: return DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA;
 
     default:
         ORT_THROW_HR(E_INVALIDARG);
@@ -2002,10 +2084,18 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_AVERAGE_POOLING_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_AVERAGE_POOLING_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_AVERAGE_POOLING1:
+        return AbstractOperatorDesc(
+            &DML_AVERAGE_POOLING1_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_AVERAGE_POOLING1_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_LP_POOLING:
         return AbstractOperatorDesc(
             &DML_LP_POOLING_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_LP_POOLING_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_LP_POOLING1:
+        return AbstractOperatorDesc(
+            &DML_LP_POOLING1_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_LP_POOLING1_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_MAX_POOLING:
         return AbstractOperatorDesc(
             &DML_MAX_POOLING_OPERATOR_SCHEMA,
@@ -2270,6 +2360,10 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
+        return AbstractOperatorDesc(
+            &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_CONVOLUTION_INTEGER:
         return AbstractOperatorDesc(
             &DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA,
@@ -2374,6 +2468,10 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_MULTIHEAD_ATTENTION_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
+        return AbstractOperatorDesc(
+            &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_ACTIVATION_ELU:
         return AbstractOperatorDesc(
             &DML_ACTIVATION_ELU_OPERATOR_SCHEMA,
@@ -2470,6 +2568,15 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_ACTIVATION_GELU_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_ACTIVATION_GELU_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_ACTIVATION_SWISH:
+        return AbstractOperatorDesc(
+            &DML_ACTIVATION_SWISH_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_ACTIVATION_SWISH_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_ACTIVATION_HARD_SWISH:
+        return AbstractOperatorDesc(
+            &DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC*>(opDesc.Desc)));
+
     default:
         ORT_THROW_HR(E_INVALIDARG);
         return AbstractOperatorDesc(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
index 25f0dd26c606..a94bb67b68d3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
@@ -15,32 +15,34 @@ using ApiAttributeVariant = std::variant<
     const FLOAT*, 
     const DML_SCALE_BIAS*, 
     DML_SIZE_2D, 
-    DML_SCALAR_UNION
+    DML_SCALAR_UNION, 
+    BOOL
     >;
 
 namespace OperatorFieldTypes
 {
     using TensorDesc = std::optional<DmlBufferTensorDesc>; // DML_SCHEMA_FIELD_TYPE_TENSOR_DESC
     using TensorDescArray = std::optional<std::vector<DmlBufferTensorDesc>>; // DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY
-    using OperatorDesc = std::optional<AbstractOperatorDesc>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC
-    using OperatorDescArray = std::optional<std::vector<AbstractOperatorDesc>>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY
+    using FusedActivationOperatorDesc = std::optional<AbstractOperatorDesc>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC
+    using FusedActivationOperatorDescArray = std::optional<std::vector<AbstractOperatorDesc>>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY
     using UInt = uint32_t; // DML_SCHEMA_FIELD_TYPE_UINT
     using UInt64 = uint64_t; // DML_SCHEMA_FIELD_TYPE_UINT64
     using Int = int32_t; // DML_SCHEMA_FIELD_TYPE_INT
     using Float = float; // DML_SCHEMA_FIELD_TYPE_FLOAT
-    using UIntArray = std::optional<std::vector<uint32_t>>; // DML_SCHEMA_FIELD_TYPE_UINT_ARRAY
-    using IntArray = std::optional<std::vector<int32_t>>; // DML_SCHEMA_FIELD_TYPE_INT_ARRAY
-    using FloatArray = std::optional<std::vector<float>>; // DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY
+    using UIntArray = std::vector<uint32_t>; // DML_SCHEMA_FIELD_TYPE_UINT_ARRAY
+    using IntArray = std::vector<int32_t>; // DML_SCHEMA_FIELD_TYPE_INT_ARRAY
+    using FloatArray = std::vector<float>; // DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY
     using ScaleBias = std::optional<DML_SCALE_BIAS>; // DML_SCHEMA_FIELD_TYPE_SCALE_BIAS
     using Size2D = DML_SIZE_2D; // DML_SCHEMA_FIELD_TYPE_SIZE_2D
     using ScalarUnion = DML_SCALAR_UNION; // DML_SCHEMA_FIELD_TYPE_SCALAR_UNION
+    using Bool = bool; // DML_SCHEMA_FIELD_TYPE_BOOL
 }
 
 using OperatorFieldVariant = std::variant<
     OperatorFieldTypes::TensorDesc, 
     OperatorFieldTypes::TensorDescArray, 
-    OperatorFieldTypes::OperatorDesc, 
-    OperatorFieldTypes::OperatorDescArray, 
+    OperatorFieldTypes::FusedActivationOperatorDesc, 
+    OperatorFieldTypes::FusedActivationOperatorDescArray, 
     OperatorFieldTypes::UInt, 
     OperatorFieldTypes::UInt64, 
     OperatorFieldTypes::Int, 
@@ -50,7 +52,8 @@ using OperatorFieldVariant = std::variant<
     OperatorFieldTypes::FloatArray, 
     OperatorFieldTypes::ScaleBias, 
     OperatorFieldTypes::Size2D, 
-    OperatorFieldTypes::ScalarUnion
+    OperatorFieldTypes::ScalarUnion, 
+    OperatorFieldTypes::Bool
     >;
 
 class OperatorField
@@ -80,11 +83,11 @@ class OperatorField
     const OperatorFieldTypes::TensorDescArray& AsTensorDescArray() const { return std::get<OperatorFieldTypes::TensorDescArray>(m_data); }
     OperatorFieldTypes::TensorDescArray& AsTensorDescArray() { return std::get<OperatorFieldTypes::TensorDescArray>(m_data); }
 
-    const OperatorFieldTypes::OperatorDesc& AsOperatorDesc() const { return std::get<OperatorFieldTypes::OperatorDesc>(m_data); }
-    OperatorFieldTypes::OperatorDesc& AsOperatorDesc() { return std::get<OperatorFieldTypes::OperatorDesc>(m_data); }
+    const OperatorFieldTypes::FusedActivationOperatorDesc& AsFusedActivationOperatorDesc() const { return std::get<OperatorFieldTypes::FusedActivationOperatorDesc>(m_data); }
+    OperatorFieldTypes::FusedActivationOperatorDesc& AsFusedActivationOperatorDesc() { return std::get<OperatorFieldTypes::FusedActivationOperatorDesc>(m_data); }
 
-    const OperatorFieldTypes::OperatorDescArray& AsOperatorDescArray() const { return std::get<OperatorFieldTypes::OperatorDescArray>(m_data); }
-    OperatorFieldTypes::OperatorDescArray& AsOperatorDescArray() { return std::get<OperatorFieldTypes::OperatorDescArray>(m_data); }
+    const OperatorFieldTypes::FusedActivationOperatorDescArray& AsFusedActivationOperatorDescArray() const { return std::get<OperatorFieldTypes::FusedActivationOperatorDescArray>(m_data); }
+    OperatorFieldTypes::FusedActivationOperatorDescArray& AsFusedActivationOperatorDescArray() { return std::get<OperatorFieldTypes::FusedActivationOperatorDescArray>(m_data); }
 
     const OperatorFieldTypes::UInt& AsUInt() const { return std::get<OperatorFieldTypes::UInt>(m_data); }
     OperatorFieldTypes::UInt& AsUInt() { return std::get<OperatorFieldTypes::UInt>(m_data); }
@@ -116,6 +119,9 @@ class OperatorField
     const OperatorFieldTypes::ScalarUnion& AsScalarUnion() const { return std::get<OperatorFieldTypes::ScalarUnion>(m_data); }
     OperatorFieldTypes::ScalarUnion& AsScalarUnion() { return std::get<OperatorFieldTypes::ScalarUnion>(m_data); }
 
+    const OperatorFieldTypes::Bool& AsBool() const { return std::get<OperatorFieldTypes::Bool>(m_data); }
+    OperatorFieldTypes::Bool& AsBool() { return std::get<OperatorFieldTypes::Bool>(m_data); }
+
 private:
     const DML_SCHEMA_FIELD* m_schema;
     OperatorFieldVariant m_data;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h
new file mode 100644
index 000000000000..639c31f0dc5c
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h
@@ -0,0 +1,1323 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_
+#define FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_
+
+#include "core/common/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 26,
+             "Non-compatible flatbuffers version included");
+
+namespace dml {
+namespace ir {
+namespace operatorFieldTypes {
+
+struct AttributeDesc;
+struct AttributeDescBuilder;
+
+struct Activation;
+struct ActivationBuilder;
+
+struct ActivationArray;
+struct ActivationArrayBuilder;
+
+struct UInt8;
+
+struct UInt16;
+
+struct UInt32;
+
+struct UInt64;
+
+struct Int8;
+
+struct Int16;
+
+struct Int32;
+
+struct Int64;
+
+struct Float32;
+
+struct Float64;
+
+struct UIntArray;
+struct UIntArrayBuilder;
+
+struct IntArray;
+struct IntArrayBuilder;
+
+struct FloatArray;
+struct FloatArrayBuilder;
+
+struct ScaleBias;
+
+struct Size2D;
+
+struct ByteArray;
+
+struct ScalarUnionData;
+struct ScalarUnionDataBuilder;
+
+struct Bool;
+
+enum AttributeFieldVariant : uint8_t {
+  AttributeFieldVariant_NONE = 0,
+  AttributeFieldVariant_Activation = 1,
+  AttributeFieldVariant_ActivationArray = 2,
+  AttributeFieldVariant_UInt32 = 3,
+  AttributeFieldVariant_UInt64 = 4,
+  AttributeFieldVariant_Int32 = 5,
+  AttributeFieldVariant_Float32 = 6,
+  AttributeFieldVariant_UIntArray = 7,
+  AttributeFieldVariant_IntArray = 8,
+  AttributeFieldVariant_FloatArray = 9,
+  AttributeFieldVariant_ScaleBias = 10,
+  AttributeFieldVariant_Size2D = 11,
+  AttributeFieldVariant_ScalarUnionData = 12,
+  AttributeFieldVariant_Bool = 13,
+  AttributeFieldVariant_MIN = AttributeFieldVariant_NONE,
+  AttributeFieldVariant_MAX = AttributeFieldVariant_Bool
+};
+
+inline const AttributeFieldVariant (&EnumValuesAttributeFieldVariant())[14] {
+  static const AttributeFieldVariant values[] = {
+    AttributeFieldVariant_NONE,
+    AttributeFieldVariant_Activation,
+    AttributeFieldVariant_ActivationArray,
+    AttributeFieldVariant_UInt32,
+    AttributeFieldVariant_UInt64,
+    AttributeFieldVariant_Int32,
+    AttributeFieldVariant_Float32,
+    AttributeFieldVariant_UIntArray,
+    AttributeFieldVariant_IntArray,
+    AttributeFieldVariant_FloatArray,
+    AttributeFieldVariant_ScaleBias,
+    AttributeFieldVariant_Size2D,
+    AttributeFieldVariant_ScalarUnionData,
+    AttributeFieldVariant_Bool
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAttributeFieldVariant() {
+  static const char * const names[15] = {
+    "NONE",
+    "Activation",
+    "ActivationArray",
+    "UInt32",
+    "UInt64",
+    "Int32",
+    "Float32",
+    "UIntArray",
+    "IntArray",
+    "FloatArray",
+    "ScaleBias",
+    "Size2D",
+    "ScalarUnionData",
+    "Bool",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAttributeFieldVariant(AttributeFieldVariant e) {
+  if (::flatbuffers::IsOutRange(e, AttributeFieldVariant_NONE, AttributeFieldVariant_Bool)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesAttributeFieldVariant()[index];
+}
+
+template<typename T> struct AttributeFieldVariantTraits {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_NONE;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Activation> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Activation;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::ActivationArray> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_ActivationArray;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::UInt32> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_UInt32;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::UInt64> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_UInt64;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Int32> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Int32;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Float32> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Float32;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::UIntArray> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_UIntArray;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::IntArray> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_IntArray;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::FloatArray> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_FloatArray;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::ScaleBias> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_ScaleBias;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Size2D> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Size2D;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::ScalarUnionData> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_ScalarUnionData;
+};
+
+template<> struct AttributeFieldVariantTraits<dml::ir::operatorFieldTypes::Bool> {
+  static const AttributeFieldVariant enum_value = AttributeFieldVariant_Bool;
+};
+
+bool VerifyAttributeFieldVariant(::flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type);
+bool VerifyAttributeFieldVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
+
+enum ScalarVariant : uint8_t {
+  ScalarVariant_NONE = 0,
+  ScalarVariant_ByteArray = 1,
+  ScalarVariant_Int8 = 2,
+  ScalarVariant_UInt8 = 3,
+  ScalarVariant_Int16 = 4,
+  ScalarVariant_UInt16 = 5,
+  ScalarVariant_Int32 = 6,
+  ScalarVariant_UInt32 = 7,
+  ScalarVariant_Int64 = 8,
+  ScalarVariant_UInt64 = 9,
+  ScalarVariant_Float32 = 10,
+  ScalarVariant_Float64 = 11,
+  ScalarVariant_MIN = ScalarVariant_NONE,
+  ScalarVariant_MAX = ScalarVariant_Float64
+};
+
+inline const ScalarVariant (&EnumValuesScalarVariant())[12] {
+  static const ScalarVariant values[] = {
+    ScalarVariant_NONE,
+    ScalarVariant_ByteArray,
+    ScalarVariant_Int8,
+    ScalarVariant_UInt8,
+    ScalarVariant_Int16,
+    ScalarVariant_UInt16,
+    ScalarVariant_Int32,
+    ScalarVariant_UInt32,
+    ScalarVariant_Int64,
+    ScalarVariant_UInt64,
+    ScalarVariant_Float32,
+    ScalarVariant_Float64
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesScalarVariant() {
+  static const char * const names[13] = {
+    "NONE",
+    "ByteArray",
+    "Int8",
+    "UInt8",
+    "Int16",
+    "UInt16",
+    "Int32",
+    "UInt32",
+    "Int64",
+    "UInt64",
+    "Float32",
+    "Float64",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameScalarVariant(ScalarVariant e) {
+  if (::flatbuffers::IsOutRange(e, ScalarVariant_NONE, ScalarVariant_Float64)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesScalarVariant()[index];
+}
+
+template<typename T> struct ScalarVariantTraits {
+  static const ScalarVariant enum_value = ScalarVariant_NONE;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::ByteArray> {
+  static const ScalarVariant enum_value = ScalarVariant_ByteArray;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Int8> {
+  static const ScalarVariant enum_value = ScalarVariant_Int8;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::UInt8> {
+  static const ScalarVariant enum_value = ScalarVariant_UInt8;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Int16> {
+  static const ScalarVariant enum_value = ScalarVariant_Int16;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::UInt16> {
+  static const ScalarVariant enum_value = ScalarVariant_UInt16;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Int32> {
+  static const ScalarVariant enum_value = ScalarVariant_Int32;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::UInt32> {
+  static const ScalarVariant enum_value = ScalarVariant_UInt32;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Int64> {
+  static const ScalarVariant enum_value = ScalarVariant_Int64;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::UInt64> {
+  static const ScalarVariant enum_value = ScalarVariant_UInt64;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Float32> {
+  static const ScalarVariant enum_value = ScalarVariant_Float32;
+};
+
+template<> struct ScalarVariantTraits<dml::ir::operatorFieldTypes::Float64> {
+  static const ScalarVariant enum_value = ScalarVariant_Float64;
+};
+
+bool VerifyScalarVariant(::flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type);
+bool VerifyScalarVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) UInt8 FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint8_t data_;
+
+ public:
+  UInt8()
+      : data_(0) {
+  }
+  UInt8(uint8_t _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  uint8_t data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(uint8_t _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(UInt8, 1);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) UInt16 FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint16_t data_;
+
+ public:
+  UInt16()
+      : data_(0) {
+  }
+  UInt16(uint16_t _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  uint16_t data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(uint16_t _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(UInt16, 2);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) UInt32 FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint32_t data_;
+
+ public:
+  UInt32()
+      : data_(0) {
+  }
+  UInt32(uint32_t _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  uint32_t data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(uint32_t _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(UInt32, 4);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) UInt64 FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint64_t data_;
+
+ public:
+  UInt64()
+      : data_(0) {
+  }
+  UInt64(uint64_t _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  uint64_t data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(uint64_t _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(UInt64, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Int8 FLATBUFFERS_FINAL_CLASS {
+ private:
+  int8_t data_;
+
+ public:
+  Int8()
+      : data_(0) {
+  }
+  Int8(int8_t _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  int8_t data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(int8_t _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int8, 1);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) Int16 FLATBUFFERS_FINAL_CLASS {
+ private:
+  int16_t data_;
+
+ public:
+  Int16()
+      : data_(0) {
+  }
+  Int16(int16_t _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  int16_t data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(int16_t _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int16, 2);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Int32 FLATBUFFERS_FINAL_CLASS {
+ private:
+  int32_t data_;
+
+ public:
+  Int32()
+      : data_(0) {
+  }
+  Int32(int32_t _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  int32_t data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(int32_t _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int32, 4);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Int64 FLATBUFFERS_FINAL_CLASS {
+ private:
+  int64_t data_;
+
+ public:
+  Int64()
+      : data_(0) {
+  }
+  Int64(int64_t _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  int64_t data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(int64_t _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int64, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Float32 FLATBUFFERS_FINAL_CLASS {
+ private:
+  float data_;
+
+ public:
+  Float32()
+      : data_(0) {
+  }
+  Float32(float _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  float data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(float _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Float32, 4);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Float64 FLATBUFFERS_FINAL_CLASS {
+ private:
+  double data_;
+
+ public:
+  Float64()
+      : data_(0) {
+  }
+  Float64(double _data)
+      : data_(::flatbuffers::EndianScalar(_data)) {
+  }
+  double data() const {
+    return ::flatbuffers::EndianScalar(data_);
+  }
+  void mutate_data(double _data) {
+    ::flatbuffers::WriteScalar(&data_, _data);
+  }
+};
+FLATBUFFERS_STRUCT_END(Float64, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) ScaleBias FLATBUFFERS_FINAL_CLASS {
+ private:
+  float scale_;
+  float bias_;
+
+ public:
+  ScaleBias()
+      : scale_(0),
+        bias_(0) {
+  }
+  ScaleBias(float _scale, float _bias)
+      : scale_(::flatbuffers::EndianScalar(_scale)),
+        bias_(::flatbuffers::EndianScalar(_bias)) {
+  }
+  float scale() const {
+    return ::flatbuffers::EndianScalar(scale_);
+  }
+  void mutate_scale(float _scale) {
+    ::flatbuffers::WriteScalar(&scale_, _scale);
+  }
+  float bias() const {
+    return ::flatbuffers::EndianScalar(bias_);
+  }
+  void mutate_bias(float _bias) {
+    ::flatbuffers::WriteScalar(&bias_, _bias);
+  }
+};
+FLATBUFFERS_STRUCT_END(ScaleBias, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Size2D FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint32_t width_;
+  uint32_t height_;
+
+ public:
+  Size2D()
+      : width_(0),
+        height_(0) {
+  }
+  Size2D(uint32_t _width, uint32_t _height)
+      : width_(::flatbuffers::EndianScalar(_width)),
+        height_(::flatbuffers::EndianScalar(_height)) {
+  }
+  uint32_t width() const {
+    return ::flatbuffers::EndianScalar(width_);
+  }
+  void mutate_width(uint32_t _width) {
+    ::flatbuffers::WriteScalar(&width_, _width);
+  }
+  uint32_t height() const {
+    return ::flatbuffers::EndianScalar(height_);
+  }
+  void mutate_height(uint32_t _height) {
+    ::flatbuffers::WriteScalar(&height_, _height);
+  }
+};
+FLATBUFFERS_STRUCT_END(Size2D, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) ByteArray FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint8_t data_[8];
+
+ public:
+  ByteArray()
+      : data_() {
+  }
+  ByteArray(::flatbuffers::span<const uint8_t, 8> _data) {
+    ::flatbuffers::CastToArray(data_).CopyFromSpan(_data);
+  }
+  const ::flatbuffers::Array<uint8_t, 8> *data() const {
+    return &::flatbuffers::CastToArray(data_);
+  }
+  ::flatbuffers::Array<uint8_t, 8> *mutable_data() {
+    return &::flatbuffers::CastToArray(data_);
+  }
+};
+FLATBUFFERS_STRUCT_END(ByteArray, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Bool FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint8_t data_;
+
+ public:
+  Bool()
+      : data_(0) {
+  }
+  Bool(bool _data)
+      : data_(::flatbuffers::EndianScalar(static_cast<uint8_t>(_data))) {
+  }
+  bool data() const {
+    return ::flatbuffers::EndianScalar(data_) != 0;
+  }
+  void mutate_data(bool _data) {
+    ::flatbuffers::WriteScalar(&data_, static_cast<uint8_t>(_data));
+  }
+};
+FLATBUFFERS_STRUCT_END(Bool, 1);
+
+struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AttributeDescBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VAL_TYPE = 6,
+    VT_VAL = 8
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  dml::ir::operatorFieldTypes::AttributeFieldVariant val_type() const {
+    return static_cast<dml::ir::operatorFieldTypes::AttributeFieldVariant>(GetField<uint8_t>(VT_VAL_TYPE, 0));
+  }
+  const void *val() const {
+    return GetPointer<const void *>(VT_VAL);
+  }
+  template<typename T> const T *val_as() const;
+  const dml::ir::operatorFieldTypes::Activation *val_as_Activation() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation ? static_cast<const dml::ir::operatorFieldTypes::Activation *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::ActivationArray *val_as_ActivationArray() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray ? static_cast<const dml::ir::operatorFieldTypes::ActivationArray *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt32 *val_as_UInt32() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt32 ? static_cast<const dml::ir::operatorFieldTypes::UInt32 *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt64 *val_as_UInt64() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt64 ? static_cast<const dml::ir::operatorFieldTypes::UInt64 *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int32 *val_as_Int32() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Int32 ? static_cast<const dml::ir::operatorFieldTypes::Int32 *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Float32 *val_as_Float32() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Float32 ? static_cast<const dml::ir::operatorFieldTypes::Float32 *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UIntArray *val_as_UIntArray() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UIntArray ? static_cast<const dml::ir::operatorFieldTypes::UIntArray *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::IntArray *val_as_IntArray() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_IntArray ? static_cast<const dml::ir::operatorFieldTypes::IntArray *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::FloatArray *val_as_FloatArray() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_FloatArray ? static_cast<const dml::ir::operatorFieldTypes::FloatArray *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::ScaleBias *val_as_ScaleBias() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias ? static_cast<const dml::ir::operatorFieldTypes::ScaleBias *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Size2D *val_as_Size2D() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Size2D ? static_cast<const dml::ir::operatorFieldTypes::Size2D *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::ScalarUnionData *val_as_ScalarUnionData() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ScalarUnionData ? static_cast<const dml::ir::operatorFieldTypes::ScalarUnionData *>(val()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Bool *val_as_Bool() const {
+    return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Bool ? static_cast<const dml::ir::operatorFieldTypes::Bool *>(val()) : nullptr;
+  }
+  void *mutable_val() {
+    return GetPointer<void *>(VT_VAL);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint8_t>(verifier, VT_VAL_TYPE, 1) &&
+           VerifyOffset(verifier, VT_VAL) &&
+           VerifyAttributeFieldVariant(verifier, val(), val_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const dml::ir::operatorFieldTypes::Activation *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Activation>() const {
+  return val_as_Activation();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::ActivationArray *AttributeDesc::val_as<dml::ir::operatorFieldTypes::ActivationArray>() const {
+  return val_as_ActivationArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt32 *AttributeDesc::val_as<dml::ir::operatorFieldTypes::UInt32>() const {
+  return val_as_UInt32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt64 *AttributeDesc::val_as<dml::ir::operatorFieldTypes::UInt64>() const {
+  return val_as_UInt64();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int32 *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Int32>() const {
+  return val_as_Int32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Float32 *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Float32>() const {
+  return val_as_Float32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UIntArray *AttributeDesc::val_as<dml::ir::operatorFieldTypes::UIntArray>() const {
+  return val_as_UIntArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::IntArray *AttributeDesc::val_as<dml::ir::operatorFieldTypes::IntArray>() const {
+  return val_as_IntArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::FloatArray *AttributeDesc::val_as<dml::ir::operatorFieldTypes::FloatArray>() const {
+  return val_as_FloatArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::ScaleBias *AttributeDesc::val_as<dml::ir::operatorFieldTypes::ScaleBias>() const {
+  return val_as_ScaleBias();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Size2D *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Size2D>() const {
+  return val_as_Size2D();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::ScalarUnionData *AttributeDesc::val_as<dml::ir::operatorFieldTypes::ScalarUnionData>() const {
+  return val_as_ScalarUnionData();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Bool *AttributeDesc::val_as<dml::ir::operatorFieldTypes::Bool>() const {
+  return val_as_Bool();
+}
+
+struct AttributeDescBuilder {
+  typedef AttributeDesc Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(AttributeDesc::VT_NAME, name);
+  }
+  void add_val_type(dml::ir::operatorFieldTypes::AttributeFieldVariant val_type) {
+    fbb_.AddElement<uint8_t>(AttributeDesc::VT_VAL_TYPE, static_cast<uint8_t>(val_type), 0);
+  }
+  void add_val(::flatbuffers::Offset<void> val) {
+    fbb_.AddOffset(AttributeDesc::VT_VAL, val);
+  }
+  explicit AttributeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AttributeDesc> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AttributeDesc>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AttributeDesc> CreateAttributeDesc(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE,
+    ::flatbuffers::Offset<void> val = 0) {
+  AttributeDescBuilder builder_(_fbb);
+  builder_.add_val(val);
+  builder_.add_name(name);
+  builder_.add_val_type(val_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<AttributeDesc> CreateAttributeDescDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE,
+    ::flatbuffers::Offset<void> val = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return dml::ir::operatorFieldTypes::CreateAttributeDesc(
+      _fbb,
+      name__,
+      val_type,
+      val);
+}
+
+struct Activation FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ActivationBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE = 4,
+    VT_ATTRIBUTES = 6
+  };
+  const ::flatbuffers::String *type() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE);
+  }
+  ::flatbuffers::String *mutable_type() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *mutable_attributes() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *>(VT_ATTRIBUTES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TYPE) &&
+           verifier.VerifyString(type()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ActivationBuilder {
+  typedef Activation Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type(::flatbuffers::Offset<::flatbuffers::String> type) {
+    fbb_.AddOffset(Activation::VT_TYPE, type);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes) {
+    fbb_.AddOffset(Activation::VT_ATTRIBUTES, attributes);
+  }
+  explicit ActivationBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Activation> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Activation>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Activation> CreateActivation(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> type = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>> attributes = 0) {
+  ActivationBuilder builder_(_fbb);
+  builder_.add_attributes(attributes);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Activation> CreateActivationDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *type = nullptr,
+    const std::vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>> *attributes = nullptr) {
+  auto type__ = type ? _fbb.CreateString(type) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::AttributeDesc>>(*attributes) : 0;
+  return dml::ir::operatorFieldTypes::CreateActivation(
+      _fbb,
+      type__,
+      attributes__);
+}
+
+struct ActivationArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ActivationArrayBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *>(VT_DATA);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.VerifyVectorOfTables(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ActivationArrayBuilder {
+  typedef ActivationArray Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>> data) {
+    fbb_.AddOffset(ActivationArray::VT_DATA, data);
+  }
+  explicit ActivationArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ActivationArray> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ActivationArray>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ActivationArray> CreateActivationArray(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>> data = 0) {
+  ActivationArrayBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ActivationArray> CreateActivationArrayDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<::flatbuffers::Offset<dml::ir::operatorFieldTypes::Activation>>(*data) : 0;
+  return dml::ir::operatorFieldTypes::CreateActivationArray(
+      _fbb,
+      data__);
+}
+
+struct UIntArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UIntArrayBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::Vector<uint32_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_DATA);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct UIntArrayBuilder {
+  typedef UIntArray Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> data) {
+    fbb_.AddOffset(UIntArray::VT_DATA, data);
+  }
+  explicit UIntArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UIntArray> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UIntArray>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UIntArray> CreateUIntArray(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> data = 0) {
+  UIntArrayBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<UIntArray> CreateUIntArrayDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<uint32_t>(*data) : 0;
+  return dml::ir::operatorFieldTypes::CreateUIntArray(
+      _fbb,
+      data__);
+}
+
+struct IntArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntArrayBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::Vector<int32_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_DATA);
+  }
+  ::flatbuffers::Vector<int32_t> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<int32_t> *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct IntArrayBuilder {
+  typedef IntArray Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> data) {
+    fbb_.AddOffset(IntArray::VT_DATA, data);
+  }
+  explicit IntArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<IntArray> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<IntArray>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<IntArray> CreateIntArray(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> data = 0) {
+  IntArrayBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<IntArray> CreateIntArrayDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<int32_t>(*data) : 0;
+  return dml::ir::operatorFieldTypes::CreateIntArray(
+      _fbb,
+      data__);
+}
+
+struct FloatArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloatArrayBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::Vector<float> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_DATA);
+  }
+  ::flatbuffers::Vector<float> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<float> *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct FloatArrayBuilder {
+  typedef FloatArray Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<float>> data) {
+    fbb_.AddOffset(FloatArray::VT_DATA, data);
+  }
+  explicit FloatArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloatArray> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FloatArray>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloatArray> CreateFloatArray(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> data = 0) {
+  FloatArrayBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<FloatArray> CreateFloatArrayDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<float> *data = nullptr) {
+  auto data__ = data ? _fbb.CreateVector<float>(*data) : 0;
+  return dml::ir::operatorFieldTypes::CreateFloatArray(
+      _fbb,
+      data__);
+}
+
+struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ScalarUnionDataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA_TYPE = 4,
+    VT_DATA = 6
+  };
+  dml::ir::operatorFieldTypes::ScalarVariant data_type() const {
+    return static_cast<dml::ir::operatorFieldTypes::ScalarVariant>(GetField<uint8_t>(VT_DATA_TYPE, 0));
+  }
+  const void *data() const {
+    return GetPointer<const void *>(VT_DATA);
+  }
+  template<typename T> const T *data_as() const;
+  const dml::ir::operatorFieldTypes::ByteArray *data_as_ByteArray() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_ByteArray ? static_cast<const dml::ir::operatorFieldTypes::ByteArray *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int8 *data_as_Int8() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int8 ? static_cast<const dml::ir::operatorFieldTypes::Int8 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt8 *data_as_UInt8() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt8 ? static_cast<const dml::ir::operatorFieldTypes::UInt8 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int16 *data_as_Int16() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int16 ? static_cast<const dml::ir::operatorFieldTypes::Int16 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt16 *data_as_UInt16() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt16 ? static_cast<const dml::ir::operatorFieldTypes::UInt16 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int32 *data_as_Int32() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int32 ? static_cast<const dml::ir::operatorFieldTypes::Int32 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt32 *data_as_UInt32() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt32 ? static_cast<const dml::ir::operatorFieldTypes::UInt32 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Int64 *data_as_Int64() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int64 ? static_cast<const dml::ir::operatorFieldTypes::Int64 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::UInt64 *data_as_UInt64() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt64 ? static_cast<const dml::ir::operatorFieldTypes::UInt64 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Float32 *data_as_Float32() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Float32 ? static_cast<const dml::ir::operatorFieldTypes::Float32 *>(data()) : nullptr;
+  }
+  const dml::ir::operatorFieldTypes::Float64 *data_as_Float64() const {
+    return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Float64 ? static_cast<const dml::ir::operatorFieldTypes::Float64 *>(data()) : nullptr;
+  }
+  void *mutable_data() {
+    return GetPointer<void *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_DATA_TYPE, 1) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           VerifyScalarVariant(verifier, data(), data_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const dml::ir::operatorFieldTypes::ByteArray *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::ByteArray>() const {
+  return data_as_ByteArray();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int8 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Int8>() const {
+  return data_as_Int8();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt8 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::UInt8>() const {
+  return data_as_UInt8();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int16 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Int16>() const {
+  return data_as_Int16();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt16 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::UInt16>() const {
+  return data_as_UInt16();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int32 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Int32>() const {
+  return data_as_Int32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt32 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::UInt32>() const {
+  return data_as_UInt32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Int64 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Int64>() const {
+  return data_as_Int64();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::UInt64 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::UInt64>() const {
+  return data_as_UInt64();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Float32 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Float32>() const {
+  return data_as_Float32();
+}
+
+template<> inline const dml::ir::operatorFieldTypes::Float64 *ScalarUnionData::data_as<dml::ir::operatorFieldTypes::Float64>() const {
+  return data_as_Float64();
+}
+
+struct ScalarUnionDataBuilder {
+  typedef ScalarUnionData Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data_type(dml::ir::operatorFieldTypes::ScalarVariant data_type) {
+    fbb_.AddElement<uint8_t>(ScalarUnionData::VT_DATA_TYPE, static_cast<uint8_t>(data_type), 0);
+  }
+  void add_data(::flatbuffers::Offset<void> data) {
+    fbb_.AddOffset(ScalarUnionData::VT_DATA, data);
+  }
+  explicit ScalarUnionDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ScalarUnionData> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ScalarUnionData>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ScalarUnionData> CreateScalarUnionData(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    dml::ir::operatorFieldTypes::ScalarVariant data_type = dml::ir::operatorFieldTypes::ScalarVariant_NONE,
+    ::flatbuffers::Offset<void> data = 0) {
+  ScalarUnionDataBuilder builder_(_fbb);
+  builder_.add_data(data);
+  builder_.add_data_type(data_type);
+  return builder_.Finish();
+}
+
+inline bool VerifyAttributeFieldVariant(::flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type) {
+  switch (type) {
+    case AttributeFieldVariant_NONE: {
+      return true;
+    }
+    case AttributeFieldVariant_Activation: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::Activation *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_ActivationArray: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::ActivationArray *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_UInt32: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt32>(static_cast<const uint8_t *>(obj), 0, 4);
+    }
+    case AttributeFieldVariant_UInt64: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt64>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case AttributeFieldVariant_Int32: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int32>(static_cast<const uint8_t *>(obj), 0, 4);
+    }
+    case AttributeFieldVariant_Float32: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Float32>(static_cast<const uint8_t *>(obj), 0, 4);
+    }
+    case AttributeFieldVariant_UIntArray: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::UIntArray *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_IntArray: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::IntArray *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_FloatArray: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::FloatArray *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_ScaleBias: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::ScaleBias>(static_cast<const uint8_t *>(obj), 0, 4);
+    }
+    case AttributeFieldVariant_Size2D: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Size2D>(static_cast<const uint8_t *>(obj), 0, 4);
+    }
+    case AttributeFieldVariant_ScalarUnionData: {
+      auto ptr = reinterpret_cast<const dml::ir::operatorFieldTypes::ScalarUnionData *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case AttributeFieldVariant_Bool: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Bool>(static_cast<const uint8_t *>(obj), 0, 1);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyAttributeFieldVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyAttributeFieldVariant(
+        verifier,  values->Get(i), types->GetEnum<AttributeFieldVariant>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool VerifyScalarVariant(::flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type) {
+  switch (type) {
+    case ScalarVariant_NONE: {
+      return true;
+    }
+    case ScalarVariant_ByteArray: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::ByteArray>(static_cast<const uint8_t *>(obj), 0, 1);
+    }
+    case ScalarVariant_Int8: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int8>(static_cast<const uint8_t *>(obj), 0, 1);
+    }
+    case ScalarVariant_UInt8: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt8>(static_cast<const uint8_t *>(obj), 0, 1);
+    }
+    case ScalarVariant_Int16: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int16>(static_cast<const uint8_t *>(obj), 0, 2);
+    }
+    case ScalarVariant_UInt16: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt16>(static_cast<const uint8_t *>(obj), 0, 2);
+    }
+    case ScalarVariant_Int32: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int32>(static_cast<const uint8_t *>(obj), 0, 4);
+    }
+    case ScalarVariant_UInt32: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt32>(static_cast<const uint8_t *>(obj), 0, 4);
+    }
+    case ScalarVariant_Int64: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Int64>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case ScalarVariant_UInt64: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::UInt64>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case ScalarVariant_Float32: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Float32>(static_cast<const uint8_t *>(obj), 0, 4);
+    }
+    case ScalarVariant_Float64: {
+      return verifier.VerifyField<dml::ir::operatorFieldTypes::Float64>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyScalarVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyScalarVariant(
+        verifier,  values->Get(i), types->GetEnum<ScalarVariant>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace operatorFieldTypes
+}  // namespace ir
+}  // namespace dml
+
+#endif  // FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h
index 528548148518..1bc694dfe90c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h
@@ -26,14 +26,14 @@ namespace SchemaHelpers
         return field;
     }
 
-    inline OperatorFieldTypes::OperatorDesc ToOperatorFieldType(const DML_OPERATOR_DESC* value)
+    inline OperatorFieldTypes::FusedActivationOperatorDesc ToOperatorFieldType(const DML_OPERATOR_DESC* value)
     {
-        return value ? OperatorFieldTypes::OperatorDesc(ConvertOperatorDesc(*value)) : std::nullopt;
+        return value ? OperatorFieldTypes::FusedActivationOperatorDesc(ConvertOperatorDesc(*value)) : std::nullopt;
     }
 
-    inline OperatorFieldTypes::OperatorDescArray ToOperatorFieldType(const DML_OPERATOR_DESC* values, uint32_t count)
+    inline OperatorFieldTypes::FusedActivationOperatorDescArray ToOperatorFieldType(const DML_OPERATOR_DESC* values, uint32_t count)
     {
-        OperatorFieldTypes::OperatorDescArray field;
+        OperatorFieldTypes::FusedActivationOperatorDescArray field;
         if (values && count != 0)
         {
             field.emplace(count);
@@ -65,13 +65,17 @@ namespace SchemaHelpers
         return value;
     }
 
+    inline OperatorFieldTypes::Bool ToOperatorFieldType(bool value)
+    {
+        return value;
+    }
+
     inline OperatorFieldTypes::UIntArray ToOperatorFieldType(const uint32_t* values, uint32_t count)
     {
         OperatorFieldTypes::UIntArray field;
         if (values && count != 0)
         {
-            field.emplace(count);
-            std::copy_n(values, count, field->begin());
+            field.assign(values, values + count);
         }
         return field;
     }
@@ -81,8 +85,7 @@ namespace SchemaHelpers
         OperatorFieldTypes::IntArray field;
         if (values && count != 0)
         {
-            field.emplace(count);
-            std::copy_n(values, count, field->begin());
+            field.assign(values, values + count);
         }
         return field;
     }
@@ -92,8 +95,7 @@ namespace SchemaHelpers
         OperatorFieldTypes::FloatArray field;
         if (values && count != 0)
         {
-            field.emplace(count);
-            std::copy_n(values, count, field->begin());
+            field.assign(values, values + count);
         }
         return field;
     }
@@ -237,7 +239,7 @@ namespace SchemaHelpers
         {
             DML_OPERATOR_DESC* desc = nullptr;
 
-            const auto& value = field.AsOperatorDesc();
+            const auto& value = field.AsFusedActivationOperatorDesc();
             if (value)
             {
                 desc = allocator->template Allocate<DML_OPERATOR_DESC>();
@@ -251,7 +253,7 @@ namespace SchemaHelpers
         {
             DML_OPERATOR_DESC* descs = nullptr;
 
-            const auto& values = field.AsOperatorDescArray();
+            const auto& values = field.AsFusedActivationOperatorDescArray();
             if (values)
             {
                 descs = allocator->template Allocate<DML_OPERATOR_DESC>(values->size());
@@ -288,16 +290,20 @@ namespace SchemaHelpers
             dst->Write(value);
         } break;
 
+        case DML_SCHEMA_FIELD_TYPE_BOOL:
+        {
+            // OperatorFieldTypes::Bool is a 'bool' (1 byte) but written as 'BOOL' in op descs (4 bytes).
+            BOOL value = static_cast<BOOL>(field.AsBool());
+            dst->Write(value);
+        } break;
+
         case DML_SCHEMA_FIELD_TYPE_UINT_ARRAY:
         {
             uint32_t* arrayPtr = nullptr;
 
             const auto& values = field.AsUIntArray();
-            if (values)
-            {
-                arrayPtr = allocator->template Allocate<uint32_t>(values->size());
-                std::copy(values->begin(), values->end(), arrayPtr);
-            }
+            arrayPtr = allocator->template Allocate<uint32_t>(values.size());
+            std::copy(values.begin(), values.end(), arrayPtr);
 
             dst->Write(arrayPtr);
         } break;
@@ -307,11 +313,8 @@ namespace SchemaHelpers
             int32_t* arrayPtr = nullptr;
 
             const auto& values = field.AsIntArray();
-            if (values)
-            {
-                arrayPtr = allocator->template Allocate<int32_t>(values->size());
-                std::copy(values->begin(), values->end(), arrayPtr);
-            }
+            arrayPtr = allocator->template Allocate<int32_t>(values.size());
+            std::copy(values.begin(), values.end(), arrayPtr);
 
             dst->Write(arrayPtr);
         } break;
@@ -321,11 +324,8 @@ namespace SchemaHelpers
             float* arrayPtr = nullptr;
 
             const auto& values = field.AsFloatArray();
-            if (values)
-            {
-                arrayPtr = allocator->template Allocate<float>(values->size());
-                std::copy(values->begin(), values->end(), arrayPtr);
-            }
+            arrayPtr = allocator->template Allocate<float>(values.size());
+            std::copy(values.begin(), values.end(), arrayPtr);
 
             dst->Write(arrayPtr);
         } break;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 67c3f110e5a5..430ccec3cda1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -13,6 +13,26 @@ namespace Dml
 {
     class FusedGraphKernel : public onnxruntime::OpKernel
     {
+    private:
+        struct ReusedCommandListState
+        {
+            // Re-usable command list, supporting descriptor heap, and DML binding table to update that heap.
+            ComPtr<ID3D12GraphicsCommandList> graphicsCommandList;
+            ComPtr<ID3D12CommandAllocator> commandAllocator;
+            ComPtr<ID3D12DescriptorHeap> heap;
+            ComPtr<IDMLBindingTable> bindingTable;
+
+            // Bindings from previous executions of a re-used command list
+            mutable std::vector<uint64_t> inputBindingAllocIds;
+            mutable std::vector<uint64_t> outputBindingAllocIds;
+            mutable uint64_t tempBindingAllocId = 0;
+
+            // Fence tracking the status of the command list's last execution, and whether its descriptor heap
+            // can safely be updated.
+            mutable ComPtr<ID3D12Fence> fence;
+            mutable uint64_t completionValue = 0;
+        };
+
     public:
         FusedGraphKernel() = delete;
 
@@ -89,7 +109,7 @@ namespace Dml
 
             if (reuseCommandList)
             {
-                BuildReusableCommandList();
+                m_reusedCommandLists.push_back(BuildReusableCommandList());
             }
         }
 
@@ -97,8 +117,7 @@ namespace Dml
         {
             // Only re-use the cached command list if its prior execution is complete on the GPU.
             // This requirement can be avoided by mantaining ring buffers.
-            if (!m_graphicsCommandList ||
-                (m_fence != nullptr && m_fence->GetCompletedValue() < m_completionValue))
+            if (m_reusedCommandLists.empty())
             {
                 // Wrap tensors as required by Dml::IExecutionProvider::ExecuteOperator
                 OpKernelContextWrapper contextWrapper(
@@ -147,7 +166,15 @@ namespace Dml
             }
             else
             {
-                ExecuteReusableCommandList(kernelContext);
+                if (m_reusedCommandLists.front()->fence &&
+                    m_reusedCommandLists.front()->fence->GetCompletedValue() < m_reusedCommandLists.front()->completionValue)
+                {
+                    m_reusedCommandLists.push_front(BuildReusableCommandList());
+                }
+                
+                ExecuteReusableCommandList(kernelContext, *m_reusedCommandLists.front());
+                m_reusedCommandLists.push_back(std::move(m_reusedCommandLists.front()));
+                m_reusedCommandLists.pop_front();
             }
 
             return onnxruntime::Status::OK();
@@ -217,8 +244,10 @@ namespace Dml
             }
 
     private:
-        void BuildReusableCommandList()
+        std::unique_ptr<ReusedCommandListState> BuildReusableCommandList() const
         {
+            auto commandListState = std::make_unique<ReusedCommandListState>();
+
             ComPtr<IDMLDevice> device;
             ORT_THROW_IF_FAILED(m_provider->GetDmlDevice(device.GetAddressOf()));
 
@@ -232,47 +261,49 @@ namespace Dml
             ComPtr<ID3D12Device> d3dDevice;
             ORT_THROW_IF_FAILED(m_provider->GetD3DDevice(d3dDevice.GetAddressOf()));
 
-            ORT_THROW_IF_FAILED(d3dDevice->CreateDescriptorHeap(&desc, IID_GRAPHICS_PPV_ARGS(m_heap.ReleaseAndGetAddressOf())));
+            ORT_THROW_IF_FAILED(d3dDevice->CreateDescriptorHeap(&desc, IID_GRAPHICS_PPV_ARGS(commandListState->heap.ReleaseAndGetAddressOf())));
 
             // Create a binding table for execution.
             DML_BINDING_TABLE_DESC bindingTableDesc = {};
             bindingTableDesc.Dispatchable = m_compiledExecutionPlanOperator.Get();
-            bindingTableDesc.CPUDescriptorHandle = m_heap->GetCPUDescriptorHandleForHeapStart();
-            bindingTableDesc.GPUDescriptorHandle = m_heap->GetGPUDescriptorHandleForHeapStart();
+            bindingTableDesc.CPUDescriptorHandle = commandListState->heap->GetCPUDescriptorHandleForHeapStart();
+            bindingTableDesc.GPUDescriptorHandle = commandListState->heap->GetGPUDescriptorHandleForHeapStart();
             bindingTableDesc.SizeInDescriptors = execBindingProps.RequiredDescriptorCount;
 
-            ORT_THROW_IF_FAILED(device->CreateBindingTable(&bindingTableDesc, IID_PPV_ARGS(&m_bindingTable)));
+            ORT_THROW_IF_FAILED(device->CreateBindingTable(&bindingTableDesc, IID_PPV_ARGS(&commandListState->bindingTable)));
 
             ORT_THROW_IF_FAILED(d3dDevice->CreateCommandAllocator(
                 m_provider->GetCommandListTypeForQueue(),
-                IID_GRAPHICS_PPV_ARGS(m_commandAllocator.ReleaseAndGetAddressOf())));
+                IID_GRAPHICS_PPV_ARGS(commandListState->commandAllocator.ReleaseAndGetAddressOf())));
 
             ORT_THROW_IF_FAILED(d3dDevice->CreateCommandList(
                 0,
                 m_provider->GetCommandListTypeForQueue(),
-                m_commandAllocator.Get(),
+                commandListState->commandAllocator.Get(),
                 nullptr,
-                IID_GRAPHICS_PPV_ARGS(m_graphicsCommandList.ReleaseAndGetAddressOf())));
+                IID_GRAPHICS_PPV_ARGS(commandListState->graphicsCommandList.ReleaseAndGetAddressOf())));
 
             if (m_persistentResource)
             {
                 DML_BINDING_DESC persistentResourceBindingDesc =
                     { DML_BINDING_TYPE_BUFFER, m_persistentResourceBinding ? &*m_persistentResourceBinding : nullptr };
-                m_bindingTable->BindPersistentResource(&persistentResourceBindingDesc);
+                commandListState->bindingTable->BindPersistentResource(&persistentResourceBindingDesc);
             }
 
-            ID3D12DescriptorHeap* descriptorHeaps[] = { m_heap.Get() };
-            m_graphicsCommandList->SetDescriptorHeaps(ARRAYSIZE(descriptorHeaps), descriptorHeaps);
+            ID3D12DescriptorHeap* descriptorHeaps[] = { commandListState->heap.Get() };
+            commandListState->graphicsCommandList->SetDescriptorHeaps(ARRAYSIZE(descriptorHeaps), descriptorHeaps);
 
             ComPtr<IDMLCommandRecorder> recorder;
             ORT_THROW_IF_FAILED(device->CreateCommandRecorder(IID_PPV_ARGS(recorder.GetAddressOf())));
 
-            recorder->RecordDispatch(m_graphicsCommandList.Get(), m_compiledExecutionPlanOperator.Get(), m_bindingTable.Get());
+            recorder->RecordDispatch(commandListState->graphicsCommandList.Get(), m_compiledExecutionPlanOperator.Get(), commandListState->bindingTable.Get());
+
+            ORT_THROW_IF_FAILED(commandListState->graphicsCommandList->Close());
 
-            ORT_THROW_IF_FAILED(m_graphicsCommandList->Close());
+            return commandListState;
         }
 
-        void ExecuteReusableCommandList(onnxruntime::OpKernelContext* kernelContext) const
+        void ExecuteReusableCommandList(onnxruntime::OpKernelContext* kernelContext, ReusedCommandListState& commandListState) const
         {
             DML_BINDING_PROPERTIES execBindingProps = m_compiledExecutionPlanOperator->GetBindingProperties();
 
@@ -287,7 +318,7 @@ namespace Dml
 
             // Populate input bindings, excluding those which were specified as owned by DML and provided
             // at initialization instead.
-            m_inputBindingAllocIds.resize(inputBindings.size());
+            commandListState.inputBindingAllocIds.resize(inputBindings.size());
             bool inputBindingsChanged = false;
 
             for (uint32_t i = 0; i < inputBindings.size(); ++i)
@@ -307,25 +338,25 @@ namespace Dml
 
                         uint64_t allocId;
                         DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &inputBindings[i].Buffer, &allocId);
-                        inputBindingsChanged = inputBindingsChanged || (!allocId || m_inputBindingAllocIds[i] != allocId);
+                        inputBindingsChanged = inputBindingsChanged || (!allocId || commandListState.inputBindingAllocIds[i] != allocId);
                         inputBindings[i].Buffer->Release(); // Avoid holding an additional reference
                         inputBindings[i].SizeInBytes = DmlGraphFusionHelper::AlignToPow2<size_t>(tensor->SizeInBytes(), 4);
                         inputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &inputBindings[i]};
-                        m_inputBindingAllocIds[i] = allocId;
+                        commandListState.inputBindingAllocIds[i] = allocId;
                     }
                 }
             }
 
             if (inputBindingsChanged)
             {
-                m_bindingTable->BindInputs(gsl::narrow_cast<uint32_t>(inputBindingDescs.size()), inputBindingDescs.data());
+                commandListState.bindingTable->BindInputs(gsl::narrow_cast<uint32_t>(inputBindingDescs.size()), inputBindingDescs.data());
             }
 
             // Populate Output bindings
             std::vector<DML_BUFFER_BINDING> outputBindings(kernelContext->OutputCount());
             std::vector<DML_BINDING_DESC> outputBindingDescs(kernelContext->OutputCount());
 
-            m_outputBindingAllocIds.resize(outputBindings.size());
+            commandListState.outputBindingAllocIds.resize(outputBindings.size());
             bool outputBindingsChanged = false;
 
             for (uint32_t i = 0; i < outputBindings.size(); ++i)
@@ -344,16 +375,16 @@ namespace Dml
 
                 uint64_t allocId;
                 DmlGraphFusionHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &outputBindings[i].Buffer, &allocId);
-                outputBindingsChanged = outputBindingsChanged || (!allocId || m_outputBindingAllocIds[i] != allocId);
+                outputBindingsChanged = outputBindingsChanged || (!allocId || commandListState.outputBindingAllocIds[i] != allocId);
                 outputBindings[i].Buffer->Release(); // Avoid holding an additional reference
                 outputBindings[i].SizeInBytes = DmlGraphFusionHelper::AlignToPow2<size_t>(tensor->SizeInBytes(), 4);
                 outputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &outputBindings[i]};
-                m_outputBindingAllocIds[i] = allocId;
+                commandListState.outputBindingAllocIds[i] = allocId;
             }
 
             if (outputBindingsChanged)
             {
-                m_bindingTable->BindOutputs(gsl::narrow_cast<uint32_t>(outputBindingDescs.size()), outputBindingDescs.data());
+                commandListState.bindingTable->BindOutputs(gsl::narrow_cast<uint32_t>(outputBindingDescs.size()), outputBindingDescs.data());
             }
 
             if (execBindingProps.TemporaryResourceSize > 0)
@@ -373,19 +404,19 @@ namespace Dml
                 DML_BUFFER_BINDING tempBufferBinding = {tempResource.Get(), 0, execBindingProps.TemporaryResourceSize};
                 DML_BINDING_DESC tempBindingDesc = { DML_BINDING_TYPE_BUFFER, &tempBufferBinding };
 
-                if (!tempAllocId || m_tempBindingAllocId != tempAllocId)
+                if (!tempAllocId || commandListState.tempBindingAllocId != tempAllocId)
                 {
-                    m_bindingTable->BindTemporaryResource(&tempBindingDesc);
+                    commandListState.bindingTable->BindTemporaryResource(&tempBindingDesc);
                 }
 
-                m_tempBindingAllocId = tempAllocId;
+                commandListState.tempBindingAllocId = tempAllocId;
             }
 
             // Execute the command list and if it succeeds, update the fence value at which this command may be
             // re-used.
             ComPtr<ID3D12Fence> fence;
             uint64_t completionValue;
-            HRESULT hr = m_provider->ExecuteCommandList(m_graphicsCommandList.Get(), fence.GetAddressOf(), &completionValue);
+            HRESULT hr = m_provider->ExecuteCommandList(commandListState.graphicsCommandList.Get(), fence.GetAddressOf(), &completionValue);
 
             if (hr == DXGI_ERROR_DEVICE_REMOVED)
             {
@@ -395,13 +426,13 @@ namespace Dml
             }
 
             ORT_THROW_IF_FAILED(hr);
-            m_fence = fence;
-            m_completionValue = completionValue;
+            commandListState.fence = fence;
+            commandListState.completionValue = completionValue;
 
             // Queue references to objects which must be kept alive until resulting GPU work completes
-            m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(m_graphicsCommandList).Get());
-            m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(m_heap).Get());
-            m_winmlProvider->QueueReference(m_bindingTable.Get());
+            m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(commandListState.graphicsCommandList).Get());
+            m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(commandListState.heap).Get());
+            m_winmlProvider->QueueReference(commandListState.bindingTable.Get());
             m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get());
         }
 
@@ -412,25 +443,12 @@ namespace Dml
         ComPtr<Dml::IExecutionProvider> m_provider;
         Windows::AI::MachineLearning::Adapter::EdgeShapes& m_outputShapes;
 
-        // Re-usable command list, supporting descriptor heap, and DML binding table to update that heap.
-        ComPtr<ID3D12GraphicsCommandList> m_graphicsCommandList;
-        ComPtr<ID3D12CommandAllocator> m_commandAllocator;
-        ComPtr<ID3D12DescriptorHeap> m_heap;
-        ComPtr<IDMLBindingTable> m_bindingTable;
+        mutable std::deque<std::unique_ptr<ReusedCommandListState>> m_reusedCommandLists;
+
         std::optional<DML_BUFFER_BINDING> m_persistentResourceBinding;
         ComPtr<ID3D12Resource> m_persistentResource;
         ComPtr<IUnknown> m_persistentResourceAllocatorUnk; // Controls when the persistent resource is returned to the allocator
 
-        // Bindings from previous executions of a re-used command list
-        mutable std::vector<uint64_t> m_inputBindingAllocIds;
-        mutable std::vector<uint64_t> m_outputBindingAllocIds;
-        mutable uint64_t m_tempBindingAllocId = 0;
-
-        // Fence tracking the status of the command list's last execution, and whether its descriptor heap
-        // can safely be updated.
-        mutable ComPtr<ID3D12Fence> m_fence;
-        mutable uint64_t m_completionValue = 0;
-
         std::vector<uint8_t> m_isInputsUploadedByDmlEP;
         std::vector<ComPtr<ID3D12Resource>> m_nonOwnedGraphInputsFromInitializers;
     };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index 3fc8f415e5a5..3b0dbd542547 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -33,10 +33,12 @@ namespace Dml::GraphDescBuilder
     #pragma warning(pop)
 
     static void RemoveUnconnectedNodes(
-        std::vector<NodeInfo>& graphNodes,
-        std::vector<DML_INPUT_GRAPH_EDGE_DESC>& graphInputEdges,
-        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC>& graphIntermediateEdges,
-        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC>& graphOutputEdges)
+        std::vector<DmlSerializedGraphNode>& graphNodes,
+        std::vector<DmlInputSerializedGraphEdge>& graphInputEdges,
+        std::vector<DmlIntermediateSerializedGraphEdge>& graphIntermediateEdges,
+        std::vector<DmlOutputSerializedGraphEdge>& graphOutputEdges,
+        std::unordered_map<uint32_t, uint32_t>& serializedGraphInputIndexToSubgraphInputIndex,
+        std::unordered_map<std::string_view, uint32_t>& serializedGraphLargeConstantNameToSubgraphInputIndex)
     {
         enum class NodeState
         {
@@ -52,7 +54,7 @@ namespace Dml::GraphDescBuilder
         };
 
         std::vector<NodeData> nodesData(graphNodes.size());
-        for (const DML_INTERMEDIATE_GRAPH_EDGE_DESC& intermediateEdge : graphIntermediateEdges)
+        for (const DmlIntermediateSerializedGraphEdge& intermediateEdge : graphIntermediateEdges)
         {
             nodesData[intermediateEdge.ToNodeIndex].predecessorIndices.push_back(intermediateEdge.FromNodeIndex);
         }
@@ -60,7 +62,7 @@ namespace Dml::GraphDescBuilder
         std::stack<uint32_t> nodeIndicesToVisit;
 
         // Start from the outputs of the graph and traverse upwards
-        for (const DML_OUTPUT_GRAPH_EDGE_DESC& outputEdge : graphOutputEdges)
+        for (const DmlOutputSerializedGraphEdge& outputEdge : graphOutputEdges)
         {
             nodeIndicesToVisit.push(outputEdge.FromNodeIndex);
         }
@@ -124,8 +126,10 @@ namespace Dml::GraphDescBuilder
         graphNodes.resize(graphNodes.size() - shift);
 
         // Adjust the node indices in the input edges
+        std::unordered_set<uint32_t> usedInputEdgeIndex;
         for (auto& inputEdge : graphInputEdges)
         {
+            usedInputEdgeIndex.insert(inputEdge.GraphInputIndex);
             inputEdge.ToNodeIndex = shiftedIndicesMapping[inputEdge.ToNodeIndex];
         }
 
@@ -136,24 +140,95 @@ namespace Dml::GraphDescBuilder
         }
 
         // Adjust the node indices in the intermediate edges
+        std::unordered_set<std::string> usedLargeConstantNames;
         for (auto& intermediateEdge : graphIntermediateEdges)
         {
             intermediateEdge.FromNodeIndex = shiftedIndicesMapping[intermediateEdge.FromNodeIndex];
             intermediateEdge.ToNodeIndex = shiftedIndicesMapping[intermediateEdge.ToNodeIndex];
+            // We need to update the edge name only when the name contains the intermediateEdge.FromNodeIndex
+            size_t pos = intermediateEdge.Name.find("nodeIdx:");
+            if (pos != std::string::npos)
+            {
+                if (pos != 0)
+                {
+                    std::string constantNamePartComingFromModel = intermediateEdge.Name.substr(0, pos - 1);
+                    usedLargeConstantNames.insert(constantNamePartComingFromModel); // need part of name which is coming from the model.
+                    intermediateEdge.Name = constantNamePartComingFromModel;
+                    intermediateEdge.Name += "-nodeIdx:" + std::to_string(intermediateEdge.FromNodeIndex) + "-outputIdx:" + std::to_string(intermediateEdge.FromNodeOutputIndex);
+                }
+                else
+                {
+                    intermediateEdge.Name = "nodeIdx:" + std::to_string(intermediateEdge.FromNodeIndex) + "-outputIdx:" + std::to_string(intermediateEdge.FromNodeOutputIndex);
+                }
+            }
+        }
+
+
+        // Erase the mapping if the input Edge is not used by any node
+        for (auto it = serializedGraphInputIndexToSubgraphInputIndex.begin(); it != serializedGraphInputIndexToSubgraphInputIndex.end();)
+        {
+            if (!usedInputEdgeIndex.count(it->first))
+            {
+                it = serializedGraphInputIndexToSubgraphInputIndex.erase(it);
+            }
+            else
+            {
+                it++;
+            }
+        }
+
+        // Erase the mapping if the input Edge is not used by any node
+        for (auto it = serializedGraphLargeConstantNameToSubgraphInputIndex.begin(); it != serializedGraphLargeConstantNameToSubgraphInputIndex.end();)
+        {
+            if (!usedLargeConstantNames.count(std::string(it->first)))
+            {
+                it = serializedGraphLargeConstantNameToSubgraphInputIndex.erase(it);
+            }
+            else
+            {
+                it++;
+            }
+        }
+    }
+
+
+    uint32_t SetAndGetDmlGraphNodeIndex(
+        const uint32_t operatorDmlGraphNodeIndex,
+        const std::string& nodeNamePrefix,
+        AbstractOperatorDesc& operatorDesc,
+        /*in_out*/std::unordered_map<uint32_t, uint32_t>& operatorDmlGraphToDmlGraphNodeIndexMap,
+        /*in_out*/std::vector<DmlSerializedGraphNode>& dmlGraphNodes)
+    {
+        auto iter = operatorDmlGraphToDmlGraphNodeIndexMap.find(operatorDmlGraphNodeIndex);
+        if (iter != operatorDmlGraphToDmlGraphNodeIndexMap.end())
+        {
+            return iter->second;
         }
+        operatorDmlGraphToDmlGraphNodeIndexMap[operatorDmlGraphNodeIndex] = static_cast<uint32_t>(dmlGraphNodes.size());
+        dmlGraphNodes.push_back({operatorDesc, nodeNamePrefix + std::to_string(operatorDmlGraphNodeIndex)});
+        return operatorDmlGraphToDmlGraphNodeIndexMap[operatorDmlGraphNodeIndex];
     }
 
+    // Terminology:
+    //   Subgraph: partitioned ONNX graph from the original (main) ONNX graph
+    //   DmlGraph: a graph in DML currency converted from subgraph.
+    //   operatorDmlGraph: a graph in DML currency for a given node or operator
+    // Main Points to note:
+    //   - GraphDesc will always has sequential indices for input and intermediate edges.
+    //   - 1 onnx node can be converted to one or more dml nodes.
     GraphDesc BuildGraphDesc(
         const uint8_t* isConstGpuGraphInput,
         const size_t isConstGpuGraphInputCount,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
         const std::unordered_map<std::string, GraphNodeProperties>& graphNodePropertyMap,
-        IDMLDevice* device,
-        const void* executionHandle,
+        const ExecutionProviderImpl* executionHandle,
         const onnxruntime::Path& modelPath,
         gsl::span<const onnxruntime::Node* const> subgraphNodes,
         gsl::span<const onnxruntime::NodeArg* const> subgraphInputs,
-        gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs)
+        gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs,
+        /*out*/ std::unordered_map<uint32_t, uint32_t>& serializedGraphInputIndexToSubgraphInputIndex,
+        /*out*/ std::unordered_map<std::string_view, uint32_t>& serializedGraphLargeConstantNameToSubgraphInputIndex,
+        /*out*/ std::vector<std::unique_ptr<std::byte[]>>& smallConstantData)
     {
         struct NodeAndIndex
         {
@@ -161,19 +236,34 @@ namespace Dml::GraphDescBuilder
             uint32_t targetIndex; // The index of the input/output on the node (e.g. 1 for the second input on a node)
         };
 
-        // Map from Lotus node argument names to the new node and index where it will be produced
-        std::unordered_map<std::string, NodeAndIndex> nameToNodeAndIndexMap;
+        std::unordered_map<std::string, std::vector<uint32_t>> nodeOutputShapes;
+
+        // Map from ORT subgraph input names to indices
+        std::unordered_map<std::string_view, uint32_t> subgraphInputNameToIndexMap;
 
-        std::unordered_map<std::string, EdgeShapes> nodeOutputShapes;
+        // - Map from ORT node's output names to DmlGraph <NodeAndIndex>.
+        // - Once a given ORT node (or operator) will be transformed into a operatorDmlGraph,
+        //   then ORT node's output names will become output edges for the operatorDmlGraph.
+        // - This map will be populated for those output edges.
+        std::unordered_map<std::string, NodeAndIndex> dmlGraphNodeOutputNameToNodeAndIndexMap;
 
-        // Map from Lotus node argument names to input indices of the fused kernel node.
-        std::unordered_map<std::string, uint32_t> nameToDmlFusedNodeInputIndex;
+        // This map will be used to re-index an subGraphInputIndex to sequential input index
+        // for DmlGraph
+        std::unordered_map<uint32_t, uint32_t> subGraphInputIndexToDmlGraphInputIndex;
+
+        // Iterate through each node and create a corresponding node in the new graph
+        // We can iterate the nodes in any order because the edge connectivity will take care of the topological order
+        std::unordered_map<std::string, std::vector<uint32_t>> inferredOutputShapes;
+
+        std::vector<DmlSerializedGraphNode> dmlGraphNodes;
+        std::vector<DmlInputSerializedGraphEdge> dmlGraphInputEdges;
+        std::vector<DmlIntermediateSerializedGraphEdge> dmlGraphIntermediateEdges;
+        std::vector<DmlOutputSerializedGraphEdge> dmlGraphOutputEdges;
 
         for (size_t inputIndex = 0; inputIndex < subgraphInputs.size(); ++inputIndex)
         {
-            const onnxruntime::NodeArg* graphInput = subgraphInputs[inputIndex];
-
-            if (!graphInput)
+            const onnxruntime::NodeArg* subgraphInput = subgraphInputs[inputIndex];
+            if (!subgraphInput)
             {
                 // This is a workaround for when node inputs get manipulated by transformers outside of our control,
                 // which then causes them to have a different name. If that happens we can't figure out how to
@@ -181,45 +271,21 @@ namespace Dml::GraphDescBuilder
                 // just bail early.
                 ORT_THROW_HR(E_UNEXPECTED);
             }
-
-            nameToDmlFusedNodeInputIndex.emplace(graphInput->Name(), gsl::narrow_cast<uint32_t>(inputIndex));
-        }
-
-        StackAllocator<1024> allocator; // Used for converting abstract operator descs into DML_OPERATOR_DESC
-
-        std::vector<NodeInfo> graphNodes;
-        std::vector<DML_INPUT_GRAPH_EDGE_DESC> graphInputEdges;
-        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> graphIntermediateEdges;
-        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> graphOutputEdges;
-
-        // Avoid using separate command lists for small graphs. This value can be reduced by tuning the
-        // flushing behavior of DmlCommandRecorder.  Its current behavior is to assume that graphs contain
-        // enough GPU work to be worth flushing immediately.
-        const uint32_t minNodeCountToReuseCommandList = 5;
-        bool reuseCommandList = false;
-
-        if (subgraphNodes.size() >= minNodeCountToReuseCommandList)
-        {
-            reuseCommandList = true;
+            subgraphInputNameToIndexMap.emplace(subgraphInput->Name(), gsl::narrow_cast<uint32_t>(inputIndex));
         }
 
         auto constantCpuGraphInputGetter = [&isInitializerTransferable, &modelPath](const std::string& argName)
         {
             ComPtr<OnnxTensorWrapper> tensorWrapper;
-
             auto iter = isInitializerTransferable.find(argName);
             if (iter != isInitializerTransferable.end())
             {
                 // Using const_cast here is simpler than making surrounding code const correct.
                 tensorWrapper = wil::MakeOrThrow<OnnxTensorWrapper>(const_cast<ONNX_NAMESPACE::TensorProto*>(iter->second.first), modelPath);
             }
-
             return tensorWrapper;
         };
 
-        // Iterate through each node and create a corresponding node in the new graph
-        // We can iterate the nodes in any order because the edge connectivity will take care of the topological order
-        std::unordered_map<std::string, std::vector<uint32_t>> inferredOutputShapes;
 
         for (const onnxruntime::Node* subgraphNode : subgraphNodes)
         {
@@ -232,14 +298,22 @@ namespace Dml::GraphDescBuilder
             {
                 ComPtr<IMLOperatorTensor> tensor = nullptr;
 
-                // Check whether this specific node requested support for constant CPU inputs
-                if (std::find(requiredConstantCpuInputs.begin(), requiredConstantCpuInputs.end(), inputIndex) != requiredConstantCpuInputs.end())
+                auto inputDefs = node.InputDefs();
+
+                if (inputIndex < inputDefs.size())
                 {
-                    auto inputDefs = node.InputDefs();
-                    if (inputIndex < inputDefs.size())
+                    const onnxruntime::NodeArg* arg = inputDefs[inputIndex];
+                    tensor = constantCpuGraphInputGetter(arg->Name());
+
+                    if (tensor == nullptr)
                     {
-                        const onnxruntime::NodeArg* arg = inputDefs[inputIndex];
-                        tensor = constantCpuGraphInputGetter(arg->Name());
+                        bool inputRequiredAsConstant = std::find(
+                            requiredConstantCpuInputs.begin(),
+                            requiredConstantCpuInputs.end(),
+                            inputIndex) != requiredConstantCpuInputs.end();
+
+                        // This shouldn't happen since kernel creation is deferred and repeated when required constant inputs are not present.
+                        ORT_THROW_HR_IF(E_UNEXPECTED, inputRequiredAsConstant);
                     }
                 }
 
@@ -269,14 +343,14 @@ namespace Dml::GraphDescBuilder
             }
 
             EdgeShapes outputShapes;
-            DmlGraphNodeCreateInfo graphNodeCreateInfo;
+            DmlGraphNodeCreateInfo operatorDmlGraphCreateInfo;
             graphNodeProps.internalRegInfo->graphNodeFactoryRegistration->factory(
                 node,
                 constantCpuNodeInputGetter,
                 executionHandle,
                 &inputShapesOverrides,
                 /*out*/ &outputShapes,
-                /*out*/ &graphNodeCreateInfo
+                /*out*/ &operatorDmlGraphCreateInfo
             );
 
             ORT_THROW_HR_IF(E_UNEXPECTED, outputShapes.EdgeCount() != node.OutputDefs().size());
@@ -285,119 +359,188 @@ namespace Dml::GraphDescBuilder
                 inferredOutputShapes[node.OutputDefs()[i]->Name()] = outputShapes.GetShape(i);
             }
 
-            // Create a map between operatorGraphNodeIndex to mainGraphNodeIndex.
-            std::unordered_map<uint32_t, uint32_t> operatorGraphNodeIndexToMainGraphNodeIndexMap;
-            uint32_t graphNodeCount = gsl::narrow_cast<uint32_t>(graphNodes.size());
-            const bool isNodeAsOpDesc = graphNodeCreateInfo.nodesAsOperatorDesc.size() > 0;
+            // Algorithm:
+            //  1. Create constant nodes by iterating through operatorDmlGraph's input edges and keep a map of it,
+            //     because there would be an intermediate edge from the constantNode and source of the intermediate edge
+            //     should come before the destination.
+            //  2. Again iterate through operatorDmlGraph's input edges to create mainGraph's input and intermediate edges.
+            //  3. Iterate through operatorDmlGraph's intermediate edges to create mainGraph's intermediate edges.
+            //  4. Iterate through operatorDmlGraph's output edges to populate outputEdgeNameToDmlGraphNodeAndIndex
+            //  5. While performing step 2, 3, and 4, insert operatorDmlGraphNode to the mainDmlGraphNode list.
 
-            if (isNodeAsOpDesc)
-            {
-                // Can't populate graphNodes vector at this point, because operatorDesc may get modified later.
-                for (uint32_t nodeIndex = 0; nodeIndex < graphNodeCreateInfo.nodeCount; nodeIndex++)
-                {
-                    ORT_THROW_HR_IF(E_UNEXPECTED, !graphNodeCreateInfo.nodesAsOperatorDesc[nodeIndex]);
-                    operatorGraphNodeIndexToMainGraphNodeIndexMap.emplace(nodeIndex, graphNodeCount++);
-                }
-            }
-            else
+            for (auto& operatorDmlGraphInputEdge : operatorDmlGraphCreateInfo.inputEdges)
             {
-                for (uint32_t nodeIndex = 0; nodeIndex < graphNodeCreateInfo.nodeCount; nodeIndex++)
+                const onnxruntime::NodeArg* arg = node.InputDefs()[operatorDmlGraphInputEdge.GraphInputIndex];
+                if (arg->Exists())
                 {
-                    ORT_THROW_HR_IF(E_UNEXPECTED, !graphNodeCreateInfo.nodesAsIDMLOperator[nodeIndex].Get());
-                    operatorGraphNodeIndexToMainGraphNodeIndexMap.emplace(nodeIndex, graphNodeCount++);
-                    NodeInfo nodeInfo = {};
-                    nodeInfo.op = std::move(graphNodeCreateInfo.nodesAsIDMLOperator[nodeIndex]);
-                    graphNodes.push_back(std::move(nodeInfo));
+                    auto iter = subgraphInputNameToIndexMap.find(arg->Name());
+                    if (iter != subgraphInputNameToIndexMap.end() &&
+                        iter->second < isConstGpuGraphInputCount &&
+                        isConstGpuGraphInput[iter->second])
+                    {
+                        DmlSerializedGraphNode constantNode = {};
+                        constantNode.Name = arg->Name();
+
+                        // This is a highly inefficient approach to generating constant nodes.  It duplicates constant data
+                        // across the graph input as well as every consumer's unique constant node.  However it is currently
+                        // only used for small inputs.
+                        auto& operatorDmlGraphInputNode = operatorDmlGraphCreateInfo.nodes[operatorDmlGraphInputEdge.ToNodeIndex];
+                        std::vector<DmlBufferTensorDesc*> toNodeInputTensorDescs = operatorDmlGraphInputNode->GetInputTensors();
+                        DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorDmlGraphInputEdge.ToNodeInputIndex];
+                        ComPtr<OnnxTensorWrapper> constantInput;
+
+                        if (tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize)
+                        {
+                            constantInput = constantCpuGraphInputGetter(arg->Name());
+                        }
+
+                        if (constantInput)
+                        {
+                            // The tensor description's size should be no larger than the constant input unless it was rounded to
+                            // the required alignment.
+                            assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
+                            size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast<size_t>(tensorDesc->totalTensorSizeInBytes));
+                            auto data = static_cast<const uint8_t*>(constantInput->GetData());
+                            std::vector<uint8_t> tensorData(data, data + minimumConstantSize);
+
+                            smallConstantData.push_back(std::make_unique<std::byte[]>(tensorData.size()));
+                            std::transform(tensorData.begin(), tensorData.end(), smallConstantData.back().get(), [](uint8_t b) {return static_cast<std::byte>(b);});
+
+                            ConstantData constantData = {smallConstantData.back().get(), tensorData.size()};
+                            constantNode.Desc = constantData;
+                        }
+                        else
+                        {
+                            ConstantName constantFileName = {GetSanitizedFileName(arg->Name())};
+                            constantNode.Desc = constantFileName;
+                        }
+                        dmlGraphNodeOutputNameToNodeAndIndexMap[arg->Name()] = {static_cast<uint32_t>(dmlGraphNodes.size()), 0};
+                        dmlGraphNodes.push_back(constantNode);
+                    }
                 }
             }
 
-            // map operatorGraphInputEdge as either mainGraphInputEdge or mainGraphIntermediateEdge
-            for (auto& operatorGraphInputEdge : graphNodeCreateInfo.inputEdges)
-            {
-                // operatorGraphInputEdge.GraphInputIndex will be the ONNX input index.
-                const onnxruntime::NodeArg* arg = node.InputDefs()[operatorGraphInputEdge.GraphInputIndex];
+            // Create a map between operatorGraphNodeIndex to dmlGraphNodeIndex.
+            std::unordered_map<uint32_t, uint32_t> operatorDmlGraphToDmlGraphNodeIndexMap;
 
+            // map operatorDmlGraphInputEdge as either mainDmlGraphInputEdge or mainDmlGraphIntermediateEdge
+            for (auto& operatorDmlGraphInputEdge : operatorDmlGraphCreateInfo.inputEdges)
+            {
+                // operatorDmlGraphInputEdge.GraphInputIndex will be the ONNX input index.
+                const onnxruntime::NodeArg* arg = node.InputDefs()[operatorDmlGraphInputEdge.GraphInputIndex];
                 if (arg->Exists())
                 {
-                    auto iter = nameToDmlFusedNodeInputIndex.find(arg->Name());
-                    uint32_t mainGraphNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphInputEdge.ToNodeIndex];
-
-                    if (iter != nameToDmlFusedNodeInputIndex.end())
+                    uint32_t dmlGraphNodeIndex = SetAndGetDmlGraphNodeIndex(
+                        operatorDmlGraphInputEdge.ToNodeIndex,
+                        node.Name(),
+                        *operatorDmlGraphCreateInfo.nodes[operatorDmlGraphInputEdge.ToNodeIndex],
+                        operatorDmlGraphToDmlGraphNodeIndexMap,
+                        dmlGraphNodes);
+
+                    auto iter = subgraphInputNameToIndexMap.find(arg->Name());
+                    if (iter != subgraphInputNameToIndexMap.end())
                     {
-                        // This is a graph input
-
-                        const uint32_t dmlFusedNodeInputIndex = iter->second;
-
-                        DML_INPUT_GRAPH_EDGE_DESC edge = {};
-                        edge.GraphInputIndex = dmlFusedNodeInputIndex;
-                        edge.ToNodeIndex = mainGraphNodeIndex;
-                        edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex;  // ?? might need to point inputIndex
-                        graphInputEdges.push_back(edge);
-
-                        // If this is a constant input, set the appropriate flags on the desc
-                        if (isNodeAsOpDesc &&
-                            dmlFusedNodeInputIndex < isConstGpuGraphInputCount &&
-                            isConstGpuGraphInput[dmlFusedNodeInputIndex])
+                        const uint32_t subgraphInputIndex = iter->second;
+
+                        // Either this edge will be
+                        //  a constant input, then it will be an intermediate edge and
+                        //  set the OWNED_BY_DML flag if it is large constant
+                        //  or,
+                        //  a non-constant input, then it will be a mainDmlGraphInputEdge.
+                        if (subgraphInputIndex < isConstGpuGraphInputCount &&
+                            isConstGpuGraphInput[subgraphInputIndex])
                         {
-                            auto& graphInputNode = graphNodeCreateInfo.nodesAsOperatorDesc[operatorGraphInputEdge.ToNodeIndex];
-                            std::vector<DmlBufferTensorDesc*> toNodeInputTensorDescs = graphInputNode->GetInputTensors();
-                            DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorGraphInputEdge.ToNodeInputIndex];
-                            tensorDesc->flags |= DML_TENSOR_FLAG_OWNED_BY_DML;
+                            const auto& constantNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(arg->Name());
+                            auto& constantNodeVariant = std::get<DmlSerializedGraphNodeConstantVariant>(dmlGraphNodes[constantNodeAndIndex.nodeIndex].Desc);
+                            if (std::holds_alternative<ConstantName>(constantNodeVariant))
+                            {
+                                auto& mainDmlGraphNode = dmlGraphNodes[dmlGraphNodeIndex];
+                                AbstractOperatorDesc& abstractOperatorDesc = std::get<AbstractOperatorDesc>(mainDmlGraphNode.Desc);
+                                std::vector<DmlBufferTensorDesc*> toNodeInputTensorDescs = abstractOperatorDesc.GetInputTensors();
+                                DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorDmlGraphInputEdge.ToNodeInputIndex];
+                                tensorDesc->flags |= DML_TENSOR_FLAG_OWNED_BY_DML;
+                                serializedGraphLargeConstantNameToSubgraphInputIndex[arg->Name()] = subgraphInputIndex;
+                            }
+
+                            DmlIntermediateSerializedGraphEdge edge = {};
+                            edge.FromNodeIndex = constantNodeAndIndex.nodeIndex;
+                            edge.FromNodeOutputIndex = constantNodeAndIndex.targetIndex;
+                            edge.ToNodeIndex = dmlGraphNodeIndex;
+                            edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex;
+                            edge.Name = arg->Name() + "-nodeIdx:" + std::to_string(edge.FromNodeIndex) + "-outputIdx:" + std::to_string(edge.FromNodeOutputIndex);
+                            dmlGraphIntermediateEdges.push_back(edge);
+                        }
+                        else
+                        {
+                            DmlInputSerializedGraphEdge edge = {};
+                            if (subGraphInputIndexToDmlGraphInputIndex.find(subgraphInputIndex) == subGraphInputIndexToDmlGraphInputIndex.end())
+                            {
+                                subGraphInputIndexToDmlGraphInputIndex[subgraphInputIndex] = static_cast<uint32_t>(subGraphInputIndexToDmlGraphInputIndex.size());
+                            }
+
+                            edge.GraphInputIndex = subGraphInputIndexToDmlGraphInputIndex[subgraphInputIndex];
+                            edge.ToNodeIndex = dmlGraphNodeIndex;
+                            edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex;  // ?? might need to point inputIndex
+                            edge.Name = arg->Name();
+
+                            serializedGraphInputIndexToSubgraphInputIndex[edge.GraphInputIndex] = subgraphInputIndex;
+                            dmlGraphInputEdges.push_back(edge);
                         }
                     }
                     else
                     {
-                        const auto& inputNodeAndIndex = nameToNodeAndIndexMap.at(arg->Name());
+                        const auto& inputNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(arg->Name());
 
-                        DML_INTERMEDIATE_GRAPH_EDGE_DESC edge = {};
+                        DmlIntermediateSerializedGraphEdge edge = {};
                         edge.FromNodeIndex = inputNodeAndIndex.nodeIndex;
                         edge.FromNodeOutputIndex = inputNodeAndIndex.targetIndex;
-                        edge.ToNodeIndex = mainGraphNodeIndex;
-                        edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex;
-                        graphIntermediateEdges.push_back(edge);
+                        edge.ToNodeIndex = dmlGraphNodeIndex;
+                        edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex;
+                        edge.Name = arg->Name();
+                        dmlGraphIntermediateEdges.push_back(edge);
                     }
                 }
             }
 
             // map operatorGraphIntermediateEdges as mainGraphIntermediateEdge
-            for (auto& operatorGraphIntermediateEdge : graphNodeCreateInfo.intermediateEdges)
+            for (auto& operatorGraphIntermediateEdge : operatorDmlGraphCreateInfo.intermediateEdges)
             {
-                DML_INTERMEDIATE_GRAPH_EDGE_DESC edge = {};
-                edge.FromNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphIntermediateEdge.FromNodeIndex];
+                DmlIntermediateSerializedGraphEdge edge = {};
+                uint32_t shiftedFromNodeIndex = SetAndGetDmlGraphNodeIndex(
+                        operatorGraphIntermediateEdge.FromNodeIndex,
+                        node.Name(),
+                        *operatorDmlGraphCreateInfo.nodes[operatorGraphIntermediateEdge.FromNodeIndex],
+                        operatorDmlGraphToDmlGraphNodeIndexMap,
+                        dmlGraphNodes);
+                uint32_t shiftedToNodeIndex = SetAndGetDmlGraphNodeIndex(
+                        operatorGraphIntermediateEdge.ToNodeIndex,
+                        node.Name(),
+                        *operatorDmlGraphCreateInfo.nodes[operatorGraphIntermediateEdge.ToNodeIndex],
+                        operatorDmlGraphToDmlGraphNodeIndexMap,
+                        dmlGraphNodes);
+
+                edge.FromNodeIndex = shiftedFromNodeIndex;
                 edge.FromNodeOutputIndex = operatorGraphIntermediateEdge.FromNodeOutputIndex;
-                edge.ToNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphIntermediateEdge.ToNodeIndex];
+                edge.ToNodeIndex = shiftedToNodeIndex;
                 edge.ToNodeInputIndex = operatorGraphIntermediateEdge.ToNodeInputIndex;
-                graphIntermediateEdges.push_back(edge);
+                edge.Name = "nodeIdx:" + std::to_string(shiftedFromNodeIndex) + "-outputIdx:" + std::to_string(operatorGraphIntermediateEdge.FromNodeOutputIndex);
+                dmlGraphIntermediateEdges.push_back(edge);
             }
 
             // populate nameToNodeAndIndexMap (which will be used by above loop) for operatorGraphOutputEdges
-            for (auto& operatorGraphOutputEdge : graphNodeCreateInfo.outputEdges)
+            for (auto& operatorGraphOutputEdge : operatorDmlGraphCreateInfo.outputEdges)
             {
                 const onnxruntime::NodeArg* arg = node.OutputDefs()[operatorGraphOutputEdge.GraphOutputIndex];
                 if (arg->Exists())
                 {
-                    nameToNodeAndIndexMap[arg->Name()] = NodeAndIndex {
-                        operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphOutputEdge.FromNodeIndex],
-                        operatorGraphOutputEdge.FromNodeOutputIndex
-                    };
-
-                    nodeOutputShapes[arg->Name()] = outputShapes;
-                }
-            }
-
-            if (isNodeAsOpDesc)
-            {
-                for (auto& opDesc : graphNodeCreateInfo.nodesAsOperatorDesc)
-                {
-                    DML_OPERATOR_DESC dmlDesc = SchemaHelpers::ConvertOperatorDesc(*opDesc, &allocator);
-                    ComPtr<IDMLOperator> op;
-                    ORT_THROW_IF_FAILED(device->CreateOperator(&dmlDesc, IID_PPV_ARGS(&op)));
-                    allocator.Reset();
-
-                    NodeInfo nodeInfo = {};
-                    nodeInfo.op = std::move(op);
-                    nodeInfo.name = node.Name();
-                    graphNodes.push_back(std::move(nodeInfo));
+                    uint32_t shiftedNodeIndex = SetAndGetDmlGraphNodeIndex(
+                            operatorGraphOutputEdge.FromNodeIndex,
+                            node.Name(),
+                            *operatorDmlGraphCreateInfo.nodes[operatorGraphOutputEdge.FromNodeIndex],
+                            operatorDmlGraphToDmlGraphNodeIndexMap,
+                            dmlGraphNodes);
+                    dmlGraphNodeOutputNameToNodeAndIndexMap[arg->Name()] = {shiftedNodeIndex, operatorGraphOutputEdge.FromNodeOutputIndex};
+                    nodeOutputShapes[arg->Name()] = outputShapes.GetShape(operatorGraphOutputEdge.GraphOutputIndex);
                 }
             }
         }
@@ -410,24 +553,32 @@ namespace Dml::GraphDescBuilder
             const onnxruntime::NodeArg* graphOutput = subgraphOutputs[outputIndex];
 
             ORT_THROW_HR_IF_NULL_MSG(E_POINTER, graphOutput, "FusedNode's nodeArgList does not contain one of the nodeArg");
-            const auto& outputNodeAndIndex = nameToNodeAndIndexMap.at(graphOutput->Name());
+            const auto& outputNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(graphOutput->Name());
 
-            DML_OUTPUT_GRAPH_EDGE_DESC edge = {};
+            DmlOutputSerializedGraphEdge edge = {};
             edge.FromNodeIndex = outputNodeAndIndex.nodeIndex;
             edge.FromNodeOutputIndex = outputNodeAndIndex.targetIndex;
             edge.GraphOutputIndex = gsl::narrow_cast<uint32_t>(outputIndex);
-            graphOutputEdges.push_back(edge);
-            graphOutputShapes.GetMutableShape(outputIndex) = nodeOutputShapes[graphOutput->Name()].GetShape(outputNodeAndIndex.targetIndex);
+            edge.Name = graphOutput->Name();
+            dmlGraphOutputEdges.push_back(edge);
+            graphOutputShapes.GetMutableShape(outputIndex) = nodeOutputShapes[graphOutput->Name()];
         }
 
-        RemoveUnconnectedNodes(graphNodes, graphInputEdges, graphIntermediateEdges, graphOutputEdges);
+        RemoveUnconnectedNodes(dmlGraphNodes,
+                               dmlGraphInputEdges,
+                               dmlGraphIntermediateEdges,
+                               dmlGraphOutputEdges,
+                               serializedGraphInputIndexToSubgraphInputIndex,
+                               serializedGraphLargeConstantNameToSubgraphInputIndex);
 
         GraphDesc graphDesc{};
-        graphDesc.nodes = std::move(graphNodes);
-        graphDesc.inputEdges = std::move(graphInputEdges);
-        graphDesc.outputEdges = std::move(graphOutputEdges);
-        graphDesc.intermediateEdges = std::move(graphIntermediateEdges);
-        graphDesc.reuseCommandList = reuseCommandList;
+        graphDesc.InputCount = static_cast<uint32_t>(dmlGraphInputEdges.size());
+        graphDesc.OutputCount = static_cast<uint32_t>(subgraphOutputs.size());
+        graphDesc.Nodes = std::move(dmlGraphNodes);
+        graphDesc.InputEdges = std::move(dmlGraphInputEdges);
+        graphDesc.OutputEdges = std::move(dmlGraphOutputEdges);
+        graphDesc.IntermediateEdges = std::move(dmlGraphIntermediateEdges);
+        graphDesc.reuseCommandList = (subgraphNodes.size() >= minNodeCountToReuseCommandList || executionHandle->IsMcdmDevice());
         graphDesc.outputShapes = std::move(graphOutputShapes);
         return graphDesc;
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
index 0039678c00e5..4055984b4040 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "MLOperatorAuthorImpl.h"
+#include "ExecutionProvider.h"
 
 namespace Dml
 {
@@ -21,22 +22,15 @@ namespace Dml
 
     namespace GraphDescBuilder
     {
+        constexpr uint32_t minNodeCountToReuseCommandList = 5;
+        constexpr uint32_t c_maxConstNodeDataSize = 8;
+
         // Gets a unique name for the node which survives recreation and graph manipulations between the point
         // that graph partitioning occurs and kernel creation happens
         const std::string& GetUniqueNodeName(const onnxruntime::Node& node);
 
-        struct NodeInfo
-        {
-            Microsoft::WRL::ComPtr<IDMLOperator> op;
-            std::string name;
-        };
-
-        struct GraphDesc
+        struct GraphDesc : DmlSerializedGraphDesc
         {
-            std::vector<NodeInfo> nodes;
-            std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
-            std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
-            std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
             bool reuseCommandList;
             Windows::AI::MachineLearning::Adapter::EdgeShapes outputShapes;
         };
@@ -46,11 +40,13 @@ namespace Dml
             const size_t isConstGpuGraphInputCount,
             const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
             const std::unordered_map<std::string, GraphNodeProperties>& graphNodePropertyMap,
-            IDMLDevice* device,
-            const void* executionHandle,
+            const ExecutionProviderImpl* executionHandle,
             const onnxruntime::Path& modelPath,
             gsl::span<const onnxruntime::Node* const> subgraphNodes,
             gsl::span<const onnxruntime::NodeArg* const> subgraphInputs,
-            gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs);
+            gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs,
+            /*out*/ std::unordered_map<uint32_t, uint32_t>& serializedGraphInputIndexToSubgraphInputIndex,
+            /*out*/ std::unordered_map<std::string_view, uint32_t>& serializedGraphLargeConstantNameToSubgraphInputIndex,
+            /*out*/ std::vector<std::unique_ptr<std::byte[]>>& smallConstantData);
     }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp
index 09922310b56c..2e04da843696 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp
@@ -17,6 +17,14 @@
 
 namespace Dml
 {
+    GraphTransformer::GraphTransformer(
+        const std::string& name,
+        const onnxruntime::IExecutionProvider* provider
+    ) : onnxruntime::GraphTransformer(name),
+        m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl())
+    {
+    }
+
     onnxruntime::common::Status GraphTransformer::ApplyImpl(
         onnxruntime::Graph& graph,
         bool& modified,
@@ -27,7 +35,7 @@ namespace Dml
       // Perform fusion
       {
         bool transformModifiedGraph = false;
-        PerformOperatorFusion(&graph, &transformModifiedGraph);
+        PerformOperatorFusion(&graph, m_providerImpl->IsMcdmDevice(), &transformModifiedGraph);
         modified |= transformModifiedGraph;
 
         if (modified)
@@ -50,7 +58,7 @@ namespace Dml
         return ss.str();
     }
 
-    void GraphTransformer::PerformOperatorFusion(onnxruntime::Graph* graph, bool* modified) const
+    void GraphTransformer::PerformOperatorFusion(onnxruntime::Graph* graph, bool isMcdmDevice, bool* modified) const
     {
         struct NodeToAdd
         {
@@ -112,7 +120,8 @@ namespace Dml
                 gsl::narrow_cast<uint32_t>(node.InputDefs().size()),
                 outputNode.OpType(),
                 outputNode.Domain(),
-                outputNode.Op()->SinceVersion());
+                outputNode.Op()->SinceVersion(),
+                isMcdmDevice);
 
             if (!fusedOpProperties)
             {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h
index a7f8186fb3b6..337c0df7ff52 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h
@@ -10,6 +10,7 @@
 
 namespace Dml
 {
+    class ExecutionProviderImpl;
 
     // Applies transforms to a Lotus graph. The graph transformer is responsible for setting the execution provider
     // on the graph nodes which DML supports.
@@ -17,16 +18,17 @@ namespace Dml
     {
     public:
         GraphTransformer(
-            const std::string& name
-        ) : onnxruntime::GraphTransformer(name)
-        {
-        }
+            const std::string& name,
+            const onnxruntime::IExecutionProvider* provider
+        );
 
     private:
      onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph, bool& modified, int graph_level, const onnxruntime::logging::Logger& logger) const final;
 
     private:
-        void PerformOperatorFusion(onnxruntime::Graph* graph, bool* modified) const;
+        void PerformOperatorFusion(onnxruntime::Graph* graph, bool isMcdmDevice, bool* modified) const;
+
+        const ExecutionProviderImpl* m_providerImpl = nullptr;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
index a8a6d6745e90..f4c3f326274a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include <d3d12.h>
+#include "directx/d3d12.h"
 
 #include "core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h"
 
@@ -76,6 +76,7 @@ namespace Dml
         STDMETHOD(AllocatePooledResource(size_t size, AllocatorRoundingMode roundingMode, ID3D12Resource **d3dResource, IUnknown* *pooledResource)) const noexcept = 0;
 
         STDMETHOD_(bool, IsMcdmDevice)() const noexcept = 0;
+        STDMETHOD_(bool, CustomHeapsSupported)() const noexcept = 0;
         STDMETHOD_(bool, MetacommandsEnabled)() const noexcept = 0;
     };
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 4deec620fe5f..f29fbc7a1a65 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1153,6 +1153,33 @@ namespace Windows::AI::MachineLearning::Adapter
         ORT_CATCH_RETURN
     }
 
+    template <class NodeInfoImpl_t, class Base1_t, class Base2_t>
+    HRESULT STDMETHODCALLTYPE OpNodeInfoWrapper<NodeInfoImpl_t, Base1_t, Base2_t>::TryGetConstantInputTensor(uint32_t inputIndex, IMLOperatorTensor** tensor) const noexcept
+    {
+        ORT_TRY
+        {
+            auto constantInput = m_constantInputGetter(inputIndex);
+            ORT_THROW_HR_IF(E_INVALIDARG, !std::holds_alternative<ComPtr<IMLOperatorTensor>>(constantInput));
+
+            auto tensorWrapper = std::get<ComPtr<IMLOperatorTensor>>(constantInput);
+            if (tensorWrapper == nullptr)
+            {
+                bool inputRequiredAsConstant = std::find(
+                                                 m_requiredConstantCpuInputs.begin(),
+                                                 m_requiredConstantCpuInputs.end(),
+                                                 inputIndex) != m_requiredConstantCpuInputs.end();
+
+                // This shouldn't happen since kernel creation is deferred and repeated when required constant inputs are not present.
+                ORT_THROW_HR_IF(E_UNEXPECTED, inputRequiredAsConstant);
+            }
+
+            *tensor = tensorWrapper.Detach();
+
+            return S_OK;
+        }
+        ORT_CATCH_RETURN
+    }
+
     HRESULT STDMETHODCALLTYPE OpKernelInfoWrapper::GetOutputTensorShape(uint32_t outputIndex, uint32_t dimensionCount, uint32_t* dimensions) const noexcept
     {
         ORT_TRY
@@ -1481,31 +1508,17 @@ namespace Windows::AI::MachineLearning::Adapter
         ORT_TRY
         {
             assert(operatorGraphDesc != nullptr);
-            // Either nodesAsOpDesc or nodesIDMLOperator can be present.
-            assert(operatorGraphDesc->nodeCount == 0 || (!operatorGraphDesc->nodesAsOpDesc ^ !operatorGraphDesc->nodesAsIDMLOperator));
+            assert(operatorGraphDesc->nodeCount == 0 || operatorGraphDesc->nodes);
 
-            if (operatorGraphDesc->nodesAsOpDesc)
+            m_graphNodeCreateInfo->nodes = std::vector<std::unique_ptr<AbstractOperatorDesc>>();
+            for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++)
             {
-                m_graphNodeCreateInfo->nodesAsOperatorDesc = std::vector<std::unique_ptr<AbstractOperatorDesc>>();
-                for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++)
-                {
-                    auto* node = operatorGraphDesc->nodesAsOpDesc[nodeIndex];
-                    assert(node != nullptr);
-                    AbstractOperatorDesc abstractDesc = SchemaHelpers::ConvertOperatorDesc(*node);
-                    m_graphNodeCreateInfo->nodesAsOperatorDesc.push_back(std::make_unique<AbstractOperatorDesc>(std::move(abstractDesc)));
-                }
+                auto* node = operatorGraphDesc->nodes[nodeIndex];
+                assert(node != nullptr);
+                AbstractOperatorDesc abstractDesc = SchemaHelpers::ConvertOperatorDesc(*node);
+                m_graphNodeCreateInfo->nodes.push_back(std::make_unique<AbstractOperatorDesc>(std::move(abstractDesc)));
             }
-            else
-            {
-                m_graphNodeCreateInfo->nodesAsIDMLOperator = std::vector<Microsoft::WRL::ComPtr<IDMLOperator>>();
-                for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++)
-                {
-                    auto* node = operatorGraphDesc->nodesAsIDMLOperator[nodeIndex];
-                    assert(node != nullptr);
-                    m_graphNodeCreateInfo->nodesAsIDMLOperator.push_back(node);
-                }
-            }
-
+            
             // There can be operators (or kernels) which don't require any input.
             assert(operatorGraphDesc->inputEdgeCount == 0 || operatorGraphDesc->inputEdges != nullptr);
             m_graphNodeCreateInfo->inputEdges.insert(
@@ -1535,7 +1548,13 @@ namespace Windows::AI::MachineLearning::Adapter
     OnnxTensorWrapper::OnnxTensorWrapper(onnx::TensorProto* impl, const onnxruntime::Path& modelPath) : m_impl(impl)
     {
         // The tensor may be stored as raw data or in typed fields.
-        if (impl->has_raw_data())
+        if (impl->data_location() == onnx::TensorProto_DataLocation_EXTERNAL)
+        {
+            THROW_IF_NOT_OK(onnxruntime::utils::UnpackInitializerData(*impl, modelPath, m_unpackedExternalTensor));
+            m_dataPtr = reinterpret_cast<std::byte*>(m_unpackedExternalTensor.data());
+            m_tensorByteSize = m_unpackedExternalTensor.size();
+        }
+        else if (impl->has_raw_data())
         {
             m_dataPtr = reinterpret_cast<std::byte*>(impl->mutable_raw_data()->data());
             m_tensorByteSize = impl->raw_data().size();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index 913997ff4ad4..59e253e88457 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -204,6 +204,11 @@ class OpNodeInfoWrapper : public Base1_t, public Base2_t, public Closable
         _Outptr_ IMLOperatorTensor** tensor
         ) const noexcept;
 
+    HRESULT STDMETHODCALLTYPE TryGetConstantInputTensor(
+        uint32_t inputIndex,
+        _Outptr_ IMLOperatorTensor** tensor
+        ) const noexcept;
+
  protected:
     // Lifetime is managed by the caller and guaranteed to outlive this class
     const onnxruntime::OpNodeProtoHelper<NodeInfoImpl_t>* m_impl = nullptr;
@@ -299,9 +304,12 @@ class OnnxTensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
     const onnxruntime::Tensor* GetInterface() const { return nullptr; }
     onnxruntime::Tensor* GetInterface() { return nullptr; }
 
+    size_t GetTensorByteSize() const { return m_tensorByteSize; }
+
  private:
     size_t m_tensorByteSize = 0;
     std::unique_ptr<std::byte[]> m_unpackedTensor;
+    std::vector<uint8_t> m_unpackedExternalTensor;
     std::byte* m_dataPtr = nullptr;
 
     // Lifetime is managed by the caller and guaranteed to outlive this class
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index c285cf1a070b..ddd6d5612846 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -4,7 +4,7 @@
 #include "../../../OperatorAuthorHelper/OperatorHelper.h"
 
 #include "../External/D3DX12/d3dx12.h"
-#include <d3d12.h>
+#include "directx/d3d12.h"
 
 // NOTE: When this operator's implementation is moved into DML, the associated FP16 fallback
 //       should be removed from IsCustomOpShader(...) in
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 4bbc8a4b718d..4f5da9dd0549 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -4,7 +4,7 @@
 #include "../MLOperatorAuthorImpl.h"
 
 #include "../External/D3DX12/d3dx12.h"
-#include <d3d12.h>
+#include "directx/d3d12.h"
 
 // NOTE: When this operator's implementation is moved into DML, the associated FP16 fallback
 //       should be removed from IsCustomOpShader(...) in
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index 25c7be42d642..287f1e5b6dfe 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -6,6 +6,9 @@
 
 namespace Dml
 {
+
+    /*static*/ const uint32_t DmlOperator::zeroArray[8] = {};
+
     DmlOperator::DmlOperator(const MLOperatorKernelCreationContext& kernelInfo)
     {
         ML_CHECK_HRESULT(kernelInfo.GetExecutionInterface().As(&m_executionProvider));
@@ -50,7 +53,7 @@ namespace Dml
             MLOperatorGraphDesc operatorGraphDesc = {};
             operatorGraphDesc.nodeCount = 1;
             const DML_OPERATOR_DESC* opDescs{&operatorDesc};
-            operatorGraphDesc.nodesAsOpDesc = &opDescs;
+            operatorGraphDesc.nodes = &opDescs;
 
             std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
             for (uint32_t inputIndex = 0; inputIndex < m_kernelInputIndices.size(); inputIndex++)
@@ -793,7 +796,7 @@ namespace Dml
         for (size_t i = 0; i < graphDesc.NodeCount; ++i)
         {
             // Create the operator.
-            ORT_THROW_IF_FAILED(m_dmlDevice->CreateOperator(operatorGraphDesc.nodesAsOpDesc[i], IID_PPV_ARGS(&dmlOperators[i])));
+            ORT_THROW_IF_FAILED(m_dmlDevice->CreateOperator(operatorGraphDesc.nodes[i], IID_PPV_ARGS(&dmlOperators[i])));
             dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{dmlOperators[i].Get()};
             dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i]};
         }
@@ -824,4 +827,84 @@ namespace Dml
         graphDesc.IntermediateEdges = dmlIntermediateEdges.data();
     }
 
+    /*static*/ void DmlOperator::TryConvertTensorToBroadcastScalar(
+        const MLOperatorKernelCreationContext& kernelInfo,
+        const DML_TENSOR_DESC* tensor,
+        uint32_t kernelInputIndex)
+    {
+        if (!tensor)
+        {
+            return;
+        }
+
+        auto constExpTensor = kernelInfo.TryGetConstantCpuInputTensor(kernelInputIndex);
+        if (!constExpTensor)
+        {
+            return;
+        }
+        else if (!constExpTensor->IsCpuData())
+        {
+            return;
+        }
+
+        uint32_t totalKernelInputElementCount = constExpTensor->GetTotalElementCount();
+        if (totalKernelInputElementCount <= 1)
+        {
+            return;
+        }
+
+        uint32_t elementSize = 0;
+
+        switch (constExpTensor->GetTensorDataType())
+        {
+        case MLOperatorTensorDataType::UInt8:
+        case MLOperatorTensorDataType::Int8:
+            elementSize = 1;
+            break;
+
+        case MLOperatorTensorDataType::Float16:
+        case MLOperatorTensorDataType::UInt16:
+        case MLOperatorTensorDataType::Int16:
+            elementSize = 2;
+            break;
+
+        case MLOperatorTensorDataType::/*Float32*/Float:
+        case MLOperatorTensorDataType::UInt32:
+        case MLOperatorTensorDataType::Int32:
+            elementSize = 4;
+            break;
+
+        case MLOperatorTensorDataType::/*Float64*/Double:
+        case MLOperatorTensorDataType::UInt64:
+        case MLOperatorTensorDataType::Int64:
+            elementSize = 8;
+            break;
+
+        default:
+            return;
+        }
+
+        const std::uint8_t* byteData = static_cast<const std::uint8_t*>(constExpTensor->GetByteData());
+
+        assert(tensor->Type == DML_TENSOR_TYPE_BUFFER);
+        auto *bufferTensorDesc = const_cast<DML_BUFFER_TENSOR_DESC*>(static_cast<const DML_BUFFER_TENSOR_DESC*>(tensor->Desc));
+
+        for (size_t i = 1; i < totalKernelInputElementCount; ++i)
+        {
+            if (memcmp(byteData, byteData + i * elementSize, elementSize))
+            {
+                return;
+            }
+        }
+
+        if (bufferTensorDesc->DimensionCount > sizeof(zeroArray) / sizeof(zeroArray[0]))
+        {
+            assert(false);
+            return;
+        }
+
+        bufferTensorDesc->Strides = zeroArray;
+        bufferTensorDesc->TotalTensorSizeInBytes = (elementSize + 3) & ~3;
+    }
+
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
index c1e8cf42a974..fa54d4b041b5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -149,6 +149,11 @@ namespace Dml
             uint32_t minDimensionCount = NchwDimensionCount
             ) const;
 
+        static void TryConvertTensorToBroadcastScalar(
+            const MLOperatorKernelCreationContext& kernelInfo,
+            const DML_TENSOR_DESC* tensor,
+            uint32_t kernelInputIndex);
+
     private:
         // For each input or output of the DML kernel, the corresponding input or output of the original
         // kernel.  Entries for unused DML inputs are nullopt.
@@ -164,6 +169,7 @@ namespace Dml
                                    _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
                                    _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlIntermediateEdges);
 
+        static const uint32_t zeroArray[8];
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp
index bbebb4a333ba..73c2d57e984a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp
@@ -531,7 +531,7 @@ class DmlOperatorAttention : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
@@ -571,6 +571,12 @@ void CALLBACK QueryAttention(IMLOperatorSupportQueryContextPrivate* context, /*o
         return;
     }
 
+    // `past_present_share_buffer == 1` is not supported yet
+    if (attributes.GetOptionalAttribute<int32_t>(AttrName::PastPresentShareBuffer, 0) != 0)
+    {
+        return;
+    }
+
     *isSupported = true;
 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBatchNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBatchNormalization.cpp
index 9f9cfad67091..ee497715dd73 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBatchNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBatchNormalization.cpp
@@ -111,6 +111,8 @@ class DmlOperatorBatchNormalization15 : public DmlOperator, BatchNormalizationHe
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
+        // TODO (jeffbloo): Port this to a graph description to enable DML graph optimization
+
         dml::Graph graph(m_dmlDevice.Get());
         dml::TensorDesc inputTensorDesc = inputDescs[OnnxInputIndex::X];
         dml::TensorDesc scaleTensorDesc = inputDescs[OnnxInputIndex::Scale];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp
index 1c851c94c4dd..5aceebbdabfe 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp
@@ -103,7 +103,7 @@ class DmlOperatorBiasAdd : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasGelu.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasGelu.cpp
index a45d548d11ee..edb9627f4802 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasGelu.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasGelu.cpp
@@ -12,7 +12,7 @@ class DmlOperatorBiasGelu : public DmlOperator
     DmlOperatorBiasGelu(const MLOperatorKernelCreationContext& kernelCreationContext)
     :   DmlOperator(kernelCreationContext)
     {
-        ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetInputCount() == 2);
+        ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetInputCount() == 1 || kernelCreationContext.GetInputCount() == 2);
         ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetOutputCount() == 1);
 
         // Broadcast bias to have the same dimensions as the input
@@ -20,23 +20,34 @@ class DmlOperatorBiasGelu : public DmlOperator
         DmlOperator::Initialize(kernelCreationContext, std::nullopt, std::nullopt, inputTensorShape);
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
-        ML_CHECK_VALID_ARGUMENT(inputDescs.size() == 2);
+        ML_CHECK_VALID_ARGUMENT(inputDescs.size() == kernelCreationContext.GetInputCount());
         ML_CHECK_VALID_ARGUMENT(outputDescs.size() == 1);
 
-        DML_ACTIVATION_GELU_OPERATOR_DESC geluDesc = {};
-        DML_OPERATOR_DESC geluOpDesc = { DML_OPERATOR_ACTIVATION_GELU, &geluDesc };
-
-        DML_ELEMENT_WISE_ADD1_OPERATOR_DESC addDesc = {};
-        addDesc.ATensor = &inputDescs[0];
-        addDesc.BTensor = &inputDescs[1];
-        addDesc.FusedActivation = &geluOpDesc;
-        addDesc.OutputTensor = &outputDescs[0];
-        DML_OPERATOR_DESC addOpDesc = { DML_OPERATOR_ELEMENT_WISE_ADD1, &addDesc };
-
-        SetDmlOperatorDesc(addOpDesc, kernelCreationContext);
+        if (kernelCreationContext.IsInputValid(1))
+        {
+            DML_ACTIVATION_GELU_OPERATOR_DESC geluDesc = {};
+            DML_OPERATOR_DESC geluOpDesc = { DML_OPERATOR_ACTIVATION_GELU, &geluDesc };
+
+            DML_ELEMENT_WISE_ADD1_OPERATOR_DESC addDesc = {};
+            addDesc.ATensor = &inputDescs[0];
+            addDesc.BTensor = &inputDescs[1];
+            addDesc.FusedActivation = &geluOpDesc;
+            addDesc.OutputTensor = &outputDescs[0];
+            DML_OPERATOR_DESC addOpDesc = { DML_OPERATOR_ELEMENT_WISE_ADD1, &addDesc };
+            SetDmlOperatorDesc(addOpDesc, kernelCreationContext);
+        }
+        else
+        {
+            DML_ACTIVATION_GELU_OPERATOR_DESC geluDesc;
+            geluDesc.InputTensor = &inputDescs[0];
+            geluDesc.OutputTensor = &outputDescs[0];
+            DML_OPERATOR_DESC geluOpDesc = { DML_OPERATOR_ACTIVATION_GELU, &geluDesc };
+            SetDmlOperatorDesc(geluOpDesc, kernelCreationContext);
+        }
     }
 };
 
 DML_OP_DEFINE_CREATION_FUNCTION(BiasGelu, DmlOperatorBiasGelu);
+DML_OP_DEFINE_CREATION_FUNCTION(FastGelu, DmlOperatorBiasGelu);
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp
index 501ce14f1fc0..1e10214ffd46 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp
@@ -137,7 +137,7 @@ class DmlOperatorBiasSplitGelu : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCast.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCast.cpp
index 76b9b308fe98..45ff25c4fdd9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCast.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCast.cpp
@@ -29,7 +29,7 @@ class DmlOperatorCast : public DmlOperator
         castDesc.OutputTensor = outputDescs.data();
 
         DML_OPERATOR_DESC opDesc = { DML_OPERATOR_CAST, &castDesc };
-        
+
         SetDmlOperatorDesc(opDesc, kernelInfo);
     }
 
@@ -49,5 +49,6 @@ class DmlOperatorCast : public DmlOperator
 
 DML_OP_DEFINE_CREATION_FUNCTION(Cast, DmlOperatorCast);
 DML_OP_DEFINE_CREATION_FUNCTION(CastLike15, DmlOperatorCast);
+DML_OP_DEFINE_CREATION_FUNCTION(CastLike19, DmlOperatorCast);
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCol2Im.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCol2Im.cpp
new file mode 100644
index 000000000000..5c33c6b96c2f
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorCol2Im.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "./precomp.h"
+
+namespace Dml
+{
+
+#if DML_TARGET_VERSION >= 0x6400
+
+class DmlOperatorCol2Im : public DmlOperator, public Col2ImHelper
+{
+public:
+    explicit DmlOperatorCol2Im(const MLOperatorKernelCreationContext& kernelCreationContext)
+    :   DmlOperator(kernelCreationContext),
+        Col2ImHelper(kernelCreationContext, kernelCreationContext.GetTensorShapeDescription())
+    {
+        ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetInputCount() == 3, "Col2Im expects 3 inputs.");
+        ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetOutputCount() == 1, "Col2Im expects 1 output.");
+
+        auto tensorShapeDescription = kernelCreationContext.GetTensorShapeDescription();
+        std::vector<uint32_t> inputTensorShape = tensorShapeDescription.GetInputTensorShape(0);
+        std::vector<uint32_t> outputTensorShape = tensorShapeDescription.GetOutputTensorShape(0);
+
+        ML_CHECK_VALID_ARGUMENT(outputTensorShape == m_outputShape);
+
+        std::vector<std::optional<uint32_t>> inputIndices = { 0 };
+        gsl::span<const uint32_t> inputShapes[1] = { m_inputShape };
+        gsl::span<const uint32_t> outputShapes[1] = { m_outputShape };
+        DmlOperator::InitializeWithShapes(
+            kernelCreationContext,
+            inputIndices,
+            std::nullopt,
+            inputShapes,
+            outputShapes,
+            3
+        );
+        // Prepare DML_FOLD_OPERATOR_DESC
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+        assert(inputDescs.size() == 1);
+        assert(outputDescs.size() == 1);
+
+        DML_FOLD_OPERATOR_DESC operatorDesc = {};
+        operatorDesc.InputTensor = inputDescs.data();
+        operatorDesc.OutputTensor = outputDescs.data();
+        operatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(m_blockShape.size());
+        operatorDesc.WindowSizes = m_blockShape.data();
+        operatorDesc.Dilations = m_dilations.data();
+        operatorDesc.StartPadding = m_pads.data();
+        operatorDesc.EndPadding = m_pads.data();
+        operatorDesc.Strides = m_strides.data();
+
+        DML_OPERATOR_DESC opDesc = { DML_OPERATOR_FOLD, &operatorDesc };
+        SetDmlOperatorDesc(opDesc, kernelCreationContext);
+    }
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(Col2Im, DmlOperatorCol2Im);
+
+#endif // DML_TARGET_VERSION >= 0x6400
+
+}  // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp
new file mode 100644
index 000000000000..c6a87da705a9
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp
@@ -0,0 +1,173 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+namespace Dml
+{
+// DynamicQuantizeMatMul = MatrixMultiplyIntegerToFloat(DynamicQuantizeLinear(A), B)
+class DmlOperatorDynamicQuantizeMatMul : public DmlOperator
+{
+    // This order matches the ONNX schema.
+    enum OnnxInputIndex
+    {
+        A, // Input
+        B,
+        B_scale,
+        B_zero_point,
+        Bias,
+        Count,
+    };
+
+public:
+    DmlOperatorDynamicQuantizeMatMul(const MLOperatorKernelCreationContext& kernelCreationContext)
+    :   DmlOperator(kernelCreationContext)
+    {
+        DmlOperator::Initialize(kernelCreationContext);
+
+        const bool hasBias = kernelCreationContext.IsInputValid(OnnxInputIndex::Bias);
+        const bool hasBZP = kernelCreationContext.IsInputValid(OnnxInputIndex::B_zero_point);
+
+        // Broadcast Bias tensor to the shape of the output tensor.
+        if (hasBias)
+        {
+            m_inputTensorDescs[OnnxInputIndex::Bias] = CreateTensorDescFromInput(
+                kernelCreationContext,
+                OnnxInputIndex::Bias,
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                kernelCreationContext.GetTensorShapeDescription().GetOutputTensorShape(0)
+            );
+        }
+        MLOperatorTensorDataType BDatatype = kernelCreationContext.GetInputEdgeDescription(OnnxInputIndex::B).tensorDataType;
+
+        std::vector<uint32_t> ATensorShape = kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(OnnxInputIndex::A);
+        std::vector<uint32_t> ExpectedAScaleTensorShape = {1, 1, 1, 1};
+        std::vector<uint32_t> ExpectedAZeroPointTensorShape = {1, 1, 1, 1};
+
+        //  output edges between DynQL and MMItoFloat node
+        TensorDesc intermediateQuantizedATensorDesc = TensorDesc(
+                BDatatype,
+                gsl::make_span(ATensorShape),
+                gsl::make_span(ATensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount,  // minDimensionCount
+                0  // guaranteedBaseOffsetAlignment
+            );
+
+        TensorDesc intermediateQuantizedAScaleTensorDesc = TensorDesc(
+                MLOperatorTensorDataType::Float,
+                gsl::make_span(ExpectedAScaleTensorShape),
+                gsl::make_span(ExpectedAScaleTensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount,  // minDimensionCount
+                0  // guaranteedBaseOffsetAlignment
+            );
+
+        TensorDesc intermediateQuantizedAZeroPointTensorDesc = TensorDesc(
+                BDatatype,
+                gsl::make_span(ExpectedAZeroPointTensorShape),
+                gsl::make_span(ExpectedAZeroPointTensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount,  // minDimensionCount
+                0  // guaranteedBaseOffsetAlignment
+            );
+
+        DML_TENSOR_DESC namedIntermediateQuantizedATensorDesc = intermediateQuantizedATensorDesc.GetDmlDesc();
+        DML_TENSOR_DESC namedIntermediateQuantizedAScaleTensorDesc = intermediateQuantizedAScaleTensorDesc.GetDmlDesc();
+        DML_TENSOR_DESC namedIntermediateQuantizedAZeroPointTensorDesc = intermediateQuantizedAZeroPointTensorDesc.GetDmlDesc();
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        DML_DYNAMIC_QUANTIZE_LINEAR_OPERATOR_DESC dynamicQuantizeLinearOperatorDesc = {};
+        dynamicQuantizeLinearOperatorDesc.InputTensor = &inputDescs[OnnxInputIndex::A];
+        dynamicQuantizeLinearOperatorDesc.OutputTensor = &namedIntermediateQuantizedATensorDesc;
+        dynamicQuantizeLinearOperatorDesc.OutputScaleTensor = &namedIntermediateQuantizedAScaleTensorDesc;
+        dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor = &namedIntermediateQuantizedAZeroPointTensorDesc;
+
+        const DML_OPERATOR_DESC opDesc1{DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR, &dynamicQuantizeLinearOperatorDesc};
+
+        DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matrixMultiplyIntergerToFloatOperatorDesc = {};
+        matrixMultiplyIntergerToFloatOperatorDesc.ATensor = dynamicQuantizeLinearOperatorDesc.OutputTensor;
+        matrixMultiplyIntergerToFloatOperatorDesc.AScaleTensor = dynamicQuantizeLinearOperatorDesc.OutputScaleTensor;
+        matrixMultiplyIntergerToFloatOperatorDesc.AZeroPointTensor = dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor;
+        matrixMultiplyIntergerToFloatOperatorDesc.BTensor = &inputDescs[OnnxInputIndex::B];
+        matrixMultiplyIntergerToFloatOperatorDesc.BScaleTensor = &inputDescs[OnnxInputIndex::B_scale];
+        matrixMultiplyIntergerToFloatOperatorDesc.BZeroPointTensor = hasBZP? &inputDescs[OnnxInputIndex::B_zero_point] : nullptr;
+        matrixMultiplyIntergerToFloatOperatorDesc.BiasTensor = hasBias? &inputDescs[OnnxInputIndex::Bias] : nullptr;
+        matrixMultiplyIntergerToFloatOperatorDesc.OutputTensor = &outputDescs[0];
+
+        const DML_OPERATOR_DESC opDesc2{ DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matrixMultiplyIntergerToFloatOperatorDesc};
+
+        MLOperatorGraphDesc operatorGraphDesc = {};
+        std::vector<const DML_OPERATOR_DESC*> opDescs{&opDesc1, &opDesc2};
+        operatorGraphDesc.nodeCount = static_cast<uint32_t>(opDescs.size());
+        operatorGraphDesc.nodes = opDescs.data();
+
+        // set input edges
+        std::pair<uint32_t, uint32_t> nodeToNodeInputIndex[OnnxInputIndex::Count] {{0, 0}, {1, 3}, {1, 4}, {1, 5}, {1, 6}};
+        std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
+        for (uint32_t inputIndex = 0; inputIndex < OnnxInputIndex::Count; inputIndex++)
+        {
+            if (inputIndex == OnnxInputIndex::B_zero_point && !hasBZP) continue;
+            if (inputIndex == OnnxInputIndex::Bias && !hasBias) continue;
+            DML_INPUT_GRAPH_EDGE_DESC inputEdge = {};
+            inputEdge.GraphInputIndex = inputIndex;  // OnnxInputIndex and DmlInputIndex are identity for QLinearSigmoid
+            inputEdge.ToNodeIndex = nodeToNodeInputIndex[inputIndex].first;
+            inputEdge.ToNodeInputIndex = nodeToNodeInputIndex[inputIndex].second;
+            inputEdges.push_back(inputEdge);
+        }
+        operatorGraphDesc.inputEdgeCount = gsl::narrow_cast<uint32_t>(inputEdges.size());
+        operatorGraphDesc.inputEdges = inputEdges.data();
+
+        // set intermediate edges
+        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge1 = {};
+        dynQLToMMItofloatEdge1.FromNodeIndex = 0;
+        dynQLToMMItofloatEdge1.FromNodeOutputIndex = 0;
+        dynQLToMMItofloatEdge1.ToNodeIndex = 1;
+        dynQLToMMItofloatEdge1.ToNodeInputIndex = 0;
+        intermediateEdges.push_back(dynQLToMMItofloatEdge1);
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge2 = {};
+        dynQLToMMItofloatEdge2.FromNodeIndex = 0;
+        dynQLToMMItofloatEdge2.FromNodeOutputIndex = 1;
+        dynQLToMMItofloatEdge2.ToNodeIndex = 1;
+        dynQLToMMItofloatEdge2.ToNodeInputIndex = 1;
+        intermediateEdges.push_back(dynQLToMMItofloatEdge2);
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge3 = {};
+        dynQLToMMItofloatEdge3.FromNodeIndex = 0;
+        dynQLToMMItofloatEdge3.FromNodeOutputIndex = 2;
+        dynQLToMMItofloatEdge3.ToNodeIndex = 1;
+        dynQLToMMItofloatEdge3.ToNodeInputIndex = 2;
+        intermediateEdges.push_back(dynQLToMMItofloatEdge3);
+
+        operatorGraphDesc.intermediateEdgeCount = gsl::narrow_cast<uint32_t>(intermediateEdges.size());
+        operatorGraphDesc.intermediateEdges = intermediateEdges.data();
+
+        // set the output edges
+        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
+        DML_OUTPUT_GRAPH_EDGE_DESC outputEdge = {};
+        outputEdge.FromNodeIndex = 1;
+        outputEdge.FromNodeOutputIndex = 0;
+        outputEdge.GraphOutputIndex = 0;
+        outputEdges.push_back(outputEdge);
+        operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
+        operatorGraphDesc.outputEdges = outputEdges.data();
+
+        SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
+    }
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(DynamicQuantizeMatMul, DmlOperatorDynamicQuantizeMatMul);
+}  // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
index 43d34657098e..16bb10f004f9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
@@ -479,17 +479,37 @@ class DmlOperatorElementwisePow : public DmlOperator
         ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() == 2);
         ML_CHECK_VALID_ARGUMENT(kernelInfo.GetOutputCount() == 1);
 
-        Initialize(kernelInfo, std::nullopt, std::nullopt, kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0));
+        auto constExpTensor = kernelInfo.TryGetConstantCpuInputTensor(1);
+        if (constExpTensor && constExpTensor->GetTotalElementCount() == 1)
+        {
+            std::vector<std::optional<uint32_t>> kernelInputIndices = {0};
 
-        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
-        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+            Initialize(kernelInfo, kernelInputIndices, std::nullopt, kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0));
 
-        DML_ELEMENT_WISE_POW_OPERATOR_DESC opDesc = {};
-        opDesc.InputTensor = &inputDescs[0];
-        opDesc.ExponentTensor = &inputDescs[1];
-        opDesc.OutputTensor = &outputDescs[0];
+            std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+            std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+            DML_ELEMENT_WISE_CONSTANT_POW_OPERATOR_DESC opDesc = {};
+            opDesc.InputTensor = &inputDescs[0];
+            opDesc.OutputTensor = &outputDescs[0];
+            opDesc.Exponent = static_cast<float>(ReadScalarTensorCastToFloat64(*constExpTensor));
+
+            SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_CONSTANT_POW, &opDesc}, kernelInfo);
+        }
+        else
+        {
+            Initialize(kernelInfo, std::nullopt, std::nullopt, kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0));
+
+            std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+            std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
-        SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_POW, &opDesc}, kernelInfo);
+            DML_ELEMENT_WISE_POW_OPERATOR_DESC opDesc = {};
+            opDesc.InputTensor = &inputDescs[0];
+            opDesc.ExponentTensor = &inputDescs[1];
+            opDesc.OutputTensor = &outputDescs[0];
+
+            SetDmlOperatorDesc({ DML_OPERATOR_ELEMENT_WISE_POW, &opDesc}, kernelInfo);
+        }
     }
 };
 
@@ -499,13 +519,16 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
 public:
     DmlOperatorElementwiseQLinear(const MLOperatorKernelCreationContext& kernelInfo) : DmlOperator(kernelInfo)
     {
-        ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() == 3);
+
+        ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() >= 2);
         ML_CHECK_VALID_ARGUMENT(kernelInfo.GetOutputCount()  == 1);
 
+        Initialize(kernelInfo, std::nullopt, std::nullopt);
+
         std::vector<uint32_t> outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
         const uint32_t outputShapeDimCount = gsl::narrow_cast<uint32_t>(outputShape.size());
-
-        Initialize(kernelInfo, std::nullopt, std::nullopt);
+        const DML_TENSOR_DATA_TYPE inputDataType = m_inputTensorDescs[0].GetDmlDataType();
+        bool hasZeroPointTensor = kernelInfo.IsInputValid(2);
 
         uint32_t axis = 0;
 
@@ -521,9 +544,14 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
             axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
         }
 
-        // Explicitly reshape each of the inputs after the first input (scale and zero point tensors).
+        // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
         for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
         {
+            if (!kernelInfo.IsInputValid(index))
+            {
+                continue;
+            }
+
             auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
             assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
 
@@ -538,7 +566,11 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
             {
                 ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
                 uint32_t broadcastAxisLength = outputShape[axis];
-                ML_CHECK_VALID_ARGUMENT(inputTensorShape[0] == broadcastAxisLength);
+                ML_CHECK_VALID_ARGUMENT(
+                    (inputTensorShape[0] == broadcastAxisLength) ||
+                    // Treat as broadcast dimension to match CPU behavior.
+                    (inputTensorShape[0] == 1)
+                );
                 inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
                 inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
             }
@@ -563,9 +595,8 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
         TOperatorDesc opDesc = {};
         opDesc.InputTensor = &inputDescs[0];
         opDesc.ScaleTensor = &inputDescs[1];
-        opDesc.ZeroPointTensor = &inputDescs[2];
+        opDesc.ZeroPointTensor = hasZeroPointTensor ? &inputDescs[2] : nullptr;
         opDesc.OutputTensor = &outputDescs[0];
-
         SetDmlOperatorDesc({ApiTraits::OperatorDescTraits<TOperatorDesc>::Type, &opDesc}, kernelInfo);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp
index 6a8333cd7256..3c9458658c4d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp
@@ -484,7 +484,7 @@ class DmlOperatorEmbedLayerNormalization : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp
index fed0e4645ffd..8b275fc550f3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp
@@ -287,7 +287,7 @@ class DmlOperatorGroupNorm : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp
index 5c64059f7caa..80e6fefc2fb8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp
@@ -247,7 +247,7 @@ class DmlOperatorLayerNormalization : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp
new file mode 100644
index 000000000000..b5a3dd0960b8
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+namespace Dml
+{
+
+class DmlOperatorMatMulIntegerToFloat : public DmlOperator
+{
+    enum OrtInputTensors : uint32_t
+    {
+        ortA,
+        ortB,
+        ortAScale,
+        ortBScale,
+        ortAZeroPoint,
+        ortBZeroPoint,
+        ortBias,
+        ortInputCount
+    };
+
+    enum DmlInputIndex : uint32_t
+    {
+        dmlA,
+        dmlAScale,
+        dmlAZeroPoint,
+        dmlB,
+        dmlBScale,
+        dmlBZeroPoint,
+        dmlBias,
+        dmlInputCount,
+    };
+
+public:
+    DmlOperatorMatMulIntegerToFloat(const MLOperatorKernelCreationContext& kernelInfo)
+        :   DmlOperator(kernelInfo)
+    {
+        std::vector<std::optional<uint32_t>> inputIndices = { OrtInputTensors::ortA, OrtInputTensors::ortAScale, OrtInputTensors::ortAZeroPoint, OrtInputTensors::ortB, OrtInputTensors::ortBScale, OrtInputTensors::ortBZeroPoint, OrtInputTensors::ortBias };
+        DmlOperator::Initialize(kernelInfo, inputIndices);
+
+        std::vector<DimensionType> inputShape0 = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortA);
+        std::vector<DimensionType> inputShape1 = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortB);
+        std::vector<DimensionType> outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
+
+        OperatorHelper::MatMulShapeMapping(inputShape0, inputShape1, outputShape);
+
+        // Initialize the input descriptions with broadcasting
+        m_inputTensorDescs[DmlInputIndex::dmlA] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortA, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape0);
+        m_inputTensorDescs[DmlInputIndex::dmlB] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortB, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape1);
+
+        // Broadcast Bias tensor to the shape of the output tensor.
+        if(kernelInfo.IsInputValid(OrtInputTensors::ortBias)) {
+            m_inputTensorDescs[DmlInputIndex::dmlBias] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortBias, TensorAxis::DoNotCoerce,
+                TensorAxis::W, TensorAxis::RightAligned, outputShape);
+        }
+
+        uint32_t dmlDimSize = m_inputTensorDescs[DmlInputIndex::dmlA].GetDimensionCount();
+        // Resize the A Scale to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        m_inputTensorDescs[DmlInputIndex::dmlAScale] = CreateTensorDescFromInput(
+            kernelInfo,
+            OrtInputTensors::ortAScale,
+            TensorAxis::DoNotCoerce,
+            TensorAxis::H,
+            TensorAxis::LeftAligned,
+            std::nullopt,
+            dmlDimSize
+            );
+
+        // Resize the A ZeroPoint to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        if (kernelInfo.IsInputValid(OrtInputTensors::ortAZeroPoint))
+        {
+            m_inputTensorDescs[DmlInputIndex::dmlAZeroPoint] = CreateTensorDescFromInput(
+                kernelInfo,
+                OrtInputTensors::ortAZeroPoint,
+                TensorAxis::DoNotCoerce,
+                TensorAxis::H,
+                TensorAxis::LeftAligned,
+                std::nullopt,
+                dmlDimSize
+                );
+        }
+
+        // B Zeropoint and BScale are already aligned in the W dimension so no need to align them
+
+        // Initialize the output description while overriding the shape
+        m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, outputShape);
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matMulDesc = {};
+        matMulDesc.ATensor = &inputDescs[DmlInputIndex::dmlA];
+        matMulDesc.AScaleTensor = &inputDescs[DmlInputIndex::dmlAScale];
+        matMulDesc.AZeroPointTensor = inputDescs[DmlInputIndex::dmlAZeroPoint].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlAZeroPoint] : nullptr;
+        matMulDesc.BTensor = &inputDescs[DmlInputIndex::dmlB];
+        matMulDesc.BScaleTensor = &inputDescs[DmlInputIndex::dmlBScale];
+        matMulDesc.BZeroPointTensor = inputDescs[DmlInputIndex::dmlBZeroPoint].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlBZeroPoint] : nullptr;
+        matMulDesc.BiasTensor = inputDescs[DmlInputIndex::dmlBias].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlBias] : nullptr;
+        matMulDesc.OutputTensor = &outputDescs[0];
+
+        DML_OPERATOR_DESC opDesc = { (DML_OPERATOR_TYPE) DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matMulDesc };
+        SetDmlOperatorDesc(opDesc, kernelInfo);
+    }
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(MatMulIntegerToFloat, DmlOperatorMatMulIntegerToFloat);
+
+}  // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorPadding.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorPadding.cpp
index a014db5adbe6..9b7ad9aa9e08 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorPadding.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorPadding.cpp
@@ -51,6 +51,12 @@ class DmlOperatorPadding : public DmlOperator, public PaddingHelper
         {
             mode = DML_PADDING_MODE_REFLECTION;
         }
+#if DML_TARGET_VERSION >= 0x6400
+        else if (modeString == AttrValue::Wrap)
+        {
+            mode = DML_PADDING_MODE_WRAP;
+        }
+#endif
         else
         {
             ML_INVALID_ARGUMENT("Unknown Pad mode attribute.");
@@ -116,5 +122,6 @@ DML_OP_DEFINE_CREATION_FUNCTION(Pad7, VersionedKernel<DmlOperatorPadding, 7>);
 DML_OP_DEFINE_CREATION_FUNCTION(Pad11, VersionedKernel<DmlOperatorPadding, 11>);
 DML_OP_DEFINE_CREATION_FUNCTION(Pad13, VersionedKernel<DmlOperatorPadding, 13>);
 DML_OP_DEFINE_CREATION_FUNCTION(Pad18, VersionedKernel<DmlOperatorPadding, 18>);
+DML_OP_DEFINE_CREATION_FUNCTION(Pad19, VersionedKernel<DmlOperatorPadding, 19>);
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorPooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorPooling.cpp
index e8d5b2746aa1..10ff1d8be8a2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorPooling.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorPooling.cpp
@@ -34,7 +34,7 @@ class DmlOperatorPooling : public DmlOperator, public PoolingHelperBase
             kernelOutputIndices.emplace_back(1);
         }
         DmlOperator::Initialize(kernelInfo, std::nullopt, kernelOutputIndices);
-        
+
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
         ML_CHECK_VALID_ARGUMENT(inputDescs.size() >= 1, "MaxPool input count must be >=1.");
@@ -98,6 +98,21 @@ class DmlOperatorPooling : public DmlOperator, public PoolingHelperBase
                 SetOpDesc(desc);
                 break;
             }
+            case DML_OPERATOR_AVERAGE_POOLING1:
+            {
+                if (hasDilations) {
+                    DML_AVERAGE_POOLING1_OPERATOR_DESC desc = {};
+                    desc.IncludePadding = kernelInfo.GetOptionalAttribute<bool>(AttrName::CountIncludePad, false);
+                    desc.Dilations = m_kernel.dilations;
+                    SetOpDesc(desc);
+                }
+                else {
+                    DML_AVERAGE_POOLING_OPERATOR_DESC desc = {};
+                    desc.IncludePadding = kernelInfo.GetOptionalAttribute<bool>(AttrName::CountIncludePad, false);
+                    SetOpDesc(desc);
+                }
+                break;
+            }
             case DML_OPERATOR_LP_POOLING:
             {
                 DML_LP_POOLING_OPERATOR_DESC desc = {};
@@ -106,6 +121,23 @@ class DmlOperatorPooling : public DmlOperator, public PoolingHelperBase
                 SetOpDesc(desc);
                 break;
             }
+            case DML_OPERATOR_LP_POOLING1:
+            {
+                if (hasDilations) {
+                    DML_LP_POOLING1_OPERATOR_DESC desc = {};
+                    desc.P = kernelInfo.GetOptionalAttribute<int>(AttrName::P, 2);
+                    ML_CHECK_VALID_ARGUMENT(desc.P > 0);
+                    desc.Dilations = m_kernel.dilations;
+                    SetOpDesc(desc);
+                }
+                else {
+                    DML_LP_POOLING_OPERATOR_DESC desc = {};
+                    desc.P = kernelInfo.GetOptionalAttribute<int>(AttrName::P, 2);
+                    ML_CHECK_VALID_ARGUMENT(desc.P > 0);
+                    SetOpDesc(desc);
+                }
+                break;
+            }
             case DML_OPERATOR_MAX_POOLING:
             case DML_OPERATOR_MAX_POOLING1:
             case DML_OPERATOR_MAX_POOLING2:
@@ -152,7 +184,7 @@ class DmlOperatorPoolingTemplate : public DmlOperatorPooling
 void CALLBACK QueryMaxPool(IMLOperatorSupportQueryContextPrivate* context, bool* isSupported)
 {
     *isSupported = false;
-    
+
     MLOperatorAttributes attributes(context);
 
     int storageOrder = attributes.GetOptionalAttribute<int>(AttrName::StorageOrder, 0);
@@ -164,11 +196,11 @@ void CALLBACK QueryMaxPool(IMLOperatorSupportQueryContextPrivate* context, bool*
     *isSupported = true;
 }
 
-DML_OP_DEFINE_CREATION_FUNCTION(AveragePool,           DmlOperatorPoolingTemplate<DML_OPERATOR_AVERAGE_POOLING, false>);
+DML_OP_DEFINE_CREATION_FUNCTION(AveragePool,           DmlOperatorPoolingTemplate<DML_OPERATOR_AVERAGE_POOLING1, false>);
 DML_OP_DEFINE_CREATION_FUNCTION(GlobalAveragePool,     DmlOperatorPoolingTemplate<DML_OPERATOR_AVERAGE_POOLING, true>);
 DML_OP_DEFINE_CREATION_FUNCTION(MaxPool,               DmlOperatorPoolingTemplate<DML_OPERATOR_MAX_POOLING2, false>);
 DML_OP_DEFINE_CREATION_FUNCTION(GlobalMaxPool,         DmlOperatorPoolingTemplate<DML_OPERATOR_MAX_POOLING, true>);
-DML_OP_DEFINE_CREATION_FUNCTION(LpPool,                DmlOperatorPoolingTemplate<DML_OPERATOR_LP_POOLING, false>);
+DML_OP_DEFINE_CREATION_FUNCTION(LpPool,                DmlOperatorPoolingTemplate<DML_OPERATOR_LP_POOLING1, false>);
 DML_OP_DEFINE_CREATION_FUNCTION(GlobalLpPool,          DmlOperatorPoolingTemplate<DML_OPERATOR_LP_POOLING, true>);
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp
new file mode 100644
index 000000000000..f9519b26bb4e
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp
@@ -0,0 +1,704 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+/*
+Abbreviations: B is batch_size, S is sequence_length, W is hidden_size
+               N is number of attention heads, H is head size, and W=N*H
+
+Input, Weight, Bias, Mask Index and Past are Inputs
+
+Mask Index/Causal  Input   Weight   Bias
+         |             \    |       /
+         |              \   |      /
+         |               \  |     /
+         |             MatMulIntToFloat
+         |                / |   \
+         |               /  |    \
+         |              /   |     \
+         |          Slice  Slice  Slice
+         |            |     |       |
+         |            |     |       |
+         |      Identity Identity Identity // The identities are used to transpose NCHW -> NHCW while
+         |            |     |       |      // keeping the GEMM strides as NCHW to better target metacommands
+         |            |     |       |
+         |            |     |       |       Past
+         |            |     |       |       / \
+         |            |     |       |      /   \
+         |            |     |       |  Slice   Slice
+         |            |     |       |     |      |
+         |            |     |       |     |      |
+         |            |     |       |     |      |
+         --------------------------MHA -----------
+                                  / | \
+                                 /  |   \
+                                /   |     \
+                               /    |       \
+                              /     |         \
+                             /      |           \
+                            /  presentKey   presentValue
+                           /         \       /
+                          /           \     /
+                         /             \   /
+                        /             Concat
+                       /                 |
+                   Output1            Output2 (present)
+
+ This kernel creates a DML_GRAPH, as mentioned above.
+ For reference, refer to this Doc:
+ https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftqattention
+ */
+
+namespace Dml
+{
+class DmlOperatorQAttention : public DmlOperator
+{
+public:
+    DmlOperatorQAttention(const MLOperatorKernelCreationContext& kernelCreationContext)
+    :   DmlOperator(kernelCreationContext)
+    {
+        enum InputIndex : uint32_t
+        {
+            inputIndex,
+            weightsIndex,
+            biasIndex,
+            inputScaleIndex,
+            weightScaleIndex,
+            maskIndex,
+            inputZeroPointIndex,
+            weightZeroPointIndex,
+            pastIndex,
+            inputCount,
+        };
+
+        enum OutputIndex : uint32_t
+        {
+            outputIndex,
+            presentIndex,
+            outputCount,
+        };
+
+        enum MhaInputIndex : uint32_t
+        {
+            mhaQueryIndex,
+            mhaKeyIndex,
+            mhaValueIndex,
+            mhaStackedQueryKeyIndex,
+            mhaStackedKeyValueIndex,
+            mhaStackedQueryKeyValueIndex,
+            mhaBiasIndex,
+            mhaMaskIndex,
+            mhaRelativePositionBiasIndex,
+            mhaPastKeyIndex,
+            mhaPastValueIndex,
+            mhaInputCount,
+        };
+
+        enum MhaOutputIndex : uint32_t
+        {
+            mhaOutputIndex,
+            mhaPresentKeyIndex,
+            mhaPresentValueIndex,
+            mhaOutputCount,
+        };
+
+        ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetInputCount() >= 5);
+        ML_CHECK_VALID_ARGUMENT(kernelCreationContext.GetOutputCount() >= 1);
+
+        const bool hasBias = kernelCreationContext.IsInputValid(biasIndex);
+        const bool hasMask = kernelCreationContext.IsInputValid(maskIndex);
+        const bool hasUnpaddedBounds = hasMask && kernelCreationContext.GetInputTensorDimensionCount(maskIndex) == 1;
+        const bool hasPast = kernelCreationContext.IsInputValid(pastIndex);
+
+        DmlOperator::Initialize(kernelCreationContext, std::nullopt, std::nullopt, std::nullopt, std::nullopt, 1);
+
+        const bool unidirectional = gsl::narrow_cast<uint32_t>(kernelCreationContext.GetAttribute<int64_t>(AttrName::Unidirectional));
+        const uint32_t numHeads = gsl::narrow_cast<uint32_t>(kernelCreationContext.GetAttribute<int64_t>(AttrName::NumHeads));
+        ML_CHECK_VALID_ARGUMENT(numHeads > 0); //  to avoid process crash because of division by zero.
+
+        auto inputTensorShape = m_inputTensorDescs[inputIndex].GetSizes();
+        ML_CHECK_VALID_ARGUMENT(inputTensorShape.size() == 3);
+
+        auto weightTensorShape = m_inputTensorDescs[weightsIndex].GetSizes();
+        ML_CHECK_VALID_ARGUMENT(weightTensorShape.size() == 2);
+        ML_CHECK_VALID_ARGUMENT(weightTensorShape[0] == inputTensorShape[2]);
+        ML_CHECK_VALID_ARGUMENT(weightTensorShape[1] % 3 == 0);
+
+        if (hasBias)
+        {
+            auto biasTensorShape = m_inputTensorDescs[biasIndex].GetSizes();
+            ML_CHECK_VALID_ARGUMENT(biasTensorShape.size() == 1);
+            ML_CHECK_VALID_ARGUMENT(biasTensorShape[0] % 3 == 0);
+            ML_CHECK_VALID_ARGUMENT(weightTensorShape[1] == biasTensorShape[0]);
+        }
+
+        if (hasPast)
+        {
+            ML_CHECK_VALID_ARGUMENT(kernelCreationContext.IsOutputValid(presentIndex));
+        }
+
+        const uint32_t hiddenSize = weightTensorShape[1] / 3;
+        const uint32_t headSize = hiddenSize / numHeads;
+        const uint32_t batchSize = inputTensorShape[0];
+        const uint32_t sequenceLength = inputTensorShape[1];
+        const uint32_t pastSequenceLength = hasPast ? m_inputTensorDescs[pastIndex].GetSizes()[3] : 0;
+
+        uint32_t desiredWeightTensorShape[3] = {batchSize, weightTensorShape[0], 3 * hiddenSize};
+        MLOperatorTensorDataType dataType = kernelCreationContext.GetOutputEdgeDescription(outputIndex).tensorDataType;
+
+        m_inputTensorDescs[weightsIndex] = TensorDesc::ConstructBroadcastedTensorDesc(
+            kernelCreationContext.GetInputEdgeDescription(weightsIndex).tensorDataType,
+            desiredWeightTensorShape,
+            weightTensorShape);
+
+        uint32_t desiredBiasTensorShape[3] = {batchSize, sequenceLength, 3 * hiddenSize};
+
+        if (hasBias)
+        {
+            auto biasTensorShape = m_inputTensorDescs[biasIndex].GetSizes();
+            m_inputTensorDescs[biasIndex] = TensorDesc::ConstructBroadcastedTensorDesc(kernelCreationContext.GetInputEdgeDescription(biasIndex).tensorDataType, desiredBiasTensorShape, biasTensorShape);
+        }
+
+        MLOperatorTensorDataType maskTensorDataType = MLOperatorTensorDataType::Undefined;
+        bool hasMaxSequenceMask = false;
+        DML_MULTIHEAD_ATTENTION_MASK_TYPE maskType = DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE;
+        if (hasMask)
+        {
+            if (hasUnpaddedBounds)
+            {
+                auto unpaddedKeyBoundsShape = m_inputTensorDescs[maskIndex].GetSizes();
+                ML_CHECK_VALID_ARGUMENT(unpaddedKeyBoundsShape.size() == 1);
+
+                const uint32_t batchGroupCount = unpaddedKeyBoundsShape[0] / batchSize;
+                ML_CHECK_VALID_ARGUMENT(batchGroupCount == 1 || batchGroupCount == 2);
+
+                uint32_t desiredShape[2] = {batchGroupCount, batchSize};
+                m_inputTensorDescs[maskIndex] = TensorDesc(
+                    m_inputTensorDescs[maskIndex].GetDmlDataType(),
+                    desiredShape);
+
+                maskType = batchGroupCount == 1
+                    ? DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH
+                    : DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START;
+            }
+            else
+            {
+                auto maskIndexTensorShape = m_inputTensorDescs[maskIndex].GetSizes();
+                ML_CHECK_VALID_ARGUMENT(maskIndexTensorShape.size() > 1 && maskIndexTensorShape.size() <= 4);
+
+                maskType = DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN;
+                std::vector<uint32_t> reshapedMaskIndexTensorShape(maskIndexTensorShape.begin(), maskIndexTensorShape.end());
+                if (maskIndexTensorShape.size() == 4 && maskIndexTensorShape[2] != sequenceLength)
+                {
+                    hasMaxSequenceMask = true;
+                    ML_CHECK_VALID_ARGUMENT(maskIndexTensorShape[2] == maskIndexTensorShape[3]);
+                    const uint32_t maxSequenceLength = maskIndexTensorShape[2];
+                    uint32_t desiredMaskIndexShape[4] = {batchSize, numHeads, maxSequenceLength, maxSequenceLength};
+                    maskTensorDataType = kernelCreationContext.GetInputEdgeDescription(maskIndex).tensorDataType;
+                    m_inputTensorDescs[maskIndex] = TensorDesc::ConstructBroadcastedTensorDesc(maskTensorDataType, desiredMaskIndexShape, reshapedMaskIndexTensorShape);
+                }
+                else
+                {
+                    uint32_t maskIndexDimensionCount = gsl::narrow_cast<uint32_t>(maskIndexTensorShape.size());
+                    reshapedMaskIndexTensorShape.insert(reshapedMaskIndexTensorShape.begin() + 1, 4 - maskIndexDimensionCount, 1);
+                    uint32_t desiredMaskIndexShape[4] = {batchSize, numHeads, sequenceLength, sequenceLength};
+                    maskTensorDataType = kernelCreationContext.GetInputEdgeDescription(maskIndex).tensorDataType;
+                    m_inputTensorDescs[maskIndex] = TensorDesc::ConstructBroadcastedTensorDesc(maskTensorDataType, desiredMaskIndexShape, reshapedMaskIndexTensorShape);
+                }
+            }
+        }
+
+        MLOperatorTensorDataType pastTensorDataType = MLOperatorTensorDataType::Undefined;
+        MLOperatorTensorDataType presentTensorDataType = MLOperatorTensorDataType::Undefined;
+        if (hasPast)
+        {
+            pastTensorDataType = kernelCreationContext.GetInputEdgeDescription(pastIndex).tensorDataType;
+            presentTensorDataType = kernelCreationContext.GetOutputEdgeDescription(presentIndex).tensorDataType;
+        }
+
+        TensorDesc matMulIntToFloatOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, desiredBiasTensorShape);
+        DML_TENSOR_DESC namedMatMulIntToFloatOutputTensorDesc = matMulIntToFloatOutputTensorDesc.GetDmlDesc();
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matMulIntToFloatOperatorDesc = {};
+        matMulIntToFloatOperatorDesc.ATensor = &inputDescs[InputIndex::inputIndex];
+        matMulIntToFloatOperatorDesc.AScaleTensor = &inputDescs[InputIndex::inputScaleIndex];
+        matMulIntToFloatOperatorDesc.AZeroPointTensor = &inputDescs[InputIndex::inputZeroPointIndex];
+        matMulIntToFloatOperatorDesc.BTensor = &inputDescs[InputIndex::weightsIndex];
+        matMulIntToFloatOperatorDesc.BScaleTensor = &inputDescs[InputIndex::weightScaleIndex];
+        matMulIntToFloatOperatorDesc.BZeroPointTensor = &inputDescs[InputIndex::weightZeroPointIndex];
+        matMulIntToFloatOperatorDesc.BiasTensor = hasBias ? &inputDescs[InputIndex::biasIndex] : nullptr;
+        matMulIntToFloatOperatorDesc.OutputTensor = &namedMatMulIntToFloatOutputTensorDesc;
+
+        const DML_OPERATOR_DESC matMulIntToFloatDesc = { DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matMulIntToFloatOperatorDesc};
+
+        std::array<uint32_t, 3> queryKeySlicedTensorShape = {batchSize, sequenceLength, hiddenSize + hiddenSize};
+        TensorDesc queryKeySlicedInputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, queryKeySlicedTensorShape);
+        DML_TENSOR_DESC namedQueryKeySlicedInputTensorDesc = queryKeySlicedInputTensorDesc.GetDmlDesc();
+
+        std::array<uint32_t, 3> valueSlicedTensorShape = {batchSize, sequenceLength, hiddenSize};
+        TensorDesc valueSlicedInputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, valueSlicedTensorShape);
+        DML_TENSOR_DESC namedValueSlicedInputTensorDesc = valueSlicedInputTensorDesc.GetDmlDesc();
+
+        // Transpose slice QK from [batchSize, sequenceLength, 2, numHeads, headSize] to [batchSize, sequenceLength, numHeads, 2, headSize]
+        std::array<uint32_t, 5> queryKeyTransposedTensorShape = {batchSize, sequenceLength, numHeads, 2, headSize};
+        std::array<uint32_t, 5> queryKeyTransposedStrides = {
+            sequenceLength * numHeads * 2 * headSize,
+            numHeads * 2 * headSize,
+            headSize,
+            numHeads * headSize,
+            1,
+        };
+
+        TensorDesc queryKeyTransposedInputTensorDesc = TensorDesc(
+            GetDmlDataTypeFromMlDataType(dataType),
+            queryKeyTransposedTensorShape,
+            queryKeyTransposedStrides);
+        DML_TENSOR_DESC namedQueryKeyTransposedInputTensorDesc = queryKeyTransposedInputTensorDesc.GetDmlDesc();
+
+        TensorDesc queryKeyTransposedOutputTensorDesc = TensorDesc(
+            GetDmlDataTypeFromMlDataType(dataType),
+            queryKeyTransposedTensorShape);
+        DML_TENSOR_DESC namedQueryKeyTransposedOutputTensorDesc = queryKeyTransposedOutputTensorDesc.GetDmlDesc();
+
+        // Transpose QKV from [batchSize, sequenceLength, 3, numHeads, headSize] to [batchSize, sequenceLength, numHeads, 3, headSize]
+        std::array<uint32_t, 5> queryKeyValueTransposedTensorShape = {batchSize, sequenceLength, numHeads, 3, headSize};
+        std::array<uint32_t, 5> queryKeyValueTransposedStrides = {
+            sequenceLength * numHeads * 3 * headSize,
+            numHeads * 3 * headSize,
+            headSize,
+            numHeads * headSize,
+            1,
+        };
+
+        TensorDesc queryKeyValueTransposedInputTensorDesc = TensorDesc(
+            GetDmlDataTypeFromMlDataType(dataType),
+            queryKeyValueTransposedTensorShape,
+            queryKeyValueTransposedStrides);
+        DML_TENSOR_DESC namedQueryKeyValueTransposedInputTensorDesc = queryKeyValueTransposedInputTensorDesc.GetDmlDesc();
+
+        TensorDesc queryKeyValueTransposedOutputTensorDesc = TensorDesc(
+            GetDmlDataTypeFromMlDataType(dataType),
+            queryKeyValueTransposedTensorShape);
+        DML_TENSOR_DESC namedQueryKeyValueTransposedOutputTensorDesc = queryKeyValueTransposedOutputTensorDesc.GetDmlDesc();
+
+        std::array<uint32_t, 3> queryKeySliceOffset = {0, 0, 0};
+        std::array<uint32_t, 3> queryKeySliceSize = {batchSize, sequenceLength, hiddenSize + hiddenSize};
+        std::array<int32_t, 3> queryKeySliceStrides = {1, 1, 1};
+
+        std::array<uint32_t, 3> valueSliceOffset = {0, 0, 2 * hiddenSize};
+        std::array<uint32_t, 3> valueSliceSize = {batchSize, sequenceLength, hiddenSize};
+        std::array<int32_t, 3> valueSliceStrides = {1, 1, 1};
+
+        // When Q/K/V all have the same hidden size, we just have to transpose it before sending it to MHA
+        DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC transposeOperatorDesc = {};
+
+        transposeOperatorDesc.InputTensor = &namedQueryKeyValueTransposedInputTensorDesc;
+        transposeOperatorDesc.OutputTensor = &namedQueryKeyValueTransposedOutputTensorDesc;
+
+        const DML_OPERATOR_DESC transposedDesc = { DML_OPERATOR_ELEMENT_WISE_IDENTITY, &transposeOperatorDesc};
+
+        std::array<uint32_t, 4> maskSliceOutputShape = {batchSize, numHeads, sequenceLength, sequenceLength};
+        std::array<int32_t, 4> maskSliceStrides = {1, 1, 1, 1};
+        std::array<uint32_t, 4> maskSliceOffsets = {0, 0, 0, 0};
+        TensorDesc maskSliceOutputTensorDesc;
+        DML_TENSOR_DESC namedMaskSliceOutputTensorDesc;
+
+        DML_SLICE1_OPERATOR_DESC maskSlicedOperatorDesc = {};
+        if (hasMaxSequenceMask)
+        {
+            maskSliceOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(maskTensorDataType, maskSliceOutputShape);
+            namedMaskSliceOutputTensorDesc = maskSliceOutputTensorDesc.GetDmlDesc();
+            maskSlicedOperatorDesc.InputTensor = &inputDescs[maskIndex];
+            maskSlicedOperatorDesc.OutputTensor = &namedMaskSliceOutputTensorDesc;
+            maskSlicedOperatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(maskSliceOutputShape.size());
+            maskSlicedOperatorDesc.InputWindowOffsets = maskSliceOffsets.data();
+            maskSlicedOperatorDesc.InputWindowSizes = maskSliceOutputShape.data();
+            maskSlicedOperatorDesc.InputWindowStrides = maskSliceStrides.data();
+        }
+        const DML_OPERATOR_DESC maskSlicedDesc = { DML_OPERATOR_SLICE1, &maskSlicedOperatorDesc};
+
+        // We need to slice Past to get PastValue and PastKey tensors for MHA
+        std::array<uint32_t, 5> pastKeyOutputShape = {1, batchSize, numHeads, pastSequenceLength, headSize};
+        std::array<int32_t, 5> pastKeyStrides = {1, 1, 1, 1, 1};
+        std::array<uint32_t, 5> pastKeyOffsets = {0, 0, 0, 0, 0};
+        TensorDesc pastKeyOutputTensorDesc;
+        DML_TENSOR_DESC namedPastKeyOutputTensorDesc;
+
+        std::array<uint32_t, 5> pastValueOutputShape = {1, batchSize, numHeads, pastSequenceLength, headSize};
+        std::array<int32_t, 5> pastValueStrides = {1, 1, 1, 1, 1};
+        std::array<uint32_t, 5> pastValueOffsets = {1, 0, 0, 0, 0};
+        TensorDesc pastValueOutputTensorDesc;
+        DML_TENSOR_DESC namedPastValueOutputTensorDesc;
+
+        DML_SLICE1_OPERATOR_DESC pastKeySlicedOperatorDesc = {};
+        DML_SLICE1_OPERATOR_DESC pastValueSlicedOperatorDesc = {};
+
+        if (hasPast)
+        {
+            pastKeyOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(pastTensorDataType, pastKeyOutputShape);
+            namedPastKeyOutputTensorDesc = pastKeyOutputTensorDesc.GetDmlDesc();
+            pastKeySlicedOperatorDesc.InputTensor = &inputDescs[pastIndex];
+            pastKeySlicedOperatorDesc.OutputTensor = &namedPastKeyOutputTensorDesc;
+            pastKeySlicedOperatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(pastKeyOutputShape.size());
+            pastKeySlicedOperatorDesc.InputWindowOffsets = pastKeyOffsets.data();
+            pastKeySlicedOperatorDesc.InputWindowSizes = pastKeyOutputShape.data();
+            pastKeySlicedOperatorDesc.InputWindowStrides = pastKeyStrides.data();
+
+            pastValueOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(pastTensorDataType, pastValueOutputShape);
+            namedPastValueOutputTensorDesc = pastValueOutputTensorDesc.GetDmlDesc();
+            pastValueSlicedOperatorDesc.InputTensor = &inputDescs[pastIndex];
+            pastValueSlicedOperatorDesc.OutputTensor = &namedPastValueOutputTensorDesc;
+            pastValueSlicedOperatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(pastValueOutputShape.size());
+            pastValueSlicedOperatorDesc.InputWindowOffsets = pastValueOffsets.data();
+            pastValueSlicedOperatorDesc.InputWindowSizes = pastValueOutputShape.data();
+            pastValueSlicedOperatorDesc.InputWindowStrides = pastValueStrides.data();
+        }
+
+        const DML_OPERATOR_DESC pastKeySlicedDesc = { DML_OPERATOR_SLICE1, &pastKeySlicedOperatorDesc};
+        const DML_OPERATOR_DESC pastValueSlicedDesc = { DML_OPERATOR_SLICE1, &pastValueSlicedOperatorDesc};
+
+        // Causal Mask: Upper Triangular Boolean Matrix
+        // Example: [[1, 0, 0, 0, 0],
+        //           [1, 1, 0, 0, 0],
+        //           [1, 1, 1, 0, 0],
+        //           [1, 1, 1, 1, 0]]
+        // DML adds maskFilterValue to the "off" bits in the mask and sets the "on" bits to 0
+        // passed to MHA as maskIndex Tensor when unidirectional == 1
+        std::array<uint32_t, 4> causalMaskOutputShape = {1, 1,  sequenceLength, pastSequenceLength + sequenceLength};
+        TensorDesc causalMaskTensorDesc;
+        DML_DIAGONAL_MATRIX1_OPERATOR_DESC causalMaskOperatorDesc = {};
+        DML_TENSOR_DESC namedcausalMaskTensorDesc;
+
+        if (unidirectional && !hasMask)
+        {
+            causalMaskTensorDesc = TensorDesc::ConstructDefaultTensorDesc(MLOperatorTensorDataType::Int32, causalMaskOutputShape);
+            namedcausalMaskTensorDesc = causalMaskTensorDesc.GetDmlDesc();
+            causalMaskOperatorDesc.ValueDataType = DML_TENSOR_DATA_TYPE_INT32;
+            causalMaskOperatorDesc.DiagonalFillBegin = INT32_MIN;
+            causalMaskOperatorDesc.DiagonalFillEnd = pastSequenceLength + 1;
+            causalMaskOperatorDesc.Value.Int32 = 1;
+            causalMaskOperatorDesc.OutputTensor = &namedcausalMaskTensorDesc;
+            maskType = DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN;
+        }
+        DML_OPERATOR_DESC causalMaskDesc = { DML_OPERATOR_DIAGONAL_MATRIX1, &causalMaskOperatorDesc };
+
+        DML_MULTIHEAD_ATTENTION_OPERATOR_DESC mhaOperatorDesc = {};
+        std::array<uint32_t, 5> presentKeyOutputShape = {1, batchSize, numHeads, pastSequenceLength + sequenceLength, headSize};
+        std::array<uint32_t, 5> presentValueOutputShape = {1, batchSize, numHeads, pastSequenceLength + sequenceLength, headSize};
+        TensorDesc presentKeyTensorDesc;
+        TensorDesc presentValueTensorDesc;
+        DML_TENSOR_DESC namedPresentKeyOutputTensorDesc;
+        DML_TENSOR_DESC namedPresentValueOutputTensorDesc;
+
+        mhaOperatorDesc.StackedQueryKeyValueTensor = &namedQueryKeyValueTransposedOutputTensorDesc;
+
+        // Broadcast to MHA MaskTensor Shape
+        std::array<uint32_t, 4> mhaMaskTensorShape = {batchSize, numHeads, sequenceLength, pastSequenceLength + sequenceLength};
+        TensorDesc broadcastedcausalMaskTensorDesc;
+        DML_TENSOR_DESC namedbroadcastedcausalMaskTensorDesc;
+        if (unidirectional && !hasMask)
+        {
+            broadcastedcausalMaskTensorDesc = TensorDesc::ConstructBroadcastedTensorDesc(MLOperatorTensorDataType::Int32, mhaMaskTensorShape, causalMaskOutputShape);
+            namedbroadcastedcausalMaskTensorDesc = broadcastedcausalMaskTensorDesc.GetDmlDesc();
+            mhaOperatorDesc.MaskTensor = &namedbroadcastedcausalMaskTensorDesc;
+        }
+        else if (hasMaxSequenceMask)
+        {
+            mhaOperatorDesc.MaskTensor = &namedMaskSliceOutputTensorDesc;
+        }
+        else
+        {
+            mhaOperatorDesc.MaskTensor = hasMask ? &inputDescs[maskIndex] : nullptr;
+        }
+
+        mhaOperatorDesc.RelativePositionBiasTensor = nullptr;
+        mhaOperatorDesc.OutputTensor = &outputDescs[outputIndex];
+        mhaOperatorDesc.Scale = kernelCreationContext.GetOptionalAttribute<float>(AttrName::Scale, gsl::narrow_cast<float>(1.0f / std::sqrt(headSize)));
+        // Set MaskFilterValue to lowest float for Causal Mask 
+        mhaOperatorDesc.MaskFilterValue = unidirectional ? std::numeric_limits<float>::lowest() :
+            kernelCreationContext.GetOptionalAttribute<float>(AttrName::MaskFilterValue, -10'000.0f);
+        mhaOperatorDesc.HeadCount = numHeads;
+        mhaOperatorDesc.MaskType = maskType;
+        if (hasPast)
+        {
+            presentKeyTensorDesc = TensorDesc::ConstructDefaultTensorDesc(presentTensorDataType, presentKeyOutputShape);
+            namedPresentKeyOutputTensorDesc = presentKeyTensorDesc.GetDmlDesc();
+            presentValueTensorDesc = TensorDesc::ConstructDefaultTensorDesc(presentTensorDataType, presentValueOutputShape);
+            namedPresentValueOutputTensorDesc = presentValueTensorDesc.GetDmlDesc();
+            mhaOperatorDesc.PastKeyTensor = &namedPastKeyOutputTensorDesc;
+            mhaOperatorDesc.PastValueTensor = &namedPastValueOutputTensorDesc;
+            mhaOperatorDesc.OutputPresentKeyTensor = &namedPresentKeyOutputTensorDesc;
+            mhaOperatorDesc.OutputPresentValueTensor = &namedPresentValueOutputTensorDesc;
+        }
+
+        const DML_OPERATOR_DESC mhaDesc = { DML_OPERATOR_MULTIHEAD_ATTENTION, &mhaOperatorDesc };
+
+        DML_JOIN_OPERATOR_DESC presentKeyValueJoinOperatorDesc = {};
+        std::vector<DML_TENSOR_DESC> joinInputDesc;
+
+        if (hasPast)
+        {
+            joinInputDesc.push_back(namedPresentKeyOutputTensorDesc);
+            joinInputDesc.push_back(namedPresentValueOutputTensorDesc);
+            presentKeyValueJoinOperatorDesc.InputCount = gsl::narrow_cast<uint32_t>(joinInputDesc.size());
+            presentKeyValueJoinOperatorDesc.InputTensors = joinInputDesc.data();
+            presentKeyValueJoinOperatorDesc.OutputTensor = &outputDescs[presentIndex];
+            presentKeyValueJoinOperatorDesc.Axis = gsl::narrow_cast<uint32_t>(0);
+        }
+
+        DML_OPERATOR_DESC presentKeyValueJoinDesc = { DML_OPERATOR_JOIN, &presentKeyValueJoinOperatorDesc };
+
+        // Construct the graph
+        std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
+        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
+        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
+
+        std::vector<const DML_OPERATOR_DESC*> opDescs = {
+            &matMulIntToFloatDesc,
+            &mhaDesc,
+        };
+
+        uint32_t currentNodeIndex = 0;
+        const uint32_t matMulIntToFloatNodeIndex = currentNodeIndex++;
+        const uint32_t mhaNodeIndex = currentNodeIndex++;
+
+        uint32_t queryKeyValueTransposedNodeIndex = 0;
+
+        opDescs.push_back(&transposedDesc);
+        queryKeyValueTransposedNodeIndex = currentNodeIndex++;
+
+        uint32_t maskSliceNodeIndex = 0;
+        if (hasMaxSequenceMask)
+        {
+            opDescs.push_back(&maskSlicedDesc);
+            maskSliceNodeIndex = currentNodeIndex++;
+        }
+
+        uint32_t pastKeySliceNodeIndex = 0;
+        uint32_t pastValueSliceNodeIndex = 0;
+        uint32_t concatNodeIndex = 0;
+        if (hasPast)
+        {
+            opDescs.push_back(&pastKeySlicedDesc);
+            pastKeySliceNodeIndex = currentNodeIndex++;
+            opDescs.push_back(&pastValueSlicedDesc);
+            pastValueSliceNodeIndex = currentNodeIndex++;
+            opDescs.push_back(&presentKeyValueJoinDesc);
+            concatNodeIndex = currentNodeIndex++;
+        }
+
+        uint32_t causalMaskNodeIndex = 0;
+        if (unidirectional && !hasMask)
+        {
+            opDescs.push_back(&causalMaskDesc);
+            causalMaskNodeIndex = currentNodeIndex++;
+        }
+
+        DML_INPUT_GRAPH_EDGE_DESC inputToMatMulIntToFloatEdge = {};
+        inputToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::inputIndex;
+        inputToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        inputToMatMulIntToFloatEdge.ToNodeInputIndex = 0;
+        inputEdges.push_back(inputToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC inputScaleToMatMulIntToFloatEdge = {};
+        inputScaleToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::inputScaleIndex;
+        inputScaleToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        inputScaleToMatMulIntToFloatEdge.ToNodeInputIndex = 1;
+        inputEdges.push_back(inputScaleToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC inputZeroPointToMatMulIntToFloatEdge = {};
+        inputZeroPointToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::inputZeroPointIndex;
+        inputZeroPointToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        inputZeroPointToMatMulIntToFloatEdge.ToNodeInputIndex = 2;
+        inputEdges.push_back(inputZeroPointToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC weightToMatMulIntToFloatEdge = {};
+        weightToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::weightsIndex;
+        weightToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        weightToMatMulIntToFloatEdge.ToNodeInputIndex = 3;
+        inputEdges.push_back(weightToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC weightScaleToMatMulIntToFloatEdge = {};
+        weightScaleToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::weightScaleIndex;
+        weightScaleToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        weightScaleToMatMulIntToFloatEdge.ToNodeInputIndex = 4;
+        inputEdges.push_back(weightScaleToMatMulIntToFloatEdge);
+
+        DML_INPUT_GRAPH_EDGE_DESC weightZeroPointToMatMulIntToFloatEdge = {};
+        weightZeroPointToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::weightZeroPointIndex;
+        weightZeroPointToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+        weightZeroPointToMatMulIntToFloatEdge.ToNodeInputIndex = 5;
+        inputEdges.push_back(weightZeroPointToMatMulIntToFloatEdge);
+
+        if (hasBias)
+        {
+            DML_INPUT_GRAPH_EDGE_DESC biasToMatMulIntToFloatEdge = {};
+            biasToMatMulIntToFloatEdge.GraphInputIndex = InputIndex::biasIndex;
+            biasToMatMulIntToFloatEdge.ToNodeIndex = matMulIntToFloatNodeIndex;
+            biasToMatMulIntToFloatEdge.ToNodeInputIndex = 6;
+            inputEdges.push_back(biasToMatMulIntToFloatEdge);
+        }
+
+        if (hasMask)
+        {
+            if (hasUnpaddedBounds)
+            {
+                DML_INPUT_GRAPH_EDGE_DESC maskToMhaEdge = {};
+                maskToMhaEdge.GraphInputIndex = InputIndex::maskIndex;
+                maskToMhaEdge.ToNodeIndex = mhaNodeIndex;
+                maskToMhaEdge.ToNodeInputIndex = mhaMaskIndex;
+                inputEdges.push_back(maskToMhaEdge);
+            }
+            else if (hasMaxSequenceMask)
+            {
+                DML_INPUT_GRAPH_EDGE_DESC maskToMaskSliceEdge = {};
+                maskToMaskSliceEdge.GraphInputIndex = InputIndex::maskIndex;
+                maskToMaskSliceEdge.ToNodeIndex = maskSliceNodeIndex;
+                maskToMaskSliceEdge.ToNodeInputIndex = 0;
+                inputEdges.push_back(maskToMaskSliceEdge);
+
+                DML_INTERMEDIATE_GRAPH_EDGE_DESC maskSliceToMhaEdge = {};
+                maskSliceToMhaEdge.FromNodeIndex = maskSliceNodeIndex;
+                maskSliceToMhaEdge.FromNodeOutputIndex = 0;
+                maskSliceToMhaEdge.ToNodeIndex = mhaNodeIndex;
+                maskSliceToMhaEdge.ToNodeInputIndex = mhaMaskIndex;
+                intermediateEdges.push_back(maskSliceToMhaEdge);
+            }
+            else
+            {
+                DML_INPUT_GRAPH_EDGE_DESC maskToMhaEdge = {};
+                maskToMhaEdge.GraphInputIndex = InputIndex::maskIndex;
+                maskToMhaEdge.ToNodeIndex = mhaNodeIndex;
+                maskToMhaEdge.ToNodeInputIndex = mhaMaskIndex;
+                inputEdges.push_back(maskToMhaEdge);
+            }
+        }
+        else if (unidirectional)
+        {
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC causalMaskToMhaEdge = {};
+            causalMaskToMhaEdge.FromNodeIndex = causalMaskNodeIndex;
+            causalMaskToMhaEdge.FromNodeOutputIndex = 0;
+            causalMaskToMhaEdge.ToNodeIndex = mhaNodeIndex;
+            causalMaskToMhaEdge.ToNodeInputIndex = mhaMaskIndex;
+            intermediateEdges.push_back(causalMaskToMhaEdge);
+        }
+
+        if (hasPast)
+        {
+            DML_INPUT_GRAPH_EDGE_DESC pastToPastKeySliceEdge = {};
+            pastToPastKeySliceEdge.GraphInputIndex = InputIndex::pastIndex;
+            pastToPastKeySliceEdge.ToNodeIndex = pastKeySliceNodeIndex;
+            pastToPastKeySliceEdge.ToNodeInputIndex = 0;
+            inputEdges.push_back(pastToPastKeySliceEdge);
+
+            DML_INPUT_GRAPH_EDGE_DESC pastToPastValueSliceEdge = {};
+            pastToPastValueSliceEdge.GraphInputIndex = InputIndex::pastIndex;
+            pastToPastValueSliceEdge.ToNodeIndex = pastValueSliceNodeIndex;
+            pastToPastValueSliceEdge.ToNodeInputIndex = 0;
+            inputEdges.push_back(pastToPastValueSliceEdge);
+
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC pastKeyToMhaEdge = {};
+            pastKeyToMhaEdge.FromNodeIndex = pastKeySliceNodeIndex;
+            pastKeyToMhaEdge.FromNodeOutputIndex = 0;
+            pastKeyToMhaEdge.ToNodeIndex = mhaNodeIndex;
+            pastKeyToMhaEdge.ToNodeInputIndex = mhaPastKeyIndex;
+            intermediateEdges.push_back(pastKeyToMhaEdge);
+
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC pastValueToMhaEdge = {};
+            pastValueToMhaEdge.FromNodeIndex = pastValueSliceNodeIndex;
+            pastValueToMhaEdge.FromNodeOutputIndex = 0;
+            pastValueToMhaEdge.ToNodeIndex = mhaNodeIndex;
+            pastValueToMhaEdge.ToNodeInputIndex = mhaPastValueIndex;
+            intermediateEdges.push_back(pastValueToMhaEdge);
+
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC presentKeyToConcatEdge = {};
+            presentKeyToConcatEdge.FromNodeIndex = mhaNodeIndex;
+            presentKeyToConcatEdge.FromNodeOutputIndex = mhaPresentKeyIndex;
+            presentKeyToConcatEdge.ToNodeIndex = concatNodeIndex;
+            presentKeyToConcatEdge.ToNodeInputIndex = 0;
+            intermediateEdges.push_back(presentKeyToConcatEdge);
+
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC presentValueToConcatEdge = {};
+            presentValueToConcatEdge.FromNodeIndex = mhaNodeIndex;
+            presentValueToConcatEdge.FromNodeOutputIndex = mhaPresentValueIndex;
+            presentValueToConcatEdge.ToNodeIndex = concatNodeIndex;
+            presentValueToConcatEdge.ToNodeInputIndex = 1;
+            intermediateEdges.push_back(presentValueToConcatEdge);
+        }
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC matMulIntToFloatToQueryKeyValueTransposeEdge = {};
+        matMulIntToFloatToQueryKeyValueTransposeEdge.FromNodeIndex = matMulIntToFloatNodeIndex;
+        matMulIntToFloatToQueryKeyValueTransposeEdge.FromNodeOutputIndex = 0;
+        matMulIntToFloatToQueryKeyValueTransposeEdge.ToNodeIndex = queryKeyValueTransposedNodeIndex;
+        matMulIntToFloatToQueryKeyValueTransposeEdge.ToNodeInputIndex = 0;
+        intermediateEdges.push_back(matMulIntToFloatToQueryKeyValueTransposeEdge);
+
+        // All we need to do here is transpose the stacked QKV tensor into something DML supports
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC queryKeyValueTransposedToMhaEdge = {};
+        queryKeyValueTransposedToMhaEdge.FromNodeIndex = queryKeyValueTransposedNodeIndex;
+        queryKeyValueTransposedToMhaEdge.FromNodeOutputIndex = 0;
+        queryKeyValueTransposedToMhaEdge.ToNodeIndex = mhaNodeIndex;
+        queryKeyValueTransposedToMhaEdge.ToNodeInputIndex = mhaStackedQueryKeyValueIndex;
+        intermediateEdges.push_back(queryKeyValueTransposedToMhaEdge);
+
+        DML_OUTPUT_GRAPH_EDGE_DESC mhaToOutputEdge = {};
+        mhaToOutputEdge.FromNodeIndex = mhaNodeIndex;
+        mhaToOutputEdge.FromNodeOutputIndex = mhaOutputIndex;
+        mhaToOutputEdge.GraphOutputIndex = OutputIndex::outputIndex;
+        outputEdges.push_back(mhaToOutputEdge);
+
+        if (hasPast)
+        {
+            DML_OUTPUT_GRAPH_EDGE_DESC concatToOutputEdge = {};
+            concatToOutputEdge.FromNodeIndex = concatNodeIndex;
+            concatToOutputEdge.FromNodeOutputIndex = 0;
+            concatToOutputEdge.GraphOutputIndex = OutputIndex::presentIndex;
+            outputEdges.push_back(concatToOutputEdge);
+        }
+
+        MLOperatorGraphDesc operatorGraphDesc = {};
+        operatorGraphDesc.inputEdgeCount = gsl::narrow_cast<uint32_t>(inputEdges.size());
+        operatorGraphDesc.inputEdges = inputEdges.data();
+        operatorGraphDesc.intermediateEdgeCount = gsl::narrow_cast<uint32_t>(intermediateEdges.size());
+        operatorGraphDesc.intermediateEdges = intermediateEdges.data();
+        operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
+        operatorGraphDesc.outputEdges = outputEdges.data();
+        operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
+        operatorGraphDesc.nodes = opDescs.data();
+
+        SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
+    }
+};
+
+void CALLBACK QueryQAttention(IMLOperatorSupportQueryContextPrivate* context, /*out*/ bool* isSupported)
+{
+    *isSupported = false;
+
+    // `unidirectional == 1` with Mask Tensor is not supported yet
+    MLOperatorAttributes attributes(context);
+    if (attributes.GetOptionalAttribute<int32_t>(AttrName::Unidirectional, 0) != 0 && context->IsInputValid(5))
+    {
+        return;
+    }
+
+    // `do_rotary == 1` is not supported yet
+    if (attributes.GetOptionalAttribute<int32_t>(AttrName::DoRotary, 0) != 0)
+    {
+        return;
+    }
+
+    // `past_present_share_buffer == 1` is not supported yet
+    if (attributes.GetOptionalAttribute<int32_t>(AttrName::PastPresentShareBuffer, 0) != 0)
+    {
+        return;
+    }
+
+    *isSupported = true;
+}
+
+DML_OP_DEFINE_CREATION_FUNCTION(QAttention, DmlOperatorQAttention);
+}  // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAdd.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAdd.cpp
index 7b50dfb9ff1a..a19e37e15e76 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAdd.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAdd.cpp
@@ -8,15 +8,15 @@ namespace Dml
 
 class DmlOperatorQLinearAdd : public DmlOperator
 {
-    enum InputTensors { 
-        IN_A, 
+    enum InputTensors {
+        IN_A,
         IN_A_SCALE,
-        IN_A_ZERO_POINT, 
-        IN_B, 
+        IN_A_ZERO_POINT,
+        IN_B,
         IN_B_SCALE,
         IN_B_ZERO_POINT,
-        IN_C_SCALE, 
-        IN_C_ZERO_POINT 
+        IN_C_SCALE,
+        IN_C_ZERO_POINT
     };
 
 public:
@@ -56,9 +56,18 @@ class DmlOperatorQLinearAdd : public DmlOperator
         AddDesc.BScaleTensor = &inputDescs[IN_B_SCALE];
         AddDesc.BZeroPointTensor = inputDescs[IN_B_ZERO_POINT].Desc != nullptr ? &inputDescs[IN_B_ZERO_POINT] : nullptr;
         AddDesc.OutputScaleTensor = &inputDescs[IN_C_SCALE];
-        AddDesc.OutputZeroPointTensor = inputDescs[IN_C_ZERO_POINT].Desc != nullptr ? &inputDescs[IN_C_ZERO_POINT] : nullptr; 
+        AddDesc.OutputZeroPointTensor = inputDescs[IN_C_ZERO_POINT].Desc != nullptr ? &inputDescs[IN_C_ZERO_POINT] : nullptr;
         AddDesc.OutputTensor = &outputDescs[0];
 
+        TryConvertTensorToBroadcastScalar(kernelInfo, AddDesc.AScaleTensor,           IN_A_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, AddDesc.AZeroPointTensor,       IN_A_ZERO_POINT);
+
+        TryConvertTensorToBroadcastScalar(kernelInfo, AddDesc.BScaleTensor,           IN_B_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, AddDesc.BZeroPointTensor,       IN_B_ZERO_POINT);
+
+        TryConvertTensorToBroadcastScalar(kernelInfo, AddDesc.OutputScaleTensor,      IN_C_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, AddDesc.OutputZeroPointTensor,  IN_C_ZERO_POINT);
+
         DML_OPERATOR_DESC opDesc = { DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD, &AddDesc };
         SetDmlOperatorDesc(opDesc, kernelInfo);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
new file mode 100644
index 000000000000..605e5fffb6a7
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
@@ -0,0 +1,156 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+namespace Dml
+{
+
+class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelperBase
+{
+    // For QLinear Avg Pool ORT and DML have same indexing order
+    enum OrtInputTensors : uint32_t
+    {
+        ortInput,
+        ortInputScale,
+        ortInputZeroPoint,
+        ortOutputScale,
+        ortOutputZeroPoint,
+        ortInputCount
+    };
+
+public:
+    using Self = DmlOperatorQLinearAveragePooling;
+
+    DmlOperatorQLinearAveragePooling(
+        const MLOperatorKernelCreationContext& kernelInfo,
+        bool useGlobalPooling
+        )
+    :   DmlOperator(kernelInfo),
+        PoolingHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), useGlobalPooling)
+    {
+        DmlOperator::Initialize(kernelInfo);
+
+        bool isNhwc = m_kernel.channelsLast;
+        std::vector<DimensionType> inputShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortInput);
+        std::vector<DimensionType> outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
+
+        uint32_t dmlDimSize = m_inputTensorDescs[OrtInputTensors::ortInput].GetDimensionCount();
+        ML_CHECK_VALID_ARGUMENT(dmlDimSize >= 2);
+        
+        // DML requires that DimensionCount be equal to Input.dmlDimSize - 2 for Pooling
+        uint32_t expectedSpatialDimCount = m_inputTensorDescs[0].GetDimensionCount() - 2;
+        if (m_kernel.spatialDimensionCount < expectedSpatialDimCount)
+        {
+            size_t shift = expectedSpatialDimCount - m_kernel.spatialDimensionCount;
+
+            for (int i = gsl::narrow_cast<int>(m_kernel.spatialDimensionCount) - 1; i >= 0; i--)
+            {
+                m_kernel.windowSize[i + shift] = m_kernel.windowSize[i];
+                m_kernel.windowSize[i] = 1;
+
+                m_kernel.strides[i + shift] = m_kernel.strides[i];
+                m_kernel.strides[i] = 1;
+
+                m_kernel.startPadding[i + shift] = m_kernel.startPadding[i];
+                m_kernel.startPadding[i] = 0;
+
+                m_kernel.endPadding[i + shift] = m_kernel.endPadding[i];
+                m_kernel.endPadding[i] = 0;
+
+                m_kernel.dilations[i + shift] = m_kernel.dilations[i];
+                m_kernel.dilations[i] = 1;
+            }
+
+            m_kernel.spatialDimensionCount = expectedSpatialDimCount;
+        }
+
+        // Initialize dimensionMapping for NCHW or NHWC layout
+        std::vector<uint32_t> dimensionMapping = {0u, dmlDimSize - 1u};
+        dimensionMapping.resize(dmlDimSize);
+        if (isNhwc)
+        {
+            // Form a remapping for dimensions so C is moved before the spatial dimensions.
+            // e.g. NWC   -> {0,2,1}     -> NCW
+            //      NHWC  -> {0,3,1,2}   -> NCHW
+            //      NDHWC -> {0,4,1,2,3} -> NCDHW
+            std::iota(dimensionMapping.begin() + 2, dimensionMapping.end(), 1u);
+        }
+        else
+        {
+            // Use NCHW {0,1,2,3} format with increasing order of indexs 
+            std::iota(dimensionMapping.begin() + 1, dimensionMapping.end(), 1u);
+        }
+        m_inputTensorDescs[OrtInputTensors::ortInput].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+
+        // Reshape the Input Scale to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        m_inputTensorDescs[OrtInputTensors::ortInputScale].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+
+        // Reshape the Input ZeroPoint to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        if (kernelInfo.IsInputValid(OrtInputTensors::ortInputZeroPoint))
+        {
+            m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+        }
+
+        // Reshape the Output Scale to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        m_inputTensorDescs[OrtInputTensors::ortOutputScale].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+
+        // Reshape the Input ZeroPoint to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint))
+        {
+            m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+        }
+
+        // Initialize the output description while overriding the shape
+        m_outputTensorDescs[0].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+
+        assert(m_kernel.spatialDimensionCount <= ARRAYSIZE(m_kernel.windowSize));
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC qLinearAvgPooldesc = {};
+
+        qLinearAvgPooldesc.InputTensor = &inputDescs[OrtInputTensors::ortInput];
+        qLinearAvgPooldesc.InputScaleTensor = &inputDescs[OrtInputTensors::ortInputScale];
+        qLinearAvgPooldesc.InputZeroPointTensor = &inputDescs[OrtInputTensors::ortInputZeroPoint];
+        qLinearAvgPooldesc.OutputScaleTensor = &inputDescs[OrtInputTensors::ortOutputScale];
+        qLinearAvgPooldesc.OutputZeroPointTensor = &inputDescs[OrtInputTensors::ortOutputZeroPoint];
+        qLinearAvgPooldesc.OutputTensor = &outputDescs[0];
+        qLinearAvgPooldesc.DimensionCount = m_kernel.spatialDimensionCount;
+        qLinearAvgPooldesc.WindowSize = m_kernel.windowSize;
+        qLinearAvgPooldesc.Strides = m_kernel.strides;
+        qLinearAvgPooldesc.StartPadding = m_kernel.startPadding;
+        qLinearAvgPooldesc.EndPadding = m_kernel.endPadding;
+        qLinearAvgPooldesc.Dilations = m_kernel.dilations;
+        qLinearAvgPooldesc.IncludePadding = kernelInfo.GetOptionalAttribute<bool>(AttrName::CountIncludePad, false);
+
+        TryConvertTensorToBroadcastScalar(kernelInfo, qLinearAvgPooldesc.InputScaleTensor,      OrtInputTensors::ortInputScale);
+        TryConvertTensorToBroadcastScalar(kernelInfo, qLinearAvgPooldesc.InputZeroPointTensor,  OrtInputTensors::ortInputZeroPoint);
+
+        TryConvertTensorToBroadcastScalar(kernelInfo, qLinearAvgPooldesc.OutputScaleTensor,     OrtInputTensors::ortOutputScale);
+        TryConvertTensorToBroadcastScalar(kernelInfo, qLinearAvgPooldesc.OutputZeroPointTensor, OrtInputTensors::ortOutputZeroPoint);
+
+        DML_OPERATOR_DESC opDesc = { (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING, &qLinearAvgPooldesc };
+        SetDmlOperatorDesc(opDesc, kernelInfo);
+    }
+};
+
+template <bool UseGlobalPooling>
+class DmlOperatorQuantizedPoolingTemplate : public DmlOperatorQLinearAveragePooling
+{
+public:
+    DmlOperatorQuantizedPoolingTemplate(const MLOperatorKernelCreationContext& kernelInfo)
+    :   DmlOperatorQLinearAveragePooling(kernelInfo, UseGlobalPooling)
+    {
+    }
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(QLinearAveragePool, DmlOperatorQuantizedPoolingTemplate<false>);
+DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQuantizedPoolingTemplate<true>);
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp
new file mode 100644
index 000000000000..8727610ff311
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp
@@ -0,0 +1,243 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+namespace Dml
+{
+// QLinearConcat = Dequantize + Join + Quantize
+class DmlOperatorQLinearConcat : public DmlOperator, public QLinearConcatHelper
+{
+    // This order matches the ONNX schema.
+    enum OnnxInputIndex
+    {
+        YScale,
+        YZeroPoint,
+        Count,
+    };
+
+public:
+    DmlOperatorQLinearConcat(const MLOperatorKernelCreationContext& kernelCreationContext)
+    :   DmlOperator(kernelCreationContext),
+        QLinearConcatHelper(kernelCreationContext, kernelCreationContext.GetTensorShapeDescription())
+    {
+        DmlOperator::Initialize(kernelCreationContext);
+
+        auto outputShape = kernelCreationContext.GetTensorShapeDescription().GetOutputTensorShape(0);
+
+        // inputs: {y_scale, y_zero_point, tuple(x_tensor, x_scale, x_zero_point)}
+        uint32_t inputDefinitionCount = kernelCreationContext.GetInputCount();
+        ML_CHECK_VALID_ARGUMENT(inputDefinitionCount >= 5, "Require at least 5 inputs.");
+        ML_CHECK_VALID_ARGUMENT((inputDefinitionCount - 2) % 3 == 0, "Each input must be (tensor, scale, zero_point) tuple!");
+
+        uint32_t inputCount = (inputDefinitionCount - 2) / 3;
+
+        auto yScaleDataType = kernelCreationContext.GetInputEdgeDescription(OnnxInputIndex::YScale).tensorDataType;
+        auto yZeroPointDataType = kernelCreationContext.GetInputEdgeDescription(OnnxInputIndex::YZeroPoint).tensorDataType;
+
+        // broadcast y_scale and y_zero_point to output shape
+        m_inputTensorDescs[OnnxInputIndex::YScale] = TensorDesc(
+            yScaleDataType,
+            outputShape,
+            kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(OnnxInputIndex::YScale),
+            TensorAxis::DoNotCoerce,
+            TensorAxis::W,
+            TensorAxis::RightAligned,
+            NchwDimensionCount, // minDimensionCount
+            0 // guaranteedBaseOffsetAlignment
+        );
+
+        m_inputTensorDescs[OnnxInputIndex::YZeroPoint] = TensorDesc(
+            yZeroPointDataType,
+            outputShape,
+            kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(OnnxInputIndex::YZeroPoint),
+            TensorAxis::DoNotCoerce,
+            TensorAxis::W,
+            TensorAxis::RightAligned,
+            NchwDimensionCount, // minDimensionCount
+            0 // guaranteedBaseOffsetAlignment
+        );
+
+        // Validate input tensors
+        for (uint32_t inputIndex = 0; inputIndex < inputCount; ++inputIndex)
+        {
+            // Inputs(input tensor, scale, zero_point) are in tuple and starting from index 2
+            auto tupleStartIndex = 2 + inputIndex * 3;
+            auto xScaleDataType = kernelCreationContext.GetInputEdgeDescription(tupleStartIndex + 1).tensorDataType;
+            auto xZeroPointDataType = kernelCreationContext.GetInputEdgeDescription(tupleStartIndex + 2).tensorDataType;
+            ML_CHECK_VALID_ARGUMENT(xScaleDataType == yScaleDataType, "Wrong input type encountered for scale");
+            ML_CHECK_VALID_ARGUMENT(xZeroPointDataType == yZeroPointDataType, "Wrong input type encountered for zero point");
+
+            // broadcast x_scale and x_zero_point to shape of corresponding x
+            m_inputTensorDescs[tupleStartIndex + 1] = TensorDesc(
+                xScaleDataType,
+                kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(tupleStartIndex),
+                kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(tupleStartIndex + 1),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount, // minDimensionCount
+                0 // guaranteedBaseOffsetAlignment
+            );
+
+            m_inputTensorDescs[tupleStartIndex + 2] = TensorDesc(
+                xZeroPointDataType,
+                kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(tupleStartIndex),
+                kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(tupleStartIndex + 2),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount, // minDimensionCount
+                0 // guaranteedBaseOffsetAlignment
+            );
+        }
+
+        uint32_t dmlAxis = GetDmlAdjustedAxis(m_axis, kernelCreationContext, m_inputTensorDescs.front().GetDimensionCount(), 2);
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        // 1. output edges between Dequantize and Join node
+        // 2. input edge between Join and Quantize node
+        std::vector<TensorDesc> intermediateOutputTensorDescs(inputCount);
+        std::vector<DML_TENSOR_DESC> namedDequantizeOperatorDescs(inputCount);
+        std::vector<DML_ELEMENT_WISE_DEQUANTIZE_LINEAR_OPERATOR_DESC> dequantizeOperatorDescs(inputCount);
+        std::vector<DML_OPERATOR_DESC> dmlOpDesc(inputCount);
+        std::vector<const DML_OPERATOR_DESC*> opDescs;
+        for (uint32_t inputIndex = 0; inputIndex < inputCount; ++inputIndex)
+        {
+            auto tupleStartIndex = 2 + inputIndex * 3;
+            intermediateOutputTensorDescs[inputIndex] = TensorDesc(
+                MLOperatorTensorDataType::Float,
+                kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(tupleStartIndex),
+                kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(tupleStartIndex),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount, // minDimensionCount
+                0 // guaranteedBaseOffsetAlignment)
+            );
+            namedDequantizeOperatorDescs[inputIndex] = intermediateOutputTensorDescs[inputIndex].GetDmlDesc();
+
+            dequantizeOperatorDescs[inputIndex].InputTensor = &inputDescs[tupleStartIndex];
+            dequantizeOperatorDescs[inputIndex].ScaleTensor = &inputDescs[tupleStartIndex + 1];
+            dequantizeOperatorDescs[inputIndex].ZeroPointTensor = &inputDescs[tupleStartIndex + 2];
+            dequantizeOperatorDescs[inputIndex].OutputTensor = &namedDequantizeOperatorDescs[inputIndex];
+            
+            TryConvertTensorToBroadcastScalar(kernelCreationContext, dequantizeOperatorDescs[inputIndex].ScaleTensor,     tupleStartIndex + 1);
+            TryConvertTensorToBroadcastScalar(kernelCreationContext, dequantizeOperatorDescs[inputIndex].ZeroPointTensor, tupleStartIndex + 2);
+
+            dmlOpDesc[inputIndex] = {DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR, &dequantizeOperatorDescs[inputIndex]};
+            opDescs.push_back(&dmlOpDesc[inputIndex]);
+        }
+
+        TensorDesc joinOutputTensorDesc = TensorDesc(
+            MLOperatorTensorDataType::Float,
+            outputShape,
+            outputShape,
+            TensorAxis::DoNotCoerce,
+            TensorAxis::W,
+            TensorAxis::RightAligned,
+            NchwDimensionCount, // minDimensionCount
+            0 // guaranteedBaseOffsetAlignment
+            );
+        DML_TENSOR_DESC namedJoinOutputTensorDesc = joinOutputTensorDesc.GetDmlDesc();
+
+        DML_JOIN_OPERATOR_DESC joinDesc = {};
+        joinDesc.InputCount = gsl::narrow_cast<uint32_t>(namedDequantizeOperatorDescs.size());
+        joinDesc.InputTensors = namedDequantizeOperatorDescs.data();
+        joinDesc.OutputTensor = &namedJoinOutputTensorDesc;
+        joinDesc.Axis = dmlAxis;
+
+        const DML_OPERATOR_DESC opJoinDesc = {DML_OPERATOR_JOIN, &joinDesc};
+        opDescs.push_back(&opJoinDesc);
+
+        DML_ELEMENT_WISE_QUANTIZE_LINEAR_OPERATOR_DESC quantizeOperatorDesc = {};
+        quantizeOperatorDesc.InputTensor = joinDesc.OutputTensor;
+        quantizeOperatorDesc.ScaleTensor = &inputDescs[OnnxInputIndex::YScale];
+        quantizeOperatorDesc.ZeroPointTensor = &inputDescs[OnnxInputIndex::YZeroPoint];
+        quantizeOperatorDesc.OutputTensor = &outputDescs[0];
+        
+        TryConvertTensorToBroadcastScalar(kernelCreationContext, quantizeOperatorDesc.ScaleTensor,     OnnxInputIndex::YScale);
+        TryConvertTensorToBroadcastScalar(kernelCreationContext, quantizeOperatorDesc.ZeroPointTensor, OnnxInputIndex::YZeroPoint);
+
+        const DML_OPERATOR_DESC opQuantizeDesc = {DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR, &quantizeOperatorDesc};
+        opDescs.push_back(&opQuantizeDesc);
+
+        MLOperatorGraphDesc operatorGraphDesc = {};
+        operatorGraphDesc.nodeCount = static_cast<uint32_t>(opDescs.size());
+        operatorGraphDesc.nodes = opDescs.data();
+
+        uint32_t joinNodeIndex = operatorGraphDesc.nodeCount - 2;
+        uint32_t quantizeNodeIndex = operatorGraphDesc.nodeCount - 1;
+
+        std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
+        // Input edges to Dequantize nodes
+        for (uint32_t inputIndex = 0; inputIndex < inputCount; ++inputIndex)
+        {
+            auto tupleStartIndex = 2 + inputIndex * 3;
+            for (auto edge_index = 0; edge_index < 3; ++edge_index)
+            {
+                DML_INPUT_GRAPH_EDGE_DESC inputEdge = {};
+                inputEdge.GraphInputIndex = tupleStartIndex + edge_index;
+                inputEdge.ToNodeIndex = inputIndex;
+                inputEdge.ToNodeInputIndex = edge_index;
+                inputEdges.push_back(inputEdge);
+            }
+        }
+
+        // Input edge from y_scale to quantize node
+        DML_INPUT_GRAPH_EDGE_DESC yScaleInputEdge = {};
+        yScaleInputEdge.GraphInputIndex = 0; // Y_scale
+        yScaleInputEdge.ToNodeIndex = quantizeNodeIndex;
+        yScaleInputEdge.ToNodeInputIndex = 1;
+        inputEdges.push_back(yScaleInputEdge);
+
+        // Input edge from y_zero_point to quantize node
+        DML_INPUT_GRAPH_EDGE_DESC yZeroPointInputEdge = {};
+        yZeroPointInputEdge.GraphInputIndex = 1; // Y_zero_point
+        yZeroPointInputEdge.ToNodeIndex = quantizeNodeIndex;
+        yZeroPointInputEdge.ToNodeInputIndex = 2;
+        inputEdges.push_back(yZeroPointInputEdge);
+
+        operatorGraphDesc.inputEdgeCount = gsl::narrow_cast<uint32_t>(inputEdges.size());
+        operatorGraphDesc.inputEdges = inputEdges.data();
+
+        // set intermediate edges
+        std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
+        for (uint32_t inputIndex = 0; inputIndex < inputCount; ++inputIndex)
+        {
+            DML_INTERMEDIATE_GRAPH_EDGE_DESC dequantizeToJoinEdge = {};
+            dequantizeToJoinEdge.FromNodeIndex = inputIndex;
+            dequantizeToJoinEdge.FromNodeOutputIndex = 0;
+            dequantizeToJoinEdge.ToNodeIndex = joinNodeIndex; // The second last node Join
+            dequantizeToJoinEdge.ToNodeInputIndex = inputIndex;
+            intermediateEdges.push_back(dequantizeToJoinEdge);
+        }
+
+        DML_INTERMEDIATE_GRAPH_EDGE_DESC joinToQuantizeEdge = {};
+        joinToQuantizeEdge.FromNodeIndex = joinNodeIndex;
+        joinToQuantizeEdge.FromNodeOutputIndex = 0;
+        joinToQuantizeEdge.ToNodeIndex = quantizeNodeIndex; // The second last node Join
+        joinToQuantizeEdge.ToNodeInputIndex = 0;
+        intermediateEdges.push_back(joinToQuantizeEdge);
+
+        operatorGraphDesc.intermediateEdgeCount = gsl::narrow_cast<uint32_t>(intermediateEdges.size());
+        operatorGraphDesc.intermediateEdges = intermediateEdges.data();
+
+        // set the output edges
+        std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
+        DML_OUTPUT_GRAPH_EDGE_DESC outputEdge = {};
+        outputEdge.FromNodeIndex = quantizeNodeIndex;
+        outputEdge.FromNodeOutputIndex = 0;
+        outputEdge.GraphOutputIndex = 0;
+        outputEdges.push_back(outputEdge);
+        operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
+        operatorGraphDesc.outputEdges = outputEdges.data();
+
+        SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
+    };
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(QLinearConcat, DmlOperatorQLinearConcat);
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp
index d45fdef3c880..4e121a6502cb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp
@@ -117,6 +117,15 @@ class DmlOperatorQLinearConv : public DmlOperator, public ConvolutionHelperBase
         convDesc.EndPadding = kernelArgs.endPadding;
         convDesc.GroupCount = m_groupCount;
 
+        TryConvertTensorToBroadcastScalar(kernelInfo, convDesc.InputScaleTensor,      IN_X_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, convDesc.InputZeroPointTensor,  IN_X_ZERO_POINT);
+
+        TryConvertTensorToBroadcastScalar(kernelInfo, convDesc.FilterScaleTensor,     IN_F_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, convDesc.FilterZeroPointTensor, IN_F_ZERO_POINT);
+
+        TryConvertTensorToBroadcastScalar(kernelInfo, convDesc.OutputScaleTensor,     IN_Y_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, convDesc.OutputZeroPointTensor, IN_Y_ZERO_POINT);
+
         DML_OPERATOR_DESC opDesc = { DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION, &convDesc };
         SetDmlOperatorDesc(opDesc, kernelInfo);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearMatMul.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearMatMul.cpp
index b746a0e81a5c..b38acd8cbf97 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearMatMul.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearMatMul.cpp
@@ -104,6 +104,15 @@ class DmlOperatorQLinearMatMul : public DmlOperator
         matMulDesc.OutputZeroPointTensor = inputDescs[IN_Y_ZERO_POINT].Desc != nullptr ? &inputDescs[IN_Y_ZERO_POINT] : nullptr; 
         matMulDesc.OutputTensor = &outputDescs[0];
 
+        TryConvertTensorToBroadcastScalar(kernelInfo, matMulDesc.AScaleTensor,           IN_A_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, matMulDesc.AZeroPointTensor,       IN_A_ZERO_POINT);
+
+        TryConvertTensorToBroadcastScalar(kernelInfo, matMulDesc.BScaleTensor,           IN_B_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, matMulDesc.BZeroPointTensor,       IN_B_ZERO_POINT);
+
+        TryConvertTensorToBroadcastScalar(kernelInfo, matMulDesc.OutputScaleTensor,      IN_Y_SCALE);
+        TryConvertTensorToBroadcastScalar(kernelInfo, matMulDesc.OutputZeroPointTensor,  IN_Y_ZERO_POINT);
+
         DML_OPERATOR_DESC opDesc = { DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY, &matMulDesc };
         SetDmlOperatorDesc(opDesc, kernelInfo);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
index 1da4a5cab762..bc0082fef349 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp
@@ -88,6 +88,9 @@ class DmlOperatorQLinearSigmoid : public DmlOperator
         dequantizeOperatorDesc.ScaleTensor = &inputDescs[OnnxInputIndex::X_scale];
         dequantizeOperatorDesc.ZeroPointTensor = &inputDescs[OnnxInputIndex::X_zero_point];
         dequantizeOperatorDesc.OutputTensor = &namedIntermediateOutputTensorDesc;
+
+        TryConvertTensorToBroadcastScalar(kernelCreationContext, dequantizeOperatorDesc.ScaleTensor,     OnnxInputIndex::X_scale);
+        TryConvertTensorToBroadcastScalar(kernelCreationContext, dequantizeOperatorDesc.ZeroPointTensor, OnnxInputIndex::X_zero_point);
         
         const DML_OPERATOR_DESC opDesc1{DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR, &dequantizeOperatorDesc};
 
@@ -101,12 +104,16 @@ class DmlOperatorQLinearSigmoid : public DmlOperator
         quantizeOperatorDesc.ScaleTensor = &inputDescs[OnnxInputIndex::Y_scale];
         quantizeOperatorDesc.ZeroPointTensor = &inputDescs[OnnxInputIndex::Y_zero_point];
         quantizeOperatorDesc.OutputTensor = &outputDescs[0];
+        
+        TryConvertTensorToBroadcastScalar(kernelCreationContext, quantizeOperatorDesc.ScaleTensor,     OnnxInputIndex::Y_scale);
+        TryConvertTensorToBroadcastScalar(kernelCreationContext, quantizeOperatorDesc.ZeroPointTensor, OnnxInputIndex::Y_zero_point);
+
         const DML_OPERATOR_DESC opDesc3{DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR, &quantizeOperatorDesc};
 
         MLOperatorGraphDesc operatorGraphDesc = {};
         operatorGraphDesc.nodeCount = 3;
         std::vector<const DML_OPERATOR_DESC*> opDescs{&opDesc1, &opDesc2, &opDesc3};
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         // set input edges
         std::pair<uint32_t, uint32_t> nodeToNodeInputIndex[5] {{0, 0}, {0, 1}, {0, 2}, {2, 1}, {2, 2}};
@@ -171,4 +178,4 @@ void CALLBACK QueryQLinearSigmoid(IMLOperatorSupportQueryContextPrivate* context
 }
 
 DML_OP_DEFINE_CREATION_FUNCTION(QLinearSigmoid, DmlOperatorQLinearSigmoid);
-} // namespace Dml
+} // namespace Dml
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp
index 3683ab7b0b0b..e62b7d707ba7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp
@@ -123,7 +123,7 @@ class DmlOperatorQuickGelu : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
index f332fac9d3a0..5256e01f86fb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
@@ -9,11 +9,12 @@ namespace Dml
 constexpr NameAndIndex coordinateTransformationModes[] =
 {
     {"half_pixel", 0},
-    {"pytorch_half_pixel", 1},
-    {"align_corners", 2},
-    {"asymmetric", 3},
-    {"tf_half_pixel_for_nn", 4},
-    {"tf_crop_and_resize", 5},
+    {"half_pixel_symmetric", 1},
+    {"pytorch_half_pixel", 2},
+    {"align_corners", 3},
+    {"asymmetric", 4},
+    {"tf_half_pixel_for_nn", 5},
+    {"tf_crop_and_resize", 6},
 };
 
 constexpr NameAndIndex nearestNeighborRoundingModes[] =
@@ -50,7 +51,7 @@ void ComputePixelOffsetsAndScales(
     uint32_t coordinateTransformationModeValue = *optionalCoordinateTransformationModeValue;
 
     ML_CHECK_VALID_ARGUMENT(
-        !regionOfInterest.empty() || coordinateTransformationModeValue != 5 /*tf_crop_and_resize*/,
+        !regionOfInterest.empty() || coordinateTransformationModeValue != 6 /*tf_crop_and_resize*/,
         "Resize expects 'roi' tensor for 'tf_crop_and_resize' mode."
     );
 
@@ -88,6 +89,18 @@ void ComputePixelOffsetsAndScales(
             break;
 
         case 1:
+            // coordinate_transformation_mode is "half_pixel_symmetric",
+            // adjustment = output_width_int / output_width
+            // center = input_width / 2
+            // offset = center * (1 - adjustment)
+            // x_original = (x + 0.5) / scale - (0.5 - offset)
+            // x_original = (x + 0.5) / scale - (0.5 - [(input_width / 2) * (1 - (output_width_int / output_width))])
+            // output_width can be fractional when calculated with scale factor
+            inputPixelOffset = 0.5f - float((inputDimensions[i] / 2.0f) * (1.0f - outputDimensions[i] / (scales[i] * inputDimensions[i])));
+            outputPixelOffset = -0.5;
+            break;
+
+        case 2:
             // if coordinate_transformation_mode is "pytorch_half_pixel",
             // x_original = length_resized > 1 ? (x_resized + 0.5) / scale - 0.5 : 0
             if (inputDimensions[i] <= 1)
@@ -104,7 +117,7 @@ void ComputePixelOffsetsAndScales(
             }
             break;
 
-        case 2:
+        case 3:
             // if coordinate_transformation_mode is "align_corners",
             // x_original = x_resized * (length_original - 1) / (length_resized - 1)
             inputPixelOffset = 0.0;
@@ -121,7 +134,7 @@ void ComputePixelOffsetsAndScales(
             }
             break;
 
-        case 3:
+        case 4:
             // if coordinate_transformation_mode is "asymmetric",
             // x_original = x_resized / scale
             inputPixelOffset = 0.0;
@@ -129,7 +142,7 @@ void ComputePixelOffsetsAndScales(
             // Keep existing scales.
             break;
 
-        case 4:
+        case 5:
             // if coordinate_transformation_mode is "tf_half_pixel_for_nn",
             // x_original = (x_resized + 0.5) / scale
             inputPixelOffset = 0.0;
@@ -137,7 +150,7 @@ void ComputePixelOffsetsAndScales(
             // Keep existing scales.
             break;
 
-        case 5:
+        case 6:
             // if coordinate_transformation_mode is "tf_crop_and_resize",
             // x_original = length_resized > 1 ? start_x * (length_original - 1) + x_resized * (end_x - start_x) * (length_original - 1) / (length_resized - 1)
             //                                 : 0.5 * (start_x + end_x) * (length_original - 1)
@@ -177,7 +190,7 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
 public:
     // Resample a multidimensional image to a new size.
     DmlOperatorResize(const MLOperatorKernelCreationContext& kernelCreationContext, uint32_t opsetVersion)
-    :   DmlOperator(kernelCreationContext), 
+    :   DmlOperator(kernelCreationContext),
         ResizeHelper(kernelCreationContext, kernelCreationContext.GetTensorShapeDescription(), opsetVersion)
     {
         ML_CHECK_VALID_ARGUMENT(!m_scales.empty(), "Resize/Upsample expect scales, either a 2nd input tensors or 'scales' attribute.");
@@ -250,6 +263,11 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         std::string mode = kernelCreationContext.GetOptionalAttribute<std::string>(AttrName::Mode, "NEAREST");
         DML_INTERPOLATION_MODE interpolationMode = Dml::MapStringToInteropolationMode(mode);
 
+
+#if DML_TARGET_VERSION >= 0x6400
+        const int antialiased = kernelCreationContext.GetOptionalAttribute<int>(AttrName::Antialiased, 0);
+#endif
+
         // Map ONNX to DML's mode using offsets and rounding direction.
         // These offsets are in addition to the coordinate transform offsets.
         DML_AXIS_DIRECTION roundingDirection = DML_AXIS_DIRECTION_DECREASING;
@@ -289,7 +307,12 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
+#if DML_TARGET_VERSION >= 0x6400
+        DML_RESAMPLE3_OPERATOR_DESC operatorDesc = {};
+        operatorDesc.Antialiased = static_cast<BOOL>(antialiased);
+#else
         DML_RESAMPLE2_OPERATOR_DESC operatorDesc = {};
+#endif
         operatorDesc.InputTensor = inputDescs.data();
         operatorDesc.OutputTensor = outputDescs.data();
         operatorDesc.InterpolationMode = interpolationMode;
@@ -298,8 +321,11 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         operatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(paddedScales.size());
         operatorDesc.InputPixelOffsets = inputPixelOffsets.data();
         operatorDesc.OutputPixelOffsets = outputPixelOffsets.data();
-
+#if DML_TARGET_VERSION >= 0x6400
+        DML_OPERATOR_DESC opDesc = { DML_OPERATOR_RESAMPLE3, &operatorDesc };
+#else
         DML_OPERATOR_DESC opDesc = { DML_OPERATOR_RESAMPLE2, &operatorDesc };
+#endif
         SetDmlOperatorDesc(opDesc, kernelCreationContext);
     }
 };
@@ -342,6 +368,10 @@ void CALLBACK QueryResize(IMLOperatorSupportQueryContextPrivate* context, bool*
 DML_OP_DEFINE_CREATION_FUNCTION(Resize10, VersionedKernel<DmlOperatorResize, 10>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize11, VersionedKernel<DmlOperatorResize, 11>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize13, VersionedKernel<DmlOperatorResize, 13>);
+#if DML_TARGET_VERSION >= 0x6400
+DML_OP_DEFINE_CREATION_FUNCTION(Resize18, VersionedKernel<DmlOperatorResize, 18>);
+DML_OP_DEFINE_CREATION_FUNCTION(Resize19, VersionedKernel<DmlOperatorResize, 19>);
+#endif
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample7, VersionedKernel<DmlOperatorResize, 7>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample9, VersionedKernel<DmlOperatorResize, 9>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample10, VersionedKernel<DmlOperatorResize, 10>);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp
index 30c339b845b3..0f15ebf342b3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp
@@ -43,6 +43,10 @@ class DmlOperatorRotaryEmbedding : public DmlOperator
         ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() == 4);
         ML_CHECK_VALID_ARGUMENT(kernelInfo.GetOutputCount() == 1);
 
+        // When the input is 4D, it has the shape [batchSize, numHeads, sequenceLength, headSize]. Otherwise,
+        // it has the shape [batchSize, sequenceLength, hiddenSize]
+        const bool inputIs4D = kernelInfo.GetInputTensorDimensionCount(inputDataIndex) == 4;
+
         // When positionIds is a scalar, it represents the start offset for each sequence
         const bool positionIdsIsOffset = kernelInfo.GetInputTensorDimensionCount(positionIdsIndex) == 1;
 
@@ -63,9 +67,9 @@ class DmlOperatorRotaryEmbedding : public DmlOperator
 
         // We resize the data to be of shape [batchSize, sequenceLength, numHeads, headSize]
         const auto inputDataSizes = m_inputTensorDescs[inputDataIndex].GetSizes();
-        const uint32_t batchSize = inputDataSizes[1];
+        const uint32_t batchSize = inputIs4D ? inputDataSizes[0] : inputDataSizes[1];
         const uint32_t sequenceLength = inputDataSizes[2];
-        const uint32_t numHeads = inputDataSizes[3] / headSize;
+        const uint32_t numHeads = inputIs4D ? inputDataSizes[1] : inputDataSizes[3] / headSize;
 
         const auto cosCacheSizes = m_inputTensorDescs[cosCacheIndex].GetSizes();
         const uint32_t maxSequenceLength = cosCacheSizes[cosCacheSizes.size() - 2];
@@ -80,16 +84,24 @@ class DmlOperatorRotaryEmbedding : public DmlOperator
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         const MLOperatorTensorDataType dataType = kernelInfo.GetInputEdgeDescription(inputDataIndex).tensorDataType;
 
-        // Splitting the hiddenSize into numHeads and headSize dimensions makes it easier for DML to handle
         const std::array<uint32_t, 4> inputOutputShape = {batchSize, sequenceLength, numHeads, headSize};
         TensorDesc inputOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, inputOutputShape);
+        TensorDesc stridedInputOutputTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, inputOutputShape);
+
+        if (inputIs4D)
+        {
+            const std::array<uint32_t, 4> inputOutputStrides = {headSize * numHeads * sequenceLength, headSize, sequenceLength * headSize, 1};
+            stridedInputOutputTensorDesc.SetStrides(inputOutputStrides);
+        }
+
         const DML_TENSOR_DESC inputOutputDmlTensorDesc = inputOutputTensorDesc.GetDmlDesc();
+        const DML_TENSOR_DESC stridedInputOutputDmlTensorDesc = stridedInputOutputTensorDesc.GetDmlDesc();
 
         // Copy the input to preserve its real input shape in the graph without reshaping it. This will disappear during DML's graph compilation phase.
         DML_SCALE_BIAS scaleBias = {1.0f, 0.0f};
 
         DML_ELEMENT_WISE_IDENTITY_OPERATOR_DESC copyInputDesc{};
-        copyInputDesc.InputTensor = &inputOutputDmlTensorDesc;
+        copyInputDesc.InputTensor = &stridedInputOutputDmlTensorDesc;
         copyInputDesc.OutputTensor = &inputOutputDmlTensorDesc;
         copyInputDesc.ScaleBias = &scaleBias;
         const DML_OPERATOR_DESC copyInputDmlDesc = {DML_OPERATOR_ELEMENT_WISE_IDENTITY, &copyInputDesc};
@@ -104,8 +116,12 @@ class DmlOperatorRotaryEmbedding : public DmlOperator
             : std::vector<uint32_t>({batchSize, sequenceLength, numHeads, 1, headSize / 2});
 
         TensorDesc inputDataTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, inputDataTensorShape);
+
         const DML_TENSOR_DESC inputDataDmlTensorDesc = inputDataTensorDesc.GetDmlDesc();
 
+        TensorDesc joinedDataTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, inputDataTensorShape);
+        const DML_TENSOR_DESC joinedDataDmlTensorDesc = joinedDataTensorDesc.GetDmlDesc();
+
         TensorDesc splitInputDataTensorDesc = TensorDesc::ConstructDefaultTensorDesc(dataType, splitInputDataTensorShape);
         const std::array<DML_TENSOR_DESC, 2> splitInputDataDmlTensorDescs = {splitInputDataTensorDesc.GetDmlDesc(), splitInputDataTensorDesc.GetDmlDesc()};
 
@@ -122,7 +138,7 @@ class DmlOperatorRotaryEmbedding : public DmlOperator
         // Swap the 2 halves and join them together
         DML_JOIN_OPERATOR_DESC joinInputDesc{};
         joinInputDesc.InputTensors = splitInputDataDmlTensorDescs.data();
-        joinInputDesc.OutputTensor = &inputDataDmlTensorDesc;
+        joinInputDesc.OutputTensor = &joinedDataDmlTensorDesc;
         joinInputDesc.Axis = splitInputDesc.Axis;
         joinInputDesc.InputCount = gsl::narrow_cast<uint32_t>(splitInputDataDmlTensorDescs.size());
         const DML_OPERATOR_DESC joinInputDmlDesc = {DML_OPERATOR_JOIN, &joinInputDesc};
@@ -212,23 +228,23 @@ class DmlOperatorRotaryEmbedding : public DmlOperator
         const DML_TENSOR_DESC broadcastedSignDmlTensorDesc = broadcastedSignCosSinTensorDesc.GetDmlDesc();
 
         DML_ELEMENT_WISE_MULTIPLY_OPERATOR_DESC mulSignDesc{};
-        mulSignDesc.ATensor = &inputDataDmlTensorDesc;
+        mulSignDesc.ATensor = &joinedDataDmlTensorDesc;
         mulSignDesc.BTensor = &broadcastedSignDmlTensorDesc;
-        mulSignDesc.OutputTensor = &inputDataDmlTensorDesc;
+        mulSignDesc.OutputTensor = &joinedDataDmlTensorDesc;
         const DML_OPERATOR_DESC mulSignDmlDesc = {DML_OPERATOR_ELEMENT_WISE_MULTIPLY, &mulSignDesc};
 
         // Multiply the non-rotated data with the cos and the rotated data with the sin
         DML_ELEMENT_WISE_MULTIPLY_OPERATOR_DESC mulCosSinDesc{};
-        mulCosSinDesc.ATensor = &inputDataDmlTensorDesc;
+        mulCosSinDesc.ATensor = &joinedDataDmlTensorDesc;
         mulCosSinDesc.BTensor = &broadcastedCosSinDmlTensorDesc;
-        mulCosSinDesc.OutputTensor = &inputDataDmlTensorDesc;
+        mulCosSinDesc.OutputTensor = &joinedDataDmlTensorDesc;
         const DML_OPERATOR_DESC mulCosSinDmlDesc = {DML_OPERATOR_ELEMENT_WISE_MULTIPLY, &mulCosSinDesc};
 
         // Add the multiplied cos and sin values together
         DML_ELEMENT_WISE_ADD_OPERATOR_DESC addDesc{};
         addDesc.ATensor = &inputOutputDmlTensorDesc;
         addDesc.BTensor = &inputOutputDmlTensorDesc;
-        addDesc.OutputTensor = &inputOutputDmlTensorDesc;
+        addDesc.OutputTensor = &stridedInputOutputDmlTensorDesc;
         const DML_OPERATOR_DESC addDmlDesc = {DML_OPERATOR_ELEMENT_WISE_ADD, &addDesc};
 
         // Construct the graph
@@ -425,7 +441,7 @@ class DmlOperatorRotaryEmbedding : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelInfo);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
index 4dafd78f21ea..094c45a0e38e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
@@ -198,7 +198,7 @@ class DmlOperatorSkipLayerNormalization : public DmlOperator
         operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
         operatorGraphDesc.outputEdges = outputEdges.data();
         operatorGraphDesc.nodeCount = gsl::narrow_cast<uint32_t>(opDescs.size());
-        operatorGraphDesc.nodesAsOpDesc = opDescs.data();
+        operatorGraphDesc.nodes = opDescs.data();
 
         SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham.h
index 9c03b7f6de63..1bfd6e6c6068 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham.h
@@ -21,7 +21,7 @@ dcl_uav_structured u0, 4
 dcl_uav_structured u1, 4
 dcl_uav_structured u2, 4
 dcl_input vThreadID.x
-dcl_temps 6
+dcl_temps 5
 dcl_thread_group 64, 1, 1
 iadd r0.x, vThreadID.x, cb0[0].x
 ult r0.y, r0.x, cb0[0].y
@@ -40,66 +40,57 @@ if_nz r0.y
   ieq r1.y, cb0[7].x, l(1)
   ult r1.z, r0.w, cb0[5].z
   and r1.z, r1.z, r1.y
-  if_nz r1.z
-    imul null, r1.z, r0.w, cb0[6].z
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r1.z, l(0), u2.xxxx
-    imad r1.z, r0.w, cb0[6].z, cb0[6].w
-    ieq r1.w, cb0[5].w, l(2)
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.z, r1.z, l(0), u2.xxxx
-    and r4.y, r1.z, r1.w
+  imul null, r1.w, r0.w, cb0[6].z
+  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r1.w, l(0), u2.xxxx
+  ieq r1.w, cb0[5].w, l(2)
+  if_nz r1.w
+    imad r2.y, r0.w, cb0[6].z, cb0[6].w
+    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.y, r2.y, l(0), u2.xxxx
   else 
-    mov r4.xy, l(1.000000,0,0,0)
+    mov r4.y, l(0)
   endif 
+  movc r2.yz, r1.zzzz, r4.yyxy, l(0,0,1.000000,0)
   ult r1.z, r0.w, cb0[1].y
-  if_nz r1.z
-    imul null, r0.w, r0.w, cb0[2].y
-    imad r0.w, r1.x, cb0[2].x, r0.w
-    imad r0.w, r3.x, cb0[2].z, r0.w
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r5.x, r0.w, l(0), u0.xxxx
-    ieq r1.z, cb0[1].w, l(2)
-    if_nz r1.z
-      iadd r0.w, r0.w, cb0[2].w
-      ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r5.y, r0.w, l(0), u0.xxxx
-    else 
-      mov r5.y, l(0)
-    endif 
+  imul null, r1.x, r1.x, cb0[2].x
+  imad r0.w, r0.w, cb0[2].y, r1.x
+  imad r0.w, r3.x, cb0[2].z, r0.w
+  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r0.w, l(0), u0.xxxx
+  ieq r2.w, cb0[1].w, l(2)
+  if_nz r2.w
+    iadd r0.w, r0.w, cb0[2].w
+    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.y, r0.w, l(0), u0.xxxx
   else 
-    mov r5.xy, l(0,0,0,0)
+    mov r4.y, l(0)
   endif 
-  mul r0.w, r4.y, r5.y
-  mad r0.w, r5.x, r4.x, -r0.w
-  dp2 r1.z, r5.yxyy, r4.xyxx
-  ult r1.w, r0.y, cb0[5].z
-  and r1.y, r1.w, r1.y
-  if_nz r1.y
-    imul null, r1.y, r0.y, cb0[6].z
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r1.y, l(0), u2.xxxx
-    imad r1.y, r0.y, cb0[6].z, cb0[6].w
-    ieq r1.w, cb0[5].w, l(2)
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.y, r1.y, l(0), u2.xxxx
-    and r4.y, r1.y, r1.w
+  and r3.yz, r1.zzzz, r4.xxyx
+  mul r0.w, r2.y, r3.z
+  mad r0.w, r3.y, r2.z, -r0.w
+  dp2 r1.z, r3.yzyy, r2.yzyy
+  ult r2.y, r0.y, cb0[5].z
+  and r1.y, r1.y, r2.y
+  imul null, r2.y, r0.y, cb0[6].z
+  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.x, r2.y, l(0), u2.xxxx
+  if_nz r1.w
+    imad r1.w, r0.y, cb0[6].z, cb0[6].w
+    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r4.y, r1.w, l(0), u2.xxxx
   else 
-    mov r4.xy, l(1.000000,0,0,0)
+    mov r4.y, l(0)
   endif 
-  ult r1.y, r0.y, cb0[1].y
-  if_nz r1.y
-    imul null, r0.y, r0.y, cb0[2].y
-    imad r0.y, r1.x, cb0[2].x, r0.y
-    imad r0.y, r3.x, cb0[2].z, r0.y
-    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.x, r0.y, l(0), u0.xxxx
-    ieq r1.w, cb0[1].w, l(2)
-    if_nz r1.w
-      iadd r0.y, r0.y, cb0[2].w
-      ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r1.y, r0.y, l(0), u0.xxxx
-    else 
-      mov r1.y, l(0)
-    endif 
+  movc r1.yw, r1.yyyy, r4.yyyx, l(0,0,0,1.000000)
+  ult r2.y, r0.y, cb0[1].y
+  imad r0.y, r0.y, cb0[2].y, r1.x
+  imad r0.y, r3.x, cb0[2].z, r0.y
+  ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r3.x, r0.y, l(0), u0.xxxx
+  if_nz r2.w
+    iadd r0.y, r0.y, cb0[2].w
+    ld_structured_indexable(structured_buffer, stride=4)(mixed,mixed,mixed,mixed) r3.y, r0.y, l(0), u0.xxxx
   else 
-    mov r1.xy, l(0,0,0,0)
+    mov r3.y, l(0)
   endif 
-  mul r0.y, r4.y, r1.y
-  mad r0.y, r1.x, r4.x, -r0.y
-  dp2 r1.x, r1.yxyy, r4.xyxx
+  and r2.yz, r2.yyyy, r3.xxyx
+  mul r0.y, r1.y, r2.z
+  mad r0.y, r2.y, r1.w, -r0.y
+  dp2 r1.x, r2.yzyy, r1.ywyy
   udiv null, r1.y, r2.x, r0.z
   ieq r1.w, cb0[0].w, l(1)
   movc r1.w, r1.w, l(6.283185), l(-6.283185)
@@ -117,17 +108,22 @@ if_nz r0.y
   mad r0.y, r3.x, r1.x, r0.y
   add r0.y, r0.y, r1.z
   mul r0.yw, r0.yyyw, cb0[7].zzzz
-  ne r1.x, cb0[7].y, l(0.000000)
-  mul r1.y, r1.y, r1.y
-  mul r1.y, r1.y, l(3.141593)
-  div r1.y, r1.y, cb0[7].y
-  sincos r2.x, r3.x, r1.y
-  mov r2.y, r3.x
-  movc r1.xy, r1.xxxx, r2.xyxx, l(0,1.000000,0,0)
-  mul r1.zw, r0.yyyy, r1.xxxy
-  mad r0.y, r0.w, r1.y, -r1.z
-  store_structured u1.x, r0.z, l(0), r0.y
-  mad r0.y, r0.w, r1.x, r1.w
+  eq r1.x, cb0[7].y, l(0.000000)
+  if_nz r1.x
+    mov r1.x, r0.w
+  else 
+    ne r1.z, cb0[7].y, l(0.000000)
+    mul r1.y, r1.y, r1.y
+    mul r1.y, r1.y, l(3.141593)
+    div r1.y, r1.y, cb0[7].y
+    sincos r2.x, r3.x, r1.y
+    mov r2.y, r3.x
+    movc r1.yz, r1.zzzz, r2.xxyx, l(0,0,1.000000,0)
+    mul r2.xy, r0.yyyy, r1.yzyy
+    mad r1.x, r0.w, r1.z, -r2.x
+    mad r0.y, r0.w, r1.y, r2.y
+  endif 
+  store_structured u1.x, r0.z, l(0), r1.x
   store_structured u1.x, r0.x, l(0), r0.y
 endif 
 ret 
@@ -136,11 +132,11 @@ ret
 
 const BYTE g_DFT[] =
 {
-     68,  88,  66,  67, 222, 156, 
-    188, 133, 179,  57, 118,  25, 
-    122, 216, 102,  13,  91, 242, 
-     99,  27,   1,   0,   0,   0, 
-    172,  12,   0,   0,   3,   0, 
+     68,  88,  66,  67,  63, 188, 
+    200, 227, 206,  73,  64,  21, 
+    140, 126,  47, 226, 169,  81, 
+    175, 134,   1,   0,   0,   0, 
+    112,  12,   0,   0,   3,   0, 
       0,   0,  44,   0,   0,   0, 
      60,   0,   0,   0,  76,   0, 
       0,   0,  73,  83,  71,  78, 
@@ -149,8 +145,8 @@ const BYTE g_DFT[] =
      79,  83,  71,  78,   8,   0, 
       0,   0,   0,   0,   0,   0, 
       8,   0,   0,   0,  83,  72, 
-     69,  88,  88,  12,   0,   0, 
-     80,   0,   5,   0,  22,   3, 
+     69,  88,  28,  12,   0,   0, 
+     80,   0,   5,   0,   7,   3, 
       0,   0, 106,   8,   0,   1, 
      89,   0,   0,   4,  70, 142, 
      32,   0,   0,   0,   0,   0, 
@@ -164,7 +160,7 @@ const BYTE g_DFT[] =
      17,   0,   2,   0,   0,   0, 
       4,   0,   0,   0,  95,   0, 
       0,   2,  18,   0,   2,   0, 
-    104,   0,   0,   2,   6,   0, 
+    104,   0,   0,   2,   5,   0, 
       0,   0, 155,   0,   0,   4, 
      64,   0,   0,   0,   1,   0, 
       0,   0,   1,   0,   0,   0, 
@@ -256,11 +252,9 @@ const BYTE g_DFT[] =
      16,   0,   1,   0,   0,   0, 
      42,   0,  16,   0,   1,   0, 
       0,   0,  26,   0,  16,   0, 
-      1,   0,   0,   0,  31,   0, 
-      4,   3,  42,   0,  16,   0, 
       1,   0,   0,   0,  38,   0, 
       0,   9,   0, 208,   0,   0, 
-     66,   0,  16,   0,   1,   0, 
+    130,   0,  16,   0,   1,   0, 
       0,   0,  58,   0,  16,   0, 
       0,   0,   0,   0,  42, 128, 
      32,   0,   0,   0,   0,   0, 
@@ -268,221 +262,203 @@ const BYTE g_DFT[] =
       0, 139,   2,  35,   0, 128, 
     131, 153,  25,   0,  18,   0, 
      16,   0,   4,   0,   0,   0, 
-     42,   0,  16,   0,   1,   0, 
+     58,   0,  16,   0,   1,   0, 
       0,   0,   1,  64,   0,   0, 
       0,   0,   0,   0,   6, 224, 
      17,   0,   2,   0,   0,   0, 
-     35,   0,   0,  11,  66,   0, 
+     32,   0,   0,   8, 130,   0, 
      16,   0,   1,   0,   0,   0, 
-     58,   0,  16,   0,   0,   0, 
-      0,   0,  42, 128,  32,   0, 
-      0,   0,   0,   0,   6,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   6,   0, 
-      0,   0,  32,   0,   0,   8, 
-    130,   0,  16,   0,   1,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   5,   0, 
-      0,   0,   1,  64,   0,   0, 
-      2,   0,   0,   0, 167,   0, 
+     58, 128,  32,   0,   0,   0, 
+      0,   0,   5,   0,   0,   0, 
+      1,  64,   0,   0,   2,   0, 
+      0,   0,  31,   0,   4,   3, 
+     58,   0,  16,   0,   1,   0, 
+      0,   0,  35,   0,   0,  11, 
+     34,   0,  16,   0,   2,   0, 
+      0,   0,  58,   0,  16,   0, 
+      0,   0,   0,   0,  42, 128, 
+     32,   0,   0,   0,   0,   0, 
+      6,   0,   0,   0,  58, 128, 
+     32,   0,   0,   0,   0,   0, 
+      6,   0,   0,   0, 167,   0, 
       0, 139,   2,  35,   0, 128, 
-    131, 153,  25,   0,  66,   0, 
-     16,   0,   1,   0,   0,   0, 
-     42,   0,  16,   0,   1,   0, 
+    131, 153,  25,   0,  34,   0, 
+     16,   0,   4,   0,   0,   0, 
+     26,   0,  16,   0,   2,   0, 
       0,   0,   1,  64,   0,   0, 
       0,   0,   0,   0,   6, 224, 
      17,   0,   2,   0,   0,   0, 
-      1,   0,   0,   7,  34,   0, 
-     16,   0,   4,   0,   0,   0, 
-     42,   0,  16,   0,   1,   0, 
-      0,   0,  58,   0,  16,   0, 
-      1,   0,   0,   0,  18,   0, 
-      0,   1,  54,   0,   0,   8, 
-     50,   0,  16,   0,   4,   0, 
+     18,   0,   0,   1,  54,   0, 
+      0,   5,  34,   0,  16,   0, 
+      4,   0,   0,   0,   1,  64, 
+      0,   0,   0,   0,   0,   0, 
+     21,   0,   0,   1,  55,   0, 
+      0,  12,  98,   0,  16,   0, 
+      2,   0,   0,   0, 166,  10, 
+     16,   0,   1,   0,   0,   0, 
+     86,   4,  16,   0,   4,   0, 
       0,   0,   2,  64,   0,   0, 
-      0,   0, 128,  63,   0,   0, 
       0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,  21,   0, 
-      0,   1,  79,   0,   0,   8, 
-     66,   0,  16,   0,   1,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,  26, 128, 
-     32,   0,   0,   0,   0,   0, 
-      1,   0,   0,   0,  31,   0, 
-      4,   3,  42,   0,  16,   0, 
-      1,   0,   0,   0,  38,   0, 
-      0,   9,   0, 208,   0,   0, 
-    130,   0,  16,   0,   0,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,  26, 128, 
-     32,   0,   0,   0,   0,   0, 
-      2,   0,   0,   0,  35,   0, 
-      0,  10, 130,   0,  16,   0, 
-      0,   0,   0,   0,  10,   0, 
+      0,   0,   0,   0, 128,  63, 
+      0,   0,   0,   0,  79,   0, 
+      0,   8,  66,   0,  16,   0, 
+      1,   0,   0,   0,  58,   0, 
+     16,   0,   0,   0,   0,   0, 
+     26, 128,  32,   0,   0,   0, 
+      0,   0,   1,   0,   0,   0, 
+     38,   0,   0,   9,   0, 208, 
+      0,   0,  18,   0,  16,   0, 
+      1,   0,   0,   0,  10,   0, 
      16,   0,   1,   0,   0,   0, 
      10, 128,  32,   0,   0,   0, 
       0,   0,   2,   0,   0,   0, 
+     35,   0,   0,  10, 130,   0, 
+     16,   0,   0,   0,   0,   0, 
      58,   0,  16,   0,   0,   0, 
-      0,   0,  35,   0,   0,  10, 
-    130,   0,  16,   0,   0,   0, 
+      0,   0,  26, 128,  32,   0, 
+      0,   0,   0,   0,   2,   0, 
       0,   0,  10,   0,  16,   0, 
-      3,   0,   0,   0,  42, 128, 
+      1,   0,   0,   0,  35,   0, 
+      0,  10, 130,   0,  16,   0, 
+      0,   0,   0,   0,  10,   0, 
+     16,   0,   3,   0,   0,   0, 
+     42, 128,  32,   0,   0,   0, 
+      0,   0,   2,   0,   0,   0, 
+     58,   0,  16,   0,   0,   0, 
+      0,   0, 167,   0,   0, 139, 
+      2,  35,   0, 128, 131, 153, 
+     25,   0,  18,   0,  16,   0, 
+      4,   0,   0,   0,  58,   0, 
+     16,   0,   0,   0,   0,   0, 
+      1,  64,   0,   0,   0,   0, 
+      0,   0,   6, 224,  17,   0, 
+      0,   0,   0,   0,  32,   0, 
+      0,   8, 130,   0,  16,   0, 
+      2,   0,   0,   0,  58, 128, 
      32,   0,   0,   0,   0,   0, 
-      2,   0,   0,   0,  58,   0, 
+      1,   0,   0,   0,   1,  64, 
+      0,   0,   2,   0,   0,   0, 
+     31,   0,   4,   3,  58,   0, 
+     16,   0,   2,   0,   0,   0, 
+     30,   0,   0,   8, 130,   0, 
      16,   0,   0,   0,   0,   0, 
-    167,   0,   0, 139,   2,  35, 
-      0, 128, 131, 153,  25,   0, 
-     18,   0,  16,   0,   5,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,   1,  64, 
-      0,   0,   0,   0,   0,   0, 
-      6, 224,  17,   0,   0,   0, 
-      0,   0,  32,   0,   0,   8, 
-     66,   0,  16,   0,   1,   0, 
+     58,   0,  16,   0,   0,   0, 
       0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   1,   0, 
-      0,   0,   1,  64,   0,   0, 
-      2,   0,   0,   0,  31,   0, 
-      4,   3,  42,   0,  16,   0, 
-      1,   0,   0,   0,  30,   0, 
-      0,   8, 130,   0,  16,   0, 
-      0,   0,   0,   0,  58,   0, 
+      0,   0,   0,   0,   2,   0, 
+      0,   0, 167,   0,   0, 139, 
+      2,  35,   0, 128, 131, 153, 
+     25,   0,  34,   0,  16,   0, 
+      4,   0,   0,   0,  58,   0, 
      16,   0,   0,   0,   0,   0, 
-     58, 128,  32,   0,   0,   0, 
-      0,   0,   2,   0,   0,   0, 
-    167,   0,   0, 139,   2,  35, 
-      0, 128, 131, 153,  25,   0, 
-     34,   0,  16,   0,   5,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,   1,  64, 
-      0,   0,   0,   0,   0,   0, 
-      6, 224,  17,   0,   0,   0, 
-      0,   0,  18,   0,   0,   1, 
-     54,   0,   0,   5,  34,   0, 
-     16,   0,   5,   0,   0,   0, 
       1,  64,   0,   0,   0,   0, 
-      0,   0,  21,   0,   0,   1, 
-     18,   0,   0,   1,  54,   0, 
-      0,   8,  50,   0,  16,   0, 
-      5,   0,   0,   0,   2,  64, 
-      0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-     21,   0,   0,   1,  56,   0, 
-      0,   7, 130,   0,  16,   0, 
-      0,   0,   0,   0,  26,   0, 
-     16,   0,   4,   0,   0,   0, 
-     26,   0,  16,   0,   5,   0, 
-      0,   0,  50,   0,   0,  10, 
-    130,   0,  16,   0,   0,   0, 
-      0,   0,  10,   0,  16,   0, 
-      5,   0,   0,   0,  10,   0, 
+      0,   0,   6, 224,  17,   0, 
+      0,   0,   0,   0,  18,   0, 
+      0,   1,  54,   0,   0,   5, 
+     34,   0,  16,   0,   4,   0, 
+      0,   0,   1,  64,   0,   0, 
+      0,   0,   0,   0,  21,   0, 
+      0,   1,   1,   0,   0,   7, 
+     98,   0,  16,   0,   3,   0, 
+      0,   0, 166,  10,  16,   0, 
+      1,   0,   0,   0,   6,   1, 
      16,   0,   4,   0,   0,   0, 
-     58,   0,  16, 128,  65,   0, 
-      0,   0,   0,   0,   0,   0, 
-     15,   0,   0,   7,  66,   0, 
-     16,   0,   1,   0,   0,   0, 
-     22,   5,  16,   0,   5,   0, 
-      0,   0,  70,   0,  16,   0, 
-      4,   0,   0,   0,  79,   0, 
-      0,   8, 130,   0,  16,   0, 
+     56,   0,   0,   7, 130,   0, 
+     16,   0,   0,   0,   0,   0, 
+     26,   0,  16,   0,   2,   0, 
+      0,   0,  42,   0,  16,   0, 
+      3,   0,   0,   0,  50,   0, 
+      0,  10, 130,   0,  16,   0, 
+      0,   0,   0,   0,  26,   0, 
+     16,   0,   3,   0,   0,   0, 
+     42,   0,  16,   0,   2,   0, 
+      0,   0,  58,   0,  16, 128, 
+     65,   0,   0,   0,   0,   0, 
+      0,   0,  15,   0,   0,   7, 
+     66,   0,  16,   0,   1,   0, 
+      0,   0, 150,   5,  16,   0, 
+      3,   0,   0,   0, 150,   5, 
+     16,   0,   2,   0,   0,   0, 
+     79,   0,   0,   8,  34,   0, 
+     16,   0,   2,   0,   0,   0, 
+     26,   0,  16,   0,   0,   0, 
+      0,   0,  42, 128,  32,   0, 
+      0,   0,   0,   0,   5,   0, 
+      0,   0,   1,   0,   0,   7, 
+     34,   0,  16,   0,   1,   0, 
+      0,   0,  26,   0,  16,   0, 
       1,   0,   0,   0,  26,   0, 
+     16,   0,   2,   0,   0,   0, 
+     38,   0,   0,   9,   0, 208, 
+      0,   0,  34,   0,  16,   0, 
+      2,   0,   0,   0,  26,   0, 
      16,   0,   0,   0,   0,   0, 
      42, 128,  32,   0,   0,   0, 
-      0,   0,   5,   0,   0,   0, 
-      1,   0,   0,   7,  34,   0, 
-     16,   0,   1,   0,   0,   0, 
-     58,   0,  16,   0,   1,   0, 
+      0,   0,   6,   0,   0,   0, 
+    167,   0,   0, 139,   2,  35, 
+      0, 128, 131, 153,  25,   0, 
+     18,   0,  16,   0,   4,   0, 
       0,   0,  26,   0,  16,   0, 
-      1,   0,   0,   0,  31,   0, 
-      4,   3,  26,   0,  16,   0, 
-      1,   0,   0,   0,  38,   0, 
-      0,   9,   0, 208,   0,   0, 
-     34,   0,  16,   0,   1,   0, 
+      2,   0,   0,   0,   1,  64, 
+      0,   0,   0,   0,   0,   0, 
+      6, 224,  17,   0,   2,   0, 
+      0,   0,  31,   0,   4,   3, 
+     58,   0,  16,   0,   1,   0, 
+      0,   0,  35,   0,   0,  11, 
+    130,   0,  16,   0,   1,   0, 
       0,   0,  26,   0,  16,   0, 
       0,   0,   0,   0,  42, 128, 
+     32,   0,   0,   0,   0,   0, 
+      6,   0,   0,   0,  58, 128, 
      32,   0,   0,   0,   0,   0, 
       6,   0,   0,   0, 167,   0, 
       0, 139,   2,  35,   0, 128, 
-    131, 153,  25,   0,  18,   0, 
+    131, 153,  25,   0,  34,   0, 
      16,   0,   4,   0,   0,   0, 
-     26,   0,  16,   0,   1,   0, 
+     58,   0,  16,   0,   1,   0, 
       0,   0,   1,  64,   0,   0, 
       0,   0,   0,   0,   6, 224, 
      17,   0,   2,   0,   0,   0, 
-     35,   0,   0,  11,  34,   0, 
-     16,   0,   1,   0,   0,   0, 
-     26,   0,  16,   0,   0,   0, 
-      0,   0,  42, 128,  32,   0, 
-      0,   0,   0,   0,   6,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   6,   0, 
-      0,   0,  32,   0,   0,   8, 
-    130,   0,  16,   0,   1,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   5,   0, 
-      0,   0,   1,  64,   0,   0, 
-      2,   0,   0,   0, 167,   0, 
-      0, 139,   2,  35,   0, 128, 
-    131, 153,  25,   0,  34,   0, 
+     18,   0,   0,   1,  54,   0, 
+      0,   5,  34,   0,  16,   0, 
+      4,   0,   0,   0,   1,  64, 
+      0,   0,   0,   0,   0,   0, 
+     21,   0,   0,   1,  55,   0, 
+      0,  12, 162,   0,  16,   0, 
+      1,   0,   0,   0,  86,   5, 
      16,   0,   1,   0,   0,   0, 
-     26,   0,  16,   0,   1,   0, 
-      0,   0,   1,  64,   0,   0, 
-      0,   0,   0,   0,   6, 224, 
-     17,   0,   2,   0,   0,   0, 
-      1,   0,   0,   7,  34,   0, 
-     16,   0,   4,   0,   0,   0, 
-     26,   0,  16,   0,   1,   0, 
-      0,   0,  58,   0,  16,   0, 
-      1,   0,   0,   0,  18,   0, 
-      0,   1,  54,   0,   0,   8, 
-     50,   0,  16,   0,   4,   0, 
+     86,   1,  16,   0,   4,   0, 
       0,   0,   2,  64,   0,   0, 
-      0,   0, 128,  63,   0,   0, 
       0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,  21,   0, 
-      0,   1,  79,   0,   0,   8, 
-     34,   0,  16,   0,   1,   0, 
-      0,   0,  26,   0,  16,   0, 
-      0,   0,   0,   0,  26, 128, 
-     32,   0,   0,   0,   0,   0, 
-      1,   0,   0,   0,  31,   0, 
-      4,   3,  26,   0,  16,   0, 
-      1,   0,   0,   0,  38,   0, 
-      0,   9,   0, 208,   0,   0, 
-     34,   0,  16,   0,   0,   0, 
-      0,   0,  26,   0,  16,   0, 
-      0,   0,   0,   0,  26, 128, 
-     32,   0,   0,   0,   0,   0, 
-      2,   0,   0,   0,  35,   0, 
+      0,   0,   0,   0,   0,   0, 
+      0,   0, 128,  63,  79,   0, 
+      0,   8,  34,   0,  16,   0, 
+      2,   0,   0,   0,  26,   0, 
+     16,   0,   0,   0,   0,   0, 
+     26, 128,  32,   0,   0,   0, 
+      0,   0,   1,   0,   0,   0, 
+     35,   0,   0,  10,  34,   0, 
+     16,   0,   0,   0,   0,   0, 
+     26,   0,  16,   0,   0,   0, 
+      0,   0,  26, 128,  32,   0, 
+      0,   0,   0,   0,   2,   0, 
+      0,   0,  10,   0,  16,   0, 
+      1,   0,   0,   0,  35,   0, 
       0,  10,  34,   0,  16,   0, 
       0,   0,   0,   0,  10,   0, 
-     16,   0,   1,   0,   0,   0, 
-     10, 128,  32,   0,   0,   0, 
+     16,   0,   3,   0,   0,   0, 
+     42, 128,  32,   0,   0,   0, 
       0,   0,   2,   0,   0,   0, 
      26,   0,  16,   0,   0,   0, 
-      0,   0,  35,   0,   0,  10, 
-     34,   0,  16,   0,   0,   0, 
-      0,   0,  10,   0,  16,   0, 
-      3,   0,   0,   0,  42, 128, 
-     32,   0,   0,   0,   0,   0, 
-      2,   0,   0,   0,  26,   0, 
+      0,   0, 167,   0,   0, 139, 
+      2,  35,   0, 128, 131, 153, 
+     25,   0,  18,   0,  16,   0, 
+      3,   0,   0,   0,  26,   0, 
      16,   0,   0,   0,   0,   0, 
-    167,   0,   0, 139,   2,  35, 
-      0, 128, 131, 153,  25,   0, 
-     18,   0,  16,   0,   1,   0, 
-      0,   0,  26,   0,  16,   0, 
-      0,   0,   0,   0,   1,  64, 
-      0,   0,   0,   0,   0,   0, 
-      6, 224,  17,   0,   0,   0, 
-      0,   0,  32,   0,   0,   8, 
-    130,   0,  16,   0,   1,   0, 
-      0,   0,  58, 128,  32,   0, 
-      0,   0,   0,   0,   1,   0, 
-      0,   0,   1,  64,   0,   0, 
-      2,   0,   0,   0,  31,   0, 
+      1,  64,   0,   0,   0,   0, 
+      0,   0,   6, 224,  17,   0, 
+      0,   0,   0,   0,  31,   0, 
       4,   3,  58,   0,  16,   0, 
-      1,   0,   0,   0,  30,   0, 
+      2,   0,   0,   0,  30,   0, 
       0,   8,  34,   0,  16,   0, 
       0,   0,   0,   0,  26,   0, 
      16,   0,   0,   0,   0,   0, 
@@ -490,39 +466,37 @@ const BYTE g_DFT[] =
       0,   0,   2,   0,   0,   0, 
     167,   0,   0, 139,   2,  35, 
       0, 128, 131, 153,  25,   0, 
-     34,   0,  16,   0,   1,   0, 
+     34,   0,  16,   0,   3,   0, 
       0,   0,  26,   0,  16,   0, 
       0,   0,   0,   0,   1,  64, 
       0,   0,   0,   0,   0,   0, 
       6, 224,  17,   0,   0,   0, 
       0,   0,  18,   0,   0,   1, 
      54,   0,   0,   5,  34,   0, 
-     16,   0,   1,   0,   0,   0, 
+     16,   0,   3,   0,   0,   0, 
       1,  64,   0,   0,   0,   0, 
       0,   0,  21,   0,   0,   1, 
-     18,   0,   0,   1,  54,   0, 
-      0,   8,  50,   0,  16,   0, 
-      1,   0,   0,   0,   2,  64, 
-      0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-     21,   0,   0,   1,  56,   0, 
+      1,   0,   0,   7,  98,   0, 
+     16,   0,   2,   0,   0,   0, 
+     86,   5,  16,   0,   2,   0, 
+      0,   0,   6,   1,  16,   0, 
+      3,   0,   0,   0,  56,   0, 
       0,   7,  34,   0,  16,   0, 
       0,   0,   0,   0,  26,   0, 
-     16,   0,   4,   0,   0,   0, 
-     26,   0,  16,   0,   1,   0, 
+     16,   0,   1,   0,   0,   0, 
+     42,   0,  16,   0,   2,   0, 
       0,   0,  50,   0,   0,  10, 
      34,   0,  16,   0,   0,   0, 
-      0,   0,  10,   0,  16,   0, 
-      1,   0,   0,   0,  10,   0, 
-     16,   0,   4,   0,   0,   0, 
+      0,   0,  26,   0,  16,   0, 
+      2,   0,   0,   0,  58,   0, 
+     16,   0,   1,   0,   0,   0, 
      26,   0,  16, 128,  65,   0, 
       0,   0,   0,   0,   0,   0, 
      15,   0,   0,   7,  18,   0, 
      16,   0,   1,   0,   0,   0, 
-     22,   5,  16,   0,   1,   0, 
-      0,   0,  70,   0,  16,   0, 
-      4,   0,   0,   0,  78,   0, 
+    150,   5,  16,   0,   2,   0, 
+      0,   0, 214,   5,  16,   0, 
+      1,   0,   0,   0,  78,   0, 
       0,   8,   0, 208,   0,   0, 
      34,   0,  16,   0,   1,   0, 
       0,   0,  10,   0,  16,   0, 
@@ -610,65 +584,77 @@ const BYTE g_DFT[] =
      16,   0,   0,   0,   0,   0, 
     166, 138,  32,   0,   0,   0, 
       0,   0,   7,   0,   0,   0, 
-     57,   0,   0,   8,  18,   0, 
+     24,   0,   0,   8,  18,   0, 
      16,   0,   1,   0,   0,   0, 
      26, 128,  32,   0,   0,   0, 
       0,   0,   7,   0,   0,   0, 
       1,  64,   0,   0,   0,   0, 
+      0,   0,  31,   0,   4,   3, 
+     10,   0,  16,   0,   1,   0, 
+      0,   0,  54,   0,   0,   5, 
+     18,   0,  16,   0,   1,   0, 
+      0,   0,  58,   0,  16,   0, 
+      0,   0,   0,   0,  18,   0, 
+      0,   1,  57,   0,   0,   8, 
+     66,   0,  16,   0,   1,   0, 
+      0,   0,  26, 128,  32,   0, 
+      0,   0,   0,   0,   7,   0, 
+      0,   0,   1,  64,   0,   0, 
+      0,   0,   0,   0,  56,   0, 
+      0,   7,  34,   0,  16,   0, 
+      1,   0,   0,   0,  26,   0, 
+     16,   0,   1,   0,   0,   0, 
+     26,   0,  16,   0,   1,   0, 
       0,   0,  56,   0,   0,   7, 
      34,   0,  16,   0,   1,   0, 
       0,   0,  26,   0,  16,   0, 
-      1,   0,   0,   0,  26,   0, 
-     16,   0,   1,   0,   0,   0, 
-     56,   0,   0,   7,  34,   0, 
+      1,   0,   0,   0,   1,  64, 
+      0,   0, 219,  15,  73,  64, 
+     14,   0,   0,   8,  34,   0, 
      16,   0,   1,   0,   0,   0, 
      26,   0,  16,   0,   1,   0, 
-      0,   0,   1,  64,   0,   0, 
-    219,  15,  73,  64,  14,   0, 
-      0,   8,  34,   0,  16,   0, 
-      1,   0,   0,   0,  26,   0, 
+      0,   0,  26, 128,  32,   0, 
+      0,   0,   0,   0,   7,   0, 
+      0,   0,  77,   0,   0,   7, 
+     18,   0,  16,   0,   2,   0, 
+      0,   0,  18,   0,  16,   0, 
+      3,   0,   0,   0,  26,   0, 
      16,   0,   1,   0,   0,   0, 
-     26, 128,  32,   0,   0,   0, 
-      0,   0,   7,   0,   0,   0, 
-     77,   0,   0,   7,  18,   0, 
+     54,   0,   0,   5,  34,   0, 
      16,   0,   2,   0,   0,   0, 
-     18,   0,  16,   0,   3,   0, 
-      0,   0,  26,   0,  16,   0, 
-      1,   0,   0,   0,  54,   0, 
-      0,   5,  34,   0,  16,   0, 
-      2,   0,   0,   0,  10,   0, 
-     16,   0,   3,   0,   0,   0, 
-     55,   0,   0,  12,  50,   0, 
-     16,   0,   1,   0,   0,   0, 
-      6,   0,  16,   0,   1,   0, 
-      0,   0,  70,   0,  16,   0, 
-      2,   0,   0,   0,   2,  64, 
+     10,   0,  16,   0,   3,   0, 
+      0,   0,  55,   0,   0,  12, 
+     98,   0,  16,   0,   1,   0, 
+      0,   0, 166,  10,  16,   0, 
+      1,   0,   0,   0,   6,   1, 
+     16,   0,   2,   0,   0,   0, 
+      2,  64,   0,   0,   0,   0, 
       0,   0,   0,   0,   0,   0, 
       0,   0, 128,  63,   0,   0, 
-      0,   0,   0,   0,   0,   0, 
-     56,   0,   0,   7, 194,   0, 
+      0,   0,  56,   0,   0,   7, 
+     50,   0,  16,   0,   2,   0, 
+      0,   0,  86,   5,  16,   0, 
+      0,   0,   0,   0, 150,   5, 
      16,   0,   1,   0,   0,   0, 
-     86,   5,  16,   0,   0,   0, 
-      0,   0,   6,   4,  16,   0, 
-      1,   0,   0,   0,  50,   0, 
-      0,  10,  34,   0,  16,   0, 
+     50,   0,   0,  10,  18,   0, 
+     16,   0,   1,   0,   0,   0, 
+     58,   0,  16,   0,   0,   0, 
+      0,   0,  42,   0,  16,   0, 
+      1,   0,   0,   0,  10,   0, 
+     16, 128,  65,   0,   0,   0, 
+      2,   0,   0,   0,  50,   0, 
+      0,   9,  34,   0,  16,   0, 
       0,   0,   0,   0,  58,   0, 
      16,   0,   0,   0,   0,   0, 
      26,   0,  16,   0,   1,   0, 
-      0,   0,  42,   0,  16, 128, 
-     65,   0,   0,   0,   1,   0, 
-      0,   0, 168,   0,   0,   9, 
+      0,   0,  26,   0,  16,   0, 
+      2,   0,   0,   0,  21,   0, 
+      0,   1, 168,   0,   0,   9, 
      18, 224,  17,   0,   1,   0, 
       0,   0,  42,   0,  16,   0, 
       0,   0,   0,   0,   1,  64, 
       0,   0,   0,   0,   0,   0, 
-     26,   0,  16,   0,   0,   0, 
-      0,   0,  50,   0,   0,   9, 
-     34,   0,  16,   0,   0,   0, 
-      0,   0,  58,   0,  16,   0, 
-      0,   0,   0,   0,  10,   0, 
-     16,   0,   1,   0,   0,   0, 
-     58,   0,  16,   0,   1,   0, 
+     10,   0,  16,   0,   1,   0, 
       0,   0, 168,   0,   0,   9, 
      18, 224,  17,   0,   1,   0, 
       0,   0,  10,   0,  16,   0, 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham_fp16.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham_fp16.h
index 988c0aa66ade..56ce75987568 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham_fp16.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/stockham_fp16.h
@@ -15,7 +15,7 @@
 ; Name                 Index   Mask Register SysValue  Format   Used
 ; -------------------- ----- ------ -------- -------- ------- ------
 ; no parameters
-; shader hash: e08f21199c48b0db30bf21bd8c5b80dc
+; shader hash: 6a1d88feb14177832f5ee49ca330c549
 ;
 ; Pipeline Runtime Information: 
 ;
@@ -125,7 +125,7 @@ define void @DFT() {
   %47 = fpext half %46 to float
   %48 = extractvalue %dx.types.CBufRet.i32 %37, 3
   %49 = icmp eq i32 %48, 2
-  br i1 %49, label %50, label %56
+  br i1 %49, label %50, label %56, !dx.controlflow.hints !15
 
 ; <label>:50                                      ; preds = %41
   %51 = extractvalue %dx.types.CBufRet.i32 %42, 3
@@ -141,7 +141,7 @@ define void @DFT() {
   %59 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %4, i32 1)  ; CBufferLoadLegacy(handle,regIndex)
   %60 = extractvalue %dx.types.CBufRet.i32 %59, 1
   %61 = icmp ult i32 %33, %60
-  br i1 %61, label %62, label %83, !dx.controlflow.hints !15
+  br i1 %61, label %62, label %83, !dx.controlflow.hints !16
 
 ; <label>:62                                      ; preds = %56
   %63 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %4, i32 2)  ; CBufferLoadLegacy(handle,regIndex)
@@ -158,7 +158,7 @@ define void @DFT() {
   %74 = fpext half %73 to float
   %75 = extractvalue %dx.types.CBufRet.i32 %59, 3
   %76 = icmp eq i32 %75, 2
-  br i1 %76, label %77, label %83, !dx.controlflow.hints !16
+  br i1 %76, label %77, label %83, !dx.controlflow.hints !17
 
 ; <label>:77                                      ; preds = %62
   %78 = extractvalue %dx.types.CBufRet.i32 %63, 3
@@ -188,7 +188,7 @@ define void @DFT() {
   %98 = fpext half %97 to float
   %99 = extractvalue %dx.types.CBufRet.i32 %37, 3
   %100 = icmp eq i32 %99, 2
-  br i1 %100, label %101, label %107
+  br i1 %100, label %101, label %107, !dx.controlflow.hints !15
 
 ; <label>:101                                     ; preds = %92
   %102 = extractvalue %dx.types.CBufRet.i32 %93, 3
@@ -202,7 +202,7 @@ define void @DFT() {
   %108 = phi float [ %98, %101 ], [ %98, %92 ], [ 1.000000e+00, %83 ]
   %109 = phi float [ %106, %101 ], [ 0.000000e+00, %92 ], [ 0.000000e+00, %83 ]
   %110 = icmp ult i32 %34, %60
-  br i1 %110, label %111, label %132, !dx.controlflow.hints !15
+  br i1 %110, label %111, label %132, !dx.controlflow.hints !16
 
 ; <label>:111                                     ; preds = %107
   %112 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %4, i32 2)  ; CBufferLoadLegacy(handle,regIndex)
@@ -219,7 +219,7 @@ define void @DFT() {
   %123 = fpext half %122 to float
   %124 = extractvalue %dx.types.CBufRet.i32 %59, 3
   %125 = icmp eq i32 %124, 2
-  br i1 %125, label %126, label %132, !dx.controlflow.hints !16
+  br i1 %125, label %126, label %132, !dx.controlflow.hints !17
 
 ; <label>:126                                     ; preds = %111
   %127 = extractvalue %dx.types.CBufRet.i32 %112, 3
@@ -270,19 +270,21 @@ define void @DFT() {
   %170 = fmul fast float %158, %169
   %171 = extractvalue %dx.types.CBufRet.f32 %157, 1
   %172 = fcmp fast oeq float %171, 0.000000e+00
-  br i1 %172, label %179, label %173
+  br i1 %172, label %173, label %176, !dx.controlflow.hints !18
 
 ; <label>:173                                     ; preds = %132
-  %174 = fmul fast float %146, %146
-  %175 = fmul fast float %174, 0x400921FB60000000
-  %176 = fdiv fast float %175, %171
-  %177 = call float @dx.op.unary.f32(i32 12, float %176)  ; Cos(value)
-  %178 = call float @dx.op.unary.f32(i32 13, float %176)  ; Sin(value)
-  br label %179
+  %174 = fptrunc float %164 to half
+  call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %2, i32 %154, i32 0, half %174, half undef, half undef, half undef, i8 1, i32 2)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  %175 = fptrunc float %170 to half
+  call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %2, i32 %156, i32 0, half %175, half undef, half undef, half undef, i8 1, i32 2)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
+  br label %190
 
-; <label>:179                                     ; preds = %173, %132
-  %180 = phi float [ %177, %173 ], [ 1.000000e+00, %132 ]
-  %181 = phi float [ %178, %173 ], [ 0.000000e+00, %132 ]
+; <label>:176                                     ; preds = %132
+  %177 = fmul fast float %146, %146
+  %178 = fmul fast float %177, 0x400921FB60000000
+  %179 = fdiv fast float %178, %171
+  %180 = call float @dx.op.unary.f32(i32 12, float %179)  ; Cos(value)
+  %181 = call float @dx.op.unary.f32(i32 13, float %179)  ; Sin(value)
   %182 = fmul fast float %180, %164
   %183 = fmul fast float %181, %170
   %184 = fsub fast float %182, %183
@@ -295,7 +297,7 @@ define void @DFT() {
   call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %2, i32 %156, i32 0, half %189, half undef, half undef, half undef, i8 1, i32 2)  ; RawBufferStore(uav,index,elementOffset,value0,value1,value2,value3,mask,alignment)
   br label %190
 
-; <label>:190                                     ; preds = %179, %0
+; <label>:190                                     ; preds = %176, %173, %0
   ret void
 }
 
@@ -345,16 +347,18 @@ attributes #2 = { nounwind }
 !11 = !{void ()* @DFT, !"DFT", null, !4, !12}
 !12 = !{i32 0, i64 8388656, i32 4, !13}
 !13 = !{i32 64, i32 1, i32 1}
-!14 = distinct !{!14, !"dx.controlflow.hints", i32 1}
+!14 = distinct !{!14, !"dx.controlflow.hints", i32 2}
 !15 = distinct !{!15, !"dx.controlflow.hints", i32 1}
-!16 = distinct !{!16, !"dx.controlflow.hints", i32 1}
+!16 = distinct !{!16, !"dx.controlflow.hints", i32 2}
+!17 = distinct !{!17, !"dx.controlflow.hints", i32 1}
+!18 = distinct !{!18, !"dx.controlflow.hints", i32 1}
 
 #endif
 
 const unsigned char g_DFT[] = {
-  0x44, 0x58, 0x42, 0x43, 0x0f, 0xc1, 0xea, 0x65, 0x6d, 0xe3, 0x8d, 0x13,
-  0x2c, 0xb2, 0x19, 0xb3, 0xd4, 0xb1, 0x94, 0xb9, 0x01, 0x00, 0x00, 0x00,
-  0xfc, 0x0b, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x44, 0x58, 0x42, 0x43, 0x12, 0x40, 0x8a, 0x15, 0xf2, 0x7d, 0x33, 0xd8,
+  0x35, 0x6a, 0x11, 0xd5, 0x43, 0xa1, 0x29, 0x3b, 0x01, 0x00, 0x00, 0x00,
+  0x3c, 0x0c, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
   0x48, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
   0x18, 0x01, 0x00, 0x00, 0x34, 0x01, 0x00, 0x00, 0x53, 0x46, 0x49, 0x30,
   0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -376,12 +380,12 @@ const unsigned char g_DFT[] = {
   0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x48, 0x41, 0x53, 0x48, 0x14, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0xe0, 0x8f, 0x21, 0x19, 0x9c, 0x48, 0xb0, 0xdb,
-  0x30, 0xbf, 0x21, 0xbd, 0x8c, 0x5b, 0x80, 0xdc, 0x44, 0x58, 0x49, 0x4c,
-  0xc0, 0x0a, 0x00, 0x00, 0x62, 0x00, 0x05, 0x00, 0xb0, 0x02, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x6a, 0x1d, 0x88, 0xfe, 0xb1, 0x41, 0x77, 0x83,
+  0x2f, 0x5e, 0xe4, 0x9c, 0xa3, 0x30, 0xc5, 0x49, 0x44, 0x58, 0x49, 0x4c,
+  0x00, 0x0b, 0x00, 0x00, 0x62, 0x00, 0x05, 0x00, 0xc0, 0x02, 0x00, 0x00,
   0x44, 0x58, 0x49, 0x4c, 0x02, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-  0xa8, 0x0a, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde, 0x21, 0x0c, 0x00, 0x00,
-  0xa7, 0x02, 0x00, 0x00, 0x0b, 0x82, 0x20, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xe8, 0x0a, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde, 0x21, 0x0c, 0x00, 0x00,
+  0xb7, 0x02, 0x00, 0x00, 0x0b, 0x82, 0x20, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x00, 0x00, 0x07, 0x81, 0x23, 0x91, 0x41, 0xc8, 0x04, 0x49,
   0x06, 0x10, 0x32, 0x39, 0x92, 0x01, 0x84, 0x0c, 0x25, 0x05, 0x08, 0x19,
   0x1e, 0x04, 0x8b, 0x62, 0x80, 0x18, 0x45, 0x02, 0x42, 0x92, 0x0b, 0x42,
@@ -441,7 +445,7 @@ const unsigned char g_DFT[] = {
   0x4a, 0xa0, 0x08, 0x8a, 0x61, 0x04, 0xa0, 0x30, 0x0a, 0x50, 0xa0, 0x10,
   0x0a, 0x30, 0x80, 0xb0, 0x11, 0x00, 0x0a, 0x0b, 0x1c, 0x10, 0x10, 0x81,
   0xc0, 0x19, 0x00, 0xea, 0x66, 0x00, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
-  0x4f, 0x00, 0x00, 0x00, 0x1a, 0x03, 0x4c, 0x90, 0x46, 0x02, 0x13, 0x44,
+  0x52, 0x00, 0x00, 0x00, 0x1a, 0x03, 0x4c, 0x90, 0x46, 0x02, 0x13, 0x44,
   0x35, 0x18, 0x63, 0x0b, 0x73, 0x3b, 0x03, 0xb1, 0x2b, 0x93, 0x9b, 0x4b,
   0x7b, 0x73, 0x03, 0x99, 0x71, 0xb9, 0x01, 0x41, 0xa1, 0x0b, 0x3b, 0x9b,
   0x7b, 0x91, 0x2a, 0x62, 0x2a, 0x0a, 0x9a, 0x2a, 0xfa, 0x9a, 0xb9, 0x81,
@@ -458,16 +462,17 @@ const unsigned char g_DFT[] = {
   0x70, 0x26, 0x08, 0xc3, 0xb3, 0x61, 0xe0, 0x86, 0x61, 0x03, 0xa1, 0x68,
   0x5b, 0xb7, 0xa1, 0xc0, 0x32, 0xe0, 0xf2, 0x48, 0x91, 0xe1, 0xb9, 0x8c,
   0xbd, 0xb9, 0xd1, 0xc9, 0xbd, 0xb1, 0x99, 0xb1, 0xbd, 0xdd, 0xb9, 0xa0,
-  0xa5, 0xb9, 0xd1, 0xcd, 0xad, 0x18, 0xc2, 0x00, 0x0c, 0x86, 0x15, 0x83,
-  0x18, 0x80, 0xc1, 0xb0, 0x62, 0x18, 0x03, 0x30, 0x18, 0xaa, 0xb0, 0xb1,
-  0xd9, 0xb5, 0xb9, 0xa4, 0x91, 0x95, 0xb9, 0xd1, 0x4d, 0x09, 0x82, 0x2a,
-  0x64, 0x78, 0x2e, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x53, 0x02,
-  0xa2, 0x09, 0x19, 0x9e, 0x8b, 0x5d, 0x18, 0x9b, 0x5d, 0x99, 0xdc, 0x94,
-  0xc0, 0xa8, 0x43, 0x86, 0xe7, 0x32, 0x87, 0x16, 0x46, 0x56, 0x26, 0xd7,
-  0xf4, 0x46, 0x56, 0xc6, 0x36, 0x25, 0x40, 0xca, 0x90, 0xe1, 0xb9, 0xc8,
-  0x95, 0xcd, 0xbd, 0xd5, 0xc9, 0x8d, 0x95, 0xcd, 0x4d, 0x09, 0xac, 0x3a,
-  0x64, 0x78, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x50, 0x6f, 0x69, 0x6e,
-  0x74, 0x73, 0x53, 0x02, 0x0f, 0x00, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
+  0xa5, 0xb9, 0xd1, 0xcd, 0xad, 0x18, 0xc2, 0x00, 0x0c, 0x88, 0x15, 0x83,
+  0x18, 0x80, 0xc1, 0xb0, 0x62, 0x18, 0x03, 0x30, 0x20, 0x56, 0x0c, 0x64,
+  0x00, 0x06, 0xc3, 0x8a, 0xa1, 0x0c, 0xc0, 0x60, 0xa8, 0xc2, 0xc6, 0x66,
+  0xd7, 0xe6, 0x92, 0x46, 0x56, 0xe6, 0x46, 0x37, 0x25, 0x08, 0xaa, 0x90,
+  0xe1, 0xb9, 0xd8, 0x95, 0xc9, 0xcd, 0xa5, 0xbd, 0xb9, 0x4d, 0x09, 0x88,
+  0x26, 0x64, 0x78, 0x2e, 0x76, 0x61, 0x6c, 0x76, 0x65, 0x72, 0x53, 0x02,
+  0xa3, 0x0e, 0x19, 0x9e, 0xcb, 0x1c, 0x5a, 0x18, 0x59, 0x99, 0x5c, 0xd3,
+  0x1b, 0x59, 0x19, 0xdb, 0x94, 0x00, 0x29, 0x43, 0x86, 0xe7, 0x22, 0x57,
+  0x36, 0xf7, 0x56, 0x27, 0x37, 0x56, 0x36, 0x37, 0x25, 0xb0, 0xea, 0x90,
+  0xe1, 0xb9, 0x94, 0xb9, 0xd1, 0xc9, 0xe5, 0x41, 0xbd, 0xa5, 0xb9, 0xd1,
+  0xcd, 0x4d, 0x09, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x79, 0x18, 0x00, 0x00,
   0x59, 0x00, 0x00, 0x00, 0x33, 0x08, 0x80, 0x1c, 0xc4, 0xe1, 0x1c, 0x66,
   0x14, 0x01, 0x3d, 0x88, 0x43, 0x38, 0x84, 0xc3, 0x8c, 0x42, 0x80, 0x07,
   0x79, 0x78, 0x07, 0x73, 0x98, 0x71, 0x0c, 0xe6, 0x00, 0x0f, 0xed, 0x10,
@@ -510,9 +515,9 @@ const unsigned char g_DFT[] = {
   0x13, 0x11, 0x7e, 0x51, 0xeb, 0x16, 0x20, 0x0d, 0x97, 0xef, 0x3c, 0xfe,
   0x74, 0x44, 0x04, 0x30, 0x88, 0x83, 0x8f, 0xdc, 0xb6, 0x09, 0x3c, 0xc3,
   0xe5, 0x3b, 0x8f, 0x4f, 0x35, 0x40, 0x84, 0xf9, 0xc5, 0x6d, 0x03, 0x00,
-  0x61, 0x20, 0x00, 0x00, 0x22, 0x01, 0x00, 0x00, 0x13, 0x04, 0x51, 0x2c,
+  0x61, 0x20, 0x00, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x13, 0x04, 0x51, 0x2c,
   0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34, 0x94, 0x5d, 0x59,
-  0x0a, 0x94, 0x5c, 0xf9, 0x94, 0x43, 0x0d, 0x94, 0x46, 0x61, 0x0a, 0x94,
+  0x0a, 0x94, 0x5c, 0x61, 0x0a, 0x94, 0x4f, 0x39, 0xd4, 0x40, 0x69, 0x94,
   0x6e, 0x40, 0x19, 0x94, 0x02, 0x2d, 0x45, 0x50, 0x02, 0x64, 0x8c, 0x11,
   0xec, 0xfe, 0x28, 0xb3, 0x60, 0x30, 0x46, 0xb0, 0xfb, 0xa3, 0xcc, 0x82,
   0xc3, 0x18, 0xc1, 0xee, 0x8f, 0x32, 0x09, 0x06, 0x94, 0xcc, 0x00, 0x90,
@@ -525,87 +530,91 @@ const unsigned char g_DFT[] = {
   0x3c, 0x60, 0x30, 0x62, 0x70, 0x00, 0x20, 0x08, 0x06, 0xd3, 0x19, 0x60,
   0x42, 0x18, 0x8c, 0x26, 0x04, 0x40, 0x05, 0x03, 0x8c, 0x26, 0x0c, 0xc1,
   0x70, 0x83, 0x10, 0x90, 0xc1, 0x2c, 0x43, 0x00, 0x05, 0x23, 0x06, 0x07,
-  0x00, 0x82, 0x60, 0x30, 0xb1, 0x41, 0x77, 0x64, 0xa3, 0x09, 0xc1, 0x50,
-  0xc1, 0x1a, 0xe0, 0x68, 0x02, 0x22, 0x54, 0xe0, 0x69, 0xb9, 0x41, 0x70,
-  0x35, 0x80, 0x01, 0x54, 0x10, 0x06, 0x6a, 0x71, 0x10, 0x5c, 0x60, 0xc4,
-  0xe0, 0x00, 0x40, 0x10, 0x0c, 0xa6, 0x3a, 0x30, 0x03, 0xa8, 0x1b, 0x4d,
-  0x08, 0x82, 0xd1, 0x04, 0x41, 0xa8, 0x40, 0x90, 0x82, 0x82, 0xaa, 0x48,
-  0x98, 0x12, 0x88, 0xa9, 0xa1, 0xa8, 0x12, 0x1a, 0xac, 0x60, 0xb9, 0x5a,
-  0xd6, 0x00, 0xaa, 0x08, 0xb4, 0x86, 0x00, 0x2a, 0xa0, 0x60, 0x34, 0xe1,
-  0x02, 0x86, 0x1b, 0x82, 0x50, 0x00, 0x83, 0x11, 0x83, 0x03, 0x00, 0x41,
-  0x30, 0x98, 0x4a, 0xc1, 0x0e, 0xc0, 0x80, 0x0d, 0x46, 0x13, 0x02, 0x61,
-  0xb8, 0xc1, 0x08, 0xc8, 0xa0, 0x88, 0x40, 0x67, 0x19, 0x04, 0x22, 0x18,
-  0x31, 0x38, 0x00, 0x10, 0x04, 0x83, 0x29, 0x15, 0xf4, 0x80, 0x0c, 0x48,
-  0x61, 0x34, 0x21, 0x10, 0x2a, 0x50, 0x64, 0xc4, 0x40, 0x01, 0x40, 0x10,
-  0x0c, 0x1c, 0x57, 0xe0, 0x03, 0x35, 0x08, 0x4c, 0x21, 0x0e, 0x50, 0x61,
-  0x34, 0x21, 0x00, 0x2e, 0x30, 0x70, 0x34, 0x41, 0x19, 0x86, 0x1b, 0x02,
-  0x56, 0x00, 0x83, 0x59, 0x86, 0x81, 0x08, 0x46, 0x13, 0x90, 0xa1, 0x82,
-  0x03, 0x46, 0x0c, 0x14, 0x00, 0x04, 0xc1, 0xc0, 0xa9, 0x85, 0x51, 0x88,
-  0x83, 0xa0, 0x15, 0xf0, 0xe0, 0x15, 0x46, 0x13, 0x02, 0xe0, 0x02, 0x03,
-  0x67, 0x09, 0x88, 0x81, 0x0e, 0x03, 0x1a, 0x20, 0x81, 0x1d, 0x82, 0x81,
-  0x0e, 0x83, 0x18, 0xd8, 0x41, 0x60, 0x87, 0x60, 0xc4, 0xe0, 0x00, 0x40,
-  0x10, 0x0c, 0x26, 0x5c, 0x48, 0x85, 0x39, 0xa8, 0x85, 0xd1, 0x84, 0x20,
-  0x18, 0x6e, 0xc8, 0x02, 0x32, 0x98, 0x65, 0x28, 0x8e, 0x60, 0xc4, 0xe0,
-  0x00, 0x40, 0x10, 0x0c, 0xa6, 0x5d, 0x60, 0x05, 0x3b, 0xb8, 0x85, 0xd1,
-  0x84, 0x00, 0xa8, 0x60, 0x0c, 0x64, 0x34, 0x61, 0x08, 0x2a, 0xf0, 0xa4,
-  0x82, 0x01, 0x46, 0x13, 0x0c, 0xa1, 0x02, 0x33, 0x90, 0x1a, 0x02, 0x18,
-  0x31, 0x50, 0x00, 0x10, 0x04, 0x03, 0xc7, 0x1c, 0x68, 0x01, 0x14, 0x02,
-  0x5f, 0x48, 0x05, 0x70, 0x18, 0x4d, 0x08, 0x80, 0x0b, 0x0c, 0x1c, 0x4d,
-  0x78, 0x86, 0xe1, 0x86, 0x80, 0x1c, 0xc0, 0x60, 0x96, 0xc1, 0x38, 0x82,
-  0xd1, 0x04, 0x67, 0xa8, 0xe0, 0x80, 0x11, 0x03, 0x05, 0x00, 0x41, 0x30,
-  0x70, 0xda, 0x61, 0x17, 0x4e, 0x21, 0x28, 0x07, 0x58, 0x38, 0x87, 0xd1,
-  0x84, 0x00, 0xb8, 0xc0, 0xc0, 0x59, 0x82, 0x63, 0xa0, 0xc3, 0x80, 0x0c,
-  0xa8, 0xd0, 0x09, 0x62, 0xa0, 0xc3, 0x20, 0x0c, 0x9e, 0x28, 0x78, 0x82,
-  0x30, 0x41, 0x93, 0x8f, 0x09, 0x9a, 0x7c, 0x8c, 0xd8, 0xe4, 0x63, 0x44,
-  0x27, 0x9f, 0xe1, 0x06, 0x39, 0x70, 0x03, 0x32, 0xa8, 0x38, 0x08, 0x74,
-  0x96, 0x01, 0x51, 0x82, 0x11, 0x83, 0x03, 0x00, 0x41, 0x30, 0x98, 0xec,
-  0xe1, 0x1c, 0x62, 0x21, 0x1e, 0x46, 0x13, 0x02, 0xa1, 0x02, 0x3b, 0x90,
-  0x11, 0x03, 0x05, 0x00, 0x41, 0x30, 0x70, 0xf6, 0x21, 0x1d, 0x6e, 0x21,
-  0x98, 0x07, 0x5f, 0xa8, 0x87, 0xd1, 0x84, 0x00, 0xb8, 0xc0, 0xc0, 0xd1,
-  0x84, 0x3b, 0x18, 0x86, 0x1b, 0x82, 0x7c, 0x00, 0x83, 0x59, 0x86, 0x44,
-  0x09, 0x46, 0x13, 0x90, 0xa1, 0x82, 0x03, 0x46, 0x0c, 0x14, 0x00, 0x04,
-  0xc1, 0xc0, 0x11, 0x09, 0x78, 0xf0, 0x85, 0x40, 0x1f, 0xca, 0x81, 0x1f,
-  0x46, 0x13, 0x02, 0xe0, 0x02, 0x03, 0x67, 0x09, 0x94, 0x81, 0x0e, 0x03,
-  0x4a, 0x20, 0xc4, 0x34, 0x8e, 0x81, 0x0e, 0x83, 0x48, 0x4c, 0x03, 0x31,
-  0x8d, 0x63, 0xb8, 0x61, 0x14, 0xd8, 0x80, 0x0c, 0x66, 0x19, 0x96, 0x26,
-  0x18, 0x31, 0x38, 0x00, 0x10, 0x04, 0x83, 0xc9, 0x24, 0xee, 0x21, 0x1c,
-  0x44, 0x62, 0x34, 0x21, 0x00, 0x2a, 0x70, 0x05, 0x19, 0x4d, 0x18, 0x82,
-  0x0a, 0x50, 0x41, 0x2a, 0x18, 0x60, 0x34, 0xc1, 0x10, 0x2a, 0x88, 0x05,
-  0xa9, 0x21, 0x80, 0x11, 0x03, 0x05, 0x00, 0x41, 0x30, 0x70, 0x62, 0xe2,
-  0x1f, 0xd6, 0x21, 0x48, 0x09, 0x7a, 0x58, 0x89, 0xd1, 0x84, 0x00, 0xb8,
-  0xc0, 0xc0, 0xd1, 0x04, 0x3d, 0x18, 0x86, 0x1b, 0x82, 0x97, 0x00, 0x83,
-  0x59, 0x06, 0xa6, 0x09, 0x46, 0x13, 0x9c, 0xa1, 0x82, 0x03, 0x46, 0x0c,
-  0x14, 0x00, 0x04, 0xc1, 0xc0, 0xc1, 0x09, 0x93, 0x90, 0x87, 0x00, 0x26,
-  0xf6, 0x41, 0x26, 0x46, 0x13, 0x02, 0xe0, 0x02, 0x03, 0x67, 0x09, 0x9a,
-  0x81, 0x0e, 0x03, 0x62, 0xa0, 0x05, 0x3e, 0x94, 0x81, 0x0e, 0x83, 0x60,
-  0xe4, 0x63, 0x91, 0x0f, 0xc5, 0x04, 0x4c, 0x3e, 0x26, 0x60, 0xf2, 0x31,
-  0x21, 0x88, 0x8f, 0x15, 0x9a, 0x7c, 0xac, 0xe0, 0xe4, 0x63, 0x81, 0x00,
-  0x9f, 0x82, 0x87, 0x96, 0x80, 0x3a, 0x87, 0x40, 0x47, 0x13, 0xf8, 0x61,
-  0x18, 0x6e, 0x08, 0xc2, 0x02, 0x0c, 0xa6, 0x1b, 0x52, 0x02, 0x25, 0x82,
-  0x23, 0x8c, 0x32, 0x21, 0x90, 0xcf, 0xdd, 0x83, 0x51, 0x26, 0x04, 0xf4,
-  0x19, 0x31, 0x30, 0x00, 0x10, 0x04, 0x83, 0xa3, 0x2d, 0xc6, 0x22, 0x18,
-  0x31, 0x30, 0x00, 0x10, 0x04, 0x83, 0xc3, 0x2d, 0x6c, 0x42, 0x18, 0x31,
-  0x38, 0x00, 0x10, 0x04, 0x83, 0x89, 0x2d, 0x7a, 0xe2, 0x24, 0xc6, 0x62,
-  0x34, 0x21, 0x10, 0x2a, 0x28, 0x09, 0x19, 0x4d, 0x18, 0x86, 0x12, 0x02,
-  0x18, 0x31, 0x38, 0x00, 0x10, 0x04, 0x03, 0x0b, 0x2e, 0xc4, 0x82, 0x25,
-  0x7c, 0x62, 0x34, 0x21, 0x10, 0x2c, 0xb1, 0xe4, 0x63, 0x09, 0x25, 0x1f,
-  0x2b, 0x05, 0x52, 0x88, 0x8f, 0x05, 0x03, 0x7c, 0x2c, 0x18, 0xe2, 0x63,
-  0x46, 0x20, 0x1f, 0x7b, 0x32, 0xf9, 0xd8, 0xd3, 0xc9, 0xc7, 0x50, 0x21,
-  0x15, 0xe0, 0x63, 0xc1, 0x00, 0x1f, 0x0b, 0x06, 0xf8, 0x18, 0x13, 0xc8,
-  0x67, 0x34, 0xc1, 0x09, 0x86, 0x23, 0x82, 0x9f, 0x08, 0xbe, 0x59, 0x86,
-  0xc7, 0x09, 0x6c, 0xdb, 0xe4, 0x63, 0x01, 0x59, 0xc8, 0xc7, 0x02, 0x82,
-  0x3e, 0x23, 0x06, 0x06, 0x00, 0x82, 0x60, 0x70, 0x9c, 0x46, 0x5f, 0x04,
-  0x23, 0x06, 0x06, 0x00, 0x82, 0x60, 0x70, 0xa0, 0x06, 0x5c, 0x08, 0xb3,
-  0x04, 0xcf, 0x40, 0x85, 0x41, 0x38, 0xac, 0xd2, 0x0c, 0x54, 0x18, 0x84,
-  0xc3, 0x2a, 0x8d, 0x09, 0x90, 0x7c, 0x4c, 0x58, 0xe4, 0x63, 0x42, 0x10,
-  0x9f, 0x0b, 0x92, 0x1b, 0x31, 0x70, 0x00, 0x10, 0x04, 0x03, 0xa8, 0x35,
-  0xec, 0x22, 0x2d, 0x3c, 0xd3, 0x08, 0xda, 0xa2, 0x2d, 0xda, 0x22, 0x2e,
-  0x50, 0xc3, 0x0a, 0x4a, 0x3e, 0x76, 0x3c, 0xf2, 0x31, 0x21, 0x80, 0xcf,
-  0x05, 0xc9, 0x8d, 0x18, 0x38, 0x00, 0x08, 0x82, 0x01, 0x14, 0x1b, 0x7a,
-  0xd1, 0x16, 0x60, 0xa0, 0x1a, 0x41, 0x5c, 0xc4, 0x45, 0x5c, 0xd4, 0x05,
-  0x6b, 0xcc, 0x12, 0x40, 0x18, 0x10, 0x03, 0x00, 0x09, 0x00, 0x00, 0x00,
-  0x5b, 0x06, 0x34, 0x78, 0xc0, 0x60, 0xcb, 0xd0, 0x07, 0x4f, 0x18, 0x6c,
-  0x19, 0x58, 0xe1, 0x11, 0x83, 0x2d, 0xc3, 0x2e, 0x3c, 0x60, 0xb0, 0x65,
-  0x70, 0x87, 0x27, 0x0c, 0xb6, 0x0c, 0xfd, 0xf0, 0x88, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  0x00, 0x82, 0x60, 0x30, 0xb1, 0x41, 0x77, 0x60, 0xa3, 0x09, 0xc1, 0x50,
+  0xc1, 0x1a, 0xe0, 0x68, 0x02, 0x22, 0x54, 0xd0, 0x69, 0xb9, 0x41, 0x70,
+  0x35, 0x7c, 0x50, 0x01, 0x18, 0xa8, 0xc5, 0x41, 0x70, 0x81, 0x11, 0x83,
+  0x03, 0x00, 0x41, 0x30, 0x98, 0xea, 0xc0, 0x0c, 0xa0, 0x6e, 0x34, 0x21,
+  0x08, 0x46, 0x13, 0x04, 0xa1, 0x02, 0x41, 0x0a, 0x0a, 0xaa, 0x22, 0x61,
+  0x4a, 0x20, 0xa6, 0x86, 0xa2, 0x4a, 0x68, 0xb0, 0x82, 0xe5, 0x6a, 0x51,
+  0x03, 0xa8, 0x22, 0xd0, 0x1a, 0x02, 0xa8, 0x80, 0x82, 0xd1, 0x84, 0x0b,
+  0x18, 0x6e, 0x08, 0x42, 0x01, 0x0c, 0x46, 0x0c, 0x0e, 0x00, 0x04, 0xc1,
+  0x60, 0x2a, 0x05, 0x3b, 0x00, 0x03, 0x36, 0x18, 0x4d, 0x08, 0x84, 0xe1,
+  0x06, 0x23, 0x20, 0x83, 0x22, 0x02, 0x9d, 0x65, 0x10, 0x88, 0x60, 0xc4,
+  0xe0, 0x00, 0x40, 0x10, 0x0c, 0xa6, 0x54, 0xd0, 0x03, 0x32, 0x20, 0x85,
+  0xd1, 0x84, 0x40, 0xa8, 0x40, 0x91, 0x11, 0x03, 0x05, 0x00, 0x41, 0x30,
+  0x70, 0x5c, 0x81, 0x0f, 0xd4, 0x20, 0x30, 0x85, 0x38, 0x40, 0x85, 0xd1,
+  0x84, 0x00, 0xb8, 0xc0, 0xc0, 0xd1, 0x04, 0x65, 0x18, 0x6e, 0x08, 0x58,
+  0x01, 0x0c, 0x66, 0x19, 0x06, 0x22, 0x18, 0x4d, 0x40, 0x86, 0x0a, 0x0e,
+  0x18, 0x31, 0x50, 0x00, 0x10, 0x04, 0x03, 0xa7, 0x16, 0x46, 0x21, 0x0e,
+  0x82, 0x56, 0xc0, 0x83, 0x57, 0x18, 0x4d, 0x08, 0x80, 0x0b, 0x0c, 0x9c,
+  0x25, 0x20, 0x06, 0x3a, 0x0c, 0x68, 0x80, 0x04, 0x76, 0x08, 0x06, 0x3a,
+  0x0c, 0x62, 0x60, 0x07, 0x81, 0x1d, 0x82, 0x11, 0x83, 0x03, 0x00, 0x41,
+  0x30, 0x98, 0x70, 0x21, 0x15, 0xe6, 0xa0, 0x16, 0x46, 0x13, 0x82, 0x60,
+  0xb8, 0x21, 0x0b, 0xc8, 0x60, 0x96, 0xa1, 0x38, 0x82, 0x11, 0x83, 0x03,
+  0x00, 0x41, 0x30, 0x98, 0x76, 0x81, 0x15, 0xec, 0xe0, 0x16, 0x46, 0x13,
+  0x02, 0xa0, 0x82, 0x31, 0x90, 0xd1, 0x84, 0x21, 0xa8, 0xc0, 0x93, 0x0a,
+  0x06, 0x18, 0x4d, 0x30, 0x84, 0x0a, 0xcc, 0x40, 0x6a, 0x08, 0x60, 0xc4,
+  0x40, 0x01, 0x40, 0x10, 0x0c, 0x1c, 0x73, 0xa0, 0x05, 0x50, 0x08, 0x7c,
+  0x21, 0x15, 0xc0, 0x61, 0x34, 0x21, 0x00, 0x2e, 0x30, 0x70, 0x34, 0xe1,
+  0x19, 0x86, 0x1b, 0x02, 0x72, 0x00, 0x83, 0x59, 0x06, 0xe3, 0x08, 0x46,
+  0x13, 0x9c, 0xa1, 0x82, 0x03, 0x46, 0x0c, 0x14, 0x00, 0x04, 0xc1, 0xc0,
+  0x69, 0x87, 0x5d, 0x38, 0x85, 0xa0, 0x1c, 0x60, 0xe1, 0x1c, 0x46, 0x13,
+  0x02, 0xe0, 0x02, 0x03, 0x67, 0x09, 0x8e, 0x81, 0x0e, 0x03, 0x32, 0xa0,
+  0x42, 0x27, 0x88, 0x81, 0x0e, 0x83, 0x30, 0x78, 0xa2, 0xe0, 0x09, 0xc2,
+  0x04, 0x4d, 0x3e, 0x26, 0x68, 0xf2, 0x31, 0x62, 0x93, 0x8f, 0x11, 0x9d,
+  0x7c, 0x86, 0x1b, 0xe4, 0xc0, 0x0d, 0xc8, 0xa0, 0xe2, 0x20, 0xd0, 0x59,
+  0x06, 0x44, 0x09, 0x46, 0x0c, 0x0e, 0x00, 0x04, 0xc1, 0x60, 0xb2, 0x87,
+  0x73, 0x88, 0x85, 0x78, 0x18, 0x4d, 0x08, 0x84, 0x0a, 0xec, 0x40, 0x46,
+  0x0c, 0x14, 0x00, 0x04, 0xc1, 0xc0, 0xd9, 0x87, 0x74, 0xb8, 0x85, 0x60,
+  0x1e, 0x7c, 0xa1, 0x1e, 0x46, 0x13, 0x02, 0xe0, 0x02, 0x03, 0x47, 0x13,
+  0xee, 0x60, 0x18, 0x6e, 0x08, 0xf2, 0x01, 0x0c, 0x66, 0x19, 0x12, 0x25,
+  0x18, 0x4d, 0x40, 0x86, 0x0a, 0x0e, 0x18, 0x31, 0x50, 0x00, 0x10, 0x04,
+  0x03, 0x47, 0x24, 0xe0, 0xc1, 0x17, 0x02, 0x7d, 0x28, 0x07, 0x7e, 0x18,
+  0x4d, 0x08, 0x80, 0x0b, 0x0c, 0x9c, 0x25, 0x50, 0x06, 0x3a, 0x0c, 0x28,
+  0x81, 0x10, 0xd3, 0x38, 0x06, 0x3a, 0x0c, 0x22, 0x31, 0x0d, 0xc4, 0x34,
+  0x8e, 0xe1, 0x86, 0x51, 0x60, 0x03, 0x32, 0x98, 0x65, 0x58, 0x9a, 0x60,
+  0xc4, 0xe0, 0x00, 0x40, 0x10, 0x0c, 0x26, 0x93, 0xb8, 0x87, 0x70, 0x10,
+  0x89, 0xd1, 0x84, 0x00, 0xa8, 0xc0, 0x15, 0x64, 0x34, 0x61, 0x08, 0x2a,
+  0x40, 0x05, 0xa9, 0x60, 0x80, 0xd1, 0x04, 0x43, 0xa8, 0x20, 0x16, 0xa4,
+  0x86, 0x00, 0x46, 0x0c, 0x14, 0x00, 0x04, 0xc1, 0xc0, 0x89, 0x89, 0x7f,
+  0x58, 0x87, 0x20, 0x25, 0xe8, 0x61, 0x25, 0x46, 0x13, 0x02, 0xe0, 0x02,
+  0x03, 0x47, 0x13, 0xf4, 0x60, 0x18, 0x6e, 0x08, 0x5e, 0x02, 0x0c, 0x66,
+  0x19, 0x98, 0x26, 0x18, 0x4d, 0x70, 0x86, 0x0a, 0x0e, 0x18, 0x31, 0x50,
+  0x00, 0x10, 0x04, 0x03, 0x07, 0x27, 0x4c, 0x42, 0x1e, 0x02, 0x98, 0xd8,
+  0x07, 0x99, 0x18, 0x4d, 0x08, 0x80, 0x0b, 0x0c, 0x9c, 0x25, 0x68, 0x06,
+  0x3a, 0x0c, 0x88, 0x81, 0x16, 0xf8, 0x50, 0x06, 0x3a, 0x0c, 0x82, 0x91,
+  0x8f, 0x45, 0x3e, 0x14, 0x13, 0x30, 0xf9, 0x98, 0x80, 0xc9, 0xc7, 0x84,
+  0x20, 0x3e, 0x56, 0x68, 0xf2, 0xb1, 0x82, 0x93, 0x8f, 0x05, 0x02, 0x7c,
+  0x0a, 0x1e, 0x58, 0x02, 0xea, 0x1c, 0x02, 0x1d, 0x4d, 0xe0, 0x87, 0x61,
+  0xb8, 0x21, 0x08, 0x0b, 0x30, 0x98, 0x6e, 0x48, 0x09, 0x94, 0x08, 0x8e,
+  0x30, 0xca, 0x84, 0x40, 0x3e, 0x77, 0x0f, 0x46, 0x99, 0x10, 0xd0, 0x67,
+  0xc4, 0xc0, 0x00, 0x40, 0x10, 0x0c, 0x8e, 0xb6, 0x18, 0x8b, 0x60, 0xc4,
+  0xc0, 0x00, 0x40, 0x10, 0x0c, 0x0e, 0xb7, 0xa8, 0x09, 0x61, 0xc4, 0xe0,
+  0x00, 0x40, 0x10, 0x0c, 0x26, 0xb6, 0xe8, 0x89, 0x93, 0x18, 0x8b, 0xd1,
+  0x84, 0x40, 0xa8, 0xa0, 0x24, 0x64, 0x34, 0x61, 0x18, 0x4a, 0x08, 0x60,
+  0xc4, 0xe0, 0x00, 0x40, 0x10, 0x0c, 0x2c, 0xb8, 0x10, 0x0b, 0x96, 0xe8,
+  0x89, 0xd1, 0x84, 0x40, 0xb0, 0xc4, 0x92, 0x8f, 0x25, 0x94, 0x7c, 0xac,
+  0x14, 0x48, 0x21, 0x3e, 0x16, 0x0c, 0xf0, 0xb1, 0x60, 0x88, 0x8f, 0x19,
+  0x81, 0x7c, 0xec, 0xc9, 0xe4, 0x63, 0x4f, 0x27, 0x1f, 0x43, 0x85, 0x54,
+  0x80, 0x8f, 0x05, 0x03, 0x7c, 0x2c, 0x18, 0xe0, 0x63, 0x4c, 0x20, 0x9f,
+  0xd1, 0x04, 0x27, 0x18, 0x8e, 0x08, 0x7e, 0x22, 0xf8, 0x66, 0x19, 0x9c,
+  0x27, 0xb8, 0x24, 0xb9, 0x11, 0x03, 0x07, 0x00, 0x41, 0x30, 0x80, 0x46,
+  0x03, 0x2e, 0x7e, 0x82, 0xe2, 0x8b, 0x60, 0x2c, 0xc6, 0x62, 0x2c, 0xce,
+  0xc2, 0x2f, 0x8e, 0x48, 0x6e, 0xc4, 0xc0, 0x01, 0x40, 0x10, 0x0c, 0x20,
+  0xd2, 0x88, 0x0b, 0xb0, 0x98, 0xfa, 0x22, 0x20, 0x0b, 0xb2, 0x20, 0x0b,
+  0xb4, 0xf8, 0x8b, 0x59, 0x02, 0xc8, 0xba, 0x4e, 0x3e, 0x16, 0x98, 0x85,
+  0x7c, 0x2c, 0x30, 0xe8, 0x33, 0x62, 0x60, 0x00, 0x20, 0x08, 0x06, 0x47,
+  0x6a, 0xfc, 0x45, 0x30, 0x62, 0x60, 0x00, 0x20, 0x08, 0x06, 0x87, 0x6a,
+  0xc4, 0x85, 0x60, 0x02, 0x24, 0x1f, 0x13, 0x16, 0xf9, 0x98, 0x10, 0xc4,
+  0xe7, 0x82, 0xe4, 0x46, 0x0c, 0x1c, 0x00, 0x04, 0xc1, 0x00, 0x6a, 0x0d,
+  0xbd, 0x48, 0x0b, 0xcf, 0x34, 0x82, 0xb6, 0x68, 0x8b, 0xb6, 0x88, 0x0b,
+  0xd4, 0xb0, 0x82, 0x92, 0x8f, 0x1d, 0x8f, 0x7c, 0x4c, 0x08, 0xe0, 0x73,
+  0x41, 0x72, 0x23, 0x06, 0x0e, 0x00, 0x82, 0x60, 0x00, 0xc5, 0x86, 0x5f,
+  0xb4, 0x05, 0x18, 0xa8, 0x46, 0x10, 0x17, 0x71, 0x11, 0x17, 0x75, 0xc1,
+  0x1a, 0xb3, 0x04, 0x10, 0x06, 0xc4, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x5b, 0x06, 0x34, 0x78, 0xc0, 0x60, 0xcb, 0x10, 0x07, 0x4f, 0x18, 0x6c,
+  0x19, 0xfa, 0xe0, 0x11, 0x83, 0x2d, 0x03, 0x2b, 0x3c, 0x63, 0xb0, 0x65,
+  0xd8, 0x85, 0x07, 0x0c, 0xb6, 0x0c, 0xe4, 0xf0, 0x84, 0xc1, 0x96, 0xc1,
+  0x1d, 0x1e, 0x31, 0xd8, 0x32, 0xf4, 0xc3, 0x33, 0x06, 0x5b, 0x06, 0xb6,
+  0x78, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index 28360f09bcba..0d0a4149eefd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -320,6 +320,8 @@ DML_OP_EXTERN_CREATION_FUNCTION(GlobalMaxPool);
 DML_OP_EXTERN_CREATION_FUNCTION(LpPool);
 DML_OP_EXTERN_CREATION_FUNCTION(GlobalLpPool);
 DML_OP_EXTERN_CREATION_FUNCTION(MaxRoiPool);
+DML_OP_EXTERN_CREATION_FUNCTION(QLinearAveragePool);
+DML_OP_EXTERN_CREATION_FUNCTION(QLinearGlobalAveragePool);
 DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign10);
 DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign16);
 DML_OP_EXTERN_CREATION_FUNCTION(InstanceNormalization);
@@ -356,6 +358,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(Pad7);
 DML_OP_EXTERN_CREATION_FUNCTION(Pad11);
 DML_OP_EXTERN_CREATION_FUNCTION(Pad13);
 DML_OP_EXTERN_CREATION_FUNCTION(Pad18);
+DML_OP_EXTERN_CREATION_FUNCTION(Pad19);
 DML_OP_EXTERN_CREATION_FUNCTION(SpaceToDepth);
 DML_OP_EXTERN_CREATION_FUNCTION(DepthToSpace);
 DML_OP_EXTERN_CREATION_FUNCTION(Sqrt);
@@ -432,8 +435,10 @@ DML_OP_EXTERN_CREATION_FUNCTION(Dropout);
 DML_OP_EXTERN_CREATION_FUNCTION(MatMul);
 DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMul);
 DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMulActivation);
+DML_OP_EXTERN_CREATION_FUNCTION(DynamicQuantizeMatMul);
 DML_OP_EXTERN_CREATION_FUNCTION(Cast);
 DML_OP_EXTERN_CREATION_FUNCTION(CastLike15);
+DML_OP_EXTERN_CREATION_FUNCTION(CastLike19);
 DML_OP_EXTERN_CREATION_FUNCTION(MemcpyFromHost);
 DML_OP_EXTERN_CREATION_FUNCTION(MemcpyToHost);
 DML_OP_EXTERN_CREATION_FUNCTION(TopK7);
@@ -471,6 +476,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(Where);
 DML_OP_EXTERN_CREATION_FUNCTION(Shrink);
 DML_OP_EXTERN_CREATION_FUNCTION(Gelu);
 DML_OP_EXTERN_CREATION_FUNCTION(BiasGelu);
+DML_OP_EXTERN_CREATION_FUNCTION(FastGelu);
 DML_OP_EXTERN_CREATION_FUNCTION(OneHot);
 DML_OP_EXTERN_CREATION_FUNCTION(EyeLike);
 DML_OP_EXTERN_CREATION_FUNCTION(MaxUnpool);
@@ -496,12 +502,22 @@ DML_OP_EXTERN_CREATION_FUNCTION(ScatterND);
 DML_OP_EXTERN_CREATION_FUNCTION(QLinearAdd);
 DML_OP_EXTERN_CREATION_FUNCTION(QLinearConv);
 DML_OP_EXTERN_CREATION_FUNCTION(QLinearMatMul);
+DML_OP_EXTERN_CREATION_FUNCTION(QLinearConcat);
 DML_OP_EXTERN_CREATION_FUNCTION(DynamicQuantizeLinear);
 DML_OP_EXTERN_CREATION_FUNCTION(MatMulInteger);
+DML_OP_EXTERN_CREATION_FUNCTION(MatMulIntegerToFloat);
 DML_OP_EXTERN_CREATION_FUNCTION(ConvInteger);
 DML_OP_EXTERN_CREATION_FUNCTION(Trilu);
+
+#if DML_TARGET_VERSION >= 0x6400
+DML_OP_EXTERN_CREATION_FUNCTION(Col2Im);
+DML_OP_EXTERN_CREATION_FUNCTION(Resize18);
+DML_OP_EXTERN_CREATION_FUNCTION(Resize19);
+#endif
+
 DML_OP_EXTERN_CREATION_FUNCTION(Shape);
 DML_OP_EXTERN_CREATION_FUNCTION(Size);
+DML_OP_EXTERN_CREATION_FUNCTION(QAttention);
 DML_OP_EXTERN_CREATION_FUNCTION(Attention);
 DML_OP_EXTERN_CREATION_FUNCTION(MultiHeadAttention);
 DML_OP_EXTERN_CREATION_FUNCTION(NonZero);
@@ -523,6 +539,7 @@ DML_OP_EXTERN_QUERY_FUNCTION(Pad);
 DML_OP_EXTERN_QUERY_FUNCTION(LayerNormalization);
 DML_OP_EXTERN_QUERY_FUNCTION(SkipLayerNormalization);
 DML_OP_EXTERN_QUERY_FUNCTION(QLinearSigmoid);
+DML_OP_EXTERN_QUERY_FUNCTION(QAttention);
 DML_OP_EXTERN_QUERY_FUNCTION(Attention);
 
 constexpr static std::array<const char*, 1> typeNameListDefault = {"T"};
@@ -547,6 +564,7 @@ constexpr static std::array<const char*, 2> typeNameListEyeLike = { "T1", "T2" }
 constexpr static std::array<const char*, 2> typeNameShape = { "T", "T1" };
 constexpr static std::array<const char*, 2> typeNameSize = { "T", "T1" };
 constexpr static std::array<const char*, 2> typeNameListGroupNorm = {"T", "M"};
+constexpr static std::array<const char*, 3> typeNameListQLinearConcat= {"TF", "T8", "TV"};
 
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListAll = {SupportedTensorDataTypes::All};
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListFloat32 = {SupportedTensorDataTypes::Float32};
@@ -587,8 +605,9 @@ constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListLogica
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListLogicalComparison9 = /* A&B,C */ { SupportedTensorDataTypes::Float16to32|SupportedTensorDataTypes::Ints8to64, SupportedTensorDataTypes::Bool };
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListSigned = { SupportedTensorDataTypes::Float16to32 | SupportedTensorDataTypes::Int64 | SupportedTensorDataTypes::Int32 | SupportedTensorDataTypes::Int16 | SupportedTensorDataTypes::Int8 };
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListRange = {SupportedTensorDataTypes::Int16|SupportedTensorDataTypes::Int32|SupportedTensorDataTypes::Int64|SupportedTensorDataTypes::Float32};
-constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListResize11 = {SupportedTensorDataTypes::Float16to32, SupportedTensorDataTypes::Float16to32 /* ROI read by CPU */};
+constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListResize11 = {SupportedTensorDataTypes::Float16to32 | SupportedTensorDataTypes::Int8 | SupportedTensorDataTypes::UInt8, SupportedTensorDataTypes::Float16to32 /* ROI read by CPU */};
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListResize13 = supportedTypeListResize11;
+constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListResize18 = supportedTypeListResize11;
 constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListInteger = {SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8, SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8, SupportedTensorDataTypes::Int32 };
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListInteger8 = {SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8 };
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListRoiAlign = {SupportedTensorDataTypes::Float16to32, SupportedTensorDataTypes::Int32|SupportedTensorDataTypes::Int64 };
@@ -598,27 +617,57 @@ constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListLayerN
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListShape = {SupportedTensorDataTypes::All, SupportedTensorDataTypes::Int64};
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListSize = {SupportedTensorDataTypes::All, SupportedTensorDataTypes::Int64};
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListQLinearSigmoid = {SupportedTensorDataTypes::UInt8 | SupportedTensorDataTypes::Int8};
+
+constexpr static std::array<SupportedTensorDataTypes, 4> supportedTypeListQAttention = {
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Float16to32,
+    SupportedTensorDataTypes::Int32
+};
+
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListAttention = {SupportedTensorDataTypes::Float16to32, SupportedTensorDataTypes::Int32};
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListRotaryEmbedding = {SupportedTensorDataTypes::Float16to32, SupportedTensorDataTypes::Int64};
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListGroupNorm = {SupportedTensorDataTypes::Float16to32, SupportedTensorDataTypes::Float16to32};
 constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListNonZero = {SupportedTensorDataTypes::Float16to32 | SupportedTensorDataTypes::Ints8Bit | SupportedTensorDataTypes::Ints16Bit | SupportedTensorDataTypes::Ints32Bit | SupportedTensorDataTypes::Bool};
 
 constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListQLinearMatMul = {
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit
 };
+
+constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListMatMulIntegerToFloat = {
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Float16to32
+};
+
 constexpr static std::array<SupportedTensorDataTypes, 4> supportedTypeListQLinearConv = {
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
-    SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit,
     SupportedTensorDataTypes::Int32
 };
 
 
 constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListDynamicQuantizeLinear = {
     SupportedTensorDataTypes::Float32,
-    SupportedTensorDataTypes::UInt8,
+    SupportedTensorDataTypes::Ints8Bit
+};
+
+constexpr static std::array<SupportedTensorDataTypes, 2> supportedTypeListDynamicQuantizeMatMul= {
+    SupportedTensorDataTypes::Float32,
+    SupportedTensorDataTypes::Ints8Bit,
+};
+
+constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListQLinearConcat= {
+    SupportedTensorDataTypes::Float32,
+    SupportedTensorDataTypes::Ints8Bit,
+    SupportedTensorDataTypes::Ints8Bit|SupportedTensorDataTypes::Float32,
+};
+
+constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListQLinearAveragePool = {
+    SupportedTensorDataTypes::Ints8Bit
 };
 
 template<typename... Args>
@@ -667,6 +716,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      7,  AveragePool,                        typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(     10,  AveragePool,                        typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(     11,  AveragePool,                        typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
+    {REG_INFO(     19,  AveragePool,                        typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      7,  GlobalAveragePool,                  typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      7,  MaxPool,                            typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      8,  MaxPool,                            typeNameListMaxPool,            supportedTypeListMaxPool,               DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryMaxPool)},
@@ -677,6 +727,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      7,  GlobalMaxPool,                      typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      7,  LpPool,                             typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(     11,  LpPool,                             typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
+    {REG_INFO(     18,  LpPool,                             typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      7,  GlobalLpPool,                       typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      7,  MaxRoiPool,                         typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_VER( 10,  RoiAlign,                           typeNameListTwo,                supportedTypeListRoiAlign,              DmlGraphSupport::Supported)},
@@ -720,6 +771,11 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO_VER( 11,  Pad,                                typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,      requiredConstantCpuInputs(1, 2) /*pads, value*/)}, // https://microsoft.visualstudio.com/OS/_workitems/edit/26007728
     {REG_INFO_VER( 13,  Pad,                                typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,      requiredConstantCpuInputs(1, 2) /*pads, value*/)}, // https://microsoft.visualstudio.com/OS/_workitems/edit/26007728
     {REG_INFO_VER( 18,  Pad,                                typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,      requiredConstantCpuInputs(1, 2, 3) /*pads, value, axes*/)},
+
+#if DML_TARGET_VERSION >= 0x6400
+    {REG_INFO_VER( 19,  Pad,                                typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,      requiredConstantCpuInputs(1, 2, 3) /*pads, value, axes*/)},
+#endif
+
     {REG_INFO(      7,  SpaceToDepth,                       typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO(     13,  SpaceToDepth,                       typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO(      7,  DepthToSpace,                       typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
@@ -750,11 +806,16 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      9,  EyeLike,                            typeNameListEyeLike,            supportedTypeListEyeLike,               DmlGraphSupport::Supported)},
     {REG_INFO(     14,  Trilu,                              typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,     requiredConstantCpuInputs(1))},
 
+#if DML_TARGET_VERSION >= 0x6400
+    {REG_INFO(     18,  Col2Im,                             typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,     requiredConstantCpuInputs(1, 2))},
+#endif
+
     // Data reorganization that merely changes the dimensions while keeping the data identical.
     {REG_INFO_COPY( 7,  Identity,                           typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY(13,  Identity,                           typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY(14,  Identity,                           typeNameListDefaultV,           supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY(16,  Identity,                           typeNameListDefaultV,           supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
+    {REG_INFO_COPY(19,  Identity,                           typeNameListDefaultV,           supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY( 7,  Flatten,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY( 9,  Flatten,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY(11,  Flatten,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
@@ -768,6 +829,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO_COPY( 7,  Reshape,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,     requiredConstantCpuInputs(1))},
     {REG_INFO_COPY(13,  Reshape,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,     requiredConstantCpuInputs(1))},
     {REG_INFO_COPY(14,  Reshape,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,     requiredConstantCpuInputs(1))},
+    {REG_INFO_COPY(19,  Reshape,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,     requiredConstantCpuInputs(1))},
 
     // Elementwise
     {REG_INFO(      7,  Sqrt,                               typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
@@ -827,8 +889,10 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      7,  Affine,                             typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      10, QuantizeLinear,                     typeNameListTwo,                supportedTypeListQuantizeLinear,        DmlGraphSupport::Supported)},
     {REG_INFO(      13, QuantizeLinear,                     typeNameListTwo,                supportedTypeListQuantizeLinear,        DmlGraphSupport::Supported)},
+    {REG_INFO(      19, QuantizeLinear,                     typeNameListTwo,                supportedTypeListQuantizeLinear19,      DmlGraphSupport::Supported)},
     {REG_INFO(      10, DequantizeLinear,                   typeNameListDefault,            supportedTypeListDequantizeLinear,      DmlGraphSupport::Supported)},
     {REG_INFO(      13, DequantizeLinear,                   typeNameListDefault,            supportedTypeListDequantizeLinear,      DmlGraphSupport::Supported)},
+    {REG_INFO(      19, DequantizeLinear,                   typeNameListTwo,                supportedTypeListDequantizeLinear19,    DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  QuantizeLinear,                     typeNameListTwo,                supportedTypeListQuantizeLinear19,      DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  DequantizeLinear,                   typeNameListTwo,                supportedTypeListDequantizeLinear19,    DmlGraphSupport::Supported)},
     {REG_INFO(      9,  Sign,                               typeNameListDefault,            supportedTypeListFloat16to32Ints8to64,  DmlGraphSupport::Supported)},
@@ -913,6 +977,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      7,  Equal,                              typeNameListLogicalComparison,  supportedTypeListLogicalComparison7,    DmlGraphSupport::Supported)},
     {REG_INFO(     11,  Equal,                              typeNameListLogicalComparison,  supportedTypeListLogicalComparison9,    DmlGraphSupport::Supported)},
     {REG_INFO(     13,  Equal,                              typeNameListLogicalComparison,  supportedTypeListLogicalComparison9,    DmlGraphSupport::Supported)},
+    {REG_INFO(     19,  Equal,                              typeNameListLogicalComparison,  supportedTypeListLogicalComparison9,    DmlGraphSupport::Supported)},
     {REG_INFO(      7,  Not,                                typeNameListDefault,            supportedTypeListBool,                  DmlGraphSupport::Supported)},
     {REG_INFO(      7,  And,                                typeNameListDefault,            supportedTypeListBool,                  DmlGraphSupport::Supported)},
     {REG_INFO(      7,  Or,                                 typeNameListDefault,            supportedTypeListBool,                  DmlGraphSupport::Supported)},
@@ -931,7 +996,10 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO_VER( 10,  Resize,                             typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported,      requiredConstantCpuInputs(1) /*scales*/)},
     {REG_INFO_VER( 11,  Resize,                             typeNameListTwo,                supportedTypeListResize11,              DmlGraphSupport::Supported,      requiredConstantCpuInputs(1, 2, 3) /*roi, scales, sizes*/, std::nullopt, QueryResize)},
     {REG_INFO_VER( 13,  Resize,                             typeNameListTwo,                supportedTypeListResize13,              DmlGraphSupport::Supported,      requiredConstantCpuInputs(1, 2, 3) /*roi, scales, sizes*/, std::nullopt, QueryResize)},
-
+#if DML_TARGET_VERSION >= 0x6400
+    {REG_INFO_VER( 18,  Resize,                             typeNameListTwo,                supportedTypeListResize18,              DmlGraphSupport::Supported,      requiredConstantCpuInputs(1, 2, 3) /*roi, scales, sizes*/, std::nullopt, QueryResize)},
+    {REG_INFO_VER( 19,  Resize,                             typeNameListTwo,                supportedTypeListResize18,              DmlGraphSupport::Supported,      requiredConstantCpuInputs(1, 2, 3) /*roi, scales, sizes*/, std::nullopt, QueryResize)},
+#endif
     // Activation Functions
     {REG_INFO(      7,  Sigmoid,                            typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(     13,  Sigmoid,                            typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
@@ -974,7 +1042,9 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      7,  Cast,                               typeNameListTwo,                supportedTypeListCast,                  DmlGraphSupport::Supported)},
     {REG_INFO(      9,  Cast,                               typeNameListTwo,                supportedTypeListCast,                  DmlGraphSupport::Supported)},
     {REG_INFO(     13,  Cast,                               typeNameListTwo,                supportedTypeListCast,                  DmlGraphSupport::Supported)},
+    {REG_INFO(     19,  Cast,                               typeNameListTwo,                supportedTypeListCast,                  DmlGraphSupport::Supported)},
     {REG_INFO_VER( 15,  CastLike,                           typeNameListTwo,                supportedTypeListCast,                  DmlGraphSupport::Supported)},
+    {REG_INFO_VER( 19,  CastLike,                           typeNameListTwo,                supportedTypeListCast,                  DmlGraphSupport::Supported)},
     {REG_INFO(      7,  MemcpyFromHost,                     typeNameListDefault,            supportedTypeListAll)},
     {REG_INFO(      7,  MemcpyToHost,                       typeNameListDefault,            supportedTypeListAll)},
     {REG_INFO_VER(  7,  TopK,                               typeNameListTopK,               supportedTypeListTopK,                  DmlGraphSupport::Supported)},
@@ -985,8 +1055,10 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      7,  Shape,                              typeNameShape,                  supportedTypeListShape,                 DmlGraphSupport::NotSupported)},
     {REG_INFO(     13,  Shape,                              typeNameShape,                  supportedTypeListShape,                 DmlGraphSupport::NotSupported)},
     {REG_INFO(     15,  Shape,                              typeNameShape,                  supportedTypeListShape,                 DmlGraphSupport::NotSupported)},
+    {REG_INFO(     19,  Shape,                              typeNameShape,                  supportedTypeListShape,                 DmlGraphSupport::NotSupported)},
     {REG_INFO(      7,  Size,                               typeNameSize,                   supportedTypeListSize,                  DmlGraphSupport::NotSupported)},
     {REG_INFO(     13,  Size,                               typeNameSize,                   supportedTypeListSize,                  DmlGraphSupport::NotSupported)},
+    {REG_INFO(     19,  Size,                               typeNameSize,                   supportedTypeListSize,                  DmlGraphSupport::NotSupported)},
     {REG_INFO_DYNAMIC_OUTPUTS( 9,  NonZero,                 typeNameListDefault,            supportedTypeListNonZero,               DmlGraphSupport::NotSupported)},
     {REG_INFO_DYNAMIC_OUTPUTS(13,  NonZero,                 typeNameListDefault,            supportedTypeListNonZero,               DmlGraphSupport::NotSupported)},
 
@@ -1004,12 +1076,16 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     // Contrib operators
     {REG_INFO_MS(   1,  Gelu,                               typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  BiasGelu,                           typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  FastGelu,                           typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  FusedMatMul,                        typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  DynamicQuantizeMatMul,              typeNameListTwo,                supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  FusedMatMulActivation,              typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  QLinearSigmoid,                     typeNameListDefault,            supportedTypeListQLinearSigmoid,        DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryQLinearSigmoid)},
+    {REG_INFO_MS(   1,  QAttention,                         typeNameListFour,               supportedTypeListQAttention,            DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryQAttention)},
     {REG_INFO_MS(   1,  Attention,                          typeNameListAttention,          supportedTypeListAttention,             DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryAttention)},
     {REG_INFO_MS(   1,  MultiHeadAttention,                 typeNameListAttention,          supportedTypeListAttention,             DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  RotaryEmbedding,                    typeNameListRotaryEmbedding,    supportedTypeListRotaryEmbedding,       DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  QLinearConcat,                      typeNameListQLinearConcat,      supportedTypeListQLinearConcat,         DmlGraphSupport::Supported)},
 
     {REG_INFO(     10,  IsInf,                              typeNameListTwo,                supportedTypeListIsInf,                 DmlGraphSupport::Supported)},
     {REG_INFO(     10,  Mod,                                typeNameListDefault,            supportedTypeListNumericDefault,        DmlGraphSupport::Supported)},
@@ -1024,10 +1100,13 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      9,  MaxUnpool,                          typeNameListTwo,                supportedTypeListMaxUnpool,             DmlGraphSupport::Supported,      requiredConstantCpuInputs(2))},
     {REG_INFO(     11,  MaxUnpool,                          typeNameListTwo,                supportedTypeListMaxUnpool,             DmlGraphSupport::Supported,      requiredConstantCpuInputs(2))},  // 11 is identical to 9.
 
+    {REG_INFO_MS(  1,   QLinearAveragePool,                 typeNameListDefault,            supportedTypeListQLinearAveragePool,    DmlGraphSupport::Supported)},
+    {REG_INFO_MS(  1,   QLinearGlobalAveragePool,           typeNameListDefault,            supportedTypeListQLinearAveragePool,    DmlGraphSupport::Supported)},
     {REG_INFO_MS(  1,   QLinearAdd,                         typeNameListDefault,            supportedTypeListInteger8,              DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearConv,                        typeNameListFour,               supportedTypeListQLinearConv,           DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearMatMul,                      typeNameListThree,              supportedTypeListQLinearMatMul,         DmlGraphSupport::Supported)},
     {REG_INFO(     10,  MatMulInteger,                      typeNameListThree,              supportedTypeListInteger,               DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  MatMulIntegerToFloat,               typeNameListThree,              supportedTypeListMatMulIntegerToFloat,  DmlGraphSupport::Supported)},
     {REG_INFO(     10,  ConvInteger,                        typeNameListThree,              supportedTypeListInteger,               DmlGraphSupport::Supported)},
     {REG_INFO(     11,  DynamicQuantizeLinear,              typeNameListTwo,                supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)},
     {REG_INFO(      7,  LayerNormalization,                 typeNameListLayerNormContrib,   supportedTypeListLayerNormalizationContrib, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryLayerNormalization)},
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorUtility.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorUtility.cpp
index d8290bbdaee3..46442fe94253 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorUtility.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorUtility.cpp
@@ -132,6 +132,8 @@ namespace Dml
             std::string_view domain;
             int sinceVersion;
             std::vector<std::string_view> activationFilter;
+            bool enableOnMcdm;
+            std::vector<std::string_view> extraMcdmActivationFilter;
             std::optional<uint32_t> inputCountFilter;
         };
 
@@ -142,10 +144,10 @@ namespace Dml
 
         static const OperatorInfo c_fusableOps[] =
         {
-            OperatorInfo{ "Conv",                      onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_Conv },
-            OperatorInfo{ "Conv",                      onnxruntime::kOnnxDomain, OnnxOperatorSet11::sc_sinceVer_Conv },
-            OperatorInfo{ "ConvTranspose",             onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_ConvTranspose },
-            OperatorInfo{ "ConvTranspose",             onnxruntime::kOnnxDomain, OnnxOperatorSet11::sc_sinceVer_ConvTranspose },
+            OperatorInfo{ "Conv",                      onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_Conv, {}, true, {"Relu", "LeakyRelu"} },
+            OperatorInfo{ "Conv",                      onnxruntime::kOnnxDomain, OnnxOperatorSet11::sc_sinceVer_Conv, {}, true, {"Relu", "LeakyRelu"} },
+            OperatorInfo{ "ConvTranspose",             onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_ConvTranspose, {}, true, {"Relu", "LeakyRelu"} },
+            OperatorInfo{ "ConvTranspose",             onnxruntime::kOnnxDomain, OnnxOperatorSet11::sc_sinceVer_ConvTranspose, {}, true, {"Relu", "LeakyRelu"} },
             OperatorInfo{ "BatchNormalization",        onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_BatchNormalization },
             OperatorInfo{ "BatchNormalization",        onnxruntime::kOnnxDomain, OnnxOperatorSet9::sc_sinceVer_BatchNormalization },
             OperatorInfo{ "BatchNormalization",        onnxruntime::kOnnxDomain, OnnxOperatorSet14::sc_sinceVer_BatchNormalization },
@@ -154,20 +156,20 @@ namespace Dml
             OperatorInfo{ "MeanVarianceNormalization", onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_MeanVarianceNormalization },
             OperatorInfo{ "MeanVarianceNormalization", onnxruntime::kOnnxDomain, OnnxOperatorSet9::sc_sinceVer_MeanVarianceNormalization },
             OperatorInfo{ "MeanVarianceNormalization", onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_MeanVarianceNormalization },
-            OperatorInfo{ "Gemm",                      onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_Gemm },
-            OperatorInfo{ "Gemm",                      onnxruntime::kOnnxDomain, OnnxOperatorSet9::sc_sinceVer_Gemm },
-            OperatorInfo{ "Gemm",                      onnxruntime::kOnnxDomain, OnnxOperatorSet11::sc_sinceVer_Gemm },
-            OperatorInfo{ "Gemm",                      onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_Gemm },
-            OperatorInfo{ "MatMul",                    onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_MatMul },
-            OperatorInfo{ "MatMul",                    onnxruntime::kOnnxDomain, OnnxOperatorSet9::sc_sinceVer_MatMul },
-            OperatorInfo{ "MatMul",                    onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_MatMul },
+            OperatorInfo{ "Gemm",                      onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_Gemm, {}, true, {"Relu", "LeakyRelu"}  },
+            OperatorInfo{ "Gemm",                      onnxruntime::kOnnxDomain, OnnxOperatorSet9::sc_sinceVer_Gemm, {}, true, {"Relu", "LeakyRelu"}  },
+            OperatorInfo{ "Gemm",                      onnxruntime::kOnnxDomain, OnnxOperatorSet11::sc_sinceVer_Gemm, {}, true, {"Relu", "LeakyRelu"}  },
+            OperatorInfo{ "Gemm",                      onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_Gemm, {}, true, {"Relu", "LeakyRelu"}  },
+            OperatorInfo{ "MatMul",                    onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_MatMul, {}, true, {"Relu", "LeakyRelu"}  },
+            OperatorInfo{ "MatMul",                    onnxruntime::kOnnxDomain, OnnxOperatorSet9::sc_sinceVer_MatMul, {}, true, {"Relu", "LeakyRelu"}  },
+            OperatorInfo{ "MatMul",                    onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_MatMul, {}, true, {"Relu", "LeakyRelu"}  },
 
             // The filter for activation functions maps to what DML's fused op internally fuses at the shader level.
-            OperatorInfo{ "Add",                       onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_Add, {"Relu", "LeakyRelu"} },
-            OperatorInfo{ "Add",                       onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_Add, {"Relu", "LeakyRelu"} },
-            OperatorInfo{ "Add",                       onnxruntime::kOnnxDomain, OnnxOperatorSet14::sc_sinceVer_Add, {"Relu", "LeakyRelu"} },
-            OperatorInfo{ "Sum",                       onnxruntime::kOnnxDomain, OnnxOperatorSet8::sc_sinceVer_Sum, {"Relu", "LeakyRelu"}, 2 },
-            OperatorInfo{ "Sum",                       onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_Sum, {"Relu", "LeakyRelu"}, 2 },
+            OperatorInfo{ "Add",                       onnxruntime::kOnnxDomain, OnnxOperatorSet7::sc_sinceVer_Add, {"Relu", "LeakyRelu"}, true },
+            OperatorInfo{ "Add",                       onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_Add, {"Relu", "LeakyRelu"}, true },
+            OperatorInfo{ "Add",                       onnxruntime::kOnnxDomain, OnnxOperatorSet14::sc_sinceVer_Add, {"Relu", "LeakyRelu"}, true },
+            OperatorInfo{ "Sum",                       onnxruntime::kOnnxDomain, OnnxOperatorSet8::sc_sinceVer_Sum, {"Relu", "LeakyRelu"}, true, {} , 2 },
+            OperatorInfo{ "Sum",                       onnxruntime::kOnnxDomain, OnnxOperatorSet13::sc_sinceVer_Sum, {"Relu", "LeakyRelu"}, true, {} , 2 },
         };
 
         // Not all activations can be fused - only simple elementwise activations (i.e. activation functions which
@@ -205,7 +207,8 @@ namespace Dml
             int candidateOpInputCount,
             std::string_view activationOpType,
             std::string_view activationOpDomain,
-            int activationOpSinceVersion)
+            int activationOpSinceVersion,
+            bool isMcdmDevice)
         {
             auto opIt = std::find(
                 std::begin(c_fusableOps),
@@ -233,6 +236,20 @@ namespace Dml
                 return std::nullopt;
             }
 
+            if (isMcdmDevice)
+            {
+                if (!opIt->enableOnMcdm)
+                {
+                    return std::nullopt;
+                }
+                
+                if (!opIt->extraMcdmActivationFilter.empty() &&
+                    std::find(opIt->extraMcdmActivationFilter.begin(), opIt->extraMcdmActivationFilter.end(), activationOpType) ==  opIt->extraMcdmActivationFilter.end())
+                {
+                    return std::nullopt;
+                }
+            }
+
             if (opIt->inputCountFilter && *opIt->inputCountFilter != static_cast<uint32_t>(candidateOpInputCount))
             {
                 return std::nullopt;
@@ -419,9 +436,9 @@ namespace Dml
 
     } // namespace FusionHelpers
 
-    uint32_t GetDmlAdjustedAxis(int32_t onnxAxis, const MLOperatorKernelCreationContext& kernelCreationContext, uint32_t dmlDimCount)
+    uint32_t GetDmlAdjustedAxis(int32_t onnxAxis, const MLOperatorKernelCreationContext& kernelCreationContext, uint32_t dmlDimCount, uint32_t firstInputIndex)
     {
-        const std::vector<DimensionType> inputDimensions = kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(0);
+        const std::vector<DimensionType> inputDimensions = kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(firstInputIndex);
         uint32_t onnxDimCount = gsl::narrow_cast<uint32_t>(inputDimensions.size());
         onnxAxis = HandleNegativeAxis(onnxAxis, onnxDimCount);
         return GetDmlAdjustedAxis(onnxAxis, onnxDimCount, dmlDimCount);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorUtility.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorUtility.h
index f0fad6a05ffb..d3483cb5e8de 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorUtility.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorUtility.h
@@ -40,7 +40,8 @@ namespace Dml
             int candidateOpInputCount,
             std::string_view activationOpType,
             std::string_view activationOpDomain,
-            int activationOpSinceVersion);
+            int activationOpSinceVersion,
+            bool isMcdmDevice);
 
         // Returns true if the given activation operator type supports being fused with a fusable operator, false
         // otherwise.
@@ -64,8 +65,7 @@ namespace Dml
     } // namespace FusionHelpers
 
     // Given an axis in ONNX axis numbering, return the axis adjusted for DML based on how the sizes have been coerced.
-    // Note this function presumes the axis attribute is relative to the first input tensor (which is always the case).
-    uint32_t GetDmlAdjustedAxis(int32_t onnxAxis, const MLOperatorKernelCreationContext& kernelCreationContext, uint32_t dmlDimCount);
+    uint32_t GetDmlAdjustedAxis(int32_t onnxAxis, const MLOperatorKernelCreationContext& kernelCreationContext, uint32_t dmlDimCount, uint32_t firstInputIndex = 0);
 
     uint32_t GetDmlAdjustedAxis(int32_t onnxAxis, uint32_t onnxDimCount, uint32_t dmlDimCount);
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/Shaders/stockham.hlsl b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/Shaders/stockham.hlsl
index 01e62b072752..c8a006c7e12e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/Shaders/stockham.hlsl
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/Shaders/stockham.hlsl
@@ -41,20 +41,21 @@ float2 ReadSourceValue(uint3 index)
     float2 value = float2(0, 0);
 
     bool hasWindow = HasWindow == 1;
-    [branch]
+    [flatten]
     if (hasWindow && index.y < (uint)WindowSizes[2])
     {
         uint windowIndexReal = index.y * WindowStrides[2];
         window_value.x = window[windowIndexReal];
 
         uint windowIndexImaginary = windowIndexReal + WindowStrides[3];
+        [branch]
         if (WindowSizes[3] == 2)
         {
             window_value.y = window[windowIndexImaginary];
         }
     }
 
-    [branch]
+    [flatten]
     if (index.y < (uint)InputSizes[1])
     {
         uint indexReal =
@@ -108,7 +109,7 @@ void DFT(uint3 dtid : SV_DispatchThreadId)
     uint index = StartIndex + dtid.x;
     if (index < ElementCount)
     {
-        uint halfTotalDFTLength = DFTLength / 2;
+        uint halfTotalDFTLength = DFTLength >> 1;
         uint N = 1U << DFTIteration;
         uint halfN = 1U << (DFTIteration - 1);
 
@@ -143,8 +144,16 @@ void DFT(uint3 dtid : SV_DispatchThreadId)
         unweighted.y = Scale * (inputEvenValue.y + (w.x * inputOddValue.y + w.y * inputOddValue.x));
 
         // When ChirpLength is 0, then chirp should evaluate to (1,0), which is a no-op.
-        float2 chirp = CalculateChirp(k, ChirpLength);
-        dst[outputIndex.x] = (TBUFFER)(unweighted.x * chirp.x - unweighted.y * chirp.y);
-        dst[outputIndex.y] = (TBUFFER)(unweighted.x * chirp.y + unweighted.y * chirp.x);
+        [branch]
+        if (ChirpLength == 0)
+        {
+            dst[outputIndex.x] = (TBUFFER)(unweighted.x);
+            dst[outputIndex.y] = (TBUFFER)(unweighted.y);
+        }
+        else {
+            float2 chirp = CalculateChirp(k, ChirpLength);
+            dst[outputIndex.x] = (TBUFFER)(unweighted.x * chirp.x - unweighted.y * chirp.y);
+            dst[outputIndex.y] = (TBUFFER)(unweighted.x * chirp.y + unweighted.y * chirp.x);
+        }
     }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
index 067a320dd800..a2183aab52ee 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
@@ -315,3 +315,39 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm
     }
     m_bufferTensorDesc.DimensionCount = newDimensionCount;
 }
+
+// Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout
+void TensorDesc::PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, const TensorAxis alignment)
+{
+    EnsureStridesExist();
+    SetDimensionCount(static_cast<uint32_t>(dimensionMapping.size()), alignment);
+
+    // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping
+    std::vector<uint32_t> tempSizes{m_sizes, m_sizes + MaximumDimensionCount};
+    std::vector<uint32_t> tempStrides{m_strides, m_strides + MaximumDimensionCount};
+
+    for (size_t i = 0; i < dimensionMapping.size(); i++)
+    {
+        m_sizes[i] = tempSizes[dimensionMapping[i]];
+        m_strides[i] = tempStrides[dimensionMapping[i]];
+    }
+
+    m_bufferTensorDesc.Sizes = m_sizes;
+    m_bufferTensorDesc.Strides = m_strides;
+}
+
+void TensorDesc::EnsureStridesExist()
+{
+    if (m_bufferTensorDesc.Strides != nullptr)
+    {
+        // Strides are populated
+        return;
+    }
+
+    uint32_t stride = 1;
+    for (uint32_t i = m_bufferTensorDesc.DimensionCount; i-- > 0;)
+    {
+        m_strides[i] = stride;
+        stride *= m_sizes[i];
+    }
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
index ff70dec5b887..909e2084d016 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
@@ -44,6 +44,7 @@ namespace Dml
         gsl::span<const uint32_t> GetSizes() const { return { m_sizes, m_sizes + m_bufferTensorDesc.DimensionCount }; }
         gsl::span<const uint32_t> GetStrides() const;
         void SetStrides(gsl::span<const uint32_t> strides);
+        void PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, const TensorAxis alignment);
 
         inline uint64_t GetBufferSizeInBytes() const
         {
@@ -90,6 +91,8 @@ namespace Dml
         uint32_t m_sizes[MaximumDimensionCount] = {};
         uint32_t m_strides[MaximumDimensionCount] = {};
         DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {};
+
+        void EnsureStridesExist();
     };
 
     class TensorDescBuilder
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h
new file mode 100644
index 000000000000..02166f992449
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h
@@ -0,0 +1,141 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <string>
+#include <string_view>
+#include <locale>
+#include <codecvt>
+        
+
+namespace Dml
+{
+    static inline std::wstring ConvertToWString(std::string_view str)
+    {
+        std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>,wchar_t> g_converterToUtf16;
+        return g_converterToUtf16.from_bytes(str.data());
+    }
+
+    static inline std::wstring GetModelName(const onnxruntime::Path& modelPath)
+    {
+        if (modelPath.GetComponents().empty())
+        {
+            return L"";
+        }
+        
+        const onnxruntime::PathString& pathString = modelPath.GetComponents().back();
+        size_t dotPosition = pathString.find_last_of('.');
+        if (dotPosition == std::string::npos)
+        {
+            return L"";
+        }
+
+        return pathString.substr(0, dotPosition);
+    }
+
+    static inline std::wstring GetSanitizedFileName(std::wstring_view name)
+    {
+        std::wstring newName(name);
+        for (wchar_t& c : newName)
+        {
+            switch (c)
+            {
+            case '\\':
+            case '/':
+            case '\"':
+            case '|':
+            case '<':
+            case '>':
+            case ':':
+            case '?':
+            case '*':
+                c = '_';
+                break;
+            }
+        }
+        return newName;
+    }
+
+    static inline std::string GetSanitizedFileName(std::string_view name)
+    {
+        std::string newName(name);
+        for (char& c : newName)
+        {
+            switch (c)
+            {
+            case '\\':
+            case '/':
+            case '\"':
+            case '|':
+            case '<':
+            case '>':
+            case ':':
+            case '?':
+            case '*':
+                c = '_';
+                break;
+            }
+        }
+        return newName;
+    }
+
+    static inline void WriteToFile(std::wstring_view directoryName, std::wstring_view fileName, std::uint8_t* data, size_t dataSize)
+    {
+        std::wstring sanitizedFileName = GetSanitizedFileName(fileName);
+        std::filesystem::create_directory(directoryName);
+        std::wstring fullSanitizedFileName = std::wstring(directoryName) +
+                                (directoryName.empty() ? L"" : L"/") +
+                                sanitizedFileName;
+        std::ofstream file(fullSanitizedFileName, std::ios::binary);
+        if (!file.is_open()) 
+        {
+            std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>,wchar_t> g_converterToUtf16;
+            std::stringstream errorMessage;
+            errorMessage << "File named: " << g_converterToUtf16.to_bytes(fileName.data()) << " could not be opened\n";
+            throw std::ios::failure(errorMessage.str());
+        }
+        file.write(reinterpret_cast<const char*>(data), dataSize);
+    }
+
+}
+
+namespace StringUtil
+{
+    struct NameAndIndex
+    {
+        const char* name; // Null terminated.
+        uint32_t index;
+    };
+
+    struct WideNameAndIndex
+    {
+        const wchar_t* name; // Null terminated.
+        uint32_t index;
+    };
+
+    inline std::optional<uint32_t> MapToIndex(std::string_view mode, gsl::span<const NameAndIndex> nameAndIndexList)
+    {
+        for (auto& nameAndIndex : nameAndIndexList)
+        {
+            if (strncmp(nameAndIndex.name, mode.data(), mode.size()) == 0)
+            {
+                return nameAndIndex.index;
+            }
+        }
+
+        return {};
+    }
+
+    inline std::optional<uint32_t> MapToIndex(std::wstring_view mode, gsl::span<const WideNameAndIndex> nameAndIndexList)
+    {
+        for (auto& nameAndIndex : nameAndIndexList)
+        {
+            if (wcsncmp(nameAndIndex.name, mode.data(), mode.size()) == 0)
+            {
+                return nameAndIndex.index;
+            }
+        }
+
+        return {};
+    }
+}
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
index 83737d2ba484..49abb8944965 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
@@ -17,6 +17,8 @@
 #include <chrono>
 #include <variant>
 #include <cassert>
+#include <fstream>
+#include <filesystem>
 
 #include <wrl/client.h>
 #include <wrl/implements.h>
@@ -33,10 +35,11 @@
 #include <d3d12_x.h>
 #include <d3dx12_x.h>
 #else // Desktop
-#include <d3d12.h>
+#include "directx/d3d12.h"
 #include <d3d12sdklayers.h>
 #include "External/D3DX12/d3dx12.h"
 #endif
+#include "core/common/flatbuffers.h"
 
 #include "GraphicsUnknownHelper.h"
 
@@ -53,6 +56,10 @@
 #include "External/DirectMLHelpers/SchemaHelpers.h"
 #include "External/DirectMLHelpers/GeneratedSchemaHelpers.h"
 #include "External/DirectMLHelpers/DirectMLX.h"
+#include "External/DirectMLHelpers/DmlSerializedGraphDesc.h"
+#include "External/DirectMLHelpers/DmlGraphSerialization.h"
+#include "External/DirectMLHelpers/DmlGraphDeserialization.h"
+#include "External/DirectMLHelpers/DmlGraphHelper.h"
 
 using Microsoft::WRL::ComPtr;
 
@@ -67,3 +74,4 @@ using Microsoft::WRL::ComPtr;
 #include "TensorDesc.h"
 #include "DescriptorPool.h"
 #include "IExecutionProvider.h"
+#include "Utility.h"
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h
index e9591cfce687..287deaa513f6 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h
@@ -12,6 +12,7 @@ namespace AttrName
     static constexpr const char* AllowZero = "allowzero";
     static constexpr const char* Alpha = "alpha";
     static constexpr const char* AlignCorners = "align_corners";
+    static constexpr const char* Antialiased = "antialias";
     static constexpr const char* AutoPad = "auto_pad";
     static constexpr const char* Axes = "axes";
     static constexpr const char* Axis = "axis";
@@ -23,8 +24,8 @@ namespace AttrName
     static constexpr const char* BlockSize = "blocksize";
     static constexpr const char* Border = "border";
     static constexpr const char* Broadcast = "broadcast";
-    static constexpr const char* ChannelsLast = "channels_last";
     static constexpr const char* CeilMode = "ceil_mode";
+    static constexpr const char* ChannelsLast = "channels_last";
     static constexpr const char* Clip = "clip";
     static constexpr const char* CoordinateTransformationMode = "coordinate_transformation_mode";
     static constexpr const char* CountIncludePad = "count_include_pad";
@@ -107,6 +108,7 @@ namespace AttrName
     static constexpr const char* QkvHiddenSizes = "qkv_hidden_sizes";
     static constexpr const char* Unidirectional = "unidirectional";
     static constexpr const char* NumHeads = "num_heads";
+    static constexpr const char* PastPresentShareBuffer = "past_present_share_buffer";
 
     static constexpr const char* FusedActivation = "fused_activation";
     static constexpr const char* FusedActivationDomain = "fused_activation_domain";
@@ -148,5 +150,6 @@ namespace AttrValue
     static constexpr const char* NearestNeighbor = "NN";
     static constexpr const char* NotSet = "NOTSET";
     static constexpr const char* Reflect = "reflect";
+    static constexpr const char* Wrap = "wrap";
 
 } // namespace AttrValue
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
index f94270cfadb8..c40f82a8c31c 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
@@ -6,6 +6,7 @@
 #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h"
 #include "MLOperatorAuthorPrivate.h"
 #include "core/common/gsl.h"
+#include <optional>
 
 #ifdef ORT_NO_EXCEPTIONS
 #define ML_CHECK_BOOL(x) ORT_THROW_HR_IF(E_INVALIDARG, !(x))
@@ -604,6 +605,18 @@ class MLOperatorKernelCreationContext : public MLOperatorAttributes
         return MLOperatorTensor(tensor.Get());
     }
 
+    std::optional<MLOperatorTensor> TryGetConstantCpuInputTensor(uint32_t inputIndex) const
+    {
+        Microsoft::WRL::ComPtr<IMLOperatorTensor> tensor;
+        ORT_THROW_IF_FAILED(m_implPrivate->TryGetConstantInputTensor(inputIndex, &tensor));
+        if (tensor && tensor->IsCpuData())
+        {
+            return MLOperatorTensor(tensor.Get());
+        }
+
+        return std::nullopt;
+    }
+
     uint32_t GetInputTensorDimensionCount(uint32_t inputIndex) const
     {
         auto shapeDesc = GetTensorShapeDescription();
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
index d1a705e151dd..ac3a3eb1268b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h
@@ -10,18 +10,11 @@ struct DML_INPUT_GRAPH_EDGE_DESC;
 struct DML_OUTPUT_GRAPH_EDGE_DESC;
 struct DML_INTERMEDIATE_GRAPH_EDGE_DESC;
 
-// Either nodesAsOpDesc or nodesAsIDMLOperator is present.
-//  1) Operator kernels which implement operators using only a single DML operator will pass a DML_OPERATOR_DESC.
-//     These kernels pass DML_OPERATOR_DESC, because while building Dml graph (inside FusedGraphKernel.cpp) we can change the
-//     the flag of constant inputs to DML_TENSOR_FLAG_OWNED_BY_DML.
-//  2) Operator kernels which implement operators using DMLX graph, they will pass IDMLOperator and won't be able
-//     to use DML_TENSOR_FLAG_OWNED_BY_DML.
 struct MLOperatorGraphDesc
 {
     uint32_t nodeCount;
-    _Field_size_opt_(nodeCount) const DML_OPERATOR_DESC** nodesAsOpDesc;
-    _Field_size_opt_(nodeCount) IDMLOperator** nodesAsIDMLOperator;
-
+    _Field_size_opt_(nodeCount) const DML_OPERATOR_DESC** nodes;
+    
     uint32_t inputEdgeCount;
     _Field_size_(inputEdgeCount) const DML_INPUT_GRAPH_EDGE_DESC* inputEdges;
 
@@ -41,6 +34,11 @@ IMLOperatorShapeInferenceContextPrivate : public IMLOperatorShapeInferenceContex
         _Outptr_ IMLOperatorTensor** tensor
         ) const noexcept PURE;
 
+    STDMETHOD(TryGetConstantInputTensor)(
+        uint32_t inputIndex, 
+        _Outptr_ IMLOperatorTensor** tensor
+        ) const noexcept PURE;
+
     //! Gets the number of dimensions of a tensor output of the operator.
     STDMETHOD(GetSequenceInputInfo)(
         uint32_t inputIndex,
@@ -73,6 +71,11 @@ IMLOperatorKernelCreationContextPrivate : public IMLOperatorKernelCreationContex
         _Outptr_ IMLOperatorTensor** tensor
         ) const noexcept PURE;
 
+    STDMETHOD(TryGetConstantInputTensor)(
+        uint32_t inputIndex, 
+        _Outptr_ IMLOperatorTensor** tensor
+        ) const noexcept PURE;
+
     STDMETHOD_(bool, IsDmlGraphNode)() const noexcept PURE;
 
     STDMETHOD(SetDmlOperator)(
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index 370f336ff520..acda1a516be0 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -56,6 +56,18 @@ namespace OperatorHelper
         }
     }
 
+    template <typename T>
+    void ExpandToAxes(/*inout*/ std::vector<T>& originalValues, gsl::span<const int32_t> axes, std::vector<T> expanded)
+    {
+        assert(originalValues.size() == axes.size());
+        // Fill in roi and scales/sizes
+        for (size_t i = 0; i < axes.size(); i++)
+        {
+            expanded[axes[i]] = originalValues[i];
+        }
+        originalValues = std::move(expanded);
+    }
+
     float CastFloat16ToFloat32(uint16_t input)
     {
         // Promote float16m10e5s1 to float32m23e8s1.
@@ -144,50 +156,6 @@ namespace OperatorHelper
     }
     #pragma warning(pop)
 
-    void ReadCpuLocalTensorIntoInt32(
-        const MLOperatorTensor& tensor,
-        std::vector<int32_t>& result
-        )
-    {
-        result.clear();
-        ML_CHECK_VALID_ARGUMENT(tensor.IsCpuData(), "Tensor must be CPU Tensor.");
-
-        const std::vector<uint32_t>& tensorDimensions = tensor.GetShape();
-        const uint32_t elementCount = ComputeElementCountFromDimensions(tensorDimensions);
-
-        switch (tensor.GetTensorDataType())
-        {
-        case MLOperatorTensorDataType::Int32:
-            {
-                const int32_t* data = tensor.GetData<int32_t>();
-                result.assign(data, data + elementCount);
-            }
-            break;
-
-        case MLOperatorTensorDataType::Int64:
-            {
-                const int64_t* data = tensor.GetData<int64_t>();
-                result.reserve(elementCount);
-
-                // Use clamped cast rather than static_cast/narrow_cast,
-                // because it's not uncommon for a model to specify a
-                // 64-bit INTMAX constant as a sentinel value to mean
-                // the largest possible value (even though the actual
-                // dimension values come nowhere close to that, far
-                // less than 32-bit INTMAX).
-                for (auto d : gsl::make_span(data, data + elementCount))
-                {
-                    result.push_back(clamp_cast<int32_t>(d));
-                }
-            }
-            break;
-
-        default:
-            ML_INVALID_ARGUMENT("Expecting CPU local tensor of type int32 or int64.");
-            break;
-        }
-    }
-
     void ReadCpuLocalTensorIntoFloat32(
         const MLOperatorTensor& tensor,
         std::vector<float>& result
@@ -257,14 +225,15 @@ namespace OperatorHelper
         }
     }
 
-    void DowncastDimensions(gsl::span<const int64_t> inputDimensions, std::vector<DimensionType>& outputDimensions)
+    template <typename T>
+    void DowncastDimensions(gsl::span<T> inputDimensions, std::vector<DimensionType>& outputDimensions)
     {
         outputDimensions.reserve(inputDimensions.size());
         outputDimensions.clear();
 
-        for (int64_t dim : inputDimensions)
+        for (T dim : inputDimensions)
         {
-            outputDimensions.push_back(gsl::narrow_cast<uint32_t>(std::clamp<int64_t>(dim, INT32_MIN, INT32_MAX)));
+            outputDimensions.push_back(gsl::narrow_cast<DimensionType>(std::clamp<T>(dim, INT32_MIN, INT32_MAX)));
         }
     }
 
@@ -365,13 +334,20 @@ namespace OperatorHelper
     }
 
     // Creates a kernel that spans the entire spatial dimensions of the input.
-    KernelArgs InitializeGlobalKernel(gsl::span<const DimensionType> inputDimensions)
+    KernelArgs InitializeGlobalKernel(
+        const MLOperatorAttributes& kernelInfo,
+        gsl::span<const DimensionType> inputDimensions)
     {
         ML_CHECK_VALID_ARGUMENT(inputDimensions.size() > NonspatialDimensionCount); // Must be at least 1D convolution (in 3D tensor)
         uint32_t spatialDimensionCount = gsl::narrow_cast<uint32_t>(inputDimensions.size()) - NonspatialDimensionCount;
         ML_CHECK_VALID_ARGUMENT(spatialDimensionCount <= NcdhwSpatialDimensionCount); // Support up to 3D convolution (in 5D tensor).
 
         KernelArgs args(spatialDimensionCount);
+        args.useCeilingOutputShape = kernelInfo.GetOptionalAttribute<bool>(AttrName::CeilMode, 0);
+        args.channelsLast = kernelInfo.GetOptionalAttribute<bool>(AttrName::ChannelsLast, 0);
+        // For Global Pooling, kernel size equal to the spatial dimension of input tensor
+        // NHWC layout need to offset by one dim to acount for channel placed at the end
+        int dimOffset = args.channelsLast ? 1 : 0;
 
         for (size_t dim = 0; dim < spatialDimensionCount; ++dim)
         {
@@ -379,7 +355,7 @@ namespace OperatorHelper
             args.dilations[dim] = 1;
             args.startPadding[dim] = 0;
             args.endPadding[dim] = 0;
-            args.windowSize[dim] = gsl::narrow_cast<uint32_t>(inputDimensions[inputDimensions.size() - spatialDimensionCount + dim]);
+            args.windowSize[dim] = gsl::narrow_cast<uint32_t>(inputDimensions[inputDimensions.size() - spatialDimensionCount + dim - dimOffset]);
         }
 
         return args;
@@ -495,6 +471,7 @@ namespace OperatorHelper
         }
 
         args.useCeilingOutputShape = kernelInfo.GetOptionalAttribute<bool>(AttrName::CeilMode, 0);
+        args.channelsLast = kernelInfo.GetOptionalAttribute<bool>(AttrName::ChannelsLast, 0);
 
         return args;
     }
@@ -1862,7 +1839,65 @@ namespace OperatorHelper
         return { std::move(outputShape) };
     }
 
-    void ConcatHelper::Initialize(
+    void Col2ImHelper::Initialize(
+        const IKernelInformationAdapter& kernelInformation,
+        const IShapeInformationAdapter& shapeInformation)
+    {
+        std::vector<int> shapeData;
+        ReadCpuLocalTensorIntoInt32(kernelInformation.GetConstantInputTensor(1), /*out*/ shapeData);
+        m_imageShape.resize(shapeData.size());
+        DowncastDimensions(gsl::span(shapeData), /*out*/ m_imageShape);
+        ReadCpuLocalTensorIntoInt32(kernelInformation.GetConstantInputTensor(2), /*out*/ shapeData);
+        m_blockShape.resize(shapeData.size());
+        DowncastDimensions(gsl::span(shapeData), /*out*/ m_blockShape);
+
+        const uint32_t dimCount = gsl::narrow_cast<uint32_t>(m_blockShape.size());
+        m_dilations = {dimCount, 1};
+        m_pads = {dimCount * 2, 0};
+        m_strides = {dimCount, 1};
+
+        if (kernelInformation.HasAttribute(AttrName::Dilations, MLOperatorAttributeType::IntArray))
+        {
+            shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Dilations);
+            m_dilations.resize(shapeData.size());
+            DowncastDimensions(gsl::span(shapeData), /*out*/ m_dilations);
+            ML_CHECK_VALID_ARGUMENT(m_dilations.size() == dimCount);
+        }
+
+        if (kernelInformation.HasAttribute(AttrName::Pads, MLOperatorAttributeType::IntArray))
+        {
+            shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Pads);
+            m_pads.resize(shapeData.size());
+            DowncastDimensions(gsl::span(shapeData), /*out*/ m_pads);
+            ML_CHECK_VALID_ARGUMENT(m_pads.size() == dimCount * 2);
+        }
+
+        if (kernelInformation.HasAttribute(AttrName::Strides, MLOperatorAttributeType::IntArray))
+        {
+            shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Strides);
+            m_strides.resize(shapeData.size());
+            DowncastDimensions(gsl::span(shapeData), /*out*/ m_strides);
+            ML_CHECK_VALID_ARGUMENT(m_strides.size() == dimCount);
+        }
+
+        m_inputShape = shapeInformation.GetInputTensorShape(0);
+
+        auto blockShapeProduct = ComputeElementCountFromDimensions(m_blockShape);
+        m_outputShape.resize(2 + m_imageShape.size());
+        m_outputShape[0] = m_inputShape[0];                     // N
+        m_outputShape[1] = m_inputShape[1] / blockShapeProduct; // C
+        for (int i = 2; i < m_outputShape.size(); i++)
+        {
+            m_outputShape[i] = m_imageShape[i - 2];
+        };
+    }
+
+    std::vector<EdgeShapes> Col2ImHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        return { EdgeShapes(m_outputShape) };
+    }
+
+    void ConcatHelperBase::Initialize(
         const MLOperatorAttributes& operatorAttributes,
         gsl::span<const DimensionType> inputDimensions
         )
@@ -1872,13 +1907,13 @@ namespace OperatorHelper
         ML_CHECK_VALID_ARGUMENT(m_axis < static_cast<int>(inputDimensions.size()));
     }
 
-    std::vector<EdgeShapes> ConcatHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    std::vector<EdgeShapes> ConcatHelperBase::GetOutputShapes(const MLShapeInferenceContext& shapeInfo, uint32_t firstInputIndex, uint32_t step) const
     {
-        auto outputShape = shapeInfo.GetInputTensorShape(0);
+        auto outputShape = shapeInfo.GetInputTensorShape(firstInputIndex);
 
         uint32_t inputCount = shapeInfo.GetInputCount();
 
-        for (uint32_t i = 1; i < inputCount; ++i)
+        for (uint32_t i = firstInputIndex + step; i < inputCount; i += step)
         {
             auto inputShape = shapeInfo.GetInputTensorShape(i);
             for (size_t j = 0; j < outputShape.size(); ++j)
@@ -1893,6 +1928,16 @@ namespace OperatorHelper
         return { EdgeShapes(outputShape) };
     }
 
+    std::vector<EdgeShapes> ConcatHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        return ConcatHelperBase::GetOutputShapes(shapeInfo, 0, 1);
+    }
+
+    std::vector<EdgeShapes> QLinearConcatHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        return ConcatHelperBase::GetOutputShapes(shapeInfo, 2, 3);
+    }
+
     void CropHelper::Initialize(
         const MLOperatorAttributes& operatorAttributes,
         gsl::span<const DimensionType> inputDimensions
@@ -2003,6 +2048,36 @@ namespace OperatorHelper
         return outputShapes;
     }
 
+    std::vector<EdgeShapes> QLinearAveragePoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        auto inputShape = shapeInfo.GetInputTensorShape(0);
+        std::vector<DimensionType> outputDimensions = InitializeKernelOutputDimensions(inputShape, m_kernel, m_kernel.channelsLast);
+
+        const uint32_t outputCount = shapeInfo.GetOutputCount();
+
+        std::vector<EdgeShapes> outputShapes;
+        for (uint32_t i = 0; i < outputCount; ++i)
+        {
+            outputShapes.push_back(outputDimensions);
+        }
+        return outputShapes;
+    }
+
+    std::vector<EdgeShapes> QLinearGlobalAveragePoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        auto inputShape = shapeInfo.GetInputTensorShape(0);
+        std::vector<DimensionType> outputDimensions = InitializeKernelOutputDimensions(inputShape, m_kernel, m_kernel.channelsLast);
+
+        const uint32_t outputCount = shapeInfo.GetOutputCount();
+
+        std::vector<EdgeShapes> outputShapes;
+        for (uint32_t i = 0; i < outputCount; ++i)
+        {
+            outputShapes.push_back(outputDimensions);
+        }
+        return outputShapes;
+    }
+
     std::vector<EdgeShapes> RoiPoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
     {
         auto roiShape = shapeInfo.GetInputTensorShape(InputTensors::ROIS);
@@ -2065,7 +2140,7 @@ namespace OperatorHelper
         {
             std::vector<int64_t> outputDimensions64bit = shapeInfo.GetAttributeVector<int64_t>(AttrName::OutputShape);
             ML_CHECK_VALID_ARGUMENT(outputDimensions64bit.size() == m_inputShape.size(), "Input dimensions and output_shape must have same rank.");
-            DowncastDimensions(outputDimensions64bit, /*out*/ outputDimensions);
+            DowncastDimensions(gsl::span(outputDimensions64bit), /*out*/ outputDimensions);
         }
         else
         {
@@ -2354,7 +2429,8 @@ namespace OperatorHelper
     {
         auto& attributes = kernelInformation.GetAttributes();
         m_inputDimensions = shapeInformation.GetInputTensorShape(0);
-        std::vector<int32_t> outputSizes;
+        std::vector<uint32_t> outputSizes;
+        std::vector<int32_t> axes;
 
         if (opsetVersion >= 11)
         {
@@ -2371,7 +2447,38 @@ namespace OperatorHelper
             if (kernelInformation.IsInputValid(3))
             {
                 MLOperatorTensor outputSizesTensor = kernelInformation.GetConstantInputTensor(3);
-                ReadCpuLocalTensorIntoInt32(outputSizesTensor, /*out*/ outputSizes);
+                ReadCpuLocalTensorIntoInt32<uint32_t>(outputSizesTensor, /*out*/ outputSizes);
+            }
+
+            axes = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Axes);
+            // Handle possible axes input
+            if (opsetVersion >= 18 && !axes.empty())
+            {
+                uint32_t dimCount = gsl::narrow_cast<uint32_t>(m_inputDimensions.size());
+                HandleEmptyAxes(/*inout*/ axes, m_inputDimensions, false);
+                HandleNegativeAxes(/*inout*/ axes, dimCount);
+
+                // Taken from https://github.com/onnx/onnx/blob/3d69db8fd16873d68e7033479467f9478562a12d/onnx/reference/ops/op_resize.py#L303
+                if (!m_scales.empty())
+                {
+                    std::vector<float> defaultScales(dimCount, 1.0f);
+                    ExpandToAxes(/*inout*/ m_scales, axes, defaultScales);
+                }
+                if (!outputSizes.empty())
+                {
+                    ExpandToAxes(/*inout*/ outputSizes, axes, m_inputDimensions);
+                }
+                if (!m_regionOfInterest.empty())
+                {
+                    std::vector<float> defaultRois(dimCount, 0.0f);
+                    defaultRois.resize(dimCount * 2, 1.0f);
+                    size_t numAxes = axes.size();
+                    for (size_t i = 0; i < axes.size(); i++)
+                    {
+                        defaultRois[axes[i]] = m_regionOfInterest[i];
+                        defaultRois[axes[i + dimCount]] = m_regionOfInterest[i + numAxes];
+                    }
+                }
             }
         }
         else if (opsetVersion >= 9)
@@ -2402,8 +2509,7 @@ namespace OperatorHelper
             {
                 float scale = m_scales[i];
                 ML_CHECK_VALID_ARGUMENT(scale > FLT_EPSILON, "Scale values should be positive.");
-                float roiRange = m_regionOfInterest.empty() ? 1.0f : m_regionOfInterest[i + rank] - m_regionOfInterest[i];
-                m_outputDimensions.push_back(gsl::narrow_cast<uint32_t>(floor(m_inputDimensions[i] * roiRange * scale)));
+                m_outputDimensions.push_back(gsl::narrow_cast<uint32_t>(floor(m_inputDimensions[i] * scale)));
             }
         }
         else
@@ -2696,6 +2802,48 @@ namespace OperatorHelper
         m_qkvHiddenSizes = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::QkvHiddenSizes);
     }
 
+    std::vector<EdgeShapes> QAttentionHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() >= 5);
+
+        auto queryShape = shapeInfo.GetInputTensorShape(0);
+        ML_CHECK_VALID_ARGUMENT(queryShape.size() == 3);
+
+        auto weightShape = shapeInfo.GetInputTensorShape(1);
+        ML_CHECK_VALID_ARGUMENT(weightShape.size() == 2);
+        ML_CHECK_VALID_ARGUMENT(weightShape[1] % 3 == 0);
+
+        const uint32_t batchSize = queryShape[0];
+        const uint32_t sequenceLength = queryShape[1];
+        const uint32_t hiddenSize = weightShape[1] / 3;
+        const uint32_t headSize = hiddenSize / m_numHeads;
+
+        std::vector<EdgeShapes> outputShapes(2);
+
+        outputShapes[0] = EdgeShapes({batchSize, sequenceLength, hiddenSize});
+
+        uint32_t totalSequenceLength = sequenceLength;
+        if (shapeInfo.IsInputValid(8))
+        {
+            ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputTensorDimensionCount(8) == 5);
+            const uint32_t pastSequenceLength = shapeInfo.GetInputTensorShape(8)[3];
+            totalSequenceLength += pastSequenceLength;
+        }
+
+        if (shapeInfo.IsOutputValid(1))
+        {
+            ML_CHECK_VALID_ARGUMENT(shapeInfo.IsInputValid(8));
+            outputShapes[1] = EdgeShapes({2, batchSize, m_numHeads, totalSequenceLength, headSize});
+        }
+
+        return outputShapes;
+    }
+
+    void QAttentionHelper::Initialize(const IKernelInformationAdapter& kernelInformation)
+    {
+        m_numHeads = gsl::narrow_cast<uint32_t>(kernelInformation.GetAttributes().GetAttribute<int64_t>(AttrName::NumHeads));
+    }
+
     std::vector<EdgeShapes> SkipLayerNormHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
     {
         ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() >= 3);
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index f7e545d9d99a..090a70c73d97 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -120,10 +120,54 @@ double CastToFloat64(MLOperatorTensorDataType tensorDataType, const void* p);
 void ReadScalarTensorData(const MLOperatorTensor& tensor, /*out*/ void* data, size_t dataByteSize);
 int64_t ReadScalarTensorCastToInt64(const MLOperatorTensor& tensor);
 double ReadScalarTensorCastToFloat64(const MLOperatorTensor& tensor);
-
-void ReadCpuLocalTensorIntoInt32(const MLOperatorTensor& tensor, std::vector<int32_t>& result);
 void ReadCpuLocalTensorIntoFloat32(const MLOperatorTensor& tensor, std::vector<float>& result);
 
+template<typename T = int32_t>
+void ReadCpuLocalTensorIntoInt32(
+    const MLOperatorTensor& tensor,
+    std::vector<T>& result
+    )
+{
+    result.clear();
+    ML_CHECK_VALID_ARGUMENT(tensor.IsCpuData(), "Tensor must be CPU Tensor.");
+
+    const std::vector<uint32_t>& tensorDimensions = tensor.GetShape();
+    const uint32_t elementCount = ComputeElementCountFromDimensions(tensorDimensions);
+
+    switch (tensor.GetTensorDataType())
+    {
+    case MLOperatorTensorDataType::Int32:
+        {
+            result.resize(elementCount);
+            const int32_t* data = tensor.GetData<int32_t>();
+            std::transform(data, data + elementCount, result.begin(), [](auto v) {return static_cast<T>(v); });
+        }
+        break;
+
+    case MLOperatorTensorDataType::Int64:
+        {
+            const int64_t* data = tensor.GetData<int64_t>();
+            result.reserve(elementCount);
+
+            // Use clamped cast rather than static_cast/narrow_cast,
+            // because it's not uncommon for a model to specify a
+            // 64-bit INTMAX constant as a sentinel value to mean
+            // the largest possible value (even though the actual
+            // dimension values come nowhere close to that, far
+            // less than 32-bit INTMAX).
+            for (auto d : gsl::make_span(data, data + elementCount))
+            {
+                result.push_back(clamp_cast<T>(d));
+            }
+        }
+        break;
+
+    default:
+        ML_INVALID_ARGUMENT("Expecting CPU local tensor of type int32 or int64.");
+        break;
+    }
+}
+
 class EdgeShapes
 {
 public:
@@ -160,6 +204,7 @@ struct KernelArgs
     bool autoPad = false;
     bool autoPadSameUpper = false;
     bool useCeilingOutputShape = false;
+    bool channelsLast = false;
     uint32_t spatialDimensionCount = 0;
 
     KernelArgs(uint32_t spatialDimensionCount) : spatialDimensionCount(spatialDimensionCount)
@@ -188,6 +233,7 @@ struct KernelArgs
     KernelArgs(KernelArgs const& kernelArgs, uint32_t minimumDimensionCount)
     :   autoPad(kernelArgs.autoPad),
         autoPadSameUpper(kernelArgs.autoPadSameUpper),
+        channelsLast(kernelArgs.channelsLast),
         spatialDimensionCount(std::max(kernelArgs.spatialDimensionCount, minimumDimensionCount))
     {
         ML_CHECK_VALID_ARGUMENT(spatialDimensionCount <= NcdhwSpatialDimensionCount);
@@ -211,7 +257,9 @@ std::vector<DimensionType> InitializeKernelOutputDimsTranspose(
     gsl::span<const DimensionType> inputDimensions,
     const KernelArgs& args);
 
-KernelArgs InitializeGlobalKernel(gsl::span<const DimensionType> inputDimensions);
+KernelArgs InitializeGlobalKernel(
+        const MLOperatorAttributes& kernelInfo,
+        gsl::span<const DimensionType> inputDimensions);
 
 KernelArgs InitializeKernel(
     const MLOperatorAttributes& kernelInfo,
@@ -822,7 +870,6 @@ class QLinearMatMulHelper : public MatMulHelperBase
     QLinearMatMulHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 3) {}
 };
 
-
 class TopKHelper
 {
     void Initialize(
@@ -864,7 +911,7 @@ class RecurrentHelper
     int m_hiddenSize = 0;
 };
 
-class ConcatHelper
+class ConcatHelperBase
 {
 public:
     void Initialize(
@@ -875,17 +922,33 @@ class ConcatHelper
     // Info_t is used to obtain attributes which will be used for calculating the output shape later.
     // Shape_t is used to obtain input shape which will be used for adjusting attribute value.
     template <typename Info_t, typename Shape_t>
-    ConcatHelper(const Info_t& info, const Shape_t& shape)
+    ConcatHelperBase(const Info_t& info, const Shape_t& shape, uint32_t firstInputIndex)
     {
-        Initialize(info, shape.GetInputTensorShape(0));
+        Initialize(info, shape.GetInputTensorShape(firstInputIndex));
     }
 
-    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo, uint32_t firstInputIndex, uint32_t step) const;
 
 protected:
     int m_axis;
 };
 
+class ConcatHelper: public ConcatHelperBase
+{
+public:
+    template<typename Info_t, typename Shape_t>
+    ConcatHelper(const Info_t& info, const Shape_t& shape) : ConcatHelperBase(info, shape, 0) {}
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+};
+
+class QLinearConcatHelper: public ConcatHelperBase
+{
+public:
+    template<typename Info_t, typename Shape_t>
+    QLinearConcatHelper(const Info_t& info, const Shape_t& shape) : ConcatHelperBase(info, shape, 2) {}
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+};
+
 class CropHelper
 {
 public:
@@ -1043,7 +1106,7 @@ class PoolingHelperBase
         bool useGlobalPooling
     )
     :   m_kernel(useGlobalPooling
-            ? InitializeGlobalKernel(shape.GetInputTensorShape(0))
+            ? InitializeGlobalKernel(info, shape.GetInputTensorShape(0))
             : InitializeKernel(info, static_cast<uint32_t>(shape.GetInputTensorShape(0).size()), gsl::span<uint32_t>()))
     {
         if (!useGlobalPooling)
@@ -1145,6 +1208,24 @@ class RoiAlignHelper : public RoiPoolingHelperBase
     std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
 };
 
+class QLinearAveragePoolingHelper : public PoolingHelperBase
+{
+public:
+    template <typename Info_t, typename Shape_t>
+    QLinearAveragePoolingHelper(const Info_t& info, const Shape_t& shape) : PoolingHelperBase(info, shape, false) {}
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+
+};
+
+class QLinearGlobalAveragePoolingHelper : public PoolingHelperBase
+{
+public:
+    template <typename Info_t, typename Shape_t>
+    QLinearGlobalAveragePoolingHelper(const Info_t& info, const Shape_t& shape) : PoolingHelperBase(info, shape, true) {}
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+
+};
+
 class SqueezeHelper
 {
 public:
@@ -1168,6 +1249,34 @@ class SqueezeHelper
     std::vector<int> m_axes;
 };
 
+class Col2ImHelper
+{
+public:
+    void Initialize(
+        const IKernelInformationAdapter& kernelInformation,
+        const IShapeInformationAdapter& shapeInformation);
+
+    // Info_t is used to obtain attributes which will be used for calculating the output shape later.
+    // Shape_t is used to obtain input shape which will be used for adjusting attribute value.
+    template <typename Info_t, typename Shape_t>
+    Col2ImHelper(const Info_t& info, const Shape_t& shape)
+    {
+        Initialize(KernelInformationAdapter(info), ShapeInformationAdapter(shape));
+    }
+
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+
+protected:
+    std::vector<uint32_t> m_dilations;
+    std::vector<uint32_t> m_pads;
+    std::vector<uint32_t> m_strides;
+    std::vector<uint32_t> m_imageShape;
+    std::vector<uint32_t> m_blockShape;
+    std::vector<uint32_t> m_inputShape;
+    std::vector<uint32_t> m_outputShape;
+};
+
+
 class UnsqueezeHelper
 {
 public:
@@ -1445,6 +1554,22 @@ class AttentionHelper
     std::vector<int32_t> m_qkvHiddenSizes;
 };
 
+class QAttentionHelper
+{
+public:
+    template <typename Info_t, typename Shape_t>
+    QAttentionHelper(const Info_t& info, const Shape_t& shapeInfo)
+    {
+        Initialize(KernelInformationAdapter(info));
+    }
+
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+
+private:
+    void Initialize(const IKernelInformationAdapter& kernelInformation);
+    uint32_t m_numHeads;
+};
+
 class SkipLayerNormHelper
 {
 public:
@@ -1474,6 +1599,8 @@ using ShapeInferenceHelper_MaxUnpool = UnpoolingHelper;
 using ShapeInferenceHelper_LpPool = PoolingHelper;
 using ShapeInferenceHelper_GlobalLpPool = GlobalPoolingHelper;
 using ShapeInferenceHelper_MaxRoiPool = RoiPoolingHelper;
+using ShapeInferenceHelper_QLinearAveragePool = QLinearAveragePoolingHelper;
+using ShapeInferenceHelper_QLinearGlobalAveragePool = QLinearGlobalAveragePoolingHelper;
 using ShapeInferenceHelper_RoiAlign10 = VersionedOpsetHelper<RoiAlignHelper, 10>;
 using ShapeInferenceHelper_RoiAlign16 = VersionedOpsetHelper<RoiAlignHelper, 16>;
 using ShapeInferenceHelper_InstanceNormalization = GetOutputShapeAsInputShapeHelper;
@@ -1512,6 +1639,7 @@ using ShapeInferenceHelper_Split13 = VersionedOpsetHelper<SplitHelper, 13>;
 using ShapeInferenceHelper_Split18 = VersionedOpsetHelper<SplitHelper, 18>;
 using ShapeInferenceHelper_Transpose = TransposeHelper;
 using ShapeInferenceHelper_Concat = ConcatHelper;
+using ShapeInferenceHelper_QLinearConcat = QLinearConcatHelper;
 using ShapeInferenceHelper_Slice7 = VersionedOpsetHelper<SliceHelper, 7>;
 using ShapeInferenceHelper_Slice10 = VersionedOpsetHelper<SliceHelper, 10>;
 using ShapeInferenceHelper_Slice11 = VersionedOpsetHelper<SliceHelper, 11>; // Note 11 and 10 are identical - no functional change.
@@ -1520,6 +1648,7 @@ using ShapeInferenceHelper_Pad7 = VersionedOpsetHelper<PaddingHelper, 7>;
 using ShapeInferenceHelper_Pad11 = VersionedOpsetHelper<PaddingHelper, 11>;
 using ShapeInferenceHelper_Pad13 = VersionedOpsetHelper<PaddingHelper, 13>;
 using ShapeInferenceHelper_Pad18 = VersionedOpsetHelper<PaddingHelper, 18>;
+using ShapeInferenceHelper_Pad19 = VersionedOpsetHelper<PaddingHelper, 19>;
 
 using ShapeInferenceHelper_SpaceToDepth = SpaceToDepthHelper;
 using ShapeInferenceHelper_DepthToSpace = DepthToSpaceHelper;
@@ -1531,16 +1660,20 @@ using ShapeInferenceHelper_Unsqueeze11 = VersionedOpsetHelper<UnsqueezeHelper, 1
 using ShapeInferenceHelper_Unsqueeze13 = VersionedOpsetHelper<UnsqueezeHelper, 13>;
 using ShapeInferenceHelper_EyeLike = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Trilu = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_Col2Im = Col2ImHelper;
 
 using ShapeInferenceHelper_Expand = ExpandHelper;
 using ShapeInferenceHelper_Reshape7 = ReshapeHelper;
 using ShapeInferenceHelper_Reshape13 = ReshapeHelper;
 using ShapeInferenceHelper_Reshape14 = ReshapeHelper;
+using ShapeInferenceHelper_Reshape19 = ReshapeHelper;
 using ShapeInferenceHelper_ConstantOfShape = ConstantOfShapeHelper;
 using ShapeInferenceHelper_Tile = TileHelper;
 using ShapeInferenceHelper_Resize10 = VersionedOpsetHelper<ResizeHelper, 10>;
 using ShapeInferenceHelper_Resize11 = VersionedOpsetHelper<ResizeHelper, 11>;
 using ShapeInferenceHelper_Resize13 = VersionedOpsetHelper<ResizeHelper, 13>;
+using ShapeInferenceHelper_Resize18 = VersionedOpsetHelper<ResizeHelper, 18>;
+using ShapeInferenceHelper_Resize19 = VersionedOpsetHelper<ResizeHelper, 19>;
 using ShapeInferenceHelper_OneHot = OneHotHelper;
 
 using ShapeInferenceHelper_Sqrt = GetOutputShapeAsInputShapeHelper;
@@ -1582,6 +1715,7 @@ using ShapeInferenceHelper_Affine = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_QuantizeLinear = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_DequantizeLinear = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_QLinearSigmoid = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_QAttention = QAttentionHelper;
 using ShapeInferenceHelper_Attention = AttentionHelper;
 using ShapeInferenceHelper_MultiHeadAttention = MultiHeadAttentionHelper;
 using ShapeInferenceHelper_RotaryEmbedding = GetOutputShapeAsInputShapeHelper;
@@ -1650,13 +1784,17 @@ using ShapeInferenceHelper_Dropout = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Shrink = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Gelu = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_BiasGelu = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_FastGelu = GetOutputShapeAsInputShapeHelper;
 
 using ShapeInferenceHelper_Identity7 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Identity13 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Identity14 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Identity16 = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_MatMul = MatMulHelper;
 using ShapeInferenceHelper_MatMulInteger = MatMulHelper;
+using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper;
+using ShapeInferenceHelper_DynamicQuantizeMatMul = MatMulHelper;
 using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper;
 using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper;
 using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper;
@@ -1680,6 +1818,7 @@ using ShapeInferenceHelper_CumSum14 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Range = RangeHelper;
 
 using ShapeInferenceHelper_CastLike15 = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_CastLike19 = GetOutputShapeAsInputShapeHelper;
 
 using ShapeInferenceHelper_DmlFusedConv = ConvHelper;
 using ShapeInferenceHelper_DmlFusedConvTranspose = ConvTransposeHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index e18ba31def48..e95f5edbf0b1 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -406,6 +406,26 @@ namespace OperatorHelper
         static const int sc_sinceVer_BitwiseNot = 18;
         static const int sc_sinceVer_Pad = 18;
         static const int sc_sinceVer_Split = 18;
+        static const int sc_sinceVer_LpPool = 18;
+        static const int sc_sinceVer_Col2Im = 18;
+        static const int sc_sinceVer_Resize = 18;
+    }
+
+    namespace OnnxOperatorSet19
+    {
+        static const int sc_sinceVer_AveragePool = 19;
+        static const int sc_sinceVer_Resize = 19;
+        static const int sc_sinceVer_Pad = 19;
+        static const int sc_sinceVer_Cast = 19;
+        static const int sc_sinceVer_CastLike = 19;
+        static const int sc_sinceVer_Constant = 19;
+        static const int sc_sinceVer_Equal = 19;
+        static const int sc_sinceVer_Identity = 19;
+        static const int sc_sinceVer_QuantizeLinear = 19;
+        static const int sc_sinceVer_DequantizeLinear = 19;
+        static const int sc_sinceVer_Reshape = 19;
+        static const int sc_sinceVer_Shape = 19;
+        static const int sc_sinceVer_Size = 19;
     }
 
     namespace MsftOperatorSet1
@@ -425,10 +445,13 @@ namespace OperatorHelper
         static const int sc_sinceVer_QLinearAdd = 1;
         static const int sc_sinceVer_Gelu = 1;
         static const int sc_sinceVer_BiasGelu = 1;
+        static const int sc_sinceVer_FastGelu = 1;
         static const int sc_sinceVer_FusedMatMul = 1;
         static const int sc_sinceVer_FusedMatMulActivation = 1;
         static const int sc_sinceVer_QLinearSigmoid = 1;
+        static const int sc_sinceVer_QAttention = 1;
         static const int sc_sinceVer_Attention = 1;
+        static const int sc_sinceVer_MatMulIntegerToFloat = 1;
         static const int sc_sinceVer_MultiHeadAttention = 1;
         static const int sc_sinceVer_SkipLayerNormalization = 1;
         static const int sc_sinceVer_EmbedLayerNormalization = 1;
@@ -437,7 +460,11 @@ namespace OperatorHelper
         static const int sc_sinceVer_BiasAdd = 1;
         static const int sc_sinceVer_QuickGelu = 1;
         static const int sc_sinceVer_GroupNorm = 1;
+        static const int sc_sinceVer_QLinearConcat = 1;
         static const int sc_sinceVer_RotaryEmbedding = 1;
+        static const int sc_sinceVer_QLinearAveragePool = 1;
+        static const int sc_sinceVer_QLinearGlobalAveragePool = 1;
+        static const int sc_sinceVer_DynamicQuantizeMatMul = 1;
     } // namespace MsftOperatorSet1
 
 } // namespace OperatorHelper
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 33f1f59e07f3..9ba1c35efb27 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -1,9 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <dxcore.h>
 #include <vector>
 
+#define INITGUID
+#include <guiddef.h>
+#include <directx/dxcore.h>
+#undef INITGUID
+
+#include "directx/d3d12.h"
+
 #include <DirectML.h>
 #ifndef _GAMING_XBOX
 #include <dxgi1_4.h>
@@ -118,7 +124,6 @@ static bool IsGPU(IDXCoreAdapter* compute_adapter) {
   return compute_adapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS);
 }
 
-#ifdef ENABLE_NPU_ADAPTER_ENUMERATION
 static bool IsNPU(IDXCoreAdapter* compute_adapter) {
   // Only considering hardware adapters
   if (!IsHardwareAdapter(compute_adapter)) {
@@ -126,7 +131,6 @@ static bool IsNPU(IDXCoreAdapter* compute_adapter) {
   }
   return !(compute_adapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS));
 }
-#endif
 
 enum class DeviceType { GPU, NPU, BadDevice };
 
@@ -159,12 +163,15 @@ static ComPtr<IDXCoreAdapterList> EnumerateDXCoreAdapters(IDXCoreAdapterFactory*
   // When DXCore APIs are available QI for relevant enumeration interfaces
   constexpr bool use_dxcore_workload_enumeration = false;
   if (!use_dxcore_workload_enumeration) {
-    // Get a list of all the adapters that support compute
-    GUID attributes[]{ DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE };
     ORT_THROW_IF_FAILED(
-      adapter_factory->CreateAdapterList(_countof(attributes),
-        attributes,
+      adapter_factory->CreateAdapterList(1,
+        &DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML,
         adapter_list.GetAddressOf()));
+
+    if (adapter_list->GetAdapterCount() == 0)
+    {
+        ORT_THROW_IF_FAILED(adapter_factory->CreateAdapterList(1, &DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE, adapter_list.GetAddressOf()));
+    }
   }
 
   return adapter_list;
@@ -327,10 +334,10 @@ static std::optional<OrtDmlPerformancePreference> ParsePerformancePreference(con
 }
 
 static std::optional<OrtDmlDeviceFilter> ParseFilter(const ProviderOptions& provider_options) {
-  static const std::string Filter = "filter";
+  static const std::string Filter = "device_filter";
+  static const std::string Any = "any";
   static const std::string Gpu = "gpu";
 #ifdef ENABLE_NPU_ADAPTER_ENUMERATION
-  static const std::string Any = "any";
   static const std::string Npu = "npu";
 #endif
 
@@ -475,12 +482,43 @@ Microsoft::WRL::ComPtr<IDMLDevice> DMLProviderFactoryCreator::CreateDMLDevice(ID
   return dml_device;
 }
 
+static D3D12_COMMAND_LIST_TYPE CalculateCommandListType(ID3D12Device* d3d12_device) {
+  D3D12_FEATURE_DATA_FEATURE_LEVELS feature_levels = {};
+
+  D3D_FEATURE_LEVEL feature_levels_list[] = {
+  #ifndef _GAMING_XBOX
+      D3D_FEATURE_LEVEL_1_0_GENERIC,
+  #endif
+      D3D_FEATURE_LEVEL_1_0_CORE,
+      D3D_FEATURE_LEVEL_11_0,
+      D3D_FEATURE_LEVEL_11_1,
+      D3D_FEATURE_LEVEL_12_0,
+      D3D_FEATURE_LEVEL_12_1
+  };
+
+  feature_levels.NumFeatureLevels = ARRAYSIZE(feature_levels_list);
+  feature_levels.pFeatureLevelsRequested = feature_levels_list;
+  ORT_THROW_IF_FAILED(d3d12_device->CheckFeatureSupport(
+      D3D12_FEATURE_FEATURE_LEVELS,
+      &feature_levels,
+      sizeof(feature_levels)
+      ));
+
+  auto use_compute_command_list = (feature_levels.MaxSupportedFeatureLevel <= D3D_FEATURE_LEVEL_1_0_CORE);
+  if (use_compute_command_list)
+  {
+    return D3D12_COMMAND_LIST_TYPE_COMPUTE;
+  }
+
+  return D3D12_COMMAND_LIST_TYPE_DIRECT;
+}
+
 std::shared_ptr<IExecutionProviderFactory> CreateDMLDeviceAndProviderFactory(
-    ID3D12Device* d3d12_device,
-    bool disable_metacommands,
-    bool enable_dynamic_graph_fusion) {
+  ID3D12Device* d3d12_device,
+  bool disable_metacommands,
+  bool enable_dynamic_graph_fusion) {
   D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
-  cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+  cmd_queue_desc.Type = CalculateCommandListType(d3d12_device);
   cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
 
   ComPtr<ID3D12CommandQueue> cmd_queue;
@@ -500,15 +538,28 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(
 }
 
 std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFromAdapterList(
-    std::vector<ComPtr<IDXCoreAdapter>>&& dxcore_devices,
+    std::vector<ComPtr<IDXCoreAdapter>>&& adapters,
     bool disable_metacommands,
     bool enable_dynamic_graph_fusion) {
   // Choose the first device from the list since it's the highest priority
-  auto dxcore_device = dxcore_devices[0];
+  auto adapter = adapters[0];
+
+  auto feature_level = D3D_FEATURE_LEVEL_11_0;
+  if (IsNPU(adapter.Get())) {
+    feature_level = D3D_FEATURE_LEVEL_1_0_GENERIC;
+  }
 
   // Create D3D12 Device from DXCore Adapter
   ComPtr<ID3D12Device> d3d12_device;
-  ORT_THROW_IF_FAILED(D3D12CreateDevice(dxcore_device.Get(), D3D_FEATURE_LEVEL_11_0, IID_GRAPHICS_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+  if (feature_level == D3D_FEATURE_LEVEL_1_0_GENERIC) {
+      // Attempt to create a D3D_FEATURE_LEVEL_1_0_CORE device first, in case the device supports this
+      // feature level and the D3D runtime does not support D3D_FEATURE_LEVEL_1_0_GENERIC
+      HRESULT hrUnused = D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_1_0_CORE, IID_GRAPHICS_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf()));
+  }
+  
+  if (!d3d12_device) {
+    ORT_THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), feature_level, IID_GRAPHICS_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+  }
 
   return CreateDMLDeviceAndProviderFactory(d3d12_device.Get(), disable_metacommands, enable_dynamic_graph_fusion);
 }
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
index 0fab9fe90252..61d0cba0e1f9 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
+++ b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
@@ -6,12 +6,12 @@
 #include <memory>
 
 #include <wrl/client.h>
-#include <d3d12.h>
+#include "directx/d3d12.h"
 #include "core/framework/provider_options.h"
 #include "core/providers/providers.h"
 #include "core/providers/dml/dml_provider_factory.h"
 
-#include <dxcore.h>
+#include <directx/dxcore.h>
 #include <vector>
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/dml/dml_session_options_config_keys.h b/onnxruntime/core/providers/dml/dml_session_options_config_keys.h
index d11fa7516e71..5b5f371f5161 100644
--- a/onnxruntime/core/providers/dml/dml_session_options_config_keys.h
+++ b/onnxruntime/core/providers/dml/dml_session_options_config_keys.h
@@ -21,3 +21,4 @@
 // "1": disabled (disallowed). Graph fusion will never be used.
 // The default value is "0"
 static const char* const kOrtSessionOptionsConfigDisableDmlGraphFusion = "ep.dml.disable_graph_fusion";
+static const char* const kOrtSessionOptionsConfigEnableGraphSerialization = "ep.dml.enable_graph_serialization";
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
index 05eb0091a8c8..3271dab13f67 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
@@ -5,8 +5,6 @@
 #pragma warning(disable : 4996)
 #endif
 
-#include "core/providers/dnnl/dnnl_execution_provider.h"
-
 #include <fstream>
 #include <iomanip>
 #include <unordered_set>
@@ -16,6 +14,7 @@
 
 #include "core/platform/ort_mutex.h"
 #include "core/providers/shared_library/provider_api.h"
+#include "core/providers/dnnl/dnnl_execution_provider.h"
 
 #include "core/providers/dnnl/dnnl_fwd.h"
 #include "core/providers/dnnl/dnnl_node_capability.h"
@@ -30,7 +29,7 @@ constexpr const char* DNNL = "Dnnl";
 constexpr const char* DNNL_CPU = "DnnlCpu";
 
 DnnlExecutionProvider::DnnlExecutionProvider(const DnnlExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kDnnlExecutionProvider, true},
+    : IExecutionProvider{onnxruntime::kDnnlExecutionProvider},
       info_(info) {
   InitProviderOrtApi();
 
@@ -77,8 +76,8 @@ DnnlExecutionProvider::DnnlExecutionProvider(const DnnlExecutionProviderInfo& in
   // Log the number of threads used
   LOGS_DEFAULT(INFO) << "Allocated " << omp_get_max_threads() << " OpenMP threads for oneDNN ep\n";
 #endif  // defined(DNNL_OPENMP)
-
-}  // namespace onnxruntime
+  metadef_id_generator_ = ModelMetadefIdGenerator::Create();
+}
 
 DnnlExecutionProvider::~DnnlExecutionProvider() {
 }
@@ -229,7 +228,7 @@ std::vector<std::unique_ptr<ComputeCapability>> DnnlExecutionProvider::GetCapabi
 
     // Assign inputs and outputs to subgraph's meta_def
     HashValue model_hash;
-    int metadef_id = GenerateMetaDefId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_->GenerateId(graph_viewer, model_hash);
     auto meta_def = ::onnxruntime::IndexedSubGraph_MetaDef::Create();
     meta_def->name() = "DNNL_" + std::to_string(model_hash) + "_" + std::to_string(metadef_id);
     meta_def->domain() = kMSDomain;
@@ -264,7 +263,7 @@ std::vector<std::unique_ptr<ComputeCapability>> DnnlExecutionProvider::GetCapabi
     graph_viewer.ToProto(*model_proto->mutable_graph(), false, true);
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     HashValue model_hash;
-    int metadef_id = GenerateMetaDefId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_->GenerateId(graph_viewer, model_hash);
     std::fstream dump("DNNL_" + std::to_string(model_hash) + "_" + std::to_string(metadef_id) + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
     model_proto->SerializeToOstream(dump);
   }
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
index 41062ccb4bc1..b7fcbb776518 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
@@ -41,6 +41,7 @@ class DnnlExecutionProvider : public IExecutionProvider {
   bool debug_log_ = false;
   // enable fusion by default
   bool enable_fusion_ = true;
+  std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index c2ff2ebc39e1..0ad62b87d33b 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -3,6 +3,7 @@
 
 #include "js_execution_provider.h"
 
+#include <emscripten.h>
 #include <string_view>
 #include <unordered_map>
 #include <unordered_set>
@@ -20,7 +21,6 @@
 #include "core/framework/kernel_registry.h"
 #include "core/graph/function_utils.h"
 #include "core/graph/indexed_sub_graph.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "data_transfer.h"
 
 namespace onnxruntime {
@@ -98,6 +98,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Erf);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Sigmoid);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Sigmoid);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, HardSigmoid);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 12, Log);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Log);
 
@@ -238,6 +239,11 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, Whe
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, DepthToSpace);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace);
+
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Conv);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Conv);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv);
@@ -392,6 +398,7 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO(13, Erf),
       KERNEL_CREATE_INFO_VERSIONED(6, 12, Sigmoid),
       KERNEL_CREATE_INFO(13, Sigmoid),
+      KERNEL_CREATE_INFO(6, HardSigmoid),
       KERNEL_CREATE_INFO_VERSIONED(6, 12, Log),
       KERNEL_CREATE_INFO(13, Log),
 
@@ -532,6 +539,11 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv)>,
@@ -679,9 +691,13 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 
 using namespace js;
 
-JsExecutionProvider::JsExecutionProvider(const JsExecutionProviderInfo& info)
-    : IExecutionProvider{kJsExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0), true},
+JsExecutionProvider::JsExecutionProvider(const JsExecutionProviderInfo& info, const SessionOptions* session_options)
+    : IExecutionProvider{kJsExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)},
       preferred_data_layout_{info.data_layout} {
+  if (session_options) {
+    enable_graph_capture_ = session_options->config_options.GetConfigOrDefault("enableGraphCapture", "false") == "true";
+    LOGS_DEFAULT(VERBOSE) << "Graph capture enable: " << enable_graph_capture_;
+  }
 }
 
 std::vector<AllocatorPtr> JsExecutionProvider::CreatePreferredAllocators() {
@@ -749,4 +765,46 @@ std::unique_ptr<onnxruntime::IDataTransfer> JsExecutionProvider::GetDataTransfer
 JsExecutionProvider::~JsExecutionProvider() {
 }
 
+Status JsExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
+  if (IsGraphCaptureEnabled() && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
+    LOGS(*GetLogger(), INFO) << "Capturing the webgpu graph for this model";
+    EM_ASM({ Module.jsepCaptureBegin(); });
+  }
+  return Status::OK();
+}
+
+Status JsExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
+  if (IsGraphCaptureEnabled() && !IsGraphCaptured(0)) {
+    if (IsGraphCaptureAllowed()) {
+      EM_ASM({ Module.jsepCaptureEnd(); });
+      is_graph_captured_ = true;
+    } else {
+      IncrementRegularRunCountBeforeGraphCapture();
+    }
+  }
+
+  return Status::OK();
+}
+
+bool JsExecutionProvider::IsGraphCaptureEnabled() const {
+  return enable_graph_capture_;
+}
+
+bool JsExecutionProvider::IsGraphCaptured(int) const {
+  return is_graph_captured_;
+}
+
+Status JsExecutionProvider::ReplayGraph(int) {
+  ORT_ENFORCE(IsGraphCaptured(0));
+  EM_ASM({ Module.jsepReplay(); });
+  return Status::OK();
+}
+
+bool JsExecutionProvider::IsGraphCaptureAllowed() const {
+  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
+}
+
+void JsExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
+  ++regular_run_count_before_graph_capture_;
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index 39d43498c071..efacf510e75d 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "core/framework/execution_provider.h"
+#include "core/framework/session_options.h"
 #include "core/graph/constants.h"
 #include "core/providers/providers.h"
 
@@ -38,7 +39,7 @@ struct JsExecutionProviderInfo {
 
 class JsExecutionProvider : public IExecutionProvider {
  public:
-  JsExecutionProvider(const JsExecutionProviderInfo& info);
+  JsExecutionProvider(const JsExecutionProviderInfo& info, const SessionOptions* session_options);
   ~JsExecutionProvider() override;
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
@@ -57,7 +58,22 @@ class JsExecutionProvider : public IExecutionProvider {
   bool ConcurrentRunSupported() const override { return false; }
 
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
+
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
+
+  bool IsGraphCaptureEnabled() const override;
+  bool IsGraphCaptured(int graph_annotation_id) const override;
+  Status ReplayGraph(int graph_annotation_id) override;
+
+ private:
+  bool IsGraphCaptureAllowed() const;
+  void IncrementRegularRunCountBeforeGraphCapture();
   DataLayout preferred_data_layout_;
+  bool enable_graph_capture_ = false;
+  bool is_graph_captured_ = false;
+  int regular_run_count_before_graph_capture_ = 0;
+  const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 5c2d1f0b881b..7324b0d69474 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -68,6 +68,10 @@ namespace js {
                             ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value));, \
                                                                                  , ({#attr_name : $1}), static_cast<double>(value))
 
+#define JSEP_HEAP8_INDEX(ptr) reinterpret_cast<uintptr_t>(ptr)
+#define JSEP_HEAP32_INDEX_START(vec) ((vec.size() > 0) ? reinterpret_cast<uintptr_t>(vec.data()) >> 2 : 0)
+#define JSEP_HEAP32_INDEX_END(vec) ((reinterpret_cast<uintptr_t>(vec.data()) >> 2) + vec.size())
+
 // TODO:
 // class JsMultiProgramKernel : public OpKernel { /* TBD */ };
 
diff --git a/onnxruntime/core/providers/js/js_provider_factory.cc b/onnxruntime/core/providers/js/js_provider_factory.cc
index 5b7329a87cf6..cbdf99f70215 100644
--- a/onnxruntime/core/providers/js/js_provider_factory.cc
+++ b/onnxruntime/core/providers/js/js_provider_factory.cc
@@ -10,21 +10,22 @@
 namespace onnxruntime {
 
 struct JsProviderFactory : IExecutionProviderFactory {
-  JsProviderFactory(const ProviderOptions& provider_options)
-      : info_{provider_options} {
+  JsProviderFactory(const ProviderOptions& provider_options, const SessionOptions* session_options)
+      : info_{provider_options}, session_options_(session_options) {
   }
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    return std::make_unique<JsExecutionProvider>(info_);
+    return std::make_unique<JsExecutionProvider>(info_, session_options_);
   }
 
  private:
   JsExecutionProviderInfo info_;
+  const SessionOptions* session_options_;
 };
 
 std::shared_ptr<IExecutionProviderFactory> JsProviderFactoryCreator::Create(
-    const ProviderOptions& provider_options) {
-  return std::make_shared<JsProviderFactory>(provider_options);
+    const ProviderOptions& provider_options, const SessionOptions* session_options) {
+  return std::make_shared<JsProviderFactory>(provider_options, session_options);
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_provider_factory_creator.h b/onnxruntime/core/providers/js/js_provider_factory_creator.h
index dbabe255c2d7..510b0fb4248c 100644
--- a/onnxruntime/core/providers/js/js_provider_factory_creator.h
+++ b/onnxruntime/core/providers/js/js_provider_factory_creator.h
@@ -9,9 +9,11 @@
 #include "core/providers/providers.h"
 
 namespace onnxruntime {
+struct SessionOptions;
 
 struct JsProviderFactoryCreator {
-  static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& provider_options);
+  static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& provider_options,
+                                                           const SessionOptions* session_options);
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 5c0fbf93a400..89719f6ba665 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -29,7 +29,6 @@ class ConvBase : public JsKernel {
     }
     conv_attrs_.activation = info.GetAttrOrDefault<std::string>("activation", "");
     std::vector<float> activation_params = info.GetAttrsOrDefault<float>("activation_params");
-    const auto* activation_params_ptr = activation_params.size() > 0 ? activation_params.data() : nullptr;
     int64_t channels_last = is_channels_last ? 1 : info.GetAttrOrDefault<int64_t>("channels_last", 0);
     auto kernel_shape_0 = conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0;
     auto kernel_shape_1 = conv_attrs_.kernel_shape_specified && kernel_shape.size() > 1 ? kernel_shape[1] : 0;
@@ -43,24 +42,24 @@ class ConvBase : public JsKernel {
                                    "dilations" : [$2],
                                    "group" : $3,
                                    "kernel_shape" : [$4],
-                                   "pads" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : [],
+                                   "pads" : $5 ? Array.from(HEAP32.subarray($5, $6)) : [],
                                    "strides" : [$7],
                                    "w_is_const" : () JS_ARROW(!!HEAP8[$9]),
                                    "activation" : UTF8ToString($10),
-                                   "activation_params" : $11 ? Array.from(HEAPF32.subarray($12, $12 + $11)) : []
+                                   "activation_params" : $11 ? Array.from(HEAPF32.subarray($11, $12)) : []
                                  }),
                                  static_cast<int32_t>(conv_attrs_.auto_pad),
                                  static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
                                  static_cast<int32_t>(conv_attrs_.group),
                                  static_cast<int32_t>(kernel_shape_0),
-                                 static_cast<int32_t>(local_pads.size()),
-                                 reinterpret_cast<int32_t>(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP32_INDEX_START(local_pads),
+                                 JSEP_HEAP32_INDEX_END(local_pads),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP8_INDEX(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
-                                 activation_params.size(),
-                                 reinterpret_cast<int32_t>(activation_params_ptr) >> 2);
+                                 JSEP_HEAP32_INDEX_START(activation_params),
+                                 JSEP_HEAP32_INDEX_END(activation_params));
     } else {
       JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
                                    "format" : $11 ? "NHWC" : "NCHW",
@@ -68,11 +67,11 @@ class ConvBase : public JsKernel {
                                    "dilations" : [ $2, $3 ],
                                    "group" : $4,
                                    "kernel_shape" : [ $5, $6 ],
-                                   "pads" : $7 ? Array.from(HEAP32.subarray($8, $8 + $7)) : [],
+                                   "pads" : $7 ? Array.from(HEAP32.subarray($7, $8)) : [],
                                    "strides" : [ $9, $10 ],
                                    "w_is_const" : () JS_ARROW(!!HEAP8[$12]),
                                    "activation" : UTF8ToString($13),
-                                   "activation_params" : $14 ? Array.from(HEAPF32.subarray($15, $15 + $14)) : []
+                                   "activation_params" : $14 ? Array.from(HEAPF32.subarray($14, $15)) : []
                                  }),
                                  static_cast<int32_t>(conv_attrs_.auto_pad),
                                  static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
@@ -80,15 +79,15 @@ class ConvBase : public JsKernel {
                                  static_cast<int32_t>(conv_attrs_.group),
                                  static_cast<int32_t>(kernel_shape_0),
                                  static_cast<int32_t>(kernel_shape_1),
-                                 static_cast<int32_t>(local_pads.size()),
-                                 reinterpret_cast<int32_t>(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP32_INDEX_START(local_pads),
+                                 JSEP_HEAP32_INDEX_END(local_pads),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP8_INDEX(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
-                                 activation_params.size(),
-                                 reinterpret_cast<int32_t>(activation_params_ptr) >> 2);
+                                 JSEP_HEAP32_INDEX_START(activation_params),
+                                 JSEP_HEAP32_INDEX_END(activation_params));
     }
   }
 
diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h
index 5d30dc851e00..59dfbb0e9492 100644
--- a/onnxruntime/core/providers/js/operators/conv_transpose.h
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.h
@@ -29,10 +29,6 @@ class ConvTranspose : public JsKernel {
                                             conv_transpose_attrs_.output_shape.end());
     std::vector<int32_t> local_output_padding(conv_transpose_attrs_.output_padding.begin(),
                                               conv_transpose_attrs_.output_padding.end());
-    const auto* local_output_padding_ptr =
-        local_output_padding.size() > 0 ? local_output_padding.data() : nullptr;
-    const auto* local_output_shape_ptr =
-        local_output_shape.size() > 0 ? local_output_shape.data() : nullptr;
 
     // currently only support Conv 1D/2D. TODO: support Conv3D and other
     if (conv_transpose_attrs_.dilations.size() == 1 ||
@@ -48,12 +44,12 @@ class ConvTranspose : public JsKernel {
                                    "autoPad" : $1,
                                    "dilations" : [$2],
                                    "group" : $3,
-                                   "kernel_shape" : [$4],
+                                   "kernelShape" : [$4],
                                    "pads" : [ $5, $6 ],
                                    "strides" : [$7],
                                    "wIsConst" : () JS_ARROW(!!HEAP8[$9]),
-                                   "outputPadding" : $10 ? Array.from(HEAP32.subarray($11, $11 + $10)) : [],
-                                   "outputShape" : $12 ? Array.from(HEAP32.subarray($13, $13 + $12)) : [],
+                                   "outputPadding" : $10 ? Array.from(HEAP32.subarray($10, $11)) : [],
+                                   "outputShape" : $12 ? Array.from(HEAP32.subarray($12, $13)) : [],
                                    "activation" : UTF8ToString($14)
                                  }),
                                  static_cast<int32_t>(conv_transpose_attrs_.auto_pad),
@@ -64,11 +60,11 @@ class ConvTranspose : public JsKernel {
                                  static_cast<int32_t>(pads_1),
                                  static_cast<int32_t>(strides),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
-                                 gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 reinterpret_cast<int32_t>(local_output_padding_ptr) >> 2,
-                                 gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 reinterpret_cast<int32_t>(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP8_INDEX(&w_is_const_),
+                                 JSEP_HEAP32_INDEX_START(local_output_padding),
+                                 JSEP_HEAP32_INDEX_END(local_output_padding),
+                                 JSEP_HEAP32_INDEX_START(local_output_shape),
+                                 JSEP_HEAP32_INDEX_END(local_output_shape),
                                  conv_transpose_attrs_.activation.c_str());
     } else {
       constexpr size_t pads_vec_size = 4;
@@ -103,28 +99,28 @@ class ConvTranspose : public JsKernel {
       JSEP_INIT_KERNEL_ATTRIBUTE(ConvTranspose, ({
                                    "format" : $7 ? "NHWC" : "NCHW",
                                    "autoPad" : $1,
-                                   "dilations" : Array.from(HEAP32.subarray($2, $2 + /* dialations_vec_size */ 2)),
+                                   "dilations" : Array.from(HEAP32.subarray($2, ($2 >>> 0) + /* dialations_vec_size */ 2)),
                                    "group" : $3,
-                                   "kernelShape" : Array.from(HEAP32.subarray($4, $4 + /* kernel_shape_vec_size */ 2)),
-                                   "pads" : Array.from(HEAP32.subarray($5, $5 + /* pads_vec_size */ 4)),
-                                   "strides" : Array.from(HEAP32.subarray($6, $6 + /* strides_vec_size */ 2)),
+                                   "kernelShape" : Array.from(HEAP32.subarray($4, ($4 >>> 0) + /* kernel_shape_vec_size */ 2)),
+                                   "pads" : Array.from(HEAP32.subarray($5, ($5 >>> 0) + /* pads_vec_size */ 4)),
+                                   "strides" : Array.from(HEAP32.subarray($6, ($6 >>> 0) + /* strides_vec_size */ 2)),
                                    "wIsConst" : () JS_ARROW(!!HEAP8[$8]),
-                                   "outputPadding" : ($9 > 0) ? Array.from(HEAP32.subarray($10, $10 + $9)) : [],
-                                   "outputShape" : ($11 > 0) ? Array.from(HEAP32.subarray($12, $12 + $11)) : [],
+                                   "outputPadding" : $9 ? Array.from(HEAP32.subarray($9, $10)) : [],
+                                   "outputShape" : $11 ? Array.from(HEAP32.subarray($11, $12)) : [],
                                    "activation" : UTF8ToString($13)
                                  }),
                                  static_cast<int32_t>(conv_transpose_attrs_.auto_pad),
-                                 reinterpret_cast<int32_t>(local_dilations.data()) >> 2,
+                                 JSEP_HEAP32_INDEX_START(local_dilations),
                                  static_cast<int32_t>(conv_transpose_attrs_.group),
-                                 reinterpret_cast<int32_t>(local_kernel_shape.data()) >> 2,
-                                 reinterpret_cast<int32_t>(local_pads.data()) >> 2,
-                                 reinterpret_cast<int32_t>(local_strides.data()) >> 2,
+                                 JSEP_HEAP32_INDEX_START(local_kernel_shape),
+                                 JSEP_HEAP32_INDEX_START(local_pads),
+                                 JSEP_HEAP32_INDEX_START(local_strides),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
-                                 gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 reinterpret_cast<int32_t>(local_output_padding_ptr) >> 2,
-                                 gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 reinterpret_cast<int32_t>(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP8_INDEX(&w_is_const_),
+                                 JSEP_HEAP32_INDEX_START(local_output_padding),
+                                 JSEP_HEAP32_INDEX_END(local_output_padding),
+                                 JSEP_HEAP32_INDEX_START(local_output_shape),
+                                 JSEP_HEAP32_INDEX_END(local_output_shape),
                                  conv_transpose_attrs_.activation.c_str());
     }
   }
diff --git a/onnxruntime/core/providers/js/operators/depth_to_space.cc b/onnxruntime/core/providers/js/operators/depth_to_space.cc
new file mode 100644
index 000000000000..4833859bd24b
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/depth_to_space.cc
@@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "depth_to_space.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_KERNEL_EX(
+    DepthToSpace,
+    kMSInternalNHWCDomain,
+    13,
+    kJsExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    DepthToSpace<true>);
+
+ONNX_OPERATOR_KERNEL_EX(
+    DepthToSpace,
+    kOnnxDomain,
+    13,
+    kJsExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    DepthToSpace<false>);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    DepthToSpace,
+    kMSInternalNHWCDomain,
+    11, 12,
+    kJsExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    DepthToSpace<true>);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    DepthToSpace,
+    kOnnxDomain,
+    11, 12,
+    kJsExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    DepthToSpace<false>);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/depth_to_space.h b/onnxruntime/core/providers/js/operators/depth_to_space.h
new file mode 100644
index 000000000000..b43ce927b1b2
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/depth_to_space.h
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+#include <string>
+#include <utility>
+
+namespace onnxruntime {
+namespace js {
+
+template <bool is_channels_last>
+class DepthToSpace final : public JsKernel {
+ public:
+  DepthToSpace(const OpKernelInfo& info) : JsKernel(info) {
+    int64_t blocksize;
+    std::string mode;
+    ORT_ENFORCE(info.GetAttr<int64_t>("blocksize", &blocksize).IsOK(), "Attribute blocksize is not set.");
+    mode = info.GetAttrOrDefault<std::string>("mode", "DCR");
+
+    if (mode != "DCR" && mode != "CRD") {
+      ORT_THROW("Invalid mode attribute value: ", mode);
+    }
+
+    JSEP_INIT_KERNEL_ATTRIBUTE(DepthToSpace, ({
+                                 "blocksize" : $1,
+                                 "mode" : UTF8ToString($2),
+                                 "format" : $3 ? "NHWC" : "NCHW"
+                               }),
+                               static_cast<int32_t>(blocksize),
+                               mode.c_str(), static_cast<int32_t>(is_channels_last));
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/flatten.cc b/onnxruntime/core/providers/js/operators/flatten.cc
index 7e4b4c350951..1aacae819e30 100644
--- a/onnxruntime/core/providers/js/operators/flatten.cc
+++ b/onnxruntime/core/providers/js/operators/flatten.cc
@@ -13,7 +13,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
         .Alias(0, 0)
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", JsepSupportedFloatTypes()),
     Flatten);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -23,7 +23,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
         .Alias(0, 0)
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", JsepSupportedFloatTypes()),
     Flatten);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -33,7 +33,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
         .Alias(0, 0)
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", JsepSupportedFloatTypes()),
     Flatten);
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -43,7 +43,7 @@ ONNX_OPERATOR_KERNEL_EX(
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
         .Alias(0, 0)
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", JsepSupportedFloatTypes()),
     Flatten);
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/layer_norm.cc b/onnxruntime/core/providers/js/operators/layer_norm.cc
index 9ba379ed0924..812aeec83d08 100644
--- a/onnxruntime/core/providers/js/operators/layer_norm.cc
+++ b/onnxruntime/core/providers/js/operators/layer_norm.cc
@@ -16,7 +16,7 @@ ONNX_OPERATOR_KERNEL_EX(
     (*KernelDefBuilder::Create())
         .TypeConstraint("T", JsepSupportedFloatTypes())
         .TypeConstraint("U", JsepSupportedFloatTypes()),
-    LayerNorm);
+    LayerNorm<false>);
 
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/layer_norm.h b/onnxruntime/core/providers/js/operators/layer_norm.h
index 791329f3e880..be6cb5fa43c1 100644
--- a/onnxruntime/core/providers/js/operators/layer_norm.h
+++ b/onnxruntime/core/providers/js/operators/layer_norm.h
@@ -8,6 +8,7 @@
 namespace onnxruntime {
 namespace js {
 
+template <bool simplified>
 class LayerNorm : public JsKernel {
  public:
   LayerNorm(const OpKernelInfo& info) : JsKernel(info) {
@@ -15,11 +16,13 @@ class LayerNorm : public JsKernel {
     info.GetAttrOrDefault<float>("epsilon", &epsilon_, 1e-05);
 
     JSEP_INIT_KERNEL_ATTRIBUTE(LayerNormalization, ({
-                                 "axis" : Number($1),
-                                 "epsilon" : Number($2),
+                                 "axis" : $1,
+                                 "epsilon" : $2,
+                                 "simplified" : !!$3
                                }),
                                static_cast<int32_t>(axis_),
-                               static_cast<float>(epsilon_));
+                               static_cast<float>(epsilon_),
+                               static_cast<int32_t>(simplified));
   }
 
  private:
diff --git a/onnxruntime/core/providers/js/operators/pad.cc b/onnxruntime/core/providers/js/operators/pad.cc
index 24ba85cbf6e0..83fee35481aa 100644
--- a/onnxruntime/core/providers/js/operators/pad.cc
+++ b/onnxruntime/core/providers/js/operators/pad.cc
@@ -14,7 +14,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     2,
     10,
     kJsExecutionProvider,
-    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
     Pad);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -24,7 +24,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     12,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("T", JsepSupportedFloatTypes())
         .InputMemoryType(OrtMemTypeCPU, 1)
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3),
@@ -37,7 +37,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     17,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("T", JsepSupportedFloatTypes())
         .InputMemoryType(OrtMemTypeCPU, 1)
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3),
@@ -50,7 +50,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     18,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("T", JsepSupportedFloatTypes())
         .InputMemoryType(OrtMemTypeCPU, 1)
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3),
@@ -62,7 +62,7 @@ ONNX_OPERATOR_KERNEL_EX(
     19,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("T", JsepSupportedFloatTypes())
         .InputMemoryType(OrtMemTypeCPU, 1)
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3),
diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h
index 19168f40b472..c18c7dd456dc 100644
--- a/onnxruntime/core/providers/js/operators/pad.h
+++ b/onnxruntime/core/providers/js/operators/pad.h
@@ -22,11 +22,11 @@ class Pad : public JsKernel, public PadBase {
 
     JSEP_INIT_KERNEL_ATTRIBUTE(Pad, ({"mode" : $1,
                                       "value" : $2,
-                                      "pads" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : []}),
+                                      "pads" : $3 ? Array.from(HEAP32.subarray($3, $4)) : []}),
                                static_cast<int32_t>(mode_),
                                static_cast<double>(value_),
-                               gsl::narrow_cast<int32_t>(pads.size()),
-                               reinterpret_cast<int32_t>((pads.size() > 0) ? pads.data() : nullptr) >> 2);
+                               JSEP_HEAP32_INDEX_START(pads),
+                               JSEP_HEAP32_INDEX_END(pads));
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h
index a5a4aa834c2c..937f1f990dc6 100644
--- a/onnxruntime/core/providers/js/operators/reduce.h
+++ b/onnxruntime/core/providers/js/operators/reduce.h
@@ -8,29 +8,29 @@
 
 namespace onnxruntime {
 namespace js {
-#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                              \
-  template <bool allow_multi_axes = true>                                                                    \
-  class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                          \
-   public:                                                                                                   \
-    using ReduceKernelBase<allow_multi_axes>::axes_;                                                         \
-    using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;                                         \
-    using ReduceKernelBase<allow_multi_axes>::keepdims_;                                                     \
-    ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase<allow_multi_axes>(info) {      \
-      std::vector<int32_t> axes(axes_.size());                                                               \
-      if (axes_.size() > 0) {                                                                                \
-        std::transform(axes_.begin(), axes_.end(), axes.begin(),                                             \
-                       [](int64_t axis) { return gsl::narrow_cast<int32_t>(axis); });                        \
-      }                                                                                                      \
-      JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                            \
-                                   "keepDims" : !!$1,                                                        \
-                                   "noopWithEmptyAxes" : !!$2,                                               \
-                                   "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [],            \
-                                 }),                                                                         \
-                                 static_cast<int32_t>(keepdims_),                                            \
-                                 static_cast<int32_t>(noop_with_empty_axes_),                                \
-                                 gsl::narrow_cast<int32_t>(axes.size()),                                     \
-                                 reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2); \
-    }                                                                                                        \
+#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                         \
+  template <bool allow_multi_axes = true>                                                               \
+  class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                     \
+   public:                                                                                              \
+    using ReduceKernelBase<allow_multi_axes>::axes_;                                                    \
+    using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;                                    \
+    using ReduceKernelBase<allow_multi_axes>::keepdims_;                                                \
+    ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase<allow_multi_axes>(info) { \
+      std::vector<int32_t> axes(axes_.size());                                                          \
+      if (axes_.size() > 0) {                                                                           \
+        std::transform(axes_.begin(), axes_.end(), axes.begin(),                                        \
+                       [](int64_t axis) { return gsl::narrow_cast<int32_t>(axis); });                   \
+      }                                                                                                 \
+      JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                       \
+                                   "keepDims" : !!$1,                                                   \
+                                   "noopWithEmptyAxes" : !!$2,                                          \
+                                   "axes" : $3 ? (Array.from(HEAP32.subarray($3, $4))) : [],            \
+                                 }),                                                                    \
+                                 static_cast<int32_t>(keepdims_),                                       \
+                                 static_cast<int32_t>(noop_with_empty_axes_),                           \
+                                 JSEP_HEAP32_INDEX_START(axes),                                         \
+                                 JSEP_HEAP32_INDEX_END(axes));                                          \
+    }                                                                                                   \
   };
 
 JSEP_DEFINE_REDUCE_KERNEL(ReduceMax);
diff --git a/onnxruntime/core/providers/js/operators/resize.h b/onnxruntime/core/providers/js/operators/resize.h
index 65854222ba98..134eb4bf5a7f 100644
--- a/onnxruntime/core/providers/js/operators/resize.h
+++ b/onnxruntime/core/providers/js/operators/resize.h
@@ -23,7 +23,7 @@ class Resize : public JsKernel, public UpsampleBase {
     std::transform(axes_.begin(), axes_.end(), std::back_inserter(axes), [](auto& axis) { return gsl::narrow_cast<int32_t>(axis); });
     JSEP_INIT_KERNEL_ATTRIBUTE(Resize, ({
                                  "antialias" : $1,
-                                 "axes" : $2 ? Array.from(HEAP32.subarray($3, $3 + $2)) : [],
+                                 "axes" : $2 ? Array.from(HEAP32.subarray($2, $3)) : [],
                                  "coordinateTransformMode" : UTF8ToString($4),
                                  "cubicCoeffA" : $5,
                                  "excludeOutside" : $6,
@@ -33,8 +33,8 @@ class Resize : public JsKernel, public UpsampleBase {
                                  "nearestMode" : UTF8ToString($10),
                                }),
                                static_cast<int32_t>(antialias_),
-                               gsl::narrow_cast<int32_t>(axes.size()),
-                               reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2,
+                               JSEP_HEAP32_INDEX_START(axes),
+                               JSEP_HEAP32_INDEX_END(axes),
                                resize_coordinate_transformation_mode.c_str(),
                                static_cast<double>(cubic_coeff_a_),
                                static_cast<int32_t>(exclude_outside_),
diff --git a/onnxruntime/core/providers/js/operators/slice.cc b/onnxruntime/core/providers/js/operators/slice.cc
index bbafe40ea92a..869b5450501e 100644
--- a/onnxruntime/core/providers/js/operators/slice.cc
+++ b/onnxruntime/core/providers/js/operators/slice.cc
@@ -12,8 +12,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     1, 9,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
-                              DataTypeImpl::GetTensorType<int32_t>()}),
+        .TypeConstraint("T", JsepSupportedDataTypes()),
     Slice_1);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -26,8 +25,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3)
         .InputMemoryType(OrtMemTypeCPU, 4)
-        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
-                              DataTypeImpl::GetTensorType<int32_t>()}),
+        .TypeConstraint("T", JsepSupportedDataTypes()),
     Slice);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -40,8 +38,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3)
         .InputMemoryType(OrtMemTypeCPU, 4)
-        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
-                              DataTypeImpl::GetTensorType<int32_t>()}),
+        .TypeConstraint("T", JsepSupportedDataTypes()),
     Slice);
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -54,8 +51,7 @@ ONNX_OPERATOR_KERNEL_EX(
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3)
         .InputMemoryType(OrtMemTypeCPU, 4)
-        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
-                              DataTypeImpl::GetTensorType<int32_t>()}),
+        .TypeConstraint("T", JsepSupportedDataTypes()),
     Slice);
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/slice.h b/onnxruntime/core/providers/js/operators/slice.h
index 6792997025d6..daeffaa66474 100644
--- a/onnxruntime/core/providers/js/operators/slice.h
+++ b/onnxruntime/core/providers/js/operators/slice.h
@@ -20,15 +20,15 @@ class Slice : public JsKernel, public SliceBase {
     std::vector<int32_t> starts(attr_starts.begin(), attr_starts.end());
     std::vector<int32_t> ends(attr_ends.begin(), attr_ends.end());
 
-    JSEP_INIT_KERNEL_ATTRIBUTE(Slice, ({"starts" : $1 ? Array.from(HEAP32.subarray($2, $2 + $1)) : [],
-                                        "ends" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : [],
-                                        "axes" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : []}),
-                               gsl::narrow_cast<int32_t>(starts.size()),
-                               reinterpret_cast<int32_t>((starts.size() > 0) ? starts.data() : nullptr) >> 2,
-                               gsl::narrow_cast<int32_t>(ends.size()),
-                               reinterpret_cast<int32_t>((ends.size() > 0) ? ends.data() : nullptr) >> 2,
-                               gsl::narrow_cast<int32_t>(axes.size()),
-                               reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2);
+    JSEP_INIT_KERNEL_ATTRIBUTE(Slice, ({"starts" : $1 ? Array.from(HEAP32.subarray($1, $2)) : [],
+                                        "ends" : $3 ? Array.from(HEAP32.subarray($3, $4)) : [],
+                                        "axes" : $5 ? Array.from(HEAP32.subarray($5, $6)) : []}),
+                               JSEP_HEAP32_INDEX_START(starts),
+                               JSEP_HEAP32_INDEX_END(starts),
+                               JSEP_HEAP32_INDEX_START(ends),
+                               JSEP_HEAP32_INDEX_END(ends),
+                               JSEP_HEAP32_INDEX_START(axes),
+                               JSEP_HEAP32_INDEX_END(axes));
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/split.h b/onnxruntime/core/providers/js/operators/split.h
index cfacc1aa6a36..4fdbab00e739 100644
--- a/onnxruntime/core/providers/js/operators/split.h
+++ b/onnxruntime/core/providers/js/operators/split.h
@@ -49,11 +49,11 @@ class Split : public JsKernel, public SplitBase {
 
     JSEP_INIT_KERNEL_ATTRIBUTE(Split, ({"axis" : $1,
                                         "numOutputs" : $2,
-                                        "splitSizes" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : []}),
+                                        "splitSizes" : $3 ? Array.from(HEAP32.subarray($3, $4)) : []}),
                                static_cast<int32_t>(axis_),
                                static_cast<int32_t>(num_outputs_),
-                               gsl::narrow_cast<int32_t>(split_sizes.size()),
-                               reinterpret_cast<int32_t>((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2);
+                               JSEP_HEAP32_INDEX_START(split_sizes),
+                               JSEP_HEAP32_INDEX_END(split_sizes));
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h
index 311badbde0d1..f43dd814aa95 100644
--- a/onnxruntime/core/providers/js/operators/transpose.h
+++ b/onnxruntime/core/providers/js/operators/transpose.h
@@ -21,13 +21,10 @@ class Transpose final : public JsKernel, public TransposeBase {
       }
     }
     JSEP_INIT_KERNEL_ATTRIBUTE(Transpose, ({
-                                 "perm" : $1 ? Array.from(HEAP32.subarray($2, $2 + $1)) : []
+                                 "perm" : $1 ? Array.from(HEAP32.subarray($1, $2)) : []
                                }),
-                               // $1: length of attribute "perm" (int32[])
-                               gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
-                               // $2: index to HEAP32 of the first int32 element. calculated from right shift memory
-                               //     address by 2
-                               reinterpret_cast<int32_t>(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
+                               JSEP_HEAP32_INDEX_START(perm),
+                               JSEP_HEAP32_INDEX_END(perm));
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index 78563d30b013..9082527e3a8d 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -77,6 +77,9 @@ JSEP_KERNEL_IMPL(Sigmoid, Sigmoid)
 JSEP_ELEMENTWISE_VERSIONED_KERNEL(Sigmoid, 6, 12, Sigmoid)
 JSEP_ELEMENTWISE_KERNEL(Sigmoid, 13, Sigmoid)
 
+JSEP_CLASS_IMPL_ATTRIBUTE_FLOAT_2_DEFAULT(HardSigmoid, HardSigmoid, alpha, 0.2, beta, 0.5)
+JSEP_ELEMENTWISE_KERNEL(HardSigmoid, 6, HardSigmoid)
+
 JSEP_KERNEL_IMPL(Log, Log)
 JSEP_ELEMENTWISE_VERSIONED_KERNEL(Log, 6, 12, Log)
 JSEP_ELEMENTWISE_KERNEL(Log, 13, Log)
diff --git a/onnxruntime/core/providers/js/operators/where.cc b/onnxruntime/core/providers/js/operators/where.cc
index 2f8f5e275aa9..dcdf9bee2f78 100644
--- a/onnxruntime/core/providers/js/operators/where.cc
+++ b/onnxruntime/core/providers/js/operators/where.cc
@@ -6,18 +6,19 @@
 namespace onnxruntime {
 namespace js {
 
-#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS)      \
-  ONNX_OPERATOR_KERNEL_EX(                                          \
-      OP_TYPE,                                                      \
-      kOnnxDomain,                                                  \
-      VERSION,                                                      \
-      kJsExecutionProvider,                                         \
-      KernelDefBuilder()                                            \
-          .TypeConstraint("T",                                      \
-                          {DataTypeImpl::GetTensorType<float>(),    \
-                           DataTypeImpl::GetTensorType<int32_t>(),  \
-                           DataTypeImpl::GetTensorType<uint32_t>(), \
-                           DataTypeImpl::GetTensorType<bool>()}),   \
+#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS)       \
+  ONNX_OPERATOR_KERNEL_EX(                                           \
+      OP_TYPE,                                                       \
+      kOnnxDomain,                                                   \
+      VERSION,                                                       \
+      kJsExecutionProvider,                                          \
+      KernelDefBuilder()                                             \
+          .TypeConstraint("T",                                       \
+                          {DataTypeImpl::GetTensorType<float>(),     \
+                           DataTypeImpl::GetTensorType<MLFloat16>(), \
+                           DataTypeImpl::GetTensorType<int32_t>(),   \
+                           DataTypeImpl::GetTensorType<uint32_t>(),  \
+                           DataTypeImpl::GetTensorType<bool>()}),    \
       KERNEL_CLASS);
 
 #define REG_ELEMENTWISE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, KERNEL_CLASS) \
@@ -29,6 +30,7 @@ namespace js {
       KernelDefBuilder()                                                                  \
           .TypeConstraint("T",                                                            \
                           {DataTypeImpl::GetTensorType<float>(),                          \
+                           DataTypeImpl::GetTensorType<MLFloat16>(),                      \
                            DataTypeImpl::GetTensorType<int32_t>(),                        \
                            DataTypeImpl::GetTensorType<uint32_t>(),                       \
                            DataTypeImpl::GetTensorType<bool>()}),                         \
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 8bfa66710e2f..50782569ee80 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -102,7 +102,7 @@ std::shared_ptr<KernelRegistry> MIGraphXExecutionProvider::GetKernelRegistry() c
 }
 
 MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id), true}, device_id_(info.device_id) {
+    : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, device_id_(info.device_id) {
   InitProviderOrtApi();
   // Set GPU device to be used
   HIP_CALL_THROW(hipSetDevice(device_id_));
@@ -165,6 +165,8 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv
   MIOPEN_CALL_THROW(miopenCreate(&external_miopen_handle_));
   MIOPEN_CALL_THROW(miopenSetStream(external_miopen_handle_, stream_));
 
+  metadef_id_generator_ = ModelMetadefIdGenerator::Create();
+
   LOGS_DEFAULT(VERBOSE) << "[MIGraphX EP] MIGraphX provider options: "
                         << "device_id: " << device_id_
                         << ", migraphx_fp16_enable: " << fp16_enable_
@@ -757,7 +759,7 @@ std::unique_ptr<IndexedSubGraph> MIGraphXExecutionProvider::GetSubGraph(const st
 
   // Generate unique kernel name for MIGraphX subgraph
   uint64_t model_hash = 0;
-  int id = GenerateMetaDefId(graph, model_hash);
+  int id = metadef_id_generator_->GenerateId(graph, model_hash);
   std::string subgraph_id = std::to_string(model_hash) + "_" + std::to_string(id);
   auto meta_def = IndexedSubGraph_MetaDef::Create();
   const std::string graph_type = graph.IsSubgraph() ? "subgraph" : "graph";
@@ -1381,11 +1383,11 @@ Status MIGraphXExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status MIGraphXExecutionProvider::OnRunStart() {
+Status MIGraphXExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   return Status::OK();
 }
 
-Status MIGraphXExecutionProvider::OnRunEnd(bool) {
+Status MIGraphXExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) {
   auto status = hipStreamQuery(stream_);
 
   if (status != hipSuccess) {
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index c094be51012e..c3617f409e72 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -56,9 +56,9 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
 #ifdef MIGRAPHX_STREAM_SYNC
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 #endif
 
   std::vector<std::unique_ptr<ComputeCapability>>
@@ -98,6 +98,7 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   AllocatorPtr allocator_;
   miopenHandle_t external_miopen_handle_ = nullptr;
   rocblas_handle external_rocblas_handle_ = nullptr;
+  std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h b/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h
index 9639040f772d..a2721f6a5b44 100644
--- a/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h
+++ b/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h
@@ -4,7 +4,7 @@
 #define ONNXRUNTIME_CORE_PROVIDERS_MIGRAPHX_ORT_TRT_INT8_CAL_TABLE_FBS_H_
 
 #include <vector>
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 namespace CalTableFlatBuffers {
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 3209ad734fa2..745504ca0494 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -11,6 +11,7 @@
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
+#include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/graph.h"
@@ -18,7 +19,6 @@
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
@@ -184,9 +184,8 @@ bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit) {
   return true;
 }
 
-common::Status GetQuantizationScaleAndZeroPoint(
-    const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
-    float& scale, int32_t& zero_point) {
+common::Status GetQuantizationScaleAndZeroPoint(const GraphViewer& graph_viewer, const NodeUnitIODef& io_def,
+                                                const Path& model_path, float& scale, int32_t& zero_point) {
   scale = 0.0f;
   zero_point = 0;
 
@@ -198,14 +197,24 @@ common::Status GetQuantizationScaleAndZeroPoint(
   const auto& quant_param = *io_def.quant_param;
   {  // get the scale
     const auto& name = quant_param.scale.Name();
-    Initializer unpacked_tensor(*initializers.at(name), model_path);
+    const auto* s = graph_viewer.GetConstantInitializer(name);
+    if (!s) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, name, " is not a constant initializer");
+    };
+
+    Initializer unpacked_tensor(*s, model_path);
     // The scale should be one or more floats
     scale = unpacked_tensor.DataAsSpan<float>()[0];
   }
 
   if (quant_param.zero_point) {  // get the zero point if it's there
     const auto& name = quant_param.zero_point->Name();
-    Initializer unpacked_tensor(*initializers.at(name), model_path);
+    const auto* zp = graph_viewer.GetConstantInitializer(name);
+    if (!zp) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, name, " is not a constant initializer");
+    };
+
+    Initializer unpacked_tensor(*zp, model_path);
     // Onnx quantization uses uint8 [int8 not yet supported], need to cast to int32_t used by NNAPI
     zero_point = static_cast<int32_t>(unpacked_tensor.DataAsByteSpan()[0]);
   }
@@ -213,13 +222,13 @@ common::Status GetQuantizationScaleAndZeroPoint(
   return Status::OK();
 }
 
-common::Status GetQuantizationScaleAndZeroPoint(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
-    float& scale, int32_t& zero_point, ArgType arg_type) {
+common::Status GetQuantizationScaleAndZeroPoint(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                                const std::string& name, float& scale, int32_t& zero_point,
+                                                ArgType arg_type) {
   const auto& io_defs = arg_type == ArgType::kInput ? node_unit.Inputs() : node_unit.Outputs();
   for (const auto& io_def : io_defs) {
     if (io_def.node_arg.Name() == name)
-      return GetQuantizationScaleAndZeroPoint(initializers, io_def, node_unit.ModelPath(),
+      return GetQuantizationScaleAndZeroPoint(graph_viewer, io_def, node_unit.ModelPath(),
                                               scale, zero_point);
   }
 
@@ -348,7 +357,7 @@ bool IsNodeSupported(const NodeUnit& node_unit, const GraphViewer& graph_viewer,
   }
 
   const auto* op_builder = op_builder_it->second;
-  return op_builder->IsOpSupported(graph_viewer.GetAllInitializedTensors(), node_unit, params);
+  return op_builder->IsOpSupported(graph_viewer, node_unit, params);
 }
 
 bool IsNodeSupportedInGroup(const NodeUnit& node_unit, const GraphViewer& graph_viewer,
@@ -381,11 +390,11 @@ uint32_t ShapeSize(const Shape& shape, size_t begin_idx, size_t end_idx) {
                          SafeInt<uint32_t>{1}, std::multiplies<SafeInt<uint32_t>>{});
 }
 
-bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                        const std::string& input_name, const char* input_description) {
-  if (!Contains(initializers, input_name)) {
+bool CheckIsConstantInitializer(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                const std::string& input_name, const char* input_description) {
+  if (!graph_viewer.GetConstantInitializer(input_name)) {
     LOGS_DEFAULT(VERBOSE) << input_description << " of " << node_unit.Name() << "of type ["
-                          << node_unit.OpType() << "] must be an initializer tensor";
+                          << node_unit.OpType() << "] must be a constant initializer";
     return false;
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 766034b3dece..a606b8aceb63 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -132,11 +132,11 @@ bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type);
 bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit);
 
 common::Status GetQuantizationScaleAndZeroPoint(
-    const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
+    const GraphViewer& graph_viewer, const NodeUnitIODef& io_def, const Path& model_path,
     float& scale, int32_t& zero_point);
 
 common::Status GetQuantizationScaleAndZeroPoint(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const std::string& name,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit, const std::string& name,
     float& scale, int32_t& zero_point, ArgType arg_type = ArgType::kInput);
 
 // Get Shape/Type of a NodeArg
@@ -167,11 +167,11 @@ inline uint32_t ShapeSize(const Shape& shape) {
   return ShapeSize(shape, 0, shape.size());
 }
 
-// Check the given input is an initializer tensor
+// Check the given input is a constant initializer
 // input_name is the name of the initializer
 // input_description is the string describing the input in the output message (if any)
-bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                        const std::string& input_name, const char* input_description);
+bool CheckIsConstantInitializer(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                const std::string& input_name, const char* input_description);
 
 // Convert ONNX int64 input to NNAPI int32 type input and optionally handle negative axis if needed
 // Mostly used in handling `axes` input for now
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
index 00bca4001326..91cad034d885 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
@@ -29,7 +29,7 @@ class LRNOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
@@ -91,7 +91,7 @@ Status LRNOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const No
 
 // Operator support related
 
-bool LRNOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool LRNOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                      const OpSupportCheckParams& /* params */) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc
index 7797e0a47caa..adc79576272a 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/graph/graph_viewer.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h"
 
 namespace onnxruntime {
@@ -11,10 +12,11 @@ bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node
   const auto is_ext_initializer =
       [&](const NodeArg& node_arg) {
         const auto& input_name(node_arg.Name());
-        if (!Contains(initializers, input_name))
+        const auto initializer = initializers.find(input_name);
+        if (initializer == initializers.end())
           return false;
 
-        const auto& tensor = *initializers.at(input_name);
+        const auto& tensor = *initializer->second;
         if (tensor.has_data_location() &&
             tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
           LOGS_DEFAULT(VERBOSE) << "Initializer [" << input_name
@@ -51,8 +53,12 @@ Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const NodeU
       model_builder.GetEffectiveFeatureLevel(),
       model_builder.UseNCHW(),
   };
-  ORT_RETURN_IF_NOT(IsOpSupported(model_builder.GetInitializerTensors(), node_unit, params),
-                    "Unsupported operator ", node_unit.OpType());
+
+  // We checked supported in IExecutionProvider::GetCapability.
+  // Checking again in AddToModelBuilder which is called in IExecutionProvider::Compile is redundant.
+  // ORT_RETURN_IF_NOT(IsOpSupported(model_builder.GetGraphViewer(), node_unit, params),
+  //                  "Unsupported operator ", node_unit.OpType());
+
 #ifndef NDEBUG
   model_builder.SetDebugCurrentOnnxNodeIndex(node_unit.Index());
 #endif
@@ -64,7 +70,7 @@ Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const NodeU
 
 // Operator support related
 
-bool BaseOpBuilder::IsOpSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool BaseOpBuilder::IsOpSupported(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                   const OpSupportCheckParams& params) const {
   int32_t required_feature_level = GetMinSupportedNNAPIFeatureLevel(node_unit, params);
   if (required_feature_level > params.android_feature_level) {
@@ -77,20 +83,20 @@ bool BaseOpBuilder::IsOpSupported(const InitializedTensorSet& initializers, cons
   if (!IsNodeUnitTypeSupported(node_unit))
     return false;
 
-  if (!HasSupportedInputOutputs(initializers, node_unit, params))
+  if (!HasSupportedInputOutputs(graph_viewer, node_unit, params))
     return false;
 
   // We do not support external initializers for now
-  if (HasExternalInitializer(initializers, node_unit))
+  if (HasExternalInitializer(graph_viewer.GetAllInitializedTensors(), node_unit))
     return false;
 
   if (!HasSupportedOpSet(node_unit))
     return false;
 
-  return IsOpSupportedImpl(initializers, node_unit, params);
+  return IsOpSupportedImpl(graph_viewer, node_unit, params);
 }
 
-bool BaseOpBuilder::HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool BaseOpBuilder::HasSupportedInputOutputs(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                              const OpSupportCheckParams& params) const {
   // We do not support unknown(null) input shape
   auto has_supported_shape = [](const NodeArg& node_arg, const std::string& name, const std::string& op_type) {
@@ -128,12 +134,12 @@ bool BaseOpBuilder::HasSupportedInputOutputs(const InitializedTensorSet& initial
         return false;
     }
   }
-  return HasSupportedInputOutputsImpl(initializers, node_unit, params);
+
+  return HasSupportedInputOutputsImpl(graph_viewer, node_unit, params);
 }
 
-bool BaseOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
-    const OpSupportCheckParams& /* params */) const {
+bool BaseOpBuilder::HasSupportedInputOutputsImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
+                                                 const OpSupportCheckParams& /* params */) const {
   // We only check the type of input 0 by default
   // specific op builder can override this
   const auto& input = node_unit.Inputs()[0].node_arg;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h
index 339ccd67f33e..ca59c5f11ed1 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "core/common/common.h"
-#include "core/providers/shared/node_unit/node_unit.h"
+#include "core/framework/node_unit.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/model_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h"
@@ -52,11 +52,11 @@ class BaseOpBuilder : public IOpBuilder {
 
   // Operator support related
  public:
-  bool IsOpSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupported(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                      const OpSupportCheckParams& params) const override;
 
  protected:
-  virtual bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& /* node_unit */,
+  virtual bool IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& /* node_unit */,
                                  const OpSupportCheckParams& /* params */) const {
     return true;
   }
@@ -68,12 +68,11 @@ class BaseOpBuilder : public IOpBuilder {
     return ANEURALNETWORKS_FEATURE_LEVEL_1;
   }
 
-  virtual bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-      const OpSupportCheckParams& params) const;
+  virtual bool HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                            const OpSupportCheckParams& params) const;
 
   virtual int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const { return 1; }
-  virtual int GetMaxSupportedOpSet(const NodeUnit& /* node_unit */) const { return 19; }
+  virtual int GetMaxSupportedOpSet(const NodeUnit& /* node_unit */) const { return 21; }
 
   // Check if this node_unit's type is supported
   // SingleNode type NodeUnit is supported
@@ -82,7 +81,7 @@ class BaseOpBuilder : public IOpBuilder {
 
  private:
   bool HasSupportedOpSet(const NodeUnit& node_unit) const;
-  bool HasSupportedInputOutputs(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool HasSupportedInputOutputs(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                 const OpSupportCheckParams& params) const;
 };
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
index 3add0ac26c0d..75a66d3a1464 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
@@ -33,7 +33,7 @@ class BatchNormalizationOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   // BatchNormalization opset 6- has unsupported attributes
@@ -127,7 +127,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
 
 // Operator support related
 
-bool BatchNormalizationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool BatchNormalizationOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                                     const OpSupportCheckParams& /* params */) const {
   if (node_unit.Outputs().size() != 1) {
     LOGS_DEFAULT(VERBOSE) << "Your onnx model may be in training mode, please export "
@@ -158,20 +158,20 @@ bool BatchNormalizationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet&
   const auto& b_name = inputs[2].node_arg.Name();
   const auto& mean_name = inputs[3].node_arg.Name();
   const auto& var_name = inputs[4].node_arg.Name();
-  if (!Contains(initializers, scale_name)) {
-    LOGS_DEFAULT(VERBOSE) << "Scale of BN must be known";
+  if (!graph_viewer.GetConstantInitializer(scale_name)) {
+    LOGS_DEFAULT(VERBOSE) << "Scale of BN must be a constant initializer";
     return false;
   }
-  if (!Contains(initializers, b_name)) {
-    LOGS_DEFAULT(VERBOSE) << "B of BN must be known";
+  if (!graph_viewer.GetConstantInitializer(b_name)) {
+    LOGS_DEFAULT(VERBOSE) << "B of BN must be a constant initializer";
     return false;
   }
-  if (!Contains(initializers, mean_name)) {
-    LOGS_DEFAULT(VERBOSE) << "Mean of BN must be known";
+  if (!graph_viewer.GetConstantInitializer(mean_name)) {
+    LOGS_DEFAULT(VERBOSE) << "Mean of BN must be a constant initializer";
     return false;
   }
-  if (!Contains(initializers, var_name)) {
-    LOGS_DEFAULT(VERBOSE) << "Var of BN must be known";
+  if (!graph_viewer.GetConstantInitializer(var_name)) {
+    LOGS_DEFAULT(VERBOSE) << "Var of BN must be a constant initializer";
     return false;
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
index dce1a7c8659b..5599fbdc69bd 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
@@ -34,10 +34,10 @@ class BinaryOpBuilder : public BaseOpBuilder {
  private:
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& node_unit,
                                            const OpSupportCheckParams& params) const override;
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
   bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const GraphViewer& graph_viewer, const NodeUnit& node_unit,
       const OpSupportCheckParams& params) const override;
   int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
 
@@ -95,7 +95,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
   if (is_quant_op) {
     ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(
-        model_builder.GetInitializerTensors(), node_unit,
+        model_builder.GetGraphViewer(), node_unit,
         a_scale, b_scale, y_scale,
         a_zero_point, b_zero_point, y_zero_point));
   }
@@ -163,22 +163,22 @@ int BinaryOpBuilder::GetMinSupportedOpSet(const NodeUnit& node_unit) const {
 }
 
 bool BinaryOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit,
     const OpSupportCheckParams& params) const {
   bool is_quantized_op = IsQuantizedOp(node_unit);
   bool is_pow = node_unit.OpType() == "Pow";
   if (!is_quantized_op && !is_pow)
-    return BaseOpBuilder::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+    return BaseOpBuilder::HasSupportedInputOutputsImpl(graph_viewer, node_unit, params);
 
   if (is_quantized_op) {
     // QLinearAdd/QDQAdd/QLinearMul/QDQMul
     if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
       return false;
 
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, ArgType::kInput))
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0, 1}, params, ArgType::kInput))
       return false;
 
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput))
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput))
       return false;
   }
 
@@ -203,7 +203,7 @@ bool BinaryOpBuilder::HasSupportedInputOutputsImpl(
   return true;
 }
 
-bool BinaryOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool BinaryOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                         const OpSupportCheckParams& /* params */) const {
   const auto& op_type(node_unit.OpType());
   const auto& inputs = node_unit.Inputs();
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
index b31ee484dc5a..9059de817e21 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
@@ -29,7 +29,7 @@ class CastOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
@@ -70,7 +70,7 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   return Status::OK();
 }
 
-bool CastOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool CastOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                       const OpSupportCheckParams& /* params */) const {
   NodeAttrHelper helper(node_unit);
   const auto to = helper.Get("to", 0);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
index b3e294d2f084..9821d9267c71 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
@@ -32,7 +32,7 @@ class ClipOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -64,7 +64,7 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   }
 
   float min, max;
-  GetClipMinMax(model_builder.GetInitializerTensors(), node_unit.GetNode(), min, max,
+  GetClipMinMax(model_builder.GetGraphViewer(), node_unit.GetNode(), min, max,
                 logging::LoggingManager::DefaultLogger());
 
   int32_t op_code;
@@ -85,10 +85,10 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 // Operator support related
 
-bool ClipOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool ClipOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                       const OpSupportCheckParams& /* params */) const {
   float min, max;
-  if (!GetClipMinMax(initializers, node_unit.GetNode(), min, max, logging::LoggingManager::DefaultLogger()))
+  if (!GetClipMinMax(graph_viewer, node_unit.GetNode(), min, max, logging::LoggingManager::DefaultLogger()))
     return false;
 
   // We only supoort relu6 or relu1
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
index 2bf8f07e26fd..a8394faec51b 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
@@ -32,11 +32,11 @@ class ConcatOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const GraphViewer& graph_viewer, const NodeUnit& node_unit,
       const OpSupportCheckParams& params) const override;
 
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
@@ -113,7 +113,7 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
       float scale = 0.0f;
       int32_t zero_point = 0;
       ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-          model_builder.GetInitializerTensors(), node_unit.Inputs()[i], node_unit.ModelPath(),
+          model_builder.GetGraphViewer(), node_unit.Inputs()[i], node_unit.ModelPath(),
           scale, zero_point));
 
       ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, scale, zero_point));
@@ -128,7 +128,7 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   int32_t y_zero_point = operand_types.at(input0).operandType.zeroPoint;
   if (is_quant_op) {
     ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-        model_builder.GetInitializerTensors(), node_unit.Outputs()[0], node_unit.ModelPath(),
+        model_builder.GetGraphViewer(), node_unit.Outputs()[0], node_unit.ModelPath(),
         y_scale, y_zero_point));
   }
 
@@ -151,7 +151,7 @@ bool ConcatOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
   return GetQuantizedOpType(node_unit) == QuantizedOpType::QDQConcat;
 }
 
-bool ConcatOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool ConcatOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                         const OpSupportCheckParams& /* params */) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
@@ -168,7 +168,7 @@ bool ConcatOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializ
 }
 
 bool ConcatOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit,
     const OpSupportCheckParams& params) const {
   const auto& op_type = node_unit.OpType();
   const auto& op_name = node_unit.Name();
@@ -188,11 +188,11 @@ bool ConcatOpBuilder::HasSupportedInputOutputsImpl(
   if (IsQuantizedOp(node_unit)) {
     std::vector<size_t> input_indices(input_size);
     std::iota(input_indices.begin(), input_indices.end(), 0);
-    if (!IsQuantizedIOSupported(initializers, node_unit, input_indices, params, ArgType::kInput)) {
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, input_indices, params, ArgType::kInput)) {
       return false;
     }
 
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput)) {
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput)) {
       return false;
     }
 
@@ -203,7 +203,7 @@ bool ConcatOpBuilder::HasSupportedInputOutputsImpl(
       size_t input_idx = 0;
 
       auto status = GetQuantizationScaleAndZeroPoint(
-          initializers, node_unit.Inputs()[input_idx], node_unit.ModelPath(),
+          graph_viewer, node_unit.Inputs()[input_idx], node_unit.ModelPath(),
           input_scales[input_idx], input_zps[input_idx]);
 
       if (!status.IsOK()) {
@@ -214,7 +214,7 @@ bool ConcatOpBuilder::HasSupportedInputOutputsImpl(
       }
 
       for (++input_idx; input_idx < input_size; ++input_idx) {
-        if (!HasRequiredScaleAndZeroPoint(initializers,
+        if (!HasRequiredScaleAndZeroPoint(graph_viewer,
                                           MakeString("Op [", op_type, "] name [", op_name, "] input ", input_idx),
                                           node_unit.Inputs()[input_idx],
                                           node_unit.ModelPath(),
@@ -225,7 +225,7 @@ bool ConcatOpBuilder::HasSupportedInputOutputsImpl(
       }
 
       // NNAPI (28-) requires the output scale and zp be the same as the input 0
-      if (!HasRequiredScaleAndZeroPoint(initializers,
+      if (!HasRequiredScaleAndZeroPoint(graph_viewer,
                                         MakeString("Op [", op_type, "] name [", op_name, "]'s output 0"),
                                         node_unit.Outputs()[0], node_unit.ModelPath(),
                                         input_scales[0] /* required_scale */,
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
index 5b8bbd338a13..5477cd16f9c0 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
@@ -33,7 +33,7 @@ class ConvOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
@@ -41,9 +41,8 @@ class ConvOpBuilder : public BaseOpBuilder {
     return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 
-  bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
-      const OpSupportCheckParams& /* params */) const override;
+  bool HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                    const OpSupportCheckParams& params) const override;
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
   bool IsQuantizedOp(const NodeUnit& node_unit) const override;
 };
@@ -279,19 +278,19 @@ bool ConvOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
 }
 
 bool ConvOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit,
     const OpSupportCheckParams& params) const {
   if (!IsQuantizedOp(node_unit))
-    return BaseOpBuilder::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+    return BaseOpBuilder::HasSupportedInputOutputsImpl(graph_viewer, node_unit, params);
 
   // QLinearConv only supports input of uint8 for now
   if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
     return false;
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, ArgType::kInput))
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0, 1}, params, ArgType::kInput))
     return false;
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput))
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput))
     return false;
 
   return true;
@@ -299,7 +298,7 @@ bool ConvOpBuilder::HasSupportedInputOutputsImpl(
 
 // Operator support related
 
-bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool ConvOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                       const OpSupportCheckParams& params) const {
   const auto& op_type = node_unit.OpType();
   bool is_quant_conv = IsQuantizedOp(node_unit);
@@ -314,8 +313,9 @@ bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   NodeAttrHelper helper(node_unit);
   const auto group = helper.Get("group", 1);
   const auto weight_name = inputs[1].node_arg.Name();
-  if (Contains(initializers, weight_name)) {
-    const auto& tensor = *initializers.at(weight_name);
+  const auto* weight = graph_viewer.GetConstantInitializer(weight_name);
+  if (weight) {
+    const auto& tensor = *weight;
     if (tensor.dims().size() != 4) {
       LOGS_DEFAULT(VERBOSE) << "Only conv 2d is supported.";
       return false;
@@ -335,13 +335,13 @@ bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       }
     }
   } else {
-    LOGS_DEFAULT(VERBOSE) << "The weight of convolution must be known";
+    LOGS_DEFAULT(VERBOSE) << "The weight of convolution must be a constant initializer";
     return false;
   }
 
   if (is_quant_conv) {
-    if (inputs.size() > 2 && !Contains(initializers, inputs[2].node_arg.Name())) {
-      LOGS_DEFAULT(VERBOSE) << "Bias of QLinearConv must be known";
+    if (inputs.size() > 2 && !graph_viewer.GetConstantInitializer(inputs[2].node_arg.Name())) {
+      LOGS_DEFAULT(VERBOSE) << "Bias of QLinearConv must be a constant initializer";
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
index 649f1e1cff2b..ef8709641e2d 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
@@ -29,7 +29,7 @@ class DepthToSpaceOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -66,7 +66,7 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related
 
-bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                               const OpSupportCheckParams& params) const {
   NodeAttrHelper helper(node_unit);
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
index b2d89ffecdca..7d0e04fbd7b0 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
@@ -38,9 +38,9 @@ class DequantizeLinearOpBuilder : public BaseOpBuilder {
   }
 
   bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const GraphViewer& graph_viewer, const NodeUnit& node_unit,
       const OpSupportCheckParams& params) const override {
-    return IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kInput);
+    return IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kInput);
   }
 };
 
@@ -61,7 +61,7 @@ Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
   float scale = 0.0;
   int32_t zero_point = 0;
   ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-      model_builder.GetInitializerTensors(), node_unit.Inputs()[0], node_unit.ModelPath(), scale, zero_point));
+      model_builder.GetGraphViewer(), node_unit.Inputs()[0], node_unit.ModelPath(), scale, zero_point));
 
   ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, scale, zero_point));
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
index 065b9638bdf6..b5e9c011990c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
@@ -44,7 +44,7 @@ class FlattenOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -70,7 +70,7 @@ Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
 
 // Operator support related
 
-bool FlattenOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool FlattenOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                          const OpSupportCheckParams& /* params */) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
index ac8970f19df0..d6da9181b5a3 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
@@ -36,7 +36,7 @@ class GatherOpBuilder : public BaseOpBuilder {
     return ANEURALNETWORKS_FEATURE_LEVEL_3;
   }
 
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -133,7 +133,7 @@ Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 // Operator support related
 
-bool GatherOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool GatherOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                         const OpSupportCheckParams& /* params */) const {
   const auto& inputs = node_unit.Inputs();
   Shape input_shape;
@@ -166,8 +166,8 @@ bool GatherOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     return false;
 
   if (indices_type != ONNX_NAMESPACE::TensorProto_DataType_INT32) {
-    if (!Contains(initializers, indices_name)) {
-      LOGS_DEFAULT(VERBOSE) << "Indices of Gather must be known.";
+    if (!graph_viewer.GetConstantInitializer(indices_name)) {
+      LOGS_DEFAULT(VERBOSE) << "Indices of Gather must be a constant initializer.";
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
index 9b3003d472b0..66eefcd6e484 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
@@ -69,11 +69,10 @@ class GemmOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
-  bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
-      const OpSupportCheckParams& /* params */) const override;
+  bool HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                    const OpSupportCheckParams& params) const override;
   int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
 
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
@@ -261,21 +260,20 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 // Operator support related
 
-bool GemmOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-    const OpSupportCheckParams& params) const {
+bool GemmOpBuilder::HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                                 const OpSupportCheckParams& params) const {
   if (!IsQuantizedOp(node_unit)) {
-    return BaseOpBuilder::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+    return BaseOpBuilder::HasSupportedInputOutputsImpl(graph_viewer, node_unit, params);
   }
 
   // QLinearMatMul/QDQGemm/QDQMatMul
   if (!HasValidBinaryOpQuantizedInputTypes(node_unit))
     return false;
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0, 1}, params, ArgType::kInput))
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0, 1}, params, ArgType::kInput))
     return false;
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput))
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput))
     return false;
 
   return true;
@@ -295,16 +293,10 @@ bool GemmOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
   return IsQuantizedGemm(GetQuantizedOpType(node_unit));
 }
 
-bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool GemmOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                       const OpSupportCheckParams& params) const {
   // check batch matmul first, then fall back to checking single gemm/matmul
   {
-    const bool is_supported_batch_matmul =
-        op_builder_helpers::IsSupportedBatchMatMul(node_unit, params.android_feature_level);
-    LOGS_DEFAULT(VERBOSE) << "Supported batch matmul: [" << is_supported_batch_matmul << "]";
-    if (is_supported_batch_matmul) {
-      return true;
-    }
   }
 
   const auto& op_type = node_unit.OpType();
@@ -314,25 +306,25 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   const bool is_quant_gemm = quant_type == QuantizedOpType::QDQGemm;
 
   Shape a_shape;
-  {
-    if (!GetShape(inputs[0].node_arg, a_shape))
-      return false;
-
-    if (a_shape.size() != 2) {
-      LOGS_DEFAULT(VERBOSE) << "A must be 2D";
-      return false;
-    }
+  Shape b_shape;
+  if (!GetShape(inputs[0].node_arg, a_shape) || !GetShape(inputs[1].node_arg, b_shape)) {
+    return false;
   }
 
-  Shape b_shape;
-  {
-    if (!GetShape(inputs[1].node_arg, b_shape))
-      return false;
+  auto a_rank = a_shape.size();
+  auto b_rank = b_shape.size();
 
-    if (b_shape.size() != 2) {
-      LOGS_DEFAULT(VERBOSE) << "B must be 2D";
-      return false;
-    }
+  if (a_rank == 2 && b_rank == 2) {
+    // can potentially use FullyConnected
+  } else if (a_rank > 2 && b_rank > 2) {
+    // can maybe use our manual batched MatMul implementation
+    const bool is_supported_batch_matmul = op_builder_helpers::IsSupportedBatchMatMul(node_unit,
+                                                                                      params.android_feature_level);
+    LOGS_DEFAULT(VERBOSE) << "Supported batch matmul: [" << is_supported_batch_matmul << "]";
+    return is_supported_batch_matmul;
+  } else {
+    LOGS_DEFAULT(VERBOSE) << "A and B must be 2D";
+    return false;
   }
 
   if (op_type == "Gemm") {
@@ -355,8 +347,8 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       return false;
     }
 
-    if (transB == 0 && !Contains(initializers, inputs[1].node_arg.Name())) {
-      LOGS_DEFAULT(VERBOSE) << "B of Gemm must be known if transB != 1";
+    if (transB == 0 && !graph_viewer.GetConstantInitializer(inputs[1].node_arg.Name())) {
+      LOGS_DEFAULT(VERBOSE) << "B of Gemm must be a constant initializer if transB != 1";
       return false;
     }
 
@@ -380,8 +372,8 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     }
   } else if (op_type == "MatMul" || is_qlinear_matmul) {
     // Only support A*B B is an initializer
-    if (!Contains(initializers, inputs[1].node_arg.Name())) {
-      LOGS_DEFAULT(VERBOSE) << "B of MatMul must be known";
+    if (!graph_viewer.GetConstantInitializer(inputs[1].node_arg.Name())) {
+      LOGS_DEFAULT(VERBOSE) << "B of MatMul must be a constant initializer";
       return false;
     }
   } else {
@@ -389,8 +381,8 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   }
 
   if (is_quant_gemm) {
-    if (inputs.size() > 2 && !Contains(initializers, inputs[2].node_arg.Name())) {
-      LOGS_DEFAULT(VERBOSE) << "Bias of QDQ Gemm must be known";
+    if (inputs.size() > 2 && !graph_viewer.GetConstantInitializer(inputs[2].node_arg.Name())) {
+      LOGS_DEFAULT(VERBOSE) << "Bias of QDQ Gemm must be a constant initializer";
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
index 3db63a756ab1..6a633c443c9e 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
@@ -27,7 +27,7 @@ class LeakyReluOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   // LeakyRelu opset 6- has unsupported attributes
@@ -111,7 +111,7 @@ Status LeakyReluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related
 
-bool LeakyReluOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /*initializers*/, const NodeUnit& node_unit,
+bool LeakyReluOpBuilder::IsOpSupportedImpl(const GraphViewer& /*graph_viewer*/, const NodeUnit& node_unit,
                                            const OpSupportCheckParams& /* params */) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
index 522f389ae62a..aeadbd17053c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
@@ -37,7 +37,7 @@ class MinMaxOpBuilder : public BaseOpBuilder {
   // Min/Max opset 5- uses consumed_inputs attribute which is not supported for now
   int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 6; }
 
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -53,7 +53,7 @@ Status MinMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 // Operator support related
 
-bool MinMaxOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool MinMaxOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                         const OpSupportCheckParams& /* params */) const {
   // TODO: support 2+ inputs for Min/Max op
   if (node_unit.Inputs().size() != 2) {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
index 11d37f9036b1..b0404ebec058 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
@@ -45,7 +45,7 @@ class PadOpBuilder : public BaseOpBuilder {
     return 11;
   }
 
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -115,7 +115,7 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const No
   return model_builder.AddOperation(op_code, input_indices, {output}, {output_operand_type});
 }
 
-bool PadOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool PadOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                      const OpSupportCheckParams& /* params */) const {
   const auto& inputs = node_unit.Inputs();
 
@@ -152,14 +152,13 @@ bool PadOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, c
 
   // only support if `pads` input is known and does not contain negative values
   {
-    const auto pads_initializer_it = initializers.find(inputs[1].node_arg.Name());
-    if (pads_initializer_it == initializers.end()) {
-      LOGS_DEFAULT(VERBOSE) << "pads must be known";
+    const auto* pads_initializer = graph_viewer.GetConstantInitializer(inputs[1].node_arg.Name());
+    if (!pads_initializer) {
+      LOGS_DEFAULT(VERBOSE) << "pads must be a constant initializer";
       return false;
     }
 
-    const ONNX_NAMESPACE::TensorProto& pads_initializer = *pads_initializer_it->second;
-    Initializer unpacked_tensor(pads_initializer);
+    Initializer unpacked_tensor(*pads_initializer);
     auto tensor_data = unpacked_tensor.DataAsSpan<int64_t>();
     for (size_t i = 0; i < unpacked_tensor.size(); i++) {
       if (tensor_data[i] < 0) {
@@ -173,8 +172,8 @@ bool PadOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, c
   // only support if `constant_value` input is known
   // Note: Could add support for non-constant initializer later. Then we need to ensure it is a scalar (with shape []).
   if (inputs.size() > 2) {
-    if (!Contains(initializers, inputs[2].node_arg.Name())) {
-      LOGS_DEFAULT(VERBOSE) << "constant_value must be known";
+    if (!graph_viewer.GetConstantInitializer(inputs[2].node_arg.Name())) {
+      LOGS_DEFAULT(VERBOSE) << "constant_value must be a constant initializer";
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
index c14568aaccfa..a2a4786b72ec 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
@@ -32,7 +32,7 @@ class PoolOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
@@ -40,10 +40,9 @@ class PoolOpBuilder : public BaseOpBuilder {
     return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 
-  bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-      const OpSupportCheckParams& params) const override;
-  bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override;
+  bool HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                    const OpSupportCheckParams& params) const override;
+  bool IsNodeUnitTypeSupported(const NodeUnit& node_unit) const override;
   bool IsQuantizedOp(const NodeUnit& node_unit) const override;
 };
 
@@ -116,16 +115,16 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   float y_scale = input_operand_type.operandType.scale;
   int32_t y_zero_point = input_operand_type.operandType.zeroPoint;
   if (is_quant_pool) {
-    const auto& initializers = model_builder.GetInitializerTensors();
+    const auto& graph_viewer = model_builder.GetGraphViewer();
     float x_scale = 0.0f;
     int32_t x_zero_point = 0;
     ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
+        graph_viewer, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
 
     // Verify if the scale and zero point values from onnx input and nnapi input match
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
     ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-        initializers, node_unit.Outputs()[0], node_unit.ModelPath(), y_scale, y_zero_point));
+        graph_viewer, node_unit.Outputs()[0], node_unit.ModelPath(), y_scale, y_zero_point));
   }
 
   InlinedVector<uint32_t> input_indices;
@@ -171,7 +170,7 @@ bool PoolOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
   return IsQuantizedPool(GetQuantizedOpType(node_unit));
 }
 
-bool PoolOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool PoolOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                       const OpSupportCheckParams& /* params */) const {
   const auto& op_name = node_unit.Name();
   const auto& op_type = node_unit.OpType();
@@ -236,7 +235,7 @@ bool PoolOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     float input_scale = 0.0f;
     int32_t input_zp = 0;
     auto status = GetQuantizationScaleAndZeroPoint(
-        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), input_scale, input_zp);
+        graph_viewer, node_unit.Inputs()[0], node_unit.ModelPath(), input_scale, input_zp);
     if (!status.IsOK()) {
       LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
                           << "] GetQuantizationScaleAndZeroPoint for input_scale/zp failed, message: "
@@ -247,7 +246,7 @@ bool PoolOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     float output_scale = 0.0f;
     int32_t output_zp = 0;
     status = GetQuantizationScaleAndZeroPoint(
-        initializers, node_unit.Outputs()[0], node_unit.ModelPath(), output_scale, output_zp);
+        graph_viewer, node_unit.Outputs()[0], node_unit.ModelPath(), output_scale, output_zp);
     if (!status.IsOK()) {
       LOGS_DEFAULT(ERROR) << "Op [" << op_type << "] name [" << op_name
                           << "] GetQuantizationScaleAndZeroPoint for output_scale/zp failed, message: "
@@ -274,7 +273,7 @@ bool PoolOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
 }
 
 bool PoolOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit,
     const OpSupportCheckParams& params) const {
   const auto& op_type = node_unit.OpType();
   bool is_quant_pool = IsQuantizedOp(node_unit);
@@ -282,13 +281,13 @@ bool PoolOpBuilder::HasSupportedInputOutputsImpl(
   bool is_average_pool = op_type == "AveragePool" || op_type == "QLinearAveragePool";
   bool is_quant_average_pool = is_quant_pool && is_average_pool;
   if (!is_max_pool && !is_quant_average_pool)
-    return BaseOpBuilder::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+    return BaseOpBuilder::HasSupportedInputOutputsImpl(graph_viewer, node_unit, params);
 
   if (is_quant_average_pool) {
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kInput))
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kInput))
       return false;
 
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput))
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput))
       return false;
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
index 49ff01d27219..d13b81c2a14b 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
@@ -38,9 +38,9 @@ class QuantizeLinearOpBuilder : public BaseOpBuilder {
   }
 
   bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const GraphViewer& graph_viewer, const NodeUnit& node_unit,
       const OpSupportCheckParams& params) const override {
-    return IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput);
+    return IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput);
   }
 };
 
@@ -60,7 +60,7 @@ Status QuantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builde
   float scale = 0.0f;
   int32_t zero_point = 0;
   ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-      model_builder.GetInitializerTensors(), node_unit.Outputs()[0], node_unit.ModelPath(), scale, zero_point));
+      model_builder.GetGraphViewer(), node_unit.Outputs()[0], node_unit.ModelPath(), scale, zero_point));
 
   Type output_type = Type::TENSOR_QUANT8_ASYMM;
   const OperandType output_operand_type(output_type, shaper[output], scale, zero_point);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
index 8d0347673ba5..a6da290753b7 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
@@ -35,7 +35,7 @@ class ReductionOpBuilder : public BaseOpBuilder {
  private:
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& node_unit,
                                            const OpSupportCheckParams& params) const override;
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -169,7 +169,7 @@ int32_t ReductionOpBuilder::GetMinSupportedNNAPIFeatureLevel(
   return ANEURALNETWORKS_FEATURE_LEVEL_3;
 }
 
-bool ReductionOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool ReductionOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                            const OpSupportCheckParams& /* params */) const {
   const auto& inputs = node_unit.Inputs();
   const auto& op(node_unit.OpType());
@@ -190,7 +190,7 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializ
     const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0;
     if (inputs.size() > 1 && inputs[1].node_arg.Exists()) {
       const auto& axes_name = inputs[1].node_arg.Name();
-      if (!Contains(initializers, axes_name)) {
+      if (!graph_viewer.GetConstantInitializer(axes_name)) {
         LOGS_DEFAULT(VERBOSE) << "Axes of ReduceMean must be a constant initializer.";
         return false;
       }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
index 869883b98b22..f2f9165d2f3c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
@@ -35,14 +35,13 @@ class ReshapeOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   // Reshape opset 4- uses attributes for new shape which we do not support for now
   int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 5; }
-  bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
-      const OpSupportCheckParams& /* params */) const override;
+  bool HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                    const OpSupportCheckParams& params) const override;
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
   bool IsQuantizedOp(const NodeUnit& node_unit) const override;
 };
@@ -59,10 +58,10 @@ void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
 
 Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
   auto& shaper(model_builder.GetShaper());
-  const auto& initializers(model_builder.GetInitializerTensors());
+  const auto& graph_viewer(model_builder.GetGraphViewer());
   auto input = node_unit.Inputs()[0].node_arg.Name();
 
-  const auto& shape_tensor = *initializers.at(node_unit.Inputs()[1].node_arg.Name());
+  const auto& shape_tensor = *graph_viewer.GetConstantInitializer(node_unit.Inputs()[1].node_arg.Name());
   Initializer unpacked_tensor(shape_tensor);
   auto raw_shape = unpacked_tensor.DataAsSpan<int64_t>();
   const auto size = SafeInt<uint32_t>(shape_tensor.dims()[0]);
@@ -80,7 +79,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   int32_t x_zero_point = 0;
   if (IsQuantizedOp(node_unit)) {
     ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
+        graph_viewer, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
   }
 
@@ -93,12 +92,13 @@ bool ReshapeOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
   return GetQuantizedOpType(node_unit) == QuantizedOpType::QDQReshape;
 }
 
-bool ReshapeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool ReshapeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                          const OpSupportCheckParams& /* params */) const {
   const auto& inputs = node_unit.Inputs();
   const auto& perm_name = inputs[1].node_arg.Name();
-  if (!Contains(initializers, perm_name)) {
-    LOGS_DEFAULT(VERBOSE) << "New shape of reshape must be known";
+  const auto* perm = graph_viewer.GetConstantInitializer(perm_name);
+  if (!perm) {
+    LOGS_DEFAULT(VERBOSE) << "New shape of reshape must be a constant initializer";
     return false;
   }
 
@@ -112,7 +112,7 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializer
     return false;
   }
 
-  const auto& perm_tensor = *initializers.at(perm_name);
+  const auto& perm_tensor = *perm;
   Initializer unpacked_tensor(perm_tensor);
   auto raw_perm = unpacked_tensor.DataAsSpan<int64_t>();
   const auto perm_size = SafeInt<uint32_t>(perm_tensor.dims()[0]);
@@ -138,17 +138,17 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializer
 }
 
 bool ReshapeOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit,
     const OpSupportCheckParams& params) const {
   if (!IsQuantizedOp(node_unit)) {
-    return BaseOpBuilder::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+    return BaseOpBuilder::HasSupportedInputOutputsImpl(graph_viewer, node_unit, params);
   }
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kInput)) {
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kInput)) {
     return false;
   }
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput)) {
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput)) {
     return false;
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
index cdaa1c8fac76..d75b9cc72ff4 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
@@ -33,19 +33,18 @@ class ResizeOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
-                                           const OpSupportCheckParams& /* params */) const override;
+  int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& node_unit,
+                                           const OpSupportCheckParams& params) const override;
 
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing
   // We only support Resize opset 11+ here
   int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 11; }
 
-  bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
-      const OpSupportCheckParams& /* params */) const override;
+  bool HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                    const OpSupportCheckParams& params) const override;
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
   bool IsQuantizedOp(const NodeUnit& node_unit) const override;
 };
@@ -74,7 +73,6 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   auto& shaper(model_builder.GetShaper());
   const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
-  const auto& initializers(model_builder.GetInitializerTensors());
   NodeAttrHelper helper(node_unit);
   const auto& inputs = node_unit.Inputs();
   const auto android_feature_level = model_builder.GetEffectiveFeatureLevel();
@@ -92,7 +90,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     float x_scale = 0.0f;
     int32_t x_zero_point = 0;
     ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
+        model_builder.GetGraphViewer(), node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
   }
 
@@ -147,7 +145,7 @@ bool ResizeOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
   return GetQuantizedOpType(node_unit) == QuantizedOpType::QDQResize;
 }
 
-bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool ResizeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                         const OpSupportCheckParams& params) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
@@ -228,32 +226,29 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     }
   }
 
-  {  // scales and sizes (if present) must be initializers
+  // scales or sizes must be constant initializers
+  {
+    // scales is input 3, sizes input 4, one must exist. only one is used.
     const auto inputs = node_unit.Inputs();
-    if (inputs.size() < 3) {
+    bool using_scales = inputs.size() > 2 && inputs[2].node_arg.Exists();
+    bool using_sizes = !using_scales && inputs.size() > 3 && inputs[3].node_arg.Exists();
+    if (!using_scales && !using_sizes) {
       LOGS_DEFAULT(VERBOSE) << "Input scales or sizes of Resize must be known";
       return false;
     }
 
-    // scales
-    bool using_scales = (inputs.size() > 2 && inputs[2].node_arg.Exists());
-    if (using_scales && !Contains(initializers, inputs[2].node_arg.Name())) {
-      LOGS_DEFAULT(VERBOSE) << "Input scales of Resize must be known";
-      return false;
-    }
-
-    // sizes
-    bool using_sizes = inputs.size() > 3 && inputs[3].node_arg.Exists();
-    if (using_sizes && !Contains(initializers, inputs[3].node_arg.Name())) {
-      LOGS_DEFAULT(VERBOSE) << "Input sizes of Resize must be known";
-      return false;
-    }
-    bool input_is_nchw = false;
     // haven't a good solution to check layout when scale is 1.0F
     // We want to check if the scales or sizes are not trying to resize on N/C channels here
-    if (using_scales) {  // we are using scales
-      const auto& scales_tensor = *initializers.at(inputs[2].node_arg.Name());
-      Initializer const unpacked_tensor(scales_tensor);
+    bool input_is_nchw = false;
+
+    if (using_scales) {
+      const auto* scales = graph_viewer.GetConstantInitializer(inputs[2].node_arg.Name());
+      if (!scales) {
+        LOGS_DEFAULT(VERBOSE) << "Input scales of Resize must be a constant initializer";
+        return false;
+      }
+
+      const Initializer unpacked_tensor(*scales);
       auto scales_data = unpacked_tensor.DataAsSpan<float>();
       input_is_nchw = scales_data[1] == 1.0F;
       float const scale_n = scales_data[0];
@@ -265,10 +260,13 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
         return false;
       }
     } else {
-      // we are using sizes
-      const auto& sizes_name = inputs[3].node_arg.Name();
-      const auto& sizes_tensor = *initializers.at(sizes_name);
-      Initializer unpacked_tensor(sizes_tensor);
+      const auto* sizes = graph_viewer.GetConstantInitializer(inputs[3].node_arg.Name());
+      if (!sizes) {
+        LOGS_DEFAULT(VERBOSE) << "Input sizes of Resize must be a constant initializer";
+        return false;
+      }
+
+      Initializer unpacked_tensor(*sizes);
       auto sizes_data = unpacked_tensor.DataAsSpan<int64_t>();
 
       input_is_nchw = sizes_data[1] == input_shape[1];
@@ -308,7 +306,7 @@ int32_t ResizeOpBuilder::GetMinSupportedNNAPIFeatureLevel(const NodeUnit& node_u
 }
 
 bool ResizeOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit,
     const OpSupportCheckParams& params) const {
   int32_t input_type;
   if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
@@ -323,10 +321,10 @@ bool ResizeOpBuilder::HasSupportedInputOutputsImpl(
   }
 
   if (IsQuantizedOp(node_unit)) {
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kInput))
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kInput))
       return false;
 
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput))
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput))
       return false;
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
index 903469d34e67..facdc7132dc0 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
@@ -40,7 +40,7 @@ class SliceOpBuilder : public BaseOpBuilder {
   // We only support slice from opset 10
   int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 10; }
 
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -201,7 +201,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 // Operator support related
 
-bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool SliceOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                        const OpSupportCheckParams& /* params */) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
@@ -219,19 +219,19 @@ bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     return false;
   }
 
-  if (!CheckIsInitializer(initializers, node_unit, node_unit.Inputs()[1].node_arg.Name(), "starts")) {
+  if (!CheckIsConstantInitializer(graph_viewer, node_unit, node_unit.Inputs()[1].node_arg.Name(), "starts")) {
     return false;
   }
-  if (!CheckIsInitializer(initializers, node_unit, node_unit.Inputs()[2].node_arg.Name(), "ends")) {
+  if (!CheckIsConstantInitializer(graph_viewer, node_unit, node_unit.Inputs()[2].node_arg.Name(), "ends")) {
     return false;
   }
   const auto& inputs = node_unit.Inputs();
   if (inputs.size() > 3) {
-    if (!CheckIsInitializer(initializers, node_unit, node_unit.Inputs()[3].node_arg.Name(), "axes")) {
+    if (!CheckIsConstantInitializer(graph_viewer, node_unit, node_unit.Inputs()[3].node_arg.Name(), "axes")) {
       return false;
     }
     if (inputs.size() > 4) {
-      if (!CheckIsInitializer(initializers, node_unit, node_unit.Inputs()[4].node_arg.Name(), "steps")) {
+      if (!CheckIsConstantInitializer(graph_viewer, node_unit, node_unit.Inputs()[4].node_arg.Name(), "steps")) {
         return false;
       }
     }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
index 1e420fec8082..a2a8b4512b02 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
@@ -33,7 +33,7 @@ class SoftMaxOpBuilder : public BaseOpBuilder {
   // Operator support related
 
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
@@ -41,7 +41,7 @@ class SoftMaxOpBuilder : public BaseOpBuilder {
     return ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
   bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const GraphViewer& graph_viewer, const NodeUnit& node_unit,
       const OpSupportCheckParams& params) const override;
 
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
@@ -77,8 +77,7 @@ Status SoftMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   int32_t y_zero_point = 0;
   if (IsQuantizedOp(node_unit)) {
     ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-        model_builder.GetInitializerTensors(), node_unit.Inputs()[0], node_unit.ModelPath(),
-        x_scale, x_zero_point));
+        model_builder.GetGraphViewer(), node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
 
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
 
@@ -156,7 +155,7 @@ bool SoftMaxOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
   return GetQuantizedOpType(node_unit) == QuantizedOpType::QDQSoftmax;
 }
 
-bool SoftMaxOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool SoftMaxOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                          const OpSupportCheckParams& params) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
@@ -197,24 +196,23 @@ bool SoftMaxOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initiali
   return true;
 }
 
-bool SoftMaxOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-    const OpSupportCheckParams& params) const {
+bool SoftMaxOpBuilder::HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                                    const OpSupportCheckParams& params) const {
   if (!IsQuantizedOp(node_unit)) {
-    return BaseOpBuilder::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+    return BaseOpBuilder::HasSupportedInputOutputsImpl(graph_viewer, node_unit, params);
   }
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kInput)) {
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kInput)) {
     return false;
   }
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput)) {
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput)) {
     return false;
   }
 
   // NNAPI requires the scale be 1.f/256 and zero point to be 0
   if (!HasRequiredScaleAndZeroPoint(
-          initializers,
+          graph_viewer,
           MakeString("Op [", node_unit.OpType(), "] name [", node_unit.Name(), "]'s output 0 "),
           node_unit.Outputs()[0], node_unit.ModelPath(),
           1.f / 256 /* required_scale */, 0 /* required_zp */)) {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
index 68b63badb8f7..edee298ad1cc 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
@@ -35,7 +35,7 @@ class SplitOpBuilder : public BaseOpBuilder {
   // Operator support related
 
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   // Split opset 13- uses "split" as attribute. Currently it's not supported.
@@ -67,7 +67,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
   int32_t num_outputs;
   if (node_unit.SinceVersion() >= 18) {
-    num_outputs = SafeInt<int32_t>(*helper.GetInt("num_outputs"));
+    num_outputs = SafeInt<int32_t>(*helper.GetInt64("num_outputs"));
   } else {
     num_outputs = SafeInt<int32_t>(node_unit.Outputs().size());
   }
@@ -85,7 +85,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 // Operator support related
 
-bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool SplitOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                        const OpSupportCheckParams& /* params */) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
@@ -98,13 +98,13 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   const auto split_dims_at_axis = input_shape[SafeInt<uint32_t>(HandleNegativeAxis(axis, input_shape.size()))];
   if (input_defs.size() > 1 && input_defs[1].node_arg.Exists()) {
     // if optional input `split` is provided
-    auto split_initializer_it = initializers.find(input_defs[1].node_arg.Name());
-    if (split_initializer_it == initializers.end()) {
-      LOGS_DEFAULT(VERBOSE) << "Optional input 'split' must be initializer if provided.";
+    const auto* splits = graph_viewer.GetConstantInitializer(input_defs[1].node_arg.Name());
+    if (!splits) {
+      LOGS_DEFAULT(VERBOSE) << "Optional input 'split' must be a constant initializer if provided.";
       return false;
     }
-    const auto& splits_tensor = *split_initializer_it->second;
-    Initializer unpacked_tensor(splits_tensor);
+
+    Initializer unpacked_tensor(*splits);
     auto splits_span = unpacked_tensor.DataAsSpan<int64_t>();
     uint32_t sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), SafeInt<uint32_t>(0));
     if (sum_of_splits != split_dims_at_axis) {
@@ -119,6 +119,7 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     auto it = std::adjacent_find(splits_span.begin(), splits_span.end(), [](const auto& a, const auto& b) {
       return a != b;
     });
+
     if (it != splits_span.end()) {
       LOGS_DEFAULT(VERBOSE) << "NNAPI only supports the case that number of splits evenly divides split axis size";
       return false;
@@ -126,7 +127,7 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   } else {
     uint32_t num_outputs;
     if (node_unit.SinceVersion() >= 18) {
-      auto num_outputs_attr = helper.GetInt("num_outputs");
+      auto num_outputs_attr = helper.GetInt64("num_outputs");
       if (!num_outputs_attr.has_value()) {
         LOGS_DEFAULT(VERBOSE) << "No 'num_outputs' provided. For split 18+, num_outputs is a required attribute.";
         return false;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
index a0fe744eaacc..fb3ca5e6175f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
@@ -32,7 +32,7 @@ class SqueezeOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
@@ -59,7 +59,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
 
 // Operator support related
 
-bool SqueezeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool SqueezeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                          const OpSupportCheckParams& /* params */) const {
   const auto& inputs = node_unit.Inputs();
   Shape input_shape;
@@ -76,8 +76,8 @@ bool SqueezeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializer
   // Squeeze opset 13 use input 1 as axes, if we have input 1 then it need to be an initializer
   if (node_unit.SinceVersion() > 12 && inputs.size() > 1) {
     const auto& axes_name = inputs[1].node_arg.Name();
-    if (!Contains(initializers, axes_name)) {
-      LOGS_DEFAULT(VERBOSE) << "Input axes of Squeeze must be known";
+    if (!graph_viewer.GetConstantInitializer(axes_name)) {
+      LOGS_DEFAULT(VERBOSE) << "Input axes of Squeeze must be a constant initializer";
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
index 4d243c730bf0..6fe5ca32fe04 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
@@ -32,7 +32,7 @@ class TransposeOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
   int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
@@ -41,7 +41,7 @@ class TransposeOpBuilder : public BaseOpBuilder {
   }
 
   bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+      const GraphViewer& graph_viewer, const NodeUnit& node_unit,
       const OpSupportCheckParams& params) const override;
   bool IsNodeUnitTypeSupported(const NodeUnit& /* node_unit */) const override { return true; }
   bool IsQuantizedOp(const NodeUnit& node_unit) const override;
@@ -59,7 +59,6 @@ void TransposeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, cons
 
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
   auto& shaper(model_builder.GetShaper());
-  const auto& initializers(model_builder.GetInitializerTensors());
 
   const auto& input = node_unit.Inputs()[0].node_arg.Name();
   const auto& output = node_unit.Outputs()[0].node_arg.Name();
@@ -78,7 +77,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
     float x_scale = 0.0f;
     int32_t x_zero_point = 0;
     ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
+        model_builder.GetGraphViewer(), node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
   }
 
@@ -95,7 +94,7 @@ bool TransposeOpBuilder::IsQuantizedOp(const NodeUnit& node_unit) const {
   return GetQuantizedOpType(node_unit) == QuantizedOpType::QDQTranspose;
 }
 
-bool TransposeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
+bool TransposeOpBuilder::IsOpSupportedImpl(const GraphViewer& /* graph_viewer */, const NodeUnit& node_unit,
                                            const OpSupportCheckParams& /* params */) const {
   Shape input_shape;
   if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
@@ -112,7 +111,7 @@ bool TransposeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initia
 }
 
 bool TransposeOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit,
     const OpSupportCheckParams& params) const {
   int32_t input_type;
   if (!GetType(node_unit.Inputs()[0].node_arg, input_type))
@@ -127,10 +126,10 @@ bool TransposeOpBuilder::HasSupportedInputOutputsImpl(
   }
 
   if (IsQuantizedOp(node_unit)) {
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kInput))
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kInput))
       return false;
 
-    if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput))
+    if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput))
       return false;
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
index 796fd207fe42..dbd960ee5536 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
@@ -32,19 +32,18 @@ class UnaryOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& /* node_unit */,
+  int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& node_unit,
                                            const OpSupportCheckParams& params) const override;
 
-  bool HasSupportedInputOutputsImpl(
-      const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit,
-      const OpSupportCheckParams& /* params */) const override;
+  bool HasSupportedInputOutputsImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                    const OpSupportCheckParams& params) const override;
 
   int GetMinSupportedOpSet(const NodeUnit& node_unit) const override;
 
-  static bool IsQuantizedOpSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  static bool IsQuantizedOpSupported(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                      const OpSupportCheckParams& params);
 };
 
@@ -117,11 +116,10 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   float y_scale = 0.0f;
   int32_t y_zero_point = 0;
   if (is_qlinear_sigmoid) {
-    const auto& initializers = model_builder.GetInitializerTensors();
     float x_scale = 0.0f;
     int32_t x_zero_point = 0;
     ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-        initializers, node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
+        model_builder.GetGraphViewer(), node_unit.Inputs()[0], node_unit.ModelPath(), x_scale, x_zero_point));
 
     // Verify if the scale and zero point values from onnx input and nnapi input match
     ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point));
@@ -141,10 +139,10 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
 // Operator support related
 
-bool UnaryOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool UnaryOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                        const OpSupportCheckParams& params) const {
   if (node_unit.OpType() == "QLinearSigmoid") {
-    return IsQuantizedOpSupported(initializers, node_unit, params);
+    return IsQuantizedOpSupported(graph_viewer, node_unit, params);
   } else if (node_unit.OpType() == "Sigmoid") {
     Shape input_shape;
     if (!GetShape(node_unit.Inputs()[0].node_arg, input_shape))
@@ -178,16 +176,16 @@ int32_t UnaryOpBuilder::GetMinSupportedNNAPIFeatureLevel(const NodeUnit& node_un
 }
 
 bool UnaryOpBuilder::HasSupportedInputOutputsImpl(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit,
     const OpSupportCheckParams& params) const {
   // We only need to override input check for QLinearSigmoid
   if (node_unit.OpType() != "QLinearSigmoid")
-    return BaseOpBuilder::HasSupportedInputOutputsImpl(initializers, node_unit, params);
+    return BaseOpBuilder::HasSupportedInputOutputsImpl(graph_viewer, node_unit, params);
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kInput))
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kInput))
     return false;
 
-  if (!IsQuantizedIOSupported(initializers, node_unit, {0}, params, ArgType::kOutput))
+  if (!IsQuantizedIOSupported(graph_viewer, node_unit, {0}, params, ArgType::kOutput))
     return false;
 
   return true;
@@ -204,13 +202,13 @@ int UnaryOpBuilder::GetMinSupportedOpSet(const NodeUnit& node_unit) const {
 }
 
 /* static */ bool UnaryOpBuilder::IsQuantizedOpSupported(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) {
+    const GraphViewer& graph_viewer, const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) {
   const auto& op_type = node_unit.OpType();
   ORT_ENFORCE(op_type == "QLinearSigmoid");
 
   // NNAPI requires the scale be 1.f/256 and zero point to be 0
   // See https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/android10-c2f2-release/nn/common/operations/Activation.cpp#180
-  if (!HasRequiredScaleAndZeroPoint(initializers,
+  if (!HasRequiredScaleAndZeroPoint(graph_viewer,
                                     MakeString("Op [", op_type, "] name [", node_unit.Name(), "]'s output 0 "),
                                     node_unit.Outputs()[0], node_unit.ModelPath(),
                                     1.f / 256 /* required_scale */, 0 /* required_zp */)) {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
index a9bece7d4236..95cd813800c9 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
@@ -32,7 +32,7 @@ class UnsqueezeOpBuilder : public BaseOpBuilder {
 
   // Operator support related
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  bool IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                          const OpSupportCheckParams& params) const override;
 };
 
@@ -74,7 +74,7 @@ Status UnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
 
 // Operator support related
 
-bool UnsqueezeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool UnsqueezeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                            const OpSupportCheckParams& /* params */) const {
   const auto& inputs = node_unit.Inputs();
   Shape input_shape;
@@ -93,8 +93,8 @@ bool UnsqueezeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializ
   // Unsqueeze opset 13 uses input 1 as axes, if we have input 1 then it needs to be an initializer
   if (node_unit.SinceVersion() > 12 && inputs.size() > 1) {
     const auto& axes_name = inputs[1].node_arg.Name();
-    if (!Contains(initializers, axes_name)) {
-      LOGS_DEFAULT(VERBOSE) << "Input axes of Unsqueeze must be known";
+    if (!graph_viewer.GetConstantInitializer(axes_name)) {
+      LOGS_DEFAULT(VERBOSE) << "Input axes of Unsqueeze must be a constant initializer";
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index b75e78cbfe7c..d0ae32378379 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -11,17 +11,19 @@
 #include "core/common/safeint.h"
 #include "core/common/status.h"
 #include "core/framework/execution_provider.h"
+#include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_api_helper.h"
-#include "core/providers/shared/node_unit/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h"
-#include "core/optimizer/initializer.h"
+#include "core/providers/shared/utils/utils.h"
 
 using namespace android::nn::wrapper;
 
@@ -100,7 +102,7 @@ void ModelBuilder::PreprocessActivations() {
       activation_node_units_.emplace(node_unit.get(), ANEURALNETWORKS_FUSED_RELU);
     } else if (op_type == "Clip") {  // Relu1 or Relu6
       float min, max;
-      if (!GetClipMinMax(GetInitializerTensors(), node, min, max, logging::LoggingManager::DefaultLogger()))
+      if (!GetClipMinMax(graph_viewer_, node, min, max, logging::LoggingManager::DefaultLogger()))
         continue;
 
       if (min == -1.0f && max == 1.0f) {
@@ -119,7 +121,7 @@ const NodeUnit& ModelBuilder::GetNodeUnit(const Node* node) const {
 }
 
 void ModelBuilder::PreprocessNodeUnits() {
-  std::tie(node_unit_holder_, node_unit_map_) = GetAllNodeUnits(graph_viewer_);
+  std::tie(node_unit_holder_, node_unit_map_) = QDQ::GetAllNodeUnits(graph_viewer_);
 }
 
 // Help to get all quantized operators' input and the NodeUnit(s) using the input
@@ -151,7 +153,7 @@ void ModelBuilder::GetAllQuantizedOpInputs() {
 }
 
 static Status GetInputDataType(
-    const InitializedTensorSet& initializers,
+    const GraphViewer& graph_viewer,
     const std::unordered_map<std::string, std::vector<const NodeUnit*>>& all_quantized_op_inputs,
     const std::string& name, int32_t data_type, const Shape& shape,
     OperandType& operand_type) {
@@ -177,7 +179,7 @@ static Status GetInputDataType(
       // TODO, verify the scale and zero point match if there are multiple op using same input
       const auto* node_unit = all_quantized_op_inputs.at(name)[0];
       ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-          initializers, *node_unit, name, scale, zero_point, ArgType::kInput));
+          graph_viewer, *node_unit, name, scale, zero_point, ArgType::kInput));
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT32:
@@ -226,9 +228,8 @@ Status ModelBuilder::RegisterInitializers() {
     }
 
     OperandType operand_type(Type::TENSOR_FLOAT32, shape);
-    ORT_RETURN_IF_ERROR(
-        GetInputDataType(GetInitializerTensors(), all_quantized_op_inputs_,
-                         name, tensor.data_type(), shape, operand_type));
+    ORT_RETURN_IF_ERROR(GetInputDataType(graph_viewer_, all_quantized_op_inputs_, name, tensor.data_type(), shape,
+                                         operand_type));
     shaper_.AddShape(name, operand_type.dimensions);
 
     uint32_t index = 0;
@@ -304,7 +305,7 @@ Status ModelBuilder::RegisterModelInputs() {
                              "The input of graph doesn't have elem_type: ", input_name);
     } else {
       ORT_RETURN_IF_ERROR(
-          GetInputDataType(GetInitializerTensors(), all_quantized_op_inputs_,
+          GetInputDataType(graph_viewer_, all_quantized_op_inputs_,
                            input_name, type_proto->tensor_type().elem_type(), shape, operand_type));
     }
 
@@ -665,7 +666,7 @@ int32_t ModelBuilder::FindActivation(const NodeUnit& node_unit) {
 
   int32_t fuse_code = ANEURALNETWORKS_FUSED_NONE;
   bool fuse_code_assigned_from_activation = false;
-  for (auto it = node_unit.OutputEdgesBegin(0), end = node_unit.OutputEdgesEnd(0); it != end; ++it) {
+  for (auto it = node_unit.OutputEdgesBegin(), end = node_unit.OutputEdgesEnd(); it != end; ++it) {
     const auto& dst_node = it->GetNode();
     const auto* dst_input = dst_node.InputDefs()[it->GetDstArgIndex()];
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
index c565af491ff9..f6db4022fb8f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.h
@@ -56,7 +56,7 @@ class IOpBuilder {
   // Operator support check related
 
   // Check if an operator is supported
-  virtual bool IsOpSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+  virtual bool IsOpSupported(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                              const OpSupportCheckParams& params) const = 0;
 };
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
index 26db7c8e7afe..dab7bccf4339 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
@@ -21,7 +21,6 @@
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h"
 
 namespace onnxruntime::nnapi::op_builder_helpers {
@@ -679,16 +678,15 @@ Status HandleAutoPad(const Shape& input_shape,
   return Status::OK();
 }
 
-Status GetBinaryOpQuantizationScaleAndZeroPoint(
-    const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-    float& a_scale, float& b_scale, float& y_scale,
-    int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) {
+Status GetBinaryOpQuantizationScaleAndZeroPoint(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                                                float& a_scale, float& b_scale, float& y_scale,
+                                                int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point) {
   ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-      initializers, node_unit.Inputs()[0], node_unit.ModelPath(), a_scale, a_zero_point));
+      graph_viewer, node_unit.Inputs()[0], node_unit.ModelPath(), a_scale, a_zero_point));
   ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-      initializers, node_unit.Inputs()[1], node_unit.ModelPath(), b_scale, b_zero_point));
+      graph_viewer, node_unit.Inputs()[1], node_unit.ModelPath(), b_scale, b_zero_point));
   ORT_RETURN_IF_ERROR(GetQuantizationScaleAndZeroPoint(
-      initializers, node_unit.Outputs()[0], node_unit.ModelPath(), y_scale, y_zero_point));
+      graph_viewer, node_unit.Outputs()[0], node_unit.ModelPath(), y_scale, y_zero_point));
 
   return Status::OK();
 }
@@ -699,16 +697,18 @@ Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
     int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
     std::optional<std::vector<float>>& w_scales, bool& is_per_tensor_u8s8) {
   is_per_tensor_u8s8 = false;
-  const auto& initializers(model_builder.GetInitializerTensors());
+  const auto& graph_viewer(model_builder.GetGraphViewer());
+
   // Get scale and zero points
   // We will handle per-channel weight scale and zero point later
   ORT_RETURN_IF_ERROR(
-      GetBinaryOpQuantizationScaleAndZeroPoint(initializers, node_unit,
+      GetBinaryOpQuantizationScaleAndZeroPoint(graph_viewer, node_unit,
                                                a_scale, w_scale, y_scale,
                                                a_zero_point, w_zero_point, y_zero_point));
 
   const auto& inputs = node_unit.Inputs();
-  const auto& weight_tensor = *initializers.at(inputs[1].node_arg.Name());
+  // all these were checked to be constant in GemmOpBuilder::IsOpSupportedImpl
+  const auto& weight_tensor = *graph_viewer.GetConstantInitializer(inputs[1].node_arg.Name());
 
   // We are done here if this is u8u8 QLinearConv
   if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
@@ -719,7 +719,7 @@ Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
   // For this case we will need to convert the int8 weight tensor to uint8
   // And have same scale and 128 as zero point
   // The conversion of the weight tensor itself will be done in the OpBuilder
-  const auto& scale_tensor = *initializers.at(inputs[1].quant_param->scale.Name());
+  const auto& scale_tensor = *graph_viewer.GetConstantInitializer(inputs[1].quant_param->scale.Name());
   int64_t scale_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
   if (scale_dim == 1) {
     w_zero_point = 128;
@@ -964,6 +964,18 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
   return Status::OK();
 }
 
+// NOTE: Skipping Reshape results in invalid output on some SnapDragon chipsets. Whilst the NNAPI spec says the input
+// to FullyConnnected can be > 2D, those chipsets don't handle this correctly.
+//
+// CanSkipReshape could potentially be re-enabled in the future if we no longer want to support those old chipsets.
+// However, the Reshape of newer chipsets may not run on CPU so there may not be a performance issue to try and avoid,
+// so CanSkipReshape could be redundant anyway.
+//
+// Known bad chipsets: Qualcomm Snapdragon 850, 855, 865, 870.
+//
+// See https://github.com/microsoft/onnxruntime/issues/19518
+
+/*
 // We can skip the Reshape if all the output edges satisfies both the following conditions
 // 1. The output of the reshape/flatten is not an output of the graph
 // 2. The output of the reshape/flatten is the input 0 of one or more GEMM/Matmul operators,
@@ -976,7 +988,7 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
 // between NNAPI CPU impl and Hardware Accelerator impl and will speed up the execution
 // If we are going to skip the reshape, we will still add correct shape and operand type for the output in
 // onnxruntime::nnapi::Model.
-bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
+static bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
                     size_t input_rank, size_t output_rank) {
   // Since we know this is a Reshape NodeUnit, so we can safely assume there is only 1 output
   // and the node_unit has only one output node.
@@ -1038,33 +1050,37 @@ bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit
                         << node_unit.Name() << "] with output, " << output_name;
   return true;
 }
+*/
 
 Status AddReshapeOperator(ModelBuilder& model_builder,
                           const NodeUnit& node_unit,
                           const std::string& input,
                           const std::vector<int32_t>& shape) {
   auto& shaper(model_builder.GetShaper());
-  const auto& operand_indices(model_builder.GetOperandIndices());
   const auto& operand_types(model_builder.GetOperandTypes());
   const auto& output = node_unit.Outputs()[0].node_arg.Name();
 
   const auto input_shape = shaper[input];
   const auto output_shape = shaper[output];
-  const auto input_rank = input_shape.size();
-  const auto output_rank = output_shape.size();
 
   // For reshape, the output type should be the same as the input type except the shape is different
   auto output_operand_type = operand_types.at(input);
   output_operand_type.SetDimensions(output_shape);
 
+  /* See CanSkipReshape definition above for explanation of why this is disabled.
   // Since Reshape is not running using hardware in NNAPI for some CPU (e.g. Qualcomm SD for now)
   // We will try to see if we the skip the Reshape to prevent context switching between
   // NNAPI CPU impl and NNAPI hardware accelerator impl
   if (CanSkipReshape(model_builder, node_unit, input_rank, output_rank)) {
-    // Since reshape can be skipped, only register the dimension and type, with same index and new name
+    const auto& operand_indices(model_builder.GetOperandIndices());
+    const auto input_rank = input_shape.size();
+    const auto output_rank = output_shape.size();
+    // Since reshape can be skipped, only register the dimension and type, with same index and new name.
+    // This essentially redirects the downstream operator builders to the input of the skipped Reshape node,
+    // but with the output shape of the Reshape node.
     model_builder.RegisterOperand(output, operand_indices.at(input), output_operand_type);
-  } else {
-    // We still need to perform a reshape here
+  } else */
+  {
     std::string shape_name = model_builder.GetUniqueName(node_unit.Name() + input + "newshape");
     ORT_RETURN_IF_ERROR(op_builder_helpers::AddNnapiReshape(model_builder, input, shape_name, shape, output));
   }
@@ -1072,20 +1088,20 @@ Status AddReshapeOperator(ModelBuilder& model_builder,
   return Status::OK();
 }
 
-bool IsQuantizationScaleSupported(const InitializedTensorSet& initializers,
+bool IsQuantizationScaleSupported(const GraphViewer& graph_viewer,
                                   const NodeUnitIODef& io_def,
                                   const OpSupportCheckParams& params,
                                   const std::string& op_type,
                                   bool is_quant_matmul,
                                   bool is_conv_matmul_u8s8_weight) {
   const auto scale_name = io_def.quant_param->scale.Name();
-  auto it = initializers.find(scale_name);
-  if (it == initializers.cend()) {
-    LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
+  const auto* scale = graph_viewer.GetConstantInitializer(scale_name);
+  if (!scale) {
+    LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be a constant initializer";
     return false;
   }
 
-  const auto& scale_tensor = *it->second;
+  const auto& scale_tensor = *scale;
   int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
   if (!is_conv_matmul_u8s8_weight) {
     if (scales_dim != 1) {
@@ -1123,7 +1139,7 @@ bool IsQuantizationScaleSupported(const InitializedTensorSet& initializers,
   return true;
 }
 
-bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers,
+bool IsQuantizationZeroPointSupported(const GraphViewer& graph_viewer,
                                       const NodeUnitIODef& io_def,
                                       const std::string& op_type,
                                       const Path& model_path,
@@ -1134,12 +1150,13 @@ bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers,
     return true;
 
   const auto& zero_point_name = io_def.quant_param->zero_point->Name();
-  if (!Contains(initializers, zero_point_name)) {
-    LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
+  const auto* zero_point = graph_viewer.GetConstantInitializer(zero_point_name);
+  if (!zero_point) {
+    LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be a constant initializer";
     return false;
   }
 
-  const auto& zero_tensor = *initializers.at(zero_point_name);
+  const auto& zero_tensor = *zero_point;
   int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
 
   if (!is_conv_matmul_u8s8_weight) {
@@ -1194,8 +1211,9 @@ bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers,
   return true;
 }
 
-bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
-                            const std::vector<size_t>& indices, const OpSupportCheckParams& params, ArgType arg_type) {
+bool IsQuantizedIOSupported(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
+                            const std::vector<size_t>& indices, const OpSupportCheckParams& params,
+                            ArgType arg_type) {
   const auto& op_type = node_unit.OpType();
   auto quant_op_type = GetQuantizedOpType(node_unit);
 
@@ -1247,12 +1265,12 @@ bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, const Node
     }
 
     // Check scale and zero point
-    if (!IsQuantizationScaleSupported(initializers, io_def, params, op_type,
+    if (!IsQuantizationScaleSupported(graph_viewer, io_def, params, op_type,
                                       is_quant_matmul, is_conv_matmul_u8s8_weight)) {
       return false;
     }
 
-    if (!IsQuantizationZeroPointSupported(initializers, io_def, op_type, node_unit.ModelPath(),
+    if (!IsQuantizationZeroPointSupported(graph_viewer, io_def, op_type, node_unit.ModelPath(),
                                           is_quant_matmul, is_conv_matmul_u8s8_weight)) {
       return false;
     }
@@ -1261,33 +1279,27 @@ bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, const Node
   return true;
 }
 
-bool HasRequiredScaleAndZeroPoint(const InitializedTensorSet& initializers,
+bool HasRequiredScaleAndZeroPoint(const GraphViewer& graph_viewer,
                                   const std::string& op_desc,
                                   const NodeUnitIODef& io_def,
                                   const Path& path,
                                   float required_scale, int32_t required_zp) {
   float scale = 0.0f;
   int32_t zp = 0;
-  auto status = GetQuantizationScaleAndZeroPoint(initializers, io_def, path,
-                                                 scale, zp);
+  auto status = GetQuantizationScaleAndZeroPoint(graph_viewer, io_def, path, scale, zp);
   if (!status.IsOK()) {
-    LOGS_DEFAULT(ERROR) << op_desc
-                        << " GetQuantizationScaleAndZeroPoint failed, message: "
-                        << status.ErrorMessage();
+    LOGS_DEFAULT(ERROR) << op_desc << " GetQuantizationScaleAndZeroPoint failed, message: " << status.ErrorMessage();
     return false;
   }
 
   if (scale != required_scale) {
-    LOGS_DEFAULT(VERBOSE) << op_desc
-                          << " scale can only be [" << required_scale
-                          << "], actual scale: " << scale;
+    LOGS_DEFAULT(VERBOSE) << op_desc << " scale can only be [" << required_scale << "], actual scale: " << scale;
     return false;
   }
 
   if (zp != required_zp) {
-    LOGS_DEFAULT(VERBOSE) << op_desc
-                          << "] zero point can only be [" << required_zp
-                          << "], actual zero point: " << scale;
+    LOGS_DEFAULT(VERBOSE) << op_desc << "] zero point can only be [" << required_zp << "], actual zero point: "
+                          << zp;
     return false;
   }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
index 0cc442890ab6..0844857a06d6 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h
@@ -7,12 +7,12 @@
 #include <vector>
 
 #include "core/common/common.h"
+#include "core/framework/node_unit.h"
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/model_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/op_builder.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 
 namespace onnxruntime::nnapi::op_builder_helpers {
 
@@ -118,7 +118,7 @@ Status HandleAutoPad(const Shape& input_shape,
 // Get scales and zero points for the qlinear binary ops (which has 2 input and 1 output)
 // QLinearConv, QLinearMatmul, QLinearAdd, QLinearMul
 // a, b are inputs, and y is output
-Status GetBinaryOpQuantizationScaleAndZeroPoint(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+Status GetBinaryOpQuantizationScaleAndZeroPoint(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                                                 float& a_scale, float& b_scale, float& y_scale,
                                                 int32_t& a_zero_point, int32_t& b_zero_point, int32_t& y_zero_point);
 
@@ -181,9 +181,6 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
 Status AddReshapeOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
                           const std::string& input, const std::vector<int32_t>& shape);
 
-bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit,
-                    size_t input_rank, size_t output_rank);
-
 Status GetAxesForSqueezeAndUnSqueeze(ModelBuilder& model_builder, const NodeUnit& node_unit,
                                      std::vector<int32_t>& axes);
 
@@ -193,14 +190,14 @@ inline bool IsNodeLayoutNHWC(const NodeUnit& node_unit) {
   return node_unit.Domain() == kMSInternalNHWCDomain;
 }
 
-bool IsQuantizationScaleSupported(const InitializedTensorSet& initializers,
+bool IsQuantizationScaleSupported(const GraphViewer& graph_viewer,
                                   const NodeUnitIODef& io_def,
                                   const OpSupportCheckParams& params,
                                   const std::string& op_type,
                                   bool is_quant_matmul,
                                   bool is_conv_matmul_u8s8_weight);
 
-bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers,
+bool IsQuantizationZeroPointSupported(const GraphViewer& graph_viewer,
                                       const NodeUnitIODef& io_def,
                                       const std::string& op_type,
                                       const Path& model_path,
@@ -208,13 +205,13 @@ bool IsQuantizationZeroPointSupported(const InitializedTensorSet& initializers,
                                       bool is_conv_matmul_u8s8_weight);
 
 // Check if the given quantized input(s) or output(s) is supported
-bool IsQuantizedIOSupported(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+bool IsQuantizedIOSupported(const GraphViewer& graph_viewer, const NodeUnit& node_unit,
                             const std::vector<size_t>& indices, const OpSupportCheckParams& params, ArgType arg_type);
 
 // Some Quantized NNAPI operations have required output scale and zero point
 // e.g. Softmax (uint8) requires output scale be 1.f/256 and zp be 0
 // This helper function checks if the given io_def has required scale and zp
-bool HasRequiredScaleAndZeroPoint(const InitializedTensorSet& initializers,
+bool HasRequiredScaleAndZeroPoint(const GraphViewer& graph_viewer,
                                   const std::string& op_desc,
                                   const NodeUnitIODef& io_def,
                                   const Path& path,
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index 727917ad9232..4d2888222ff0 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -7,7 +7,10 @@
 #include "core/common/logging/logging.h"
 #include "core/common/string_utils.h"
 #include "core/framework/compute_capability.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/platform/env.h"
 #include "core/providers/common.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
@@ -17,7 +20,6 @@
 #include "core/providers/nnapi/nnapi_builtin/nnapi_api_helper.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h"
 #include "core/providers/partitioning_utils.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/session/onnxruntime_cxx_api.h"
 
 namespace onnxruntime {
@@ -50,7 +52,7 @@ std::unordered_set<std::string> GetPartitioningStopOps(const optional<std::strin
 
 NnapiExecutionProvider::NnapiExecutionProvider(uint32_t nnapi_flags,
                                                const optional<std::string>& partitioning_stop_ops_list)
-    : IExecutionProvider{onnxruntime::kNnapiExecutionProvider, true},
+    : IExecutionProvider{onnxruntime::kNnapiExecutionProvider},
       nnapi_flags_(nnapi_flags),
       partitioning_stop_ops_(GetPartitioningStopOps(partitioning_stop_ops_list)) {
   nnapi_handle_ = NnApiImplementation();
@@ -119,7 +121,7 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
 
   // This holds the result of whether a NodeUnit is supported or not,
   // to prevent nodes in a NodeUnit to be checked for multiple times
@@ -176,12 +178,12 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
 
   const auto gen_metadef_name = [&]() {
     HashValue model_hash;
-    int metadef_id = GenerateMetaDefId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
     return MakeString(NNAPI, "_", model_hash, "_", metadef_id);
   };
 
   result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
-                                            gen_metadef_name, NNAPI, kNnapiExecutionProvider);
+                                            gen_metadef_name, NNAPI, kNnapiExecutionProvider, &node_unit_map);
 
   // Generally, NNAPI supports sub-graphs with at least one non-constant initializer input and one output.
   // So far, we have a few cases that sub-graph has zero valid inputs, like `CastLike`
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
index e4911511e6db..460616c41991 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
@@ -6,6 +6,7 @@
 #include "core/common/inlined_containers_fwd.h"
 #include "core/common/optional.h"
 #include "core/framework/execution_provider.h"
+#include "core/framework/model_metadef_id_generator.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_api_helper.h"
 #include "core/providers/nnapi/nnapi_provider_factory.h"
 
@@ -48,5 +49,6 @@ class NnapiExecutionProvider : public IExecutionProvider {
   const NnApi* nnapi_handle_ = nullptr;
   nnapi::DeviceWrapperVector nnapi_target_devices_;
   nnapi::TargetDeviceOption target_device_option_;
+  ModelMetadefIdGenerator metadef_id_generator_;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index b2a7028f49e5..3252603e3338 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -1,8 +1,9 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <fstream>
 #include <utility>
+#include <exception>
 
 #include "core/providers/shared_library/provider_api.h"
 #include "contexts.h"
@@ -13,33 +14,17 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-static std::unique_ptr<GlobalContext> g_global_context;
-
 GlobalContext& BackendManager::GetGlobalContext() {
-  // This is not thread safe to call for the first time,
-  // but it is first called on the main thread by the constructor so it is safe.
-  if (!g_global_context)
-    g_global_context = std::make_unique<GlobalContext>();
-  return *g_global_context;
-}
-
-void BackendManager::ReleaseGlobalContext() {
-  g_global_context.reset();
+  return global_context_;
 }
 
-BackendManager::BackendManager(const onnxruntime::Node& fused_node,
+BackendManager::BackendManager(const GlobalContext& global_context,
+                               const onnxruntime::Node& fused_node,
                                const onnxruntime::GraphViewer& subgraph,
                                const logging::Logger& logger) {
+  global_context_ = global_context;
+
   auto prec_str = GetGlobalContext().precision_str;
-  if (prec_str == "FP32") {
-    subgraph_context_.precision = "FP32";
-  } else if (prec_str == "FP16") {
-    subgraph_context_.precision = "FP16";
-  } else if (prec_str == "U8") {
-    subgraph_context_.precision = "U8";
-  } else {
-    throw std::string("Invalid OpenVINO Precision type: " + prec_str);
-  }
 
   // Save the indexes of graph inputs among fused_node's inputDefs
   // (which also contains initializers).
@@ -54,7 +39,7 @@ BackendManager::BackendManager(const onnxruntime::Node& fused_node,
   for (auto input : graph_inputs) {
     auto it = subgraph_context_.input_names.find(input->Name());
     if (it == subgraph_context_.input_names.end()) {
-      throw std::string("Input not found in the input defs list");
+      ORT_THROW("Input not found in the input defs list");
     }
     int index = it->second;
     subgraph_context_.input_indexes.push_back(index);
@@ -68,6 +53,7 @@ BackendManager::BackendManager(const onnxruntime::Node& fused_node,
   }
   subgraph_context_.subgraph_name = fused_node.Name();
   model_proto_ = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
+  std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type;
 
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
@@ -82,7 +68,7 @@ BackendManager::BackendManager(const onnxruntime::Node& fused_node,
                                                           GetGlobalContext(),
                                                           subgraph_context_);
         } catch (std::string const& msg) {
-          throw msg;
+          ORT_THROW(msg);
         }
         LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                            << "Backend created for graph " << subgraph_context_.subgraph_name;
@@ -94,12 +80,29 @@ BackendManager::BackendManager(const onnxruntime::Node& fused_node,
                        << subgraph_context_.subgraph_name;
 
     subgraph_context_.has_dynamic_input_shape = false;
+
+    // OV NPU plugin is supported with fallback to OV CPU upon compilation failures.
     try {
       concrete_backend_ = BackendFactory::MakeBackend(*model_proto_,
                                                       GetGlobalContext(),
                                                       subgraph_context_);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const OnnxRuntimeException& ex) {
+      if (device_type.find("NPU") != std::string::npos) {
+        LOGS_DEFAULT(WARNING) << ex.what();
+        LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
+                              << "Falling back to OV CPU for execution";
+        GetGlobalContext().device_type = "CPU";
+        GetGlobalContext().precision_str = "FP32";
+        try {
+          concrete_backend_ = BackendFactory::MakeBackend(*model_proto_,
+                                                          GetGlobalContext(),
+                                                          subgraph_context_);
+        } catch (std::string const& msg) {
+          ORT_THROW(msg);
+        }
+      } else {
+        ORT_THROW(ex.what());
+      }
     }
   }
 }
@@ -261,8 +264,13 @@ void BackendManager::Compute(OrtKernelContext* context) {
     LOGS_DEFAULT(INFO) << "Start Compute";
   }
 #endif
+  // OV NPU doesn't support dynamic shaped model inference.
+  // if disable_dynamic_shapes is set to true then execution of dynamic model is done
+  // by rewriting the model to static shaped model at runtime based on input shape.
+  // disable_dynamic_shapes is always set to true for OV NPU plugin.
   bool use_dynamic_backend = true;
-  if (!GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape &&
+  if (subgraph_context_.has_dynamic_input_shape &&
+      !GetGlobalContext().disable_dynamic_shapes &&
       (GetGlobalContext().device_type.find("CPU") != std::string::npos ||
        GetGlobalContext().device_type.find("GPU") != std::string::npos)) {
     concrete_backend_->Infer(context);
@@ -270,12 +278,11 @@ void BackendManager::Compute(OrtKernelContext* context) {
   } else if (use_dynamic_backend && subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
     auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
-
     std::shared_ptr<IBackend> dynamic_backend;
     auto search = backend_map_.find(key);
     if (search == backend_map_.end()) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
-                         << "Creating concrete backend for key: " << key;
+                         << "Creating dynamic backend for key: " << key;
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                          << "Backend created for graph " << subgraph_context_.subgraph_name;
       auto modelproto_with_concrete_shapes = ReWriteInputShapeInfo(*model_proto_, tensor_shapes);
@@ -283,8 +290,22 @@ void BackendManager::Compute(OrtKernelContext* context) {
         dynamic_backend = BackendFactory::MakeBackend(*modelproto_with_concrete_shapes,
                                                       GetGlobalContext(),
                                                       subgraph_context_);
-      } catch (std::string const& msg) {
-        throw msg;
+      } catch (const OnnxRuntimeException& ex) {
+        if (GetGlobalContext().device_type.find("NPU") != std::string::npos) {
+          LOGS_DEFAULT(WARNING) << ex.what();
+          LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
+                                << "Falling back to OV CPU for execution";
+          GetGlobalContext().device_type = "CPU";
+          GetGlobalContext().precision_str = "FP32";
+          key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
+          try {
+            dynamic_backend = BackendFactory::MakeBackend(*modelproto_with_concrete_shapes,
+                                                          GetGlobalContext(),
+                                                          subgraph_context_);
+          } catch (std::string const& msg) {
+            ORT_THROW(msg);
+          }
+        }
       }
       backend_map_.insert({key, dynamic_backend});
     } else {
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index a177324b23f7..376ebea225a2 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -18,13 +18,14 @@ namespace openvino_ep {
 // Singleton class that manages all the backends
 class BackendManager {
  public:
-  BackendManager(const onnxruntime::Node& fused_node,
+  BackendManager(const GlobalContext& global_context,
+                 const onnxruntime::Node& fused_node,
                  const onnxruntime::GraphViewer& subgraph,
                  const logging::Logger& logger);
   void Compute(OrtKernelContext* context);
   void ShutdownBackendManager();
-  static GlobalContext& GetGlobalContext();
-  static void ReleaseGlobalContext();
+  void SetGlobalCotext(const GlobalContext& global_context);
+  GlobalContext& GetGlobalContext();
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
@@ -45,6 +46,7 @@ class BackendManager {
   std::shared_ptr<IBackend> concrete_backend_;
   std::map<std::string, std::shared_ptr<IBackend>> backend_map_;
   SubGraphContext subgraph_context_;
+  GlobalContext global_context_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 5092fffcfc11..32b5ad7d5b66 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <algorithm>
@@ -11,12 +11,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "backend_utils.h"
 
-#if defined(OV_API_20)
 using Exception = ov::Exception;
-#else
-using Exception = InferenceEngine::details::InferenceEngineException;
-using WaitMode = InferenceEngine::IInferRequest::WaitMode;
-#endif
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -47,7 +42,6 @@ struct static_cast_int64 {
 
 std::shared_ptr<OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context,
-              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
     std::cout << "CreateNgraphFunc" << std::endl;
@@ -55,28 +49,6 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
   const std::string model = model_proto.SerializeAsString();
   try {
     auto cnn_network = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);
-    if ((subgraph_context.precision == "FP16") &&
-        (global_context.device_type.find("NPU") == std::string::npos)) {
-      // FP16 transformations
-      ov::pass::ConvertFP32ToFP16 pass_obj;
-      pass_obj.run_on_model(cnn_network);
-      cnn_network->validate_nodes_and_infer_types();
-
-      auto proc = ov::preprocess::PrePostProcessor(cnn_network);
-      for (size_t i = 0; i < cnn_network->inputs().size(); i++) {
-        if (cnn_network->inputs()[i].get_element_type() == ov::element::f16) {
-          proc.input(i).tensor().set_element_type(ov::element::f32);
-          proc.input(i).preprocess().convert_element_type(ov::element::f16);
-        }
-      }
-
-      for (size_t i = 0; i < cnn_network->outputs().size(); i++) {
-        if (cnn_network->outputs()[i].get_element_type() == ov::element::f16) {
-          proc.output(i).postprocess().convert_element_type(ov::element::f32);
-        }
-      }
-      cnn_network = proc.build();
-    }
 
     // Check for Constant Folding
     if (!global_context.is_wholly_supported_graph) {
@@ -95,17 +67,15 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
       }
     }
 #ifndef NDEBUG
-#if defined(OPENVINO_2022_3) || (OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
     if (IsDebugEnabled()) {
       std::string name = cnn_network->get_friendly_name();
       ov::pass::Serialize serializer(name + ".xml", name + ".bin");
       serializer.run_on_model(cnn_network);
     }
-#endif
 #endif
     return cnn_network;
   } catch (std::string const& msg) {
-    throw msg;
+    ORT_THROW(msg);
   }
 }
 
@@ -129,7 +99,7 @@ GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
   }
   auto it = output_names.find(output_name);
   if (it == output_names.end()) {
-    throw std::string(log_tag + "Output names mismatch between OpenVINO and ONNX");
+    ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
   }
   int index = it->second;
   return context.GetOutput(index, output_shape.get(), num_dims);
@@ -147,7 +117,7 @@ GetOutputTensor(Ort::KernelContext& context,
 
   auto it = output_names.find(output_name);
   if (it == output_names.end()) {
-    throw std::string(log_tag + "Output names mismatch between OpenVINO and ONNX");
+    ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
   }
   int index = it->second;
   auto shape = node->get_shape();
@@ -206,7 +176,7 @@ void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedVal
       break;
     }
     default:
-      throw std::string(log_tag + "Unsupported output data type");
+      ORT_THROW(log_tag + "Unsupported output data type");
   }
 }
 
@@ -234,7 +204,7 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
   auto tensor = context.GetInput(subgraph_context.input_names.at(input_name));
   auto mem_info = tensor.GetTensorMemoryInfo();
   if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-    throw std::string(log_tag + "IO Buffering is not enabled, Please enable Input on CPU");
+    ORT_THROW(log_tag + "IO Buffering is not enabled, Please enable Input on CPU");
   }
   // Copy input data into OpenVINO's input buffer
   const char* tensor_data = tensor.GetTensorData<char>();
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 82b0351e87da..93fa87477446 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -65,7 +65,6 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
 std::shared_ptr<OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
               const GlobalContext& global_context,
-              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index c586dd8b38af..a0f4ce8f843b 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <memory>
@@ -24,11 +24,11 @@ BackendFactory::MakeBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
     try {
       concrete_backend_ = std::make_shared<BasicBackend>(model_proto, global_context, subgraph_context);
     } catch (std::string const& msg) {
-      throw msg;
+      ORT_THROW(msg);
     }
     return concrete_backend_;
   } else {
-    throw std::string("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type);
+    ORT_THROW("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type);
   }
 }
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 2280d853e30f..69d234a7c55e 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <map>
@@ -70,33 +70,29 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       }
 #else
-#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
-      if (global_context_.disable_dynamic_shapes && dev_prec != "CPU_FP16") {
-        const std::string model = model_proto.SerializeAsString();
-        exe_network_ = global_context_.ie_core.LoadNetwork(
-            model, hw_target, device_config, subgraph_context_.subgraph_name);
+      if (!subgraph_context_.has_dynamic_input_shape &&
+          global_context_.onnx_model_path_name != "" &&
+          dev_prec != "CPU_FP16") {
+        exe_network_ = global_context_.ie_core.LoadNetwork(global_context_.onnx_model_path_name,
+                                                           hw_target,
+                                                           device_config,
+                                                           subgraph_context_.subgraph_name);
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       } else {
-        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
+        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_);
         exe_network_ = global_context_.ie_core.LoadNetwork(
             ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       }
-#else
-      ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
-      exe_network_ = global_context_.ie_core.LoadNetwork(
-          ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
-      LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
-#endif
 #endif
     } else {
-      ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
+      ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_);
       exe_network_ = global_context_.ie_core.LoadNetwork(
           ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
       LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
     }
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 
   inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, 1));
@@ -126,26 +122,20 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     device_config.emplace(ov::enable_profiling(true));
   }
 #endif
-#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVION_2023_2)
   if (global_context_.device_type.find("NPU") != std::string::npos) {
     std::pair<std::string, ov::Any> device_property;
     device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER");
+
+    const std::string env_npu_compiler_type = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_NPU_COMPILER_TYPE");
+    if (!env_npu_compiler_type.empty()) {
+      device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type);
+    }
     device_config.emplace(ov::device::properties("NPU", device_property));
   }
-#endif
 }
 
 void BasicBackend::EnableCaching() {
   if (!global_context_.cache_dir.empty()) {
-    if (global_context_.is_wholly_supported_graph) {
-#if defined(OPENVINO_2022_3)
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
-      _putenv_s("OV_GPU_CACHE_MODEL", "1");
-#else
-      setenv("OV_GPU_CACHE_MODEL", "1", 1);
-#endif
-#endif
-    }
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
     global_context_.ie_core.SetCache(global_context_.cache_dir);
   }
@@ -168,7 +158,7 @@ void BasicBackend::EnableStreams() {
       (global_context_.device_type.find("HETERO") != std::string::npos) ||
       (global_context_.device_type.find("AUTO") != std::string::npos)) {
     if (global_context_.num_streams != 1) {
-      throw(log_tag + "Cannot set NUM_STREAMS to " + std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type);
+      ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " + std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type);
     }
     // Do nothing
   } else {
@@ -204,9 +194,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       if (input_names.find(onnx_input_name) != input_names.end()) {
         input_name = onnx_input_name;
       } else {
-        throw(log_tag +
-              "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
-              " doesn't exist in the list of OpenVINO input tensor names");
+        ORT_THROW(log_tag +
+                  "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
+                  " doesn't exist in the list of OpenVINO input tensor names");
       }
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
@@ -238,14 +228,14 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         try {
           infer_request->SetTensor(input_name, tensor_ptr);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
       } else {
         OVTensorPtr graph_input_blob;
         try {
           graph_input_blob = infer_request->GetTensor(input_name);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
         FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
       }
@@ -254,7 +244,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
     // Start Async inference
     infer_request->StartAsync();
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 
@@ -280,10 +270,10 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
       if (input_names.find(onnx_input_name) != input_names.end()) {
         input_name = onnx_input_name;
       } else {
-        throw(log_tag +
-              "Input names mismatch between OpenVINO and ONNX. " +
-              onnx_input_name +
-              " doesn't exist in the list of OpenVINO input tensor names");
+        ORT_THROW(log_tag +
+                  "Input names mismatch between OpenVINO and ONNX. " +
+                  onnx_input_name +
+                  " doesn't exist in the list of OpenVINO input tensor names");
       }
       input_idx++;
       // Kernel Context Input Buffer
@@ -328,7 +318,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
         }
       }
       if (!output_name_found) {
-        throw std::string(
+        ORT_THROW(
             log_tag +
             "Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " +
             onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names");
@@ -350,7 +340,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
         try {
           infer_request->SetTensor(output_name, tensor_ptr);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
       }
     }
@@ -358,7 +348,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
     // Start Async inference
     infer_request->StartAsync();
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 #endif
@@ -388,17 +378,18 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         }
       }
       if (!output_name_found) {
-        throw(log_tag +
-              "Output names mismatch between OpenVINO and ONNX. "
-              "[ONNX Output: ] " +
-              onnx_output_name +
-              " doesn't exist in the "
-              "list of OpenVINO output tensor names");
+        ORT_THROW(
+            log_tag +
+            "Output names mismatch between OpenVINO and ONNX. "
+            "[ONNX Output: ] " +
+            onnx_output_name +
+            " doesn't exist in the "
+            "list of OpenVINO output tensor names");
       }
       try {
         graph_output_blob = infer_request->GetTensor(output_name);
       } catch (const char* msg) {
-        throw(msg);
+        ORT_THROW(msg);
       }
       size_t batch_size = 1;
       auto output_tensor =
@@ -419,14 +410,14 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
         auto mem_info = output_tensor.GetTensorMemoryInfo();
         if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-          throw(log_tag + "IO Buffering is not supported for constant subgraphs");
+          ORT_THROW(log_tag + "IO Buffering is not supported for constant subgraphs");
         } else {
           FillOutputsWithConstantData(node, output_tensor);
         }
       }
     }
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 
@@ -446,7 +437,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
         auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
         FillOutputsWithConstantData(node, output_tensor);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     }
     // Get Output tensors
@@ -463,31 +454,30 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
 
 #ifdef IO_BUFFER_ENABLED
     if ((global_context_.device_type.find("GPU") != std::string::npos) &&
-        (global_context_.context != nullptr) &&
-        (openvino_ep::BackendManager::GetGlobalContext().is_wholly_supported_graph)) {
+        (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) {
       try {
         StartRemoteAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     } else {
       try {
         StartAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     }
 #else
     try {
       StartAsyncInference(context, infer_request);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const std::runtime_error& e) {
+      ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
     }
 #endif
     try {
       CompleteAsyncInference(context, infer_request);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const std::runtime_error& e) {
+      ORT_THROW(log_tag + " Exception at CompleteAsyncInference: " + e.what());
     }
 
     // Get Output tensors
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index aa96dadbf0e2..3502f660bbb2 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 5f19c71683f2..8701d9f676ff 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -31,6 +31,7 @@ struct GlobalContext {
   int onnx_opset_version;
   void* context = 0;
   bool use_api_2;
+  std::vector<int> OpenVINO_Version = {};  // Ov Major and OV minor version from OV headers
 };
 
 // Holds context specific to subgraph.
@@ -44,7 +45,6 @@ struct SubGraphContext {
   std::vector<int> input_indexes;
   std::unordered_map<std::string, int> input_names;
   std::unordered_map<std::string, int> output_names;
-  std::string precision;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 8aacce19c14d..ece855c6167c 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index aa389f6297d8..913440d2fb6e 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -1,11 +1,12 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
 #include "openvino_execution_provider.h"
 #include "contexts.h"
 #include "backend_manager.h"
-#include "ov_versions/capabilities.h"
+#include "ov_versions/capability.h"
+#include "openvino/core/version.hpp"
 
 #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
 
@@ -15,22 +16,24 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
     : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} {
   InitProviderOrtApi();
 
-  openvino_ep::BackendManager::GetGlobalContext().device_type = info.device_type_;
-  openvino_ep::BackendManager::GetGlobalContext().precision_str = info.precision_;
-  openvino_ep::BackendManager::GetGlobalContext().enable_npu_fast_compile = info.enable_npu_fast_compile_;
-  openvino_ep::BackendManager::GetGlobalContext().cache_dir = info.cache_dir_;
-  openvino_ep::BackendManager::GetGlobalContext().num_streams = info.num_streams_;
-  openvino_ep::BackendManager::GetGlobalContext().context = info.context_;
-  openvino_ep::BackendManager::GetGlobalContext().enable_opencl_throttling = info.enable_opencl_throttling_;
-  openvino_ep::BackendManager::GetGlobalContext().disable_dynamic_shapes = info.disable_dynamic_shapes_;
-  openvino_ep::BackendManager::GetGlobalContext().num_of_threads = info.num_of_threads_;
+  global_context_ = std::make_unique<openvino_ep::GlobalContext>();
+  global_context_->device_type = info.device_type_;
+  global_context_->precision_str = info.precision_;
+  global_context_->enable_npu_fast_compile = info.enable_npu_fast_compile_;
+  global_context_->cache_dir = info.cache_dir_;
+  global_context_->num_streams = info.num_streams_;
+  global_context_->context = info.context_;
+  global_context_->enable_opencl_throttling = info.enable_opencl_throttling_;
+  global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_;
+  global_context_->num_of_threads = info.num_of_threads_;
+  global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
 
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
   if (info.cache_dir_.empty()) {
     bool device_found = false;
     bool device_id_found = false;
-    auto available_devices = openvino_ep::BackendManager::GetGlobalContext().ie_core.GetAvailableDevices();
+    auto available_devices = global_context_->ie_core.GetAvailableDevices();
     // Checking for device_type configuration
     if (info.device_type_ != "") {
       if (info.device_type_.find("HETERO") != std::string::npos ||
@@ -49,8 +52,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
               device_found = true;
               break;
             }
-            if ((info.device_type_.find("NPU") != std::string::npos) &&
-                (info.precision_ == "FP16" || info.precision_ == "U8")) {
+            if (info.device_type_.find("NPU") != std::string::npos) {
               device_found = true;
               break;
             }
@@ -89,7 +91,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
       }
     }
   }
-  openvino_ep::BackendManager::GetGlobalContext().device_id = info.device_id_;
+  global_context_->device_id = info.device_id_;
 }
 
 std::vector<std::unique_ptr<ComputeCapability>>
@@ -100,35 +102,24 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) {
     std::cout << "In the OpenVINO EP" << std::endl;
   }
-  openvino_ep::BackendManager::GetGlobalContext().onnx_model_name = graph_viewer.Name();
+  global_context_->onnx_model_name = graph_viewer.Name();
 #ifdef _WIN32
   std::wstring onnx_path = graph_viewer.ModelPath().ToPathString();
-  openvino_ep::BackendManager::GetGlobalContext().onnx_model_path_name =
+  global_context_->onnx_model_path_name =
       std::string(onnx_path.begin(), onnx_path.end());
 #else
-  openvino_ep::BackendManager::GetGlobalContext().onnx_model_path_name =
+  global_context_->onnx_model_path_name =
       graph_viewer.ModelPath().ToPathString();
 #endif
-  openvino_ep::BackendManager::GetGlobalContext().onnx_opset_version =
+  global_context_->onnx_opset_version =
       graph_viewer.DomainToVersionMap().at(kOnnxDomain);
 
-#if defined(OPENVINO_2022_3)
   openvino_ep::GetCapability obj(graph_viewer,
-                                 openvino_ep::BackendManager::GetGlobalContext().device_type, "V_2022_3");
+                                 global_context_->device_type,
+                                 global_context_->precision_str);
   result = obj.Execute();
-#elif defined(OPENVINO_2023_0)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 openvino_ep::BackendManager::GetGlobalContext().device_type, "V_2023_0");
-  result = obj.Execute();
-#elif defined(OPENVINO_2023_1)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 openvino_ep::BackendManager::GetGlobalContext().device_type, "V_2023_1");
-  result = obj.Execute();
-#elif defined(OPENVINO_2023_2)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 openvino_ep::BackendManager::GetGlobalContext().device_type, "V_2023_2");
-  result = obj.Execute();
-#endif
+
+  global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
 
   return result;
 }
@@ -142,10 +133,10 @@ common::Status OpenVINOExecutionProvider::Compile(
 
     NodeComputeInfo compute_info;
 
-    openvino_ep::BackendManager::GetGlobalContext().use_api_2 = true;
+    global_context_->use_api_2 = true;
 
     std::shared_ptr<openvino_ep::BackendManager> backend_manager =
-        std::make_shared<openvino_ep::BackendManager>(fused_node, graph_body_viewer, *GetLogger());
+        std::make_shared<openvino_ep::BackendManager>(*global_context_, fused_node, graph_body_viewer, *GetLogger());
 
     compute_info.create_state_func =
         [backend_manager](ComputeContext* context, FunctionState* state) {
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 7cc2fb9b1ea9..b0dc881c36f3 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -20,7 +20,7 @@ static void print_build_options() {
             << "you want to build"
             << std::endl;
   std::cout << "The different hardware devices that can be added with HETERO/MULTI/AUTO build "
-            << "are ['CPU','GPU']"
+            << "are ['CPU','GPU','NPU']"
             << std::endl;
   std::cout << "An example of how to specify the HETERO or MULTI or AUTO build type. "
             << "Ex: HETERO:GPU,CPU  Ex: MULTI:GPU,CPU Ex: AUTO:GPU,CPU"
@@ -48,7 +48,7 @@ static std::vector<std::string> parseDevices(const std::string& device_string) {
     print_build_options();
     ORT_THROW("Invalid device string: " + device_string);
   }
-  std::vector<std::string> dev_options = {"CPU", "GPU"};
+  std::vector<std::string> dev_options = {"CPU", "GPU", "NPU"};
   for (std::string dev : devices) {
     if (!std::count(dev_options.begin(), dev_options.end(), dev)) {
       print_build_options();
@@ -98,12 +98,9 @@ struct OpenVINOExecutionProviderInfo {
 #elif defined OPENVINO_CONFIG_GPU_FP16
       device_type_ = "GPU";
       precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_NPU_FP16
+#elif defined OPENVINO_CONFIG_NPU
       device_type_ = "NPU";
-      precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_NPU_U8
-      device_type_ = "NPU";
-      precision_ = "U8";
+      precision_ = "";
 #elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
 #ifdef DEVICE_NAME
 #define DEVICE DEVICE_NAME
@@ -142,12 +139,9 @@ struct OpenVINOExecutionProviderInfo {
     } else if (dev_type == "GPU.1_FP16") {
       device_type_ = "GPU.1";
       precision_ = "FP16";
-    } else if (dev_type == "NPU_FP16") {
+    } else if (dev_type == "NPU") {
       device_type_ = "NPU";
-      precision_ = "FP16";
-    } else if (dev_type == "NPU_U8") {
-      device_type_ = "NPU";
-      precision_ = "U8";
+      precision_ = "";
     } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0) {
       std::vector<std::string> devices = parseDevices(dev_type);
       precision_ = "FP16";
@@ -193,6 +187,9 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   const void* GetExecutionHandle() const noexcept override {
     return nullptr;
   }
+
+ private:
+  std::unique_ptr<openvino_ep::GlobalContext> global_context_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 749907da1835..17511c54aab8 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -78,7 +78,6 @@ struct OpenVINO_Provider : Provider {
                                             // with this value at runtime.
     bool enable_opencl_throttling = false;  // [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU
                                             // device (Reduces CPU Utilization when using GPU)
-    bool disable_dynamic_shapes = false;    // [disable_dynamic_shapes]:  Execute model with default static shape for optimal performance.
     void* context = nullptr;
 
     if (provider_options_map.find("device_type") != provider_options_map.end()) {
@@ -86,7 +85,7 @@ struct OpenVINO_Provider : Provider {
 
       std::set<std::string> ov_supported_device_types = {"CPU_FP32", "CPU_FP16", "GPU_FP32",
                                                          "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
-                                                         "GPU.0_FP16", "GPU.1_FP16"};
+                                                         "GPU.0_FP16", "GPU.1_FP16", "NPU"};
       if (!((ov_supported_device_types.find(device_type) != ov_supported_device_types.end()) ||
             (device_type.find("HETERO:") == 0) ||
             (device_type.find("MULTI:") == 0) ||
@@ -94,7 +93,7 @@ struct OpenVINO_Provider : Provider {
         ORT_THROW(
             "[ERROR] [OpenVINO] You have selcted wrong configuration value for the key 'device_type'. "
             "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', "
-            "'GPU.0_FP16', 'GPU.1_FP16' or from"
+            "'GPU.0_FP16', 'GPU.1_FP16', 'NPU' or from"
             " HETERO/MULTI/AUTO options available. \n");
       }
     }
@@ -147,12 +146,24 @@ struct OpenVINO_Provider : Provider {
       bool_flag = "";
     }
 
+    // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to static shape at runtime and execute.
+    // Always true for NPU plugin.
+    bool disable_dynamic_shapes = false;
+    if (device_type.find("NPU") != std::string::npos) {
+      disable_dynamic_shapes = true;
+    }
     if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) {
       bool_flag = provider_options_map.at("disable_dynamic_shapes");
       if (bool_flag == "true" || bool_flag == "True")
         disable_dynamic_shapes = true;
-      else if (bool_flag == "false" || bool_flag == "False")
-        disable_dynamic_shapes = false;
+      else if (bool_flag == "false" || bool_flag == "False") {
+        if (device_type.find("NPU") != std::string::npos) {
+          disable_dynamic_shapes = true;
+          LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to TRUE for NPU backend.\n ";
+        } else {
+          disable_dynamic_shapes = false;
+        }
+      }
     }
     return std::make_shared<OpenVINOProviderFactory>(const_cast<char*>(device_type.c_str()),
                                                      enable_npu_fast_compile,
@@ -169,7 +180,6 @@ struct OpenVINO_Provider : Provider {
   }
 
   void Shutdown() override {
-    openvino_ep::BackendManager::ReleaseGlobalContext();
   }
 } g_provider;
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 31952e5b15e3..d7c6654c90f8 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "ov_interface.h"
@@ -8,12 +8,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "backend_utils.h"
 
-#if defined(OV_API_20)
 using Exception = ov::Exception;
-#else
-using Exception = InferenceEngine::details::InferenceEngineException;
-using WaitMode = InferenceEngine::IInferRequest::WaitMode;
-#endif
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -36,9 +31,9 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std
     }
     return FE->convert(inputModel);
   } catch (const Exception& e) {
-    throw std::string(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what()));
+    ORT_THROW(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what()));
   } catch (...) {
-    throw std::string(log_tag + "[OpenVINO-EP] Unknown exception while Reading network");
+    ORT_THROW(log_tag + "[OpenVINO-EP] Unknown exception while Reading network");
   }
 }
 
@@ -81,20 +76,19 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr<OVNetwork>& ie_cnn_network,
     OVExeNetwork exe(obj);
     return exe;
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Exception while Loading Network for graph: " + name + e.what());
+    ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Exception while Loading Network for graph " + name);
+    ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
   }
 }
 
-#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
-OVExeNetwork OVCore::LoadNetwork(const std::string& model,
+OVExeNetwork OVCore::LoadNetwork(const std::string onnx_model_path,
                                  std::string& hw_target,
                                  ov::AnyMap& device_config,
                                  std::string name) {
   ov::CompiledModel obj;
   try {
-    obj = oe.compile_model(model, ov::Tensor(), hw_target, device_config);
+    obj = oe.compile_model(onnx_model_path, hw_target, device_config);
     OVExeNetwork exe(obj);
     return exe;
   } catch (const Exception& e) {
@@ -103,7 +97,6 @@ OVExeNetwork OVCore::LoadNetwork(const std::string& model,
     ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
   }
 }
-#endif
 
 void OVCore::SetCache(std::string cache_dir_path) {
   oe.set_property(ov::cache_dir(cache_dir_path));
@@ -115,9 +108,9 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr<OVNetwork>& model, OVRemoteCont
     auto obj = oe.compile_model(model, *context);
     return OVExeNetwork(obj);
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Exception while Loading Network for graph: " + name + e.what());
+    ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Exception while Loading Network for graph " + name);
+    ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
   }
 }
 #endif
@@ -137,9 +130,9 @@ OVInferRequest OVExeNetwork::CreateInferRequest() {
     OVInferRequest inf_obj(infReq);
     return inf_obj;
   } catch (const Exception& e) {
-    throw std::string(log_tag + "Exception while creating InferRequest object: " + e.what());
+    ORT_THROW(log_tag + "Exception while creating InferRequest object: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + "Exception while creating InferRequest object.");
+    ORT_THROW(log_tag + "Exception while creating InferRequest object.");
   }
 }
 
@@ -149,9 +142,9 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
     OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
     return blob;
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Cannot access IE Blob for input: " + input_name + e.what());
+    ORT_THROW(log_tag + " Cannot access IE Blob for input: " + input_name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Cannot access IE Blob for input: " + input_name);
+    ORT_THROW(log_tag + " Cannot access IE Blob for input: " + input_name);
   }
 }
 
@@ -159,9 +152,9 @@ void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
   try {
     ovInfReq.set_tensor(name, *(blob.get()));
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
+    ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Cannot set Remote Blob for output: " + name);
+    ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name);
   }
 }
 
@@ -169,9 +162,9 @@ void OVInferRequest::StartAsync() {
   try {
     ovInfReq.start_async();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Couldn't start Inference: " + e.what());
+    ORT_THROW(log_tag + " Couldn't start Inference: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " In Error Couldn't start Inference");
+    ORT_THROW(log_tag + " In Error Couldn't start Inference");
   }
 }
 
@@ -179,9 +172,9 @@ void OVInferRequest::Infer() {
   try {
     ovInfReq.infer();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Couldn't start Inference: " + e.what());
+    ORT_THROW(log_tag + " Couldn't start Inference: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " In Error Couldn't start Inference");
+    ORT_THROW(log_tag + " In Error Couldn't start Inference");
   }
 }
 
@@ -189,9 +182,9 @@ void OVInferRequest::WaitRequest() {
   try {
     ovInfReq.wait();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Wait Model Failed: " + e.what());
+    ORT_THROW(log_tag + " Wait Model Failed: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Wait Mode Failed");
+    ORT_THROW(log_tag + " Wait Mode Failed");
   }
 }
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 690e91742bee..2a13fafb99fd 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -6,18 +6,11 @@
 #include <vector>
 #include <memory>
 
-#if defined(OPENVINO_2022_3) || (OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
-#define OV_API_20
 #include "openvino/openvino.hpp"
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/frontend/manager.hpp"
-#else
-#include <inference_engine.hpp>
-#endif
 
 #ifdef IO_BUFFER_ENABLED
-#include <gpu/gpu_context_api_ocl.hpp>
-#include <gpu/gpu_config.hpp>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #endif
 
@@ -49,12 +42,10 @@ class OVCore {
                            std::string& hw_target,
                            ov::AnyMap& device_config,
                            std::string name);
-#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
-  OVExeNetwork LoadNetwork(const std::string& model_stream,
+  OVExeNetwork LoadNetwork(const std::string model_path,
                            std::string& hw_target,
                            ov::AnyMap& device_config,
                            std::string name);
-#endif
   void SetCache(std::string cache_dir_path);
 #ifdef IO_BUFFER_ENABLED
   OVExeNetwork LoadNetwork(std::shared_ptr<OVNetwork>& model, OVRemoteContextPtr context, std::string& name);
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 4494bb8ab2d6..3970bf6ff68a 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -1,11 +1,12 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) 2019- Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
 #include "../backend_utils.h"
 #include "../backend_manager.h"
-#include "capabilities.h"
+#include "capability.h"
 #include "utils.h"
+#include "openvino/core/version.hpp"
 
 #if defined(_MSC_VER)
 #pragma warning(disable : 4244 4245 5208)
@@ -23,20 +24,24 @@ namespace onnxruntime {
 namespace openvino_ep {
 
 // Constructor
-GetCapability::GetCapability(const GraphViewer& graph_viewer_param, std::string device_type_param,
-                             const std::string version_param)
-    : graph_viewer_(graph_viewer_param), device_type_(device_type_param) {
-  if (version_param == "V_2022_3") {
-    data_ops_ = new DataOps(graph_viewer_, V_2022_3, device_type_);
-  } else if (version_param == "V_2023_0") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_0, device_type_);
-  } else if (version_param == "V_2023_1") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_);
-  } else if (version_param == "V_2023_2") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_);
-  } else {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_);
+GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
+                             const std::string device_type_param,
+                             const std::string device_precision)
+    : graph_viewer_(graph_viewer_param), device_type_(device_type_param), device_precision_(device_precision) {
+  if (device_type_.find("NPU") != std::string::npos) {
+    device_type_ = "CPU_FP32";
   }
+#if OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 1
+  data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 2
+  data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 3
+  data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 0
+  data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, device_precision_);
+#else
+  data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, device_precision_);
+#endif
 }
 
 std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
@@ -111,7 +116,7 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     if (backend_utils::IsCILogEnabled()) {
       std::cout << "Model is fully supported on OpenVINO" << std::endl;
     }
-    openvino_ep::BackendManager::GetGlobalContext().is_wholly_supported_graph = true;
+    is_wholly_supported_graph_ = true;
 
   } else {                                     // unsupported_nodes_idx.empty()
 #if defined(OPENVINO_DISABLE_GRAPH_PARTITION)  // disables graph partition at build time
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capabilities.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h
similarity index 55%
rename from onnxruntime/core/providers/openvino/ov_versions/capabilities.h
rename to onnxruntime/core/providers/openvino/ov_versions/capability.h
index 5bcf9d68cd94..d9fe5a95ef83 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capabilities.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -14,11 +14,18 @@ class GetCapability {
  private:
   const GraphViewer& graph_viewer_;
   std::string device_type_;
+  std::string device_precision_;
   DataOps* data_ops_;
+  bool is_wholly_supported_graph_ = false;
 
  public:
-  GetCapability(const GraphViewer& graph_viewer_param, std::string device_type_param, const std::string version_param);
+  GetCapability(const GraphViewer& graph_viewer_param,
+                const std::string device_type_param,
+                const std::string precision);
   virtual std::vector<std::unique_ptr<ComputeCapability>> Execute();
+  bool IsWhollySupportedGraph() {
+    return is_wholly_supported_graph_;
+  }
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index 874988566031..c7c3e9359571 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <unordered_set>
@@ -12,8 +12,9 @@
 #include "../backend_utils.h"
 #include "../backend_manager.h"
 #include "data_ops.h"
-#include "capabilities.h"
+#include "capability.h"
 #include "utils.h"
+#include "../ov_interface.h"
 
 #if defined(_MSC_VER)
 #pragma warning(disable : 4244 4245 5208)
@@ -36,6 +37,7 @@ namespace openvino_ep {
 std::set<std::string> ops_supported_only_in_model = {
     "Add",
     "Cast",
+    "Celu",
     "Concat",
     "ConstantOfShape",
     "DequantizeLinear",
@@ -46,6 +48,7 @@ std::set<std::string> ops_supported_only_in_model = {
     "EyeLike",
     "GatherElements",
     "GatherND",
+    "GridSample",
     "Identity",
     "LayerNormalization",
     "Loop",
@@ -72,293 +75,171 @@ std::set<std::string> ops_supported_only_in_model = {
 std::set<std::string> ops_supported_as_function = {
     "LessOrEqual",
     "GreaterOrEqual",
-    "LayerNormalization"};
+    "LayerNormalization",
+    "Celu"};
 
 std::vector<SupportedOp> supported_op_mode = {
     {"Abs", V_2020_4, {"CPU", "GPU"}},
-    {"Abs", V_2023_0, {"NPU"}},
     {"Acos", V_2020_4, {"CPU"}},
     {"Acos", V_2022_1, {"GPU"}},
-    {"Acos", V_2023_1, {"NPU"}},
     {"Acosh", V_2020_4, {"CPU"}},
     {"Acosh", V_2022_1, {"GPU"}},
-    {"Acosh", V_2023_1, {"NPU"}},
     {"Add", V_2020_4, {"CPU", "GPU"}},
-    {"Add", V_2023_0, {"NPU"}},
     {"And", V_2020_4, {"CPU", "GPU"}},
-    {"And", V_2023_1, {"NPU"}},
     {"ArgMax", V_2020_4, {"CPU"}},
     {"ArgMax", V_2021_1, {"GPU"}},
     {"ArgMin", V_2020_4, {"CPU"}},
     {"ArgMin", V_2022_1, {"GPU"}},
     {"Asin", V_2020_4, {"CPU", "GPU"}},
-    {"Asin", V_2023_1, {"NPU"}},
     {"Asinh", V_2020_4, {"CPU", "GPU"}},
-    {"Asinh", V_2023_1, {"NPU"}},
     {"Atan", V_2020_4, {"CPU", "GPU"}},
-    {"Atan", V_2023_1, {"NPU"}},
     {"Atanh", V_2020_4, {"CPU"}},
     {"Atanh", V_2022_1, {"GPU"}},
-    {"Atanh", V_2023_1, {"NPU"}},
     {"AveragePool", V_2020_4, {"CPU", "GPU"}},
-    {"AveragePool", V_2023_0, {"NPU"}},
     {"BatchNormalization", V_2020_4, {"CPU", "GPU"}},
-    {"BatchNormalization", V_2023_0, {"NPU"}},
     {"BitShift", V_2022_1, {"CPU"}},
-    {"BitShift", V_2023_1, {"NPU"}},
     {"Cast", V_2020_4, {"CPU", "GPU"}},
-    {"Cast", V_2023_0, {"NPU"}},
-    {"CastLike", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"CastLike", V_2023_1, {"CPU", "GPU"}},
     {"Ceil", V_2020_4, {"GPU"}},
     {"Ceil", V_2021_4, {"CPU"}},
-    {"Ceil", V_2023_1, {"NPU"}},
     {"Celu", V_2022_1, {"CPU", "GPU"}},
     {"Clip", V_2020_4, {"CPU", "GPU"}},
-    {"Clip", V_2023_0, {"NPU"}},
     {"Compress", V_2023_1, {"CPU", "GPU"}},
     {"Concat", V_2020_4, {"CPU", "GPU"}},
-    {"Concat", V_2023_0, {"NPU"}},
     {"Constant", V_2020_4, {"CPU", "GPU"}},
-    {"Constant", V_2023_0, {"NPU"}},
     {"ConstantOfShape", V_2020_4, {"CPU", "GPU"}},
-    {"ConstantOfShape", V_2023_0, {"NPU"}},  // Gets mapped to broadcast op in the plugin.
     {"Conv", V_2020_4, {"CPU", "GPU"}},
-    {"Conv", V_2023_0, {"NPU"}},
     {"ConvInteger", V_2022_1, {"CPU", "GPU"}},
-    {"ConvInteger", V_2023_1, {"NPU"}},
     {"ConvTranspose", V_2020_4, {"CPU", "GPU"}},
-    {"ConvTranspose", V_2023_1, {"NPU"}},
     {"Cos", V_2020_4, {"CPU"}},
     {"Cos", V_2022_1, {"GPU"}},
-    {"Cos", V_2023_0, {"NPU"}},
     {"Cosh", V_2020_4, {"CPU"}},
     {"Cosh", V_2022_1, {"GPU"}},
-    {"Cosh", V_2023_1, {"NPU"}},
     {"CumSum", V_2022_1, {"CPU", "GPU"}},
-    {"CumSum", V_2023_0, {"NPU"}},
     {"DepthToSpace", V_2020_4, {"CPU", "GPU"}},
-    {"DepthToSpace", V_2023_0, {"NPU"}},
     {"DequantizeLinear", V_2021_4, {"CPU", "GPU"}},
-    {"DequantizeLinear", V_2023_0, {"NPU"}},
     {"Div", V_2020_4, {"CPU", "GPU"}},
-    {"Div", V_2023_0, {"NPU"}},
     {"Dropout", V_2020_4, {"CPU", "GPU"}},
-    {"Dropout", V_2023_0, {"NPU"}},
     {"Elu", V_2020_4, {"CPU", "GPU"}},
-    {"Elu", V_2023_0, {"NPU"}},
     {"Einsum", V_2023_1, {"CPU", "GPU"}},
     {"Equal", V_2020_4, {"CPU", "GPU"}},
-    {"Equal", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"Erf", V_2020_4, {"CPU", "GPU"}},
-    {"Erf", V_2023_0, {"NPU"}},
     {"Exp", V_2020_4, {"CPU", "GPU"}},
-    {"Exp", V_2023_0, {"NPU"}},
     {"Expand", V_2022_1, {"CPU", "GPU"}},
-    {"Expand", V_2023_0, {"NPU"}},  // Gets mapped to broadcast op and multiply op in the plugin.
     {"EyeLike", V_2022_1, {"CPU"}},
-    {"EyeLike", V_2023_0, {"NPU"}},  // NoOP
     {"Flatten", V_2020_4, {"CPU", "GPU"}},
-    {"Flatten", V_2023_0, {"NPU"}},
     {"Floor", V_2020_4, {"CPU", "GPU"}},
-    {"Floor", V_2023_1, {"NPU"}},
     {"Gather", V_2020_4, {"CPU", "GPU"}},
-    {"Gather", V_2023_0, {"NPU"}},
     {"GatherElements", V_2022_2, {"CPU", "GPU"}},
-    {"GatherElements", V_2023_1, {"NPU"}},
     {"GatherND", V_2021_4, {"CPU", "GPU"}},
-    {"GatherND", V_2023_1, {"NPU"}},
+    {"Gelu", V_2023_1, {"CPU", "GPU"}},
     {"Gemm", V_2020_4, {"CPU", "GPU"}},
-    {"Gemm", V_2023_0, {"NPU"}},
     {"GlobalAveragePool", V_2020_4, {"CPU", "GPU"}},
-    {"GlobalAveragePool", V_2023_0, {"NPU"}},
     {"GlobalLpPool", V_2020_4, {"CPU", "GPU"}},
-    {"GlobalLpPool", V_2023_1, {"NPU"}},
     {"GlobalMaxPool", V_2022_1, {"CPU", "GPU"}},
-    {"GlobalMaxPool", V_2023_1, {"NPU"}},
     {"Greater", V_2020_4, {"CPU", "GPU"}},
-    {"Greater", V_2023_0, {"NPU"}},
     {"GreaterOrEqual", V_2022_1, {"CPU", "GPU"}},
-    {"GreaterOrEqual", V_2023_0, {"NPU"}},
     {"GridSample", V_2022_3, {"CPU"}},
     {"GridSample", V_2023_0, {"GPU"}},
-    {"GridSample", V_2023_1, {"NPU"}},
-    {"HardMax", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"HardMax", V_2023_1, {"CPU", "GPU"}},
     {"Identity", V_2020_4, {"CPU", "GPU"}},
-    {"Identity", V_2023_0, {"NPU"}},  // NoOP
     {"If", V_2022_3, {"CPU", "GPU"}},
-    {"If", V_2023_1, {"NPU"}},
     {"ImageScaler", V_2022_1, {"CPU", "GPU"}},
-    {"ImageScaler", V_2023_0, {"NPU"}},
     {"InstanceNormalization", V_2020_4, {"CPU", "GPU"}},
-    {"InstanceNormalization", V_2023_0, {"NPU"}},
     {"HardSigmoid", V_2020_4, {"CPU", "GPU"}},
-    {"HardSigmoid", V_2023_1, {"NPU"}},
     {"HardMax", V_2022_1, {"CPU", "GPU"}},
+    {"LayerNormalization", V_2023_0, {"CPU", "GPU"}},
     {"LeakyRelu", V_2020_4, {"CPU", "GPU"}},
-    {"LeakyRelu", V_2023_0, {"NPU"}},
     {"Less", V_2020_4, {"CPU", "GPU"}},
-    {"Less", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"LessOrEqual", V_2022_1, {"CPU", "GPU"}},
-    {"LessOrEqual", V_2023_0, {"NPU"}},
     {"Log", V_2020_4, {"CPU", "GPU"}},
-    {"Log", V_2023_0, {"NPU"}},
     {"LogSoftMax", V_2022_1, {"CPU", "GPU"}},
     {"Loop", V_2021_4, {"CPU", "GPU"}},
-    {"LpNormalization", V_2023_1, {"CPU", "GPU", "NPU"}},
-    {"LpPool", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"LpNormalization", V_2023_1, {"CPU", "GPU"}},
     {"LRN", V_2020_4, {"CPU", "GPU"}},
-    {"LRN", V_2023_0, {"NPU"}},
     {"LSTM", V_2020_4, {"CPU", "GPU"}},
-    {"LSTM", V_2023_1, {"NPU"}},
     {"MatMul", V_2020_4, {"CPU", "GPU"}},
-    {"MatMul", V_2023_0, {"NPU"}},
     {"MatMulInteger", V_2022_1, {"CPU"}},
-    {"MatMulInteger", V_2023_1, {"NPU"}},
     {"Max", V_2020_4, {"CPU", "GPU"}},
-    {"Max", V_2023_0, {"NPU"}},
     {"MaxPool", V_2020_4, {"CPU", "GPU"}},
-    {"MaxPool", V_2023_0, {"NPU"}},
     {"Mean", V_2020_4, {"CPU", "GPU"}},
-    {"Mean", V_2023_0, {"NPU"}},
     {"MeanVarianceNormalization", V_2022_1, {"CPU", "GPU"}},
-    {"MeanVarianceNormalization", V_2023_1, {"NPU"}},
     {"Min", V_2020_4, {"CPU", "GPU"}},
-    {"Min", V_2023_0, {"NPU"}},
     {"Mod", V_2022_1, {"CPU", "GPU"}},
     {"Mul", V_2020_4, {"CPU", "GPU"}},
-    {"Mul", V_2023_0, {"NPU"}},
     {"Neg", V_2020_4, {"CPU", "GPU"}},
-    {"Neg", V_2023_0, {"NPU"}},
     {"NonMaxSuppression", V_2021_1, {"CPU", "GPU"}},
-    {"NonMaxSuppression", V_2023_1, {"NPU"}},
     {"NonZero", V_2021_1, {"CPU"}},
     {"NonZero", V_2023_0, {"GPU"}},
     {"Not", V_2021_1, {"CPU", "GPU"}},
     {"Not", V_2020_4, {"CPU", "GPU"}},
-    {"Not", V_2023_1, {"NPU"}},
     {"OneHot", V_2020_4, {"CPU", "GPU"}},
-    {"OneHot", V_2023_1, {"NPU"}},
     {"Or", V_2022_1, {"CPU", "GPU"}},
-    {"Or", V_2023_1, {"NPU"}},
     {"Pad", V_2020_4, {"CPU", "GPU"}},
-    {"Pad", V_2023_0, {"NPU"}},
     {"Pow", V_2020_4, {"CPU", "GPU"}},
-    {"Pow", V_2023_0, {"NPU"}},
     {"PRelu", V_2020_4, {"CPU", "GPU"}},
-    {"PRelu", V_2023_0, {"NPU"}},
     {"QLinearMatMul", V_2022_3, {"CPU"}},
-    // {"QLinearMatMul", V_2023_1, {"NPU"}},
     {"QuantizeLinear", V_2021_4, {"CPU", "GPU"}},
-    {"QuantizeLinear", V_2023_0, {"NPU"}},
     {"RNN", V_2023_1, {"CPU", "GPU"}},
     {"RandomNormalLike", V_2023_0, {"CPU", "GPU"}},
     {"RandomNormalLike", V_2023_0, {"CPU", "GPU"}},
-    {"RandomNormalLike", V_2023_1, {"NPU"}},
     {"RandomNormal", V_2023_0, {"CPU", "GPU"}},
-    {"RandomNormal", V_2023_1, {"NPU"}},
     {"Range", V_2022_1, {"CPU", "GPU"}},
-    {"Range", V_2023_0, {"NPU"}},
     {"Reciprocal", V_2020_4, {"CPU", "GPU"}},
-    {"Reciprocal", V_2023_0, {"NPU"}},
     {"ReduceL1", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceL1", V_2023_1, {"NPU"}},
     {"ReduceL2", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceL2", V_2023_1, {"NPU"}},
     {"ReduceLogSum", V_2020_4, {"CPU"}},
     {"ReduceLogSum", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceLogSum", V_2023_1, {"NPU"}},
     {"ReduceLogSumExp", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceLogSumExp", V_2023_1, {"NPU"}},
     {"ReduceMax", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMax", V_2023_1, {"NPU"}},
     {"ReduceMean", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMean", V_2023_0, {"NPU"}},
     {"ReduceMin", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMin", V_2023_1, {"NPU"}},
     {"ReduceProd", V_2020_4, {"CPU"}},
     {"ReduceProd", V_2022_1, {"GPU"}},
-    {"ReduceProd", V_2023_1, {"NPU"}},
     {"ReduceSum", V_2020_4, {"CPU", "GPU"}},
-    // {"ReduceSum", V_2023_1, {"NPU"}},
     {"ReduceSumSquare", V_2020_4, {"CPU"}},
     {"ReduceSumSquare", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceSumSquare", V_2023_1, {"NPU"}},
     {"Relu", V_2020_4, {"CPU", "GPU"}},
-    {"Relu", V_2023_0, {"NPU"}},
     {"Resize", V_2020_4, {"CPU"}},
     {"Resize", V_2022_1, {"GPU"}},
-    {"Resize", V_2023_1, {"NPU"}},
     {"Reshape", V_2020_4, {"CPU", "GPU"}},
-    {"Reshape", V_2023_0, {"NPU"}},
     {"ReverseSequence", V_2022_1, {"CPU", "GPU"}},
     {"RoiAlign", V_2021_1, {"CPU", "GPU"}},
-    {"RoiAlign", V_2023_1, {"NPU"}},
     {"Round", V_2021_4, {"CPU", "GPU"}},
-    {"Round", V_2023_1, {"NPU"}},
     {"Scatter", V_2022_1, {"CPU", "GPU"}},
-    {"Scatter", V_2023_1, {"NPU"}},
     {"ScatterElements", V_2022_1, {"CPU", "GPU"}},
-    {"ScatterElements", V_2023_1, {"NPU"}},
     {"ScatterND", V_2022_1, {"CPU", "GPU"}},
-    {"ScatterND", V_2023_1, {"NPU"}},
     {"Selu", V_2020_4, {"CPU", "GPU"}},
-    {"Selu", V_2023_1, {"NPU"}},
     {"Shape", V_2020_4, {"CPU", "GPU"}},
-    {"Shape", V_2023_0, {"NPU"}},
     {"Shrink", V_2022_1, {"CPU", "GPU"}},
-    {"Shrink", V_2023_0, {"NPU"}},
     {"Sigmoid", V_2020_4, {"CPU", "GPU"}},
-    {"Sigmoid", V_2023_0, {"NPU"}},
     {"Sign", V_2020_4, {"CPU"}},
     {"Sign", V_2022_1, {"GPU"}},
-    {"Sign", V_2023_0, {"NPU"}},
     {"Sin", V_2022_1, {"CPU", "GPU"}},
-    {"Sin", V_2023_0, {"NPU"}},
     {"Sinh", V_2020_4, {"CPU"}},
-    {"Sinh", V_2023_1, {"NPU"}},
     {"Size", V_2022_1, {"CPU", "GPU"}},
-    {"Size", V_2023_1, {"NPU"}},
     {"Slice", V_2020_4, {"CPU", "GPU"}},
-    {"Slice", V_2023_0, {"NPU"}},
     {"Softmax", V_2020_4, {"CPU", "GPU"}},
-    {"Softmax", V_2023_0, {"NPU"}},
     {"Softplus", V_2022_1, {"CPU", "GPU"}},
-    {"Softplus", V_2023_0, {"NPU"}},
     {"Softsign", V_2022_1, {"CPU", "GPU"}},
     {"SpaceToDepth", V_2020_4, {"CPU", "GPU"}},
-    {"SpaceToDepth", V_2023_0, {"NPU"}},
     {"Split", V_2020_4, {"CPU", "GPU"}},
-    {"Split", V_2023_0, {"NPU"}},
     {"Sqrt", V_2020_4, {"CPU", "GPU"}},
-    {"Sqrt", V_2023_0, {"NPU"}},
     {"Squeeze", V_2020_4, {"CPU", "GPU"}},
-    {"Squeeze", V_2023_0, {"NPU"}},
     {"Softsign", V_2020_4, {"CPU"}},
     {"Sub", V_2020_4, {"CPU", "GPU"}},
-    {"Sub", V_2023_0, {"NPU"}},
     {"Sum", V_2020_4, {"CPU", "GPU"}},
-    {"Sum", V_2023_0, {"NPU"}},
     {"Tan", V_2020_4, {"CPU", "GPU"}},
-    {"Tan", V_2023_1, {"NPU"}},
     {"Tanh", V_2020_4, {"CPU", "GPU"}},
-    {"Tanh", V_2023_0, {"NPU"}},
     {"ThresholdedRelu", V_2022_1, {"CPU", "GPU"}},
-    {"ThresholdedRelu", V_2023_0, {"NPU"}},
     {"Tile", V_2021_3, {"CPU", "GPU"}},
-    {"Tile", V_2023_0, {"NPU"}},
     {"Transpose", V_2020_4, {"CPU", "GPU"}},
-    {"Transpose", V_2023_0, {"NPU"}},
     {"Trilu", V_2023_0, {"CPU", "GPU"}},
-    {"Trilu", V_2023_1, {"NPU"}},
     {"TopK", V_2020_4, {"CPU", "GPU"}},
-    {"TopK", V_2023_0, {"NPU"}},
     {"Upsample", V_2020_4, {"CPU", "GPU"}},
     {"Unsqueeze", V_2020_4, {"CPU", "GPU"}},
-    {"Unsqueeze", V_2023_0, {"NPU"}},
     {"Where", V_2022_1, {"CPU", "GPU"}},
-    {"Where", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"Xor", V_2022_1, {"CPU", "GPU"}},
-    {"Xor", V_2023_1, {"NPU"}},
 };
 
 void DataOps::populate_types_supported() {
@@ -370,6 +251,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_initializer_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64));
+  supported_types_initializer_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_initializer_.insert(
       std::make_pair(V_2021_1, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16));
   supported_types_initializer_.insert(
@@ -387,6 +270,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8));
   supported_types_npu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16));
+  supported_types_npu_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_npu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_npu_.insert(
@@ -402,6 +287,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_cpu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16));
+  supported_types_cpu_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_cpu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8));
   supported_types_cpu_.insert(
@@ -437,14 +324,16 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"DequantizeLinear", V_2021_4, {"All"}});
   no_dimension_supported_.push_back({"Equal", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Equal", V_2023_0, {"GPU"}});
+  no_dimension_supported_.push_back({"Expand", V_2023_3, {"CPU"}});
   no_dimension_supported_.push_back({"Floor", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Gather", V_2020_4, {"All"}});
-  no_dimension_supported_.push_back({"Greater", V_2023_0, {"NPU"}});
+  no_dimension_supported_.push_back({"Identity", V_2023_0, {"All"}});
   no_dimension_supported_.push_back({"Less", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}});
-  no_dimension_supported_.push_back({"Max", V_2023_0, {"NPU"}});
   no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Mul", V_2020_4, {"All"}});
+  no_dimension_supported_.push_back({"Neg", V_2023_0, {"CPU", "GPU"}});
+  no_dimension_supported_.push_back({"Pow", V_2023_0, {"CPU", "GPU"}});
   no_dimension_supported_.push_back({"QuantizeLinear", V_2021_4, {"All"}});
   no_dimension_supported_.push_back({"Range", V_2021_2, {"All"}});
   no_dimension_supported_.push_back({"ReduceMax", V_2021_4, {"All"}});
@@ -453,6 +342,7 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"Reshape", V_2022_1, {"All"}});
   no_dimension_supported_.push_back({"Shape", V_2022_1, {"GPU"}});
   no_dimension_supported_.push_back({"Shape", V_2023_0, {"CPU"}});
+  no_dimension_supported_.push_back({"Sqrt", V_2023_0, {"All"}});
   no_dimension_supported_.push_back({"Squeeze", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Sub", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Unsqueeze", V_2020_4, {"All"}});
@@ -472,9 +362,8 @@ void DataOps::populate_op_mode_supported() {
   {
     UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3},
                              [this](const Node* node, const InitializedTensorSet&) {
-                               // Abs is not supproted with INT8 or INT32 as input data type on GPU and NPU
-                               if ((device_id_.find("GPU") != std::string::npos) ||
-                                   (device_id_.find("NPU") != std::string::npos)) {
+                               // Abs is not supproted with INT8 or INT32 as input data type on GPU
+                               if ((device_id_.find("GPU") != std::string::npos)) {
                                  for (size_t i = 0; i < node->InputDefs().size(); i++) {
                                    if (node->InputDefs()[i]->TypeAsProto()->tensor_type().elem_type() ==
                                            ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8 ||
@@ -640,8 +529,7 @@ void DataOps::populate_op_mode_supported() {
                              [this](const Node* node, const InitializedTensorSet&) {
                                // Max op with one input is not supporting for GPU_FP16
                                if (device_id_.find("GPU") != std::string::npos) {
-                                 auto prec_str = openvino_ep::BackendManager::GetGlobalContext().precision_str;
-                                 if (prec_str == "FP16") {
+                                 if (device_precision_ == "FP16") {
                                    if (node->InputDefs().size() == 1) {
                                      return true;
                                    }
@@ -656,8 +544,7 @@ void DataOps::populate_op_mode_supported() {
                              [this](const Node* node, const InitializedTensorSet&) {
                                // Min op with one input is not supporting for GPU_FP16
                                if (device_id_.find("GPU") != std::string::npos) {
-                                 auto prec_str = openvino_ep::BackendManager::GetGlobalContext().precision_str;
-                                 if (prec_str == "FP16") {
+                                 if (device_precision_ == "FP16") {
                                    if (node->InputDefs().size() == 1) {
                                      return true;
                                    }
@@ -672,8 +559,7 @@ void DataOps::populate_op_mode_supported() {
                              [this](const Node* node, const InitializedTensorSet&) {
                                // Sum op with one input is not supporting for GPU_FP16
                                if (device_id_.find("GPU") != std::string::npos) {
-                                 auto prec_str = openvino_ep::BackendManager::GetGlobalContext().precision_str;
-                                 if (prec_str == "FP16") {
+                                 if (device_precision_ == "FP16") {
                                    if (node->InputDefs().size() == 1) {
                                      return true;
                                    }
@@ -705,7 +591,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"PRelu", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3, V_2023_0, V_2023_1, V_2023_2},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_arg = node->InputDefs()[1];
                                auto shape = input_arg->Shape();
@@ -820,7 +706,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Squeeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3, V_2023_0, V_2023_1, V_2023_2},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -835,7 +721,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3, V_2023_0, V_2023_1, V_2023_2},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
@@ -960,7 +846,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
   } else {
     auto dtype = type_proto->tensor_type().elem_type();
 
-    if (device_id_.find("NPU") != std::string::npos || device_id_.find("HETERO") != std::string::npos ||
+    if (device_id_.find("HETERO") != std::string::npos ||
         device_id_.find("MULTI") != std::string::npos || device_id_.find("AUTO") != std::string::npos) {
       for (auto const& var : supported_types_npu_) {
         if ((var.first <= version_id_) &&
@@ -1062,8 +948,7 @@ bool DataOps::dimension_unsupported(const Node* node) {
   return true;
 }
 
-bool DataOps::node_is_supported(const std::map<std::string, std::set<std::string>>& op_map,
-                                const NodeIndex node_idx) {
+bool DataOps::node_is_supported(const NodeIndex node_idx) {
   const auto& node = graph_viewer_.GetNode(node_idx);
   const auto& optype = node->OpType();
 
@@ -1131,9 +1016,6 @@ bool DataOps::node_is_supported(const std::map<std::string, std::set<std::string
         if (op_is_supported(optype, no_dimension_supported_)) {
           return;
         }
-        if ((optype == "Identity") || (optype == "Sqrt")) {
-          return;
-        }
         has_unsupported_dimension = true;
         return;
       } else {
@@ -1176,37 +1058,14 @@ bool DataOps::node_is_supported(const std::map<std::string, std::set<std::string
     return false;
   }
 
-  // Check 3b
-  const auto opset = op_map.find(domain);
-  const auto op_fun = ops_supported_as_function.find(node->OpType());
-  if (opset == op_map.end()) {
-#ifndef NDEBUG
-    if (openvino_ep::backend_utils::IsDebugEnabled()) {
-      std::cout << "Failed in Unsupported onnx model domain" << std::endl;
-    }
-#endif
-    return false;
-  }
-  if (opset->second.find(optype) == opset->second.end() && op_fun == ops_supported_as_function.end()) {
-#ifndef NDEBUG
-    if (openvino_ep::backend_utils::IsDebugEnabled()) {
-      std::cout << "The operator is not available in OpenVINO ngraph operators list"
-                << "nor the operator is a special ONNX function"
-                << std::endl;
-    }
-#endif
-    return false;
-  }
   return true;
 }
 
 std::vector<NodeIndex> DataOps::GetUnsupportedNodeIndices(std::unordered_set<std::string>& ng_required_initializers) {
-  const auto ng_supported_ops = GetNgSupportedOps(GetOnnxOpSet(graph_viewer_));
-
   std::vector<NodeIndex> unsupported_nodes_idx;
 
   for (const auto& node_idx : graph_viewer_.GetNodesInTopologicalOrder()) {
-    if (node_is_supported(ng_supported_ops, node_idx)) {
+    if (node_is_supported(node_idx)) {
       // Collect inputs that are initializers
       graph_viewer_.GetNode(node_idx)->ForEachDef([&ng_required_initializers, this](const NodeArg& node_arg,
                                                                                     bool is_input) {
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index f6ad2dd5c9d6..099090490811 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -25,7 +25,9 @@ enum versionNum {
   V_2022_3,
   V_2023_0,
   V_2023_1,
-  V_2023_2
+  V_2023_2,
+  V_2023_3,
+  V_2024_0
 };
 
 using VersionNum = enum versionNum;
@@ -50,6 +52,7 @@ class DataOps {
   const GraphViewer& graph_viewer_;
   VersionNum version_id_;
   std::string device_id_;
+  std::string device_precision_;
   std::multimap<std::string, UnsupportedOpMode> op_list_;
   std::vector<SupportedOp> subgraph_supported_;
   std::vector<SupportedOp> no_dimension_supported_;
@@ -65,13 +68,11 @@ class DataOps {
   bool dimension_unsupported(const Node* node);
   bool unsupported_op_mode(const Node* node);
   bool type_is_supported(const NodeArg* node_arg, bool is_initializer);
-  bool node_is_supported(const std::map<std::string,
-                                        std::set<std::string>>& op_map,
-                         const NodeIndex node_idx);
+  bool node_is_supported(const NodeIndex node_idx);
 
  public:
-  DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, std::string dev_id)
-      : graph_viewer_(graph_viewer_param), version_id_(ver), device_id_(dev_id) {
+  DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, const std::string dev_id, const std::string device_precision)
+      : graph_viewer_(graph_viewer_param), version_id_(ver), device_id_(dev_id), device_precision_(device_precision) {
     populate_op_mode_supported();
     populate_types_supported();
   }
diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.cc b/onnxruntime/core/providers/openvino/ov_versions/utils.cc
index ee0bfddb7dc8..c5ed29df487b 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/utils.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -11,14 +11,6 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
-#include "openvino/core/deprecated.hpp"
-#define IN_OV_COMPONENT
-#define NGRAPH_LEGACY_HEADER_INCLUDED
-#include <ngraph/frontend/onnx_import/onnx.hpp>
-
-#undef NGRAPH_LEGACY_HEADER_INCLUDED
-#undef IN_OV_COMPONENT
-
 #if defined(_MSC_VER)
 #pragma warning(default : 4244 4245)
 #elif __GNUC__
@@ -95,20 +87,6 @@ int GetOnnxOpSet(const GraphViewer& graph_viewer) {
   return dm_to_ver.at(kOnnxDomain);
 }
 
-std::map<std::string, std::set<std::string>> GetNgSupportedOps(const int onnx_opset) {
-  std::map<std::string, std::set<std::string>> ng_supported_ops;
-  OPENVINO_SUPPRESS_DEPRECATED_START
-  ng_supported_ops.emplace(kOnnxDomain, ngraph::onnx_import::get_supported_operators(onnx_opset, kOnnxDomain));
-
-  const std::set<std::string> ng_disabled_ops = {"LSTM"};  // Place-holder for ops not supported.
-
-  for (const auto& disabled_op : ng_disabled_ops) {
-    ng_supported_ops.at(kOnnxDomain).erase(disabled_op);
-  }
-  OPENVINO_SUPPRESS_DEPRECATED_END
-  return ng_supported_ops;
-}
-
 /**
  * Returns a vector clusters(or node_idx). For each unsupported node, the graph is split into 3 parts.
  * supported_cluster + (UNsupported_node + rest_of_the_graph). This functions returns vector of all supported_clusters by nGraph
diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.h b/onnxruntime/core/providers/openvino/ov_versions/utils.h
index b3edeef88dfe..34aa762ba9b6 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/utils.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/utils.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 #pragma once
 
diff --git a/onnxruntime/core/providers/partitioning_utils.cc b/onnxruntime/core/providers/partitioning_utils.cc
index d537a4cf58b2..c45f5cd0848d 100644
--- a/onnxruntime/core/providers/partitioning_utils.cc
+++ b/onnxruntime/core/providers/partitioning_utils.cc
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+// QDQ models require graph modification at runtime, so we know this infrastructure is not used in a minimal build
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include "core/providers/partitioning_utils.h"
 
 #include <algorithm>
@@ -10,6 +13,7 @@
 
 #include "core/framework/compute_capability.h"
 #include "core/framework/execution_provider.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
 #include "core/providers/common.h"
 
@@ -76,6 +80,11 @@ When selecting the next node to process, we first take:
 The remaining unsupported nodes mark the border of the current group so they will be processed later when we consider
 the next group.
 
+If node_unit_map is provided, we process NodeUnit instances (a logical 'Node' that can be a single node or a
+QDQ node group) instead of individual Node instances. As an EP must take complete NodeUnit instances (i.e. it
+must not break up a QDQ node group by taking a subset of nodes in it), this granularity of processing is valid.
+It is required to ensure we do not break up a QDQ node unit during partitioning.
+
 @param graph_viewer GraphViewer that IExecutionProvider::GetCapability is called with.
 @param is_node_supported_fn Callback to check whether a node is supported.
 @param on_group_closed_fn Callback to indicate a completed partition node group.
@@ -88,6 +97,7 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     const IsNodeSupportedFn& is_node_supported_fn,
     const OnGroupClosedFn& on_group_closed_fn,
     const std::string& execution_provider_type,
+    const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
     bool debug_output) {
 #ifdef NDEBUG
   ORT_UNUSED_PARAMETER(debug_output);
@@ -111,7 +121,18 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
   // initialize in-degrees and find root nodes
   for (const auto& node_index : graph_viewer.GetNodesInTopologicalOrder()) {
     const auto& node = *graph_viewer.GetNode(node_index);
-    const auto node_input_edge_count = node.GetInputEdgesCount();
+    auto node_input_edge_count = node.GetInputEdgesCount();
+
+    if (node_unit_map != nullptr) {
+      const auto& node_unit = node_unit_map->at(&node);
+      if (&node_unit->GetNode() != &node) {
+        // only process the target node
+        continue;
+      }
+
+      node_input_edge_count = node_unit->InputEdgeCount();
+    }
+
     in_degree.insert({node.Index(), node_input_edge_count});
     if (node_input_edge_count == 0) {
       nodes_to_process.push_back(&node);
@@ -151,6 +172,8 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     }
   };
 
+  size_t num_nodes_processed = 0;
+
   while (!nodes_to_process.empty() || !nodes_to_process_with_next_group.empty()) {
     if (nodes_to_process.empty()) {
       // we have processed all the nodes that we can while building this partition node group, start a new one
@@ -162,9 +185,13 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     const Node& node = *nodes_to_process.front();
     nodes_to_process.pop_front();
 
+    const NodeUnit* node_unit = node_unit_map ? node_unit_map->at(&node) : nullptr;
+    const bool is_qdq_node_unit = node_unit && node_unit->UnitType() == NodeUnit::Type::QDQGroup;
+
     // a node that is already assigned to an EP other than current EP is unsupported
-    const bool is_node_supported =
-        (node.GetExecutionProviderType().empty() || node.GetExecutionProviderType() == execution_provider_type) && is_node_supported_fn(node);
+    const bool is_node_supported = (node.GetExecutionProviderType().empty() ||
+                                    node.GetExecutionProviderType() == execution_provider_type) &&
+                                   is_node_supported_fn(node);
 
     if (!is_node_supported && Contains(supported_group_border, &node)) {
       // an unsupported node on the border will be processed after the current partition node group
@@ -173,34 +200,62 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     }
 
     if (is_node_supported) {
-      // add node to the partition node group
-      supported_group.push_back(&node);
+      if (is_qdq_node_unit) {
+        // add DQ -> node -> Q for the node unit. must be in topological order
+        for (const auto& dq : node_unit->GetDQNodes()) {
+          supported_group.push_back(dq);
+        }
 
-      // remove node from the border and add its outputs to the border
+        supported_group.push_back(&node);
+
+        for (const auto& q : node_unit->GetQNodes()) {
+          supported_group.push_back(q);
+        }
+      } else {
+        supported_group.push_back(&node);
+      }
+
+      // remove node from the border
       supported_group_border.erase(&node);
+    }
 
-      std::for_each(
-          node.OutputNodesBegin(), node.OutputNodesEnd(),
-          [&supported_group_border](const Node& output) {
-            supported_group_border.insert(&output);
-          });
+    // For each downstream node:
+    //   1: add the downstream node to the border if the current node is supported
+    //   2: adjust in-degrees of the nodes consuming the current node's outputs, and add any new nodes to process
+    const auto process_downstream_node = [&](const Node& downstream_node) {
+      if (is_node_supported) {
+        supported_group_border.insert(&downstream_node);
+      }
+
+      auto& downstream_node_in_degree = in_degree[downstream_node.Index()];
+      --downstream_node_in_degree;
+
+      if (downstream_node_in_degree == 0) {
+        nodes_to_process.push_back(&downstream_node);
+      }
+    };
+
+    if (node_unit_map) {
+      std::for_each(node_unit->OutputEdgesBegin(), node_unit->OutputEdgesEnd(),
+                    [&](const Node::EdgeEnd& edge_end) {
+                      const Node& n = edge_end.GetNode();
+                      const NodeUnit& downstream_node_unit = *node_unit_map->at(&n);
+                      const Node& output = downstream_node_unit.GetNode();
+
+                      process_downstream_node(output);
+                    });
+    } else {
+      std::for_each(node.OutputNodesBegin(), node.OutputNodesEnd(), process_downstream_node);
     }
 
-    // adjust in-degrees of the node outputs and add any new nodes to process
-    std::for_each(
-        node.OutputNodesBegin(), node.OutputNodesEnd(),
-        [&](const Node& output) {
-          auto& output_node_in_degree = in_degree[output.Index()];
-          --output_node_in_degree;
-
-          if (output_node_in_degree == 0) {
-            nodes_to_process.push_back(&output);
-          }
-        });
+    ++num_nodes_processed;
   }
 
   close_group();
 
+  ORT_ENFORCE(num_nodes_processed == in_degree.size(),
+              "Processed ", num_nodes_processed, " nodes. Expected to process ", in_degree.size());
+
   return supported_groups;
 }
 }  // namespace
@@ -318,11 +373,13 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name_fn,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
                           bool debug_output) {
   const auto groups = CreateSupportedPartitionNodeGroups(graph_viewer,
                                                          is_node_supported_fn,
                                                          on_partition_closed_fn,
                                                          execution_provider_type,
+                                                         node_unit_map,
                                                          debug_output);
 
   std::vector<std::unique_ptr<ComputeCapability>> partitions{};
@@ -346,6 +403,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name_fn,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
                           bool debug_output) {
   const auto excluded_nodes = CreateExcludedNodeSet(graph_viewer, stop_ops);
   const bool check_excluded_nodes = !excluded_nodes.empty();
@@ -360,8 +418,11 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
       generate_metadef_name_fn,
       execution_provider_name,
       execution_provider_type,
+      node_unit_map,
       debug_output);
 }
 
 }  // namespace utils
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/partitioning_utils.h b/onnxruntime/core/providers/partitioning_utils.h
index f9d5f7403f17..c3f6b104e3f6 100644
--- a/onnxruntime/core/providers/partitioning_utils.h
+++ b/onnxruntime/core/providers/partitioning_utils.h
@@ -3,6 +3,9 @@
 
 #pragma once
 
+// QDQ models require graph modification at runtime, so we know this infrastructure is not used in a minimal build
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include <functional>
 #include <memory>
 #include <unordered_set>
@@ -14,8 +17,9 @@
 namespace onnxruntime {
 struct ComputeCapability;
 class GraphViewer;
-class NodeArg;
 class Node;
+class NodeArg;
+class NodeUnit;
 
 namespace utils {
 
@@ -40,7 +44,7 @@ using OnGroupClosedFn = std::function<bool(const std::vector<const Node*>& group
 
 /**
 Called to create a metadef name.
-Most likely should call IExecutionProvider::GenerateMetaDefId.
+Most likely should call ModelMetadefIdGenerator.GenerateId.
 See onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc for example usage.
 
 @return The metadef name.
@@ -56,6 +60,8 @@ Create the supported partitions for the execution provider.
 @param generate_metadef_name_fn Callback to create the name for the MetaDef.
 @param execution_provider_name Name of execution provider creating the ComputeCapability instance.
 @param execution_provider_type ExecutionProviderType of the EP creating this ComputeCapability instance.
+@param node_unit_map Map of each Node in the graph_viewer to its NodeUnit. Provide if EP handles QDQ format models.
+                     Should be created by EP calling GetAllNodeUnits.
 @param debug_output Print diagnostic output about the partitions and reasons for partition breaks.
                     No-op in a release build.
 
@@ -68,6 +74,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name_fn,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map = nullptr,
                           bool debug_output = false);
 
 /**
@@ -79,6 +86,8 @@ Create the supported partitions for the execution provider.
 @param generate_metadef_name Functor to create the name for the MetaDef.
 @param execution_provider_name Name of execution provider creating the ComputeCapability instance.
 @param execution_provider_type ExecutionProviderType of the EP creating this ComputeCapability instance.
+@param node_unit_map Map of each Node in the graph_viewer to its NodeUnit. Provide if EP handles QDQ format models.
+                     Should be created by EP calling GetAllNodeUnits.
 @param debug_output Print diagnostic output about the partitions and reasons for partition breaks.
                     No-op in a release build.
 
@@ -91,6 +100,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map = nullptr,
                           bool debug_output = false);
 
 /**
@@ -125,3 +135,5 @@ InlinedHashSet<const Node*> CreateExcludedNodeSet(const GraphViewer& graph_viewe
                                                   const std::unordered_set<std::string>& stop_ops);
 }  // namespace utils
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index b157396306d0..2d8ec295d613 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -12,34 +12,60 @@
 namespace onnxruntime {
 namespace qnn {
 
-Status IsFusedGraphHasCtxNode(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
-                              bool& is_qnn_ctx_model) {
-  is_qnn_ctx_model = false;
-  for (const auto& fused_node_graph : fused_nodes_and_graphs) {
-    const onnxruntime::GraphViewer& graph_viewer(fused_node_graph.filtered_graph);
-    // It's an Onnx model with Qnn context cache binary if it only has a node with EPContext type
-    int count = 0;
-    for (const auto& node : graph_viewer.Nodes()) {
-      if (EPCONTEXT_OP == node.OpType()) {
-        is_qnn_ctx_model = true;
+bool GraphHasEpContextNode(const onnxruntime::GraphViewer& graph_viewer) {
+  // It's an Onnx model with Qnn context cache binary if it has a node with EPContext type and the source is QNN or QNNExecutionProvider.
+  for (const auto& node : graph_viewer.Nodes()) {
+    if (EPCONTEXT_OP == node.OpType()) {
+      NodeAttrHelper node_helper(node);
+      std::string cache_source = node_helper.Get(SOURCE, "");
+
+      std::transform(cache_source.begin(),
+                     cache_source.end(),
+                     cache_source.begin(),
+                     [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
+
+      if (cache_source == "qnnexecutionprovider" || cache_source == "qnn") {
+        return true;
       }
-      ++count;
     }
-    ORT_RETURN_IF(is_qnn_ctx_model && count > 1, "Fused graph should only has 1 single EPContext node.");
   }
-  return Status::OK();
+  return false;
 }
 
-bool IsQnnCtxModel(const onnxruntime::GraphViewer& graph_viewer) {
-  // It's an Onnx model with Qnn context cache binary if it only has a node with EPContext type
-  for (const auto& node : graph_viewer.Nodes()) {
-    if (EPCONTEXT_OP == node.OpType()) {
+bool IsFusedGraphHasCtxNode(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs) {
+  for (const auto& fused_node_graph : fused_nodes_and_graphs) {
+    const onnxruntime::GraphViewer& graph_viewer(fused_node_graph.filtered_graph);
+    bool has_qnn_ep_context_node = GraphHasEpContextNode(graph_viewer);
+    if (has_qnn_ep_context_node) {
       return true;
     }
   }
   return false;
 }
 
+Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
+                          QnnBackendManager* qnn_backend_manager,
+                          const logging::Logger& logger,
+                          int& main_context_pos,
+                          std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
+  main_context_pos = -1;
+  for (size_t i = 0; i < fused_nodes_and_graphs.size(); ++i) {
+    const onnxruntime::GraphViewer& graph_viewer(fused_nodes_and_graphs[i].filtered_graph);
+    const auto& ep_context_node = graph_viewer.Nodes().begin();
+    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node->OpType(), "Should only filter in the EPContext node.");
+    qnn_models.emplace(ep_context_node->Name(),
+                       std::make_unique<qnn::QnnModel>(logger, qnn_backend_manager));
+    NodeAttrHelper node_helper(*ep_context_node);
+    int64_t is_main_context = node_helper.Get(MAIN_CONTEXT, static_cast<int64_t>(0));
+    if (1 == is_main_context) {
+      main_context_pos = static_cast<int>(i);
+    }
+  }
+
+  ORT_RETURN_IF(main_context_pos < 0, "Failed to find the EPContext node with main_context=1");
+  return Status::OK();
+}
+
 Status CreateNodeArgs(const std::vector<std::string>& names,
                       const std::unordered_map<std::string, OnnxTensorInfo>& tensor_info_table,
                       std::vector<NodeArg*>& node_args,
@@ -60,37 +86,47 @@ Status CreateNodeArgs(const std::vector<std::string>& names,
   return Status::OK();
 }
 
-Status GetEpContextFromModel(const onnxruntime::PathString& ctx_onnx_model_path,
-                             QnnBackendManager* qnn_backend_manager,
-                             QnnModel& qnn_model,
-                             const logging::Logger& logger) {
-  using namespace onnxruntime;
-  std::shared_ptr<Model> model;
-  ORT_RETURN_IF_ERROR(Model::Load(ToPathString(ctx_onnx_model_path), model, {}, logger));
-  const auto& graph = model->MainGraph();
-  return GetEpContextFromGraph(GraphViewer(graph),
-                               ctx_onnx_model_path,
-                               qnn_backend_manager,
-                               qnn_model);
-}
-
-Status GetEpContextFromGraph(const onnxruntime::GraphViewer& graph_viewer,
-                             const onnxruntime::PathString& ctx_onnx_model_path,
-                             QnnBackendManager* qnn_backend_manager,
-                             QnnModel& qnn_model) {
-  const auto& node = graph_viewer.Nodes().begin();
-  NodeAttrHelper node_helper(*node);
+Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
+                                const onnxruntime::PathString& ctx_onnx_model_path,
+                                QnnBackendManager* qnn_backend_manager,
+                                std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
+  ORT_RETURN_IF_NOT(EPCONTEXT_OP == main_context_node.OpType(), "Should only filter in the EPContext node.");
+  NodeAttrHelper node_helper(main_context_node);
   bool is_embed_mode = node_helper.Get(EMBED_MODE, true);
   if (is_embed_mode) {
     const std::string& context_binary = node_helper.Get(EP_CACHE_CONTEXT, "");
     return qnn_backend_manager->LoadCachedQnnContextFromBuffer(const_cast<char*>(context_binary.c_str()),
                                                                static_cast<uint64_t>(context_binary.length()),
-                                                               qnn_model);
+                                                               qnn_models);
   }
 
-  std::string external_qnn_context_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
   std::filesystem::path folder_path = std::filesystem::path(ctx_onnx_model_path).parent_path();
-  std::filesystem::path context_binary_path = folder_path.append(external_qnn_context_binary_file_name);
+  std::string external_qnn_ctx_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
+  ORT_RETURN_IF(external_qnn_ctx_binary_file_name.empty(), "The file path in ep_cache_context should not be empty.");
+#ifdef _WIN32
+  onnxruntime::PathString external_qnn_context_binary_path = onnxruntime::ToPathString(external_qnn_ctx_binary_file_name);
+  auto ctx_file_path = std::filesystem::path(external_qnn_context_binary_path.c_str());
+  ORT_RETURN_IF(ctx_file_path.is_absolute(), "External mode should set ep_cache_context field with a relative path, but it is an absolute path: ",
+                external_qnn_ctx_binary_file_name);
+  auto relative_path = ctx_file_path.lexically_normal().make_preferred().wstring();
+  if (relative_path.find(L"..", 0) != std::string::npos) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context field has '..'. It's not allowed to point outside the directory.");
+  }
+
+  std::filesystem::path context_binary_path = folder_path.append(relative_path);
+#else
+  ORT_RETURN_IF(external_qnn_ctx_binary_file_name[0] == '/',
+                "External mode should set ep_cache_context field with a relative path, but it is an absolute path: ",
+                external_qnn_ctx_binary_file_name);
+  if (external_qnn_ctx_binary_file_name.find("..", 0) != std::string::npos) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context field has '..'. It's not allowed to point outside the directory.");
+  }
+  std::filesystem::path context_binary_path = folder_path.append(external_qnn_ctx_binary_file_name);
+  std::string file_full_path = context_binary_path.string();
+#endif
+  if (!std::filesystem::is_regular_file(context_binary_path)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context does not exist or is not accessible.");
+  }
 
   size_t buffer_size{0};
   std::ifstream cache_file(context_binary_path.string().c_str(), std::ifstream::binary);
@@ -109,118 +145,57 @@ Status GetEpContextFromGraph(const onnxruntime::GraphViewer& graph_viewer,
   cache_file.close();
   return qnn_backend_manager->LoadCachedQnnContextFromBuffer(buffer.get(),
                                                              static_cast<uint64_t>(buffer_size),
-                                                             qnn_model);
+                                                             qnn_models);
 }
 
-Status LoadQnnCtxFromOnnxModel(const onnxruntime::GraphViewer& graph_viewer,
+Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
                                const onnxruntime::PathString& ctx_onnx_model_path,
-                               bool is_qnn_ctx_model,
-                               bool is_ctx_cache_file_exist,
                                QnnBackendManager* qnn_backend_manager,
-                               QnnModel& qnn_model,
+                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models,
                                const logging::Logger& logger) {
-  Status status;
-  if (is_qnn_ctx_model) {
-    status = GetEpContextFromGraph(graph_viewer, ctx_onnx_model_path, qnn_backend_manager, qnn_model);
-  } else if (is_ctx_cache_file_exist) {
-    status = GetEpContextFromModel(ctx_onnx_model_path, qnn_backend_manager, qnn_model, logger);
-  }
+  Status status = GetEpContextFromMainNode(*graph_viewer.Nodes().begin(), ctx_onnx_model_path, qnn_backend_manager, qnn_models);
 
+  // This is the protocol with customer that status with INVALID_GRAPH will be generated if failed to load context model
   if (!status.IsOK()) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to load from EpContextModel. ", status.ErrorMessage());
+    LOGS(logger, ERROR) << "Failed to load from EpContext model. " << status.ErrorMessage();
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to load from EpContext model. ", status.ErrorMessage());
   }
 
   return Status::OK();
 }
 
-Status GetMetadataFromEpContextModel(const onnxruntime::PathString& ctx_onnx_model_path,
-                                     std::string& model_name,
-                                     std::string& model_description,
-                                     std::string& graph_partition_name,
-                                     std::string& cache_source,
-                                     const logging::Logger& logger) {
-  using namespace onnxruntime;
-  std::shared_ptr<Model> model;
-  ORT_RETURN_IF_ERROR(Model::Load(ctx_onnx_model_path, model, {}, logger));
-  const auto& graph = GraphViewer(model->MainGraph());
-  const auto& node = graph.Nodes().begin();
-  NodeAttrHelper node_helper(*node);
-  model_name = graph.Name();
-  model_description = graph.Description();
-  graph_partition_name = node_helper.Get(PARTITION_NAME, "");
-  cache_source = node_helper.Get(SOURCE, "");
-
-  return Status::OK();
-}
-
-bool IsContextCacheFileExists(const std::string& customer_context_cache_path,
-                              const onnxruntime::PathString& model_pathstring,
-                              onnxruntime::PathString& context_cache_path) {
-  // Use user provided context cache file path if exist, otherwise try model_file.onnx_ctx.onnx by default
+// Figure out the real context cache file path
+// return true if context cache file exists
+bool ValidateContextCacheFilePath(bool is_qnn_ctx_model,
+                                  const std::string& customer_context_cache_path,
+                                  const onnxruntime::PathString& model_pathstring,
+                                  onnxruntime::PathString& context_cache_path) {
+  // always try the path set by user first, it's the only way to set it if load model from memory
   if (!customer_context_cache_path.empty()) {
     context_cache_path = ToPathString(customer_context_cache_path);
-  } else if (!model_pathstring.empty()) {
-    context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
+  } else if (!model_pathstring.empty()) {  // model loaded from file
+    if (is_qnn_ctx_model) {
+      // it's a context cache model, just use the model path
+      context_cache_path = model_pathstring;
+    } else if (!model_pathstring.empty()) {
+      // this is not a normal Onnx model, no customer path, create a default path for generation: model_path + _ctx.onnx
+      context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
+    }
   }
 
   return std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);
 }
 
-Status ValidateWithContextFile(const onnxruntime::PathString& context_cache_path,
-                               const std::string& model_name,
-                               const std::string& model_description,
-                               const std::string& graph_partition_name,
-                               const logging::Logger& logger) {
-  std::string model_name_from_ctx_cache;
-  std::string model_description_from_ctx_cache;
-  std::string graph_partition_name_from_ctx_cache;
-  std::string cache_source;
-  auto status = GetMetadataFromEpContextModel(context_cache_path,
-                                              model_name_from_ctx_cache,
-                                              model_description_from_ctx_cache,
-                                              graph_partition_name_from_ctx_cache,
-                                              cache_source,
-                                              logger);
-  if (!status.IsOK()) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to get metadata from EpContextModel.");
-  }
-
-  // The source attribute from the skeleton onnx file indicate whether it's generated from QNN toolchain or ORT
-  if (cache_source != kQnnExecutionProvider) {
-    LOGS(logger, VERBOSE) << "Context binary cache is not generated by Ort.";
-    return Status::OK();
-  }
-
-  if (model_name != model_name_from_ctx_cache ||
-      model_description != model_description_from_ctx_cache ||
-      graph_partition_name != graph_partition_name_from_ctx_cache) {
-    std::string message = onnxruntime::MakeString("Metadata mismatch. onnx: ",
-                                                  model_name, " ", model_description, " ", graph_partition_name,
-                                                  " vs epcontext: ",
-                                                  model_name_from_ctx_cache, " ",
-                                                  model_description_from_ctx_cache, " ",
-                                                  graph_partition_name_from_ctx_cache);
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, message);
-  }
-
-  return Status::OK();
-}
-
-Status GenerateCtxCacheOnnxModel(const std::string model_name,
-                                 const std::string model_description,
-                                 unsigned char* buffer,
-                                 uint64_t buffer_size,
-                                 const std::string& sdk_build_version,
-                                 const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
-                                 const std::unordered_map<std::string, std::unique_ptr<QnnModel>>& qnn_models,
-                                 const onnxruntime::PathString& context_cache_path,
-                                 bool qnn_context_embed_mode,
-                                 const logging::Logger& logger) {
-  std::unordered_map<std::string, int> domain_to_version = {{kOnnxDomain, 11}, {kMSDomain, 1}};
-  Model model(model_name, false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-              domain_to_version, {}, logger);
-  auto& graph = model.MainGraph();
-  graph.SetDescription(model_description);
+Status CreateEPContextNodes(Model* model,
+                            unsigned char* buffer,
+                            uint64_t buffer_size,
+                            const std::string& sdk_build_version,
+                            const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
+                            const std::unordered_map<std::string, std::unique_ptr<QnnModel>>& qnn_models,
+                            const onnxruntime::PathString& context_cache_path,
+                            bool qnn_context_embed_mode,
+                            const logging::Logger& logger) {
+  auto& graph = model->MainGraph();
 
   using namespace ONNX_NAMESPACE;
   int index = 0;
@@ -246,7 +221,7 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
                                   nullptr,
                                   kMSDomain);
 
-    // Only dump the context buffer once since all QNN graph are in one single context
+    // Only dump the context buffer once since all QNN graphs are in one single context
     if (0 == index) {
       if (qnn_context_embed_mode) {
         std::string cache_payload(buffer, buffer + buffer_size);
@@ -272,8 +247,6 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
     ep_node.AddAttribute(SOURCE, kQnnExecutionProvider);
     ++index;
   }
-  ORT_RETURN_IF_ERROR(graph.Resolve());
-  ORT_RETURN_IF_ERROR(Model::Save(model, context_cache_path));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index 0011d0f43f5b..7d56b45a1dbc 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -28,60 +28,45 @@ static const std::string EP_SDK_VER = "ep_sdk_version";
 static const std::string PARTITION_NAME = "partition_name";
 static const std::string SOURCE = "source";
 
-Status IsFusedGraphHasCtxNode(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
-                              bool& is_qnn_ctx_model);
+bool GraphHasEpContextNode(const onnxruntime::GraphViewer& graph_viewer);
 
-bool IsQnnCtxModel(const onnxruntime::GraphViewer& graph_viewer);
+bool IsFusedGraphHasCtxNode(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs);
+
+Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
+                          QnnBackendManager* qnn_backend_manager,
+                          const logging::Logger& logger,
+                          int& main_context_pos,
+                          std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models);
 
 Status CreateNodeArgs(const std::vector<std::string>& names,
                       const std::unordered_map<std::string, OnnxTensorInfo>& tensor_info_table,
                       std::vector<NodeArg*>& node_args,
                       onnxruntime::Graph& graph);
 
-bool IsContextCacheFileExists(const std::string& customer_context_cache_path,
-                              const onnxruntime::PathString& model_pathstring,
-                              onnxruntime::PathString& context_cache_path);
-
-Status GetEpContextFromModel(const onnxruntime::PathString& ctx_onnx_model_path,
-                             QnnBackendManager* qnn_backend_manager,
-                             QnnModel& qnn_model,
-                             const logging::Logger& logger);
+bool ValidateContextCacheFilePath(bool is_qnn_ctx_model,
+                                  const std::string& customer_context_cache_path,
+                                  const onnxruntime::PathString& model_pathstring,
+                                  onnxruntime::PathString& context_cache_path);
 
-Status GetEpContextFromGraph(const onnxruntime::GraphViewer& graph_viewer,
-                             const onnxruntime::PathString& ctx_onnx_model_path,
-                             QnnBackendManager* qnn_backend_manager,
-                             QnnModel& qnn_model);
+Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
+                                const onnxruntime::PathString& ctx_onnx_model_path,
+                                QnnBackendManager* qnn_backend_manager,
+                                std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models);
 
-Status LoadQnnCtxFromOnnxModel(const onnxruntime::GraphViewer& graph_viewer,
+Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
                                const onnxruntime::PathString& ctx_onnx_model_path,
-                               bool is_qnn_ctx_model,
-                               bool is_ctx_cache_file_exist,
                                QnnBackendManager* qnn_backend_manager,
-                               QnnModel& qnn_model,
+                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models,
                                const logging::Logger& logger);
 
-Status ValidateWithContextFile(const onnxruntime::PathString& context_cache_path,
-                               const std::string& model_name,
-                               const std::string& model_description,
-                               const std::string& graph_partition_name,
-                               const logging::Logger& logger);
-
-Status GetMetadataFromEpContextModel(const onnxruntime::PathString& ctx_onnx_model_path,
-                                     std::string& model_name,
-                                     std::string& model_description,
-                                     std::string& graph_partition_name,
-                                     std::string& cache_source,
-                                     const logging::Logger& logger);
-
-Status GenerateCtxCacheOnnxModel(const std::string model_name,
-                                 const std::string model_description,
-                                 unsigned char* buffer,
-                                 uint64_t buffer_size,
-                                 const std::string& sdk_build_version,
-                                 const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
-                                 const std::unordered_map<std::string, std::unique_ptr<QnnModel>>& qnn_models,
-                                 const onnxruntime::PathString& context_cache_path,
-                                 bool qnn_context_embed_mode,
-                                 const logging::Logger& logger);
+Status CreateEPContextNodes(Model* model,
+                            unsigned char* buffer,
+                            uint64_t buffer_size,
+                            const std::string& sdk_build_version,
+                            const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
+                            const std::unordered_map<std::string, std::unique_ptr<QnnModel>>& qnn_models,
+                            const onnxruntime::PathString& context_cache_path,
+                            bool qnn_context_embed_mode,
+                            const logging::Logger& logger);
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder.h b/onnxruntime/core/providers/qnn/builder/op_builder.h
index 018d9a2797a6..05398c3f22ea 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "core/graph/graph_viewer.h"
-#include "core/providers/shared/node_unit/node_unit.h"
+#include "core/framework/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
index d95e2baa9457..4a9106f0c06a 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -94,5 +94,28 @@ void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_r
 
 void CreateExpandOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
+struct HandleConvertResult {
+  Status status;                // Indicates an unexpected error. Check if q_node_unit != nullptr to determine
+                                // whether a DQ -> Q sequence was successfully merged into a Convert.
+  const NodeUnit* q_node_unit;  // Non-null if successfully merged DQ -> Q sequence.
+                                // Set to nullptr if this node unit could not be merged into a Convert.
+};
+
+/**
+ * Tries to merge a DQ -> Q sequence into a QNN Convert operator. The DQ -> Q must be converting from
+ * one quantization type (e.g., uint8_t) to another (e.g., uint16_t).
+ *
+ * \param qnn_model_wrapper The QNN model that is being built.
+ * \param maybe_dq_node_unit The node unit that could potentially start the DQ -> Q sequence.
+ * \param logger The logger.
+ * \param do_op_validation True if should call QNN operator validation APIs.
+ * \return An qnn::HandleConvertResult object that indicates success/failure and provides a pointer
+ *         to the Q node unit that was successfully merged with the provided DQ node unit.
+ */
+HandleConvertResult TryHandleConvertSequence(QnnModelWrapper& qnn_model_wrapper,
+                                             const NodeUnit& maybe_dq_node_unit,
+                                             const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
+                                             const logging::Logger& logger,
+                                             bool do_op_validation);
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index 6d8c80bd2aaa..ba86e08822a9 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -65,8 +65,9 @@ Status BaseOpBuilder::ProcessInput(QnnModelWrapper& qnn_model_wrapper,
   }
 
   Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, input_name);
-  QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, input_info.qnn_data_type, input_info.quant_param,
-                                       std::move(input_info.shape), std::move(unpacked_tensor));
+  QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, input_info.qnn_data_type,
+                                       std::move(input_info.quant_param), std::move(input_info.shape),
+                                       std::move(unpacked_tensor));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
   input_names.push_back(input_name);
 
@@ -129,7 +130,7 @@ Status BaseOpBuilder::ProcessOutputs(QnnModelWrapper& qnn_model_wrapper,
     TensorInfo output_info = {};
     ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(outputs[output_i], output_info));
 
-    if (output_info.quant_param.encodingDefinition == QNN_DEFINITION_DEFINED) {
+    if (output_info.quant_param.IsQuantized()) {
       ORT_RETURN_IF_ERROR(OverrideOutputQuantParam(qnn_model_wrapper, node_unit, logger, input_names,
                                                    output_i, output_info.qnn_data_type, output_info.quant_param));
     }
@@ -143,7 +144,7 @@ Status BaseOpBuilder::ProcessOutputs(QnnModelWrapper& qnn_model_wrapper,
       QnnTensorWrapper cast_input_tensorwrapper(cast_input_name,
                                                 QNN_TENSOR_TYPE_NATIVE,
                                                 supported_qnn_data_type,
-                                                output_info.quant_param,
+                                                output_info.quant_param.Copy(),
                                                 std::move(cast_output_shape));
       ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(cast_input_tensorwrapper)), "Failed to add tensor.");
       output_names.push_back(cast_input_name);
@@ -156,7 +157,7 @@ Status BaseOpBuilder::ProcessOutputs(QnnModelWrapper& qnn_model_wrapper,
     QnnTensorWrapper output_tensorwrapper(output_name,
                                           tensor_type,
                                           output_info.qnn_data_type,
-                                          output_info.quant_param,
+                                          std::move(output_info.quant_param),
                                           std::move(output_info.shape));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
   }
@@ -189,15 +190,15 @@ Status BaseOpBuilder::SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper&
                                                                size_t input_index,
                                                                size_t output_index,
                                                                Qnn_DataType_t qnn_data_type,
-                                                               Qnn_QuantizeParams_t& quant_param) const {
+                                                               QnnQuantParamsWrapper& quant_param) const {
   const QnnTensorWrapper& input_tensor_wrapper = qnn_model_wrapper.GetQnnTensorWrapper(input_names[input_index]);
   ORT_RETURN_IF_NOT(input_tensor_wrapper.GetTensorDataType() == qnn_data_type,
                     "Input and output data types do not match");
-  Qnn_QuantizeParams_t input_quant_param = GetQnnTensorQParams(input_tensor_wrapper.GetQnnTensor());
+  const QnnQuantParamsWrapper& input_quant_param = input_tensor_wrapper.GetQnnQuantParams();
 
   float scale_diff = 0.0f;
   int32_t offset_diff = 0;
-  ORT_RETURN_IF_ERROR(CompareQnnQuantParams(quant_param, input_quant_param, scale_diff, offset_diff));
+  ORT_RETURN_IF_ERROR(CompareQnnQuantParams(quant_param.Get(), input_quant_param.Get(), scale_diff, offset_diff));
   constexpr float NEARLY_EQUAL_THRESHOLD = 1e-9f;
   constexpr float WARN_THRESHOLD = 1e-6f;
 
@@ -244,7 +245,9 @@ Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrap
 
   TensorShape new_tensor_shape(new_tensor_shape_dims);
   Tensor out_tensor = Tensor(tensor_dtype, new_tensor_shape, cpu_allocator);
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(Env::Default(), nullptr, initializer, in_tensor));
+  onnxruntime::PathString model_path = qnn_model_wrapper.GetGraphViewer().ModelPath().ToPathString();
+  const ORTCHAR_T* model_path_str = model_path.empty() ? nullptr : model_path.c_str();
+  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(Env::Default(), model_path_str, initializer, in_tensor));
   ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutations, in_tensor, out_tensor));
   onnx::TensorProto new_tensor_proto = onnxruntime::utils::TensorToTensorProto(out_tensor, "test");
   ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(new_tensor_proto, transposed_data));
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 4eb599eb5017..8e4e05be8245 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -6,6 +6,7 @@
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder.h"
+#include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 #include "core/framework/allocator.h"
 
 #include "QnnOpDef.h"
@@ -57,7 +58,7 @@ class BaseOpBuilder : public IOpBuilder {
                                           const std::vector<std::string>& input_names,
                                           size_t output_index,
                                           Qnn_DataType_t qnn_data_type,
-                                          Qnn_QuantizeParams_t& quant_param) const ORT_MUST_USE_RESULT {
+                                          QnnQuantParamsWrapper& quant_param) const ORT_MUST_USE_RESULT {
     // Do nothing by default. Op builders like Split implement this function to override output quant params.
     ORT_UNUSED_PARAMETER(qnn_model_wrapper);
     ORT_UNUSED_PARAMETER(node_unit);
@@ -110,7 +111,7 @@ class BaseOpBuilder : public IOpBuilder {
                                                   size_t input_index,
                                                   size_t output_index,
                                                   Qnn_DataType_t qnn_data_type,
-                                                  Qnn_QuantizeParams_t& quant_param) const ORT_MUST_USE_RESULT;
+                                                  QnnQuantParamsWrapper& quant_param) const ORT_MUST_USE_RESULT;
 
   static const std::string& GetQnnOpType(const std::string& onnx_op_type) {
     // TODO: Use QNN operator names defined in "QnnOpDef.h"
@@ -320,6 +321,8 @@ class BaseOpBuilder : public IOpBuilder {
 
  private:
   std::string op_builder_type_;
+
+ protected:
   const std::vector<size_t> nchw2nhwc_perm{0, 2, 3, 1};
   const std::vector<size_t> nchw2hwcn_perm{2, 3, 1, 0};
   const std::vector<size_t> cnhw2hwcn_perm{2, 3, 0, 1};
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
index 294aa659872c..70ad00b90c9d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
@@ -260,13 +260,16 @@ class BatchNormOpBuilder : public BaseOpBuilder {
     uint32_t channel = mean_info.shape[0];
     mean_out.resize(channel);
     ORT_RETURN_IF_ERROR(AssertUnpackedTensorSize(mean_info.qnn_data_type, channel, mean_raw_ptr_length));
+    ORT_RETURN_IF_NOT(!is_npu_backend || mean_info.quant_param.IsPerTensor(),
+                      "BatchNormalization's input_mean does not support per-channel quantization");
     int i = 0;
     int offset = 0;
+    const Qnn_QuantizeParams_t& quant_param = mean_info.quant_param.Get();
     for (; i < static_cast<int>(channel); ++i) {
       double mean_value = 0.0;
       ORT_RETURN_IF_ERROR(GetValueOnQnnDataType(mean_info.qnn_data_type, mean_raw_ptr + offset, mean_value, offset));
-      mean_out[i] = (is_npu_backend) ? utils::Dequantize(mean_info.quant_param.scaleOffsetEncoding.offset,
-                                                         mean_info.quant_param.scaleOffsetEncoding.scale,
+      mean_out[i] = (is_npu_backend) ? utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                         quant_param.scaleOffsetEncoding.scale,
                                                          mean_value)
                                      : mean_value;
     }
@@ -283,13 +286,16 @@ class BatchNormOpBuilder : public BaseOpBuilder {
     uint32_t channel = var_info.shape[0];
     std_out.resize(channel);
     ORT_RETURN_IF_ERROR(AssertUnpackedTensorSize(var_info.qnn_data_type, channel, var_raw_ptr_length));
+    ORT_RETURN_IF_NOT(!is_npu_backend || var_info.quant_param.IsPerTensor(),
+                      "BatchNormalization's input_var does not support per-channel quantization");
     int i = 0;
     int offset = 0;
+    const Qnn_QuantizeParams_t& quant_param = var_info.quant_param.Get();
     for (; i < static_cast<int>(channel); ++i) {
       double var_value = 0.0;
       ORT_RETURN_IF_ERROR(GetValueOnQnnDataType(var_info.qnn_data_type, var_raw_ptr + offset, var_value, offset));
-      std_out[i] = (is_npu_backend) ? utils::Dequantize(var_info.quant_param.scaleOffsetEncoding.offset,
-                                                        var_info.quant_param.scaleOffsetEncoding.scale,
+      std_out[i] = (is_npu_backend) ? utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                        quant_param.scaleOffsetEncoding.scale,
                                                         var_value)
                                     : var_value;
       std_out[i] = std::sqrt(std_out[i] + static_cast<double>(epsilon));
@@ -309,13 +315,16 @@ class BatchNormOpBuilder : public BaseOpBuilder {
     uint32_t channel = scale_info.shape[0];
     scale_out.resize(channel);
     ORT_RETURN_IF_ERROR(AssertUnpackedTensorSize(scale_info.qnn_data_type, channel, scale_raw_ptr_length));
+    ORT_RETURN_IF_NOT(!is_npu_backend || scale_info.quant_param.IsPerTensor(),
+                      "BatchNormalization's scale input does not support per-channel quantization");
     int i = 0;
     int offset = 0;
+    const Qnn_QuantizeParams_t& quant_param = scale_info.quant_param.Get();
     for (; i < static_cast<int>(channel); ++i) {
       double scale_value = 0.0;
       ORT_RETURN_IF_ERROR(GetValueOnQnnDataType(scale_info.qnn_data_type, scale_raw_ptr + offset, scale_value, offset));
-      scale_out[i] = (is_npu_backend) ? utils::Dequantize(scale_info.quant_param.scaleOffsetEncoding.offset,
-                                                          scale_info.quant_param.scaleOffsetEncoding.scale,
+      scale_out[i] = (is_npu_backend) ? utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                          quant_param.scaleOffsetEncoding.scale,
                                                           scale_value)
                                       : scale_value;
       scale_out[i] = scale_out[i] / std_double_tensor[i];
@@ -338,13 +347,16 @@ class BatchNormOpBuilder : public BaseOpBuilder {
     uint32_t channel = bias_info.shape[0];
     bias_out.resize(channel);
     ORT_RETURN_IF_ERROR(AssertUnpackedTensorSize(bias_info.qnn_data_type, channel, bias_raw_ptr_length));
+    ORT_RETURN_IF_NOT(!is_npu_backend || bias_info.quant_param.IsPerTensor(),
+                      "BatchNormalization's bias input does not support per-channel quantization");
     int i = 0;
     int offset = 0;
+    const Qnn_QuantizeParams_t& quant_param = bias_info.quant_param.Get();
     for (; i < static_cast<int>(channel); ++i) {
       double bias_value = 0.0;
       ORT_RETURN_IF_ERROR(GetValueOnQnnDataType(bias_info.qnn_data_type, bias_raw_ptr + offset, bias_value, offset));
-      bias_out[i] = (is_npu_backend) ? utils::Dequantize(bias_info.quant_param.scaleOffsetEncoding.offset,
-                                                         bias_info.quant_param.scaleOffsetEncoding.scale,
+      bias_out[i] = (is_npu_backend) ? utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                         quant_param.scaleOffsetEncoding.scale,
                                                          bias_value)
                                      : bias_value;
       bias_out[i] = bias_out[i] - (mean_double_tensor[i] * scale_double_tensor[i]);
@@ -359,7 +371,7 @@ class BatchNormOpBuilder : public BaseOpBuilder {
                      const std::vector<double>& double_tensor,
                      const double rmax,
                      const double rmin,
-                     Qnn_QuantizeParams_t& quant_param,
+                     QnnQuantParamsWrapper& quant_param,
                      std::vector<uint8_t>& raw_tensor) const {
     if (is_npu_backend) {
       raw_tensor.resize(double_tensor.size());
@@ -370,8 +382,7 @@ class BatchNormOpBuilder : public BaseOpBuilder {
                                                 info.qnn_data_type,
                                                 scale,
                                                 zero_point));
-      quant_param = QNN_QUANTIZE_PARAMS_INIT;
-      utils::InitializeQuantizeParam(quant_param, true, scale, zero_point);
+      quant_param = QnnQuantParamsWrapper(scale, zero_point);
       for (size_t i = 0; i < double_tensor.size(); ++i) {
         // onnx only supports 8 bits quantization
         int quant_value_int = 0;
@@ -382,6 +393,7 @@ class BatchNormOpBuilder : public BaseOpBuilder {
           int8_t quant_value = static_cast<int8_t>(quant_value_int);
           raw_tensor[i] = *reinterpret_cast<uint8_t*>(&quant_value);
         } else {
+          // TODO(adrianlizarraga): Should support 16-bit quantization as well.
           ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", info.qnn_data_type);
         }
       }
@@ -545,7 +557,7 @@ Status BatchNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
     if (!qnn_model_wrapper.IsQnnTensorWrapperExist(scale_name)) {
       std::vector<uint8_t> scale_raw_tensor;
-      Qnn_QuantizeParams_t scale_quant_param = scale_info.quant_param;
+      QnnQuantParamsWrapper scale_quant_param = scale_info.quant_param;
       ORT_RETURN_IF_ERROR(Postprocess(scale_info,
                                       is_npu_backend,
                                       scale_double_tensor,
@@ -554,15 +566,16 @@ Status BatchNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                       scale_quant_param,
                                       scale_raw_tensor));
       Qnn_TensorType_t scale_tensor_type = GetInputTensorType(qnn_model_wrapper, scale_name);
-      QnnTensorWrapper input_tensorwrapper(scale_name, scale_tensor_type, scale_info.qnn_data_type, scale_quant_param,
-                                           std::move(scale_info.shape), std::move(scale_raw_tensor));
+      QnnTensorWrapper input_tensorwrapper(scale_name, scale_tensor_type, scale_info.qnn_data_type,
+                                           std::move(scale_quant_param), std::move(scale_info.shape),
+                                           std::move(scale_raw_tensor));
       ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
     }
     input_names.push_back(scale_name);
 
     if (!qnn_model_wrapper.IsQnnTensorWrapperExist(bias_name)) {
       std::vector<uint8_t> bias_raw_tensor;
-      Qnn_QuantizeParams_t bias_quant_param = bias_info.quant_param;
+      QnnQuantParamsWrapper bias_quant_param = bias_info.quant_param;
       ORT_RETURN_IF_ERROR(Postprocess(bias_info,
                                       is_npu_backend,
                                       bias_double_tensor,
@@ -571,8 +584,9 @@ Status BatchNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                       bias_quant_param,
                                       bias_raw_tensor));
       Qnn_TensorType_t bias_tensor_type = GetInputTensorType(qnn_model_wrapper, bias_name);
-      QnnTensorWrapper input_tensorwrapper(bias_name, bias_tensor_type, bias_info.qnn_data_type, bias_quant_param,
-                                           std::move(bias_info.shape), std::move(bias_raw_tensor));
+      QnnTensorWrapper input_tensorwrapper(bias_name, bias_tensor_type, bias_info.qnn_data_type,
+                                           std::move(bias_quant_param), std::move(bias_info.shape),
+                                           std::move(bias_raw_tensor));
       ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
     }
     input_names.push_back(bias_name);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
index 000a94f888e9..ee2bb8a7f5b8 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
@@ -70,7 +70,7 @@ Status CastOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                             type_proto,
                                             qnn_data_type));
 
-  QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, qnn_data_type, QNN_QUANTIZE_PARAMS_INIT,
+  QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, qnn_data_type, QnnQuantParamsWrapper(),
                                        std::move(input_shape), std::move(unpacked_tensor));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)),
                     "Failed to add input tensor for QNN Cast node.");
@@ -106,7 +106,7 @@ Status CastOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   QnnTensorWrapper output_tensorwrapper(output_name,
                                         tensor_type,
                                         qnn_data_type,
-                                        QNN_QUANTIZE_PARAMS_INIT,
+                                        QnnQuantParamsWrapper(),
                                         std::move(output_shape));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)),
                     "Failed to add output tensor for QNN Cast node.");
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index 0a9f9889ad2d..dc99687e78d3 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -36,6 +36,27 @@ class ClipOpBuilder : public BaseOpBuilder {
   Status ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
 };
 
+static Status ProcessClipMinMax(QnnModelWrapper& qnn_model_wrapper,
+                                const NodeUnitIODef& input,
+                                float& float_value) {
+  TensorInfo input_info = {};
+  std::vector<uint8_t> val_bytes;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(input, input_info));
+  assert(input_info.is_initializer);  // Checked by ExplicitOpCheck().
+  if (QNN_DATATYPE_FLOAT_16 == input_info.qnn_data_type) {
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, val_bytes));
+    MLFloat16 fp16_value = *reinterpret_cast<const MLFloat16*>(val_bytes.data());
+    float_value = fp16_value.ToFloat();
+  } else {
+    ORT_RETURN_IF_NOT(QNN_DATATYPE_FLOAT_32 == input_info.qnn_data_type,
+                      "QNN EP: The 'min' input of the Clip operator must be of type float32.");
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, val_bytes));
+    float_value = *reinterpret_cast<const float*>(val_bytes.data());
+  }
+
+  return Status::OK();
+}
+
 Status ClipOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
   if (node_unit.Inputs().size() > 1) {
     const auto& min_input_name = node_unit.Inputs()[1].node_arg.Name();
@@ -75,54 +96,36 @@ Status ClipOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   const Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
   std::vector<std::string> param_tensor_names;
 
-  auto get_f32_from_bytes = [](const std::vector<uint8_t>& bytes, float default_val) -> float {
-    return bytes.empty() ? default_val : *reinterpret_cast<const float*>(bytes.data());
-  };
-
   // Set the 'min' parameter.
-  {
-    std::vector<uint8_t> min_val_bytes;
-
-    if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) {
-      TensorInfo min_input_info = {};
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], min_input_info));
-      ORT_RETURN_IF_NOT(min_input_info.qnn_data_type == qnn_data_type,
-                        "QNN EP: The 'min' input of the Clip operator must be of type float32.");
-      assert(min_input_info.is_initializer);  // Checked by ExplicitOpCheck().
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*min_input_info.initializer_tensor, min_val_bytes));
-    }
+  Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT;
+  min_qnn_scalar.dataType = qnn_data_type;
 
-    Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT;
-    min_qnn_scalar.dataType = qnn_data_type;
-    min_qnn_scalar.floatValue = get_f32_from_bytes(min_val_bytes, std::numeric_limits<float>::lowest());
-    QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
-                                    min_qnn_scalar);
-    param_tensor_names.push_back(min_value_param.GetParamTensorName());
-    qnn_model_wrapper.AddParamWrapper(std::move(min_value_param));
+  if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) {
+    ORT_RETURN_IF_ERROR(ProcessClipMinMax(qnn_model_wrapper, inputs[1], min_qnn_scalar.floatValue));
+  } else {
+    min_qnn_scalar.floatValue = std::numeric_limits<float>::lowest();
   }
 
+  QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
+                                  min_qnn_scalar);
+  param_tensor_names.push_back(min_value_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(min_value_param));
+
   // Set the 'max' parameter.
-  {
-    std::vector<uint8_t> max_val_bytes;
-
-    if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) {
-      TensorInfo max_input_info = {};
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[2], max_input_info));
-      ORT_RETURN_IF_NOT(max_input_info.qnn_data_type == qnn_data_type,
-                        "QNN EP: The 'max' input of the Clip operator must of type float32.");
-      assert(max_input_info.is_initializer);  // Checked by ExplicitOpCheck().
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*max_input_info.initializer_tensor, max_val_bytes));
-    }
+  Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT;
+  max_qnn_scalar.dataType = qnn_data_type;
 
-    Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT;
-    max_qnn_scalar.dataType = qnn_data_type;
-    max_qnn_scalar.floatValue = get_f32_from_bytes(max_val_bytes, std::numeric_limits<float>::max());
-    QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
-                                    max_qnn_scalar);
-    param_tensor_names.push_back(max_value_param.GetParamTensorName());
-    qnn_model_wrapper.AddParamWrapper(std::move(max_value_param));
+  if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) {
+    ORT_RETURN_IF_ERROR(ProcessClipMinMax(qnn_model_wrapper, inputs[2], max_qnn_scalar.floatValue));
+  } else {
+    max_qnn_scalar.floatValue = std::numeric_limits<float>::max();
   }
 
+  QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
+                                  max_qnn_scalar);
+  param_tensor_names.push_back(max_value_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(max_value_param));
+
   ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
                                      std::move(input_names),
                                      std::move(param_tensor_names),
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index 84b6cad9c41c..a1966168a81a 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -116,6 +116,22 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     }
   }
 
+  // Validate that weight is signed type for per-channel quantization (required by QNN docs).
+  if (is_npu_backend) {
+    const auto& input_1 = inputs[1];  // weight
+    bool is_per_axis_quant = false;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant));
+
+    if (is_per_axis_quant) {
+      int32_t elem_data_type = 0;
+      ORT_RETURN_IF_ERROR(utils::GetOnnxTensorElemDataType(input_1.node_arg, elem_data_type));
+
+      const bool is_signed_type = (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT8) ||
+                                  (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16);
+      ORT_RETURN_IF_NOT(is_signed_type, "Conv weights must be of a signed quantized type if quantized per-channel");
+    }
+  }
+
   return Status::OK();
 }
 
@@ -171,7 +187,7 @@ Status ConvOpBuilder::ProcessConv2DInputs(QnnModelWrapper& qnn_model_wrapper,
   ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));
 
   //
-  // Input 1: weight
+  // Input 1: weight. This input must be transposed manually by QNN EP.
   //
   {
     const std::string& input1_name = inputs[1].node_arg.Name();
@@ -203,8 +219,18 @@ Status ConvOpBuilder::ProcessConv2DInputs(QnnModelWrapper& qnn_model_wrapper,
       } else {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
       }
+
+      // Transpose quantization parameter's axis if this is using per-channel quantization.
+      if (input_info.quant_param.IsPerChannel()) {
+        const std::vector<size_t>& perm = conv_type == OnnxConvType::kConv ? nchw2hwcn_perm : cnhw2hwcn_perm;
+        std::vector<size_t> perm_inv(perm.size());
+        ORT_RETURN_IF_ERROR(utils::InvertPerm<size_t>(perm, perm_inv));
+        ORT_RETURN_IF_ERROR(input_info.quant_param.HandleTranspose<size_t>(perm_inv));
+      }
     } else {
       // Add transpose node above weight input.
+      ORT_RETURN_IF(input_info.quant_param.IsPerChannel(),
+                    "Non-constant Conv inputs only support per-tensor quantization");
       bool is_graph_input = qnn_model_wrapper.IsGraphInput(input1_name);
       LOGS(logger, VERBOSE) << "Add HWCN Transpose node after input: " << input1_name;
 
@@ -234,7 +260,8 @@ Status ConvOpBuilder::ProcessConv2DInputs(QnnModelWrapper& qnn_model_wrapper,
     }
 
     Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, actual_name);
-    QnnTensorWrapper input_tensorwrapper(actual_name, tensor_type, input_info.qnn_data_type, input_info.quant_param,
+    QnnTensorWrapper input_tensorwrapper(actual_name, tensor_type, input_info.qnn_data_type,
+                                         std::move(input_info.quant_param),
                                          std::move(actual_shape), std::move(unpacked_tensor));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
   }
@@ -288,6 +315,9 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       };
 
       if (!input0_info.is_initializer) {
+        ORT_RETURN_IF(input0_info.quant_param.IsPerChannel(),
+                      "Non-constant Conv inputs only support per-tensor quantization");
+
         // Add Reshape node to transform 1D input to 2D (i.e., set height to 1).
         // We don't need to do this for initializers, because the number of elements does not change. We can just
         // modify the shape dimensions.
@@ -300,11 +330,15 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
                                                              input0_info.quant_param,
                                                              do_op_validation,
                                                              is_graph_input));
+      } else if (input0_info.quant_param.IsPerChannel()) {
+        // The reshape (unsqueeze) may require us to shift the quant parameter's axis.
+        ORT_RETURN_IF_ERROR(input0_info.quant_param.HandleUnsqueeze<uint32_t>(input0_info.shape, shape));
       }
 
       Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, conv_input0_name);
-      QnnTensorWrapper input_tensorwrapper(conv_input0_name, tensor_type, input0_info.qnn_data_type, input0_info.quant_param,
-                                           std::move(shape), std::move(unpacked_tensor));
+      QnnTensorWrapper input_tensorwrapper(conv_input0_name, tensor_type, input0_info.qnn_data_type,
+                                           std::move(input0_info.quant_param), std::move(shape),
+                                           std::move(unpacked_tensor));
       ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
     } else {
       LOGS(logger, VERBOSE) << "Tensor already added, skip it: " << input0_name;
@@ -370,6 +404,11 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       ONNX_NAMESPACE::TensorProto reshaped_initializer = onnxruntime::utils::TensorToTensorProto(tensor_2d,
                                                                                                  reshape_output);
 
+      // The reshape (unsqueeze) may require us to shift the quant parameter's axis.
+      if (input_info.quant_param.IsPerChannel()) {
+        ORT_RETURN_IF_ERROR(input_info.quant_param.HandleUnsqueeze<uint32_t>(input_info.shape, shape_2d));
+      }
+
       //
       // Get transposed initializer bytes.
       //
@@ -380,8 +419,19 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       } else {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
       }
+
+      // Transpose quantization parameter's axis if this is using per-channel quantization.
+      if (input_info.quant_param.IsPerChannel()) {
+        const std::vector<size_t>& perm = conv_type == OnnxConvType::kConv ? nchw2hwcn_perm : cnhw2hwcn_perm;
+        std::vector<size_t> perm_inv(perm.size());
+        ORT_RETURN_IF_ERROR(utils::InvertPerm<size_t>(perm, perm_inv));
+        ORT_RETURN_IF_ERROR(input_info.quant_param.HandleTranspose<size_t>(perm_inv));
+      }
     } else {
       // Dynamic weight: Add nodes to reshape to 2D, and then transpose.
+      ORT_RETURN_IF(input_info.quant_param.IsPerChannel(),
+                    "Non-constant Conv inputs only support per-tensor quantization");
+
       bool is_graph_input = qnn_model_wrapper.IsGraphInput(input1_name);
       LOGS(logger, VERBOSE) << "Adding Reshape (to 2D) and HWCN Transpose node after input: " << input1_name;
       ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(input1_name,
@@ -419,7 +469,8 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, conv_weight_input_name);
     QnnTensorWrapper input_tensorwrapper(conv_weight_input_name, tensor_type, input_info.qnn_data_type,
-                                         input_info.quant_param, std::move(final_shape), std::move(unpacked_tensor));
+                                         std::move(input_info.quant_param), std::move(final_shape),
+                                         std::move(unpacked_tensor));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
   }
 
@@ -648,17 +699,13 @@ Status ConvOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
 
   const std::string& output_node_type = is_depthwise_conv2d ? QNN_OP_DEPTH_WISE_CONV_2D : GetQnnOpType(node_unit.OpType());
 
-  Qnn_QuantizeParams_t output_quantize_param = QNN_QUANTIZE_PARAMS_INIT;
+  QnnQuantParamsWrapper output_quantize_param;
+  ORT_RETURN_IF_ERROR(output_quantize_param.Init(qnn_model_wrapper, outputs[0]));
   bool is_quantized_tensor = outputs[0].quant_param.has_value();
-  utils::InitializeQuantizeParam(output_quantize_param, is_quantized_tensor);
 
   const auto* type_proto = outputs[0].node_arg.TypeAsProto();
   Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
   ORT_RETURN_IF_ERROR(utils::GetQnnDataType(is_quantized_tensor, type_proto, qnn_data_type));
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.ProcessQuantizationParameter(outputs[0].quant_param,
-                                                                   output_quantize_param.scaleOffsetEncoding.scale,
-                                                                   output_quantize_param.scaleOffsetEncoding.offset),
-                    "Cannot get quantization parameter");
 
   if (is_1d_conv) {
     const bool is_graph_output = qnn_model_wrapper.IsGraphOutput(output_name);
@@ -669,8 +716,8 @@ Status ConvOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
         output_shape[2],  // C
     };
     const std::string conv_output_name = output_name + "_ort_qnn_ep_conv2d";
-    QnnTensorWrapper output_tensorwrapper(conv_output_name, QNN_TENSOR_TYPE_NATIVE, qnn_data_type, output_quantize_param,
-                                          std::vector<uint32_t>(output_shape_2d));
+    QnnTensorWrapper output_tensorwrapper(conv_output_name, QNN_TENSOR_TYPE_NATIVE, qnn_data_type,
+                                          output_quantize_param.Copy(), std::vector<uint32_t>(output_shape_2d));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
     ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(GetNodeName(node_unit),
                                                       QNN_OP_PACKAGE_NAME_QTI_AISW,
@@ -693,8 +740,8 @@ Status ConvOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   } else {
     const bool is_graph_output = qnn_model_wrapper.IsGraphOutput(output_name);
     Qnn_TensorType_t tensor_type = is_graph_output ? QNN_TENSOR_TYPE_APP_READ : QNN_TENSOR_TYPE_NATIVE;
-    QnnTensorWrapper output_tensorwrapper(output_name, tensor_type, qnn_data_type, output_quantize_param,
-                                          std::move(output_shape));
+    QnnTensorWrapper output_tensorwrapper(output_name, tensor_type, qnn_data_type,
+                                          std::move(output_quantize_param), std::move(output_shape));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
     ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(GetNodeName(node_unit),
                                                       QNN_OP_PACKAGE_NAME_QTI_AISW,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/convert_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/convert_op_builder.cc
new file mode 100644
index 000000000000..977a9e0b3d9d
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/convert_op_builder.cc
@@ -0,0 +1,103 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/qdq_transformer/qdq_util.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/common/safeint.h"
+#include "onnx/defs/data_type_utils.h"
+
+#include "QnnOpDef.h"  // From QNN SDK: contains QNN constants (e.g., op names, param values).
+
+namespace onnxruntime {
+namespace qnn {
+
+class ConvertOpBuilder : public BaseOpBuilder {
+ public:
+  ConvertOpBuilder() : BaseOpBuilder("ConvertOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ConvertOpBuilder);
+
+  Status AddConvertToModelBuilder(QnnModelWrapper& qnn_model_wrapper,
+                                  const NodeUnit& dq_node_unit,
+                                  const NodeUnit& q_node_unit,
+                                  const logging::Logger& logger,
+                                  bool do_op_validation) const ORT_MUST_USE_RESULT;
+};
+
+Status ConvertOpBuilder::AddConvertToModelBuilder(QnnModelWrapper& qnn_model_wrapper,
+                                                  const NodeUnit& dq_node_unit,
+                                                  const NodeUnit& q_node_unit,
+                                                  const logging::Logger& logger,
+                                                  bool do_op_validation) const {
+  std::vector<std::string> input_names;
+
+  // Process the input from the DQ node
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, dq_node_unit.Inputs()[0], logger, input_names));
+
+  // Process the output from the Q node. Override the QNN operator type to "Convert".
+  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, q_node_unit, std::move(input_names), {},
+                                     logger, do_op_validation, QNN_OP_CONVERT));
+  return Status::OK();
+}
+
+HandleConvertResult TryHandleConvertSequence(QnnModelWrapper& qnn_model_wrapper,
+                                             const NodeUnit& maybe_dq_node_unit,
+                                             const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
+                                             const logging::Logger& logger,
+                                             bool do_op_validation) {
+  const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
+
+  // Looking for a standalone DQ to start the sequence.
+  if (maybe_dq_node_unit.OpType() != QDQ::DQOpName || maybe_dq_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
+    return {};
+  }
+
+  const Node& dq_node = maybe_dq_node_unit.GetNode();
+
+  // DQ must have a single Q child. DQ must not produce a graph output.
+  auto children = graph_utils::FindChildrenByType(dq_node, QDQ::QOpName);
+  if (children.size() != 1 || dq_node.GetOutputEdgesCount() != 1 || graph_viewer.NodeProducesGraphOutput(dq_node)) {
+    return {};
+  }
+
+  const Node& q_node = *children[0];
+  const auto q_node_unit_it = node_unit_map.find(&q_node);
+
+  if (q_node_unit_it == node_unit_map.end()) {
+    return {ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Node does not have a corresponding NodeUnit"), nullptr};
+  }
+
+  const NodeUnit* q_node_unit = q_node_unit_it->second;
+
+  // Q child must not already be part of a QDQ NodeUnit (i.e., be standalone).
+  if (q_node_unit->UnitType() != NodeUnit::Type::SingleNode) {
+    return {};
+  }
+
+  auto get_const_initializer = [&graph_viewer](const std::string& initializer_name) {
+    return graph_viewer.GetConstantInitializer(initializer_name, true);
+  };
+
+  // DQ and Q must have equal scale type and different zp type.
+  if (!QDQ::IsDQQConversion(dq_node, q_node, get_const_initializer, graph_viewer.ModelPath())) {
+    return {};
+  }
+
+  ConvertOpBuilder op_builder;
+
+  LOGS(logger, VERBOSE) << " Adding QNN Convert. dq_node name: [" << dq_node.Name()
+                        << "] dq_node optype: [" << dq_node.OpType()
+                        << "] q_node name: [" << q_node_unit->Name()
+                        << "] q_node optype: [" << q_node_unit->OpType()
+                        << "]";
+
+  auto status = op_builder.AddConvertToModelBuilder(qnn_model_wrapper, maybe_dq_node_unit, *q_node_unit, logger,
+                                                    do_op_validation);
+  return status.IsOK() ? HandleConvertResult{status, q_node_unit} : HandleConvertResult{status, nullptr};
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index 90e18e9fd049..9e31cf9cae21 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -30,7 +30,7 @@ class ExpandOpBuilder : public BaseOpBuilder {
                                   const std::vector<std::string>& input_names,
                                   size_t output_index,
                                   Qnn_DataType_t qnn_data_type,
-                                  Qnn_QuantizeParams_t& quant_param) const override ORT_MUST_USE_RESULT;
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 };
 
 template <typename T>
@@ -75,7 +75,7 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   bool is_quantized_tensor = inputs[0].quant_param.has_value();
   Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
   const auto* type_proto = inputs[0].node_arg.TypeAsProto();
-  Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT;
+  QnnQuantParamsWrapper quantize_param;
   if (is_quantized_tensor) {
     ORT_RETURN_IF_ERROR(utils::GetQnnDataType(true, type_proto, qnn_data_type));
     float scale = 0.0f;
@@ -87,7 +87,7 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                               qnn_data_type,
                                               scale,
                                               zero_point));
-    utils::InitializeQuantizeParam(quantize_param, true, scale, zero_point);
+    quantize_param = QnnQuantParamsWrapper(scale, zero_point);
     int quant_value_int = 0;
     double ini_value = 1.0;
     ORT_RETURN_IF_ERROR(utils::Quantize(ini_value, scale, zero_point, qnn_data_type, quant_value_int));
@@ -129,8 +129,9 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
   const std::string& output_name = node_unit.Outputs()[0].node_arg.Name();
   std::string shape_input_name(input_name + "_" + output_name);
-  QnnTensorWrapper input_tensorwrapper(shape_input_name, QNN_TENSOR_TYPE_STATIC, qnn_data_type, quantize_param,
-                                       std::move(input_shape), std::move(shape_data));
+  QnnTensorWrapper input_tensorwrapper(shape_input_name, QNN_TENSOR_TYPE_STATIC, qnn_data_type,
+                                       std::move(quantize_param), std::move(input_shape),
+                                       std::move(shape_data));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
 
   input_names.push_back(shape_input_name);
@@ -144,7 +145,11 @@ Status ExpandOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrap
                                                  const std::vector<std::string>& input_names,
                                                  size_t output_index,
                                                  Qnn_DataType_t qnn_data_type,
-                                                 Qnn_QuantizeParams_t& quant_param) const {
+                                                 QnnQuantParamsWrapper& quant_param) const {
+  if (!quant_param.IsPerTensor()) {
+    return Status::OK();
+  }
+
   // Force Expand output to use the same quantization parameters as the input if they are nearly equal.
   // This enables the HTP backend to employ certain optimizations.
   return SetOutputQParamEqualToInputIfNearlyEqual(qnn_model_wrapper, node_unit, logger, input_names,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
index 9f396a27369e..40bfc7015889 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
@@ -80,14 +80,11 @@ Status GatherOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     qnn_data_type = QNN_DATATYPE_INT_32;
   }
 
-  // Even for Quantized model, Gather indices use int32 without quantization
-  Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT;
-
   Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, input_name);
   std::vector<uint32_t> input_shape;
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[1].node_arg, input_shape), "Cannot get shape");
   std::vector<uint32_t> cast_output_shape(input_shape);
-  QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, qnn_data_type, quantize_param,
+  QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, qnn_data_type, QnnQuantParamsWrapper(),
                                        std::move(input_shape), std::move(gather_indices));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
 
@@ -96,8 +93,8 @@ Status GatherOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     if (qnn_data_type == QNN_DATATYPE_INT_64) {
       // Add Cast node for indices
       indices_input_name = input_name + "_ort_qnn_ep_cast";
-      QnnTensorWrapper cast_output(indices_input_name, QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_INT_32, quantize_param,
-                                   std::move(cast_output_shape));
+      QnnTensorWrapper cast_output(indices_input_name, QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_INT_32,
+                                   QnnQuantParamsWrapper(), std::move(cast_output_shape));
       ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(cast_output)), "Failed to add tensor.");
       ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(indices_input_name,
                                                         QNN_OP_PACKAGE_NAME_QTI_AISW,
@@ -157,18 +154,14 @@ Status GatherOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   const auto& gather_output = node_unit.Outputs()[0];
   const auto& output_name = gather_output.node_arg.Name();
 
-  Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT;
-  bool is_quantized_tensor = gather_output.quant_param.has_value();
-  utils::InitializeQuantizeParam(quantize_param, is_quantized_tensor);
+  QnnQuantParamsWrapper quantize_param;
+  ORT_RETURN_IF_ERROR(quantize_param.Init(qnn_model_wrapper, gather_output));
 
   const auto* type_proto = gather_output.node_arg.TypeAsProto();
   Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
-  ORT_RETURN_IF_ERROR(utils::GetQnnDataType(is_quantized_tensor, type_proto, qnn_data_type));
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.ProcessQuantizationParameter(gather_output.quant_param,
-                                                                   quantize_param.scaleOffsetEncoding.scale,
-                                                                   quantize_param.scaleOffsetEncoding.offset),
-                    "Cannot get quantization parameter");
-  if (is_quantized_tensor) {
+  ORT_RETURN_IF_ERROR(utils::GetQnnDataType(quantize_param.IsQuantized(), type_proto, qnn_data_type));
+
+  if (quantize_param.IsPerTensor()) {
     // Make sure the output quantization parameters are equal to the input.
     ORT_RETURN_IF_ERROR(SetOutputQParamEqualToInputIfNearlyEqual(qnn_model_wrapper, node_unit, logger, input_names,
                                                                  0 /*input_index*/, 0 /*output_index*/, qnn_data_type,
@@ -183,7 +176,7 @@ Status GatherOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   bool reshape_required = (qnn_output_shape.size() != target_output_shape.size());
   std::string gather_output_name = output_name + (reshape_required ? "_ort_qnn_ep_reshape" : "");
   Qnn_TensorType_t tensor_type = (!reshape_required && is_graph_output) ? QNN_TENSOR_TYPE_APP_READ : QNN_TENSOR_TYPE_NATIVE;
-  QnnTensorWrapper gather_output_wrapper(gather_output_name, tensor_type, qnn_data_type, quantize_param,
+  QnnTensorWrapper gather_output_wrapper(gather_output_name, tensor_type, qnn_data_type, quantize_param.Copy(),
                                          std::move(qnn_output_shape));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(gather_output_wrapper)), "Failed to add tensor.");
 
@@ -199,7 +192,7 @@ Status GatherOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   if (reshape_required) {
     // Add Reshape Node after Gather.
     Qnn_TensorType_t reshape_tensor_type = is_graph_output ? QNN_TENSOR_TYPE_APP_READ : QNN_TENSOR_TYPE_NATIVE;
-    QnnTensorWrapper reshape_output(output_name, reshape_tensor_type, qnn_data_type, quantize_param,
+    QnnTensorWrapper reshape_output(output_name, reshape_tensor_type, qnn_data_type, std::move(quantize_param),
                                     std::move(target_output_shape));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(reshape_output)), "Failed to add tensor.");
     const static std::string qnn_node_type = "Reshape";
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
index 338e46765736..bf409b8f508d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
@@ -87,10 +87,10 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
   const auto& inputs = node_unit.Inputs();
   for (size_t input_i = 0; input_i < inputs.size(); ++input_i) {
-    Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT;
-    bool is_quantized_tensor = inputs[input_i].quant_param.has_value();
-    utils::InitializeQuantizeParam(quantize_param, is_quantized_tensor);
+    QnnQuantParamsWrapper quantize_param;
+    ORT_RETURN_IF_ERROR(quantize_param.Init(qnn_model_wrapper, inputs[input_i]));
 
+    bool is_quantized_tensor = inputs[input_i].quant_param.has_value();
     const auto& input_name = inputs[input_i].node_arg.Name();
 
     // Only skip if the input tensor has already been added (by producer op) *and* we don't need
@@ -107,16 +107,12 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     std::vector<uint32_t> input_shape;
     ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[input_i].node_arg, input_shape), "Cannot get shape");
 
-    ORT_RETURN_IF_NOT(qnn_model_wrapper.ProcessQuantizationParameter(inputs[input_i].quant_param,
-                                                                     quantize_param.scaleOffsetEncoding.scale,
-                                                                     quantize_param.scaleOffsetEncoding.offset),
-                      "Cannot get quantization parameter");
-
     std::vector<uint8_t> unpacked_tensor;
     bool is_initializer_input = qnn_model_wrapper.IsInitializerInput(input_name);
     if (is_initializer_input) {
       const auto& input_tensor = qnn_model_wrapper.GetInitializerTensors().at(input_name);
       if (1 == input_trans_flag.at(input_i)) {
+        ORT_RETURN_IF_ERROR(quantize_param.HandleTranspose<size_t>(std::vector<size_t>({1, 0})));
         ORT_RETURN_IF_ERROR(TwoDimensionTranspose(qnn_model_wrapper,
                                                   input_shape,
                                                   *input_tensor,
@@ -128,6 +124,8 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
     std::string input_tensor_name = input_name;
     if (1 == input_trans_flag.at(input_i) && !is_initializer_input) {
+      ORT_RETURN_IF(quantize_param.IsPerChannel(), "Non-constant Gemm inputs only support per-tensor quantization");
+
       // Add Transpose node
       std::vector<uint32_t> old_input_shape(input_shape);
       input_shape[0] = old_input_shape[1];
@@ -148,7 +146,7 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
     input_names.push_back(input_tensor_name);
     Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, input_tensor_name);
-    QnnTensorWrapper input_tensorwrapper(input_tensor_name, tensor_type, qnn_data_type, quantize_param,
+    QnnTensorWrapper input_tensorwrapper(input_tensor_name, tensor_type, qnn_data_type, std::move(quantize_param),
                                          std::move(input_shape), std::move(unpacked_tensor));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
index 38172caa0376..5fe5e3bedd6e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
@@ -119,6 +119,9 @@ Status InstanceNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     };
 
     if (!input0_info.is_initializer) {
+      ORT_RETURN_IF(input0_info.quant_param.IsPerChannel(),
+                    "Non-constant InstanceNormalization inputs only support per-tensor quantization");
+
       // Add Reshape node to transform 1D input to 2D (i.e., set height to 1).
       // We don't need to do this for initializers, because the element layout does not change. We can just
       // modify the shape dimensions.
@@ -131,11 +134,15 @@ Status InstanceNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                                            input0_info.quant_param,
                                                            do_op_validation,
                                                            is_graph_input));
+    } else if (input0_info.quant_param.IsPerChannel()) {
+      // The reshape (unsqueeze) may require us to shift the quant parameter's axis.
+      ORT_RETURN_IF_ERROR(input0_info.quant_param.HandleUnsqueeze<uint32_t>(input0_info.shape, op_shape));
     }
 
     Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, op_input0_name);
-    QnnTensorWrapper input_tensorwrapper(op_input0_name, tensor_type, input0_info.qnn_data_type, input0_info.quant_param,
-                                         std::move(op_shape), std::move(initializer_data));
+    QnnTensorWrapper input_tensorwrapper(op_input0_name, tensor_type, input0_info.qnn_data_type,
+                                         std::move(input0_info.quant_param), std::move(op_shape),
+                                         std::move(initializer_data));
     ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
   } else {
     ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));  // Input 0
@@ -197,7 +204,7 @@ Status InstanceNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_m
   };
 
   QnnTensorWrapper output_tensorwrapper(op_output_name, QNN_TENSOR_TYPE_NATIVE, output_info.qnn_data_type,
-                                        output_info.quant_param, std::vector<uint32_t>(op_output_shape));
+                                        output_info.quant_param.Copy(), std::vector<uint32_t>(op_output_shape));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
   ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(GetNodeName(node_unit),
                                                     QNN_OP_PACKAGE_NAME_QTI_AISW,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index d6752f76ef47..3f73ef76e9de 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -77,47 +77,50 @@ Status ProcessConstantValue(QnnModelWrapper& qnn_model_wrapper,
   if (input.quant_param.has_value()) {
     // QNN prefers pad_constant_value quantized with quantization params same as in[0], and data stored as 32-bit signed integer
     // Onnx doesn't guarantee it has same quantization parameter as in[0], so get back the float32 value and use non-quantized data directly
+    ORT_RETURN_IF_NOT(input_info.quant_param.IsPerTensor(),
+                      "Pad's constant value must use per-tensor quantization");
+    const Qnn_QuantizeParams_t& quant_param = input_info.quant_param.Get();
     constant_value_qnn_scalar.dataType = QNN_DATATYPE_FLOAT_32;
     float constant_value = 0;
     switch (input_info.qnn_data_type) {
       case QNN_DATATYPE_SFIXED_POINT_8: {
         auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(unpacked_tensor));
-        constant_value = static_cast<float>(utils::Dequantize(input_info.quant_param.scaleOffsetEncoding.offset,
-                                                              input_info.quant_param.scaleOffsetEncoding.scale,
+        constant_value = static_cast<float>(utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                              quant_param.scaleOffsetEncoding.scale,
                                                               static_cast<double>(int8_span.data()[0])));
         break;
       }
       case QNN_DATATYPE_SFIXED_POINT_16: {
         auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(unpacked_tensor));
-        constant_value = static_cast<float>(utils::Dequantize(input_info.quant_param.scaleOffsetEncoding.offset,
-                                                              input_info.quant_param.scaleOffsetEncoding.scale,
+        constant_value = static_cast<float>(utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                              quant_param.scaleOffsetEncoding.scale,
                                                               static_cast<double>(int16_span.data()[0])));
         break;
       }
       case QNN_DATATYPE_SFIXED_POINT_32: {
         auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(unpacked_tensor));
-        constant_value = static_cast<float>(utils::Dequantize(input_info.quant_param.scaleOffsetEncoding.offset,
-                                                              input_info.quant_param.scaleOffsetEncoding.scale,
+        constant_value = static_cast<float>(utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                              quant_param.scaleOffsetEncoding.scale,
                                                               static_cast<double>(int32_span.data()[0])));
         break;
       }
       case QNN_DATATYPE_UFIXED_POINT_8: {
-        constant_value = static_cast<float>(utils::Dequantize(input_info.quant_param.scaleOffsetEncoding.offset,
-                                                              input_info.quant_param.scaleOffsetEncoding.scale,
+        constant_value = static_cast<float>(utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                              quant_param.scaleOffsetEncoding.scale,
                                                               static_cast<double>(unpacked_tensor.data()[0])));
         break;
       }
       case QNN_DATATYPE_UFIXED_POINT_16: {
         auto uint16_span = ReinterpretAsSpan<const uint16_t>(gsl::make_span(unpacked_tensor));
-        constant_value = static_cast<float>(utils::Dequantize(input_info.quant_param.scaleOffsetEncoding.offset,
-                                                              input_info.quant_param.scaleOffsetEncoding.scale,
+        constant_value = static_cast<float>(utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                              quant_param.scaleOffsetEncoding.scale,
                                                               static_cast<double>(uint16_span.data()[0])));
         break;
       }
       case QNN_DATATYPE_UFIXED_POINT_32: {
         auto uint32_span = ReinterpretAsSpan<const uint32_t>(gsl::make_span(unpacked_tensor));
-        constant_value = static_cast<float>(utils::Dequantize(input_info.quant_param.scaleOffsetEncoding.offset,
-                                                              input_info.quant_param.scaleOffsetEncoding.scale,
+        constant_value = static_cast<float>(utils::Dequantize(quant_param.scaleOffsetEncoding.offset,
+                                                              quant_param.scaleOffsetEncoding.scale,
                                                               static_cast<double>(uint32_span.data()[0])));
         break;
       }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index 872d9682b835..ef1990ad8e69 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -4,6 +4,7 @@
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/common/safeint.h"
@@ -35,7 +36,7 @@ class PoolOpBuilder : public BaseOpBuilder {
                                   const std::vector<std::string>& input_names,
                                   size_t output_index,
                                   Qnn_DataType_t qnn_data_type,
-                                  Qnn_QuantizeParams_t& quant_param) const override ORT_MUST_USE_RESULT;
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 
  private:
   Status SetCommonPoolParams(const NodeAttrHelper& node_helper, std::vector<uint32_t>& filter_size,
@@ -250,10 +251,10 @@ Status PoolOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrappe
                                                const std::vector<std::string>& input_names,
                                                size_t output_index,
                                                Qnn_DataType_t qnn_data_type,
-                                               Qnn_QuantizeParams_t& quant_param) const {
+                                               QnnQuantParamsWrapper& quant_param) const {
   // Force MaxPool outputs to use the same quantization parameters as the input if they are nearly equal.
   // This helps the HTP backend employ certain optimizations.
-  if (node_unit.OpType() == "MaxPool") {
+  if (node_unit.OpType() == "MaxPool" && quant_param.IsPerTensor()) {
     return SetOutputQParamEqualToInputIfNearlyEqual(qnn_model_wrapper, node_unit, logger, input_names,
                                                     0 /*input_index*/, output_index, qnn_data_type, quant_param);
   }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
index 4b06df6a0e63..b6f414da950d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
@@ -4,6 +4,7 @@
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
@@ -29,7 +30,7 @@ class ReshapeOpBuilder : public BaseOpBuilder {
                                   const std::vector<std::string>& input_names,
                                   size_t output_index,
                                   Qnn_DataType_t qnn_data_type,
-                                  Qnn_QuantizeParams_t& quant_param) const override ORT_MUST_USE_RESULT;
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 };
 
 Status ReshapeOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
@@ -57,7 +58,11 @@ Status ReshapeOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wra
                                                   const std::vector<std::string>& input_names,
                                                   size_t output_index,
                                                   Qnn_DataType_t qnn_data_type,
-                                                  Qnn_QuantizeParams_t& quant_param) const {
+                                                  QnnQuantParamsWrapper& quant_param) const {
+  if (!quant_param.IsPerTensor()) {
+    return Status::OK();
+  }
+
   // Force Reshape output to use the same quantization parameters as the input if nearly equal.
   // This helps the HTP backend emply certain optimizations.
   return SetOutputQParamEqualToInputIfNearlyEqual(qnn_model_wrapper, node_unit, logger, input_names,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index cc620b7a86a1..e1c9a391459b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -48,7 +48,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
                                   const std::vector<std::string>& input_names,
                                   size_t output_index,
                                   Qnn_DataType_t qnn_data_type,
-                                  Qnn_QuantizeParams_t& quant_param) const override ORT_MUST_USE_RESULT;
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 
  private:
   // Info for each ONNX attribute of interest (attribute name + default value)
@@ -148,7 +148,20 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
                     "QNN EP: Cannot get shape for Resize input");
   const size_t input_rank = input_shape.size();
 
-  // Validate Resize w/ "nearest" mode.
+  // Resize w/ "linear" mode.
+  // Translation matrix of ONNX Resize w/ "linear" mode on HTP backend.
+  // Table entries correspond to the QNN operator used for the given configuration
+  // (Resize = QNN Resize op, RBL = QNN ResizeBilinear op, X = Unsupported).
+  //
+  //                                                   input rank:
+  // coordinate_transformation_mode: |   < 3      3        4        5        > 5
+  // ---------------------------------------------------------------------------------
+  //                      half_pixel |    X     Resize    RBL     Resize       X
+  //              pytorch_half_pixel |    X     Resize    Resize  Resize       X
+  //                   align_corners |    X     Resize    RBL     Resize       X
+  //                      asymmetric |    X     Resize    RBL     Resize       X
+
+  // Resize w/ "nearest" mode.
   // Translation matrix of ONNX Resize w/ "nearest" mode on HTP backend.
   // Table entries correspond to the QNN operator used for the given configuration
   // (Resize = QNN Resize op, RNN = QNN ResizeNearestNeighbor op, X = Unsupported).
@@ -239,36 +252,74 @@ Status ResizeOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   std::vector<std::string> param_tensor_names;
   NodeAttrHelper node_helper(node_unit);
 
+  const auto& input_0 = node_unit.Inputs()[0];
+  std::vector<uint32_t> input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape),
+                    "QNN EP: Cannot get shape for Resize input");
+  const size_t input_rank = input_shape.size();
   const std::string interp_mode = GetOnnxAttr(node_helper, onnx_mode_attr);
   const std::string transformation_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr);
   const std::string nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr);
   const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
   std::string qnn_op_type = "Resize";
 
-  // Translate Resize with {mode: "nearest", nearest_mode: "floor", coordinate_transformation_mode: XXX} to
-  // QNN's ResizeNearestNeighbor operator on the HTP backend. This combination of parameters is not supported on HTP
-  // via QNN's Resize operator. Note that QNN's ResizeNearestNeighbor operator always uses "floor" rounding.
-  if (is_npu_backend && interp_mode == "nearest" && nearest_mode == "floor") {
+  if (is_npu_backend && input_rank == 4 && interp_mode == "nearest" && nearest_mode == "floor") {
+    // Translate Resize with
+    // {input_rank: 4, mode: "nearest", nearest_mode: "floor", coordinate_transformation_mode: XXX} to
+    // QNN's ResizeNearestNeighbor operator on the HTP backend. This combination of parameters is not supported on HTP
+    // via QNN's Resize operator. Note that QNN's ResizeNearestNeighbor operator always uses "floor" rounding.
     qnn_op_type = "ResizeNearestNeighbor";
 
-    // Parameter 'align_corners'
+    // 'align_corners'
+    Qnn_Scalar_t qnn_align_corners = QNN_SCALAR_INIT;
+    qnn_align_corners.dataType = QNN_DATATYPE_BOOL_8;
+    qnn_align_corners.bool8Value = static_cast<uint8_t>(transformation_mode == "align_corners");
+    QnnParamWrapper qnn_align_corners_param(node_unit.Index(), node_unit.Name(),
+                                            QNN_OP_RESIZE_NEAREST_NEIGHBOR_PARAM_ALIGN_CORNERS, qnn_align_corners);
+    param_tensor_names.push_back(qnn_align_corners_param.GetParamTensorName());
+    qnn_model_wrapper.AddParamWrapper(std::move(qnn_align_corners_param));
+
+    // 'half_pixel_centers'
+    Qnn_Scalar_t qnn_half_pixel = QNN_SCALAR_INIT;
+    qnn_half_pixel.dataType = QNN_DATATYPE_BOOL_8;
+    qnn_half_pixel.bool8Value = static_cast<uint8_t>(transformation_mode == "half_pixel");
+    QnnParamWrapper qnn_half_pixel_param(node_unit.Index(), node_unit.Name(),
+                                         QNN_OP_RESIZE_NEAREST_NEIGHBOR_PARAM_HALF_PIXEL_CENTERS, qnn_half_pixel);
+    param_tensor_names.push_back(qnn_half_pixel_param.GetParamTensorName());
+    qnn_model_wrapper.AddParamWrapper(std::move(qnn_half_pixel_param));
+  } else if (is_npu_backend && input_rank == 4 && interp_mode == "linear" &&
+             transformation_mode != "pytorch_half_pixel") {
+    // Translate Resize with
+    // {input_rank: 4, mode: "linear", coordinate_transformation_mode: XXX} to
+    // QNN's ResizeBilinear operator on the HTP backend. QNN ResizeBilinear seems to be faster than QNN Resize on
+    // Windows/HTP QNN SDK 2.19.2.
+    qnn_op_type = "ResizeBilinear";
+
+    // 'align_corners'
     Qnn_Scalar_t qnn_align_corners = QNN_SCALAR_INIT;
     qnn_align_corners.dataType = QNN_DATATYPE_BOOL_8;
     qnn_align_corners.bool8Value = static_cast<uint8_t>(transformation_mode == "align_corners");
+
     QnnParamWrapper qnn_align_corners_param(node_unit.Index(), node_unit.Name(),
                                             QNN_OP_RESIZE_BILINEAR_PARAM_ALIGN_CORNERS, qnn_align_corners);
+
     param_tensor_names.push_back(qnn_align_corners_param.GetParamTensorName());
     qnn_model_wrapper.AddParamWrapper(std::move(qnn_align_corners_param));
 
-    // Parameter 'half_pixel_centers'
+    // 'half_pixel_centers'
     Qnn_Scalar_t qnn_half_pixel = QNN_SCALAR_INIT;
     qnn_half_pixel.dataType = QNN_DATATYPE_BOOL_8;
     qnn_half_pixel.bool8Value = static_cast<uint8_t>(transformation_mode == "half_pixel");
+
     QnnParamWrapper qnn_half_pixel_param(node_unit.Index(), node_unit.Name(),
                                          QNN_OP_RESIZE_BILINEAR_PARAM_HALF_PIXEL_CENTERS, qnn_half_pixel);
+
     param_tensor_names.push_back(qnn_half_pixel_param.GetParamTensorName());
     qnn_model_wrapper.AddParamWrapper(std::move(qnn_half_pixel_param));
   } else {
+    // Fallback to QNN's Resize operator, which seems to align better with ONNX's Resize attributes and supports
+    // input ranks other than 4, but may not perform as optimally (at the moment).
+
     // Parameter 'transformation_mode'
     Qnn_Scalar_t qnn_transformation_mode = QNN_SCALAR_INIT;
     qnn_transformation_mode.dataType = QNN_DATATYPE_UINT_32;
@@ -325,7 +376,11 @@ Status ResizeOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrap
                                                  const std::vector<std::string>& input_names,
                                                  size_t output_index,
                                                  Qnn_DataType_t qnn_data_type,
-                                                 Qnn_QuantizeParams_t& quant_param) const {
+                                                 QnnQuantParamsWrapper& quant_param) const {
+  if (!quant_param.IsPerTensor()) {
+    return Status::OK();
+  }
+
   // Force Resize op's output to use the same quantization parameters as the input if nearly equal.
   // This helps the HTP backend employ certain optimizations.
   return SetOutputQParamEqualToInputIfNearlyEqual(qnn_model_wrapper, node_unit, logger, input_names,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index dd678ab5467e..82d71bb3e9dd 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -38,7 +38,7 @@ class SimpleOpBuilder : public BaseOpBuilder {
                                   const std::vector<std::string>& input_names,
                                   size_t output_index,
                                   Qnn_DataType_t qnn_data_type,
-                                  Qnn_QuantizeParams_t& quant_param) const override ORT_MUST_USE_RESULT;
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 
  private:
   Status ExplicitOpCheck(const NodeUnit& node_unit) const;
@@ -69,21 +69,19 @@ Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
   ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
   double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
   double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);
-
-  Qnn_QuantizeParams_t convert_output_quant_param = QNN_QUANTIZE_PARAMS_INIT;
-  convert_output_quant_param.encodingDefinition = QNN_DEFINITION_DEFINED;
-  convert_output_quant_param.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
+  float scale = 0.0f;
+  int32_t offset = 0;
   ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
                                                  static_cast<float>(value_max),
                                                  output_qnn_data_type,
-                                                 convert_output_quant_param.scaleOffsetEncoding.scale,
-                                                 convert_output_quant_param.scaleOffsetEncoding.offset));
+                                                 scale,
+                                                 offset));
 
   std::vector<uint32_t> output_shape_copy = output_shape;
   QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
                                                 QNN_TENSOR_TYPE_NATIVE,
                                                 output_qnn_data_type,
-                                                convert_output_quant_param,
+                                                QnnQuantParamsWrapper(scale, offset),
                                                 std::move(output_shape_copy));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");
 
@@ -116,6 +114,9 @@ Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     if (!input0_info.is_initializer && !input1_info.is_initializer &&
         input0_info.qnn_data_type == input1_info.qnn_data_type &&
         input0_info.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
+      ORT_RETURN_IF_NOT(input1_info.quant_param.IsPerTensor(),
+                        "MatMul's activation inputs only support per-tensor quantization");
+      const Qnn_QuantizeParams_t& quant_param = input1_info.quant_param.Get();
       // insert Convert op after input1
       std::string convert_input_name = input_names.back();
       input_names.pop_back();
@@ -126,8 +127,8 @@ Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                           convert_output_name,
                                           input1_info.qnn_data_type,
                                           QNN_DATATYPE_UFIXED_POINT_8,
-                                          input1_info.quant_param.scaleOffsetEncoding.offset,
-                                          input1_info.quant_param.scaleOffsetEncoding.scale,
+                                          quant_param.scaleOffsetEncoding.offset,
+                                          quant_param.scaleOffsetEncoding.scale,
                                           input1_info.shape,
                                           do_op_validation));
       input_names.push_back(convert_output_name);
@@ -218,7 +219,7 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
                                     const NodeUnit& node_unit,
                                     const std::string input_name) {
   NodeAttrHelper node_helper(node_unit);
-  Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT;
+  QnnQuantParamsWrapper quantize_param;
   Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
   union {
     float alpha;
@@ -236,14 +237,14 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
     GetQuantizationParameter(&tensor_data.alpha, num_of_elements, scale, zero_point, thread_pool);
     unpacked_data.resize(1);
     ParQuantizeLinearStd(&tensor_data.alpha, unpacked_data.data(), num_of_elements, scale, zero_point, thread_pool);
-    utils::InitializeQuantizeParam(quantize_param, is_quantized_tensor, scale, static_cast<int32_t>(zero_point));
+    quantize_param = QnnQuantParamsWrapper(scale, static_cast<int32_t>(zero_point));
     qnn_data_type = QNN_DATATYPE_UFIXED_POINT_8;
   } else {
     unpacked_data.assign(tensor_data.unpack, tensor_data.unpack + sizeof(float));
   }
   std::vector<uint32_t> input_shape{1};
   Qnn_TensorType_t tensor_type = QNN_TENSOR_TYPE_STATIC;
-  QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, qnn_data_type, quantize_param,
+  QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, qnn_data_type, std::move(quantize_param),
                                        std::move(input_shape), std::move(unpacked_data));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
   return Status::OK();
@@ -443,7 +444,7 @@ Status SimpleOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrap
                                                  const std::vector<std::string>& input_names,
                                                  size_t output_index,
                                                  Qnn_DataType_t qnn_data_type,
-                                                 Qnn_QuantizeParams_t& quant_param) const {
+                                                 QnnQuantParamsWrapper& quant_param) const {
   ORT_UNUSED_PARAMETER(input_names);
   const std::string& op_type = node_unit.OpType();
 
@@ -458,10 +459,10 @@ Status SimpleOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrap
     const auto& output = node_unit.Outputs()[0];
     const std::string& output_name = output.node_arg.Name();
 
-    if (quant_param.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
-      if (OverrideQuantParams(op_type, qnn_data_type, quant_param.scaleOffsetEncoding)) {
-        const int32_t offset = quant_param.scaleOffsetEncoding.offset;
-        const float scale = quant_param.scaleOffsetEncoding.scale;
+    if (quant_param.IsPerTensor(/*include_bw*/ false)) {
+      if (OverrideQuantParams(op_type, qnn_data_type, quant_param.Get().scaleOffsetEncoding)) {
+        const int32_t offset = quant_param.Get().scaleOffsetEncoding.offset;
+        const float scale = quant_param.Get().scaleOffsetEncoding.scale;
 
         LOGS(logger, VERBOSE) << "QNN requires that 16-bit quantized " << op_type
                               << " operators use offset/scale values "
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
index 9059f7459200..b0b2dc6164e8 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
@@ -140,8 +140,8 @@ Status SoftmaxOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                                          is_graph_input));
 
   Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, op_input_name);
-  QnnTensorWrapper input_tensorwrapper(op_input_name, tensor_type, input_info.qnn_data_type, input_info.quant_param,
-                                       std::move(op_input_shape), {});
+  QnnTensorWrapper input_tensorwrapper(op_input_name, tensor_type, input_info.qnn_data_type,
+                                       std::move(input_info.quant_param), std::move(op_input_shape), {});
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
 
   return Status::OK();
@@ -199,8 +199,8 @@ Status SoftmaxOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_
   op_output_shape[output_rank - 1] = output_info.shape[axis];
   op_output_shape[axis] = output_info.shape[output_rank - 1];
 
-  QnnTensorWrapper output_tensorwrapper(op_output_name, QNN_TENSOR_TYPE_NATIVE, output_info.qnn_data_type, output_info.quant_param,
-                                        std::vector<uint32_t>(op_output_shape));
+  QnnTensorWrapper output_tensorwrapper(op_output_name, QNN_TENSOR_TYPE_NATIVE, output_info.qnn_data_type,
+                                        output_info.quant_param.Copy(), std::vector<uint32_t>(op_output_shape));
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
   ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(GetNodeName(node_unit),
                                                     QNN_OP_PACKAGE_NAME_QTI_AISW,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
index f4b0d1ff5917..1a7411eb5136 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
@@ -37,7 +38,7 @@ class SplitOpBuilder : public BaseOpBuilder {
                                   const std::vector<std::string>& input_names,
                                   size_t output_index,
                                   Qnn_DataType_t qnn_data_type,
-                                  Qnn_QuantizeParams_t& quant_param) const override ORT_MUST_USE_RESULT;
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 };
 
 Status SplitOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
@@ -55,6 +56,19 @@ Status SplitOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   return Status::OK();
 }
 
+// Converts an ONNX list of split lengths to a QNN list of split indices.
+// Note that the first split index at 0 is implicit (QNN SDK >= 2.19 will raise a validation error if included).
+static void ConvertSplitLengthsToSplitIndices(gsl::span<const int64_t> split_lengths,
+                                              std::vector<uint32_t>& split_indices) {
+  uint32_t split_it = 0;
+  for (size_t i = 0; i < split_lengths.size(); ++i) {
+    if (i > 0) {  // Do not include the 0th split index.
+      split_indices.push_back(split_it);
+    }
+    split_it += SafeInt<uint32_t>(split_lengths[i]);
+  }
+}
+
 Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
                                                    const NodeUnit& node_unit,
                                                    std::vector<std::string>&& input_names,
@@ -79,22 +93,15 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
       const int64_t* tensor_data = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
       size_t tensor_byte_size = unpacked_tensor.size();
       size_t size = tensor_byte_size / sizeof(int64_t);
-      split_index.push_back(0);  // QNN need the start index of each range and starts from 0
-      std::transform(tensor_data, tensor_data + size, std::back_inserter(split_index),
-                     [](int64_t item) { return SafeInt<uint32_t>(item); });
-      split_index.pop_back();
+      ConvertSplitLengthsToSplitIndices({tensor_data, size}, split_index);
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic split");
     }
   } else {
     NodeAttrHelper node_helper(node_unit);
     if (node_helper.HasAttr("split")) {
-      auto split = node_helper.Get("split", std::vector<int32_t>{0});
-      uint32_t split_it = 0;
-      for (size_t i = 0; i < split.size(); ++i) {
-        split_index.push_back(split_it);
-        split_it += split[i];
-      }
+      auto split_lengths = node_helper.Get("split", std::vector<int64_t>{0});
+      ConvertSplitLengthsToSplitIndices(split_lengths, split_index);
     }
   }
 
@@ -105,11 +112,19 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
                       "Cannot get shape");
     ORT_ENFORCE(static_cast<int32_t>(input_shape.size()) > axis_value, "axis not valid!");
     ORT_RETURN_IF_NOT(input_shape.at(axis_value) > 0, "Shape value not valid!");
-    auto num_outputs = node_unit.Outputs().size();
-    auto step = SafeInt<uint32_t>(input_shape.at(axis_value) / num_outputs);
+
+    // ONNX spec states that if not evenly divisible by `num_outputs`, the last chunk is smaller.
+    // Therefore, we have to use ceil() when computing shape[axis] / num_outputs.
+    // See: core/providers/cpu/tensor/split.cc::PrepareForCompute()
+    const float num_outputs = static_cast<float>(node_unit.Outputs().size());
+    const float split_dim_size = static_cast<float>(input_shape[axis_value]);
+    const uint32_t step = SafeInt<uint32_t>(std::ceil(split_dim_size / num_outputs));
     uint32_t split_it = 0;
+
     for (size_t i = 0; i < num_outputs; ++i) {
-      split_index.push_back(split_it);
+      if (i > 0) {  // 0th split index is implicit (QNN >= 2.19 raises validation error if included)
+        split_index.push_back(split_it);
+      }
       split_it += step;
     }
   }
@@ -135,7 +150,11 @@ Status SplitOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrapp
                                                 const std::vector<std::string>& input_names,
                                                 size_t output_index,
                                                 Qnn_DataType_t qnn_data_type,
-                                                Qnn_QuantizeParams_t& quant_param) const {
+                                                QnnQuantParamsWrapper& quant_param) const {
+  if (!quant_param.IsPerTensor()) {
+    return Status::OK();
+  }
+
   // Force Split outputs to use the same quantization parameters as the input if nearly equal.
   // This helps the HTP backend employ certain optimizations.
   //
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
index 721db9dd2670..851ca84dce07 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
@@ -37,7 +38,7 @@ class TileOpBuilder : public BaseOpBuilder {
                                   const std::vector<std::string>& input_names,
                                   size_t output_index,
                                   Qnn_DataType_t qnn_data_type,
-                                  Qnn_QuantizeParams_t& quant_param) const override ORT_MUST_USE_RESULT;
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 };
 
 Status TileOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
@@ -100,7 +101,11 @@ Status TileOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrappe
                                                const std::vector<std::string>& input_names,
                                                size_t output_index,
                                                Qnn_DataType_t qnn_data_type,
-                                               Qnn_QuantizeParams_t& quant_param) const {
+                                               QnnQuantParamsWrapper& quant_param) const {
+  if (!quant_param.IsPerTensor()) {
+    return Status::OK();
+  }
+
   // Force the Tile operator output to use the same quantization parameters as the input if nearly equal.
   // This helps the HTP backend employ certain optimizations.
   return SetOutputQParamEqualToInputIfNearlyEqual(qnn_model_wrapper, node_unit, logger, input_names,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
index e69067ba8b0c..c71ae4435f8b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
@@ -96,7 +96,7 @@ Status TransposeOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_mode
   QnnTensorWrapper output_tensorwrapper(output_name,
                                         tensor_type,
                                         input_tensor_wrapper.GetTensorDataType(),
-                                        GetQnnTensorQParams(input_tensor_wrapper.GetQnnTensor()),
+                                        input_tensor_wrapper.GetQnnQuantParams().Copy(),
                                         std::move(output_shape));
 
   ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 38d74909db86..f322a27da22f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -17,6 +17,12 @@
 #include "core/framework/endian_utils.h"
 #include "core/common/logging/capture.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
+#include "core/providers/qnn/builder/qnn_configs_helper.h"
+
+#ifdef _WIN32
+#include <winmeta.h>
+#include "core/platform/tracing.h"
+#endif
 
 // Flag to determine if Backend should do node validation for each opNode added
 #define DO_GRAPH_NODE_VALIDATIONS 1
@@ -324,9 +330,37 @@ Status QnnBackendManager::CreateDevice() {
     return Status::OK();
   }
 
+  qnn::QnnConfigsBuilder<QnnDevice_Config_t, QnnHtpDevice_CustomConfig_t> device_configs_builder(QNN_DEVICE_CONFIG_INIT,
+                                                                                                 {});
+  if (qnn_backend_type_ == QnnBackendType::HTP) {
+    // Set SoC Model. The *enum* Qnn_SocModel_t is deprecated and will not be updated in the future. Therefore,
+    // must use the latest SDK documentation to get the SoC model of the latest HW.
+    if (soc_model_ != QNN_SOC_MODEL_UNKNOWN) {
+      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
+      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+      custom_config.socModel = soc_model_;
+
+      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
+      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config.customConfig = &custom_config;
+    }
+
+    // Set the minimum HTP architecture. The driver will use ops that are compatible with this minimum architecture.
+    if (htp_arch_ != QNN_HTP_DEVICE_ARCH_NONE) {
+      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
+      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+      custom_config.arch.arch = htp_arch_;
+      custom_config.arch.deviceId = device_id_;
+
+      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
+      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config.customConfig = &custom_config;
+    }
+  }
+
   LOGS_DEFAULT(INFO) << "Create device.";
   if (nullptr != qnn_interface_.deviceCreate) {
-    auto result = qnn_interface_.deviceCreate(log_handle_, nullptr, &device_handle_);
+    auto result = qnn_interface_.deviceCreate(log_handle_, device_configs_builder.GetQnnConfigs(), &device_handle_);
     if (QNN_SUCCESS != result) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create device. Error: ", result);
     }
@@ -354,15 +388,21 @@ Status QnnBackendManager::ReleaseDevice() {
 }
 
 Status QnnBackendManager::InitializeProfiling() {
-  if (ProfilingLevel::OFF == profiling_level_ || ProfilingLevel::INVALID == profiling_level_) {
+  profiling_level_merge_ = profiling_level_;
+  // use profiling level from ETW if ETW is enabled
+  if (profiling_level_etw_ != ProfilingLevel::INVALID) {
+    profiling_level_merge_ = profiling_level_etw_;
+  }
+
+  if (ProfilingLevel::OFF == profiling_level_merge_ || ProfilingLevel::INVALID == profiling_level_merge_) {
     LOGS_DEFAULT(INFO) << "Profiling turned off.";
     return Status::OK();
   }
 
   QnnProfile_Level_t qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
-  if (ProfilingLevel::BASIC == profiling_level_) {
+  if (ProfilingLevel::BASIC == profiling_level_merge_) {
     qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
-  } else if (ProfilingLevel::DETAILED == profiling_level_) {
+  } else if (ProfilingLevel::DETAILED == profiling_level_merge_) {
     qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED;
   }
   auto result = qnn_interface_.profileCreate(backend_handle_, qnn_profile_level, &profile_backend_handle_);
@@ -483,7 +523,8 @@ std::unique_ptr<unsigned char[]> QnnBackendManager::GetContextBinaryBuffer(uint6
   return context_buffer;
 }
 
-Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t buffer_length, QnnModel& qnn_model) {
+Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t buffer_length,
+                                                         std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
   bool result = nullptr == qnn_sys_interface_.systemContextCreate ||
                 nullptr == qnn_sys_interface_.systemContextGetBinaryInfo ||
                 nullptr == qnn_sys_interface_.systemContextFree;
@@ -516,8 +557,9 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
     graphs_info = binary_info->contextBinaryInfoV2.graphs;
   }
 
-  ORT_RETURN_IF(graph_count > 1, "Load from Qnn cached context only support 1 sub-graph.");
-  ORT_RETURN_IF(graphs_info == nullptr, "Failed to get graph info from Qnn cached context.");
+  ORT_RETURN_IF(graph_count < 1 || graphs_info == nullptr, "Failed to get graph info from Qnn cached context.");
+  LOGS(*logger_, VERBOSE) << "Graph count from QNN context: " << graph_count << ", EPContext node count: " << qnn_models.size();
+  ORT_RETURN_IF(graph_count != qnn_models.size(), "Graph count from QNN context not equal to EPContext node count.");
 
   ORT_RETURN_IF(nullptr == qnn_interface_.contextCreateFromBinary,
                 "Invalid function pointer for contextCreateFromBinary.");
@@ -537,7 +579,17 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
 
   // More work to support multiple partition, how to map the graph name in compile to qnn graph name
   // Need the lower level framework to understand EPContext op and pass in the partition_name in fused_node during Compile
-  ORT_RETURN_IF_ERROR(qnn_model.DeserializeGraphInfoFromBinaryInfo(graphs_info[0]));
+  if (1 == graph_count) {
+    auto qnn_model_pose = qnn_models.begin();
+    ORT_RETURN_IF_ERROR(qnn_model_pose->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[0]));
+  } else {
+    for (uint32_t i = 0; i < graph_count; ++i) {
+      std::string graph_name(graphs_info[i].graphInfoV1.graphName);
+      auto qnn_model_pos = qnn_models.find(graph_name);
+      ORT_RETURN_IF(qnn_model_pos == qnn_models.end(), graph_name + " does not match any EPContext node names.");
+      ORT_RETURN_IF_ERROR(qnn_model_pos->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[i]));
+    }
+  }
 
   qnn_sys_interface_.systemContextFree(sys_ctx_handle);
   sys_ctx_handle = nullptr;
@@ -588,11 +640,6 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_
     LOGS(logger, VERBOSE) << "CreateContext succeed.";
   }
 
-  if (htp_performance_mode_ != HtpPerformanceMode::kHtpDefault) {
-    ORT_RETURN_IF_ERROR(SetHtpPowerConfig());
-    LOGS(logger, VERBOSE) << "SetHtpPowerConfig succeed.";
-  }
-
   LOGS(logger, VERBOSE) << "QNN SetupBackend succeed";
 
   backend_setup_completed_ = true;
@@ -600,7 +647,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_
   return Status::OK();
 }
 
-Status QnnBackendManager::SetHtpPowerConfig() {
+Status QnnBackendManager::CreateHtpPowerCfgId(uint32_t device_id, uint32_t core_id, uint32_t& htp_power_config_id) {
   QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
   auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
   ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
@@ -610,26 +657,40 @@ Status QnnBackendManager::SetHtpPowerConfig() {
                 "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
   QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
   // Get power client id
-  status = htp_perf_infra.createPowerConfigId(/*device_id=*/0, /*core_id=*/0, &htp_power_config_client_id_);
+  status = htp_perf_infra.createPowerConfigId(device_id, core_id, &htp_power_config_id);
   ORT_RETURN_IF(QNN_SUCCESS != status, "createPowerConfigId failed.");
 
+  return Status::OK();
+}
+
+Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id,
+                                            HtpPerformanceMode htp_performance_mode) {
+  QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
+  auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
+  ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
+
+  auto* htp_infra = static_cast<QnnHtpDevice_Infrastructure_t*>(qnn_device_infra);
+  ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType,
+                "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
+  QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
+
   constexpr const int kNumConfigs = 1;
   std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> power_configs(
       kNumConfigs);
   QnnHtpPerfInfrastructure_PowerConfig_t& dcvs_config = power_configs[0];
   dcvs_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
   QnnHtpPerfInfrastructure_DcvsV3_t& dcvs_v3 = dcvs_config.dcvsV3Config;
-  dcvs_v3.contextId = htp_power_config_client_id_;
+  dcvs_v3.contextId = htp_power_config_client_id;
   dcvs_v3.setSleepDisable = 0;
   dcvs_v3.sleepDisable = 0;
   dcvs_v3.setDcvsEnable = 1;
-  dcvs_v3.dcvsEnable = kDcvsDisable;
   dcvs_v3.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
   // choose performance mode
-  switch (htp_performance_mode_) {
+  switch (htp_performance_mode) {
     case HtpPerformanceMode::kHtpBurst:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMinLatency;
+      dcvs_v3.dcvsEnable = kDcvsDisable;
       dcvs_v3.setBusParams = 1;
       dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
       dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
@@ -643,6 +704,7 @@ Status QnnBackendManager::SetHtpPowerConfig() {
     case HtpPerformanceMode::kHtpHighPerformance:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepLowLatency;
+      dcvs_v3.dcvsEnable = kDcvsDisable;
       dcvs_v3.setBusParams = 1;
       dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_TURBO;
       dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_TURBO;
@@ -652,33 +714,36 @@ Status QnnBackendManager::SetHtpPowerConfig() {
       dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_TURBO;
       dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_TURBO;
       break;
-    case HtpPerformanceMode::kHtpPowerSaver:
+    case HtpPerformanceMode::kHtpBalanced:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
       break;
-    case HtpPerformanceMode::kHtpLowPowerSaver:
+    case HtpPerformanceMode::kHtpLowBalanced:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS2;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS2;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS2;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS2;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM;
       break;
     case HtpPerformanceMode::kHtpHighPowerSaver:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
       dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
       dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
@@ -688,50 +753,81 @@ Status QnnBackendManager::SetHtpPowerConfig() {
       dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
       dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
       break;
-    case HtpPerformanceMode::kHtpLowBalanced:
+    case HtpPerformanceMode::kHtpPowerSaver:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS;
       break;
-    case HtpPerformanceMode::kHtpBalanced:
+    case HtpPerformanceMode::kHtpLowPowerSaver:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
       dcvs_v3.setBusParams = 1;
-      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
-      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
-      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS2;
       dcvs_v3.setCoreParams = 1;
-      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
-      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
-      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS2;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS2;
+      break;
+    case HtpPerformanceMode::kHtpExtremePowerSaver:
+      dcvs_v3.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE;
+      dcvs_v3.setSleepLatency = 1;  // true
+      dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.dcvsEnable = kDcvsEnable;
+      dcvs_v3.setBusParams = 1;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.setCoreParams = 1;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_CORNER_DISABLE;
       break;
     default:
-      ORT_THROW("Invalid performance profile %d", static_cast<int>(htp_performance_mode_));
+      ORT_THROW("Invalid performance profile %d", static_cast<int>(htp_performance_mode));
       break;
   }
-  std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr_ = ObtainNullTermPtrVector(power_configs);
-  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr_.data());
+  std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr = ObtainNullTermPtrVector(power_configs);
+  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data());
   ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for HTP performance mode.");
 
-  // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
-  if (rpc_control_latency_ != 0) {
-    constexpr int kNumRpcPollingPowerConfigs = 1;
+  return Status::OK();
+}
+
+Status QnnBackendManager::SetRpcControlLatency(uint32_t htp_power_config_client_id,
+                                               uint32_t rpc_control_latency) {
+  if (rpc_control_latency != 0) {
+    QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
+    auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
+    ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
+
+    auto* htp_infra = static_cast<QnnHtpDevice_Infrastructure_t*>(qnn_device_infra);
+    ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType,
+                  "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
+    QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
+
+    // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
+    constexpr int kNumRpcPollingPowerConfigs = 2;
     std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> rpc_power_configs(kNumRpcPollingPowerConfigs);
-    QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency = rpc_power_configs[0];
+    QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency_cfg = rpc_power_configs[0];
     // v68 doesn't support this.
     QnnHtpPerfInfrastructure_PowerConfig_t& rpc_polling_time = rpc_power_configs[1];
-    rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
+    rpc_control_latency_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
     rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-    rpc_control_latency.rpcControlLatencyConfig = rpc_control_latency_;
-    perf_power_configs_ptr_ = ObtainNullTermPtrVector(rpc_power_configs);
-    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr_.data());
+    rpc_control_latency_cfg.rpcControlLatencyConfig = rpc_control_latency;
+    std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr =
+        ObtainNullTermPtrVector(rpc_power_configs);
+    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data());
     ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency.");
   }
 
@@ -752,11 +848,7 @@ void QnnBackendManager::Split(std::vector<std::string>& split_string,
   }
 }
 
-Status QnnBackendManager::DestroyHTPPowerConfigID() {
-  if (htp_performance_mode_ == HtpPerformanceMode::kHtpDefault) {
-    return Status::OK();
-  }
-
+Status QnnBackendManager::DestroyHTPPowerConfigID(uint32_t htp_power_config_id) {
   QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
   auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
   ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
@@ -766,7 +858,7 @@ Status QnnBackendManager::DestroyHTPPowerConfigID() {
                 "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
   QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
 
-  Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_client_id_);
+  Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_id);
   ORT_RETURN_IF(QNN_SUCCESS != destroy_ret, "destroyPowerConfigId failed.");
   return Status::OK();
 }
@@ -776,12 +868,7 @@ void QnnBackendManager::ReleaseResources() {
     return;
   }
 
-  auto result = DestroyHTPPowerConfigID();
-  if (Status::OK() != result) {
-    ORT_THROW("Failed to DestroyHTPPowerConfigID.");
-  }
-
-  result = ReleaseContext();
+  auto result = ReleaseContext();
   if (Status::OK() != result) {
     ORT_THROW("Failed to ReleaseContext.");
   }
@@ -819,9 +906,36 @@ void QnnBackendManager::ReleaseResources() {
 }
 
 Status QnnBackendManager::ExtractBackendProfilingInfo() {
-  if (ProfilingLevel::OFF == profiling_level_ || ProfilingLevel::INVALID == profiling_level_) {
+  if (ProfilingLevel::OFF == profiling_level_merge_ || ProfilingLevel::INVALID == profiling_level_merge_) {
     return Status::OK();
   }
+
+  bool tracelogging_provider_ep_enabled = false;
+  const Env& env = Env::Default();
+  auto& provider = env.GetTelemetryProvider();
+  auto level = provider.Level();
+  if (provider.IsEnabled()) {
+    auto keyword = provider.Keyword();
+    if ((keyword & static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0 && level >= 5) {
+      tracelogging_provider_ep_enabled = true;
+    }
+  }
+
+  // ETW disabled previously, but enabled now
+  if (ProfilingLevel::INVALID == profiling_level_etw_ && tracelogging_provider_ep_enabled) {
+    LOGS(*logger_, ERROR) << "ETW disabled previously, but enabled now. Can't do the switch! Won't output any profiling.";
+    return Status::OK();
+  }
+
+  // ETW enabled previously, but disabled now
+  if (ProfilingLevel::INVALID != profiling_level_etw_ && !tracelogging_provider_ep_enabled) {
+    LOGS(*logger_, ERROR) << "ETW enabled previously, but disabled now. Can't do the switch! Won't output any profiling.";
+    return Status::OK();
+  }
+
+  ORT_RETURN_IF(!tracelogging_provider_ep_enabled && profiling_file_path_.empty(),
+                "Need to specify a cvs file via provider option profiling_file_path if ETW not enabled.");
+
   ORT_RETURN_IF(nullptr == profile_backend_handle_, "Backend profile handle not valid.");
 
   const QnnProfile_EventId_t* profile_events{nullptr};
@@ -836,35 +950,43 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
     Qnn_ErrorHandle_t resultPropertyHasCapability =
         qnn_interface_.propertyHasCapability(QNN_PROPERTY_PROFILE_SUPPORTS_EXTENDED_EVENT);
     uint16_t errorCodePropertyHasCapability = static_cast<uint16_t>(resultPropertyHasCapability & 0xFFFF);
-    if (errorCodePropertyHasCapability == QNN_PROFILE_NO_ERROR) {
+    if (errorCodePropertyHasCapability == QNN_PROPERTY_SUPPORTED) {
       LOGS(*logger_, VERBOSE) << "The QNN backend supports extended event data.";
       backendSupportsExtendedEventData = true;
     } else {
       LOGS(*logger_, VERBOSE) << "The QNN backend does not support extended event data.";
     }
 
-    // Write to CSV in append mode
-    const char* profilingCsvFilename = "qnn-profiling-data.csv";
-    std::ifstream infile(profilingCsvFilename);
-    bool exists = infile.good();
-    infile.close();
-
-    std::ofstream outfile(profilingCsvFilename, std::ios_base::app);
-    ORT_RETURN_IF(!outfile.is_open(), "Failed to open qnn-profiling-data.csv");
-    // If file didn't exist before, write the header
-    if (!exists) {
-      outfile << "Msg Timestamp,Message,Time,Unit of Measurement,Timing Source,Event Level,Event Identifier\n";
+    std::ofstream outfile;
+    if (!tracelogging_provider_ep_enabled) {
+      // Write to CSV in append mode
+      std::ifstream infile(profiling_file_path_.c_str());
+      bool exists = infile.good();
+      infile.close();
+
+      outfile.open(profiling_file_path_, std::ios_base::app);
+      ORT_RETURN_IF(!outfile.is_open(), "Failed to open profiling file: ", profiling_file_path_);
+      // If file didn't exist before, write the header
+      if (!exists) {
+        outfile << "Msg Timestamp,Message,Time,Unit of Measurement,Timing Source,Event Level,Event Identifier\n";
+      }
     }
 
     for (size_t event_idx = 0; event_idx < num_events; event_idx++) {
       ORT_RETURN_IF_ERROR(
-          ExtractProfilingEvent(*(profile_events + event_idx), "ROOT", outfile, backendSupportsExtendedEventData));
+          ExtractProfilingEvent(*(profile_events + event_idx), "ROOT", outfile, backendSupportsExtendedEventData,
+                                tracelogging_provider_ep_enabled));
       ORT_RETURN_IF_ERROR(
-          ExtractProfilingSubEvents(*(profile_events + event_idx), outfile, backendSupportsExtendedEventData));
+          ExtractProfilingSubEvents(*(profile_events + event_idx), outfile, backendSupportsExtendedEventData,
+                                    tracelogging_provider_ep_enabled));
     }
 
-    outfile.close();
-    LOGS(*logger_, INFO) << "Wrote QNN profiling events (" << num_events << ") to qnn-profiling-data.csv";
+    if (!tracelogging_provider_ep_enabled) {
+      outfile.close();
+      LOGS(*logger_, VERBOSE) << "Wrote QNN profiling events (" << num_events << ") to qnn-profiling-data.csv";
+    } else {
+      LOGS(*logger_, VERBOSE) << "Wrote QNN profiling events (" << num_events << ") to ETW";
+    }
   }
 
   return Status::OK();
@@ -873,7 +995,8 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
 Status QnnBackendManager::ExtractProfilingSubEvents(
     QnnProfile_EventId_t profile_event_id,
     std::ofstream& outfile,
-    bool useExtendedEventData) {
+    bool useExtendedEventData,
+    bool tracelogging_provider_ep_enabled) {
   const QnnProfile_EventId_t* profile_sub_events{nullptr};
   uint32_t num_sub_events{0};
   auto result = qnn_interface_.profileGetSubEvents(profile_event_id, &profile_sub_events, &num_sub_events);
@@ -884,12 +1007,14 @@ Status QnnBackendManager::ExtractProfilingSubEvents(
 
     for (size_t sub_event_idx = 0; sub_event_idx < num_sub_events; sub_event_idx++) {
       ORT_RETURN_IF_ERROR(
-          ExtractProfilingEvent(*(profile_sub_events + sub_event_idx), "SUB-EVENT", outfile, useExtendedEventData));
+          ExtractProfilingEvent(*(profile_sub_events + sub_event_idx), "SUB-EVENT", outfile, useExtendedEventData,
+                                tracelogging_provider_ep_enabled));
       ORT_RETURN_IF_ERROR(
-          ExtractProfilingSubEvents(*(profile_sub_events + sub_event_idx), outfile, useExtendedEventData));
+          ExtractProfilingSubEvents(*(profile_sub_events + sub_event_idx), outfile, useExtendedEventData,
+                                    tracelogging_provider_ep_enabled));
     }
 
-    LOGS(*logger_, INFO) << "Wrote QNN profiling sub events (" << num_sub_events << ") to qnn-profiling-data.csv";
+    LOGS(*logger_, VERBOSE) << "Wrote QNN profiling sub events (" << num_sub_events << ")";
   }
 
   return Status::OK();
@@ -899,18 +1024,20 @@ Status QnnBackendManager::ExtractProfilingEvent(
     QnnProfile_EventId_t profile_event_id,
     const std::string& eventLevel,
     std::ofstream& outfile,
-    bool useExtendedEventData) {
+    bool useExtendedEventData,
+    bool tracelogging_provider_ep_enabled) {
   if (useExtendedEventData) {
-    return ExtractProfilingEventExtended(profile_event_id, eventLevel, outfile);
+    return ExtractProfilingEventExtended(profile_event_id, eventLevel, outfile, tracelogging_provider_ep_enabled);
   } else {
-    return ExtractProfilingEventBasic(profile_event_id, eventLevel, outfile);
+    return ExtractProfilingEventBasic(profile_event_id, eventLevel, outfile, tracelogging_provider_ep_enabled);
   }
 }
 
 Status QnnBackendManager::ExtractProfilingEventBasic(
     QnnProfile_EventId_t profile_event_id,
     const std::string& eventLevel,
-    std::ofstream& outfile) {
+    std::ofstream& outfile,
+    bool tracelogging_provider_ep_enabled) {
   QnnProfile_EventData_t event_data;
   auto result = qnn_interface_.profileGetEventData(profile_event_id, &event_data);
   QnnProfile_Error_t errorCode = static_cast<QnnProfile_Error_t>(result & 0xFFFF);
@@ -919,15 +1046,32 @@ Status QnnBackendManager::ExtractProfilingEventBasic(
   std::string message = GetEventTypeString(event_data.type);
   std::string unit = GetUnitString(event_data.unit);
 
-  outfile << "UNKNOWN"
-          << ","
-          << message << ","
-          << event_data.value << ","
-          << unit << ","
-          << "BACKEND"
-          << ","
-          << eventLevel << ","
-          << (event_data.identifier ? event_data.identifier : "NULL") << "\n";
+#ifndef _WIN32
+  tracelogging_provider_ep_enabled = false;
+#endif
+
+  if (!tracelogging_provider_ep_enabled) {
+    outfile << "UNKNOWN"
+            << ","
+            << message << ","
+            << event_data.value << ","
+            << unit << ","
+            << "BACKEND"
+            << ","
+            << eventLevel << ","
+            << (event_data.identifier ? event_data.identifier : "NULL") << "\n";
+  } else {
+#ifdef _WIN32
+    LogQnnProfileEventAsTraceLogging(
+        (uint64_t)0,
+        message,
+        std::to_string(event_data.value),
+        unit,
+        "BACKEND",
+        eventLevel,
+        (event_data.identifier ? event_data.identifier : "NULL"));
+#endif
+  }
 
   return Status::OK();
 }
@@ -935,16 +1079,24 @@ Status QnnBackendManager::ExtractProfilingEventBasic(
 Status QnnBackendManager::ExtractProfilingEventExtended(
     QnnProfile_EventId_t profile_event_id,
     const std::string& eventLevel,
-    std::ofstream& outfile) {
+    std::ofstream& outfile,
+    bool tracelogging_provider_ep_enabled) {
   QnnProfile_ExtendedEventData_t event_data_extended;
   auto resultGetExtendedEventData = qnn_interface_.profileGetExtendedEventData(profile_event_id, &event_data_extended);
   QnnProfile_Error_t errorCode = static_cast<QnnProfile_Error_t>(resultGetExtendedEventData & 0xFFFF);
   ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != errorCode, "Failed to get profile event data: " + std::string(QnnProfileErrorToString(errorCode)));
 
+  // need to check the version first
   std::string message = GetEventTypeString(event_data_extended.v1.type);
   std::string unit = GetUnitString(event_data_extended.v1.unit);
 
-  if (event_data_extended.version == QNN_PROFILE_DATA_VERSION_1) {
+#ifndef _WIN32
+  tracelogging_provider_ep_enabled = false;
+#endif
+
+  if (!tracelogging_provider_ep_enabled) {
+    // QNN issue, the version number not correct, ticket created
+    // if (event_data_extended.version == QNN_PROFILE_DATA_VERSION_1) {
     outfile << event_data_extended.v1.timestamp << ","
             << message << ","
             << ExtractQnnScalarValue(event_data_extended.v1.value) << ","
@@ -953,11 +1105,47 @@ Status QnnBackendManager::ExtractProfilingEventExtended(
             << ","
             << eventLevel << ","
             << (event_data_extended.v1.identifier ? event_data_extended.v1.identifier : "NULL") << "\n";
+    //}
+  } else {
+#ifdef _WIN32
+    LogQnnProfileEventAsTraceLogging(
+        event_data_extended.v1.timestamp,
+        message,
+        ExtractQnnScalarValue(event_data_extended.v1.value),
+        unit,
+        "BACKEND",
+        eventLevel,
+        (event_data_extended.v1.identifier ? event_data_extended.v1.identifier : "NULL"));
+#endif
   }
 
   return Status::OK();
 }
 
+#ifdef _WIN32
+void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
+    uint64_t timestamp,
+    const std::string& message,
+    const std::string& qnnScalarValue,
+    const std::string& unit,
+    const std::string& timingSource,
+    const std::string& eventLevel,
+    const char* eventIdentifier) {
+  TraceLoggingWrite(
+      telemetry_provider_handle,
+      "QNNProfilingEvent",
+      TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)),
+      TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
+      TraceLoggingValue(timestamp, "Timestamp"),
+      TraceLoggingString(message.c_str(), "Message"),
+      TraceLoggingString(qnnScalarValue.c_str(), "Value"),
+      TraceLoggingString(unit.c_str(), "Unit of Measurement"),
+      TraceLoggingString(timingSource.c_str(), "Timing Source"),
+      TraceLoggingString(eventLevel.c_str(), "Event Level"),
+      TraceLoggingString(eventIdentifier, "Event Identifier"));
+}
+#endif
+
 const std::string& QnnBackendManager::GetUnitString(QnnProfile_EventUnit_t unitType) {
   const auto& unitStringMap = GetUnitStringMap();
   auto it = unitStringMap.find(unitType);
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index bc05820da2f7..673e3c2f33d6 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -6,12 +6,18 @@
 #include <windows.h>
 #include <psapi.h>
 #include <libloaderapi.h>
+#include <set>
 #else
 #include <dlfcn.h>
 #endif
 
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
 #include "HTP/QnnHtpDevice.h"
 #include "QnnLog.h"
+#include "QnnTypes.h"
 #include "System/QnnSystemInterface.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
@@ -26,17 +32,23 @@ class QnnModel;
 class QnnBackendManager {
  public:
   QnnBackendManager(std::string&& backend_path,
+                    ProfilingLevel profiling_level_etw,
                     ProfilingLevel profiling_level,
-                    uint32_t rpc_control_latency,
-                    HtpPerformanceMode htp_performance_mode,
+                    std::string&& profiling_file_path,
                     ContextPriority context_priority,
-                    std::string&& qnn_saver_path)
+                    std::string&& qnn_saver_path,
+                    uint32_t device_id,
+                    QnnHtpDevice_Arch_t htp_arch,
+                    uint32_t soc_model)
       : backend_path_(backend_path),
+        profiling_level_etw_(profiling_level_etw),
         profiling_level_(profiling_level),
-        rpc_control_latency_(rpc_control_latency),
-        htp_performance_mode_(htp_performance_mode),
+        profiling_file_path_(profiling_file_path),
         context_priority_(context_priority),
-        qnn_saver_path_(qnn_saver_path) {
+        qnn_saver_path_(qnn_saver_path),
+        device_id_(device_id),
+        htp_arch_(htp_arch),
+        soc_model_(soc_model) {
   }
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnBackendManager);
 
@@ -75,11 +87,18 @@ class QnnBackendManager {
 
   std::unique_ptr<unsigned char[]> GetContextBinaryBuffer(uint64_t& written_buffer_size);
 
-  Status LoadCachedQnnContextFromBuffer(char* buffer, uint64_t buffer_length, QnnModel& qnn_model);
+  Status LoadCachedQnnContextFromBuffer(char* buffer, uint64_t buffer_length,
+                                        std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models);
 
   Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context);
 
-  Status SetHtpPowerConfig();
+  Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
+
+  Status SetHtpPowerConfig(uint32_t htp_power_config_client_id,
+                           HtpPerformanceMode htp_performance_mode);
+
+  Status SetRpcControlLatency(uint32_t htp_power_config_client_id,
+                              uint32_t rpc_control_latency);
 
   const QNN_INTERFACE_VER_TYPE& GetQnnInterface() { return qnn_interface_; }
 
@@ -117,14 +136,19 @@ class QnnBackendManager {
   void Split(std::vector<std::string>& split_string, const std::string& tokenized_string, const char separator);
 
   Status ExtractBackendProfilingInfo();
-  Status ExtractProfilingSubEvents(QnnProfile_EventId_t profile_event_id, std::ofstream& outfile, bool backendSupportsExtendedEventData);
-  Status ExtractProfilingEvent(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel, std::ofstream& outfile, bool backendSupportsExtendedEventData);
+  Status ExtractProfilingSubEvents(QnnProfile_EventId_t profile_event_id, std::ofstream& outfile,
+                                   bool backendSupportsExtendedEventData, bool tracelogging_provider_ep_enabled);
+  Status ExtractProfilingEvent(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel,
+                               std::ofstream& outfile, bool backendSupportsExtendedEventData,
+                               bool tracelogging_provider_ep_enabled);
 
   void SetQnnBackendType(uint32_t backend_id);
   QnnBackendType GetQnnBackendType() { return qnn_backend_type_; }
 
   const std::string& GetSdkVersion() { return sdk_build_version_; }
 
+  Status DestroyHTPPowerConfigID(uint32_t htp_power_config_id);
+
  private:
   void* LoadLib(const char* file_name, int flags, std::string& error_msg);
 
@@ -134,8 +158,6 @@ class QnnBackendManager {
 
   Status UnloadLib(void* handle);
 
-  Status DestroyHTPPowerConfigID();
-
   void* LibFunction(void* handle, const char* symbol, std::string& error_msg);
 
   template <class T>
@@ -175,13 +197,25 @@ class QnnBackendManager {
     return (backend_build_id == nullptr ? std::string("") : std::string(backend_build_id));
   }
 
-  Status ExtractProfilingEventBasic(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel, std::ofstream& outfile);
-  Status ExtractProfilingEventExtended(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel, std::ofstream& outfile);
+  Status ExtractProfilingEventBasic(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel,
+                                    std::ofstream& outfile, bool tracelogging_provider_ep_enabled);
+  Status ExtractProfilingEventExtended(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel,
+                                       std::ofstream& outfile, bool tracelogging_provider_ep_enabled);
   static const std::string& GetUnitString(QnnProfile_EventUnit_t unitType);
   static const std::unordered_map<QnnProfile_EventUnit_t, std::string>& GetUnitStringMap();
   static const std::string GetEventTypeString(QnnProfile_EventType_t eventType);
   static const std::string ExtractQnnScalarValue(const Qnn_Scalar_t& scalar);
   const char* QnnProfileErrorToString(QnnProfile_Error_t error);
+#ifdef _WIN32
+  void LogQnnProfileEventAsTraceLogging(
+      uint64_t timestamp,
+      const std::string& message,
+      const std::string& qnnScalarValue,
+      const std::string& unit,
+      const std::string& timingSource,
+      const std::string& eventLevel,
+      const char* eventIdentifier);
+#endif
 
  private:
   const std::string backend_path_;
@@ -195,7 +229,10 @@ class QnnBackendManager {
   Qnn_LogHandle_t log_handle_ = nullptr;
   Qnn_DeviceHandle_t device_handle_ = nullptr;
   Qnn_ContextHandle_t context_ = nullptr;
+  ProfilingLevel profiling_level_etw_;
   ProfilingLevel profiling_level_;
+  ProfilingLevel profiling_level_merge_;
+  const std::string profiling_file_path_;
   bool backend_initialized_ = false;
   bool device_created_ = false;
   bool context_created_ = false;
@@ -204,15 +241,15 @@ class QnnBackendManager {
   QnnBackendType qnn_backend_type_ = QnnBackendType::CPU;
   Qnn_ProfileHandle_t profile_backend_handle_ = nullptr;
   std::vector<std::string> op_package_paths_;
-  uint32_t rpc_control_latency_ = 0;
-  HtpPerformanceMode htp_performance_mode_;
   ContextPriority context_priority_;
   std::string sdk_build_version_ = "";
 #ifdef _WIN32
   std::set<HMODULE> mod_handles_;
 #endif
   const std::string qnn_saver_path_;
-  uint32_t htp_power_config_client_id_ = 0;
+  uint32_t device_id_ = 0;
+  QnnHtpDevice_Arch_t htp_arch_ = QNN_HTP_DEVICE_ARCH_NONE;
+  uint32_t soc_model_ = QNN_SOC_MODEL_UNKNOWN;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
new file mode 100644
index 000000000000..9dd9bbaa08d6
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <core/common/inlined_containers_fwd.h>
+
+namespace onnxruntime {
+namespace qnn {
+
+/**
+ * Helper class for building a null-terminated list of QNN configurations.
+ * A QNN configuration consists of multiple objects with references to each other. This
+ * class ensures that all configuration objects have the same lifetime, so that they remain valid
+ * across calls to qnn_interface.xxxCreate().
+ */
+template <typename BaseConfigType, typename CustomConfigType>
+class QnnConfigsBuilder {
+ public:
+  /**
+   * Initializes the config build. Provide the initial/default value for each config struct type.
+   * \param base_config_init The initial/default value for objects of type BaseConfigType.
+   * \param custom_config_init The initial/default value for objects of type CustomConfigType.
+   */
+  QnnConfigsBuilder(BaseConfigType base_config_init, CustomConfigType custom_config_init)
+      : base_config_init_(std::move(base_config_init)), custom_config_init_(std::move(custom_config_init)) {}
+
+  /**
+   * Returns a pointer to the beginning of a null-terminated array of QNN base configurations.
+   * This result is typically passed to QNN's xxxCreate() APIs.
+   *
+   * \return Pointer to null-terminated BaseConfigType* array.
+   */
+  const BaseConfigType** GetQnnConfigs() {
+    if (config_ptrs_.empty()) {
+      return nullptr;
+    }
+
+    if (!IsNullTerminated()) {
+      config_ptrs_.push_back(nullptr);
+    }
+
+    return config_ptrs_.data();
+  }
+
+  /**
+   * Creates and returns a reference to a new custom QNN configuration object. The object is initialized to
+   * the QNN recommended default value. The caller is meant to override fields in this object.
+   *
+   * \return A reference to a default CustomConfigType object.
+   */
+  CustomConfigType& PushCustomConfig() {
+    custom_configs_.push_back(custom_config_init_);
+    return custom_configs_.back();
+  }
+
+  /**
+   * Creates and returns a reference to a new QNN configuration object. The object is initialized to
+   * the QNN recommended default value. The caller is meant to override fields in this object.
+   *
+   * \return A reference to a default BaseConfigType object.
+   */
+  BaseConfigType& PushConfig() {
+    configs_.push_back(base_config_init_);
+    BaseConfigType& config = configs_.back();
+
+    // Add pointer to this new config to the list of config pointers.
+    if (IsNullTerminated()) {
+      config_ptrs_.back() = &config;  // Replace last nullptr entry.
+    } else {
+      config_ptrs_.push_back(&config);
+    }
+
+    return config;
+  }
+
+ private:
+  bool IsNullTerminated() const {
+    return !config_ptrs_.empty() && config_ptrs_.back() == nullptr;
+  }
+
+  BaseConfigType base_config_init_;
+  CustomConfigType custom_config_init_;
+  InlinedVector<CustomConfigType> custom_configs_;
+  InlinedVector<BaseConfigType> configs_;
+  InlinedVector<const BaseConfigType*> config_ptrs_;
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.cc b/onnxruntime/core/providers/qnn/builder/qnn_def.cc
index a77ac16cf624..a1b4dc8bbb71 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.cc
@@ -89,6 +89,15 @@ void SetQnnTensorClientBuf(Qnn_Tensor_t& qnn_tensor, const std::vector<uint32_t>
   }
 }
 
+void SetQnnTensorClientBuf(Qnn_Tensor_t& qnn_tensor, void* buf_data, uint32_t buf_size) {
+  if (QNN_TENSOR_VERSION_1 == qnn_tensor.version) {
+    qnn_tensor.v1.clientBuf.data = buf_data;
+    qnn_tensor.v1.clientBuf.dataSize = buf_size;
+  } else {
+    ORT_THROW("QNN tensor version not supported, QNN tensor version: ", qnn_tensor.version);
+  }
+}
+
 void SetQnnTensorClientBufSize(Qnn_Tensor_t& qnn_tensor, uint32_t client_buf_size) {
   if (QNN_TENSOR_VERSION_1 == qnn_tensor.version) {
     qnn_tensor.v1.clientBuf.dataSize = client_buf_size;
@@ -106,18 +115,10 @@ void SetQnnTensorClientBufData(Qnn_Tensor_t& qnn_tensor, void* client_buf_data)
 }
 
 void SetQnnTensorQParams(Qnn_Tensor_t& qnn_tensor, const Qnn_QuantizeParams_t& quantize_params) {
-  Qnn_QuantizationEncoding_t encoding = quantize_params.quantizationEncoding;
-  if (encoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET ||
-      encoding == QNN_QUANTIZATION_ENCODING_UNDEFINED) {
-    if (QNN_TENSOR_VERSION_1 == qnn_tensor.version) {
-      qnn_tensor.v1.quantizeParams = quantize_params;
-    } else {
-      ORT_THROW("QNN tensor version not supported, QNN tensor version: ", qnn_tensor.version);
-    }
-  } else if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-    ORT_THROW("Axis scale offset quantization parameter is not supported.");
+  if (QNN_TENSOR_VERSION_1 == qnn_tensor.version) {
+    qnn_tensor.v1.quantizeParams = quantize_params;
   } else {
-    ORT_THROW("quantizationEncoding incorrect value.");
+    ORT_THROW("QNN tensor version not supported, QNN tensor version: ", qnn_tensor.version);
   }
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index f6a3b1bd360e..7d76006ed9b1 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -11,6 +11,7 @@
 #include <type_traits>
 #include "core/graph/basic_types.h"
 #include "core/common/common.h"
+#include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -46,6 +47,7 @@ enum class HtpPerformanceMode : uint8_t {
   kHtpHighPowerSaver,
   kHtpLowBalanced,
   kHtpBalanced,
+  kHtpExtremePowerSaver,
 };
 
 enum class ContextPriority : uint8_t {
@@ -100,6 +102,7 @@ void SetQnnTensorDim(Qnn_Tensor_t& qnn_tensor, const std::vector<uint32_t>& dime
 void SetQnnTensorMemType(Qnn_Tensor_t& qnn_tensor, Qnn_TensorMemType_t mem_type);
 void SetQnnTensorClientBuf(Qnn_Tensor_t& qnn_tensor, const std::vector<uint8_t>& client_buf);
 void SetQnnTensorClientBuf(Qnn_Tensor_t& qnn_tensor, const std::vector<uint32_t>& client_buf);
+void SetQnnTensorClientBuf(Qnn_Tensor_t& qnn_tensor, void* buf_data, uint32_t buf_size);
 void SetQnnTensorClientBufSize(Qnn_Tensor_t& qnn_tensor, uint32_t client_buf_size);
 void SetQnnTensorClientBufData(Qnn_Tensor_t& qnn_tensor, void* client_buf_data);
 void SetQnnTensorQParams(Qnn_Tensor_t& qnn_tensor, const Qnn_QuantizeParams_t& quantize_params);
@@ -142,12 +145,13 @@ class QnnTensorWrapper {
   QnnTensorWrapper(const std::string& name,
                    Qnn_TensorType_t tensor_type,
                    Qnn_DataType_t data_type,
-                   const Qnn_QuantizeParams_t& quantize_params,
+                   QnnQuantParamsWrapper&& quantize_params,
                    std::vector<uint32_t>&& shape,
                    std::vector<uint8_t>&& client_buf = {},
                    Qnn_TensorMemType_t mem_type = QNN_TENSORMEMTYPE_RAW) : tensor_name_(name),
                                                                            dimensions_(std::move(shape)),
-                                                                           client_buf_(std::move(client_buf)) {
+                                                                           client_buf_(std::move(client_buf)),
+                                                                           quant_params_(quantize_params) {
     SetQnnTensorType(qnn_tensor_, tensor_type);
     SetQnnTensorName(qnn_tensor_, tensor_name_.c_str());
     SetQnnTensorDataType(qnn_tensor_, data_type);
@@ -161,31 +165,36 @@ class QnnTensorWrapper {
       ORT_THROW("mem_type not supported for now.");
     }
 
-    SetQnnTensorQParams(qnn_tensor_, quantize_params);
+    SetQnnTensorQParams(qnn_tensor_, quant_params_.Get());
   }
 
-  QnnTensorWrapper(const Qnn_Tensor_t& qnn_tensor) : tensor_name_(GetQnnTensorName(qnn_tensor)),
-                                                     client_buf_{} {
+  // Initialize from a raw Qnn_Tensor_t. This method is currently used for graph inputs/outputs
+  // when deserializing from cached context object. Possible return errors due to:
+  //   - Unexpected Qnn_TensorType_t: only handle graph inputs/outputs, not static initializers with data buffers.
+  //   - Unexpected quantization encoding.
+  Status Init(const Qnn_Tensor_t& qnn_tensor) {
+    Qnn_TensorType_t tensor_type = GetQnnTensorType(qnn_tensor);
+    ORT_RETURN_IF(tensor_type == QNN_TENSOR_TYPE_STATIC,
+                  "QnnTensorWrapper::Init(const Qnn_Tensor_t&) does not support static initializers");
+
+    tensor_name_ = GetQnnTensorName(qnn_tensor);
+    client_buf_.clear();
+
     qnn_tensor_ = qnn_tensor;
     SetQnnTensorName(qnn_tensor_, tensor_name_.c_str());
 
-    Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT;
-    const auto& src_quantize_param = GetQnnTensorQParams(qnn_tensor);
-    // quantization only support SCALE_OFFSET encoding
-    quantize_param.encodingDefinition = src_quantize_param.encodingDefinition;
-    quantize_param.quantizationEncoding = src_quantize_param.quantizationEncoding;
-    quantize_param.scaleOffsetEncoding = src_quantize_param.scaleOffsetEncoding;
-    SetQnnTensorQParams(qnn_tensor_, quantize_param);
+    const Qnn_QuantizeParams_t& src_quantize_param = GetQnnTensorQParams(qnn_tensor);
+    ORT_RETURN_IF_ERROR(quant_params_.Init(src_quantize_param));
+    SetQnnTensorQParams(qnn_tensor_, quant_params_.Get());
 
     uint32_t shape_rank = GetQnnTensorRank(qnn_tensor);
     uint32_t* shape_data = GetQnnTensorDims(qnn_tensor);
     dimensions_.assign(shape_data, shape_data + shape_rank);
     SetQnnTensorDim(qnn_tensor_, dimensions_);
 
-    // This method is only used for graph inputs/outputs when desearilize from cached context
-    // no client buffer should be set
-
     SetQnnTensorMemType(qnn_tensor_, QNN_TENSORMEMTYPE_RAW);
+
+    return Status::OK();
   }
 
   QnnTensorWrapper() = default;
@@ -196,10 +205,12 @@ class QnnTensorWrapper {
     std::swap(tensor_name_, other.tensor_name_);
     std::swap(dimensions_, other.dimensions_);
     std::swap(client_buf_, other.client_buf_);
+    std::swap(quant_params_, other.quant_params_);
     std::swap(qnn_tensor_, other.qnn_tensor_);
     SetQnnTensorName(qnn_tensor_, tensor_name_.c_str());
     SetQnnTensorDim(qnn_tensor_, dimensions_);
     SetQnnTensorClientBuf(qnn_tensor_, client_buf_);
+    SetQnnTensorQParams(qnn_tensor_, quant_params_.Get());
   }
 
   ~QnnTensorWrapper() = default;
@@ -212,6 +223,14 @@ class QnnTensorWrapper {
     return qnn_tensor_;
   }
 
+  const QnnQuantParamsWrapper& GetQnnQuantParams() const {
+    return quant_params_;
+  }
+
+  QnnQuantParamsWrapper& GetQnnQuantParams() {
+    return quant_params_;
+  }
+
   const std::string& GetName() const { return tensor_name_; }
 
   Qnn_TensorType_t GetTensorType() const { return GetQnnTensorType(qnn_tensor_); }
@@ -229,22 +248,11 @@ class QnnTensorWrapper {
   }
 
  private:
-  void CopyQuantizationEncoding(Qnn_QuantizeParams_t& dst, const Qnn_QuantizeParams_t& src) {
-    Qnn_QuantizationEncoding_t encoding = src.quantizationEncoding;
-    if (encoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET ||
-        encoding == QNN_QUANTIZATION_ENCODING_UNDEFINED) {
-      dst = src;
-    } else if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-      ORT_THROW("Axis scale offset quantization parameter is not supported.");
-    } else {
-      ORT_THROW("quantizationEncoding incorrect value.");
-    }
-  }
-
   std::string tensor_name_;
   std::vector<uint32_t> dimensions_;
   std::vector<uint8_t> client_buf_;
   Qnn_Tensor_t qnn_tensor_ = QNN_TENSOR_INIT;
+  QnnQuantParamsWrapper quant_params_;
 };
 
 class QnnParamWrapper {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc
deleted file mode 100644
index 63aa01b48e7e..000000000000
--- a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/qnn/builder/qnn_graph_configs_helper.h"
-
-#include "HTP/QnnHtpGraph.h"
-
-namespace onnxruntime {
-namespace qnn {
-
-const QnnGraph_Config_t** QnnGraphConfigsBuilder::GetQnnGraphConfigs() {
-  if (graph_config_ptrs_.empty()) {
-    return nullptr;
-  }
-
-  if (!IsNullTerminated()) {
-    graph_config_ptrs_.push_back(nullptr);
-  }
-
-  return graph_config_ptrs_.data();
-}
-
-QnnHtpGraph_CustomConfig_t& QnnGraphConfigsBuilder::PushHtpGraphCustomConfig() {
-  htp_custom_graph_configs_.push_back(QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT);
-  return htp_custom_graph_configs_.back();
-}
-
-QnnGraph_Config_t& QnnGraphConfigsBuilder::PushGraphConfig() {
-  graph_configs_.push_back(QNN_GRAPH_CONFIG_INIT);
-  QnnGraph_Config_t& config = graph_configs_.back();
-
-  // Add pointer to this new graph config to the list of graph config pointers.
-  if (IsNullTerminated()) {
-    graph_config_ptrs_.back() = &config;  // Replace last nullptr entry.
-  } else {
-    graph_config_ptrs_.push_back(&config);
-  }
-
-  return config;
-}
-
-}  // namespace qnn
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h
deleted file mode 100644
index 8c4928fdacbc..000000000000
--- a/onnxruntime/core/providers/qnn/builder/qnn_graph_configs_helper.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <core/common/inlined_containers_fwd.h>
-
-#include "HTP/QnnHtpGraph.h"
-
-namespace onnxruntime {
-namespace qnn {
-
-/**
- * Helper class for building a null-terminated list of QNN Graph configurations.
- * A QNN configuration consists of multiple objects with references to each other. This
- * class ensures that all configuration objects have the same lifetime, so that they remain valid
- * across the call to graphCreate().
- */
-class QnnGraphConfigsBuilder {
- public:
-  /**
-   * Returns a pointer to the beginning of a null-terminated array of QNN Graph configurations.
-   * This result is passed QNN's graphCreate() API.
-   *
-   * \return Pointer to null-terminated QnnGraph_Config_t* array.
-   */
-  const QnnGraph_Config_t** GetQnnGraphConfigs();
-
-  /**
-   * Creates and returns a reference to a new HTP graph configuration object. The object is initialized to
-   * the QNN recommended default value. The caller is meant to override fields in this object.
-   *
-   * \return A reference to a default QnnHtpGraph_CustomConfig_t object.
-   */
-  QnnHtpGraph_CustomConfig_t& PushHtpGraphCustomConfig();
-
-  /**
-   * Creates and returns a reference to a new graph configuration object. The object is initialized to
-   * the QNN recommended default value. The caller is meant to override fields in this object.
-   *
-   * \return A reference to a default QnnGraph_Config_t object.
-   */
-  QnnGraph_Config_t& PushGraphConfig();
-
- private:
-  bool IsNullTerminated() const {
-    return !graph_config_ptrs_.empty() && graph_config_ptrs_.back() == nullptr;
-  }
-
-  InlinedVector<QnnHtpGraph_CustomConfig_t> htp_custom_graph_configs_;
-  InlinedVector<QnnGraph_Config_t> graph_configs_;
-  InlinedVector<const QnnGraph_Config_t*> graph_config_ptrs_;
-};
-
-}  // namespace qnn
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index fd3a95b5f1f7..109ec743d848 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -9,6 +9,8 @@
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/framework/utils.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
@@ -95,9 +97,10 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
   // valid throughout the lifetime of the ModelBuilder
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
 
-  const auto& graph_name = graph_viewer.Name();
+  // This name must be same with the EPContext node name
+  const auto& graph_name = fused_node.Name();
   ORT_RETURN_IF_ERROR(SetGraphInputOutputInfo(graph_viewer, fused_node));
 
   QnnModelWrapper qnn_model_wrapper = QnnModelWrapper(graph_viewer, logger_,
@@ -113,6 +116,8 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to initialize qnn_model_wrapper.");
   }
 
+  std::unordered_set<const NodeUnit*> handled_node_units;
+
   // Op builer
   const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
   for (size_t i = 0; i < node_indices.size(); i++) {
@@ -121,20 +126,43 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
     // Check whether it's part of NodeUnit
     const NodeUnit& node_unit = GetNodeUnit(node, node_unit_map);
     // Q, DQ nodes in the node unit only carry the quantization parameters
-    // Add the QNN node when it is the target node (It's a normal node or a singel Q/DQ node)
+    // Add the QNN node when it is the target node (It's a normal node or a single Q/DQ node)
     const std::string& op_type = node_unit.OpType();
+
+    if (node != &node_unit.GetNode()) {
+      continue;
+    }
+
+    if (handled_node_units.count(&node_unit) != 0) {
+      continue;  // Already handled.
+    }
+
+    // Try to convert particular DQ -> Q sequences into QNN Convert op
+    auto convert_result = TryHandleConvertSequence(qnn_model_wrapper,
+                                                   node_unit,
+                                                   node_unit_map,
+                                                   logger_,
+                                                   false /*do_op_validation*/);
+    ORT_RETURN_IF_ERROR(convert_result.status);
+
+    if (convert_result.q_node_unit) {
+      // Successfully merged DQ -> Q sequence into a QNN Convert op.
+      // Mark both of these node units as handled.
+      handled_node_units.insert(&node_unit);
+      handled_node_units.insert(convert_result.q_node_unit);
+      continue;
+    }
+
     LOGS(logger_, VERBOSE) << " node name: [" << node->Name()
                            << "] node optype: [" << op_type
                            << "] as part of the NodeUnit type: [" << node_unit.OpType()
                            << "] name: [" << node_unit.Name()
                            << "]";
-    if (node != &node_unit.GetNode()) {
-      continue;
-    }
-
     if (const auto* op_builder = GetOpBuilder(op_type)) {
       ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(qnn_model_wrapper, node_unit, logger_));
     }
+
+    handled_node_units.insert(&node_unit);
   }
 
   ORT_RETURN_IF_NOT(qnn_model_wrapper.ComposeQnnGraph(), "Failed to compose Qnn graph.");
@@ -166,14 +194,14 @@ Status QnnModel::FinalizeGraphs() {
 Status QnnModel::SetupQnnInputOutput() {
   LOGS(logger_, VERBOSE) << "Setting up QNN input/output for graph: " << graph_info_->Name();
 
-  auto result = SetupTensors(qnn_inputs_, graph_info_->InputTensors());
+  auto result = SetupTensors(qnn_input_infos_, graph_info_->InputTensors());
 
   if (Status::OK() != result) {
     LOGS(logger_, ERROR) << "Failed to setup QNN input output tensors for graph: " << graph_info_->Name();
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to setup QNN input tensors!");
   }
 
-  result = SetupTensors(qnn_outputs_, graph_info_->OutputTensors(), false);
+  result = SetupTensors(qnn_output_infos_, graph_info_->OutputTensors(), false);
   if (Status::OK() != result) {
     LOGS(logger_, ERROR) << "Failed to setup QNN input output tensors for graph: " << graph_info_->Name();
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to setup QNN output tensors!");
@@ -186,8 +214,8 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context) {
   LOGS(logger_, VERBOSE) << "QnnModel::ExecuteGraphs";
   const size_t num_inputs = context.GetInputCount();
   const size_t num_outputs = context.GetOutputCount();
-  ORT_RETURN_IF_NOT(qnn_inputs_.size() <= num_inputs, "Inconsistent input sizes");
-  ORT_RETURN_IF_NOT(qnn_outputs_.size() == num_outputs, "Inconsistent output sizes");
+  ORT_RETURN_IF_NOT(qnn_input_infos_.size() <= num_inputs, "Inconsistent input sizes");
+  ORT_RETURN_IF_NOT(qnn_output_infos_.size() == num_outputs, "Inconsistent output sizes");
 
   using namespace qnn::utils;
   auto TensorDataSize = [&](auto ort_tensor) -> size_t {
@@ -198,49 +226,67 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context) {
     return element_size * length;
   };
 
-  for (auto& qnn_input_tensor : qnn_inputs_) {
-    const std::string& model_input_name(GetQnnTensorName(qnn_input_tensor));
-    auto index = GetOrtInputIndex(model_input_name);
-    LOGS(logger_, VERBOSE) << "model_input = " << model_input_name << " index = " << index;
-    auto ort_input_tensor = context.GetInput(index);
-    auto qnn_tensor_size = GetQnnTensorClientBuf(qnn_input_tensor).dataSize;
+  std::vector<Qnn_Tensor_t> qnn_inputs;
+  qnn_inputs.reserve(qnn_input_infos_.size());
+
+  for (const auto& qnn_input_info : qnn_input_infos_) {
+    LOGS(logger_, VERBOSE) << "model_input = " << qnn_input_info.tensor_wrapper->GetName()
+                           << " index = " << qnn_input_info.ort_index;
+    auto ort_input_tensor = context.GetInput(qnn_input_info.ort_index);
     auto ort_tensor_size = TensorDataSize(ort_input_tensor);
-    LOGS(logger_, VERBOSE) << "Qnn tensor size: " << qnn_tensor_size << "Ort tensor size: " << ort_tensor_size;
-    ORT_ENFORCE(qnn_tensor_size == ort_tensor_size,
+    LOGS(logger_, VERBOSE) << "Qnn tensor size: " << qnn_input_info.tensor_byte_size
+                           << "Ort tensor size: " << ort_tensor_size;
+    ORT_ENFORCE(qnn_input_info.tensor_byte_size == ort_tensor_size,
                 "ORT Tensor data size does not match QNN tensor data size.");
-    SetQnnTensorClientBufData(qnn_input_tensor,
-                              const_cast<void*>(ort_input_tensor.GetTensorData<void>()));
+
+    qnn_inputs.push_back(qnn_input_info.tensor_wrapper->GetQnnTensor());
+    SetQnnTensorClientBuf(qnn_inputs.back(),
+                          const_cast<void*>(ort_input_tensor.GetTensorData<void>()), qnn_input_info.tensor_byte_size);
   }
 
-  for (auto& qnn_output_tensor : qnn_outputs_) {
-    const std::string& model_output_name(GetQnnTensorName(qnn_output_tensor));
-    auto index = GetOutputIndex(model_output_name);
-    LOGS(logger_, VERBOSE) << "model_output = " << model_output_name << " index = " << index;
-    const auto& output_info = GetOutputInfo(model_output_name);
-    const std::vector<int64_t>& output_shape = output_info->shape_;
-    auto output_tensor = context.GetOutput(index, output_shape.data(), output_shape.size());
-    auto qnn_tensor_size = GetQnnTensorClientBuf(qnn_output_tensor).dataSize;
-    auto ort_tensor_size = TensorDataSize(output_tensor);
-    LOGS(logger_, VERBOSE) << "Qnn tensor size: " << qnn_tensor_size << "Ort tensor size: " << ort_tensor_size;
-    ORT_ENFORCE(qnn_tensor_size == ort_tensor_size,
+  std::vector<Qnn_Tensor_t> qnn_outputs;
+  qnn_outputs.reserve(qnn_output_infos_.size());
+
+  for (auto& qnn_output_info : qnn_output_infos_) {
+    const std::string& model_output_name = qnn_output_info.tensor_wrapper->GetName();
+    LOGS(logger_, VERBOSE) << "model_output = " << model_output_name << " index = " << qnn_output_info.ort_index;
+    const auto& ort_output_info = GetOutputInfo(model_output_name);
+    const std::vector<int64_t>& output_shape = ort_output_info->shape_;
+    auto ort_output_tensor = context.GetOutput(qnn_output_info.ort_index, output_shape.data(), output_shape.size());
+    auto ort_tensor_size = TensorDataSize(ort_output_tensor);
+    LOGS(logger_, VERBOSE) << "Qnn tensor size: " << qnn_output_info.tensor_byte_size
+                           << "Ort tensor size: " << ort_tensor_size;
+    ORT_ENFORCE(qnn_output_info.tensor_byte_size == ort_tensor_size,
                 "ORT Tensor data size does not match QNN tensor data size");
-    SetQnnTensorClientBufData(qnn_output_tensor,
-                              const_cast<void*>(output_tensor.GetTensorData<void>()));
+
+    qnn_outputs.push_back(qnn_output_info.tensor_wrapper->GetQnnTensor());
+    SetQnnTensorClientBuf(qnn_outputs.back(),
+                          const_cast<void*>(ort_output_tensor.GetTensorData<void>()), qnn_output_info.tensor_byte_size);
   }
 
   LOGS(logger_, VERBOSE) << "Start execute QNN graph:" << graph_info_->Name();
   auto qnn_interface = qnn_backend_manager_->GetQnnInterface();
   auto profile_backend_handle = qnn_backend_manager_->GetQnnProfileHandle();
   Qnn_ErrorHandle_t execute_status = QNN_GRAPH_NO_ERROR;
-  execute_status = qnn_interface.graphExecute(graph_info_->Graph(),
-                                              qnn_inputs_.data(),
-                                              static_cast<uint32_t>(qnn_inputs_.size()),
-                                              qnn_outputs_.data(),
-                                              static_cast<uint32_t>(qnn_outputs_.size()),
-                                              profile_backend_handle,
-                                              nullptr);
 
-  ORT_RETURN_IF_ERROR(qnn_backend_manager_->ExtractBackendProfilingInfo());
+  {
+    // Acquire mutex before calling graphExecute and profiling APIs to support calling session.Run()
+    // from multiple threads.
+    std::lock_guard<OrtMutex> lock(graph_exec_mutex_);
+    execute_status = qnn_interface.graphExecute(graph_info_->Graph(),
+                                                qnn_inputs.data(),
+                                                static_cast<uint32_t>(qnn_inputs.size()),
+                                                qnn_outputs.data(),
+                                                static_cast<uint32_t>(qnn_outputs.size()),
+                                                profile_backend_handle,
+                                                nullptr);
+
+    // NOTE: This function returns immediately when profiling is disabled.
+    // Extracting profiling data can be expensive, but it is typically only enabled for debugging purposes
+    // and not in production. We can improve synchronization for event profiling if it becomes an issue.
+    ORT_RETURN_IF_ERROR(qnn_backend_manager_->ExtractBackendProfilingInfo());
+  }
+
   if (QNN_GRAPH_NO_ERROR != execute_status) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN graph execute error. Error code: ", execute_status);
   }
@@ -262,14 +308,13 @@ Status QnnModel::GetQnnTensorDataLength(const std::vector<uint32_t>& dims,
   return Status::OK();
 }
 
-// Setup details for Qnn_Tensor_t for execution
-// based on information in QnnTensorWrapper
-Status QnnModel::SetupTensors(std::vector<Qnn_Tensor_t>& qnn_tensors,
+// Setup information for Qnn inputs/outputs used during execution.
+Status QnnModel::SetupTensors(std::vector<QnnTensorInfo>& qnn_tensor_infos,
                               const std::vector<QnnTensorWrapper>& tensor_wrappers,
                               bool is_input) {
   size_t tensor_count = tensor_wrappers.size();
   ORT_RETURN_IF(0 == tensor_count, "Zero tensor size!");
-  qnn_tensors.resize(tensor_count);
+  qnn_tensor_infos.resize(tensor_count);
 
   for (auto& tensor_wrapper : tensor_wrappers) {
     size_t length = 0;
@@ -277,10 +322,14 @@ Status QnnModel::SetupTensors(std::vector<Qnn_Tensor_t>& qnn_tensors,
     ORT_RETURN_IF_ERROR(GetQnnTensorDataLength(tensor_wrapper.GetTensorDims(),
                                                tensor_wrapper.GetTensorDataType(),
                                                length));
-    auto tensor_name = tensor_wrapper.GetName();
-    auto index = is_input ? GetGraphInputIndex(tensor_name) : GetOutputIndex(tensor_name);
-    qnn_tensors[index] = tensor_wrapper.GetQnnTensor();
-    SetQnnTensorClientBufSize(qnn_tensors[index], static_cast<uint32_t>(length));
+    const auto& tensor_name = tensor_wrapper.GetName();
+    auto qnn_index = is_input ? GetGraphInputIndex(tensor_name) : GetOutputIndex(tensor_name);
+    auto ort_index = is_input ? GetOrtInputIndex(tensor_name) : qnn_index;
+
+    QnnTensorInfo& qnn_tensor_info = qnn_tensor_infos[qnn_index];
+    qnn_tensor_info.tensor_wrapper = &tensor_wrapper;
+    qnn_tensor_info.tensor_byte_size = static_cast<uint32_t>(length);
+    qnn_tensor_info.ort_index = ort_index;
   }
   return Status::OK();
 }
@@ -300,14 +349,16 @@ Status QnnModel::DeserializeGraphInfoFromBinaryInfo(const QnnSystemContext_Graph
     // Copy graph input
     Qnn_Tensor_t* input_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphInputs;
     for (size_t i = 0; i < graph_input_num; ++i) {
-      QnnTensorWrapper tensorwrapper(input_tensors[i]);
+      QnnTensorWrapper tensorwrapper;
+      ORT_RETURN_IF_ERROR(tensorwrapper.Init(input_tensors[i]));
       input_tensor_wrappers.push_back(std::move(tensorwrapper));
     }
 
     // Copy graph output
     Qnn_Tensor_t* output_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphOutputs;
     for (size_t i = 0; i < graph_output_num; ++i) {
-      QnnTensorWrapper tensorwrapper(output_tensors[i]);
+      QnnTensorWrapper tensorwrapper;
+      ORT_RETURN_IF_ERROR(tensorwrapper.Init(output_tensors[i]));
       output_tensor_wrappers.push_back(std::move(tensorwrapper));
     }
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h
index de4f872f73cc..8fed2f364ba5 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h
@@ -3,17 +3,26 @@
 
 #pragma once
 
+#include <vector>
+
 #include "core/common/status.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
+#include "core/platform/ort_mutex.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/session/onnxruntime_cxx_api.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 
 namespace onnxruntime {
 namespace qnn {
 
+struct QnnTensorInfo {
+  const QnnTensorWrapper* tensor_wrapper = nullptr;
+  uint32_t tensor_byte_size = 0;
+  size_t ort_index = 0;
+};
+
 class QnnModel {
  public:
   QnnModel(const logging::Logger& logger,
@@ -103,7 +112,8 @@ class QnnModel {
                                 Qnn_DataType_t data_type,
                                 size_t& data_length) const;
 
-  Status SetupTensors(std::vector<Qnn_Tensor_t>& tensors, const std::vector<QnnTensorWrapper>& tensor_wrappers, bool is_input = true);
+  Status SetupTensors(std::vector<QnnTensorInfo>& tensors, const std::vector<QnnTensorWrapper>& tensor_wrappers,
+                      bool is_input = true);
 
   QnnBackendType GetQnnBackendType() { return qnn_backend_type_; }
 
@@ -126,9 +136,12 @@ class QnnModel {
   std::vector<std::string> output_names_;
   std::unordered_map<std::string, OnnxTensorInfo> inputs_info_;
   std::unordered_map<std::string, OnnxTensorInfo> outputs_info_;
-  std::vector<Qnn_Tensor_t> qnn_inputs_;
-  std::vector<Qnn_Tensor_t> qnn_outputs_;
+  std::vector<QnnTensorInfo> qnn_input_infos_;
+  std::vector<QnnTensorInfo> qnn_output_infos_;
   QnnBackendType qnn_backend_type_ = QnnBackendType::CPU;
+
+  // Mutex acquired during graph execution to support multi-threaded inference of a single session.
+  OrtMutex graph_exec_mutex_;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index a422434205c6..750fe2488230 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -276,113 +276,139 @@ bool QnnModelWrapper::GetOnnxShape(const NodeArg& node_arg, std::vector<uint32_t
   return true;
 }
 
-bool QnnModelWrapper::ProcessOffset(const std::string& offset_name,
-                                    int32_t& offset_value) const {
+Status QnnModelWrapper::UnpackZeroPoints(const std::string& initializer_name,
+                                         std::vector<int32_t>& zero_points) const {
   const auto& graph_initializers = GetInitializerTensors();
-  auto offset_it = graph_initializers.find(offset_name);
-  if (offset_it == graph_initializers.end()) {
-    LOGS(logger_, ERROR) << "Not able to find initializer: " << offset_name;
-    return false;
-  }
-  const auto offset_tensor = offset_it->second;
-  const int32_t onnx_data_type = offset_tensor->data_type();
+  auto iter = graph_initializers.find(initializer_name);
+  ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for zero-point(s): ",
+                initializer_name.c_str());
+  gsl::not_null<const onnx::TensorProto*> zp_tensor_proto = iter->second;
+
+  ORT_RETURN_IF_NOT(zp_tensor_proto->has_data_type(), "Expected zero-point initializer ", initializer_name.c_str(),
+                    " to have a proto data type.");
+
+  const int32_t onnx_data_type = zp_tensor_proto->data_type();
+  std::vector<uint8_t> initializer_bytes;
+
+  ORT_RETURN_IF_ERROR(UnpackInitializerData(*zp_tensor_proto, initializer_bytes));
 
-  std::vector<uint8_t> unpacked_tensor;
-  ORT_THROW_IF_ERROR(UnpackInitializerData(*offset_tensor, unpacked_tensor));
   switch (onnx_data_type) {
     // QNN use -offset for some reason
     case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
-      auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(unpacked_tensor));
-      offset_value = -(int8_span.data()[0]);
+      auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
+      std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
+                     [](int8_t zp) -> int32_t {
+                       return -static_cast<int32_t>(zp);
+                     });
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8: {
-      auto uint8_span = ReinterpretAsSpan<const uint8_t>(gsl::make_span(unpacked_tensor));
-      offset_value = 0 - (uint8_span.data()[0]);
+      auto uint8_span = ReinterpretAsSpan<const uint8_t>(gsl::make_span(initializer_bytes));
+      std::transform(uint8_span.begin(), uint8_span.end(), std::back_inserter(zero_points),
+                     [](uint8_t zp) -> int32_t {
+                       return -static_cast<int32_t>(zp);
+                     });
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_UINT16: {
-      auto uint16_span = ReinterpretAsSpan<const uint16_t>(gsl::make_span(unpacked_tensor));
-      offset_value = -static_cast<int32_t>(uint16_span.data()[0]);
+      auto uint16_span = ReinterpretAsSpan<const uint16_t>(gsl::make_span(initializer_bytes));
+      std::transform(uint16_span.begin(), uint16_span.end(), std::back_inserter(zero_points),
+                     [](uint16_t zp) -> int32_t {
+                       return -static_cast<int32_t>(zp);
+                     });
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT16: {
-      auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(unpacked_tensor));
-      offset_value = -static_cast<int32_t>(int16_span.data()[0]);
+      auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(initializer_bytes));
+      std::transform(int16_span.begin(), int16_span.end(), std::back_inserter(zero_points),
+                     [](int16_t zp) -> int32_t {
+                       return -static_cast<int32_t>(zp);
+                     });
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
-      auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(unpacked_tensor));
-      offset_value = -(int32_span.data()[0]);
+      auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(initializer_bytes));
+      std::transform(int32_span.begin(), int32_span.end(), std::back_inserter(zero_points),
+                     [](int32_t zp) -> int32_t {
+                       return -zp;
+                     });
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_UINT32: {
-      auto uint32_span = ReinterpretAsSpan<const uint32_t>(gsl::make_span(unpacked_tensor));
-      offset_value = 0 - (uint32_span.data()[0]);
+      auto uint32_span = ReinterpretAsSpan<const uint32_t>(gsl::make_span(initializer_bytes));
+      std::transform(uint32_span.begin(), uint32_span.end(), std::back_inserter(zero_points),
+                     [](uint32_t zp) -> int32_t {
+                       return -static_cast<int32_t>(zp);
+                     });
       break;
     }
     default: {
-      LOGS(logger_, ERROR) << "Data type not supported!";
-      return false;
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Zero-point ONNX data type `", onnx_data_type,
+                             "` is not supported.");
     }
   }
-  return true;
+
+  return Status::OK();
 }
 
-bool QnnModelWrapper::ProcessScale(const std::string& scale_name,
-                                   float& scale_value) const {
+Status QnnModelWrapper::UnpackScales(const std::string& initializer_name, std::vector<float>& scales) const {
   const auto& graph_initializers = GetInitializerTensors();
-  auto offset_it = graph_initializers.find(scale_name);
-  if (offset_it == graph_initializers.end()) {
-    LOGS(logger_, ERROR) << "Not able to find initializer: " << scale_name;
-    return false;
-  }
-  const auto scale_tensor = offset_it->second;
-  std::vector<uint8_t> unpacked_tensor;
+  auto iter = graph_initializers.find(initializer_name);
+  ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ",
+                initializer_name.c_str());
+  gsl::not_null<const onnx::TensorProto*> scale_tensor_proto = iter->second;
 
-  ORT_THROW_IF_ERROR(UnpackInitializerData(*scale_tensor, unpacked_tensor));
-  const float* scale_data = reinterpret_cast<float*>(unpacked_tensor.data());
-  scale_value = scale_data[0];
-  return true;
-}
+  ORT_RETURN_IF_NOT(scale_tensor_proto->has_data_type(), "Expected scale initializer ", initializer_name.c_str(),
+                    " to have a proto data type.");
+  ORT_RETURN_IF_NOT(scale_tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
+                    "Expected scale initializer to be of type FLOAT");
 
-bool QnnModelWrapper::ProcessQuantizationParameter(const std::optional<NodeUnitIODef::QuantParam>& quant_param,
-                                                   float& scale_value,
-                                                   int32_t& offset_value) const {
-  if (quant_param.has_value()) {
-    // Parse scale & zero_point
-    const auto& scale_name = quant_param->scale.Name();
-    bool rt = ProcessScale(scale_name, scale_value);
-    if (!rt) {
-      return rt;
-    }
+  std::vector<uint8_t> initializer_bytes;
 
-    if (quant_param->zero_point) {
-      const auto& zero_point_name = quant_param->zero_point->Name();
-      return ProcessOffset(zero_point_name, offset_value);
-    }
+  ORT_RETURN_IF_ERROR(UnpackInitializerData(*scale_tensor_proto, initializer_bytes));
+
+  gsl::span<const float> src = gsl::make_span(reinterpret_cast<const float*>(initializer_bytes.data()),
+                                              initializer_bytes.size() / sizeof(float));
+
+  scales.insert(scales.end(), src.begin(), src.end());
+  return Status::OK();
+}
+
+// Checks if a tensor in the ONNX graph is per-channel quantized.
+Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def,
+                                              /*out*/ bool& is_per_axis) const {
+  if (!io_def.quant_param) {
+    is_per_axis = false;
+    return Status::OK();
   }
-  return true;
+
+  const std::string& scale_name = io_def.quant_param->scale.Name();
+  const auto& graph_initializers = GetInitializerTensors();
+  auto iter = graph_initializers.find(scale_name);
+  ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ",
+                scale_name.c_str());
+  gsl::not_null<const onnx::TensorProto*> scale_tensor_proto = iter->second;
+  TensorShape scale_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*scale_tensor_proto);
+
+  // Check the number of scale values to determine if the tensor is per-channel.
+  // This is consistent with CPU EP's Quant/Dequant logic. We can't use the presence of an axis because even a
+  // per-channel DQ/Q op may not have an explicit axis attribute (assumed to default to 1 if missing).
+  const bool is_scalar_or_1_elem_vector = scale_shape.NumDimensions() == 0 ||
+                                          (scale_shape.NumDimensions() == 1 && scale_shape.Size() == 1);
+
+  is_per_axis = !is_scalar_or_1_elem_vector;
+  return Status::OK();
 }
 
 Status QnnModelWrapper::GetTensorInfo(const NodeUnitIODef& input, TensorInfo& tensor_info) const {
   const std::string& name = input.node_arg.Name();
 
   // Fill in quantization param info.
-  tensor_info.quant_param = QNN_QUANTIZE_PARAMS_INIT;
-  bool is_quantized_tensor = input.quant_param.has_value();
-  utils::InitializeQuantizeParam(tensor_info.quant_param, is_quantized_tensor);
-
-  if (is_quantized_tensor) {
-    ORT_RETURN_IF_NOT(ProcessQuantizationParameter(input.quant_param,
-                                                   tensor_info.quant_param.scaleOffsetEncoding.scale,
-                                                   tensor_info.quant_param.scaleOffsetEncoding.offset),
-                      "QNN EP: Cannot get quantization parameters for input ", name.c_str());
-  }
+  ORT_RETURN_IF_ERROR(tensor_info.quant_param.Init(*this, input));
 
   // Fill in QNN data type.
   tensor_info.qnn_data_type = QNN_DATATYPE_FLOAT_32;
-  ORT_RETURN_IF_ERROR(utils::GetQnnDataType(is_quantized_tensor, input.node_arg.TypeAsProto(),
+  ORT_RETURN_IF_ERROR(utils::GetQnnDataType(input.quant_param.has_value(), input.node_arg.TypeAsProto(),
                                             tensor_info.qnn_data_type));
 
   // Fill in shape.
@@ -402,14 +428,19 @@ Status QnnModelWrapper::AddReshapeNode(const std::string& input_name,
                                        const std::vector<uint32_t>& input_shape,
                                        const std::vector<uint32_t>& output_shape,
                                        const Qnn_DataType_t& tensor_data_type,
-                                       const Qnn_QuantizeParams_t& quantize_param,
+                                       const QnnQuantParamsWrapper& quantize_param,
                                        bool do_op_validation,
                                        bool is_for_input,
                                        bool is_for_output) {
+  // Do not allow QNN EP to insert Reshape nodes with per-channel quantization on dynamic tensors.
+  // We could technically support this by shifting the quantization param's axis value, but
+  // we don't need this right now.
+  ORT_RETURN_IF(quantize_param.IsPerChannel(),
+                "Do not support inserted Reshape nodes with per-channel quantization");
   QnnTensorWrapper input_tensorwrapper(input_name,
                                        is_for_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_NATIVE,
                                        tensor_data_type,
-                                       quantize_param,
+                                       quantize_param.Copy(),
                                        std::vector<uint32_t>(input_shape));
   ORT_RETURN_IF_NOT(AddTensorWrapper(std::move(input_tensorwrapper)),
                     "QNN EP: Failed to add input tensor for inserted Reshape.");
@@ -418,7 +449,7 @@ Status QnnModelWrapper::AddReshapeNode(const std::string& input_name,
   QnnTensorWrapper output_tensorwrapper(output_name,
                                         tensor_type,
                                         tensor_data_type,
-                                        quantize_param,
+                                        quantize_param.Copy(),
                                         std::vector<uint32_t>(output_shape));
   ORT_RETURN_IF_NOT(AddTensorWrapper(std::move(output_tensorwrapper)),
                     "QNN EP: Failed to add output tensor for inserted Reshape.");
@@ -442,17 +473,22 @@ Status QnnModelWrapper::AddTransposeNode(NodeIndex node_index,
                                          const std::vector<uint32_t>& transpose_perm,
                                          const std::vector<uint32_t>& output_shape,
                                          const Qnn_DataType_t& tensor_data_type,
-                                         const Qnn_QuantizeParams_t& quantize_param,
+                                         const QnnQuantParamsWrapper& quantize_param,
                                          bool do_op_validation,
                                          bool is_for_input,
                                          bool is_for_output) {
+  // Do not allow QNN EP to insert transpose nodes with per-channel quantization on dynamic tensors.
+  // We could technically support this by transposing the quantization param's axis value, but
+  // we don't need this right now.
+  ORT_RETURN_IF(quantize_param.IsPerChannel(),
+                "Do not support inserted Transpose nodes with per-channel quantization");
   // No need to add this for output nodes as it is added as output tensor for previous node
   if (is_for_input) {
     Qnn_TensorType_t tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
     QnnTensorWrapper input_tensorwrapper(input_name,
                                          tensor_type,
                                          tensor_data_type,
-                                         quantize_param,
+                                         quantize_param.Copy(),
                                          std::vector<uint32_t>(input_shape));
     ORT_RETURN_IF_NOT(AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
   }
@@ -469,7 +505,7 @@ Status QnnModelWrapper::AddTransposeNode(NodeIndex node_index,
   QnnTensorWrapper output_tensorwrapper(output_name,
                                         tensor_type,
                                         tensor_data_type,
-                                        quantize_param,
+                                        quantize_param.Copy(),
                                         std::move(output_shape_copy));
   ORT_RETURN_IF_NOT(AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
   const static std::string qnn_node_type = "Transpose";
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 8ae489c749f3..446c08295065 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -11,9 +11,10 @@
 #include "QnnInterface.h"
 #include "qnn_def.h"
 #include "core/common/logging/logging.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -23,7 +24,7 @@ namespace qnn {
 struct TensorInfo {
   std::vector<uint32_t> shape;
   Qnn_DataType_t qnn_data_type;
-  Qnn_QuantizeParams_t quant_param;
+  QnnQuantParamsWrapper quant_param;
   bool is_initializer;
   const ONNX_NAMESPACE::TensorProto* initializer_tensor;
 };
@@ -97,16 +98,6 @@ class QnnModelWrapper {
 
   static bool GetOnnxShape(const NodeArg& node_arg, std::vector<uint32_t>& shape);
 
-  bool ProcessOffset(const std::string& offset_name,
-                     int32_t& offset_value) const;
-
-  bool ProcessScale(const std::string& scale_name,
-                    float& scale_value) const;
-
-  bool ProcessQuantizationParameter(const std::optional<NodeUnitIODef::QuantParam>& quant_param,
-                                    float& scale_value,
-                                    int32_t& offset_value) const;
-
   bool IsQnnTensorWrapperExist(const std::string& tensor_name) const;
 
   bool IsGraphOutput(const std::string& tensor_name) const {
@@ -124,7 +115,7 @@ class QnnModelWrapper {
                         const std::vector<uint32_t>& input_shape,
                         const std::vector<uint32_t>& output_shape,
                         const Qnn_DataType_t& tensor_data_type,
-                        const Qnn_QuantizeParams_t& quantize_param,
+                        const QnnQuantParamsWrapper& quantize_param,
                         bool do_op_validation,
                         bool is_for_input = true,
                         bool is_for_output = false);
@@ -136,7 +127,7 @@ class QnnModelWrapper {
                           const std::vector<uint32_t>& transpose_perm,
                           const std::vector<uint32_t>& output_shape,
                           const Qnn_DataType_t& tensor_data_type,
-                          const Qnn_QuantizeParams_t& quantize_param,
+                          const QnnQuantParamsWrapper& quantize_param,
                           bool do_op_validation,
                           bool is_for_input = true,
                           bool is_for_output = false);
@@ -148,7 +139,7 @@ class QnnModelWrapper {
                                 const std::vector<uint32_t>& input_shape,
                                 const std::vector<uint32_t>& output_shape,
                                 const Qnn_DataType_t& tensor_data_type,
-                                const Qnn_QuantizeParams_t& quantize_param,
+                                const QnnQuantParamsWrapper& quantize_param,
                                 bool do_op_validation,
                                 bool is_for_input = true,
                                 bool is_for_output = false) {
@@ -165,7 +156,7 @@ class QnnModelWrapper {
                                 const std::vector<uint32_t>& input_shape,
                                 const std::vector<uint32_t>& output_shape,
                                 const Qnn_DataType_t& tensor_data_type,
-                                const Qnn_QuantizeParams_t& quantize_param,
+                                const QnnQuantParamsWrapper& quantize_param,
                                 bool do_op_validation,
                                 bool is_for_input = true,
                                 bool is_for_output = false) {
@@ -182,6 +173,15 @@ class QnnModelWrapper {
 
   const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
 
+  // Unpack float scales from initializer (1 scale for per-tensor, > 1 for per-axis).
+  Status UnpackScales(const std::string& initializer_name, std::vector<float>& scales) const;
+
+  // Unpack zero-points from initializer and convert to int32_t (1 zero-point for per-tensor, > 1 for per-channel).
+  Status UnpackZeroPoints(const std::string& initializer_name, std::vector<int32_t>& zero_points) const;
+
+  // Checks if a tensor in the ONNX graph is per-axis quantized.
+  Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, /*out*/ bool& is_per_axis) const;
+
  private:
   bool CreateQnnInputOutputTensors(const std::string& qnn_node_name,
                                    const std::vector<std::string>& names,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
new file mode 100644
index 000000000000..401d403c15b0
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -0,0 +1,159 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
+#include <algorithm>
+#include <cassert>
+#include <optional>
+#include <vector>
+#include "QnnTypes.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+QnnQuantParamsWrapper::QnnQuantParamsWrapper(const QnnQuantParamsWrapper& other)
+    : params_(QNN_QUANTIZE_PARAMS_INIT) {
+  Status status = Init(other.params_);
+  assert(status.IsOK());  // Expect other QnnQuantParamsWrapper to always have a supported quantization encoding.
+}
+
+QnnQuantParamsWrapper& QnnQuantParamsWrapper::operator=(const QnnQuantParamsWrapper& other) {
+  if (this != &other) {
+    Status status = Init(other.params_);
+    assert(status.IsOK());  // Expect other QnnQuantParamsWrapper to always have a supported quantization encoding.
+  }
+
+  return *this;
+}
+
+QnnQuantParamsWrapper::QnnQuantParamsWrapper(float scale, int32_t offset) {
+  params_.encodingDefinition = QNN_DEFINITION_DEFINED;
+  params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
+  params_.scaleOffsetEncoding.scale = scale;
+  params_.scaleOffsetEncoding.offset = offset;
+}
+
+QnnQuantParamsWrapper QnnQuantParamsWrapper::Copy() const {
+  return QnnQuantParamsWrapper(*this);
+}
+
+Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params) {
+  if (scale_offset_data_) {
+    scale_offset_data_.reset(nullptr);
+    params_ = QNN_QUANTIZE_PARAMS_INIT;
+  }
+
+  if (params.encodingDefinition != QNN_DEFINITION_DEFINED) {
+    params_ = params;
+    return Status::OK();
+  }
+
+  switch (params.quantizationEncoding) {
+    case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET:
+      params_ = params;
+      break;
+    case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
+      params_.encodingDefinition = params.encodingDefinition;
+      params_.quantizationEncoding = params.quantizationEncoding;
+      params_.axisScaleOffsetEncoding.axis = params.axisScaleOffsetEncoding.axis;
+      params_.axisScaleOffsetEncoding.numScaleOffsets = params.axisScaleOffsetEncoding.numScaleOffsets;
+
+      // Deep copy the scaleOffset data.
+      const uint32_t num_elems = params.axisScaleOffsetEncoding.numScaleOffsets;
+
+      if (num_elems > 0) {
+        scale_offset_data_ = std::make_unique<Qnn_ScaleOffset_t[]>(num_elems);
+        gsl::span<Qnn_ScaleOffset_t> src_span(params.axisScaleOffsetEncoding.scaleOffset, num_elems);
+        std::copy(src_span.begin(), src_span.end(), scale_offset_data_.get());
+        params_.axisScaleOffsetEncoding.scaleOffset = scale_offset_data_.get();
+      } else {
+        params_.axisScaleOffsetEncoding.scaleOffset = nullptr;
+      }
+      break;
+    }
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported QNN quantization encoding: ", params.quantizationEncoding);
+  }
+
+  return Status::OK();
+}
+
+Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper, const NodeUnitIODef& io_def) {
+  const std::optional<NodeUnitIODef::QuantParam>& ort_quant_params = io_def.quant_param;
+
+  if (scale_offset_data_) {
+    scale_offset_data_.reset(nullptr);
+    params_ = QNN_QUANTIZE_PARAMS_INIT;
+  }
+
+  if (!ort_quant_params.has_value()) {
+    params_.encodingDefinition = QNN_DEFINITION_UNDEFINED;
+    params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED;
+    return Status::OK();
+  }
+
+  std::vector<float> scales;
+  std::vector<int32_t> zero_points;
+
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackScales(ort_quant_params->scale.Name(), scales));
+
+  if (ort_quant_params->zero_point != nullptr) {
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackZeroPoints(ort_quant_params->zero_point->Name(), zero_points));
+  }
+
+  const bool is_per_tensor = scales.size() == 1;
+
+  if (is_per_tensor) {
+    params_.encodingDefinition = QNN_DEFINITION_DEFINED;
+    params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
+
+    // Parse scale & zero_point
+    params_.scaleOffsetEncoding.scale = scales[0];
+
+    if (ort_quant_params->zero_point != nullptr) {
+      ORT_RETURN_IF_NOT(zero_points.size() == 1, "Expected one zero-point value");
+      params_.scaleOffsetEncoding.offset = zero_points[0];
+    } else {
+      params_.scaleOffsetEncoding.offset = 0;
+    }
+  } else {
+    // Per-channel quantization.
+    const auto* io_shape = io_def.node_arg.Shape();
+    ORT_RETURN_IF(io_shape == nullptr, "Input/output tensor proto must have a shape");
+    const int32_t io_rank = io_shape->dim_size();
+
+    constexpr int64_t DEFAULT_QDQ_AXIS = 1;
+    int64_t axis = ort_quant_params->axis.value_or(DEFAULT_QDQ_AXIS);
+    if (axis < 0) {
+      axis += io_rank;
+    }
+    ORT_RETURN_IF_NOT(axis >= 0 && axis < io_rank,
+                      "Quantization axis must be within the range [0, rank - 1]");
+
+    params_.encodingDefinition = QNN_DEFINITION_DEFINED;
+    params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET;
+
+    const size_t num_elems = scales.size();
+    const bool no_zero_points = zero_points.empty();
+    ORT_RETURN_IF_NOT(num_elems > 1, "Expected more than one scale value");
+    ORT_RETURN_IF_NOT(no_zero_points || zero_points.size() == num_elems,
+                      "Expected the same number of zero-points and scales for per-channel quantization");
+
+    scale_offset_data_ = std::make_unique<Qnn_ScaleOffset_t[]>(num_elems);
+    gsl::span<Qnn_ScaleOffset_t> data_span(scale_offset_data_.get(), num_elems);
+
+    for (size_t i = 0; i < num_elems; i++) {
+      data_span[i].scale = scales[i];
+      data_span[i].offset = no_zero_points ? 0 : zero_points[i];
+    }
+
+    params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
+    params_.axisScaleOffsetEncoding.numScaleOffsets = static_cast<uint32_t>(num_elems);
+    params_.axisScaleOffsetEncoding.scaleOffset = data_span.data();
+  }
+
+  return Status::OK();
+}
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
new file mode 100644
index 000000000000..3cf04da97a8f
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -0,0 +1,141 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <memory>
+#include "QnnTypes.h"
+#include "core/common/common.h"
+#include "core/common/gsl.h"
+#include "core/framework/node_unit.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+class QnnModelWrapper;  // Forward-declare
+
+class QnnQuantParamsWrapper {
+ public:
+  QnnQuantParamsWrapper() : params_(QNN_QUANTIZE_PARAMS_INIT) {}
+
+  QnnQuantParamsWrapper(const QnnQuantParamsWrapper& other);
+  QnnQuantParamsWrapper& operator=(const QnnQuantParamsWrapper& other);
+
+  QnnQuantParamsWrapper(QnnQuantParamsWrapper&& other) = default;
+  QnnQuantParamsWrapper& operator=(QnnQuantParamsWrapper&& other) = default;
+
+  // Construct a per-tensor quantization param (SCALE_OFFSET)
+  QnnQuantParamsWrapper(float scale, int32_t offset);
+
+  Qnn_QuantizeParams_t& Get() { return params_; }
+  const Qnn_QuantizeParams_t& Get() const { return params_; }
+
+  // Initialize this object from a raw Qnn_QuantizeParam_t object.
+  Status Init(const Qnn_QuantizeParams_t& params);
+
+  // Initialize this object from a (potentially) quantized ONNX tensor.
+  // QnnModelWrapper provides utilities for unpacking scale and zero-point ONNX initializers.
+  Status Init(const QnnModelWrapper& qnn_model_wrapper, const NodeUnitIODef& io_def);
+
+  QnnQuantParamsWrapper Copy() const;
+
+  bool IsQuantized() const {
+    return params_.encodingDefinition == QNN_DEFINITION_DEFINED;
+  }
+
+  bool IsPerTensor(bool include_bw = false) const {
+    return params_.encodingDefinition == QNN_DEFINITION_DEFINED &&
+           (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET ||
+            (include_bw && params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET));
+  }
+
+  bool IsPerChannel(bool include_bw = false) const {
+    return params_.encodingDefinition == QNN_DEFINITION_DEFINED &&
+           (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET ||
+            (include_bw && params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET));
+  }
+
+  // Handle transposing of a per-channel quantized tensor. The quantization parameter's axis
+  // must be transposed using the inverse permutation of the Transpose.
+  template <typename IntType>
+  Status HandleTranspose(gsl::span<const IntType> perm) {
+    if (!IsPerChannel(true)) {
+      return Status::OK();
+    }
+
+    if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+      ORT_RETURN_IF_NOT(static_cast<size_t>(params_.axisScaleOffsetEncoding.axis) < perm.size(),
+                        "Axis value is out of range of the provided permutation");
+      const int32_t new_axis = static_cast<int32_t>(perm[params_.axisScaleOffsetEncoding.axis]);
+      params_.axisScaleOffsetEncoding.axis = new_axis;
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+      ORT_RETURN_IF_NOT(static_cast<size_t>(params_.bwAxisScaleOffsetEncoding.axis) < perm.size(),
+                        "Axis value is out of range of the provided permutation");
+      const int32_t new_axis = static_cast<int32_t>(perm[params_.bwAxisScaleOffsetEncoding.axis]);
+      params_.bwAxisScaleOffsetEncoding.axis = new_axis;
+    }
+
+    return Status::OK();
+  }
+
+  // Handle "unsqueeze" of a per-channel quantized tensor. The quantization parameter's axis
+  // may need to be shifted if the unsqueeze inserted 1s before the quantization axis.
+  template <typename IntType>
+  Status HandleUnsqueeze(gsl::span<const IntType> orig_shape,
+                         gsl::span<const IntType> new_shape) {
+    if (!IsPerChannel(true)) {
+      return Status::OK();
+    }
+
+    ORT_RETURN_IF_NOT(orig_shape.size() < new_shape.size(), "Expected unsqueezed shape to have a greater rank.");
+
+    // Get the axis value.
+    int32_t axis = 0;
+    if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+      axis = params_.axisScaleOffsetEncoding.axis;
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+      axis = params_.bwAxisScaleOffsetEncoding.axis;
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "Unhandled quantization encoding: ", params_.quantizationEncoding);
+    }
+
+    // Find where the axis was moved to after unsqueeze.
+    size_t num_found = 0;
+    size_t j = 0;
+    for (size_t i = 0; i < orig_shape.size() && j < new_shape.size(); i++) {
+      while (orig_shape[i] != new_shape[j] && j < new_shape.size()) {
+        assert(new_shape[j] == 1);
+        j++;
+      }
+      assert(orig_shape[i] == new_shape[j]);
+      if (num_found == static_cast<size_t>(axis)) {
+        break;
+      }
+      num_found += 1;
+      j++;
+    }
+
+    if (j == static_cast<size_t>(axis)) {
+      return Status::OK();
+    }
+
+    // Set new axis.
+    if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+      params_.axisScaleOffsetEncoding.axis = static_cast<int32_t>(j);
+    } else if (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+      params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(j);
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "Unhandled quantization encoding: ", params_.quantizationEncoding);
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  Qnn_QuantizeParams_t params_;
+  std::unique_ptr<Qnn_ScaleOffset_t[]> scale_offset_data_;  // Stores per-channel scales and offsets
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index e4074fa6fb60..7a75b055e7de 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -216,6 +216,31 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
     if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
       out << " scale=" << quantize_params.scaleOffsetEncoding.scale;
       out << " offset=" << quantize_params.scaleOffsetEncoding.offset;
+    } else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+      out << " axis=" << quantize_params.axisScaleOffsetEncoding.axis;
+      size_t num_elems = quantize_params.axisScaleOffsetEncoding.numScaleOffsets;
+      out << " scales=(";
+      for (size_t i = 0; i < num_elems; i++) {
+        out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].scale << (i == num_elems - 1 ? "" : " ");
+      }
+      out << ") offsets=(";
+      for (size_t i = 0; i < num_elems; i++) {
+        out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].offset << (i == num_elems - 1 ? "" : " ");
+      }
+      out << ")";
+    } else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+      out << " axis=" << quantize_params.bwAxisScaleOffsetEncoding.axis;
+      out << " bw=" << quantize_params.bwAxisScaleOffsetEncoding.bitwidth;
+      size_t num_elems = quantize_params.bwAxisScaleOffsetEncoding.numElements;
+      out << " scales=(";
+      for (size_t i = 0; i < num_elems; i++) {
+        out << quantize_params.bwAxisScaleOffsetEncoding.scales[i] << (i == num_elems - 1 ? "" : " ");
+      }
+      out << ") offsets=(";
+      for (size_t i = 0; i < num_elems; i++) {
+        out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " ");
+      }
+      out << ")";
     } else {
       out << " encoding not supported.";
     }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index edbef7ae92ee..f61117addd83 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -1,14 +1,16 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
-#include "QnnTypes.h"
-#include "core/session/onnxruntime_cxx_api.h"
+#pragma once
 
 #include <functional>
 #include <numeric>
-#include <vector>
 #include <string>
+#include <type_traits>
+#include <vector>
 
+#include "QnnTypes.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/framework/node_unit.h"
 #include "core/util/qmath.h"
 
 namespace onnxruntime {
@@ -30,11 +32,28 @@ Status GetQnnDataType(const bool is_quantized_tensor, const ONNX_NAMESPACE::Type
 
 bool OnnxDataTypeToQnnDataType(const int32_t data_type, Qnn_DataType_t& qnn_data_type, bool is_quantized = false);
 
-inline void InitializeQuantizeParam(Qnn_QuantizeParams_t& quantize_param, bool is_quantized_tensor, float scale = 0.0f, int32_t offset = 0) {
-  quantize_param.encodingDefinition = is_quantized_tensor ? QNN_DEFINITION_DEFINED : QNN_DEFINITION_UNDEFINED;
-  quantize_param.quantizationEncoding = is_quantized_tensor ? QNN_QUANTIZATION_ENCODING_SCALE_OFFSET : QNN_QUANTIZATION_ENCODING_UNDEFINED;
-  quantize_param.scaleOffsetEncoding.scale = scale;
-  quantize_param.scaleOffsetEncoding.offset = offset;
+inline Status GetOnnxTensorElemDataType(const NodeArg& node_arg, /*out*/ int32_t& onnx_data_type) {
+  auto type_proto = node_arg.TypeAsProto();
+  ORT_RETURN_IF_NOT(type_proto != nullptr && type_proto->has_tensor_type() && type_proto->tensor_type().has_elem_type(),
+                    "NodeArg must have a tensor TypeProto");
+  onnx_data_type = type_proto->tensor_type().elem_type();
+  return Status::OK();
+}
+
+template <typename IntType>
+static Status InvertPerm(gsl::span<const IntType> perm, /*out*/ gsl::span<IntType> perm_inv) {
+  static_assert(std::is_integral<IntType>::value, "permutation arrays must contain integer elements");
+
+  size_t rank = perm.size();
+  ORT_RETURN_IF_NOT(perm_inv.size() == rank, "perm.size() != perm_inv.size()");
+
+  for (size_t i = 0; i < rank; ++i) {
+    size_t j = static_cast<size_t>(perm[i]);
+    ORT_RETURN_IF_NOT(j < rank, "perm element out of range [0, rank - 1]");
+    perm_inv[j] = static_cast<IntType>(i);
+  }
+
+  return Status::OK();
 }
 
 // Utility function that checks if an array of strings contains a specific string.
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index c72012fd4a19..235ea45cd4dd 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -4,12 +4,16 @@
 #include "qnn_execution_provider.h"
 
 #include <filesystem>
-#include "core/providers/common.h"
 #include "core/framework/compute_capability.h"
 #include "core/graph/graph_viewer.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/kernel_registry.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/platform/env.h"
+#include "core/providers/common.h"
 #include "core/providers/partitioning_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/partitioning_utils.h"
@@ -17,18 +21,43 @@
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
+#include "core/framework/run_options.h"
 
 namespace onnxruntime {
 
 constexpr const char* QNN = "QNN";
 
+static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
+
+void RunOnUnload(std::function<void()> function) {
+  OrtMutex mutex;
+  std::lock_guard<OrtMutex> guard(mutex);
+  if (!s_run_on_unload_) {
+    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
+  }
+  s_run_on_unload_->push_back(std::move(function));
+}
+
+struct OnUnload {
+  ~OnUnload() {
+    if (!s_run_on_unload_)
+      return;
+
+    for (auto& function : *s_run_on_unload_)
+      function();
+
+    s_run_on_unload_.reset();
+  }
+
+} g_on_unload;
+
 static void ParseProfilingLevel(std::string profiling_level_string,
                                 qnn::ProfilingLevel& profiling_level) {
   std::transform(profiling_level_string.begin(),
                  profiling_level_string.end(),
                  profiling_level_string.begin(),
                  [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
-  LOGS_DEFAULT(VERBOSE) << "profiling_level: " << profiling_level_string;
+  LOGS_DEFAULT(INFO) << "profiling_level: " << profiling_level_string;
   if (profiling_level_string == "off") {
     profiling_level = qnn::ProfilingLevel::OFF;
   } else if (profiling_level_string == "basic") {
@@ -63,6 +92,8 @@ static void ParseHtpPerformanceMode(std::string htp_performance_mode_string,
     htp_performance_mode = qnn::HtpPerformanceMode::kHtpLowPowerSaver;
   } else if (htp_performance_mode_string == "power_saver") {
     htp_performance_mode = qnn::HtpPerformanceMode::kHtpPowerSaver;
+  } else if (htp_performance_mode_string == "extreme_power_saver") {
+    htp_performance_mode = qnn::HtpPerformanceMode::kHtpExtremePowerSaver;
   } else if (htp_performance_mode_string == "sustained_high_performance") {
     htp_performance_mode = qnn::HtpPerformanceMode::kHtpSustainedHighPerformance;
   } else {
@@ -108,9 +139,25 @@ void QNNExecutionProvider::ParseHtpGraphFinalizationOptimizationMode(const std::
   }
 }
 
+static void ParseHtpArchitecture(const std::string& htp_arch_string, QnnHtpDevice_Arch_t& qnn_htp_arch) {
+  if (htp_arch_string.empty() || htp_arch_string == "0") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_NONE;
+  } else if (htp_arch_string == "68") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V68;
+  } else if (htp_arch_string == "69") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V69;
+  } else if (htp_arch_string == "73") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V73;
+  } else if (htp_arch_string == "75") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V75;
+  } else {
+    LOGS_DEFAULT(WARNING) << "Invalid HTP architecture: " << htp_arch_string;
+  }
+}
+
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
                                            const SessionOptions* session_options)
-    : IExecutionProvider{onnxruntime::kQnnExecutionProvider, true} {
+    : IExecutionProvider{onnxruntime::kQnnExecutionProvider} {
   if (session_options) {
     disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
@@ -131,6 +178,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;
 
     context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
   }
 
   static const std::string BACKEND_PATH = "backend_path";
@@ -144,26 +192,59 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     LOGS_DEFAULT(ERROR) << "No backend path provided.";
   }
 
+  std::string profiling_file_path;
   static const std::string PROFILING_LEVEL = "profiling_level";
   qnn::ProfilingLevel profiling_level = qnn::ProfilingLevel::OFF;
+  // separate out the profiling level for ETW in case it gets disabled later when we extract the events
+  // set to invalid to indicate that ETW is no enabled when we setup QNN
+  qnn::ProfilingLevel profiling_level_etw = qnn::ProfilingLevel::INVALID;
+  const Env& env = Env::Default();
+  auto& provider = env.GetTelemetryProvider();
+  if (provider.IsEnabled()) {
+    auto level = provider.Level();
+    auto keyword = provider.Keyword();
+    if ((keyword & static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
+      if (level != 0) {
+        if (level == 5) {
+          LOGS_DEFAULT(INFO) << "Overriding profiling to basic based on ETW level: " << static_cast<int>(level);
+          profiling_level_etw = qnn::ProfilingLevel::BASIC;
+        } else if (level < 5) {
+          LOGS_DEFAULT(INFO) << "QNN Profiler ETW level not supported below level 5. Level: "
+                             << static_cast<int>(level);
+          profiling_level_etw = qnn::ProfilingLevel::OFF;
+        } else {
+          LOGS_DEFAULT(INFO) << "Overriding profiling to detailed based on ETW level: " << static_cast<int>(level);
+          profiling_level_etw = qnn::ProfilingLevel::DETAILED;
+        }
+      }
+    }
+  }
+
+  // In case ETW gets disabled later
   auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL);
   if (profiling_level_pos != provider_options_map.end()) {
     ParseProfilingLevel(profiling_level_pos->second, profiling_level);
   }
+  static const std::string PROFILING_FILE = "profiling_file_path";
+  auto profiling_file_pos = provider_options_map.find(PROFILING_FILE);
+  if (profiling_file_pos != provider_options_map.end()) {
+    profiling_file_path = profiling_file_pos->second;
+  }
+  LOGS_DEFAULT(VERBOSE) << "Profiling file path: " << profiling_file_path;
 
   static const std::string RPC_CONTROL_LANTENCY = "rpc_control_latency";
-  uint32_t rpc_control_latency = 0;
   auto latency_pos = provider_options_map.find(RPC_CONTROL_LANTENCY);
   if (latency_pos != provider_options_map.end()) {
-    rpc_control_latency = static_cast<uint32_t>(std::stoul(latency_pos->second));
-    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
+    default_rpc_control_latency_ = static_cast<uint32_t>(std::stoul(latency_pos->second));
+    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << default_rpc_control_latency_;
   }
 
-  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  // default_htp_performance_mode from QNN EP option.
+  // set it once only for each thread as default so user don't need to set it for every session run
   static const std::string HTP_PERFORMANCE_MODE = "htp_performance_mode";
   auto htp_performance_mode_pos = provider_options_map.find(HTP_PERFORMANCE_MODE);
   if (htp_performance_mode_pos != provider_options_map.end()) {
-    ParseHtpPerformanceMode(htp_performance_mode_pos->second, htp_performance_mode);
+    ParseHtpPerformanceMode(htp_performance_mode_pos->second, default_htp_performance_mode_);
   }
 
   htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
@@ -199,57 +280,119 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
   }
 
+  static const std::string QNN_DEVICE_ID = "device_id";
+  auto dev_id_pos = provider_options_map.find(QNN_DEVICE_ID);
+  if (dev_id_pos != provider_options_map.end()) {
+    int value = std::stoi(dev_id_pos->second);
+    if (value < 0) {
+      LOGS_DEFAULT(WARNING) << "Invalid device ID '" << value
+                            << "', only >= 0 allowed. Set to " << device_id_ << ".";
+    } else {
+      device_id_ = static_cast<uint32_t>(value);
+    }
+  }
+
+  static const std::string QNN_HTP_ARCH = "htp_arch";
+  QnnHtpDevice_Arch_t htp_arch = QNN_HTP_DEVICE_ARCH_NONE;
+  auto htp_arch_pos = provider_options_map.find(QNN_HTP_ARCH);
+  if (htp_arch_pos != provider_options_map.end()) {
+    ParseHtpArchitecture(htp_arch_pos->second, htp_arch);
+  }
+
+  static const std::string QNN_SOC_MODEL = "soc_model";
+  uint32_t soc_model = QNN_SOC_MODEL_UNKNOWN;
+  auto soc_model_pos = provider_options_map.find(QNN_SOC_MODEL);
+  if (soc_model_pos != provider_options_map.end()) {
+    int value = std::stoi(soc_model_pos->second);
+    if (value < 0) {
+      LOGS_DEFAULT(WARNING) << "Invalid SoC Model '" << value
+                            << "', only >= 0 allowed. Set to " << soc_model << ".";
+    } else {
+      soc_model = static_cast<uint32_t>(value);
+    }
+  }
+
+  static const std::string QNN_HTP_FP16_MODE = "enable_htp_fp16_precision";
+  auto htp_fp16_mode_pos = provider_options_map.find(QNN_HTP_FP16_MODE);
+  if (htp_fp16_mode_pos != provider_options_map.end()) {
+    if ("1" == htp_fp16_mode_pos->second) {
+      enable_HTP_FP16_precision_ = true;
+    } else if ("0" == htp_fp16_mode_pos->second) {
+      enable_HTP_FP16_precision_ = false;
+    } else {
+      LOGS_DEFAULT(VERBOSE) << "Invalid enable_htp_fp16_precision: " << enable_HTP_FP16_precision_ << " only 0 or 1 allowed. Set to 0.";
+    }
+    LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_fp16_precision: " << enable_HTP_FP16_precision_;
+  }
+
   qnn_backend_manager_ = std::make_unique<qnn::QnnBackendManager>(
       std::move(backend_path),
+      profiling_level_etw,
       profiling_level,
-      rpc_control_latency,
-      htp_performance_mode,
+      std::move(profiling_file_path),
       context_priority,
-      std::move(qnn_saver_path));
+      std::move(qnn_saver_path),
+      device_id_,
+      htp_arch,
+      soc_model);
+}
+
+QNNExecutionProvider::~QNNExecutionProvider() {
+  // clean up thread local context caches
+  std::lock_guard<OrtMutex> lock(context_state_.mutex);
+  for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) {
+    const auto cache = cache_weak.lock();
+    if (!cache) continue;
+    ORT_IGNORE_RETURN_VALUE(cache->erase(this));
+  }
 }
 
 bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                                           std::unordered_map<const NodeUnit*, bool>& node_unit_supported_result,
                                            const logging::Logger& logger) const {
-  // If we have visited one of the nodes in the node_unit, use the result directly
-  const auto it = node_unit_supported_result.find(&node_unit);
-  if (it != node_unit_supported_result.cend()) {
-    return it->second;
+  const std::string& op_type = node_unit.OpType();
+  bool supported = false;
+  const auto* op_builder = qnn::GetOpBuilder(op_type);
+  if (op_builder == nullptr) {
+    LOGS(logger, WARNING) << "Operators of type `" << node_unit.OpType() << "` are not supported by QNN EP."
+                          << node_unit.OpType() << " node `" << node_unit.Name()
+                          << "` will not be assigned to QNN EP.";
   } else {
-    const std::string& op_type = node_unit.OpType();
-
-    bool supported = false;
-    const auto* op_builder = qnn::GetOpBuilder(op_type);
-    if (op_builder == nullptr) {
-      LOGS(logger, WARNING) << "Operators of type `" << node_unit.OpType() << "` are not supported by QNN EP."
-                            << node_unit.OpType() << " node `" << node_unit.Name()
-                            << "` will not be assigned to QNN EP.";
-    } else {
-      auto status = op_builder->IsOpSupported(qnn_model_wrapper,
-                                              node_unit, logger);
-      if (Status::OK() != status) {
-        LOGS(logger, WARNING) << node_unit.OpType() << " node `" << node_unit.Name()
-                              << "` is not supported: " << status.ErrorMessage();
-      }
-      supported = (Status::OK() == status);
+    auto status = op_builder->IsOpSupported(qnn_model_wrapper,
+                                            node_unit, logger);
+    if (Status::OK() != status) {
+      LOGS(logger, WARNING) << node_unit.OpType() << " node `" << node_unit.Name()
+                            << "` is not supported: " << status.ErrorMessage();
     }
-    node_unit_supported_result[&node_unit] = supported;
-    return supported;
+    supported = (Status::OK() == status);
   }
+  return supported;
 }
 
 std::unordered_set<const Node*>
 QNNExecutionProvider::GetSupportedNodes(const GraphViewer& graph_viewer,
                                         const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
                                         const size_t node_unit_size,
-                                        bool load_from_cached_context,
+                                        bool is_qnn_ctx_model,
                                         const logging::Logger& logger) const {
   std::unordered_set<const Node*> supported_nodes{};
-  // Enable Qnn context cache requires the whole graph partitioned to Qnn EP
-  // Blindly filter in all nodes if context cache is enabled
-  if (load_from_cached_context) {
+  // Filter in the EPContext node for QNN
+  if (is_qnn_ctx_model) {
     for (const auto& node : graph_viewer.Nodes()) {
-      supported_nodes.insert(&node);
+      NodeAttrHelper node_helper(node);
+      std::string cache_source = node_helper.Get(qnn::SOURCE, "");
+
+      std::transform(cache_source.begin(),
+                     cache_source.end(),
+                     cache_source.begin(),
+                     [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
+
+      if (qnn::EPCONTEXT_OP == node.OpType() && (cache_source == "qnnexecutionprovider" || cache_source == "qnn")) {
+        LOGS(logger, VERBOSE) << "Node supported: [1] index: [" << node.Index()
+                              << "] name: [" << node.Name()
+                              << "] Operator type: [EPContext"
+                              << "] index: [" << node.Index() << "]";
+        supported_nodes.insert(&node);
+      }
     }
     return supported_nodes;
   }
@@ -301,24 +444,51 @@ QNNExecutionProvider::GetSupportedNodes(const GraphViewer& graph_viewer,
     if (node != &node_unit->GetNode()) {
       continue;
     }
-    const bool supported = IsNodeSupported(qnn_model_wrapper,
-                                           *node_unit,
-                                           node_unit_supported_result,
-                                           logger);
-    LOGS(logger, VERBOSE) << "Node supported: [" << supported
-                          << "] index: [" << node->Index()
-                          << "] name: [" << node->Name()
-                          << "] Operator type: [" << node->OpType()
-                          << "] as part of the NodeUnit type: [" << node_unit->OpType()
-                          << "] index: [" << node_unit->Index()
-                          << "] name: [" << node_unit->Name()
-                          << "]";
+
+    if (node_unit_supported_result.count(node_unit) != 0) {
+      continue;  // Already handled this node unit
+    }
+
+    // Try to convert certain standalone DQ -> Q sequences into QNN Convert op
+    auto convert_result = TryHandleConvertSequence(qnn_model_wrapper,
+                                                   *node_unit,
+                                                   node_unit_map,
+                                                   logger,
+                                                   true /*do_op_validation*/);
+    if (!convert_result.status.IsOK()) {
+      LOGS(logger, WARNING) << "Failed to convert DQ -> Q sequence to QNN Convert. "
+                            << "Type: " << node_unit->OpType() << ", Node name: " << node_unit->Name() << ", "
+                            << "Message: " << convert_result.status.ErrorMessage();
+    }
+
+    bool supported = false;
+
+    if (convert_result.status.IsOK() && convert_result.q_node_unit) {  // Merged DQ -> Q sequence into QNN Convert op
+      supported = true;
+
+      // Mark the Q node unit as handled and supported here so that we don't try to process it again.
+      node_unit_supported_result.insert({convert_result.q_node_unit, true});
+      supported_nodes.insert(&convert_result.q_node_unit->GetNode());
+    } else {
+      supported = IsNodeSupported(qnn_model_wrapper, *node_unit, logger);
+      LOGS(logger, VERBOSE) << "Node supported: [" << supported
+                            << "] index: [" << node->Index()
+                            << "] name: [" << node->Name()
+                            << "] Operator type: [" << node->OpType()
+                            << "] as part of the NodeUnit type: [" << node_unit->OpType()
+                            << "] index: [" << node_unit->Index()
+                            << "] name: [" << node_unit->Name()
+                            << "]";
+    }
+
     if (supported) {
       // If the node_unit is supported, add all of its nodes to the supported list.
       for (const auto* node_in_group : node_unit->GetAllNodesInGroup()) {
         supported_nodes.insert(node_in_group);
       }
     }
+
+    node_unit_supported_result.insert({node_unit, supported});
   }
 
   return supported_nodes;
@@ -334,22 +504,11 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   }
 
   const auto& logger = *GetLogger();
-  bool load_from_cached_context = false;
-  bool is_qnn_ctx_model = qnn::IsQnnCtxModel(graph_viewer);
-  if (is_qnn_ctx_model) {
-    load_from_cached_context = true;
-  }
-
-  // This is for case: QDQ model + Onnx Qnn context cache model
-  if (context_cache_enabled_ && !is_qnn_ctx_model) {
-    onnxruntime::PathString context_cache_path;
-    load_from_cached_context = qnn::IsContextCacheFileExists(context_cache_path_cfg_,
-                                                             graph_viewer.ModelPath().ToPathString(),
-                                                             context_cache_path);
-  }
+  bool is_qnn_ctx_model = qnn::GraphHasEpContextNode(graph_viewer);
 
-  // Load from cached context will load the QnnSystem lib and skip the Qnn context creation
-  auto rt = qnn_backend_manager_->SetupBackend(logger, load_from_cached_context);
+  // It will load the QnnSystem lib if is_qnn_ctx_model=true, and
+  // delay the Qnn context creation to Compile() using the cached context binary
+  auto rt = qnn_backend_manager_->SetupBackend(logger, is_qnn_ctx_model);
   if (Status::OK() != rt) {
     LOGS(logger, ERROR) << "QNN SetupBackend failed " << rt.ErrorMessage();
     return result;
@@ -364,10 +523,10 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
 
   const auto supported_nodes = GetSupportedNodes(graph_viewer, node_unit_map, node_unit_holder.size(),
-                                                 load_from_cached_context, logger);
+                                                 is_qnn_ctx_model, logger);
 
   // Helper function that returns a string that lists all unsupported nodes.
   // Ex: { name: mul_123, type: Mul }, {}, ...
@@ -396,7 +555,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
 
   const auto gen_metadef_name = [&]() {
     uint64_t model_hash;
-    int metadef_id = GenerateMetaDefId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
     return MakeString(QNN, "_", model_hash, "_", metadef_id);
   };
 
@@ -404,44 +563,39 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   size_t num_of_supported_nodes = 0;
 
   // Create partitions from supported nodes.
-  {
-    std::vector<std::unique_ptr<ComputeCapability>> partitions = utils::CreateSupportedPartitions(graph_viewer,
-                                                                                                  supported_nodes, {},
-                                                                                                  gen_metadef_name, QNN,
-                                                                                                  kQnnExecutionProvider,
-                                                                                                  true);
-
-    // Filter out partitions that consist of a single QuantizeLinear or DequantizeLinear node.
-    // We also count the number of supported nodes in all valid partitions.
-    for (auto& partition : partitions) {
-      bool is_valid_partition = true;
-      size_t nodes_in_partition = 0;
-
-      if (partition && partition->sub_graph) {
-        nodes_in_partition = partition->sub_graph->nodes.size();
-
-        if (nodes_in_partition == 1) {
-          const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]);
-
-          if (!node) {
-            LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node.";
-            is_valid_partition = false;
-          } else if (node->OpType() == "QuantizeLinear" || node->OpType() == "DequantizeLinear") {
-            LOGS(logger, WARNING) << "QNN EP does not support a single Quantize/Dequantize node in a partition.";
-            is_valid_partition = false;
-          }
+  std::vector<std::unique_ptr<ComputeCapability>> partitions = utils::CreateSupportedPartitions(
+      graph_viewer, supported_nodes, {}, gen_metadef_name, QNN, kQnnExecutionProvider, &node_unit_map, true);
+
+  // Filter out partitions that consist of a single QuantizeLinear or DequantizeLinear node.
+  // We also count the number of supported nodes in all valid partitions.
+  for (auto& partition : partitions) {
+    bool is_valid_partition = true;
+    size_t nodes_in_partition = 0;
+
+    if (partition && partition->sub_graph) {
+      nodes_in_partition = partition->sub_graph->nodes.size();
+
+      if (nodes_in_partition == 1 && !is_qnn_ctx_model) {
+        const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]);
+
+        if (!node) {
+          LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node.";
+          is_valid_partition = false;
+        } else if (node->OpType() == "QuantizeLinear" || node->OpType() == "DequantizeLinear") {
+          LOGS(logger, WARNING) << "QNN EP does not support a single Quantize/Dequantize node in a partition.";
+          is_valid_partition = false;
         }
-      } else {
-        LOGS(logger, ERROR) << "QNN EP: Invalid partition.";
-        is_valid_partition = false;
       }
+    } else {
+      LOGS(logger, ERROR) << "QNN EP: Invalid partition.";
+      is_valid_partition = false;
+    }
 
-      if (is_valid_partition) {
-        result.push_back(std::move(partition));
-        num_of_supported_nodes += nodes_in_partition;
-      }
+    if (is_valid_partition) {
+      result.push_back(std::move(partition));
+      num_of_supported_nodes += nodes_in_partition;
     }
-  }
+  }  // for
 
   const size_t num_of_partitions = result.size();
   const auto summary_msg = MakeString("Number of partitions supported by QNN EP: ", num_of_partitions,
@@ -451,7 +605,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
 
   // Print list of unsupported nodes to the ERROR logger if the CPU EP
   // has been disabled for this inference session.
-  if (disable_cpu_ep_fallback_ && num_nodes_in_graph != num_of_supported_nodes) {
+  if (!is_qnn_ctx_model && disable_cpu_ep_fallback_ && num_nodes_in_graph != num_of_supported_nodes) {
     LOGS(logger, ERROR) << "Unsupported nodes in QNN EP: " << get_unsupported_node_names();
   }
 
@@ -488,28 +642,38 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector<NodeComputeInfo>& nod
   return Status::OK();
 }
 
-void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_builder) const {
+void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t>& configs_builder) const {
   if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP) {
     if (htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushHtpGraphCustomConfig();
+      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushCustomConfig();
       htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
       htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
       htp_graph_opt_config.optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
 
-      QnnGraph_Config_t& graph_opt_config = configs_builder.PushGraphConfig();
+      QnnGraph_Config_t& graph_opt_config = configs_builder.PushConfig();
       graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
       graph_opt_config.customConfig = &htp_graph_opt_config;
     }
 
     if (vtcm_size_in_mb_ > 0) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushHtpGraphCustomConfig();
+      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
       htp_graph_opt_config_vtcm.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
       htp_graph_opt_config_vtcm.vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
 
-      QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushGraphConfig();
+      QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushConfig();
       graph_opt_config_vtcm.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
       graph_opt_config_vtcm.customConfig = &htp_graph_opt_config_vtcm;
     }
+
+    if (enable_HTP_FP16_precision_) {
+      QnnHtpGraph_CustomConfig_t& htp_graph_precision_config = configs_builder.PushCustomConfig();
+      htp_graph_precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+      htp_graph_precision_config.precision = QNN_PRECISION_FLOAT16;
+
+      QnnGraph_Config_t& graph_precision_config = configs_builder.PushConfig();
+      graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_precision_config.customConfig = &htp_graph_precision_config;
+    }
   }
 }
 
@@ -523,10 +687,11 @@ Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector<FusedNodeAndG
     std::unique_ptr<qnn::QnnModel> qnn_model = std::make_unique<qnn::QnnModel>(logger,
                                                                                qnn_backend_manager_.get());
 
-    qnn::QnnGraphConfigsBuilder graph_configs_builder;
+    qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t> graph_configs_builder(QNN_GRAPH_CONFIG_INIT,
+                                                                                                QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT);
     InitQnnGraphConfigs(graph_configs_builder);
 
-    ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node, graph_configs_builder.GetQnnGraphConfigs()));
+    ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node, graph_configs_builder.GetQnnConfigs()));
     ORT_RETURN_IF_ERROR(qnn_model->FinalizeGraphs());
     ORT_RETURN_IF_ERROR(qnn_model->SetupQnnInputOutput());
 
@@ -541,65 +706,233 @@ Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector<FusedNodeAndG
 Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                      std::vector<NodeComputeInfo>& node_compute_funcs) {
   const auto& logger = *GetLogger();
-  Node& fused_node = fused_nodes_and_graphs[0].fused_node;
-  const onnxruntime::GraphViewer& graph_viewer(fused_nodes_and_graphs[0].filtered_graph);
 
-  bool is_qnn_ctx_model = false;
-  ORT_RETURN_IF_ERROR(qnn::IsFusedGraphHasCtxNode(fused_nodes_and_graphs, is_qnn_ctx_model));
+  bool is_qnn_ctx_model = qnn::IsFusedGraphHasCtxNode(fused_nodes_and_graphs);
 
   onnxruntime::PathString context_cache_path;
-  bool is_ctx_file_exist = qnn::IsContextCacheFileExists(context_cache_path_cfg_,
-                                                         graph_viewer.ModelPath().ToPathString(),
-                                                         context_cache_path);
-  const std::string& model_name = graph_viewer.GetGraph().Name();
-  const std::string& model_description = graph_viewer.GetGraph().Description();
-  const std::string& graph_meta_id = fused_node.Name();
-  if (fused_nodes_and_graphs.size() == 1 && !is_qnn_ctx_model && is_ctx_file_exist) {
-    ORT_RETURN_IF_ERROR(qnn::ValidateWithContextFile(context_cache_path,
-                                                     model_name,
-                                                     model_description,
-                                                     graph_meta_id,
-                                                     logger));
+  bool is_ctx_file_exist = false;
+  if (is_qnn_ctx_model || context_cache_enabled_) {
+    const onnxruntime::GraphViewer& graph_viewer_0(fused_nodes_and_graphs[0].filtered_graph);
+    is_ctx_file_exist = qnn::ValidateContextCacheFilePath(is_qnn_ctx_model,
+                                                          context_cache_path_cfg_,
+                                                          graph_viewer_0.ModelPath().ToPathString(),
+                                                          context_cache_path);
   }
 
-  if (is_qnn_ctx_model || (context_cache_enabled_ && is_ctx_file_exist)) {
-    ORT_RETURN_IF(fused_nodes_and_graphs.size() != 1, "Only support single partition for context cache feature.");
-    std::unique_ptr<qnn::QnnModel> qnn_model = std::make_unique<qnn::QnnModel>(logger, qnn_backend_manager_.get());
-    // Load and execute from cached context if exist
-    ORT_RETURN_IF_ERROR(qnn::LoadQnnCtxFromOnnxModel(graph_viewer,
+  ORT_RETURN_IF(is_ctx_file_exist && !is_qnn_ctx_model && context_cache_enabled_,
+                "The inference session is created from normal ONNX model. And an EP context model file is provided and existed. ",
+                "Please remove the EP context model manually if you want to re-generate it.");
+
+  if (is_qnn_ctx_model) {
+    // Table<EPContext node name, QnnModel>, the node name is the graph_meta_id (old) created from user model which used to generate the EP context model
+    // for this session (created from an EP context model), the graph_meta_id is new
+    std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>> qnn_models;
+
+    int main_context_pos = -1;
+    ORT_RETURN_IF_ERROR(qnn::GetMainContextNode(fused_nodes_and_graphs, qnn_backend_manager_.get(),
+                                                logger, main_context_pos, qnn_models));
+
+    const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[main_context_pos].filtered_graph);
+    // Create QNN context from the cached binary, deserialize the QNN graph from the binary
+    ORT_RETURN_IF_ERROR(qnn::LoadQnnCtxFromOnnxGraph(main_ctx_graph_viewer,
                                                      context_cache_path,
-                                                     is_qnn_ctx_model,
-                                                     is_ctx_file_exist,
                                                      qnn_backend_manager_.get(),
-                                                     *(qnn_model.get()),
+                                                     qnn_models,
                                                      logger));
-    ORT_RETURN_IF_ERROR(qnn_model->SetGraphInputOutputInfo(graph_viewer, fused_node));
-    ORT_RETURN_IF_ERROR(qnn_model->SetupQnnInputOutput());
 
-    // fused node name is QNNExecutionProvider_QNN_[hash_id]_[id]
-    // the name here should be same with context->node_name in compute_info
-    qnn_models_.emplace(graph_meta_id, std::move(qnn_model));
+    for (auto fused_node_and_graph : fused_nodes_and_graphs) {
+      const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
+      const auto& ep_context_node = graph_viewer.Nodes().begin();
+      const Node& fused_node = fused_node_and_graph.fused_node;
+      const std::string& graph_meta_id = fused_node.Name();
+      std::string key = ep_context_node->Name();
+      ORT_RETURN_IF(qnn_models.find(key) == qnn_models.end(), key + " key name not exist in table qnn_models.");
+      auto qnn_model = std::move(qnn_models[key]);
+      ORT_RETURN_IF_ERROR(qnn_model->SetGraphInputOutputInfo(graph_viewer, fused_node));
+      ORT_RETURN_IF_ERROR(qnn_model->SetupQnnInputOutput());
+
+      // fused node name is QNNExecutionProvider_QNN_[hash_id]_[id]
+      // the name here must be same with context->node_name in compute_info
+      qnn_models_.emplace(graph_meta_id, std::move(qnn_model));
+
+      ORT_RETURN_IF_ERROR(CreateComputeFunc(node_compute_funcs, logger));
+    }
 
-    ORT_RETURN_IF_ERROR(CreateComputeFunc(node_compute_funcs, logger));
     return Status::OK();
   }
 
   ORT_RETURN_IF_ERROR(CompileFromOrtGraph(fused_nodes_and_graphs, node_compute_funcs, logger));
-  if (context_cache_enabled_ && !is_qnn_ctx_model) {
-    ORT_RETURN_IF(fused_nodes_and_graphs.size() != 1, "Only support single partition for context cache feature.");
+  // Generate QNN context model if it's QDQ model + context_cache_enabled=true + not exist already
+  if (!is_qnn_ctx_model && context_cache_enabled_ && !is_ctx_file_exist) {
+    // All partitioned graph share single QNN context, included in the same context binary
     uint64_t buffer_size(0);
     auto context_buffer = qnn_backend_manager_->GetContextBinaryBuffer(buffer_size);
-    ORT_RETURN_IF_ERROR(qnn::GenerateCtxCacheOnnxModel(model_name,
-                                                       model_description,
-                                                       context_buffer.get(),
-                                                       buffer_size,
-                                                       qnn_backend_manager_->GetSdkVersion(),
-                                                       fused_nodes_and_graphs,
-                                                       qnn_models_,
-                                                       context_cache_path,
-                                                       qnn_context_embed_mode_,
-                                                       logger));
+    qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
+    ORT_RETURN_IF_ERROR(qnn::CreateEPContextNodes(qnn_ep_context_model_.get(),
+                                                  context_buffer.get(),
+                                                  buffer_size,
+                                                  qnn_backend_manager_->GetSdkVersion(),
+                                                  fused_nodes_and_graphs,
+                                                  qnn_models_,
+                                                  context_cache_path,
+                                                  qnn_context_embed_mode_,
+                                                  logger));
+  }
+  return Status::OK();
+}
+
+const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const {
+  InlinedVector<const Node*> ep_context_nodes;
+  if (qnn_ep_context_model_) {
+    const auto& graph = qnn_ep_context_model_->MainGraph();
+    for (const auto& node : graph.Nodes()) {
+      ep_context_nodes.push_back(graph.GetNode(node.Index()));
+    }
+  }
+
+  return ep_context_nodes;
+}
+
+QNNExecutionProvider::PerThreadContext::PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager,
+                                                         uint32_t device_id,
+                                                         uint32_t core_id,
+                                                         qnn::HtpPerformanceMode default_htp_performance_mode,
+                                                         uint32_t default_rpc_control_latency)
+    : qnn_backend_manager_(qnn_backend_manager) {
+  Status rt = qnn_backend_manager_->CreateHtpPowerCfgId(device_id, core_id, htp_power_config_id_);
+  is_htp_power_config_id_valid_ = rt.IsOK();
+  // default_htp_performance_mode and default_rpc_control_latency are from QNN EP option.
+  // set it once only for each thread as default so user don't need to set it for every session run
+  if (is_htp_power_config_id_valid_) {
+    if (qnn::HtpPerformanceMode::kHtpDefault != default_htp_performance_mode) {
+      ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetHtpPowerConfig(htp_power_config_id_,
+                                                                      default_htp_performance_mode));
+    }
+    if (default_rpc_control_latency > 0) {
+      ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetRpcControlLatency(htp_power_config_id_,
+                                                                         default_rpc_control_latency));
+    }
   }
+}
+
+QNNExecutionProvider::PerThreadContext::~PerThreadContext() {
+  if (is_htp_power_config_id_valid_) {
+    ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->DestroyHTPPowerConfigID(htp_power_config_id_));
+  }
+}
+
+QNNExecutionProvider::PerThreadContext& QNNExecutionProvider::GetPerThreadContext() const {
+  const auto& per_thread_context_cache = PerThreadContextCache();
+
+  // try to use cached context
+  auto cached_context_it = per_thread_context_cache->find(this);
+  if (cached_context_it != per_thread_context_cache->end()) {
+    auto cached_context = cached_context_it->second.lock();
+    ORT_ENFORCE(cached_context);
+    return *cached_context;
+  }
+
+  // get context and update cache
+  std::shared_ptr<PerThreadContext> context;
+  {
+    std::lock_guard<OrtMutex> lock(context_state_.mutex);
+
+    // get or create a context
+    if (context_state_.retired_context_pool.empty()) {
+      uint32_t core_id = 0;
+      context = std::make_shared<PerThreadContext>(qnn_backend_manager_.get(), device_id_, core_id,
+                                                   default_htp_performance_mode_, default_rpc_control_latency_);
+    } else {
+      context = context_state_.retired_context_pool.back();
+      context_state_.retired_context_pool.pop_back();
+    }
+
+    // insert into active_contexts, should not already be present
+    const auto active_contexts_insert_result = context_state_.active_contexts.insert(context);
+    ORT_ENFORCE(active_contexts_insert_result.second);
+
+    // insert into caches_to_update_on_destruction, may already be present
+    ORT_IGNORE_RETURN_VALUE(context_state_.caches_to_update_on_destruction.insert(per_thread_context_cache));
+  }
+
+  per_thread_context_cache->insert(std::make_pair(this, context));
+
+  return *context;
+}
+
+void QNNExecutionProvider::ReleasePerThreadContext() const {
+  const auto& per_thread_context_cache = PerThreadContextCache();
+
+  auto cached_context_it = per_thread_context_cache->find(this);
+  ORT_ENFORCE(cached_context_it != per_thread_context_cache->end());
+  auto cached_context = cached_context_it->second.lock();
+  ORT_ENFORCE(cached_context);
+
+  {
+    std::lock_guard<OrtMutex> lock(context_state_.mutex);
+    context_state_.active_contexts.erase(cached_context);
+    context_state_.retired_context_pool.push_back(cached_context);
+  }
+
+  per_thread_context_cache->erase(cached_context_it);
+}
+
+Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
+  auto backend_type = qnn_backend_manager_->GetQnnBackendType();
+  if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
+    return Status::OK();
+  }
+
+  std::string htp_perf_mode = "";
+  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
+    // set power mode
+    ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
+  }
+
+  std::string rpc_latency = "";
+  uint32_t rpc_control_latency = 0;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
+    rpc_control_latency = static_cast<uint32_t>(std::stoul(rpc_latency));
+    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
+  }
+
+  if (GetPerThreadContext().IsHtpPowerConfigIdValid()) {
+    if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) {
+      ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                  htp_performance_mode));
+    }
+
+    if (rpc_control_latency > 0) {
+      ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetRpcControlLatency(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                     rpc_control_latency));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& run_options) {
+  auto backend_type = qnn_backend_manager_->GetQnnBackendType();
+  if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
+    return Status::OK();
+  }
+
+  std::string htp_perf_mode = "";
+  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
+    // set power mode
+    ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
+  }
+
+  if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) {
+    if (!GetPerThreadContext().IsHtpPowerConfigIdValid()) {
+      return Status::OK();
+    }
+    ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                htp_performance_mode));
+  }
+
   return Status::OK();
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 8b5d0929209e..82dceb8ae397 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -5,18 +5,26 @@
 
 #include "core/framework/execution_provider.h"
 #include "core/framework/session_options.h"
+#include "core/framework/model_metadef_id_generator.h"
+#include "core/graph/model.h"
 #include <string>
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
-#include "core/providers/qnn/builder/qnn_graph_configs_helper.h"
+#include "core/providers/qnn/builder/qnn_configs_helper.h"
+#include "HTP/QnnHtpGraph.h"
+#include <vector>
+#include <set>
+#include <unordered_map>
 
 namespace onnxruntime {
 
+void RunOnUnload(std::function<void()> function);
+
 // Logical device representation.
 class QNNExecutionProvider : public IExecutionProvider {
  public:
   explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options);
-  virtual ~QNNExecutionProvider() = default;
+  virtual ~QNNExecutionProvider();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider);
 
   // we implement the Compile that takes FusedNodeAndGraph instances
@@ -35,9 +43,14 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   DataLayout GetPreferredLayout() const override;
 
+  const InlinedVector<const Node*> GetEpContextNodes() const override;
+
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
+
  private:
   bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                       std::unordered_map<const NodeUnit*, bool>& node_unit_supported_result,
                        const logging::Logger& logger) const;
 
   std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewer,
@@ -55,7 +68,7 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   void ParseHtpGraphFinalizationOptimizationMode(const std::string& htp_graph_finalization_opt_mode_string);
 
-  void InitQnnGraphConfigs(qnn::QnnGraphConfigsBuilder& configs_holder) const;
+  void InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t>& configs_builder) const;
 
  private:
   qnn::HtpGraphFinalizationOptimizationMode htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
@@ -66,6 +79,71 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool disable_cpu_ep_fallback_ = false;  // True if CPU EP fallback has been disabled for this session.
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
+  std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
+  ModelMetadefIdGenerator metadef_id_generator_;
+  uint32_t device_id_ = 0;
+  qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
+  uint32_t default_rpc_control_latency_ = 0;
+  bool enable_HTP_FP16_precision_ = false;
+
+  class PerThreadContext final {
+   public:
+    PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager,
+                     uint32_t device_id, uint32_t core_id,
+                     qnn::HtpPerformanceMode default_htp_performance_mode,
+                     uint32_t default_rpc_control_latency);
+    ~PerThreadContext();
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext);
+
+    bool IsHtpPowerConfigIdValid() { return is_htp_power_config_id_valid_; }
+
+    uint32_t GetHtpPowerConfigId() { return htp_power_config_id_; }
+
+   private:
+    bool is_htp_power_config_id_valid_ = false;
+    uint32_t htp_power_config_id_ = 0;
+    qnn::QnnBackendManager* qnn_backend_manager_;
+  };
+
+  using PerThreadContextMap = std::unordered_map<const QNNExecutionProvider*, std::weak_ptr<PerThreadContext>>;
+
+  struct ContextCacheHolder {
+    ContextCacheHolder() {
+      RunOnUnload([&, weak_p_ = std::weak_ptr<PerThreadContextMap>(p)] {
+        if (auto lock = weak_p_.lock())
+          p.reset();
+      });
+    }
+
+    std::shared_ptr<PerThreadContextMap> p = std::make_shared<PerThreadContextMap>();
+  };
+
+  static const std::shared_ptr<PerThreadContextMap>& PerThreadContextCache() {
+    thread_local const ContextCacheHolder per_thread_context_cache;
+    return per_thread_context_cache.p;
+  }
+
+  struct PerThreadContextState {
+    // contexts that are currently active
+    std::set<std::shared_ptr<PerThreadContext>, std::owner_less<std::shared_ptr<PerThreadContext>>> active_contexts;
+    // contexts available for reuse
+    std::vector<std::shared_ptr<PerThreadContext>> retired_context_pool;
+    // weak references to thread local caches from which this QNNExecutionProvider instance's entry should be removed
+    // upon destruction
+    std::set<std::weak_ptr<PerThreadContextMap>, std::owner_less<std::weak_ptr<PerThreadContextMap>>>
+        caches_to_update_on_destruction;
+    // synchronizes access to PerThreadContextState members
+    OrtMutex mutex;
+  };
+
+  // The execution provider maintains the PerThreadContexts in this structure.
+  // Synchronization is required to update the contained structures.
+  // On the other hand, access to an individual PerThreadContext is assumed to be from a single thread at a time,
+  // so synchronization is not required for that.
+  mutable PerThreadContextState context_state_;
+
+  PerThreadContext& GetPerThreadContext() const;
+  void ReleasePerThreadContext() const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/atomic/common.cuh b/onnxruntime/core/providers/rocm/atomic/common.cuh
index 4e235702028c..b5d01b91c70e 100644
--- a/onnxruntime/core/providers/rocm/atomic/common.cuh
+++ b/onnxruntime/core/providers/rocm/atomic/common.cuh
@@ -59,5 +59,304 @@ __device__ __forceinline__ void AtomicAdd(T *start_addr, size_t index, const siz
   atomic_add(start_addr + index, value);
 }
 
+// Disable default template instantiation.
+// For every type T, we need to define a specialization
+// to select the right type for calling atomicCAS.
+template <typename T>
+class AtomicCasType;
+
+template<>
+class AtomicCasType<int8_t> {
+ public:
+  using type = unsigned short int;
+  static const unsigned int mask = 0xffu;
+};
+
+template<>
+class AtomicCasType<half> {
+ public:
+  using type = unsigned short int;
+  static const unsigned int mask = 0xffffu;
+};
+
+template<>
+class AtomicCasType<float> {
+ public:
+  using type = unsigned int;
+  static const unsigned int mask = 0xffffffffu;
+};
+
+template<>
+class AtomicCasType<double> {
+ public:
+  using type = unsigned long long int;
+  static const unsigned int mask = 0xffffffffu;
+};
+
+template<>
+class AtomicCasType<int> {
+ public:
+  using type = int;
+  static const unsigned int mask = 0xffffffffu;
+};
+
+template<>
+class AtomicCasType<int64_t> {
+ public:
+  using type = unsigned long long int;
+  static const unsigned int mask = 0xffffffffu;
+};
+
+// Obtained from pytorch/aten/src/ATen/cuda/Atomic.cuh.
+//
+// This function compute 8-bit atomic binary operation using 32-bit atomicCAS.
+// It accumulate `val` into the `address` using the `func`.
+// The accumulation is atomic (i.e., thread-safe).
+//
+// E.g., Assume ValueType is
+//  int8_t
+// and BinaryFunc is
+//  struct AddFunc {
+//    __device__ __forceinline__ int8_t operator()(int8_t a, int8_t b) const {
+//      return a + b;
+//  }
+// This function becomes atomic_add for int8_t.
+template<typename ValueType, typename BinaryFunc>
+__device__ __forceinline__ void atomic_byte_func_with_unit32_cas(ValueType* address, ValueType val, BinaryFunc func) {
+    // Assert to ensure the following bit-wise manipulation is correct.
+    static_assert(sizeof(ValueType) == 1 | sizeof(ValueType) == 2 | sizeof(ValueType) == 4,
+      "ValueType must be 1-byte, 2-byte or 4-byte large.");
+    // Number of bytes to the lower 4-byte aligned address.
+    // If the current address is b1010"10", then offset = b10 = 2,
+    // which means the current address is 2 bytes away from
+    // the lower 4-byte aligned address b1010"00".
+    size_t offset = (size_t)address & 3;
+    // Find an new 4-byte aligned address `address_as_ui` lower than
+    // or equal to `address`. Lower than `address` so that the actual
+    // int8_t byte is in the 4-byte word that we load.
+    //
+    // This address has the following properties:
+    //   1. It is 4-byte aligned.
+    //   2. It is lower than or equal to `address`.
+    //   3. De-referencing this address may return
+    //      a uint32_t value that contains the same int8_t
+    //      value indicated by `address`.
+    //
+    // E.g.,
+    //  address = b101010
+    //  offset = b101010 & b000011 = b10 = 2
+    //  (char*)address - offset => (char*)b101010 - b000010 => b1010"00",
+    // which is (32-bit aligned).
+    uint32_t * address_as_ui = (uint32_t*)((char*)address - offset);
+    uint32_t old = *address_as_ui;
+    // E.g., offset = 2.
+    // address_as_ui is an address 2 bytes lower than `address`.
+    //
+    // ..... byte 3 ..... | ..... byte 2 ..... | ..... byte 1 ..... | ..... byte 0 .....
+    //                  ^                    ^                                         ^
+    //                  |                    |                                         |
+    //                  |                  address <--- offset * 8 (bit)----->  address_as_ui
+    //                  |                                                              ^
+    //                  |                                                              |
+    //                  ------------------------- *address_as_ui -----------------------
+    //
+    // This visualization shows
+    //  1. the 32-bit word at address_as_ui.
+    //  2. the gap between address_as_ui and address.
+    //  3. *address_as_ui contains the int8_t value at `address`.
+    uint32_t shift = offset * 8;
+    uint32_t old_byte;
+    uint32_t newval;
+    uint32_t assumed;
+    do {
+      assumed = old;
+      // Select 8-bit value from 32-bit word. Assume offset = 2 (byte), so
+      // we want to select the 3rd byte (byte 2 below) from the word.
+      //
+      // Journey of a 32-bit value:
+      //
+      // ..... byte 3 ..... | ..... byte 2 ..... | ..... byte 1 ..... | ..... byte 0 .....
+      //
+      //                                         |
+      //                                         |  old >> offset * 8, where offset = 2.
+      //                                         |  Effectively, push lower two bytes
+      //                                         |  out of the word.
+      //                                         V
+      //
+      //      00000000      |      00000000      | ..... byte 3 ..... | ..... byte 2 .....
+      //
+      //                                                              |  apply bit-wise AND,
+      //                                                              |  & 0xff (i.e., & b11111111),
+      //                                                              |  so that we only keep
+      //                                                              |  the byte of interest.
+      //                                                              |  Otherwise, overflow may
+      //                                                              |  happen when casting this
+      //                                                              |  32-bit value to int8_t.
+      //                                                              V
+      //
+      //      00000000      |      00000000      |      00000000      | ..... byte 2 .....
+      old_byte = (old >> shift) & AtomicCasType<ValueType>::mask;
+      // Compute new int8_t value and store it to newrawvalue.
+      // Journey of a 32-bit value (cont'd):
+      //
+      // newrawvalue
+      // ... new byte 2 ...
+      auto newrawvalue = func(val, reinterpret_cast<ValueType&>(old_byte));
+      // Put the new int8_t value back to 32-bit word.
+      // Also ensure that bits not occupied by the int8_t value are 0s.
+      //
+      // Journey of a 32-bit value (cont'd):
+      //
+      // reinterpret_cast<uint32_t&>(newrawvalue)
+      //    random values   |   random values    |   random values    | ... new byte 2 ...
+      //
+      // reinterpret_cast<uint32_t&>(newrawvalue) & AtomicCasType<ValueType>::mask
+      //      00000000      |      00000000      |      00000000      | ... new byte 2 ...
+      newval = reinterpret_cast<uint32_t&>(newrawvalue) & AtomicCasType<ValueType>::mask;
+      // Journey of a 32-bit value (cont'd):
+      //
+      // old
+      // ..... byte 3 ..... | ..... byte 2 ..... | ..... byte 1 ..... | ..... byte 0 .....
+      //
+      // 0x000000ff
+      //      00000000      |      00000000      |      00000000      |      11111111
+      //
+      // 0x000000ff << shift
+      //      00000000      |      11111111      |      00000000      |      00000000
+      //
+      // ~(0x000000ff << shift)
+      //      11111111      |      00000000      |      11111111      |      11111111
+      //
+      // old & ~(0x000000ff << shift)
+      // ..... byte 3 ..... |      00000000      | ..... byte 1 ..... | ..... byte 0 .....
+      //
+      // newval << shift
+      //      00000000      | ... new byte 2 ... |      00000000      |      00000000
+      //
+      // (old & ~(0x000000ff << shift)) | (newval << shift)
+      // ..... byte 3 ..... | ... new byte 2 ... | ..... byte 1 ..... | ..... byte 0 .....
+      newval = (old & ~(AtomicCasType<ValueType>::mask << shift)) | (newval << shift);
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+}
+
+// It accumulates `val` into the `address` using the `func`.
+// This function is thread-safe (i.e., atomic).
+template<typename ValueType, typename BinaryFunc>
+__device__ __forceinline__ void atomic_binary_func(ValueType* address, ValueType val, BinaryFunc func) {
+  ValueType observed = *address, assumed, new_value;
+  using CasType = typename AtomicCasType<ValueType>::type;
+  static_assert(sizeof(ValueType) == sizeof(CasType),
+    "ValueType and CasType must have the same size for calling atomicCAS.");
+  auto address_as_cas_type = reinterpret_cast<CasType*>(address);
+  do {
+      // Record the value used to compute new value.
+      assumed = observed;
+
+      // Compute expected new value.
+      new_value = func(observed, val);
+
+      // Cast to aribitrary 2-byte type to desired integer type supported by atomicCAS.
+      //                    4
+      //                    8
+      auto observed_as_cas_type = *reinterpret_cast<CasType*>(&observed);
+      auto new_value_as_cas_type = *reinterpret_cast<CasType*>(&new_value);
+
+      // Call atomicCAS as if the 2-byte type variables are all unsigned short int.
+      //                          4                             unsigned int (or int)
+      //                          8                             unsigned long long int
+      auto cas_observed_as_cas_type = atomicCAS(address_as_cas_type, observed_as_cas_type, new_value_as_cas_type);
+
+      // Cast the freshly observed value in memory back to the TwoByteType.
+      observed = *reinterpret_cast<ValueType*>(&cas_observed_as_cas_type);
+
+      // Two cases:
+      // 1. compare-and-swap success
+      //    a. `address` holds `new_value`
+      //    b. `observed` becomes the new value after the assignment.
+      //       Thus, the following `observed != new_value` is false,
+      //       and the loop terminates.
+      //  2. compare-and-swap fails
+      //     a. `address` holds a value different from `observed`, thus,
+      //        the `new_value` is stale.
+      //     b. `observed` becomes the fresh value observed in `address`.
+      //        Thus, the following (observed != new_value) is true,
+      //        and the loop continues. In the next iteration, the
+      //        `new_value` is computed again using the fresh `observed`.
+  } while (observed != assumed);
+}
+
+struct AddFunc {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+struct MulFunc {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a * b;
+  }
+};
+
+struct MaxFunc {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return b > a ? b : a;
+  }
+};
+
+struct MinFunc {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return b < a ? b : a;
+  }
+};
+
+__device__ __forceinline__ void atomic_add(int8_t* address, int8_t value) {
+  atomic_byte_func_with_unit32_cas(address, value, AddFunc());
+}
+__device__ __forceinline__ void atomic_mul(int8_t* address, int8_t value) {
+  atomic_byte_func_with_unit32_cas(address, value, MulFunc());
+}
+__device__ __forceinline__ void atomic_max(int8_t* address, int8_t value) {
+  atomic_byte_func_with_unit32_cas(address, value, MaxFunc());
+}
+__device__ __forceinline__ void atomic_min(int8_t* address, int8_t value) {
+  atomic_byte_func_with_unit32_cas(address, value, MinFunc());
+}
+
+__device__ __forceinline__ void atomic_mul(half* address, half value) {
+  atomic_byte_func_with_unit32_cas(address, value, MulFunc());
+}
+__device__ __forceinline__ void atomic_max(half* address, half value) {
+  atomic_byte_func_with_unit32_cas(address, value, MaxFunc());
+}
+__device__ __forceinline__ void atomic_min(half* address, half value) {
+  atomic_byte_func_with_unit32_cas(address, value, MinFunc());
+}
+
+__device__ __forceinline__ void atomic_mul(float* address, float value) {
+  atomic_binary_func(address, value, MulFunc());
+}
+__device__ __forceinline__ void atomic_max(float* address, float value) {
+  atomic_binary_func(address, value, MaxFunc());
+}
+__device__ __forceinline__ void atomic_min(float* address, float value) {
+  atomic_binary_func(address, value, MinFunc());
+}
+
+__device__ __forceinline__ void atomic_mul(double* address, double value) {
+  atomic_binary_func(address, value, MulFunc());
+}
+__device__ __forceinline__ void atomic_max(double* address, double value) {
+  atomic_binary_func(address, value, MaxFunc());
+}
+__device__ __forceinline__ void atomic_min(double* address, double value) {
+  atomic_binary_func(address, value, MinFunc());
+}
+
+
 }  // namespace rocm
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/cu_inc/common.cuh b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
index 5f966ac746fc..1698e5ca8478 100644
--- a/onnxruntime/core/providers/rocm/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/rocm/cu_inc/common.cuh
@@ -335,6 +335,157 @@ __device__ __inline__ BFloat16 _Fmod(BFloat16 a, BFloat16 b) {
   return fmodf((float)a, (float)b);
 }
 
+namespace isinf_details {
+template <typename T>
+struct IsInfTyped {
+  static __device__ __inline__ bool IsInf(T a) {
+    // cast is needed because on non MS compilers,
+    // because there isinf() returns int
+    // and we want to avoid stupid warnings
+    return static_cast<bool>(isinf(a));
+  }
+  static __device__ __inline__ bool IsInfPos(T a) {
+    return a == std::numeric_limits<T>::infinity();
+  }
+  static __device__ __inline__ bool IsInfNeg(T a) {
+    return a == -std::numeric_limits<T>::infinity();
+  }
+};
+
+template <>
+struct IsInfTyped<half> {
+  static __device__ __inline__ bool IsInf(half a) {
+    return MLFloat16::kPositiveInfinityBits ==
+           static_cast<uint16_t>(*reinterpret_cast<uint16_t*>(&a) & ~MLFloat16::kSignMask);
+  }
+  static __device__ __inline__ bool IsInfPos(half a) {
+    return MLFloat16::kPositiveInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+  static __device__ __inline__ bool IsInfNeg(half a) {
+    return MLFloat16::kNegativeInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+};
+
+template <>
+struct IsInfTyped<BFloat16> {
+  static __device__ __inline__ bool IsInf(BFloat16 a) {
+    return BFloat16::kPositiveInfinityBits ==
+           static_cast<uint16_t>(*reinterpret_cast<uint16_t*>(&a) & ~BFloat16::kSignMask);
+  }
+  static __device__ __inline__ bool IsInfPos(BFloat16 a) {
+    return BFloat16::kPositiveInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+  static __device__ __inline__ bool IsInfNeg(BFloat16 a) {
+    return BFloat16::kNegativeInfinityBits == *reinterpret_cast<uint16_t*>(&a);
+  }
+};
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template <typename T>
+struct ReturnFalse {
+  constexpr static bool __device__ __inline__ IsInf(T) { return false; }
+  constexpr static bool __device__ __inline__ IsInfPos(T) { return false; }
+  constexpr static bool __device__ __inline__ IsInfNeg(T) { return false; }
+};
+
+template <>
+struct IsInfTyped<Float8E4M3FN> : ReturnFalse<Float8E4M3FN> {};
+
+template <>
+struct IsInfTyped<Float8E4M3FNUZ> : ReturnFalse<Float8E4M3FNUZ> {};
+
+template <>
+struct IsInfTyped<Float8E5M2> {
+  static __device__ __inline__ bool IsInf(Float8E5M2 a) {
+    return a.val == 0b01111100 || a.val == 0b11111100;
+  }
+  static __device__ __inline__ bool IsInfPos(Float8E5M2 a) {
+    return a.val == 0b01111100;
+  }
+  static __device__ __inline__ bool IsInfNeg(Float8E5M2 a) {
+    return a.val == 0b11111100;
+  }
+};
+
+template <>
+struct IsInfTyped<Float8E5M2FNUZ> : ReturnFalse<Float8E5M2FNUZ> {};
+
+#endif
+}  // namespace isinf_details
+
+template <typename T, bool detect_positive, bool detect_negative>
+struct _IsInf {
+  __device__ __inline__ bool operator()(T a) const {
+    if constexpr (detect_positive && detect_negative) {
+      return isinf_details::IsInfTyped<T>::IsInf(a);
+    } else if constexpr (detect_positive) {
+      return isinf_details::IsInfTyped<T>::IsInfPos(a);
+    } else if constexpr (detect_negative) {
+      return isinf_details::IsInfTyped<T>::IsInfNeg(a);
+    } else {
+      return false;
+    }
+  }
+};
+
+// float and double
+template <typename T>
+struct _IsNan {
+  __device__ __inline__ bool operator()(T a) const {
+    return isnan(a);
+  }
+};
+
+template <>
+struct _IsNan<half> {
+  __device__ __inline__ bool operator()(half a) const {
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~MLFloat16::kSignMask) 
+                                > MLFloat16::kPositiveInfinityBits;
+  }
+};
+
+template <>
+struct _IsNan<BFloat16> {
+  __device__ __inline__ bool operator()(BFloat16 a) const {
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~BFloat16::kSignMask) 
+                               > BFloat16::kPositiveInfinityBits;
+  }
+};
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template <>
+struct _IsNan<Float8E4M3FN> {
+  __device__ __inline__ bool operator()(Float8E4M3FN a) const {
+    return (*reinterpret_cast<const uint8_t*>(&a) & 0x7f) == 0x7f;
+  }
+};
+
+template <>
+struct _IsNan<Float8E4M3FNUZ> {
+  __device__ __inline__ bool operator()(Float8E4M3FNUZ a) const {
+    return *reinterpret_cast<const uint8_t*>(&a) == 0x80;
+  }
+};
+
+template <>
+struct _IsNan<Float8E5M2> {
+  __device__ __inline__ bool operator()(Float8E5M2 a) const {
+    uint8_t c = *reinterpret_cast<const uint8_t*>(&a);
+    return ((c & 0x7c) == 0x7c) && ((c & 0x03) != 0x00);
+  }
+};
+
+template <>
+struct _IsNan<Float8E5M2FNUZ> {
+  __device__ __inline__ bool operator()(Float8E5M2FNUZ a) const {
+    return *reinterpret_cast<const uint8_t*>(&a) == 0x80;
+  }
+};
+
+#endif
+
 // We would like to use 64-bit integer to support large matrices. However, ROCM seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
 #ifndef HIP_LONG
diff --git a/onnxruntime/core/providers/rocm/nn/pool.cc b/onnxruntime/core/providers/rocm/nn/pool.cc
index 045c8b55c0b0..3a82ab598004 100644
--- a/onnxruntime/core/providers/rocm/nn/pool.cc
+++ b/onnxruntime/core/providers/rocm/nn/pool.cc
@@ -257,7 +257,7 @@ Status Pool<T, MaxPool<8>>::ComputeInternal(OpKernelContext* context) const {
   Tensor* I = context->Output(1, TensorShape(y_dims));
   if (nullptr != I || !this->pool_attrs_.default_dilations) {
     auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
-    MaxPoolWithIndex<HipT>(
+    MaxPoolWithIndex<HipT, false>(
         this->Stream(context),
         x_shape,
         TensorShape(y_dims),
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index d7c5098d9dbe..87daaeea969a 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -170,6 +170,8 @@ ROCMExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de
 
   MIOPEN_CALL_THROW(miopenCreate(&miopen_handle_));
   MIOPEN_CALL_THROW(miopenSetStream(miopen_handle_, stream));
+
+  hip_graph_.SetStream(stream);
 }
 
 ROCMExecutionProvider::PerThreadContext::~PerThreadContext() {
@@ -177,6 +179,34 @@ ROCMExecutionProvider::PerThreadContext::~PerThreadContext() {
   ORT_IGNORE_RETURN_VALUE(MIOPEN_CALL(miopenDestroy(miopen_handle_)));
 }
 
+bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
+  return regular_run_count_before_graph_capture_ >= min_num_runs_before_hip_graph_capture_;
+}
+
+void ROCMExecutionProvider::PerThreadContext::CaptureBegin(int) {
+  hip_graph_.Reset();
+  hip_graph_.CaptureBegin(0);
+}
+
+void ROCMExecutionProvider::PerThreadContext::CaptureEnd(int) {
+  hip_graph_.CaptureEnd(0);
+  is_graph_captured_ = true;
+}
+
+bool ROCMExecutionProvider::PerThreadContext::IsGraphCaptured(int) const {
+  return is_graph_captured_;
+}
+
+Status ROCMExecutionProvider::PerThreadContext::ReplayGraph(int graph_annotation_id) {
+  ORT_ENFORCE(IsGraphCaptured(graph_annotation_id));
+
+  return hip_graph_.Replay(graph_annotation_id);
+}
+
+void ROCMExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
+  ++regular_run_count_before_graph_capture_;
+}
+
 void OverrideTunableOpInfoByEnv(ROCMExecutionProviderInfo& info) {
   if (auto env_tunable_op_enable = onnxruntime::ParseTestOnlyEnvironmentVariable<bool>(
           "ORT_ROCM_TUNABLE_OP_ENABLE", {"0", "1"}, "Use provider_options \"tunable_op_enable\" instead.");
@@ -219,6 +249,11 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in
     if (info.external_allocator_info.UseExternalAllocator()) {
       use_ep_level_unified_stream_ = true;
       stream_ = nullptr;
+    } else if (info.enable_hip_graph) {
+      // current hip graph implementation only works with single stream
+      // use EP level unified stream for all the reqeust
+      HIP_CALL_THROW(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking));
+      use_ep_level_unified_stream_ = true;
     } else {
       stream_ = nullptr;
     }
@@ -319,28 +354,61 @@ Status ROCMExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status ROCMExecutionProvider::OnRunStart() {
+Status ROCMExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   // always set ROCM device when session::Run() in case it runs in a worker thread
   HIP_RETURN_IF_ERROR(hipSetDevice(GetDeviceId()));
+  if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured(0)) {
+    LOGS_DEFAULT(INFO) << "Capturing the hip graph for this model";
+    GetPerThreadContext().CaptureBegin(0);
+  }
   return Status::OK();
 }
 
-Status ROCMExecutionProvider::OnRunEnd(bool sync_stream) {
+Status ROCMExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured(0)) {
+    if (GetPerThreadContext().IsGraphCaptureAllowed()) {
+      GetPerThreadContext().CaptureEnd(0);
+      // HIP work issued to a capturing stream doesn’t actually run on the GPU,
+      // so run the captured graph here to actually execute the work.
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(0));
+    } else {
+      GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture();
+    }
+  }
+
   if (sync_stream) {
     HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream_)));
   }
 
-  // In extreme cases (e.g., 1-op graph and that op fallbacks to CPU),
-  // PerThreadContext won't be created and there is nothing to
-  // release. This didn't happen before because we always call
-  // GetPerThreadContext in OnRunStart.
-  if (PerThreadContextCache()->find(this) != PerThreadContextCache()->end()) {
+  // The reason of !IsGraphCaptureEnabled():
+  //  If hip graph is enabled, the per thread context will not be released
+  //  because the per thread hip graph needs to be maintained and replayed for
+  //  the next run.
+  // The reason of PerThreadContextCache()->find(this) != PerThreadContextCache()->end():
+  //  In extreme cases (e.g., 1-op graph and that op fallbacks to CPU),
+  //  PerThreadContext won't be created and there is nothing to
+  //  release. This didn't happen before because we always call
+  //  GetPerThreadContext in OnRunStart.
+  if (!IsGraphCaptureEnabled() &&
+      PerThreadContextCache()->find(this) != PerThreadContextCache()->end()) {
     ReleasePerThreadContext();
   }
 
   return Status::OK();
 }
 
+bool ROCMExecutionProvider::IsGraphCaptureEnabled() const {
+  return info_.enable_hip_graph;
+}
+
+bool ROCMExecutionProvider::IsGraphCaptured(int) const {
+  return GetPerThreadContext().IsGraphCaptured(0);
+}
+
+Status ROCMExecutionProvider::ReplayGraph(int /*graph_annotation_id*/) {
+  return GetPerThreadContext().ReplayGraph(0);
+}
+
 namespace rocm {
 // opset 1 to 9
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
@@ -666,6 +734,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, Shrink);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, double, Shrink);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, MLFloat16, Shrink);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, IsNaN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, float, Less);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, double, Less);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Less);
@@ -726,6 +795,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, MLFloat16, ThresholdedRelu);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 10, TopK);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 12, Mod);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 19, IsInf);
 
 // opset 11
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 11, 11, float, ArgMax);
@@ -998,11 +1068,12 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kO
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, uint32_t, Cast);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, uint64_t, Cast);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, bool, Cast);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 19, IsNaN);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 13, Reshape);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 14, Shape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Size);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Transpose);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, ScatterElements);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 15, ScatterElements);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Slice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int64_t, Slice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Softmax);
@@ -1078,11 +1149,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int64_t, GatherND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Dropout);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, uint8_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, float, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, double, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, If);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, Loop);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Flatten);
@@ -1091,10 +1162,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, LRN);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 13, Identity);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, ScatterND);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Pad);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Pad);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Pad);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, bool, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, float, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, double, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Pad);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, bool, Pad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, SpaceToDepth);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, DepthToSpace);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int8_t, Sign);
@@ -1223,6 +1294,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, float, LessOrEqual);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, double, LessOrEqual);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, 17, ScatterElements);
 
 // Opset 17
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
@@ -1231,6 +1303,16 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization);
 
 // Opset 18
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, float, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, double, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, bool, Pad);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, ScatterElements);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, float, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, double, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, int32_t, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, uint8_t, Resize);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, Split);
 
 // Opset 19
@@ -1264,6 +1346,10 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, R
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Shape);
 
+// Opset 20
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN);
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   return {};
@@ -1448,6 +1534,7 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, float, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, double, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Erf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, 12, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, bool, Not)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, float, BatchNormalization)>,
     // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 7, 8, double, BatchNormalization)>,
@@ -1660,6 +1747,8 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 12, Mod)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10,
+                                                                                                           19, IsInf)>,
 
     // opset 11
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 11, 11, float, ArgMax)>,
@@ -1856,6 +1945,7 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 19, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int16_t, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
@@ -1932,7 +2022,7 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Size)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Transpose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 15, ScatterElements)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int64_t, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Softmax)>,
@@ -2008,11 +2098,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int64_t, GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, uint8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, If)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Flatten)>,
@@ -2021,10 +2116,10 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, LRN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 13, Identity)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, bool, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, float, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, double, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, bool, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, SpaceToDepth)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int8_t, Sign)>,
@@ -2153,6 +2248,7 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, float, LessOrEqual)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, double, LessOrEqual)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
 
     // Opset 17
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
@@ -2161,6 +2257,21 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
 
     // Opset 18
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, float, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, double, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, bool, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                   MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, Split)>,
 
     // Opset 19
@@ -2195,6 +2306,10 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Shape)>,
+
+    // opset 20
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
index c4945b9ac248..6d6c05027e7b 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@@ -10,6 +10,7 @@
 #include "core/framework/execution_provider.h"
 #include "core/platform/ort_mutex.h"
 #include "core/providers/rocm/rocm_execution_provider_info.h"
+#include "core/providers/rocm/rocm_graph.h"
 #include "core/providers/rocm/rocm_pch.h"
 #include "core/providers/rocm/shared_inc/rocm_utils.h"
 #include "core/providers/rocm/shared_inc/rocm_call.h"
@@ -27,9 +28,9 @@ class ROCMExecutionProvider : public IExecutionProvider {
 
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   const void* GetExecutionHandle() const noexcept override {
     // The ROCM interface does not return anything interesting.
@@ -73,6 +74,9 @@ class ROCMExecutionProvider : public IExecutionProvider {
 
   std::unique_ptr<profiling::EpProfiler> GetProfiler() override;
 
+  bool IsGraphCaptureEnabled() const override;
+  bool IsGraphCaptured(int graph_annotation_id) const override;
+  Status ReplayGraph(int graph_annotation_id) override;
   void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
   OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
@@ -81,6 +85,7 @@ class ROCMExecutionProvider : public IExecutionProvider {
   ROCMExecutionProviderInfo info_;
   hipDeviceProp_t device_prop_;
   bool external_stream_ = false;
+  // only used when set user external stream or hip graph
   hipStream_t stream_ = nullptr;
 
   bool use_ep_level_unified_stream_ = false;
@@ -133,6 +138,13 @@ class ROCMExecutionProvider : public IExecutionProvider {
       }
     }
 
+    bool IsGraphCaptureAllowed() const;
+    void CaptureBegin(int graph_annotation_id);
+    void CaptureEnd(int graph_annotation_id);
+    bool IsGraphCaptured(int graph_annotation_id) const;
+    Status ReplayGraph(int graph_annotation_id);
+    void IncrementRegularRunCountBeforeGraphCapture();
+
    private:
     rocblas_handle rocblas_handle_ = nullptr;
     miopenHandle_t miopen_handle_ = nullptr;
@@ -141,6 +153,18 @@ class ROCMExecutionProvider : public IExecutionProvider {
     std::unique_ptr<rocm::IConstantBuffer<double>> constant_ones_double_;
     std::unique_ptr<rocm::IConstantBuffer<half>> constant_ones_half_;
     std::unique_ptr<rocm::IConstantBuffer<BFloat16>> constant_ones_bfloat16_;
+
+    // Hip graph with multi threads will be supported in the future, so hip_graph_
+    // is put under PerThreadContext.
+    ROCMGraph hip_graph_;
+    bool is_graph_captured_ = false;
+    int regular_run_count_before_graph_capture_ = 0;
+
+    // There is chance that the second regular run allocates GPU memory for causes like:
+    // (1) memory pattern is enabled. (2) arena allocation for stream.
+    // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs
+    // to allocate enough memory in Arena before graph capturing.
+    const int min_num_runs_before_hip_graph_capture_ = 2;  // required min regular runs before graph capture for the necessary memory allocations.
   };
 
   using PerThreadContextMap = std::unordered_map<const ROCMExecutionProvider*, std::weak_ptr<PerThreadContext>>;
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc
index 650635c15364..3cb826437a54 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc
@@ -13,6 +13,8 @@ namespace onnxruntime {
 namespace rocm {
 namespace provider_option_names {
 constexpr const char* kDeviceId = "device_id";
+constexpr const char* kHasUserComputeStream = "has_user_compute_stream";
+constexpr const char* kUserComputeStream = "user_compute_stream";
 constexpr const char* kMemLimit = "gpu_mem_limit";
 constexpr const char* kArenaExtendStrategy = "arena_extend_strategy";
 constexpr const char* kMiopenConvExhaustiveSearch = "miopen_conv_exhaustive_search";
@@ -21,6 +23,7 @@ constexpr const char* kGpuExternalAlloc = "gpu_external_alloc";
 constexpr const char* kGpuExternalFree = "gpu_external_free";
 constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache";
 constexpr const char* kMiopenConvUseMaxWorkspace = "miopen_conv_use_max_workspace";
+constexpr const char* kEnableHipGraph = "enable_hip_graph";
 constexpr const char* kTunableOpEnable = "tunable_op_enable";
 constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
 constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
@@ -37,6 +40,7 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
   void* alloc = nullptr;
   void* free = nullptr;
   void* empty_cache = nullptr;
+  void* user_compute_stream = nullptr;
   ORT_THROW_IF_ERROR(
       ProviderOptionsParser{}
           .AddValueParser(
@@ -51,6 +55,15 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
                     ", must be between 0 (inclusive) and ", num_devices, " (exclusive).");
                 return Status::OK();
               })
+          .AddAssignmentToReference(rocm::provider_option_names::kHasUserComputeStream, info.has_user_compute_stream)
+          .AddValueParser(
+              rocm::provider_option_names::kUserComputeStream,
+              [&user_compute_stream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                user_compute_stream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
           .AddValueParser(
               rocm::provider_option_names::kGpuExternalAlloc,
               [&alloc](const std::string& value_str) -> Status {
@@ -84,6 +97,7 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
               info.miopen_conv_exhaustive_search)
           .AddAssignmentToReference(rocm::provider_option_names::kDoCopyInDefaultStream, info.do_copy_in_default_stream)
           .AddAssignmentToReference(rocm::provider_option_names::kMiopenConvUseMaxWorkspace, info.miopen_conv_use_max_workspace)
+          .AddAssignmentToReference(rocm::provider_option_names::kEnableHipGraph, info.enable_hip_graph)
           .AddValueParser(
               rocm::provider_option_names::kTunableOpEnable,
               [&info](const std::string& value_str) -> Status {
@@ -106,12 +120,18 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
 
   ROCMExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
   info.external_allocator_info = alloc_info;
+
+  info.user_compute_stream = user_compute_stream;
+  info.has_user_compute_stream = (user_compute_stream != nullptr);
+
   return info;
 }
 
 ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecutionProviderInfo& info) {
   const ProviderOptions options{
       {rocm::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
+      {rocm::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {rocm::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {rocm::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)},
       {rocm::provider_option_names::kGpuExternalAlloc, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.alloc))},
       {rocm::provider_option_names::kGpuExternalFree, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.external_allocator_info.free))},
@@ -121,6 +141,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution
       {rocm::provider_option_names::kMiopenConvExhaustiveSearch, MakeStringWithClassicLocale(info.miopen_conv_exhaustive_search)},
       {rocm::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)},
       {rocm::provider_option_names::kMiopenConvUseMaxWorkspace, MakeStringWithClassicLocale(info.miopen_conv_use_max_workspace)},
+      {rocm::provider_option_names::kEnableHipGraph, MakeStringWithClassicLocale(info.enable_hip_graph)},
       {rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)},
       {rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
       {rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
@@ -132,6 +153,8 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution
 ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const OrtROCMProviderOptions& info) {
   const ProviderOptions options{
       {rocm::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
+      {rocm::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {rocm::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {rocm::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)},
       {rocm::provider_option_names::kArenaExtendStrategy, EnumToName(arena_extend_strategy_mapping, static_cast<onnxruntime::ArenaExtendStrategy>(info.arena_extend_strategy))},
       {rocm::provider_option_names::kMiopenConvExhaustiveSearch, MakeStringWithClassicLocale(info.miopen_conv_exhaustive_search)},
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h
index e35c0cc0afec..c245b18057ca 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h
@@ -63,6 +63,8 @@ struct ROCMExecutionProviderInfo {
   // If set to false, use fix workspace size (32M) for Conv algo search, the final algo might not be the best.
   bool miopen_conv_use_max_workspace{true};
 
+  bool enable_hip_graph{false};
+
   rocm::TunableOpInfo tunable_op{};
 
   static ROCMExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
@@ -72,12 +74,32 @@ struct ROCMExecutionProviderInfo {
 }  // namespace onnxruntime
 
 template <>
-struct std::hash<::onnxruntime::rocm::TunableOpInfo> {
-  size_t operator()(const ::onnxruntime::rocm::TunableOpInfo& info) const {
-    size_t seed_and_value{0xbc9f1d34};
-    onnxruntime::HashCombine(info.enable, seed_and_value);
-    onnxruntime::HashCombine(info.tuning_enable, seed_and_value);
-    onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value);
-    return seed_and_value;
+struct std::hash<::onnxruntime::ROCMExecutionProviderInfo> {
+  size_t operator()(const ::onnxruntime::ROCMExecutionProviderInfo& info) const {
+    size_t value{0xbc9f1d34};  // seed
+
+    // Bits: device_id (16), arena_extend_strategy/miopen_conv_exhaustive_search (reserved 2), boolean options (1 each)
+    size_t data = static_cast<size_t>(info.device_id) ^
+                  (static_cast<size_t>(info.arena_extend_strategy) << 16) ^
+                  (static_cast<size_t>(info.miopen_conv_exhaustive_search) << 18) ^
+                  (static_cast<size_t>(info.do_copy_in_default_stream) << 20) ^
+                  (static_cast<size_t>(info.has_user_compute_stream) << 21) ^
+                  (static_cast<size_t>(info.miopen_conv_use_max_workspace) << 22) ^
+                  (static_cast<size_t>(info.enable_hip_graph) << 23) ^
+                  (static_cast<size_t>(info.tunable_op.enable) << 24) ^
+                  (static_cast<size_t>(info.tunable_op.tuning_enable) << 25);
+    onnxruntime::HashCombine(data, value);
+
+    onnxruntime::HashCombine(info.gpu_mem_limit, value);
+    onnxruntime::HashCombine(info.tunable_op.max_tuning_duration_ms, value);
+
+    // Memory pointers
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.user_compute_stream), value);
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.alloc), value);
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.free), value);
+    onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.empty_cache), value);
+
+    // The default memory arena cfg is not used in hashing right now.
+    return value;
   }
 };
diff --git a/onnxruntime/core/providers/rocm/rocm_kernel.h b/onnxruntime/core/providers/rocm/rocm_kernel.h
index c0b7d4722d3e..70bf08d65401 100644
--- a/onnxruntime/core/providers/rocm/rocm_kernel.h
+++ b/onnxruntime/core/providers/rocm/rocm_kernel.h
@@ -101,6 +101,10 @@ class RocmKernel : public OpKernel {
     return static_cast<tunable::RocmTuningContext*>(provider_->GetTuningContext());
   }
 
+  bool UseTF32() const {
+    return false;
+  }
+
   // To support hipMemcpyAsync, the cpu memory should be allocated in pinned memory
   // and it can only be released after the copy has finished
   template <typename T>
diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc
index 4d88c2546937..88ef666678b3 100644
--- a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc
+++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc
@@ -185,6 +185,7 @@ struct ROCM_Provider : Provider {
     info.has_user_compute_stream = params->has_user_compute_stream != 0;
     info.user_compute_stream = params->user_compute_stream;
     info.default_memory_arena_cfg = params->default_memory_arena_cfg;
+    info.enable_hip_graph = params->enable_hip_graph;
     info.tunable_op.enable = params->tunable_op_enable;
     info.tunable_op.tuning_enable = params->tunable_op_tuning_enable;
     info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms;
@@ -215,6 +216,7 @@ struct ROCM_Provider : Provider {
       rocm_options.user_compute_stream = internal_options.user_compute_stream;
     }
     rocm_options.default_memory_arena_cfg = internal_options.default_memory_arena_cfg;
+    rocm_options.enable_hip_graph = internal_options.enable_hip_graph;
     rocm_options.tunable_op_enable = internal_options.tunable_op.enable;
     rocm_options.tunable_op_tuning_enable = internal_options.tunable_op.tuning_enable;
     rocm_options.tunable_op_max_tuning_duration_ms = internal_options.tunable_op.max_tuning_duration_ms;
diff --git a/onnxruntime/core/providers/rocm/rocm_stream_handle.h b/onnxruntime/core/providers/rocm/rocm_stream_handle.h
index 1f3e5b75548e..30983ce03568 100644
--- a/onnxruntime/core/providers/rocm/rocm_stream_handle.h
+++ b/onnxruntime/core/providers/rocm/rocm_stream_handle.h
@@ -8,6 +8,7 @@
 #include "core/framework/stream_handles.h"
 
 namespace onnxruntime {
+void WaitRocmNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 
 struct RocmStream : Stream {
   RocmStream(hipStream_t stream,
@@ -36,6 +37,8 @@ struct RocmStream : Stream {
 
   void* GetResource(int version, int id) const override;
 
+  WaitNotificationFn GetWaitNotificationFn() const override { return WaitRocmNotificationOnDevice; }
+
  private:
   std::vector<void*> deferred_cpu_buffers_;
   AllocatorPtr cpu_allocator_;
@@ -50,5 +53,4 @@ void RegisterRocmStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                bool use_existing_stream,
                                miopenHandle_t external_miopen_handle,
                                rocblas_handle external_rocblas_handle);
-void WaitRocmNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h
index 7cbc37cb64c5..d93f70785c09 100644
--- a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h
+++ b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h
@@ -115,7 +115,8 @@ inline rocblas_status rocblasGemmHelper(rocblas_handle handle,
                                         const half* B, int ldb,
                                         const float* beta,
                                         half* C, int ldc,
-                                        const hipDeviceProp_t&) {
+                                        const hipDeviceProp_t&,
+                                        bool /*use_tf32*/) {
   return rocblasGemmHelper(handle,
                            transa,
                            transb,
@@ -154,7 +155,7 @@ inline rocblas_status rocblasGemmHelper(rocblas_handle handle,
                          rocblas_gemm_algo_standard, 0, 0);
 }
 
-// Compatible for function call with the extra hipDeviceProp_t argument
+// Compatible for function call with extra arguments (see cublasGemmHelper)
 template <typename Scalar>
 rocblas_status rocblasGemmHelper(rocblas_handle handle,
                                  rocblas_operation transa,
@@ -165,7 +166,8 @@ rocblas_status rocblasGemmHelper(rocblas_handle handle,
                                  const Scalar* B, int ldb,
                                  const Scalar* beta,
                                  Scalar* C, int ldc,
-                                 const hipDeviceProp_t&) {
+                                 const hipDeviceProp_t&,
+                                 bool /*use_tf32*/) {
   return rocblasGemmHelper(handle,
                            transa,
                            transb,
@@ -404,7 +406,7 @@ inline rocblas_status rocblasGemmStridedBatchedHelper(rocblas_handle handle,
                                          rocblas_gemm_algo_standard, 0, 0);
 }
 
-// Compatible for function call with the extra hipDeviceProp_t argument
+// Compatible for function call with with extra arguments (see cublasGemmStridedBatchedHelper)
 template <typename Scalar>
 rocblas_status rocblasGemmStridedBatchedHelper(rocblas_handle handle,
                                                rocblas_operation transa,
@@ -419,7 +421,8 @@ rocblas_status rocblasGemmStridedBatchedHelper(rocblas_handle handle,
                                                Scalar* C, int ldc,
                                                intmax_t strideC,
                                                int batchCount,
-                                               const hipDeviceProp_t&) {
+                                               const hipDeviceProp_t&,
+                                               bool /*use_tf32*/) {
   return rocblasGemmStridedBatchedHelper(handle,
                                          transa,
                                          transb,
@@ -445,7 +448,8 @@ inline rocblas_status rocblasGemmStridedBatchedHelper(rocblas_handle handle,
                                                       __half* C, int ldc,
                                                       intmax_t strideC,
                                                       int batchCount,
-                                                      const hipDeviceProp_t&) {
+                                                      const hipDeviceProp_t&,
+                                                      bool /*use_tf32*/) {
   return rocblasGemmStridedBatchedHelper(handle,
                                          transa,
                                          transb,
diff --git a/onnxruntime/core/providers/shared/node_unit/node_unit.cc b/onnxruntime/core/providers/shared/node_unit/node_unit.cc
deleted file mode 100644
index 10dd58ba2837..000000000000
--- a/onnxruntime/core/providers/shared/node_unit/node_unit.cc
+++ /dev/null
@@ -1,319 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "node_unit.h"
-#include "core/graph/graph_viewer.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-
-namespace onnxruntime {
-
-namespace {
-
-enum class QLinearOpType : uint8_t {
-  Unknown,  // Unknown or not a linear quantized op
-  DequantizeLinear,
-  QuantizeLinear,
-  QLinearConv,
-  QLinearMatMul,
-  QLinearAdd,
-  QLinearSigmoid,
-  QLinearAveragePool,
-  QLinearMul,
-  QLinearReduceMean,
-  QLinearConcat,
-  QLinearGlobalAveragePool,
-  QLinearLeakyRelu,
-};
-
-QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
-  const auto& op_type = node.OpType();
-  if (op_type == "DequantizeLinear")
-    return QLinearOpType::DequantizeLinear;
-  else if (op_type == "QuantizeLinear")
-    return QLinearOpType::QuantizeLinear;
-  else if (op_type == "QLinearConv")
-    return QLinearOpType::QLinearConv;
-  else if (op_type == "QLinearMatMul")
-    return QLinearOpType::QLinearMatMul;
-  else if (op_type == "QLinearAdd")
-    return QLinearOpType::QLinearAdd;
-  else if (op_type == "QLinearSigmoid")
-    return QLinearOpType::QLinearSigmoid;
-  else if (op_type == "QLinearAveragePool")
-    return QLinearOpType::QLinearAveragePool;
-  else if (op_type == "QLinearMul")
-    return QLinearOpType::QLinearMul;
-  else if (op_type == "QLinearReduceMean")
-    return QLinearOpType::QLinearReduceMean;
-  else if (op_type == "QLinearConcat")
-    return QLinearOpType::QLinearConcat;
-  else if (op_type == "QLinearGlobalAveragePool")
-    return QLinearOpType::QLinearGlobalAveragePool;
-  else if (op_type == "QLinearLeakyRelu")
-    return QLinearOpType::QLinearLeakyRelu;
-
-  return QLinearOpType::Unknown;
-}
-
-// Ops have 1 input
-bool IsUnaryQLinearOp(QLinearOpType type) {
-  return type == QLinearOpType::QLinearSigmoid ||
-         type == QLinearOpType::QLinearAveragePool ||
-         type == QLinearOpType::QLinearGlobalAveragePool ||
-         type == QLinearOpType::QLinearLeakyRelu ||
-         type == QLinearOpType::QLinearReduceMean;
-}
-
-// Ops have 2 inputs
-bool IsBinaryQLinearOp(QLinearOpType type) {
-  return type == QLinearOpType::QLinearConv ||
-         type == QLinearOpType::QLinearMatMul ||
-         type == QLinearOpType::QLinearAdd ||
-         type == QLinearOpType::QLinearMul;
-}
-
-// Ops have 1 or more inputs
-bool IsVariadicQLinearOp(QLinearOpType type) {
-  return type == QLinearOpType::QLinearConcat;
-}
-
-const std::vector<const Node*> GetQDQIONodes(const GraphViewer& graph_viewer,
-                                             const QDQ::NodeGroup& node_group, bool is_input) {
-  std::vector<const Node*> io_nodes;
-  const auto& src_nodes = is_input ? node_group.dq_nodes : node_group.q_nodes;
-  io_nodes.reserve(src_nodes.size());
-  for (const auto& node_idx : src_nodes) {
-    io_nodes.push_back(graph_viewer.GetNode(node_idx));
-  }
-  return io_nodes;
-}
-
-// Get the input or output NodeUnitIODef(s) for the given QDQ NodeGroup
-std::vector<NodeUnitIODef> GetQDQIODefs(const Node& target_node, const QDQ::NodeGroup& node_group,
-                                        bool is_input) {
-  const auto& dq_or_q_nodes = is_input ? node_group.dq_nodes : node_group.q_nodes;
-  const auto target_node_io_defs = is_input ? target_node.InputDefs() : target_node.OutputDefs();
-  const size_t target_node_io_defs_size = target_node_io_defs.size();
-
-  // Find all the quantized IO defs and indices (for the input to the target node)
-  std::unordered_map<size_t, NodeUnitIODef> quantized_io_defs;
-  quantized_io_defs.reserve(target_node_io_defs_size);
-
-  auto cur = is_input ? target_node.InputEdgesBegin() : target_node.OutputEdgesBegin();
-  auto end = is_input ? target_node.InputEdgesEnd() : target_node.OutputEdgesEnd();
-  for (; cur != end; ++cur) {
-    const Node& node = cur->GetNode();
-
-    // If we can find the node index in the dq or q nodes, then this is a quantize node (can be DQ or Q depends on is_input)
-    if (std::find(dq_or_q_nodes.cbegin(), dq_or_q_nodes.cend(), node.Index()) != dq_or_q_nodes.cend()) {
-      const auto node_inputs = node.InputDefs();
-      // quantization scale and zp are always the input[1, 2]
-      NodeUnitIODef::QuantParam quant_param{
-          *node_inputs[1],
-          node_inputs.size() == 3 ? node_inputs[2] : nullptr};
-      if (is_input) {
-        // DQ is input to the target node, use the DstArgIndex
-        auto idx = cur->GetDstArgIndex();
-        // This is a DQ node, we are using x, x_scale, x_zp (input[0, 1, 2])
-        quantized_io_defs.insert({idx, NodeUnitIODef{*node_inputs[0], quant_param}});
-      } else {
-        // Q is output of the target node, use the SrcArgIndex
-        auto idx = cur->GetSrcArgIndex();
-        // This is a Q node, we are using y (output[0]), y_scale, y_zp (input[1, 2])
-        const auto node_outputs = node.OutputDefs();
-        quantized_io_defs.insert({idx, NodeUnitIODef{*node_outputs[0], quant_param}});
-      }
-    }
-  }
-
-  // Construct the IODefs for this QDQ NodeGroup
-  std::vector<NodeUnitIODef> io_defs;
-  io_defs.reserve(target_node_io_defs_size);
-  for (size_t i = 0; i < target_node_io_defs_size; i++) {
-    // If we can find the NodeUnitIODef for this index, this is a quantized input
-    if (quantized_io_defs.find(i) != quantized_io_defs.cend()) {
-      io_defs.push_back(std::move(quantized_io_defs.at(i)));
-    } else {
-      // This is a regular input
-      io_defs.push_back({*target_node_io_defs[i], std::nullopt});
-    }
-  }
-
-  return io_defs;
-}
-
-}  // namespace
-
-NodeUnit::NodeUnit(const Node& node)
-    : target_node_(node),
-      type_(Type::SingleNode) {
-  InitForSingleNode();
-}
-
-NodeUnit::NodeUnit(const GraphViewer& graph_viewer, const QDQ::NodeGroup& node_group)
-    : q_nodes_{GetQDQIONodes(graph_viewer, node_group, false /* is_input */)},
-      dq_nodes_{GetQDQIONodes(graph_viewer, node_group, true /* is_input */)},
-      target_node_(*graph_viewer.GetNode(node_group.target_node)),
-      type_(Type::QDQGroup),
-      inputs_{GetQDQIODefs(target_node_, node_group, true /* is_input */)},
-      outputs_{GetQDQIODefs(target_node_, node_group, false /* is_input */)} {
-  ORT_THROW_IF_ERROR(QDQ::ValidateNodeGroupDQNodes(graph_viewer, target_node_, dq_nodes_));
-}
-
-const std::string& NodeUnit::Domain() const noexcept { return target_node_.Domain(); }
-const std::string& NodeUnit::OpType() const noexcept { return target_node_.OpType(); }
-const std::string& NodeUnit::Name() const noexcept { return target_node_.Name(); }
-int NodeUnit::SinceVersion() const noexcept { return target_node_.SinceVersion(); }
-NodeIndex NodeUnit::Index() const noexcept { return target_node_.Index(); }
-const Path& NodeUnit::ModelPath() const noexcept { return target_node_.ModelPath(); }
-ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target_node_.GetExecutionProviderType(); }
-
-void NodeUnit::InitForSingleNode() {
-  const auto& input_defs = target_node_.InputDefs();
-  const auto& output_defs = target_node_.OutputDefs();
-  auto qlinear_type = GetQLinearOpType(target_node_);
-  if (qlinear_type == QLinearOpType::Unknown ||
-      IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
-    // Not a Qlinear op, add all inputs / outputs
-    auto add_all_io = [](std::vector<NodeUnitIODef>& defs,
-                         const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
-      defs.reserve(node_defs.size());
-
-      for (const auto def : node_defs) {
-        defs.push_back(NodeUnitIODef{*def, std::nullopt});
-      }
-    };
-    add_all_io(inputs_, input_defs);
-    add_all_io(outputs_, output_defs);
-  } else if (IsUnaryQLinearOp(qlinear_type)) {
-    // Unary QLinear Op has 5 inputs
-    // x, x_scale, x_zp, y_scale, y_zp (optional)
-    inputs_.push_back(NodeUnitIODef{
-        *input_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
-
-    outputs_.push_back(NodeUnitIODef{
-        *output_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[3],
-                                  input_defs.size() > 4
-                                      ? input_defs[4]
-                                      : nullptr}});
-  } else if (IsBinaryQLinearOp(qlinear_type)) {
-    // Binary QLinear Op has 9 inputs
-    // x1, x1_scale, x1_zp, x2/w, x2_scale, x2_zp, y_scale , y_zp, B
-    inputs_.push_back(NodeUnitIODef{
-        *input_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[1], input_defs[2]}});
-    inputs_.push_back(NodeUnitIODef{
-        *input_defs[3],
-        NodeUnitIODef::QuantParam{*input_defs[4], input_defs[5]}});
-
-    if (input_defs.size() == 9) {  // has Bias
-      inputs_.push_back(NodeUnitIODef{
-          *input_defs[8],
-          std::nullopt});  // for Bias the scale and zp are optional
-    }
-
-    outputs_.push_back(NodeUnitIODef{
-        *output_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[6], input_defs[7]}});
-  } else if (qlinear_type == QLinearOpType::DequantizeLinear) {
-    // DequantizeLinear has 3 inputs
-    // x, x_scale, x_zp
-    // output is not quantized
-    inputs_.push_back(NodeUnitIODef{
-        *input_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[1],
-                                  input_defs.size() == 3
-                                      ? input_defs[2]
-                                      : nullptr}});
-    outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
-  } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
-    // QuantizeLinear the input is not quantized and has 3 inputs
-    // x, y_scale, y_zp (optional)
-    // The output is quantized
-    inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
-    outputs_.push_back(NodeUnitIODef{
-        *output_defs[0],
-        NodeUnitIODef::QuantParam{*input_defs[1],
-                                  input_defs.size() == 3
-                                      ? input_defs[2]
-                                      : nullptr}});
-  } else {
-    ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
-  }
-}
-
-Node::EdgeConstIterator NodeUnit::OutputEdgesBegin(size_t index) const {
-  if (type_ == Type::SingleNode) {
-    ORT_ENFORCE(index == 0, "invalid output node index");
-    return target_node_.OutputEdgesBegin();
-  } else {
-    ORT_ENFORCE(index < q_nodes_.size(), "invalid output node index");
-    return q_nodes_[index]->OutputEdgesBegin();
-  }
-}
-
-Node::EdgeConstIterator NodeUnit::OutputEdgesEnd(size_t index) const {
-  if (type_ == Type::SingleNode) {
-    ORT_ENFORCE(index == 0, "invalid output node index");
-    return target_node_.OutputEdgesEnd();
-  } else {
-    ORT_ENFORCE(index < q_nodes_.size(), "invalid output node index");
-    return q_nodes_[index]->OutputEdgesEnd();
-  }
-}
-
-std::vector<const Node*> NodeUnit::GetAllNodesInGroup() const noexcept {
-  std::vector<const Node*> all_nodes = dq_nodes_;
-  all_nodes.push_back(&target_node_);
-  all_nodes.insert(all_nodes.end(), q_nodes_.begin(), q_nodes_.end());
-  return all_nodes;
-}
-
-std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-GetAllNodeUnits(const GraphViewer& graph_viewer) {
-  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
-  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-
-  const auto add_node_unit_to_map = [&](const std::vector<NodeIndex>& node_indices, const NodeUnit* node_unit) {
-    for (const auto& node_idx : node_indices) {
-      const auto* node = graph_viewer.GetNode(node_idx);
-      node_unit_map.insert({node, node_unit});
-    }
-  };
-
-  // Get QDQ NodeUnits first
-  QDQ::SelectorManager selector_mgr;
-  const auto qdq_selections = selector_mgr.GetQDQSelections(graph_viewer);
-
-  for (const auto& qdq_selection : qdq_selections) {
-    auto qdq_unit = std::make_unique<NodeUnit>(graph_viewer, qdq_selection);
-
-    // Fill the node to node_unit map for all nodes in the QDQ Group
-    add_node_unit_to_map(qdq_selection.dq_nodes, qdq_unit.get());
-    add_node_unit_to_map(qdq_selection.q_nodes, qdq_unit.get());
-    add_node_unit_to_map({qdq_selection.target_node}, qdq_unit.get());
-
-    node_unit_holder.push_back(std::move(qdq_unit));
-  }
-
-  // Get the left over SingleNode NodeUnits
-  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
-  for (const auto node_idx : node_indices) {
-    const auto* node(graph_viewer.GetNode(node_idx));
-
-    // This is already part of a QDQ NodeUnit
-    if (node_unit_map.find(node) != node_unit_map.cend())
-      continue;
-
-    auto node_unit = std::make_unique<NodeUnit>(*node);
-    node_unit_map[node] = node_unit.get();
-    node_unit_holder.push_back(std::move(node_unit));
-  }
-
-  return std::make_pair(std::move(node_unit_holder), std::move(node_unit_map));
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared/utils/utils.cc b/onnxruntime/core/providers/shared/utils/utils.cc
index 39ea4dd8412b..2088618538de 100644
--- a/onnxruntime/core/providers/shared/utils/utils.cc
+++ b/onnxruntime/core/providers/shared/utils/utils.cc
@@ -4,12 +4,12 @@
 
 #include "utils.h"
 
-#include <core/common/safeint.h>
-#include <core/framework/tensorprotoutils.h>
-#include <core/graph/graph.h>
-#include <core/providers/common.h>
-#include "core/providers/shared/node_unit/node_unit.h"
+#include "core/common/safeint.h"
+#include "core/framework/node_unit.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/graph.h"
 #include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
 
 namespace onnxruntime {
 
@@ -25,12 +25,14 @@ bool GetType(const NodeArg& node_arg, int32_t& type, const logging::Logger& logg
   return true;
 }
 
-bool GetClipMinMax(const InitializedTensorSet& initializers, const Node& node,
-                   float& min, float& max, const logging::Logger& logger) {
+namespace {
+bool GetClipMinMaxImpl(std::function<const ONNX_NAMESPACE::TensorProto*(const std::string&)> get_const_initializer,
+                       const Node& node, float& min, float& max, const logging::Logger& logger) {
   const auto& node_name = node.Name();
   int32_t input_type;
-  if (!GetType(*node.InputDefs()[0], input_type, logger))
+  if (!GetType(*node.InputDefs()[0], input_type, logger)) {
     return false;
+  }
 
   min = std::numeric_limits<float>::lowest();
   max = std::numeric_limits<float>::max();
@@ -41,49 +43,73 @@ bool GetClipMinMax(const InitializedTensorSet& initializers, const Node& node,
     min = helper.Get("min", std::numeric_limits<float>::lowest());
     max = helper.Get("max", std::numeric_limits<float>::max());
   } else {
-    if (node.InputDefs().size() > 1) {
-      // we have input min
-      const auto& min_name = node.InputDefs()[1]->Name();
-      if (!Contains(initializers, min_name)) {
-        LOGS(logger, VERBOSE) << "Input min of Clip must be known";
+    auto get_value =
+        [&](const ONNX_NAMESPACE::TensorProto* initializer, std::string_view type, float& value) -> bool {
+      if (!initializer) {
+        LOGS(logger, VERBOSE) << type << " input of Clip must be a constant initializer";
         return false;
       }
-      Initializer unpacked_tensor_min(*initializers.at(min_name));
+
+      Initializer unpacked_tensor_min(*initializer);
       switch (input_type) {
         case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
-          min = unpacked_tensor_min.DataAsSpan<float>()[0];
+          value = unpacked_tensor_min.DataAsSpan<float>()[0];
           break;
         case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
-          min = (unpacked_tensor_min.DataAsSpan<MLFloat16>()[0]).ToFloat();
+          value = unpacked_tensor_min.DataAsSpan<MLFloat16>()[0].ToFloat();
           break;
         default:
-          LOGS(logger, VERBOSE) << "GetClipMinMax() only support Clip node with float inputs for now. "
-                                << "The node [" << node_name << "] has input 0 type: " << input_type;
+          LOGS(logger, VERBOSE) << "GetClipMinMax() only supports float and float16 as min and max inputs for now."
+                                << " The node [" << node_name << "] has input type: " << input_type;
           return false;
       }
 
-      if (node.InputDefs().size() > 2) {
-        // we have input max
-        const auto& max_name = node.InputDefs()[2]->Name();
-        if (!Contains(initializers, max_name)) {
-          LOGS(logger, VERBOSE) << "Input max of Clip must be known";
-          return false;
-        }
-        Initializer unpacked_tensor_max(*initializers.at(max_name));
-        switch (input_type) {
-          case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
-            max = unpacked_tensor_max.DataAsSpan<float>()[0];
-            break;
-          case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
-            max = (unpacked_tensor_max.DataAsSpan<MLFloat16>()[0]).ToFloat();
-            break;
-        }
+      return true;
+    };
+
+    // min and max are both optional. could have neither, one or both.
+    if (node.InputDefs().size() > 1 && node.InputDefs()[1]->Exists()) {
+      // we have input min
+      const auto& min_name = node.InputDefs()[1]->Name();
+      const auto* min_value = get_const_initializer(min_name);
+      if (!get_value(min_value, "Min", min)) {
+        return false;
+      }
+    }
+
+    if (node.InputDefs().size() > 2 && node.InputDefs()[2]->Exists()) {
+      // we have input max
+      const auto& max_name = node.InputDefs()[2]->Name();
+      const auto* max_value = get_const_initializer(max_name);
+      if (!get_value(max_value, "Max", max)) {
+        return false;
       }
     }
   }
 
   return true;
 }
+}  // namespace
+
+bool GetClipMinMax(const GraphViewer& graph_viewer, const Node& node, float& min, float& max,
+                   const logging::Logger& logger) {
+  return GetClipMinMaxImpl(
+      [&graph_viewer](const std::string& name) -> const ONNX_NAMESPACE::TensorProto* {
+        return graph_viewer.GetConstantInitializer(name);
+      },
+      node, min, max, logger);
+}
+
+// deprecated version that is not able to check if the initializer is constant
+bool GetClipMinMax(const InitializedTensorSet& initializers, const Node& node, float& min, float& max,
+                   const logging::Logger& logger) {
+  return GetClipMinMaxImpl(
+      [&initializers](const std::string& name) -> const ONNX_NAMESPACE::TensorProto* {
+        auto entry = initializers.find(name);
+        return entry == initializers.end() ? nullptr : entry->second;
+      },
+      node, min, max, logger);
+}
 
 NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node)
     : node_attributes_(node.GetAttributes()) {}
@@ -92,84 +118,134 @@ NodeAttrHelper::NodeAttrHelper(const NodeUnit& node_unit)
     : node_attributes_(node_unit.GetNode().GetAttributes()) {}
 
 float NodeAttrHelper::Get(const std::string& key, float def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second.f();
+  }
 
-  return node_attributes_.at(key).f();
+  return def_val;
 }
 
 int32_t NodeAttrHelper::Get(const std::string& key, int32_t def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<int32_t>(entry->second.i());
+  }
 
-  return SafeInt<int32_t>(node_attributes_.at(key).i());
+  return def_val;
 }
 
 uint32_t NodeAttrHelper::Get(const std::string& key, uint32_t def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<uint32_t>(entry->second.i());
+  }
 
-  return SafeInt<uint32_t>(node_attributes_.at(key).i());
+  return def_val;
 }
 
 int64_t NodeAttrHelper::Get(const std::string& key, int64_t def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second.i();
+  }
 
-  return node_attributes_.at(key).i();
+  return def_val;
 }
 
 const std::string& NodeAttrHelper::Get(const std::string& key, const std::string& def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second.s();
+  }
 
-  return node_attributes_.at(key).s();
+  return def_val;
 }
 
 std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int32_t>& def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& attr = entry->second;
+    std::vector<int32_t> v;
+    v.reserve(static_cast<size_t>(attr.ints_size()));
+    std::transform(attr.ints().cbegin(), attr.ints().cend(), std::back_inserter(v),
+                   [](int64_t val) -> int32_t { return narrow<int32_t>(val); });
+    return v;
+  }
 
-  const auto& attr(node_attributes_.at(key));
-  std::vector<int32_t> v;
-  v.reserve(static_cast<size_t>(attr.ints_size()));
-  std::transform(attr.ints().cbegin(), attr.ints().cend(), std::back_inserter(v),
-                 [](int64_t val) -> int32_t { return SafeInt<int32_t>(val); });
-  return v;
+  return def_val;
 }
 
 std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<uint32_t>& def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& attr = entry->second;
+    std::vector<uint32_t> v;
+    v.reserve(static_cast<size_t>(attr.ints_size()));
+    std::transform(attr.ints().cbegin(), attr.ints().cend(), std::back_inserter(v),
+                   [](int64_t val) -> uint32_t { return narrow<uint32_t>(val); });
+    return v;
+  }
 
-  const auto& attr(node_attributes_.at(key));
-  std::vector<uint32_t> v;
-  v.reserve(static_cast<size_t>(attr.ints_size()));
-  std::transform(attr.ints().cbegin(), attr.ints().cend(), std::back_inserter(v),
-                 [](int64_t val) -> uint32_t { return SafeInt<uint32_t>(val); });
-  return v;
+  return def_val;
 }
 
 std::vector<int64_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int64_t>& def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.ints();
+    return std::vector<int64_t>{values.cbegin(), values.cend()};
+  }
 
-  const auto& source(node_attributes_.at(key).ints());
-  return std::vector<int64_t>{source.cbegin(), source.cend()};
+  return def_val;
 }
 
 std::vector<float> NodeAttrHelper::Get(const std::string& key, const std::vector<float>& def_val) const {
-  if (!HasAttr(key))
-    return def_val;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.floats();
+    return std::vector<float>{values.cbegin(), values.cend()};
+  }
+
+  return def_val;
+}
+
+std::optional<float> NodeAttrHelper::GetFloat(const std::string& key) const {
+  std::optional<float> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second.f();
+  }
+
+  return result;
+}
+
+std::optional<int64_t> NodeAttrHelper::GetInt64(const std::string& key) const {
+  std::optional<int64_t> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second.i();
+  }
 
-  const auto& source(node_attributes_.at(key).floats());
-  return std::vector<float>{source.cbegin(), source.cend()};
+  return result;
 }
 
-std::optional<int64_t> NodeAttrHelper::GetInt(const std::string& key) const {
-  if (!HasAttr(key))
-    return std::nullopt;
-  return node_attributes_.at(key).i();
+std::optional<std::vector<float>> NodeAttrHelper::GetFloats(const std::string& key) const {
+  std::optional<std::vector<float>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.floats();
+    result = std::vector<float>(values.begin(), values.end());
+  }
+
+  return result;
+}
+
+std::optional<std::vector<int64_t>> NodeAttrHelper::GetInt64s(const std::string& key) const {
+  std::optional<std::vector<int64_t>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.ints();
+    result = std::vector<int64_t>(values.begin(), values.end());
+  }
+
+  return result;
+}
+
+std::optional<std::string> NodeAttrHelper::GetString(const std::string& key) const {
+  std::optional<std::string> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second.s();
+  }
+
+  return result;
 }
 
 bool NodeAttrHelper::HasAttr(const std::string& key) const {
diff --git a/onnxruntime/core/providers/shared/utils/utils.h b/onnxruntime/core/providers/shared/utils/utils.h
index 1e93f040711d..5813dcc48d72 100644
--- a/onnxruntime/core/providers/shared/utils/utils.h
+++ b/onnxruntime/core/providers/shared/utils/utils.h
@@ -16,14 +16,20 @@ namespace logging {
 class Logger;
 }
 
+class GraphViewer;
 class Node;
 class NodeArg;
 class NodeUnit;
 
-// Get the min/max of a Clip operator.
-// If min/max are not known initializer tensors, will return false
-// For now we only support getting float min/max,
-// since in most cases, Clip(0,6)[Relu6] will be fused by quantization tool
+// Get the min/max of a Clip operator. Reads values from attributes for opset < 11 and inputs after that.
+// For opset 11+, if min/max are not constant initializers, will return false.
+// For now we only support getting float min/max.
+bool GetClipMinMax(const GraphViewer& graph_viewer, const Node& node,
+                   float& min, float& max, const logging::Logger& logger);
+
+/// <deprecated>GraphViewer GetConstantInitializer/IsConstantInitializer should be used to ensure the initializer is
+/// constant. Low risk for Clip min/max but in general the infrastructure to check if an operator is supported needs
+/// to be updated to not use InitializedTensorSet which may contain non-constant initializers.</deprecated>
 bool GetClipMinMax(const InitializedTensorSet& initializers, const Node& node,
                    float& min, float& max, const logging::Logger& logger);
 
@@ -41,15 +47,17 @@ class NodeAttrHelper {
   // Get the attributes from the target node of the node_unit
   explicit NodeAttrHelper(const NodeUnit& node_unit);
 
+  /*
+   * Get with default
+   */
   float Get(const std::string& key, float def_val) const;
+  std::vector<float> Get(const std::string& key, const std::vector<float>& def_val) const;
 
   int64_t Get(const std::string& key, int64_t def_val) const;
+  std::vector<int64_t> Get(const std::string& key, const std::vector<int64_t>& def_val) const;
 
   const std::string& Get(const std::string& key, const std::string& def_val) const;
 
-  std::vector<int64_t> Get(const std::string& key, const std::vector<int64_t>& def_val) const;
-  std::vector<float> Get(const std::string& key, const std::vector<float>& def_val) const;
-
   // Convert the i() or ints() of the attribute from int64_t to int32_t
   int32_t Get(const std::string& key, int32_t def_val) const;
   std::vector<int32_t> Get(const std::string& key, const std::vector<int32_t>& def_val) const;
@@ -58,7 +66,16 @@ class NodeAttrHelper {
   uint32_t Get(const std::string& key, uint32_t def_val) const;
   std::vector<uint32_t> Get(const std::string& key, const std::vector<uint32_t>& def_val) const;
 
-  std::optional<int64_t> GetInt(const std::string& key) const;
+  /*
+   * Get without default.
+   */
+  std::optional<float> GetFloat(const std::string& key) const;
+  std::optional<std::vector<float>> GetFloats(const std::string& key) const;
+
+  std::optional<int64_t> GetInt64(const std::string& key) const;
+  std::optional<std::vector<int64_t>> GetInt64s(const std::string& key) const;
+
+  std::optional<std::string> GetString(const std::string& key) const;
 
   bool HasAttr(const std::string& key) const;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 76533a006170..1cebe4a256fd 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -95,12 +95,15 @@ enum OperatorStatus : int {
 };
 
 // onnx Protobuf types (All of these are direct mappings to the onnx types except for the Repeated*Field ones which map to a Repeated*Field type)
-struct int64s;  // RepeatedField
+struct int64s;    // RepeatedField
+struct float32s;  // RepeatedField
 struct AttributeProto;
 struct GraphProto;
 struct ModelProto;
 struct NodeProto;
 struct SparseTensorProto;
+struct StringStringEntryProto;
+struct StringStringEntryProtos;  // RepeatedPtrField
 struct TensorProto;
 struct TensorProtos;  // RepeatedPtrField
 struct TensorShapeProto_Dimension;
@@ -113,6 +116,9 @@ struct TypeProto_Sequence;
 struct TypeProto;
 struct ValueInfoProto;
 struct ValueInfoProtos;  // RepeatedPtrField
+struct InferenceContext;
+class GraphInferencer;
+using InferenceFunction = std::function<void(InferenceContext&)>;
 }  // namespace ONNX_NAMESPACE
 
 namespace onnxruntime {
@@ -132,6 +138,7 @@ struct Logger;
 struct Capture;
 }  // namespace logging
 struct ComputeCapability;
+struct ConfigOptions;
 struct DataTransferManager;
 struct IndexedSubGraph;
 struct IndexedSubGraph_MetaDef;
@@ -141,7 +148,7 @@ struct KernelDefBuilder;
 struct KernelRegistry;
 struct Function;
 struct Graph;
-struct GraphViewer;
+class GraphViewer;
 enum class DataLayout;
 struct Model;
 struct Path;
@@ -152,10 +159,12 @@ class OpKernel;
 struct OpKernelContext;
 struct OpKernelInfo;
 struct PrimitiveDataTypeBase;
+struct OrtRunOptions;
 struct Tensor;
 struct SparseTensor;
 class TensorSeq;
 class SessionState;
+class ModelMetadefIdGenerator;
 
 class If;
 class Loop;
@@ -247,6 +256,7 @@ constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
 constexpr const char* kCannExecutionProvider = "CANNExecutionProvider";
 constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
 constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider";
+constexpr const char* kVitisAIExecutionProvider = "VitisAIExecutionProvider";
 constexpr const char* kRocmExecutionProvider = "ROCMExecutionProvider";
 constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider";
 constexpr const char* kMIGraphXExecutionProvider = "MIGraphXExecutionProvider";
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index a3155fe6b86c..7b73ab36b374 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -24,6 +24,7 @@
 #include "core/providers/cpu/tensor/size.h"
 #include "core/providers/cpu/tensor/scatter_nd.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cpu/tensor/tile.h"
 
 #ifndef DISABLE_CONTRIB_OPS
@@ -329,10 +330,6 @@ common::Status IExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
   return g_host->IExecutionProvider__Compile(this, fused_nodes_and_graphs, node_compute_funcs);
 }
 
-int IExecutionProvider::GenerateMetaDefId(const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash) const {
-  return g_host->IExecutionProvider__GenerateMetaDefId(this, graph_viewer, model_hash);
-}
-
 #ifdef USE_TENSORRT
 std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) {
   return g_host->CreateCUDAAllocator(device_id, name);
@@ -496,6 +493,10 @@ template <>
 Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ int64_t* p_data, size_t expected_size) { return g_host->UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }
 template <>
 Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ uint64_t* p_data, size_t expected_size) { return g_host->UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }
+Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, const Path& model_path,
+                             /*out*/ std::vector<uint8_t>& unpacked_tensor) {
+  return g_host->UnpackInitializerData(tensor, model_path, unpacked_tensor);
+}
 
 }  // namespace utils
 
@@ -547,7 +548,14 @@ Status ScatterND::ValidateShapes(const TensorShape& input_shape,
                                  const TensorShape& indice_shape,
                                  const TensorShape& update_shape) { return g_host_cpu.ScatterNDBase__ValidateShapes(input_shape, indice_shape, update_shape); }
 
-Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) { return g_host_cpu.PadBase__HandleDimValueZero(mode, input_shape, output_shape); }
+Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) {
+  return g_host_cpu.PadBase__HandleDimValueZero(mode, input_shape, output_shape);
+}
+
+void PadBase::ComputePads(OpKernelContext& ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
+                          PadsVector& pads) {
+  g_host_cpu.PadBase__ComputePads(ctx, data_rank, pads_data, pads);
+}
 
 Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, const ConcatBase::InlinedTensorsVector& input_tensors,
                                      Prepare& p) const {
@@ -565,6 +573,11 @@ std::unique_ptr<EinsumTypedComputeProcessor<double>> EinsumTypedComputeProcessor
 template <>
 std::unique_ptr<EinsumTypedComputeProcessor<MLFloat16>> EinsumTypedComputeProcessor<MLFloat16>::Create(OpKernelContext* context, AllocatorPtr allocator, concurrency::ThreadPool* tp, EinsumComputePreprocessor& einsum_compute_preprocessor, void* einsum_cuda_assets) { return g_host_cpu.EinsumTypedComputeProcessor_MLFloat16__Create(context, allocator, tp, einsum_compute_preprocessor, einsum_cuda_assets); }
 
+void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                            InlinedVector<float>& scales) const {
+  g_host_cpu.UpsampleBase__AdjustOutputSizeAsPolicy(this, output_dims, input_dims, scales);
+}
+
 #ifndef DISABLE_CONTRIB_OPS
 namespace contrib {
 Status embed_layer_norm::CheckInputs(const OpKernelContext* context, bool quantizedVersion) {
@@ -641,7 +654,6 @@ Status Sampling::SetupSubgraphExecutionInfo(const SessionState& session_state, c
                                             const SessionState& subgraph_session_state) {
   return g_host_cpu.Sampling__SetupSubgraphExecutionInfo(this, session_state, attribute_name, subgraph_session_state);
 }
-
 }  // namespace transformers
 
 #ifdef ENABLE_ATEN
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 27226005a9c0..8c8d5b1fd460 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <optional>
+#include <list>
+
 // Public wrappers around internal ort interfaces (currently)
 #include "core/providers/shared_library/provider_host_api.h"
 
@@ -32,6 +35,7 @@ struct ProviderHostCPU;
 class PhiloxGenerator;
 using ProviderType = const std::string&;
 class RandomGenerator;
+class IOnnxRuntimeOpSchemaCollection;
 
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
 namespace contrib {
@@ -89,7 +93,10 @@ using HashValue = uint64_t;
 using NodeIndex = size_t;
 // We can't just reinterpret_cast this one, since it's an unordered_map of object BY VALUE (can't do anything by value on the real types)
 // using NodeAttributes = std::unordered_map<std::string, ONNX_NAMESPACE::AttributeProto_Copyable>;
+using ModelMetaData = std::unordered_map<std::string, std::string>;
 
+using IOnnxRuntimeOpSchemaCollectionPtr = std::shared_ptr<IOnnxRuntimeOpSchemaCollection>;
+using IOnnxRuntimeOpSchemaRegistryList = std::list<IOnnxRuntimeOpSchemaCollectionPtr>;
 using InitializedTensorSet = std::unordered_map<std::string, const ONNX_NAMESPACE::TensorProto*>;
 
 struct Node__NodeIterator {
@@ -199,6 +206,8 @@ struct ProviderHost {
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ uint32_t* p_data, size_t expected_size) = 0;
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ int64_t* p_data, size_t expected_size) = 0;
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ uint64_t* p_data, size_t expected_size) = 0;
+  virtual Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, const Path& model_path,
+                                       /*out*/ std::vector<uint8_t>& unpacked_tensor) = 0;
 
   virtual uint16_t math__floatToHalf(float f) = 0;
   virtual float math__halfToFloat(uint16_t h) = 0;
@@ -227,8 +236,6 @@ struct ProviderHost {
 
   virtual common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs, std::vector<NodeComputeInfo>& node_compute_funcs) = 0;
 
-  virtual int IExecutionProvider__GenerateMetaDefId(const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash) = 0;
-
   // Status
   virtual std::string Status__ToString(const Status* p) = 0;
 
@@ -261,12 +268,32 @@ struct ProviderHost {
   virtual void logging__Capture__operator_delete(logging::Capture* p) noexcept = 0;
   virtual std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept = 0;
 
+  // Env
+  virtual Env& Env__Default() = 0;
+
   // Utils::DataTypeUtils
   virtual const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) = 0;
 
   // int64s
   virtual int int64s__size(const ONNX_NAMESPACE::int64s* p) = 0;
   virtual const int64_t& int64s__Get(const ONNX_NAMESPACE::int64s* p, int index) = 0;
+  virtual void int64s__Reserve(ONNX_NAMESPACE::int64s* p, int size) = 0;
+  virtual const int64_t* int64s__data(const ONNX_NAMESPACE::int64s* p) = 0;
+
+  // float32s
+  virtual void float32s__Reserve(ONNX_NAMESPACE::float32s* p, int size) = 0;
+  virtual const float* float32s__data(const ONNX_NAMESPACE::float32s* p) = 0;
+  virtual int float32s__size(const ONNX_NAMESPACE::float32s* p) = 0;
+
+  // StringStringEntryProto
+  virtual std::string* StringStringEntryProto__mutable_key(ONNX_NAMESPACE::StringStringEntryProto* p) = 0;
+  virtual std::string* StringStringEntryProto__mutable_value(ONNX_NAMESPACE::StringStringEntryProto* p) = 0;
+
+  // StringStringEntryProtos
+  virtual void StringStringEntryProtos__Clear(ONNX_NAMESPACE::StringStringEntryProtos* p) = 0;
+  virtual ONNX_NAMESPACE::StringStringEntryProto* StringStringEntryProtos__Add(ONNX_NAMESPACE::StringStringEntryProtos* p) = 0;
+  virtual int StringStringEntryProtos__size(ONNX_NAMESPACE::StringStringEntryProtos* p) = 0;
+  virtual ONNX_NAMESPACE::StringStringEntryProto& StringStringEntryProtos__at(ONNX_NAMESPACE::StringStringEntryProtos* p, int index) = 0;
 
 #if !defined(DISABLE_OPTIONAL_TYPE)
   // TypeProto_Optional
@@ -283,6 +310,7 @@ struct ProviderHost {
   virtual const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
+  virtual void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) = 0;
 
 #if !defined(DISABLE_SPARSE_TENSORS)
   // TypeProto_SparseTensor
@@ -327,9 +355,18 @@ struct ProviderHost {
   virtual float AttributeProto__floats(const ONNX_NAMESPACE::AttributeProto* p, int i) = 0;
   virtual const ::std::string& AttributeProto__strings(const ONNX_NAMESPACE::AttributeProto* p, int i) = 0;
   virtual const ONNX_NAMESPACE::int64s& AttributeProto__ints(const ONNX_NAMESPACE::AttributeProto* p) = 0;
+  virtual const ONNX_NAMESPACE::float32s& AttributeProto__floats(const ONNX_NAMESPACE::AttributeProto* p) = 0;
+  virtual ONNX_NAMESPACE::int64s* AttributeProto__mutable_ints(ONNX_NAMESPACE::AttributeProto* p) = 0;
+  virtual ONNX_NAMESPACE::float32s* AttributeProto__mutable_floats(ONNX_NAMESPACE::AttributeProto* p) = 0;
+  virtual void AttributeProto__add_ints(ONNX_NAMESPACE::AttributeProto* p, int64_t size) = 0;
+  virtual void AttributeProto__add_floats(ONNX_NAMESPACE::AttributeProto* p, float size) = 0;
+  virtual void AttributeProto__add_strings(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& size) = 0;
   virtual int64_t AttributeProto__i(const ONNX_NAMESPACE::AttributeProto* p) = 0;
   virtual float AttributeProto__f(const ONNX_NAMESPACE::AttributeProto* p) = 0;
+  virtual const ONNX_NAMESPACE::TensorProto& AttributeProto__t(const ONNX_NAMESPACE::AttributeProto* p) = 0;
   virtual void AttributeProto__set_s(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) = 0;
+  virtual void AttributeProto__set_f(ONNX_NAMESPACE::AttributeProto* p, const float& value) = 0;
+  virtual void AttributeProto__set_i(ONNX_NAMESPACE::AttributeProto* p, int64_t value) = 0;
   virtual const ::std::string& AttributeProto__s(const ONNX_NAMESPACE::AttributeProto* p) = 0;
   virtual void AttributeProto__set_name(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) = 0;
   virtual void AttributeProto__set_type(ONNX_NAMESPACE::AttributeProto* p, ONNX_NAMESPACE::AttributeProto_AttributeType value) = 0;
@@ -351,6 +388,8 @@ struct ProviderHost {
   virtual ONNX_NAMESPACE::ValueInfoProtos* GraphProto__mutable_value_info(ONNX_NAMESPACE::GraphProto* p) = 0;
   virtual ONNX_NAMESPACE::TensorProtos* GraphProto__mutable_initializer(ONNX_NAMESPACE::GraphProto* p) = 0;
   virtual ONNX_NAMESPACE::NodeProto* GraphProto__add_node(ONNX_NAMESPACE::GraphProto* p) = 0;
+  virtual std::string* GraphProto__mutable_name(ONNX_NAMESPACE::GraphProto* p) = 0;
+  virtual ONNX_NAMESPACE::NodeProto* GraphProto__mutable_node(ONNX_NAMESPACE::GraphProto* p, int index) = 0;
 
   // ModelProto
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> ModelProto__construct() = 0;
@@ -365,6 +404,7 @@ struct ProviderHost {
   virtual ONNX_NAMESPACE::GraphProto* ModelProto__mutable_graph(ONNX_NAMESPACE::ModelProto* p) = 0;
 
   virtual void ModelProto__set_ir_version(ONNX_NAMESPACE::ModelProto* p, int64_t value) = 0;
+  virtual ONNX_NAMESPACE::StringStringEntryProtos* ModelProto__mutable_metadata_props(ONNX_NAMESPACE::ModelProto* p) = 0;
 
   // NodeProto
   virtual std::unique_ptr<ONNX_NAMESPACE::NodeProto> NodeProto__construct() = 0;
@@ -372,25 +412,41 @@ struct ProviderHost {
   virtual void NodeProto__operator_assign(ONNX_NAMESPACE::NodeProto* p, const ONNX_NAMESPACE::NodeProto& v) = 0;
   virtual int NodeProto__attribute_size(ONNX_NAMESPACE::NodeProto* p) = 0;
   virtual const ONNX_NAMESPACE::AttributeProto& NodeProto__attribute(const ONNX_NAMESPACE::NodeProto* p, int index) const = 0;
+  virtual ONNX_NAMESPACE::AttributeProto* NodeProto__mutable_attribute(ONNX_NAMESPACE::NodeProto* p, int index) = 0;
 
   // TensorProto
   virtual std::unique_ptr<ONNX_NAMESPACE::TensorProto> TensorProto__construct() = 0;
   virtual void TensorProto__operator_delete(ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual void TensorProto__operator_assign(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto& v) = 0;
   virtual bool TensorProto__has_name(const ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__set_name(ONNX_NAMESPACE::TensorProto* p, const ::std::string& name) = 0;
+  virtual const ::std::string& TensorProto__name(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual int TensorProto__dims_size(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual const ONNX_NAMESPACE::int64s& TensorProto__dims(const ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__add_dims(ONNX_NAMESPACE::TensorProto* p, int64_t value) = 0;
   virtual bool TensorProto__has_data_location(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual int TensorProto__data_location(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual bool TensorProto__has_raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) = 0;
   virtual void TensorProto__CopyFrom(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto* other) = 0;
+  virtual ONNX_NAMESPACE::StringStringEntryProtos* TensorProto__mutable_external_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__clear_float_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__clear_int32_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__clear_string_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__clear_int64_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__clear_double_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__clear_uint64_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual void TensorProto__set_data_location(ONNX_NAMESPACE::TensorProto* p, ONNX_NAMESPACE::TensorProto_DataLocation data_location) = 0;
 
   virtual bool TensorProto_DataType_IsValid(int value) = 0;
 
   // TensorProtos
   virtual ONNX_NAMESPACE::TensorProto* TensorProtos__Add(ONNX_NAMESPACE::TensorProtos* p) = 0;
+  virtual int TensorProtos__size(ONNX_NAMESPACE::TensorProtos* p) = 0;
+  virtual ONNX_NAMESPACE::TensorProto& TensorProtos__at(ONNX_NAMESPACE::TensorProtos* p, int index) = 0;
 
   // TensorShapeProto_Dimension
   virtual int TensorShapeProto_Dimension__value_case(const ONNX_NAMESPACE::TensorShapeProto_Dimension* p) = 0;
@@ -400,6 +456,8 @@ struct ProviderHost {
   virtual bool TensorShapeProto_Dimension__has_dim_value(const ONNX_NAMESPACE::TensorShapeProto_Dimension* p) = 0;
   virtual bool TensorShapeProto_Dimension__has_dim_param(const ONNX_NAMESPACE::TensorShapeProto_Dimension* p) = 0;
   virtual void TensorShapeProto_Dimension__clear_dim_value(ONNX_NAMESPACE::TensorShapeProto_Dimension* p) = 0;
+  virtual const std::string& TensorShapeProto_Dimension__denotation(const ONNX_NAMESPACE::TensorShapeProto_Dimension* p) const = 0;
+  virtual void TensorShapeProto_Dimension__set_denotation(ONNX_NAMESPACE::TensorShapeProto_Dimension* p, const std::string& value) = 0;
 
   // TensorShapeProto_Dimensions
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__begin(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
@@ -423,6 +481,14 @@ struct ProviderHost {
 
   virtual const ONNX_NAMESPACE::ValueInfoProto& ValueInfoProtos__operator_array(const ONNX_NAMESPACE::ValueInfoProtos* p, int index) = 0;
 
+  virtual void RegisterSchema(const std::string& domain, const OrtCustomOp* op, int type) = 0;
+
+  // ConfigOptions
+  virtual std::optional<std::string> ConfigOptions__GetConfigEntry(const ConfigOptions* p, const std::string& config_key) = 0;
+
+  // OrtRunOptions
+  virtual const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions* p) = 0;
+
   // ComputeCapability
   virtual std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) = 0;
   virtual void ComputeCapability__operator_delete(ComputeCapability* p) = 0;
@@ -645,6 +711,7 @@ struct ProviderHost {
   virtual void Node__ToProto(const Node* p, ONNX_NAMESPACE::NodeProto& proto, bool update_subgraphs = false) = 0;
 
   virtual const NodeAttributes& Node__GetAttributes(const Node* p) noexcept = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) = 0;
   virtual size_t Node__GetInputEdgesCount(const Node* p) noexcept = 0;
   virtual size_t Node__GetOutputEdgesCount(const Node* p) noexcept = 0;
 
@@ -654,10 +721,13 @@ struct ProviderHost {
   virtual std::unique_ptr<Node__NodeIterator> Node__OutputNodesBegin(const Node* p) noexcept = 0;
   virtual std::unique_ptr<Node__NodeIterator> Node__OutputNodesEnd(const Node* p) noexcept = 0;
 
+  virtual std::unique_ptr<Node__EdgeIterator> Node__InputEdgesBegin(const Node* p) noexcept = 0;
+  virtual std::unique_ptr<Node__EdgeIterator> Node__InputEdgesEnd(const Node* p) noexcept = 0;
   virtual std::unique_ptr<Node__EdgeIterator> Node__OutputEdgesBegin(const Node* p) noexcept = 0;
   virtual std::unique_ptr<Node__EdgeIterator> Node__OutputEdgesEnd(const Node* p) noexcept = 0;
 
   virtual void Node__ForEachDef(const Node* p, std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs) = 0;
+  virtual int Node__NodeType(const Node* p) const noexcept = 0;
   virtual const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0;
   virtual std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0;
 
@@ -668,6 +738,7 @@ struct ProviderHost {
   virtual const ONNX_NAMESPACE::NodeArgInfo& NodeArg__ToProto(const NodeArg* p) noexcept = 0;
   virtual bool NodeArg__Exists(const NodeArg* p) const noexcept = 0;
   virtual const ONNX_NAMESPACE::TypeProto* NodeArg__TypeAsProto(const NodeArg* p) noexcept = 0;
+  virtual Status NodeArg__OverrideTypesHelper(NodeArg* p, const ONNX_NAMESPACE::TypeProto& input_type, int32_t input_tensor_elem_type, int32_t current_tensor_elem_type, bool override_types) = 0;
 
   // NodeAttributes
   virtual std::unique_ptr<NodeAttributes> NodeAttributes__construct() = 0;
@@ -685,12 +756,19 @@ struct ProviderHost {
   virtual std::unique_ptr<NodeAttributes_Iterator> NodeAttributes__find(const NodeAttributes* p, const std::string& key) = 0;
   virtual void NodeAttributes__insert(NodeAttributes* p, const NodeAttributes& v) = 0;
   virtual void NodeAttributes__emplace(NodeAttributes* p, const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) = 0;
+  virtual void NodeAttributes__insert_or_assign(NodeAttributes* p, const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) = 0;
   virtual void NodeAttributes__reserve(NodeAttributes* p, size_t size) = 0;
 
   // Model
+  virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
+                                                  const IOnnxRuntimeOpSchemaRegistryList* local_registries,
+                                                  const logging::Logger& logger) = 0;
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
+  virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p, const std::string& external_file_name, const PathString& file_path, size_t initializer_size_threshold) = 0;
+  virtual const ModelMetaData& Model__MetaData(const Model* p) const noexcept = 0;
+  virtual Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) = 0;
 
   // Graph
   virtual std::unique_ptr<GraphViewer> Graph__CreateGraphViewer(const Graph* p) = 0;
@@ -708,6 +786,7 @@ struct ProviderHost {
   virtual void Graph__SetOutputs(Graph* p, gsl::span<const NodeArg* const> outputs) = 0;
 
   virtual const std::vector<const NodeArg*>& Graph__GetInputs(const Graph* p) noexcept = 0;
+  virtual std::vector<const Node*> Graph__Nodes(const Graph* p) = 0;
   virtual bool Graph__GetInitializedTensor(const Graph* p, const std::string& tensor_name, const ONNX_NAMESPACE::TensorProto*& value) = 0;
 
   virtual const Node* Graph__ParentNode(const Graph* p) const = 0;
@@ -717,10 +796,31 @@ struct ProviderHost {
   virtual const Path& Graph__ModelPath(const Graph* p) const = 0;
   virtual const std::vector<const NodeArg*>& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept = 0;
   virtual bool Graph__IsSubgraph(const Graph* p) = 0;
+  virtual const Node* Graph__GetProducerNode(const Graph* p, const std::string& node_arg_name) const = 0;
+  virtual const Model& Graph__GetModel(const Graph* p) = 0;
+  virtual void Graph__ReverseDFSFrom(const Graph* p, gsl::span<const Node* const> from,
+                                     const std::function<void(const Node*)>& enter,
+                                     const std::function<void(const Node*)>& leave,
+                                     const std::function<bool(const Node*, const Node*)>& comp,
+                                     const std::function<bool(const Node* from, const Node* to)>& stop) const = 0;
+  virtual Graph& Graph__SetGraphResolveNeeded(Graph* p) = 0;
+  virtual void Graph__RemoveInitializedTensor(Graph* p, const std::string& tensor_name) = 0;
+
+  virtual std::vector<const Node*> Graph__GetConsumerNodes(const Graph* p, const std::string& node_arg_name) const = 0;
+  virtual void Graph__AddEdge(Graph* p, NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_index,
+                              int dst_arg_index) = 0;
+  virtual void Graph__RemoveEdge(Graph* p, NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_index,
+                                 int dst_arg_index) = 0;
+  virtual void Graph__RemoveNode(Graph* p, NodeIndex index) = 0;
+  virtual Node& Graph__FuseSubGraph(Graph* p, const IndexedSubGraph& sub_graph, const std::string& fused_node_name) = 0;
+  virtual void Graph__UpdateProducerNode(Graph* p, const std::string& node_arg_name, NodeIndex node_index) = 0;
+  virtual const ONNX_NAMESPACE::TensorProto* Graph__GetConstantInitializer(const Graph* p, const std::string& name, bool check_outer_scope) const = 0;
+  virtual const InitializedTensorSet& Graph__GetAllInitializedTensors(const Graph* p) = 0;
   virtual int Graph__MaxNodeIndex(const Graph* p) const noexcept = 0;
   virtual Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept = 0;
   virtual const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const = 0;
   virtual const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const = 0;
+  virtual IOnnxRuntimeOpSchemaCollectionPtr Graph__GetSchemaRegistry(const Graph* p) const = 0;
 
   // GraphViewer
   virtual void GraphViewer__operator_delete(GraphViewer* p) = 0;
@@ -751,11 +851,14 @@ struct ProviderHost {
   virtual const std::vector<const NodeArg*>& GraphViewer__GetInputsIncludingInitializers(const GraphViewer* p) noexcept = 0;
 
   virtual void GraphViewer__ToProto(const GraphViewer* p, ONNX_NAMESPACE::GraphProto& graph_proto, bool include_initializers, bool include_outer_scope_args) noexcept = 0;
+  virtual const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const = 0;
 
   // Path
   virtual PathString Path__ToPathString(const Path* p) noexcept = 0;
   virtual const std::vector<PathString>& Path__GetComponents(const Path* p) noexcept = 0;
   virtual bool Path__IsEmpty(const Path* p) noexcept = 0;
+  virtual std::unique_ptr<Path> Path__construct() = 0;
+  virtual void Path__operator_delete(ONNX_NAMESPACE::Path* p) = 0;
 
   // OpKernel
   virtual const Node& OpKernel__Node(const OpKernel* p) = 0;
@@ -805,6 +908,7 @@ struct ProviderHost {
   virtual uint32_t OpKernelInfo__GetInputCount(const OpKernelInfo* p) = 0;
   virtual uint32_t OpKernelInfo__GetOutputCount(const OpKernelInfo* p) = 0;
   virtual const Node& OpKernelInfo__node(const OpKernelInfo* p) = 0;
+  virtual const ConfigOptions& OpKernelInfo__GetConfigOptions(const OpKernelInfo* p) = 0;
 
   // SessionState
   virtual const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) = 0;
@@ -963,6 +1067,11 @@ struct ProviderHost {
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
   virtual Status LoadDynamicLibrary(onnxruntime::PathString library_name) = 0;
 #endif
+
+  // ModelMetadefIdGenerator
+  virtual std::unique_ptr<ModelMetadefIdGenerator> ModelMetadefIdGenerator__construct() = 0;
+  virtual void ModelMetadefIdGenerator__operator_delete(ModelMetadefIdGenerator* p) = 0;
+  virtual int ModelMetadefIdGenerator__GenerateId(const ModelMetadefIdGenerator* p, const GraphViewer& graph_viewer, HashValue& model_hash) = 0;
 };
 
 #if defined(_MSC_VER) && !defined(__clang__)
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index c0b282b202ef..3bb938c1a319 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -52,11 +52,34 @@ namespace ONNX_NAMESPACE {
 struct int64s final {
   int size() const { return g_host->int64s__size(this); }
   const int64_t& Get(int index) const { return g_host->int64s__Get(this, index); }
+  const int64_t* data() const { return g_host->int64s__data(this); }
   const int64_t& operator[](int index) const { return Get(index); }
-
+  void Reserve(int size) { g_host->int64s__Reserve(this, size); }
   PROVIDER_DISALLOW_ALL(int64s)
 };
 
+struct float32s final {
+  void Reserve(int size) { g_host->float32s__Reserve(this, size); }
+  const float* data() const { return g_host->float32s__data(this); }
+  int size() const { return g_host->float32s__size(this); }
+  PROVIDER_DISALLOW_ALL(float32s)
+};
+
+struct StringStringEntryProto final {
+  std::string* mutable_key() { return g_host->StringStringEntryProto__mutable_key(this); }
+  std::string* mutable_value() { return g_host->StringStringEntryProto__mutable_value(this); }
+
+  PROVIDER_DISALLOW_ALL(StringStringEntryProto)
+};
+
+struct StringStringEntryProtos final {
+  void Clear() { g_host->StringStringEntryProtos__Clear(this); }
+  StringStringEntryProto* Add() { return g_host->StringStringEntryProtos__Add(this); }
+  int size() { return g_host->StringStringEntryProtos__size(this); }
+  StringStringEntryProto& at(int index) { return g_host->StringStringEntryProtos__at(this, index); }
+
+  PROVIDER_DISALLOW_ALL(StringStringEntryProtos)
+};
 struct AttributeProto final {
   static std::unique_ptr<AttributeProto> Create() { return g_host->AttributeProto__construct(); }
   void operator=(const AttributeProto& v) { g_host->AttributeProto__operator_assign(this, v); }
@@ -71,9 +94,19 @@ struct AttributeProto final {
   float floats(int i) const { return g_host->AttributeProto__floats(this, i); }
   const std::string& strings(int i) const { return g_host->AttributeProto__strings(this, i); }
   const int64s& ints() const { return g_host->AttributeProto__ints(this); }
+  const float32s& floats() const { return g_host->AttributeProto__floats(this); }
+  int64s* mutable_ints() { return g_host->AttributeProto__mutable_ints(this); }
+  float32s* mutable_floats() { return g_host->AttributeProto__mutable_floats(this); }
+  void add_ints(int64_t value) { g_host->AttributeProto__add_ints(this, value); }
+  void add_floats(float value) { g_host->AttributeProto__add_floats(this, value); }
+  void add_strings(const ::std::string& value) { g_host->AttributeProto__add_strings(this, value); }
+
   int64_t i() const { return g_host->AttributeProto__i(this); }
   float f() const { return g_host->AttributeProto__f(this); }
+  const ONNX_NAMESPACE::TensorProto& t() const { return g_host->AttributeProto__t(this); }
   void set_s(const ::std::string& value) { return g_host->AttributeProto__set_s(this, value); }
+  void set_f(const float& value) { return g_host->AttributeProto__set_f(this, value); }
+  void set_i(int64_t value) { return g_host->AttributeProto__set_i(this, value); }
   const ::std::string& s() const { return g_host->AttributeProto__s(this); }
   void set_name(const ::std::string& value) { return g_host->AttributeProto__set_name(this, value); }
   void set_type(AttributeProto_AttributeType value) { return g_host->AttributeProto__set_type(this, value); }
@@ -118,6 +151,9 @@ struct GraphProto final {
   ValueInfoProtos* mutable_value_info() { return g_host->GraphProto__mutable_value_info(this); }
   TensorProtos* mutable_initializer() { return g_host->GraphProto__mutable_initializer(this); }
   NodeProto* add_node() { return g_host->GraphProto__add_node(this); }
+  NodeProto* mutable_node(int index) { return g_host->GraphProto__mutable_node(this, index); }
+
+  std::string* mutable_name() { return g_host->GraphProto__mutable_name(this); }
 
   GraphProto() = delete;
   GraphProto(const GraphProto&) = delete;
@@ -131,7 +167,7 @@ struct ModelProto final {
   bool SerializeToOstream(std::ostream& output) const { return g_host->ModelProto__SerializeToOstream(this, output); }
   bool ParseFromString(const std::string& data) { return g_host->ModelProto__ParseFromString(this, data); }
   std::string SerializeAsString() const { return g_host->ModelProto__SerializeAsString(this); }
-
+  StringStringEntryProtos* mutable_metadata_props() { return g_host->ModelProto__mutable_metadata_props(this); };
   const GraphProto& graph() const { return g_host->ModelProto__graph(this); }
   GraphProto* mutable_graph() { return g_host->ModelProto__mutable_graph(this); }
 
@@ -148,6 +184,7 @@ struct NodeProto final {
   void operator=(const NodeProto& v) { g_host->NodeProto__operator_assign(this, v); }
   int attribute_size() { return g_host->NodeProto__attribute_size(this); }
   const AttributeProto& attribute(int index) const { return g_host->NodeProto__attribute(this, index); }
+  AttributeProto* mutable_attribute(int index) { return g_host->NodeProto__mutable_attribute(this, index); }
 
   NodeProto() = delete;
   NodeProto(const NodeProto&) = delete;
@@ -159,17 +196,23 @@ struct TensorProto final {
   void operator=(const TensorProto& v) { g_host->TensorProto__operator_assign(this, v); }
 
   bool has_name() const { return g_host->TensorProto__has_name(this); }
+  void set_name(const ::std::string& name) { return g_host->TensorProto__set_name(this, name); }
+  const ::std::string& name() const { return g_host->TensorProto__name(this); }
 
   int dims_size() const { return g_host->TensorProto__dims_size(this); }
   const int64s& dims() const { return g_host->TensorProto__dims(this); }
+  void add_dims(int64_t value) { g_host->TensorProto__add_dims(this, value); }
 
   bool has_data_location() const { return g_host->TensorProto__has_data_location(this); }
   TensorProto_DataLocation data_location() const { return TensorProto_DataLocation(g_host->TensorProto__data_location(this)); }
+  void set_data_location(TensorProto_DataLocation data_location) { return g_host->TensorProto__set_data_location(this, data_location); }
 
   bool has_raw_data() const { return g_host->TensorProto__has_raw_data(this); }
   const std::string& raw_data() const { return g_host->TensorProto__raw_data(this); }
+  std::string* mutable_raw_data() { return g_host->TensorProto__mutable_raw_data(this); }
 
   int32_t data_type() const { return g_host->TensorProto__data_type(this); }
+  void set_data_type(int32_t type) { return g_host->TensorProto__set_data_type(this, type); }
 
   typedef TensorProto_DataType DataType;
   static constexpr DataType UNDEFINED = TensorProto_DataType_UNDEFINED;
@@ -177,6 +220,13 @@ struct TensorProto final {
   static bool DataType_IsValid(int value) { return g_host->TensorProto_DataType_IsValid(value); }
 
   void copy_from(const TensorProto* other) { return g_host->TensorProto__CopyFrom(this, other); }
+  StringStringEntryProtos* mutable_external_data() { return g_host->TensorProto__mutable_external_data(this); };
+  void clear_float_data() { return g_host->TensorProto__clear_float_data(this); }
+  void clear_int32_data() { return g_host->TensorProto__clear_int32_data(this); }
+  void clear_string_data() { return g_host->TensorProto__clear_string_data(this); }
+  void clear_int64_data() { return g_host->TensorProto__clear_int64_data(this); }
+  void clear_double_data() { return g_host->TensorProto__clear_double_data(this); }
+  void clear_uint64_data() { return g_host->TensorProto__clear_uint64_data(this); }
 
   TensorProto() = delete;
   TensorProto(const TensorProto&) = delete;
@@ -184,6 +234,8 @@ struct TensorProto final {
 
 struct TensorProtos final {
   TensorProto* Add() { return g_host->TensorProtos__Add(this); }
+  int size() { return g_host->TensorProtos__size(this); }
+  TensorProto& at(int index) { return g_host->TensorProtos__at(this, index); }
 
   PROVIDER_DISALLOW_ALL(TensorProtos)
 };
@@ -202,6 +254,8 @@ struct TensorShapeProto_Dimension final {
   bool has_dim_value() const { return g_host->TensorShapeProto_Dimension__has_dim_value(this); }
   bool has_dim_param() const { return g_host->TensorShapeProto_Dimension__has_dim_param(this); }
   void clear_dim_value() { return g_host->TensorShapeProto_Dimension__clear_dim_value(this); }
+  const std::string& denotation() const { return g_host->TensorShapeProto_Dimension__denotation(this); }
+  void set_denotation(const std::string& value) { g_host->TensorShapeProto_Dimension__set_denotation(this, value); }
 
   PROVIDER_DISALLOW_ALL(TensorShapeProto_Dimension)
 };
@@ -229,6 +283,7 @@ struct TypeProto_Tensor final {
   const TensorShapeProto& shape() const { return g_host->TypeProto_Tensor__shape(this); }
   TensorShapeProto* mutable_shape() { return g_host->TypeProto_Tensor__mutable_shape(this); }
   int32_t elem_type() const { return g_host->TypeProto_Tensor__elem_type(this); }
+  void set_elem_type(int32_t value) { g_host->TypeProto_Tensor__set_elem_type(this, value); }
 
   PROVIDER_DISALLOW_ALL(TypeProto_Tensor)
 };
@@ -312,7 +367,6 @@ struct ValueInfoProtos final {
 
   PROVIDER_DISALLOW_ALL(ValueInfoProtos)
 };
-
 }  // namespace ONNX_NAMESPACE
 
 namespace onnxruntime {
@@ -332,6 +386,14 @@ struct DataTypeUtils final {
 
 }  // namespace Utils
 
+struct ConfigOptions final {
+  std::optional<std::string> GetConfigEntry(const std::string& config_key) const {
+    return g_host->ConfigOptions__GetConfigEntry(this, config_key);
+  }
+
+  PROVIDER_DISALLOW_ALL(ConfigOptions)
+};
+
 struct ComputeCapability final {
   static std::unique_ptr<ComputeCapability> Create(std::unique_ptr<IndexedSubGraph> t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); }
   static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast<ComputeCapability*>(p)); }
@@ -592,6 +654,10 @@ struct Function final {
 };
 
 struct Node final {
+  enum class Type {
+    Primitive = 0,
+    Fused = 1,
+  };
   const std::string& Name() const noexcept { return g_host->Node__Name(this); }
   const std::string& Description() const noexcept { return g_host->Node__Description(this); }
   const std::string& Domain() const noexcept { return g_host->Node__Domain(this); }
@@ -615,6 +681,10 @@ struct Node final {
   void ToProto(ONNX_NAMESPACE::NodeProto& proto, bool update_subgraphs = false) const { return g_host->Node__ToProto(this, proto, update_subgraphs); }
 
   const NodeAttributes& GetAttributes() const noexcept { return g_host->Node__GetAttributes(this); }
+  void AddAttribute(const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
+
   size_t GetInputEdgesCount() const noexcept { return g_host->Node__GetInputEdgesCount(this); }
   size_t GetOutputEdgesCount() const noexcept { return g_host->Node__GetOutputEdgesCount(this); }
 
@@ -650,12 +720,15 @@ struct Node final {
     std::unique_ptr<Node__EdgeIterator> impl_;
   };
 
+  EdgeConstIterator InputEdgesBegin() const noexcept { return g_host->Node__InputEdgesBegin(this); }
+  EdgeConstIterator InputEdgesEnd() const noexcept { return g_host->Node__InputEdgesEnd(this); }
   EdgeConstIterator OutputEdgesBegin() const noexcept { return g_host->Node__OutputEdgesBegin(this); }
   EdgeConstIterator OutputEdgesEnd() const noexcept { return g_host->Node__OutputEdgesEnd(this); }
 
   void ForEachDef(std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs = false) const { g_host->Node__ForEachDef(this, func, std::move(include_missing_optional_defs)); }
   const std::unordered_map<std::string, gsl::not_null<Graph*>>& GetAttributeNameToMutableSubgraphMap() { return g_host->Node__GetAttributeNameToMutableSubgraphMap(this); }
   std::unordered_map<std::string, gsl::not_null<const Graph*>> GetAttributeNameToSubgraphMap() const { return g_host->Node__GetAttributeNameToSubgraphMap(this); }
+  Type NodeType() const noexcept { return Type(g_host->Node__NodeType(this)); }
 
   PROVIDER_DISALLOW_ALL(Node)
 };
@@ -667,6 +740,7 @@ struct NodeArg final {
   const NodeArgInfo& ToProto() const noexcept { return g_host->NodeArg__ToProto(this); }
   bool Exists() const noexcept { return g_host->NodeArg__Exists(this); }
   const ONNX_NAMESPACE::TypeProto* TypeAsProto() const noexcept { return g_host->NodeArg__TypeAsProto(this); }
+  Status OverrideTypesHelper(const ONNX_NAMESPACE::TypeProto& input_type, int32_t input_tensor_elem_type, int32_t current_tensor_elem_type, bool override_types) { return g_host->NodeArg__OverrideTypesHelper(this, input_type, input_tensor_elem_type, current_tensor_elem_type, override_types); }
 
   PROVIDER_DISALLOW_ALL(NodeArg)
 };
@@ -687,6 +761,8 @@ struct NodeAttributes final {
   IteratorHolder<NodeAttributes_Iterator, std::pair<const std::string, ONNX_NAMESPACE::AttributeProto>> find(const std::string& key) const { return g_host->NodeAttributes__find(this, key); }
   void insert(const NodeAttributes& v) { return g_host->NodeAttributes__insert(this, v); }
   void emplace(const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) { g_host->NodeAttributes__emplace(this, k, v); }
+  void insert_or_assign(const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) { g_host->NodeAttributes__insert_or_assign(this, k, v); }
+
   void reserve(size_t size) { g_host->NodeAttributes__reserve(this, size); }
 
   NodeAttributes() = delete;
@@ -694,11 +770,18 @@ struct NodeAttributes final {
 };
 
 struct Model final {
+  static std::unique_ptr<Model> Create(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
+                                       const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
+    return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger);
+  }
   static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast<Model*>(p)); }
+  static Status Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { return g_host->Model__Load(file_path, model_proto); }
 
   Graph& MainGraph() { return g_host->Model__MainGraph(this); }
 
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToProto() { return g_host->Model__ToProto(this); }
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToGraphProtoWithExternalInitializers(const std::string& external_file_name, const PathString& file_path, size_t initializer_size_threshold) { return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path, initializer_size_threshold); }
+  const ModelMetaData& MetaData() const noexcept { return g_host->Model__MetaData(this); }
 
   Model() = delete;
   Model(const Model&) = delete;
@@ -721,6 +804,7 @@ struct Graph final {
   void SetOutputs(gsl::span<const NodeArg* const> outputs) { return g_host->Graph__SetOutputs(this, outputs); }
 
   const std::vector<const NodeArg*>& GetInputs() const noexcept { return g_host->Graph__GetInputs(this); }
+  std::vector<const Node*> Nodes() const noexcept { return g_host->Graph__Nodes(this); }
 
   bool GetInitializedTensor(const std::string& tensor_name, const ONNX_NAMESPACE::TensorProto*& value) const { return g_host->Graph__GetInitializedTensor(this, tensor_name, value); }
 
@@ -731,15 +815,48 @@ struct Graph final {
   const Path& ModelPath() const { return g_host->Graph__ModelPath(this); }
   const std::vector<const NodeArg*>& GetInputsIncludingInitializers() const noexcept { return g_host->Graph__GetInputsIncludingInitializers(this); }
   bool IsSubgraph() const { return g_host->Graph__IsSubgraph(this); }
+  const Node* GetProducerNode(const std::string& node_arg_name) const { return g_host->Graph__GetProducerNode(this, node_arg_name); }
+  const Model& GetModel() const { return g_host->Graph__GetModel(this); }
+  void ReverseDFSFrom(gsl::span<const Node* const> from, const std::function<void(const Node*)>& enter,
+                      const std::function<void(const Node*)>& leave,
+                      const std::function<bool(const Node*, const Node*)>& comp,
+                      const std::function<bool(const Node* from, const Node* to)>& stop) const {
+    g_host->Graph__ReverseDFSFrom(this, from, enter, leave, comp, stop);
+  }
+  Graph& SetGraphResolveNeeded() { return g_host->Graph__SetGraphResolveNeeded(this); }
+  void RemoveInitializedTensor(const std::string& tensor_name) { g_host->Graph__RemoveInitializedTensor(this, tensor_name); }
+
+  std::vector<const Node*> GetConsumerNodes(const std::string& node_arg_name) const {
+    return g_host->Graph__GetConsumerNodes(this, node_arg_name);
+  }
+  void AddEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_index, int dst_arg_index) {
+    g_host->Graph__AddEdge(this, src_node_index, dst_node_index, src_arg_index, dst_arg_index);
+  }
+  void RemoveEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_index, int dst_arg_index) {
+    g_host->Graph__RemoveEdge(this, src_node_index, dst_node_index, src_arg_index, dst_arg_index);
+  }
+  void RemoveNode(NodeIndex index) { g_host->Graph__RemoveNode(this, index); }
+  Node& FuseSubGraph(const IndexedSubGraph& sub_graph, const std::string& fused_node_name) {
+    return g_host->Graph__FuseSubGraph(this, sub_graph, fused_node_name);
+  }
+  void UpdateProducerNode(const std::string& node_arg_name, NodeIndex node_index) {
+    g_host->Graph__UpdateProducerNode(this, node_arg_name, node_index);
+  }
+  const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const std::string& name, bool check_outer_scope) const {
+    return g_host->Graph__GetConstantInitializer(this, name, check_outer_scope);
+  }
+  const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return g_host->Graph__GetAllInitializedTensors(this); }
   int MaxNodeIndex() const noexcept { return g_host->Graph__MaxNodeIndex(this); }
   const Node* GetNode(NodeIndex node_index) const noexcept { return g_host->Graph__GetNode(this, node_index); }
   Node* GetNode(NodeIndex node_index) noexcept { return g_host->Graph__GetNode(this, node_index); }
   const NodeArg* GetNodeArg(const std::string& name) const { return g_host->Graph__GetNodeArg(this, name); }
+  IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const { return g_host->Graph__GetSchemaRegistry(this); }
 
   PROVIDER_DISALLOW_ALL(Graph)
 };
 
-struct GraphViewer final {
+class GraphViewer final {
+ public:
   static void operator delete(void* p) { g_host->GraphViewer__operator_delete(reinterpret_cast<GraphViewer*>(p)); }
 
   std::unique_ptr<Model> CreateModel(const logging::Logger& logger) const { return g_host->GraphViewer__CreateModel(this, logger); }
@@ -771,6 +888,7 @@ struct GraphViewer final {
   const std::vector<const NodeArg*>& GetInputsIncludingInitializers() const noexcept { return g_host->GraphViewer__GetInputsIncludingInitializers(this); }
 
   void ToProto(ONNX_NAMESPACE::GraphProto& graph_proto, bool include_initializers, bool include_outer_scope_args) const { g_host->GraphViewer__ToProto(this, graph_proto, include_initializers, include_outer_scope_args); }
+  const Node* GetProducerNode(const std::string& node_arg_name) const { return g_host->GraphViewer__GetProducerNode(this, node_arg_name); }
 
   GraphViewer() = delete;
   GraphViewer(const GraphViewer&) = delete;
@@ -778,11 +896,16 @@ struct GraphViewer final {
 };
 
 struct Path final {
+  static std::unique_ptr<Path> Create() { return g_host->Path__construct(); }
+  static void operator delete(void* p) { g_host->Path__operator_delete(reinterpret_cast<Path*>(p)); }
+
   PathString ToPathString() const noexcept { return g_host->Path__ToPathString(this); }
   const std::vector<PathString>& GetComponents() const noexcept { return g_host->Path__GetComponents(this); }
   bool IsEmpty() const noexcept { return g_host->Path__IsEmpty(this); }
 
-  PROVIDER_DISALLOW_ALL(Path)
+  Path() = delete;
+  Path(const Path&) = delete;
+  void operator=(const Path&) = delete;
 };
 
 struct OpKernelContext final {
@@ -898,6 +1021,8 @@ struct OpKernelInfo final {
 
   const Node& node() const noexcept { return g_host->OpKernelInfo__node(this); }
 
+  const ConfigOptions& GetConfigOptions() const { return g_host->OpKernelInfo__GetConfigOptions(this); }
+
   OpKernelInfo() = delete;
   OpKernelInfo(const OpKernelInfo&) = delete;
   void operator=(const OpKernelInfo&) = delete;
@@ -1139,7 +1264,21 @@ class TensorSeq final {
   void Reserve(size_t capacity) { g_host->TensorSeq__Reserve(this, capacity); }
 };
 
+class ModelMetadefIdGenerator {
+ public:
+  static std::unique_ptr<ModelMetadefIdGenerator> Create() { return g_host->ModelMetadefIdGenerator__construct(); }
+  static void operator delete(void* p) { g_host->ModelMetadefIdGenerator__operator_delete(reinterpret_cast<ModelMetadefIdGenerator*>(p)); }
+  int GenerateId(const GraphViewer& graph_viewer, HashValue& model_hash) const { return g_host->ModelMetadefIdGenerator__GenerateId(this, graph_viewer, model_hash); }
+};
+
 template <>
 inline gsl::span<const int64_t> Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); }
 
 }  // namespace onnxruntime
+
+struct OrtRunOptions final {
+  const onnxruntime::ConfigOptions& GetConfigOptions() const {
+    return onnxruntime::g_host->RunOptions__GetConfigOptions(this);
+  }
+  PROVIDER_DISALLOW_ALL(OrtRunOptions)
+};
diff --git a/onnxruntime/core/providers/tensorrt/nv_includes.h b/onnxruntime/core/providers/tensorrt/nv_includes.h
new file mode 100644
index 000000000000..c3e9f7a3a2a7
--- /dev/null
+++ b/onnxruntime/core/providers/tensorrt/nv_includes.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+// File to include the required TRT headers with workarounds for warnings we can't fix.
+
+// Ignore warning C4100: unreferenced formal parameter
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+#include <NvInfer.h>
+#include <NvInferPlugin.h>
+#include <NvInferRuntime.h>
+#include <NvOnnxParser.h>
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
new file mode 100644
index 000000000000..1994d1f5ab0b
--- /dev/null
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -0,0 +1,328 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <fstream>
+#include <filesystem>
+
+#include "onnx_ctx_model_helper.h"
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+#include "core/framework/execution_provider.h"
+
+namespace onnxruntime {
+
+/*
+ *  Check whether the graph has the EP context contrib op.
+ *  The op can contain the precompiled engine info for TRT EP to directly load the engine.
+ *
+ *  Note: Please see more details about "EPContext" contrib op in contrib_defs.cc
+ */
+bool GraphHasCtxNode(const GraphViewer& graph_viewer) {
+  for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) {
+    auto node = graph_viewer.GetNode(i);
+    if (node != nullptr && node->OpType() == EPCONTEXT_OP) {
+      return true;
+    }
+  }
+  return false;
+}
+
+const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer) {
+  // find the top level graph
+  const Graph* cur_graph = &graph_viewer.GetGraph();
+  while (cur_graph->IsSubgraph()) {
+    cur_graph = cur_graph->ParentGraph();
+  }
+
+  const Graph& main_graph = *cur_graph;
+  return main_graph.ModelPath();
+}
+
+/*
+ * Update ep_cache_context attribute of the EP context node with the given engine binary data
+ */
+void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
+                                     char* engine_data,
+                                     size_t size) {
+  ONNX_NAMESPACE::GraphProto* graph_proto = model_proto->mutable_graph();
+  ONNX_NAMESPACE::NodeProto* node_proto = graph_proto->mutable_node(0);
+
+  for (int i = 0; i < node_proto->attribute_size(); ++i) {
+    ONNX_NAMESPACE::AttributeProto* attribute_proto = node_proto->mutable_attribute(i);
+    if (attribute_proto->name() == EP_CACHE_CONTEXT) {
+      std::string engine_data_str = "";
+      if (size > 0) {
+        engine_data_str.assign(engine_data, size);
+      }
+      attribute_proto->set_s(engine_data_str);
+    }
+  }
+}
+
+/*
+ * Create "EP context node" model where engine information is embedded
+ */
+ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
+                                           const std::string engine_cache_path,
+                                           char* engine_data,
+                                           size_t size,
+                                           const int64_t embed_mode,
+                                           std::string compute_capability,
+                                           const logging::Logger* logger) {
+  auto model_build = graph_viewer.CreateModel(*logger);
+  auto& graph_build = model_build->MainGraph();
+
+  // Get graph inputs and outputs
+  std::vector<onnxruntime::NodeArg*> inputs, outputs;
+  for (auto input : graph_viewer.GetInputs()) {
+    auto& n_input = graph_build.GetOrCreateNodeArg(input->Name(), input->TypeAsProto());
+    inputs.push_back(&n_input);
+  }
+
+  for (auto output : graph_viewer.GetOutputs()) {
+    auto& n_output = graph_build.GetOrCreateNodeArg(output->Name(), output->TypeAsProto());
+    outputs.push_back(&n_output);
+  }
+
+  // Create EP context node attributes
+  auto attr_0 = ONNX_NAMESPACE::AttributeProto::Create();  // embed_mode
+  auto attr_1 = ONNX_NAMESPACE::AttributeProto::Create();  // ep_cache_context
+  auto attr_2 = ONNX_NAMESPACE::AttributeProto::Create();  // hardware_architecture
+  std::string engine_data_str = "";
+  attr_0->set_name(EMBED_MODE);
+  attr_0->set_type(onnx::AttributeProto_AttributeType_INT);
+  attr_0->set_i(embed_mode);
+  attr_1->set_name(EP_CACHE_CONTEXT);
+  attr_1->set_type(onnx::AttributeProto_AttributeType_STRING);
+  if (embed_mode) {
+    if (size > 0) {
+      engine_data_str.assign(engine_data, size);
+    }
+    attr_1->set_s(engine_data_str);
+    LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
+  } else {
+    attr_1->set_s(engine_cache_path);
+  }
+  attr_2->set_name(COMPUTE_CAPABILITY);
+  attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
+  attr_2->set_s(compute_capability);
+
+  auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
+  int num_attributes = 3;
+  node_attributes->reserve(num_attributes);
+  node_attributes->emplace(EMBED_MODE, *attr_0);
+  node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);
+  node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
+
+  // Create EP context node
+  graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
+  ORT_ENFORCE(graph_build.Resolve().IsOK());
+
+  // Serialize modelproto to string
+  auto new_graph_viewer = graph_build.CreateGraphViewer();
+  auto model = new_graph_viewer->CreateModel(*logger);
+  auto model_proto = model->ToProto();
+  new_graph_viewer->ToProto(*model_proto->mutable_graph(), true, true);
+  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+
+  return model_proto.release();
+}
+
+/*
+ * Return the directory where the ep context model locates
+ */
+std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path) {
+  if (ep_context_file_path.empty()) {
+    return std::filesystem::path();
+  }
+  std::filesystem::path ctx_path(ep_context_file_path);
+  if (std::filesystem::is_directory(ep_context_file_path)) {
+    return ctx_path;
+  } else {
+    return ctx_path.parent_path();
+  }
+}
+
+/*
+ * Get "EP context" model path.
+ *
+ * Function logic:
+ * If ep_context_file_path is provided,
+ *     - If ep_context_file_path is a file, return "ep_context_file_path".
+ *     - If ep_context_file_path is a directory, return "ep_context_file_path/original_model_name_ctx.onnx".
+ * If ep_context_file_path is not provided,
+ *     - Return "original_model_name_ctx.onnx".
+ *
+ * TRT EP has rules about context model path and engine cache path (see tensorrt_execution_provider.cc):
+ * - If dump_ep_context_model_ and engine_cache_enabled_ is enabled, TRT EP will dump context model and save engine cache
+ *   to the same directory provided by ep_context_file_path_. (i.e. engine_cache_path_ = ep_context_file_path_)
+ *
+ * Example 1:
+ * ep_context_file_path = "/home/user/ep_context_model_directory"
+ * original_model_path = "model.onnx"
+ * => return "/home/user/ep_context_model_folder/model_ctx.onnx"
+ *
+ * Example 2:
+ * ep_context_file_path = "my_ctx_model.onnx"
+ * original_model_path = "model.onnx"
+ * => return "my_ctx_model.onnx"
+ *
+ * Example 3:
+ * ep_context_file_path = "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
+ * original_model_path = "model.onnx"
+ * => return "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
+ *
+ */
+std::string GetCtxModelPath(const std::string& ep_context_file_path,
+                            const std::string& original_model_path) {
+  std::string ctx_model_path;
+
+  if (!ep_context_file_path.empty() && !std::filesystem::is_directory(ep_context_file_path)) {
+    ctx_model_path = ep_context_file_path;
+  } else {
+    std::filesystem::path model_path = original_model_path;
+    std::filesystem::path model_name_stem = model_path.stem();  // model_name.onnx -> model_name
+    std::string ctx_model_name = model_name_stem.string() + "_ctx.onnx";
+
+    if (std::filesystem::is_directory(ep_context_file_path)) {
+      std::filesystem::path model_directory = ep_context_file_path;
+      ctx_model_path = model_directory.append(ctx_model_name).string();
+    } else {
+      ctx_model_path = ctx_model_name;
+    }
+  }
+  return ctx_model_path;
+}
+
+/*
+ * Dump "EP context" model
+ *
+ */
+void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                  const std::string& ctx_model_path) {
+  std::fstream dump(ctx_model_path, std::ios::out | std::ios::trunc | std::ios::binary);
+  model_proto->SerializeToOstream(dump);
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Dumped " + ctx_model_path;
+}
+
+bool IsAbsolutePath(std::string& path_string) {
+#ifdef _WIN32
+  onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
+  auto path = std::filesystem::path(ort_path_string.c_str());
+  return path.is_absolute();
+#else
+  if (!path_string.empty() && path_string[0] == '/') {
+    return true;
+  }
+  return false;
+#endif
+}
+
+// Like "../file_path"
+bool IsRelativePathToParentPath(std::string& path_string) {
+#ifdef _WIN32
+  onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
+  auto path = std::filesystem::path(ort_path_string.c_str());
+  auto relative_path = path.lexically_normal().make_preferred().wstring();
+  if (relative_path.find(L"..", 0) != std::string::npos) {
+    return true;
+  }
+  return false;
+#else
+  if (!path_string.empty() && path_string.find("..", 0) != std::string::npos) {
+    return true;
+  }
+  return false;
+#endif
+}
+
+Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
+  if (!ValidateEPCtxNode(graph_viewer)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "It's not a valid EP Context node");
+  }
+  auto node = graph_viewer.GetNode(0);
+  auto& attrs = node->GetAttributes();
+
+  const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  if (embed_mode) {
+    // Get engine from byte stream.
+    const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
+    *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(const_cast<char*>(context_binary.c_str()),
+                                                                                                static_cast<size_t>(context_binary.length())));
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Read engine as binary data from \"ep_cache_context\" attribute of ep context node and deserialized it";
+    if (!(*trt_engine_)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP could not deserialize engine from binary data");
+    }
+  } else {
+    // Get engine from cache file.
+    std::string cache_path = attrs.at(EP_CACHE_CONTEXT).s();
+
+    // For security purpose, in the case of running context model, TRT EP won't allow
+    // engine cache path to be the relative path like "../file_path" or the absolute path.
+    // It only allows the engine cache to be in the same directory or sub directory of the context model.
+    if (IsAbsolutePath(cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "For security purpose, the ep_cache_context attribute should be set with a relative path, but it is an absolute path:  " + cache_path);
+    }
+    if (IsRelativePathToParentPath(cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "The file path in ep_cache_context attribute has '..'. For security purpose, it's not allowed to point outside the directory.");
+    }
+
+    // The engine cache and context model (current model) should be in the same directory
+    std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_));
+    auto engine_cache_path = ctx_model_dir.append(cache_path);
+
+    if (!std::filesystem::exists(engine_cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP can't find engine cache: " + engine_cache_path.string() +
+                                 ". Please make sure engine cache is in the same directory or sub-directory of context model.");
+    }
+
+    std::ifstream engine_file(engine_cache_path.string(), std::ios::binary | std::ios::in);
+    engine_file.seekg(0, std::ios::end);
+    size_t engine_size = engine_file.tellg();
+    engine_file.seekg(0, std::ios::beg);
+    std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+    engine_file.read((char*)engine_buf.get(), engine_size);
+    *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
+    if (!(*trt_engine_)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP could not deserialize engine from cache: " + engine_cache_path.string());
+    }
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path.string();
+  }
+  return Status::OK();
+}
+
+/*
+ * The sanity check for EP context contrib op.
+ */
+bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewer) {
+  assert(graph_viewer.NumberOfNodes() == 1);
+  assert(graph_viewer.GetNode(0)->OpType() == EPCONTEXT_OP);
+  auto node = graph_viewer.GetNode(0);
+  auto& attrs = node->GetAttributes();
+
+  // Show the warning if compute capability is not matched
+  if (attrs.count(COMPUTE_CAPABILITY) > 0) {
+    std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
+    if (model_compute_capability != compute_capability_) {
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal";
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the engine: " << model_compute_capability;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the GPU: " << compute_capability_;
+    }
+  }
+
+  // "embed_mode" attr and "ep_cache_context" attr should be present
+  assert(attrs.count(EMBED_MODE) > 0);
+  assert(attrs.count(EP_CACHE_CONTEXT) > 0);
+
+  const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  if (embed_mode == 1) {
+    // engine binary data
+    LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
+  }
+
+  return true;
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
new file mode 100644
index 000000000000..9f1e5178428e
--- /dev/null
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <filesystem>
+
+#include "core/providers/tensorrt/nv_includes.h"
+#include "core/providers/shared_library/provider_api.h"
+
+namespace onnxruntime {
+
+static const std::string EPCONTEXT_OP = "EPContext";
+static const std::string EMBED_MODE = "embed_mode";
+static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
+static const std::string COMPUTE_CAPABILITY = "hardware_architecture";
+static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft";
+static const std::string EPCONTEXT_WARNING =
+    "It's suggested to set the ORT graph optimization level to 0 and  \
+                                              make \"embed_mode\" to 0 (\"ep_cache_context\" is the cache path)\
+                                              for the best model loading time";
+
+bool GraphHasCtxNode(const GraphViewer& graph_viewer);
+const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer);
+std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path);
+ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
+                                           const std::string engine_cache_path,
+                                           char* engine_data,
+                                           size_t size,
+                                           const int64_t embed_mode,
+                                           std::string compute_capability,
+                                           const logging::Logger* logger);
+std::string GetCtxModelPath(const std::string& ep_context_file_path,
+                            const std::string& original_model_path);
+bool IsAbsolutePath(std::string& path_string);
+bool IsRelativePathToParentPath(std::string& path_string);
+void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                  const std::string& ctx_model_path);
+void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
+                                     char* engine_data,
+                                     size_t size);
+
+class TensorRTCacheModelHandler {
+ public:
+  TensorRTCacheModelHandler(std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine,
+                            nvinfer1::IRuntime* trt_runtime,
+                            std::string ep_context_model_path,
+                            std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), ep_context_model_path_(ep_context_model_path), compute_capability_(compute_capability) {
+  }
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
+
+  bool ValidateEPCtxNode(const GraphViewer& graph_viewer);
+
+  Status GetEpContextFromGraph(const GraphViewer& graph_viewer);
+
+ private:
+  std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine_;
+  nvinfer1::IRuntime* trt_runtime_;
+  std::string ep_context_model_path_;  // If using context model, it implies context model and engine cache is in the same directory
+  std::string compute_capability_;
+};  // TRTCacheModelHandler
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h b/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h
index 9e4324fb9f51..a2e027f56fbd 100644
--- a/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h
+++ b/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h
@@ -3,7 +3,7 @@
 #ifndef FLATBUFFERS_GENERATED_ORTTRTINT8CALTABLE_CALTABLEFLATBUFFERS_H_
 #define FLATBUFFERS_GENERATED_ORTTRTINT8CALTABLE_CALTABLEFLATBUFFERS_H_
 
-#include "flatbuffers/flatbuffers.h"
+#include "core/common/flatbuffers.h"
 
 namespace CalTableFlatBuffers {
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 684303a8b644..ac9d9f72887a 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -7,10 +7,12 @@
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "tensorrt_execution_provider.h"
 #include "tensorrt_execution_provider_utils.h"
 #include "tensorrt_execution_provider_custom_ops.h"
+#include "onnx_ctx_model_helper.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
@@ -136,10 +138,10 @@ std::vector<std::string> SplitToStringVec(std::string const& s, char separator)
   return splitted;
 }
 
-nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_sting) {
+nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_string) {
   nvinfer1::TacticSources disabledTactics = 0;
   nvinfer1::TacticSources enabledTactics = 0;
-  std::vector<std::string> tacticList = SplitToStringVec(tactic_sting, ',');
+  std::vector<std::string> tacticList = SplitToStringVec(tactic_string, ',');
   for (auto& t : tacticList) {
     bool enable{false};
     if (t.front() == '+') {
@@ -150,8 +152,8 @@ nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_sting) {
     t.erase(0, 1);
 
     const auto toUpper = [](std::string& sourceName) {
-      std::transform(
-          sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); });
+      std::transform(sourceName.begin(), sourceName.end(), sourceName.begin(),
+                     [](char c) { return onnxruntime::narrow<char>(std::toupper(c)); });
       return sourceName;
     };
 
@@ -287,7 +289,8 @@ void CudaCall<cudnnStatus_t, true>(cudnnStatus_t retCode, const char* exprString
   return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg, file, line);
 }
 
-void* OutputAllocator::reallocateOutput(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept {
+void* OutputAllocator::reallocateOutput(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
+                                        uint64_t /*alignment*/) noexcept {
   // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
   // even for empty tensors, so allocate a dummy byte.
   size = std::max(size, static_cast<uint64_t>(1));
@@ -303,7 +306,7 @@ void* OutputAllocator::reallocateOutput(char const* tensorName, void* currentMem
   return outputPtr;
 }
 
-void OutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept {
+void OutputAllocator::notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept {
   output_shapes.clear();
   output_shapes.reserve(dims.nbDims);
   for (int i = 0; i < dims.nbDims; i++) {
@@ -379,8 +382,12 @@ std::shared_ptr<KernelRegistry> TensorrtExecutionProvider::GetKernelRegistry() c
 }
 
 // Per TensorRT documentation, logger needs to be a singleton.
-TensorrtLogger& GetTensorrtLogger() {
-  static TensorrtLogger trt_logger(nvinfer1::ILogger::Severity::kWARNING);
+TensorrtLogger& GetTensorrtLogger(bool verbose_log) {
+  const auto log_level = verbose_log ? nvinfer1::ILogger::Severity::kVERBOSE : nvinfer1::ILogger::Severity::kWARNING;
+  static TensorrtLogger trt_logger(log_level);
+  if (log_level != trt_logger.get_level()) {
+    trt_logger.set_level(verbose_log ? nvinfer1::ILogger::Severity::kVERBOSE : nvinfer1::ILogger::Severity::kWARNING);
+  }
   return trt_logger;
 }
 
@@ -612,20 +619,22 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
       tensor_shape_values[input_name].resize(shape_size);
       switch (tensor_type) {
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-          auto input = std::make_unique<int32_t[]>(shape_size);
-          CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData<int32_t>(), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
+          auto input_shape = std::make_unique<int32_t[]>(shape_size);
+          CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_shape.get(), input_tensor.GetTensorData<int32_t>(),
+                                               shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
           CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
           for (int j = 0; j < shape_size; ++j) {
-            tensor_shape_values[input_name][j] = input[j];
+            tensor_shape_values[input_name][j] = input_shape[j];
           }
           break;
         }
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-          auto input = std::make_unique<int64_t[]>(shape_size);
-          CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData<int64_t>(), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream));
+          auto input_shape = std::make_unique<int64_t[]>(shape_size);
+          CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_shape.get(), input_tensor.GetTensorData<int64_t>(),
+                                               shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream));
           CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
           for (int j = 0; j < shape_size; ++j) {
-            tensor_shape_values[input_name][j] = static_cast<int32_t>(input[j]);
+            tensor_shape_values[input_name][j] = static_cast<int32_t>(input_shape[j]);
           }
           break;
         }
@@ -716,6 +725,77 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
   return Status::OK();
 }
 
+#define CASE_GET_INPUT_TENSOR(DATA_TYPE, SrcT)                                              \
+  case DATA_TYPE: {                                                                         \
+    auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                             \
+    if (input_tensor_ptr != nullptr && elem_cnt > 0) {                                      \
+      data = const_cast<SrcT*>(input_tensor_ptr);                                           \
+    } else {                                                                                \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
+      data = scratch_buffers.back().get();                                                  \
+    }                                                                                       \
+    break;                                                                                  \
+  }
+
+#define CASE_GET_CAST_INPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                         \
+  case DATA_TYPE: {                                                                                               \
+    auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                                                   \
+    if (input_tensor_ptr != nullptr && elem_cnt > 0) {                                                            \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
+      data = scratch_buffers.back().get();                                                                        \
+      cuda::Impl_Cast<SrcT, DstT>(stream, input_tensor_ptr, reinterpret_cast<DstT*>(data), elem_cnt);             \
+    } else {                                                                                                      \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1));                       \
+      data = scratch_buffers.back().get();                                                                        \
+    }                                                                                                             \
+    break;                                                                                                        \
+  }
+
+#define CASE_GET_OUTPUT_TENSOR(DATA_TYPE, SrcT)                                             \
+  case DATA_TYPE: {                                                                         \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                    \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                     \
+      buffers[output_name] = output_tensor_ptr;                                             \
+    } else {                                                                                \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
+      buffers[output_name] = scratch_buffers.back().get();                                  \
+    }                                                                                       \
+    break;                                                                                  \
+  }
+
+#define CASE_GET_CAST_OUTPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                        \
+  case DATA_TYPE: {                                                                                               \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                                          \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                           \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
+      buffers[output_name] = scratch_buffers.back().get();                                                        \
+      output_dim_sizes[i] = static_cast<int>(elem_cnt);                                                           \
+    } else {                                                                                                      \
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1));                       \
+      buffers[output_name] = scratch_buffers.back().get();                                                        \
+      output_dim_sizes[i] = 1;                                                                                    \
+    }                                                                                                             \
+    break;                                                                                                        \
+  }
+
+#define CASE_COPY_TENSOR(DATA_TYPE, DstT)                                                                                                          \
+  case DATA_TYPE: {                                                                                                                                \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>();                                                                           \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                                                            \
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(DstT), cudaMemcpyDeviceToDevice, stream)); \
+    }                                                                                                                                              \
+    break;                                                                                                                                         \
+  }
+
+#define CASE_CAST_TENSOR(DATA_TYPE, SrcT, DstT)                                                                                                   \
+  case DATA_TYPE: {                                                                                                                               \
+    auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>();                                                                          \
+    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                                                           \
+      cuda::Impl_Cast<SrcT, DstT>(stream, reinterpret_cast<SrcT*>(allocator->getBuffer()), reinterpret_cast<DstT*>(output_tensor_ptr), elem_cnt); \
+    }                                                                                                                                             \
+    break;                                                                                                                                        \
+  }
+
 /*
  * Set TensorRT execution context input.
  *
@@ -736,6 +816,17 @@ Status BindContextInput(Ort::KernelContext& ctx,
   auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
   const auto tensor_shapes = tensor_info.GetShape();
   const auto tensor_type = tensor_info.GetElementType();
+  /*
+   * Return the number of elements specified by the tensor shape (all dimensions multiplied by each other).
+   * For 0 dimensions, 1 is returned. If any dimension is less than 0, the result is always -1.
+   *
+   * Examples:<br>
+   * [] = 1<br>
+   * [1,3,4] = 12<br>
+   * [2,0,4] = 0<br>
+   * [-1,3,4] = -1<br>
+   */
+  const auto elem_cnt = tensor_info.GetElementCount();
 
   if (trt_engine->isShapeInferenceIO(input_name)) {
     // Get the shape value of "shape tensor"
@@ -764,113 +855,24 @@ Status BindContextInput(Ort::KernelContext& ctx,
       ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                          "TensorRT EP failed to call nvinfer1::IExecutionContext::setInputShape() for input '" + error_input_name + "'"));
     }
-    // Bind "execution tensor" input buffers
+
+    // Bind "execution tensor" input buffer
+    //
+    // Note: If an engine binding is an empty tensor, it still needs a non-null memory address, and different tensors should have different addresses.
+    //       Therefore, in the case of empty tensor, TRT EP always allocates a dummy byte.
+    //       https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#empty-tensors
     void* data = nullptr;
     switch (tensor_type) {
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<float>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<float*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<uint16_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint16_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<uint16_t*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<bool>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(bool)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<bool*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<int8_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int8_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<int8_t*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<uint8_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint8_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<uint8_t*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-        auto input_tensor_ptr = input_tensor.GetTensorData<int32_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          data = const_cast<int32_t*>(input_tensor_ptr);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-        // Cast INT64 input to INT32 because TensorRT doesn't fully support INT64
-        auto input_tensor_ptr = input_tensor.GetTensorData<int64_t>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-          data = scratch_buffers.back().get();
-        } else {
-          SafeInt<int> input_dim_size = 1;
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (tensor_shapes[j] == 0) {
-              input_dim_size = 1;
-              break;
-            } else {
-              input_dim_size *= tensor_shapes[j];
-            }
-          }
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, input_dim_size * sizeof(int32_t)));
-          data = scratch_buffers.back().get();
-          cuda::Impl_Cast<int64_t, int32_t>(stream, input_tensor_ptr, reinterpret_cast<int32_t*>(data), input_dim_size);
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
-        // Cast DOUBLE input to FLOAT because TensorRT doesn't fully support INT64
-        auto input_tensor_ptr = input_tensor.GetTensorData<double>();
-        if (input_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-          data = scratch_buffers.back().get();
-        } else {
-          SafeInt<int> input_dim_size = 1;
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (tensor_shapes[j] == 0) {
-              input_dim_size = 1;
-              break;
-            } else {
-              input_dim_size *= tensor_shapes[j];
-            }
-          }
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, input_dim_size * sizeof(float)));
-          data = scratch_buffers.back().get();
-          cuda::Impl_Cast<double, float>(stream, input_tensor_ptr, reinterpret_cast<float*>(data), input_dim_size);
-        }
-        break;
-      }
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+      CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+      // Cast int64 input to int32 input because TensorRT doesn't support int64
+      CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t, int32_t)
+      // Cast double input to float because TensorRT doesn't support double
+      CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
       default: {
         return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                "TensorRT EP input onnx tensor data type: " + std::to_string(tensor_type) + " not supported.");
@@ -883,7 +885,7 @@ Status BindContextInput(Ort::KernelContext& ctx,
 }
 
 /*
- * Set TensorRT execution context output.
+ * Bind TensorRT execution context output.
  *
  * Please note that the "data-depedent shape" output needs corresponding allocator provided.
  *
@@ -911,7 +913,6 @@ Status BindContextOutput(Ort::KernelContext& ctx,
                          size_t i,
                          std::unordered_map<size_t, Ort::UnownedValue>& output_tensors,
                          std::unordered_map<size_t, int>& output_dim_sizes,
-                         std::unordered_set<char const*>& dds_output_set,
                          DDSOutputAllocatorMap& dds_output_allocator_map,
                          std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
                          OrtAllocator* alloc,
@@ -919,142 +920,47 @@ Status BindContextOutput(Ort::KernelContext& ctx,
   // Get output shape
   nvinfer1::Dims dims = trt_context->getTensorShape(output_name);
   int nb_dims = dims.nbDims;
-  bool is_dds_output = false;
+  bool is_DDS = false;
   std::vector<int64_t> output_shapes(nb_dims);
   for (int j = 0, end = nb_dims; j < end; ++j) {
     // data-dependent shape
     if (dims.d[j] == -1) {
-      is_dds_output = true;
-      dds_output_set.emplace(output_name);
+      is_DDS = true;
       break;
     }
     output_shapes[j] = dims.d[j];
   }
 
+  auto known_DDS = dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end();
+
   // If the output tensor has data-dependent shape, TRT EP will provide an IOutputAllocator for enqueueV3 to dynamically allocate memory buffer.
   // Once enqueueV3 returns, TRT EP will then bind the output allocation to ORT kernel context output.
   // (Please note that we take strategy A mentioned in https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#dynamic-shaped-output,
   //  which we defer allocation until the size is known and don't call IExecution::setTensorAddress)
   //
   // Otherwise, if the shape of the output tensor is known prior to the runtime, ORT will pre-allocate memory buffer for the output tensor for enqueueV3.
-  if (is_dds_output) {
-    if (dds_output_allocator_map.find(output_name) == dds_output_allocator_map.end()) {
+  if (is_DDS || known_DDS) {
+    if (!known_DDS) {
       auto allocatorPtr = std::make_unique<OutputAllocator>();
       trt_context->setOutputAllocator(output_name, allocatorPtr.get());
       dds_output_allocator_map[output_name] = std::move(allocatorPtr);
-    } else {
-      trt_context->setOutputAllocator(output_name, dds_output_allocator_map[output_name].get());
     }
   } else {
     output_tensors[i] = ctx.GetOutput(output_index, output_shapes);
     auto& output_tensor = output_tensors[i];
+    const auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
+
     switch (output_type) {
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<float>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint16_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint16_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<bool>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(bool)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int8_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int8_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint8_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint8_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int32_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-        } else {
-          buffers[output_name] = output_tensor_ptr;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-        // Allocate INT32 CUDA memory for INT64 output type because TensorRT doesn't fully support INT64
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-          output_dim_sizes[i] = 1;
-        } else {
-          SafeInt<int> output_dim_size(1);
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (dims.d[j] == 0) {
-              output_dim_size = 1;
-              break;
-            } else {
-              output_dim_size *= dims.d[j];
-            }
-          }
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(int32_t)));
-          buffers[output_name] = scratch_buffers.back().get();
-          output_dim_sizes[i] = output_dim_size;
-        }
-        break;
-      }
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
-        // Allocate FLOAT CUDA memory for DOUBLE output type because TensorRT doesn't fully support DOUBLE
-        auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-        if (output_tensor_ptr == nullptr) {
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-          buffers[output_name] = scratch_buffers.back().get();
-          output_dim_sizes[i] = 1;
-        } else {
-          SafeInt<int> output_dim_size(1);
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (dims.d[j] == 0) {
-              output_dim_size = 1;
-              break;
-            } else {
-              output_dim_size *= dims.d[j];
-            }
-          }
-          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(float)));
-          buffers[output_name] = scratch_buffers.back().get();
-          output_dim_sizes[i] = output_dim_size;
-        }
-        break;
-      }
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+      CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+      // Allocate int32 CUDA memory for int64 output type because TensorRT doesn't support int64
+      CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t, int32_t)
+      // Allocate float CUDA memory for double output type because TensorRT doesn't support double
+      CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
       default: {
         return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                "TensorRT EP output tensor data type: " + std::to_string(output_type) + " not supported.");
@@ -1067,110 +973,64 @@ Status BindContextOutput(Ort::KernelContext& ctx,
 }
 
 /*
- * Set ORT kernel context Output.
+ * Bind ORT kernel context Output.
  *
- * Note: In the case of DDS (data-dependent shape) output, TRT requires a provided allocator to allocate memory during runtime.
+ * In the case of DDS (data-dependent shape) output, TRT requires a provided allocator to allocate memory during runtime.
  * Once the output has been put in the allocation buffer, ORT calls this function to bind the allocation to ORT kernel context output.
+ *
+ * Note: Current approach of setting the ORT kernel context output is copying the output data from allocation buffer to ORT context output address which is not optimal,
+ * we are waiting for ORT core to support "assign" memory address to ORT context output. Some works need to be done in ORT memory planner to be aware of this memory support.
  */
 Status BindKernelOutput(Ort::KernelContext& ctx,
-                        OrtMemoryInfo* mem_info,
+                        OrtMemoryInfo* /*mem_info*/,
                         DDSOutputAllocatorMap& allocator_map,
                         char const* output_name,
                         size_t output_index,
                         size_t output_type,
-                        std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
-                        OrtAllocator* alloc,
                         cudaStream_t stream) {
   auto allocator = allocator_map[output_name].get();
   auto& shape = allocator->getOutputShape();
   auto output_tensor = ctx.GetOutput(output_index, shape);
+
+  /*
+   * Return the number of elements specified by the tensor shape (all dimensions multiplied by each other).
+   * For 0 dimensions, 1 is returned. If any dimension is less than 0, the result is always -1.
+   *
+   * Examples:<br>
+   * [] = 1<br>
+   * [1,3,4] = 12<br>
+   * [2,0,4] = 0<br>
+   * [-1,3,4] = -1<br>
+   */
   auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
 
+  /*
+   * Copy output data from allocation buffer to ORT kernel context output location or
+   * cast (int32 or float) -> (int64 or double) to ORT kernel context output location.
+   *
+   * Note:
+   * 1. If the output tensor is empty tensor (i.e. any of the dimension is 0) which means element count is 0,
+   *    TRT EP does not perform cuda memory copy nor cuda cast to prevent overwriting other location that might belong to other tensors.
+   * 2. The cudaMemcpyAsync() and cuda::Impl_Cast() (implemented as _UnaryElementWise() in cuda ep) are all async, but we
+   *    don't need to explicitly call cudaStreamSynchronize() after those APIs due to CUDA EP and TRT EP uses same stream,
+   *    and within the same stream, operations are guaranteed to be executed in order.
+   */
   switch (output_type) {
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<float>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint16_t>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(uint16_t), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<bool>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(bool), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<int8_t>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(int8_t), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint8_t>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(uint8_t), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<int32_t>();
-      if (output_tensor_ptr != nullptr) {
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_tensor_ptr, allocator->getBuffer(), elem_cnt * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-      // The allocation buffer holds the INT32 output data since TRT doesn't support INT64 but INT32.
-      // So, we need to cast the data from INT32 to INT64 and then set INT64 output data to kernel context.
-      SafeInt<int> output_dim_size(1);
-      for (size_t i = 0; i < shape.size(); ++i) {
-        if (shape[i] == 0) {
-          output_dim_size = 1;
-          break;
-        } else {
-          output_dim_size *= shape[i];
-        }
-      }
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
-      if (output_tensor_ptr != nullptr) {
-        cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(allocator->getBuffer()), reinterpret_cast<int64_t*>(output_tensor_ptr), output_dim_size);
-      }
-      break;
-    }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
-      // The allocation buffer holds the FLOAT output data since TRT doesn't support DOUBLE but FLOAT.
-      // So, we need to cast the data from FLOAT to DOUBEL and then set DOUBLE output data to kernel context.
-      SafeInt<int> output_dim_size(1);
-      for (size_t i = 0; i < shape.size(); ++i) {
-        if (shape[i] == 0) {
-          output_dim_size = 1;
-          break;
-        } else {
-          output_dim_size *= shape[i];
-        }
-      }
-      auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-      if (output_tensor_ptr != nullptr) {
-        cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(allocator->getBuffer()), reinterpret_cast<double*>(output_tensor_ptr), output_dim_size);
-      }
-      break;
-    }
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL, bool)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+    // The allocation buffer holds the int32 output data since TRT doesn't support int64. So, we need to cast the data (int32 -> int64) for ORT kernel output.
+    CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int32_t, int64_t)
+    // The allocation buffer holds the float output data since TRT doesn't support double. So, we need to cast the data (float -> double) for ORT kernel output.
+    CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float, double)
     default: {
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                              "TensorRT EP output tensor data type: " + std::to_string(output_type) + " not supported.");
     }
   }
-  CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
   return Status::OK();
 }
 
@@ -1291,7 +1151,8 @@ TensorrtExecutionProvider::PerThreadContext& TensorrtExecutionProvider::GetPerTh
 
     // get or create a context
     if (context_state_.retired_context_pool.empty()) {
-      context = std::make_shared<PerThreadContext>(info_.device_id, info_.has_user_compute_stream, stream_);
+      context = std::make_shared<PerThreadContext>(narrow<OrtDevice::DeviceId>(info_.device_id),
+                                                   info_.has_user_compute_stream, stream_);
     } else {
       context = context_state_.retired_context_pool.back();
       context_state_.retired_context_pool.pop_back();
@@ -1311,10 +1172,17 @@ TensorrtExecutionProvider::PerThreadContext& TensorrtExecutionProvider::GetPerTh
 }
 
 TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id), true}, info_(info), device_id_(info.device_id) {
+    : IExecutionProvider{onnxruntime::kTensorrtExecutionProvider,
+                         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT,
+                                   narrow<OrtDevice::DeviceId>(info.device_id))},
+      info_(info),
+      device_id_(info.device_id) {
   InitProviderOrtApi();
 
   CUDA_CALL_THROW(cudaSetDevice(device_id_));
+  cudaDeviceProp prop;
+  CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+  compute_capability_ = GetComputeCapacity(prop);
   if (info.has_user_compute_stream) {
     external_stream_ = true;
     stream_ = static_cast<cudaStream_t>(info.user_compute_stream);
@@ -1346,8 +1214,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     timing_cache_enable_ = info.timing_cache_enable;
     force_timing_cache_match_ = info.force_timing_cache;
     detailed_build_log_ = info.detailed_build_log;
+    dump_ep_context_model_ = info.dump_ep_context_model;
+    ep_context_file_path_ = info.ep_context_file_path;
+    ep_context_embed_mode_ = info.ep_context_embed_mode;
     if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
       cache_path_ = info.engine_cache_path;
+      cache_prefix_ = info.engine_cache_prefix;
     }
     // use a more global cache if given
     if (timing_cache_enable_) {
@@ -1453,9 +1325,25 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true);
       }
 
+      const std::string dump_ep_context_model_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDumpEpContextModel);
+      if (!dump_ep_context_model_env.empty()) {
+        dump_ep_context_model_ = (std::stoi(dump_ep_context_model_env) == 0 ? false : true);
+      }
+
+      const std::string ep_context_file_path_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextComputeCapabilityEnable);
+      if (!ep_context_file_path_env.empty()) {
+        ep_context_file_path_ = ep_context_file_path_env;
+      }
+
+      const std::string ep_context_embed_mode_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextEmbedMode);
+      if (!ep_context_embed_mode_env.empty()) {
+        ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env);
+      }
+
       if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
         const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
         cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
+        cache_prefix_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePrefix);
         if (!engine_cache_path.empty() && cache_path_.empty()) {
           cache_path_ = engine_cache_path;
           LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_ENGINE_CACHE_PATH is deprecated! Please use ORT_TENSORRT_CACHE_PATH to specify engine cache path";
@@ -1528,6 +1416,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
       if (!cuda_graph_enable_env.empty()) {
         cuda_graph_enable_ = (std::stoi(cuda_graph_enable_env) == 0 ? false : true);
       }
+
     } catch (const std::invalid_argument& ex) {
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] Invalid Argument (from environment variables): " << ex.what();
     } catch (const std::out_of_range& ex) {
@@ -1555,6 +1444,35 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     dla_core_ = 0;
   }
 
+  // If ep_context_file_path_ is provided as a directory, create it if it's not existed
+  if (dump_ep_context_model_ && !ep_context_file_path_.empty() && std::filesystem::path(ep_context_file_path_).extension().empty() && !std::filesystem::is_directory(ep_context_file_path_)) {
+    if (!std::filesystem::create_directory(ep_context_file_path_)) {
+      throw std::runtime_error("Failed to create directory " + ep_context_file_path_);
+    }
+  }
+
+  // If dump_ep_context_model_ is enable, TRT EP forces cache_path_ to be the relative path of ep_context_file_path_.
+  // For example,
+  //    - original cache path = "engine_cache_dir" -> new cache path = "./context_model_dir/engine_cache_dir"
+  //    - original cache path = ""                 -> new cache path = "./context_model_dir"
+  // The new cache path will be saved as the "ep_cache_context" node attritue of the EP context node.
+  // For security reason, it needs to make sure the engine cache is saved inside context model directory.
+  if (dump_ep_context_model_ && engine_cache_enable_) {
+    if (IsAbsolutePath(cache_path_)) {
+      LOGS_DEFAULT(ERROR) << "In the case of dumping context model and for security purpose, the trt_engine_cache_path should be set with a relative path, but it is an absolute path:  " << cache_path_;
+    }
+    if (IsRelativePathToParentPath(cache_path_)) {
+      LOGS_DEFAULT(ERROR) << "In the case of dumping context model and for security purpose, The trt_engine_cache_path has '..', it's not allowed to point outside the directory.";
+    }
+
+    // Engine cache relative path to context model directory.
+    // It's used when dumping the "ep_cache_context" node attribute.
+    engine_cache_relative_path_to_context_model_dir = cache_path_;
+
+    // Make cache_path_ to be the relative path of ep_context_file_path_
+    cache_path_ = GetPathOrParentPathOfCtxModel(ep_context_file_path_).append(cache_path_).string();
+  }
+
   if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
     if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
       if (!fs::create_directory(cache_path_)) {
@@ -1632,9 +1550,19 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     }
   }
 
+  // cuda graph:
+  // cudaStreamSynchronize() is not allowed in cuda graph capture.
+  //
+  // external stream:
+  // If user provides "external" cuda stream, only this cuda stream will be used even if multiple threads are running InferenceSession.Run() concurrently.
+  // So, no need to synchronize different streams after enqueueV3.
+  if (cuda_graph_enable_ || external_stream_) {
+    sync_stream_after_enqueue_ = false;
+  }
+
   {
     auto lock = GetApiLock();
-    runtime_ = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger()));
+    runtime_ = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger(detailed_build_log_)));
   }
 
   LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT provider options: "
@@ -1666,7 +1594,11 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_profile_min_shapes: " << profile_min_shapes
                         << ", trt_profile_max_shapes: " << profile_max_shapes
                         << ", trt_profile_opt_shapes: " << profile_opt_shapes
-                        << ", trt_cuda_graph_enable: " << cuda_graph_enable_;
+                        << ", trt_cuda_graph_enable: " << cuda_graph_enable_
+                        << ", trt_dump_ep_context_model: " << dump_ep_context_model_
+                        << ", trt_ep_context_file_path: " << ep_context_file_path_
+                        << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_
+                        << ", trt_cache_prefix: " << cache_prefix_;
 }
 
 TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@@ -1705,26 +1637,26 @@ bool TensorrtExecutionProvider::IsGraphCaptureAllowed() const {
   return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
 }
 
-void TensorrtExecutionProvider::CaptureBegin() {
+void TensorrtExecutionProvider::CaptureBegin(int) {
   cuda_graph_.Reset();
-  cuda_graph_.CaptureBegin();
+  cuda_graph_.CaptureBegin(0);
 }
 
-void TensorrtExecutionProvider::CaptureEnd() {
-  cuda_graph_.CaptureEnd();
+void TensorrtExecutionProvider::CaptureEnd(int) {
+  cuda_graph_.CaptureEnd(0);
   is_graph_captured_ = true;
 }
 
-bool TensorrtExecutionProvider::IsGraphCaptured() const {
+bool TensorrtExecutionProvider::IsGraphCaptured(int) const {
   return is_graph_captured_;
 }
 
-Status TensorrtExecutionProvider::ReplayGraph() {
-  ORT_ENFORCE(IsGraphCaptured());
+Status TensorrtExecutionProvider::ReplayGraph(int) {
+  ORT_ENFORCE(IsGraphCaptured(0));
   // Please note that CUDAGraph::Replay() is not thread safe.
-  // ORT TRT calls ReplayGraph() in compute_func() where synchromization is enforced due to lock_guard(),
+  // ORT TRT calls ReplayGraph() in compute_func() where synchronization is enforced due to lock_guard(),
   // therefore calling CUDAGraph::Replay() here is guaranteed to be thread safe.
-  return cuda_graph_.Replay();
+  return cuda_graph_.Replay(0);
 }
 
 void TensorrtExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
@@ -1736,7 +1668,8 @@ void TensorrtExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
 
 std::vector<AllocatorPtr> TensorrtExecutionProvider::CreatePreferredAllocators() {
   AllocatorCreationInfo default_memory_info(
-      [](OrtDevice::DeviceId device_id) { return CreateCUDAAllocator(device_id, onnxruntime::CUDA); }, device_id_);
+      [](OrtDevice::DeviceId device_id) { return CreateCUDAAllocator(device_id, onnxruntime::CUDA); },
+      narrow<OrtDevice::DeviceId>(device_id_));
 
   AllocatorCreationInfo pinned_allocator_info(
       [](OrtDevice::DeviceId device_id) {
@@ -1752,11 +1685,11 @@ std::unique_ptr<IDataTransfer> TensorrtExecutionProvider::GetDataTransfer() cons
   return onnxruntime::CreateGPUDataTransfer();
 }
 
-Status TensorrtExecutionProvider::OnRunStart() {
+Status TensorrtExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   return Status::OK();
 }
 
-Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream) {
+Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (sync_stream && external_stream_) {
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
   }
@@ -1766,9 +1699,8 @@ Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream) {
 // Get the pointer to the IBuilder instance.
 // Note: This function is not thread safe. Calls to this function from different threads must be serialized
 // even though it doesn't make sense to have multiple threads initializing the same inference session.
-nvinfer1::IBuilder* TensorrtExecutionProvider::GetBuilder() const {
+nvinfer1::IBuilder* TensorrtExecutionProvider::GetBuilder(TensorrtLogger& trt_logger) const {
   if (!builder_) {
-    TensorrtLogger& trt_logger = GetTensorrtLogger();
     {
       auto lock = GetApiLock();
       builder_ = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(trt_logger));
@@ -1778,13 +1710,21 @@ nvinfer1::IBuilder* TensorrtExecutionProvider::GetBuilder() const {
 }
 
 void TensorrtExecutionProvider::GetCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list) const {
-  if (info_.custom_op_domain_list.empty()) {
-    common::Status status = CreateTensorRTCustomOpDomainList(info_);
-    if (!status.IsOK()) {
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration.";
+  std::string extra_plugin_lib_paths{""};
+  if (info_.has_trt_options) {
+    if (!info_.extra_plugin_lib_paths.empty()) {
+      extra_plugin_lib_paths = info_.extra_plugin_lib_paths;
+    }
+  } else {
+    const std::string extra_plugin_lib_paths_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kExtraPluginLibPaths);
+    if (!extra_plugin_lib_paths_env.empty()) {
+      extra_plugin_lib_paths = extra_plugin_lib_paths_env;
     }
   }
-  custom_op_domain_list = info_.custom_op_domain_list;
+  auto status = CreateTensorRTCustomOpDomainList(custom_op_domain_list, extra_plugin_lib_paths);
+  if (status != Status::OK()) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration.";
+  }
 }
 
 // Check the graph is the subgraph of control flow op
@@ -2003,7 +1943,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
         bool has_control_flow_op = false;
 
         // Add node and node args
-        // If node output is also parent graph output, the  output will be added to the
+        // If node output is also parent graph output, the output will be added to the
         // subgraph's output list
         std::vector<std::string> subgraph_output_names;
         for (const auto& index : group.first) {
@@ -2137,10 +2077,14 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
 
         // Get supported node list recursively
         SubGraphCollection_t parser_nodes_list;
-        TensorrtLogger& trt_logger = GetTensorrtLogger();
-        auto trt_builder = GetBuilder();
-        const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-        auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(explicitBatch));
+        TensorrtLogger& trt_logger = GetTensorrtLogger(detailed_build_log_);
+        auto trt_builder = GetBuilder(trt_logger);
+        auto network_flags = 0;
+#if NV_TENSORRT_MAJOR > 8
+        network_flags |= fp16_enable_ || int8_enable_ ? 0 : 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED);
+#endif
+        network_flags |= 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+        auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(network_flags));
 
         auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
         trt_parser->supportsModel(string_buf.data(), string_buf.size(), parser_nodes_list, model_path_);
@@ -2280,6 +2224,9 @@ bool TensorrtExecutionProvider::DetectTensorRTGraphCycles(SubGraphCollection_t&
 std::vector<std::unique_ptr<ComputeCapability>>
 TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
                                          const IKernelLookup& /*kernel_lookup*/) const {
+  // Construct subgraph capability from node list
+  std::vector<std::unique_ptr<ComputeCapability>> result;
+
   // Get ModelPath
   const auto& path_string = graph.ModelPath().ToPathString();
 #ifdef _WIN32
@@ -2288,6 +2235,16 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   strcpy(model_path_, path_string.c_str());
 #endif
 
+  // If the model consists of only a single "EPContext" contrib op, it means TRT EP can fetch the precompiled engine info from the node and
+  // load the engine directly without having to go through the processes of graph proto reconstruction, calling TRT parser and engine compilation.
+  // So, simply return the ComputeCapability here.
+  if (graph.NumberOfNodes() == 1 && GraphHasCtxNode(graph)) {
+    SubGraph_t supported_node_vector = {{0}, true};
+    std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(supported_node_vector, graph, TRTGenerateId(graph), 0);
+    result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+    return result;
+  }
+
   // Generate unique kernel name for TRT graph
   HashValue model_hash = TRTGenerateId(graph);
 
@@ -2368,9 +2325,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
     }
   }
 
-  // Construct subgraph capability from node list
-  std::vector<std::unique_ptr<ComputeCapability>> result;
-
   // Handle the case where the graph is subgraph of control flow op.
   // The purpose is to make control flow op as well as its subgraphs run on TRT.
   // Here we need to check whether subgraph is fully supported by TRT and don't fuse the nodes of the subgraph until control flow op level.
@@ -2455,7 +2409,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   } else if (number_of_trt_nodes == number_of_ort_nodes) {
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider";
   } else {
-    sync_stream_after_enqueue_ = true;
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs;
   }
 
@@ -2485,729 +2438,409 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       output_map[output_defs[i]->Name()] = i;
     }
 
-    // Reconstruct graph proto from fused node's function body
-    auto model = graph_body_viewer.CreateModel(*GetLogger());
-    auto model_proto = model->ToProto();
-    graph_body_viewer.ToProto(*model_proto->mutable_graph(), true, true);
-    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
-    std::string string_buf;
-    model_proto->SerializeToString(string_buf);
-
-    if (dump_subgraphs_) {
-      // Dump TensorRT subgraphs
-      std::fstream dump(fused_node.Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
-      model_proto->SerializeToOstream(dump);
+    Status status;
+    if (GraphHasCtxNode(graph_body_viewer)) {
+      status = CreateNodeComputeInfoFromPrecompiledEngine(graph_body_viewer, fused_node, input_map, output_map, node_compute_funcs);
+    } else {
+      status = CreateNodeComputeInfoFromGraph(graph_body_viewer, fused_node, input_map, output_map, node_compute_funcs);
     }
-
-    TensorrtLogger& trt_logger = GetTensorrtLogger();
-    auto trt_builder = GetBuilder();
-    const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-    auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(explicitBatch));
-    auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-    auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
-    trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
-    trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
-
-    // Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow
-    if (fp16_enable_ && layer_norm_fp32_fallback_) {
-      for (auto idx = 1; idx < trt_network->getNbLayers() - 1; ++idx) {
-        auto layer = trt_network->getLayer(idx);
-        auto next_layer = trt_network->getLayer(idx + 1);
-        if (layer->getType() == nvinfer1::LayerType::kELEMENTWISE && next_layer->getType() == nvinfer1::LayerType::kREDUCE && (static_cast<nvinfer1::IElementWiseLayer*>(layer))->getOperation() == nvinfer1::ElementWiseOperation::kPOW) {
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow";
-          layer->setPrecision(nvinfer1::DataType::kFLOAT);
-          next_layer->setPrecision(nvinfer1::DataType::kFLOAT);
-          layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
-          next_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
-        }
-      }
+    if (status != Status::OK()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
     }
+  }
+  return Status::OK();
+}
 
-    int num_inputs = trt_network->getNbInputs();
-    int num_outputs = trt_network->getNbOutputs();
-    std::unordered_map<std::string, size_t> input_indexes(num_inputs);
-    std::unordered_map<std::string, size_t> output_indexes(num_outputs);
-    std::unordered_map<std::string, size_t> output_types(num_outputs);
+Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& graph_body_viewer,
+                                                                 const Node& fused_node,
+                                                                 std::unordered_map<std::string, size_t>& input_map,
+                                                                 std::unordered_map<std::string, size_t>& output_map,
+                                                                 std::vector<NodeComputeInfo>& node_compute_funcs) {
+  // Reconstruct graph proto from fused node's function body
+  auto model = graph_body_viewer.CreateModel(*GetLogger());
+  auto model_proto = model->ToProto();
+  graph_body_viewer.ToProto(*model_proto->mutable_graph(), true, true);
+  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  std::string string_buf;
+  model_proto->SerializeToString(string_buf);
+
+  if (dump_subgraphs_) {
+    // Dump TensorRT subgraphs
+    std::fstream dump(fused_node.Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
+    model_proto->SerializeToOstream(dump);
+  }
+
+  TensorrtLogger& trt_logger = GetTensorrtLogger(detailed_build_log_);
+  auto trt_builder = GetBuilder(trt_logger);
+  auto network_flags = 0;
+#if NV_TENSORRT_MAJOR > 8
+  network_flags |= fp16_enable_ || int8_enable_ ? 0 : 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED);
+#endif
+  network_flags |= 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+  auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(network_flags));
+  auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
+  auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
+  trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
+  trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+
+  // Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow
+  if (fp16_enable_ && layer_norm_fp32_fallback_) {
+    for (auto idx = 1; idx < trt_network->getNbLayers() - 1; ++idx) {
+      auto layer = trt_network->getLayer(idx);
+      auto next_layer = trt_network->getLayer(idx + 1);
+      if (layer->getType() == nvinfer1::LayerType::kELEMENTWISE && next_layer->getType() == nvinfer1::LayerType::kREDUCE && (static_cast<nvinfer1::IElementWiseLayer*>(layer))->getOperation() == nvinfer1::ElementWiseOperation::kPOW) {
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow";
+        layer->setPrecision(nvinfer1::DataType::kFLOAT);
+        next_layer->setPrecision(nvinfer1::DataType::kFLOAT);
+        layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        next_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+      }
+    }
+  }
+
+  int num_inputs = trt_network->getNbInputs();
+  int num_outputs = trt_network->getNbOutputs();
+  std::unordered_map<std::string, size_t> input_indexes(num_inputs);
+  std::unordered_map<std::string, size_t> output_indexes(num_outputs);
+  std::unordered_map<std::string, size_t> output_types(num_outputs);
 
-    /*
-     * Initialize shape range for each dynamic shape input tensor:
-     *   1) If user explicitly specifies optimization profiles via provider options, TRT EP will create those profiles during EP compile time.
-     *      It won't make adjustment for profile values during EP compute time.
-     *
-     *   2) If no explicit optimization profiles provided by user, TRT EP will firstly set min/max/opt shape to [INT_MAX, INT_MIN, INT_MIN].
-     *      Later in EP compute time, the shape will be adjusted to [min_input_value, max_input_value, max_input_value] based on input tensor value.
-     *
-     *
-     * Once the TRT profiles are created:
-     *   1) If all the dynamic shape input tensors have associated profiles explicitly provided by user, those profiles will be applied to TRT builder config
-     *      and the engine will be built at EP compile time.
-     *
-     *   2) As long as one of the dynamic shape input tensors has no explicitly associated profile, TRT EP will create default shape as described above,
-     *      and all the profiles won't be applied and engine won't be built until EP compute time.
-     */
-    bool has_dynamic_shape = false;  // True if input tensor has dynamic shape and no explicit profile is specified, otherwise false.
-    bool has_explicit_profile = false;
-    bool apply_explicit_profile = false;
-    int num_profiles = 0;
-    std::vector<nvinfer1::IOptimizationProfile*> trt_profiles;
+  /*
+   * Initialize shape range for each dynamic shape input tensor:
+   *   1) If user explicitly specifies optimization profiles via provider options, TRT EP will create those profiles during EP compile time.
+   *      It won't make adjustment for profile values during EP compute time.
+   *
+   *   2) If no explicit optimization profiles provided by user, TRT EP will firstly set min/max/opt shape to [INT_MAX, INT_MIN, INT_MIN].
+   *      Later in EP compute time, the shape will be adjusted to [min_input_value, max_input_value, max_input_value] based on input tensor value.
+   *
+   *
+   * Once the TRT profiles are created:
+   *   1) If all the dynamic shape input tensors have associated profiles explicitly provided by user, those profiles will be applied to TRT builder config
+   *      and the engine will be built at EP compile time.
+   *
+   *   2) As long as one of the dynamic shape input tensors has no explicitly associated profile, TRT EP will create default shape as described above,
+   *      and all the profiles won't be applied and engine won't be built until EP compute time.
+   */
+  bool has_dynamic_shape = false;  // True if input tensor has dynamic shape and no explicit profile is specified, otherwise false.
+  bool has_explicit_profile = false;
+  bool apply_explicit_profile = false;
+  int num_profiles = 0;
+  std::vector<nvinfer1::IOptimizationProfile*> trt_profiles;
 
-    // Following c++ map data structure is used to help serialize/deserialize profiles where it saves dynamic shape dimension(s) and min/max/opt values for dynamic shape input tensor.
-    //
-    // (1) Single profile case:
-    // For example, assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, and tensor_b
-    // has one dynamic shape dimension: dim_1. The data will be:
-    // {
-    //   tensor_a: {
-    //              dim_0: [[min_shape, max_shape, opt_shape]],
-    //              dim_2: [[min_shape, max_shape, opt_shape]]
-    //   },
-    //   tensor_b: {
-    //              dim_1: [[min_shape, max_shape, opt_shape]]
-    //   }
-    // }
-    //
-    // (2) Multiple profiles case:
-    // For example, assume tensor_a has one dynamic shap dimension: dim 0, and tensor_b has one dynamic shape dimension: dim_1,
-    // and both of the tensors have two profiles. The data will be:
-    // {
-    //   tensor_a: {
-    //     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
-    //   },
-    //   tensor_b: {
-    //     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
-    //   }
-    // }
-    ShapeRangesMap input_explicit_shape_ranges;
-    ShapeRangesMap input_implicit_shape_ranges;
-
-    if ((!profile_min_shapes_.empty()) && (!profile_max_shapes_.empty()) && (!profile_opt_shapes_.empty())) {
-      has_explicit_profile = true;
-      num_profiles = GetNumProfiles(profile_min_shapes_);
-      for (int i = 0; i < num_profiles; i++) {
-        trt_profiles.push_back(trt_builder->createOptimizationProfile());
-      }
-    }
-
-    // Iterate all input tensors to check dynamic shape
-    for (unsigned int i = 0, end = num_inputs; i < end; ++i) {
-      auto input = trt_network->getInput(i);
-      const std::string& input_name = input->getName();
-      nvinfer1::Dims dims = input->getDimensions();
-      int nb_dims = dims.nbDims;
+  // Following c++ map data structure is used to help serialize/deserialize profiles where it saves dynamic shape dimension(s) and min/max/opt values for dynamic shape input tensor.
+  //
+  // (1) Single profile case:
+  // For example, assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, and tensor_b
+  // has one dynamic shape dimension: dim_1. The data will be:
+  // {
+  //   tensor_a: {
+  //              dim_0: [[min_shape, max_shape, opt_shape]],
+  //              dim_2: [[min_shape, max_shape, opt_shape]]
+  //   },
+  //   tensor_b: {
+  //              dim_1: [[min_shape, max_shape, opt_shape]]
+  //   }
+  // }
+  //
+  // (2) Multiple profiles case:
+  // For example, assume tensor_a has one dynamic shap dimension: dim 0, and tensor_b has one dynamic shape dimension: dim_1,
+  // and both of the tensors have two profiles. The data will be:
+  // {
+  //   tensor_a: {
+  //     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
+  //   },
+  //   tensor_b: {
+  //     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
+  //   }
+  // }
+  ShapeRangesMap input_explicit_shape_ranges;
+  ShapeRangesMap input_implicit_shape_ranges;
+
+  if ((!profile_min_shapes_.empty()) && (!profile_max_shapes_.empty()) && (!profile_opt_shapes_.empty())) {
+    has_explicit_profile = true;
+    num_profiles = GetNumProfiles(profile_min_shapes_);
+    for (int i = 0; i < num_profiles; i++) {
+      trt_profiles.push_back(trt_builder->createOptimizationProfile());
+    }
+  }
 
-      // Apply explicit optimization profiles provided by user
-      if (has_explicit_profile) {
-        apply_explicit_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges);
-      }
+  // Iterate all input tensors to check dynamic shape
+  for (unsigned int i = 0, end = num_inputs; i < end; ++i) {
+    auto input = trt_network->getInput(i);
+    const std::string& input_name = input->getName();
+    nvinfer1::Dims dims = input->getDimensions();
+    int nb_dims = dims.nbDims;
 
-      // If no explicit optimization profile is being applied, TRT EP will later set min/max/opt shape values based on input tensor values at EP compute time
-      if (!apply_explicit_profile) {
-        if (input->isShapeTensor()) {
-          // Shape tensor
-          std::vector<std::vector<int64_t>> profile_vector;
-          std::vector<int64_t> shape_vector{INT_MAX, INT_MIN, INT_MIN};
-          profile_vector.push_back(shape_vector);  // only one profile needed
-          input_implicit_shape_ranges[input_name][0] = profile_vector;
-          has_dynamic_shape = true;
-        } else {
-          // Execution tensor
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (dims.d[j] == -1) {
-              std::vector<std::vector<int64_t>> profile_vector;
-              std::vector<int64_t> shape_vector{INT_MAX, INT_MIN, INT_MIN};
-              profile_vector.push_back(shape_vector);  // only one profile needed
-              input_implicit_shape_ranges[input_name][j] = profile_vector;
-              has_dynamic_shape = true;
-            }
-          }
-        }
-        apply_explicit_profile = false;
-      }
+    // Apply explicit optimization profiles provided by user
+    if (has_explicit_profile) {
+      apply_explicit_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges);
     }
 
-    // Set explicit profiles in TRT config if all dynamic shape inputs have associated profiles provided by user
-    if (has_explicit_profile) {
-      // TRT EP has a constraint here.
-      // Users need to provide all the dynamic shape inputs with associated profiles if they want to explicitly specify profiles through provider options.
-      if (has_dynamic_shape) {
-        std::ostringstream msg;
-        msg << "User needs to provide all the dynamic shape inputs with associated profiles if they want to explicitly set profiles through provider options.\n";
-        msg << "Please note that main graph could be partitioned into TRT/CUDA/CPU subgraphs, in this case, user also needs to provide shape profiles for the TRT subgraph's input if it's dynamic shape input.\n";
-        msg << "Following input(s) has no associated shape profiles provided: ";
-        auto begin = input_implicit_shape_ranges.begin();
-        auto end = input_implicit_shape_ranges.end();
-        auto it = begin;
-        if (it != end) {
-          msg << it->first;
-          ++it;
-        }
-        for (; it != end; ++it) {
-          msg << "," << it->first;
-        }
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, msg.str());
+    // If no explicit optimization profile is being applied, TRT EP will later set min/max/opt shape values based on input tensor values at EP compute time
+    if (!apply_explicit_profile) {
+      if (input->isShapeTensor()) {
+        // Shape tensor
+        std::vector<std::vector<int64_t>> profile_vector;
+        std::vector<int64_t> shape_vector{INT_MAX, INT_MIN, INT_MIN};
+        profile_vector.push_back(shape_vector);  // only one profile needed
+        input_implicit_shape_ranges[input_name][0] = profile_vector;
+        has_dynamic_shape = true;
       } else {
-        for (auto trt_profile : trt_profiles) {
-          trt_config->addOptimizationProfile(trt_profile);
+        // Execution tensor
+        for (int j = 0, end = nb_dims; j < end; ++j) {
+          if (dims.d[j] == -1) {
+            std::vector<std::vector<int64_t>> profile_vector;
+            std::vector<int64_t> shape_vector{INT_MAX, INT_MIN, INT_MIN};
+            profile_vector.push_back(shape_vector);  // only one profile needed
+            input_implicit_shape_ranges[input_name][j] = profile_vector;
+            has_dynamic_shape = true;
+          }
         }
       }
+      apply_explicit_profile = false;
     }
-    // If no explicit profile is applied and the input has dynamic shape, TRT EP simply creates one profile by default.
-    // It will later set proper min/max/opt shape values duing EP compute time.
-    else if (!has_explicit_profile && has_dynamic_shape) {
-      trt_profiles.push_back(trt_builder->createOptimizationProfile());
-    }
+  }
 
-    // Check platform availability for low precision
-    if (fp16_enable_) {
-      if (!trt_builder->platformHasFastFp16()) {
-        fp16_enable_ = false;
-        LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_FP16_ENABLE is set, but platform doesn't support fast native fp16";
+  // Set explicit profiles in TRT config if all dynamic shape inputs have associated profiles provided by user
+  if (has_explicit_profile) {
+    // TRT EP has a constraint here.
+    // Users need to provide all the dynamic shape inputs with associated profiles if they want to explicitly specify profiles through provider options.
+    if (has_dynamic_shape) {
+      std::ostringstream msg;
+      msg << "User needs to provide all the dynamic shape inputs with associated profiles if they want to explicitly set profiles through provider options.\n";
+      msg << "Please note that main graph could be partitioned into TRT/CUDA/CPU subgraphs, in this case, user also needs to provide shape profiles for the TRT subgraph's input if it's dynamic shape input.\n";
+      msg << "Following input(s) has no associated shape profiles provided: ";
+      auto begin = input_implicit_shape_ranges.begin();
+      auto end = input_implicit_shape_ranges.end();
+      auto it = begin;
+      if (it != end) {
+        msg << it->first;
+        ++it;
+      }
+      for (; it != end; ++it) {
+        msg << "," << it->first;
+      }
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, msg.str());
+    } else {
+      for (auto trt_profile : trt_profiles) {
+        trt_config->addOptimizationProfile(trt_profile);
       }
     }
+  }
+  // If no explicit profile is applied and the input has dynamic shape, TRT EP simply creates one profile by default.
+  // It will later set proper min/max/opt shape values duing EP compute time.
+  else if (!has_explicit_profile && has_dynamic_shape) {
+    trt_profiles.push_back(trt_builder->createOptimizationProfile());
+  }
 
-    if (int8_enable_) {
-      if (!trt_builder->platformHasFastInt8()) {
-        int8_enable_ = false;
-        LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_INT8_ENABLE is set, but platform doesn't support fast native int8";
-      }
-    }
-
-    // Load INT8 calibration table
-    std::unordered_map<std::string, float> dynamic_range_map;
-    if (int8_enable_ && int8_calibration_cache_available_) {
-      const std::string calibration_cache_path = GetCachePath(cache_path_, int8_calibration_cache_name_);
-      if (!ReadDynamicRange(calibration_cache_path, int8_use_native_tensorrt_calibration_table_, dynamic_range_map)) {
-        throw std::runtime_error("Failed to read INT8 calibration table " + calibration_cache_path);
-      }
-    }
-
-    // Set precision flags
-    std::string trt_node_name_with_precision = fused_node.Name();
-    if (fp16_enable_ && int8_enable_) {
-      trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
-      trt_node_name_with_precision += "_fp16_int8";
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 and INT8 mode is enabled";
-    } else if (fp16_enable_) {
-      trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-      trt_node_name_with_precision += "_fp16";
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 mode is enabled";
-    } else if (int8_enable_) {
-      trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
-      trt_node_name_with_precision += "_int8";
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] INT8 mode is enabled";
-    }
-
-    // Set DLA
-    if (fp16_enable_ || int8_enable_) {
-      if (dla_enable_ && dla_core_ >= 0) {  // DLA can only run with FP16 and INT8
-        int number_of_dla_core = trt_builder->getNbDLACores();
-        if (number_of_dla_core == 0) {
-          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core";
-          dla_enable_ = false;
-        } else {
-          if (dla_core_ >= number_of_dla_core) {
-            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead.";
-            dla_core_ = 0;
-          }
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << dla_core_;
-          trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-          trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-          trt_config->setDLACore(dla_core_);
-          trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_);
+  // Check platform availability for low precision
+  if (fp16_enable_) {
+    if (!trt_builder->platformHasFastFp16()) {
+      fp16_enable_ = false;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_FP16_ENABLE is set, but platform doesn't support fast native fp16";
+    }
+  }
+
+  if (int8_enable_) {
+    if (!trt_builder->platformHasFastInt8()) {
+      int8_enable_ = false;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_INT8_ENABLE is set, but platform doesn't support fast native int8";
+    }
+  }
+
+  // Load INT8 calibration table
+  std::unordered_map<std::string, float> dynamic_range_map;
+  if (int8_enable_ && int8_calibration_cache_available_) {
+    const std::string calibration_cache_path = GetCachePath(cache_path_, int8_calibration_cache_name_);
+    if (!ReadDynamicRange(calibration_cache_path, int8_use_native_tensorrt_calibration_table_, dynamic_range_map)) {
+      throw std::runtime_error("Failed to read INT8 calibration table " + calibration_cache_path);
+    }
+  }
+
+  // Set precision flags
+  std::string trt_node_name_with_precision = fused_node.Name();
+  if (fp16_enable_ && int8_enable_) {
+    trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
+    trt_node_name_with_precision += "_fp16_int8";
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 and INT8 mode is enabled";
+  } else if (fp16_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+    trt_node_name_with_precision += "_fp16";
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 mode is enabled";
+  } else if (int8_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    trt_node_name_with_precision += "_int8";
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] INT8 mode is enabled";
+  }
+
+  // Set DLA
+  if (fp16_enable_ || int8_enable_) {
+    if (dla_enable_ && dla_core_ >= 0) {  // DLA can only run with FP16 and INT8
+      int number_of_dla_core = trt_builder->getNbDLACores();
+      if (number_of_dla_core == 0) {
+        LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core";
+        dla_enable_ = false;
+      } else {
+        if (dla_core_ >= number_of_dla_core) {
+          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead.";
+          dla_core_ = 0;
         }
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << dla_core_;
+        trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+        trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+        trt_config->setDLACore(dla_core_);
+        trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_);
       }
     }
+  }
 
-    // enable sparse weights
-    if (sparsity_enable_) {
-      trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
-    }
+  // enable sparse weights
+  if (sparsity_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
+  }
 #if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5
-    if (build_heuristics_enable_) {
-      trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder heuristics are enabled."
-                            << " For TRT > 8.5, trt_build_heuristics_enable is deprecated, please set builder optimization level as 2 to enable builder heuristics.";
-    }
+  if (build_heuristics_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder heuristics are enabled."
+                          << " For TRT > 8.5, trt_build_heuristics_enable is deprecated, please set builder optimization level as 2 to enable builder heuristics.";
+  }
 #elif NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
-    // for TRT 8.6 onwards, heuristic-based tactic option is automatically enabled by setting builder optimization level 2
-    if (build_heuristics_enable_) {
-      if (builder_optimization_level_ == 2) {
-        LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder heuristics are automatically enabled by builder optimization level 2. trt_build_heuristics_enable is deprecated on TRT 8.6 onwards.";
-      } else {
-        LOGS_DEFAULT(WARNING) << "[TensorRT EP] trt_build_heuristics_enable is deprecated on TRT 8.6 onwards. Please set builder optimization level as 2 to enable builder heuristics.";
-      }
+  // for TRT 8.6 onwards, heuristic-based tactic option is automatically enabled by setting builder optimization level 2
+  if (build_heuristics_enable_) {
+    if (builder_optimization_level_ == 2) {
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder heuristics are automatically enabled by builder optimization level 2. trt_build_heuristics_enable is deprecated on TRT 8.6 onwards.";
+    } else {
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] trt_build_heuristics_enable is deprecated on TRT 8.6 onwards. Please set builder optimization level as 2 to enable builder heuristics.";
     }
+  }
 #endif
 
 #if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
-    // switch optimizaion level
-    if (builder_optimization_level_ != 3) {
-      trt_config->setBuilderOptimizationLevel(builder_optimization_level_);
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
-    }
+  // switch optimizaion level
+  if (builder_optimization_level_ != 3) {
+    trt_config->setBuilderOptimizationLevel(builder_optimization_level_);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
+  }
 
-    // limit auxiliary streams
-    if (auxiliary_streams_ >= 0) {
-      trt_config->setMaxAuxStreams(auxiliary_streams_);
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << auxiliary_streams_;
-    }
+  // limit auxiliary streams
+  if (auxiliary_streams_ >= 0) {
+    trt_config->setMaxAuxStreams(auxiliary_streams_);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << auxiliary_streams_;
+  }
 #else
-    if (builder_optimization_level_ != 3) {
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
-    }
-    if (auxiliary_streams_ >= 0) {
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
-    }
+  if (builder_optimization_level_ != 3) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
+  }
+  if (auxiliary_streams_ >= 0) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
+  }
 #endif
-    // limit used tactic sources
-    if (!tactic_sources_.empty()) {
-      nvinfer1::TacticSources tactics = trt_config->getTacticSources();
-      tactics |= GetTacticSourceFromString(tactic_sources_);
-      trt_config->setTacticSources(tactics);
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using " << tactic_sources_;
-    }
-
-    // Build TRT engine (if needed) and load TRT engine if:
-    //   (1) Graph has no dynamic shape input
-    //   (2) All the dynamic shape inputs have associated explicit profiles specified by user
-    //
-    // Otherwise engine will be handled at inference time.
-    std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
-    std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
 
-    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
-    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-    cudaDeviceProp prop;
-    CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
-    std::string compute_capability = GetComputeCapacity(prop);
-
-    if (!has_dynamic_shape) {
-      const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
-      const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
-      const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-      const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
-      std::string timing_cache_path = "";
-      bool engine_update = false;
-      if (timing_cache_enable_) {
-        timing_cache_path = GetTimingCachePath(global_cache_path_, prop);
-      }
-      {
-        // ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
-        auto lock = GetApiLock();
+  // limit used tactic sources
+  if (!tactic_sources_.empty()) {
+    nvinfer1::TacticSources tactics = trt_config->getTacticSources();
+    tactics |= GetTacticSourceFromString(tactic_sources_);
+    trt_config->setTacticSources(tactics);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using " << tactic_sources_;
+  }
 
-        // If explicit profile flag is on and engine cache enable flag is on,
-        // we need to compare explicit profiles and profiles used to build the engine in order to decide whether to rebuild the engine.
-        if (has_explicit_profile && engine_cache_enable_) {
-          engine_update = CompareProfiles(profile_cache_path, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_);
-          if (engine_update) {
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Engine will be built";
-          } else {
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Engine won't be rebuilt";
-          }
-        }
+  // Build TRT engine (if needed) and load TRT engine if:
+  //   (1) Graph has no dynamic shape input
+  //   (2) All the dynamic shape inputs have associated explicit profiles specified by user
+  //
+  // Otherwise engine will be handled at inference time.
+  std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+  std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
+
+  std::string cache_path = "";
+  std::string cache_suffix = "";
+  // Customize cache prefix if assigned
+  if (!cache_prefix_.empty()) {
+    // Generate cache suffix in case user would like to customize cache prefix
+    cache_suffix = "_" + GetCacheSuffix(fused_node.Name(), trt_node_name_with_precision);
+    cache_path = GetCachePath(cache_path_, cache_prefix_) + cache_suffix;
+  } else {
+    cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
+  }
 
-        std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
-        if (engine_cache_enable_ && !engine_decryption_enable_ && engine_file && !engine_update) {
-          engine_file.seekg(0, std::ios::end);
-          size_t engine_size = engine_file.tellg();
-          engine_file.seekg(0, std::ios::beg);
-          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-          engine_file.read((char*)engine_buf.get(), engine_size);
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
-          if (trt_engine == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not deserialize engine from cache: " + engine_cache_path);
-          }
-        } else if (engine_decryption_enable_ && engine_cache_enable_ && std::filesystem::exists(encrypted_engine_cache_path) && !engine_update) {
-          // Decrypt engine
-          size_t engine_size = 0;
-          if (!engine_decryption_(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not get engine buffer size");
-          }
-          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-          if (!engine_decryption_(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not call engine decryption function decrypt");
-          }
-          // Deserialize engine
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
-          if (trt_engine == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
-          }
-        } else {
-          // Set INT8 per tensor dynamic range
-          if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) {
-            trt_config->setInt8Calibrator(nullptr);
-            if (!SetDynamicRange(*trt_network, dynamic_range_map)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not set INT8 dynamic range for fused node: " + fused_node.Name());
-            }
-          }
+  // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+  // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+  const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
+  const std::string engine_cache_path = cache_path_prefix + ".engine";
+  const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
+  const std::string profile_cache_path = cache_path_prefix + ".profile";
 
-          // Load timing cache from file. Create a fresh cache if the file doesn't exist
-          std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
-          if (timing_cache_enable_) {
-            std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
-            timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
-            if (timing_cache == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not create timing cache: " + timing_cache_path);
-            }
-            trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
-            }
-          }
+  // Generate file name for dumping ep context model
+  if (dump_ep_context_model_ && ctx_model_path_.empty()) {
+    ctx_model_path_ = GetCtxModelPath(ep_context_file_path_, model_path_);
+  }
 
-          // Build engine
-          std::chrono::steady_clock::time_point engine_build_start;
-          if (detailed_build_log_) {
-            engine_build_start = std::chrono::steady_clock::now();
-          }
-          std::unique_ptr<nvinfer1::IHostMemory> serialized_engine{trt_builder->buildSerializedNetwork(*trt_network, *trt_config)};
-          if (serialized_engine == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP failed to create engine from network for fused node: " + fused_node.Name());
-          }
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
-          if (trt_engine == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP failed to deserialize engine for fused node: " + fused_node.Name());
-          }
-          if (detailed_build_log_) {
-            auto engine_build_stop = std::chrono::steady_clock::now();
-            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
-          }
-          if (engine_cache_enable_) {
-            // Serialize engine profile if it has explicit profiles
-            if (has_explicit_profile) {
-              SerializeProfileV2(profile_cache_path, input_explicit_shape_ranges);
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
-            }
+  if (!has_dynamic_shape) {
+    std::string timing_cache_path = "";
+    bool engine_update = false;
+    if (timing_cache_enable_) {
+      timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
+    }
+    {
+      // ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
+      auto lock = GetApiLock();
 
-            if (engine_decryption_enable_) {
-              // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
-              if (engine_encryption_ != nullptr) {
-                if (!engine_encryption_(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
-                  return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                         "TensorRT EP call to engine encryption library failed");
-                }
-                LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
-              } else {
-                LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
-              }
-            } else {
-              std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
-              file.write(reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
-            }
-          }
-          // serialize and save timing cache
-          if (timing_cache_enable_) {
-            auto timing_cache = trt_config->getTimingCache();
-            std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
-            if (timingCacheHostData == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not serialize timing cache: " + timing_cache_path);
-            }
-            saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
-            }
-          }
+      // If explicit profile flag is on and engine cache enable flag is on,
+      // we need to compare explicit profiles and profiles used to build the engine in order to decide whether to rebuild the engine.
+      if (has_explicit_profile && engine_cache_enable_) {
+        engine_update = CompareProfiles(profile_cache_path, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_);
+        if (engine_update) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Engine will be built";
+        } else {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Engine won't be rebuilt";
         }
       }
 
-      // Build context
-      // Note: Creating an execution context from an engine is thread safe per TRT doc
-      // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-      if (context_memory_sharing_enable_) {
-        size_t mem_size = trt_engine->getDeviceMemorySize();
-        if (mem_size > max_ctx_mem_size_) {
-          max_ctx_mem_size_ = mem_size;
+      std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
+      if (engine_cache_enable_ && !engine_decryption_enable_ && engine_file && !engine_update) {
+        engine_file.seekg(0, std::ios::end);
+        size_t engine_size = engine_file.tellg();
+        engine_file.seekg(0, std::ios::beg);
+        std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+        engine_file.read((char*)engine_buf.get(), engine_size);
+        trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
+        if (trt_engine == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not deserialize engine from cache: " + engine_cache_path);
         }
-        trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
-      } else {
-        trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
-      }
-      if (!trt_context) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                               "TensorRT EP could not build execution context for fused node: " + fused_node.Name());
-      }
-    }
-
-    // Create input to index map
-    for (int i = 0; i < num_inputs; ++i) {
-      auto input = trt_network->getInput(i);
-      const std::string& input_name = input->getName();
-      const auto& iter = input_map.find(input_name);
-      if (iter != input_map.end()) {
-        input_indexes[input_name] = iter->second;
-      }
-    }
-
-    // Create output to index and type maps
-    const auto& graph_output = model_proto->graph().output();
-    for (int i = 0; i < num_outputs; ++i) {
-      const std::string& output_name = trt_network->getOutput(i)->getName();
-      const auto& iter = output_map.find(output_name);
-      if (iter != output_map.end()) {
-        output_indexes[output_name] = iter->second;
-      }
-      const auto& tensor_type = graph_output[i].type().tensor_type();
-      output_types[output_name] = tensor_type.elem_type();
-    }
-
-    // Save TRT engine, other TRT objects and input/output info to map
-    parsers_.emplace(fused_node.Name(), std::move(trt_parser));
-    engines_.emplace(fused_node.Name(), std::move(trt_engine));
-    contexts_.emplace(fused_node.Name(), std::move(trt_context));
-    networks_.emplace(fused_node.Name(), std::move(trt_network));
-    input_info_[fused_node.Name()].push_back(input_indexes);
-    output_info_[fused_node.Name()].push_back(output_indexes);
-    output_info_[fused_node.Name()].push_back(output_types);
-    input_shape_ranges_[fused_node.Name()] = input_implicit_shape_ranges;
-    profiles_.emplace(fused_node.Name(), std::move(trt_profiles));
-
-    // Create function state
-    // TODO: remove default capture
-    NodeComputeInfo compute_info;
-    compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
-      std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>();
-      // translate tactic sources string to nvinfer1::TacticSources
-      nvinfer1::TacticSources tactics = 0;
-      if (!tactic_sources_.empty()) {
-        tactics = GetTacticSourceFromString(tactic_sources_);
-      }
-      *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(),
-            &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
-            &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
-            input_shape_ranges_[context->node_name], sync_stream_after_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
-            dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
-            runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
-            dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
-            global_cache_path_, force_timing_cache_match_, detailed_build_log_, build_heuristics_enable_, sparsity_enable_,
-            builder_optimization_level_, auxiliary_streams_, !tactic_sources_.empty(), tactics};
-      *state = p.release();
-      return 0;
-    };
-
-    // Release function state
-    compute_info.release_state_func = [](FunctionState state) {
-      delete static_cast<TensorrtFuncState*>(state);
-    };
-
-    // Create compute function
-    compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
-      Ort::KernelContext ctx(context);
-
-      TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
-
-      // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
-      // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
-      // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-      std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
-      const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
-      const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
-      const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
-      bool sync_stream_after_enqueue = trt_state->sync_stream_after_enqueue;
-      auto fused_node_name = trt_state->fused_node_name;
-      auto& shape_ranges = trt_state->input_shape_ranges;
-      auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
-      auto trt_builder = trt_state->builder;
-      auto trt_engine = trt_state->engine->get();
-      auto trt_context = trt_state->context->get();
-      auto trt_profiles = trt_state->profiles;
-      auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
-      int num_inputs = static_cast<int>(input_indexes.size());
-      int num_outputs = static_cast<int>(output_indexes.size());
-      bool engine_update = false;
-      bool context_update = false;
-      std::unordered_set<std::string> input_names;
-      std::unordered_map<std::string, std::vector<int32_t>> tensor_shape_values;
-
-      OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id_), device_id_);
-      if (alloc_ == nullptr) {
-        Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
-      }
-      OrtAllocator* alloc = alloc_;
-
-      void* cuda_stream;
-      Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
-      cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
-
-      // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
-      // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-      cudaDeviceProp prop;
-      CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
-      std::string compute_capability = GetComputeCapacity(prop);
-
-      // Prepare cache name
-      const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
-      const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
-      const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-      const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
-      std::string timing_cache_path = "";
-      if (timing_cache_enable_) {
-        timing_cache_path = GetTimingCachePath(global_cache_path_, prop);
-      }
-
-      // Load serialized engine
-      if (trt_state->engine_cache_enable && trt_engine == nullptr) {
-        std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
-        std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
-        if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
-          // Deserialize profile
-          shape_ranges = DeserializeProfileV2(profile_file);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-
-          // Prepare buffer
-          engine_file.seekg(0, std::ios::end);
-          size_t engine_size = engine_file.tellg();
-          engine_file.seekg(0, std::ios::beg);
-          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-          engine_file.read((char*)engine_buf.get(), engine_size);
-
-          // Deserialize engine
-          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-          trt_state->engine->reset();
-          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-              trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
-          if (!(*(trt_state->engine))) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
-          }
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
-          trt_engine = trt_state->engine->get();
-          context_update = true;
-        } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
-          shape_ranges = DeserializeProfileV2(profile_file);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-          // Decrypt engine
-          size_t engine_size = 0;
-          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not get engine buffer size");
-          }
-          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not call engine decryption function decrypt");
-          }
-          // Deserialize engine
-          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-          trt_state->engine->reset();
-          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
-          if (!(*(trt_state->engine))) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
-          }
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
-          trt_engine = trt_state->engine->get();
-          context_update = true;
+      } else if (engine_decryption_enable_ && engine_cache_enable_ && std::filesystem::exists(encrypted_engine_cache_path) && !engine_update) {
+        // Decrypt engine
+        size_t engine_size = 0;
+        if (!engine_decryption_(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not get engine buffer size");
         }
-      }
-
-      // Check and update shape ranges for dynamic shape inputs.
-      for (int i = 0, end = num_inputs; i < end; ++i) {
-        auto input = trt_state->network->get()->getInput(i);
-        const std::string& input_name = input->getName();
-        input_names.insert(input_name);
-
-        // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
-        // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
-        if (shape_ranges.find(input_name) != shape_ranges.end()) {
-          auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
-          if (status != Status::OK()) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
-          }
+        std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+        if (!engine_decryption_(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not call engine decryption function decrypt");
         }
-      }
-
-      // Regenerate engine
-      if (engine_update) {
-        // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
-        trt_state->context->reset();
-        trt_state->engine->reset();
-        auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-        trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr));
-        for (auto trt_profile : trt_profiles) {
-          trt_config->addOptimizationProfile(trt_profile);
+        // Deserialize engine
+        trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
+        if (trt_engine == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
         }
-
-        // Set INT8 Per Tensor Dynamic range
-        if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+      } else {
+        // Set INT8 per tensor dynamic range
+        if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) {
           trt_config->setInt8Calibrator(nullptr);
-          if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
+          if (!SetDynamicRange(*trt_network, dynamic_range_map)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not set INT8 dynamic range for fused node: " + fused_node.Name());
           }
         }
 
-        // Set precision
-        if (trt_state->fp16_enable && trt_state->int8_enable) {
-          trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
-        } else if (trt_state->fp16_enable) {
-          trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-        } else if (trt_state->int8_enable) {
-          trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
-        }
-
-        // Set DLA (DLA can only run with FP16 or INT8)
-        if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
-          trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-          trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-          trt_config->setDLACore(trt_state->dla_core);
-        }
-
-        // enable sparse weights
-        if (trt_state->sparsity_enable) {
-          trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
-        }
-#if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5
-        // enable builder heuristics
-        if (trt_state->build_heuristics_enable) {
-          trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
-        }
-#elif NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
-        // switch optimizaion level
-        if (trt_state->builder_optimization_level != 3) {
-          trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
-        }
-
-        // limit auxiliary streams
-        if (trt_state->auxiliary_streams >= 0) {
-          trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
-        }
-#else
-        if (trt_state->builder_optimization_level != 3) {
-          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
-        }
-        if (trt_state->auxiliary_streams >= 0) {
-          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
-        }
-#endif
-        // limit used tactic sources
-        if (trt_state->filter_tactic_sources) {
-          nvinfer1::TacticSources tactics = trt_config->getTacticSources();
-          tactics |= trt_state->tactic_sources;
-          trt_config->setTacticSources(tactics);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
-        }
-
         // Load timing cache from file. Create a fresh cache if the file doesn't exist
         std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
-        if (trt_state->timing_cache_enable) {
+        if (timing_cache_enable_) {
           std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
           timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
           if (timing_cache == nullptr) {
@@ -3221,44 +2854,37 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         }
 
         // Build engine
-        std::unique_ptr<nvinfer1::IHostMemory> serialized_engine;
-        {
-          auto lock = GetApiLock();
-          std::chrono::steady_clock::time_point engine_build_start;
-          if (detailed_build_log_) {
-            engine_build_start = std::chrono::steady_clock::now();
-          }
-          serialized_engine = std::unique_ptr<nvinfer1::IHostMemory>(
-              trt_builder->buildSerializedNetwork(*trt_state->network->get(), *trt_config));
-          if (!serialized_engine) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create engine from network.");
-          }
-          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-              trt_state->runtime->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
-          if (!(*(trt_state->engine))) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to deserialize engine.");
-          }
-          if (detailed_build_log_) {
-            auto engine_build_stop = std::chrono::steady_clock::now();
-            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
-          }
+        std::chrono::steady_clock::time_point engine_build_start;
+        if (detailed_build_log_) {
+          engine_build_start = std::chrono::steady_clock::now();
         }
-        if (!(*(trt_state->engine))) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+        std::unique_ptr<nvinfer1::IHostMemory> serialized_engine{trt_builder->buildSerializedNetwork(*trt_network, *trt_config)};
+        if (serialized_engine == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP failed to create engine from network for fused node: " + fused_node.Name());
         }
-        trt_engine = trt_state->engine->get();
-        if (trt_state->engine_cache_enable) {
-          // Serialize engine profile
-          SerializeProfileV2(profile_cache_path, shape_ranges);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
+        trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
+        if (trt_engine == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP failed to deserialize engine for fused node: " + fused_node.Name());
+        }
+        if (detailed_build_log_) {
+          auto engine_build_stop = std::chrono::steady_clock::now();
+          LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+        }
+        if (engine_cache_enable_) {
+          // Serialize engine profile if it has explicit profiles
+          if (has_explicit_profile) {
+            SerializeProfileV2(profile_cache_path, input_explicit_shape_ranges);
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
+          }
 
-          // Serialize engine
-          if (trt_state->engine_decryption_enable) {
+          if (engine_decryption_enable_) {
             // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
-            if (trt_state->engine_encryption != nullptr) {
-              if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
+            if (engine_encryption_ != nullptr) {
+              if (!engine_encryption_(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
                 return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                       "TensorRT EP could not call engine encryption function encrypt");
+                                       "TensorRT EP call to engine encryption library failed");
               }
               LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
             } else {
@@ -3267,12 +2893,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           } else {
             std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
             file.write(reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
           }
         }
-
         // serialize and save timing cache
-        if (trt_state->timing_cache_enable) {
+        if (timing_cache_enable_) {
           auto timing_cache = trt_config->getTimingCache();
           std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
           if (timingCacheHostData == nullptr) {
@@ -3284,183 +2909,899 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
             LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
           }
         }
-        context_update = true;
+        // dump EP context node model
+        if (dump_ep_context_model_) {
+          // "ep_cache_context" node attribute should be a relative path to context model directory
+          if (ep_cache_context_attr_.empty()) {
+            auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
+            ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
+          }
+
+          std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto{CreateCtxModel(graph_body_viewer,
+                                                                                 ep_cache_context_attr_,
+                                                                                 reinterpret_cast<char*>(serialized_engine->data()),
+                                                                                 serialized_engine->size(),
+                                                                                 ep_context_embed_mode_,
+                                                                                 compute_capability_,
+                                                                                 GetLogger())};
+          DumpCtxModel(model_proto.get(), ctx_model_path_);
+        }
       }
+    }
 
-      if (context_update) {
-        if (trt_state->context_memory_sharing_enable) {
-          *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
-              trt_state->engine->get()->createExecutionContextWithoutDeviceMemory());
-        } else {
-          *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
-              trt_state->engine->get()->createExecutionContext());
+    // Build context
+    // Note: Creating an execution context from an engine is thread safe per TRT doc
+    // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    if (context_memory_sharing_enable_) {
+      size_t mem_size = trt_engine->getDeviceMemorySize();
+      if (mem_size > max_ctx_mem_size_) {
+        max_ctx_mem_size_ = mem_size;
+      }
+      trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
+    } else {
+      trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
+    }
+    if (!trt_context) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP could not build execution context for fused node: " + fused_node.Name());
+    }
+  }
+
+  // Create input to index map
+  for (int i = 0; i < num_inputs; ++i) {
+    auto input = trt_network->getInput(i);
+    const std::string& input_name = input->getName();
+    const auto& iter = input_map.find(input_name);
+    if (iter != input_map.end()) {
+      input_indexes[input_name] = iter->second;
+    }
+  }
+
+  // Create output to index and type maps
+  const auto& graph_output = model_proto->graph().output();
+  for (int i = 0; i < num_outputs; ++i) {
+    const std::string& output_name = trt_network->getOutput(i)->getName();
+    const auto& iter = output_map.find(output_name);
+    if (iter != output_map.end()) {
+      output_indexes[output_name] = iter->second;
+    }
+    const auto& tensor_type = graph_output[i].type().tensor_type();
+    output_types[output_name] = tensor_type.elem_type();
+  }
+
+  // Save TRT engine, other TRT objects and input/output info to map
+  parsers_.emplace(fused_node.Name(), std::move(trt_parser));
+  engines_.emplace(fused_node.Name(), std::move(trt_engine));
+  contexts_.emplace(fused_node.Name(), std::move(trt_context));
+  networks_.emplace(fused_node.Name(), std::move(trt_network));
+  input_info_[fused_node.Name()].push_back(input_indexes);
+  output_info_[fused_node.Name()].push_back(output_indexes);
+  output_info_[fused_node.Name()].push_back(output_types);
+  input_shape_ranges_[fused_node.Name()] = input_implicit_shape_ranges;
+  profiles_.emplace(fused_node.Name(), std::move(trt_profiles));
+
+  // For dynamic shape input model, firstly TRT EP creates a model proto which includes inputs, outputs and empty engine.
+  // TRT EP will serialize the model at inference time due to engine can be updated and the updated engine should be included in the model.
+  // However, if the embed_mode is 0 (only includes engine path), TRT EP will serialize it here.
+  if (dump_ep_context_model_ && has_dynamic_shape) {
+    // "ep_cache_context" node attribute should be a relative path to context model directory
+    if (ep_cache_context_attr_.empty()) {
+      auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
+      ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
+    }
+    model_proto_.reset(CreateCtxModel(graph_body_viewer,
+                                      ep_cache_context_attr_,
+                                      nullptr,
+                                      0,
+                                      ep_context_embed_mode_,
+                                      compute_capability_,
+                                      GetLogger()));
+    if (ep_context_embed_mode_ == 0) {
+      DumpCtxModel(model_proto_.get(), ctx_model_path_);
+    }
+  }
+
+  // Create function state
+  // TODO: remove default capture
+  NodeComputeInfo compute_info;
+  compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
+    std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>();
+    // translate tactic sources string to nvinfer1::TacticSources
+    nvinfer1::TacticSources tactics = 0;
+    if (!tactic_sources_.empty()) {
+      tactics = GetTacticSourceFromString(tactic_sources_);
+    }
+    *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(),
+          &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
+          &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
+          input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
+          dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
+          runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
+          dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
+          global_cache_path_, force_timing_cache_match_, detailed_build_log_, build_heuristics_enable_, sparsity_enable_,
+          builder_optimization_level_, auxiliary_streams_, !tactic_sources_.empty(), tactics, cuda_graph_enable_, cache_prefix_, cache_suffix};
+    *state = p.release();
+    return 0;
+  };
+
+  // Release function state
+  compute_info.release_state_func = [](FunctionState state) {
+    delete static_cast<TensorrtFuncState*>(state);
+  };
+
+  // Create compute function
+  compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    Ort::KernelContext ctx(context);
+
+    TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
+
+    // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
+    // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
+    // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
+    const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
+    const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
+    const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
+    auto fused_node_name = trt_state->fused_node_name;
+    auto& shape_ranges = trt_state->input_shape_ranges;
+    auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
+    auto trt_builder = trt_state->builder;
+    auto trt_engine = trt_state->engine->get();
+    auto trt_context = trt_state->context->get();
+    auto trt_profiles = trt_state->profiles;
+    auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
+    int num_inputs = static_cast<int>(input_indexes.size());
+    int num_outputs = static_cast<int>(output_indexes.size());
+    bool engine_update = false;
+    bool context_update = false;
+    std::unordered_set<std::string> input_names;
+    std::unordered_map<std::string, std::vector<int32_t>> tensor_shape_values;
+
+    OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow<OrtDevice::DeviceId>(device_id_));
+    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_);
+    if (alloc_ == nullptr) {
+      Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
+    }
+    OrtAllocator* alloc = alloc_;
+
+    void* cuda_stream;
+    Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
+
+    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+    // Prepare cache name
+    std::string cache_path = "";
+    // Customize cache prefix if assigned
+    if (!cache_prefix_.empty()) {
+      cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->cache_prefix) + trt_state->cache_suffix;
+    } else {
+      cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
+    }
+    const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
+    const std::string engine_cache_path = cache_path_prefix + ".engine";
+    const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
+    const std::string profile_cache_path = cache_path_prefix + ".profile";
+    std::string timing_cache_path = "";
+    if (timing_cache_enable_) {
+      timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
+    }
+
+    // Load serialized engine
+    if (trt_state->engine_cache_enable && trt_engine == nullptr) {
+      std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
+      std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
+      if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
+        // Deserialize profile
+        shape_ranges = DeserializeProfileV2(profile_file);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+
+        // Prepare buffer
+        engine_file.seekg(0, std::ios::end);
+        size_t engine_size = engine_file.tellg();
+        engine_file.seekg(0, std::ios::beg);
+        std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+        engine_file.read((char*)engine_buf.get(), engine_size);
+
+        // Deserialize engine
+        // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+        // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+        trt_state->engine->reset();
+        *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+            trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
+        if (!(*(trt_state->engine))) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
         }
-        if (!(*(trt_state->context))) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context.");
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
+        trt_engine = trt_state->engine->get();
+        context_update = true;
+      } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
+        shape_ranges = DeserializeProfileV2(profile_file);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+        // Decrypt engine
+        size_t engine_size = 0;
+        if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not get engine buffer size");
         }
-        trt_context = trt_state->context->get();
+        std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+        if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not call engine decryption function decrypt");
+        }
+        // Deserialize engine
+        // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+        // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+        trt_state->engine->reset();
+        *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
+        if (!(*(trt_state->engine))) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
+        }
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
+        trt_engine = trt_state->engine->get();
+        context_update = true;
       }
+    }
 
-      // Get input and output binding names
-      int total_bindings = trt_engine->getNbIOTensors();
-      std::vector<char const*> input_binding_names, output_binding_names;
-      for (int i = 0, end = total_bindings; i < end; ++i) {
-        auto const& name = trt_engine->getIOTensorName(i);
-        auto const& mode = trt_engine->getTensorIOMode(name);
-        if (mode == nvinfer1::TensorIOMode::kINPUT) {
-          input_binding_names.push_back(name);
-        } else {
-          output_binding_names.push_back(name);
+    // Check and update shape ranges for dynamic shape inputs.
+    for (int i = 0, end = num_inputs; i < end; ++i) {
+      auto input = trt_state->network->get()->getInput(i);
+      const std::string& input_name = input->getName();
+      input_names.insert(input_name);
+
+      // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
+      // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
+      if (shape_ranges.find(input_name) != shape_ranges.end()) {
+        auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
         }
       }
+    }
+
+    // Regenerate engine
+    if (engine_update) {
+      // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
+      trt_state->context->reset();
+      trt_state->engine->reset();
+      auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
+      trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr));
+      for (auto trt_profile : trt_profiles) {
+        trt_config->addOptimizationProfile(trt_profile);
+      }
+
+      // Set INT8 Per Tensor Dynamic range
+      if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+        trt_config->setInt8Calibrator(nullptr);
+        if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
+        }
+      }
+
+      // Set precision
+      if (trt_state->fp16_enable && trt_state->int8_enable) {
+        trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
+      } else if (trt_state->fp16_enable) {
+        trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+      } else if (trt_state->int8_enable) {
+        trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
+      }
+
+      // Set DLA (DLA can only run with FP16 or INT8)
+      if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
+        trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+        trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+        trt_config->setDLACore(trt_state->dla_core);
+      }
+
+      // enable sparse weights
+      if (trt_state->sparsity_enable) {
+        trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
+      }
+#if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5
+      // enable builder heuristics
+      if (trt_state->build_heuristics_enable) {
+        trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
+      }
+#elif NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
+      // switch optimizaion level
+      if (trt_state->builder_optimization_level != 3) {
+        trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
+      }
 
-      /*
-       * Set input shapes and bind input buffers
-       */
-      std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
-      for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
-        char const* input_name = input_binding_names[i];
+      // limit auxiliary streams
+      if (trt_state->auxiliary_streams >= 0) {
+        trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
+      }
+#else
+      if (trt_state->builder_optimization_level != 3) {
+        LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
+      }
+      if (trt_state->auxiliary_streams >= 0) {
+        LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
+      }
+#endif
+      // limit used tactic sources
+      if (trt_state->filter_tactic_sources) {
+        nvinfer1::TacticSources tactics = trt_config->getTacticSources();
+        tactics |= trt_state->tactic_sources;
+        trt_config->setTacticSources(tactics);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
+      }
+
+      // Load timing cache from file. Create a fresh cache if the file doesn't exist
+      std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+      if (trt_state->timing_cache_enable) {
+        std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+        timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+        if (timing_cache == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not create timing cache: " + timing_cache_path);
+        }
+        trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+        if (detailed_build_log_) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
+        }
+      }
 
-        size_t input_index = 0;
-        const auto iter = input_indexes.find(input_name);
-        if (iter != input_indexes.end()) {
-          input_index = iter->second;
+      // Build engine
+      std::unique_ptr<nvinfer1::IHostMemory> serialized_engine;
+      {
+        auto lock = GetApiLock();
+        std::chrono::steady_clock::time_point engine_build_start;
+        if (detailed_build_log_) {
+          engine_build_start = std::chrono::steady_clock::now();
+        }
+        serialized_engine = std::unique_ptr<nvinfer1::IHostMemory>(
+            trt_builder->buildSerializedNetwork(*trt_state->network->get(), *trt_config));
+        if (!serialized_engine) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create engine from network.");
+        }
+        *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+            trt_state->runtime->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
+        if (!(*(trt_state->engine))) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to deserialize engine.");
         }
-        auto input_tensor = ctx.GetInput(input_index);
-        auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-        const auto tensor_shapes = tensor_info.GetShape();
+        if (detailed_build_log_) {
+          auto engine_build_stop = std::chrono::steady_clock::now();
+          LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+        }
+      }
+      if (!(*(trt_state->engine))) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+      }
+      trt_engine = trt_state->engine->get();
+      if (trt_state->engine_cache_enable) {
+        // Serialize engine profile
+        SerializeProfileV2(profile_cache_path, shape_ranges);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
 
-        // Only use for "shape tensor" input
-        std::vector<int32_t> shape_values;
-        if (tensor_shape_values.find(input_name) != tensor_shape_values.end()) {
-          shape_values = tensor_shape_values[input_name];
+        // Serialize engine
+        if (trt_state->engine_decryption_enable) {
+          // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
+          if (trt_state->engine_encryption != nullptr) {
+            if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
+              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                     "TensorRT EP could not call engine encryption function encrypt");
+            }
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
+          } else {
+            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
+          }
+        } else {
+          std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
+          file.write(reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
         }
+      }
 
-        auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream);
-        if (status != Status::OK()) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      // serialize and save timing cache
+      if (trt_state->timing_cache_enable) {
+        auto timing_cache = trt_config->getTimingCache();
+        std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+        if (timingCacheHostData == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not serialize timing cache: " + timing_cache_path);
         }
+        saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+        if (detailed_build_log_) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
+        }
+      }
+
+      // dump ep context model
+      if (dump_ep_context_model_ && ep_context_embed_mode_) {
+        UpdateCtxNodeModelEngineContext(model_proto_.get(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
+        DumpCtxModel(model_proto_.get(), ctx_model_path_);
+      }
+      context_update = true;
+    }
+
+    if (context_update) {
+      if (trt_state->context_memory_sharing_enable) {
+        *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
+            trt_state->engine->get()->createExecutionContextWithoutDeviceMemory());
+      } else {
+        *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
+            trt_state->engine->get()->createExecutionContext());
       }
+      if (!(*(trt_state->context))) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context.");
+      }
+      trt_context = trt_state->context->get();
+    }
 
-      /*
-       * Set output shapes and bind output buffers
-       */
-      std::unordered_map<char const*, void*> buffers;
-      buffers.reserve(num_outputs);
-      using OutputOrtValue = Ort::UnownedValue;
-      std::unordered_map<size_t, OutputOrtValue> output_tensors;
-      output_tensors.reserve(num_outputs);
-      std::unordered_map<size_t, int> output_dim_sizes;
-      output_dim_sizes.reserve(num_outputs);
-      std::unordered_set<char const*> dds_output_set;
+    // Get input and output binding names
+    int total_bindings = trt_engine->getNbIOTensors();
+    std::vector<char const*> input_binding_names, output_binding_names;
+    for (int i = 0, end = total_bindings; i < end; ++i) {
+      auto const& name = trt_engine->getIOTensorName(i);
+      auto const& mode = trt_engine->getTensorIOMode(name);
+      if (mode == nvinfer1::TensorIOMode::kINPUT) {
+        input_binding_names.push_back(name);
+      } else {
+        output_binding_names.push_back(name);
+      }
+    }
 
-      for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
-        char const* output_name = output_binding_names[i];
+    /*
+     * Set input shapes and bind input buffers
+     */
+    std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
+    for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
+      char const* input_name = input_binding_names[i];
 
+      size_t input_index = 0;
+      const auto iter = input_indexes.find(input_name);
+      if (iter != input_indexes.end()) {
+        input_index = iter->second;
+      }
+      auto input_tensor = ctx.GetInput(input_index);
+      auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+      const auto tensor_shapes = tensor_info.GetShape();
+
+      // Only use for "shape tensor" input
+      std::vector<int32_t> shape_values;
+      if (tensor_shape_values.find(input_name) != tensor_shape_values.end()) {
+        shape_values = tensor_shape_values[input_name];
+      }
+
+      auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    /*
+     * Set output shapes and bind output buffers
+     */
+    std::unordered_map<char const*, void*> buffers;
+    buffers.reserve(num_outputs);
+    using OutputOrtValue = Ort::UnownedValue;
+    std::unordered_map<size_t, OutputOrtValue> output_tensors;
+    output_tensors.reserve(num_outputs);
+    std::unordered_map<size_t, int> output_dim_sizes;
+    output_dim_sizes.reserve(num_outputs);
+
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_index = 0;
+      const auto& index_iter = output_indexes.find(output_name);
+      if (index_iter != output_indexes.end()) {
+        output_index = index_iter->second;
+      }
+
+      size_t output_type = 0;
+      const auto type_iter = output_types.find(output_name);
+      if (type_iter != output_types.end()) {
+        output_type = type_iter->second;
+      }
+
+      Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                        dds_output_allocator_map, scratch_buffers, alloc, buffers);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    // Set execution context memory
+    if (trt_state->context_memory_sharing_enable) {
+      size_t mem_size = trt_engine->getDeviceMemorySize();
+      if (mem_size > *max_context_mem_size_ptr) {
+        *max_context_mem_size_ptr = mem_size;
+      }
+      trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
+    }
+
+    // Start CUDA graph capture.
+    // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
+    // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
+    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
+      LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
+      cuda_graph_.SetStream(stream);
+      CaptureBegin(0);
+    }
+
+    // Run TRT inference
+    if (!trt_context->enqueueV3(stream)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
+    }
+
+    /*
+     * Given that InferenceSession::Run() is guaranteed to be thread-safe meaning multiple threads can call this function concurrently,
+     * TRT EP needs to carefully take care of concurrency here, if not, following concurrent issue might happen:
+     *
+     * It's suggested that to perform inference concurrently in multiple streams, use one trt execution context per stream.
+     * In the design of TRT EP (Not apply per-thread context implementation) and if multiple threads are calling InferenceSession::Run() concurrently,
+     * the trt execution context instance is shared by all the threads and each thread aquires different stream from ORT.
+     * So TRT EP will end up having one trt execution context using multiple streams which is not suggested.
+     * But, since the whole compute_func() is protected by the lock and if cudaStreamSynchronize() is enforced here, one trt execution context per stream
+     * is guaranteed.
+     *
+     * Therefore, TRT EP needs to call cudaStreamSynchronize() which means to wait until stream has completed all operations to prevent the concurrent issue mentioned above.
+     * However, if cuda graph is enabled, TRT EP won't call cudaStreamSynchronize() since it's not allowed during graph capture.
+     */
+    if (sync_stream_after_enqueue_) {
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+    }
+
+    // Assign TRT output back to ORT output
+    // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
+    // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_type = 0;
+      const auto& iter = output_types.find(output_name);
+      if (iter != output_types.end()) {
+        output_type = iter->second;
+      }
+
+      if (dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end()) {
         size_t output_index = 0;
         const auto& index_iter = output_indexes.find(output_name);
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-
-        size_t output_type = 0;
-        const auto type_iter = output_types.find(output_name);
-        if (type_iter != output_types.end()) {
-          output_type = type_iter->second;
-        }
-
-        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
-                                          dds_output_set, dds_output_allocator_map, scratch_buffers, alloc, buffers);
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
         if (status != Status::OK()) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+        }
+      } else {
+        auto& output_tensor = output_tensors[i];
+        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
+          }
+        } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
+          }
         }
       }
+    }
 
-      // Set execution context memory
-      if (trt_state->context_memory_sharing_enable) {
-        size_t mem_size = trt_engine->getDeviceMemorySize();
-        if (mem_size > *max_context_mem_size_ptr) {
-          *max_context_mem_size_ptr = mem_size;
-        }
-        trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
+    // End CUDA graph capture.
+    // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
+    // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
+    // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
+    if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
+      if (IsGraphCaptureAllowed()) {
+        CaptureEnd(0);
+        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // so run the captured graph here to actually execute the work.
+        ORT_RETURN_IF_ERROR(ReplayGraph(0));
+      } else {
+        IncrementRegularRunCountBeforeGraphCapture();
       }
+    }
+
+    return Status::OK();
+  };
+
+  node_compute_funcs.push_back(compute_info);
+  return Status::OK();
+}
+
+Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const GraphViewer& graph_body_viewer,
+                                                                             const Node& fused_node,
+                                                                             std::unordered_map<std::string, size_t>& input_map,
+                                                                             std::unordered_map<std::string, size_t>& output_map,
+                                                                             std::vector<NodeComputeInfo>& node_compute_funcs) {
+  std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+  std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
+  std::unordered_map<std::string, size_t> input_indexes;   // TRT engine input name -> ORT kernel context input index
+  std::unordered_map<std::string, size_t> output_indexes;  // TRT engine output name -> ORT kernel context output index
+  std::unordered_map<std::string, size_t> output_types;    // TRT engine output name -> ORT output tensor type
+
+  // Get engine binary data and deserialize it
+  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, runtime_.get(), model_path_, compute_capability_);
+  auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer);
+  if (status != Status::OK()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+  }
 
-      // Start CUDA graph capture.
-      // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
-      // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
-      if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
-        LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
-        cuda_graph_.SetStream(stream);
-        CaptureBegin();
+  // Build context
+  //
+  // Note: Creating an execution context from an engine is thread safe per TRT doc
+  // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+  if (context_memory_sharing_enable_) {
+    size_t mem_size = trt_engine->getDeviceMemorySize();
+    if (mem_size > max_ctx_mem_size_) {
+      max_ctx_mem_size_ = mem_size;
+    }
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
+  } else {
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
+  }
+  if (!trt_context) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "TensorRT EP could not build execution context for fused node: " + fused_node.Name());
+  }
+
+  // Create input/output to index maps
+  for (int32_t i = 0; i < trt_engine->getNbIOTensors(); ++i) {
+    auto const& name = trt_engine->getIOTensorName(i);
+    auto const& mode = trt_engine->getTensorIOMode(name);
+    if (mode == nvinfer1::TensorIOMode::kINPUT) {
+      const auto& iter = input_map.find(name);
+      if (iter != input_map.end()) {
+        input_indexes[name] = iter->second;
       }
+    } else {
+      const auto& iter = output_map.find(name);
+      if (iter != output_map.end()) {
+        output_indexes[name] = iter->second;
+      }
+    }
+  }
+
+  // Create output to type map
+  for (auto node_arg : graph_body_viewer.GetOutputs()) {
+    auto output_name = node_arg->Name();
+    auto& type = node_arg->TypeAsProto()->tensor_type();
+    output_types[output_name] = type.elem_type();
+  }
+
+  // Save TRT engine, TRT context and input/output info to map
+  engines_.emplace(fused_node.Name(), std::move(trt_engine));
+  contexts_.emplace(fused_node.Name(), std::move(trt_context));
+  input_info_[fused_node.Name()].push_back(input_indexes);
+  output_info_[fused_node.Name()].push_back(output_indexes);
+  output_info_[fused_node.Name()].push_back(output_types);
+
+  // Create function state
+  // TODO: remove default capture
+  NodeComputeInfo compute_info;
+  compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
+    std::unique_ptr<TensorrtShortFuncState> p = std::make_unique<TensorrtShortFuncState>();
+    *p = {context->allocate_func,
+          context->release_func,
+          context->allocator_handle,
+          context->node_name,
+          &engines_[context->node_name],
+          &contexts_[context->node_name],
+          input_info_[context->node_name],
+          output_info_[context->node_name],
+          context_memory_sharing_enable_,
+          &max_ctx_mem_size_,
+          &tensorrt_mu_};
+    *state = p.release();
+    return 0;
+  };
+
+  // Release function state
+  compute_info.release_state_func = [](FunctionState state) {
+    delete static_cast<TensorrtShortFuncState*>(state);
+  };
 
-      // Run TRT inference
-      if (!trt_context->enqueueV3(stream)) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
+  // Create compute function
+  compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    Ort::KernelContext ctx(context);
+
+    TensorrtShortFuncState* trt_state = reinterpret_cast<TensorrtShortFuncState*>(state);
+
+    // The whole compute_function should be considered the critical section.
+    // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
+
+    const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
+    const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
+    const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
+    auto fused_node_name = trt_state->fused_node_name;
+    auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
+    auto trt_engine = trt_state->engine->get();
+    auto trt_context = trt_state->context->get();
+    auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
+    // int num_inputs = static_cast<int>(input_indexes.size());
+    int num_outputs = static_cast<int>(output_indexes.size());
+
+    OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, narrow<OrtDevice::DeviceId>(device_id_));
+    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device, device_id_);
+    if (alloc_ == nullptr) {
+      Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
+    }
+    OrtAllocator* alloc = alloc_;
+
+    void* cuda_stream;
+    Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
+
+    // Get input and output binding names
+    int total_bindings = trt_engine->getNbIOTensors();
+    std::vector<char const*> input_binding_names, output_binding_names;
+    for (int i = 0, end = total_bindings; i < end; ++i) {
+      auto const& name = trt_engine->getIOTensorName(i);
+      auto const& mode = trt_engine->getTensorIOMode(name);
+      if (mode == nvinfer1::TensorIOMode::kINPUT) {
+        input_binding_names.push_back(name);
+      } else {
+        output_binding_names.push_back(name);
       }
+    }
 
-      if (sync_stream_after_enqueue || dds_output_set.size() > 0) {
-        CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+    /*
+     * Set input shapes and bind input buffers
+     */
+    std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
+    for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
+      char const* input_name = input_binding_names[i];
+
+      size_t input_index = 0;
+      const auto iter = input_indexes.find(input_name);
+      if (iter != input_indexes.end()) {
+        input_index = iter->second;
       }
 
-      // Assign TRT output back to ORT output
-      // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
-      // (2) Cast TRT INT32 output to ORT INT64 output or TRT float output to double output
-      for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
-        char const* output_name = output_binding_names[i];
+      // Only use for "shape tensor" input
+      std::vector<int32_t> shape_values;
 
-        size_t output_type = 0;
-        const auto& iter = output_types.find(output_name);
-        if (iter != output_types.end()) {
-          output_type = iter->second;
-        }
+      Status status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
 
-        if (dds_output_set.find(output_name) != dds_output_set.end()) {
-          size_t output_index = 0;
-          const auto& index_iter = output_indexes.find(output_name);
-          if (index_iter != output_indexes.end()) {
-            output_index = index_iter->second;
-          }
-          auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, alloc, stream);
-          if (status != Status::OK()) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+    /*
+     * Set output shapes and bind output buffers
+     */
+    std::unordered_map<char const*, void*> buffers;
+    buffers.reserve(num_outputs);
+    using OutputOrtValue = Ort::UnownedValue;
+    std::unordered_map<size_t, OutputOrtValue> output_tensors;
+    output_tensors.reserve(num_outputs);
+    std::unordered_map<size_t, int> output_dim_sizes;
+    output_dim_sizes.reserve(num_outputs);
+
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_index = 0;
+      const auto& index_iter = output_indexes.find(output_name);
+      if (index_iter != output_indexes.end()) {
+        output_index = index_iter->second;
+      }
+
+      size_t output_type = 0;
+      const auto type_iter = output_types.find(output_name);
+      if (type_iter != output_types.end()) {
+        output_type = type_iter->second;
+      }
+
+      Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                        dds_output_allocator_map, scratch_buffers, alloc, buffers);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    // Set execution context memory
+    if (trt_state->context_memory_sharing_enable) {
+      size_t mem_size = trt_engine->getDeviceMemorySize();
+      if (mem_size > *max_context_mem_size_ptr) {
+        *max_context_mem_size_ptr = mem_size;
+      }
+      trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
+    }
+
+    // Start CUDA graph capture.
+    // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
+    // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
+    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
+      LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
+      cuda_graph_.SetStream(stream);
+      CaptureBegin(0);
+    }
+
+    // Run TRT inference
+    if (!trt_context->enqueueV3(stream)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
+    }
+
+    /*
+     * Given that InferenceSession::Run() is guaranteed to be thread-safe meaning multiple threads can call this function concurrently,
+     * TRT EP needs to carefully take care of concurrency here, if not, following concurrent issue might happen:
+     *
+     * It's suggested that to perform inference concurrently in multiple streams, use one trt execution context per stream.
+     * In the design of TRT EP (Not apply per-thread context implementation) and if multiple threads are calling InferenceSession::Run() concurrently,
+     * the trt execution context instance is shared by all the threads and each thread aquires different stream from ORT.
+     * So TRT EP will end up having one trt execution context using multiple streams which is not suggested.
+     * But, since the whole compute_func() is protected by the lock and if cudaStreamSynchronize() is enforced here, one trt execution context per stream
+     * is guaranteed.
+     *
+     * Therefore, TRT EP needs to call cudaStreamSynchronize() which means to wait until stream has completed all operations to prevent the concurrent issue mentioned above.
+     * However, if cuda graph is enabled, TRT EP won't call cudaStreamSynchronize() since it's not allowed during graph capture.
+     */
+    if (sync_stream_after_enqueue_) {
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+    }
+
+    // Assign TRT output back to ORT output
+    // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
+    // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_type = 0;
+      const auto& iter = output_types.find(output_name);
+      if (iter != output_types.end()) {
+        output_type = iter->second;
+      }
+
+      if (dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end()) {
+        size_t output_index = 0;
+        const auto& index_iter = output_indexes.find(output_name);
+        if (index_iter != output_indexes.end()) {
+          output_index = index_iter->second;
+        }
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+        }
+      } else {
+        auto& output_tensor = output_tensors[i];
+        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
           }
-        } else {
-          auto& output_tensor = output_tensors[i];
-          if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
-            if (output_tensor_ptr != nullptr) {
-              cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
-            }
-          } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-            if (output_tensor_ptr != nullptr) {
-              cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
-            }
+        } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
           }
         }
       }
+    }
 
-      // End CUDA graph capture.
-      // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
-      // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
-      // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
-      if (cuda_graph_enable_ && !IsGraphCaptured()) {
-        if (IsGraphCaptureAllowed()) {
-          CaptureEnd();
-          // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
-          // so run the captured graph here to actually execute the work.
-          ORT_RETURN_IF_ERROR(ReplayGraph());
-        } else {
-          IncrementRegularRunCountBeforeGraphCapture();
-        }
+    // End CUDA graph capture.
+    // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
+    // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
+    // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
+    if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
+      if (IsGraphCaptureAllowed()) {
+        CaptureEnd(0);
+        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // so run the captured graph here to actually execute the work.
+        ORT_RETURN_IF_ERROR(ReplayGraph(0));
+      } else {
+        IncrementRegularRunCountBeforeGraphCapture();
       }
+    }
 
-      return Status::OK();
-    };
+    return Status::OK();
+  };
 
-    node_compute_funcs.push_back(compute_info);
-  }
+  node_compute_funcs.push_back(compute_info);
   return Status::OK();
 }
 
@@ -3473,7 +3814,8 @@ void TensorrtExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegis
                             stream_,
                             external_stream_ /* use_existing_stream */,
                             external_cudnn_handle_,
-                            external_cublas_handle_);
+                            external_cublas_handle_,
+                            {});
 }
 
 OrtDevice TensorrtExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 7eefdd3cba9e..7ee0527b0b81 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -5,8 +5,9 @@
 #include <ctime>
 #include <cudnn.h>
 #include <cublas_v2.h>
-#include "NvInfer.h"
-#include "NvOnnxParser.h"
+
+#include "core/providers/tensorrt/nv_includes.h"
+
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_graph.h"
 #include "tensorrt_execution_provider_info.h"
@@ -46,6 +47,10 @@ static const std::string kProfilesMinShapes = "ORT_TENSORRT_PROFILE_MIN_SHAPES";
 static const std::string kProfilesMaxShapes = "ORT_TENSORRT_PROFILE_MAX_SHAPES";
 static const std::string kProfilesOptShapes = "ORT_TENSORRT_PROFILE_OPT_SHAPES";
 static const std::string kCudaGraphEnable = "ORT_TENSORRT_CUDA_GRAPH_ENABLE";
+static const std::string kDumpEpContextModel = "ORT_DUMP_EP_CONTEXT_MODEL";
+static const std::string kEpContextEmbedMode = "ORT_EP_CONTEXT_EMBED_MODE";
+static const std::string kEpContextComputeCapabilityEnable = "ORT_EP_CONTEXT_COMPUTE_CAPABILITY_ENABLE";
+static const std::string kEngineCachePrefix = "ORT_TENSORRT_CACHE_PREFIX";
 // Old env variable for backward compatibility
 static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
 }  // namespace tensorrt_env_vars
@@ -80,6 +85,12 @@ class TensorrtLogger : public nvinfer1::ILogger {
       }
     }
   }
+  void set_level(Severity verbosity) {
+    verbosity_ = verbosity;
+  }
+  Severity get_level() const {
+    return verbosity_;
+  }
 };
 
 namespace tensorrt_ptr {
@@ -145,7 +156,6 @@ struct TensorrtFuncState {
   std::vector<std::unordered_map<std::string, size_t>> input_info;
   std::vector<std::unordered_map<std::string, size_t>> output_info;
   std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> input_shape_ranges;
-  bool sync_stream_after_enqueue = false;
   OrtMutex* tensorrt_mu_ptr = nullptr;
   bool fp16_enable = false;
   bool int8_enable = false;
@@ -175,6 +185,23 @@ struct TensorrtFuncState {
   bool filter_tactic_sources = false;
   nvinfer1::TacticSources tactic_sources;
   bool cuda_graph_enable = 0;
+  std::string cache_prefix;
+  std::string cache_suffix;
+};
+
+// Minimum information to construct kernel function state for direct engine load code path
+struct TensorrtShortFuncState {
+  AllocateFunc test_allocate_func = nullptr;
+  DestroyFunc test_release_func = nullptr;
+  AllocatorHandle allocator = nullptr;
+  std::string fused_node_name;
+  std::unique_ptr<nvinfer1::ICudaEngine>* engine = nullptr;
+  std::unique_ptr<nvinfer1::IExecutionContext>* context = nullptr;
+  std::vector<std::unordered_map<std::string, size_t>> input_info;
+  std::vector<std::unordered_map<std::string, size_t>> output_info;
+  bool context_memory_sharing_enable = false;
+  size_t* max_context_mem_size_ptr = nullptr;
+  OrtMutex* tensorrt_mu_ptr = nullptr;
 };
 
 // Holds important information for building valid ORT graph.
@@ -213,8 +240,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
 
-  Status OnRunStart() override;
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   ProviderOptions GetProviderOptions() const override {
     return TensorrtExecutionProviderInfo::ToProviderOptions(info_);
@@ -229,8 +256,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 
   bool IsGraphCaptureEnabled() const override;
-  bool IsGraphCaptured() const override;
-  Status ReplayGraph() override;
+  bool IsGraphCaptured(int graph_annotation_id) const override;
+  Status ReplayGraph(int graph_annotation_id) override;
 
  private:
   mutable TensorrtExecutionProviderInfo info_;
@@ -258,6 +285,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
   OrtMutex tensorrt_mu_;
   int device_id_;
+  std::string compute_capability_;
   bool context_memory_sharing_enable_ = false;
   bool layer_norm_fp32_fallback_ = false;
   size_t max_ctx_mem_size_ = 0;
@@ -270,11 +298,21 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool force_timing_cache_match_ = false;
   bool detailed_build_log_ = false;
   bool cuda_graph_enable_ = false;
+  std::string cache_prefix_;
 
   // The OrtAllocator object will be get during ep compute time
   // and should be kept for the lifetime of TRT EP object.
   OrtAllocator* alloc_ = nullptr;
 
+  // For create/dump EP context node model
+  bool dump_ep_context_model_ = false;
+  std::string ep_context_file_path_;
+  int ep_context_embed_mode_ = 0;
+  std::string ctx_model_path_;
+  std::string ep_cache_context_attr_;
+  std::string engine_cache_relative_path_to_context_model_dir;
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto_ = ONNX_NAMESPACE::ModelProto::Create();
+
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
   mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;
 
@@ -302,8 +340,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   cudnnHandle_t external_cudnn_handle_ = nullptr;
   cublasHandle_t external_cublas_handle_ = nullptr;
 
-  // Call cudaStreamSynchronize() after TRT enqueueV2()/enqueueV3()
-  mutable bool sync_stream_after_enqueue_ = false;
+  // Call cudaStreamSynchronize() after TRT enqueueV3()
+  mutable bool sync_stream_after_enqueue_ = true;
 
   CUDAGraph cuda_graph_;
   bool is_graph_captured_ = false;
@@ -341,10 +379,10 @@ class TensorrtExecutionProvider : public IExecutionProvider {
     void InitCUDAGraph();
     void SetGraphStream(cudaStream_t stream);
     bool IsGraphCaptureAllowed() const;
-    void CaptureBegin();
-    void CaptureEnd();
-    bool IsGraphCaptured() const;
-    Status ReplayGraph();
+    void CaptureBegin(int graph_annotation_id);
+    void CaptureEnd(int graph_annotation_id);
+    bool IsGraphCaptured(int graph_annotation_id) const;
+    Status ReplayGraph(int graph_annotation_id);
     void IncrementRegularRunCountBeforeGraphCapture();
 
    private:
@@ -488,15 +526,34 @@ class TensorrtExecutionProvider : public IExecutionProvider {
    */
   bool IsLocalValue(const Graph& graph, const std::string& name) const;
 
+  /**
+   * Create a vector of NodeComputeInfo instances directly from "TRT engine" wrapped onnx model without
+   * going through the time-consuming processes of model parsing and engine building.
+   */
+  Status CreateNodeComputeInfoFromPrecompiledEngine(const GraphViewer& graph_body_viewer,
+                                                    const Node& fused_node,
+                                                    std::unordered_map<std::string, size_t>& input_map,
+                                                    std::unordered_map<std::string, size_t>& output_map,
+                                                    std::vector<NodeComputeInfo>& node_compute_funcs);
+
+  /**
+   * Create a vector of NodeComputeInfo instances from graph.
+   */
+  Status CreateNodeComputeInfoFromGraph(const GraphViewer& graph_body_viewer,
+                                        const Node& fused_node,
+                                        std::unordered_map<std::string, size_t>& input_map,
+                                        std::unordered_map<std::string, size_t>& output_map,
+                                        std::vector<NodeComputeInfo>& node_compute_funcs);
+
   bool IsGraphCaptureAllowed() const;
-  void CaptureBegin();
-  void CaptureEnd();
+  void CaptureBegin(int graph_annotation_id);
+  void CaptureEnd(int graph_annotation_id);
   void IncrementRegularRunCountBeforeGraphCapture();
 
   /**
    * Get the pointer to the IBuilder instance.
    * This function only creates the instance at the first time it's being called."
    */
-  nvinfer1::IBuilder* GetBuilder() const;
+  nvinfer1::IBuilder* GetBuilder(TensorrtLogger& trt_logger) const;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
index 4e466a5d568a..58a1afd00556 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc
@@ -1,15 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <unordered_set>
+
 #include "core/framework/provider_options.h"
 #include "tensorrt_execution_provider_custom_ops.h"
 #include "tensorrt_execution_provider.h"
-#include <NvInferRuntime.h>
-#include <NvInferPlugin.h>
-#include <unordered_set>
 
 namespace onnxruntime {
-extern TensorrtLogger& GetTensorrtLogger();
+extern TensorrtLogger& GetTensorrtLogger(bool verbose);
 
 /*
  * Create custom op domain list for TRT plugins.
@@ -27,8 +26,14 @@ extern TensorrtLogger& GetTensorrtLogger();
  * So, TensorRTCustomOp uses variadic inputs/outputs to pass ONNX graph validation.
  */
 common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths) {
-  std::unique_ptr<OrtCustomOpDomain> custom_op_domain = std::make_unique<OrtCustomOpDomain>();
-  custom_op_domain->domain_ = "trt.plugins";
+  static std::unique_ptr<OrtCustomOpDomain> custom_op_domain = std::make_unique<OrtCustomOpDomain>();
+  static std::vector<std::unique_ptr<TensorRTCustomOp>> created_custom_op_list;
+  static OrtMutex mutex;
+  std::lock_guard<OrtMutex> lock(mutex);
+  if (custom_op_domain->domain_ != "" && custom_op_domain->custom_ops_.size() > 0) {
+    domain_list.push_back(custom_op_domain.get());
+    return Status::OK();
+  }
 
   // Load any extra TRT plugin library if any.
   // When the TRT plugin library is loaded, the global static object is created and the plugin is registered to TRT registry.
@@ -52,7 +57,7 @@ common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>&
   try {
     // Get all registered TRT plugins from registry
     LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Getting all registered TRT plugins from TRT plugin registry ...";
-    TensorrtLogger trt_logger = GetTensorrtLogger();
+    TensorrtLogger trt_logger = GetTensorrtLogger(false);
     initLibNvInferPlugins(&trt_logger, "");
 
     int num_plugin_creator = 0;
@@ -69,38 +74,19 @@ common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>&
         continue;
       }
 
-      std::unique_ptr<TensorRTCustomOp> trt_custom_op = std::make_unique<TensorRTCustomOp>(onnxruntime::kTensorrtExecutionProvider, nullptr);
-      trt_custom_op->SetName(plugin_creator->getPluginName());
-      custom_op_domain->custom_ops_.push_back(trt_custom_op.release());
+      created_custom_op_list.push_back(std::make_unique<TensorRTCustomOp>(onnxruntime::kTensorrtExecutionProvider, nullptr));  // Make sure TensorRTCustomOp object won't be cleaned up
+      created_custom_op_list.back().get()->SetName(plugin_creator->getPluginName());
+      custom_op_domain->custom_ops_.push_back(created_custom_op_list.back().get());
       registered_plugin_names.insert(plugin_name);
     }
-    domain_list.push_back(custom_op_domain.release());
+    custom_op_domain->domain_ = "trt.plugins";
+    domain_list.push_back(custom_op_domain.get());
   } catch (const std::exception&) {
     LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration. Therefore, TRT EP can't create custom ops for TRT plugins";
   }
   return Status::OK();
 }
 
-common::Status CreateTensorRTCustomOpDomainList(TensorrtExecutionProviderInfo& info) {
-  std::vector<OrtCustomOpDomain*> domain_list;
-  std::string extra_plugin_lib_paths{""};
-  if (info.has_trt_options) {
-    if (!info.extra_plugin_lib_paths.empty()) {
-      extra_plugin_lib_paths = info.extra_plugin_lib_paths;
-    }
-  } else {
-    const std::string extra_plugin_lib_paths_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kExtraPluginLibPaths);
-    if (!extra_plugin_lib_paths_env.empty()) {
-      extra_plugin_lib_paths = extra_plugin_lib_paths_env;
-    }
-  }
-  auto status = CreateTensorRTCustomOpDomainList(domain_list, extra_plugin_lib_paths);
-  if (!domain_list.empty()) {
-    info.custom_op_domain_list = domain_list;
-  }
-  return Status::OK();
-}
-
 void ReleaseTensorRTCustomOpDomain(OrtCustomOpDomain* domain) {
   if (domain != nullptr) {
     for (auto ptr : domain->custom_ops_) {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
index 35bd38d81897..54212d34aa2c 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
@@ -13,7 +13,8 @@ using namespace onnxruntime;
 namespace onnxruntime {
 
 common::Status LoadDynamicLibrary(onnxruntime::PathString library_name);
-common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list, const std::string extra_plugin_lib_paths);
+common::Status CreateTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& domain_list,
+                                                const std::string extra_plugin_lib_paths);
 common::Status CreateTensorRTCustomOpDomainList(TensorrtExecutionProviderInfo& info);
 void ReleaseTensorRTCustomOpDomain(OrtCustomOpDomain* domain);
 void ReleaseTensorRTCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list);
@@ -23,16 +24,22 @@ struct TensorRTCustomKernel {
       : compute_stream_(compute_stream) {
   }
 
-  void Compute(OrtKernelContext* context){};  // The implementation is in TensorRT plugin. No need to implement it here.
+  void Compute(OrtKernelContext* /*context*/){
+      // The implementation is in TensorRT plugin. No need to implement it here.
+  };
 
  private:
   void* compute_stream_;
 };
 
 struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKernel> {
-  explicit TensorRTCustomOp(const char* provider, void* compute_stream) : provider_(provider), compute_stream_(compute_stream) {}
+  explicit TensorRTCustomOp(const char* provider, void* compute_stream) : provider_(provider),
+                                                                          compute_stream_(compute_stream) {
+  }
 
-  void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const { return new TensorRTCustomKernel(info, compute_stream_); };
+  void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const {
+    return new TensorRTCustomKernel(info, compute_stream_);
+  };
 
   const char* GetName() const { return name_; };
 
@@ -46,7 +53,9 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
 
   ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
 
-  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t) const { return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC; };
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC;
+  };
 
   size_t GetOutputTypeCount() const { return num_outputs_; };
 
@@ -54,7 +63,17 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
 
   ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
 
-  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const { return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC; };
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC;
+  };
+
+  bool GetVariadicInputHomogeneity() const {
+    return false;  // heterogenous
+  }
+
+  bool GetVariadicOutputHomogeneity() const {
+    return false;  // heterogeneous
+  }
 
  private:
   const char* provider_{onnxruntime::kTensorrtExecutionProvider};
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index 3ead33f9131d..cd2087c9d747 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -14,6 +14,7 @@ namespace tensorrt {
 namespace provider_option_names {
 constexpr const char* kDeviceId = "device_id";
 constexpr const char* kHasUserComputeStream = "has_user_compute_stream";
+constexpr const char* kUserComputeStream = "user_compute_stream";
 constexpr const char* kMaxPartitionIterations = "trt_max_partition_iterations";
 constexpr const char* kMinSubgraphSize = "trt_min_subgraph_size";
 constexpr const char* kMaxWorkspaceSize = "trt_max_workspace_size";
@@ -26,6 +27,7 @@ constexpr const char* kDLACore = "trt_dla_core";
 constexpr const char* kDumpSubgraphs = "trt_dump_subgraphs";
 constexpr const char* kEngineCacheEnable = "trt_engine_cache_enable";
 constexpr const char* kEngineCachePath = "trt_engine_cache_path";
+constexpr const char* kEngineCachePrefix = "trt_engine_cache_prefix";
 constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable";
 constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path";
 constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine_build";
@@ -46,11 +48,15 @@ constexpr const char* kProfilesMinShapes = "trt_profile_min_shapes";
 constexpr const char* kProfilesMaxShapes = "trt_profile_max_shapes";
 constexpr const char* kProfilesOptShapes = "trt_profile_opt_shapes";
 constexpr const char* kCudaGraphEnable = "trt_cuda_graph_enable";
+constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode";
+constexpr const char* kEpContextFilePath = "trt_ep_context_file_path";
+constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
 }  // namespace provider_option_names
 }  // namespace tensorrt
 
 TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
   TensorrtExecutionProviderInfo info{};
+  void* user_compute_stream = nullptr;
   ORT_THROW_IF_ERROR(
       ProviderOptionsParser{}
           .AddValueParser(
@@ -67,6 +73,14 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
               })
           .AddAssignmentToReference(tensorrt::provider_option_names::kMaxPartitionIterations, info.max_partition_iterations)
           .AddAssignmentToReference(tensorrt::provider_option_names::kHasUserComputeStream, info.has_user_compute_stream)
+          .AddValueParser(
+              tensorrt::provider_option_names::kUserComputeStream,
+              [&user_compute_stream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                user_compute_stream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
           .AddAssignmentToReference(tensorrt::provider_option_names::kMinSubgraphSize, info.min_subgraph_size)
           .AddAssignmentToReference(tensorrt::provider_option_names::kMaxWorkspaceSize, info.max_workspace_size)
           .AddAssignmentToReference(tensorrt::provider_option_names::kFp16Enable, info.fp16_enable)
@@ -78,6 +92,7 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCachePath, info.engine_cache_path)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCachePrefix, info.engine_cache_prefix)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build)
@@ -97,8 +112,13 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kProfilesMaxShapes, info.profile_max_shapes)
           .AddAssignmentToReference(tensorrt::provider_option_names::kProfilesOptShapes, info.profile_opt_shapes)
           .AddAssignmentToReference(tensorrt::provider_option_names::kCudaGraphEnable, info.cuda_graph_enable)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kDumpEpContextModel, info.dump_ep_context_model)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextFilePath, info.ep_context_file_path)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
           .Parse(options));  // add new provider option here.
 
+  info.user_compute_stream = user_compute_stream;
+  info.has_user_compute_stream = (user_compute_stream != nullptr);
   return info;
 }
 
@@ -107,6 +127,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
       {tensorrt::provider_option_names::kMaxPartitionIterations, MakeStringWithClassicLocale(info.max_partition_iterations)},
       {tensorrt::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {tensorrt::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {tensorrt::provider_option_names::kMinSubgraphSize, MakeStringWithClassicLocale(info.min_subgraph_size)},
       {tensorrt::provider_option_names::kMaxWorkspaceSize, MakeStringWithClassicLocale(info.max_workspace_size)},
       {tensorrt::provider_option_names::kFp16Enable, MakeStringWithClassicLocale(info.fp16_enable)},
@@ -118,6 +139,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.dump_subgraphs)},
       {tensorrt::provider_option_names::kEngineCacheEnable, MakeStringWithClassicLocale(info.engine_cache_enable)},
       {tensorrt::provider_option_names::kEngineCachePath, MakeStringWithClassicLocale(info.engine_cache_path)},
+      {tensorrt::provider_option_names::kEngineCachePrefix, MakeStringWithClassicLocale(info.engine_cache_prefix)},
       {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.engine_decryption_enable)},
       {tensorrt::provider_option_names::kDecryptionLibPath, MakeStringWithClassicLocale(info.engine_decryption_lib_path)},
       {tensorrt::provider_option_names::kForceSequentialEngineBuild, MakeStringWithClassicLocale(info.force_sequential_engine_build)},
@@ -138,6 +160,9 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kProfilesMaxShapes, MakeStringWithClassicLocale(info.profile_max_shapes)},
       {tensorrt::provider_option_names::kProfilesOptShapes, MakeStringWithClassicLocale(info.profile_opt_shapes)},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.cuda_graph_enable)},
+      {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.dump_ep_context_model)},
+      {tensorrt::provider_option_names::kEpContextFilePath, MakeStringWithClassicLocale(info.ep_context_file_path)},
+      {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)},
   };
   return options;
 }
@@ -146,6 +171,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
   auto empty_if_null = [](const char* s) { return s != nullptr ? std::string{s} : std::string{}; };
   const std::string kInt8CalibTable_ = empty_if_null(info.trt_int8_calibration_table_name);
   const std::string kEngineCachePath_ = empty_if_null(info.trt_engine_cache_path);
+  const std::string kEngineCachePrefix_ = empty_if_null(info.trt_engine_cache_prefix);
   const std::string kTimingCachePath_ = empty_if_null(info.trt_timing_cache_path);
   const std::string kTacticSources_ = empty_if_null(info.trt_tactic_sources);
   const std::string kDecryptionLibPath_ = empty_if_null(info.trt_engine_decryption_lib_path);
@@ -153,10 +179,12 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
   const std::string kProfilesMinShapes_ = empty_if_null(info.trt_profile_min_shapes);
   const std::string kProfilesMaxShapes_ = empty_if_null(info.trt_profile_max_shapes);
   const std::string kProfilesOptShapes_ = empty_if_null(info.trt_profile_opt_shapes);
+  const std::string kEpContextFilePath_ = empty_if_null(info.trt_ep_context_file_path);
 
   const ProviderOptions options{
       {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
       {tensorrt::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
+      {tensorrt::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {tensorrt::provider_option_names::kMaxPartitionIterations, MakeStringWithClassicLocale(info.trt_max_partition_iterations)},
       {tensorrt::provider_option_names::kMinSubgraphSize, MakeStringWithClassicLocale(info.trt_min_subgraph_size)},
       {tensorrt::provider_option_names::kMaxWorkspaceSize, MakeStringWithClassicLocale(info.trt_max_workspace_size)},
@@ -169,6 +197,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.trt_dump_subgraphs)},
       {tensorrt::provider_option_names::kEngineCacheEnable, MakeStringWithClassicLocale(info.trt_engine_cache_enable)},
       {tensorrt::provider_option_names::kEngineCachePath, kEngineCachePath_},
+      {tensorrt::provider_option_names::kEngineCachePrefix, kEngineCachePrefix_},
       {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.trt_engine_decryption_enable)},
       {tensorrt::provider_option_names::kDecryptionLibPath, kDecryptionLibPath_},
       {tensorrt::provider_option_names::kForceSequentialEngineBuild, MakeStringWithClassicLocale(info.trt_force_sequential_engine_build)},
@@ -188,6 +217,9 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kProfilesMaxShapes, kProfilesMaxShapes_},
       {tensorrt::provider_option_names::kProfilesOptShapes, kProfilesOptShapes_},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.trt_cuda_graph_enable)},
+      {tensorrt::provider_option_names::kEpContextFilePath, kEpContextFilePath_},
+      {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)},
+      {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)},
   };
   return options;
 }
@@ -235,10 +267,14 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.device_id = internal_options.device_id;
 
   // The 'has_user_compute_stream' of the OrtTensorRTProviderOptionsV2 instance can be set by C API UpdateTensorRTProviderOptionsWithValue() as well
-  // We only set the 'has_user_compute_stream' of the OrtTensorRTProviderOptionsV2 instance if it is provided in options
+  // We only set the 'has_user_compute_stream' of the OrtTensorRTProviderOptionsV2 instance if it is provided in options or user_compute_stream is provided
   if (options.find("has_user_compute_stream") != options.end()) {
     trt_provider_options_v2.has_user_compute_stream = internal_options.has_user_compute_stream;
   }
+  if (options.find("user_compute_stream") != options.end() && internal_options.user_compute_stream != nullptr) {
+    trt_provider_options_v2.user_compute_stream = internal_options.user_compute_stream;
+    trt_provider_options_v2.has_user_compute_stream = true;
+  }
 
   trt_provider_options_v2.trt_max_partition_iterations = internal_options.max_partition_iterations;
   trt_provider_options_v2.trt_min_subgraph_size = internal_options.min_subgraph_size;
@@ -255,6 +291,7 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_engine_cache_enable = internal_options.engine_cache_enable;
 
   trt_provider_options_v2.trt_engine_cache_path = copy_string_if_needed(internal_options.engine_cache_path);
+  trt_provider_options_v2.trt_engine_cache_prefix = copy_string_if_needed(internal_options.engine_cache_prefix);
   trt_provider_options_v2.trt_timing_cache_path = copy_string_if_needed(internal_options.timing_cache_path);
 
   trt_provider_options_v2.trt_engine_decryption_enable = internal_options.engine_decryption_enable;
@@ -279,5 +316,8 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_profile_opt_shapes = copy_string_if_needed(internal_options.profile_opt_shapes);
 
   trt_provider_options_v2.trt_cuda_graph_enable = internal_options.cuda_graph_enable;
+  trt_provider_options_v2.trt_dump_ep_context_model = internal_options.dump_ep_context_model;
+  trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode;
+  trt_provider_options_v2.trt_ep_context_file_path = copy_string_if_needed(internal_options.ep_context_file_path);
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index b16543aa3d7d..80424b8d6d19 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -51,6 +51,10 @@ struct TensorrtExecutionProviderInfo {
   std::string profile_max_shapes{""};
   std::string profile_opt_shapes{""};
   bool cuda_graph_enable{false};
+  bool dump_ep_context_model{false};
+  std::string ep_context_file_path{""};
+  int ep_context_embed_mode{0};
+  std::string engine_cache_prefix{""};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index 6bbeab7e94ce..a54b728c17c4 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -4,7 +4,10 @@
 #include <fstream>
 #include <unordered_map>
 #include <string>
+#include <vector>
+#include <sstream>
 #include <iostream>
+#include <filesystem>
 #include <experimental/filesystem>
 #include "flatbuffers/idl.h"
 #include "ort_trt_int8_cal_table.fbs.h"
@@ -95,12 +98,27 @@ bool ReadDynamicRange(const std::string file_name, const bool is_trt_calibration
   return true;
 }
 
+/*
+ * Get number of profile setting.
+ *
+ * profile_min_shapes/profile_max_shapes/profile_opt_shapes may contain multiple profile settings.
+ * Note: TRT EP currently only supports one profile setting.
+ *
+ * {
+ *   tensor_a: [[dim_0_value_0, dim_1_value_1, dim_2_value_2]],
+ *   tensor_b: [[dim_0_value_3, dim_1_value_4, dim_2_value_5]]
+ * }
+ *
+ */
 int GetNumProfiles(std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_shapes) {
-  std::unordered_map<std::string, std::vector<std::vector<int64_t>>>::iterator it;
-  for (it = profile_shapes.begin(); it != profile_shapes.end(); it++) {
-    return static_cast<int>(it->second.size());
+  int num_profile = 0;
+  for (auto it = profile_shapes.begin(); it != profile_shapes.end(); it++) {
+    num_profile = static_cast<int>(it->second.size());
+    if (num_profile > 0) {
+      break;
+    }
   }
-  return 0;
+  return num_profile;
 }
 
 /*
@@ -153,10 +171,10 @@ std::unordered_map<std::string, std::unordered_map<size_t, std::pair<int64_t, in
   auto tensors_range_entries = flexbuffers::GetRoot((const uint8_t*)data.get(), length).AsMap();
   auto keys = tensors_range_entries.Keys();
   auto values = tensors_range_entries.Values();
-  for (size_t i = 0, end = keys.size(); i < end; ++i) {
+  for (size_t i = 0, i_end = keys.size(); i < i_end; ++i) {
     auto dim_range_vectors = values[i].AsTypedVector();
     std::unordered_map<size_t, std::pair<int64_t, int64_t>> inner_map;
-    for (size_t j = 0, end = dim_range_vectors.size() / 3; j < end; ++j) {
+    for (size_t j = 0, j_end = dim_range_vectors.size() / 3; j < j_end; ++j) {
       size_t idx = 3 * j;
       inner_map[dim_range_vectors[idx].AsInt64()] = std::make_pair(dim_range_vectors[idx + 1].AsInt64(), dim_range_vectors[idx + 2].AsInt64());
     }
@@ -456,10 +474,10 @@ std::string GetComputeCapacity(const cudaDeviceProp& prop) {
  * Get Timing by compute capability
  *
  */
-std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) {
+std::string GetTimingCachePath(const std::string& root, std::string& compute_cap) {
   // append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
   const std::string timing_cache_name = "TensorrtExecutionProvider_cache_sm" +
-                                        GetComputeCapacity(prop) + ".timing";
+                                        compute_cap + ".timing";
   return GetCachePath(root, timing_cache_name);
 }
 
@@ -494,7 +512,15 @@ void RemoveCachesByType(const std::string& root, std::string file_extension) {
   }
 }
 
-// Helper class to generate engine id via model name/model content/env metadata
+/**
+ * <summary>
+ * Helper class to generate engine id via model name/model content/env metadata
+ * </summary>
+ * <remarks>
+ * The TensorRT Execution Provider is used in multiple sessions and the underlying infrastructure caches
+ * compiled kernels, so the name must be unique and deterministic across models and sessions.
+ * </remarks>
+ */
 HashValue TRTGenerateId(const GraphViewer& graph_viewer) {
   HashValue model_hash = 0;
 
@@ -694,4 +720,49 @@ bool ParseProfileShapes(std::string profile_shapes_string, std::unordered_map<st
 
   return true;
 }
+
+std::vector<std::string> split(const std::string& str, char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(str);
+  while (std::getline(tokenStream, token, delimiter)) {
+    tokens.push_back(token);
+  }
+  return tokens;
+}
+
+std::string join(const std::vector<std::string>& vec, const std::string& delimiter) {
+  std::string result;
+  for (size_t i = 0; i < vec.size(); ++i) {
+    result += vec[i];
+    if (i < vec.size() - 1) {
+      result += delimiter;
+    }
+  }
+  return result;
+}
+
+/*
+ * Parse engine cache name suffix when user customizes prefix for engine cache name
+ *
+ * For example:
+ * When default subgraph name is "TensorrtExecutionProvider_TRTKernel_graph_torch-jit-export_2068723788287043730_189_189_fp16"
+ * This func will generate the suffix "2068723788287043730_189_fp16"
+ *
+ */
+std::string GetCacheSuffix(const std::string& fused_node_name, const std::string& trt_node_name_with_precision) {
+  std::vector<std::string> split_fused_node_name = split(fused_node_name, '_');
+  if (split_fused_node_name.size() >= 3) {
+    // Get index of model hash from fused_node_name
+    std::string model_hash = split_fused_node_name[split_fused_node_name.size() - 3];
+    size_t index = fused_node_name.find(model_hash);
+    // Parse suffix from trt_node_name_with_precision, as it has additional precision info
+    std::vector<std::string> suffix_group = split(trt_node_name_with_precision.substr(index), '_');
+    if (suffix_group.size() > 2) {
+      suffix_group.erase(suffix_group.begin() + 2);
+    }
+    return join(suffix_group, "_");
+  }
+  return "";
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index 426584553f34..568da57a5095 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -61,13 +61,6 @@ std::unique_ptr<IExecutionProvider> TensorrtProviderFactory::CreateProvider() {
   return std::make_unique<TensorrtExecutionProvider>(info_);
 }
 
-std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
-  TensorrtExecutionProviderInfo info;
-  info.device_id = device_id;
-  info.has_trt_options = false;
-  return std::make_shared<onnxruntime::TensorrtProviderFactory>(info);
-}
-
 struct Tensorrt_Provider : Provider {
   void* GetInfo() override { return &g_info; }
   std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) override {
@@ -116,6 +109,10 @@ struct Tensorrt_Provider : Provider {
     info.profile_max_shapes = options.trt_profile_max_shapes == nullptr ? "" : options.trt_profile_max_shapes;
     info.profile_opt_shapes = options.trt_profile_opt_shapes == nullptr ? "" : options.trt_profile_opt_shapes;
     info.cuda_graph_enable = options.trt_cuda_graph_enable != 0;
+    info.dump_ep_context_model = options.trt_dump_ep_context_model != 0;
+    info.ep_context_file_path = options.trt_ep_context_file_path == nullptr ? "" : options.trt_ep_context_file_path;
+    info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
+    info.engine_cache_prefix = options.trt_engine_cache_prefix == nullptr ? "" : options.trt_engine_cache_prefix;
 
     return std::make_shared<TensorrtProviderFactory>(info);
   }
diff --git a/onnxruntime/core/providers/utils.cc b/onnxruntime/core/providers/utils.cc
index ca3fc4fc1972..b2f9d265ca05 100644
--- a/onnxruntime/core/providers/utils.cc
+++ b/onnxruntime/core/providers/utils.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/framework/tensorprotoutils.h"
-#include "utils.h"
+#include "core/providers/utils.h"
 
 namespace onnxruntime {
 namespace utils {
@@ -23,6 +23,5 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
   return Status::OK();
 }
 #endif
-
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/imp/attr_proto.cc b/onnxruntime/core/providers/vitisai/imp/attr_proto.cc
index 29bc886fb5ed..1392ecef1b72 100644
--- a/onnxruntime/core/providers/vitisai/imp/attr_proto.cc
+++ b/onnxruntime/core/providers/vitisai/imp/attr_proto.cc
@@ -2,126 +2,106 @@
 // Licensed under the MIT License.
 #include "./attr_proto.h"
 
-#include "./vai_assert.h"
-
 #include <cmath>
 #include <functional>
 #include <string>
 #include <unordered_map>
 
-namespace vaip {
+#include "core/providers/shared_library/provider_api.h"
 
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_int(const std::string& name,
-                                                   int64_t value) {
-  auto ret = new onnx::AttributeProto();
+#include "./vai_assert.h"
+
+namespace vaip {
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_int(const std::string& name, int64_t value) {
+  auto ret = ONNX_NAMESPACE::AttributeProto::Create();
   ret->set_name(name);
-  ret->set_type(onnx::AttributeProto_AttributeType_INT);
+  ret->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
   ret->set_i(value);
-  return ret;
+  return ret.release();
 }
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_float(const std::string& name,
-                                                     float value) {
-  auto ret = new onnx::AttributeProto();
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_float(const std::string& name, float value) {
+  auto ret = ONNX_NAMESPACE::AttributeProto::Create();
   ret->set_name(name);
-  ret->set_type(onnx::AttributeProto_AttributeType_FLOAT);
+  ret->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT);
   ret->set_f(value);
-  return ret;
+  return ret.release();
 }
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_string(
-    const std::string& name, const std::string& value) {
-  auto ret = new onnx::AttributeProto();
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_string(const std::string& name, const std::string& value) {
+  auto ret = ONNX_NAMESPACE::AttributeProto::Create();
   ret->set_name(name);
-  ret->set_type(onnx::AttributeProto_AttributeType_STRING);
+  ret->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_STRING);
   ret->set_s(value);
-  return ret;
+  return ret.release();
 }
 ONNX_NAMESPACE::AttributeProto* attr_proto_new_tensor(
     const std::string& name, const ONNX_NAMESPACE::TensorProto& value) {
-  auto ret = new onnx::AttributeProto();
+  auto ret = ONNX_NAMESPACE::AttributeProto::Create();
   ret->set_name(name);
-  ret->set_type(onnx::AttributeProto_AttributeType_TENSOR);
-  *ret->mutable_t() = value;
-  return ret;
+  ret->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR);
+  *ret->add_tensors() = value;
+  return ret.release();
 }
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_ints(
-    const std::string& name, const std::vector<int64_t>& value) {
-  auto ret = new onnx::AttributeProto();
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_ints(const std::string& name, const std::vector<int64_t>& value) {
+  auto ret = ONNX_NAMESPACE::AttributeProto::Create();
   ret->set_name(name);
-  ret->set_type(onnx::AttributeProto_AttributeType_INTS);
+  ret->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS);
   ret->mutable_ints()->Reserve((int)value.size());
   for (auto v : value) {
     ret->add_ints(v);
   }
-  return ret;
+  return ret.release();
 }
-
 ONNX_NAMESPACE::AttributeProto* attr_proto_new_floats(
     const std::string& name, const std::vector<float>& value) {
-  auto ret = new onnx::AttributeProto();
+  auto ret = ONNX_NAMESPACE::AttributeProto::Create();
   ret->set_name(name);
-  ret->set_type(onnx::AttributeProto_AttributeType_FLOATS);
+  ret->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOATS);
   ret->mutable_floats()->Reserve((int)value.size());
   for (auto v : value) {
     ret->add_floats(v);
   }
-  return ret;
+  return ret.release();
 }
-
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_strings(
-    const std::string& name, const std::vector<std::string>& value) {
-  auto ret = new onnx::AttributeProto();
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_strings(const std::string& name, const std::vector<std::string>& value) {
+  auto ret = ONNX_NAMESPACE::AttributeProto::Create();
   ret->set_name(name);
-  ret->set_type(onnx::AttributeProto_AttributeType_STRINGS);
-  ret->mutable_strings()->Reserve((int)value.size());
+  ret->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_STRINGS);
   for (auto& v : value) {
     ret->add_strings(v);
   }
-  return ret;
+  return ret.release();
 }
-
-int64_t attr_proto_get_int(const onnx::AttributeProto& attr) {
-  vai_assert(attr.type() == onnx::AttributeProto_AttributeType_INT, attr.DebugString());
+int64_t attr_proto_get_int(const ONNX_NAMESPACE::AttributeProto& attr) {
+  vai_assert(attr.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_INT, attr.name());
   return attr.i();
 }
-
-float attr_proto_get_float(const onnx::AttributeProto& attr) {
-  vai_assert(attr.type() == onnx::AttributeProto_AttributeType_FLOAT, attr.DebugString());
+float attr_proto_get_float(const ONNX_NAMESPACE::AttributeProto& attr) {
+  vai_assert(attr.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT, attr.name());
   return attr.f();
 }
-
-const std::string& attr_proto_get_string(const onnx::AttributeProto& attr) {
-  vai_assert(attr.type() == onnx::AttributeProto_AttributeType_STRING, attr.DebugString());
+const std::string& attr_proto_get_string(const ONNX_NAMESPACE::AttributeProto& attr) {
+  vai_assert(attr.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_STRING, attr.name());
   return attr.s();
 }
-
-const ONNX_NAMESPACE::TensorProto& attr_proto_get_tensor(
-    const onnx::AttributeProto& attr) {
-  vai_assert(attr.type() == onnx::AttributeProto_AttributeType_TENSOR, attr.DebugString());
+const ONNX_NAMESPACE::TensorProto& attr_proto_get_tensor(const ONNX_NAMESPACE::AttributeProto& attr) {
+  vai_assert(attr.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR, attr.name());
   return attr.t();
 }
-
-gsl::span<const int64_t> attr_proto_get_ints(const onnx::AttributeProto& attr) {
-  vai_assert(attr.type() == onnx::AttributeProto_AttributeType_INTS, attr.DebugString());
+gsl::span<const int64_t> attr_proto_get_ints(const ONNX_NAMESPACE::AttributeProto& attr) {
+  vai_assert(attr.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_INTS, attr.name());
   return gsl::span<const int64_t>(attr.ints());
 }
-
-gsl::span<const float> attr_proto_get_floats(const onnx::AttributeProto& attr) {
-  vai_assert(attr.type() == onnx::AttributeProto_AttributeType_FLOATS, attr.DebugString());
+gsl::span<const float> attr_proto_get_floats(const ONNX_NAMESPACE::AttributeProto& attr) {
+  vai_assert(attr.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_FLOATS, attr.name());
   return gsl::span<const float>(attr.floats());
 }
-
-std::vector<std::string> attr_proto_get_strings(
-    const ONNX_NAMESPACE::AttributeProto& attr) {
-  vai_assert(attr.type() == onnx::AttributeProto_AttributeType_STRINGS, attr.DebugString());
-  return std::vector<std::string>(attr.strings().begin(), attr.strings().end());
-}
-
-ONNX_NAMESPACE::AttributeProto attr_proto_from_i64(const std::string& name,
-                                                   int64_t value) {
-  ONNX_NAMESPACE::AttributeProto ret;
-  ret.set_name(name);
-  ret.set_i(value);
+std::vector<std::string> attr_proto_get_strings(const ONNX_NAMESPACE::AttributeProto& attr) {
+  vai_assert(attr.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_STRINGS, attr.name());
+  std::vector<std::string> ret;
+  ret.reserve(attr.strings_size());
+  for (int i = 0; i < attr.strings_size(); i++) {
+    ret.push_back(attr.strings(i));
+  }
   return ret;
 }
-
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/attr_proto.h b/onnxruntime/core/providers/vitisai/imp/attr_proto.h
index 32ba8fa672d7..f4d56dd618a8 100644
--- a/onnxruntime/core/providers/vitisai/imp/attr_proto.h
+++ b/onnxruntime/core/providers/vitisai/imp/attr_proto.h
@@ -2,46 +2,26 @@
 // Licensed under the MIT License.
 #pragma once
 #include <functional>
-
+#include "vaip/my_ort.h"
 #include "core/common/gsl.h"
-#include "onnx/onnx_pb.h"
 
 namespace vaip {
 
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_int(const std::string& name,
-                                                   int64_t value);
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_float(const std::string& name,
-                                                     float value);
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_string(const std::string& name,
-                                                      const std::string& value);
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_tensor(
-    const std::string& name, const ONNX_NAMESPACE::TensorProto& value);
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_ints(
-    const std::string& name, const std::vector<int64_t>& value);
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_floats(
-    const std::string& name, const std::vector<float>& value);
-ONNX_NAMESPACE::AttributeProto* attr_proto_new_strings(
-    const std::string& name, const std::vector<std::string>& value);
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_int(const std::string& name, int64_t value);
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_float(const std::string& name, float value);
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_string(const std::string& name, const std::string& value);
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_tensor(const std::string& name, const ONNX_NAMESPACE::TensorProto& value);
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_ints(const std::string& name, const std::vector<int64_t>& value);
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_floats(const std::string& name, const std::vector<float>& value);
+ONNX_NAMESPACE::AttributeProto* attr_proto_new_strings(const std::string& name, const std::vector<std::string>& value);
 
 /// attr_proto getters
 int64_t attr_proto_get_int(const ONNX_NAMESPACE::AttributeProto& attr);
 float attr_proto_get_float(const ONNX_NAMESPACE::AttributeProto& attr);
-const std::string& attr_proto_get_string(
-    const ONNX_NAMESPACE::AttributeProto& attr);
-
-const ONNX_NAMESPACE::TensorProto& attr_proto_get_tensor(
-    const onnx::AttributeProto& attr);
-gsl::span<const int64_t> attr_proto_get_ints(const onnx::AttributeProto& attr);
-gsl::span<const float> attr_proto_get_floats(const onnx::AttributeProto& attr);
-std::vector<std::string> attr_proto_get_strings(
-    const ONNX_NAMESPACE::AttributeProto& attr);
-
-/// attr_proto makers
-ONNX_NAMESPACE::AttributeProto attr_proto_from_i64(const std::string& name,
-                                                   int64_t);
-
-///
-using attr_proto_func_t = std::function<ONNX_NAMESPACE::AttributeProto(
-    const ONNX_NAMESPACE::AttributeProto&)>;
+const std::string& attr_proto_get_string(const ONNX_NAMESPACE::AttributeProto& attr);
+const ONNX_NAMESPACE::TensorProto& attr_proto_get_tensor(const ONNX_NAMESPACE::AttributeProto& attr);
+gsl::span<const int64_t> attr_proto_get_ints(const ONNX_NAMESPACE::AttributeProto& attr);
+gsl::span<const float> attr_proto_get_floats(const ONNX_NAMESPACE::AttributeProto& attr);
+std::vector<std::string> attr_proto_get_strings(const ONNX_NAMESPACE::AttributeProto& attr);
 
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/capability.cc b/onnxruntime/core/providers/vitisai/imp/capability.cc
index a55180bd2ee5..58522a45a151 100644
--- a/onnxruntime/core/providers/vitisai/imp/capability.cc
+++ b/onnxruntime/core/providers/vitisai/imp/capability.cc
@@ -3,15 +3,10 @@
 #include "vaip/capability.h"
 #include "./vai_assert.h"
 
-#include "core/graph/basic_types.h"
-
-#include "./attr_proto.h"
-
 namespace vaip {
 using namespace ::onnxruntime;
 
-static std::vector<NodeIndex> node_names_to_nodes(const GraphViewer& graph,
-                                                  const std::vector<std::string>& node_names) {
+static std::vector<NodeIndex> node_names_to_nodes(const GraphViewer& graph, const std::vector<std::string>& node_names) {
   auto ret = std::vector<NodeIndex>();
   ret.reserve(node_names.size());
   for (auto& onnx_node_name : node_names) {
@@ -24,53 +19,45 @@ static std::vector<NodeIndex> node_names_to_nodes(const GraphViewer& graph,
 }
 
 std::unique_ptr<ComputeCapability> XirSubgraphToComputeCapability1(const onnxruntime::GraphViewer& graph, vaip_core::ExecutionProvider* ep, size_t index) {
-  auto meta_def = std::make_unique<IndexedSubGraph::MetaDef>();
-  meta_def->constant_initializers = *ep->get_meta_def_constant_initializer();
-  meta_def->inputs = *ep->get_meta_def_inputs();
-  meta_def->outputs = *ep->get_meta_def_outputs();
-  auto indexed_subgraph = std::make_unique<IndexedSubGraph>();
-  auto indexed_subgraph_ptr = indexed_subgraph.get();
-  indexed_subgraph_ptr->nodes = node_names_to_nodes(graph, *ep->get_meta_def_nodes());
+  auto meta_def = IndexedSubGraph_MetaDef::Create();
+  meta_def->constant_initializers() = *ep->get_meta_def_constant_initializer();
+  meta_def->inputs() = *ep->get_meta_def_inputs();
+  meta_def->outputs() = *ep->get_meta_def_outputs();
+  auto indexed_subgraph = IndexedSubGraph::Create();
+  indexed_subgraph->Nodes() = node_names_to_nodes(graph, *ep->get_meta_def_nodes());
   static auto g_counter = 1;
-  meta_def->name = std::string("vitis_ai_ep_") + std::to_string(g_counter++);
-  meta_def->domain = "com.xilinx";
-  meta_def->since_version = 1;
-  meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
-  auto index_proto = std::unique_ptr<ONNX_NAMESPACE::AttributeProto>(vaip::attr_proto_new_int("index", (int64_t)index));
-  meta_def->attributes["index"] = *index_proto;
+  meta_def->name() = std::string("vitis_ai_ep_") + std::to_string(g_counter++);
+  meta_def->domain() = "com.xilinx";
+  meta_def->since_version() = 1;
+  meta_def->status() = ONNX_NAMESPACE::EXPERIMENTAL;
+  auto index_proto = ONNX_NAMESPACE::AttributeProto::Create();
+  index_proto->set_name("index");
+  index_proto->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INT);
+  index_proto->set_i(index);
+  meta_def->attributes()["index"] = *index_proto;
   indexed_subgraph->SetMetaDef(std::move(meta_def));
-  return std::make_unique<ComputeCapability>(std::move(indexed_subgraph));
+  return ComputeCapability::Create(std::move(indexed_subgraph));
 }
 
 std::vector<std::unique_ptr<ComputeCapability>>
 GetComputeCapabilityOps(const onnxruntime::GraphViewer& graph,
                         vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>>* eps,
-                        const std::set<std::string>& all_not_support_optypes) {
-  std::set<std::string> all_compute_capability_nodes;
+                        const std::set<std::string>& all_support_optypes_by_eps) {
+  std::set<NodeIndex> all_nodes_included_eps;
   for (auto& ep : **eps) {
-    auto nodes = *ep->get_meta_def_nodes();
-    for (auto n : nodes)
-      all_compute_capability_nodes.insert(n);
+    auto nodes = node_names_to_nodes(graph, *ep->get_meta_def_nodes());
+    all_nodes_included_eps.insert(nodes.begin(), nodes.end());
   }
+
+  std::vector<NodeIndex> node_indexs = graph.GetNodesInTopologicalOrder();
+  node_indexs.erase(std::remove_if(node_indexs.begin(), node_indexs.end(), [&](NodeIndex index) { return all_nodes_included_eps.count(index) > 0; }), node_indexs.end());
+  node_indexs.erase(std::remove_if(node_indexs.begin(), node_indexs.end(), [&](NodeIndex index) { return all_support_optypes_by_eps.count(graph.GetNode(index)->OpType()) == 0; }), node_indexs.end());
+
   std::vector<std::unique_ptr<ComputeCapability>> result;
-  for (auto& n : graph.Nodes()) {
-    if ((!all_compute_capability_nodes.count(n.Name())) && all_not_support_optypes.count(n.OpType())) {
-      auto meta_def = std::make_unique<IndexedSubGraph::MetaDef>();
-      meta_def->name = n.OpType();
-      meta_def->domain = n.Domain();
-      meta_def->since_version = 1;
-      meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
-      auto indexed_subgraph = std::make_unique<IndexedSubGraph>();
-      indexed_subgraph->nodes.push_back(n.Index());
-      for (auto i : n.InputDefs()) {
-        meta_def->inputs.push_back(i->Name());
-      }
-      for (auto i : n.OutputDefs()) {
-        meta_def->outputs.push_back(i->Name());
-      }
-      indexed_subgraph->SetMetaDef(std::move(meta_def));
-      result.emplace_back(std::make_unique<ComputeCapability>(std::move(indexed_subgraph)));
-    }
+  for (auto& n : node_indexs) {
+    auto indexed_subgraph = IndexedSubGraph::Create();
+    indexed_subgraph->Nodes() = {n};
+    result.emplace_back(ComputeCapability::Create(std::move(indexed_subgraph)));
   }
   return result;
 }
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index b629c8eff909..eba3230d283c 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -1,20 +1,18 @@
-
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
+
 #include "vaip/global_api.h"
 
 #include <atomic>
+#include <iostream>
+#include <codecvt>
 #include <fstream>
 
 #include "./vai_assert.h"
-#include "core/common/exceptions.h"
-#include "core/common/logging/logging.h"
 
+#include "core/common/exceptions.h"
 #include "core/framework/error_code_helper.h"
-
-#include "core/graph/model.h"
-#include "core/session/ort_env.h"
-#include "core/session/onnxruntime_cxx_api.h"
+#include "core/providers/shared/common.h"
 
 #include <nlohmann/json.hpp>
 
@@ -55,16 +53,14 @@ struct OrtVitisAIEpAPI {
   std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* (*compile_onnx_model_with_options)(
       const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options);
   void Ensure() {
-    if (handle_) return;
-    auto full_path = Env::Default().GetRuntimePath() +
-                     PathString(LIBRARY_PREFIX ORT_TSTR("onnxruntime_vitisai_ep") LIBRARY_EXTENSION);
-    ORT_THROW_IF_ERROR(Env::Default().LoadDynamicLibrary(full_path, true, &handle_));
-    ORT_THROW_IF_ERROR(Env::Default().GetSymbolFromLibrary(
-        handle_, "initialize_onnxruntime_vitisai_ep", reinterpret_cast<void**>(&initialize_onnxruntime_vitisai_ep)));
-    auto status1 = Env::Default().GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_options",
-                                                       reinterpret_cast<void**>(&compile_onnx_model_with_options));
-    auto status2 = Env::Default().GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep",
-                                                       reinterpret_cast<void**>(&compile_onnx_model_3));
+    if (handle_)
+      return;
+    auto& env = Provider_GetHost()->Env__Default();
+    auto full_path = env.GetRuntimePath() + PathString(LIBRARY_PREFIX ORT_TSTR("onnxruntime_vitisai_ep") LIBRARY_EXTENSION);
+    ORT_THROW_IF_ERROR(env.LoadDynamicLibrary(full_path, true, &handle_));
+    ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "initialize_onnxruntime_vitisai_ep", (void**)&initialize_onnxruntime_vitisai_ep));
+    auto status1 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_options", (void**)&compile_onnx_model_with_options);
+    auto status2 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep", (void**)&compile_onnx_model_3);
     if (!status1.IsOK() && !status2.IsOK()) {
       ::onnxruntime::LogRuntimeError(0, status1, __FILE__, static_cast<const char*>(__FUNCTION__), __LINE__);
       ORT_THROW(status1);
@@ -76,6 +72,12 @@ struct OrtVitisAIEpAPI {
 };
 
 static OrtVitisAIEpAPI s_library_vitisaiep;
+static std::shared_ptr<KernelRegistry> s_kernel_registry_vitisaiep;
+static std::vector<OrtCustomOpDomain*> s_domains_vitisaiep;
+static vaip_core::OrtApiForVaip the_global_api;
+std::shared_ptr<KernelRegistry> get_kernel_registry_vitisaiep() { return s_kernel_registry_vitisaiep; }
+const std::vector<OrtCustomOpDomain*>& get_domains_vitisaiep() { return s_domains_vitisaiep; }
+
 static std::string config_to_json_str(const onnxruntime::ProviderOptions& config) {
   auto iter = config.find("config_file");
   if (iter == config.end()) {
@@ -105,121 +107,143 @@ static std::string config_to_json_str(const onnxruntime::ProviderOptions& config
     return "";
   }
 }
-vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model_with_options(
-    const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options) {
+
+vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
+    const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
+#ifndef _WIN32
+  auto model_path = graph_viewer.ModelPath().ToPathString();
+#else
+  using convert_t = std::codecvt_utf8<wchar_t>;
+  std::wstring_convert<convert_t, wchar_t> strconverter;
+  auto model_path = strconverter.to_bytes(graph_viewer.ModelPath().ToPathString());
+#endif
   if (s_library_vitisaiep.compile_onnx_model_with_options) {
-    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path, graph, options));
+    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path, graph_viewer.GetGraph(), options));
   } else {
     auto json_str = config_to_json_str(options);
-    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_3(model_path, graph, json_str.c_str()));
+    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_3(model_path, graph_viewer.GetGraph(), json_str.c_str()));
   }
 }
 
-std::vector<OrtCustomOpDomain*> initialize_vitisai_ep() {
-  s_library_vitisaiep.Ensure();
-  Status status = Status::OK();
-  try {
-    OrtEnv::LoggingManagerConstructionInfo lm_info{nullptr, nullptr, ORT_LOGGING_LEVEL_WARNING,
-                                                   "onnxruntime-vitisai-ep"};
-    std::ignore = OrtEnv::GetInstance(lm_info, status);
-  } catch (onnxruntime::OnnxRuntimeException& /*e*/) {
+struct MyCustomOpKernel : OpKernel {
+  MyCustomOpKernel(const OpKernelInfo& info, const OrtCustomOp& op) : OpKernel(info), op_(op) {
+    op_kernel_ =
+        op_.CreateKernel(&op_, Ort::Global<void>::api_, reinterpret_cast<const OrtKernelInfo*>(&info));
   }
-  auto domains = std::vector<OrtCustomOpDomain*>();
-  domains.reserve(100);
-  s_library_vitisaiep.initialize_onnxruntime_vitisai_ep(create_org_api_hook(), domains);
-  auto& domainToVersionRangeInstance = ONNX_NAMESPACE::OpSchemaRegistry::DomainToVersionRange::Instance();
-  if (domainToVersionRangeInstance.Map().find("com.xilinx") == domainToVersionRangeInstance.Map().end()) {
-    vaip::register_xir_ops(domains);
+
+  ~MyCustomOpKernel() override { op_.KernelDestroy(op_kernel_); }
+
+  Status Compute(OpKernelContext* ctx) const override {
+    op_.KernelCompute(op_kernel_, reinterpret_cast<OrtKernelContext*>(ctx));
+    return Status::OK();
   }
 
-  return domains;
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MyCustomOpKernel);
+
+  const OrtCustomOp& op_;
+  void* op_kernel_;
+};
+
+void create_kernel_registry(std::vector<OrtCustomOpDomain*> domains) {
+  s_kernel_registry_vitisaiep = KernelRegistry::Create();
+  for (const auto& domain : domains) {
+    for (const auto* op : domain->custom_ops_) {
+      auto def_builder = KernelDefBuilder::Create();
+      def_builder->SetName(op->GetName(op));
+      def_builder->SetDomain(domain->domain_.c_str());
+      def_builder->SinceVersion(1);
+      if (op->version > 12) {
+        auto input_count = op->GetInputTypeCount(op);
+        for (auto i = 0u; i < input_count; i++) {
+          def_builder->InputMemoryType(op->GetInputMemoryType(op, i), i);
+        }
+      }
+      def_builder->Provider(onnxruntime::kVitisAIExecutionProvider);
+      KernelCreateFn kernel_create_fn =
+          [op](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+        // out = std::make_unique<MyCustomOpKernel>(info, *op);
+        return Status::OK();
+      };
+      std::ignore = s_kernel_registry_vitisaiep->Register(KernelCreateInfo(def_builder->Build(), kernel_create_fn));
+    }
+  }
+}
+void initialize_vitisai_ep() {
+  s_library_vitisaiep.Ensure();
+  s_domains_vitisaiep.reserve(100);
+  s_library_vitisaiep.initialize_onnxruntime_vitisai_ep(create_org_api_hook(), s_domains_vitisaiep);
+  vaip::register_xir_ops(s_domains_vitisaiep);
+  create_kernel_registry(s_domains_vitisaiep);
 }
 
-static vaip_core::OrtApiForVaip the_global_api;
 vaip_core::OrtApiForVaip* create_org_api_hook() {
+  InitProviderOrtApi();
+  the_global_api.host_ = Provider_GetHost();
   assert(Ort::Global<void>::api_ != nullptr);
   the_global_api.ort_api_ = Ort::Global<void>::api_;
   the_global_api.model_load = [](const std::string& filename) -> Model* {
-    ONNX_NAMESPACE::ModelProto model_proto;
+    auto model_proto = ONNX_NAMESPACE::ModelProto::Create();
     auto& logger = logging::LoggingManager::DefaultLogger();
     auto file_path = ToPathString(filename);
-    auto status = Model::Load(file_path, model_proto);
+    auto status = Model::Load(file_path, *model_proto);
     vai_assert(status.IsOK(), "load model proto error");
-    auto model = std::make_unique<Model>(std::move(model_proto), file_path, nullptr, logger);
+    auto model = Model::Create(std::move(*model_proto), file_path, nullptr, logger);
     return model.release();
   };
   the_global_api.model_delete = [](Model* model) { delete model; };
-  the_global_api.model_clone = [](const Model& model) -> Model* {
+
+  the_global_api.model_clone = [](const Model& const_model) -> Model* {
     auto& logger = logging::LoggingManager::DefaultLogger();
-    auto model_proto = const_cast<onnxruntime::Model&>(model).ToProto();
-    auto file_path = model.ModelPath().ToPathString();
-    auto ret = std::make_unique<Model>(std::move(model_proto), file_path, nullptr, logger);
+    auto& model = const_cast<onnxruntime::Model&>(const_model);
+    auto model_proto = model.ToProto();
+    auto file_path = model.MainGraph().ModelPath().ToPathString();
+    auto local_registries = IOnnxRuntimeOpSchemaRegistryList{model.MainGraph().GetSchemaRegistry()};
+    auto ret = Model::Create(std::move(*model_proto), file_path, &local_registries, logger);
     auto status = ret->MainGraph().Resolve();
     vai_assert(status.IsOK(), status.ErrorMessage());
     return ret.release();
   };
-  the_global_api.model_set_meta_data = [](Model& model, const std::string& key, const std::string& value) -> void {
+  the_global_api.model_set_meta_data = [](Model& model, const std::string& key, const std::string& value) {
     const_cast<ModelMetaData&>(model.MetaData())[key] = value;
   };
-  the_global_api.model_get_meta_data = [](const Model& model,
-                                          const std::string& key) -> vaip_core::DllSafe<std::string> {
-    auto& m = model.MetaData();
-    auto it = m.find(key);
-    auto ret = std::string();
-    if (it != m.end()) {
-      ret = it->second;
+  the_global_api.model_get_meta_data =
+      [](const Model& model, const std::string& key) -> vaip_core::DllSafe<std::string> {
+    if (model.MetaData().count(key)) {
+      return vaip_core::DllSafe(model.MetaData().at(key));
     }
-    return vaip_core::DllSafe(ret);
+    return vaip_core::DllSafe(std::string());
   };
-
   the_global_api.model_has_meta_data = [](const Model& model, const std::string& key) -> int {
-    auto& m = model.MetaData();
-    return m.find(key) != m.end() ? 1 : 0;
+    return int(model.MetaData().count(key));
   };
-
   the_global_api.model_main_graph = [](Model& model) -> Graph& { return model.MainGraph(); };
   the_global_api.graph_get_model = [](const Graph& graph) -> const Model& { return graph.GetModel(); };
-  the_global_api.graph_get_inputs_unsafe = [](const Graph& graph) -> vaip_core::DllSafe<std::vector<const NodeArg*>> {
-    auto ret = std::vector<const NodeArg*>();
-    auto inputs = graph.GetInputs();
-    for (auto input : inputs) {
-      vai_assert(input->Exists(), input->Name());
-      ret.push_back(input);
-    }
-    return vaip_core::DllSafe(std::move(ret));
+  the_global_api.graph_get_inputs_unsafe = [](const Graph& graph) -> auto {
+    return vaip_core::DllSafe(graph.GetInputs());
   };
-  the_global_api.graph_get_outputs_unsafe = [](const Graph& graph) -> vaip_core::DllSafe<std::vector<const NodeArg*>> {
+  the_global_api.graph_get_outputs_unsafe = [](const Graph& graph) -> auto {
     return vaip_core::DllSafe(graph.GetOutputs());
   };
-
-  the_global_api.graph_set_outputs = [](Graph& graph, gsl::span<const NodeArg* const> outputs) -> void {
-    return graph.SetOutputs(outputs);
+  the_global_api.graph_set_outputs = [](Graph& graph, gsl::span<const NodeArg* const> outputs) {
+    graph.SetOutputs(outputs);
   };
-
   the_global_api.graph_get_node_arg = [](const Graph& graph, const std::string& name) -> const NodeArg* {
     return graph.GetNodeArg(name);
   };
   the_global_api.graph_producer_node = [](const Graph& graph, const std::string& name) -> const Node* {
     return graph.GetProducerNode(name);
   };
-
-  the_global_api.graph_get_node = [](const Graph& graph, size_t index) -> const Node* { return graph.GetNode(index); };
-
+  the_global_api.graph_get_node = [](const Graph& graph, size_t index) -> const Node* {
+    return graph.GetNode(index);
+  };
   the_global_api.graph_save = vaip::graph_save;
   the_global_api.graph_fuse = vaip::graph_fuse;
   the_global_api.graph_remove_node = vaip::graph_remove_node;
-  the_global_api.graph_add_node = [](Graph& graph, const std::string& name, const std::string& op_type,
-                                     const std::string& description, const std::vector<const NodeArg*>& input_args,
-                                     const std::vector<const NodeArg*>& output_args,
-                                     vaip_core::NodeAttributes& attributes, const std::string& domain) -> Node& {
-    return vaip::graph_add_node(graph, name, op_type, description, input_args, output_args,
-                                std::move(reinterpret_cast<onnxruntime::NodeAttributes&>(attributes)), domain);
-  };
-
+  the_global_api.graph_add_node = vaip::graph_add_node;
   the_global_api.graph_get_all_initialized_tensors = [](const Graph& graph) -> const InitializedTensorSet& {
     return graph.GetAllInitializedTensors();
   };
-
   the_global_api.graph_resolve = [](Graph& graph, bool force) {
     if (force) {
       graph.SetGraphResolveNeeded();
@@ -227,129 +251,57 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     auto status = graph.Resolve();
     return status.Code();
   };
-
-  the_global_api.graph_get_consumer_nodes_unsafe =
-      [](const Graph& graph, const std::string& node_arg_name) -> vaip_core::DllSafe<std::vector<const Node*>> {
+  the_global_api.graph_get_consumer_nodes_unsafe = [](const Graph& graph, const std::string& node_arg_name) -> auto {
     return vaip_core::DllSafe(graph.GetConsumerNodes(node_arg_name));
   };
-  the_global_api.graph_nodes_unsafe = [](const Graph& graph) -> vaip_core::DllSafe<std::vector<const Node*>> {
-    auto& node_refererence = graph.Nodes();
-    std::vector<const Node*> nodes(static_cast<size_t>(graph.NumberOfNodes()), nullptr);
-    std::transform(node_refererence.begin(), node_refererence.end(), nodes.begin(), [](const Node& n) { return &n; });
-    return vaip_core::DllSafe(std::move(nodes));
-  };
+  the_global_api.graph_nodes_unsafe = [](const Graph& graph) -> auto { return vaip_core::DllSafe(graph.Nodes()); };
   the_global_api.graph_get_name = [](const Graph& graph) -> const std::string& { return graph.Name(); };
   the_global_api.graph_reverse_dfs_from = [](const Graph& graph, gsl::span<const Node* const> from,
-                                             const std::function<void(const Node*)>& enter,
-                                             const std::function<void(const Node*)>& leave,
-                                             const std::function<bool(const Node* from, const Node* to)>& stop) {
+                                             const auto& enter, const auto& leave, const auto& stop) {
     graph.ReverseDFSFrom(from, enter, leave, nullptr, stop);
   };
   // node
   the_global_api.node_get_inputs_unsafe = vaip::node_get_inputs;
   the_global_api.node_get_output_node_args_unsafe = vaip::node_get_output_node_args;
-
   the_global_api.node_op_type = [](const Node& node) -> const std::string& { return node.OpType(); };
   the_global_api.node_op_domain = [](const Node& node) -> const std::string& { return node.Domain(); };
-  the_global_api.node_get_index = [](const Node& node) -> size_t { return static_cast<size_t>(node.Index()); };
+  the_global_api.node_get_index = [](const Node& node) -> size_t { return node.Index(); };
   the_global_api.node_get_name = [](const Node& node) -> const std::string& { return node.Name(); };
   the_global_api.node_description = [](const Node& node) -> const std::string& { return node.Description(); };
-
-  the_global_api.node_get_attributes = [](Node& node) -> vaip_core::NodeAttributes& {
-    return reinterpret_cast<vaip_core::NodeAttributes&>(node.GetMutableAttributes());
-  };
-
-  the_global_api.node_type_is_fused = [](const Node& node) {
-    return node.NodeType() == onnxruntime::Node::Type::Fused;
+  the_global_api.node_get_attributes = [](Node& node) -> NodeAttributes& {
+    return const_cast<NodeAttributes&>(node.GetAttributes());
   };
-  the_global_api.node_get_function_body = [](const Node& node) -> const onnxruntime::Graph& {
+  the_global_api.node_type_is_fused = [](const Node& node) { return node.NodeType() == Node::Type::Fused; };
+  the_global_api.node_get_function_body = [](const Node& node) -> const auto& {
     assert(node.GetFunctionBody() != nullptr);
     return node.GetFunctionBody()->Body();
   };
 
   // node_arg
-  the_global_api.node_arg_get_name_unsafe = [](const NodeArg& node_arg) -> const std::string& {
-    return node_arg.Name();
-  };
+  the_global_api.node_arg_get_name_unsafe =
+      [](const NodeArg& node_arg) -> const std::string& { return node_arg.Name(); };
   the_global_api.node_arg_clone = vaip::node_arg_clone;
   the_global_api.node_arg_new = vaip::node_arg_new;
-  the_global_api.node_arg_is_exists = vaip::node_arg_is_exists;
+  the_global_api.node_arg_is_exists = [](const NodeArg& node_arg) { return node_arg.Exists(); };
   the_global_api.node_arg_is_constant = vaip::node_arg_is_constant;
   the_global_api.node_arg_get_shape_i64_unsafe = vaip::node_arg_get_shape_i64;
   the_global_api.node_arg_set_shape_i64 = vaip::node_arg_set_shape_i64;
   the_global_api.node_arg_get_denotation_unsafe = vaip::node_arg_get_denotation;
+
   the_global_api.node_arg_set_denotation = vaip::node_arg_set_denotation;
   the_global_api.node_arg_get_const_data_as_tensor = vaip::node_arg_get_const_data_as_tensor;
 
   the_global_api.node_arg_get_element_type = vaip::node_arg_get_element_type;
-  the_global_api.node_arg_set_element_type = [](NodeArg& node_arg, int type) {
-    auto data_type = ONNX_NAMESPACE::TensorProto::UNDEFINED;
-    switch (type) {
-      case 1:
-        data_type = ONNX_NAMESPACE::TensorProto::FLOAT;
-        break;
-      case 2:
-        data_type = ONNX_NAMESPACE::TensorProto::UINT8;
-        break;
-      case 3:
-        data_type = ONNX_NAMESPACE::TensorProto::INT8;
-        break;
-
-      case 4:
-        data_type = ONNX_NAMESPACE::TensorProto::UINT16;
-        break;
-      case 5:
-        data_type = ONNX_NAMESPACE::TensorProto::INT16;
-        break;
-      case 6:
-        data_type = ONNX_NAMESPACE::TensorProto::INT32;
-        break;
-      case 7:
-        data_type = ONNX_NAMESPACE::TensorProto::INT64;
-        break;
-      case 8:
-        data_type = ONNX_NAMESPACE::TensorProto::STRING;
-        break;
-      case 9:
-        data_type = ONNX_NAMESPACE::TensorProto::BOOL;
-        break;
-      case 10:
-        data_type = ONNX_NAMESPACE::TensorProto::FLOAT16;
-        break;
-      case 11:
-        data_type = ONNX_NAMESPACE::TensorProto::DOUBLE;
-        break;
-      case 12:
-        data_type = ONNX_NAMESPACE::TensorProto::UINT32;
-        break;
-      case 13:
-        data_type = ONNX_NAMESPACE::TensorProto::UINT64;
-        break;
-      case 14:
-        data_type = ONNX_NAMESPACE::TensorProto::COMPLEX64;
-        break;
-      case 15:
-        data_type = ONNX_NAMESPACE::TensorProto::COMPLEX128;
-        break;
-      case 16:
-        data_type = ONNX_NAMESPACE::TensorProto::BFLOAT16;
-        break;
-      default:
-        vai_assert(false, "TensorProto::DataType not supoort");
-    }
-    return vaip::node_arg_set_element_type(node_arg, data_type);
-  };
+  the_global_api.node_arg_set_element_type = vaip::node_arg_set_element_type;
   /// attr proto
-  the_global_api.attr_proto_delete = [](onnx::AttributeProto* v) { delete v; };
-  the_global_api.attr_proto_clone = [](const onnx::AttributeProto& v) -> onnx::AttributeProto* {
-    return new onnx::AttributeProto(v);
-  };
-  the_global_api.attr_proto_get_name = [](const onnx::AttributeProto& attr_proto) -> const std::string& {
-    return attr_proto.name();
-  };
-  the_global_api.attr_proto_set_name = [](onnx::AttributeProto* attr_proto, const std::string& name) {
-    attr_proto->set_name(name);
+  the_global_api.attr_proto_delete = [](ONNX_NAMESPACE::AttributeProto* v) { delete v; };
+  the_global_api.attr_proto_clone = [](const ONNX_NAMESPACE::AttributeProto& v) -> ONNX_NAMESPACE::AttributeProto* {
+    auto ret = ONNX_NAMESPACE::AttributeProto::Create();
+    *ret = v;
+    return ret.release();
   };
+  the_global_api.attr_proto_get_name = [](const auto& attr_proto) -> const std::string& { return attr_proto.name(); };
+  the_global_api.attr_proto_set_name = [](auto* attr_proto, const auto& name) { attr_proto->set_name(name); };
   the_global_api.attr_proto_new_int = vaip::attr_proto_new_int;
   the_global_api.attr_proto_new_float = vaip::attr_proto_new_float;
   the_global_api.attr_proto_new_string = vaip::attr_proto_new_string;
@@ -364,31 +316,24 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
   the_global_api.attr_proto_get_ints = vaip::attr_proto_get_ints;
   the_global_api.attr_proto_get_floats = vaip::attr_proto_get_floats;
   the_global_api.attr_proto_get_strings = vaip::attr_proto_get_strings;
-  the_global_api.attr_proto_get_type = [](const onnx::AttributeProto& attr) -> int { return attr.type(); };
+  the_global_api.attr_proto_get_type = [](const ONNX_NAMESPACE::AttributeProto& attr) -> int { return attr.type(); };
 
   /// node attributes
-  the_global_api.node_attributes_new = []() {
-    return reinterpret_cast<vaip_core::NodeAttributes*>(new NodeAttributes());
-  };
-  the_global_api.node_attributes_add = [](vaip_core::NodeAttributes& p, onnx::AttributeProto&& attr) {
-    reinterpret_cast<NodeAttributes&>(p).insert_or_assign(attr.name(), std::move(attr));
+  the_global_api.node_attributes_new = []() { return NodeAttributes::Create().release(); };
+  the_global_api.node_attributes_add = [](NodeAttributes& p, ONNX_NAMESPACE::AttributeProto&& attr) {
+    p.insert_or_assign(attr.name(), std::move(attr));
   };
-  the_global_api.node_attributes_delete = [](vaip_core::NodeAttributes* p) {
-    delete reinterpret_cast<NodeAttributes*>(p);
-  };
-  the_global_api.node_attributes_get = [](vaip_core::NodeAttributes& p,
-                                          const std::string& name) -> ONNX_NAMESPACE::AttributeProto* {
-    auto& attr = reinterpret_cast<NodeAttributes&>(p);
-    auto it = attr.find(name);
-    if (it == attr.end()) {
-      return nullptr;
+
+  the_global_api.node_attributes_delete = [](NodeAttributes* p) { delete p; };
+  the_global_api.node_attributes_get =
+      [](const NodeAttributes& attr, const std::string& name) -> const ONNX_NAMESPACE::AttributeProto* {
+    if (attr.count(name)) {
+      return &attr.at(name);
     }
-    return &it->second;
+    return nullptr;
   };
-  the_global_api.node_attributes_get_keys =
-      [](vaip_core::NodeAttributes& p) -> vaip_core::DllSafe<std::vector<std::string>> {
+  the_global_api.node_attributes_get_keys = [](NodeAttributes& attr) -> vaip_core::DllSafe<std::vector<std::string>> {
     auto ret = std::vector<std::string>();
-    auto& attr = reinterpret_cast<NodeAttributes&>(p);
     ret.reserve(attr.size());
     for (auto& it : attr) {
       ret.push_back(it.first);
@@ -396,35 +341,16 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     return vaip_core::DllSafe(std::move(ret));
   };
   /// tensor proto
-  the_global_api.tensor_proto_get_shape_unsafe =
-      [](const onnx::TensorProto& t) -> vaip_core::DllSafe<std::vector<int64_t>> {
-    return vaip_core::DllSafe<std::vector<int64_t>>(vaip::tensor_proto_get_shape(t));
-  };
-
-  the_global_api.tensor_proto_data_type = [](const onnx::TensorProto& t) -> int { return t.data_type(); };
-
-  the_global_api.tensor_proto_delete = [](onnx::TensorProto* tp) { delete tp; };
-
-  the_global_api.tensor_proto_new_floats = [](const std::string& name, const std::vector<int64_t>& shape,
-                                              const std::vector<float>& data) -> onnx::TensorProto* {
-    return new onnx::TensorProto{vaip::tensor_proto_new_floats(name, shape, data)};
-  };
-  the_global_api.tensor_proto_new_i32 = [](const std::string& name, const std::vector<int64_t>& shape,
-                                           const std::vector<int32_t>& data) -> onnx::TensorProto* {
-    return new onnx::TensorProto{vaip::tensor_proto_new_i32(name, shape, data)};
-  };
-  the_global_api.tensor_proto_new_i64 = [](const std::string& name, const std::vector<int64_t>& shape,
-                                           const std::vector<int64_t>& data) -> onnx::TensorProto* {
-    return new onnx::TensorProto{vaip::tensor_proto_new_i64(name, shape, data)};
-  };
-  the_global_api.tensor_proto_new_i8 = [](const std::string& name, const std::vector<int64_t>& shape,
-                                          const std::vector<int8_t>& data) -> onnx::TensorProto* {
-    return new onnx::TensorProto{vaip::tensor_proto_new_i8(name, shape, data)};
-  };
-  the_global_api.tensor_proto_raw_data_size = vaip::tensor_proto_raw_data_size;
-
+  the_global_api.tensor_proto_get_shape_unsafe = vaip::tensor_proto_get_shape;
+  the_global_api.tensor_proto_data_type = [](const ONNX_NAMESPACE::TensorProto& t) -> int { return t.data_type(); };
+  the_global_api.tensor_proto_delete = [](ONNX_NAMESPACE::TensorProto* tp) { delete tp; };
+  the_global_api.tensor_proto_new_floats = vaip::tensor_proto_new_floats;
+  the_global_api.tensor_proto_new_i32 = vaip::tensor_proto_new_i32;
+  the_global_api.tensor_proto_new_i64 = vaip::tensor_proto_new_i64;
+  the_global_api.tensor_proto_new_i8 = vaip::tensor_proto_new_i8;
+  the_global_api.tensor_proto_raw_data_size = [](const auto& tensor) { return tensor.raw_data().size(); };
   the_global_api.tensor_proto_as_raw = vaip::tensor_proto_as_raw;
-  the_global_api.tensor_proto_get_name = vaip::tensor_proto_get_name;
+  the_global_api.tensor_proto_get_name = [](const auto& tensor) -> const std::string& { return tensor.name(); };
 
   the_global_api.get_lib_name = []() -> vaip_core::DllSafe<std::string> {
     return vaip_core::DllSafe(std::string("onnxruntime.") + std::string(ORT_VERSION));
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index cca680baf7dc..061bc414fcec 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -2,27 +2,15 @@
 // Licensed under the MIT License.
 #include "vaip/graph.h"
 
-#include <core/graph/graph_viewer.h>
-
-#include "./vai_assert.h"
 #include <codecvt>
 #include <fstream>
 #include <filesystem>
 #include <limits>
 #include <locale>
 #include <string>
-#include "onnx/onnx-ml.pb.h"
-#ifdef _MSC_VER
-#pragma warning(push)
-// 'type' : forcing value to bool 'true' or 'false' (performance warning)
-#pragma warning(disable : 4800)
-#endif
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-using convert_t = std::codecvt_utf8<wchar_t>;
-std::wstring_convert<convert_t, wchar_t> strconverter;
+
+#include "core/providers/shared_library/provider_api.h"
+#include "./vai_assert.h"
 
 #include "vaip/node.h"
 #include "vaip/node_arg.h"
@@ -38,23 +26,14 @@ struct NodeEdgeT {
 
 static void graph_remove_node(Graph& graph, const Node& node) {
   auto remove_edges = std::vector<NodeEdgeT>();
-  auto begin = node.InputEdgesBegin();
-  auto end = node.InputEdgesEnd();
-  for (auto it = begin; it != end; ++it) {
-    remove_edges.push_back(NodeEdgeT{it->GetNode().Index(), node.Index(),
-                                     it->GetSrcArgIndex(),
-                                     it->GetDstArgIndex()});
+  for (auto it = node.InputEdgesBegin(); it != node.InputEdgesEnd(); ++it) {
+    remove_edges.push_back(NodeEdgeT{it->GetNode().Index(), node.Index(), it->GetSrcArgIndex(), it->GetDstArgIndex()});
   }
-  begin = node.OutputEdgesBegin();
-  end = node.OutputEdgesEnd();
-  for (auto it = begin; it != end; ++it) {
-    remove_edges.push_back(NodeEdgeT{node.Index(), it->GetNode().Index(),
-                                     it->GetSrcArgIndex(),
-                                     it->GetDstArgIndex()});
+  for (auto it = node.OutputEdgesBegin(); it != node.OutputEdgesEnd(); ++it) {
+    remove_edges.push_back(NodeEdgeT{node.Index(), it->GetNode().Index(), it->GetSrcArgIndex(), it->GetDstArgIndex()});
   }
   for (auto it : remove_edges) {
-    graph.RemoveEdge(it.src_node_index, it.dst_node_index, it.src_arg_index,
-                     it.dst_arg_index);
+    graph.RemoveEdge(it.src_node_index, it.dst_node_index, it.src_arg_index, it.dst_arg_index);
   }
   graph.RemoveNode(node.Index());
 }
@@ -68,13 +47,9 @@ static std::vector<const NodeArg*> node_get_implicit_input_node_args(const Node&
   }
   return ret;
 }
-
-Node& graph_add_node(Graph& graph, const std::string& name,
-                     const std::string& op_type, const std::string& description,
-                     const std::vector<const NodeArg*>& input_args,
-                     const std::vector<const NodeArg*>& output_args,
-                     const NodeAttributes& attributes,
-                     const std::string& domain) {
+Node& graph_add_node(Graph& graph, const std::string& name, const std::string& op_type, const std::string& description,
+                     const std::vector<const NodeArg*>& input_args, const std::vector<const NodeArg*>& output_args,
+                     const NodeAttributes& attributes, const std::string& domain) {
   std::vector<NodeArg*> inputs;
   inputs.reserve(input_args.size());
   for (auto i : input_args) {
@@ -85,8 +60,7 @@ Node& graph_add_node(Graph& graph, const std::string& name,
   for (auto i : output_args) {
     outputs.push_back(const_cast<NodeArg*>(i));
   }
-  auto& ret = graph.AddNode(name, op_type, description, inputs, outputs,
-                            &attributes, domain);
+  auto& ret = graph.AddNode(name, op_type, description, inputs, outputs, &attributes, domain);
   auto src_arg_index = 0;
   for (auto& o : outputs) {
     auto consumers = graph.GetConsumerNodes(o->Name());
@@ -96,8 +70,7 @@ Node& graph_add_node(Graph& graph, const std::string& name,
       for (auto ni : *tmp_inputs) {
         auto name1 = ni.node_arg->Name();
         if (name1 == o->Name()) {
-          graph.AddEdge(ret.Index(), consumer->Index(), src_arg_index,
-                        dst_arg_index);
+          graph.AddEdge(ret.Index(), consumer->Index(), src_arg_index, dst_arg_index);
         }
         dst_arg_index = dst_arg_index + 1;
       }
@@ -105,8 +78,7 @@ Node& graph_add_node(Graph& graph, const std::string& name,
       for (auto implicit_node_arg : node_get_implicit_input_node_args(*consumer)) {
         auto name1 = implicit_node_arg->Name();
         if (name1 == o->Name()) {
-          graph.AddEdge(ret.Index(), consumer->Index(), src_arg_index,
-                        dst_arg_index);
+          graph.AddEdge(ret.Index(), consumer->Index(), src_arg_index, dst_arg_index);
         }
         dst_arg_index = dst_arg_index + 1;
       }
@@ -132,44 +104,39 @@ void graph_remove_node(Graph& graph, const NodeInput& node_input) {
 
 void graph_save(const Graph& graph, const std::string& filename, const std::string& filename_dat, size_t initializer_size_threshold) {
   auto& model = const_cast<Model&>(graph.GetModel());
-  auto model_proto = ONNX_NAMESPACE::ModelProto();
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto;
 
   if (initializer_size_threshold == std::numeric_limits<size_t>::max()) {
     model_proto = model.ToProto();
   } else {
-    model_proto = model.ToGraphProtoWithExternalInitializers(filename_dat,
-                                                             ToPathString(filename),
-                                                             initializer_size_threshold);
+    model_proto = model.ToGraphProtoWithExternalInitializers(filename_dat, graph.ModelPath().ToPathString(), initializer_size_threshold);
   }
   auto& metadata = model.MetaData();
   if (!metadata.empty()) {
-    model_proto.mutable_metadata_props()->Clear();
+    auto metadata_props = model_proto->mutable_metadata_props();
+    metadata_props->Clear();
     for (auto& m : metadata) {
-      auto prop = model_proto.mutable_metadata_props()->Add();
+      auto prop = metadata_props->Add();
       *prop->mutable_key() = m.first;
       *prop->mutable_value() = m.second;
     }
   }
   // use relative path as data storage.
-  auto graph_proto = model_proto.mutable_graph();
-  *graph_proto = graph.ToGraphProto();
-  for (auto i = 0; i < graph_proto->initializer_size(); ++i) {
-    auto initializer = graph_proto->mutable_initializer(i);
-    for (auto j = 0; j < initializer->external_data_size(); ++j) {
-      auto external_data = initializer->mutable_external_data(j);
-      if (external_data->key() == "location") {
-        *external_data->mutable_value() = std::filesystem::path(external_data->value()).filename().u8string();
-      }
+  auto graph_proto = model_proto->mutable_graph();
+  *graph_proto = *graph.ToGraphProto();
+  for (int i = 0; i < graph_proto->mutable_initializer()->size(); i++) {
+    auto mutable_external_data = graph_proto->mutable_initializer()->at(i).mutable_external_data();
+    for (int j = 0; j < mutable_external_data->size(); j++) {
+      auto& external_data = mutable_external_data->at(j);
+      if (*external_data.mutable_key() == "location")
+        *external_data.mutable_value() = std::filesystem::path(*external_data.mutable_value()).filename().u8string();
     }
   }
-  int fd = -1;
-  Status status = Env::Default().FileOpenWr(filename, fd);
-  vai_assert(status.IsOK(), status.ErrorMessage());
-  google::protobuf::io::FileOutputStream output(fd);
-  const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush();
-  vai_assert(result, "model serialize to zero cipy stream error");
-  status = Env::Default().FileClose(fd);
-  vai_assert(status.IsOK(), status.ErrorMessage());
+
+  std::fstream output(filename, std::ios::out | std::ios::trunc | std::ios::binary);
+  bool result = model_proto->SerializeToOstream(output);
+  output << std::flush;
+  vai_assert(result, "model serialize to ostream error");
 }
 
 Node& graph_fuse(Graph& graph, const std::string& name,
@@ -178,25 +145,25 @@ Node& graph_fuse(Graph& graph, const std::string& name,
                  const std::vector<std::string>& inputs,
                  const std::vector<std::string>& outputs,
                  const std::vector<std::string>& constant_initializers) {
-  auto meta_def = std::make_unique<IndexedSubGraph::MetaDef>();
-  auto indexed_subgraph = std::make_unique<IndexedSubGraph>();
-  indexed_subgraph->nodes = nodes;
-  meta_def->inputs = inputs;
-  meta_def->outputs = outputs;
-  meta_def->constant_initializers = constant_initializers;
-  meta_def->name = "super_layer";
-  meta_def->domain = "com.xilinx";
-  meta_def->since_version = 1;
-  meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
+  auto meta_def = IndexedSubGraph_MetaDef::Create();
+  meta_def->inputs() = inputs;
+  meta_def->outputs() = outputs;
+  meta_def->constant_initializers() = constant_initializers;
+  meta_def->name() = "super_layer";
+  meta_def->domain() = "com.xilinx";
+  meta_def->since_version() = 1;
+  meta_def->status() = ONNX_NAMESPACE::EXPERIMENTAL;
+
+  auto indexed_subgraph = IndexedSubGraph::Create();
+  indexed_subgraph->Nodes() = nodes;
   indexed_subgraph->SetMetaDef(std::move(meta_def));
+
   auto& fused_node = graph.FuseSubGraph(*indexed_subgraph, name);
   auto function_body = fused_node.GetFunctionBody();
   if (function_body) {
-    auto& mygraph = function_body->Body();
-    // auto proto = graph.ToGraphProtoWithExternal("exteranl.dat", 128);
-    auto proto = mygraph.ToGraphProto();
-    *proto.mutable_name() = name;
-    fused_node.AddAttribute("body", proto);
+    auto proto = function_body->Body().ToGraphProto();
+    *proto->mutable_name() = name;
+    fused_node.AddAttribute("body", *proto);
   }
   for (auto&& o : fused_node.OutputDefs()) {
     graph.UpdateProducerNode(o->Name(), fused_node.Index());
diff --git a/onnxruntime/core/providers/vitisai/imp/node.cc b/onnxruntime/core/providers/vitisai/imp/node.cc
index 6d65ad4e8c40..432d7f7daead 100644
--- a/onnxruntime/core/providers/vitisai/imp/node.cc
+++ b/onnxruntime/core/providers/vitisai/imp/node.cc
@@ -4,9 +4,8 @@
 #include "./vai_assert.h"
 
 #include "attr_proto.h"
-#include "core/graph/graph_utils.h"
-#include "core/graph/node_arg.h"
 #include "vaip/node_arg.h"
+#include "core/providers/shared_library/provider_api.h"
 
 namespace vaip {
 
@@ -29,24 +28,24 @@ vaip_core::DllSafe<std::vector<NodeInput>> node_get_inputs(const Node& node) {
   }
   return vaip_core::DllSafe(ret);
 }
-
 vaip_core::DllSafe<std::vector<const NodeArg*>> node_get_output_node_args(const Node& node) {
   auto outputs = node.OutputDefs();
   auto size = outputs.size();
   auto ret = std::vector<const NodeArg*>(size);
   for (auto i = 0u; i < size; ++i) {
     auto output = outputs[i];
-    ret[i] = output;
     assert(output != nullptr);
-    vai_assert(output->Exists(), std::string("output must exists. name=" + output->Name()));
+    // Optional Outputs
+    // Some operators have outputs that are optional. When an actual output parameter of an operator is not specified, the operator implementation MAY forgo computing values for such outputs.
+    // There are two ways to leave an optional input or output unspecified: the first, available only for trailing inputs and outputs, is to simply not provide that input; the second method is to use an empty string in place of an input or output name.
+    // so optional output maybe output != null && output->Exists() return false
+    // Our processing : nullptr means optional output , and clinet code needs to handle nullptr
+    if (output->Exists()) {
+      ret[i] = output;
+    } else {
+      ret[i] = nullptr;
+    }
   }
   return vaip_core::DllSafe(ret);
 }
-
-vaip_core::DllSafe<std::vector<int64_t>> node_get_output_shape(const Node& node, int index) {
-  auto outputs = node.OutputDefs();
-  assert((size_t)index < outputs.size());
-  return node_arg_get_shape_i64(*outputs[index]);
-}
-
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/node_arg.cc b/onnxruntime/core/providers/vitisai/imp/node_arg.cc
index 3bdeb09698d4..a54cbef91c39 100644
--- a/onnxruntime/core/providers/vitisai/imp/node_arg.cc
+++ b/onnxruntime/core/providers/vitisai/imp/node_arg.cc
@@ -2,25 +2,16 @@
 // Licensed under the MIT License.
 #include "vaip/node_arg.h"
 #include "./vai_assert.h"
-
-#include <cstdint>
+#include "core/providers/shared_library/provider_api.h"
 
 #include "./tensor_proto.h"
-#include "core/graph/node_arg.h"
 
 namespace vaip {
-
-bool node_arg_is_exists(const NodeArg& node_arg) {
-  return node_arg.Exists();
-}
 bool node_arg_is_constant(const Graph& graph, const NodeArg& node_arg) {
   assert(node_arg.Exists());
   assert(!node_arg.Name().empty());
-  auto constant_tensor_proto =
-      graph.GetConstantInitializer(node_arg.Name(), true);
-  return constant_tensor_proto != nullptr;
+  return graph.GetConstantInitializer(node_arg.Name(), true) != nullptr;
 }
-
 vaip_core::DllSafe<std::vector<int64_t>> node_arg_get_shape_i64(const NodeArg& node_arg) {
   auto shape = node_arg.Shape();
   if (nullptr == shape) return vaip_core::DllSafe<std::vector<int64_t>>();
@@ -32,104 +23,42 @@ vaip_core::DllSafe<std::vector<int64_t>> node_arg_get_shape_i64(const NodeArg& n
   }
   return vaip_core::DllSafe(shape_vector);
 }
-
-static void LayoutTransformRule_set_shape(onnx::TensorShapeProto& shape_proto,
-                                          const std::vector<int64_t>& shape) {
-  assert(shape.size() == static_cast<size_t>(shape_proto.dim_size()));
-  auto rank = shape_proto.dim_size();
+void node_arg_set_shape_i64(const NodeArg& node_arg, const std::vector<int64_t>& shape) {
+  auto shape_proto = const_cast<ONNX_NAMESPACE::TensorShapeProto*>(node_arg.Shape());
+  assert(shape_proto != nullptr);
+  assert(shape.size() == static_cast<size_t>(shape_proto->dim_size()));
+  auto rank = shape_proto->dim_size();
   for (auto i = 0; i < rank; ++i) {
-    shape_proto.mutable_dim(i)->set_dim_value(shape[i]);
+    shape_proto->mutable_dim(i)->set_dim_value(shape[i]);
   }
 }
-
-static void LayoutTransformRule_set_shape(onnx::TypeProto& type_proto,
-                                          const std::vector<int64_t>& shape) {
-  assert(type_proto.value_case() == onnx::TypeProto::kTensorType);
-  //<< type_proto.DebugString();
-  auto& tensor_type = *type_proto.mutable_tensor_type();
-  auto& shape_prot = *tensor_type.mutable_shape();
-  return LayoutTransformRule_set_shape(shape_prot, shape);
-}
-
-static void LayoutTransformRule_set_shape(NodeArg* node_arg,
-                                          const std::vector<int64_t>& shape) {
-  assert(node_arg != nullptr);
-  auto* type_proto = node_arg->TypeAsProto();
-  assert(type_proto != nullptr);
-  return LayoutTransformRule_set_shape(
-      *const_cast<onnx::TypeProto*>(type_proto), shape);
-}
-
-void node_arg_set_shape_i64(const NodeArg& node_arg,
-                            const std::vector<int64_t>& shape) {
-  LayoutTransformRule_set_shape(const_cast<NodeArg*>(&node_arg), shape);
-}
-
-static std::vector<std::string> LayoutTransformRule_get_denotation(
-    const onnx::TensorShapeProto& shape) {
+vaip_core::DllSafe<std::vector<std::string>> node_arg_get_denotation(const NodeArg& node_arg) {
+  auto shape = node_arg.Shape();
+  if (shape == nullptr) {
+    return vaip_core::DllSafe<std::vector<std::string>>();
+  }
   auto ret = std::vector<std::string>();
-  auto rank = shape.dim_size();
-  ret.reserve(rank);
+  auto rank = shape->dim_size();
   for (auto i = 0; i < rank; ++i) {
-    auto& d = shape.dim(i).denotation();
-    ret.push_back(d);
+    ret.push_back(shape->dim(i).denotation());
   }
-  return ret;
+  return vaip_core::DllSafe<std::vector<std::string>>(ret);
 }
-
-static vaip_core::DllSafe<std::vector<std::string>> LayoutTransformRule_get_denotation(
-    const onnx::TypeProto& type_proto) {
-  vai_assert(type_proto.value_case() == onnx::TypeProto::kTensorType, type_proto.DebugString());
-  auto& tensor_type = type_proto.tensor_type();
-  if (!tensor_type.has_shape()) {
-    return vaip_core::DllSafe<std::vector<std::string>>();
-  }
-  auto& shape = tensor_type.shape();
-  auto denotation = LayoutTransformRule_get_denotation(shape);
-  return vaip_core::DllSafe<std::vector<std::string>>(denotation);
-}
-
-static vaip_core::DllSafe<std::vector<std::string>> LayoutTransformRule_get_denotation(
-    const NodeArg* node_arg) {
-  assert(node_arg != nullptr);
-  auto* type_proto = node_arg->TypeAsProto();
-  assert(type_proto != nullptr);
-  return LayoutTransformRule_get_denotation(*type_proto);
-}
-
-vaip_core::DllSafe<std::vector<std::string>> node_arg_get_denotation(const NodeArg& node_arg) {
-  return LayoutTransformRule_get_denotation(&node_arg);
-}
-
-static onnx::TensorShapeProto* node_arg_get_tensor_mutable_shape(
-    NodeArg* node_arg) {
-  assert(node_arg != nullptr);
-  auto type_proto = const_cast<onnx::TypeProto*>(node_arg->TypeAsProto());
-  assert(type_proto != nullptr);
-  vai_assert(type_proto->value_case() == onnx::TypeProto::kTensorType,
-             type_proto->DebugString());
-  return type_proto->mutable_tensor_type()->mutable_shape();
-}
-
-static void LayoutTransformRule_set_denotation(
-    onnx::TensorShapeProto& shape, const std::vector<std::string>& denotation) {
-  assert(denotation.size() == static_cast<size_t>(shape.dim_size()));
-  auto rank = shape.dim_size();
+void node_arg_set_denotation(const NodeArg& node_arg, const std::vector<std::string>& denotation) {
+  auto shape_proto = const_cast<ONNX_NAMESPACE::TensorShapeProto*>(node_arg.Shape());
+  assert(shape_proto != nullptr);
+  assert(denotation.size() == static_cast<size_t>(shape_proto->dim_size()));
+  auto rank = shape_proto->dim_size();
   for (auto i = 0; i < rank; ++i) {
-    shape.mutable_dim(i)->set_denotation(denotation[i]);
+    shape_proto->mutable_dim(i)->set_denotation(denotation[i]);
   }
 }
-void node_arg_set_denotation(const NodeArg& node_arg,
-                             const std::vector<std::string>& denotation) {
-  auto mutable_shape =
-      node_arg_get_tensor_mutable_shape(const_cast<NodeArg*>(&node_arg));
-
-  return LayoutTransformRule_set_denotation(*mutable_shape, denotation);
-}
-
-void node_arg_set_element_type(NodeArg& node_arg,
-                               onnx::TensorProto::DataType data_type) {
-  auto type_proto = const_cast<onnx::TypeProto*>(node_arg.TypeAsProto());
+void node_arg_set_element_type(NodeArg& node_arg, int type) {
+  if (type < 0 || type > 16) {
+    vai_assert(false, "TensorProto::DataType not supoort");
+  }
+  auto data_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(type);
+  auto type_proto = const_cast<ONNX_NAMESPACE::TypeProto*>(node_arg.TypeAsProto());
   assert(type_proto != nullptr);
   auto current_elem_type = type_proto->mutable_tensor_type()->elem_type();
   auto input_elem_type = data_type;
@@ -138,24 +67,12 @@ void node_arg_set_element_type(NodeArg& node_arg,
                                              current_elem_type, true);
   vai_assert(status.IsOK(), status.ErrorMessage());
 }
-void node_arg_set_shape(NodeArg& node_arg, std::vector<int64_t> shape) {
-  auto type_proto = const_cast<onnx::TypeProto*>(node_arg.TypeAsProto());
-  assert(type_proto != nullptr);
-  for (auto i = 0u; i < shape.size(); i++) {
-    type_proto->mutable_tensor_type()
-        ->mutable_shape()
-        ->mutable_dim(i)
-        ->set_dim_value(shape[i]);
-  }
-}
-
 const ONNX_NAMESPACE::TensorProto& node_arg_get_const_data_as_tensor(
     const Graph& graph, const NodeArg& node_arg) {
   auto tensor_proto = graph.GetConstantInitializer(node_arg.Name(), true);
   assert(tensor_proto != nullptr);
   return *tensor_proto;
 }
-
 int node_arg_get_element_type(const NodeArg& node_arg) {
   auto type_proto = node_arg.TypeAsProto();
   assert(type_proto != nullptr);
@@ -164,9 +81,7 @@ int node_arg_get_element_type(const NodeArg& node_arg) {
   }
   return type_proto->tensor_type().elem_type();
 }
-
-NodeArg& node_arg_clone(Graph& graph, const NodeArg& node_arg,
-                        const std::string& name) {
+NodeArg& node_arg_clone(Graph& graph, const NodeArg& node_arg, const std::string& name) {
   vai_assert(name != node_arg.Name(), "node arg must have a new unique name");
   vai_assert(graph.GetNodeArg(name) == nullptr, std::string("node arg " + name + " already exists. "));
   auto type_proto = node_arg.TypeAsProto();
@@ -174,12 +89,10 @@ NodeArg& node_arg_clone(Graph& graph, const NodeArg& node_arg,
   auto& ret = graph.GetOrCreateNodeArg(name, type_proto);
   return ret;
 }
-
-NodeArg& node_arg_new(Graph& graph,
-                      const std::string& name, const std::vector<int64_t>* shape, int element_type) {
+NodeArg& node_arg_new(Graph& graph, const std::string& name, const std::vector<int64_t>* shape, int element_type) {
   vai_assert(graph.GetNodeArg(name) == nullptr, std::string("node arg " + name + " already exists. "));
-  auto type_proto = onnx::TypeProto();
-  auto tensor_type = type_proto.mutable_tensor_type();
+  auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
+  auto tensor_type = type_proto->mutable_tensor_type();
   tensor_type->set_elem_type(element_type);
   if (shape != nullptr) {
     auto shape_proto = tensor_type->mutable_shape();
@@ -189,8 +102,6 @@ NodeArg& node_arg_new(Graph& graph,
   } else {
     assert(tensor_type->has_shape() == false);
   }
-  auto& ret = graph.GetOrCreateNodeArg(name, &type_proto);
-  return ret;
+  return graph.GetOrCreateNodeArg(name, type_proto.release());
 }
-
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/node_attrs.cc b/onnxruntime/core/providers/vitisai/imp/node_attrs.cc
deleted file mode 100644
index e438266e2a4c..000000000000
--- a/onnxruntime/core/providers/vitisai/imp/node_attrs.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-// Licensed under the MIT License.
-#include "vaip/node_attrs.h"
-#include "./vai_assert.h"
-
-namespace vaip {
-static onnx::AttributeProto make_attribute(const std::string& name,
-                                           int64_t value) {
-  auto ret = onnx::AttributeProto();
-  ret.set_name(name);
-  ret.set_type(onnx::AttributeProto::INT);
-  ret.set_i(value);
-  return ret;
-}
-
-static onnx::AttributeProto make_attribute(const std::string& name,
-                                           const std::vector<int64_t> value) {
-  auto ret = onnx::AttributeProto();
-  ret.set_name(name);
-  ret.set_type(onnx::AttributeProto::INTS);
-  for (auto v : value) {
-    ret.add_ints(v);
-  }
-  return ret;
-}
-
-static onnx::AttributeProto make_attribute(const std::string& name,
-                                           const std::string& value) {
-  auto ret = onnx::AttributeProto();
-  ret.set_name(name);
-  ret.set_type(onnx::AttributeProto::STRING);
-  ret.set_s(value);
-  return ret;
-}
-static onnx::AttributeProto make_attribute(
-    const std::string& name, const std::vector<std::string>& value) {
-  auto ret = onnx::AttributeProto();
-  ret.set_name(name);
-  ret.set_type(onnx::AttributeProto::STRINGS);
-  for (auto v : value) {
-    ret.add_strings(v);
-  }
-  return ret;
-}
-
-static onnx::AttributeProto make_attribute(const std::string& name,
-                                           const std::vector<float>& value) {
-  auto ret = onnx::AttributeProto();
-  ret.set_name(name);
-  ret.set_type(onnx::AttributeProto::FLOATS);
-  for (auto v : value) {
-    ret.add_floats(v);
-  }
-  return ret;
-}
-
-static onnx::AttributeProto make_attribute(const std::string& name,
-                                           const onnx::TensorProto& value) {
-  auto ret = onnx::AttributeProto();
-  ret.set_name(name);
-  ret.set_type(onnx::AttributeProto::TENSOR);
-  *(ret.mutable_t()) = std::move(value);
-  return ret;
-}  // namespace vaip
-
-NodeAttr::NodeAttr(const std::string& name, int64_t value)
-    : attribute_proto_{make_attribute(name, value)} {}
-
-NodeAttr::NodeAttr(const std::string& name, const std::vector<int64_t>& value)
-    : attribute_proto_{make_attribute(name, value)} {}
-
-NodeAttr::NodeAttr(const std::string& name, const std::string& value)
-    : attribute_proto_{make_attribute(name, value)} {}
-
-NodeAttr::NodeAttr(const std::string& name,
-                   const std::vector<std::string>& value)
-    : attribute_proto_{make_attribute(name, value)} {}
-
-NodeAttr::NodeAttr(const std::string& name, const std::vector<float>& value)
-    : attribute_proto_{make_attribute(name, value)} {}
-
-NodeAttr::NodeAttr(const std::string& name, const onnx::TensorProto& value)
-    : attribute_proto_{make_attribute(name, value)} {}
-
-onnx::AttributeProto& NodeAttr::get() { return attribute_proto_; }
-
-NodeAttributesBuiler::NodeAttributesBuiler(size_t capacity) : attrs_{} {
-  attrs_.reserve(capacity);
-}
-
-NodeAttributes NodeAttributesBuiler::build() {
-  auto ret = NodeAttributes();
-  ret.reserve(attrs_.size());
-  for (auto& node_attr : attrs_) {
-    onnx::AttributeProto& attr_proto = node_attr.get();
-    auto name = attr_proto.name();
-    ret.insert(std::make_pair(name, std::move(attr_proto)));
-  }
-  attrs_.clear();
-  return ret;
-}
-
-void NodeAttributesBuiler::merge_into(Node& node) {
-  merge_into(node.GetMutableAttributes());
-}
-
-void NodeAttributesBuiler::merge_into(NodeAttributes& attrs) {
-  for (auto& attr : attrs_) {
-    vai_assert(attr.get().has_name(), std::string("attr must has name " + attr.get().DebugString()));
-    auto name = attr.get().name();
-    attrs.insert_or_assign(std::move(name), std::move(attr.get()));
-  }
-}
-}  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc b/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc
index ee8dfc6d03d1..97ed2d3b4b8a 100644
--- a/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc
+++ b/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc
@@ -1,130 +1,25 @@
-
-
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
+
 #include "./register_xir_ops.h"
 #include "./vai_assert.h"
-
-#include "core/common/logging/logging.h"
-#include "core/common/status.h"
-
-#include "core/framework/customregistry.h"
-
+#include "core/providers/shared_library/provider_api.h"
 #include "core/session/onnxruntime_c_api.h"
-#include "core/session/custom_ops.h"
-#include "core/session/inference_session.h"
-#include "onnx/defs/schema.h"
-#include "onnx/defs/shape_inference.h"
 
 using namespace onnxruntime;
-namespace vaip {
-
-static void xir_shape_infer(ONNX_NAMESPACE::InferenceContext& ctx) {
-  auto* shape = ctx.getAttribute("shape");
-  auto* data_type = ctx.getAttribute("data_type");
-  if (data_type->s() == "float32") {
-    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::FLOAT);
-  } else if (data_type->s() == "int8") {
-    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::INT8);
-  } else if (data_type->s() == "uint8") {
-    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::UINT8);
-  } else if (data_type->s() == "int32") {
-    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::INT32);
-  } else if (data_type->s() == "int64") {
-    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::INT64);
-  } else if (data_type->s() == "int1") {
-    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
-  } else if (data_type->s() == "bfloat16") {
-    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BFLOAT16);
-  } else if (data_type->s() == "float16") {
-    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::FLOAT16);
-  } else {
-    vai_assert(false, ", not supported data_type: " + data_type->s());
-  }
-  if (shape != nullptr) {
-    for (auto i = 0; i < shape->ints_size(); ++i) {
-      ONNX_NAMESPACE::appendDim(ONNX_NAMESPACE::getOutputShape(ctx, 0), shape->ints(i));
-    }
-  } else {
-    // set scalar type.
-    auto* output_shape = ONNX_NAMESPACE::getOutputShape(ctx, 0);
-    output_shape->clear_dim();
-  }
-  return;
-}
-
-static void xir_fixneuron_shape_inference(ONNX_NAMESPACE::InferenceContext& ctx) {
-  ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
-  ONNX_NAMESPACE::propagateShapeFromInputToOutput(ctx, 0, 0);
-}
-
-static void xir_subgraph_shape_inference(ONNX_NAMESPACE::InferenceContext& ctx) {
-  auto num_inputs = ctx.getNumInputs();
-
-  // Run inferencing on the subgraph
-  ONNX_NAMESPACE::GraphInferencer* graphInferencer = ctx.getGraphAttributeInferencer("body");
-  if (!graphInferencer) {
-    fail_type_inference("body is missing.");
-  }
-
-  std::vector<const ONNX_NAMESPACE::TensorProto*> input_data;
-  std::vector<const ONNX_NAMESPACE::TypeProto*> subgraph_input_types;
-  for (size_t i = 0; i < num_inputs; ++i) {
-    input_data.push_back(ctx.getInputData(i));
-    subgraph_input_types.push_back(ctx.getInputType(i));
-  }
-  std::vector<const ONNX_NAMESPACE::TypeProto*> output_types;
-  output_types =
-      graphInferencer->doInferencing(subgraph_input_types, input_data);
-
-  auto num_outputs = ctx.getNumOutputs();
-  auto num_of_the_subgraph_outputs = output_types.size();
-  if (num_outputs != num_of_the_subgraph_outputs) {
-    fail_type_inference("super layer has ", num_outputs,
-                        " but subgraphs produce ", num_of_the_subgraph_outputs);
-  }
-  for (size_t i = 0, end = output_types.size(); i < end; ++i) {
-    auto subgraph_output = output_types[i];
-    auto* super_layer_output = ctx.getOutputType(i);
-    *super_layer_output = *subgraph_output;
-  }
-}
 
+namespace vaip {
 void register_xir_ops(const std::vector<OrtCustomOpDomain*>& domains) {
-  std::shared_ptr<CustomRegistry> custom_registry;
-  auto status = CreateCustomRegistry(gsl::span(domains), custom_registry);
-  vai_assert(status.IsOK(), status.ErrorMessage());
   for (auto domain : domains) {
     for (auto op : domain->custom_ops_) {
       auto name = op->GetName(op);
-      auto schema1 = custom_registry->GetOpschemaRegistry()->GetSchema(name, ORT_API_VERSION, domain->domain_);
-      auto schema2 = ::ONNX_NAMESPACE::OpSchema();
-      schema2.SetName(schema1->Name());
-      schema2.SetDomain(schema1->domain());
-      auto n = 0;
-      for (auto input : schema1->inputs()) {
-        schema2.Input(n, input.GetName(), input.GetDescription(), std::string("T") + std::to_string(n), input.GetOption(), false, input.GetMinArity(), input.GetDifferentiationCategory());
-        schema2.TypeConstraint(std::string("T") + std::to_string(n), DataTypeImpl::ToString(DataTypeImpl::AllTensorTypes()), "all types");
-        n = n + 1;
-      }
-      auto m = n;
-      n = 0;
-      for (auto output : schema1->outputs()) {
-        auto type_str = std::string("T") + std::to_string(n + m);
-        schema2.Output(n, output.GetName(), output.GetDescription(), type_str, output.GetOption(), false, output.GetMinArity(), output.GetDifferentiationCategory());
-        schema2.TypeConstraint(type_str, DataTypeImpl::ToString(DataTypeImpl::AllTensorTypes()), "all types");
-        n = n + 1;
-      }
-      schema2.SinceVersion(1);
-      schema2.AllowUncheckedAttributes();
       if ((std::string)name == "super_layer") {
-        schema2.TypeAndShapeInferenceFunction(xir_subgraph_shape_inference);
+        Provider_GetHost()->RegisterSchema(domain->domain_, op, 1);
       } else if ((std::string)name == "FixNeuron") {
-        schema2.TypeAndShapeInferenceFunction(xir_fixneuron_shape_inference);
+        Provider_GetHost()->RegisterSchema(domain->domain_, op, 2);
       } else {
-        schema2.TypeAndShapeInferenceFunction(xir_shape_infer);
+        Provider_GetHost()->RegisterSchema(domain->domain_, op, 3);
       }
-      ONNX_NAMESPACE::RegisterSchema(schema2, ORT_API_VERSION);
     }
   }
 }
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
index db03354bf4c4..671d852abb0d 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
@@ -1,20 +1,19 @@
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
 #include "./tensor_proto.h"
-#include "./vai_assert.h"
-#include "core/framework/tensorprotoutils.h"
 
 #include <cstdint>
 #include <limits>
 
+#include "./vai_assert.h"
+#include "core/providers/shared_library/provider_api.h"
 namespace vaip {
-
-gsl::span<const char> tensor_proto_as_raw(
-    const ONNX_NAMESPACE::TensorProto& tensor) {
+gsl::span<const char> tensor_proto_as_raw(const ONNX_NAMESPACE::TensorProto& tensor) {
   auto& mut_tensor = const_cast<ONNX_NAMESPACE::TensorProto&>(tensor);
   if (!tensor.has_raw_data()) {
     std::vector<uint8_t> unpacked_tensor;
-    auto s = onnxruntime::utils::UnpackInitializerData(tensor, onnxruntime::Path(), unpacked_tensor);
+    auto path = onnxruntime::Path::Create();
+    auto s = onnxruntime::utils::UnpackInitializerData(tensor, *path, unpacked_tensor);
     mut_tensor.mutable_raw_data()->resize(unpacked_tensor.size());
     mut_tensor.clear_float_data();
     mut_tensor.clear_int32_data();
@@ -23,82 +22,56 @@ gsl::span<const char> tensor_proto_as_raw(
     mut_tensor.clear_double_data();
     mut_tensor.clear_uint64_data();
     memcpy(mut_tensor.mutable_raw_data()->data(), unpacked_tensor.data(), unpacked_tensor.size());
+    mut_tensor.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_DEFAULT);
   }
   return gsl::span<const char>(tensor.raw_data().data(), tensor.raw_data().size());
 }
 
-size_t tensor_proto_raw_data_size(const ONNX_NAMESPACE::TensorProto& tensor) {
-  return tensor.raw_data().size();
-}
-
-std::vector<int64_t> tensor_proto_get_shape(
-    const onnx::TensorProto& tensor_proto) {
+vaip_core::DllSafe<std::vector<int64_t>> tensor_proto_get_shape(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
   auto ret = std::vector<int64_t>();
   int rank = tensor_proto.dims_size();
   if (rank > 0) {
-    ret.reserve((size_t)rank);
-    for (auto i = 0; i < rank; ++i) {
-      ret.push_back(tensor_proto.dims(i));
+    auto& dims = tensor_proto.dims();
+    for (auto i = 0; i < dims.size(); ++i) {
+      ret.push_back(dims[i]);
     }
   }
-  return ret;
+  return vaip_core::DllSafe(ret);
 }
-
-const std::string& tensor_proto_get_name(
-    const ONNX_NAMESPACE::TensorProto& tensor) {
-  return tensor.name();
+static ONNX_NAMESPACE::TensorProto* tensor_proto_new(const std::string& name, const std::vector<int64_t>& shape,
+                                                     int data_type, const char* data, size_t data_size) {
+  auto tensor_proto = ONNX_NAMESPACE::TensorProto::Create();
+  tensor_proto->set_name(name);
+  for (auto s : shape) {
+    tensor_proto->add_dims(s);
+  }
+  tensor_proto->set_data_type(data_type);
+  tensor_proto->mutable_raw_data()->assign(data, data_size);
+  return tensor_proto.release();
 }
 
-ONNX_NAMESPACE::TensorProto tensor_proto_new_i32(
-    const std::string& name, const std::vector<int64_t>& shape,
-    const std::vector<int32_t>& data) {
-  auto tensor_proto = ONNX_NAMESPACE::TensorProto();
-  tensor_proto.set_name(name);
-  tensor_proto.mutable_dims()->Clear();
-  tensor_proto.mutable_dims()->Add(shape.begin(), shape.end());
-  tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto::INT32);
-  tensor_proto.mutable_raw_data()->assign(
-      reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int32_t));
-  return tensor_proto;
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i32(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<int32_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT32,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int32_t));
 }
 
-ONNX_NAMESPACE::TensorProto tensor_proto_new_i64(
-    const std::string& name, const std::vector<int64_t>& shape,
-    const std::vector<int64_t>& data) {
-  auto tensor_proto = ONNX_NAMESPACE::TensorProto();
-  tensor_proto.set_name(name);
-  tensor_proto.mutable_dims()->Clear();
-  tensor_proto.mutable_dims()->Add(shape.begin(), shape.end());
-  tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto::INT64);
-  tensor_proto.mutable_raw_data()->assign(
-      reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int64_t));
-  return tensor_proto;
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i64(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<int64_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT64,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int64_t));
 }
 
-ONNX_NAMESPACE::TensorProto tensor_proto_new_i8(
-    const std::string& name, const std::vector<int64_t>& shape,
-    const std::vector<int8_t>& data) {
-  auto tensor_proto = ONNX_NAMESPACE::TensorProto();
-  tensor_proto.set_name(name);
-  tensor_proto.mutable_dims()->Clear();
-  tensor_proto.mutable_dims()->Add(shape.begin(), shape.end());
-  tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto::INT8);
-  tensor_proto.mutable_raw_data()->assign(
-      reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int8_t));
-  return tensor_proto;
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<int8_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT8,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int8_t));
 }
 
-ONNX_NAMESPACE::TensorProto tensor_proto_new_floats(
-    const std::string& name, const std::vector<int64_t>& shape,
-    const std::vector<float>& data) {
-  auto tensor_proto = ONNX_NAMESPACE::TensorProto();
-  tensor_proto.set_name(name);
-  tensor_proto.mutable_dims()->Clear();
-  tensor_proto.mutable_dims()->Add(shape.begin(), shape.end());
-  tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto::FLOAT);
-  tensor_proto.mutable_raw_data()->assign(
-      reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(float));
-  return tensor_proto;
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_floats(const std::string& name, const std::vector<int64_t>& shape,
+                                                     const std::vector<float>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(float));
 }
 
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
index 00aa388c809c..292905ca734f 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
@@ -1,31 +1,20 @@
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
-//
-#include "core/common/gsl.h"
-#include "onnx/onnx_pb.h"
-namespace vaip {
-
-gsl::span<const char> tensor_proto_as_raw(
-    const ONNX_NAMESPACE::TensorProto& tensor);
-size_t tensor_proto_raw_data_size(const ONNX_NAMESPACE::TensorProto& tensor);
-
-std::vector<int64_t> tensor_proto_get_shape(
-    const ONNX_NAMESPACE::TensorProto& tensor);
-const std::string& tensor_proto_get_name(
-    const ONNX_NAMESPACE::TensorProto& tensor);
-ONNX_NAMESPACE::TensorProto tensor_proto_new_i8(
-    const std::string& name, const std::vector<int64_t>& shape,
-    const std::vector<int8_t>& data);
-ONNX_NAMESPACE::TensorProto tensor_proto_new_i32(
-    const std::string& name, const std::vector<int64_t>& shape,
-    const std::vector<int32_t>& data);
-ONNX_NAMESPACE::TensorProto tensor_proto_new_i64(
-    const std::string& name, const std::vector<int64_t>& shape,
-    const std::vector<int64_t>& data);
-
-ONNX_NAMESPACE::TensorProto tensor_proto_new_floats(
-    const std::string& name, const std::vector<int64_t>& shape,
-    const std::vector<float>& data);
+#include "vaip/my_ort.h"
+#include "vaip/vaip_gsl.h"
+#include "vaip/dll_safe.h"
 
+namespace vaip {
+gsl::span<const char> tensor_proto_as_raw(const ONNX_NAMESPACE::TensorProto& tensor);
+vaip_core::DllSafe<std::vector<int64_t>> tensor_proto_get_shape(const ONNX_NAMESPACE::TensorProto& tensor);
+const std::string& tensor_proto_get_name(const ONNX_NAMESPACE::TensorProto& tensor);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<int8_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i32(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<int32_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i64(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<int64_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_floats(const std::string& name, const std::vector<int64_t>& shape,
+                                                     const std::vector<float>& data);
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/capability.h b/onnxruntime/core/providers/vitisai/include/vaip/capability.h
index d6b5ae34decc..e7644dbe8635 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/capability.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/capability.h
@@ -2,8 +2,7 @@
 // Licensed under the MIT License.
 #pragma once
 
-#include "core/framework/compute_capability.h"
-#include "core/graph/graph_viewer.h"
+#include "core/providers/shared_library/provider_api.h"
 #include "vaip/custom_op.h"
 namespace vaip {
 using namespace ::onnxruntime;
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
index c446ab3aefcc..1f8b8802e86b 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
@@ -2,16 +2,15 @@
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
-#include <vector>
-#include <memory>
-#include <string>
-
+#include "core/providers/shared_library/provider_api.h"
+#define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/provider_options.h"
 #include "vaip/my_ort.h"
 #include "vaip/dll_safe.h"
 #include "vaip/custom_op.h"
 
-std::vector<OrtCustomOpDomain*> initialize_vitisai_ep();
-vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model_with_options(
-    const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options);
+void initialize_vitisai_ep();
+vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options);
+std::shared_ptr<onnxruntime::KernelRegistry> get_kernel_registry_vitisaiep();
+const std::vector<OrtCustomOpDomain*>& get_domains_vitisaiep();
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/graph.h b/onnxruntime/core/providers/vitisai/include/vaip/graph.h
index 9def8645709f..292fb2bb38b2 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/graph.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/graph.h
@@ -1,25 +1,19 @@
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
-#include <core/graph/graph.h>
 #include "./node.h"
+#include "vaip/my_ort.h"
 namespace vaip {
 using namespace onnxruntime;
 
 void graph_remove_node(Graph& graph, const NodeInput& node_input);
-Node& graph_add_node(Graph& graph, const std::string& name,
-                     const std::string& op_type, const std::string& description,
-                     const std::vector<const NodeArg*>& input_args,
-                     const std::vector<const NodeArg*>& output_args,
-                     const NodeAttributes& attributes,
-                     const std::string& domain);
-
-void graph_save(const Graph& graph, const std::string& filename, const std::string& dat_filename, size_t initializer_size_threshold);
-Node& graph_fuse(Graph& graph, const std::string& name,
-                 const std::string& op_type,
-                 const std::vector<size_t>& nodes,
-                 const std::vector<std::string>& inputs,
-                 const std::vector<std::string>& outputs,
+Node& graph_add_node(Graph& graph, const std::string& name, const std::string& op_type, const std::string& description,
+                     const std::vector<const NodeArg*>& input_args, const std::vector<const NodeArg*>& output_args,
+                     const NodeAttributes& attributes, const std::string& domain);
+void graph_save(const Graph& graph, const std::string& filename, const std::string& dat_filename,
+                size_t initializer_size_threshold);
+Node& graph_fuse(Graph& graph, const std::string& name, const std::string& op_type, const std::vector<size_t>& nodes,
+                 const std::vector<std::string>& inputs, const std::vector<std::string>& outputs,
                  const std::vector<std::string>& constant_initializers);
 
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h b/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
index d43ef1253715..46fc4ac9b2a5 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
@@ -9,15 +9,17 @@
 #include <vector>
 
 namespace onnxruntime {
-class Model;
-class Graph;
-class GraphViewer;
-class Node;
-class NodeArg;
+struct Model;
+struct Graph;
+struct GraphViewer;
+struct Node;
+struct NodeArg;
+struct ProviderHost;
+struct NodeAttributes;
 }  // namespace onnxruntime
 namespace ONNX_NAMESPACE {
-class AttributeProto;
-class TensorProto;
+struct AttributeProto;
+struct TensorProto;
 #ifndef USE_VITISAI
 enum TensorProto_DataType : int {
   TensorProto_DataType_UNDEFINED = 0,
@@ -68,6 +70,7 @@ using onnxruntime::GraphViewer;
 using onnxruntime::Model;
 using onnxruntime::Node;
 using onnxruntime::NodeArg;
+using onnxruntime::NodeAttributes;
 struct ModelDeleter {
   VAIP_DLL_SPEC void operator()(Model* tp) const;
 };
@@ -75,22 +78,17 @@ using ModelPtr = std::unique_ptr<Model, ModelDeleter>;
 struct AttributeProtoDeleter {
   VAIP_DLL_SPEC void operator()(AttributeProto* p) const;
 };
-using AttributeProtoPtr =
-    std::unique_ptr<AttributeProto, AttributeProtoDeleter>;
+using AttributeProtoPtr = std::unique_ptr<AttributeProto, AttributeProtoDeleter>;
 
 struct TensorProtoDeleter {
   VAIP_DLL_SPEC void operator()(TensorProto* tp) const;
 };
 using TensorProtoPtr = std::unique_ptr<TensorProto, TensorProtoDeleter>;
 
-/// I cannot forward declare a using directive, because
-/// std::unorderd_map required AttributeProto must be defiend.
-class NodeAttributes;
 struct NodeAttributesDeleter {
   VAIP_DLL_SPEC void operator()(NodeAttributes* p) const;
 };
-using NodeAttributesPtr =
-    std::unique_ptr<NodeAttributes, NodeAttributesDeleter>;
+using NodeAttributesPtr = std::unique_ptr<NodeAttributes, NodeAttributesDeleter>;
 /// get node's input
 /// when Node* is nullptr, it is a tensor in the initializer.
 /// node_arg is always non-null.
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/node.h b/onnxruntime/core/providers/vitisai/include/vaip/node.h
index bad7660f6674..31d9d4bd73b8 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/node.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/node.h
@@ -2,10 +2,6 @@
 // Licensed under the MIT License.
 
 #pragma once
-
-#include <core/graph/model.h>
-
-#include "core/graph/node_arg.h"
 #include "vaip/dll_safe.h"
 #include "vaip/my_ort.h"
 namespace vaip {
@@ -17,8 +13,4 @@ vaip_core::DllSafe<std::vector<NodeInput>> node_get_inputs(const Node& node);
 
 /// to support multiple outputs
 vaip_core::DllSafe<std::vector<const NodeArg*>> node_get_output_node_args(const Node& node);
-/// get output shape
-/// index is usually zero, because most operators only have a single output.
-vaip_core::DllSafe<std::vector<int64_t>> node_get_output_shape(const Node& node, int index = 0);
-
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/node_arg.h b/onnxruntime/core/providers/vitisai/include/vaip/node_arg.h
index 76432fc5b3a6..fca641c5e11c 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/node_arg.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/node_arg.h
@@ -2,9 +2,8 @@
 // Licensed under the MIT License.
 
 #pragma once
-#include <core/graph/graph.h>
 #include "vaip/dll_safe.h"
-#include <cstdint>
+#include "vaip/my_ort.h"
 namespace vaip {
 using namespace onnxruntime;
 
@@ -26,9 +25,7 @@ void node_arg_set_shape_i64(const NodeArg& node_arg,
 void node_arg_set_denotation(const NodeArg& node_arg,
                              const std::vector<std::string>& denotation);
 void node_arg_set_element_type(NodeArg& node_arg,
-                               ONNX_NAMESPACE::TensorProto::DataType data_type);
-void node_arg_set_shape(NodeArg& node_arg, std::vector<int64_t> shape);
-
+                               int data_type);
 const ONNX_NAMESPACE::TensorProto& node_arg_get_const_data_as_tensor(const Graph& graph,
                                                                      const NodeArg& node_arg);
 
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/node_attrs.h b/onnxruntime/core/providers/vitisai/include/vaip/node_attrs.h
deleted file mode 100644
index 49cd1aad89f4..000000000000
--- a/onnxruntime/core/providers/vitisai/include/vaip/node_attrs.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <core/graph/model.h>
-
-#include <initializer_list>
-
-#include "core/graph/basic_types.h"
-namespace vaip {
-using namespace onnxruntime;
-class NodeAttr {
- public:
-  NodeAttr(const std::string& name, int64_t value);
-  NodeAttr(const std::string& name, const std::vector<int64_t>& value);
-  NodeAttr(const std::string& name, const std::string& value);
-  NodeAttr(const std::string& name, const std::vector<std::string>& value);
-  NodeAttr(const std::string& name, const std::vector<float>& value);
-  NodeAttr(const std::string& name, const onnx::TensorProto& value);
-
-  onnx::AttributeProto& get();
-
- private:
-  onnx::AttributeProto attribute_proto_;
-};
-
-class NodeAttributesBuiler {
- public:
-  explicit NodeAttributesBuiler(size_t capacity = 10);
-  NodeAttributesBuiler(const NodeAttributesBuiler&) = delete;
-  NodeAttributesBuiler(NodeAttributesBuiler&&) = default;
-  /// after build, all attrs_ are cleared.
-  NodeAttributes build();
-  /// for efficiency reason, after merge_into, all attrs_ are moved.
-  void merge_into(Node& node);
-  void merge_into(NodeAttributes& attrs);
-  template <typename T>
-  NodeAttributesBuiler& add(const std::string& name, T&& value) {
-    attrs_.emplace_back(name, std::forward<T>(value));
-    return *this;
-  }
-
- private:
-  std::vector<NodeAttr> attrs_;
-};
-}  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
index 0d7d5f6220d0..ae5f71d66269 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
@@ -13,6 +13,7 @@ struct OrtApi;
 namespace vaip_core {
 
 struct OrtApiForVaip {
+  onnxruntime::ProviderHost* host_;
   const OrtApi* ort_api_;
   // model
   Model* (*model_load)(const std::string& file);  // [0]
@@ -49,7 +50,7 @@ struct OrtApiForVaip {
                           const std::string& description,
                           const std::vector<const NodeArg*>& input_args,
                           const std::vector<const NodeArg*>& output_args,
-                          NodeAttributes& attributes,
+                          const NodeAttributes& attributes,
                           const std::string& domain);  // [18]
   void (*graph_save)(const Graph& graph, const std::string& filename,
                      const std::string& dat_filename,
@@ -119,8 +120,8 @@ struct OrtApiForVaip {
   NodeAttributes* (*node_attributes_new)();                               // [46]
   void (*node_attributes_delete)(NodeAttributes* p);                      // [47]
   void (*node_attributes_add)(NodeAttributes& p, AttributeProto&& attr);  // [48]
-  AttributeProto* (*node_attributes_get)(NodeAttributes& p,
-                                         const std::string& name);  // [49]
+  const AttributeProto* (*node_attributes_get)(const NodeAttributes& p,
+                                               const std::string& name);  // [49]
   DllSafe<std::vector<std::string>> (*node_attributes_get_keys)(
       NodeAttributes& p);  // [50]
   /// attr proto
@@ -194,5 +195,4 @@ VAIP_DLL_SPEC const OrtApiForVaip* api();
        ? ::vaip_core::api()->name      \
        : (assert(false && #name " is not set"), nullptr))
 #endif
-VAIP_DLL_SPEC void initialize_ort();
 }  // namespace vaip_core
diff --git a/onnxruntime/core/providers/vitisai/symbols.def b/onnxruntime/core/providers/vitisai/symbols.def
new file mode 100644
index 000000000000..4ec2f7914c20
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/symbols.def
@@ -0,0 +1,2 @@
+EXPORTS
+   GetProvider
diff --git a/onnxruntime/core/providers/vitisai/version_script.lds b/onnxruntime/core/providers/vitisai/version_script.lds
new file mode 100644
index 000000000000..2c8e9c4b3ed6
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/version_script.lds
@@ -0,0 +1,9 @@
+#_init and _fini should be local
+VERS_1.0 {
+  global:
+    GetProvider;
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 5f20b32cd6dc..6fc09f3495aa 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -1,91 +1,34 @@
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
-#include "core/graph/graph_utils.h"
 #include "vitisai_execution_provider.h"
 
 #include <cassert>
-#include <codecvt>
 #include <fstream>
 #include <istream>
 
-#include "core/common/common.h"
-
 #include "vaip/capability.h"
 #include "vaip/global_api.h"
-#include "core/session/custom_ops.h"
-#include "core/session/inference_session.h"
 
 using namespace ONNX_NAMESPACE;
 
 namespace onnxruntime {
-
 constexpr const char* VITISAI = "VITISAI";
 
-static vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
-    const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
-#ifndef _WIN32
-  auto model_path = graph_viewer.ModelPath().ToPathString();
-#else
-  using convert_t = std::codecvt_utf8<wchar_t>;
-  std::wstring_convert<convert_t, wchar_t> strconverter;
-  auto model_path = strconverter.to_bytes(graph_viewer.ModelPath().ToPathString());
-#endif
-  return compile_onnx_model_with_options(model_path, graph_viewer.GetGraph(), options);
-}
-
-struct MyCustomOpKernel : OpKernel {
-  MyCustomOpKernel(const OpKernelInfo& info, const OrtCustomOp& op) : OpKernel(info), op_(op) {
-    op_kernel_ =
-        op_.CreateKernel(&op_, OrtGetApiBase()->GetApi(op_.version), reinterpret_cast<const OrtKernelInfo*>(&info));
-  }
-
-  ~MyCustomOpKernel() override { op_.KernelDestroy(op_kernel_); }
-
-  Status Compute(OpKernelContext* ctx) const override {
-    op_.KernelCompute(op_kernel_, reinterpret_cast<OrtKernelContext*>(ctx));
-    return Status::OK();
-  }
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MyCustomOpKernel);
-
-  const OrtCustomOp& op_;
-  void* op_kernel_;
-};
-
-VitisAIExecutionProvider::VitisAIExecutionProvider(const ProviderOptions& info)
+VitisAIExecutionProvider::VitisAIExecutionProvider(
+    const ProviderOptions& info)
     : IExecutionProvider{onnxruntime::kVitisAIExecutionProvider}, info_(info) {
-  custom_op_domains_ = initialize_vitisai_ep();
-  registry_ = std::make_shared<KernelRegistry>();
   CreateKernelRegistry();
 }
 
 void VitisAIExecutionProvider::CreateKernelRegistry() {
-  for (const auto& domain : custom_op_domains_) {
+  for (const auto& domain : get_domains_vitisaiep()) {
     for (const auto* op : domain->custom_ops_) {
-      KernelDefBuilder def_builder;
-      def_builder.SetName(op->GetName(op));
-      def_builder.SetDomain(domain->domain_);
-      def_builder.SinceVersion(1);
-      if (op->version > 12) {
-        auto input_count = op->GetInputTypeCount(op);
-        for (auto i = 0u; i < input_count; i++) {
-          def_builder.InputMemoryType(op->GetInputMemoryType(op, i), i);
-        }
-      }
-      def_builder.Provider(onnxruntime::kVitisAIExecutionProvider);
-      KernelCreateFn kernel_create_fn = [op](FuncManager&, const OpKernelInfo& info,
-                                             std::unique_ptr<OpKernel>& out) -> Status {
-        out = std::make_unique<MyCustomOpKernel>(info, *op);
-        return Status::OK();
-      };
-      std::ignore = registry_->Register(def_builder, kernel_create_fn);
       vitisai_optypes_.insert(op->GetName(op));
     }
   }
 }
 
-std::shared_ptr<KernelRegistry> VitisAIExecutionProvider::GetKernelRegistry() const { return registry_; }
+std::shared_ptr<KernelRegistry> VitisAIExecutionProvider::GetKernelRegistry() const { return get_kernel_registry_vitisaiep(); }
 
 std::vector<std::unique_ptr<ComputeCapability>> VitisAIExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph, const IKernelLookup& /*kernel_lookup*/) const {
@@ -111,9 +54,9 @@ common::Status VitisAIExecutionProvider::Compile(const std::vector<FusedNodeAndG
                                                  std::vector<NodeComputeInfo>& node_compute_funcs) {
   for (const auto& fused_node_graph : fused_nodes_and_graphs) {
     NodeComputeInfo compute_info;
-    const onnx::AttributeProto* attr = graph_utils::GetNodeAttribute(fused_node_graph.fused_node, "index");
-    assert(attr != nullptr);
-    size_t index = (size_t)attr->i();
+    auto& attrs = fused_node_graph.fused_node.get().GetAttributes();
+    assert(attrs.count("index"));
+    size_t index = attrs.at("index").i();
     compute_info.create_state_func = [this, index](ComputeContext* context, FunctionState* state) {
       auto* p = (**this->execution_providers_)[index]->compile().release();
       *state = p;
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index e86b53339d4d..186427be4fab 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -9,8 +9,7 @@
 #include <set>
 #include <string>
 
-#include "core/framework/execution_provider.h"
-#include "core/framework/customregistry.h"
+#include "core/providers/shared_library/provider_api.h"
 #include "core/session/onnxruntime_c_api.h"
 
 // we cannot include vaip/vaip.hpp here because header file referred by
@@ -21,7 +20,6 @@ class DllSafe;
 class ExecutionProvider;
 }  // namespace vaip_core
 namespace onnxruntime {
-
 // Logical device representation.
 class VitisAIExecutionProvider : public IExecutionProvider {
  public:
diff --git a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
index 4c416124ca8f..dc34419ef936 100755
--- a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
@@ -10,9 +10,6 @@
 #include "./vitisai_execution_provider.h"
 #include "core/framework/execution_provider.h"
 
-#include "core/session/abi_session_options_impl.h"
-#include "core/providers/shared_library/provider_host_api.h"
-
 using namespace onnxruntime;
 namespace onnxruntime {
 
@@ -30,10 +27,37 @@ std::unique_ptr<IExecutionProvider> VitisAIProviderFactory::CreateProvider() {
   return std::make_unique<VitisAIExecutionProvider>(info_);
 }
 
-std::shared_ptr<IExecutionProviderFactory> VitisAIProviderFactoryCreator::Create(
-    const ProviderOptions& provider_options) {
-  initialize_vitisai_ep();
-  return std::make_shared<VitisAIProviderFactory>(provider_options);
-}
+struct VitisAI_Provider : Provider {
+  // Takes a pointer to a provider specific structure to create the factory. For example, with OpenVINO it is a pointer to an OrtOpenVINOProviderOptions structure
+  std::shared_ptr<IExecutionProviderFactory>
+  CreateExecutionProviderFactory(const void* options) override {
+    return std::make_shared<VitisAIProviderFactory>(GetProviderOptions(options));
+  }
+  // Convert provider options struct to ProviderOptions which is a map
+  ProviderOptions GetProviderOptions(const void* options) override {
+    auto vitisai_options = reinterpret_cast<const ProviderOptions*>(options);
+    return *vitisai_options;
+  }
+  // Update provider options from key-value string configuration
+  void UpdateProviderOptions(void* options, const ProviderOptions& provider_options) override {
+    auto vitisai_options = reinterpret_cast<ProviderOptions*>(options);
+    for (const auto& entry : provider_options) {
+      vitisai_options->insert_or_assign(entry.first, entry.second);
+    }
+  };
+  // Get provider specific custom op domain list. Provider has the resposibility to release OrtCustomOpDomain instances it creates.
+  void GetCustomOpDomainList(IExecutionProviderFactory*, std::vector<OrtCustomOpDomain*>&) override{};
+  // Called right after loading the shared library, if this throws any errors Shutdown() will be called and the library unloaded
+  void Initialize() override { initialize_vitisai_ep(); }
+  // Called right before unloading the shared library
+  void Shutdown() override {}
+} g_provider;
 
 }  // namespace onnxruntime
+
+extern "C" {
+
+ORT_API(onnxruntime::Provider*, GetProvider) {
+  return &onnxruntime::g_provider;
+}
+}
diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index d34cb7e36244..ef7c10dae580 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -12,6 +12,24 @@
 namespace onnxruntime {
 namespace webnn {
 
+InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer) {
+  InitializedTensorSet all_initializers;
+  if (graph_viewer.IsSubgraph()) {
+    const Graph* cur_graph = &graph_viewer.GetGraph();
+    // Traverse up to the top-level graph, collecting all initializers.
+    while (cur_graph->IsSubgraph()) {
+      const auto& current_initializers = cur_graph->GetAllInitializedTensors();
+      all_initializers.insert(current_initializers.begin(), current_initializers.end());
+      cur_graph = cur_graph->ParentGraph();
+    }
+    // Collect initializers in top-level graph.
+    const auto& current_initializers = cur_graph->GetAllInitializedTensors();
+    all_initializers.insert(current_initializers.begin(), current_initializers.end());
+  }
+
+  return all_initializers;
+}
+
 bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger) {
   const auto* shape_proto = node_arg.Shape();
   if (!shape_proto) {
@@ -148,9 +166,14 @@ bool SetWebnnDataType(emscripten::val& desc, const int32_t data_type) {
   // TODO: Remove legacy "type" once all browsers implement the new "dataType".
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       desc.set("type", emscripten::val("uint8"));
       desc.set("dataType", emscripten::val("uint8"));
       return true;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      desc.set("type", emscripten::val("int8"));
+      desc.set("dataType", emscripten::val("int8"));
+      return true;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       desc.set("type", emscripten::val("float16"));
       desc.set("dataType", emscripten::val("float16"));
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 28857d3002ed..d35a2ae17f9a 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -28,6 +28,7 @@ namespace webnn {
 enum class WebnnDeviceType {
   CPU,
   GPU,
+  NPU,
 };
 
 typedef struct {
@@ -35,6 +36,9 @@ typedef struct {
   bool isCpuSupported;  // The WebNN CPU backend XNNPack supports it (not about the CPU EP).
 } WebnnOpInfo;
 
+// Collects all the initializer tensors in the subGraph and its ancestor graphs.
+InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer);
+
 bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger);
 
 template <typename T>
@@ -51,6 +55,19 @@ std::string GetShapeString(std::vector<T>& shape) {
   return shape_info.str();
 }
 
+inline std::string GetTensorName(const ConstPointerContainer<std::vector<NodeArg*>>& input_defs, const size_t index) {
+  return (input_defs.size() > index) ? std::string(input_defs[index]->Name()) : "";
+}
+
+inline std::vector<uint32_t> GetVecUint32FromVecInt64(const std::vector<int64_t>& int64_vec) {
+  std::vector<uint32_t> uint32_vec;
+  uint32_vec.reserve(int64_vec.size());
+  std::transform(int64_vec.begin(), int64_vec.end(),
+                 std::back_inserter(uint32_vec),
+                 [](int64_t val) -> uint32_t { return SafeInt<uint32_t>(val); });
+  return uint32_vec;
+}
+
 template <typename T>
 bool ReadIntArrayFrom1DTensor(const onnx::TensorProto& tensor, std::vector<T>& array, const logging::Logger& logger) {
   std::vector<uint8_t> unpacked_tensor;
@@ -98,8 +115,12 @@ inline bool ReadScalarTensorData(const onnx::TensorProto& tensor, emscripten::va
   }
   switch (tensor.data_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       scalar = emscripten::val{*reinterpret_cast<uint8_t*>(unpacked_tensor.data())};
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      scalar = emscripten::val{*reinterpret_cast<int8_t*>(unpacked_tensor.data())};
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       scalar = emscripten::val{MLFloat16::FromBits(*reinterpret_cast<uint16_t*>(unpacked_tensor.data())).ToFloat()};
       break;
@@ -139,15 +160,18 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"ArgMax", {"argMax", false}},
     {"ArgMin", {"argMin", false}},
     {"AveragePool", {"averagePool2d", true}},
-    {"BatchNormalization", {"meanVarianceNormalization", false}},
+    {"BatchNormalization", {"batchNormalization", false}},
     {"Cast", {"cast", false}},
     {"Ceil", {"ceil", true}},
     {"Clip", {"clamp", true}},
     {"Concat", {"concat", true}},
     {"Conv", {"conv2d", true}},
+    {"ConvInteger", {"conv2dInteger", false}},
     {"ConvTranspose", {"convTranspose2d", true}},
     {"Cos", {"cos", false}},
     {"Div", {"div", true}},
+    {"DequantizeLinear", {"dequantizeLinear", false}},
+    {"DynamicQuantizeLinear", {"dynamicQuantizeLinear", false}},
     {"Elu", {"elu", true}},
     {"Equal", {"equal", false}},
     {"Erf", {"erf", false}},
@@ -156,30 +180,31 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"Flatten", {"reshape", true}},
     {"Floor", {"floor", true}},
     {"Gather", {"gather", false}},
+    {"Gelu", {"gelu", false}},
     {"Gemm", {"gemm", true}},
     {"GlobalAveragePool", {"averagePool2d", true}},
     {"GlobalMaxPool", {"maxPool2d", true}},
     {"GlobalLpPool", {"l2Pool2d", false}},
     {"Greater", {"greater", false}},
     {"GreaterOrEqual", {"greaterOrEqual", false}},
-    {"GroupNormalization", {"meanVarianceNormalization", false}},
     {"HardSigmoid", {"hardSigmoid", false}},
     {"HardSwish", {"hardSwish", true}},
     {"Identity", {"identity", false}},
-    {"InstanceNormalization", {"meanVarianceNormalization", false}},
-    {"LayerNormalization", {"meanVarianceNormalization", false}},
+    {"InstanceNormalization", {"instanceNormalization", false}},
+    {"LayerNormalization", {"layerNormalization", false}},
     {"LeakyRelu", {"leakyRelu", true}},
     {"Less", {"lesser", false}},
     {"LessOrEqual", {"lesserOrEqual", false}},
     {"Log", {"log", false}},
     {"LpPool", {"l2Pool2d", false}},
-    {"MatMul", {"matmul", false}},
+    {"MatMul", {"matmul", true}},
+    {"MatMulInteger", {"matmulInteger", false}},
     {"Max", {"max", true}},
     {"MaxPool", {"maxPool2d", true}},
     {"Min", {"min", true}},
     {"Mul", {"mul", true}},
     {"Neg", {"neg", true}},
-    {"Not", {"not", false}},
+    {"Not", {"logicalNot", false}},
     {"Pad", {"pad", true}},
     {"Pow", {"pow", false}},
     {"PRelu", {"prelu", true}},
@@ -240,8 +265,10 @@ constexpr std::array<ONNX_NAMESPACE::TensorProto_DataType, 1> supported_cpu_data
     ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
 };
 
-constexpr std::array<ONNX_NAMESPACE::TensorProto_DataType, 7> supported_gpu_data_types = {
+constexpr std::array<ONNX_NAMESPACE::TensorProto_DataType, 9> supported_gpu_data_types = {
     ONNX_NAMESPACE::TensorProto_DataType_BOOL,
+    ONNX_NAMESPACE::TensorProto_DataType_INT8,
+    ONNX_NAMESPACE::TensorProto_DataType_UINT8,
     ONNX_NAMESPACE::TensorProto_DataType_FLOAT16,
     ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
     ONNX_NAMESPACE::TensorProto_DataType_INT32,
diff --git a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
index 78dd9d9fee72..6dacd749b8a6 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
@@ -17,6 +17,10 @@ class ActivationOpBuilder : public BaseOpBuilder {
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  // Operator support related.
+  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+                         WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
 // Add operator related.
@@ -37,6 +41,8 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     if (op_type == "Elu") {
       options.set("alpha", helper.Get("alpha", 1.0f));
       output = model_builder.GetBuilder().call<emscripten::val>("elu", input, options);
+    } else if (op_type == "Gelu") {
+      output = model_builder.GetBuilder().call<emscripten::val>("gelu", input, options);
     } else if (op_type == "HardSigmoid") {
       options.set("alpha", helper.Get("alpha", 0.2f));
       options.set("beta", helper.Get("beta", 0.5f));
@@ -66,6 +72,20 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   return Status::OK();
 }
 
+// Operator support related.
+bool ActivationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+                                            const Node& node,
+                                            WebnnDeviceType /* device_type */,
+                                            const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+
+  std::vector<int64_t> input_shape;
+  if (!GetShape(*input_defs[0], input_shape, logger))
+    return false;
+
+  return true;
+}
+
 void CreateActivationOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   if (op_registrations.op_builder_map.count(op_type) > 0)
     return;
@@ -73,6 +93,7 @@ void CreateActivationOpBuilder(const std::string& op_type, OpBuilderRegistration
   static std::vector<std::string> op_types =
       {
           "Elu",
+          "Gelu",
           "HardSigmoid",
           "HardSwish",
           "LeakyRelu",
diff --git a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
index 01e4a3c60281..85e38b668cee 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
@@ -46,7 +46,7 @@ class BaseOpBuilder : public IOpBuilder {
   // We still set the mininal supported opset to 1 as we couldn't
   // get the model opset version at this stage.
   virtual int GetMinSupportedOpSet(const Node& /* node */) const { return 1; }
-  virtual int GetMaxSupportedOpSet(const Node& /* node */) const { return 20; }
+  virtual int GetMaxSupportedOpSet(const Node& /* node */) const { return 21; }
 
  private:
   bool HasSupportedOpSet(const Node& node, const logging::Logger& logger) const;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
index ed9cbbaaec96..3d961e4589c2 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
@@ -39,8 +39,12 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   std::string operand_type;
   switch (to_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       operand_type = "uint8";
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      operand_type = "int8";
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       operand_type = "float16";
       break;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
index 9de5b889808f..0d6001bcba89 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
@@ -47,7 +47,7 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto& output_name = node.OutputDefs()[0]->Name();
   emscripten::val options = emscripten::val::object();
   float minValue, maxValue;
-  ORT_RETURN_IF_NOT(GetClipMinMax(model_builder.GetInitializerTensors(), node, minValue, maxValue, logger),
+  ORT_RETURN_IF_NOT(GetClipMinMax(model_builder.GetGraphViewer(), node, minValue, maxValue, logger),
                     "GetClipMinMax failed");
   options.set("minValue", minValue);
   options.set("maxValue", maxValue);
@@ -70,6 +70,9 @@ bool ClipOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                       const Node& node,
                                       const WebnnDeviceType /* device_type */,
                                       const logging::Logger& logger) const {
+  // TODO: Update IsOpSupportedImpl to pass GraphViewer instead of InitializedTensorSet so the implementations
+  // can ensure initializers are constant. See #19401 for details of how this update was made to the NNAPI EP.
+  // GetClipMinMax(graph_viewer, node, minValue, maxValue, logger)
   float min, max;
   return GetClipMinMax(initializers, node, min, max, logger);
 }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index df0d54e3fd4b..c74545479e46 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -42,72 +42,61 @@ void ConvOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
 // Helper functions
 common::Status SetConvBaseOptions(ModelBuilder& model_builder,
                                   const Node& node, emscripten::val& options,
-                                  const std::vector<int32_t>& strides,
-                                  const std::vector<int32_t>& dilations,
-                                  std::vector<int32_t>& pads,
+                                  const std::vector<int64_t> input_shape,
+                                  const std::vector<int64_t> weight_shape,
+                                  const std::vector<int64_t>& strides,
+                                  const std::vector<int64_t>& dilations,
+                                  std::vector<int64_t>& pads,
+                                  const bool is_nhwc,
+                                  const bool is_conv1d,
                                   const logging::Logger& logger) {
   NodeAttrHelper helper(node);
-  const auto group = helper.Get("group", static_cast<int32_t>(1));
   const auto& input_defs = node.InputDefs();
-  std::vector<int64_t> weight_shape;
-  ORT_RETURN_IF_NOT(GetShape(*input_defs[1], weight_shape, logger), "Cannot get weight shape");
-  options.set("strides", emscripten::val::array(strides));
-  options.set("dilations", emscripten::val::array(dilations));
-  options.set("groups", group);
+
   // Add Padding.
-  std::vector<int64_t> input_shape;
-  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
   AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
   if (node.OpType() == "Conv") {
     // Calculate explicit padding for autoPad.
     if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
       std::vector<int64_t> pads_out;
       ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, weight_shape[2], weight_shape[3],
-                                        helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0}),
-                                        helper.Get("strides", std::vector<int64_t>{1, 1}),
-                                        helper.Get("dilations", std::vector<int64_t>{1, 1}),
-                                        auto_pad_type,
-                                        pads_out,
-                                        model_builder.GetPreferredLayout() == DataLayout::NCHW));
-      std::transform(pads_out.begin(), pads_out.end(), pads.begin(),
-                     [](int64_t pad) -> int32_t { return static_cast<int32_t>(pad); });
+                                        pads, strides, dilations, auto_pad_type, pads_out, !is_nhwc));
+      pads = pads_out;
     }
   } else if (node.OpType() == "ConvTranspose") {
     // When the 'output_shape' is specificed, the 'output_padding' values
     // in options.outputPadding are ignored.
-    std::vector<int32_t> dim;
-    std::vector<int32_t> output_padding{0, 0};
+    std::vector<int64_t> dims;
+    std::vector<int64_t> output_padding{0, 0};
     if (helper.HasAttr("output_shape")) {
-      // Default value of 'output_shape' will be ignore as we already check if
-      // it's existed.
-      dim = helper.Get("output_shape", std::vector<int32_t>{-1, -1});
+      // Default value of 'output_shape' will be ignored as we already check if it existed.
+      dims = helper.Get("output_shape", std::vector<int64_t>{-1, -1});
       // Extract the height and width.
-      std::vector<int32_t> output_shape;
-      if (dim.size() == 2) {
-        output_shape = dim;
-      } else if (dim.size() == 4) {
-        output_shape = {dim[2], dim[3]};
+      std::vector<int64_t> output_shape;
+      if (dims.size() == 1 && is_conv1d) {  // ConvTranspose 1d
+        output_shape = {dims[0], 1};
+      } else if (dims.size() == 2 && !is_conv1d) {
+        output_shape = dims;
       } else {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid output shape");
       }
       // Padding values are auto generated.
       if (helper.HasAttr("kernel_shape")) {
-        std::vector<int32_t> kernel_shape = helper.Get("kernel_shape", std::vector<int32_t>{-1, -1});
-        std::vector<int32_t> total_padding(2);
-        std::vector<int64_t> input_shape;
-        ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+        std::vector<int64_t> kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{-1, -1});
+        if (is_conv1d) {  // ConvTranspose 1d
+          kernel_shape.push_back(1);
+        }
+        std::vector<int64_t> total_padding(2);
         for (size_t i = 0; i < 2; i++) {
           // Get the dimensions of H and W.
           // For NHWC layout, the dimensions of H and W correspond to index 1 and 2.
           // For NCHW layout, the dimensions of H and W correspond to index 2 and 3.
-          if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
-            total_padding[i] = strides[i] * (narrow<size_t>(input_shape[i + 1]) - 1) +
-                               output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i];
+          if (is_nhwc) {
+            total_padding[i] = strides[i] * (input_shape[i + 1] - 1) + output_padding[i] +
+                               ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i];
           } else {
-            ORT_RETURN_IF_NOT(model_builder.GetPreferredLayout() == DataLayout::NCHW,
-                              "WebNN GPU backend preferred layout should be NCHW.");
-            total_padding[i] = strides[i] * (narrow<size_t>(input_shape[i + 2]) - 1) +
-                               output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i];
+            total_padding[i] = strides[i] * (input_shape[i + 2] - 1) + output_padding[i] +
+                               ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i];
           }
         }
         AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
@@ -122,18 +111,27 @@ common::Status SetConvBaseOptions(ModelBuilder& model_builder,
           }
         }
       }
-      options.set("outputSizes", emscripten::val::array(output_shape));
+      options.set("outputSizes", emscripten::val::array(GetVecUint32FromVecInt64(output_shape)));
     } else {
-      output_padding = helper.Get("output_padding", std::vector<int32_t>{0, 0});
-      options.set("outputPadding", emscripten::val::array(output_padding));
+      output_padding = helper.Get("output_padding", std::vector<int64_t>{0, 0});
+      if (output_padding.size() == 1 && is_conv1d) {  // ConvTranspose 1d
+        output_padding.push_back(0);
+      }
+      options.set("outputPadding", emscripten::val::array(GetVecUint32FromVecInt64(output_padding)));
     }
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "conv_op_builder only supports Op Conv and ConvTranspose.");
   }
+
+  const auto group = helper.Get("group", static_cast<uint32_t>(1));
+  options.set("groups", group);
+  options.set("strides", emscripten::val::array(GetVecUint32FromVecInt64(strides)));
+  options.set("dilations", emscripten::val::array(GetVecUint32FromVecInt64(dilations)));
+
   // Permute the ONNX's pads, which is [beginning_height, beginning_width, ending_height, ending_width],
   // while WebNN's padding is [beginning_height, ending_height, beginning_width, ending_width].
-  const std::vector<int32_t> padding{pads[0], pads[2], pads[1], pads[3]};
-  options.set("padding", emscripten::val::array(padding));
+  const std::vector<int64_t> padding{pads[0], pads[2], pads[1], pads[3]};
+  options.set("padding", emscripten::val::array(GetVecUint32FromVecInt64(padding)));
 
   // Add bias if present.
   if (input_defs.size() > 2) {
@@ -151,7 +149,8 @@ common::Status SetConvBaseOptions(ModelBuilder& model_builder,
 // Both depthwise Conv and ConvTranspose share the same logic to add the layout.
 Status AddInitializerInNewLayout(ModelBuilder& model_builder,
                                  const std::string& name,
-                                 bool is_conv) {
+                                 bool is_conv,
+                                 bool is_conv1d) {
   const auto& tensor = *model_builder.GetInitializerTensors().at(name);
   auto data_type = tensor.data_type();
   if (!IsSupportedDataType(data_type, model_builder.GetWebnnDeviceType())) {
@@ -161,13 +160,13 @@ Status AddInitializerInNewLayout(ModelBuilder& model_builder,
   }
 
   const auto& shape = tensor.dims();
-  std::vector<uint32_t> dims;
-  std::transform(shape.cbegin(), shape.cend(),
-                 std::back_inserter(dims),
-                 [](int64_t dim) -> int32_t { return SafeInt<int32_t>(dim); });
+  std::vector<uint32_t> dims = GetVecUint32FromVecInt64(std::vector<int64_t>(std::begin(shape), std::end(shape)));
+
+  if (is_conv1d) {
+    // Support conv1d by prepending a 1 size dimension.
+    dims.push_back(1);
+  }
 
-  ORT_RETURN_IF_NOT(dims.size() == 4,
-                    "The initializer is not 4D: ", name, " actual dim ", dims.size());
   const uint8_t* src = nullptr;
   Initializer unpacked_tensor(tensor, model_builder.GetGraphViewer().ModelPath());
   src = unpacked_tensor.DataAsByteSpan().data();
@@ -183,6 +182,13 @@ Status AddInitializerInNewLayout(ModelBuilder& model_builder,
 
   size_t element_size{0};
   switch (data_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+      element_size = sizeof(uint8_t);
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      element_size = sizeof(int8_t);
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       element_size = sizeof(uint16_t);
       break;
@@ -250,40 +256,101 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
   emscripten::val output = emscripten::val::object();
 
-  NodeAttrHelper helper(node);
-  const auto strides = helper.Get("strides", std::vector<int32_t>{1, 1});
-  const auto dilations = helper.Get("dilations", std::vector<int32_t>{1, 1});
-  auto pads = helper.Get("pads", std::vector<int32_t>{0, 0, 0, 0});
+  std::vector<int64_t> input_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
+  std::vector<int64_t> weight_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[1], weight_shape, logger), "Cannot get weight shape");
   const auto& weight_name = input_defs[1]->Name();
+
+  NodeAttrHelper helper(node);
+  auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
+  auto dilations = helper.Get("dilations", std::vector<int64_t>{1, 1});
+  auto pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+  const bool is_nhwc = model_builder.GetPreferredLayout() == DataLayout::NHWC;
+  const bool is_conv1d = input_shape.size() == 3 && weight_shape.size() == 3;
+  // Support conv1d by prepending a 1 or 2 size dimensions.
+  if (is_conv1d) {
+    // Reshape input.
+    if (is_nhwc) {
+      // For NHWC preferred layout, the input has been transposed.
+      // For conv1d it is NCD1 -> ND1C, so we need to prepend 1 to the index 2.
+      input_shape.insert(input_shape.begin() + 2, 1);
+    } else {
+      input_shape.push_back(1);
+    }
+    std::vector<uint32_t> new_shape = GetVecUint32FromVecInt64(input_shape);
+    input = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
+
+    weight_shape.resize(4, 1);  // Ensure 4D by appending 1's if needed.
+    strides.resize(2, 1);       // Ensure 2D by appending 1's if needed.
+    dilations.resize(2, 1);     // Ensure 2D by appending 1's if needed.
+    if (pads.size() == 2) {
+      pads.insert(pads.begin() + 1, 0);
+      pads.push_back(0);
+    }
+  }
+
   emscripten::val options = emscripten::val::object();
-  ORT_RETURN_IF_ERROR(SetConvBaseOptions(model_builder, node, options, strides, dilations, pads, logger));
-  if (op_type == "Conv") {
+  ORT_RETURN_IF_ERROR(SetConvBaseOptions(
+      model_builder, node, options, input_shape, weight_shape, strides, dilations, pads, is_nhwc, is_conv1d, logger));
+  if (op_type == "Conv" || op_type == "ConvInteger") {
     int groups = options["groups"].as<int>();
-    std::vector<int64_t> input_shape;
-    ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
-    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+    if (is_nhwc) {
       bool depthwise = (groups == input_shape[3] && groups != 1);
       options.set("inputLayout", emscripten::val("nhwc"));
-      ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight_name, !depthwise));
+      ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight_name, !depthwise, is_conv1d));
       if (!depthwise) {
         options.set("filterLayout", emscripten::val("ohwi"));
       } else {
         options.set("filterLayout", emscripten::val("ihwo"));
       }
     }
-    emscripten::val filter = model_builder.GetOperand(input_defs[1]->Name());
-
-    output = model_builder.GetBuilder().call<emscripten::val>("conv2d", input, filter, options);
-  } else {
-    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+  } else {  // ConvTranspose
+    if (is_nhwc) {
       options.set("inputLayout", emscripten::val("nhwc"));
       options.set("filterLayout", emscripten::val("ohwi"));
-      ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight_name, false));
+      ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight_name, false, is_conv1d));
     }
-    emscripten::val filter = model_builder.GetOperand(input_defs[1]->Name());
+  }
+
+  emscripten::val filter = model_builder.GetOperand(weight_name);
+  if (!is_nhwc && is_conv1d) {
+    // Reshape weight to 4D for conv1d with NCHW preferred layout.
+    std::vector<uint32_t> new_shape = GetVecUint32FromVecInt64(weight_shape);
+    filter = model_builder.GetBuilder().call<emscripten::val>("reshape", filter, emscripten::val::array(new_shape));
+  }
+
+  if (op_type == "Conv") {
+    output = model_builder.GetBuilder().call<emscripten::val>("conv2d", input, filter, options);
+  } else if (op_type == "ConvInteger") {
+    emscripten::val x_zero_point = emscripten::val::null();
+    emscripten::val w_zero_point = emscripten::val::null();
+    if (input_defs.size() >= 3) {
+      x_zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
+    } else {
+      x_zero_point = model_builder.GetZeroConstant("uint8");
+    }
+    if (input_defs.size() >= 4) {
+      w_zero_point = model_builder.GetOperand(node.InputDefs()[3]->Name());
+    } else {
+      w_zero_point = model_builder.GetZeroConstant("uint8");
+    }
+    output = model_builder.GetBuilder().call<emscripten::val>("conv2dInteger",
+                                                              input, x_zero_point, filter, w_zero_point, options);
+  } else {
     output = model_builder.GetBuilder().call<emscripten::val>("convTranspose2d", input, filter, options);
   }
 
+  // If it's a conv1d, reshape it back.
+  if (is_conv1d) {
+    const auto& output_defs = node.OutputDefs();
+    std::vector<int64_t> output_shape;
+    ORT_RETURN_IF_NOT(GetShape(*output_defs[0], output_shape, logger), "Cannot get output shape");
+    std::vector<uint32_t> new_shape = GetVecUint32FromVecInt64(output_shape);
+    output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+  }
+
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
@@ -305,9 +372,9 @@ bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   }
 
   const auto input_size = input_shape.size();
-  if (input_size != 4) {
+  if (input_size != 4 && input_size != 3) {
     LOGS(logger, VERBOSE) << op_type << " [" << name << "]'s input dimension: " << input_size
-                          << ". Only conv 2d is supported.";
+                          << ". Only conv 1d / 2d is supported.";
     return false;
   }
 
@@ -318,9 +385,9 @@ bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   }
 
   const auto weight_size = weight_shape.size();
-  if (weight_size != 4) {
+  if (weight_size != 4 && weight_size != 3) {
     LOGS(logger, VERBOSE) << op_type << " [" << name << "]'s weight dimension: " << weight_size
-                          << ". Only conv 2d is supported.";
+                          << ". Only conv 1d / 2d is supported.";
     return false;
   }
 
@@ -341,6 +408,7 @@ void CreateConvOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
   static std::vector<std::string> op_types =
       {
           "Conv",
+          "ConvInteger",
           "ConvTranspose",
       };
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc
new file mode 100644
index 000000000000..66d502a4e672
--- /dev/null
+++ b/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc
@@ -0,0 +1,70 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+#include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/webnn/builders/helper.h"
+#include "core/providers/webnn/builders/model_builder.h"
+#include "core/providers/webnn/builders/op_builder_factory.h"
+
+#include "core/providers/webnn/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace webnn {
+
+class DequantizeLinearOpBuilder : public BaseOpBuilder {
+  // Add operator related.
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+};
+
+Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
+                                                        const Node& node,
+                                                        const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
+  emscripten::val scale = model_builder.GetOperand(input_defs[1]->Name());
+  emscripten::val zero_point = emscripten::val::null();
+  if (input_defs.size() == 3) {
+    zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
+  } else {
+    zero_point = model_builder.GetZeroConstant("uint8");
+  }
+  emscripten::val output;
+  std::vector<int64_t> input_shape;
+  std::vector<int64_t> scale_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[1], scale_shape, logger), "Cannot get scale shape");
+  NodeAttrHelper helper(node);
+  int32_t axis = helper.Get("axis", 1);
+  // axis is valid for input shape greater than 1D.
+  if (input_shape.size() > 1) {
+    axis = static_cast<int32_t>(HandleNegativeAxis(axis, input_shape.size()));
+  }
+  // Insert ones before and after the axis dimension for broadcasting of 1D scale tensor.
+  if (1 == scale_shape.size() && 1 < input_shape.size()) {
+    std::vector<int32_t> target_shape{static_cast<int>(input_shape[axis])};
+    target_shape.insert(target_shape.begin(), axis, 1);
+    target_shape.insert(target_shape.end(), input_shape.size() - axis - 1, 1);
+    scale = model_builder.GetBuilder().call<emscripten::val>("reshape", scale, emscripten::val::array(target_shape));
+    zero_point = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                  zero_point, emscripten::val::array(target_shape));
+  }
+  output = model_builder.GetBuilder().call<emscripten::val>("dequantizeLinear", input, scale, zero_point);
+
+  model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
+
+  return Status::OK();
+}
+
+void CreateDequantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<DequantizeLinearOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace webnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
new file mode 100644
index 000000000000..3b5f64584b82
--- /dev/null
+++ b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
@@ -0,0 +1,49 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+#include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/webnn/builders/helper.h"
+#include "core/providers/webnn/builders/model_builder.h"
+#include "core/providers/webnn/builders/op_builder_factory.h"
+
+#include "core/providers/webnn/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace webnn {
+
+class DynamicQuantizaLinearOpBuilder : public BaseOpBuilder {
+  // Add operator related.
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+};
+
+Status DynamicQuantizaLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
+                                                             const Node& node,
+                                                             const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
+  emscripten::val output_array;
+  std::vector<int64_t> input_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+  emscripten::val options = emscripten::val::object();
+
+  output_array = model_builder.GetBuilder().call<emscripten::val>("dynamicQuantizeLinear", input);
+
+  for (size_t i = 0, count = output_array["length"].as<size_t>(); i < count; i++) {
+    model_builder.AddOperand(node.OutputDefs()[i]->Name(), std::move(output_array[i]));
+  }
+  return Status::OK();
+}
+
+void CreateDynamicQuantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<DynamicQuantizaLinearOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace webnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 03ef284336f2..ed320132169e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -29,7 +29,7 @@ class GemmOpBuilder : public BaseOpBuilder {
 
 // Add operator related.
 Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                            const logging::Logger& /* logger */) const {
+                                            const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
   const size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
@@ -38,7 +38,72 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   emscripten::val b = model_builder.GetOperand(node.InputDefs()[b_idx]->Name());
   emscripten::val output = emscripten::val::object();
   if (op_type == "MatMul") {
-    output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
+    std::vector<int64_t> a_shape;
+    if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of A.");
+    }
+    std::vector<int64_t> b_shape;
+    if (!GetShape(*input_defs[b_idx], b_shape, logger)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of B.");
+    }
+    // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
+    bool extended_a_shape = false;
+    if (a_shape.size() == 1) {
+      extended_a_shape = true;
+      a_shape.insert(a_shape.begin(), 1);
+      a = model_builder.GetBuilder().call<emscripten::val>("reshape", a,
+                                                           emscripten::val::array(GetVecUint32FromVecInt64(a_shape)));
+    }
+    // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
+    bool extended_b_shape = false;
+    if (b_shape.size() == 1) {
+      extended_b_shape = true;
+      b_shape.push_back(1);
+      b = model_builder.GetBuilder().call<emscripten::val>("reshape", b,
+                                                           emscripten::val::array(GetVecUint32FromVecInt64(b_shape)));
+    }
+    // The inputs of MatMul must be at least 3D for WebNN CPU backend. Use GEMM for 2D case.
+    // TODO: Remove this workaround when it is fixed in Chromium.
+    if (model_builder.GetWebnnDeviceType() == WebnnDeviceType::CPU && a_shape.size() == 2) {
+      output = model_builder.GetBuilder().call<emscripten::val>("gemm", a, b);
+    } else {
+      output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
+    }
+    // If the inputs are both 1D， reduce the output to a scalar.
+    if (extended_a_shape && extended_b_shape) {
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array());
+    }
+    // After matrix multiplication the prepended 1 is removed.
+    else if (extended_a_shape) {
+      std::vector<uint32_t> new_shape;
+      for (size_t i = 0; i < b_shape.size() - 2; i++) {
+        new_shape.push_back(narrow<uint32_t>(b_shape[i]));
+      }
+      new_shape.push_back(narrow<uint32_t>(b_shape.back()));
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+    }
+    // After matrix multiplication the appended 1 is removed.
+    else if (extended_b_shape) {
+      std::vector<uint32_t> new_shape;
+      for (size_t i = 0; i < a_shape.size() - 1; i++) {
+        new_shape.push_back(narrow<uint32_t>(a_shape[i]));
+      }
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+    }
+  } else if (op_type == "MatMulInteger") {
+    emscripten::val a_zero_point = emscripten::val::null();
+    emscripten::val b_zero_point = emscripten::val::null();
+    if (input_defs.size() >= 3) {
+      a_zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
+    } else {
+      a_zero_point = model_builder.GetZeroConstant("uint8");
+    }
+    if (input_defs.size() >= 4) {
+      b_zero_point = model_builder.GetOperand(node.InputDefs()[3]->Name());
+    } else {
+      b_zero_point = model_builder.GetZeroConstant("uint8");
+    }
+    output = model_builder.GetBuilder().call<emscripten::val>("matmulInteger", a, a_zero_point, b, b_zero_point);
   } else {  // Gemm
     emscripten::val options = emscripten::val::object();
     NodeAttrHelper helper(node);
@@ -67,44 +132,33 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                       const Node& node,
-                                      const WebnnDeviceType /* device_type */,
+                                      const WebnnDeviceType device_type,
                                       const logging::Logger& logger) const {
   (void)initializers;
   const auto& op_type = node.OpType();
   const auto& input_defs(node.InputDefs());
   const size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
 
-  if (op_type == "Gemm") {
-    std::vector<int64_t> a_shape;
-    {
-      if (!GetShape(*input_defs[a_idx], a_shape, logger))
-        return false;
-
-      if (a_shape.size() != 2) {
-        LOGS(logger, VERBOSE) << "A must be 2D";
-        return false;
-      }
-
-      if (Product(a_shape) == 0) {
-        LOGS(logger, VERBOSE) << "A must be non-empty";
-        return false;
-      }
-    }
-
-    std::vector<int64_t> b_shape;
-    {
-      if (!GetShape(*input_defs[b_idx], b_shape, logger))
-        return false;
+  std::vector<int64_t> a_shape;
+  if (!GetShape(*input_defs[a_idx], a_shape, logger))
+    return false;
+  if (Product(a_shape) == 0) {
+    LOGS(logger, VERBOSE) << "A must be non-empty";
+    return false;
+  }
 
-      if (b_shape.size() != 2) {
-        LOGS(logger, VERBOSE) << "B must be 2D";
-        return false;
-      }
+  std::vector<int64_t> b_shape;
+  if (!GetShape(*input_defs[b_idx], b_shape, logger))
+    return false;
+  if (Product(b_shape) == 0) {
+    LOGS(logger, VERBOSE) << "B must be non-empty";
+    return false;
+  }
 
-      if (Product(b_shape) == 0) {
-        LOGS(logger, VERBOSE) << "B must be non-empty";
-        return false;
-      }
+  if (op_type == "Gemm") {
+    if (a_shape.size() != 2 || b_shape.size() != 2) {
+      LOGS(logger, VERBOSE) << "A and B must be 2D for Gemm";
+      return false;
     }
 
     // C of Gemm.
@@ -138,6 +192,30 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     }
   }
 
+  if (op_type == "MatMul") {
+    // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
+    // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
+    if (a_shape.size() == 1) a_shape.insert(a_shape.begin(), 1);
+    if (b_shape.size() == 1) b_shape.push_back(1);
+
+    // WebNN CPU backend has two more constraints.
+    // https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/modules/ml/webnn/ml_graph_xnnpack.cc;l=1177
+    // TODO: Remove this workaround when Chromium enables broadcast for MatMul on WebNN CPU backend.
+    if (device_type == WebnnDeviceType::CPU) {
+      if (a_shape.size() != b_shape.size()) {
+        LOGS(logger, VERBOSE) << "The rank of two inputs for WebNN CPU backend MatMul must be the same.";
+        return false;
+      }
+
+      for (size_t i = 0; i < a_shape.size() - 2; i++) {
+        if (a_shape[i] != b_shape[i]) {
+          LOGS(logger, VERBOSE) << "WebNN CPU backend can't support broadcasting for MatMul.";
+          return false;
+        }
+      }
+    }
+  }
+
   return true;
 }
 
@@ -149,6 +227,7 @@ void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
       {
           "Gemm",
           "MatMul",
+          "MatMulInteger",
       };
 
   op_registrations.builders.push_back(std::make_unique<GemmOpBuilder>());
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index 756a838cc0c3..50e04df4fe0f 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -27,8 +27,6 @@ class NormalizationOpBuilder : public BaseOpBuilder {
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
-// All normalization are based on layout NCHW.
-// TODO: add support for NHWC.
 Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                      const Node& node,
                                                      const logging::Logger& logger) const {
@@ -61,49 +59,13 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     ORT_RETURN_IF_NOT(bias_shape == scale_shape, "The bias' shape should be equal to scale's shape.");
   }
 
-  std::vector<uint32_t> new_scale_shape;
-  if (scale_size < rank) {
-    if (op_type == "BatchNormalization") {
-      scale_shape.insert(scale_shape.begin(), 1);
-      scale_shape.insert(scale_shape.end(), rank - 2, 1);
-    } else if (op_type == "LayerNormalization") {
-      // Align right with leading ones.
-      scale_shape.insert(scale_shape.begin(), rank - scale_size, 1);
-    } else if (op_type == "InstanceNormalization") {
-      // Insert ones before and after the channel dimension.
-      scale_shape.insert(scale_shape.begin(), 1);
-      ORT_RETURN_IF(scale_size != 1 || rank < 2,
-                    "The scale size should be 1 and rank should be at least 2 for InstanceNorm.");
-      scale_shape.insert(scale_shape.end(), rank - scale_size - 1, 1);
-    } else if (op_type == "GroupNormalization") {
-      // The input will be reshaped to 3D later. So just insert ones before the channel and after.
-      scale_shape.insert(scale_shape.begin(), 1);
-      scale_shape.insert(scale_shape.end(), 1);
-    } else {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
-    }
+  emscripten::val scale = model_builder.GetOperand(input_defs[1]->Name());
+  options.set("scale", scale);
 
-    std::transform(scale_shape.cbegin(), scale_shape.cend(),
-                   std::back_inserter(new_scale_shape),
-                   [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
-    emscripten::val reshape_scale = model_builder.GetOperand(input_defs[1]->Name());
-    emscripten::val reshape_output_scale =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", reshape_scale, emscripten::val::array(new_scale_shape));
-    options.set("scale", reshape_output_scale);
-
-    if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
-      // Bias input exists, and bias's shape is the same as scale's shape.
-      emscripten::val reshape_bias = model_builder.GetOperand(input_defs[2]->Name());
-      emscripten::val reshape_output_bias =
-          model_builder.GetBuilder().call<emscripten::val>("reshape", reshape_bias, emscripten::val::array(new_scale_shape));
-      options.set("bias", reshape_output_bias);
-    }
-  } else {
-    options.set("scale", model_builder.GetOperand(input_defs[1]->Name()));
-    if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
-      // Bias input exists, and bias's shape is the same as scale's shape.
-      options.set("bias", model_builder.GetOperand(input_defs[2]->Name()));
-    }
+  if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
+    // Bias input exists, and bias's shape is the same as scale's shape.
+    emscripten::val bias = model_builder.GetOperand(input_defs[2]->Name());
+    options.set("bias", bias);
   }
 
   NodeAttrHelper helper(node);
@@ -114,56 +76,59 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     ORT_RETURN_IF_NOT(input_defs.size() == 5, "BatchNormalization requires five inputs.");
     emscripten::val mean = model_builder.GetOperand(input_defs[3]->Name());
     emscripten::val variance = model_builder.GetOperand(input_defs[4]->Name());
-    // Enlarge 1-D mean and variance to new scale shape.
-    emscripten::val reshape_mean =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", mean, emscripten::val::array(new_scale_shape));
-    emscripten::val reshape_variance =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", variance, emscripten::val::array(new_scale_shape));
-
-    std::vector<uint32_t> axes = {0};
-    for (uint32_t i = 2; i < rank; i++) {
-      axes.push_back(i);
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+      options.set("axis", rank - 1);
     }
-
-    options.set("axes", emscripten::val::array(axes));
-    options.set("mean", reshape_mean);
-    options.set("variance", reshape_variance);
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("batchNormalization", input, mean, variance, options);
   } else if (op_type == "LayerNormalization") {
     int64_t axis = helper.Get("axis", -1);
     axis = HandleNegativeAxis(axis, rank);
     std::vector<uint32_t> axes(rank - SafeInt<uint32_t>(axis));
-    std::iota(axes.begin(), axes.end(), axis);
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC && axis > 1) {
+      std::iota(axes.begin(), axes.end(), axis - 1);
+    } else {
+      std::iota(axes.begin(), axes.end(), axis);
+    }
     options.set("axes", emscripten::val::array(axes));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("layerNormalization", input, options);
   } else if (op_type == "InstanceNormalization") {
-    std::vector<uint32_t> axes;
-    for (uint32_t i = 2; i < rank; i++) {
-      axes.emplace_back(i);
+    // WebNN spec only supports 4D input for instanceNormalization.
+    // Supports 3D input by prepending 1 size dimension.
+    // For models with dimensions greater than 4, they will be reshaped into 4D.
+    constexpr size_t webnn_shape_rank = 4;
+    if (input_shape.size() != webnn_shape_rank) {
+      std::vector<uint32_t> new_shape;
+      new_shape.reserve(std::max(input_shape.size(), webnn_shape_rank));
+      std::transform(input_shape.begin(), input_shape.end(),
+                     std::back_inserter(new_shape),
+                     [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
+
+      size_t insertion_offset = (model_builder.GetPreferredLayout() == DataLayout::NHWC) ? 2 : 3;
+      ptrdiff_t excess_rank = new_shape.size() - webnn_shape_rank;
+      auto insertion_point = new_shape.begin() + insertion_offset;
+      if (input_shape.size() < webnn_shape_rank) {
+        // Pad the shape with extra 1's to satisfy WebNN v1's rank requirements.
+        new_shape.insert(insertion_point, -excess_rank, 1);
+      } else {
+        // Fold the extra range to fit within WebNN v1's rank requirements.
+        uint32_t sum = std::accumulate(
+            insertion_point, insertion_point + excess_rank + 1, 1, std::multiplies<uint32_t>());
+        new_shape.erase(insertion_point, insertion_point + excess_rank);
+        *insertion_point = sum;
+      }
+      input = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
+    }
+
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+      options.set("layout", emscripten::val("nhwc"));
+    }
+    output = model_builder.GetBuilder().call<emscripten::val>("instanceNormalization", input, options);
+    // Reshape back to the original output shape for 3D input.
+    if (input_shape.size() != 4) {
+      std::vector<uint32_t> output_shape = GetVecUint32FromVecInt64(input_shape);
+      output = model_builder.GetBuilder().call<emscripten::val>(
+          "reshape", output, emscripten::val::array(output_shape));
     }
-    options.set("axes", emscripten::val::array(axes));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
-  } else if (op_type == "GroupNormalization") {
-    ORT_RETURN_IF_NOT(helper.HasAttr("num_groups"), "GroupNormalization num_group must be provided.");
-    int32_t group_count = helper.Get("num_groups", -1);
-    std::vector<uint32_t> orig_shape, new_shape;
-    std::transform(input_shape.cbegin(), input_shape.cend(),
-                   std::back_inserter(orig_shape),
-                   [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
-    // Add N and Group.
-    ORT_RETURN_IF_NOT(rank >= 2, "Input for GroupNormalization cannot be a scalar or 1D");
-    new_shape.emplace_back(SafeInt<uint32_t>(input_shape[0]));
-    new_shape.emplace_back(SafeInt<uint32_t>(group_count));
-
-    ORT_RETURN_IF_NOT(group_count > 0 && input_shape[1] % group_count == 0,
-                      "GroupNormalization num_group must be divisible by group.");
-    new_shape.emplace_back(SafeInt<uint32_t>(std::reduce(input_shape.begin() + 2, input_shape.end(),
-                                                         input_shape[1] / group_count, std::multiplies<int64_t>())));
-    // Input will be reshaped to (N, group count, channels per group x D1 x D2 ... Dn) and recovered after normalization.
-    options.set("axes", emscripten::val::array(std::vector<uint32_t>{2}));
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", output, options);
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(orig_shape));
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
   }
@@ -214,7 +179,6 @@ void CreateNormalizationOpBuilder(const std::string& op_type, OpBuilderRegistrat
   constexpr static std::string_view op_types[] =
       {
           "BatchNormalization",
-          "GroupNormalization",
           "InstanceNormalization",
           "LayerNormalization",
       };
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
index a2a1e2f2e599..9852db0abc9d 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
@@ -88,15 +88,15 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     const auto& pads_tensor = *initializers.at(input_defs[1]->Name());
     ORT_RETURN_IF_NOT(ReadIntArrayFrom1DTensor(pads_tensor, pads, logger), "Error while read pads tensor");
 
-    // Constant value and axes are optional.
-    if (input_defs.size() >= 3) {
+    // Constant value and axes are optional. Make sure they are not empty.
+    if (!GetTensorName(input_defs, 2).empty()) {
       const auto value_tensor = *initializers.at(input_defs[2]->Name());
       emscripten::val value = emscripten::val::object();
       ORT_RETURN_IF_NOT(ReadScalarTensorData(value_tensor, value, logger), "Cannot read constant value");
       options.set("value", value);
     }
 
-    if (input_defs.size() == 4) {
+    if (!GetTensorName(input_defs, 3).empty()) {
       const auto input_rank = input_shape.size();
       std::vector<int64_t> axes;
       const auto& axes_tensor = *initializers.at(input_defs[3]->Name());
@@ -178,8 +178,10 @@ bool PadOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       return false;
     }
     for (size_t i = 1; i < input_defs.size(); i++) {
-      if (!Contains(initializers, input_defs[i]->Name())) {
-        LOGS(logger, VERBOSE) << "Input [" << input_defs[i]->Name() << "] must be known as initializer";
+      // Optional tensors (constant_value, axes) can be indicated by an empty name, just ignore it.
+      const std::string input_name = GetTensorName(input_defs, i);
+      if (!input_name.empty() && !Contains(initializers, input_name)) {
+        LOGS(logger, VERBOSE) << "Input [" << input_name << "] must be known as initializer";
         return false;
       }
     }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
index 739c3b3f38de..8b3eecf35fcc 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
@@ -81,7 +81,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto onnx_kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{0, 0});
   const auto onnx_strides = helper.Get("strides", std::vector<int64_t>{1, 1});
   const auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
-  auto pads = helper.Get("pads", std::vector<int32_t>{0, 0, 0, 0});
+  auto pads = helper.Get("pads", std::vector<uint32_t>{0, 0, 0, 0});
   std::vector<int64_t> input_shape;
   ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
   AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
@@ -94,12 +94,11 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                       auto_pad_type,
                                       pads_out,
                                       model_builder.GetPreferredLayout() == DataLayout::NCHW));
-    std::transform(pads_out.begin(), pads_out.end(), pads.begin(),
-                   [](int64_t pad) -> int32_t { return static_cast<int32_t>(pad); });
+    pads = GetVecUint32FromVecInt64(pads_out);
   }
   // Permute the ONNX's pads, which is [beginning_height, beginning_width, ending_height, ending_width],
   // while WebNN's padding is [beginning_height, ending_height, beginning_width, ending_width].
-  const std::vector<int32_t> padding{pads[0], pads[2], pads[1], pads[3]};
+  const std::vector<uint32_t> padding{pads[0], pads[2], pads[1], pads[3]};
   options.set("padding", emscripten::val::array(padding));
 
   const auto ceil_mode = helper.Get("ceil_mode", 0);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index 1a702649b7f0..c0954f7cf6fb 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -65,7 +65,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   if (opset >= 18 || (op_type == "ReduceSum" && opset >= 13)) {
     // 'axes' is an optional input.
     const auto noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0);
-    if (input_defs.size() > 1) {
+    if (!GetTensorName(input_defs, 1).empty()) {
       // Optional input axes is provided, use axes initializer data.
       const auto& initializers(model_builder.GetInitializerTensors());
       const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
@@ -134,8 +134,9 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializ
     return false;
 
   const auto& op_type = node.OpType();
+  const std::string axes_name = GetTensorName(input_defs, 1);
   // If the optional input 'axes' is provided, it must be an initializer.
-  if (input_defs.size() > 1 && !Contains(initializers, input_defs[1]->Name())) {
+  if (!axes_name.empty() && !Contains(initializers, axes_name)) {
     LOGS(logger, VERBOSE) << "Input axes of " << op_type << " must be a constant";
     return false;
   }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index ea9fc379ee23..9018f8c96f30 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
   // Operator support related.
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
 
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing.
   // We only support Resize opset 11+ here.
@@ -120,8 +120,9 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   std::vector<float> scales_hw;
   std::vector<int32_t> sizes_hw;
   std::vector<int32_t> axes;
+  std::string scales_name = GetTensorName(input_defs, 2);
   const bool is_nhwc = model_builder.GetPreferredLayout() == DataLayout::NHWC;
-  if (input_defs.size() == 3) {  // Use scales.
+  if (!scales_name.empty()) {  // Use scales.
     ORT_RETURN_IF_NOT(GetResizeScales(initializers, node, scales, logger), "Error getting resize scales");
     if (is_nhwc) {
       scales_hw = {scales[1], scales[2]};
@@ -129,7 +130,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       scales_hw = {scales[2], scales[3]};
     }
     options.set("scales", emscripten::val::array(scales_hw));
-  } else {  // We already checked number of inputs in IsOpSupportedImpl.
+  } else {  // Use sizes, we already checked inputs in IsOpSupportedImpl.
     std::vector<int64_t> output_sizes;
     ORT_RETURN_IF_NOT(GetResizeOutputSizes(initializers, node, output_sizes, logger),
                       "Error getting resize output_sizes");
@@ -161,7 +162,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                         const Node& node,
-                                        const WebnnDeviceType /* device_type */,
+                                        const WebnnDeviceType device_type,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
@@ -181,9 +182,18 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     const auto mode = helper.Get("mode", "nearest");
     bool is_linear_resize = mode == "linear";
     bool is_nearest_resize = mode == "nearest";
-    if (!is_linear_resize && !is_nearest_resize) {
-      LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode;
-      return false;
+    // WebNN CPU backend only supports "linear" mode.
+    // WebNN GPU backend only supports "linear" and "nearest" modes.
+    if (device_type == WebnnDeviceType::CPU) {
+      if (!is_linear_resize) {
+        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for CPU backend.";
+        return false;
+      }
+    } else {
+      if (!is_linear_resize && !is_nearest_resize) {
+        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for GPU backend.";
+        return false;
+      }
     }
 
     const auto exclude_outside = helper.Get("exclude_outside", 0);
@@ -194,26 +204,31 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
   }
 
   {  // scales and sizes (if present) must be initializers.
-    if (input_defs.size() < 3) {
-      LOGS(logger, VERBOSE) << "Input scales or sizes of Resize must be known";
-      return false;
-    }
+    const std::string scales_name = GetTensorName(input_defs, 2);
+    const std::string sizes_name = GetTensorName(input_defs, 3);
 
-    // scales
-    if (input_defs.size() == 3 && !Contains(initializers, input_defs[2]->Name())) {
+    // scales (scales may be empty tensor)
+    bool has_scales = !scales_name.empty();
+    if ((has_scales && !Contains(initializers, scales_name)) || (!has_scales && node.SinceVersion() == 11)) {
       LOGS(logger, VERBOSE) << "Input scales of Resize must be known";
       return false;
     }
 
-    // sizes
-    if (input_defs.size() > 3 && !Contains(initializers, input_defs[3]->Name())) {
+    // sizes (sizes may be empty tensor)
+    bool has_sizes = !sizes_name.empty();
+    if (has_sizes && !Contains(initializers, sizes_name)) {
       LOGS(logger, VERBOSE) << "Input sizes of Resize must be known";
       return false;
     }
 
+    if (has_scales && has_sizes) {
+      LOGS(logger, VERBOSE) << "Only one of 'scales' and 'sizes' can be specified";
+      return false;
+    }
+
     const bool is_nhwc = node.Domain() == kMSInternalNHWCDomain;
     // We want to check if the scales or sizes are not trying to resize on N/C channels here.
-    if (input_defs.size() == 3) {  // We are using scales.
+    if (has_scales) {  // We are using scales.
       std::vector<float> scales;
       if (!GetResizeScales(initializers, node, scales, logger))
         return false;
@@ -242,7 +257,9 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
         LOGS(logger, VERBOSE) << "Resize: scale_w: " << scale_w << " is not a whole number";
         return false;
       }
-    } else {
+    }
+
+    if (has_sizes) {
       // We are using sizes.
       std::vector<int64_t> output_sizes;
       if (!GetResizeOutputSizes(initializers, node, output_sizes, logger))
diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
index e48cf3501265..4e0628581abf 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
@@ -123,8 +123,10 @@ bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
 
   // Inputs: starts, ends, axes, and steps must be constant initializers if present.
   for (size_t i = 1; i < input_defs.size(); i++) {
-    if (!Contains(initializers, input_defs[i]->Name())) {
-      LOGS(logger, VERBOSE) << "Input [" << input_defs[i]->Name() << "] of " << op_type
+    // Optional tensors (axes, steps) can be indicated by an empty name, just ignore it.
+    const std::string input_name = GetTensorName(input_defs, i);
+    if (!input_name.empty() && !Contains(initializers, input_name)) {
+      LOGS(logger, VERBOSE) << "Input [" << input_name << "] of " << op_type
                             << " [" << name << "] must be known as initializer";
       return false;
     }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index d568d4e62507..ea3b8ef384dd 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -28,8 +28,6 @@ class SplitOpBuilder : public BaseOpBuilder {
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
-
-  int GetMinSupportedOpSet(const Node& node) const override;
 };
 
 // Add operator related.
@@ -57,56 +55,35 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   axis = SafeInt<int32_t>(HandleNegativeAxis(axis, rank));
   options.set("axis", axis);
 
-  if (input_defs.size() == 2) {
-    // Inputs contains optional 'split' input
-    std::vector<int32_t> splits;
+  uint32_t split_count = 0;
+  std::vector<uint32_t> splits = helper.Get("split", std::vector<uint32_t>{});
+
+  // Read either the split count or explicit split lengths from the various attributes over opset versions.
+  if (helper.HasAttr("num_outputs")) {
+    split_count = helper.Get("num_outputs", 0);
+  } else if (GetTensorName(input_defs, 1).size()) {
     const auto& initializers(model_builder.GetInitializerTensors());
     const auto& split_tensor = *initializers.at(input_defs[1]->Name());
-    ORT_RETURN_IF_NOT(ReadIntArrayFrom1DTensor(split_tensor, splits, logger), "Cannot get split.");
-    output_array = model_builder.GetBuilder().call<emscripten::val>("split",
-                                                                    input,
-                                                                    emscripten::val::array(splits),
-                                                                    options);
-    ORT_RETURN_IF_NOT(output_array["length"].as<int32_t>() == static_cast<int32_t>(splits.size()),
-                      "The size of outputs must be equal to the size of 'split' input.");
+    ORT_RETURN_IF_NOT(ReadIntArrayFrom1DTensor(split_tensor, splits, logger), "Cannot get input for split.");
+  } else if (!helper.HasAttr("split")) {
+    split_count = node.OutputDefs().size();
+  }
+
+  // Check that the splits evenly divide.
+  if (split_count > 0 && splits.empty() && input_shape[axis] % split_count != 0) {
+    // Divide inputs into variable size outputs:
+    splits.insert(splits.end(), split_count - 1, gsl::narrow<uint32_t>(input_shape[axis]) / split_count);
+    splits.insert(splits.end(), gsl::narrow<uint32_t>(input_shape[axis]) % split_count);
+  }
+
+  if (splits.empty()) {
+    output_array = model_builder.GetBuilder().call<emscripten::val>(
+        "split", input, split_count, options);
   } else {
-    if (helper.HasAttr("num_outputs")) {
-      const int32_t num_outputs = helper.Get("num_outputs", 1);
-      ORT_RETURN_IF_NOT(num_outputs > 0, "The 'num_outputs' must be a positive integer.");
-      if (input_shape[axis] % num_outputs == 0) {
-        // The 'num_outputs' evenly divide the dim value at 'axis' specified.
-        output_array = model_builder.GetBuilder().call<emscripten::val>("split",
-                                                                        input,
-                                                                        num_outputs,
-                                                                        options);
-      } else {
-        std::vector<int64_t> mapping_split;
-        mapping_split.insert(mapping_split.begin(), num_outputs - 1, input_shape[axis] / num_outputs);
-        mapping_split.insert(mapping_split.end(), input_shape[axis] % num_outputs);
-        std::vector<int32_t> converted_splits;
-        std::transform(mapping_split.cbegin(), mapping_split.cend(),
-                       std::back_inserter(converted_splits),
-                       [](int64_t dim) -> int32_t { return SafeInt<int32_t>(dim); });
-        output_array = model_builder.GetBuilder().call<emscripten::val>("split",
-                                                                        input,
-                                                                        emscripten::val::array(converted_splits),
-                                                                        options);
-      }
-      ORT_RETURN_IF_NOT(output_array["length"].as<int32_t>() == num_outputs,
-                        "The size of outputs must be equal to 'num_outputs'.");
-    } else {
-      // w/o 'split' input for opset 13
-      // Refer to https://github.com/microsoft/onnxruntime/blob/a7ad859e3ab60bddfcf2fefa96bfcb550f0fc04c/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp#L984-L989
-      // split input stream equally across output streams.
-      const auto& output_defs = node.OutputDefs();
-      const size_t output_count = output_defs.size();
-      output_array = model_builder.GetBuilder().call<emscripten::val>("split",
-                                                                      input, static_cast<int32_t>(output_count),
-                                                                      options);
-      ORT_RETURN_IF_NOT(output_array["length"].as<size_t>() == output_count,
-                        "The size of outputs must be equal to the count of output nodes.");
-    }
+    output_array = model_builder.GetBuilder().call<emscripten::val>(
+        "split", input, emscripten::val::array(splits), options);
   }
+
   for (size_t i = 0, count = output_array["length"].as<size_t>(); i < count; i++) {
     model_builder.AddOperand(node.OutputDefs()[i]->Name(), std::move(output_array[i]));
   }
@@ -115,11 +92,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 // Operator support related.
 
-int SplitOpBuilder::GetMinSupportedOpSet(const Node& /* node */) const {
-  // Since opset 13, Split has optional 'split' input.
-  return 13;
-}
-
 bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                        const Node& node,
                                        const WebnnDeviceType /* device_type */,
@@ -135,16 +107,16 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   NodeAttrHelper helper(node);
   int32_t axis = helper.Get("axis", 0);
   axis = SafeInt<int32_t>(HandleNegativeAxis(axis, rank));
+  std::vector<uint32_t> split = helper.Get("split", std::vector<uint32_t>{});
 
-  if (input_defs.size() == 2) {
-    // Inputs contains optional 'split' input
-    const auto& split_name = input_defs[1]->Name();
+  const std::string split_name = GetTensorName(input_defs, 1);
+  // Inputs contain optional 'split' input.
+  if (!split_name.empty()) {
     if (!Contains(initializers, split_name)) {
       LOGS(logger, VERBOSE) << "The split must be a constant initializer.";
       return false;
     }
     // Values should be >= 0. Sum of the values must be equal to the dim value at 'axis' specified.
-    std::vector<int64_t> split;
     const auto& split_tensor = *initializers.at(input_defs[1]->Name());
     if (split_tensor.data_type() != ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
       LOGS(logger, VERBOSE) << "The type of tensor's element data must be INT64.";
@@ -154,19 +126,7 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       LOGS(logger, VERBOSE) << "Cannot get split.";
       return false;
     }
-    int64_t sum = 0;
-    for (size_t i = 0; i < split.size(); i++) {
-      if (split[i] < 0) {
-        LOGS(logger, VERBOSE) << "Value of split should be greater than or equal to 0.";
-        return false;
-      }
-      sum += split[i];
-    }
-    if (sum != input_shape[axis]) {
-      LOGS(logger, VERBOSE) << "Sum of the split's values must be equal to the dim value at 'axis' specified.";
-      return false;
-    }
-  } else if (input_defs.size() == 1) {
+  } else {
     if (helper.HasAttr("num_outputs")) {
       // Split has 'num_outputs' attribute when opset is 18.
       const int32_t num_outputs = helper.Get("num_outputs", 1);
@@ -182,6 +142,23 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       }
     }
   }
+
+  if (!split.empty()) {
+    int64_t sum = 0;
+    // TODO: Allow 0 size dimensions.
+    // https://github.com/webmachinelearning/webnn/issues/391
+    for (uint32_t split_value : split) {
+      if (split_value <= 0) {
+        LOGS(logger, VERBOSE) << "Value of split should be greater than 0.";
+        return false;
+      }
+      sum += split_value;
+    }
+    if (sum != input_shape[axis]) {
+      LOGS(logger, VERBOSE) << "Sum of the split's values must be equal to the dim value at 'axis' specified.";
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
index 2a1672c001b0..8e6feb62fa8c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
@@ -58,7 +58,7 @@ Status SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
   std::vector<int32_t> axes_data;
   auto rank = input_rank;
 
-  if (node.SinceVersion() >= 13 && input_defs.size() > 1) {
+  if (node.SinceVersion() >= 13 && !GetTensorName(input_defs, 1).empty()) {
     // Input axes is provided, use axes initializer data.
     const auto& initializers = model_builder.GetInitializerTensors();
     const auto& axes_tensor = *initializers.at(input_defs[1]->Name());
@@ -87,10 +87,7 @@ Status SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
 
   emscripten::val output = emscripten::val::undefined();
   // Use WebNN's reshape to implement Squeeze/Unsqueeze.
-  std::vector<uint32_t> new_shape;
-  std::transform(
-      input_shape.begin(), input_shape.end(), std::back_inserter(new_shape),
-      [](int64_t data) -> uint32_t { return SafeInt<uint32_t>(data); });
+  std::vector<uint32_t> new_shape = GetVecUint32FromVecInt64(input_shape);
   // Sort axes_data in ascending order.
   std::sort(axes_data.begin(), axes_data.end());
   if (op_type == "Squeeze") {
@@ -138,8 +135,8 @@ bool SqueezeUnsqueezeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& in
 
   // Squeeze/Unsqueeze opset 13 uses input 1 as axes, it needs to be an initializer.
   if (node.SinceVersion() >= 13) {
-    if (input_defs.size() > 1) {
-      const auto& axes_name = input_defs[1]->Name();
+    const std::string axes_name = GetTensorName(input_defs, 1);
+    if (!axes_name.empty()) {
       if (!Contains(initializers, axes_name)) {
         LOGS(logger, ERROR) << "Input axes of " << op_type << " is not present and constant";
         return false;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
index eca152138464..79f60c51ace1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
@@ -40,10 +40,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
   emscripten::val options = emscripten::val::object();
-  std::vector<int32_t> permutation;
-  std::transform(perm.cbegin(), perm.cend(),
-                 std::back_inserter(permutation),
-                 [](int64_t dim) -> int32_t { return SafeInt<int32_t>(dim); });
+  std::vector<uint32_t> permutation = GetVecUint32FromVecInt64(perm);
   options.set("permutation", emscripten::val::array(permutation));
   emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("transpose", input, options);
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
diff --git a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
index 129532e91f5a..e6c5cf24080c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
@@ -48,7 +48,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   } else if (op_type == "Neg") {
     output = model_builder.GetBuilder().call<emscripten::val>("neg", input);
   } else if (op_type == "Not") {
-    output = model_builder.GetBuilder().call<emscripten::val>("not", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("logicalNot", input);
   } else if (op_type == "Reciprocal") {
     output = model_builder.GetBuilder().call<emscripten::val>("reciprocal", input);
   } else if (op_type == "Sin") {
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index b25d00d45a49..ef807a8c4fa2 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -33,9 +33,14 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
     emscripten::val view = emscripten::val::undefined();
     switch (tensor.tensor_info.data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint8_t*>(tensor.buffer))};
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                             static_cast<const int8_t*>(tensor.buffer))};
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint16_t*>(tensor.buffer))};
@@ -65,22 +70,13 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
                                "The input of graph has unsupported type, name: ",
                                name, " type: ", tensor.tensor_info.data_type);
     }
-#ifdef ENABLE_WEBASSEMBLY_THREADS
-    // Copy the inputs from Wasm SharedArrayBuffer to the pre-allocated ArrayBuffers.
+    // Copy the inputs from Wasm ArrayBuffer to the WebNN inputs ArrayBuffer.
+    // As Wasm ArrayBuffer is not detachable.
     wnn_inputs_[name].call<void>("set", view);
-#else
-    wnn_inputs_.set(name, view);
-#endif
   }
 
-#ifdef ENABLE_WEBASSEMBLY_THREADS
-  // This vector uses for recording output buffers from WebNN graph compution when WebAssembly
-  // multi-threads is enabled, since WebNN API only accepts non-shared ArrayBufferView,
-  // https://www.w3.org/TR/webnn/#typedefdef-mlnamedarraybufferviews
-  // and at this time the 'view' defined by Emscripten is shared ArrayBufferView, the memory
-  // address is different from the non-shared one, additional memory copy is required here.
   InlinedHashMap<std::string, emscripten::val> output_views;
-#endif
+
   for (const auto& output : outputs) {
     const std::string& name = output.first;
     const struct OnnxTensorData tensor = output.second;
@@ -88,9 +84,14 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
     emscripten::val view = emscripten::val::undefined();
     switch (tensor.tensor_info.data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint8_t*>(tensor.buffer))};
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                             static_cast<const int8_t*>(tensor.buffer))};
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint16_t*>(tensor.buffer))};
@@ -121,21 +122,23 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
                                name, " type: ", tensor.tensor_info.data_type);
     }
 
-#ifdef ENABLE_WEBASSEMBLY_THREADS
     output_views.insert({name, view});
-#else
-    wnn_outputs_.set(name, view);
-#endif
   }
-  wnn_context_.call<emscripten::val>("computeSync", wnn_graph_, wnn_inputs_, wnn_outputs_);
-#ifdef ENABLE_WEBASSEMBLY_THREADS
-  // Copy the outputs from pre-allocated ArrayBuffers back to the Wasm SharedArrayBuffer.
+  emscripten::val results = wnn_context_.call<emscripten::val>(
+                                            "compute", wnn_graph_, wnn_inputs_, wnn_outputs_)
+                                .await();
+
+  // Copy the outputs from pre-allocated ArrayBuffers back to the Wasm ArrayBuffer.
   for (const auto& output : outputs) {
     const std::string& name = output.first;
     emscripten::val view = output_views.at(name);
-    view.call<void>("set", wnn_outputs_[name]);
+    view.call<void>("set", results["outputs"][name]);
   }
-#endif
+  // WebNN compute() method would return the input and output buffers via the promise
+  // resolution. Reuse the buffers to avoid additional allocation.
+  wnn_inputs_ = results["inputs"];
+  wnn_outputs_ = results["outputs"];
+
   return Status::OK();
 }
 
@@ -164,8 +167,12 @@ void Model::AllocateInputOutputBuffers() {
     const auto data_type = input_info.data_type;
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         wnn_inputs_.set(input, emscripten::val::global("Uint8Array").new_(num_elements));
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        wnn_inputs_.set(input, emscripten::val::global("Int8Array").new_(num_elements));
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         wnn_inputs_.set(input, emscripten::val::global("Uint16Array").new_(num_elements));
         break;
@@ -195,8 +202,12 @@ void Model::AllocateInputOutputBuffers() {
     const auto data_type = output_info.data_type;
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         wnn_outputs_.set(output, emscripten::val::global("Uint8Array").new_(num_elements));
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        wnn_outputs_.set(output, emscripten::val::global("Int8Array").new_(num_elements));
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         wnn_outputs_.set(output, emscripten::val::global("Uint16Array").new_(num_elements));
         break;
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index b6631263dfb9..cb6669ecbae6 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -14,6 +14,8 @@
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 
+#include <utility>
+
 namespace onnxruntime {
 namespace webnn {
 
@@ -38,6 +40,25 @@ Status ModelBuilder::Initialize() {
   return Status::OK();
 }
 
+InitializedTensorSet ModelBuilder::GetInitializerTensors() {
+  if (graph_viewer_.IsSubgraph()) {
+    auto all_initializers = CollectAllInitializedTensors(graph_viewer_);
+    const auto sub_graph_id = graph_viewer_.GetFilterInfo();
+    const auto subgraph_initializer_names = sub_graph_id->GetMetaDef()->constant_initializers;
+    InitializedTensorSet subgraph_initializers;
+
+    for (const auto& name : subgraph_initializer_names) {
+      auto it = all_initializers.find(name);
+      if (it != all_initializers.end()) {
+        subgraph_initializers.insert(*it);
+      }
+    }
+    return subgraph_initializers;
+  } else {
+    return graph_viewer_.GetAllInitializedTensors();
+  }
+}
+
 /* static */ const IOpBuilder* ModelBuilder::GetOpBuilder(const Node& node) {
   const auto& op_builders = GetOpBuilders();
   const auto it = op_builders.find(node.OpType());
@@ -78,6 +99,8 @@ void ModelBuilder::PreprocessActivations() {
       emscripten::val options = emscripten::val::object();
       options.set("alpha", helper.Get("alpha", 1.0f));
       activation_nodes_.emplace(node->Index(), wnn_builder_.call<emscripten::val>("elu", options));
+    } else if (op_type == "Gelu") {
+      activation_nodes_.emplace(node->Index(), wnn_builder_.call<emscripten::val>("gelu"));
     } else if (op_type == "HardSigmoid") {
       NodeAttrHelper helper(*node);
       emscripten::val options = emscripten::val::object();
@@ -139,9 +162,16 @@ Status ModelBuilder::RegisterInitializers() {
       }
       switch (data_type) {
         case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+        case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+          desc.set("type", emscripten::val("uint8"));
           view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                                reinterpret_cast<uint8_t*>(tensor_ptr))};
           break;
+        case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+          desc.set("type", emscripten::val("int8"));
+          view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                               reinterpret_cast<int8_t*>(tensor_ptr))};
+          break;
         case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
           view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                                reinterpret_cast<uint16_t*>(tensor_ptr))};
@@ -169,14 +199,10 @@ Status ModelBuilder::RegisterInitializers() {
         default:
           break;
       }
-#ifdef ENABLE_WEBASSEMBLY_THREADS
-      // Workaround for WebAssembly multi-threads enabled since WebNN API only accepts non-shared ArrayBufferView.
-      // https://www.w3.org/TR/webnn/#typedefdef-mlnamedarraybufferviews
-      operand = wnn_builder_.call<emscripten::val>("constant", desc, view.call<emscripten::val>("slice"));
-#else
-      operand = wnn_builder_.call<emscripten::val>("constant", desc, view);
-#endif
 
+      // Wasm memory grow will cause all array buffers reallocation, which will be treated as detached
+      // buffers in JS side. Simply create a copy to fix it.
+      operand = wnn_builder_.call<emscripten::val>("constant", desc, view.call<emscripten::val>("slice"));
     } else {
       // TODO: support other type.
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
@@ -294,9 +320,14 @@ Status ModelBuilder::AddOperandFromPersistMemoryBuffer(
   ORT_RETURN_IF_NOT(SetWebnnDataType(desc, data_type), "Unsupported data type");
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       view = emscripten::val{emscripten::typed_memory_view(size / sizeof(uint8_t),
                                                            reinterpret_cast<const uint8_t*>(dest))};
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      view = emscripten::val{emscripten::typed_memory_view(size / sizeof(int8_t),
+                                                           reinterpret_cast<const int8_t*>(dest))};
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       view = emscripten::val{emscripten::typed_memory_view(size / sizeof(uint16_t),
                                                            reinterpret_cast<const uint16_t*>(dest))};
@@ -327,13 +358,10 @@ Status ModelBuilder::AddOperandFromPersistMemoryBuffer(
 
   desc.set("dimensions", emscripten::val::array(shape));
   emscripten::val operand = emscripten::val::object();
-#ifdef ENABLE_WEBASSEMBLY_THREADS
-  // Workaround for WebAssembly multi-threads enabled since WebNN API only accepts non-shared ArrayBufferView.
-  // https://www.w3.org/TR/webnn/#typedefdef-mlnamedarraybufferviews
+  // Wasm memory grow will cause all array buffers reallocation, which will be treated as detached
+  // buffers in JS side. Simply create a copy to fix it.
   operand = wnn_builder_.call<emscripten::val>("constant", desc, view.call<emscripten::val>("slice"));
-#else
-  operand = wnn_builder_.call<emscripten::val>("constant", desc, view);
-#endif
+
   AddOperand(name, operand);
   mem_persist_buffers_.push_back(std::move(persist_buffer));
   return Status::OK();
@@ -353,7 +381,8 @@ Status ModelBuilder::Compile(std::unique_ptr<Model>& model) {
   for (auto& name : output_names_) {
     named_operands.set(name, wnn_operands_.at(name));
   }
-  emscripten::val wnn_graph = wnn_builder_.call<emscripten::val>("buildSync", named_operands);
+
+  emscripten::val wnn_graph = wnn_builder_.call<emscripten::val>("build", named_operands).await();
   if (!wnn_graph.as<bool>()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to build WebNN graph.");
   }
@@ -362,13 +391,10 @@ Status ModelBuilder::Compile(std::unique_ptr<Model>& model) {
   model->SetOutputs(std::move(output_names_));
   model->SetScalarOutputs(std::move(scalar_outputs_));
   model->SetInputOutputInfo(std::move(input_output_info_));
-#ifdef ENABLE_WEBASSEMBLY_THREADS
-  // Pre-allocate the input and output tensors for the WebNN graph
-  // when WebAssembly multi-threads is enabled since WebNN API only
-  // accepts non-shared ArrayBufferView.
-  // https://www.w3.org/TR/webnn/#typedefdef-mlnamedarraybufferviews
+  // Wasm heap is not transferrable, we have to pre-allocate the MLNamedArrayBufferViews
+  // for inputs and outputs because they will be transferred after compute() done.
+  // https://webmachinelearning.github.io/webnn/#api-mlcontext-async-execution
   model->AllocateInputOutputBuffers();
-#endif
   return Status::OK();
 }
 
@@ -420,6 +446,38 @@ void ModelBuilder::AddOperand(const std::string& name, const emscripten::val& op
   wnn_operands_.insert(std::make_pair(name, operand));
 }
 
+// Get the zero scalar constant.
+// Workaround for builer.constant(value, type) method since it has not been implemented now.
+// https://webmachinelearning.github.io/webnn/#api-mlgraphbuilder-constant-value-type
+// BTW, the spec is discussing if the builer.constant(value, type) should be dropped at
+// https://github.com/webmachinelearning/webnn/issues/475. Fix me according to the spec decision.
+const emscripten::val& ModelBuilder::GetZeroConstant(const std::string& data_type) {
+  std::string name = "webnn_zero_constant_" + data_type;
+  // If the operand does not exist, create it.
+  if (wnn_operands_.find(name) == wnn_operands_.end()) {
+    emscripten::val desc = emscripten::val::object();
+    emscripten::val dims = emscripten::val::array();
+    desc.set("dimensions", dims);
+    emscripten::val zero_buffer = emscripten::val::undefined();
+    if (data_type == "uint8") {
+      if (!SetWebnnDataType(desc, ONNX_NAMESPACE::TensorProto_DataType_UINT8)) {
+        ORT_THROW("Unsupported data type: " + data_type);
+      }
+      zero_buffer = emscripten::val::global("Uint8Array").new_(1);
+    } else if (data_type == "float32") {
+      if (!SetWebnnDataType(desc, ONNX_NAMESPACE::TensorProto_DataType_FLOAT)) {
+        ORT_THROW("Unsupported data type: " + data_type);
+      }
+      zero_buffer = emscripten::val::global("Float32Array").new_(1);
+    } else {
+      ORT_THROW("Unsupported data type: " + data_type);
+    }
+    emscripten::val zero_constant = wnn_builder_.call<emscripten::val>("constant", desc, zero_buffer);
+    wnn_operands_.insert(std::make_pair(name, zero_constant));
+  }
+  return wnn_operands_.at(name);
+}
+
 void ModelBuilder::AddInitializerToSkip(const std::string& tensor_name) {
   skipped_initializers_.insert(tensor_name);
 }
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h
index c381eef3f42f..16cc7a376b71 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.h
@@ -30,12 +30,13 @@ class ModelBuilder {
 
   // Accessors for members.
   const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
-  const InitializedTensorSet& GetInitializerTensors() const { return graph_viewer_.GetAllInitializedTensors(); }
+  InitializedTensorSet GetInitializerTensors();
 
   const emscripten::val& GetBuilder() const { return wnn_builder_; }
   const emscripten::val& GetContext() const { return wnn_context_; }
   const emscripten::val& GetOperand(const std::string& name) const { return wnn_operands_.at(name); }
   void AddOperand(const std::string& name, const emscripten::val& operand);
+  const emscripten::val& GetZeroConstant(const std::string& data_type);
   // Use the buffers to persist WebNN allocated data like transposed weight.
   // It ensures the validity during inference session.
   std::vector<std::unique_ptr<uint8_t[]>> mem_persist_buffers_;
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 463317a4dafd..c39a5510cfb9 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -47,6 +47,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   {  // Activations
     CreateActivationOpBuilder("Elu", op_registrations);
+    CreateActivationOpBuilder("Gelu", op_registrations);
     CreateActivationOpBuilder("HardSigmoid", op_registrations);
     CreateActivationOpBuilder("HardSwish", op_registrations);
     CreateActivationOpBuilder("LeakyRelu", op_registrations);
@@ -72,6 +73,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   {  // Conv
     CreateConvOpBuilder("Conv", op_registrations);
+    CreateConvOpBuilder("ConvInteger", op_registrations);
     CreateConvOpBuilder("ConvTranspose", op_registrations);
   }
 
@@ -79,6 +81,11 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     CreateConcatOpBuilder("Concat", op_registrations);
   }
 
+  {  // Quantize/Dequantize
+    CreateDynamicQuantizeLinearOpBuilder("DynamicQuantizeLinear", op_registrations);
+    CreateDequantizeLinearOpBuilder("DequantizeLinear", op_registrations);
+  }
+
   {  // Expand
     CreateExpandOpBuilder("Expand", op_registrations);
   }
@@ -94,6 +101,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   {  // Gemm/MatMul
     CreateGemmOpBuilder("Gemm", op_registrations);
     CreateGemmOpBuilder("MatMul", op_registrations);
+    CreateGemmOpBuilder("MatMulInteger", op_registrations);
   }
 
   {  // Logical
@@ -111,7 +119,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   {  // Normalization
     CreateNormalizationOpBuilder("BatchNormalization", op_registrations);
-    CreateNormalizationOpBuilder("GroupNormalization", op_registrations);
     CreateNormalizationOpBuilder("InstanceNormalization", op_registrations);
     CreateNormalizationOpBuilder("LayerNormalization", op_registrations);
   }
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
index 0b7934692f01..a50a7318e375 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
@@ -26,6 +26,8 @@ void CreateCastOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConvOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConcatOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateDynamicQuantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateDequantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateExpandOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index 4da54aaad3a3..d72abf1a721c 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -19,7 +19,7 @@ namespace onnxruntime {
 
 WebNNExecutionProvider::WebNNExecutionProvider(const std::string& webnn_device_flags,
                                                const std::string& webnn_threads_number, const std::string& webnn_power_flags)
-    : IExecutionProvider{onnxruntime::kWebNNExecutionProvider, true} {
+    : IExecutionProvider{onnxruntime::kWebNNExecutionProvider} {
   // Create WebNN context and graph builder.
   const emscripten::val ml = emscripten::val::global("navigator")["ml"];
   if (!ml.as<bool>()) {
@@ -37,12 +37,19 @@ WebNNExecutionProvider::WebNNExecutionProvider(const std::string& webnn_device_f
     }
   } else {
     preferred_layout_ = DataLayout::NCHW;
-    wnn_device_type_ = webnn::WebnnDeviceType::GPU;
+    if (webnn_device_flags.compare("gpu") == 0) {
+      wnn_device_type_ = webnn::WebnnDeviceType::GPU;
+    } else if (webnn_device_flags.compare("npu") == 0) {
+      wnn_device_type_ = webnn::WebnnDeviceType::NPU;
+    } else {
+      ORT_THROW("Unknown WebNN deviceType.");
+    }
   }
   if (webnn_power_flags.compare("default") != 0) {
     context_options.set("powerPreference", emscripten::val(webnn_power_flags));
   }
-  wnn_context_ = ml.call<emscripten::val>("createContextSync", context_options);
+
+  wnn_context_ = ml.call<emscripten::val>("createContext", context_options).await();
   if (!wnn_context_.as<bool>()) {
     ORT_THROW("Failed to create WebNN context.");
   }
@@ -59,10 +66,15 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
                                       const IKernelLookup& /*kernel_registries*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
-  // We do not run WebNN EP on subgraph, instead we cover this in the control flow nodes.
-  // TODO investigate whether we want to support subgraph using WebNN EP.
-  if (graph_viewer.IsSubgraph()) {
-    return result;
+  // For subgraph which is the attribute of the control flow nodes, part of its initializers are stored in its
+  // ancestor graphs as common initializers shared for other subgraphs. We need to collect all of them used for
+  // identifying the required initializer names and storing into 'meta_def->constant_initializers'.
+  // Thus we are able to get the required initialized tensors for this subgraph via the GetInitializerTensors()
+  // method defined in the model_builder.h file.
+  InitializedTensorSet all_initializers;
+  const bool is_subgraph = graph_viewer.IsSubgraph();
+  if (is_subgraph) {
+    all_initializers = webnn::CollectAllInitializedTensors(graph_viewer);
   }
 
   /*
@@ -110,6 +122,7 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
 
     std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
 
+    std::vector<std::string> subgraph_initializers;
     InlinedHashSet<const NodeArg*> node_outputs;
     InlinedHashSet<const NodeArg*> subgraph_inputs;
     InlinedHashSet<const NodeArg*> subgraph_outputs;
@@ -126,7 +139,11 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
           // skip the placeholder inputs.
           continue;
         }
-        // if the node input was not produced by this subgraph, add it to the subgraph inputs.
+        // If it is a subgraph of a control flow node, collect the constant initializer.
+        if (is_subgraph && Contains(all_initializers, input->Name())) {
+          subgraph_initializers.push_back(input->Name());
+        }
+        // If the node input was not produced by this subgraph, add it to the subgraph inputs.
         if (node_outputs.count(input) == 0) {
           if (subgraph_inputs.count(input) == 0) {
             subgraph_inputs.insert(input);
@@ -158,13 +175,19 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
 
     // Assign inputs and outputs to subgraph's meta_def.
     uint64_t model_hash;
-    int metadef_id = GenerateMetaDefId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
     auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>();
     meta_def->name = "WEBNN_" + std::to_string(model_hash) + "_" + std::to_string(metadef_id);
     meta_def->domain = kMSDomain;
     meta_def->since_version = 1;
     meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
 
+    if (is_subgraph) {
+      for (const auto& initializer : subgraph_initializers) {
+        meta_def->constant_initializers.push_back(initializer);
+      }
+    }
+
     for (const auto& input : ordered_subgraph_inputs) {
       meta_def->inputs.push_back(input->Name());
     }
@@ -265,9 +288,6 @@ common::Status WebNNExecutionProvider::Compile(const std::vector<FusedNodeAndGra
         // Since all the input output of WebNN EP is MultiArray, we will make the scalar input as a {1} MultiArray.
         if (shape.empty())
           shape.push_back(1);
-        std::vector<int> temp(shape.size());
-        transform(shape.begin(), shape.end(), temp.begin(),
-                  [](int64_t dim) -> uint32_t { return SafeInt<int32_t>(dim); });
         const void* inputBuffer = const_cast<void*>(input_tensor.GetTensorRawData());
         inputs.emplace(
             input_name,
@@ -301,6 +321,8 @@ common::Status WebNNExecutionProvider::Compile(const std::vector<FusedNodeAndGra
           void* output_buffer;
           switch (output_type) {
             case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+            case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+            case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
             case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
             case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
             case ONNX_NAMESPACE::TensorProto_DataType_INT32:
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.h b/onnxruntime/core/providers/webnn/webnn_execution_provider.h
index 13a475327dc0..d9cfa5f17c0d 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.h
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.h
@@ -6,6 +6,7 @@
 
 #include "core/common/inlined_containers.h"
 #include "core/framework/execution_provider.h"
+#include "core/framework/model_metadef_id_generator.h"
 #include "core/providers/webnn/builders/helper.h"
 
 #include <emscripten.h>
@@ -48,5 +49,6 @@ class WebNNExecutionProvider : public IExecutionProvider {
   DataLayout preferred_layout_;
   webnn::WebnnDeviceType wnn_device_type_;
   InlinedHashMap<std::string, std::unique_ptr<onnxruntime::webnn::Model>> models_;
+  ModelMetadefIdGenerator metadef_id_generator_;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
index 8e7e228f974e..e2d71cda68ec 100644
--- a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
@@ -6,12 +6,12 @@
 #include <unordered_map>
 
 #include "core/common/common.h"
+#include "core/framework/node_unit.h"
 #include "core/framework/op_node_proto_helper.h"
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/providers/common.h"
 #include "core/providers/cpu/nn/pool_attributes.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "core/providers/xnnpack/detail/utils.h"
 
 // each operator provides a helper to check if supported
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc
index 1a3261298112..f9cb45ebc8ab 100644
--- a/onnxruntime/core/providers/xnnpack/detail/utils.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc
@@ -6,14 +6,14 @@
 #include <vector>
 
 #include "core/common/common.h"
+#include "core/common/safeint.h"
+#include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/indexed_sub_graph.h"
 #include "core/graph/node_attr_utils.h"
+#include "core/optimizer/initializer.h"
 
-#include "core/providers/shared/node_unit/node_unit.h"
 #include "onnx/defs/attr_proto_util.h"
-#include "core/common/safeint.h"
-#include "core/optimizer/initializer.h"
 
 namespace onnxruntime {
 namespace xnnpack {
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.h b/onnxruntime/core/providers/xnnpack/detail/utils.h
index 2bbf3ac8c2cb..d555ee2286b8 100644
--- a/onnxruntime/core/providers/xnnpack/detail/utils.h
+++ b/onnxruntime/core/providers/xnnpack/detail/utils.h
@@ -10,10 +10,10 @@
 #include <string>
 #include <utility>
 
+#include "core/framework/node_unit.h"
 #include "core/framework/op_kernel.h"
 #include "core/graph/indexed_sub_graph.h"
 #include "core/providers/common.h"
-#include "core/providers/shared/node_unit/node_unit.h"
 
 #include "xnnpack.h"
 
diff --git a/onnxruntime/core/providers/xnnpack/tensor/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
index 0c9e2e9fc17a..09666c803940 100644
--- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
@@ -288,7 +288,7 @@ Status Resize::Compute(OpKernelContext* ctx) const {
 
     // Get scales data
     const auto* scales = ctx->Input<Tensor>(scales_input_idx_);
-    std::vector<float> scales_array(X->Shape().GetDims().size());
+    InlinedVector<float> scales_array(X->Shape().GetDims().size());
 
     if (scales != nullptr && scales->Shape().Size() != 0) {
       ORT_RETURN_IF_ERROR(ParseScalesData(scales, scales_array, output_shape.size()));
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
index a2a776df439e..12e567e7080b 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
@@ -6,17 +6,17 @@
 #include <unordered_set>
 #include <utility>
 
-#include "core/graph/function_utils.h"
-#include "xnnpack_execution_provider.h"
-#include "detail/utils.h"
-#include "detail/node_support_checker.h"
-
 #include "core/framework/compute_capability.h"
 #include "core/framework/kernel_registry.h"
-#include "core/providers/shared/node_unit/node_unit.h"
+#include "core/framework/node_unit.h"
+#include "core/graph/function_utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
-
-#include "xnnpack_init.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/xnnpack/xnnpack_execution_provider.h"
+#include "core/providers/xnnpack/detail/utils.h"
+#include "core/providers/xnnpack/detail/node_support_checker.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 
 namespace onnxruntime {
 
@@ -155,7 +155,7 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 using namespace xnnpack;
 
 XnnpackExecutionProvider::XnnpackExecutionProvider(const XnnpackExecutionProviderInfo& info)
-    : IExecutionProvider{kXnnpackExecutionProvider, true} {
+    : IExecutionProvider{kXnnpackExecutionProvider} {
   int xnn_thread_pool_size = info.xnn_thread_pool_size;
   int ort_thread_pool_size = info.session_options ? info.session_options->intra_op_param.thread_pool_size : 1;
   bool allow_intra_op_spinning = (info.session_options == nullptr) ||
@@ -268,7 +268,7 @@ std::vector<std::unique_ptr<ComputeCapability>> XnnpackExecutionProvider::GetCap
   // Get all the NodeUnits in the GraphViewer so we can check if something is in a QDQ node group
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(graph);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph);
 
   // This holds the result of whether a NodeUnit is supported or not,
   // to prevent nodes in a NodeUnit being checked for multiple times
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index fb314b161f1a..e2084e9ef4f0 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -293,3 +293,10 @@ ORT_API_STATUS_IMPL(OrtApis::AddExternalInitializers, _In_ OrtSessionOptions* op
   return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "External initializers are not supported in this build");
 #endif
 }
+
+ORT_API_STATUS_IMPL(OrtApis::SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value) {
+  API_IMPL_BEGIN
+  options->value.use_deterministic_compute = value;
+  return nullptr;
+  API_IMPL_END
+}
diff --git a/onnxruntime/core/session/allocator_adapters.cc b/onnxruntime/core/session/allocator_adapters.cc
index dc864836580f..ac5ea7545355 100644
--- a/onnxruntime/core/session/allocator_adapters.cc
+++ b/onnxruntime/core/session/allocator_adapters.cc
@@ -17,12 +17,20 @@ OrtAllocatorImplWrappingIAllocator::OrtAllocatorImplWrappingIAllocator(onnxrunti
       [](OrtAllocator* this_, void* p) { static_cast<OrtAllocatorImplWrappingIAllocator*>(this_)->Free(p); };
   OrtAllocator::Info =
       [](const OrtAllocator* this_) { return static_cast<const OrtAllocatorImplWrappingIAllocator*>(this_)->Info(); };
+  if (OrtAllocator::version >= 18) {
+    OrtAllocator::Reserve =
+        [](OrtAllocator* this_, size_t size) { return static_cast<OrtAllocatorImplWrappingIAllocator*>(this_)->Reserve(size); };
+  }
 }
 
 void* OrtAllocatorImplWrappingIAllocator::Alloc(size_t size) {
   return i_allocator_->Alloc(size);
 }
 
+void* OrtAllocatorImplWrappingIAllocator::Reserve(size_t size) {
+  return i_allocator_->Reserve(size);
+}
+
 void OrtAllocatorImplWrappingIAllocator::Free(void* p) {
   i_allocator_->Free(p);
 }
@@ -42,6 +50,14 @@ void* IAllocatorImplWrappingOrtAllocator::Alloc(size_t size) {
   return ort_allocator_->Alloc(ort_allocator_, size);
 }
 
+void* IAllocatorImplWrappingOrtAllocator::Reserve(size_t size) {
+  if (ort_allocator_->version >= 18 && ort_allocator_->Reserve) {
+    return ort_allocator_->Reserve(ort_allocator_, size);
+  }
+
+  return ort_allocator_->Alloc(ort_allocator_, size);
+}
+
 void IAllocatorImplWrappingOrtAllocator::Free(void* p) {
   return ort_allocator_->Free(ort_allocator_, p);
 }
diff --git a/onnxruntime/core/session/allocator_adapters.h b/onnxruntime/core/session/allocator_adapters.h
index 587e9f733ca2..48f4ea03118c 100644
--- a/onnxruntime/core/session/allocator_adapters.h
+++ b/onnxruntime/core/session/allocator_adapters.h
@@ -27,6 +27,7 @@ struct OrtAllocatorImplWrappingIAllocator final : public OrtAllocatorImpl {
   void Free(void* p);
 
   const OrtMemoryInfo* Info() const;
+  void* Reserve(size_t size);
 
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(OrtAllocatorImplWrappingIAllocator);
 
@@ -43,6 +44,7 @@ class IAllocatorImplWrappingOrtAllocator final : public IAllocator {
   ~IAllocatorImplWrappingOrtAllocator() override = default;
 
   void* Alloc(size_t size) override;
+  void* Reserve(size_t size) override;
 
   void Free(void* p) override;
 
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index b827c28f129b..d0c46142ac06 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -5,7 +5,10 @@
 #pragma warning(disable : 4267)
 #endif
 
+#include <string>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 
 #include "core/common/gsl.h"
 #include "core/framework/data_types.h"
@@ -21,6 +24,18 @@
 #include "core/session/custom_ops.h"
 #include "core/session/inference_session.h"
 #include "core/session/ort_apis.h"
+#include "core/platform/threadpool.h"
+
+// NOTE: OrtKernelContext is used by both custom ops and compiled kernels.
+// In a minimal build, ORT_EXTENDED_MINIMAL_BUILD is used to enable EPs like CoreML/NNAPI which use compiled kernels,
+// and ORT_MINIMAL_BUILD_CUSTOM_OPS is used to allow external custom op libraries to be used.
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#define ENABLE_ORT_KERNEL_CONTEXT_API 1
+#endif
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#define ENABLE_CUSTOM_OP_API 1
+#endif
 
 #if !defined(ORT_MINIMAL_BUILD)
 static constexpr uint32_t min_ort_version_with_optional_io_support = 8;
@@ -28,7 +43,7 @@ static constexpr uint32_t min_ort_version_with_variadic_io_support = 14;
 static constexpr uint32_t min_ort_version_with_custom_version = 17;
 #endif
 
-#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#if ENABLE_CUSTOM_OP_API
 static constexpr uint32_t min_ort_version_with_compute_v2_support = 16;
 static constexpr uint32_t min_ort_version_with_shape_inference = 17;
 #endif
@@ -44,7 +59,8 @@ struct OrtShapeInferContext {
   size_t GetInputCount() const { return 0; }
   OrtTensorTypeAndShapeInfo* GetInputTypeShape(size_t) const { return {}; }
   onnxruntime::Status SetOutputTypeShape(size_t, const OrtTensorTypeAndShapeInfo*) const {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "OrtShapeInferContext::SetOutputTypeShape not implemented for minimal build");
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "OrtShapeInferContext::SetOutputTypeShape not implemented for minimal build");
   }
   const ONNX_NAMESPACE::AttributeProto* GetAttr(const char*) const { return {}; }
 };
@@ -55,13 +71,15 @@ struct OrtShapeInferContext {
     for (size_t ith_input = 0; ith_input < num_inputs; ++ith_input) {
       const auto* input_type = ctx_.getInputType(ith_input);
       const auto& value_case = input_type->value_case();
-      ORT_ENFORCE(value_case == ONNX_NAMESPACE::TypeProto::kTensorType, "shape inference not yet supported for non-tensor types");
+      ORT_ENFORCE(value_case == ONNX_NAMESPACE::TypeProto::kTensorType,
+                  "shape inference not yet supported for non-tensor types");
       const auto& shape_proto = input_type->tensor_type().shape();
       const auto& type_proto = input_type->tensor_type();
       auto elem_type = ::onnxruntime::utils::CApiElementTypeFromProtoType(type_proto.elem_type());
       auto tensor_shape = ::onnxruntime::utils::GetTensorShapeFromTensorShapeProto(shape_proto);
       auto symbolic_dims = GetSymbolicDims(shape_proto);
-      input_type_shapes_.emplace_back(OrtTensorTypeAndShapeInfo::GetTensorShapeAndTypeHelper(elem_type, tensor_shape, &symbolic_dims).release());
+      input_type_shapes_.emplace_back(
+          OrtTensorTypeAndShapeInfo::GetTensorShapeAndTypeHelper(elem_type, tensor_shape, &symbolic_dims).release());
     }
   }
 
@@ -113,273 +131,392 @@ struct OrtShapeInferContext {
 };
 #endif
 
-ORT_API_STATUS_IMPL(OrtApis::ShapeInferContext_GetInputCount, _In_ const OrtShapeInferContext* context, _Out_ size_t* out) {
+#if ENABLE_ORT_KERNEL_CONTEXT_API
+template <typename T>
+static OrtStatusPtr ExecuteIfKernelContextApiEnabled(const T& fn) {
   API_IMPL_BEGIN
-  *out = context->GetInputCount();
-  return nullptr;
+  return fn();
   API_IMPL_END
 }
+#else
+template <typename T>
+static OrtStatusPtr ExecuteIfKernelContextApiEnabled(const T&) {
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "OrtKernelContext API is not enabled in this build");
+}
+#endif
 
-ORT_API_STATUS_IMPL(OrtApis::ShapeInferContext_GetInputTypeShape, _In_ const OrtShapeInferContext* context, _In_ size_t index, _Outptr_ OrtTensorTypeAndShapeInfo** info) {
-  API_IMPL_BEGIN
-  *info = context->GetInputTypeShape(index);
-  if (*info) {
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetInputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    *out = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->InputCount();
     return nullptr;
-  } else {
-    return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Failed to fetch type shape info for the index.");
-  }
-  API_IMPL_END
-}
+  });
+};
 
-ORT_API_STATUS_IMPL(OrtApis::ShapeInferContext_GetAttribute, _In_ const OrtShapeInferContext* context, _In_ const char* attr_name, _Outptr_ const OrtOpAttr** attr) {
-  API_IMPL_BEGIN
-  *attr = reinterpret_cast<const OrtOpAttr*>(context->GetAttr(attr_name));
-  if (*attr) {
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetOutputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    *out = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->OutputCount();
     return nullptr;
-  } else {
-    return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Attribute does not exist.");
-  }
-  API_IMPL_END
-}
+  });
+};
 
-ORT_API_STATUS_IMPL(OrtApis::ReadOpAttr,
-                    _In_ const OrtOpAttr* op_attr,
-                    _In_ OrtOpAttrType type,
-                    _Inout_ void* data,
-                    _In_ size_t len,
-                    _Out_ size_t* out) {
-  API_IMPL_BEGIN
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index,
+                    _Out_ const OrtValue** out) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    const auto* ctx = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context);
+    *out = reinterpret_cast<const OrtValue*>(ctx->GetInputMLValue(onnxruntime::narrow<int>(index)));
+    return nullptr;
+  });
+};
 
-  if (!op_attr) {
-    return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Invalid attribute.");
-  }
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index,
+                    _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    onnxruntime::TensorShape shape(dim_values, dim_count);
+    auto* ctx = reinterpret_cast<onnxruntime::OpKernelContextInternal*>(context);
+    *out = reinterpret_cast<OrtValue*>(ctx->OutputMLValue(onnxruntime::narrow<int>(index), shape));
+    return nullptr;
+  });
+};
 
-  auto attr = reinterpret_cast<const ONNX_NAMESPACE::AttributeProto*>(op_attr);
-  OrtStatusPtr ret = nullptr;
-  *out = 0;
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetGPUComputeStream, _In_ const OrtKernelContext* context,
+                    _Outptr_ void** out) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    auto* stream = reinterpret_cast<const onnxruntime::OpKernelContext*>(context)->GetComputeStream();
+    if (stream)
+      *out = stream->GetHandle();
+    else
+      *out = nullptr;
+    return nullptr;
+  });
+};
 
-  if (type == OrtOpAttrType::ORT_OP_ATTR_FLOAT) {
-    if (len < sizeof(float)) {
-      ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Size of data not large enough to hold a float.");
-    } else {
-      if (attr->has_f()) {
-        auto output_f = reinterpret_cast<float*>(data);
-        *output_f = attr->f();
-      } else {
-        ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Attribute has no float value.");
-      }
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetAllocator, _In_ const OrtKernelContext* context,
+                    _In_ const OrtMemoryInfo* mem_info, _Outptr_ OrtAllocator** out) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    const auto* ctx = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context);
+    onnxruntime::AllocatorPtr allocator = ctx->GetAllocator(mem_info->device);
+    if (!allocator) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
     }
-    *out = sizeof(float);
 
-  } else if (type == OrtOpAttrType::ORT_OP_ATTR_FLOATS) {
-    const auto& floats = attr->floats();
-    auto num_floats = floats.size();
+    auto p = std::make_unique<onnxruntime::OrtAllocatorImplWrappingIAllocator>(std::move(allocator));
+    *out = p.release();
+    return nullptr;
+  });
+};
 
-    if (len < sizeof(float) * num_floats) {
-      ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Size of data not large enough to hold the array of floats.");
-    } else {
-      auto output_f = reinterpret_cast<float*>(data);
-      for (auto f : floats) {
-        *output_f = f;
-        output_f++;
-      }
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelContext* context,
+                    _In_ int resource_version, _In_ int resource_id, _Outptr_ void** resource) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    *resource = {};
+    const auto* ctx = reinterpret_cast<const onnxruntime::OpKernelContext*>(context);
+    auto* stream = reinterpret_cast<onnxruntime::Stream*>(ctx->GetComputeStream());
+    if (!stream) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Failed to fetch a stream hosting the requested resource");
     }
-    *out = num_floats * sizeof(float);
+    *resource = stream->GetResource(resource_version, resource_id);
+    return nullptr;
+  });
+};
 
-  } else if (type == OrtOpAttrType::ORT_OP_ATTR_INT) {
-    if (len < sizeof(int)) {
-      ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Size of data not large enough to hold an int64.");
-    } else {
-      if (attr->has_i()) {
-        auto output_i = reinterpret_cast<int64_t*>(data);
-        *output_i = attr->i();
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_ParallelFor, _In_ const OrtKernelContext* context,
+                    _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    if (!context) {
+      return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, "Invalid context");
+    }
+    if (fn && total) {
+      const auto* ctx = reinterpret_cast<const onnxruntime::OpKernelContext*>(context);
+      auto* tp = ctx->GetOperatorThreadPool();
+      if (num_batch) {
+        onnxruntime::concurrency::ThreadPool::TryBatchParallelFor(
+            tp,
+            static_cast<std::ptrdiff_t>(total),
+            [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast<size_t>(ith)); },
+            static_cast<std::ptrdiff_t>(num_batch));
       } else {
-        ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Attribute has no int64 value.");
+        onnxruntime::concurrency::ThreadPool::TrySimpleParallelFor(
+            tp,
+            static_cast<std::ptrdiff_t>(total),
+            [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast<size_t>(ith)); });
       }
     }
-    *out = sizeof(int64_t);
+    return nullptr;
+  });
+};
 
-  } else if (type == OrtOpAttrType::ORT_OP_ATTR_INTS) {
-    const auto& ints = attr->ints();
-    auto num_ints = ints.size();
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetLogger, _In_ const OrtKernelContext* context,
+                    _Outptr_ const OrtLogger** logger) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    const auto& kernel_ctx_logger = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->Logger();
 
-    if (len < sizeof(int64_t) * num_ints) {
-      ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Size of data not large enough to hold the array of int64.");
-    } else {
-      auto output_i = reinterpret_cast<int64_t*>(data);
-      for (auto i : ints) {
-        *output_i = i;
-        output_i++;
-      }
-    }
-    *out = num_ints * sizeof(int64_t);
+    *logger = reinterpret_cast<const OrtLogger*>(&kernel_ctx_logger);
+    return nullptr;
+  });
+}
 
-  } else if (type == OrtOpAttrType::ORT_OP_ATTR_STRING) {
-    const auto& s = attr->s();
-    if (len < s.size() + 1) {
-      ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Size of data not large enough to hold the string.");
-    } else {
-      char* output_c = reinterpret_cast<char*>(data);
-      for (char c : s) {
-        *output_c++ = c;
-      }
-      *output_c = '\0';
-    }
-    *out = s.size() + 1;
+// Enabled via ExecuteIfKernelContextApiEnabled due to KernelContext_GetLogger
+ORT_API_STATUS_IMPL(OrtApis::Logger_LogMessage, _In_ const OrtLogger* logger, OrtLoggingLevel log_severity_level,
+                    _In_z_ const char* message, _In_z_ const ORTCHAR_T* file_path, int line_number,
+                    _In_z_ const char* func_name) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    const auto& actual_logger = *reinterpret_cast<const onnxruntime::logging::Logger*>(logger);
+    const auto severity = static_cast<onnxruntime::logging::Severity>(log_severity_level);
+    const auto log_data_type = onnxruntime::logging::DataType::SYSTEM;
 
-  } else if (type == OrtOpAttrType::ORT_OP_ATTR_STRINGS) {
-    const auto& ss = attr->strings();
-    size_t num_bytes = 0;
-    for_each(ss.begin(), ss.end(), [&num_bytes](const std::string& s) { num_bytes += s.size() + 1; });
+    if (actual_logger.OutputIsEnabled(severity, log_data_type)) {
+#ifdef _WIN32
+      const std::string file_path_str = onnxruntime::ToUTF8String(file_path);
+      onnxruntime::CodeLocation location(file_path_str.c_str(), line_number, func_name);
+#else
+      onnxruntime::CodeLocation location(file_path, line_number, func_name);
+#endif
 
-    if (len < num_bytes) {
-      ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Size of data not large enough to hold the array of strings.");
-    } else {
-      char* output_c = reinterpret_cast<char*>(data);
-      for (const auto& s : ss) {
-        for (char c : s) {
-          *output_c++ = c;
-        }
-        *output_c++ = '\0';
-      }
+      onnxruntime::logging::Capture(
+          actual_logger,
+          severity,
+          onnxruntime::logging::Category::onnxruntime,
+          log_data_type,
+          location)
+              .Stream()
+          << message;
     }
-    *out = num_bytes;
-
-  } else {
-    ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Unknown attribute type.");
-  }
 
-  return ret;
-  API_IMPL_END
+    return nullptr;
+  });
 }
 
-ORT_API_STATUS_IMPL(OrtApis::ShapeInferContext_SetOutputTypeShape, _In_ const OrtShapeInferContext* context, _In_ size_t index, _In_ const OrtTensorTypeAndShapeInfo* info) {
-  API_IMPL_BEGIN
-  auto status = context->SetOutputTypeShape(index, info);
-  if (status.IsOK()) {
+// Enabled via ExecuteIfKernelContextApiEnabled due to KernelContext_GetLogger
+ORT_API_STATUS_IMPL(OrtApis::Logger_GetLoggingSeverityLevel, _In_ const OrtLogger* logger,
+                    _Out_ OrtLoggingLevel* out) {
+  return ExecuteIfKernelContextApiEnabled([&]() -> OrtStatusPtr {
+    const auto& actual_logger = *reinterpret_cast<const onnxruntime::logging::Logger*>(logger);
+    *out = static_cast<OrtLoggingLevel>(actual_logger.GetSeverity());
     return nullptr;
-  } else {
-    return OrtApis::CreateStatus(static_cast<OrtErrorCode>(status.Code()), status.ErrorMessage().c_str());
-  }
-  API_IMPL_END
+  });
 }
 
-ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttribute_float, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ float* out) {
+#if ENABLE_CUSTOM_OP_API
+template <typename T>
+static OrtStatusPtr ExecuteIfCustomOpsApiEnabled(const T& fn) {
   API_IMPL_BEGIN
-  auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttr<float>(name, out);
-  if (status.IsOK())
-    return nullptr;
-  return onnxruntime::ToOrtStatus(status);
+  return fn();
   API_IMPL_END
 }
+#else
+template <typename T>
+static OrtStatusPtr ExecuteIfCustomOpsApiEnabled(const T&) {
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Custom operator API is not enabled in this build");
+}
+#endif
 
-ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttribute_int64, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ int64_t* out) {
-  API_IMPL_BEGIN
-  auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttr<int64_t>(name, out);
-  if (status.IsOK())
+ORT_API_STATUS_IMPL(OrtApis::ShapeInferContext_GetInputCount, _In_ const OrtShapeInferContext* context,
+                    _Out_ size_t* out) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    *out = context->GetInputCount();
     return nullptr;
-  return onnxruntime::ToOrtStatus(status);
-  API_IMPL_END
+  });
 }
 
-ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetInputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) {
-  API_IMPL_BEGIN
-  *out = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->InputCount();
-  return nullptr;
-  API_IMPL_END
-};
-
-ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetOutputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out) {
-  API_IMPL_BEGIN
-  *out = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->OutputCount();
-  return nullptr;
-  API_IMPL_END
-};
-
-ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out) {
-  API_IMPL_BEGIN
-  *out = reinterpret_cast<const OrtValue*>(reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->GetInputMLValue(gsl::narrow_cast<int>(index)));
-  return nullptr;
-  API_IMPL_END
-};
-
-ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out) {
-  API_IMPL_BEGIN
-  onnxruntime::TensorShape shape(dim_values, dim_count);
-  *out = reinterpret_cast<OrtValue*>(reinterpret_cast<onnxruntime::OpKernelContextInternal*>(context)->OutputMLValue(gsl::narrow_cast<int>(index), shape));
-  return nullptr;
-  API_IMPL_END
-};
+ORT_API_STATUS_IMPL(OrtApis::ShapeInferContext_GetInputTypeShape, _In_ const OrtShapeInferContext* context,
+                    _In_ size_t index, _Outptr_ OrtTensorTypeAndShapeInfo** info) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    *info = context->GetInputTypeShape(index);
+    if (*info) {
+      return nullptr;
+    } else {
+      return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
+                                   "Failed to fetch type shape info for the index.");
+    }
+  });
+}
 
-ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out, _Inout_ size_t* size) {
-  API_IMPL_BEGIN
-  std::string value;
-  auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttr<std::string>(name, &value);
-  if (status.IsOK()) {
-    if (out == nullptr) {  // User is querying the true size of the attribute
-      *size = value.size() + 1;
+ORT_API_STATUS_IMPL(OrtApis::ShapeInferContext_GetAttribute, _In_ const OrtShapeInferContext* context,
+                    _In_ const char* attr_name, _Outptr_ const OrtOpAttr** attr) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    *attr = reinterpret_cast<const OrtOpAttr*>(context->GetAttr(attr_name));
+    if (*attr) {
       return nullptr;
-    } else if (*size >= value.size() + 1) {
-      std::memcpy(out, value.data(), value.size());
-      out[value.size()] = '\0';
-      *size = value.size() + 1;
+    } else {
+      return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Attribute does not exist.");
+    }
+  });
+}
+
+ORT_API_STATUS_IMPL(OrtApis::ShapeInferContext_SetOutputTypeShape, _In_ const OrtShapeInferContext* context,
+                    _In_ size_t index, _In_ const OrtTensorTypeAndShapeInfo* info) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    auto status = context->SetOutputTypeShape(index, info);
+    if (status.IsOK()) {
       return nullptr;
-    } else {  // User has provided a buffer that is not large enough
-      *size = value.size() + 1;
-      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Result buffer is not large enough");
+    } else {
+      return OrtApis::CreateStatus(static_cast<OrtErrorCode>(status.Code()), status.ErrorMessage().c_str());
     }
-  }
-  return onnxruntime::ToOrtStatus(status);
-  API_IMPL_END
+  });
 }
 
-#ifdef _WIN32
-#pragma warning(push)
-#pragma warning(disable : 28196 6387)
-#endif
+ORT_API_STATUS_IMPL(OrtApis::ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data,
+                    _In_ size_t len, _Out_ size_t* out) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    if (!op_attr) {
+      return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Invalid attribute.");
+    }
 
-ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetGPUComputeStream, _In_ const OrtKernelContext* context, _Outptr_ void** out) {
-  API_IMPL_BEGIN
-  auto* stream = reinterpret_cast<const onnxruntime::OpKernelContext*>(context)->GetComputeStream();
-  if (stream)
-    *out = stream->GetHandle();
-  else
-    *out = nullptr;
-  return nullptr;
-  API_IMPL_END
-};
+    auto attr = reinterpret_cast<const ONNX_NAMESPACE::AttributeProto*>(op_attr);
+    OrtStatusPtr ret = nullptr;
+    *out = 0;
 
-ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetAllocator, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _Outptr_ OrtAllocator** out) {
-  API_IMPL_BEGIN
-  onnxruntime::AllocatorPtr allocator = reinterpret_cast<const onnxruntime::OpKernelContext*>(context)->GetAllocator(mem_info->device);
-  if (!allocator) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
-  }
-  std::unique_ptr<onnxruntime::OrtAllocatorImplWrappingIAllocator> p = std::make_unique<onnxruntime::OrtAllocatorImplWrappingIAllocator>(std::move(allocator));
-  *out = p.release();
-  return nullptr;
-  API_IMPL_END
-};
+    switch (type) {
+      case OrtOpAttrType::ORT_OP_ATTR_FLOAT: {
+        if (len < sizeof(float)) {
+          ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
+                                      "Size of data not large enough to hold a float.");
+        } else {
+          if (attr->has_f()) {
+            auto output_f = reinterpret_cast<float*>(data);
+            *output_f = attr->f();
+          } else {
+            ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Attribute has no float value.");
+          }
+        }
+        *out = sizeof(float);
 
-ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelContext* context, _In_ int resource_version, _In_ int resource_id, _Outptr_ void** resource) {
-  API_IMPL_BEGIN
-  *resource = {};
-  const auto* ctx = reinterpret_cast<const onnxruntime::OpKernelContext*>(context);
-  auto* stream = reinterpret_cast<onnxruntime::Stream*>(ctx->GetComputeStream());
-  if (!stream) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Failed to fetch a stream hosting the requested resource");
-  }
-  *resource = stream->GetResource(resource_version, resource_id);
-  if (!(*resource)) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Requested resource does not exist");
-  }
-  return nullptr;
-  API_IMPL_END
-};
+        break;
+      }
+      case OrtOpAttrType::ORT_OP_ATTR_FLOATS: {
+        const auto& floats = attr->floats();
+        auto num_floats = floats.size();
 
-#ifdef _WIN32
-#pragma warning(pop)
-#endif
+        if (len < sizeof(float) * num_floats) {
+          ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
+                                      "Size of data not large enough to hold the array of floats.");
+        } else {
+          auto output_f = reinterpret_cast<float*>(data);
+          for (auto f : floats) {
+            *output_f = f;
+            output_f++;
+          }
+        }
+        *out = num_floats * sizeof(float);
+        break;
+      }
+      case OrtOpAttrType::ORT_OP_ATTR_INT: {
+        if (len < sizeof(int)) {
+          ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
+                                      "Size of data not large enough to hold an int64.");
+        } else {
+          if (attr->has_i()) {
+            auto output_i = reinterpret_cast<int64_t*>(data);
+            *output_i = attr->i();
+          } else {
+            ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Attribute has no int64 value.");
+          }
+        }
+        *out = sizeof(int64_t);
+        break;
+      }
+      case OrtOpAttrType::ORT_OP_ATTR_INTS: {
+        const auto& ints = attr->ints();
+        auto num_ints = ints.size();
+
+        if (len < sizeof(int64_t) * num_ints) {
+          ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
+                                      "Size of data not large enough to hold the array of int64.");
+        } else {
+          auto output_i = reinterpret_cast<int64_t*>(data);
+          for (auto i : ints) {
+            *output_i = i;
+            output_i++;
+          }
+        }
+        *out = num_ints * sizeof(int64_t);
+        break;
+      }
+      case OrtOpAttrType::ORT_OP_ATTR_STRING: {
+        const auto& s = attr->s();
+        if (len < s.size() + 1) {
+          ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
+                                      "Size of data not large enough to hold the string.");
+        } else {
+          char* output_c = reinterpret_cast<char*>(data);
+          for (char c : s) {
+            *output_c++ = c;
+          }
+          *output_c = '\0';
+        }
+        *out = s.size() + 1;
+        break;
+      }
+      case OrtOpAttrType::ORT_OP_ATTR_STRINGS: {
+        const auto& ss = attr->strings();
+        size_t num_bytes = 0;
+        for_each(ss.begin(), ss.end(), [&num_bytes](const std::string& s) { num_bytes += s.size() + 1; });
+
+        if (len < num_bytes) {
+          ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
+                                      "Size of data not large enough to hold the array of strings.");
+        } else {
+          char* output_c = reinterpret_cast<char*>(data);
+          for (const auto& s : ss) {
+            for (char c : s) {
+              *output_c++ = c;
+            }
+            *output_c++ = '\0';
+          }
+        }
+        *out = num_bytes;
+        break;
+      }
+      default:
+        ret = OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Unexpected attribute type. ");
+    }
+
+    return ret;
+  });
+}
+
+ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttribute_float, _In_ const OrtKernelInfo* info, _In_ const char* name,
+                    _Out_ float* out) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttr<float>(name, out);
+    if (status.IsOK())
+      return nullptr;
+    return onnxruntime::ToOrtStatus(status);
+  });
+}
+
+ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttribute_int64, _In_ const OrtKernelInfo* info, _In_ const char* name,
+                    _Out_ int64_t* out) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttr<int64_t>(name, out);
+    if (status.IsOK())
+      return nullptr;
+    return onnxruntime::ToOrtStatus(status);
+  });
+}
+
+ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name,
+                    _Out_ char* out, _Inout_ size_t* size) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    std::string value;
+    auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttr<std::string>(name, &value);
+    if (status.IsOK()) {
+      if (out == nullptr) {  // User is querying the true size of the attribute
+        *size = value.size() + 1;
+        return nullptr;
+      } else if (*size >= value.size() + 1) {
+        std::memcpy(out, value.data(), value.size());
+        out[value.size()] = '\0';
+        *size = value.size() + 1;
+        return nullptr;
+      } else {  // User has provided a buffer that is not large enough
+        *size = value.size() + 1;
+        return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Result buffer is not large enough");
+      }
+    }
+    return onnxruntime::ToOrtStatus(status);
+  });
+}
 
 template <typename T, typename std::enable_if<std::is_fundamental<T>::value, int>::type = 0>
 static Status CopyDataFromVectorToMemory(const std::vector<T>& values, T* out, size_t* size) {
@@ -399,256 +536,235 @@ static Status CopyDataFromVectorToMemory(const std::vector<T>& values, T* out, s
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttributeArray_float, _In_ const OrtKernelInfo* info, _In_ const char* name,
                     _Out_ float* out, _Inout_ size_t* size) {
-  API_IMPL_BEGIN
-  std::vector<float> values;
-  auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttrs<float>(name, values);
-  if (status.IsOK()) {
-    status = CopyDataFromVectorToMemory<float>(values, out, size);
-  }
-  return onnxruntime::ToOrtStatus(status);
-  API_IMPL_END
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    std::vector<float> values;
+    auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttrs<float>(name, values);
+    if (status.IsOK()) {
+      status = CopyDataFromVectorToMemory<float>(values, out, size);
+    }
+    return onnxruntime::ToOrtStatus(status);
+  });
 }
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttributeArray_int64, _In_ const OrtKernelInfo* info, _In_ const char* name,
                     _Out_ int64_t* out, _Inout_ size_t* size) {
-  API_IMPL_BEGIN
-  std::vector<int64_t> values;
-  auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttrs<int64_t>(name, values);
-  if (status.IsOK()) {
-    status = CopyDataFromVectorToMemory<int64_t>(values, out, size);
-  }
-  return onnxruntime::ToOrtStatus(status);
-  API_IMPL_END
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    std::vector<int64_t> values;
+    auto status = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAttrs<int64_t>(name, values);
+    if (status.IsOK()) {
+      status = CopyDataFromVectorToMemory<int64_t>(values, out, size);
+    }
+    return onnxruntime::ToOrtStatus(status);
+  });
 }
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttribute_tensor, _In_ const OrtKernelInfo* info, _In_z_ const char* name,
                     _Inout_ OrtAllocator* allocator, _Outptr_ OrtValue** out) {
-  API_IMPL_BEGIN
-  const auto* op_kinfo = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
-
-  // Get TensorProto attribute
-  onnx::TensorProto tensor_proto;
-  auto status = op_kinfo->GetAttr<onnx::TensorProto>(name, &tensor_proto);
-  if (!status.IsOK()) {
-    return onnxruntime::ToOrtStatus(status);
-  }
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    const auto* op_kinfo = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
+
+    // Get TensorProto attribute
+    onnx::TensorProto tensor_proto;
+    auto status = op_kinfo->GetAttr<onnx::TensorProto>(name, &tensor_proto);
+    if (!status.IsOK()) {
+      return onnxruntime::ToOrtStatus(status);
+    }
 
-  // Determine the tensor's size in bytes.
-  size_t req_size = 0;
-  status = onnxruntime::utils::GetSizeInBytesFromTensorProto<0>(tensor_proto, &req_size);
-  if (!status.IsOK()) {
-    return onnxruntime::ToOrtStatus(status);
-  }
+    // Determine the tensor's size in bytes.
+    size_t req_size = 0;
+    status = onnxruntime::utils::GetSizeInBytesFromTensorProto<0>(tensor_proto, &req_size);
+    if (!status.IsOK()) {
+      return onnxruntime::ToOrtStatus(status);
+    }
 
-  // Create Tensor that owns buffer memory that will be allocated with the provided OrtAllocator.
-  onnxruntime::TensorShape tensor_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(tensor_proto);
-  const auto* const type = onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto.data_type())->GetElementType();
-  onnxruntime::AllocatorPtr alloc_ptr = std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(allocator);
-  auto tensorp = std::make_unique<onnxruntime::Tensor>(type, tensor_shape, std::move(alloc_ptr));
+    // Create Tensor that owns buffer memory that will be allocated with the provided OrtAllocator.
+    onnxruntime::TensorShape tensor_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(tensor_proto);
+    const auto* type = onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto.data_type())->GetElementType();
+    onnxruntime::AllocatorPtr alloc_ptr = std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(allocator);
+    auto tensorp = std::make_unique<onnxruntime::Tensor>(type, tensor_shape, std::move(alloc_ptr));
 
-  // Deserialize TensorProto into pre-allocated, empty Tensor.
-  status = onnxruntime::utils::TensorProtoToTensor(onnxruntime::Env::Default(), nullptr, tensor_proto, *tensorp);
-  if (!status.IsOK()) {
-    return onnxruntime::ToOrtStatus(status);
-  }
+    // Deserialize TensorProto into pre-allocated, empty Tensor.
+    status = onnxruntime::utils::TensorProtoToTensor(onnxruntime::Env::Default(), nullptr, tensor_proto, *tensorp);
+    if (!status.IsOK()) {
+      return onnxruntime::ToOrtStatus(status);
+    }
 
-  // Initialize OrtValue from Tensor.
-  auto ml_tensor = onnxruntime::DataTypeImpl::GetType<onnxruntime::Tensor>();
-  auto value = std::make_unique<OrtValue>();
-  value->Init(tensorp.release(), ml_tensor, ml_tensor->GetDeleteFunc());
+    // Initialize OrtValue from Tensor.
+    auto ml_tensor = onnxruntime::DataTypeImpl::GetType<onnxruntime::Tensor>();
+    auto value = std::make_unique<OrtValue>();
+    value->Init(tensorp.release(), ml_tensor, ml_tensor->GetDeleteFunc());
 
-  *out = value.release();
-  return nullptr;
-  API_IMPL_END
+    *out = value.release();
+    return nullptr;
+  });
 }
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetInputCount, _In_ const OrtKernelInfo* info, _Out_ size_t* out) {
-  API_IMPL_BEGIN
-  *out = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetInputCount();
-  return nullptr;
-  API_IMPL_END
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    *out = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetInputCount();
+    return nullptr;
+  });
 };
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetOutputCount, _In_ const OrtKernelInfo* info, _Out_ size_t* out) {
-  API_IMPL_BEGIN
-  *out = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetOutputCount();
-  return nullptr;
-  API_IMPL_END
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    *out = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetOutputCount();
+    return nullptr;
+  });
 };
 
-ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetInputName, _In_ const OrtKernelInfo* info, size_t index, _Out_ char* out,
-                    _Inout_ size_t* size) {
-  API_IMPL_BEGIN
-  const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
-  const auto input_defs = op_info->node().InputDefs();
+ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetInputName, _In_ const OrtKernelInfo* info, size_t index,
+                    _Out_ char* out, _Inout_ size_t* size) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
+    const auto input_defs = op_info->node().InputDefs();
 
-  if (index >= input_defs.size()) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "::OrtKernelInfo input index is out of bounds");
-  }
+    if (index >= input_defs.size()) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "::OrtKernelInfo input index is out of bounds");
+    }
 
-  auto status = CopyStringToOutputArg(input_defs[index]->Name(),
-                                      "Output buffer is not large enough for ::OrtKernelInfo input name", out, size);
+    auto status = CopyStringToOutputArg(input_defs[index]->Name(),
+                                        "Output buffer is not large enough for ::OrtKernelInfo input name", out, size);
 
-  return onnxruntime::ToOrtStatus(status);
-  API_IMPL_END
+    return onnxruntime::ToOrtStatus(status);
+  });
 }
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetOutputName, _In_ const OrtKernelInfo* info, size_t index, _Out_ char* out,
                     _Inout_ size_t* size) {
-  API_IMPL_BEGIN
-  const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
-  const auto output_defs = op_info->node().OutputDefs();
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
+    const auto output_defs = op_info->node().OutputDefs();
 
-  if (index >= output_defs.size()) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "::OrtKernelInfo output index is out of bounds");
-  }
+    if (index >= output_defs.size()) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "::OrtKernelInfo output index is out of bounds");
+    }
 
-  auto status = CopyStringToOutputArg(output_defs[index]->Name(),
-                                      "Output buffer is not large enough for ::OrtKernelInfo output name", out, size);
+    auto status = CopyStringToOutputArg(output_defs[index]->Name(),
+                                        "Output buffer is not large enough for ::OrtKernelInfo output name",
+                                        out, size);
 
-  return onnxruntime::ToOrtStatus(status);
-  API_IMPL_END
+    return onnxruntime::ToOrtStatus(status);
+  });
 }
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetInputTypeInfo, _In_ const OrtKernelInfo* info, size_t index,
                     _Outptr_ OrtTypeInfo** type_info) {
-  API_IMPL_BEGIN
-  const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
-  const auto input_defs = op_info->node().InputDefs();
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
+    const auto input_defs = op_info->node().InputDefs();
 
-  if (index >= input_defs.size()) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "::OrtKernelInfo input index is out of bounds");
-  }
+    if (index >= input_defs.size()) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "::OrtKernelInfo input index is out of bounds");
+    }
 
-  const onnxruntime::NodeArg* node_arg = input_defs[index];
-  const ONNX_NAMESPACE::TypeProto* type_proto = node_arg->TypeAsProto();
+    const onnxruntime::NodeArg* node_arg = input_defs[index];
+    const ONNX_NAMESPACE::TypeProto* type_proto = node_arg->TypeAsProto();
 
-  if (type_proto == nullptr) {
-    return OrtApis::CreateStatus(ORT_INVALID_GRAPH, "::OrtKernelInfo input does not have a type");
-  }
+    if (type_proto == nullptr) {
+      return OrtApis::CreateStatus(ORT_INVALID_GRAPH, "::OrtKernelInfo input does not have a type");
+    }
 
-  auto type_info_ret = OrtTypeInfo::FromTypeProto(*type_proto);
-  *type_info = type_info_ret.release();
-  return nullptr;
-  API_IMPL_END
+    auto type_info_ret = OrtTypeInfo::FromTypeProto(*type_proto);
+    *type_info = type_info_ret.release();
+    return nullptr;
+  });
 }
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetOutputTypeInfo, _In_ const OrtKernelInfo* info, size_t index,
                     _Outptr_ OrtTypeInfo** type_info) {
-  API_IMPL_BEGIN
-  const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
-  const auto output_defs = op_info->node().OutputDefs();
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
+    const auto output_defs = op_info->node().OutputDefs();
 
-  if (index >= output_defs.size()) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "::OrtKernelInfo output index is out of bounds");
-  }
+    if (index >= output_defs.size()) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "::OrtKernelInfo output index is out of bounds");
+    }
 
-  const onnxruntime::NodeArg* node_arg = output_defs[index];
-  const ONNX_NAMESPACE::TypeProto* type_proto = node_arg->TypeAsProto();
+    const onnxruntime::NodeArg* node_arg = output_defs[index];
+    const ONNX_NAMESPACE::TypeProto* type_proto = node_arg->TypeAsProto();
 
-  if (type_proto == nullptr) {
-    return OrtApis::CreateStatus(ORT_INVALID_GRAPH, "::OrtKernelInfo output does not have a type");
-  }
+    if (type_proto == nullptr) {
+      return OrtApis::CreateStatus(ORT_INVALID_GRAPH, "::OrtKernelInfo output does not have a type");
+    }
 
-  auto type_info_ret = OrtTypeInfo::FromTypeProto(*type_proto);
-  *type_info = type_info_ret.release();
-  return nullptr;
-  API_IMPL_END
+    auto type_info_ret = OrtTypeInfo::FromTypeProto(*type_proto);
+    *type_info = type_info_ret.release();
+    return nullptr;
+  });
 }
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetConstantInput_tensor, _In_ const OrtKernelInfo* info, _In_ size_t index,
                     _Out_ int* is_constant, _Outptr_ const OrtValue** out) {
-  API_IMPL_BEGIN
-  const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
-  *is_constant = static_cast<int>(op_info->TryGetConstantInput(gsl::narrow_cast<int>(index), out));
-  return nullptr;
-  API_IMPL_END
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
+    *is_constant = static_cast<int>(op_info->TryGetConstantInput(gsl::narrow_cast<int>(index), out));
+    return nullptr;
+  });
 };
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetNodeName, _In_ const OrtKernelInfo* info, _Out_ char* out,
                     _Inout_ size_t* size) {
-  API_IMPL_BEGIN
-  const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    const auto* op_info = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info);
 
-  auto status = CopyStringToOutputArg(op_info->node().Name(),
-                                      "Output buffer is not large enough for ::OrtKernelInfo node name", out, size);
+    auto status = CopyStringToOutputArg(op_info->node().Name(),
+                                        "Output buffer is not large enough for ::OrtKernelInfo node name", out, size);
 
-  return onnxruntime::ToOrtStatus(status);
-  API_IMPL_END
+    return onnxruntime::ToOrtStatus(status);
+  });
 }
 
 ORT_API_STATUS_IMPL(OrtApis::KernelInfo_GetLogger, _In_ const OrtKernelInfo* info, _Outptr_ const OrtLogger** logger) {
-  API_IMPL_BEGIN
-  const auto* ep = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetExecutionProvider();
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    const auto* ep = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetExecutionProvider();
 
-  if (ep == nullptr) {
-    return OrtApis::CreateStatus(ORT_INVALID_GRAPH, "::OrtKernelInfo does not have an execution provider");
-  }
+    if (ep == nullptr) {
+      return OrtApis::CreateStatus(ORT_INVALID_GRAPH, "::OrtKernelInfo does not have an execution provider");
+    }
 
-  const auto* ep_logger = ep->GetLogger();
+    const auto* ep_logger = ep->GetLogger();
 
-  if (ep_logger == nullptr) {
-    return OrtApis::CreateStatus(ORT_INVALID_GRAPH,
-                                 "::OrtKernelInfo cannot get a valid logger from "
-                                 "its execution provider");
-  }
+    if (ep_logger == nullptr) {
+      return OrtApis::CreateStatus(ORT_INVALID_GRAPH,
+                                   "::OrtKernelInfo cannot get a valid logger from "
+                                   "its execution provider");
+    }
 
-  *logger = reinterpret_cast<const OrtLogger*>(ep_logger);
-  return nullptr;
-  API_IMPL_END
+    *logger = reinterpret_cast<const OrtLogger*>(ep_logger);
+    return nullptr;
+  });
 }
 
-ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetLogger, _In_ const OrtKernelContext* context, _Outptr_ const OrtLogger** logger) {
-  API_IMPL_BEGIN
-  const auto& kernel_ctx_logger = reinterpret_cast<const onnxruntime::OpKernelContextInternal*>(context)->Logger();
-
-  *logger = reinterpret_cast<const OrtLogger*>(&kernel_ctx_logger);
-  return nullptr;
-  API_IMPL_END
+ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out) {
+  return ExecuteIfCustomOpsApiEnabled([&]() -> OrtStatusPtr {
+    onnxruntime::AllocatorPtr allocator = reinterpret_cast<const onnxruntime::OpKernelInfo*>(info)->GetAllocator(mem_type);
+    if (!allocator) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
+    }
+    auto p = std::make_unique<onnxruntime::OrtAllocatorImplWrappingIAllocator>(std::move(allocator));
+    *out = p.release();
+    return nullptr;
+  });
 }
 
-ORT_API_STATUS_IMPL(OrtApis::Logger_LogMessage, _In_ const OrtLogger* logger, OrtLoggingLevel log_severity_level,
-                    _In_z_ const char* message, _In_z_ const ORTCHAR_T* file_path, int line_number,
-                    _In_z_ const char* func_name) {
-  API_IMPL_BEGIN
-  const auto& actual_logger = *reinterpret_cast<const onnxruntime::logging::Logger*>(logger);
-  const auto severity = static_cast<onnxruntime::logging::Severity>(log_severity_level);
-  const auto log_data_type = onnxruntime::logging::DataType::SYSTEM;
-
-  if (actual_logger.OutputIsEnabled(severity, log_data_type)) {
-#ifdef _WIN32
-    const std::string file_path_str = onnxruntime::ToUTF8String(file_path);
-    onnxruntime::CodeLocation location(file_path_str.c_str(), line_number, func_name);
-#else
-    onnxruntime::CodeLocation location(file_path, line_number, func_name);
-#endif
-
-    onnxruntime::logging::Capture(
-        actual_logger,
-        severity,
-        onnxruntime::logging::Category::onnxruntime,
-        log_data_type,
-        location)
-            .Stream()
-        << message;
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out) {
+  if (count_or_bytes == 0) {
+    *out = nullptr;
+    return nullptr;
   }
-
-  return nullptr;
-  API_IMPL_END
-}
-
-ORT_API_STATUS_IMPL(OrtApis::Logger_GetLoggingSeverityLevel, _In_ const OrtLogger* logger, _Out_ OrtLoggingLevel* out) {
-  API_IMPL_BEGIN
-  const auto& actual_logger = *reinterpret_cast<const onnxruntime::logging::Logger*>(logger);
-  *out = static_cast<OrtLoggingLevel>(actual_logger.GetSeverity());
+  onnxruntime::AllocatorPtr allocator = reinterpret_cast<const onnxruntime::OpKernelContext*>(context)->GetAllocator(mem_info->device);
+  if (!allocator) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "No requested allocator available");
+  }
+  onnxruntime::Stream* stream = reinterpret_cast<const onnxruntime::OpKernelContext*>(context)->GetComputeStream();
+  *out = AllocateBufferWithOptions(*allocator, count_or_bytes, false, stream, stream->GetWaitNotificationFn());
   return nullptr;
-  API_IMPL_END
-}
+};
 
-#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#if ENABLE_CUSTOM_OP_API
 #include "core/framework/customregistry.h"
 namespace onnxruntime {
-
 struct CustomOpKernel : OpKernel {
   CustomOpKernel(const OpKernelInfo& info, const OrtCustomOp& op) : OpKernel(info), op_(op) {
     if (op_.version > ORT_API_VERSION) {
@@ -727,7 +843,8 @@ KernelCreateInfo CreateKernelCreateInfo(const std::string& domain, const OrtCust
     if (input_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
       def_builder.TypeConstraint(input_name, SUPPORTED_TENSOR_TYPES);
     } else {
-      def_builder.TypeConstraint(input_name, DataTypeImpl::TensorTypeFromONNXEnum(static_cast<int>(input_type))->AsTensorType());
+      def_builder.TypeConstraint(input_name,
+                                 DataTypeImpl::TensorTypeFromONNXEnum(static_cast<int>(input_type))->AsTensorType());
     }
   }
 
@@ -737,7 +854,8 @@ KernelCreateInfo CreateKernelCreateInfo(const std::string& domain, const OrtCust
     if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
       def_builder.TypeConstraint(output_name, SUPPORTED_TENSOR_TYPES);
     } else {
-      def_builder.TypeConstraint(output_name, DataTypeImpl::TensorTypeFromONNXEnum(static_cast<int>(output_type))->AsTensorType());
+      def_builder.TypeConstraint(output_name,
+                                 DataTypeImpl::TensorTypeFromONNXEnum(static_cast<int>(output_type))->AsTensorType());
     }
   }
 
@@ -747,7 +865,28 @@ KernelCreateInfo CreateKernelCreateInfo(const std::string& domain, const OrtCust
     def_builder.Provider(onnxruntime::kCpuExecutionProvider);
   }
 
-  KernelCreateFn kernel_create_fn = [op](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
+  if (op->version >= 18 && op->GetMayInplace != nullptr) {
+    int* input_index = nullptr;
+    int* output_index = nullptr;
+    size_t len = op->GetMayInplace(&input_index, &output_index);
+    if (len > 0) {
+      for (size_t i = 0; i < len; i++) def_builder.MayInplace(input_index[i], output_index[i]);
+      op->ReleaseMayInplace(input_index, output_index);
+    }
+  }
+
+  if (op->version >= 18 && op->GetAliasMap != nullptr) {
+    int* input_index = nullptr;
+    int* output_index = nullptr;
+    size_t len = op->GetAliasMap(&input_index, &output_index);
+    if (len > 0) {
+      for (size_t i = 0; i < len; i++) def_builder.Alias(input_index[i], output_index[i]);
+      op->ReleaseAliasMap(input_index, output_index);
+    }
+  }
+
+  KernelCreateFn kernel_create_fn = [op](FuncManager&, const OpKernelInfo& info,
+                                         std::unique_ptr<OpKernel>& out) -> Status {
     out = std::make_unique<CustomOpKernel>(info, *op);
     return Status::OK();
   };
@@ -755,14 +894,16 @@ KernelCreateInfo CreateKernelCreateInfo(const std::string& domain, const OrtCust
   return KernelCreateInfo(def_builder.Build(), kernel_create_fn);
 }
 
-ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const OrtCustomOp* op) {
-  const size_t input_count = op->GetInputTypeCount(op);
-  const size_t output_count = op->GetOutputTypeCount(op);
+ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const std::vector<const OrtCustomOp*>& ops) {
+  // The function registers the first schema assuming all the other one are the same except the types constraints.
+  ORT_ENFORCE(ops.size() > 0, "No kernels to registers.");
   int undefined = 0;
 
+  // Creation of the schema for the first kernel in ops.
+  const OrtCustomOp* op = *ops.begin();
   ONNX_NAMESPACE::OpSchema schema(op->GetName(op), "custom op registered at runtime", 0);
 
-  for (size_t i = 0; i < input_count; i++) {
+  auto create_type_constraint = [&ops, &schema, &undefined](const OrtCustomOp* op, int count, int i, bool is_input) {
     onnx::OpSchema::FormalParameterOption option = onnx::OpSchema::FormalParameterOption::Single;
     bool is_homogeneous = true;
     int min_arity = 1;
@@ -770,51 +911,79 @@ ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const OrtCustom
     // The OrtCustomOp interface did not support the methods to query input/output characteristics before
     // ORT API version 8. So, query the relevant methods ONLY from API version 8 onwards.
     if (op->version >= min_ort_version_with_optional_io_support) {
-      const auto characteristic = op->GetInputCharacteristic(op, i);
+      const auto characteristic = is_input ? op->GetInputCharacteristic(op, i) : op->GetOutputCharacteristic(op, i);
 
       // Support for optional and variadic inputs/output was added in versions 8 and 14, respectively.
       if (characteristic == OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL) {
         option = onnx::OpSchema::FormalParameterOption::Optional;
       } else if ((op->version >= min_ort_version_with_variadic_io_support) &&
                  (characteristic == OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC)) {
-        ORT_ENFORCE(i == input_count - 1, "Only the last input to a custom op may be marked variadic.");
+        ORT_ENFORCE(i == count - 1, "Only the last ", (is_input ? "input" : "output"),
+                    " to a custom op may be marked variadic.");
         option = onnx::OpSchema::FormalParameterOption::Variadic;
-        min_arity = op->GetVariadicInputMinArity(op);
-        is_homogeneous = static_cast<bool>(op->GetVariadicInputHomogeneity(op));
+        min_arity = is_input ? op->GetVariadicInputMinArity(op) : op->GetVariadicOutputMinArity(op);
+        is_homogeneous = static_cast<bool>(is_input
+                                               ? op->GetVariadicInputHomogeneity(op)
+                                               : op->GetVariadicOutputHomogeneity(op));
       }
     }
 
-    const auto type = op->GetInputType(op, i);
-    if (type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
-      undefined++;
+    // The loop goes through all operators sharing the same schema to build
+    // the minimal type constraints for all of them. All kernels must have
+    // the same number of inputs / outputs among themselves to be able to build
+    // the type constraints. Any kind of incompatibility between a schema and
+    // a kernel is checked by method IsCompatible once the schema is created
+    // by this method.
+    std::unordered_set<ONNXTensorElementDataType> all_types;
+    for (auto o : ops) {
+      ORT_ENFORCE(static_cast<size_t>(i) != (is_input ? o->GetInputTypeCount(o) : o->GetOutputTypeCount(o)),
+                  "Another version of operator '", schema.Name(),
+                  "'has a different number of ", (is_input ? "inputs" : "outputs"),
+                  ". onnxruntime allows the overloading of an operator "
+                  "if all versions have the same number of declared ",
+                  (is_input ? "inputs" : "outputs"), ".");
+      const auto type = is_input ? o->GetInputType(o, i) : o->GetOutputType(o, i);
+      if (type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
+        // If 'type' is undefined, all types are allowed regardless of what other versions of the same operator
+        // define. In that case, all_types is cleared, that's the convention used by the code following this loop
+        // to declare all types as possible types.
+        all_types.clear();
+        break;
+      }
+      all_types.insert(type);
     }
-    std::string input_name = "Input" + std::to_string(i);
-    schema.Input(gsl::narrow_cast<int>(i), input_name, "", input_name, option, is_homogeneous, min_arity);
-    // support all types as input here in schema, and handle the type inference in TypeShapeInference func
-    schema.TypeConstraint(input_name, DataTypeImpl::ToString(SUPPORTED_TENSOR_TYPES), "all types");
-  }
-
-  for (size_t i = 0; i < output_count; i++) {
-    onnx::OpSchema::FormalParameterOption option = onnx::OpSchema::FormalParameterOption::Single;
-    bool is_homogeneous = true;
-    int min_arity = 1;
 
-    // The OrtCustomOp interface did not support the methods to query input/output characteristics before
-    // ORT API version 8. So, query the relevant methods ONLY from API version 8 onwards.
-    if (op->version >= min_ort_version_with_optional_io_support) {
-      const auto characteristic = op->GetOutputCharacteristic(op, i);
+    std::string prefix = is_input ? "Input" : "Output";
+    std::string name = prefix + std::to_string(i);
+    if (is_input) {
+      schema.Input(gsl::narrow_cast<int>(i), name, "", name, option, is_homogeneous, min_arity);
+    } else {
+      schema.Output(gsl::narrow_cast<int>(i), name, "", name, option, is_homogeneous, min_arity);
+    }
 
-      // Support for optional and variadic inputs/output was added in versions 8 and 14, respectively.
-      if (characteristic == OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL) {
-        option = onnx::OpSchema::FormalParameterOption::Optional;
-      } else if ((op->version >= min_ort_version_with_variadic_io_support) &&
-                 (characteristic == OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC)) {
-        ORT_ENFORCE(i == output_count - 1, "Only the last output to a custom op may be marked variadic.");
-        option = onnx::OpSchema::FormalParameterOption::Variadic;
-        min_arity = op->GetVariadicOutputMinArity(op);
-        is_homogeneous = static_cast<bool>(op->GetVariadicOutputHomogeneity(op));
+    if (!all_types.empty()) {
+      // all_types is not empty then only the types in this container are allowed of this input.
+      std::vector<std::string> types;
+      for (auto type : all_types) {
+        const ONNX_NAMESPACE::TypeProto* type_proto =
+            DataTypeImpl::TensorTypeFromONNXEnum(static_cast<int>(type))->GetTypeProto();
+        types.push_back(*ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(*type_proto));
       }
+      schema.TypeConstraint(name, types, "defined list of types");
+    } else {
+      // all_types is empty. As mentioned in the previous loop, all types are allowed.
+      schema.TypeConstraint(name, DataTypeImpl::ToString(SUPPORTED_TENSOR_TYPES), "all types");
+      undefined++;
     }
+  };
+
+  const size_t input_count = op->GetInputTypeCount(op);
+  for (size_t i = 0; i < input_count; i++) {
+    create_type_constraint(op, static_cast<int>(input_count), static_cast<int>(i), true);
+  }
+
+  const size_t output_count = op->GetOutputTypeCount(op);
+  for (size_t i = 0; i < output_count; i++) {
     const auto type = op->GetOutputType(op, i);
     if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) {
       if (op->GetOutputCharacteristic(op, i) == OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED) {
@@ -822,15 +991,13 @@ ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const OrtCustom
                     "There must be one (and only one) dynamic typed input to the custom op. "
                     "Its type info at runtime will be used to infer the type info of this dynamic typed output "
                     "which is required for the success of the model loading step. "
-                    "More than one dynamic typed inputs are currently not supported as differing types at runtime means the output type "
-                    "cannot be inferred without which model loading cannot proceed.");
+                    "More than one dynamic typed inputs are currently not supported as differing types at runtime "
+                    "means the output type cannot be inferred without which model loading cannot proceed.");
       }
     }
-    std::string output_name = "Output" + std::to_string(i);
-    schema.Output(gsl::narrow_cast<int>(i), output_name, "", output_name, option, is_homogeneous, min_arity);
-    // support all types as input here in schema, and handle the type inference in TypeShapeInference func
-    schema.TypeConstraint(output_name, DataTypeImpl::ToString(SUPPORTED_TENSOR_TYPES), "all types");
+    create_type_constraint(op, static_cast<int>(output_count), static_cast<int>(i), false);
   }
+
   schema.SetDomain(domain);
   if (op->version >= min_ort_version_with_custom_version && op->GetStartVersion) {
     schema.SinceVersion(op->GetStartVersion(op));
@@ -905,7 +1072,7 @@ Status IsCompatible(const ONNX_NAMESPACE::OpSchema& schema, const OrtCustomOp* o
                         "custom op schemas mismatch, expecting ", i + 1,
                         i == 0 ? "st" : (i == 1 ? "nd" : "th"),
                         " output to keep same homogeneity");
-      ORT_RETURN_IF_NOT(formal_parameter.GetMinArity() == op->GetVariadicInputMinArity(op),
+      ORT_RETURN_IF_NOT(formal_parameter.GetMinArity() == op->GetVariadicOutputMinArity(op),
                         "custom op schemas mismatch, expecting ", i + 1,
                         i == 0 ? "st" : (i == 1 ? "nd" : "th"),
                         " output to keep same arity");
@@ -919,57 +1086,120 @@ Status IsCompatible(const ONNX_NAMESPACE::OpSchema& schema, const OrtCustomOp* o
   return Status::OK();
 }
 
-void InferOutputTypes(const InlinedVector<const KernelDef*>& kernel_defs,
-                      ONNX_NAMESPACE::InferenceContext& infer_ctx) {
-  for (const auto& kernel_def : kernel_defs) {
+// This function attempts to do its best for older custom ops (most of them) who do not have
+// they own type and shape inference function. However, it falls short in some cases, and we leave
+// those for the user to handle in their own inference function.
+static void InferOutputTypes(const ONNX_NAMESPACE::OpSchema& schema, gsl::span<const KernelDef* const> kernel_defs,
+                             ONNX_NAMESPACE::InferenceContext& infer_ctx) {
+  const auto& inputs = schema.inputs();
+  const auto node_input_num = infer_ctx.getNumInputs();
+
+  const KernelDef* def_selected = nullptr;
+  bool is_variadic_input = false;
+  bool is_homogeneous_input = false;
+  int32_t output_propagate{0};
+
+  for (size_t kernel_index = 0;
+       kernel_index < kernel_defs.size() && def_selected == nullptr;
+       ++kernel_index) {
+    const auto* kernel_def = kernel_defs[kernel_index];
     const auto& type_constraints = kernel_def->TypeConstraints();
-    auto num_inputs = infer_ctx.getNumInputs();
-    bool matched = true;
-    ONNXTensorElementDataType undef = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
-    // first, make sure there is a constraint for every input
-    for (size_t i = 0; i < num_inputs && matched; ++i) {
-      auto input_name = "Input" + std::to_string(i);
-      auto input_type = infer_ctx.getInputType(i);
-      if (input_type) {
-        auto elem_type = static_cast<ONNXTensorElementDataType>(input_type->tensor_type().elem_type());
-        auto tc_iter = type_constraints.find(input_name);
-        if (tc_iter != type_constraints.end()) {
-          if (tc_iter->second.size() > 1) {
-            undef = elem_type;
-          } else if (tc_iter->second.size() != 1 || tc_iter->second[0] != DataTypeImpl::TensorTypeFromONNXEnum(elem_type)) {
-            matched = false;
+    def_selected = kernel_def;
+
+    for (size_t i = 0; i < node_input_num; ++i) {
+      const auto input_type = infer_ctx.getInputType(i);
+
+      // Guard against variadic parameter index
+      const size_t schema_input_index = (i < inputs.size()) ? i : inputs.size() - 1;
+      const auto& param = inputs[schema_input_index];
+      const auto& input_name = param.GetName();
+      if (input_type == nullptr) {
+        if (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Optional)
+          continue;
+
+        ORT_THROW("[CustomOP type inferencing error]: kernel Input: ", input_name,
+                  " is absent, but not optional. Op : ", schema.Name());
+      }
+
+      is_variadic_input = (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Variadic);
+      is_homogeneous_input = param.GetIsHomogeneous();
+
+      if (!is_variadic_input || is_homogeneous_input) {
+        auto hit = type_constraints.find(input_name);
+        if (hit != type_constraints.end()) {
+          const auto& types = hit->second;
+          // For custom ops kernel constraints are never empty
+          assert(!types.empty());
+          if (!std::any_of(types.cbegin(), types.cend(),
+                           [input_type](const DataTypeImpl* type) {
+                             return type->IsCompatible(*input_type);
+                           })) {
+            def_selected = nullptr;
+            output_propagate = 0;
+            break;
+          }
+
+          // If we have multiple types possible from the constraints,
+          // record the last type and use it to guess the output type if
+          // output may have different types. Works well for symmetric single input/outputs
+          // otherwise give up and let the user supply their own function
+          if (types.size() > 1) {
+            output_propagate = input_type->tensor_type().elem_type();
           }
         } else {
-          matched = false;
+          ORT_THROW("[CustomOP type inferencing error]: no type constraint found for input: ",
+                    input_name, " Op: ", schema.Name());
         }
-      } else {
-        matched = false;
-      }
-    }  // for
-    // next, ensure that there is a constraint for every output
-    auto num_outputs = infer_ctx.getNumOutputs();
-    for (size_t i = 0; i < num_outputs && matched; i++) {
-      auto output_name = "Output" + std::to_string(i);
-      auto tc_iter = type_constraints.find(output_name);
-      if (tc_iter == type_constraints.end() || tc_iter->second.size() < 1) {
-        matched = false;
       }
     }
-    if (matched) {
-      for (size_t i = 0; i < num_outputs; i++) {
-        auto output_name = "Output" + std::to_string(i);
-        auto output_type = infer_ctx.getOutputType(i);
-        auto tc_iter = type_constraints.find(output_name);
-        if (tc_iter->second.size() > 1) {
-          output_type->mutable_tensor_type()->set_elem_type(undef);
-        } else {
-          output_type->mutable_tensor_type()->set_elem_type(tc_iter->second[0]->GetTypeProto()->tensor_type().elem_type());
-        }
-      }
+  }
+
+  if (def_selected == nullptr) {
+    ORT_THROW("[CustomOP type inferencing error]: no kernel def matches node inputs for Op: ", schema.Name());
+  }
+
+  const auto& outputs = schema.outputs();
+  const auto node_output_num = infer_ctx.getNumOutputs();
+  const auto& selected_type_constraints = def_selected->TypeConstraints();
+
+  for (size_t i = 0; i < node_output_num; ++i) {
+    auto output_type = infer_ctx.getOutputType(i);
+    // Account for variadic outputs
+    const size_t schema_output_index = (i < outputs.size()) ? i : outputs.size() - 1;
+    const auto& param = outputs[schema_output_index];
+    const auto& output_name = param.GetName();
+
+    const bool is_variadic_output = (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Variadic);
+    const bool is_homogeneous = param.GetIsHomogeneous();
+
+    // We give up on variadic non-homogeneous outputs
+    // Let the user handle it in their inference function
+    if (is_variadic_output && !is_homogeneous) {
       break;
     }
+
+    auto hit = selected_type_constraints.find(output_name);
+    if (hit != selected_type_constraints.end()) {
+      const auto& types = hit->second;
+      assert(!types.empty());
+
+      if (types.size() == 1) {
+        // Use the constraint type
+        output_type->mutable_tensor_type()->set_elem_type(
+            types[0]->GetTypeProto()->tensor_type().elem_type());
+      } else if (!is_variadic_input || is_homogeneous_input) {
+        // If not variadic or homogeneous, and there are multiple types possible, guess from the last input type
+        // as this works for symmetric varied single input/outputs
+        // otherwise give up and let the user supply their own function
+        output_type->mutable_tensor_type()->set_elem_type(output_propagate);
+      }
+    } else {
+      ORT_THROW("[CustomOP type inferencing error]: no type constraint found for output: ",
+                output_name, " Op: ", schema.Name());
+    }
   }
 }
+
 #endif
 
 common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domains,
@@ -985,7 +1215,8 @@ common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domai
     // If domain is empty, it is assumed to be part of the ONNX domain
     if (!domain->domain_.empty()) {
       // Add it to the DomainToVersion ONNX map if it doesn't already exist
-      // For example, two sessions using the same session_options should not add the same custom op domain to the version map twice
+      // For example, two sessions using the same session_options should not add the same custom op domain
+      // to the version map twice
       auto& domain_to_version_range_instance = ONNX_NAMESPACE::OpSchemaRegistry::DomainToVersionRange::Instance();
       const auto& domain_to_version_map = domain_to_version_range_instance.Map();
 
@@ -994,32 +1225,51 @@ common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domai
       }
     }
 
+    // domain_kernels aggregate all custom operator per names.
+    std::unordered_map<std::string, std::vector<const OrtCustomOp*>> domain_kernels;
     for (const auto* op : domain->custom_ops_) {
       // define kernel
-      auto kernel_create_info = CreateKernelCreateInfo(domain->domain_, op);
-      kernel_def_map[op->GetName(op)].push_back(kernel_create_info.kernel_def.get());
-      ORT_RETURN_IF_ERROR(output->RegisterCustomKernel(kernel_create_info));
-      // define schema
-      auto schema_map_iter = schema_map.find(op->GetName(op));
-      if (schema_map_iter == schema_map.end()) {
-        auto schema = CreateSchema(domain->domain_, op);
-        schema_map.emplace(schema.Name(), schema);
+      auto it = domain_kernels.find(op->GetName(op));
+      if (it == domain_kernels.end()) {
+        domain_kernels[op->GetName(op)] = {op};
       } else {
-        ORT_RETURN_IF_ERROR(IsCompatible(schema_map_iter->second, op));
+        domain_kernels[op->GetName(op)].push_back(op);
+      }
+    }
+
+    // Creation of the schemas, one per unique name.
+    for (auto& [name, ops] : domain_kernels) {
+      auto schema = CreateSchema(domain->domain_, ops);
+      // schema.Name() is equal to ops[0]->GetName(ops[0]) and op->GetName(op) is the value
+      // used as a key for dictionary domain_kernels, therefore name == schema.Name().
+      schema_map.emplace(schema.Name(), schema);
+
+      // This loops checks that all custom operators sharing the same name are compatible with the defined schema.
+      for (const auto* op : ops) {
+        // define kernel
+        auto kernel_create_info = CreateKernelCreateInfo(domain->domain_, op);
+        kernel_def_map[op->GetName(op)].push_back(kernel_create_info.kernel_def.get());
+        ORT_RETURN_IF_ERROR(output->RegisterCustomKernel(kernel_create_info));
+        // If IsCompatible returns false, then all custom operators named
+        // 'op->GetName(op)' are not compatible among themselves.
+        // They should have the same number of inputs and outputs, the same characteristics,
+        // (optional, ...). Only the type can change.
+        ORT_RETURN_IF_ERROR(IsCompatible(schema, op));
       }
     }
 
     std::vector<ONNX_NAMESPACE::OpSchema> schemas;
-    for (auto schema_iter : schema_map) {
-      schemas.push_back(schema_iter.second);
-      InlinedVector<const KernelDef*> kernel_defs = std::move(kernel_def_map[schema_iter.first]);
+    for (auto& [name, schema] : schema_map) {
+      schemas.push_back(schema);
       auto infer_fn = schemas.back().GetTypeAndShapeInferenceFunction();
-      ONNX_NAMESPACE::InferenceFunction extended_infer_fn = [infer_fn, kernel_defs](ONNX_NAMESPACE::InferenceContext& infer_ctx) {
-        InferOutputTypes(kernel_defs, infer_ctx);
-        if (infer_fn) {
-          infer_fn(infer_ctx);
-        }
-      };
+      ONNX_NAMESPACE::InferenceFunction extended_infer_fn =
+          [sch = schema, infer_fn = std::move(infer_fn),
+           kernel_defs = std::move(kernel_def_map[name])](ONNX_NAMESPACE::InferenceContext& infer_ctx) {
+            InferOutputTypes(sch, kernel_defs, infer_ctx);
+            if (infer_fn) {
+              infer_fn(infer_ctx);
+            }
+          };
       schemas.back().TypeAndShapeInferenceFunction(extended_infer_fn);
     }
 
@@ -1078,4 +1328,4 @@ common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domai
 }
 
 }  // namespace onnxruntime
-#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#endif  // ENABLE_CUSTOM_OP_API
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 80a0cb673c19..318c76645bdf 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -240,12 +240,10 @@ Status Environment::Initialize(std::unique_ptr<logging::LoggingManager> logging_
 // Register contributed schemas.
 // The corresponding kernels are registered inside the appropriate execution provider.
 #ifndef DISABLE_CONTRIB_OPS
-#ifndef ORT_MINIMAL_BUILD
       RegisterOpSetSchema<contrib::OpSet_Microsoft_ver1>();
       RegisterOpSetSchema<contrib::OpSet_ONNX_Deprecated>();
       // internal opset that has NHWC versions of ONNX operators
       RegisterOpSetSchema<internal_nhwc_onnx::OpSet_Internal_NHWC_ONNX>();
-#endif
       contrib::RegisterContribSchemas();
 #endif
 
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 575529a06fb7..dbb4bc5bfe16 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -15,6 +15,7 @@
 #include "core/common/logging/logging.h"
 #include "core/common/parse_string.h"
 #include "core/common/path_string.h"
+#include "core/common/string_utils.h"
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/flatbuffers/ort_format_version.h"
 #include "core/framework/bfc_arena.h"
@@ -46,10 +47,11 @@
 #include "core/optimizer/transformer_memcpy.h"
 #include "core/optimizer/transpose_optimization/ort_optimizer_utils.h"
 #include "core/platform/Barrier.h"
-#include "core/platform/ort_mutex.h"
 #include "core/platform/threadpool.h"
 #ifdef _WIN32
 #include "core/platform/tracing.h"
+#include <Windows.h>
+#include "core/platform/windows/telemetry.h"
 #endif
 #include "core/providers/cpu/controlflow/utils.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
@@ -59,6 +61,7 @@
 #include "core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h"
 #include "core/providers/dml/dml_session_options_config_keys.h"
 #include "core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h"
+#include "core/optimizer/stft_decomposition.h"
 #endif
 #include "core/session/environment.h"
 #include "core/session/user_logging_sink.h"
@@ -69,6 +72,11 @@
 #include "core/util/protobuf_parsing_utils.h"
 #include "core/util/thread_utils.h"
 
+#ifdef _WIN32
+#include "core/platform/windows/logging/etw_sink.h"
+#include "core/common/logging/sinks/composite_sink.h"
+#endif
+
 // custom ops are not available in a minimal build unless ORT_MINIMAL_BUILD_CUSTOM_OPS is set
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
 #include "core/framework/customregistry.h"
@@ -140,28 +148,30 @@ static bool HasMemcpyNodes(const Graph& graph) {
   return false;
 }
 
-static bool AreAllComputeNodesAssignedToCudaEp(const Graph& graph) {
-  bool nodes_on_cpu_and_cuda_eps_only = true;
+static bool AreAllComputeNodesAssignedToCudaOrJsEp(const Graph& graph) {
+  bool nodes_on_cpu_and_cuda_and_js_eps_only = true;
 
   for (const auto& node : graph.Nodes()) {
     const auto& node_provider = node.GetExecutionProviderType();
 
     // Empty node provider means CPU EP
     if (!node_provider.empty() &&
-        node_provider != kCudaExecutionProvider &&
+        !(node_provider == kCudaExecutionProvider ||
+          node_provider == kRocmExecutionProvider ||
+          node_provider == kJsExecutionProvider) &&
         node_provider != kCpuExecutionProvider) {
-      nodes_on_cpu_and_cuda_eps_only = false;
+      nodes_on_cpu_and_cuda_and_js_eps_only = false;
       break;
     }
   }
 
-  // If we see nodes assigned to EPs other than CPU or CUDA
+  // If we see nodes assigned to EPs other than CPU, or CUDA/JS
   // (or) if there are Memcpy nodes, then all compute nodes have
-  // not been parititoned to the CUDA EP.
+  // not been parititoned to the CUDA/JS EP.
   // We allow CPU EPs to show up in the EP list as long as thre is no Memcpy
   // involved as shape subgraphs will be forced onto CPU and these will not have
   // Memcpy nodes involved.
-  return nodes_on_cpu_and_cuda_eps_only && !HasMemcpyNodes(graph);
+  return nodes_on_cpu_and_cuda_and_js_eps_only && !HasMemcpyNodes(graph);
 }
 
 static bool AreAllNodesInMainGraphAssignedToOneEp(const Graph& graph, ProviderType provider) {
@@ -234,6 +244,10 @@ Status GetMinimalBuildOptimizationHandling(
 }  // namespace
 
 std::atomic<uint32_t> InferenceSession::global_session_id_{1};
+std::map<uint32_t, InferenceSession*> InferenceSession::active_sessions_;
+#ifdef _WIN32
+OrtMutex InferenceSession::active_sessions_mutex_;  // Protects access to active_sessions_
+#endif
 
 static Status FinalizeSessionOptions(const SessionOptions& user_provided_session_options,
                                      const ONNX_NAMESPACE::ModelProto& model_proto,
@@ -307,6 +321,7 @@ static Status FinalizeSessionOptions(const SessionOptions& user_provided_session
 
 logging::Severity GetSeverity(const SessionOptions& session_options) {
   logging::Severity severity = logging::Severity::kWARNING;
+
   if (session_options.session_log_severity_level == -1) {
     severity = logging::LoggingManager::DefaultLogger().GetSeverity();
   } else {
@@ -322,11 +337,17 @@ logging::Severity GetSeverity(const SessionOptions& session_options) {
 void InferenceSession::SetLoggingManager(const SessionOptions& session_options,
                                          const Environment& session_env) {
   logging_manager_ = session_env.GetLoggingManager();
+  std::unique_ptr<logging::ISink> sink;
+
   if (session_options.user_logging_function) {
-    std::unique_ptr<logging::ISink> user_sink = std::make_unique<UserLoggingSink>(session_options.user_logging_function,
-                                                                                  session_options.user_logging_param);
-    user_logging_manager_ = std::make_unique<logging::LoggingManager>(std::move(user_sink),
-                                                                      GetSeverity(session_options),
+    sink = std::make_unique<UserLoggingSink>(session_options.user_logging_function,
+                                             session_options.user_logging_param);
+    auto sessionSeverity = GetSeverity(session_options);
+    auto etwOverrideSeverity = logging::OverrideLevelWithEtw(sessionSeverity);
+    sink = EnhanceLoggerWithEtw(std::move(sink), sessionSeverity, etwOverrideSeverity);
+
+    user_logging_manager_ = std::make_unique<logging::LoggingManager>(std::move(sink),
+                                                                      std::min(sessionSeverity, etwOverrideSeverity),
                                                                       false,
                                                                       logging::LoggingManager::InstanceType::Temporal,
                                                                       &session_options.session_logid);
@@ -337,21 +358,65 @@ void InferenceSession::SetLoggingManager(const SessionOptions& session_options,
 void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
                                          const Environment& session_env) {
   auto status = FinalizeSessionOptions(session_options, model_proto_, is_model_proto_parsed_, session_options_);
-  // a monotonically increasing session id for use in telemetry
-  session_id_ = global_session_id_.fetch_add(1);
   ORT_ENFORCE(status.IsOK(), "Could not finalize session options while constructing the inference session. Error Message: ",
               status.ErrorMessage());
 
+  // a monotonically increasing session id for use in telemetry
+  session_id_ = global_session_id_.fetch_add(1);
+
+#ifdef _WIN32
+  std::lock_guard<OrtMutex> lock(active_sessions_mutex_);
+  active_sessions_[global_session_id_++] = this;
+
+  // Register callback for ETW capture state (rundown)
+  WindowsTelemetry::RegisterInternalCallback(
+      [this](
+          LPCGUID SourceId,
+          ULONG IsEnabled,
+          UCHAR Level,
+          ULONGLONG MatchAnyKeyword,
+          ULONGLONG MatchAllKeyword,
+          PEVENT_FILTER_DESCRIPTOR FilterData,
+          PVOID CallbackContext) {
+        (void)SourceId;
+        (void)Level;
+        (void)MatchAnyKeyword;
+        (void)MatchAllKeyword;
+        (void)FilterData;
+        (void)CallbackContext;
+
+        // Check if this callback is for capturing state
+        if ((IsEnabled == EVENT_CONTROL_CODE_CAPTURE_STATE) &&
+            ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)) != 0)) {
+          LogAllSessions();
+        }
+      });
+#endif
+
   SetLoggingManager(session_options, session_env);
 
   // The call to InitLogger depends on the final state of session_options_. Hence it should be invoked
   // after the invocation of FinalizeSessionOptions.
   InitLogger(logging_manager_);  // this sets session_logger_ so that it can be used for logging after this point.
-  TraceSessionOptions(session_options);
+  TraceSessionOptions(session_options, false);
 
 #if !defined(ORT_MINIMAL_BUILD)
   // Update the number of steps for the graph transformer manager using the "finalized" session options
-  ORT_ENFORCE(graph_transformer_mgr_.SetSteps(session_options_.max_num_graph_transformation_steps).IsOK());
+  ORT_THROW_IF_ERROR(graph_transformer_mgr_.SetSteps(session_options_.max_num_graph_transformation_steps));
+#endif
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  {
+    auto disabled_string = session_options_.config_options.GetConfigOrDefault(
+        kOrtSessionOptionsDisableSpecifiedOptimizers, "");
+    if (!disabled_string.empty()) {
+      const auto disabled_list = utils::SplitString(disabled_string, ";");
+      InlinedHashSet<std::string> disabled_rules_and_transformers;
+      disabled_rules_and_transformers.reserve(disabled_list.size());
+      disabled_rules_and_transformers.insert(disabled_list.cbegin(), disabled_list.cend());
+      ORT_THROW_IF_ERROR(FilterEnabledOptimizers(std::move(disabled_rules_and_transformers)));
+    }
+  }
 #endif
 
   bool set_denormal_as_zero =
@@ -461,12 +526,16 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
   telemetry_ = {};
 }
 
-void InferenceSession::TraceSessionOptions(const SessionOptions& session_options) {
+void InferenceSession::TraceSessionOptions(const SessionOptions& session_options, bool captureState) {
+  (void)captureState;  // Otherwise Linux build error
+
   LOGS(*session_logger_, INFO) << session_options;
 
 #ifdef _WIN32
   TraceLoggingWrite(telemetry_provider_handle,
                     "SessionOptions",
+                    TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
                     TraceLoggingUInt8(static_cast<UINT8>(session_options.execution_mode), "execution_mode"),
                     TraceLoggingUInt8(static_cast<UINT8>(session_options.execution_order), "execution_order"),
                     TraceLoggingBoolean(session_options.enable_profiling, "enable_profiling"),
@@ -482,25 +551,32 @@ void InferenceSession::TraceSessionOptions(const SessionOptions& session_options
                     TraceLoggingUInt8(static_cast<UINT8>(session_options.graph_optimization_level), "graph_optimization_level"),
                     TraceLoggingBoolean(session_options.use_per_session_threads, "use_per_session_threads"),
                     TraceLoggingBoolean(session_options.thread_pool_allow_spinning, "thread_pool_allow_spinning"),
-                    TraceLoggingBoolean(session_options.use_deterministic_compute, "use_deterministic_compute"));
+                    TraceLoggingBoolean(session_options.use_deterministic_compute, "use_deterministic_compute"),
+                    TraceLoggingBoolean(captureState, "isCaptureState"));
 
   TraceLoggingWrite(
       telemetry_provider_handle,
       "SessionOptions_IntraOrtThreadPoolParams",
+      TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+      TraceLoggingLevel(WINEVENT_LEVEL_INFO),
       TraceLoggingInt32(session_options.intra_op_param.thread_pool_size, "thread_pool_size"),
       TraceLoggingBoolean(session_options.intra_op_param.auto_set_affinity, "auto_set_affinity"),
       TraceLoggingBoolean(session_options.intra_op_param.allow_spinning, "allow_spinning"),
       TraceLoggingInt32(session_options.intra_op_param.dynamic_block_base_, "dynamic_block_base_"),
       TraceLoggingUInt32(session_options.intra_op_param.stack_size, "stack_size"),
       TraceLoggingString(!session_options.intra_op_param.affinity_str.empty() ? session_options.intra_op_param.affinity_str.c_str() : "", "affinity_str"),
-      TraceLoggingBoolean(session_options.intra_op_param.set_denormal_as_zero, "set_denormal_as_zero"));
+      TraceLoggingBoolean(session_options.intra_op_param.set_denormal_as_zero, "set_denormal_as_zero"),
+      TraceLoggingBoolean(captureState, "isCaptureState"));
 
   for (const auto& config_pair : session_options.config_options.configurations) {
     TraceLoggingWrite(
         telemetry_provider_handle,
         "SessionOptions_ConfigEntry",
+        TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+        TraceLoggingLevel(WINEVENT_LEVEL_INFO),
         TraceLoggingString(config_pair.first.c_str(), "Key"),
-        TraceLoggingString(config_pair.second.c_str(), "Value"));
+        TraceLoggingString(config_pair.second.c_str(), "Value"),
+        TraceLoggingBoolean(captureState, "isCaptureState"));
   }
 #endif
 }
@@ -596,6 +672,12 @@ InferenceSession::~InferenceSession() {
     }
   }
 
+  // Unregister the session
+#ifdef _WIN32
+  std::lock_guard<OrtMutex> lock(active_sessions_mutex_);
+#endif
+  active_sessions_.erase(global_session_id_);
+
 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
   if (session_activity_started_)
     TraceLoggingWriteStop(session_activity, "OrtInferenceSessionActivity");
@@ -633,26 +715,6 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr
       session_options_.enable_mem_pattern = false;
     }
 
-    // Default this option to true when the DML EP is registered.
-    // This should be removed if QDQ is supported for DML through QDQSelectorActionTransformer and the DML EP does not
-    // rely on the constant folding pass for DequantizeLinear.
-    optional<std::string> disable_quant_qdq = session_options_.config_options.GetConfigEntry(kOrtSessionOptionsDisableQuantQDQ);
-
-    if (disable_quant_qdq == std::nullopt) {
-      LOGS(*session_logger_, INFO)
-          << "QDQ quantization is not supported while using the DML Execution Provider. "
-          << "So disabling it for this session since it uses the DML Execution Provider.";
-
-      auto st = session_options_.config_options.AddConfigEntry(kOrtSessionOptionsDisableQuantQDQ, "1");
-      if (!st.IsOK()) {
-        return st;
-      }
-    } else if (*disable_quant_qdq != "1") {
-      LOGS(*session_logger_, WARNING)
-          << "QDQ quantization is not supported while using the DML Execution Provider. "
-          << "It is enabled within session options which may result in lower performance.";
-    }
-
     // Parallel execution mode does not support DML EP
     if (session_options_.execution_mode != ExecutionMode::ORT_SEQUENTIAL) {
       LOGS(*session_logger_, INFO)
@@ -1166,6 +1228,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
 
   // Do partitioning based on execution providers' capabilities.
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
+                                                       session_options_.config_options, *session_logger_,
                                                        mode, debug_graph_fn));
 
   // apply Level2 and higher transformers.
@@ -1460,7 +1523,9 @@ namespace {
 Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                const ExecutionProviders& providers,
                                KernelRegistryManager& kernel_registry_manager,
-                               SessionState& session_state) {
+                               SessionState& session_state,
+                               const ConfigOptions& config_options,
+                               const logging::Logger& logger) {
   layout_transformation::TransformLayoutFunction transform_layout_fn = nullptr;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1481,6 +1546,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
   ORT_RETURN_IF_ERROR(partitioner.Partition(graph,
                                             session_state.GetMutableFuncMgr(),
                                             transform_layout_fn,
+                                            config_options,
+                                            logger,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
   return Status::OK();
@@ -1674,10 +1741,17 @@ common::Status InferenceSession::Initialize() {
         // graph optimization level and is generally always applied.
         bool dml_graph_fusion_enabled = session_options_.optimized_model_filepath.empty() &&
                                         session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigDisableDmlGraphFusion, "0") == "0";
+        std::string dml_graph_serialization_enabled_config_val = session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigEnableGraphSerialization, "0");
+        std::transform(dml_graph_serialization_enabled_config_val.begin(),
+                       dml_graph_serialization_enabled_config_val.end(),
+                       dml_graph_serialization_enabled_config_val.begin(),
+                       [](char ch) { return std::tolower(ch); });
+        bool dml_graph_serialization_enabled = dml_graph_serialization_enabled_config_val == "true";
 
         if (dml_graph_fusion_enabled) {
           std::unique_ptr<onnxruntime::GraphTransformer> dmlGraphFusionTransformer = std::make_unique<Dml::DmlGraphFusionTransformer>("DmlGraphFusionTransformer",
-                                                                                                                                      dmlExecutionProvider);
+                                                                                                                                      dmlExecutionProvider,
+                                                                                                                                      dml_graph_serialization_enabled);
           if (dmlGraphFusionTransformer == nullptr) {
             return Status(common::ONNXRUNTIME, common::FAIL, "DmlGraphFusionTransformer is nullptr");
           }
@@ -1696,12 +1770,21 @@ common::Status InferenceSession::Initialize() {
         // This transformer applies DML-specific fusions that go beyond what ORT offers by default
         bool dml_operator_fusion_enabled = session_options_.graph_optimization_level >= TransformerLevel::Level2;
         if (dml_operator_fusion_enabled) {
-          std::unique_ptr<onnxruntime::GraphTransformer> dmlOperatorFusionTransformer = std::make_unique<Dml::GraphTransformer>("DmlOperatorFusionTransformer");
+          std::unique_ptr<onnxruntime::GraphTransformer> dmlOperatorFusionTransformer = std::make_unique<Dml::GraphTransformer>("DmlOperatorFusionTransformer",
+                                                                                                                                execution_providers_.Get(kDmlExecutionProvider));
           if (dmlOperatorFusionTransformer == nullptr) {
             return Status(common::ONNXRUNTIME, common::FAIL, "DmlOperatorFusionTransformer is nullptr");
           }
           ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr_.Register(std::move(dmlOperatorFusionTransformer), onnxruntime::TransformerLevel::Level2));
         }
+
+        const auto dml_ep_impl = static_cast<const Dml::ExecutionProvider*>(dmlExecutionProvider);
+        auto is_mcdm_device = dml_ep_impl->GetImpl()->IsMcdmDevice();
+        if (is_mcdm_device) {
+          const InlinedHashSet<std::string_view> dml_ep = {onnxruntime::kDmlExecutionProvider};
+          auto stft_decomposition_transformer = std::make_unique<STFTDecomposition>(dml_ep);
+          ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr_.Register(std::move(stft_decomposition_transformer), onnxruntime::TransformerLevel::Level1));
+        }
       }
 #endif
 
@@ -1711,7 +1794,7 @@ common::Status InferenceSession::Initialize() {
       // now that all the transforms are done, call Resolve on the main graph. this will recurse into the subgraphs.
       ORT_RETURN_IF_ERROR_SESSIONID_(graph.Resolve());
 
-      // Currently CUDA graph is only considered by CUDA EP and TRT EP.
+      // Currently graph capture is only considered by CUDA EP, TRT EP, ROCM EP and JS EP.
       //
       // Check for CUDA EP:
       // If the CUDA EP is part of the providers list for this session AND
@@ -1724,47 +1807,70 @@ common::Status InferenceSession::Initialize() {
       // The TRT EP is configured to do a graph capture AND
       // All the graph nodes have been assigned to the TRT EP,
       // Then the TRT EP is cached for triggering a ReplayGraph() in Run().
-      std::vector<const char*> cuda_graph_support_ep_list = {onnxruntime::kTensorrtExecutionProvider, onnxruntime::kCudaExecutionProvider};
+      //
+      // Check for JS EP:
+      // If the JS EP is part of the providers list for this session AND
+      // The JS EP is configured to do a graph capture AND
+      // All the "compute" graph nodes have been assigned to the JS EP,
+      // Then the JS EP is cached for triggering a ReplayGraph() in Run().
+      //
+      // Check for ROCM EP:
+      // If the ROCM EP is part of the providers list for this session AND
+      // The ROCM EP is configured to do a graph capture AND
+      // All the "compute" graph nodes have been assigned to the ROCM EP,
+      // Then the ROCM EP is cached for triggering a ReplayGraph() in Run().
+      //
+      std::vector<const char*> graph_support_ep_list = {
+          onnxruntime::kTensorrtExecutionProvider,
+          onnxruntime::kCudaExecutionProvider,
+          onnxruntime::kRocmExecutionProvider,
+          onnxruntime::kJsExecutionProvider};
 
-      for (auto& it : cuda_graph_support_ep_list) {
+      for (auto& it : graph_support_ep_list) {
         auto* target_ep = execution_providers_.Get(it);
 
         if (target_ep && target_ep->IsGraphCaptureEnabled()) {
-          // CUDA Graphs can't work with control flow nodes
+          // Graphs capture can't work with control flow nodes
           if (HasControlflowNodes(graph)) {
-            LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA Graph feature as requested by the user "
-                                          << "as the model has control flow nodes which can't be supported by CUDA Graphs.";
+            LOGS(*session_logger_, ERROR) << "This session cannot use the graph capture feature as requested by the user "
+                                          << "as the model has control flow nodes which can't be supported by "
+                                          << target_ep->Type();
 
             ORT_RETURN_IF_ERROR_SESSIONID_(
                 ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                                "This session cannot use the CUDA Graph feature as requested by the user "
-                                "as the model has control flow nodes which can't be supported by CUDA Graphs."));
+                                "This session cannot use the graph capture feature as requested by the user "
+                                "as the model has control flow nodes which can't be supported by" +
+                                    target_ep->Type()));
           }
 
-          if (strcmp(target_ep->Type().c_str(), onnxruntime::kCudaExecutionProvider) == 0) {
-            // Ensure that all nodes have been partitioned to CUDA or CPU EP && there are no memcpy nodes
+          if (strcmp(target_ep->Type().c_str(), onnxruntime::kCudaExecutionProvider) == 0 ||
+              strcmp(target_ep->Type().c_str(), onnxruntime::kRocmExecutionProvider) == 0 ||
+              strcmp(target_ep->Type().c_str(), onnxruntime::kJsExecutionProvider) == 0) {
+            // Ensure that all nodes have been partitioned to CUDA/JS or CPU EP && there are no memcpy nodes
             // The reasoning behind this logic is that certain shape nodes will be forced onto CPU
             // and as long as there are no memcpy nodes this is confirmation that no compute nodes have been placed on the CPU EP
             // which is all we care about.
-            if (!AreAllComputeNodesAssignedToCudaEp(graph)) {
-              LOGS(*session_logger_, ERROR) << "This session cannot use the CUDA Graph feature as requested by the user "
-                                            << " as all compute graph nodes have not been partitioned to the CUDA EP.";
+            if (!AreAllComputeNodesAssignedToCudaOrJsEp(graph)) {
+              LOGS(*session_logger_, ERROR) << "This session cannot use the graph capture feature as requested by the user "
+                                            << " as all compute graph nodes have not been partitioned to the "
+                                            << target_ep->Type();
 
               ORT_RETURN_IF_ERROR_SESSIONID_(
                   ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                                  "This session cannot use the CUDA Graph feature as requested by the user "
-                                  " as all compute graph nodes have not been partitioned to the CUDA EP."));
+                                  "This session cannot use the graph capture feature as requested by the user "
+                                  " as all compute graph nodes have not been partitioned to the " +
+                                      target_ep->Type()));
             }
 
             // Log a warning for the user to know that there are shape subgraphs that will execute on CPU
             if (HasShapeSubgraphNodes(graph)) {
               LOGS(*session_logger_, WARNING) << "This model has shape massaging nodes that will execute on CPU. "
-                                              << "Use the CUDA Graph feature with caution. "
+                                              << "Use the graph capture feature with caution. "
                                               << "As long as the intermediate shapes produced in the model "
-                                              << "using the representative input used to capture the CUDA graph, "
+                                              << "using the representative input used to capture the graph, "
                                               << "will match the shapes produced in the model for other inputs "
                                               << "of the same shape as the representative input (common case), "
-                                              << "it is safe to use the CUDA Graph feature.";
+                                              << "it is safe to use the graph capture feature.";
             }
           } else {
             // Following code path is for TRT EP currently.
@@ -1783,7 +1889,7 @@ common::Status InferenceSession::Initialize() {
             }
           }
 
-          LOGS(*session_logger_, INFO) << "This session will use the CUDA Graph feature as requested by the user.";
+          LOGS(*session_logger_, INFO) << "This session will use the CUDA/HIP Graph feature as requested by the user.";
           cached_execution_provider_for_graph_replay_.SetExecutionProvider(target_ep);
           break;  // Make sure only one ep can run CUDA graph.
         }
@@ -1834,7 +1940,7 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD)
     } else {
       ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
-                                                             *session_state_));
+                                                             *session_state_, session_options_.config_options, *session_logger_));
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
@@ -2214,8 +2320,8 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
     // TODO: only call OnRunStart for all providers in-use
     for (auto& xp : execution_providers_) {
       // call OnRunStart and add to exec_providers_to_stop if successful
-      auto start_func = [&xp, &exec_providers_to_stop]() {
-        auto status = xp->OnRunStart();
+      auto start_func = [&xp, &exec_providers_to_stop, run_options]() {
+        auto status = xp->OnRunStart(run_options);
         if (status.IsOK())
           exec_providers_to_stop.push_back(xp.get());
 
@@ -2251,7 +2357,7 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
 
   // info all execution providers InferenceSession:Run ended
   for (auto* xp : exec_providers_to_stop) {
-    auto status = xp->OnRunEnd(/*sync_stream*/ false);
+    auto status = xp->OnRunEnd(/*sync_stream*/ false, run_options);
     ORT_CHECK_AND_SET_RETVAL(status);
   }
 
@@ -2301,21 +2407,32 @@ Status InferenceSession::Run(const RunOptions& run_options,
   Status retval = Status::OK();
   const Env& env = Env::Default();
 
+  int graph_annotation_id = 0;
+  const std::string& graph_annotation_str =
+      run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigCudaGraphAnnotation, "");
+  if (!graph_annotation_str.empty()) {
+    if (!TryParseStringWithClassicLocale<int>(graph_annotation_str, graph_annotation_id)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Failed to parse the cuda graph annotation id: ",
+                             graph_annotation_str);
+    }
+  }
+
   // Increment/decrement concurrent_num_runs_ and control
   // session threads spinning as configured. Do nothing for graph replay except the counter.
   const bool control_spinning = use_per_session_threads_ &&
                                 force_spinning_stop_between_runs_ &&
-                                !cached_execution_provider_for_graph_replay_.IsGraphCaptured();
+                                !cached_execution_provider_for_graph_replay_.IsGraphCaptured(graph_annotation_id);
   auto* intra_tp = (control_spinning) ? thread_pool_.get() : nullptr;
   auto* inter_tp = (control_spinning) ? inter_op_thread_pool_.get() : nullptr;
   ThreadPoolSpinningSwitch runs_refcounter_and_tp_spin_control(intra_tp, inter_tp, current_num_runs_);
 
   // Check if this Run() is simply going to be a CUDA Graph replay.
-  if (cached_execution_provider_for_graph_replay_.IsGraphCaptured()) {
+  if (cached_execution_provider_for_graph_replay_.IsGraphCaptured(graph_annotation_id)) {
     LOGS(*session_logger_, INFO) << "Replaying the captured "
                                  << cached_execution_provider_for_graph_replay_.Type()
-                                 << " CUDA Graph for this model with tag: " << run_options.run_tag;
-    ORT_RETURN_IF_ERROR_SESSIONID_(cached_execution_provider_for_graph_replay_.ReplayGraph());
+                                 << " CUDA Graph for this model with tag: " << run_options.run_tag
+                                 << " with graph annotation id: " << graph_annotation_id;
+    ORT_RETURN_IF_ERROR_SESSIONID_(cached_execution_provider_for_graph_replay_.ReplayGraph(graph_annotation_id));
   } else {
     InlinedVector<IExecutionProvider*> exec_providers_to_stop;
     exec_providers_to_stop.reserve(execution_providers_.NumProviders());
@@ -2373,8 +2490,8 @@ Status InferenceSession::Run(const RunOptions& run_options,
       // TODO: only call OnRunStart for all providers in-use
       for (auto& xp : execution_providers_) {
         // call OnRunStart and add to exec_providers_to_stop if successful
-        auto start_func = [&xp, &exec_providers_to_stop]() {
-          auto status = xp->OnRunStart();
+        auto start_func = [&xp, &exec_providers_to_stop, &run_options]() {
+          auto status = xp->OnRunStart(run_options);
           if (status.IsOK())
             exec_providers_to_stop.push_back(xp.get());
 
@@ -2415,7 +2532,7 @@ Status InferenceSession::Run(const RunOptions& run_options,
       // info all execution providers InferenceSession:Run ended
       for (auto* xp : exec_providers_to_stop) {
         bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
-        auto status = xp->OnRunEnd(synchronize_execution_providers);
+        auto status = xp->OnRunEnd(synchronize_execution_providers, run_options);
         ORT_CHECK_AND_SET_RETVAL(status);
       }
 
@@ -2473,9 +2590,12 @@ Status InferenceSession::Run(const RunOptions& run_options,
   // As N+1 inference runs (N for memory allocation and 1 for graph capturing)
   // are needed before replaying the captured graph, here run N inference runs recursively until graph captured,
   // so that users just need one session run to capture the graph.
-  // N is defined in min_num_runs_before_cuda_graph_capture_ for CUDA EP, and the value could be different for other EP.
+  // N is defined in min_num_runs_before_cuda_graph_capture_ for CUDA EP,
+  // N is defined in min_num_runs_before_hip_graph_capture_ for ROCM EP,
+  // and the value could be different for other EP.
   if (retval.IsOK() && cached_execution_provider_for_graph_replay_.IsGraphCaptureEnabled() &&
-      !cached_execution_provider_for_graph_replay_.IsGraphCaptured()) {
+      cached_execution_provider_for_graph_replay_.AllowGraphCaptureOnRun(graph_annotation_id) &&
+      !cached_execution_provider_for_graph_replay_.IsGraphCaptured(graph_annotation_id)) {
     LOGS(*session_logger_, INFO) << "Start another run for necessary memory allocation or graph capture.";
     ORT_RETURN_IF_ERROR(Run(run_options, feed_names, feeds, output_names, p_fetches, p_fetches_device_info));
   }
@@ -3039,4 +3159,14 @@ IOBinding* SessionIOBinding::Get() {
   return binding_.get();
 }
 
+#ifdef _WIN32
+void InferenceSession::LogAllSessions() {
+  std::lock_guard<OrtMutex> lock(active_sessions_mutex_);
+  for (const auto& session_pair : active_sessions_) {
+    InferenceSession* session = session_pair.second;
+    TraceSessionOptions(session->session_options_, true);
+  }
+}
+#endif
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 96db49aabdaf..3038c8d22ec8 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <map>
 #include <optional>
 #include <string>
 #include <unordered_map>
@@ -21,11 +22,12 @@
 #include "core/framework/session_state.h"
 #include "core/framework/tuning_results.h"
 #include "core/framework/framework_provider_common.h"
+#include "core/framework/session_options.h"
 #include "core/graph/basic_types.h"
 #include "core/optimizer/graph_transformer_level.h"
 #include "core/optimizer/graph_transformer_mgr.h"
 #include "core/optimizer/insert_cast_transformer.h"
-#include "core/framework/session_options.h"
+#include "core/platform/ort_mutex.h"
 #ifdef ENABLE_LANGUAGE_INTEROP_OPS
 #include "core/language_interop_ops/language_interop_ops.h"
 #endif
@@ -119,6 +121,10 @@ class InferenceSession {
   };
 
   using InputOutputDefMetaMap = InlinedHashMap<std::string_view, InputOutputDefMetaData>;
+  static std::map<uint32_t, InferenceSession*> active_sessions_;
+#ifdef _WIN32
+  static OrtMutex active_sessions_mutex_;  // Protects access to active_sessions_
+#endif
 
  public:
 #if !defined(ORT_MINIMAL_BUILD)
@@ -642,7 +648,7 @@ class InferenceSession {
 
   void InitLogger(logging::LoggingManager* logging_manager);
 
-  void TraceSessionOptions(const SessionOptions& session_options);
+  void TraceSessionOptions(const SessionOptions& session_options, bool captureState);
 
   [[nodiscard]] common::Status CheckShapes(const std::string& input_name, const TensorShape& input_shape,
                                            const TensorShape& expected_shape, const char* input_output_moniker) const;
@@ -669,7 +675,6 @@ class InferenceSession {
    * If we encounter an invalid request, we return an error
    * back to the user.
    */
-
   [[nodiscard]] common::Status ValidateAndParseShrinkArenaString(const std::string& ort_device_list,
                                                                  /*out*/ InlinedVector<AllocatorPtr>& arenas_to_shrink) const;
 
@@ -679,6 +684,10 @@ class InferenceSession {
    */
   void ShrinkMemoryArenas(gsl::span<const AllocatorPtr> arenas_to_shrink);
 
+#ifdef _WIN32
+  void LogAllSessions();
+#endif
+
 #if !defined(ORT_MINIMAL_BUILD)
   virtual common::Status AddPredefinedTransformers(
       GraphTransformerManager& transformer_manager,
@@ -857,14 +866,17 @@ class InferenceSession {
       return cached_execution_provider_for_graph_replay_ != nullptr && cached_execution_provider_for_graph_replay_->IsGraphCaptureEnabled();
     }
 
-    bool IsGraphCaptured() const {
-      return cached_execution_provider_for_graph_replay_ != nullptr && cached_execution_provider_for_graph_replay_->IsGraphCaptured();
+    bool IsGraphCaptured(int graph_annotation_id) const {
+      return cached_execution_provider_for_graph_replay_ != nullptr && cached_execution_provider_for_graph_replay_->IsGraphCaptured(graph_annotation_id);
+    }
+
+    bool AllowGraphCaptureOnRun(int graph_annotation_id) const {
+      return cached_execution_provider_for_graph_replay_ != nullptr && graph_annotation_id != kGraphAnnotationSkip;
     }
 
-    Status ReplayGraph() {
-      ORT_ENFORCE(IsGraphCaptured());
+    Status ReplayGraph(int graph_annotation_id) {
       if (cached_execution_provider_for_graph_replay_) {
-        return cached_execution_provider_for_graph_replay_->ReplayGraph();
+        return cached_execution_provider_for_graph_replay_->ReplayGraph(graph_annotation_id);
       }
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Cached EP instance for graph replay is not set yet before calling ReplayGraph()");
     }
@@ -874,6 +886,8 @@ class InferenceSession {
     }
 
     IExecutionProvider* cached_execution_provider_for_graph_replay_ = nullptr;
+    // TODO(wy): Same as kCudaGraphAnnotationSkip in cuda_graph.h. Move to a common place.
+    constexpr static int kGraphAnnotationSkip = -1;
   };
 
   CachedExecutionProviderForGraphReplay cached_execution_provider_for_graph_replay_;
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 9f8786b727ac..49e3f0a0213b 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2397,7 +2397,7 @@ Second example, if we wanted to add and remove some members, we'd do this:
     In GetApi we now make it return ort_api_3 for version 3.
 */
 
-static constexpr OrtApi ort_api_1_to_17 = {
+static constexpr OrtApi ort_api_1_to_18 = {
     // NOTE: The ordering of these fields MUST not change after that version has shipped since existing binaries depend on this ordering.
 
     // Shipped as version 1 - DO NOT MODIFY (see above text for more information)
@@ -2721,7 +2721,12 @@ static constexpr OrtApi ort_api_1_to_17 = {
     &OrtApis::ShapeInferContext_SetOutputTypeShape,
     &OrtApis::SetSymbolicDimensions,
     &OrtApis::ReadOpAttr,
-};
+    &OrtApis::SetDeterministicCompute,
+    &OrtApis::KernelContext_ParallelFor,
+    &OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+    &OrtApis::SessionOptionsAppendExecutionProvider_VitisAI,
+    &OrtApis::KernelContext_GetScratchBuffer,
+    &OrtApis::KernelInfoGetAllocator};
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
 static_assert(sizeof(OrtApiBase) == sizeof(void*) * 2, "New methods can't be added to OrtApiBase as it is not versioned");
@@ -2753,16 +2758,16 @@ static_assert(offsetof(OrtApi, KernelContext_GetResource) / sizeof(void*) == 265
 static_assert(offsetof(OrtApi, SetUserLoggingFunction) / sizeof(void*) == 266, "Size of version 17 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.17.0",
+static_assert(std::string_view(ORT_VERSION) == "1.18.0",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
-// 2. If there were any APIs added to ort_api_1_to_17 above:
+// 2. If there were any APIs added to ort_api_1_to_18 above:
 //    a. Add the 'End of version #' markers (pattern above should be obvious)
 //    b. Add a static_assert in the directly above list of version sizes to ensure nobody adds any more functions to the just shipped API version
 
 ORT_API(const OrtApi*, OrtApis::GetApi, uint32_t version) {
   if (version >= 1 && version <= ORT_API_VERSION)
-    return &ort_api_1_to_17;
+    return &ort_api_1_to_18;
 
   fprintf(stderr,
           "The requested API version [%u] is not available, only API versions [1, %u] are supported in this build."
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 09c83219ad2c..3591c96234aa 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -500,5 +500,21 @@ ORT_API_STATUS_IMPL(ShapeInferContext_GetAttribute, _In_ const OrtShapeInferCont
 ORT_API_STATUS_IMPL(ShapeInferContext_SetOutputTypeShape, _In_ const OrtShapeInferContext* context, _In_ size_t index, _In_ const OrtTensorTypeAndShapeInfo* info);
 ORT_API_STATUS_IMPL(SetSymbolicDimensions, _In_ OrtTensorTypeAndShapeInfo* info, _In_ const char* dim_params[], _In_ size_t dim_params_length);
 ORT_API_STATUS_IMPL(ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data, _In_ size_t len, _Out_ size_t* out);
+ORT_API_STATUS_IMPL(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
 
+ORT_API_STATUS_IMPL(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* user_data);
+
+ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+                    _In_ OrtSessionOptions* options,
+                    _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values,
+                    _In_ size_t num_keys);
+
+ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_VitisAI, _In_ OrtSessionOptions* options,
+                    _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values, _In_ size_t num_keys);
+
+ORT_API_STATUS_IMPL(KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out);
+
+ORT_API_STATUS_IMPL(KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out);
 }  // namespace OrtApis
diff --git a/onnxruntime/core/session/ort_env.cc b/onnxruntime/core/session/ort_env.cc
index e3957baa990f..331f1db26a02 100644
--- a/onnxruntime/core/session/ort_env.cc
+++ b/onnxruntime/core/session/ort_env.cc
@@ -39,23 +39,23 @@ OrtEnv* OrtEnv::GetInstance(const OrtEnv::LoggingManagerConstructionInfo& lm_inf
   if (!p_instance_) {
     std::unique_ptr<LoggingManager> lmgr;
     std::string name = lm_info.logid;
+
+    std::unique_ptr<ISink> sink = nullptr;
     if (lm_info.logging_function) {
-      std::unique_ptr<ISink> logger = std::make_unique<UserLoggingSink>(lm_info.logging_function,
-                                                                        lm_info.logger_param);
-      lmgr = std::make_unique<LoggingManager>(std::move(logger),
-                                              static_cast<Severity>(lm_info.default_warning_level),
-                                              false,
-                                              LoggingManager::InstanceType::Default,
-                                              &name);
-    } else {
-      auto sink = MakePlatformDefaultLogSink();
+      sink = std::make_unique<UserLoggingSink>(lm_info.logging_function, lm_info.logger_param);
 
-      lmgr = std::make_unique<LoggingManager>(std::move(sink),
-                                              static_cast<Severity>(lm_info.default_warning_level),
-                                              false,
-                                              LoggingManager::InstanceType::Default,
-                                              &name);
+    } else {
+      sink = MakePlatformDefaultLogSink();
     }
+    auto etwOverrideSeverity = logging::OverrideLevelWithEtw(static_cast<Severity>(lm_info.default_warning_level));
+    sink = EnhanceLoggerWithEtw(std::move(sink), static_cast<Severity>(lm_info.default_warning_level),
+                                etwOverrideSeverity);
+    lmgr = std::make_unique<LoggingManager>(std::move(sink),
+                                            std::min(static_cast<Severity>(lm_info.default_warning_level), etwOverrideSeverity),
+                                            false,
+                                            LoggingManager::InstanceType::Default,
+                                            &name);
+
     std::unique_ptr<onnxruntime::Environment> env;
     if (!tp_options) {
       status = onnxruntime::Environment::Create(std::move(lmgr), env);
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index e3b8dea90a89..fda41161ac40 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -6,6 +6,7 @@
 
 #include "core/common/inlined_containers.h"
 #include "core/framework/allocator_utils.h"
+#include "core/framework/config_options.h"
 #include "core/framework/compute_capability.h"
 #include "core/framework/data_types.h"
 #include "core/framework/data_transfer_manager.h"
@@ -13,6 +14,7 @@
 #include "core/framework/execution_provider.h"
 #include "core/framework/kernel_registry.h"
 #include "core/framework/provider_shutdown.h"
+#include "core/framework/run_options.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/TensorSeq.h"
 #include "core/framework/provider_options.h"
@@ -29,6 +31,7 @@
 #include "core/framework/sparse_utils.h"
 #include "core/graph/graph_proto_serializer.h"
 #include "core/framework/murmurhash3.h"
+#include "core/framework/model_metadef_id_generator.h"
 
 #include "core/session/onnxruntime_c_api.h"
 #include "core/common/string_helper.h"
@@ -55,6 +58,8 @@
 namespace ONNX_NAMESPACE {
 // We use these names in the provider API because we don't have the protobuf definitions of the RepeatedField* types
 using int64s = google::protobuf::RepeatedField<int64_t>;
+using float32s = google::protobuf::RepeatedField<float>;
+using StringStringEntryProtos = google::protobuf::RepeatedPtrField<StringStringEntryProto>;
 using TensorProtos = google::protobuf::RepeatedPtrField<TensorProto>;
 using TensorShapeProto_Dimensions = google::protobuf::RepeatedPtrField<TensorShapeProto_Dimension>;
 using ValueInfoProtos = google::protobuf::RepeatedPtrField<ValueInfoProto>;
@@ -75,6 +80,7 @@ using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 #include "core/providers/migraphx/migraphx_provider_factory_creator.h"
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h"
+#include "core/providers/vitisai/vitisai_provider_factory_creator.h"
 
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cann/cann_provider_factory.h"
@@ -88,6 +94,10 @@ using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 #include "core/providers/cann/cann_provider_options.h"
 #include "core/providers/dnnl/dnnl_provider_options.h"
 
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#endif
+
 // The filename extension for a shared library is different per platform
 #ifdef _WIN32
 #define LIBRARY_PREFIX
@@ -117,6 +127,7 @@ ProviderInfo_Dnnl& GetProviderInfo_Dnnl();
 ProviderInfo_ROCM* TryGetProviderInfo_ROCM();
 ProviderInfo_ROCM& GetProviderInfo_ROCM();
 ProviderHostCPU& GetProviderHostCPU();
+ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const std::vector<const OrtCustomOp*>& ops);
 struct TensorShapeProto_Dimension_Iterator_Impl : TensorShapeProto_Dimension_Iterator {
   TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
 
@@ -268,7 +279,10 @@ struct ProviderHostImpl : ProviderHost {
   Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ uint32_t* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }
   Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ int64_t* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }
   Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ uint64_t* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }
-
+  Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, const Path& model_path,
+                               /*out*/ std::vector<uint8_t>& unpacked_tensor) override {
+    return utils::UnpackInitializerData(tensor, model_path, unpacked_tensor);
+  }
   uint16_t math__floatToHalf(float f) override { return math::floatToHalf(f); }
   float math__halfToFloat(uint16_t h) override { return math::halfToFloat(h); }
 
@@ -312,10 +326,6 @@ struct ProviderHostImpl : ProviderHost {
     return p->IExecutionProvider::Compile(fused_nodes_and_graphs, node_compute_funcs);
   }
 
-  int IExecutionProvider__GenerateMetaDefId(const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer, HashValue& model_hash) override {
-    return p->IExecutionProvider::GenerateMetaDefId(graph_viewer, model_hash);
-  }
-
   // Status (direct)
   std::string Status__ToString(const Status* p) override { return p->Status::ToString(); }
 
@@ -350,12 +360,32 @@ struct ProviderHostImpl : ProviderHost {
   void logging__Capture__operator_delete(logging::Capture* p) noexcept override { delete p; }
   std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept override { return p->Stream(); }
 
+  // Env
+  Env& Env__Default() override { return Env::Default(); }
+
   // Utils::DataTypeUtils (wrapped)
   const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) override { return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_proto); }
 
   // int64s (wrapped)
   int int64s__size(const ONNX_NAMESPACE::int64s* p) override { return p->size(); }
   const int64_t& int64s__Get(const ONNX_NAMESPACE::int64s* p, int index) override { return p->Get(index); }
+  void int64s__Reserve(ONNX_NAMESPACE::int64s* p, int size) override { p->Reserve(size); };
+  const int64_t* int64s__data(const ONNX_NAMESPACE::int64s* p) override { return p->data(); }
+
+  // float32s
+  void float32s__Reserve(ONNX_NAMESPACE::float32s* p, int size) override { p->Reserve(size); };
+  const float* float32s__data(const ONNX_NAMESPACE::float32s* p) override { return p->data(); }
+  int float32s__size(const ONNX_NAMESPACE::float32s* p) override { return p->size(); }
+
+  // StringStringEntryProto
+  std::string* StringStringEntryProto__mutable_key(ONNX_NAMESPACE::StringStringEntryProto* p) override { return p->mutable_key(); }
+  std::string* StringStringEntryProto__mutable_value(ONNX_NAMESPACE::StringStringEntryProto* p) override { return p->mutable_value(); }
+
+  // StringStringEntryProtos
+  void StringStringEntryProtos__Clear(ONNX_NAMESPACE::StringStringEntryProtos* p) override { p->Clear(); };
+  ONNX_NAMESPACE::StringStringEntryProto* StringStringEntryProtos__Add(ONNX_NAMESPACE::StringStringEntryProtos* p) override { return p->Add(); }
+  int StringStringEntryProtos__size(ONNX_NAMESPACE::StringStringEntryProtos* p) override { return p->size(); }
+  ONNX_NAMESPACE::StringStringEntryProto& StringStringEntryProtos__at(ONNX_NAMESPACE::StringStringEntryProtos* p, int index) override { return p->at(index); };
 
 #if !defined(DISABLE_OPTIONAL_TYPE)
   // TypeProto_Optional (wrapped)
@@ -372,6 +402,7 @@ struct ProviderHostImpl : ProviderHost {
   const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->shape(); }
   ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->mutable_shape(); }
   int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->elem_type(); }
+  void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) override { p->set_elem_type(value); };
 
   // TypeProto_SparseTensor (wrapped)
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -424,9 +455,19 @@ struct ProviderHostImpl : ProviderHost {
   float AttributeProto__floats(const ONNX_NAMESPACE::AttributeProto* p, int i) override { return p->floats(i); }
   const std::string& AttributeProto__strings(const ONNX_NAMESPACE::AttributeProto* p, int i) override { return p->strings(i); }
   const ONNX_NAMESPACE::int64s& AttributeProto__ints(const ONNX_NAMESPACE::AttributeProto* p) override { return p->ints(); }
+  const ONNX_NAMESPACE::float32s& AttributeProto__floats(const ONNX_NAMESPACE::AttributeProto* p) override { return p->floats(); }
+  ONNX_NAMESPACE::int64s* AttributeProto__mutable_ints(ONNX_NAMESPACE::AttributeProto* p) override { return p->mutable_ints(); }
+  ONNX_NAMESPACE::float32s* AttributeProto__mutable_floats(ONNX_NAMESPACE::AttributeProto* p) override { return p->mutable_floats(); }
+  void AttributeProto__add_ints(ONNX_NAMESPACE::AttributeProto* p, int64_t value) override { p->add_ints(value); };
+  void AttributeProto__add_floats(ONNX_NAMESPACE::AttributeProto* p, float value) override { p->add_floats(value); };
+  void AttributeProto__add_strings(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) override { p->add_strings(value); };
+
   int64_t AttributeProto__i(const ONNX_NAMESPACE::AttributeProto* p) override { return p->i(); }
   float AttributeProto__f(const ONNX_NAMESPACE::AttributeProto* p) override { return p->f(); }
+  const ONNX_NAMESPACE::TensorProto& AttributeProto__t(const ONNX_NAMESPACE::AttributeProto* p) override { return p->t(); }
   void AttributeProto__set_s(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) override { return p->set_s(value); }
+  void AttributeProto__set_f(ONNX_NAMESPACE::AttributeProto* p, const float& value) override { return p->set_f(value); }
+  void AttributeProto__set_i(ONNX_NAMESPACE::AttributeProto* p, int64_t value) override { return p->set_i(value); }
   const ::std::string& AttributeProto__s(const ONNX_NAMESPACE::AttributeProto* p) override { return p->s(); }
   void AttributeProto__set_name(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) override { return p->set_name(value); }
   void AttributeProto__set_type(ONNX_NAMESPACE::AttributeProto* p, ONNX_NAMESPACE::AttributeProto_AttributeType value) override { return p->set_type(value); }
@@ -447,6 +488,8 @@ struct ProviderHostImpl : ProviderHost {
   ONNX_NAMESPACE::ValueInfoProtos* GraphProto__mutable_value_info(ONNX_NAMESPACE::GraphProto* p) override { return p->mutable_value_info(); }
   ONNX_NAMESPACE::TensorProtos* GraphProto__mutable_initializer(ONNX_NAMESPACE::GraphProto* p) override { return p->mutable_initializer(); }
   ONNX_NAMESPACE::NodeProto* GraphProto__add_node(ONNX_NAMESPACE::GraphProto* p) override { return p->add_node(); }
+  std::string* GraphProto__mutable_name(ONNX_NAMESPACE::GraphProto* p) override { return p->mutable_name(); }
+  ONNX_NAMESPACE::NodeProto* GraphProto__mutable_node(ONNX_NAMESPACE::GraphProto* p, int index) override { return p->mutable_node(index); }
 
   void GraphProto__operator_assign(ONNX_NAMESPACE::GraphProto* p, const ONNX_NAMESPACE::GraphProto& v) override { *p = v; }
 
@@ -463,6 +506,7 @@ struct ProviderHostImpl : ProviderHost {
   ONNX_NAMESPACE::GraphProto* ModelProto__mutable_graph(ONNX_NAMESPACE::ModelProto* p) override { return p->mutable_graph(); }
 
   void ModelProto__set_ir_version(ONNX_NAMESPACE::ModelProto* p, int64_t value) override { p->set_ir_version(value); }
+  ONNX_NAMESPACE::StringStringEntryProtos* ModelProto__mutable_metadata_props(ONNX_NAMESPACE::ModelProto* p) override { return p->mutable_metadata_props(); };
 
   // NodeProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::NodeProto> NodeProto__construct() override { return std::make_unique<ONNX_NAMESPACE::NodeProto>(); }
@@ -470,25 +514,42 @@ struct ProviderHostImpl : ProviderHost {
   void NodeProto__operator_assign(ONNX_NAMESPACE::NodeProto* p, const ONNX_NAMESPACE::NodeProto& v) override { *p = v; }
   int NodeProto__attribute_size(ONNX_NAMESPACE::NodeProto* p) override { return p->attribute_size(); }
   const ONNX_NAMESPACE::AttributeProto& NodeProto__attribute(const ONNX_NAMESPACE::NodeProto* p, int index) const override { return p->attribute(index); }
+  ONNX_NAMESPACE::AttributeProto* NodeProto__mutable_attribute(ONNX_NAMESPACE::NodeProto* p, int index) override { return p->mutable_attribute(index); }
 
   // TensorProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::TensorProto> TensorProto__construct() override { return std::make_unique<ONNX_NAMESPACE::TensorProto>(); }
   void TensorProto__operator_delete(ONNX_NAMESPACE::TensorProto* p) override { delete p; }
   void TensorProto__operator_assign(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto& v) override { *p = v; }
   bool TensorProto__has_name(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_name(); }
+  void TensorProto__set_name(ONNX_NAMESPACE::TensorProto* p, const ::std::string& name) override { p->set_name(name); }
+  const ::std::string& TensorProto__name(const ONNX_NAMESPACE::TensorProto* p) override { return p->name(); }
   int TensorProto__dims_size(const ONNX_NAMESPACE::TensorProto* p) override { return p->dims_size(); }
   const ONNX_NAMESPACE::int64s& TensorProto__dims(const ONNX_NAMESPACE::TensorProto* p) override { return p->dims(); }
+  void TensorProto__add_dims(ONNX_NAMESPACE::TensorProto* p, int64_t value) override { p->add_dims(value); }
   bool TensorProto__has_data_location(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_data_location(); }
   int TensorProto__data_location(const ONNX_NAMESPACE::TensorProto* p) override { return p->data_location(); }
+  void TensorProto__set_data_location(ONNX_NAMESPACE::TensorProto* p, ONNX_NAMESPACE::TensorProto_DataLocation data_location) override { return p->set_data_location(data_location); }
   bool TensorProto__has_raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_raw_data(); }
   const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->raw_data(); }
+  std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) override { return p->mutable_raw_data(); }
+
   int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->data_type(); }
+  void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) override { p->set_data_type(type); }
 
   bool TensorProto_DataType_IsValid(int value) override { return ONNX_NAMESPACE::TensorProto::DataType_IsValid(value); }
   void TensorProto__CopyFrom(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto* other) override { p->CopyFrom(*other); }
+  ONNX_NAMESPACE::StringStringEntryProtos* TensorProto__mutable_external_data(ONNX_NAMESPACE::TensorProto* p) override { return p->mutable_external_data(); };
+  void TensorProto__clear_float_data(ONNX_NAMESPACE::TensorProto* p) override { p->clear_float_data(); }
+  void TensorProto__clear_int32_data(ONNX_NAMESPACE::TensorProto* p) override { p->clear_int32_data(); }
+  void TensorProto__clear_string_data(ONNX_NAMESPACE::TensorProto* p) override { p->clear_string_data(); }
+  void TensorProto__clear_int64_data(ONNX_NAMESPACE::TensorProto* p) override { p->clear_int64_data(); }
+  void TensorProto__clear_double_data(ONNX_NAMESPACE::TensorProto* p) override { p->clear_double_data(); }
+  void TensorProto__clear_uint64_data(ONNX_NAMESPACE::TensorProto* p) override { p->clear_uint64_data(); }
 
   // TensorProtos (wrapped)
   ONNX_NAMESPACE::TensorProto* TensorProtos__Add(ONNX_NAMESPACE::TensorProtos* p) override { return p->Add(); }
+  int TensorProtos__size(ONNX_NAMESPACE::TensorProtos* p) override { return p->size(); }
+  ONNX_NAMESPACE::TensorProto& TensorProtos__at(ONNX_NAMESPACE::TensorProtos* p, int index) override { return p->at(index); };
 
   // TensorShapeProto_Dimension (wrapped)
   int TensorShapeProto_Dimension__value_case(const ONNX_NAMESPACE::TensorShapeProto_Dimension* p) override { return p->value_case(); }
@@ -498,6 +559,8 @@ struct ProviderHostImpl : ProviderHost {
   void TensorShapeProto_Dimension__set_dim_value(ONNX_NAMESPACE::TensorShapeProto_Dimension* p, int64_t value) override { return p->set_dim_value(value); }
   bool TensorShapeProto_Dimension__has_dim_value(const ONNX_NAMESPACE::TensorShapeProto_Dimension* p) override { return p->has_dim_value(); }
   bool TensorShapeProto_Dimension__has_dim_param(const ONNX_NAMESPACE::TensorShapeProto_Dimension* p) override { return p->has_dim_param(); }
+  const std::string& TensorShapeProto_Dimension__denotation(const ONNX_NAMESPACE::TensorShapeProto_Dimension* p) const override { return p->denotation(); }
+  void TensorShapeProto_Dimension__set_denotation(ONNX_NAMESPACE::TensorShapeProto_Dimension* p, const std::string& value) override { return p->set_denotation(value); }
 
   // TensorShapeProto_Dimensions (wrapped)
   std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__begin(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) override {
@@ -526,6 +589,98 @@ struct ProviderHostImpl : ProviderHost {
 
   const ONNX_NAMESPACE::ValueInfoProto& ValueInfoProtos__operator_array(const ONNX_NAMESPACE::ValueInfoProtos* p, int index) override { return (*p)[index]; }
 
+  static void xir_shape_infer(ONNX_NAMESPACE::InferenceContext& ctx) {
+    auto* shape = ctx.getAttribute("shape");
+    auto* data_type = ctx.getAttribute("data_type");
+    int32_t elemType = 0;
+    if (data_type->s() == "float32") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+    } else if (data_type->s() == "int8") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_INT8;
+    } else if (data_type->s() == "uint8") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_UINT8;
+    } else if (data_type->s() == "int32") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_INT32;
+    } else if (data_type->s() == "int64") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_INT64;
+    } else if (data_type->s() == "int1") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_BOOL;
+    } else if (data_type->s() == "bfloat16") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16;
+    } else if (data_type->s() == "float16") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
+    } else if (data_type->s() == "uint16") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_UINT16;
+    } else if (data_type->s() == "int16") {
+      elemType = ONNX_NAMESPACE::TensorProto_DataType_INT16;
+    } else {
+      return;
+    }
+    ONNX_NAMESPACE::updateOutputElemType(ctx, 0, elemType);
+    if (shape != nullptr) {
+      for (auto i = 0; i < shape->ints_size(); ++i) {
+        ONNX_NAMESPACE::getOutputShape(ctx, 0, ONNX_NAMESPACE::TypeProto::kTensorType)->add_dim()->set_dim_value(shape->ints(i));
+      }
+    } else {
+      // set scalar type.
+      ONNX_NAMESPACE::getOutputShape(ctx, 0, ONNX_NAMESPACE::TypeProto::kTensorType)->clear_dim();
+    }
+  }
+
+  static void xir_fixneuron_shape_inference(ONNX_NAMESPACE::InferenceContext& ctx) {
+    ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+    ONNX_NAMESPACE::propagateShapeFromInputToOutput(ctx, 0, 0);
+  }
+
+  static void xir_subgraph_shape_inference(ONNX_NAMESPACE::InferenceContext& ctx) {
+    auto num_inputs = ctx.getNumInputs();
+
+    // Run inferencing on the subgraph
+    auto* graphInferencer = ctx.getGraphAttributeInferencer("body");
+
+    std::vector<const ONNX_NAMESPACE::TensorProto*> input_data;
+    std::vector<const ONNX_NAMESPACE::TypeProto*> subgraph_input_types;
+    for (size_t i = 0; i < num_inputs; ++i) {
+      input_data.push_back(ctx.getInputData(i));
+      subgraph_input_types.push_back(ctx.getInputType(i));
+    }
+
+    auto output_types = graphInferencer->doInferencing(subgraph_input_types, input_data);
+    for (size_t i = 0, end = output_types.size(); i < end; ++i) {
+      *ctx.getOutputType(i) = *output_types[i];
+    }
+  }
+  void RegisterSchema(const std::string& domain, const OrtCustomOp* op, int type) override {
+    auto& domain_instance = ONNX_NAMESPACE::OpSchemaRegistry::DomainToVersionRange::Instance();
+    const auto& domain_to_version_map = domain_instance.Map();
+    if (domain_to_version_map.find(domain) == domain_to_version_map.end()) {
+      domain_instance.AddDomainToVersion(domain, 1, 1000);
+    }
+    auto schema = CreateSchema(domain, {op});
+    switch (type) {
+      case 1:
+        schema.TypeAndShapeInferenceFunction(xir_subgraph_shape_inference);
+        break;
+      case 2:
+        schema.TypeAndShapeInferenceFunction(xir_fixneuron_shape_inference);
+        break;
+      case 3:
+        schema.TypeAndShapeInferenceFunction(xir_shape_infer);
+        break;
+      default:
+        break;
+    }
+    ONNX_NAMESPACE::RegisterSchema(schema, ORT_API_VERSION);
+  }
+
+  // ConfigOptions (wrapped)
+  std::optional<std::string> ConfigOptions__GetConfigEntry(const ConfigOptions* p, const std::string& config_key) override {
+    return p->GetConfigEntry(config_key);
+  }
+
+  // OrtRunOptions (wrapped)
+  const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions* p) override { return p->config_options; }
+
   // ComputeCapability (wrapped)
   std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) override { return std::make_unique<ComputeCapability>(std::move(t_sub_graph)); }
   void ComputeCapability__operator_delete(ComputeCapability* p) override { delete p; }
@@ -752,6 +907,9 @@ struct ProviderHostImpl : ProviderHost {
   void Node__ToProto(const Node* p, ONNX_NAMESPACE::NodeProto& proto, bool update_subgraphs = false) override { p->ToProto(proto, update_subgraphs); }
 
   const NodeAttributes& Node__GetAttributes(const Node* p) noexcept override { return p->GetAttributes(); }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) override {
+    p->AddAttribute(attr_name, value);
+  }
   size_t Node__GetInputEdgesCount(const Node* p) noexcept override { return p->GetInputEdgesCount(); }
   size_t Node__GetOutputEdgesCount(const Node* p) noexcept override { return p->GetOutputEdgesCount(); }
 
@@ -760,13 +918,19 @@ struct ProviderHostImpl : ProviderHost {
 
   std::unique_ptr<Node__NodeIterator> Node__OutputNodesBegin(const Node* p) noexcept override { return std::make_unique<Node__NodeIterator_Impl>(p->OutputNodesBegin()); }
   std::unique_ptr<Node__NodeIterator> Node__OutputNodesEnd(const Node* p) noexcept override { return std::make_unique<Node__NodeIterator_Impl>(p->OutputNodesEnd()); }
-
+  std::unique_ptr<Node__EdgeIterator> Node__InputEdgesBegin(const Node* p) noexcept override {
+    return std::make_unique<Node__EdgeIterator_Impl>(p->InputEdgesBegin());
+  }
+  std::unique_ptr<Node__EdgeIterator> Node__InputEdgesEnd(const Node* p) noexcept override {
+    return std::make_unique<Node__EdgeIterator_Impl>(p->InputEdgesEnd());
+  }
   std::unique_ptr<Node__EdgeIterator> Node__OutputEdgesBegin(const Node* p) noexcept override { return std::make_unique<Node__EdgeIterator_Impl>(p->OutputEdgesBegin()); }
   std::unique_ptr<Node__EdgeIterator> Node__OutputEdgesEnd(const Node* p) noexcept override { return std::make_unique<Node__EdgeIterator_Impl>(p->OutputEdgesEnd()); }
 
   void Node__ForEachDef(const Node* p, std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs) override { p->ForEachDef(func, std::move(include_missing_optional_defs)); }
   const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) noexcept override { return p->GetAttributeNameToMutableSubgraphMap(); }
   std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); }
+  int Node__NodeType(const Node* p) const noexcept override { return int(p->NodeType()); }
 
   // NodeArg (wrapped)
   const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); }
@@ -775,6 +939,7 @@ struct ProviderHostImpl : ProviderHost {
   const NodeArgInfo& NodeArg__ToProto(const NodeArg* p) noexcept override { return p->ToProto(); }
   bool NodeArg__Exists(const NodeArg* p) const noexcept override { return p->Exists(); }
   const ONNX_NAMESPACE::TypeProto* NodeArg__TypeAsProto(const NodeArg* p) noexcept override { return p->TypeAsProto(); }
+  Status NodeArg__OverrideTypesHelper(NodeArg* p, const ONNX_NAMESPACE::TypeProto& input_type, int32_t input_tensor_elem_type, int32_t current_tensor_elem_type, bool override_types) override { return p->OverrideTypesHelper(input_type, input_tensor_elem_type, current_tensor_elem_type, override_types); };
 
   // NodeAttributes (wrapped)
   std::unique_ptr<NodeAttributes> NodeAttributes__construct() override { return std::make_unique<NodeAttributes>(); }
@@ -797,12 +962,21 @@ struct ProviderHostImpl : ProviderHost {
   }
   void NodeAttributes__insert(NodeAttributes* p, const NodeAttributes& v) override { return p->insert(v.begin(), v.end()); }
   void NodeAttributes__emplace(NodeAttributes* p, const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) override { p->emplace(k, v); }
+  void NodeAttributes__insert_or_assign(NodeAttributes* p, const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) override { p->insert_or_assign(k, v); }
   void NodeAttributes__reserve(NodeAttributes* p, size_t size) override { p->reserve(size); }
 
   // Model (wrapped)
+  std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
+                                          const IOnnxRuntimeOpSchemaRegistryList* local_registries,
+                                          const logging::Logger& logger) override {
+    return std::make_unique<Model>(model_proto, model_path, local_registries, logger);
+  }
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToProto()); }
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p, const std::string& external_file_name, const PathString& file_path, size_t initializer_size_threshold) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold)); };
+  const ModelMetaData& Model__MetaData(const Model* p) const noexcept override { return p->MetaData(); };
+  Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) override { return Model::Load(file_path, model_proto); }
 
   // Graph (wrapped)
   std::unique_ptr<GraphViewer> Graph__CreateGraphViewer(const Graph* p) override { return std::make_unique<GraphViewer>(*p); }
@@ -822,6 +996,12 @@ struct ProviderHostImpl : ProviderHost {
   void Graph__SetOutputs(Graph* p, gsl::span<const NodeArg* const> outputs) override { p->SetOutputs(outputs); }
 
   const std::vector<const NodeArg*>& Graph__GetInputs(const Graph* p) noexcept override { return p->GetInputs(); }
+  std::vector<const Node*> Graph__Nodes(const Graph* p) override {
+    auto& node_refererence = p->Nodes();
+    std::vector<const Node*> nodes(p->NumberOfNodes(), nullptr);
+    std::transform(node_refererence.begin(), node_refererence.end(), nodes.begin(), [](const Node& n) { return &n; });
+    return nodes;
+  }
   bool Graph__GetInitializedTensor(const Graph* p, const std::string& tensor_name, const ONNX_NAMESPACE::TensorProto*& value) override { return p->GetInitializedTensor(tensor_name, value); }
 
   const Node* Graph__ParentNode(const Graph* p) const override { return p->ParentNode(); }
@@ -831,10 +1011,45 @@ struct ProviderHostImpl : ProviderHost {
   const Path& Graph__ModelPath(const Graph* p) const override { return p->ModelPath(); }
   const std::vector<const NodeArg*>& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept override { return p->GetInputsIncludingInitializers(); }
   bool Graph__IsSubgraph(const Graph* p) override { return p->IsSubgraph(); }
+  const Node* Graph__GetProducerNode(const Graph* p, const std::string& node_arg_name) const override { return p->GetProducerNode(node_arg_name); }
+  const Model& Graph__GetModel(const Graph* p) override { return p->GetModel(); }
+  void Graph__ReverseDFSFrom(const Graph* p, gsl::span<const Node* const> from,
+                             const std::function<void(const Node*)>& enter,
+                             const std::function<void(const Node*)>& leave,
+                             const std::function<bool(const Node*, const Node*)>& comp,
+                             const std::function<bool(const Node* from, const Node* to)>& stop) const override {
+    p->ReverseDFSFrom(from, enter, leave, comp, stop);
+  }
+  Graph& Graph__SetGraphResolveNeeded(Graph* p) override { return p->SetGraphResolveNeeded(); }
+  void Graph__RemoveInitializedTensor(Graph* p, const std::string& tensor_name) override { p->RemoveInitializedTensor(tensor_name); }
+
+  std::vector<const Node*> Graph__GetConsumerNodes(const Graph* p, const std::string& node_arg_name) const override {
+    return p->GetConsumerNodes(node_arg_name);
+  }
+  void Graph__AddEdge(Graph* p, NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_index,
+                      int dst_arg_index) override {
+    p->AddEdge(src_node_index, dst_node_index, src_arg_index, dst_arg_index);
+  }
+  void Graph__RemoveEdge(Graph* p, NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_index,
+                         int dst_arg_index) override {
+    p->RemoveEdge(src_node_index, dst_node_index, src_arg_index, dst_arg_index);
+  }
+  void Graph__RemoveNode(Graph* p, NodeIndex index) override { p->RemoveNode(index); }
+  Node& Graph__FuseSubGraph(Graph* p, const IndexedSubGraph& sub_graph, const std::string& fused_node_name) override {
+    return p->FuseSubGraph(sub_graph, fused_node_name);
+  }
+  void Graph__UpdateProducerNode(Graph* p, const std::string& node_arg_name, NodeIndex node_index) override {
+    p->UpdateProducerNode(node_arg_name, node_index);
+  }
+  const ONNX_NAMESPACE::TensorProto* Graph__GetConstantInitializer(const Graph* p, const std::string& name, bool check_outer_scope) const override {
+    return p->GetConstantInitializer(name, check_outer_scope);
+  }
+  const InitializedTensorSet& Graph__GetAllInitializedTensors(const Graph* p) override { return p->GetAllInitializedTensors(); }
   int Graph__MaxNodeIndex(const Graph* p) const noexcept override { return p->MaxNodeIndex(); }
   Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept override { return p->GetNode(node_index); }
   const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const override { return p->GetNode(node_index); }
   const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const override { return p->GetNodeArg(name); }
+  IOnnxRuntimeOpSchemaCollectionPtr Graph__GetSchemaRegistry(const Graph* p) const override { return p->GetSchemaRegistry(); }
 
   // GraphViewer (wrapped)
   void GraphViewer__operator_delete(GraphViewer* p) override { delete p; }
@@ -875,11 +1090,14 @@ struct ProviderHostImpl : ProviderHost {
   void GraphViewer__ToProto(const GraphViewer* p, ONNX_NAMESPACE::GraphProto& graph_proto, bool include_initializers, bool include_outer_scope_args) noexcept override {
     GraphViewerToProto(*p, graph_proto, include_initializers, include_outer_scope_args);
   }
+  const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const override { return p->GetProducerNode(node_arg_name); }
 
   // Path (wrapped)
   PathString Path__ToPathString(const Path* p) noexcept override { return p->ToPathString(); }
   const std::vector<PathString>& Path__GetComponents(const Path* p) noexcept override { return p->GetComponents(); }
   bool Path__IsEmpty(const Path* p) noexcept override { return p->IsEmpty(); }
+  std::unique_ptr<Path> Path__construct() override { return std::make_unique<Path>(); }
+  void Path__operator_delete(ONNX_NAMESPACE::Path* p) override { delete p; };
 
   // OpKernel (direct)
   const Node& OpKernel__Node(const OpKernel* p) override { return p->OpKernel::Node(); }
@@ -931,6 +1149,7 @@ struct ProviderHostImpl : ProviderHost {
   uint32_t OpKernelInfo__GetInputCount(const OpKernelInfo* p) override { return p->GetInputCount(); }
   uint32_t OpKernelInfo__GetOutputCount(const OpKernelInfo* p) override { return p->GetOutputCount(); }
   const Node& OpKernelInfo__node(const OpKernelInfo* p) override { return p->node(); }
+  const ConfigOptions& OpKernelInfo__GetConfigOptions(const OpKernelInfo* p) override { return p->GetConfigOptions(); }
 
   // SessionState (wrapped)
   const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) override { return p->GetDataTransferMgr(); }
@@ -1069,6 +1288,11 @@ struct ProviderHostImpl : ProviderHost {
   void TensorSeq__Add(TensorSeq* p, Tensor&& tensor) override { p->Add(std::move(tensor)); }
   void TensorSeq__Reserve(TensorSeq* p, size_t capacity) override { p->Reserve(capacity); }
 
+  // ModelMetadefIdGenerator(wrapped)
+  std::unique_ptr<ModelMetadefIdGenerator> ModelMetadefIdGenerator__construct() override { return std::make_unique<ModelMetadefIdGenerator>(); }
+  void ModelMetadefIdGenerator__operator_delete(ModelMetadefIdGenerator* p) override { delete p; }
+  int ModelMetadefIdGenerator__GenerateId(const ModelMetadefIdGenerator* p, const GraphViewer& graph_viewer, HashValue& model_hash) override { return p->GenerateId(graph_viewer, model_hash); }
+
 #if defined(ENABLE_TRAINING) && defined(ORT_USE_NCCL)
   training::DistributedRunContext& GetDistributedRunContextInstance() override { return training::DistributedRunContext::GetInstance(); }
 #endif
@@ -1264,6 +1488,7 @@ static ProviderLibrary s_library_rocm(LIBRARY_PREFIX ORT_TSTR("onnxruntime_provi
 #endif
 );
 static ProviderLibrary s_library_dnnl(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_dnnl") LIBRARY_EXTENSION);
+static ProviderLibrary s_library_vitisai(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_vitisai") LIBRARY_EXTENSION);
 static ProviderLibrary s_library_openvino(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_openvino") LIBRARY_EXTENSION);
 static ProviderLibrary s_library_tensorrt(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_tensorrt") LIBRARY_EXTENSION
 #ifndef _WIN32
@@ -1292,6 +1517,7 @@ static ProviderLibrary s_library_migraphx(LIBRARY_PREFIX ORT_TSTR("onnxruntime_p
 
 void UnloadSharedProviders() {
   s_library_dnnl.Unload();
+  s_library_vitisai.Unload();
   s_library_openvino.Unload();
   s_library_tensorrt.Unload();
   s_library_cuda.Unload();
@@ -1336,6 +1562,7 @@ OrtCUDAProviderOptionsV2 OrtCUDAProviderOptionsToOrtCUDAProviderOptionsV2(const
   cuda_options_converted.cudnn_conv1d_pad_to_nc1d = 0;
   cuda_options_converted.enable_skip_layer_norm_strict_mode = 0;
   cuda_options_converted.use_ep_level_unified_stream = 0;
+  cuda_options_converted.use_tf32 = 1;
 
   return cuda_options_converted;
 }
@@ -1362,10 +1589,6 @@ std::shared_ptr<IExecutionProviderFactory> DnnlProviderFactoryCreator::Create(in
   return s_library_dnnl.Get().CreateExecutionProviderFactory(use_arena);
 }
 
-std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
-  return s_library_tensorrt.Get().CreateExecutionProviderFactory(device_id);
-}
-
 std::shared_ptr<IExecutionProviderFactory> MIGraphXProviderFactoryCreator::Create(int device_id) {
   return s_library_migraphx.Get().CreateExecutionProviderFactory(device_id);
 }
@@ -1409,10 +1632,18 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
   trt_options_converted.trt_profile_max_shapes = "";
   trt_options_converted.trt_profile_opt_shapes = "";
   trt_options_converted.trt_cuda_graph_enable = 0;
+  trt_options_converted.trt_dump_ep_context_model = 0;
+  trt_options_converted.trt_ep_context_file_path = "";
+  trt_options_converted.trt_ep_context_embed_mode = 0;
+  trt_options_converted.trt_engine_cache_prefix = "";
 
   return trt_options_converted;
 }
 
+std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
+  return s_library_tensorrt.Get().CreateExecutionProviderFactory(device_id);
+}
+
 std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(const OrtTensorRTProviderOptions* provider_options) {
   OrtTensorRTProviderOptionsV2 trt_options_converted = onnxruntime::OrtTensorRTProviderOptionsToOrtTensorRTProviderOptionsV2(provider_options);
   return s_library_tensorrt.Get().CreateExecutionProviderFactory(&trt_options_converted);
@@ -1432,29 +1663,35 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O
   if (legacy_ov_options->device_type != nullptr)
     ov_options_converted_map["device_type"] = legacy_ov_options->device_type;
 
-  ov_options_converted_map["enable_npu_fast_compile"] = legacy_ov_options->enable_npu_fast_compile;
+  if (legacy_ov_options->enable_npu_fast_compile) {
+    ov_options_converted_map["enable_npu_fast_compile"] = "false";
+  } else {
+    ov_options_converted_map["enable_npu_fast_compile"] = "true";
+  }
 
   if (legacy_ov_options->device_id != nullptr)
     ov_options_converted_map["device_id"] = legacy_ov_options->device_id;
 
-  ov_options_converted_map["num_of_threads"] = std::to_string(legacy_ov_options->num_of_threads);
+  if (legacy_ov_options->num_of_threads != '\0')
+    ov_options_converted_map["num_of_threads"] = std::to_string(legacy_ov_options->num_of_threads);
 
   if (legacy_ov_options->cache_dir != nullptr)
     ov_options_converted_map["cache_dir"] = legacy_ov_options->cache_dir;
 
-  std::stringstream context_string;
-
-  if (legacy_ov_options->context != nullptr)
+  if (legacy_ov_options->context != nullptr) {
+    std::stringstream context_string;
     context_string << legacy_ov_options->context;
-  ov_options_converted_map["context"] = context_string.str();
+    ov_options_converted_map["context"] = context_string.str();
+  }
 
   ov_options_converted_map["enable_opencl_throttling"] = legacy_ov_options->enable_opencl_throttling;
-  std::string enable_dynamic_shapes = reinterpret_cast<const char*>(legacy_ov_options->enable_dynamic_shapes);
-  if (enable_dynamic_shapes == "true" || enable_dynamic_shapes == "True") {
+
+  if (legacy_ov_options->enable_dynamic_shapes) {
     ov_options_converted_map["disable_dynamic_shapes"] = "false";
-  } else if (enable_dynamic_shapes == "false" || enable_dynamic_shapes == "False") {
+  } else {
     ov_options_converted_map["disable_dynamic_shapes"] = "true";
   }
+
   // Add new provider option below
   ov_options_converted_map["num_streams"] = "1";
   return ov_options_converted_map;
@@ -1474,6 +1711,10 @@ std::shared_ptr<IExecutionProviderFactory> DnnlProviderFactoryCreator::Create(co
   return s_library_dnnl.Get().CreateExecutionProviderFactory(dnnl_options);
 }
 
+std::shared_ptr<IExecutionProviderFactory> VitisAIProviderFactoryCreator::Create(const ProviderOptions& provider_options) {
+  return s_library_vitisai.Get().CreateExecutionProviderFactory(&provider_options);
+}
+
 ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO() {
   return reinterpret_cast<ProviderInfo_OpenVINO*>(s_library_openvino.Get().GetInfo());
 }
@@ -1665,17 +1906,9 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Dnnl, _In_ OrtSessi
 
 ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtSessionOptions* options, int device_id) {
   API_IMPL_BEGIN
-  auto factory = onnxruntime::TensorrtProviderFactoryCreator::Create(device_id);
-  if (!factory) {
-    return OrtApis::CreateStatus(ORT_FAIL, "OrtSessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library");
-  }
-
-  options->provider_factories.push_back(factory);
-
-  std::string extra_plugin_lib_paths = onnxruntime::Env::Default().GetEnvironmentVar("trt_extra_plugin_lib_paths");
-  AddTensorRTCustomOpDomainToSessionOption(options, extra_plugin_lib_paths);
-
-  return nullptr;
+  OrtTensorRTProviderOptionsV2 tensorrt_options;
+  tensorrt_options.device_id = device_id;
+  return OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2(options, &tensorrt_options);
   API_IMPL_END
 }
 
@@ -1693,16 +1926,8 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_MIGraphX, _In_ OrtS
 
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) {
   API_IMPL_BEGIN
-  auto factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
-  if (!factory) {
-    return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library");
-  }
-
-  options->provider_factories.push_back(factory);
-
-  AddTensorRTCustomOpDomainToSessionOption(options, "");
-
-  return nullptr;
+  OrtTensorRTProviderOptionsV2 trt_options_converted = onnxruntime::OrtTensorRTProviderOptionsToOrtTensorRTProviderOptionsV2(tensorrt_options);
+  return OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2(options, &trt_options_converted);
   API_IMPL_END
 }
 
@@ -1730,6 +1955,39 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO, _In
   API_IMPL_END
 }
 
+ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+                    _In_ OrtSessionOptions* options,
+                    _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values,
+                    _In_ size_t num_keys) {
+  API_IMPL_BEGIN
+  onnxruntime::ProviderOptions provider_options;
+  for (size_t i = 0; i != num_keys; ++i) {
+    if (provider_options_keys[i] == nullptr || provider_options_keys[i][0] == '\0' ||
+        provider_options_values[i] == nullptr || provider_options_values[i][0] == '\0') {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Provider options key/value cannot be empty");
+    }
+
+    // arbitrary length to validate the key/value. adjust if/when needed.
+    // TODO: are any other input validation checks required here (and in the other functions that process
+    // provider options)?
+    if (strlen(provider_options_keys[i]) > 1024 || strlen(provider_options_values[i]) > 1024) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                   "Maximum string length for a provider options key/value is 1024.");
+    }
+
+    provider_options[provider_options_keys[i]] = provider_options_values[i];
+  }
+  auto factory = onnxruntime::OpenVINOProviderFactoryCreator::Create(&provider_options);
+  if (!factory) {
+    return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_OpenVINO_V2: Failed to load shared library");
+  }
+
+  options->provider_factories.push_back(factory);
+  return nullptr;
+  API_IMPL_END
+}
+
 ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_OpenVINO, _In_ OrtSessionOptions* options,
                     _In_ const char* device_type) {
   OrtOpenVINOProviderOptions provider_options{};
@@ -1797,7 +2055,56 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_ROCM, _In_ Or
 
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptionsV2* tensorrt_options) {
   API_IMPL_BEGIN
-  auto factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+
+  std::shared_ptr<onnxruntime::IExecutionProviderFactory> factory;
+
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+  auto ep_context_cache_enabled_from_provider_options = tensorrt_options->trt_dump_ep_context_model != 0;
+  auto ep_context_cache_enabled_from_sess_options = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") != "0";
+
+  // If EP context configs are provided in session options, we need to propagate them to provider options. However,
+  // if provider options already have the EP context configs provided, the configs in session options will be ignored
+  // since provider options has higher priority than session options.
+  if (!ep_context_cache_enabled_from_provider_options && ep_context_cache_enabled_from_sess_options) {
+    // This function might need to update the "const" OrtTensorRTProviderOptionsV2 object which can't be modified.
+    // Therefore, we need to create a new OrtTensorRTProviderOptionsV2 object and copy from tensorrt_options and use this new object to create the factory instead.
+    // Note: No need to worry about new_tensorrt_options being a local variable, CreateExecutionProviderFactory() in TRT EP will
+    // create a factory object that copies any provider options from tensorrt_options including "const char*" provider options.
+    OrtTensorRTProviderOptionsV2 new_tensorrt_options = *tensorrt_options;  // copy and assign from tensorrt_options
+
+    // Update provider options from session options. Curretnly only EPContext related session options are supported.
+    // Note: The string-based local variables will be kept accessible during the lifetime of this function,
+    // therefore the "const char*" provider options can still be accessible when calling CreateExecutionProviderFactory() in TRT EP.
+    bool context_cache_enabled = false;
+    std::string context_cache_path = "";
+    std::string embed_mode = "";
+    if (options) {
+      context_cache_enabled = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") != "0";
+      new_tensorrt_options.trt_dump_ep_context_model = context_cache_enabled;
+      LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled;
+
+      context_cache_path = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+      new_tensorrt_options.trt_ep_context_file_path = (context_cache_path.size() == 0) ? nullptr : context_cache_path.c_str();
+      LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path;
+
+      embed_mode = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1");
+      if ("1" == embed_mode) {
+        new_tensorrt_options.trt_ep_context_embed_mode = 1;
+      } else if ("0" == embed_mode) {
+        new_tensorrt_options.trt_ep_context_embed_mode = 0;
+      } else {
+        LOGS_DEFAULT(VERBOSE) << "Invalid ep.context_embed_mode: " << embed_mode << " only 0 or 1 allowed. Set to 1.";
+      }
+      LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << embed_mode;
+    }
+    factory = onnxruntime::TensorrtProviderFactoryCreator::Create(&new_tensorrt_options);
+  } else {
+    factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+  }
+#else
+  factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+#endif
+
   if (!factory) {
     return OrtApis::CreateStatus(ORT_FAIL, "OrtSessionOptionsAppendExecutionProvider_TensorRT: Failed to load shared library");
   }
@@ -1935,6 +2242,7 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor
   if (ptr != nullptr) {
     delete[] ptr->trt_int8_calibration_table_name;
     delete[] ptr->trt_engine_cache_path;
+    delete[] ptr->trt_engine_cache_prefix;
     delete[] ptr->trt_timing_cache_path;
     delete[] ptr->trt_engine_decryption_lib_path;
     delete[] ptr->trt_tactic_sources;
@@ -1942,6 +2250,7 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor
     delete[] ptr->trt_profile_min_shapes;
     delete[] ptr->trt_profile_max_shapes;
     delete[] ptr->trt_profile_opt_shapes;
+    delete[] ptr->trt_ep_context_file_path;
   }
 
   std::unique_ptr<OrtTensorRTProviderOptionsV2> p(ptr);
@@ -2093,6 +2402,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateCANNProviderOptions, _Outptr_ OrtCANNProvider
   options->arena_extend_strategy = static_cast<onnxruntime::ArenaExtendStrategy>(0);
   options->enable_cann_graph = 1;
   options->dump_graphs = 0;
+  options->dump_om_model = 1;
   options->default_memory_arena_cfg = nullptr;
   *out = options.release();
   return nullptr;
@@ -2256,6 +2566,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateROCMProviderOptions, _Outptr_ OrtROCMProvider
   options->has_user_compute_stream = 0;
   options->user_compute_stream = nullptr;
   options->default_memory_arena_cfg = nullptr;
+  options->enable_hip_graph = false;
   options->tunable_op_enable = 0;
   options->tunable_op_tuning_enable = 0;
   options->tunable_op_max_tuning_duration_ms = 0;
@@ -2322,3 +2633,34 @@ ORT_API(void, OrtApis::ReleaseROCMProviderOptions, _Frees_ptr_opt_ OrtROCMProvid
   ORT_UNUSED_PARAMETER(ptr);
 #endif
 }
+
+ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_VitisAI, _In_ OrtSessionOptions* options,
+                    _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values, _In_ size_t num_keys) {
+  API_IMPL_BEGIN
+  onnxruntime::ProviderOptions provider_options;
+  for (size_t i = 0; i != num_keys; ++i) {
+    if (provider_options_keys[i] == nullptr || provider_options_keys[i][0] == '\0' ||
+        provider_options_values[i] == nullptr || provider_options_values[i][0] == '\0') {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Provider options key/value cannot be empty");
+    }
+
+    // arbitrary length to validate the key/value. adjust if/when needed.
+    // TODO: are any other input validation checks required here (and in the other functions that process
+    // provider options)?
+    if (strlen(provider_options_keys[i]) > 1024 || strlen(provider_options_values[i]) > 1024) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                   "Maximum string length for a provider options key/value is 1024.");
+    }
+
+    provider_options[provider_options_keys[i]] = provider_options_values[i];
+  }
+  auto factory = onnxruntime::VitisAIProviderFactoryCreator::Create(provider_options);
+  if (!factory) {
+    return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_VitisAI: Failed to load shared library");
+  }
+
+  options->provider_factories.push_back(factory);
+  return nullptr;
+  API_IMPL_END
+}
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index 2e9af9f1f9bb..17a955ba8ce1 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -4,6 +4,7 @@
 #include <string>
 
 #include "core/common/common.h"
+#include "core/common/logging/logging.h"
 #include "core/framework/error_code_helper.h"
 #include "core/framework/provider_options.h"
 #include "core/providers/provider_factory_creators.h"
@@ -13,6 +14,7 @@
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 
 #ifdef _WIN32
+#include <winmeta.h>
 #include "core/platform/tracing.h"
 #endif
 
@@ -75,6 +77,8 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     TraceLoggingWrite(
         telemetry_provider_handle,
         "ProviderOptionsAppendExecutionProvider",
+        TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+        TraceLoggingLevel(WINEVENT_LEVEL_INFO),
         TraceLoggingString(provider_name, "ProviderName"),
         TraceLoggingString(config_pair.first.c_str(), "Key"),
         TraceLoggingString(config_pair.second.c_str(), "Value"));
@@ -86,6 +90,10 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
                                  (std::string(provider_name) + " execution provider is not supported in this build. ").c_str());
   };
 
+  for (const auto& config_pair : provider_options) {
+    ORT_THROW_IF_ERROR(options->value.config_options.AddConfigEntry((std::string(provider_name) + ":" + config_pair.first).c_str(), config_pair.second.c_str()));
+  }
+
   if (strcmp(provider_name, "DML") == 0) {
 #if defined(USE_DML)
     options->provider_factories.push_back(DMLProviderFactoryCreator::CreateFromProviderOptions(provider_options));
@@ -141,13 +149,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     if (options->value.config_options.TryGetConfigEntry("preferredLayout", preferred_layout)) {
       provider_options["preferred_layout"] = preferred_layout;
     }
-    options->provider_factories.push_back(JsProviderFactoryCreator::Create(provider_options));
-#else
-    status = create_not_supported_status();
-#endif
-  } else if (strcmp(provider_name, "VitisAI") == 0) {
-#if defined(USE_VITISAI)
-    options->provider_factories.push_back(VitisAIProviderFactoryCreator::Create(provider_options));
+    options->provider_factories.push_back(JsProviderFactoryCreator::Create(provider_options, &(options->value)));
 #else
     status = create_not_supported_status();
 #endif
@@ -307,6 +309,18 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO,
   return CreateNotEnabledStatus("OpenVINO");
 }
 
+ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+                    _In_ OrtSessionOptions* options,
+                    _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values,
+                    _In_ size_t num_keys) {
+  ORT_UNUSED_PARAMETER(options);
+  ORT_UNUSED_PARAMETER(provider_options_keys);
+  ORT_UNUSED_PARAMETER(provider_options_values);
+  ORT_UNUSED_PARAMETER(num_keys);
+  return CreateNotEnabledStatus("OpenVINO");
+}
+
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT,
                     _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) {
   ORT_UNUSED_PARAMETER(options);
@@ -483,4 +497,14 @@ ORT_API_STATUS_IMPL(OrtApis::GetROCMProviderOptionsAsString,
 ORT_API(void, OrtApis::ReleaseROCMProviderOptions, _Frees_ptr_opt_ OrtROCMProviderOptions* ptr) {
   ORT_UNUSED_PARAMETER(ptr);
 }
+
+ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_VitisAI,
+                    _In_ OrtSessionOptions* options, _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values, _In_ size_t num_keys) {
+  ORT_UNUSED_PARAMETER(options);
+  ORT_UNUSED_PARAMETER(provider_options_keys);
+  ORT_UNUSED_PARAMETER(provider_options_values);
+  ORT_UNUSED_PARAMETER(num_keys);
+  return CreateNotEnabledStatus("VitisAI");
+}
 #endif
diff --git a/onnxruntime/core/session/standalone_op_invoker.cc b/onnxruntime/core/session/standalone_op_invoker.cc
index b3128571f16f..9cbf01946e92 100644
--- a/onnxruntime/core/session/standalone_op_invoker.cc
+++ b/onnxruntime/core/session/standalone_op_invoker.cc
@@ -421,7 +421,10 @@ onnxruntime::Status CreateOp(_In_ const OrtKernelInfo* info,
   static const OrtValueNameIdxMap kEmptyNameMap;
 
   OpKernelInfo tmp_kernel_info(*node_ptr.get(), *kernel_def, *ep, kEmptyValueMap, kEmptyNameMap,
-                               kernel_info->GetDataTransferManager(), kernel_info->GetAllocators());
+                               kernel_info->GetDataTransferManager(),
+                               kernel_info->GetAllocators(),
+                               kernel_info->GetConfigOptions());
+
   std::unique_ptr<onnxruntime::OpKernel> op_kernel;
 
   auto& node_repo = NodeRepo::GetInstance();
diff --git a/onnxruntime/core/util/matrix_layout.h b/onnxruntime/core/util/matrix_layout.h
index a0405e32034a..43843da3fb96 100644
--- a/onnxruntime/core/util/matrix_layout.h
+++ b/onnxruntime/core/util/matrix_layout.h
@@ -17,7 +17,6 @@
 #include <cstdint>
 #include "core/common/gsl.h"
 
-// TODO!! Already have this in cuda, what about cpu code though?
 #if defined(_MSC_VER)
 #define ORT_FORCEINLINE __forceinline
 #else
@@ -379,7 +378,7 @@ class MatrixRef {
   MatrixRef(
       NonConstMatrixRef const& ref,  ///< MatrixRef to non-const data
       /// SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
-      _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
+      [[maybe_unused]] _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
       ) : data_(ref.data()), shape_(ref.shape()), layout_(Layout::packed(ref.shape())) {}
 
   ORT_FORCEINLINE
diff --git a/onnxruntime/core/util/thread_utils.cc b/onnxruntime/core/util/thread_utils.cc
index 48f58add8237..2a6c14ff1b05 100644
--- a/onnxruntime/core/util/thread_utils.cc
+++ b/onnxruntime/core/util/thread_utils.cc
@@ -7,6 +7,7 @@
 
 #ifdef _WIN32
 #include <Windows.h>
+#include <versionhelpers.h>
 #endif
 #include <thread>
 #include "core/session/ort_apis.h"
@@ -92,13 +93,31 @@ static std::unique_ptr<ThreadPool>
 CreateThreadPoolHelper(Env* env, OrtThreadPoolParams options) {
   ThreadOptions to;
   if (options.thread_pool_size <= 0) {  // default
-    auto default_affinities = Env::Default().GetDefaultThreadAffinities();
-    if (default_affinities.size() <= 1) {
-      return nullptr;
-    }
-    options.thread_pool_size = static_cast<int>(default_affinities.size());
     if (options.auto_set_affinity) {
+#ifdef _WIN32
+      // Only set thread affinity on Server with auto affinity.
+      // On client best to let OS scheduler handle.
+      // On big (P-Core) / little (E-Core) CPU designs affinity overrides QoS and has high power usage
+      if (IsWindowsServer()) {
+        auto default_affinities = Env::Default().GetDefaultThreadAffinities();
+        if (default_affinities.size() <= 1) {
+          return nullptr;
+        }
+        options.thread_pool_size = static_cast<int>(default_affinities.size());
+        to.affinities = std::move(default_affinities);
+      } else {
+        options.thread_pool_size = Env::Default().GetNumPhysicalCpuCores();
+      }
+#else
+      auto default_affinities = Env::Default().GetDefaultThreadAffinities();
+      if (default_affinities.size() <= 1) {
+        return nullptr;
+      }
+      options.thread_pool_size = static_cast<int>(default_affinities.size());
       to.affinities = std::move(default_affinities);
+#endif
+    } else {
+      options.thread_pool_size = Env::Default().GetNumPhysicalCpuCores();
     }
   }
   if (options.thread_pool_size <= 1) {
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 1a3e22142f80..2fbd118a43ed 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -358,7 +358,7 @@ class InferenceSession(Session):
     def __init__(
         self,
         path_or_bytes: str | bytes | os.PathLike,
-        sess_options: Sequence[onnxruntime.SessionOptions] | None = None,
+        sess_options: onnxruntime.SessionOptions | None = None,
         providers: Sequence[str | tuple[str, dict[Any, Any]]] | None = None,
         provider_options: Sequence[dict[Any, Any]] | None = None,
         **kwargs,
@@ -413,7 +413,7 @@ def __init__(
             self._read_config_from_model = os.environ.get("ORT_LOAD_CONFIG_FROM_MODEL") == "1"
 
         # internal parameters that we don't expect to be used in general so aren't documented
-        disabled_optimizers = kwargs["disabled_optimizers"] if "disabled_optimizers" in kwargs else None
+        disabled_optimizers = kwargs.get("disabled_optimizers")
 
         try:
             self._create_inference_session(providers, provider_options, disabled_optimizers)
@@ -466,7 +466,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
 
         session_options = self._sess_options if self._sess_options else C.get_default_session_options()
 
-        self._register_ep_custom_ops(session_options, providers, provider_options)
+        self._register_ep_custom_ops(session_options, providers, provider_options, available_providers)
 
         if self._model_path:
             sess = C.InferenceSession(session_options, self._model_path, True, self._read_config_from_model)
@@ -510,11 +510,15 @@ def _reset_session(self, providers, provider_options):
         self._sess_options = self._sess_options_initial
         self._create_inference_session(providers, provider_options)
 
-    def _register_ep_custom_ops(self, session_options, providers, provider_options):
+    def _register_ep_custom_ops(self, session_options, providers, provider_options, available_providers):
         for i in range(len(providers)):
-            if providers[i] == "TensorrtExecutionProvider":
+            if providers[i] in available_providers and providers[i] == "TensorrtExecutionProvider":
                 C.register_tensorrt_plugins_as_custom_ops(session_options, provider_options[i])
-            elif isinstance(providers[i], tuple) and providers[i][0] == "TensorrtExecutionProvider":
+            elif (
+                isinstance(providers[i], tuple)
+                and providers[i][0] in available_providers
+                and providers[i][0] == "TensorrtExecutionProvider"
+            ):
                 C.register_tensorrt_plugins_as_custom_ops(session_options, providers[i][1])
 
 
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index f470e9f6b6ed..0bbcee12ea5c 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -659,7 +659,12 @@ static bool CheckIfInputIsSequenceType(const std::string& name_input,
   if (!temp) {
     throw std::runtime_error("Corresponding type_proto is null");
   } else {
-    type_proto = *temp;
+    if (temp->has_optional_type()) {
+      const ::onnx::TypeProto_Optional& optional_type_proto = temp->optional_type();
+      type_proto = optional_type_proto.elem_type();
+    } else {
+      type_proto = *temp;
+    }
   }
 
   return type_proto.has_sequence_type();
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 6f383d733edb..0c1945109f31 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -443,9 +443,9 @@ void RegisterTensorRTPluginsAsCustomOps(PySessionOptions& so, const ProviderOpti
     if (it != options.end()) {
       trt_extra_plugin_lib_paths = it->second;
     }
-    std::vector<OrtCustomOpDomain*> domain_list;
-    tensorrt_provider_info->GetTensorRTCustomOpDomainList(domain_list, trt_extra_plugin_lib_paths);
-    for (auto ptr : domain_list) {
+    std::vector<OrtCustomOpDomain*> custom_op_domains;
+    tensorrt_provider_info->GetTensorRTCustomOpDomainList(custom_op_domains, trt_extra_plugin_lib_paths);
+    for (auto ptr : custom_op_domains) {
       if (!is_already_in_domains(ptr->domain_, so.custom_op_domains_)) {
         so.custom_op_domains_.push_back(ptr);
       } else {
@@ -475,7 +475,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       // So we need these std::string variables defined here as they will be kept alive for the lifetime of TRT EP and we can still access them from OrtTensorRTProviderOptionsV2 instance.
       // (The reason is string copy is involved, for example params.trt_engine_cache_path = cache_path.c_str() and those std::string variable is referenced by OrtTensorRTProviderOptionsV2 instance
       // and TRT EP instance, so it won't be released.)
-      std::string calibration_table, cache_path, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile;
+      std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path;
       auto it = provider_options_map.find(type);
       if (it != provider_options_map.end()) {
         OrtTensorRTProviderOptionsV2 params;
@@ -486,6 +486,15 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'device_id' should be a number i.e. '0'.\n");
             }
+          } else if (option.first == "user_compute_stream") {
+            if (!option.second.empty()) {
+              auto stream = std::stoull(option.second, nullptr, 0);
+              params.user_compute_stream = reinterpret_cast<void*>(stream);
+              params.has_user_compute_stream = true;
+            } else {
+              params.has_user_compute_stream = false;
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'user_compute_stream' should be a string to define the compute stream for the inference to run on.\n");
+            }
           } else if (option.first == "trt_max_partition_iterations") {
             if (!option.second.empty()) {
               params.trt_max_partition_iterations = std::stoi(option.second);
@@ -572,6 +581,13 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_cache_path' should be a path string i.e. 'engine_cache'.\n");
             }
+          } else if (option.first == "trt_engine_cache_prefix") {
+            if (!option.second.empty()) {
+              cache_prefix = option.second;
+              params.trt_engine_cache_prefix = cache_prefix.c_str();
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_cache_prefix' should be a string to customize engine cache prefix i.e. 'FRCNN' or 'yolov4'.\n");
+            }
           } else if (option.first == "trt_engine_decryption_enable") {
             if (option.second == "True" || option.second == "true") {
               params.trt_engine_decryption_enable = true;
@@ -713,6 +729,27 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_cuda_graph_enable' should be 'True' or 'False'. Default value is 'False'.\n");
             }
+          } else if (option.first == "trt_dump_ep_context_model") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_dump_ep_context_model = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_dump_ep_context_model = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_dump_ep_context_model' should be 'True' or 'False'. Default value is 'False'.\n");
+            }
+          } else if (option.first == "trt_ep_context_file_path") {
+            if (!option.second.empty()) {
+              ep_context_file_path = option.second;
+              params.trt_ep_context_file_path = ep_context_file_path.c_str();
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_file_path' should be a string.\n");
+            }
+          } else if (option.first == "trt_ep_context_embed_mode") {
+            if (!option.second.empty()) {
+              params.trt_ep_context_embed_mode = std::stoi(option.second);
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_embed_mode' should be a positive integer number i.e. '1'.\n");
+            }
           } else {
             ORT_THROW("Invalid TensorRT EP option: ", option.first);
           }
@@ -909,6 +946,20 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             ORT_THROW("Invalid value passed for disable_dynamic_shapes: ", option.second);
           }
           OV_provider_options_map[option.first] = option.second;
+        } else if (option.first == "enable_dynamic_shapes") {
+          LOGS_DEFAULT(WARNING) << " Deprecation notice - 'enable_dynamic_shapes' is Deprected. Upgrade the API to disable_dynamic_shapes parameter."
+                                   "Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
+          std::string value;
+          if (!(option.second == "True" || option.second == "true" ||
+                option.second == "False" || option.second == "false")) {
+            ORT_THROW("Invalid value passed for enable_dynamic_shapes: ", option.second);
+          }
+          if (option.second == "True" || option.second == "true") {
+            value = "false";
+          } else {
+            value = "true";
+          }
+          OV_provider_options_map["disable_dynamic_shapes"] = value;
         } else if (option.first == "device_id") {
           OV_provider_options_map[option.first] = option.second;
           continue;
@@ -939,7 +990,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       if (!Env::Default().GetEnvironmentVar("INTEL_OPENVINO_DIR").empty()) {
         ORT_THROW("INTEL_OPENVINO_DIR is set but OpenVINO library wasn't able to be loaded. Please install a supported version of OpenVINO as mentioned in the requirements page (https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements), ensure dependency libraries are in the PATH and your hardware is supported.");
       } else {
-        LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please reference https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
+        LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
       }
     }
 #endif
@@ -954,7 +1005,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
     return onnxruntime::TVMProviderFactoryCreator::Create(info)->CreateProvider();
 #endif
   } else if (type == kVitisAIExecutionProvider) {
-#if USE_VITISAI
+#ifdef USE_VITISAI
     const auto it = provider_options_map.find(type);
     if (it == provider_options_map.end()) {
       LOGS_DEFAULT(FATAL) << "cannot find provider options for VitisAIExecutionProvider";
@@ -1299,14 +1350,14 @@ void addGlobalMethods(py::module& m) {
 
 #ifdef ENABLE_ATEN
   m.def("register_aten_op_executor",
-        [](const std::string& is_cpu_argument_address_str, const std::string& aten_op_executor_address_str) -> void {
-          size_t is_cpu_argument_address_int, aten_op_executor_address_int;
+        [](const std::string& is_tensor_argument_address_str, const std::string& aten_op_executor_address_str) -> void {
+          size_t is_tensor_argument_address_int, aten_op_executor_address_int;
           ORT_THROW_IF_ERROR(
-              ParseStringWithClassicLocale(is_cpu_argument_address_str, is_cpu_argument_address_int));
+              ParseStringWithClassicLocale(is_tensor_argument_address_str, is_tensor_argument_address_int));
           ORT_THROW_IF_ERROR(ParseStringWithClassicLocale(aten_op_executor_address_str, aten_op_executor_address_int));
-          void* p_is_cpu_argument = reinterpret_cast<void*>(is_cpu_argument_address_int);
+          void* p_is_tensor_argument = reinterpret_cast<void*>(is_tensor_argument_address_int);
           void* p_aten_op_executor = reinterpret_cast<void*>(aten_op_executor_address_int);
-          contrib::aten_ops::ATenOperatorExecutor::Instance().Initialize(p_is_cpu_argument, p_aten_op_executor);
+          contrib::aten_ops::ATenOperatorExecutor::Instance().Initialize(p_is_tensor_argument, p_aten_op_executor);
         });
 #endif
 }
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 6827f2c9dfd9..22314610dbee 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -60,11 +60,8 @@ struct OrtStatus {
 #elif OPENVINO_CONFIG_GPU_FP16
 #define BACKEND_OPENVINO "-OPENVINO_GPU_FP16"
 
-#elif OPENVINO_CONFIG_NPU_FP16
-#define BACKEND_OPENVINO "-OPENVINO_NPU_FP16"
-
-#elif OPENVINO_CONFIG_NPU_U8
-#define BACKEND_OPENVINO "-OPENVINO_NPU_U8"
+#elif OPENVINO_CONFIG_NPU
+#define BACKEND_OPENVINO "-OPENVINO_NPU"
 
 #elif OPENVINO_CONFIG_MULTI
 #define BACKEND_OPENVINO "-OPENVINO_MULTI"
diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 16cbc8e8099e..10d9f469863c 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -22,7 +22,7 @@ def check_distro_info():
         __my_distro__ = __my_system__
         __my_distro_ver__ = platform.release().lower()
 
-        if __my_distro_ver__ != "10":
+        if __my_distro_ver__ not in ["10", "11"]:
             warnings.warn(
                 "Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
                 % __my_distro_ver__
diff --git a/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cc b/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cc
index b25f55062e10..5eb05edefdcf 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cc
+++ b/onnxruntime/python/tools/kernel_explorer/kernel_explorer.cc
@@ -4,6 +4,8 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/embed.h>
 #include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
 #include "python/tools/kernel_explorer/device_array.h"
 #include "python/tools/kernel_explorer/kernel_explorer_interface.h"
 
@@ -13,6 +15,10 @@ namespace onnxruntime {
 
 static py::module::module_def _kernel_explorer_module_def;
 
+bool TuningInfo::collect_enabled_{false};
+std::vector<TuningResults> TuningInfo::collected_tuning_results_ = {};
+std::optional<int> TuningInfo::max_tuning_duration_ms_ = {};
+
 py::module GetKernelExplorerModule() {
   static pybind11::module_ m = []() {
     auto tmp = pybind11::module_::create_extension_module(
@@ -36,11 +42,29 @@ KE_REGISTER(m) {
       .def("UpdateHostNumpyArray", &DeviceArray::UpdateHostNumpyArray)
       .def("UpdateDeviceArray", &DeviceArray::UpdateDeviceArray);
 
+  m.def("enable_collect_tuning_results", TuningInfo::EnableCollect, pybind11::arg("enable") = true);
+
+  m.def("max_tuning_duration_ms", TuningInfo::SetMaxTuningDurationMs);
+
+  m.def("get_collected_tuning_results", []() {
+    py::list ret;
+    for (const auto& trs : TuningInfo::GetCollectedTuningResults()) {
+      py::dict py_trs;
+      py_trs["ep"] = trs.ep;
+      py_trs["results"] = trs.results;
+      py_trs["validators"] = trs.validators;
+      ret.append(std::move(py_trs));
+    }
+    return ret;
+  });
+
+  // clang-format ill-format the following code below version 18
+  // clang-format off
   m.def("is_composable_kernel_available", []() {
 #ifdef USE_COMPOSABLE_KERNEL
     return true;
 #else
-        return false;
+    return false;
 #endif
   });
 
@@ -48,7 +72,7 @@ KE_REGISTER(m) {
 #ifdef USE_HIPBLASLT
     return true;
 #else
-        return false;
+    return false;
 #endif
   });
 
@@ -56,9 +80,10 @@ KE_REGISTER(m) {
 #ifndef DISABLE_FLOAT8_TYPES
     return true;
 #else
-        return false;
+    return false;
 #endif
   });
+  // clang-format on
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/python/tools/kernel_explorer/kernel_explorer_interface.h b/onnxruntime/python/tools/kernel_explorer/kernel_explorer_interface.h
index 9eb0adcede04..1c7232e6a5cd 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernel_explorer_interface.h
+++ b/onnxruntime/python/tools/kernel_explorer/kernel_explorer_interface.h
@@ -36,6 +36,24 @@ using TuningContextT = onnxruntime::rocm::tunable::RocmTuningContext;
 
 namespace onnxruntime {
 
+struct TuningInfo {
+  static void EnableCollect(bool b) {
+    collect_enabled_ = b;
+  }
+
+  static std::vector<TuningResults> GetCollectedTuningResults() {
+    return collected_tuning_results_;
+  }
+
+  static void SetMaxTuningDurationMs(int milliseconds) {
+    max_tuning_duration_ms_ = milliseconds;
+  }
+
+  static bool collect_enabled_;
+  static std::vector<TuningResults> collected_tuning_results_;
+  static std::optional<int> max_tuning_duration_ms_;
+};
+
 /// Wrapping around Op and TunableOp
 class IKernelExplorer {
  public:
@@ -59,7 +77,11 @@ class IKernelExplorer {
     return timer.Duration() / repeats_;
   }
 
-  virtual ~IKernelExplorer() = default;
+  virtual ~IKernelExplorer() {
+    if (TuningInfo::collect_enabled_) {
+      TuningInfo::collected_tuning_results_.emplace_back(this->ep_->GetTuningContext()->GetTuningResults());
+    }
+  }
 
  protected:
   ExecutionProvider* GetEp() {
@@ -73,6 +95,15 @@ class IKernelExplorer {
       auto tuning_ctx = this->ep_->GetTuningContext();
       if (nullptr != tuning_ctx) {
         tuning_ctx->RegisterAllocatorsView(&this->allocators_);
+        for (const auto& tr : TuningInfo::collected_tuning_results_) {
+          auto status = tuning_ctx->LoadTuningResults(tr);
+          if (!status.IsOK()) {
+            LOGS_DEFAULT(ERROR) << status;
+          }
+        }
+        if (TuningInfo::max_tuning_duration_ms_.has_value()) {
+          tuning_ctx->SetMaxTuningDurationMs(*TuningInfo::max_tuning_duration_ms_);
+        }
       }
       stream_ = std::make_unique<onnxruntime::Stream>(nullptr, this->ep_->GetOrtDeviceByMemType(OrtMemTypeDefault));
     });
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi b/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
index 94213aceed08..4682f7135d7a 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
@@ -14,3 +14,7 @@ class qkv_format:  # noqa: N801
     Q_KV_BSNH_BSN2H: int
 
 def is_composable_kernel_available(*args, **kwargs): ...
+def is_hipblaslt_available(*args, **kwargs): ...
+
+def enable_collect_tuning_results(*args, **kwargs): ...
+def get_collected_tuning_results(*args, **kwargs): ...
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
index cc5a91873553..5f2486790157 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import os
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -23,6 +22,7 @@ def dtype_to_suffix(dtype):
     }[dtype]
 
 
+@ke.dispatchable
 def _test_batched_gemm(
     func, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, batch: int, alpha=1.0, beta=0.0
 ):
@@ -148,6 +148,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, batch: int):
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -177,12 +178,13 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         ke.report(BatchedGemmMetric(impl, dtype, duration_ms, flops, transa, transb, m, n, k, batch))
 
 
-def profile_with_args(dtype, transa, transb, m, n, k, batch, sort):
+@ke.dispatchable
+def profile_with_args(dtype, transa, transb, m, n, k, batch):
     dtype_suffix = "_" + dtype_to_suffix(dtype)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
     fn_rocblas = getattr(ke, "RocBlasBatchedGemm" + dtype_suffix)
     fn_tunable = getattr(ke, "BatchedGemmTunable" + dtype_suffix + transab_suffix)
-    with ke.benchmark(sort):
+    with ke.benchmark():
         profile_gemm_func(fn_rocblas, dtype, transa, transb, m, n, k, batch)
         profile_gemm_func(fn_tunable, dtype, transa, transb, m, n, k, batch)
     print()
@@ -192,14 +194,12 @@ def profile():
     for dtype in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=False):
             for batch in [1, 32, 64]:
-                profile_with_args(dtype, False, False, m, n, k, batch, True)
+                profile_with_args(dtype, False, False, m, n, k, batch)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("transa", choices="NT")
     group.add_argument("transb", choices="NT")
@@ -207,12 +207,9 @@ def profile():
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
     group.add_argument("batch", type=int)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
-            args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.batch, args.sort
-        )
+        args.dispatch(args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.batch)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/cuda/dequant_blockwise_int4.cu b/onnxruntime/python/tools/kernel_explorer/kernels/cuda/dequant_blockwise_int4.cu
index 9b5e4079a7e3..e6dee290a6fc 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/cuda/dequant_blockwise_int4.cu
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/cuda/dequant_blockwise_int4.cu
@@ -51,6 +51,7 @@ class DequantizeInt4 : public IKernelExplorer {
         params_.quant_,
         params_.scales_,
         params_.zero_points_,
+        nullptr, /*reorder_idx*/
         params_.k_,
         params_.n_,
         32,
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/cuda/gemm.cu b/onnxruntime/python/tools/kernel_explorer/kernels/cuda/gemm.cu
index fd9e9c4fd161..8b05b96ec38a 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/cuda/gemm.cu
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/cuda/gemm.cu
@@ -56,6 +56,9 @@ class GemmBenchmark : public IKernelExplorer {
     typedef typename ToCudaType<T>::MappedType CudaT;
     CudaT one = ToCudaType<T>::FromFloat(1.0f);
     CudaT zero = ToCudaType<T>::FromFloat(0.0f);
+
+    // TF32 is enable by default. To disable TF32, set environment variable NVIDIA_TF32_OVERRIDE = 0
+    constexpr bool use_tf32 = true;
     CUBLAS_CALL_THROW(cublasGemmHelper(
         params_.cublas_handle,
         CUBLAS_OP_N,
@@ -69,7 +72,8 @@ class GemmBenchmark : public IKernelExplorer {
         &zero,
         params_.output_,
         params_.n_,
-        device_prop_));
+        device_prop_,
+        use_tf32));
   }
 
  private:
@@ -79,11 +83,11 @@ class GemmBenchmark : public IKernelExplorer {
   cudaDeviceProp device_prop_;
 };
 
-#define REGISTER_OP(name, type)                                                               \
-  py::class_<name<type>>(m, #name "_" #type)                                                  \
+#define REGISTER_OP(name, type)                                                 \
+  py::class_<name<type>>(m, #name "_" #type)                                    \
       .def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, int, int, int>()) \
-      .def("SetRepeats", &name<type>::SetRepeats)                                             \
-      .def("Profile", &name<type>::Profile)                                                   \
+      .def("SetRepeats", &name<type>::SetRepeats)                               \
+      .def("Profile", &name<type>::Profile)                                     \
       .def("Run", &name<type>::Run);
 
 KE_REGISTER(m) {
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_int4.py b/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_int4.py
index 7088039f9e53..ba049fad773a 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_int4.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_int4.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 
 import kernel_explorer as ke
@@ -31,6 +30,7 @@ def report(self):
         return f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s {self.dtype} n={self.n} k={self.k} {self.name}"
 
 
+@ke.dispatchable(pattern_arg=3)
 def profile_dequantize_int4_func(n, k, dtype, func):
     np.random.seed(0)
     output = np.random.rand(n, k).astype(dtype)
@@ -48,8 +48,9 @@ def profile_dequantize_int4_func(n, k, dtype, func):
     ke.report(DequantizeInt4Metric(func, dtype, duration_ms, total_bytes, n, k))
 
 
-def profile_with_args(n, k, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(n, k, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_dequantize_int4_func(n, k, dtype, func)
 
@@ -57,22 +58,19 @@ def profile_with_args(n, k, dtype, sort):
 def profile():
     for dt in dtypes:
         for n, k in ((4096, 4096), (4096, 12288), (12288, 4096)):
-            profile_with_args(n, k, dt, True)
+            profile_with_args(n, k, dt)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.n, args.k, args.dtype, args.sort)
+        args.dispatch(args.n, args.k, args.dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/elementwise_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/elementwise_test.py
index 913a2c31a5f1..425d8843814c 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/elementwise_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/elementwise_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import re
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -90,6 +89,7 @@ def report(self):
         return "not supported        " + common
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_elementwise_func(batch_size, seq_len, hidden_size, dtype, func):
     x_size = [batch_size, seq_len, hidden_size]
     bias_size = hidden_size
@@ -112,8 +112,9 @@ def profile_elementwise_func(batch_size, seq_len, hidden_size, dtype, func):
     ke.report(ElementwiseMetric(func, dtype, duration_ms, total_bytes, batch_size, seq_len, hidden_size))
 
 
-def profile_with_args(batch_size, seq_len, hidden_size, fn_name, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(batch_size, seq_len, hidden_size, fn_name, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(fn_name, dtype):
             profile_elementwise_func(batch_size, seq_len, hidden_size, dtype, func)
 
@@ -121,24 +122,21 @@ def profile_with_args(batch_size, seq_len, hidden_size, fn_name, dtype, sort):
 def profile():
     for dtype in dtypes:
         for bert_size in get_bert_sizes():
-            profile_with_args(*bert_size, "FastGeLU", dtype, True)
+            profile_with_args(*bert_size, "FastGeLU", dtype)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("batch_size", type=int)
     group.add_argument("seq_len", type=int)
     group.add_argument("hidden_size", type=int)
     group.add_argument("fn_name", choices=fn_names)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.batch_size, args.seq_len, args.hidden_size, args.fn_name, args.dtype, args.sort)
+        args.dispatch(args.batch_size, args.seq_len, args.hidden_size, args.fn_name, args.dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
index 9b308c09811d..8ee9c6bc0f04 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -120,6 +119,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemmfastgelu_func(my_func, dtype: str, m: int, n: int, k: int, transa: bool, transb: bool):
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -153,10 +153,11 @@ def profile_gemmfastgelu_func(my_func, dtype: str, m: int, n: int, k: int, trans
         ke.report(GemmFastGeluMetric(impl, dtype, duration_ms, floating_point_operations, transa, transb, m, n, k))
 
 
-def profile_with_args(transa, transb, dtype, m, n, k, sort):
+@ke.dispatchable
+def profile_with_args(transa, transb, dtype, m, n, k):
     dtype_suffix = "_" + dtype_to_suffix(dtype)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
-    with ke.benchmark(sort):
+    with ke.benchmark():
         profile_gemmfastgelu_func(getattr(ke, "GemmFastGeluUnfused" + dtype_suffix), dtype, m, n, k, transa, transb)
         profile_gemmfastgelu_func(
             getattr(ke, "CKGemmFastGelu" + dtype_suffix + transab_suffix), dtype, m, n, k, transa, transb
@@ -173,24 +174,22 @@ def profile_with_args(transa, transb, dtype, m, n, k, sort):
 def profile():
     for dtype in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=True):
-            profile_with_args(False, False, dtype, m, n, k, True)
+            profile_with_args(False, False, dtype, m, n, k)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("transa", choices="NT")
     group.add_argument("transb", choices="NT")
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("m", type=int)
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
-    group.add_argument("--sort", action="store_true")
-    if len(sys.argv) == 1:
+
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.transa == "T", args.transb == "T", args.dtype, args.m, args.n, args.k, args.sort)
+        args.dispatch(args.transa == "T", args.transb == "T", args.dtype, args.m, args.n, args.k)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_float8_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_float8_test.py
index 19a1008b3947..76d0b2a3138b 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_float8_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_float8_test.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 
 import kernel_explorer as ke
@@ -43,6 +42,7 @@ def cast_and_scale(a, dtype: str):
         raise ValueError(dtype)
 
 
+@ke.dispatchable(pattern_arg=0)
 def _test_gemm(
     func, dta: str, dtb: str, dtc: str, transa: bool, transb: bool, m: int, n: int, k: int, alpha=1.0, beta=0.0
 ):
@@ -154,6 +154,7 @@ def _test_gemm(
 )
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dta, dtb, dtc", dtypes)
+@ke.dispatchable
 def test_ck_gemm(dta, dtb, dtc, transa, transb, m, n, k):
     if dtb == "float16" and transb:
         pytest.skip("Only supports transb when b is fp8")
@@ -206,6 +207,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops {self.gbps:5.2f} GB/s " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_func(
     func, dta: str, dtb: str, dtc: str, transa: bool, transb: bool, m: int, n: int, k: int, alpha=1.0, beta=0.0
 ):
@@ -264,10 +266,11 @@ def profile_gemm_func(
         ke.report(GemmMetric(impl, f"{dta}_{dtb}_{dtc}", duration_ms, FLOPs, total_bytes, transa, transb, m, n, k))
 
 
-def profile_with_args(dta, dtb, dtc, transa, transb, m, n, k, sort):
+@ke.dispatchable
+def profile_with_args(dta, dtb, dtc, transa, transb, m, n, k):
     dtype_suffix = "_" + dtype_to_suffix(dta) + "_" + dtype_to_suffix(dtb) + "_" + dtype_to_suffix(dtc)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
-    with ke.benchmark(sort):
+    with ke.benchmark():
         profile_gemm_func(
             getattr(ke, "GemmFloat8CK" + dtype_suffix + transab_suffix), dta, dtb, dtc, transa, transb, m, n, k
         )
@@ -280,14 +283,12 @@ def profile_with_args(dta, dtb, dtc, transa, transb, m, n, k, sort):
 def profile():
     for dta, dtb, dtc in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=True):
-            profile_with_args(dta, dtb, dtc, False, False, m, n, k, True)
+            profile_with_args(dta, dtb, dtc, False, False, m, n, k)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dta", choices=["float8_e4m3fn", "float8_e4m3fnuz", "float16"])
     group.add_argument("dtb", choices=["float8_e4m3fn", "float8_e4m3fnuz", "float16"])
     group.add_argument("dtc", choices=["float8_e4m3fn", "float8_e4m3fnuz", "float16"])
@@ -296,12 +297,9 @@ def profile():
     group.add_argument("m", type=int)
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
-            args.dta, args.dtb, args.dtc, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.sort
-        )
+        args.dispatch(args.dta, args.dtb, args.dtc, args.transa == "T", args.transb == "T", args.m, args.n, args.k)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
index 6e1e431842a5..8a6713f6e03a 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
@@ -5,7 +5,6 @@
 
 
 import os
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -44,6 +43,7 @@ def get_ck_binding_name(dtype, biased: bool, masked: bool):
 num_heads = [8, 12]
 head_sizes = [64]
 biaseds = [False, True]
+causals = [False]
 mask_dims = [0, 2, 3, 4]
 
 
@@ -81,8 +81,57 @@ def maybe_pack_q_k_v_bnsh_for_device_on_host(q, k, v, dtype, qkv_format):
     raise NotImplementedError
 
 
+def _make_causal_mask(
+    seqence_length,
+    total_sequence_length,
+    dtype: np.dtype,
+):
+    """
+    Make causal mask used for Attention with attribute unidirectional == 1.
+    The mask is a upper triangular matrix with shape [sequence_length, total_sequence_length].
+    Putting a 1 indicates that the token at this position should be masked.
+    For Example:
+    sequence_length = 5, total_sequence_length = 5,
+    mask: [[0. 1. 1. 1. 1.]
+           [0. 0. 1. 1. 1.]
+           [0. 0. 0. 1. 1.]
+           [0. 0. 0. 0. 1.]
+           [0. 0. 0. 0. 0.]]
+    seqence_length = 5, total_seqence_length = 3,
+    mask: [[1. 1. 1.]
+           [1. 1. 1.]
+           [0. 1. 1.]
+           [0. 0. 1.]
+           [0. 0. 0.]]
+    seqence_length = 5, total_seqence_length = 7,
+    mask: [[0. 0. 0. 1. 1. 1. 1.]
+           [0. 0. 0. 0. 1. 1. 1.]
+           [0. 0. 0. 0. 0. 1. 1.]
+           [0. 0. 0. 0. 0. 0. 1.]
+           [0. 0. 0. 0. 0. 0. 0.]]
+    """
+    mask = np.full((seqence_length, seqence_length), 1)
+    mask_cond = np.arange(mask.shape[-1])
+    mask = np.where(mask_cond < (mask_cond + 1).reshape(mask.shape[-1], 1), 0, mask)
+
+    mask = mask.astype(dtype)
+
+    if total_sequence_length - seqence_length > 0:
+        mask = np.concatenate(
+            [np.zeros((seqence_length, total_sequence_length - seqence_length), dtype=dtype), mask], axis=-1
+        )
+
+    if total_sequence_length - seqence_length < 0:
+        mask = mask[:, -total_sequence_length:]
+
+    correct_mask = np.full((seqence_length, total_sequence_length), 1)
+    for i in range(seqence_length):
+        correct_mask[i][:] = sum(mask[i]) != total_sequence_length
+    return mask, correct_mask
+
+
 def _test_gemm_softmax_gemm_permute(
-    f, dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, qkv_format
+    f, dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, causal, qkv_format
 ):
     v_head_size = head_size
     q_shape = [batch, num_heads, seqlen, head_size]
@@ -123,6 +172,8 @@ def _test_gemm_softmax_gemm_permute(
     pre_softmax_attn_scores = pre_softmax_attn_scores * scale
     if attn_bias is not None:
         pre_softmax_attn_scores = pre_softmax_attn_scores + attn_bias
+
+    correct_causal_mask = np.full((seqlen, total_seqlen), 1)
     if attn_mask is not None:
         filter_value = -10000.0
         if mask_dim == 4:
@@ -131,7 +182,18 @@ def _test_gemm_softmax_gemm_permute(
         else:
             converted_mask = (1 - attn_mask.reshape(mask_shape_broadcasted)) * filter_value
         pre_softmax_attn_scores = pre_softmax_attn_scores + converted_mask
+    if causal:
+        filter_value = np.finfo(dtype).min
+        causal_mask, correct_causal_mask = _make_causal_mask(seqlen, total_seqlen, pre_softmax_attn_scores.dtype)
+        causal_mask = np.broadcast_to(causal_mask, pre_softmax_attn_scores.shape) * filter_value
+        pre_softmax_attn_scores = pre_softmax_attn_scores + causal_mask
     attn_scores = softmax(pre_softmax_attn_scores, axis=-1)
+
+    # apply mask to attn_scores to correct softmax result, in c++ implementation, if all values in a row are masked,
+    # the softmax result in this row will be filled with 0.
+    correct_causal_mask = np.broadcast_to(correct_causal_mask, pre_softmax_attn_scores.shape)
+    attn_scores = attn_scores * correct_causal_mask
+
     attn = matmul(attn_scores, v)
     ref = np.swapaxes(attn, 2, 1)  # permute 0213
 
@@ -154,6 +216,7 @@ def _test_gemm_softmax_gemm_permute(
         head_size,
         mask_dim,
         scale,
+        causal,
         qkv_format,
         dev_q,
         dev_k,
@@ -202,12 +265,26 @@ def _test_gemm_softmax_gemm_permute(
 @pytest.mark.parametrize("total_seqlen", total_seqlens)
 @pytest.mark.parametrize("seqlen", seqlens)
 @pytest.mark.parametrize("batch", [16])
+@pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("dtype", ["float16", "float32"])
-def test_gemm_softmax_gemm_permute_generic(dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim):
+def test_gemm_softmax_gemm_permute_generic(
+    dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, causal, mask_dim
+):
     f = getattr(ke, "GemmSoftmaxGemmPermuteGeneric_" + dtype_to_suffix(dtype))
     scale = 1.0 / np.sqrt(head_size)
     _test_gemm_softmax_gemm_permute(
-        f, dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim, scale, ke.qkv_format.Q_K_V_BNSH
+        f,
+        dtype,
+        batch,
+        seqlen,
+        total_seqlen,
+        nhead,
+        head_size,
+        biased,
+        mask_dim,
+        scale,
+        causal,
+        ke.qkv_format.Q_K_V_BNSH,
     )
 
 
@@ -218,14 +295,26 @@ def test_gemm_softmax_gemm_permute_generic(dtype, batch, seqlen, total_seqlen, n
 @pytest.mark.parametrize("total_seqlen", [128])
 @pytest.mark.parametrize("seqlen", [64])
 @pytest.mark.parametrize("batch", [16])
+@pytest.mark.parametrize("causal", [True, False])
 @pytest.mark.parametrize("dtype", ["float16", "float32"])
 def test_gemm_softmax_gemm_permute_generic_nested_tunable(
-    dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim
+    dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, causal, mask_dim
 ):
     f = getattr(ke, "GemmSoftmaxGemmPermuteGenericNestedTunable_" + dtype_to_suffix(dtype))
     scale = 1.0 / np.sqrt(head_size)
     _test_gemm_softmax_gemm_permute(
-        f, dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim, scale, ke.qkv_format.Q_K_V_BNSH
+        f,
+        dtype,
+        batch,
+        seqlen,
+        total_seqlen,
+        nhead,
+        head_size,
+        biased,
+        mask_dim,
+        scale,
+        causal,
+        ke.qkv_format.Q_K_V_BNSH,
     )
 
 
@@ -237,12 +326,24 @@ def test_gemm_softmax_gemm_permute_generic_nested_tunable(
 @pytest.mark.parametrize("total_seqlen", total_seqlens)
 @pytest.mark.parametrize("seqlen", seqlens)
 @pytest.mark.parametrize("batch", batches)
+@pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("dtype", dtypes)
-def test_gemm_softmax_gemm_permute_ck(dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim):
+def test_gemm_softmax_gemm_permute_ck(dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, causal, mask_dim):
     f = getattr(ke, get_ck_binding_name(dtype, biased, mask_dim != 0))
     scale = 1.0 / np.sqrt(head_size)
     _test_gemm_softmax_gemm_permute(
-        f, dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim, scale, ke.qkv_format.Q_K_V_BNSH
+        f,
+        dtype,
+        batch,
+        seqlen,
+        total_seqlen,
+        nhead,
+        head_size,
+        biased,
+        mask_dim,
+        scale,
+        causal,
+        ke.qkv_format.Q_K_V_BNSH,
     )
 
 
@@ -253,12 +354,26 @@ def test_gemm_softmax_gemm_permute_ck(dtype, batch, seqlen, total_seqlen, nhead,
 @pytest.mark.parametrize("total_seqlen", [128])
 @pytest.mark.parametrize("seqlen", [64])
 @pytest.mark.parametrize("batch", [16])
+@pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("dtype", ["float16"])
-def test_gemm_softmax_gemm_permute_tunable(dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim):
+def test_gemm_softmax_gemm_permute_tunable(
+    dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, causal, mask_dim
+):
     f = getattr(ke, "GemmSoftmaxGemmPermuteTunable_" + dtype_to_suffix(dtype))
     scale = 1.0 / np.sqrt(head_size)
     _test_gemm_softmax_gemm_permute(
-        f, dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim, scale, ke.qkv_format.Q_K_V_BNSH
+        f,
+        dtype,
+        batch,
+        seqlen,
+        total_seqlen,
+        nhead,
+        head_size,
+        biased,
+        mask_dim,
+        scale,
+        causal,
+        ke.qkv_format.Q_K_V_BNSH,
     )
 
 
@@ -278,16 +393,17 @@ def test_gemm_softmax_gemm_permute_tunable(dtype, batch, seqlen, total_seqlen, n
 @pytest.mark.skipif(not ke.is_composable_kernel_available(), reason="ck is not enabled")
 @pytest.mark.parametrize("mask_dim", [0], ids=get_mask_dim_id)
 @pytest.mark.parametrize("biased", [False], ids=get_biased_id)
+@pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("batch, seqlen, total_seqlen, nhead, head_size, qkv_format_name", stabel_diffusion_configs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_gemm_softmax_gemm_permute_ck_sd(
-    dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim, qkv_format_name
+    dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, causal, mask_dim, qkv_format_name
 ):
     qkv_format = getattr(ke.qkv_format, qkv_format_name)
     f = getattr(ke, get_ck_binding_name(dtype, biased, mask_dim != 0))
     scale = 1.0 / np.sqrt(head_size)
     _test_gemm_softmax_gemm_permute(
-        f, dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim, scale, qkv_format
+        f, dtype, batch, seqlen, total_seqlen, nhead, head_size, biased, mask_dim, scale, causal, qkv_format
     )
 
 
@@ -315,8 +431,9 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_softmax_gemm_permute_func(
-    f, dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, qkv_format
+    f, dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, causal, qkv_format
 ):
     v_head_size = head_size
     q_shape = [batch, num_heads, seqlen, head_size]
@@ -369,6 +486,7 @@ def profile_gemm_softmax_gemm_permute_func(
         head_size,
         mask_dim,
         scale,
+        causal,
         qkv_format,
         dev_q,
         dev_k,
@@ -401,11 +519,22 @@ def profile_gemm_softmax_gemm_permute_func(
         )
 
 
+@ke.dispatchable
 def profile_with_args(
-    dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, qkv_format, *, sort=False
+    dtype,
+    batch,
+    seqlen,
+    total_seqlen,
+    num_heads,
+    head_size,
+    biased,
+    causal,
+    mask_dim,
+    scale,
+    qkv_format,
 ):
-    with ke.benchmark(sort):
-        args = (dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, qkv_format)
+    with ke.benchmark():
+        args = (dtype, batch, seqlen, total_seqlen, num_heads, head_size, biased, mask_dim, scale, causal, qkv_format)
         if qkv_format == ke.qkv_format.Q_K_V_BNSH:
             profile_gemm_softmax_gemm_permute_func(
                 getattr(ke, "GemmSoftmaxGemmPermuteGeneric_" + dtype_to_suffix(dtype)), *args
@@ -429,24 +558,21 @@ def profile():
             nhead,
             head_size,
             biased=False,
+            causal=False,
             mask_dim=0,
             qkv_format=getattr(ke.qkv_format, qkv_format_name),
             scale=0.125,
-            sort=True,
         )
         print()
 
-    for args in product(dtypes, batches, seqlens, total_seqlens, num_heads, head_sizes, biaseds, mask_dims):
-        profile_with_args(*args, qkv_format=ke.qkv_format.Q_K_V_BNSH, scale=0.125, sort=True)
+    for args in product(dtypes, batches, seqlens, total_seqlens, num_heads, head_sizes, biaseds, causals, mask_dims):
+        profile_with_args(*args, qkv_format=ke.qkv_format.Q_K_V_BNSH, scale=0.125)
         print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
-    group.add_argument("--sort", action="store_true")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("batch", type=int)
     group.add_argument("seqlen", type=int)
@@ -455,6 +581,7 @@ def profile():
     group.add_argument("head_size", type=int)
     group.add_argument("biased", type=int, choices=[0, 1], default=0)
     group.add_argument("mask_dim", type=int, choices=[0, 2, 3, 4], default=2, help="0 for mask disabled")
+    group.add_argument("causal", type=int, choices=[0, 1], default=0)
     group.add_argument("--scale", type=float, default=None, help="default to 1.0/sqrt(head_size)")
     group.add_argument(
         "--qkv_format",
@@ -467,11 +594,11 @@ def profile():
         ],
     )
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
+        args.dispatch(
             args.dtype,
             args.batch,
             args.seqlen,
@@ -479,8 +606,8 @@ def profile():
             args.num_heads,
             args.head_size,
             args.biased,
+            args.causal,
             args.mask_dim,
             args.scale,
             getattr(ke.qkv_format, args.qkv_format),
-            sort=args.sort,
         )
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
index 8182cdb17567..23ffa5735d2c 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -13,8 +12,9 @@
 from utils import dtype_to_suffix, get_gemm_basic_sizes, get_gemm_bert_sizes, get_gemm_bound, matmul, transab_to_suffix
 
 
+@ke.dispatchable
 def _test_gemm(func, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, alpha=1.0, beta=0.0):
-    assert dtype in ["float32", "float16"]
+    assert dtype in ["float32", "float16", "float8_e4m3"]
 
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -76,6 +76,7 @@ def _test_gemm(func, dtype: str, transa: bool, transb: bool, m: int, n: int, k:
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=True) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k):
     _test_gemm(getattr(ke, "RocBlasGemm_" + dtype_to_suffix(dtype)), dtype, transa, transb, m, n, k)
 
@@ -84,6 +85,7 @@ def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k):
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_ck_gemm_bert_cases(dtype, transa, transb, m, n, k):
     wrapper_name = f"CKGemm_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k)
@@ -93,6 +95,7 @@ def test_ck_gemm_bert_cases(dtype, transa, transb, m, n, k):
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_gemm_tunable_bert_cases(dtype, transa, transb, m, n, k):
     wrapper_name = f"GemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k)
@@ -142,6 +145,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int):
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -172,14 +176,17 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         ke.report(GemmMetric(impl, dtype, duration_ms, FLOPs, transa, transb, m, n, k))
 
 
-def profile_with_args(dtype, transa, transb, m, n, k, sort):
+@ke.dispatchable
+def profile_with_args(dtype, transa, transb, m, n, k):
     dtype_suffix = "_" + dtype_to_suffix(dtype)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
-    with ke.benchmark(sort):
-        profile_gemm_func(getattr(ke, "RocBlasGemm" + dtype_suffix), dtype, transa, transb, m, n, k)
-        profile_gemm_func(getattr(ke, "CKGemm" + dtype_suffix + transab_suffix), dtype, transa, transb, m, n, k)
+    with ke.benchmark():
+        if ke.is_rocm_available():
+            profile_gemm_func(getattr(ke, "RocBlasGemm" + dtype_suffix), dtype, transa, transb, m, n, k)
+            profile_gemm_func(getattr(ke, "CKGemm" + dtype_suffix + transab_suffix), dtype, transa, transb, m, n, k)
         profile_gemm_func(getattr(ke, "GemmTunable" + dtype_suffix + transab_suffix), dtype, transa, transb, m, n, k)
-        profile_gemm_func(getattr(ke, "GemmBenchmark" + dtype_suffix), dtype, transa, transb, m, n, k)
+        if ke.is_cuda_available():
+            profile_gemm_func(getattr(ke, "GemmBenchmark" + dtype_suffix), dtype, transa, transb, m, n, k)
         if ke.is_hipblaslt_available():
             profile_gemm_func(
                 getattr(ke, "GemmHipBlasLt" + dtype_suffix + transab_suffix), dtype, transa, transb, m, n, k
@@ -190,24 +197,21 @@ def profile_with_args(dtype, transa, transb, m, n, k, sort):
 def profile():
     for dtype in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=True):
-            profile_with_args(dtype, False, False, m, n, k, True)
+            profile_with_args(dtype, False, False, m, n, k)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("transa", choices="NT")
     group.add_argument("transb", choices="NT")
     group.add_argument("m", type=int)
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.sort)
+        args.dispatch(args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
index e32cb032798f..a45b9e80500c 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import re
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -35,7 +34,11 @@ def sigmoid_function(x):
     return 1.0 / (1.0 + np.exp(-x))
 
 
-def group_norm(input_x, gamma, beta, num_groups, epsilon, with_swish):
+def group_norm(input_x, skip_x, bias_x, gamma, beta, num_groups, epsilon, with_silu, has_skip):
+    add_output = None
+    if has_skip:
+        input_x = input_x + skip_x + bias_x
+        add_output = input_x
     n, h, w, c = input_x.shape
     input_x = input_x.transpose([0, 3, 1, 2])
     assert c % num_groups == 0
@@ -45,46 +48,82 @@ def group_norm(input_x, gamma, beta, num_groups, epsilon, with_swish):
     x = x.transpose([0, 2, 3, 1])
     x = x * gamma + beta
 
-    if with_swish:
+    if with_silu:
         x = x * sigmoid_function(x)
-    return x
+    return x, add_output
 
 
-def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups: int, dtype: str, swish: bool, func):
+def run_group_norm(
+    batch_size: int, height: int, num_channels: int, num_groups: int, dtype: str, silu: bool, has_skip: bool, func
+):
     np.random.seed(0)
     width = height
     input_x = np.random.rand(batch_size, height, width, num_channels).astype(np.float32)
     gamma = np.random.rand(num_channels).astype(np.float32)
     beta = np.random.rand(num_channels).astype(np.float32)
     # the size of workspace is defined in onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h L18
-    workspace = np.random.rand((np.dtype(np.float32).itemsize * 2) * 32 * 32).astype(np.float32)
+    workspace = np.random.rand((np.dtype(np.float32).itemsize * 2) * batch_size * num_groups).astype(np.float32)
     epsilon = 1e-05
     output_y = np.random.rand(batch_size, height, width, num_channels).astype(dtype)
-    use_swish = swish
 
-    host_x = input_x.astype(dtype)
-    input_d = ke.DeviceArray(host_x)
+    skip_x = (
+        np.random.rand(batch_size, height, width, num_channels).astype(np.float32)
+        if has_skip
+        else np.empty((0), dtype=dtype)
+    )
+    bias_x = np.random.rand(num_channels).astype(np.float32) if has_skip else np.empty((0), dtype=dtype)
+    add_output = (
+        np.random.rand(batch_size, height, width, num_channels).astype(dtype)
+        if has_skip
+        else np.empty((0), dtype=dtype)
+    )
+    use_silu = silu
+    broadcast_skip = False
+    if has_skip:
+        skip_x_shape = skip_x.shape
+        b2 = len(skip_x_shape) == 2 and skip_x_shape[0] == batch_size and skip_x_shape[1] == num_channels
+        b4 = (
+            len(skip_x_shape) == 4
+            and skip_x_shape[0] == batch_size
+            and skip_x_shape[1] == 1
+            and skip_x_shape[2] == 1
+            and skip_x_shape[3] == num_channels
+        )
+        if b2 or b4:
+            broadcast_skip = True
+    channels_per_block = 0  # Compute in params initialization
+
+    input_d = ke.DeviceArray(input_x.astype(dtype))
+    skip_d = ke.DeviceArray(skip_x.astype(dtype))
+    bias_d = ke.DeviceArray(bias_x.astype(dtype))
     gamma_d = ke.DeviceArray(gamma)
     beta_d = ke.DeviceArray(beta)
     workspace_d = ke.DeviceArray(workspace)
     y_d = ke.DeviceArray(output_y)
+    y_add_d = ke.DeviceArray(add_output)
     f = getattr(ke, func)
 
     my_op = f(
         y_d,
-        workspace_d,
+        y_add_d,
         input_d,
+        skip_d,
+        bias_d,
         gamma_d,
         beta_d,
+        workspace_d,
+        epsilon,
         batch_size,
+        num_channels,
         height,
         width,
-        num_channels,
         num_groups,
-        epsilon,
-        use_swish,
+        use_silu,
+        broadcast_skip,
+        channels_per_block,
     )
-    y_ref = group_norm(input_x, gamma, beta, num_groups, epsilon, use_swish).astype(dtype)
+    y_ref, y_add_d_ref = group_norm(input_x, skip_x, bias_x, gamma, beta, num_groups, epsilon, use_silu, has_skip)
+    y_ref = y_ref.astype(dtype)
 
     for impl in my_op.ListOps():
         if not my_op.SelectOp(impl):
@@ -95,6 +134,10 @@ def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups:
         y_d.UpdateHostNumpyArray()
 
         np.testing.assert_allclose(y_ref, output_y, atol=1e-02)
+        if has_skip:
+            y_add_d_ref = y_add_d_ref.astype(dtype)
+            y_add_d.UpdateHostNumpyArray()
+            np.testing.assert_allclose(y_add_d_ref, add_output, atol=1e-02)
 
 
 dtypes = ["float32", "float16"]
@@ -102,19 +145,21 @@ def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups:
 
 @pytest.mark.parametrize("sd_sizes", get_sd_sizes())
 @pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.parametrize("swish", [True])
-def test_group_norm(sd_sizes, dtype, swish):
+@pytest.mark.parametrize("silu", [True])
+@pytest.mark.parametrize("has_skip", [True, False])
+def test_group_norm(sd_sizes, dtype, silu, has_skip):
     for func in dtype_to_funcs(dtype):
-        run_group_norm(*sd_sizes, dtype, swish, func)
+        run_group_norm(*sd_sizes, dtype, silu, has_skip, func)
 
 
 @pytest.mark.parametrize("sd_sizes", get_sd_sizes())
 @pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.parametrize("swish", [True])
-def test_group_norm_ck(sd_sizes, dtype, swish):
-    swish_suffix = "Swish" if swish else "Pass"
-    ck_f_name = "CKGroupNormNHWC" + swish_suffix + "_" + dtype_to_suffix(dtype)
-    run_group_norm(*sd_sizes, dtype, swish, ck_f_name)
+@pytest.mark.parametrize("silu", [True])
+@pytest.mark.parametrize("has_skip", [False])
+def test_group_norm_ck(sd_sizes, dtype, silu, has_skip):
+    silu_suffix = "Silu" if silu else "Pass"
+    ck_f_name = "CKGroupNormNHWC" + silu_suffix + "_" + dtype_to_suffix(dtype)
+    run_group_norm(*sd_sizes, dtype, silu, has_skip, ck_f_name)
 
 
 @dataclass
@@ -135,38 +180,69 @@ def report(self):
         return "not supported          " + common
 
 
+@ke.dispatchable(pattern_arg=8)
 def profile_group_norm_func(
-    batch_size: int, height: int, width: int, num_channels: int, num_groups: int, dtype: str, swish: bool, func
+    batch_size: int,
+    height: int,
+    width: int,
+    num_channels: int,
+    num_groups: int,
+    dtype: str,
+    silu: bool,
+    has_skip: bool,
+    func,
 ):
     np.random.seed(0)
     input_x = np.random.rand(batch_size, height, width, num_channels).astype(dtype)
     gamma = np.random.rand(num_channels).astype(np.float32)
     beta = np.random.rand(num_channels).astype(np.float32)
-    workspace = np.random.rand(np.dtype(np.float32).itemsize * 2 * 32 * 32).astype(np.float32)
+    workspace = np.random.rand(np.dtype(np.float32).itemsize * 2 * batch_size * num_groups).astype(np.float32)
     epsilon = 0.05
     output_y = np.random.rand(batch_size, height, width, num_channels).astype(dtype)
-    use_swish = swish
+
+    skip_x = (
+        np.random.rand(batch_size, height, width, num_channels).astype(dtype)
+        if has_skip
+        else np.empty((0), dtype=dtype)
+    )
+    bias_x = np.random.rand(num_channels).astype(dtype) if has_skip else np.empty((0), dtype=dtype)
+    add_output = (
+        np.random.rand(batch_size, height, width, num_channels).astype(dtype)
+        if has_skip
+        else np.empty((0), dtype=dtype)
+    )
+    use_silu = silu
+    broadcast_skip = False
+    channels_per_block = 0  # Compute in params initialization
 
     input_d = ke.DeviceArray(input_x)
+    skip_d = ke.DeviceArray(skip_x)
+    bias_d = ke.DeviceArray(bias_x)
     gamma_d = ke.DeviceArray(gamma)
     beta_d = ke.DeviceArray(beta)
     workspace_d = ke.DeviceArray(workspace)
     y_d = ke.DeviceArray(output_y)
+    y_add_d = ke.DeviceArray(add_output)
     f = getattr(ke, func)
 
     my_op = f(
         y_d,
-        workspace_d,
+        y_add_d,
         input_d,
+        skip_d,
+        bias_d,
         gamma_d,
         beta_d,
+        workspace_d,
+        epsilon,
         batch_size,
+        num_channels,
         height,
         width,
-        num_channels,
         num_groups,
-        epsilon,
-        use_swish,
+        use_silu,
+        broadcast_skip,
+        channels_per_block,
     )
     for impl in my_op.ListOps():
         duration_ms = -1
@@ -181,14 +257,15 @@ def profile_group_norm_func(
         )
 
 
-def profile_with_args(batch_size, height, width, num_channels, num_groups, dtype, swish=True, sort=True):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(batch_size, height, width, num_channels, num_groups, dtype, silu=True, has_skip=True):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
-            profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, swish, func)
+            profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, silu, has_skip, func)
         # ck function
-        swish_suffix = "Swish" if swish else "Pass"
-        ck_f_name = "CKGroupNormNHWC" + swish_suffix + "_" + dtype_to_suffix(dtype)
-        profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, swish, ck_f_name)
+        silu_suffix = "Silu" if silu else "Pass"
+        ck_f_name = "CKGroupNormNHWC" + silu_suffix + "_" + dtype_to_suffix(dtype)
+        profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, silu, has_skip, ck_f_name)
 
 
 sd_profile_sizes = [
@@ -217,30 +294,28 @@ def profile():
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("batch_size", type=int)
     group.add_argument("height", type=int)
     group.add_argument("width", type=int)
     group.add_argument("num_channels", type=int)
     group.add_argument("num_groups", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--swish", action="store_true")
-    group.add_argument("--sort", action="store_true")
+    group.add_argument("--silu", action="store_true")
+    group.add_argument("--has_skip", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
+        args.dispatch(
             args.batch_size,
             args.height,
             args.width,
             args.num_channels,
             args.num_groups,
             args.dtype,
-            args.swish,
-            args.sort,
+            args.silu,
+            args.has_skip,
         )
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py b/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
index 289dad22379b..66e1a8052ce8 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
@@ -5,12 +5,19 @@
 
 """This file provides wrapper for native _kernel_explorer.so library and benchmark reporter for operator"""
 
+from __future__ import annotations
+
 import ctypes
+import json
 import os
 import sys
 from abc import abstractmethod
+from argparse import Action, ArgumentParser
 from contextlib import contextmanager
 from dataclasses import dataclass
+from fnmatch import fnmatch
+from functools import wraps
+from typing import Callable
 
 build_dir = os.environ.get("KERNEL_EXPLORER_BUILD_DIR", None)
 if build_dir is None:
@@ -38,10 +45,14 @@
     "onnxruntime_pybind11_state.so",
     "libonnxruntime_providers_shared.so",
 ]
+_is_cuda_available = False
+_is_rocm_available = False
 if "CUDAExecutionProvider" in available_providers:
     library_files_to_load.append("libonnxruntime_providers_cuda.so")
+    _is_cuda_available = True
 if "ROCMExecutionProvider" in available_providers:
     library_files_to_load.append("libonnxruntime_providers_rocm.so")
+    _is_rocm_available = True
 
 library_to_load = []
 
@@ -56,15 +67,37 @@
 
 
 # use RTLD_GLOBAL to bring all symbols to global name space
-libraries = [ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL) for lib_path in library_to_load]
+_libraries = [ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL) for lib_path in library_to_load]
+del library_files_to_load, library_to_load
 
 # pylint: disable=wrong-import-position, disable=unused-import
-import _kernel_explorer  # noqa: E402, F401
+import _kernel_explorer  # noqa: E402
 
 # pylint: disable=wrong-import-position, disable=unused-import, disable=wildcard-import
 from _kernel_explorer import *  # noqa: F403, E402
 
 
+@dataclass
+class _KeContext:
+    sort: bool = False
+
+    pattern = "*"
+
+    # mapping the module to dispatch to
+    dispatchable: dict | None = None
+    instance_dispatchable: dict | None = None  # can be filtered with pattern
+
+    dispatch_depth = 0
+
+    save_tuning_results: str | None = None
+    return_tuning_results: bool = False
+
+
+_ke_context = _KeContext()
+_ke_context.dispatchable = {}
+_ke_context.instance_dispatchable = {}
+
+
 # Benchmark Reporter
 @dataclass
 class MetricBase:
@@ -114,30 +147,34 @@ class ComputeAndBandwidthMetric(ComputeMetric, BandwidthMetric):
 
 class InstanceBenchmarkReporter:
     def __init__(self):
-        self.sort = False
+        self.best = float("inf")
         self.reporters = []
 
-    def set_sort(self, sort):
-        self.sort = sort
-
     def make_report(self):
         self.reporters.sort()
         for item in self.reporters:
-            print(item.report())
+            if not _ke_context.sort and item.milliseconds_duration > 0 and item.milliseconds_duration < self.best:
+                self.best = item.milliseconds_duration
+                print(item.report(), "*")
+            else:
+                print(item.report())
         self.reporters.clear()
 
     def receive(self, status):
         self.reporters.append(status)
-        if not self.sort:
+        if not _ke_context.sort:
             self.make_report()
 
+    def _reset_best(self):
+        self.best = float("inf")
+
 
 _reporter = InstanceBenchmarkReporter()
 
 
 @contextmanager
-def benchmark(sort):
-    _reporter.set_sort(sort)
+def benchmark():
+    _reporter._reset_best()
     try:
         yield
     finally:
@@ -146,3 +183,182 @@ def benchmark(sort):
 
 def report(status):
     _reporter.receive(status)
+
+
+def set_ort_severity(v):
+    v = int(v)
+    onnxruntime_pybind11_state.set_default_logger_severity(v)
+    return v
+
+
+def set_ort_verbosity(v):
+    v = int(v)
+    onnxruntime_pybind11_state.set_default_logger_verbosity(v)
+    return v
+
+
+def register_common_arguments(parser: ArgumentParser):
+    class SortAction(Action):
+        def __init__(self, option_strings, dest, default=False, help=None):
+            super().__init__(option_strings=option_strings, dest=dest, nargs=0, default=default, help=help)
+
+        def __call__(self, parser, namespace, values, option_string=None):
+            setattr(namespace, self.dest, True)
+            _ke_context.sort = True
+
+    def set_dispatch(name):
+        if name in _ke_context.dispatchable:
+            dispatch = _ke_context.dispatchable[name]
+            _ke_context.dispatch = dispatch
+            return dispatch
+
+        if name in _ke_context.instance_dispatchable:
+            msg = f"'{name}' needs an instance to dispatch, thus it is not dispatchable from commandline."
+            print(msg)
+            raise ValueError(msg)
+
+        from difflib import SequenceMatcher as Matcher
+
+        valid_names = list(_ke_context.dispatchable.keys())
+        scored_names = list(reversed(sorted([(Matcher(None, name, a).ratio(), a) for a in valid_names])))
+        top10 = "\n    ".join([a for _, a in scored_names[:10]])
+        msg = f"'{name}' is not registered for dispatch. Top 10 matches are:\n    {top10}"
+        print(msg)
+        raise ValueError(msg)
+
+    def set_pattern(pattern):
+        pattern = str(pattern)
+        _ke_context.pattern = pattern
+
+    def set_save_tuning_results(path):
+        _ke_context.save_tuning_results = path
+        return path
+
+    group = parser.add_argument_group("kernel explorer args", "Common arguments for kernel explorer")
+    group.add_argument(
+        "--sort",
+        action=SortAction,
+        help="control the sort of ke benchmark results based on timing",
+    )
+    group.add_argument(
+        "--ort_default_logger_severity",
+        default=2,
+        choices=[0, 1, 2, 3, 4],
+        type=set_ort_severity,
+        help="0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal",
+    )
+    group.add_argument("--ort_default_logger_verbosity", default=0, type=set_ort_verbosity)
+    group.add_argument(
+        "--dispatch",
+        default="profile_with_args",
+        help="dispatch a registered dispatchable.",
+        type=set_dispatch,
+    )
+    group.add_argument(
+        "--pattern",
+        default="*",
+        help="filter the register instanced dispatchables, only matched pattern will be run.",
+        type=set_pattern,
+    )
+    group.add_argument(
+        "--save_tuning_results",
+        default=None,
+        type=set_save_tuning_results,
+        help="patch the dispatch function to save tuning results to the specified path.",
+    )
+
+    return parser
+
+
+def get_argument_parser():
+    parser = ArgumentParser()
+    return register_common_arguments(parser)
+
+
+def has_args():
+    if "--help" in sys.argv or "-h" in sys.argv or "--func" in sys.argv:
+        return True
+
+    # parse the KE args group
+    parser = get_argument_parser()
+    _, remainder = parser.parse_known_args(sys.argv)
+    return len(remainder) > 1  # the file path is always the remainder
+
+
+def is_cuda_available():
+    return _is_cuda_available
+
+
+def is_rocm_available():
+    return _is_rocm_available
+
+
+def dispatchable(f: Callable | None = None, *, pattern_arg: int | None = None):
+    def wrap_dispatch(f, *args, **kwargs):
+        if _ke_context.dispatch_depth == 0:
+            if _ke_context.save_tuning_results is not None:
+                _kernel_explorer.enable_collect_tuning_results()
+        _ke_context.dispatch_depth += 1
+        ret = f(*args, **kwargs)
+        _ke_context.dispatch_depth -= 1
+        if _ke_context.dispatch_depth == 0:
+            if _ke_context.save_tuning_results is not None:
+                try:
+                    trs = _kernel_explorer.get_collected_tuning_results()
+                    with open(_ke_context.save_tuning_results, "x") as f:
+                        json.dump(trs, f)
+                finally:
+                    pass
+
+            if _ke_context.return_tuning_results:
+                if ret is not None:
+                    print(
+                        f"WARNING: kernel explorer wants to override the return value of {f.__name__},",
+                        "but original return value is not None!",
+                    )
+                    return ret
+                try:
+                    trs = _kernel_explorer.get_collected_tuning_results()
+                    return trs
+                finally:
+                    pass
+
+        return ret
+
+    if f is None:  # Used with ke.dispatchable(...)
+        assert pattern_arg is not None
+
+        def decorator(f):
+            _ke_context.instance_dispatchable[f.__name__] = f
+
+            @wraps(f)
+            def wrapper(*args, **kwargs):
+                func_name = args[pattern_arg] if isinstance(args[pattern_arg], str) else args[pattern_arg].__name__
+                if not fnmatch(func_name, _ke_context.pattern):
+                    print(
+                        f"Trying to run {func_name},",
+                        f"does not match allowed function name pattern '{_ke_context.pattern}', skip...",
+                    )
+                    return
+                return wrap_dispatch(f, *args, **kwargs)
+
+            return wrapper
+
+        return decorator
+
+    else:  # Used with @ke.dispatchable
+        _ke_context.dispatchable[f.__name__] = f
+
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            return wrap_dispatch(f, *args, **kwargs)
+
+        return wrapper
+
+
+def set_dispatchable_pattern(p: str = "*"):
+    _ke_context.pattern = p
+
+
+def set_return_tuning_results(b: bool = True):
+    _ke_context.return_tuning_results = b
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
index 9b8a261f728d..df35fa4e6c41 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 
 import kernel_explorer as ke
@@ -50,6 +49,7 @@ def report(self):
         return f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s {self.dtype} m={self.m} n={self.n} k={self.k} is_symmetric={self.is_symmetric} {self.name}"
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_matmul_fp_int4_func(m, n, k, dtype, func, is_symmetric):
     np.random.seed(0)
     output = np.random.rand(m, n).astype(dtype)
@@ -76,6 +76,7 @@ def profile_matmul_fp_int4_func(m, n, k, dtype, func, is_symmetric):
     ke.report(MatrixFpInt4Metric(func, dtype, duration_ms, total_bytes, m, n, k, is_symmetric))
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_gemm_func(m, n, k, dtype, func):
     np.random.seed(0)
     output = np.random.rand(m, n).astype(dtype)
@@ -93,8 +94,9 @@ def profile_gemm_func(m, n, k, dtype, func):
     ke.report(MatrixMulMetric(func, dtype, duration_ms, total_bytes, m, n, k))
 
 
-def profile_with_args(m, n, k, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(m, n, k, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_matmul_fp_int4_func(m, n, k, dtype, func, True)
 
@@ -117,23 +119,20 @@ def profile():
                 (11008, 4096),
                 (2 * 11008, 4096),
             ):
-                profile_with_args(m, n, k, dt, False)
+                profile_with_args(m, n, k, dt)
                 print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("m", type=int)
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.m, args.n, args.k, args.dtype, args.sort)
+        args.dispatch(args.m, args.n, args.k, args.dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/gemm_softmax_gemm_permute.cu b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/gemm_softmax_gemm_permute.cu
index 5e60bad776d4..7068fc8fd0eb 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/gemm_softmax_gemm_permute.cu
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/gemm_softmax_gemm_permute.cu
@@ -28,6 +28,7 @@ class IGemmSoftmaxGemmPermuteKernelExplorer : public IKernelExplorer {
       int64_t head_size,
       int64_t mask_dim,
       double scale,
+      bool causal,
       contrib::AttentionQkvFormat qkv_format,
       DeviceArray& Q,
       std::optional<DeviceArray>& K,
@@ -51,7 +52,7 @@ class IGemmSoftmaxGemmPermuteKernelExplorer : public IKernelExplorer {
     attn_.v_hidden_size = attn_.hidden_size;  // Q,K,V hidden size must agree now
     attn_.v_head_size = attn_.head_size;      // Q,K,V hidden size must agree now
     attn_.num_heads = num_heads;
-    attn_.is_unidirectional = false;
+    attn_.is_unidirectional = causal;
     attn_.past_present_share_buffer = false;
     attn_.do_rotary = false;
     attn_.mask_filter_value = -10000.0f;
@@ -148,6 +149,7 @@ class GemmSoftmaxGemmPermuteGeneric : public IGemmSoftmaxGemmPermuteKernelExplor
       int64_t head_size,
       int64_t mask_dim,
       double scale,
+      bool causal,
       contrib::AttentionQkvFormat qkv_format,
       DeviceArray& Q,
       std::optional<DeviceArray>& K,
@@ -156,7 +158,7 @@ class GemmSoftmaxGemmPermuteGeneric : public IGemmSoftmaxGemmPermuteKernelExplor
       std::optional<DeviceArray>& attn_mask,
       DeviceArray& out)
       : IGemmSoftmaxGemmPermuteKernelExplorer<T>(batch, seqlen, total_seqlen, max_seqlen,
-                                                 num_heads, head_size, mask_dim, scale, qkv_format,
+                                                 num_heads, head_size, mask_dim, scale, causal, qkv_format,
                                                  Q, K, V, attn_bias, attn_mask, out) {
     this->SetWorkspace(GemmSoftmaxGemmPermuteGenericPipeline<T>::GetWorkspaceNumBytes(&this->attn_));
   }
@@ -187,6 +189,7 @@ class GemmSoftmaxGemmPermuteGenericNestedTunable : public GemmSoftmaxGemmPermute
       int64_t head_size,
       int64_t mask_dim,
       double scale,
+      bool causal,
       contrib::AttentionQkvFormat qkv_format,
       DeviceArray& Q,
       std::optional<DeviceArray>& K,
@@ -195,7 +198,7 @@ class GemmSoftmaxGemmPermuteGenericNestedTunable : public GemmSoftmaxGemmPermute
       std::optional<DeviceArray>& attn_mask,
       DeviceArray& out)
       : GemmSoftmaxGemmPermuteGeneric<T>(batch, seqlen, total_seqlen, max_seqlen,
-                                         num_heads, head_size, mask_dim, scale, qkv_format,
+                                         num_heads, head_size, mask_dim, scale, causal, qkv_format,
                                          Q, K, V, attn_bias, attn_mask, out) {
     this->params_.TuningContext()->EnableTunableOpAndTuning();
   }
@@ -214,6 +217,7 @@ class GemmSoftmaxGemmPermuteCK : public IGemmSoftmaxGemmPermuteKernelExplorer<T>
       int64_t head_size,
       int64_t mask_dim,
       double scale,
+      bool causal,
       contrib::AttentionQkvFormat qkv_format,
       DeviceArray& Q,
       std::optional<DeviceArray>& K,
@@ -222,7 +226,7 @@ class GemmSoftmaxGemmPermuteCK : public IGemmSoftmaxGemmPermuteKernelExplorer<T>
       std::optional<DeviceArray>& attn_mask,
       DeviceArray& out)
       : IGemmSoftmaxGemmPermuteKernelExplorer<T>(batch, seqlen, total_seqlen, max_seqlen,
-                                                 num_heads, head_size, mask_dim, scale, qkv_format,
+                                                 num_heads, head_size, mask_dim, scale, causal, qkv_format,
                                                  Q, K, V, attn_bias, attn_mask, out) {
     this->SetWorkspace(GemmSoftmaxGemmPermuteTunableOp<T>::GetWorkspaceNumBytes(&this->attn_));
 
@@ -275,6 +279,7 @@ class GemmSoftmaxGemmPermuteTunable : public IGemmSoftmaxGemmPermuteKernelExplor
       int64_t head_size,
       int64_t mask_dim,
       double scale,
+      bool causal,
       contrib::AttentionQkvFormat qkv_format,
       DeviceArray& Q,
       std::optional<DeviceArray>& K,
@@ -283,7 +288,7 @@ class GemmSoftmaxGemmPermuteTunable : public IGemmSoftmaxGemmPermuteKernelExplor
       std::optional<DeviceArray>& attn_mask,
       DeviceArray& out)
       : IGemmSoftmaxGemmPermuteKernelExplorer<T>(batch, seqlen, total_seqlen, max_seqlen,
-                                                 num_heads, head_size, mask_dim, scale, qkv_format,
+                                                 num_heads, head_size, mask_dim, scale, causal, qkv_format,
                                                  Q, K, V, attn_bias, attn_mask, out) {
     this->SetWorkspace(std::max(
         GemmSoftmaxGemmPermuteGenericPipeline<T>::GetWorkspaceNumBytes(&this->attn_),
@@ -311,7 +316,7 @@ class GemmSoftmaxGemmPermuteTunable : public IGemmSoftmaxGemmPermuteKernelExplor
 #define REGISTER_COMMON(name, type, ...)                                                          \
   py::class_<type<__VA_ARGS__>>(m, name)                                                          \
       .def(py::init<int64_t, int64_t, int64_t, std::optional<int64_t>, int64_t, int64_t, int64_t, \
-                    float, contrib::AttentionQkvFormat,                                           \
+                    float, bool, contrib::AttentionQkvFormat,                                     \
                     DeviceArray&,                                                                 \
                     std::optional<DeviceArray>&,                                                  \
                     std::optional<DeviceArray>&,                                                  \
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu
index 0bd47b2c0387..6af163ab94b1 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu
@@ -12,17 +12,21 @@
 #include "python/tools/kernel_explorer/kernel_explorer_interface.h"
 
 namespace py = pybind11;
-
+using onnxruntime::contrib::rocm::GetGroupNormWorkspaceSizeInBytes;
 namespace onnxruntime {
 
 template <typename T, int ThreadsPerBlock, int VecSize>
 class GroupNormNHWC : public IKernelExplorer {
  public:
-  GroupNormNHWC(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
+  GroupNormNHWC(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip, DeviceArray& bias,
+                DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace, float epsilon,
+                int batch_size, int num_channels, int height, int width, int num_groups, bool use_silu,
+                bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
     type_string_ = "GroupNormNHWC_" + std::to_string(ThreadsPerBlock) + "_" + std::to_string(VecSize);
   }
 
@@ -40,7 +44,7 @@ class GroupNormNHWC : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   ParamsT params_{};
   contrib::rocm::GroupNormNHWCOp<T, ThreadsPerBlock, VecSize> op_{};
   std::string type_string_{};
@@ -49,11 +53,15 @@ class GroupNormNHWC : public IKernelExplorer {
 template <typename T>
 class GroupNormNHWCStaticSelection : public IKernelExplorer {
  public:
-  GroupNormNHWCStaticSelection(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                               int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
+  GroupNormNHWCStaticSelection(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip,
+                               DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace,
+                               float epsilon, int batch_size, int num_channels, int height, int width, int num_groups,
+                               bool use_silu, bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
     type_string_ = "GroupNormNHWCStaticSelection";
   }
 
@@ -71,7 +79,7 @@ class GroupNormNHWCStaticSelection : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   ParamsT params_{};
   std::string type_string_{};
 };
@@ -79,11 +87,15 @@ class GroupNormNHWCStaticSelection : public IKernelExplorer {
 template <typename T>
 class GroupNormNHWCTunable : public IKernelExplorer {
  public:
-  GroupNormNHWCTunable(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                       int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
+  GroupNormNHWCTunable(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip,
+                       DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace,
+                       float epsilon, int batch_size, int num_channels, int height, int width, int num_groups,
+                       bool use_silu, bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
     params_.TuningContext()->EnableTunableOpAndTuning();
   }
 
@@ -100,21 +112,25 @@ class GroupNormNHWCTunable : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   ParamsT params_{};
   contrib::rocm::GroupNormNHWCTunableOp<T> op_{};
 };
 
 #ifdef USE_COMPOSABLE_KERNEL
-template <typename T, bool WithSwish>
+template <typename T, bool WithSilu>
 class CKGroupNormNHWC : public IKernelExplorer {
  public:
-  CKGroupNormNHWC(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                  int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
-    for (auto&& [type_string, op] : contrib::rocm::GetCKGroupNormNHWCTypeStringAndOps<T, float, WithSwish>()) {
+  CKGroupNormNHWC(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip,
+                  DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace,
+                  float epsilon, int batch_size, int num_channels, int height, int width, int num_groups,
+                  bool use_silu, bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
+    for (auto&& [type_string, op] : contrib::rocm::GetCKGroupNormNHWCTypeStringAndOps<T, float, WithSilu>()) {
       type_strings_.emplace_back(std::move(type_string));
       ops_.emplace_back(std::move(op));
     }
@@ -141,7 +157,7 @@ class CKGroupNormNHWC : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   using OpT = rocm::tunable::Op<ParamsT>;
   ParamsT params_{};
   std::vector<OpT> ops_;
@@ -151,15 +167,19 @@ class CKGroupNormNHWC : public IKernelExplorer {
 #endif  // USE_COMPOSABLE_KERNEL
 
 #ifdef USE_TRITON_KERNEL
-template <typename T, bool WithSwish>
+template <typename T, bool WithSilu>
 class GroupNormNHWCTriton : public IKernelExplorer {
  public:
-  GroupNormNHWCTriton(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta,
-                      int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish)
-      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<float*>(workspace.ptr()),
-                static_cast<T*>(input.ptr()), static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()),
-                batch_size, height, width, num_channels, num_groups, epsilon, use_swish) {
-    for (auto&& [name, op] : contrib::rocm::GetTritonGroupNormNHWCTypeStringAndOps<T, WithSwish>()) {
+  GroupNormNHWCTriton(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip,
+                      DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace,
+                      float epsilon, int batch_size, int num_channels, int height, int width, int num_groups,
+                      bool use_silu, bool broadcast_skip, int channels_per_block)
+      : params_(TuningContext(), Stream(), static_cast<T*>(output.ptr()), static_cast<T*>(add_output.ptr()),
+                static_cast<T*>(input.ptr()), static_cast<T*>(skip.ptr()), static_cast<T*>(bias.ptr()),
+                static_cast<float*>(gamma.ptr()), static_cast<float*>(beta.ptr()), static_cast<float*>(workspace.ptr()),
+                epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip,
+                channels_per_block) {
+    for (auto&& [name, op] : contrib::rocm::GetTritonGroupNormNHWCTypeStringAndOps<T, WithSilu>()) {
       name_strings_.emplace_back(name);
       ops_.emplace_back(std::move(op));
     }
@@ -186,7 +206,7 @@ class GroupNormNHWCTriton : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::GroupNormNHWCParams<T>;
+  using ParamsT = contrib::rocm::GroupNormNHWCTunableParams<T>;
   using OpT = rocm::tunable::Op<ParamsT>;
   ParamsT params_{};
   std::vector<OpT> ops_;
@@ -198,7 +218,8 @@ class GroupNormNHWCTriton : public IKernelExplorer {
 #define REGISTER_OP(name, type, threads_per_block, vec_size)                                                   \
   py::class_<name<type, threads_per_block, vec_size>>(m, #name "_" #type "_" #threads_per_block "_" #vec_size) \
       .def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&,                      \
-                    int, int, int, int, int, float, bool>())                                                   \
+                    DeviceArray&, DeviceArray&, DeviceArray&, float,                                           \
+                    int, int, int, int, int, bool, bool, int>())                                               \
       .def("SetRepeats", &name<type, threads_per_block, vec_size>::SetRepeats)                                 \
       .def("Profile", &name<type, threads_per_block, vec_size>::Profile)                                       \
       .def("Run", &name<type, threads_per_block, vec_size>::Run)                                               \
@@ -220,7 +241,8 @@ class GroupNormNHWCTriton : public IKernelExplorer {
 #define REGISTER_COMMON(name, type, ...)                                                  \
   py::class_<type<__VA_ARGS__>>(m, name)                                                  \
       .def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, \
-                    int, int, int, int, int, float, bool>())                              \
+                    DeviceArray&, DeviceArray&, DeviceArray&, float,                      \
+                    int, int, int, int, int, bool, bool, int>())                          \
       .def("SetRepeats", &type<__VA_ARGS__>::SetRepeats)                                  \
       .def("Profile", &type<__VA_ARGS__>::Profile)                                        \
       .def("Run", &type<__VA_ARGS__>::Run)                                                \
@@ -230,11 +252,11 @@ class GroupNormNHWCTriton : public IKernelExplorer {
 #define REGISTER_OP_TYPED(name, type) \
   REGISTER_COMMON(#name "_" #type, name, type)
 
-#define REGISTER_CK(type, with_swish, swish_suffix) \
-  REGISTER_COMMON("CKGroupNormNHWC" swish_suffix "_" #type, CKGroupNormNHWC, type, with_swish)
+#define REGISTER_CK(type, with_silu, silu_suffix) \
+  REGISTER_COMMON("CKGroupNormNHWC" silu_suffix "_" #type, CKGroupNormNHWC, type, with_silu)
 
-#define REGISTER_TRITON(type, with_swish, swish_suffix) \
-  REGISTER_COMMON("GroupNormNHWCTriton" swish_suffix "_" #type, GroupNormNHWCTriton, type, with_swish)
+#define REGISTER_TRITON(type, with_silu, silu_suffix) \
+  REGISTER_COMMON("GroupNormNHWCTriton" silu_suffix "_" #type, GroupNormNHWCTriton, type, with_silu)
 
 KE_REGISTER(m) {
   REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(GroupNormNHWC, half);
@@ -248,16 +270,16 @@ KE_REGISTER(m) {
 
 #ifdef USE_COMPOSABLE_KERNEL
   REGISTER_CK(half, false, "Pass");
-  REGISTER_CK(half, true, "Swish");
+  REGISTER_CK(half, true, "Silu");
   REGISTER_CK(float, false, "Pass");
-  REGISTER_CK(float, true, "Swish");
+  REGISTER_CK(float, true, "Silu");
 #endif  // USE_COMPOSABLE_KERNEL
 
 #ifdef USE_TRITON_KERNEL
   REGISTER_TRITON(half, false, "Pass");
-  REGISTER_TRITON(half, true, "Swish");
+  REGISTER_TRITON(half, true, "Silu");
   REGISTER_TRITON(float, false, "Pass");
-  REGISTER_TRITON(float, true, "Swish");
+  REGISTER_TRITON(float, true, "Silu");
 #endif
 }
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
index a31e8b851fa3..bfe13fac2a14 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import re
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -51,6 +50,7 @@ def simplified_skip_layer_norm(input_x, skip, bias, gamma, epsilon):
     return output, val
 
 
+@ke.dispatchable(pattern_arg=4)
 def run_skip_layer_norm(
     batch_size: int, seq_len: int, hidden_size: int, dtype: str, func, simplified=False, has_optional_output=False
 ):
@@ -130,6 +130,7 @@ def report(self):
         return "not supported          " + common
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_skip_layer_norm_func(batch_size, seq_len, hidden_size, dtype, func, has_optional_output):
     np.random.seed(0)
     input_x = np.random.rand(batch_size, seq_len, hidden_size).astype(dtype)
@@ -175,8 +176,9 @@ def profile_skip_layer_norm_func(batch_size, seq_len, hidden_size, dtype, func,
     ke.report(SkipLayerNormMetric(func, dtype, duration_ms, total_bytes, batch_size, seq_len, hidden_size))
 
 
-def profile_with_args(batch_size, seq_len, hidden_size, dtype, sort=True, has_optional_output=False, simplified=False):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(batch_size, seq_len, hidden_size, dtype, has_optional_output=False, simplified=False):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype, simplified):
             profile_skip_layer_norm_func(batch_size, seq_len, hidden_size, dtype, func, has_optional_output)
 
@@ -189,28 +191,24 @@ def profile():
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("batch_size", type=int)
     group.add_argument("seq_len", type=int)
     group.add_argument("hidden_size", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
     group.add_argument("--has_optional_output", "-o", action="store_true")
     group.add_argument("--simplified", "-s", action="store_true", default=False)
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
+        args.dispatch(
             args.batch_size,
             args.seq_len,
             args.hidden_size,
             args.dtype,
-            args.sort,
             args.has_optional_output,
             args.simplified,
         )
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/softmax_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/softmax_test.py
index c8de619fe96d..3a7e4442108f 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/softmax_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/softmax_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import re
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -57,6 +56,7 @@ def _test_softmax(batch_count, softmax_elements, is_log_softmax, dtype, func):
 
 @pytest.mark.parametrize("batch_count, softmax_elements, is_log_softmax", get_test_sizes())
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_softmax(batch_count, softmax_elements, is_log_softmax, dtype):
     for f in dtype_to_funcs(dtype):
         _test_softmax(batch_count, softmax_elements, is_log_softmax, dtype, f)
@@ -64,6 +64,7 @@ def test_softmax(batch_count, softmax_elements, is_log_softmax, dtype):
 
 @pytest.mark.parametrize("batch_count, softmax_elements, is_log_softmax", get_test_sizes())
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_ck_softmax(batch_count, softmax_elements, is_log_softmax, dtype):
     ck_f_name = "CKSoftmax_" + dtype_to_suffix(dtype)
     _test_softmax(batch_count, softmax_elements, is_log_softmax, dtype, ck_f_name)
@@ -82,6 +83,7 @@ def report(self):
         return "not supported        " + common
 
 
+@ke.dispatchable(pattern_arg=4)
 def profile_softmax_func(batch_count, softmax_elements, is_log_softmax, dtype, func):
     np.random.seed(0)
     x = np.random.rand(batch_count, softmax_elements).astype(dtype)
@@ -104,8 +106,9 @@ def profile_softmax_func(batch_count, softmax_elements, is_log_softmax, dtype, f
         ke.report(SoftmaxMetric(impl, dtype, duration_ms, total_bytes, batch_count, softmax_elements, is_log_softmax))
 
 
-def profile_with_args(batch_count, softmax_elements, is_log_softmax, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(batch_count, softmax_elements, is_log_softmax, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_softmax_func(batch_count, softmax_elements, is_log_softmax, dtype, func)
         # ck function
@@ -119,23 +122,20 @@ def profile_with_args(batch_count, softmax_elements, is_log_softmax, dtype, sort
 def profile():
     for dtype in dtypes:
         for batch_count, softmax_elements in profile_size:
-            profile_with_args(batch_count, softmax_elements, False, dtype, True)
+            profile_with_args(batch_count, softmax_elements, False, dtype)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("batch_count", type=int)
     group.add_argument("softmax_elements", type=int)
     group.add_argument("is_log_softmax", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.batch_count, args.softmax_elements, args.is_log_softmax, args.dtype, args.sort)
+        args.dispatch(args.batch_count, args.softmax_elements, args.is_log_softmax, args.dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
index b5504cbd4944..b8c9c6f6a4ab 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
@@ -4,7 +4,6 @@
 # --------------------------------------------------------------------------
 
 import os
-import sys
 from dataclasses import dataclass
 from itertools import product
 
@@ -23,6 +22,7 @@ def dtype_to_suffix(dtype):
     }[dtype]
 
 
+@ke.dispatchable
 def _test_strided_batched_gemm(
     func, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, batch: int, alpha=1.0, beta=0.0
 ):
@@ -101,6 +101,7 @@ def _test_strided_batched_gemm(
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
     wrapper_name = "RocBlasStridedBatchedGemm_" + dtype_to_suffix(dtype)
     _test_strided_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
@@ -111,6 +112,7 @@ def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
 @pytest.mark.parametrize("m, n, k", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_ck_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
     wrapper_name = f"CKStridedBatchedGemm_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_strided_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
@@ -121,6 +123,7 @@ def test_ck_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
 @pytest.mark.parametrize("m, n, k", get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_gemm_tunable_bert_cases(dtype, transa, transb, m, n, k, batch):
     wrapper_name = f"StridedBatchedGemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_strided_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
@@ -177,6 +180,7 @@ def report(self):
         return f"{self.duration:>6.2f} us {self.tflops:>5.2f} tflops " + common
 
 
+@ke.dispatchable(pattern_arg=0)
 def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int, k: int, batch: int):
     a_shape = (k, m) if transa else (m, k)
     b_shape = (n, k) if transb else (k, n)
@@ -209,7 +213,8 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         ke.report(StridedBatchedGemmMetric(impl, dtype, duration_ms, FLOPs, transa, transb, m, n, k, batch))
 
 
-def profile_with_args(dtype, transa, transb, m, n, k, batch, sort):
+@ke.dispatchable
+def profile_with_args(dtype, transa, transb, m, n, k, batch):
     dtype_suffix = "_" + dtype_to_suffix(dtype)
     transab_suffix = "_" + transab_to_suffix((transa, transb))
     fn_rocblas = getattr(ke, "RocBlasStridedBatchedGemm" + dtype_suffix)
@@ -217,7 +222,7 @@ def profile_with_args(dtype, transa, transb, m, n, k, batch, sort):
     fn_tunable = getattr(ke, "StridedBatchedGemmTunable" + dtype_suffix + transab_suffix)
     if ke.is_hipblaslt_available():
         fn_hipblaslt = getattr(ke, "StridedBatchedGemmHipBlasLt" + dtype_suffix + transab_suffix)
-    with ke.benchmark(sort):
+    with ke.benchmark():
         profile_gemm_func(fn_rocblas, dtype, transa, transb, m, n, k, batch)
         profile_gemm_func(fn_ck, dtype, transa, transb, m, n, k, batch)
         profile_gemm_func(fn_tunable, dtype, transa, transb, m, n, k, batch)
@@ -230,14 +235,12 @@ def profile():
     for dtype in dtypes:
         for m, n, k in get_gemm_bert_sizes(full=False):
             for batch in [1, 32, 64]:
-                profile_with_args(dtype, False, False, m, n, k, batch, True)
+                profile_with_args(dtype, False, False, m, n, k, batch)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("dtype", choices=dtypes)
     group.add_argument("transa", choices="NT")
     group.add_argument("transb", choices="NT")
@@ -245,12 +248,9 @@ def profile():
     group.add_argument("n", type=int)
     group.add_argument("k", type=int)
     group.add_argument("batch", type=int)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(
-            args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.batch, args.sort
-        )
+        args.dispatch(args.dtype, args.transa == "T", args.transb == "T", args.m, args.n, args.k, args.batch)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py
index dcb51f1db145..8edf55f68c11 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/vector_add_test.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
 from dataclasses import dataclass
 
 import kernel_explorer as ke
@@ -43,6 +42,7 @@ def run_vector_add(size, dtype, func):
 
 @pytest.mark.parametrize("size", [1, 3, 4, 16, 124, 125, 126, 127, 128, 129, 130, 131, 132, 1024])
 @pytest.mark.parametrize("dtype", dtypes)
+@ke.dispatchable
 def test_vector_add(size, dtype):
     for dtype in dtypes:
         for f in dtype_to_funcs(dtype):
@@ -57,6 +57,7 @@ def report(self):
         return f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s {self.dtype} size={self.size:<4} {self.name}"
 
 
+@ke.dispatchable(pattern_arg=2)
 def profile_vector_add_func(size, dtype, func):
     np.random.seed(0)
     x = np.random.rand(size).astype(dtype)
@@ -74,8 +75,9 @@ def profile_vector_add_func(size, dtype, func):
     ke.report(VectorAddMetric(func, dtype, duration_ms, total_bytes, size))
 
 
-def profile_with_args(size, dtype, sort):
-    with ke.benchmark(sort):
+@ke.dispatchable
+def profile_with_args(size, dtype):
+    with ke.benchmark():
         for func in dtype_to_funcs(dtype):
             profile_vector_add_func(size, dtype, func)
 
@@ -84,21 +86,18 @@ def profile():
     sizes = [10000, 100000, 1000000, 10000000]
     for dt in dtypes:
         for s in sizes:
-            profile_with_args(s, dt, True)
+            profile_with_args(s, dt)
             print()
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group("profile with args")
+    parser = ke.get_argument_parser()
+    group = parser.add_argument_group()
     group.add_argument("size", type=int)
     group.add_argument("dtype", choices=dtypes)
-    group.add_argument("--sort", action="store_true")
 
-    if len(sys.argv) == 1:
+    if not ke.has_args():
         profile()
     else:
         args = parser.parse_args()
-        profile_with_args(args.size, args.dtype, args.sort)
+        args.dispatch(args.size, args.dtype)
diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py
index a52740d45956..a5936afcfe13 100644
--- a/onnxruntime/python/tools/microbench/benchmark.py
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@@ -147,20 +147,17 @@ def __init__(self, args):
 
     @classmethod
     @abstractmethod
-    def create_inputs_outputs(cls, op_param):
-        ...
+    def create_inputs_outputs(cls, op_param): ...
 
     def add_case(self, op_param, model):
         self.cases += [(op_param, model)]
 
     @abstractmethod
-    def create_cases(self):
-        ...
+    def create_cases(self): ...
 
     @classmethod
     @abstractmethod
-    def case_profile(cls, op_param, time):
-        ...
+    def case_profile(cls, op_param, time): ...
 
     def benchmark(self):
         self.create_cases()
diff --git a/onnxruntime/python/tools/qnn/add_trans_cast.py b/onnxruntime/python/tools/qnn/add_trans_cast.py
index bd6b8701f8fb..ced3e3519ad4 100644
--- a/onnxruntime/python/tools/qnn/add_trans_cast.py
+++ b/onnxruntime/python/tools/qnn/add_trans_cast.py
@@ -270,19 +270,15 @@ def main():
             raise AssertionError("Error: Onnx model output: " + graph_output.name + " not exist from QNN model output.")
 
     for node in model.graph.node:
-        node_input_index = 0
-        for node_input in node.input:
+        for node_input_index, node_input in enumerate(node.input):
             # update consumer node for graph inputs to connect to inserted node
             if node_input in graph_input_output_name_dic:
                 node.input[node_input_index] = graph_input_output_name_dic[node_input]
-            node_input_index += 1
 
-        node_output_index = 0
-        for node_output in node.output:
+        for node_output_index, node_output in enumerate(node.output):
             # update producer node for graph outputs to connect to inserted node
             if node_output in graph_input_output_name_dic:
                 node.output[node_output_index] = graph_input_output_name_dic[node_output]
-            node_output_index += 1
 
     model.graph.node.extend(nodes_to_add)
     graph_topological_sort(model.graph)
diff --git a/onnxruntime/python/tools/qnn/gen_qnn_ctx_onnx_model.py b/onnxruntime/python/tools/qnn/gen_qnn_ctx_onnx_model.py
index 1cd8793ab14e..1bc22eb0e571 100644
--- a/onnxruntime/python/tools/qnn/gen_qnn_ctx_onnx_model.py
+++ b/onnxruntime/python/tools/qnn/gen_qnn_ctx_onnx_model.py
@@ -14,9 +14,17 @@ class QnnTensorStruct:
     def __init__(self):
         self.name = ""
         self.onnx_data_type = TensorProto.FLOAT
+        self.is_quantized = False
+        self.scale = 0.0
+        self.offset = 0
         self.dim = []
 
 
+def is_quantized_data_type(qnn_data_type):
+    # QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_FIXED_POINT_16
+    return qnn_data_type == 0x0408 or qnn_data_type == 0x0416 or qnn_data_type == 0x0308 or qnn_data_type == 0x0316
+
+
 def qnn_data_type_to_onnx_data_type(qnn_data_type):
     # QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UINT_8
     if qnn_data_type == 0x0408 or qnn_data_type == 0x0108:
@@ -73,7 +81,14 @@ def parse_qnn_json_file(qnn_json_file_path, qnn_input_tensor_dic, qnn_output_ten
                 qnn_tensor = QnnTensorStruct()
                 qnn_tensor.name = qnn_tensor_name
                 qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(qnn_tensor_attribute["data_type"])
+                qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"])
                 qnn_tensor.dim = qnn_tensor_attribute["dims"]
+                if (
+                    qnn_tensor_attribute["quant_params"]["definition"] == 1
+                    and qnn_tensor_attribute["quant_params"]["encoding"] == 0
+                ):
+                    qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"]
+                    qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"]
                 qnn_input_tensor_dic[qnn_tensor_name] = qnn_tensor
 
             # Get all graph outputs
@@ -81,7 +96,14 @@ def parse_qnn_json_file(qnn_json_file_path, qnn_input_tensor_dic, qnn_output_ten
                 qnn_tensor = QnnTensorStruct()
                 qnn_tensor.name = qnn_tensor_name
                 qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(qnn_tensor_attribute["data_type"])
+                qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"])
                 qnn_tensor.dim = qnn_tensor_attribute["dims"]
+                if (
+                    qnn_tensor_attribute["quant_params"]["definition"] == 1
+                    and qnn_tensor_attribute["quant_params"]["encoding"] == 0
+                ):
+                    qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"]
+                    qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"]
                 qnn_output_tensor_dic[qnn_tensor_name] = qnn_tensor
 
     assert (
@@ -120,13 +142,33 @@ def main():
             ep_cache_context_content = file.read()
         ctx_embed_mode = 1
 
-    qnn_inputs = []
-    for qnn_input in qnn_input_tensor_dic.values():
-        qnn_inputs.append(helper.make_tensor_value_info(qnn_input.name, qnn_input.onnx_data_type, qnn_input.dim))
+    graph_nodes = []
+    ini_list = []
+    value_infos = []
 
-    qnn_outputs = []
-    for qnn_output in qnn_output_tensor_dic.values():
-        qnn_outputs.append(helper.make_tensor_value_info(qnn_output.name, qnn_output.onnx_data_type, qnn_output.dim))
+    model_inputs = []
+    for qnn_input in qnn_input_tensor_dic.values():
+        if qnn_input.is_quantized:
+            q_scale_input_name = qnn_input.name + "_scale"
+            q_offset_input_name = qnn_input.name + "_zp"
+            q_scale = helper.make_tensor(q_scale_input_name, TensorProto.FLOAT, [], [qnn_input.scale])
+            ini_list.append(q_scale)
+            q_offset = helper.make_tensor(q_offset_input_name, qnn_input.onnx_data_type, [], [qnn_input.offset])
+            ini_list.append(q_offset)
+            input_name = qnn_input.name + "_dq"
+
+            q_node = helper.make_node(
+                "QuantizeLinear",
+                name=qnn_input.name,
+                inputs=[input_name, q_scale_input_name, q_offset_input_name],
+                outputs=[qnn_input.name],
+            )
+
+            graph_nodes.append(q_node)
+            model_inputs.append(helper.make_tensor_value_info(input_name, TensorProto.FLOAT, qnn_input.dim))
+            value_infos.append(helper.make_tensor_value_info(qnn_input.name, qnn_input.onnx_data_type, qnn_input.dim))
+        else:
+            model_inputs.append(helper.make_tensor_value_info(qnn_input.name, qnn_input.onnx_data_type, qnn_input.dim))
 
     qnn_ep_context_node = helper.make_node(
         "EPContext",
@@ -138,8 +180,37 @@ def main():
         source="Qnn",
         domain="com.microsoft",
     )
+    graph_nodes.append(qnn_ep_context_node)
 
-    graph_def = helper.make_graph([qnn_ep_context_node], "qnn-onnx-model", qnn_inputs, qnn_outputs)
+    model_outputs = []
+    for qnn_output in qnn_output_tensor_dic.values():
+        if qnn_output.is_quantized:
+            dq_scale_input_name = qnn_output.name + "_scale"
+            dq_offset_input_name = qnn_output.name + "_zp"
+            dq_scale = helper.make_tensor(dq_scale_input_name, TensorProto.FLOAT, [], [qnn_output.scale])
+            ini_list.append(dq_scale)
+            dq_offset = helper.make_tensor(dq_offset_input_name, qnn_output.onnx_data_type, [], [qnn_output.offset])
+            ini_list.append(dq_offset)
+            output_name = qnn_output.name + "_dq"
+
+            dq_node = helper.make_node(
+                "DequantizeLinear",
+                name=output_name,
+                inputs=[qnn_output.name, dq_scale_input_name, dq_offset_input_name],
+                outputs=[output_name],
+            )
+
+            graph_nodes.append(dq_node)
+            model_outputs.append(helper.make_tensor_value_info(output_name, TensorProto.FLOAT, qnn_output.dim))
+            value_infos.append(
+                helper.make_tensor_value_info(qnn_output.name, qnn_output.onnx_data_type, qnn_output.dim)
+            )
+        else:
+            model_outputs.append(
+                helper.make_tensor_value_info(qnn_output.name, qnn_output.onnx_data_type, qnn_output.dim)
+            )
+
+    graph_def = helper.make_graph(graph_nodes, "qnn-onnx-model", model_inputs, model_outputs, ini_list, "", value_infos)
 
     model_def = helper.make_model(graph_def, producer_name="MS")
 
diff --git a/onnxruntime/python/tools/quantization/__init__.py b/onnxruntime/python/tools/quantization/__init__.py
index 170c0928fee2..9d397499d45a 100644
--- a/onnxruntime/python/tools/quantization/__init__.py
+++ b/onnxruntime/python/tools/quantization/__init__.py
@@ -5,7 +5,6 @@
     MinMaxCalibrater,
     create_calibrator,
 )
-from .matmul_weight4_quantizer import MatMulWeight4Quantizer  # noqa: F401
 from .qdq_quantizer import QDQQuantizer  # noqa: F401
 from .quant_utils import QuantFormat, QuantType, write_calibration_table  # noqa: F401
 from .quantize import DynamicQuantConfig  # noqa: F401
diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
new file mode 100644
index 000000000000..625cab25b9c4
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -0,0 +1,503 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import logging
+from typing import Any, Dict
+
+import numpy as np
+import onnx
+import onnx.numpy_helper
+
+try:
+    from onnx.reference.op_run import to_array_extended
+except ImportError:
+    # old version of onnx.
+    to_array_extended = None
+
+from .calibrate import TensorData
+from .onnx_model import ONNXModel
+from .quant_utils import (
+    ONNX_TYPE_TO_NP_TYPE,
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantType,
+    find_by_name,
+    model_has_infer_metadata,
+    normalize_axis,
+    quantize_data,
+    quantize_nparray,
+    save_and_reload_model_with_shape_infer,
+    tensor_proto_to_array,
+)
+from .tensor_quant_overrides import TensorQuantOverridesHelper
+
+
+class QuantizationParams:
+    def __init__(self, **data: Dict[str, Any]):
+        self.data = {}
+        for k, v in data.items():
+            if not isinstance(k, str):
+                raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
+            if not isinstance(v, (int, str, np.ndarray)):
+                raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
+            if k == "scale" and v.dtype not in (np.float32, np.float16):
+                raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
+            self.data[k] = v
+
+    def __iter__(self):
+        yield from self.data
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class BaseQuantizer:
+    def __init__(
+        self,
+        model,
+        per_channel,
+        reduce_range,
+        weight_qType,
+        activation_qType,
+        tensors_range,
+        nodes_to_quantize,
+        nodes_to_exclude,
+        op_types_to_quantize,
+        extra_options=None,
+    ):
+        if not model_has_infer_metadata(model):
+            model = save_and_reload_model_with_shape_infer(model)
+        self.value_infos = {vi.name: vi for vi in model.graph.value_info}
+        self.value_infos.update({ot.name: ot for ot in model.graph.output})
+        self.value_infos.update({it.name: it for it in model.graph.input})
+
+        self.model = ONNXModel(model)
+        self.per_channel = per_channel  # weight-pack per channel
+        self.reduce_range = reduce_range
+
+        self.extra_options = extra_options if extra_options else {}
+        self.enable_subgraph_quantization = (
+            "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
+        )
+        self.parent = None
+        self.force_quantize_no_input_check = (
+            "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
+        )
+        self.is_weight_symmetric = self.extra_options.get(
+            "WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
+        )
+        self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
+        self.min_real_range = self.extra_options.get("MinimumRealRange")
+
+        self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
+        self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
+
+        """
+            Dictionary specifying the min and max values for tensors. It has following format:
+                {
+                    "param_name": [min, max]
+                }
+            example:
+                {
+                    'Conv_3:0': [np.float32(0), np.float32(0.5)],
+                    'Conv_4:0': [np.float32(1), np.float32(3.5)]
+                }
+        """
+        if tensors_range is not None and any(map(lambda t: not isinstance(t, TensorData), tensors_range.values())):
+            raise TypeError(
+                f"tensors_range contains unexpected types {set(type(v) for v in tensors_range.values())}, not TensorData."
+            )
+        self.tensors_range = tensors_range
+        self.nodes_to_quantize = nodes_to_quantize  # specific nodes to quantize
+        self.nodes_to_exclude = nodes_to_exclude  # specific nodes to exclude
+        self.op_types_to_quantize = op_types_to_quantize
+
+        self.opset_version = self.check_opset_version()
+
+        # Get tensor-level quantization overrides and ensure they are valid.
+        self.tensor_quant_overrides = TensorQuantOverridesHelper(self.extra_options.get("TensorQuantOverrides", {}))
+
+        self.initializers = {initzer.name: initzer for initzer in self.model.initializer()}
+        overrides_valid, overrides_err = self.tensor_quant_overrides.is_valid(
+            self.initializers, self.value_infos.keys(), activation_qType
+        )
+        if not overrides_valid:
+            raise ValueError(overrides_err)
+
+        self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
+
+    def quantize_model(self):
+        raise NotImplementedError
+
+    def is_input_a_initializer(self, input_name):
+        initializer = find_by_name(input_name, self.model.initializer())
+        return initializer is not None
+
+    def is_per_channel(self):
+        return self.per_channel
+
+    def is_valid_quantize_weight(self, weight_name):
+        weight = find_by_name(weight_name, self.model.initializer())
+        if weight is not None:
+            return weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16)
+        if (not self.enable_subgraph_quantization) or (self.parent is None):
+            return False
+        return self.parent.is_valid_quantize_weight(weight_name)
+
+    def should_quantize_node(self, node):
+        if (
+            self.nodes_to_quantize is not None
+            and len(self.nodes_to_quantize) != 0
+            and node.name not in self.nodes_to_quantize
+        ):
+            return False
+
+        if node.op_type not in self.op_types_to_quantize:
+            return False
+
+        if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
+            return False
+
+        return True
+
+    def check_opset_version(self):
+        ai_onnx_domain = [
+            opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
+        ]
+        if len(ai_onnx_domain) != 1:
+            raise ValueError("Failed to find proper ai.onnx domain")
+        opset_version = ai_onnx_domain[0].version
+
+        if opset_version == 10:
+            logging.warning(
+                f"The original model opset version is {opset_version}, which does not support node fusions. Please update the model to opset >= 11 for better performance."
+            )
+            return 10
+
+        if opset_version < 10:
+            logging.warning(
+                f"The original model opset version is {opset_version}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model."
+            )
+            self.model.model.opset_import.remove(ai_onnx_domain[0])
+            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
+            opset_version = 11
+
+        if opset_version < 19 and self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            logging.warning(
+                f"The original model opset version is {opset_version}, which does not support quantization to float 8. "
+                "Please update the model to opset >= 19. Updating the model automatically to opset 19. "
+                "Please verify the quantized model."
+            )
+            self.model.model.opset_import.remove(ai_onnx_domain[0])
+            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
+            self.model.model.ir_version = 9
+            opset_version = 19
+
+        return opset_version
+
+    def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1.0):
+        """
+        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+
+        # get bias
+        bias_initializer = find_by_name(bias_name, self.model.initializer())
+        bias_data = tensor_proto_to_array(bias_initializer)
+        quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
+
+        # quantize bias
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            data = np.asarray(bias_data)
+            if data.dtype == np.float16:
+                node_qtype = onnx.TensorProto.FLOAT16
+            elif data.dtype == np.float32:
+                node_qtype = onnx.TensorProto.FLOAT
+            else:
+                raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
+            quantized_data = data.astype(np.float32)
+            bias_scale = np.array([1], dtype=quantized_data.dtype)
+            bias_scale_data = bias_scale.reshape(-1)
+            packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+            node_type = "Cast"
+        else:
+            # calculate scale for bias
+            # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
+            bias_scale = input_scale * weight_scale * beta
+
+            quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
+
+            # update bias initializer
+            bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
+            packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+
+            # Bias's scale dtype should match the original bias data's unquantized type (float32 or float16).
+            bias_scale_data = np.asarray(bias_scale, dtype=bias_data.dtype).reshape(-1)
+            node_type = "DequantizeLinear"
+            node_qtype = self.weight_qType
+
+        # update scale initializer
+        quantized_bias_scale_name = quantized_bias_name + "_scale"
+        packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
+        self.model.initializer_extend([packed_bias_scale_initializer])
+
+        # update zero initializer
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            tensor_type = self.weight_qType
+        else:
+            tensor_type = onnx.TensorProto.INT32
+
+        quantized_bias_zp_name = quantized_bias_name + "_zero_point"
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
+        elif bias_scale.size > 1:
+            bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
+            packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
+        else:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
+        self.model.initializer_extend([packed_bias_zp_initializer])
+
+        return (
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            bias_scale_data,
+            node_type,
+            node_qtype,
+        )
+
+    def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_float_weight=False):
+        """
+        :param weight: TensorProto initializer
+        :param qType: type to quantize to
+        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
+                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
+        :return: quantized weight name, zero point name, scale name
+        """
+        q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight.name + "_zero_point"
+        scale_name = weight.name + "_scale"
+
+        # Quantize weight data. Use quantization overrides if provided by the user.
+        weight_data = tensor_proto_to_array(weight)
+        quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(weight.name, default_val={})
+        if "quant_type" in quant_overrides:
+            qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
+
+        if "scale" in quant_overrides and "zero_point" in quant_overrides:
+            zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
+            scale = np.array(quant_overrides["scale"])
+            q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert (
+                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+            ), f"Unexpected dtype {zero_point.dtype}"
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        else:
+            _, _, zero_point, scale, q_weight_data = quantize_data(
+                weight_data.flatten(),
+                qType,
+                quant_overrides.get("symmetric", self.is_weight_symmetric),
+                reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
+                min_real_range=self.min_real_range,
+                rmin_override=quant_overrides.get("rmin"),
+                rmax_override=quant_overrides.get("rmax"),
+            )
+
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert (
+                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+            ), f"Unexpected dtype {zero_point.dtype}"
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        scale_dtype = weight.data_type
+        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
+        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+                q_weight_initializer = onnx.TensorProto()
+                q_weight_initializer.data_type = self.weight_qType
+                q_weight_initializer.dims.extend(weight.dims)
+                q_weight_initializer.name = q_weight_name
+                # Do not remove .flatten().copy() numpy is not clear about data persistence.
+                q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
+                if to_array_extended is not None:
+                    # This test should not be needed but it helped catch some issues
+                    # with data persistence and tobytes.
+                    check = to_array_extended(q_weight_initializer)
+                    if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
+                        raise RuntimeError(
+                            f"The initializer of shape {weight_data.shape} could not be created, expecting "
+                            f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
+                            f"\nraw={str(q_weight_initializer)[:200]}."
+                        )
+            else:
+                q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
+                    weight.dims
+                )
+                q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
+            self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_weight_per_channel_impl(
+        self,
+        weight_name,
+        weight_qType,
+        channel_axis,
+        reduce_range=True,
+        keep_float_weight=False,
+    ):
+        initializer = find_by_name(weight_name, self.model.initializer())
+        if initializer is None:
+            raise ValueError("{} is not an initializer", weight_name)
+
+        weights = tensor_proto_to_array(initializer)
+        weights_rank = len(weights.shape)
+        is_axis_valid, axis_norm = normalize_axis(channel_axis, weights_rank)
+        if not is_axis_valid:
+            raise ValueError(
+                f"Weight {weight_name} has a per-channel axis with value {channel_axis} that is "
+                f"out-of-bounds for rank {weights_rank}"
+            )
+
+        channel_axis = axis_norm
+        channel_count = weights.shape[channel_axis]
+        quant_overrides_for_channels = self.tensor_quant_overrides.get_per_channel_overrides(
+            weight_name, default_val=[{"axis": channel_axis}]
+        )
+
+        num_channel_overrides = len(quant_overrides_for_channels)
+        if num_channel_overrides != 1 and num_channel_overrides != channel_count:
+            raise ValueError(
+                f"Per-channel tensor quantization overrides for {weight_name} must have "
+                f"either 1 or {channel_count} elements in the list of dictionaries."
+            )
+
+        is_axis_override_valid, axis_override = normalize_axis(quant_overrides_for_channels[0]["axis"], weights_rank)
+        if not is_axis_override_valid or axis_override != channel_axis:
+            raise ValueError(
+                f"Tensor quantization overrides for {weight_name} specify an unexpected axis. "
+                f"Expected {channel_axis}, but got {quant_overrides_for_channels[0]['axis']}."
+            )
+
+        # If user provides per-channel quantization overrides, all channels must use the same quant_type,
+        # axis, symmetric, and reduce_range values. So, just use the first channel's values.
+        if "quant_type" in quant_overrides_for_channels[0]:
+            weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type  # noqa: N806
+
+        symmetric = quant_overrides_for_channels[0].get(
+            "symmetric",
+            (self.is_weight_symmetric or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN)),
+        )
+        reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
+        zero_point_list = []
+        scale_list = []
+        quantized_per_channel_data_list = []
+        for i in range(channel_count):
+            per_channel_data = weights.take(i, channel_axis)
+            channel_override_index = i if i < num_channel_overrides else 0
+            channel_quant_overrides = quant_overrides_for_channels[channel_override_index]
+
+            if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
+                zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
+                scale = np.array(channel_quant_overrides["scale"])
+                quantized_per_channel_data = quantize_nparray(
+                    weight_qType, per_channel_data.flatten(), scale, zero_point
+                )
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert (
+                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+                ), f"Unexpected dtype {zero_point.dtype}"
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(
+                    quantized_per_channel_data, np.ndarray
+                ), f"Unexpected type {type(quantized_per_channel_data)}"
+
+            else:
+                _, _, zero_point, scale, quantized_per_channel_data = quantize_data(
+                    per_channel_data.flatten(),
+                    weight_qType,
+                    symmetric,
+                    reduce_range=reduce_range,
+                    min_real_range=self.min_real_range,
+                    rmin_override=channel_quant_overrides.get("rmin"),
+                    rmax_override=channel_quant_overrides.get("rmax"),
+                )
+
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert (
+                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+                ), f"Unexpected dtype {zero_point.dtype}"
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(
+                    quantized_per_channel_data, np.ndarray
+                ), f"Unexpected type {type(quantized_per_channel_data)}"
+
+            zero_point_list.append(zero_point)
+            scale_list.append(scale)
+            quantized_per_channel_data_list.append(quantized_per_channel_data)
+
+        # combine per_channel_data into one
+        reshape_dims = list(weights.shape)  # deep copy
+        reshape_dims[channel_axis] = 1  # only one per channel for reshape
+        quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
+        for i in range(1, len(quantized_per_channel_data_list)):
+            channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
+            quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
+
+        q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight_name + "_zero_point"
+        scale_name = weight_name + "_scale"
+
+        # Update packed weight, zero point, and scale initializers
+        zero_scale_shape = [initializer.dims[channel_axis]]
+        scale_initializer = onnx.helper.make_tensor(
+            scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
+        )
+        zero_initializer = onnx.helper.make_tensor(
+            zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
+        )
+
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            quantized_weights = np.asarray(
+                quantized_weights,
+                dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight_qType],
+            ).reshape(initializer.dims)
+            q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
+            self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def adjust_tensor_ranges(self):
+        if self.tensors_range is None:
+            return
+
+        for node in self.model.nodes():
+            # adjust tensor_ranges for input of Clip and Relu node
+            if node.op_type in ["Clip", "Relu"]:
+                if self.is_activation_symmetric:
+                    continue
+                if not self.should_quantize_node(node):
+                    continue
+                if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
+                    continue
+                if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
+                    continue
+                td = self.tensors_range[node.output[0]]
+                if not isinstance(td, TensorData):
+                    raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
+                self.tensors_range[node.input[0]] = td
+            # Adjust Softmax to range from 0.0 to 1.0
+            elif node.op_type == "Softmax":
+                self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index f934b55bdc30..ef1ecd20a0d6 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -5,6 +5,7 @@
 # license information.
 # --------------------------------------------------------------------------
 import abc
+import copy
 import itertools
 import os
 import uuid
@@ -21,13 +22,61 @@
 from .quant_utils import apply_plot, load_model_with_shape_infer, smooth_distribution
 
 
+def rel_entr(pk: np.ndarray, qk: np.ndarray) -> np.ndarray:
+    """
+    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr.
+    Python implementation.
+    """
+    res = np.empty(pk.shape, dtype=pk.dtype)
+    res[:] = pk[:] * np.log(pk[:] / qk[:])
+    c2 = (pk == 0) & (qk >= 0)
+    res[c2] = 0
+    c1 = (pk > 0) & (qk > 0)
+    res[~c1] = np.inf
+    return res
+
+
+def entropy(
+    pk: np.ndarray,
+    qk: np.ndarray,
+    base: Optional[float] = None,
+    axis: int = 0,
+) -> np.ndarray:
+    """
+    Simplifeied version of entropy.
+    Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html.
+    This avoids taking a dependency on scipy just for this function.
+    """
+    assert base is None or base > 0, "base={base} must be a positive number or `None`."
+    assert qk is not None, "qk is None"
+
+    pk = np.asarray(pk).astype(np.float32)
+    pk = 1.0 * pk / np.sum(pk, axis=axis, keepdims=True)
+
+    qk = np.asarray(qk).astype(np.float32)
+    pk, qk = np.broadcast_arrays(pk, qk)
+    qk = 1.0 * qk / np.sum(qk, axis=axis, keepdims=True)
+    vec = rel_entr(pk, qk)
+
+    s = np.sum(vec, axis=axis)
+    if base is not None:
+        s /= np.log(base)
+    return s.astype(pk.dtype)
+
+
 class TensorData:
     _allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges", "bins"])
+    _floats = frozenset(["avg", "std", "lowest", "highest", "hist_edges"])
 
     def __init__(self, **kwargs):
         for k, v in kwargs.items():
             if k not in TensorData._allowed:
                 raise ValueError(f"Unexpected value {k!r} not in {TensorData._allowed}.")
+            if k in TensorData._floats:
+                if not hasattr(v, "dtype"):
+                    raise ValueError(f"Unexpected type {type(v)} for k={k!r}")
+                if v.dtype not in (np.float16, np.float32):
+                    raise ValueError(f"Unexpected dtype {v.dtype} for k={k!r}")
             setattr(self, k, v)
 
     @property
@@ -171,7 +220,7 @@ def select_tensors_to_calibrate(self, model: ModelProto):
         initializer = {init.name for init in model.graph.initializer}
 
         tensors_to_calibrate = set()
-        tensor_type_to_calibrate = {TensorProto.FLOAT}
+        tensor_type_to_calibrate = {TensorProto.FLOAT, TensorProto.FLOAT16}
 
         for node in model.graph.node:
             if not self.op_types_to_calibrate or node.op_type in self.op_types_to_calibrate:
@@ -284,7 +333,17 @@ def add_reduce_min_max(tensor_name, reduce_op_name):
             )
 
             self.model.graph.node.extend([reduce_node, reshape_node])
-            self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, TensorProto.FLOAT, [1]))
+            value_infos = {vi.name: vi for vi in self.model.graph.value_info}
+            value_infos.update({o.name: o for o in self.model.graph.output})
+            value_infos.update({i.name: i for i in self.model.graph.input})
+            if tensor_name in value_infos:
+                onnx_type = value_infos[tensor_name].type.tensor_type.elem_type
+            else:
+                raise ValueError(
+                    f"Unable to guess tensor type for tensor {tensor_name!r}, "
+                    f"running shape inference before quantization may resolve this issue."
+                )
+            self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [1]))
 
         for tensor in tensors:
             add_reduce_min_max(tensor, "ReduceMin")
@@ -309,7 +368,6 @@ def collect_data(self, data_reader: CalibrationDataReader):
                 self.max_intermediate_outputs is not None
                 and len(self.intermediate_outputs) == self.max_intermediate_outputs
             ):
-                self.compute_range()
                 self.clear_collected_data()
 
         if len(self.intermediate_outputs) == 0 and self.calibrate_tensors_range is None:
@@ -364,24 +422,18 @@ def compute_data(self) -> TensorsData:
 
         pairs = []
         for i in range(0, len(added_output_names), 2):
-            min_value = 0
-            max_value = 0
             if self.moving_average:
                 min_value_array = np.mean(merged_added_output_dict[added_output_names[i]], axis=0)
                 max_value_array = np.mean(merged_added_output_dict[added_output_names[i + 1]], axis=0)
             else:
-                min_value_array = min(merged_added_output_dict[added_output_names[i]])
-                max_value_array = max(merged_added_output_dict[added_output_names[i + 1]])
-            if isinstance(min_value_array, int) or min_value_array.size > 0:
-                min_value = float(min_value_array)
-            if isinstance(max_value_array, int) or max_value_array.size > 0:
-                max_value = float(max_value_array)
+                min_value_array = np.min(merged_added_output_dict[added_output_names[i]], axis=0)
+                max_value_array = np.max(merged_added_output_dict[added_output_names[i + 1]], axis=0)
 
             if self.symmetric:
-                max_absolute_value = max(abs(min_value), abs(max_value))
+                max_absolute_value = max(np.abs(min_value_array), np.abs(max_value_array))
                 pairs.append(tuple([-max_absolute_value, max_absolute_value]))
             else:
-                pairs.append(tuple([min_value, max_value]))
+                pairs.append(tuple([min_value_array, max_value_array]))
 
         new_calibrate_tensors_range = TensorsData(CalibrationMethod.MinMax, dict(zip(calibrate_tensor_names, pairs)))
         if self.calibrate_tensors_range:
@@ -679,36 +731,59 @@ def collect_absolute_value(self, name_to_arr):
         Collect histogram on absolute value
         """
         for tensor, data_arr in name_to_arr.items():
-            data_arr = np.asarray(data_arr)  # noqa: PLW2901
-            data_arr = data_arr.flatten()  # noqa: PLW2901
-            if data_arr.size > 0:
-                min_value = np.min(data_arr)
-                max_value = np.max(data_arr)
+            if isinstance(data_arr, list):
+                for arr in data_arr:
+                    if not isinstance(arr, np.ndarray):
+                        raise ValueError(f"Unexpected type {type(arr)} for tensor={tensor!r}")
+                dtypes = set(a.dtype for a in arr)
+                if len(dtypes) != 1:
+                    raise ValueError(
+                        f"The calibration expects only one element type but got {dtypes} for tensor={tensor!r}"
+                    )
+                data_arr_np = np.asarray(data_arr)
+            elif not isinstance(data_arr, np.ndarray):
+                raise ValueError(f"Unexpected type {type(data_arr)} for tensor={tensor!r}")
             else:
-                min_value = 0
-                max_value = 0
+                data_arr_np = data_arr
+            data_arr_np = data_arr_np.flatten()
+            if data_arr_np.size > 0:
+                min_value = np.min(data_arr_np)
+                max_value = np.max(data_arr_np)
+            else:
+                min_value = np.array(0, dtype=data_arr_np.dtype)
+                max_value = np.array(0, dtype=data_arr_np.dtype)
 
-            data_arr = np.absolute(data_arr)  # only consider absolute value  # noqa: PLW2901
+            data_arr_np = np.absolute(data_arr_np)  # only consider absolute value
 
             if tensor not in self.histogram_dict:
                 # first time it uses num_bins to compute histogram.
-                hist, hist_edges = np.histogram(data_arr, bins=self.num_bins)
+                hist, hist_edges = np.histogram(data_arr_np, bins=self.num_bins)
+                hist_edges = hist_edges.astype(data_arr_np.dtype)
+                assert (
+                    data_arr_np.dtype != np.float64
+                ), "only float32 or float16 is supported, every constant must be explicetly typed"
                 self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value)
             else:
                 old_histogram = self.histogram_dict[tensor]
                 old_min = old_histogram[2]
                 old_max = old_histogram[3]
+                assert hasattr(old_min, "dtype"), f"old_min should be a numpy array but is {type(old_min)}"
+                assert hasattr(old_max, "dtype"), f"old_min should be a numpy array but is {type(old_max)}"
                 old_hist = old_histogram[0]
                 old_hist_edges = old_histogram[1]
-                temp_amax = np.max(data_arr)
+                temp_amax = np.max(data_arr_np)
                 if temp_amax > old_hist_edges[-1]:
                     # increase the number of bins
                     width = old_hist_edges[1] - old_hist_edges[0]
                     # NOTE: np.arange may create an extra bin after the one containing temp_amax
                     new_bin_edges = np.arange(old_hist_edges[-1] + width, temp_amax + width, width)
                     old_hist_edges = np.hstack((old_hist_edges, new_bin_edges))
-                hist, hist_edges = np.histogram(data_arr, bins=old_hist_edges)
+                hist, hist_edges = np.histogram(data_arr_np, bins=old_hist_edges)
+                hist_edges = hist_edges.astype(data_arr_np.dtype)
                 hist[: len(old_hist)] += old_hist
+                assert (
+                    data_arr_np.dtype != np.float64
+                ), "only float32 or float16 is supported, every constant must be explicetly typed"
                 self.histogram_dict[tensor] = (hist, hist_edges, min(old_min, min_value), max(old_max, max_value))
 
     def collect_value(self, name_to_arr):
@@ -723,10 +798,10 @@ def collect_value(self, name_to_arr):
                 min_value = np.min(data_arr)
                 max_value = np.max(data_arr)
             else:
-                min_value = 0
-                max_value = 0
+                min_value = np.array(0, dtype=data_arr.dtype)
+                max_value = np.array(0, dtype=data_arr.dtype)
 
-            threshold = max(abs(min_value), abs(max_value))
+            threshold = np.array(max(abs(min_value), abs(max_value)), dtype=data_arr.dtype)
 
             if tensor in self.histogram_dict:
                 old_histogram = self.histogram_dict[tensor]
@@ -778,7 +853,7 @@ def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_thresho
     def compute_collection_result(self):
         if not self.histogram_dict or len(self.histogram_dict) == 0:
             raise ValueError("Histogram has not been collected. Please run collect() first.")
-        print(f"Finding optimal threshold for each tensor using {self.method} algorithm ...")
+        print(f"Finding optimal threshold for each tensor using {self.method!r} algorithm ...")
 
         if self.method == "entropy":
             return self.compute_entropy()
@@ -811,16 +886,16 @@ def compute_percentile(self):
                 idx_right = np.searchsorted(cdf, percentile / 100.0)
 
                 thresholds_dict[tensor] = (
-                    -float(hist_edges[idx_right]),
-                    float(hist_edges[idx_right]),
+                    -np.array(hist_edges[idx_right], dtype=hist_edges.dtype),
+                    np.array(hist_edges[idx_right], dtype=hist_edges.dtype),
                 )
             else:
                 percent_to_cut_one_side = (100.0 - percentile) / 200.0
                 idx_right = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side)
                 idx_left = np.searchsorted(cdf, percent_to_cut_one_side)
                 thresholds_dict[tensor] = (
-                    float(hist_edges[idx_left]),
-                    float(hist_edges[idx_right]),
+                    np.array(hist_edges[idx_left], dtype=hist_edges.dtype),
+                    np.array(hist_edges[idx_right], dtype=hist_edges.dtype),
                 )
             min_value = histogram[2]
             max_value = histogram[3]
@@ -842,11 +917,7 @@ def compute_entropy(self):
         thresholds_dict = {}  # per tensor thresholds
 
         print(f"Number of tensors : {len(histogram_dict)}")
-        print(
-            "Number of histogram bins : {} (The number may increase depends on the data it collects)".format(
-                self.num_bins
-            )
-        )
+        print(f"Number of histogram bins : {self.num_bins} (The number may increase depends on the data it collects)")
         print(f"Number of quantized bins : {self.num_quantized_bins}")
 
         for tensor, histogram in histogram_dict.items():
@@ -868,11 +939,11 @@ def _avg_std(hist, hist_edges, power=1):
         if power == 1:
             avg = (hist * values).sum() / hist.sum()
             std = ((hist * values**2).sum() / hist.sum() - avg**2) ** 0.5
-            return avg, std
+            return np.array(avg, dtype=hist_edges.dtype), np.array(std, dtype=hist_edges.dtype)
         if int(power) == power and int(power) % 2 == 1:
             avg = (hist * values**power).sum() / hist.sum()
             std = ((hist * (values**power - avg) ** 2).sum() / hist.sum()) ** 0.5
-            return avg, std
+            return np.array(avg, dtype=hist_edges.dtype), np.array(std, dtype=hist_edges.dtype)
 
         fact = np.abs(values) / values
         fact[np.isnan(fact)] = 1
@@ -880,7 +951,7 @@ def _avg_std(hist, hist_edges, power=1):
         values = np.abs(values) ** power * fact
         avg = (hist * values).sum() / hist.sum()
         std = ((hist * values**2).sum() / hist.sum() - avg**2) ** 0.5
-        return avg, std
+        return np.array(avg, dtype=hist_edges.dtype), np.array(std, dtype=hist_edges.dtype)
 
     def compute_distribution(self):
         if self.num_bins < 512:
@@ -897,13 +968,24 @@ def compute_distribution(self):
             hist = histogram[0]
             hist_edges = histogram[1]
 
+            assert hist_edges.dtype != np.float64
             if self.scenario == "same":
                 avg_coef, std_coef = self._avg_std(hist, hist_edges, power=1)
             elif self.scenario == "p3":
                 avg_coef, std_coef = self._avg_std(hist, hist_edges, power=1.0 / 3.0)
             else:
                 raise ValueError("Invalid scenario. Must be in {'same', 'p3'}.")
-            thresholds_dict[tensor] = TensorData(avg=avg_coef, std=std_coef, hist=hist, hist_edges=hist_edges)
+            assert avg_coef.dtype != np.float64
+            assert std_coef.dtype != np.float64
+            assert hist_edges.dtype != np.float64
+            thresholds_dict[tensor] = TensorData(
+                avg=avg_coef,
+                std=std_coef,
+                hist=hist,
+                hist_edges=hist_edges,
+                lowest=hist_edges.min(),
+                highest=hist_edges.max(),
+            )
 
             # Plot histogram for debug only
             if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
@@ -917,18 +999,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
         `q` is a truncated version of the original distribution.
         Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
         """
-        import copy
-
-        from scipy.stats import entropy
-
         hist = histogram[0]
         hist_edges = histogram[1]
         num_bins = hist.size
         zero_bin_index = num_bins // 2
         num_half_quantized_bin = num_quantized_bins // 2
 
+        dtype = histogram[1].dtype
         kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
-        thresholds = [(0, 0) for i in range(kl_divergence.size)]
+        thresholds = [(np.array(0, dtype=dtype), np.array(0, dtype=dtype)) for i in range(kl_divergence.size)]
 
         # <------------ num bins ---------------->
         #        <--- quantized bins ---->
@@ -948,10 +1027,7 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             start_index = zero_bin_index - i
             end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
 
-            thresholds[i - num_half_quantized_bin] = (
-                float(hist_edges[start_index]),
-                float(hist_edges[end_index]),
-            )
+            thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
 
             sliced_distribution = copy.deepcopy(hist[start_index:end_index])
 
@@ -985,15 +1061,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
 
                 norm = sum(nonzeros[start:end])
                 if norm != 0:
-                    q[start:end] = float(quantized_bins[index]) / float(norm)
+                    q[start:end] = quantized_bins[index] / norm
 
             p = smooth_distribution(p)
             q = smooth_distribution(q)
-
-            if isinstance(q, np.ndarray):
-                kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
+            if p is None or q is None:
+                div = np.array(np.inf, dtype=dtype)
             else:
-                kl_divergence[i - num_half_quantized_bin] = float("inf")
+                div = np.array(entropy(p, q), dtype=dtype)
+            kl_divergence[i - num_half_quantized_bin] = div
 
         min_kl_divergence_idx = np.argmin(kl_divergence)
         optimal_threshold = thresholds[min_kl_divergence_idx]
@@ -1003,6 +1079,8 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             optimal_threshold = (min_value, optimal_threshold[1])
         if optimal_threshold[1] > max_value:
             optimal_threshold = (optimal_threshold[0], max_value)
+        assert hasattr(optimal_threshold[0], "dtype")
+        assert hasattr(optimal_threshold[1], "dtype")
         return optimal_threshold
 
 
@@ -1017,12 +1095,10 @@ def create_calibrator(
     calibrator = None
     if calibrate_method == CalibrationMethod.MinMax:
         # default settings for min-max algorithm
-        symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
-        moving_average = False if "moving_average" not in extra_options else extra_options["moving_average"]
-        averaging_constant = 0.01 if "averaging_constant" not in extra_options else extra_options["averaging_constant"]
-        max_intermediate_outputs = (
-            None if "max_intermediate_outputs" not in extra_options else extra_options["max_intermediate_outputs"]
-        )
+        symmetric = extra_options.get("symmetric", False)
+        moving_average = extra_options.get("moving_average", False)
+        averaging_constant = extra_options.get("averaging_constant", 0.01)
+        max_intermediate_outputs = extra_options.get("max_intermediate_outputs", None)
         calibrator = MinMaxCalibrater(
             model,
             op_types_to_calibrate,
@@ -1035,9 +1111,9 @@ def create_calibrator(
         )
     elif calibrate_method == CalibrationMethod.Entropy:
         # default settings for entropy algorithm
-        num_bins = 128 if "num_bins" not in extra_options else extra_options["num_bins"]
-        num_quantized_bins = 128 if "num_quantized_bins" not in extra_options else extra_options["num_quantized_bins"]
-        symmetric = False if "symmetric" not in extra_options else extra_options["symmetric"]
+        num_bins = extra_options.get("num_bins", 128)
+        num_quantized_bins = extra_options.get("num_quantized_bins", 128)
+        symmetric = extra_options.get("symmetric", False)
         calibrator = EntropyCalibrater(
             model,
             op_types_to_calibrate,
@@ -1049,9 +1125,9 @@ def create_calibrator(
         )
     elif calibrate_method == CalibrationMethod.Percentile:
         # default settings for percentile algorithm
-        num_bins = 2048 if "num_bins" not in extra_options else extra_options["num_bins"]
-        percentile = 99.999 if "percentile" not in extra_options else extra_options["percentile"]
-        symmetric = True if "symmetric" not in extra_options else extra_options["symmetric"]
+        num_bins = extra_options.get("num_bins", 2048)
+        percentile = extra_options.get("percentile", 99.999)
+        symmetric = extra_options.get("symmetric", True)
         calibrator = PercentileCalibrater(
             model,
             op_types_to_calibrate,
@@ -1064,8 +1140,8 @@ def create_calibrator(
 
     elif calibrate_method == CalibrationMethod.Distribution:
         # default settings for percentile algorithm
-        num_bins = 2048 if "num_bins" not in extra_options else extra_options["num_bins"]
-        scenario = "same" if "scenario" not in extra_options else extra_options["scenario"]
+        num_bins = extra_options.get("num_bins", 2048)
+        scenario = extra_options.get("scenario", "same")
 
         calibrator = DistributionCalibrater(
             model,
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py
index 9ebf400498e0..fbf954febdda 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py
@@ -122,6 +122,11 @@ def fuse(
 
         self.nodes_to_remove.extend(subgraph_nodes)
         fused_node = onnx.helper.make_node(
-            self.fused_op_type, inputs=[subgraph_input], outputs=[subgraph_output], p=2, axis=-1
+            self.fused_op_type,
+            name=self.create_unique_node_name(),
+            inputs=[subgraph_input],
+            outputs=[subgraph_output],
+            p=2,
+            axis=-1,
         )
         self.nodes_to_add.append(fused_node)
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
new file mode 100644
index 000000000000..6396e87c73d0
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
@@ -0,0 +1,413 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+
+import onnx
+
+from ...quant_utils import QuantType
+from ...tensor_quant_overrides import QuantTypeInfo, TensorQuantOverridesHelper
+
+
+@dataclass
+class TensorTypeRequest:
+    """
+    Bundles desired quantization type requests for a tensor. A distinction is made between the
+    produced type and the consumed type.
+    """
+
+    # The tensor's quant type at the producer end. If None, assumed to be the default activation quant type.
+    producer: QuantTypeInfo | None
+
+    # The tensor's quant type received by a set of consumer nodes.
+    # If None, assumed to be the default activation quant type for all consumers.
+    # consumers[1] is a set of consumer node names.
+    consumers: tuple[QuantTypeInfo, set[str]] | None
+
+
+class MixedPrecisionTensorQuantOverridesFixer:
+    """
+    Helper that generates tensor quantization overrides for mixed-precision QDQ models.
+
+    Specifically, this helper fixes an initial set of quantization overrides that assign a non-default
+    activation quantization type to one or more tensors by doing the following:
+     - Inferring which other tensors need to be overridden to the non-default activation quantization type.
+     - Inserting quantization data type conversions.
+
+    Example:
+    --------
+
+    Float model:
+
+    input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
+                                 ^
+                                 |
+    input_1 --> Op2 -+-> Op4 ----+
+                     |
+                     +-> Op7 --> output_1
+                     |
+                     +-> Op8 --> output_2
+
+    If we'd like to quantize this model to uint8 precision, but would like to make sure tensor "Op4_out"
+    is quantized to 16-bit, then we would specify the following initial tensor quantization overrides:
+
+    ```
+    init_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
+    ```
+
+    These initial overrides may not create a valid model because Op4 and Op5 may require both the input and output
+    to be the same type (e.g., uint16). This helper fixes the overrides so that input/output data types
+    are valid:
+
+    ```
+    overrides = TensorQuantOverridesHelper(init_overrides)
+
+    fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, QuantType.QUInt8)
+    fixer.apply(
+        default_activation_qtype=QuantType.QUInt8,
+        default_activation_symmetric=False,
+    )
+    ```
+
+    The above snippet generates the following "fixed" overrides (get via overrides.get_dict()):
+
+    {
+      "Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
+      "Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
+      "Op4_out": [{"quant_type": QUInt16}],
+      "Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
+    }
+
+    How to interpret the fixed overrides:
+    - Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the converted u16 type,
+      but Op7 and Op8 consume the original u8 type.
+    - Op3's output is converted from u8 to u16. Op5 consumes the converted u16 type.
+    - Op4's output is just u16 (not converted). All consumers of Op4_out get the u16 type.
+    - Op5's output is converted from u16 to u8. Op6 consumes the u8 type.
+    """
+
+    def __init__(
+        self,
+        overrides: TensorQuantOverridesHelper,
+        producers: dict[str, onnx.NodeProto],
+        consumers: dict[str, list[onnx.NodeProto]],
+        value_infos: dict[str, onnx.ValueInfoProto],
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        """
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            producers: Dictionary that maps a tensor name to the producer node that generates the tensor.
+            consumers: Dictionary that maps a tensor name to the consumer nodes that take the tensor as input.
+            value_infos: Dictionary that maps a tensor name to its onnx.ValueInfoProto.
+            initializers: Dictionary that maps an initializer name to its onnx.TensorProto.
+        """
+        self.overrides = overrides
+        self.consumers = consumers
+        self.producers = producers
+        self.value_infos = value_infos
+        self.initializers = initializers
+
+    @staticmethod
+    def create_from_model(
+        overrides: TensorQuantOverridesHelper, model: onnx.ModelProto, default_activation_qtype: QuantType
+    ) -> MixedPrecisionTensorQuantOverridesFixer:
+        """
+        Helper function that creates an instance of this class from a loaded ONNX model.
+
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            model: Loaded ONNX model
+            default_activation_qtype: The intended default activation quantization type.
+                                      Used to validate the initial overrides.
+
+        Returns:
+            Initialized MixedPrecisionTensorQuantOverridesFixer object
+        """
+        model = onnx.shape_inference.infer_shapes(model)  # Need to infer shapes to get value_infos
+
+        # Build dictionaries that enable convenient lookups of initializers and value_infos by name.
+        initializers = {initializer.name: initializer for initializer in model.graph.initializer}
+        value_infos = {vi.name: vi for vi in model.graph.value_info}
+        value_infos.update({ot.name: ot for ot in model.graph.output})
+        value_infos.update({it.name: it for it in model.graph.input})
+
+        # Ensure that the user-provided initial overrides are actually valid.
+        valid, err = overrides.is_valid(initializers, set(value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = overrides.pprint_str(indent=4)
+            logging.error(f"Provided invalid tensor quantization overrides:\n{pprint_overrides}")
+            raise ValueError(err)
+
+        consumers = {}
+        producers = {}
+
+        # Build dictionaries that map a tensor name to the consumer or producer nodes.
+        for node in model.graph.node:
+            for input_name in node.input:
+                if input_name:
+                    if input_name not in consumers:
+                        consumers[input_name] = []
+
+                    consumers[input_name].append(node)
+
+            for output_name in node.output:
+                producers[output_name] = node
+
+        return MixedPrecisionTensorQuantOverridesFixer(overrides, producers, consumers, value_infos, initializers)
+
+    def apply(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ):
+        """
+        Fixes the initial tensor quantization overrides (in-place) for use in mixed-precision QDQ models.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+        """
+        type_requests = self.get_desired_tensor_types(default_activation_qtype, default_activation_symmetric)
+
+        # Use type requests to "fix" tensor quantization overrides by adding
+        # quantization type conversions where necessary.
+        for tensor_name, type_req in type_requests.items():
+            all_consumers = set([node.name for node in self.consumers.get(tensor_name, [])])
+            has_producer_req = type_req.producer is not None
+            has_consumer_req = bool(type_req.consumers)
+
+            # Only producer type: Add conversion back to default activation type
+            if has_producer_req and not has_consumer_req:
+                self._update_converted_tensor(
+                    tensor_name, type_req.producer, QuantTypeInfo(default_activation_qtype), all_consumers
+                )
+            # Only consumers
+            elif not has_producer_req and has_consumer_req:
+                prod_type_info = self.overrides.get_node_output_qtype_info(tensor_name, default_activation_qtype)
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    if not self._check_nodes_are_not_convert_consumers(tensor_name, type_req.consumers[1]):
+                        raise ValueError(
+                            f"Tensor override for '{tensor_name}' converts the type for consumers that need the original type."
+                        )
+            # Both producer and consumers
+            elif has_producer_req and has_consumer_req:
+                prod_type_info = type_req.producer
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    consumers_for_original_type = all_consumers.difference(type_req.consumers[1])
+
+                    if len(consumers_for_original_type) == 0:
+                        # All consumers want the overridden type, so no need for convert nodes!
+                        # Just add the override to the new new if not already present.
+                        if tensor_name not in self.overrides:
+                            self.overrides[tensor_name] = [{}]
+                            prod_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+                        assert "convert" not in self.overrides[tensor_name][0]
+                    else:
+                        # Some consumers don't want the overridden type.
+                        self._update_converted_tensor(
+                            tensor_name,
+                            prod_type_info,
+                            QuantTypeInfo(default_activation_qtype),
+                            consumers_for_original_type,
+                        )
+            else:
+                raise ValueError(f"TypeRequest for tensor {tensor_name} has no producer or consumers.")
+
+        # Done. Check if the overrides are valid.
+        valid, err = self.overrides.is_valid(self.initializers, set(self.value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = self.overrides.pprint_str(indent=4)
+            logging.error(
+                f"Generated invalid tensor quantization overrides for mixed-precision QDQ model:\n{pprint_overrides}"
+            )
+            raise ValueError(err)
+
+    def get_desired_tensor_types(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ) -> dict[str, TensorTypeRequest]:
+        """
+        Iterates through the initial tensor quantization overrides and builds a set of TensorTypeRequests objects
+        that describe the quantization types required at each tensor. These TensorTypeRequests objects are ultimately
+        used to generated the "fixed" overrides.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+
+        Returns:
+            TensorTypeRequest objects as a dict that maps a tensor name to its requested types.
+        """
+        type_requests = {}
+        default_activation_type_info = QuantTypeInfo(default_activation_qtype, default_activation_symmetric)
+
+        # Scan tensor overrides for type conversion requests.
+        for tensor_name, override_list in self.overrides.items():
+            if not self.__is_tensor_quantizable(tensor_name):
+                continue  # Skip non-quantizable tensors (e.g., not a float)
+
+            if tensor_name in self.initializers:
+                continue  # Skip initializers
+
+            if not override_list or len(override_list) > 1:
+                continue  # Skip per-channel stuff
+
+            override_dict = override_list[0]
+            quant_type_info = QuantTypeInfo.load_from_dict(override_dict, default_activation_type_info.quant_type)
+            producer_node = self.producers.get(tensor_name)  # None if this is a model input
+
+            if quant_type_info != default_activation_type_info and "convert" not in override_dict:
+                if producer_node is not None:
+                    self._add_type_requests_for_node(type_requests, quant_type_info, producer_node)
+
+                # Find all consumer nodes of `tensor_name` and update their inputs/outputs to the new type.
+                for consumer_node in self.consumers.get(tensor_name, []):
+                    self._add_type_requests_for_node(type_requests, quant_type_info, consumer_node)
+
+        return type_requests
+
+    def _add_type_requests_for_node(
+        self,
+        type_requests: dict[str, TensorTypeRequest],
+        quant_type_info: QuantTypeInfo,
+        node: onnx.NodeProto,
+    ):
+        """
+        Adds TensorTypeRequest objects for a given node, assuming that we want all its inputs and outputs
+        to have the same quantization type (as specified by the `quant_type_info` parameter).
+
+        Params:
+            type_requests: Dictionary of type requests to append to for this node.
+            quant_type_info: The quantization type to use for inputs and outputs.
+            node: The node for which the TensorTypeRequest objects are created and added to type_requests.
+        """
+        # Add output side
+        for output_name in node.output:
+            if not self.__is_tensor_quantizable(output_name):
+                continue
+
+            if output_name not in type_requests:
+                type_requests[output_name] = TensorTypeRequest(quant_type_info, None)
+            else:
+                if (
+                    type_requests[output_name].producer is not None
+                    and type_requests[output_name].producer != quant_type_info
+                ):
+                    raise ValueError(f"Tensor {output_name} has multiple types.")
+
+                type_requests[output_name].producer = quant_type_info
+
+        # Add the consumer side
+        for input_name in node.input:
+            if input_name and input_name not in self.initializers and self.__is_tensor_quantizable(input_name):
+                if input_name not in type_requests:
+                    type_requests[input_name] = TensorTypeRequest(None, None)
+
+                if type_requests[input_name].consumers is None:
+                    type_requests[input_name].consumers = (quant_type_info, set())
+
+                if type_requests[input_name].consumers[0] != quant_type_info:
+                    raise ValueError(f"Tensor {input_name} has consumers requesting different types.")
+
+                if not node.name:
+                    raise ValueError(
+                        f"Node of type {node.op_type} with output 0 {node.output[0]} does not have a name!"
+                    )
+
+                type_requests[input_name].consumers[1].add(node.name)
+
+    def _update_converted_tensor(
+        self,
+        tensor_name: str,
+        producer_type_info: QuantTypeInfo,
+        consumer_type_info: QuantTypeInfo,
+        consumer_names: set[str],
+    ):
+        """
+        Updates the tensor quantization overrides for a tensor that is converted from one type to another.
+
+        Params:
+            tensor_name: The name of the tensor for which to update overrides.
+            producer_type_info: Info for the tensor's produced type.
+            consumer_type_info: Info for the tensor's consumed (i.e., converted) type.
+            consumer_names: Nodes names of consumers that consume the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            self.overrides[tensor_name] = [{}]
+            producer_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+        overrides = self.overrides[tensor_name][0]
+        if producer_type_info != QuantTypeInfo.load_from_dict(overrides):
+            raise ValueError(f"Desired producer quant_type for {tensor_name} doesn't match existing type.")
+
+        if consumer_names:
+            if "convert" not in overrides:
+                overrides["convert"] = {}
+                consumer_type_info.save_to_dict(overrides["convert"])
+
+            convert_dict = overrides["convert"]
+            if consumer_type_info != QuantTypeInfo.load_from_dict(convert_dict):
+                raise ValueError(f"Desired consumer quant_type for {tensor_name} doesn't match existing type.")
+
+            if "recv_nodes" not in convert_dict:
+                convert_dict["recv_nodes"] = set()
+
+            convert_dict["recv_nodes"].update(consumer_names)
+
+    def _check_nodes_are_not_convert_consumers(self, tensor_name: str, node_names: set[str]):
+        """
+        Returns true if the given nodes do not consume/receive a converted quantization type.
+
+        Params:
+            tensor_name: The name of the tensor to check.
+            node_names: Set of node names that should not be consumers of the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            return True
+
+        overrides = self.overrides[tensor_name][0]
+
+        if "convert" not in overrides:
+            return True
+
+        convert_dict = overrides["convert"]
+
+        if "recv_nodes" not in convert_dict:
+            return False
+
+        return not convert_dict["recv_nodes"].intersection(node_names)
+
+    def __is_tensor_quantizable(self, tensor_name):
+        weight = self.initializers.get(tensor_name)
+        if weight is not None:
+            if weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16):
+                return True
+        elif tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.FLOAT16,
+            ):
+                return True
+
+        return False
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
index becbaceab184..85f5d967f9ee 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
@@ -3,6 +3,8 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
+
 import logging
 from pathlib import Path
 
@@ -13,9 +15,74 @@
 from .fusion_lpnorm import FusionLpNormalization
 
 
-def qnn_preprocess_model(model_input: Path, model_output: Path, fuse_layernorm: bool = False) -> bool:
+def qnn_preprocess_model(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    fuse_layernorm: bool = False,
+    save_as_external_data: bool = False,
+    all_tensors_to_one_file: bool = False,
+    external_data_location: str | None = None,
+    external_data_size_threshold: int = 1024,
+    external_data_convert_attribute: bool = False,
+    inputs_to_make_channel_last: list[str] | None = None,
+    outputs_to_make_channel_last: list[str] | None = None,
+) -> bool:
+    """
+    If necessary, this method creates a new "pre-processed" model in preparation for
+    quantization of a model to be used in QNN EP. Returns true if a new model was created.
+
+    This method perfoms the following operations:
+    - Fuse Erf sequence into a single Gelu node.
+    - Fuse ReduceL2 sequence into a single LpNormalization node (p == 2).
+    - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
+
+    Args:
+        model_input: Path to the input model file or ModelProto.
+        model_output: Path the output model file, which is only created if this method returns True.
+        fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
+            Defaults to False.
+        save_as_external_data: True if output model should be saved with external data. Defaults to false.
+        all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false.
+            If true, save all tensors to one external file specified by external_data_location.
+            If false, save each tensor to a file named with the tensor name.
+        external_data_location: Effective only if save_as_external_data is true. Defaults to None.
+            Specify the external file to which all tensors are saved. Path is relative
+            to the model path. If not specified, the model's name is used.
+        external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024.
+            Tensors with a data size >= external_data_size_threshold are converted to external data.
+            To convert every tensor with raw data to external data, set to 0.
+        external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
+            If true, convert all tensors to external data.
+            If false, convert only non-attribute tensors to external data.
+        inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
+            if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
+
+            Original:
+                input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            Updated:
+                input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+        outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
+            if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
+
+            Original:
+                <Nodes> --> output0 (N, C, D1, D2, ..., Dn)
+
+            Updated:
+                <Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+    """
     modified = False
-    model = onnx.load_model(model_input)
+    model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
     onnx_model = ONNXModel(model)
 
     # Fuse Erf sequence into a single Gelu
@@ -44,8 +111,197 @@ def qnn_preprocess_model(model_input: Path, model_output: Path, fuse_layernorm:
             if fusion_layernorm.apply():
                 modified = True
 
+    # Optionally, transpose inputs and/or outputs to make them "channel-last".
+    if inputs_to_make_channel_last or outputs_to_make_channel_last:
+        transpose_node_prefix = "Transpose_channel_"
+        transpose_node_suffix: int = onnx_model.get_largest_node_name_suffix(transpose_node_prefix) + 1
+        update_io_to_channel_last(
+            onnx_model.model,
+            inputs_to_make_channel_last,
+            outputs_to_make_channel_last,
+            transpose_node_name_prefix=transpose_node_prefix,
+            transpose_node_name_start_suffix=transpose_node_suffix,
+        )
+        modified = True
+
+    # Make sure all nodes have a name.
+    unnamed_node_prefix = "qnn_preproc_node_"
+    available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
+    for node in onnx_model.model.graph.node:
+        if node.op_type != "Constant" and not node.name:
+            new_node_name = f"{unnamed_node_prefix}{available_suffix!s}"
+            available_suffix += 1
+            node.name = new_node_name
+            modified = True
+            logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.")
+
     if modified:
         onnx_model.topological_sort()
-        onnx.save_model(model, model_output)
+        onnx.save_model(
+            model,
+            model_output,
+            save_as_external_data=save_as_external_data,
+            all_tensors_to_one_file=all_tensors_to_one_file,
+            location=external_data_location,
+            size_threshold=external_data_size_threshold,
+            convert_attribute=external_data_convert_attribute,
+        )
 
     return modified
+
+
+class InputOutputNameMap:
+    def __init__(
+        self,
+        orig_tensor_names: set[str],
+        orig_graph_inputs: dict[str, onnx.ValueInfoProto],
+        orig_graph_outputs: dict[str, onnx.ValueInfoProto],
+    ):
+        self.orig_tensor_names = orig_tensor_names
+        self.orig_graph_inputs = orig_graph_inputs
+        self.orig_graph_outputs = orig_graph_outputs
+        self.updated_io_names = {}
+        self.new_value_infos = []
+
+    def get_new_name(self, orig_name: str):
+        if orig_name in self.updated_io_names:
+            return self.updated_io_names[orig_name]
+
+        # Make a new tensor name that is unique among all tensors in the graph.
+        prefix: str = f"{orig_name}_channel_first_"
+        suffix: int = -1
+        for tensor_name in self.orig_tensor_names:
+            if tensor_name.startswith(prefix) and tensor_name[len(prefix) :].isdigit():
+                index = int(tensor_name[len(prefix) :])
+                suffix = max(suffix, index)
+
+        suffix += 1  # This is the first available suffix.
+        new_name = f"{prefix}{suffix!s}"
+
+        # Add new value_info objects for these new tensors.
+        orig_value_info = self.orig_graph_inputs.get(orig_name) or self.orig_graph_outputs[orig_name]
+        value_info_proto = onnx.ValueInfoProto()
+        value_info_proto.CopyFrom(orig_value_info)
+        value_info_proto.name = new_name
+        self.new_value_infos.append(value_info_proto)
+
+        self.updated_io_names[orig_name] = new_name
+        return self.updated_io_names[orig_name]
+
+
+def update_io_to_channel_last(
+    model: onnx.ModelProto,
+    inputs_to_update: list[str] | None,
+    outputs_to_update: list[str] | None,
+    transpose_node_name_prefix: str = "Transpose_channel_",
+    transpose_node_name_start_suffix: int = 0,
+):
+    inputs_to_update = set(inputs_to_update or [])
+    outputs_to_update = set(outputs_to_update or [])
+
+    if not inputs_to_update and not outputs_to_update:
+        return
+
+    graph = model.graph
+    orig_graph_inputs = {ginput.name: ginput for ginput in graph.input}
+    orig_graph_outputs = {goutput.name: goutput for goutput in graph.output}
+
+    # Check that the user passed in actual input and output names.
+    for input_name in inputs_to_update:
+        if input_name not in orig_graph_inputs:
+            raise ValueError(f"{input_name} is not a graph input")
+
+    for output_name in outputs_to_update:
+        if output_name not in orig_graph_outputs:
+            raise ValueError(f"{output_name} is not a graph output")
+
+    orig_tensor_names = set()
+    orig_tensor_names.update(set(orig_graph_inputs))
+    orig_tensor_names.update(set(orig_graph_outputs))
+    orig_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
+
+    # Maps original input (or output) name to its updated name used within the graph.
+    io_map = InputOutputNameMap(orig_tensor_names, orig_graph_inputs, orig_graph_outputs)
+
+    # Update each node's inputs/outputs to use the transposed versions.
+    for node in graph.node:
+        for i in range(len(node.input)):
+            if node.input[i] and node.input[i] in inputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+            elif node.input[i] and node.input[i] in outputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+
+        for i in range(len(node.output)):
+            if node.output[i] in outputs_to_update:
+                node.output[i] = io_map.get_new_name(node.output[i])
+
+    # Update graph inputs to channel-last and a Transpose (to channel-first) after each.
+    for g_input_name in inputs_to_update:
+        g_input = orig_graph_inputs[g_input_name]
+
+        if not g_input.type.HasField("tensor_type") or not g_input.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected input {g_input.name} to have a tensor_type with a shape")
+
+        input_shape = g_input.type.tensor_type.shape
+        input_rank = len(input_shape.dim)
+
+        if input_rank < 3:
+            raise ValueError(f"Expected input {g_input.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(input_shape.dim[1])
+        for i in range(1, input_rank - 1):
+            input_shape.dim[i].CopyFrom(input_shape.dim[i + 1])
+        input_shape.dim[input_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(input_rank))
+        for i in range(input_rank):
+            transpose_perm[i] = i if i < 1 else i - 1
+        transpose_perm[1] = input_rank - 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[g_input.name],
+            outputs=[io_map.get_new_name(g_input.name)],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    # Update graph outputs to channel-last and a Transpose (from channel-first) before each.
+    for g_output_name in outputs_to_update:
+        g_output = orig_graph_outputs[g_output_name]
+        if not g_output.type.HasField("tensor_type") or not g_output.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected output {g_output.name} to have a tensor_type with a shape")
+
+        output_shape = g_output.type.tensor_type.shape
+        output_rank = len(output_shape.dim)
+
+        if output_rank < 3:
+            raise ValueError(f"Expected output {g_output.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(output_shape.dim[1])
+        for i in range(1, output_rank - 1):
+            output_shape.dim[i].CopyFrom(output_shape.dim[i + 1])
+        output_shape.dim[output_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(output_rank))
+        for i in range(output_rank):
+            transpose_perm[i] = i if i == 0 else i + 1
+        transpose_perm[output_rank - 1] = 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[io_map.get_new_name(g_output.name)],
+            outputs=[g_output.name],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    graph.value_info.extend(io_map.new_value_infos)
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
index eea3a045619f..3c9b319c7853 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -3,75 +3,171 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
+
+import copy
+import logging
 from pathlib import Path
+from typing import Any
 
+import numpy as np
 import onnx
 
 from ...calibrate import CalibrationDataReader, CalibrationMethod
 from ...quant_utils import QuantType
 from ...quantize import StaticQuantConfig
+from ...tensor_quant_overrides import TensorQuantOverridesHelper
+from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
 
 Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
 Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
 OP_TYPES_TO_EXCLUDE = {"Cast"}
+MODEL_SIZE_THRESHOLD = 2147483648  # Quant model should use external data if >= 2GB
+
+
+def warn_unable_to_override(
+    node: onnx.NodeProto,
+    what_str: str,
+    tensor_name: str,
+    io_kind: str,
+):
+    logging.warning(
+        f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
+        "because it has already been overridden! Check the initial quantization overrides provided "
+        "to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
+        f"Node name: {node.name}, {io_kind} name: {tensor_name}"
+    )
 
 
 def get_qnn_qdq_config(
-    model_input: Path,
+    model_input: str | Path | onnx.ModelProto,
     calibration_data_reader: CalibrationDataReader,
-    calibrate_method=CalibrationMethod.MinMax,
-    activation_type=QuantType.QUInt8,
-    weight_type=QuantType.QUInt8,
-    per_channel=False,
-):
-    if per_channel:
-        raise ValueError("QNN EP does not yet support per-channel quantization.")
+    calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
+    activation_type: QuantType = QuantType.QUInt8,
+    weight_type: QuantType = QuantType.QUInt8,
+    per_channel: bool = False,
+    init_overrides: dict[str, list[dict[str, Any]]] | None = None,
+    add_qtype_converts: bool = True,
+    activation_symmetric: bool = False,
+    weight_symmetric: bool | None = None,
+) -> StaticQuantConfig:
+    """
+    Returns a static quantization configuration suitable for running QDQ models on QNN EP.
+    This is done primarily by setting tensor-level quantization overrides.
+
+    Params:
+        model_input: Path to the input model file or ModelProto.
+        calibration_data_reader: Calibration data reader.
+        calibrate_methode: The calibration method. Defaults to MinMax.
+        activation_type: The default activation quantization type. Defaults to QUInt8.
+        weight_type: The default weight quantization type. Defaults to QUInt8.
+        per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
+            Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
+            and their quantization axes.
+
+            If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
+                - Conv:
+                    - input[1] on axis 0
+                    - input[2] (bias) on axis 0
+                - ConvTranspose:
+                    - input[1] on axis 1
+                    - input[2] (bias) on axis 0
+        init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
+            of these overrides with any necessary adjustments and includes them in the returned
+            configuration object (i.e., config.extra_options['TensorQuantOverrides']).
+
+            The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
+            contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
+            each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
+            key must be present in the first dictionary for per-channel quantization.
+
+            Each dictionary contains optional overrides with the following keys and values.
+                'quant_type' = QuantType : The tensor's quantization data type.
+                'axis' = Int             : The per-channel axis. Must be present for per-channel weights.
+                'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                            set `scale` or `zero_point`.
+                'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                            set `scale` or `zero_point`. Only valid for initializers.
+                'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'convert' = Dict         : A nested dictionary with the same keys for an activation
+                                           tensor that should be converted to another quantization type.
+                'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
+                                               other nodes get the original type. If not specified,
+                                               assume all consumer nodes get the converted type.
+        add_qtype_converts: True if this function should automatically add "convert" entries to the provided
+            `init_overrides` to ensure that operators use valid input/output types (activations only).
+            Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
+            of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
+            appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
+        activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
+            Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
+            the zero-point values are 128 and 32,768, respectively.
+        weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
+            Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
+
+    Returns:
+        A StaticQuantConfig object
+    """
+    if weight_symmetric is None:
+        weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
 
-    # Process model nodes to setup overrides.
-    model = onnx.load_model(model_input)
+    model = (
+        model_input
+        if isinstance(model_input, onnx.ModelProto)
+        else onnx.load_model(model_input, load_external_data=False)
+    )
 
     op_types = set()
-    tensor_quant_overrides = {}
+    model_has_external_data = False
+    name_to_initializer = {}
+
+    # Build map of initializers (name -> initializer) and
+    # check if the model has external data.
+    for initializer in model.graph.initializer:
+        name_to_initializer[initializer.name] = initializer
+        if onnx.external_data_helper.uses_external_data(initializer):
+            model_has_external_data = True
+
+    overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
 
-    name_to_initializer = {initializer.name: initializer for initializer in model.graph.initializer}
+    if not overrides_helper.empty() and add_qtype_converts:
+        # Fix mixed-precision overrides.
+        overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
+            overrides_helper, model, activation_type
+        )
+        overrides_fixer.apply(activation_type, activation_symmetric)
+
+    # Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
+    qnn_compat = QnnCompatibilityOverrides(
+        activation_type,
+        weight_type,
+        activation_symmetric,
+        weight_symmetric,
+        per_channel,
+        overrides_helper,
+        name_to_initializer,
+    )
 
     for node in model.graph.node:
         op_types.add(node.op_type)
-
-        if node.op_type == "MatMul" and activation_type in Q16_TYPES and weight_type in Q8_TYPES:
-            weight_symmetric = weight_type == QuantType.QInt8
-
-            # Override initializers to use the weight_type
-            for input_name in node.input:
-                if input_name in name_to_initializer:
-                    tensor_quant_overrides[input_name] = [{"quant_type": weight_type, "symmetric": weight_symmetric}]
-        elif node.op_type == "LayerNormalization" and activation_type in Q16_TYPES and weight_type in Q8_TYPES:
-            weight_symmetric = weight_type == QuantType.QInt8
-
-            # Override initializers to use the weight_type. Don't override the bias input.
-            for i in range(2):
-                input_name = node.input[i]
-                if input_name in name_to_initializer:
-                    tensor_quant_overrides[input_name] = [{"quant_type": weight_type, "symmetric": weight_symmetric}]
-        elif node.op_type == "Sigmoid":
-            if activation_type == QuantType.QUInt16:
-                tensor_quant_overrides[node.output[0]] = [{"scale": 1.0 / 65536.0, "zero_point": 0}]
-            elif activation_type == QuantType.QInt16:
-                tensor_quant_overrides[node.output[0]] = [{"scale": 1.0 / 32768.0, "zero_point": 0}]
-        elif node.op_type == "Tanh":
-            if activation_type == QuantType.QUInt16:
-                tensor_quant_overrides[node.output[0]] = [{"scale": 1.0 / 32768.0, "zero_point": 32768}]
-            elif activation_type == QuantType.QInt16:
-                tensor_quant_overrides[node.output[0]] = [{"scale": 1.0 / 32768.0, "zero_point": 0}]
+        qnn_compat.process_node(node)
 
     extra_options = {
         "MinimumRealRange": 0.0001,
         "DedicatedQDQPair": False,  # Let ORT optimizer duplicate DQ nodes
-        "TensorQuantOverrides": tensor_quant_overrides,
+        "TensorQuantOverrides": overrides_helper.get_dict(),
+        "ActivationSymmetric": activation_symmetric,
+        "WeightSymmetric": weight_symmetric,
     }
 
     # TODO: Remove this extra option once ORT uses an ONNX version that supports 16-bit Q/DQ ops.
-    if activation_type in Q16_TYPES or weight_type in Q16_TYPES:
+    overrides_have_int16 = any(t in Q16_TYPES for t in overrides_helper.get_quant_types())
+    if activation_type in Q16_TYPES or weight_type in Q16_TYPES or overrides_have_int16:
         extra_options["UseQDQContribOps"] = True
 
     return StaticQuantConfig(
@@ -80,5 +176,198 @@ def get_qnn_qdq_config(
         activation_type=activation_type,
         weight_type=weight_type,
         op_types_to_quantize=list(op_types.difference(OP_TYPES_TO_EXCLUDE)),
+        per_channel=per_channel,
+        use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
         extra_options=extra_options,
     )
+
+
+class QnnCompatibilityOverrides:
+    """
+    Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
+    compatible with QNN EP.
+    """
+
+    def __init__(
+        self,
+        default_activation_qtype: QuantType,
+        default_weight_qtype: QuantType,
+        activation_symmetric: bool,
+        weight_symmetric: bool,
+        per_channel: bool,
+        overrides: TensorQuantOverridesHelper,
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        self.default_activation_qtype = default_activation_qtype
+        self.default_weight_qtype = default_weight_qtype
+        self.activation_symmetric = activation_symmetric
+        self.weight_symmetric = weight_symmetric
+        self.per_channel = per_channel
+        self.overrides = overrides
+        self.initializers = initializers
+
+        self.process_fns = {
+            "MatMul": self._process_matmul,
+            "LayerNormalization": self._process_layernorm,
+            "Sigmoid": self._process_sigmoid,
+            "Tanh": self._process_tanh,
+        }
+
+    def process_node(self, node: onnx.NodeProto):
+        process_fn = self.process_fns.get(node.op_type)
+
+        if process_fn is not None:
+            process_fn(node)
+
+    def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
+        """
+        Overrides initializer input(s) to use the default weight type if:
+        - The default weight type is 8-bit
+        - One of the inputs is a 16-bit activation
+        - The other input is an initializer (per-tensor quantized)
+
+        This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
+        inputs the default weight type. Instead, it assigns the default activation type.
+        """
+        if self.default_weight_qtype not in Q8_TYPES:
+            return
+
+        input_16bit_act_name = None
+        input_weight_name = None
+
+        # Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
+        for i in range(2):
+            input_name = node.input[i]
+            if not input_name:
+                continue
+
+            is_weight = input_name in self.initializers
+            qtype_info = self.overrides.get_node_input_qtype_info(
+                input_name,
+                node.name,
+                default_qtype=None if is_weight else self.default_activation_qtype,
+            )
+
+            if qtype_info.axis is not None:
+                return  # Don't process MatMul with a per-channel quantized input.
+
+            if (
+                is_weight
+                and qtype_info.quant_type == self.default_weight_qtype
+                and qtype_info.symmetric == self.weight_symmetric
+            ):
+                return  # Return. Weight is already overridden to use the desired weight type.
+
+            if is_weight:
+                input_weight_name = input_name
+            elif qtype_info.quant_type in Q16_TYPES:
+                input_16bit_act_name = input_name
+
+        # Override initializer input to use the default weight type.
+        if input_16bit_act_name and input_weight_name:
+            did_update = self.overrides.update_tensor_overrides(
+                input_weight_name,
+                {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                overwrite=False,
+            )
+
+            if not did_update:
+                warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
+
+    def _process_matmul(self, node: onnx.NodeProto):
+        assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
+
+        if not self.per_channel:
+            self._make_static_inputs_use_default_weight_type(node)
+            return
+
+        # QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
+        # quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
+        # provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
+        # the user did not provide any other overrides.
+        for input_name in node.input:
+            is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
+            if is_weight_no_overrides:
+                self.overrides.update_tensor_overrides(
+                    input_name,
+                    {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                )
+
+    def _process_layernorm(self, node: onnx.NodeProto):
+        assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
+
+        if not self.per_channel:
+            self._make_static_inputs_use_default_weight_type(node)
+            return
+
+        has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
+        has_bias_no_overrides = (
+            len(node.input) > 2
+            and node.input[2]
+            and node.input[2] in self.initializers
+            and node.input[2] not in self.overrides
+        )
+
+        if has_weight_no_overrides or has_bias_no_overrides:
+            # TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
+            # tries to makes it per-channel if the weight is also per-channel.
+            raise ValueError(
+                "get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
+                " Please try using custom overrides that make bias per-tensor quantized."
+            )
+
+    def _process_sigmoid(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 65536.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
+
+    def _process_tanh(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(32768, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
diff --git a/onnxruntime/python/tools/quantization/fusions/fusion.py b/onnxruntime/python/tools/quantization/fusions/fusion.py
index 456a75eec2f8..4bdc5c26cc94 100644
--- a/onnxruntime/python/tools/quantization/fusions/fusion.py
+++ b/onnxruntime/python/tools/quantization/fusions/fusion.py
@@ -24,6 +24,9 @@ def __init__(self, model: ONNXModel, fused_op_type: str, search_op_type: str):
         self.nodes_to_remove: list = []
         self.nodes_to_add: list = []
 
+        self._new_node_name_prefix = self.fused_op_type + "_fused_" + self.search_op_type + "_"
+        self._new_node_name_suffix = None  # int|None used to create unique node names for the fused ops.
+
     def fuse(
         self,
         node: onnx.NodeProto,
@@ -57,6 +60,18 @@ def apply(self) -> bool:
 
         return graph_updated
 
+    def create_unique_node_name(self):
+        prefix = self._new_node_name_prefix
+
+        if self._new_node_name_suffix is None:
+            largest_suffix: int = self.model.get_largest_node_name_suffix(prefix)
+            self._new_node_name_suffix = largest_suffix + 1
+
+        new_name = f"{prefix}{self._new_node_name_suffix!s}"
+        self._new_node_name_suffix += 1
+
+        return new_name
+
     @staticmethod
     def is_safe_to_fuse_nodes(
         nodes_to_remove: list[onnx.NodeProto],
@@ -86,11 +101,9 @@ def get_node_attribute(node: onnx.NodeProto, attribute_name: str):
 
     @staticmethod
     def input_index(node_output: str, child_node: onnx.NodeProto) -> int:
-        index = 0
-        for input_name in child_node.input:
+        for index, input_name in enumerate(child_node.input):
             if input_name == node_output:
                 return index
-            index += 1
         return -1
 
     @staticmethod
diff --git a/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py b/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py
index a20d6dbffd7a..42c4a1183364 100644
--- a/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py
+++ b/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py
@@ -112,7 +112,9 @@ def fuse_1(
             return False
 
         self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = onnx.helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[subgraph_output]
+        )
         fused_node.domain = "com.microsoft"
         self.nodes_to_add.append(fused_node)
         return True
@@ -173,11 +175,9 @@ def fuse_2(
             if not self.has_constant_input(sqrt_node, 2.0):
                 return False
 
-        root_node = self.model.get_parent(div, 0, output_name_to_node)
-        if root_node is None:
-            return False
+        subgraph_input = div.input[0]
 
-        if root_node.output[0] not in mul.input:
+        if subgraph_input not in mul.input:
             return False
 
         subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
@@ -188,7 +188,9 @@ def fuse_2(
             return False
 
         self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = onnx.helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]])
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[mul.output[0]]
+        )
         fused_node.domain = "com.microsoft"
         self.nodes_to_add.append(fused_node)
         return True
@@ -239,9 +241,8 @@ def fuse_3(
         if i < 0:
             return False
 
-        root_node = self.model.get_parent(first_mul, 0 if i == 1 else 1, output_name_to_node)
-        if root_node is None:
-            return False
+        root_input_index = 1 - i
+        subgraph_input = first_mul.input[root_input_index]
 
         if mul_half.output[0] not in input_name_to_nodes:
             return False
@@ -250,7 +251,7 @@ def fuse_3(
             return False
         last_mul = children[0]
 
-        if not (last_mul.input[0] == root_node.output[0] or last_mul.input[1] == root_node.output[0]):
+        if not (last_mul.input[0] == subgraph_input or last_mul.input[1] == subgraph_input):
             return False
 
         subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
@@ -263,7 +264,9 @@ def fuse_3(
             return False
 
         self.nodes_to_remove.extend(subgraph_nodes)
-        fused_node = onnx.helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[last_mul.output[0]]
+        )
         fused_node.domain = "com.microsoft"
         self.nodes_to_add.append(fused_node)
         return True
diff --git a/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py b/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py
index d7fb89236d3d..7d58c1c18082 100644
--- a/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py
+++ b/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py
@@ -127,6 +127,7 @@ def fuse(
 
         normalize_node = onnx.helper.make_node(
             "LayerNormalization",
+            name=self.create_unique_node_name(),
             inputs=[reduce_mean_node.input[0], weight_input, bias_input],
             outputs=[last_add_node.output[0]],
         )
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 9f90196e301e..11a830dc6d7f 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -4,18 +4,23 @@
 # license information.
 # --------------------------------------------------------------------------
 
+from __future__ import annotations
+
 import argparse
+import copy
+import importlib
 import logging
 import os
-from typing import List, Tuple
 
 import numpy as np
 import numpy.typing as npt
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+from packaging import version
 
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits
 
+from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
 from .quant_utils import attribute_to_kwarg
 
@@ -23,25 +28,359 @@
 logger = logging.getLogger(__name__)
 
 
-class MatMul4BitsQuantizer:
-    """Perform 4b quantization of constant MatMul weights"""
-
-    def __init__(self, model: ModelProto, block_size: int, is_symmetric: bool, nodes_to_exclude=None):
-        if nodes_to_exclude is None:
-            nodes_to_exclude = []
-        self.model = ONNXModel(model)
+class WeightOnlyQuantConfig:
+    def __init__(self, algorithm):
+        """This is the Base class for Weight Only Quant Configuration.
+
+        Args:
+            algorithm:
+                weight only quantize algorithm name.
+        """
+        self.algorithm = algorithm
+
+
+class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        ratios=None,
+    ):
+        """
+        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
+        RTN is the most straightforward way to quantize weight using scale maps.
+
+        Args:
+            ratios:
+                percentile of clip. Defaults to {}.
+        """
+        if ratios is None:
+            ratios = {}
+        super().__init__(
+            algorithm="RTN",
+        )
+        self.ratios = ratios
+
+
+class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        percdamp=0.01,
+        block_size=128,
+        actorder=False,
+        mse=False,
+        perchannel=True,
+    ):
+        """
+        This is a class for GPTQ algorithm Weight Only Quant Configuration.
+        GPTQ algorithm provides more accurate quantization but requires more computational resources.
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            percdamp:
+                percent of the average Hessian diagonal to use for dampening.
+            block_size (int, optional):
+                channel number in one block to execute a GPTQ quantization iteration.
+            actorder (bool, optional):
+                whether rearrange Hessian matrix considering the diag's value.
+            mse (bool, optional):
+                whether get scale and zero point with mse error.
+            perchannel (bool, optional):
+                whether quantize weight per-channel.
+        """
+        super().__init__(
+            algorithm="GPTQ",
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.percdamp = percdamp
+        self.block_size = block_size
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
+
+
+class HQQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        block_size=128,
+        bits=4,
+        axis=1,
+    ):
+        """
+        This is a class for HQQ algorithm Weight Only Quant Configuration.
+        HQQ algorithm quant weight without needing calibrate data.
+
+        Args:
+            block_size (int, optional):
+                channel number in one block to execute a GPTQ quantization iteration.
+            bits (int, optional):
+                how many bits to represent weight.
+            axis (int, optional):
+                0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
+        """
+        super().__init__(
+            algorithm="HQQ",
+        )
+        self.block_size = block_size
+        self.bits = bits
+        self.axis = axis
+
+
+class DefaultWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        block_size: int = 128,
+        is_symmetric: bool = False,
+        accuracy_level: int | None = None,
+    ):
+        super().__init__(algorithm="DEFAULT")
         self.block_size = block_size
         self.is_symmetric = is_symmetric
-        self.nodes_to_exclude = set(nodes_to_exclude)
+        self.bits = 4
+        self.accuracy_level = accuracy_level
 
+
+def is_divisible(val1, val2):
+    return int(val2 * np.ceil(val1 / val2)) == val1
+
+
+class HQQWeightOnlyQuantizer:
+    def __init__(
+        self,
+        config: HQQWeightOnlyQuantConfig,
+    ):
+        self.config = config
+
+    # Proximal solver || weight - dequantize(quantize(weight))||_p^p
     @staticmethod
-    def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
-        for gid in range(len(graph_path) - 1, -1, -1):
-            graph = graph_path[gid]
-            for tensor in graph.initializer:
-                if tensor.name == name:
-                    return tensor, graph
-        return None, None
+    def optimize_weights(
+        tensor,
+        scale,
+        zero,
+        min_max: list[int],
+        axis: int = 0,
+        opt_params: dict = None,  # noqa: RUF013
+        verbose=False,
+    ):
+        import torch
+
+        opt_params = {"lp_norm": 0.7, "beta": 1e1, "kappa": 1.01, "iters": 20} if opt_params is None else opt_params
+        lp_norm, beta, kappa, iters = (
+            opt_params["lp_norm"],
+            opt_params["beta"],
+            opt_params["kappa"],
+            opt_params["iters"],
+        )
+
+        dtype = torch.float16 if tensor.is_cuda else torch.float32
+        w_f = tensor.to(dtype)
+        scale = scale.to(dtype)
+        zero = zero.to(dtype)
+
+        if lp_norm == 1:
+
+            def shrink_op(x, beta):
+                return torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - 1.0 / beta)
+
+        else:
+
+            def shrink_op(x, beta, p=lp_norm):
+                return torch.sign(x) * torch.nn.functional.relu(
+                    torch.abs(x) - (1.0 / beta) * torch.pow(torch.abs(x) + 1e-8, p - 1)
+                )
+
+        best_error = 1e4
+        for i in range(iters):
+            w_q = torch.round(w_f * scale + zero).clamp(min_max[0], min_max[1])
+            w_r = (w_q - zero) / scale
+            w_e = shrink_op(w_f - w_r, beta)
+            zero = torch.mean(w_q - (w_f - w_e) * scale, axis=axis, keepdim=True)
+            beta *= kappa
+
+            current_error = float(torch.abs(w_f - w_r).mean())
+            if verbose:
+                print(i, np.round(current_error, 6))
+            if current_error < best_error:
+                best_error = current_error
+            else:
+                break
+
+        del w_f, w_q, w_r, w_e
+
+        return scale, zero
+
+    @staticmethod
+    def pack_on_row_fast_248bit(pack_tensor, ori_int_tensor, bits):
+        if pack_tensor.shape[0] == ori_int_tensor.shape[0]:
+            ori_int_tensor = ori_int_tensor.T
+            pack_tensor = pack_tensor.T
+        if bits in [2, 4, 8]:
+            compress_ratio = pack_tensor.element_size() * 8 // bits
+            for j in range(compress_ratio):
+                pack_tensor[0:] |= ori_int_tensor[j::compress_ratio] << (bits * (j))
+        else:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+    # from Official implementation of Half-Quadratic Quantization (HQQ)
+    def quantize_internal(
+        self, tensor, bits=4, channel_wise=True, group_size=64, optimize=True, round_zero=True, axis=1
+    ):
+        import torch
+
+        weight = tensor.float()
+        ori_shape = weight.shape
+
+        pad_len = (group_size - ori_shape[axis] % group_size) % group_size
+        if axis == 1:
+            weight = torch.nn.functional.pad(weight, (0, pad_len), "constant", 0)
+        else:
+            weight = torch.nn.functional.pad(weight, (0, 0, 0, pad_len), "constant", 0)
+        shape = weight.shape
+
+        # Reshape for grouping
+        if (group_size is not None) and channel_wise:
+            weight = weight.reshape([-1, group_size]) if (axis == 1) else weight.reshape([group_size, -1])
+
+        # Get min/max values
+        if channel_wise is False:
+            _min, _max = weight.min(), weight.max()
+            optimize = False
+        else:
+            _min = weight.min(axis=axis, keepdim=True)[0]
+            _max = weight.max(axis=axis, keepdim=True)[0]
+
+        max_v = 2**bits - 1
+        min_v = 0
+        min_max = [min_v, max_v]
+
+        # Note: here we work with the inverse of the scale to avoid division and quantize instead via weight*scale + zero, the scale is inverted later on.
+        # clamp to avoid half-precision problems
+        scale = (max_v / (_max - _min)).clamp(max=2e4)
+        #!!!!!!!!!!!!!!!
+        min_max_axis = _max - _min
+        if (min_max_axis == 0).sum().item() > 0:
+            min_max_axis[min_max_axis == 0] = max_v
+            scale = (max_v / min_max_axis).clamp(max=2e4)
+        zero = -_min * scale
+
+        if round_zero:
+            zero = torch.round(zero)
+
+        # Fine-tune weights
+        if optimize:
+            scale, zero = self.optimize_weights(tensor=weight, scale=scale, zero=zero, min_max=min_max, axis=axis)
+
+        # Quantize
+        # Necessary for fake quantization backprop
+        w_q = torch.round(weight * scale + zero).clamp(min_max[0], min_max[1])
+        w_q = w_q.reshape(shape).int()
+
+        scale = 1.0 / scale
+        if axis == 1:
+            scale = scale.reshape(shape[0], -1)
+            zero = zero.reshape(shape[0], -1)
+        else:
+            scale = scale.reshape(-1, shape[-1])
+            zero = zero.reshape(-1, shape[-1])
+        # cleanup
+        del weight, _min, _max
+
+        return w_q, scale.to(tensor.dtype), zero.to(tensor.dtype)
+
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
+        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+        if node.op_type != "MatMul":
+            return node  # only care about MatMul for now
+        import torch
+
+        logger.info(f"start to quantize {node.name} ...")
+        inputB = node.input[1]  # noqa: N806
+        b_pb, bs_graph = get_initializer(inputB, graph_stack)
+        if b_pb is None:
+            logger.info("MatMul doesn't have const weight. Skip to quantize")
+            return node  # only care about constant weight
+
+        b_array = onnx.numpy_helper.to_array(b_pb)
+        if len(b_array.shape) != 2:
+            logger.info("MatMul weight is not 2D. Skip to quantize")
+            return node  # can only process 2-D matrix
+        b_array_torch = torch.from_numpy(b_array)
+        if torch.cuda.is_available():
+            b_array_torch = b_array_torch.cuda()
+        quant_weight_torch, scales_torch, zero_points_torch = self.quantize_internal(
+            b_array_torch.T, bits=self.config.bits, group_size=self.config.block_size
+        )
+        quant_weight_torch = quant_weight_torch.contiguous()
+        scales_torch = scales_torch.contiguous()
+        zero_points_torch = zero_points_torch.contiguous()
+
+        packed_torch = torch.zeros(
+            (quant_weight_torch.shape[0], quant_weight_torch.shape[1] // 2),
+            dtype=torch.uint8,
+            device=quant_weight_torch.device,
+        )
+        self.pack_on_row_fast_248bit(packed_torch, quant_weight_torch, self.config.bits)
+        scales = scales_torch.cpu().numpy()
+        zero_points = zero_points_torch.cpu().numpy()
+        # reshape to the predefined shape in MatmulNbits
+        scales = scales.reshape(-1)
+        zero_points = zero_points.reshape(-1)
+        rows, cols = b_array_torch.shape
+        block_size = self.config.block_size
+        blob_size = block_size // 2
+        k_blocks = (rows + block_size - 1) // block_size
+        packed_torch = packed_torch.reshape(cols, k_blocks, blob_size)
+
+        b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy())
+        b_quant.name = b_pb.name + "_Q4"
+        for input in bs_graph.input:
+            if input.name == inputB:
+                bs_graph.input.remove(input)
+                break
+
+        scales_tensor = onnx.numpy_helper.from_array(scales)
+        scales_tensor.name = b_pb.name + "_scales"
+        bs_graph.initializer.extend([b_quant, scales_tensor])
+
+        input_names = [node.input[0], b_quant.name, scales_tensor.name]
+        zp_tensor = onnx.numpy_helper.from_array(zero_points)
+        zp_tensor.name = b_pb.name + "_zero_points"
+        bs_graph.initializer.extend([zp_tensor])
+        input_names.append(zp_tensor.name)
+
+        kwargs = {}
+        rows, cols = b_array.shape
+        kwargs["K"] = rows
+        kwargs["N"] = cols
+        kwargs["bits"] = self.config.bits
+        kwargs["block_size"] = self.config.block_size
+
+        matmul_q4_node = onnx.helper.make_node(
+            "MatMulNBits",
+            inputs=input_names,
+            outputs=[node.output[0]],
+            name=node.name + "_Q4" if node.name else "",
+            domain="com.microsoft",
+            **kwargs,
+        )
+
+        logger.info(f"complete quantization of {node.name} ...")
+
+        return matmul_q4_node
+
+
+def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
+    for gid in range(len(graph_path) - 1, -1, -1):
+        graph = graph_path[gid]
+        for tensor in graph.initializer:
+            if tensor.name == name:
+                return tensor, graph
+    return None, None
+
+
+class DefaultWeightOnlyQuantizer:
+    def __init__(self, config: DefaultWeightOnlyQuantConfig):
+        self.config = config
 
     def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
         """4b quantize fp32 weight to a blob"""
@@ -50,7 +389,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
             raise ValueError("Current int4 block quantization only supports 2D tensors!")
         rows, cols = fp32weight.shape
 
-        block_size = self.block_size
+        block_size = self.config.block_size
         blob_size = block_size // 2
         k_blocks = (rows + block_size - 1) // block_size
         padded_rows = k_blocks * block_size
@@ -62,23 +401,19 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
         packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
         scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
         zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
-        quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.is_symmetric)
+        quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric)
 
         return (packed, scales, zero_point)
 
-    def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
         """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
 
         if node.op_type != "MatMul":
             return node  # only care about MatMul for now
 
         logger.info(f"start to quantize {node.name} ...")
-        if node.name in self.nodes_to_exclude:
-            logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
-            return node
-
         inputB = node.input[1]  # noqa: N806
-        B, Bs_graph = MatMul4BitsQuantizer.__get_initializer(inputB, graph_stack)  # noqa: N806
+        B, Bs_graph = get_initializer(inputB, graph_stack)  # noqa: N806
         if B is None:
             logger.info("MatMul doesn't have const weight. Skip to quantize")
             return node  # only care about constant weight
@@ -101,7 +436,7 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
         Bs_graph.initializer.extend([B_quant, scales_tensor])
 
         input_names = [node.input[0], B_quant.name, scales_tensor.name]
-        if not self.is_symmetric:
+        if not self.config.is_symmetric:
             zp_tensor = onnx.numpy_helper.from_array(zero_points)
             zp_tensor.name = B.name + "_zero_points"
             Bs_graph.initializer.extend([zp_tensor])
@@ -112,7 +447,9 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
         kwargs["K"] = rows
         kwargs["N"] = cols
         kwargs["bits"] = 4
-        kwargs["block_size"] = self.block_size
+        kwargs["block_size"] = self.config.block_size
+        if self.config.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.config.accuracy_level
 
         matmul_q4_node = onnx.helper.make_node(
             "MatMulNBits",
@@ -127,7 +464,39 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
 
         return matmul_q4_node
 
-    def _process_subgraph(self, graph_stack: List[GraphProto]):
+
+class MatMul4BitsQuantizer:
+    """Perform 4b quantization of constant MatMul weights"""
+
+    def __init__(
+        self,
+        model: ModelProto | str,
+        block_size: int = 128,
+        is_symmetric: bool = False,
+        accuracy_level: int | None = None,
+        nodes_to_exclude=None,
+        algo_config: WeightOnlyQuantConfig = None,
+    ):
+        if nodes_to_exclude is None:
+            nodes_to_exclude = []
+        self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
+        self.model_path = model if isinstance(model, str) else None
+        self.block_size = block_size
+        self.is_symmetric = is_symmetric
+        self.accuracy_level = accuracy_level
+        self.nodes_to_exclude = set(nodes_to_exclude)
+        self.node_quantizer = None
+        if algo_config is None:
+            algo_config = DefaultWeightOnlyQuantConfig(
+                block_size=block_size, is_symmetric=is_symmetric, accuracy_level=accuracy_level
+            )
+        self.algo_config = algo_config
+        if algo_config.algorithm == "HQQ":
+            self.node_quantizer = HQQWeightOnlyQuantizer(self.algo_config)
+        elif algo_config.algorithm == "DEFAULT":
+            self.node_quantizer = DefaultWeightOnlyQuantizer(self.algo_config)
+
+    def _process_subgraph(self, graph_stack: list[GraphProto]):
         new_nodes = []
         graph = graph_stack[-1]
 
@@ -157,28 +526,117 @@ def _process_subgraph(self, graph_stack: List[GraphProto]):
                 node = onnx.helper.make_node(  # noqa: PLW2901
                     node.op_type, node.input, node.output, name=node.name, **kwargs
                 )
-
-            new_nodes.append(self._q4_matmul_node_weight(node, graph_stack))
+            out_node = None
+            if node.name in self.nodes_to_exclude:
+                logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
+                out_node = node
+            elif self.algo_config is not None and self.algo_config.algorithm == "HQQ":
+                out_node = self.node_quantizer.quantize(node, graph_stack)
+            else:
+                out_node = self.node_quantizer.quantize(node, graph_stack)
+            new_nodes.append(out_node)
 
         graph.ClearField("node")
         graph.node.extend(new_nodes)
         graph_stack.pop()
         return graph
 
-    def process(self):
-        # use a stack to keep track of sub-graphs
-        graph_stack = [self.model.graph()]
-        opset_import = self.model.opset_import()
+    def _generate_q4_node_config(self):
+        """Generate weight only quant configuration for nodes."""
+        q4_node_config = {}
+        template_config_q4 = {
+            "bits": 4,
+            "group_size": self.block_size,
+            "scheme": "sym" if self.is_symmetric else "asym",
+        }
+        for node in self.model.model.graph.node:
+            if node.op_type in ["MatMul"]:
+                if not all([self.model.get_initializer(i) is None for i in node.input]):
+                    q4_node_config[node.name] = template_config_q4
+        return q4_node_config
+
+    def int4_quant_algo(self):
+        """4b quantize a model with RTN or GPTQ algorithm. Please refer to
+        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
+        for more details on weight only quantization using Intel® Neural Compressor.
+        """
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(self.algo_config.calibration_data_reader)
+            for data in data_reader:
+                yield data, None
 
-        has_ms_domain = False
-        for opset in opset_import:
-            if opset.domain == "com.microsoft":
-                has_ms_domain = True
-        if not has_ms_domain:
-            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+        kwargs = {}
+        if self.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.accuracy_level
+        weight_only_node_config = self._generate_q4_node_config()
+
+        algorithm = self.algo_config.algorithm
+        logger.info(f"start to quantize model with {algorithm} algorithm...")
+        if algorithm == "RTN":
+            from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
+
+            kwargs["ratios"] = self.algo_config.ratios
+
+            self.model = rtn_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                **kwargs,
+            )
+        elif algorithm == "GPTQ":
+            from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
+
+            kwargs["percdamp"] = self.algo_config.percdamp
+            kwargs["blocksize"] = self.algo_config.block_size
+            kwargs["actorder"] = self.algo_config.actorder
+            kwargs["mse"] = self.algo_config.mse
+            kwargs["perchannel"] = self.algo_config.perchannel
+            kwargs["n_samples"] = -1
+            dataloader = inc_dataloader()
+
+            self.model = gptq_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                dataloader=dataloader,
+                **kwargs,
+            )
+        logger.info(f"complete quantization of model with {algorithm} algorithm.")
 
-        self._process_subgraph(graph_stack)
-        self.model.clean_initializers()
+    def process(self):
+        if self.algo_config.algorithm in ["HQQ", "DEFAULT"]:
+            # use a stack to keep track of sub-graphs
+            graph_stack = [self.model.graph()]
+            opset_import = self.model.opset_import()
+
+            has_ms_domain = False
+            for opset in opset_import:
+                if opset.domain == "com.microsoft":
+                    has_ms_domain = True
+            if not has_ms_domain:
+                opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+            self._process_subgraph(graph_stack)
+            self.model.clean_initializers()
+        else:
+            # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
+            try:
+                importlib.import_module("neural_compressor")
+            except Exception as e:
+                logging.error(f"{e}.")
+                raise RuntimeError(
+                    "neural-compressor is not correctly installed. Please check your environment."
+                ) from e
+
+            import neural_compressor
+
+            assert version.parse(neural_compressor.__version__) >= version.parse(
+                "2.3.2"
+            ), "Require neural-compressor >= 2.3.2 to support weight only quantization!"
+
+            self.int4_quant_algo()
+
+
+def ort_convert_str_to_bool(value):
+    return value.lower() in ("true", "1")
 
 
 def parse_args():
@@ -194,12 +652,31 @@ def parse_args():
     parser.add_argument("--input_model", required=True, help="Path to the input model file")
     parser.add_argument("--output_model", required=True, help="Path to the output model file")
     parser.add_argument("--block_size", required=False, default=32, type=int, help="Block size for quantization")
+    parser.add_argument(
+        "--quant_method",
+        default="default",
+        type=str,
+        choices=["default", "hqq", "rtn", "gptq"],
+        help="the algorithm used to quantize weight, \nrtn and gptq leverage Intel® Neural Compressor",
+    )
+    parser.add_argument("--bits", default=4, type=int, help="the target bits to represent weight")
     parser.add_argument(
         "--symmetric",
         required=False,
         default=True,
-        type=bool,
-        help="Indicate whether to quantize the model symmetrically",
+        const=True,
+        nargs="?",
+        type=ort_convert_str_to_bool,
+        choices=[True, False],
+        help="Indicate whether to quantize the model symmetrically, symmetric is not supported by hqq",
+    )
+    parser.add_argument(
+        "--accuracy_level",
+        required=False,
+        type=int,
+        help="Accuracy level of the 4-bit quantized MatMul computation. "
+        "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
+        "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
     )
     parser.add_argument("-v", "--verbose", required=False, action="store_true")
     parser.set_defaults(verbose=False)
@@ -227,7 +704,29 @@ def parse_args():
         logger.error(f"file {output_model_path} already exists")
         raise Exception(f"file {output_model_path} already exists")
 
+    if args.symmetric and args.quant_method == "hqq":
+        logger.warning("symmetric is not supportted by hqq, will force to symmetric=False")
+        args.symmetric = False
+
     model = onnx.load(input_model_path)
-    quant = MatMul4BitsQuantizer(model, args.block_size, args.symmetric, nodes_to_exclude=args.nodes_to_exclude)
+    if args.quant_method == "hqq":
+        quant_config = HQQWeightOnlyQuantConfig(block_size=args.block_size, bits=args.bits)
+    elif args.quant_method == "default":
+        quant_config = DefaultWeightOnlyQuantConfig(
+            block_size=args.block_size, is_symmetric=args.symmetric, accuracy_level=args.accuracy_level
+        )
+    elif args.quant_method == "rtn":
+        quant_config = RTNWeightOnlyQuantConfig()
+    elif args.quant_method == "gptq":
+        quant_config = GPTQWeightOnlyQuantConfig(block_size=args.block_size)
+    else:
+        raise ValueError(f"Unsupported quantization method: {args.quant_method}")
+
+    quant = MatMul4BitsQuantizer(
+        model=model,
+        accuracy_level=args.accuracy_level,
+        nodes_to_exclude=args.nodes_to_exclude,
+        algo_config=quant_config,
+    )
     quant.process()
     quant.model.save_model_to_file(output_model_path, True)
diff --git a/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py b/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py
index 951746a08930..2bf47fe1680e 100644
--- a/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py
@@ -199,14 +199,14 @@ def parse_args():
         "--quant_type",
         required=False,
         default=1,
-        options=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
+        choices=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
         help="Quantization data type. 0: FP4, 1: NF4",
     )
     parser.add_argument(
         "--block_size",
         required=False,
         default=64,
-        description="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
+        help="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
     )
     parser.add_argument("-v", "--verbose", required=False, action="store_true")
     parser.set_defaults(verbose=False)
diff --git a/onnxruntime/python/tools/quantization/matmul_weight4_quantizer.py b/onnxruntime/python/tools/quantization/matmul_weight4_quantizer.py
deleted file mode 100644
index 921e02fb69e9..000000000000
--- a/onnxruntime/python/tools/quantization/matmul_weight4_quantizer.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-
-import argparse
-import struct
-from pathlib import Path
-from typing import List, Tuple
-
-import numpy as np
-import numpy.typing as npt
-import onnx
-from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
-
-from .onnx_model import ONNXModel
-from .quant_utils import attribute_to_kwarg, load_model_with_shape_infer
-
-
-def __q4_block_size(quant_type: int) -> int:
-    # happens to be 32 for now, but future quantization types
-    # may have bigger block size
-    return 32
-
-
-def __q4_blob_size(quant_type: int) -> int:
-    if quant_type == MatMulWeight4Quantizer.BlkQ4Sym:
-        # 4b each value, with one fp32 scale
-        blob_size = 32 // 2 + 4
-    elif quant_type == MatMulWeight4Quantizer.BlkQ4Zp8:
-        # 4b each value, with one fp32 scale and one uint8 zero point
-        blob_size = 32 // 2 + 4 + 1
-    else:
-        raise ValueError(f"Unsupported quantization type: {quant_type}")
-    return blob_size
-
-
-def __q4_buf_size(quant_type: int, rows: int, cols: int) -> int:
-    block_size = __q4_block_size(quant_type)
-    blob_size = __q4_blob_size(quant_type)
-    k_blocks = (rows + block_size - 1) // block_size
-    return k_blocks * cols * blob_size
-
-
-def int4_block_quant(quant_type: int, fp32weight: npt.ArrayLike) -> np.ndarray:
-    """4b quantize fp32 weight to a blob"""
-
-    if len(fp32weight.shape) != 2:
-        raise ValueError("Current int4 block quantization only supports 2D tensors!")
-    rows, cols = fp32weight.shape
-
-    block_size = __q4_block_size(quant_type)
-    blob_size = __q4_blob_size(quant_type)
-    k_blocks = (rows + block_size - 1) // block_size
-    padded_rows = k_blocks * block_size
-    pad_len = padded_rows - rows
-    if pad_len > 0:
-        fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
-
-    # block wise quantization, each block comes from a single column
-    blob_idx = 0
-    packed = np.zeros((cols * k_blocks, blob_size), dtype="uint8")
-    for n in range(cols):
-        ncol = fp32weight[:, n]
-        blks = np.split(ncol, k_blocks)
-        for blk in blks:
-            packed_blob = packed[blob_idx]
-            blob_idx += 1
-
-            if quant_type == MatMulWeight4Quantizer.BlkQ4Sym:
-                amax_idx = np.argmax(np.abs(blk))
-                bmax = blk[amax_idx]
-                scale = bmax / (-8)
-                zp = 8
-            else:
-                vmin = np.min(blk)
-                vmax = np.max(blk)
-                vmin = min(vmin, 0.0)
-                vmax = max(vmax, 0.0)
-                scale = (vmax - vmin) / ((1 << 4) - 1)
-                zero_point_fp = vmin
-                if scale != 0.0:
-                    zero_point_fp = 0.0 - vmin / scale
-                zp = min(15, max(0, round(zero_point_fp)))
-
-            reciprocal_scale = 1.0 / scale if scale != 0 else 0.0
-            bf = struct.pack("f", scale)
-            packed_blob[0] = bf[0]
-            packed_blob[1] = bf[1]
-            packed_blob[2] = bf[2]
-            packed_blob[3] = bf[3]
-            blob_offset = 4
-            if quant_type == MatMulWeight4Quantizer.BlkQ4Zp8:
-                packed_blob[4] = zp
-                blob_offset = 5
-
-            num_segs = block_size // 32
-            blk_int = np.clip(np.rint(blk * reciprocal_scale + zp), 0, 15).astype("uint8")
-            segs = np.split(blk_int, num_segs)
-            for seg in segs:
-                packed_blob[blob_offset : (blob_offset + 16)] = np.bitwise_or(seg[0:16], np.left_shift(seg[16:32], 4))
-                blob_offset += 16
-    return packed.reshape(-1)
-
-
-class MatMulWeight4Quantizer:
-    """Perform 4b quantization of constant MatMul weights"""
-
-    ##################
-    # quantization types, must be consistent with native code type
-    # MLAS_BLK_QUANT_TYPE defined in mlas_q4.h
-
-    # 32 number block, symmetric quantization, with one fp32 as scale, zero point is always 0
-    BlkQ4Sym = 0
-
-    # 32 number block, quantization, with one fp32 as scale, one uint8 zero point
-    BlkQ4Zp8 = 1
-
-    def __init__(self, model: ModelProto, quant_type: int):
-        self.model = ONNXModel(model)
-        self.quant_type = quant_type
-
-    @staticmethod
-    def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
-        for gid in range(len(graph_path) - 1, -1, -1):
-            graph = graph_path[gid]
-            for tensor in graph.initializer:
-                if tensor.name == name:
-                    return tensor, graph
-        return None, None
-
-    def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
-        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
-
-        if node.op_type != "MatMul":
-            return node  # only care about MatMul for now
-
-        inputB = node.input[1]  # noqa: N806
-        B, Bs_graph = MatMulWeight4Quantizer.__get_initializer(inputB, graph_stack)  # noqa: N806
-        if B is None:
-            return node  # only care about constant weight
-
-        # TODO!! assume B is not used by any other node
-        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
-        if len(B_array.shape) != 2:
-            return node  # can only process 2-D matrix
-
-        rows, cols = B_array.shape
-        packed = int4_block_quant(self.quant_type, B_array)
-        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
-        B_quant.name = B.name + "_Q4"
-        Bs_graph.initializer.remove(B)
-        for input in Bs_graph.input:
-            if input.name == inputB:
-                Bs_graph.input.remove(input)
-                break
-
-        B_shape = onnx.numpy_helper.from_array(np.array([rows, cols]).astype(np.int64))  # noqa: N806
-        B_shape.name = B.name + "_shape"
-        Bs_graph.initializer.extend([B_quant, B_shape])
-
-        kwargs = {}
-        kwargs["blk_quant_type"] = self.quant_type
-        matmul_q4_node = onnx.helper.make_node(
-            "MatMulFpQ4",
-            inputs=[node.input[0], B_quant.name, B_shape.name],
-            outputs=[node.output[0]],
-            name=node.name + "_Q4" if node.name else "",
-            domain="com.microsoft",
-            **kwargs,
-        )
-        return matmul_q4_node
-
-    def _process_subgraph(self, graph_stack: List[GraphProto]):
-        new_nodes = []
-        graph = graph_stack[-1]
-
-        for node in graph.node:
-            graph_attrs = [
-                attr
-                for attr in node.attribute
-                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
-            ]
-            if len(graph_attrs):
-                kwargs = {}
-                for attr in node.attribute:
-                    if attr.type == onnx.AttributeProto.GRAPH:
-                        # recursive call to take care of sub-graph
-                        graph_stack.append(attr.g)
-                        kv = {attr.name: self._process_subgraph(graph_stack)}
-                    elif attr.type == onnx.AttributeProto.GRAPHS:
-                        value = []
-                        for subgraph in attr.graphs:
-                            # recursive call to take care of sub-graph
-                            graph_stack.append(subgraph)
-                            value.extend([self._process_subgraph(graph_stack)])
-                        kv = {attr.name: value}
-                    else:
-                        kv = attribute_to_kwarg(attr)
-                    kwargs.update(kv)
-                node = onnx.helper.make_node(  # noqa: PLW2901
-                    node.op_type, node.input, node.output, name=node.name, **kwargs
-                )
-
-            new_nodes.append(self._q4_matmul_node_weight(node, graph_stack))
-
-        graph.ClearField("node")
-        graph.node.extend(new_nodes)
-        graph_stack.pop()
-        return graph
-
-    def process(self):
-        # use a stack to keep track of sub-graphs
-        graph_stack = [self.model.graph()]
-        opset_import = self.model.opset_import()
-
-        has_ms_domain = False
-        for opset in opset_import:
-            if opset.domain == "com.microsoft":
-                has_ms_domain = True
-        if not has_ms_domain:
-            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
-
-        self._process_subgraph(graph_stack)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="""Blockwise int4 quantization for MatMul 2D weight matrices.
-
-A weight matrix is partitioned into into blocks, where each block is a
-continguous subset inside each column. Each block is quantized into a
-set of 4b integers with a scaling factor and an optional offset.
-"""
-    )
-
-    parser.add_argument("--input_model", required=True, help="Path to the input model file")
-    parser.add_argument("--output_model", required=True, help="Path to the output model file")
-    parser.add_argument(
-        "--quant_bin_path",
-        required=True,
-        help="""Currently quantization code is implemented in a separate binary
-(onnxruntime_mlas_q4dq) that is compiled with Onnxruntime native code.
-Path to this binary needs to be provided here.""",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    input_model_path = args.input_model
-    output_model_path = args.output_model
-    q4dq_bin_path = args.quant_bin_path
-
-    model = load_model_with_shape_infer(Path(input_model_path))
-    quant = MatMulWeight4Quantizer(model, 0)
-    quant.process()
-    quant.model.save_model_to_file(output_model_path, False)
diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py
index 4591c9c950e6..174bf5fd1509 100644
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@@ -79,11 +79,7 @@ def _clean_initializers_helper(graph, model):
                 graph.input.remove(name_to_input[initializer.name])
             except StopIteration:
                 if model.ir_version < 4:
-                    print(
-                        "Warning: invalid weight name {} found in the graph (not a graph input)".format(
-                            initializer.name
-                        )
-                    )
+                    print(f"Warning: invalid weight name {initializer.name} found in the graph (not a graph input)")
 
     requesting_tensor_names.difference_update(input.name for input in graph.input)
 
@@ -283,6 +279,23 @@ def find_node_by_name(self, node_name, new_nodes_list, graph):
         node = find_by_name(node_name, graph_nodes_list)
         return node
 
+    def get_largest_node_name_suffix(self, node_name_prefix):
+        """
+        Gets the largest node name (int) suffix for all node names that begin with `node_name_prefix`.
+        Example: for nodes my_prefix_0 and my_prefix_3, this method returns 3.
+        """
+        suffix = -1
+
+        for node in self.model.graph.node:
+            if node.name and node.name.startswith(node_name_prefix):
+                try:
+                    index = int(node.name[len(node_name_prefix) :])
+                    suffix = max(index, suffix)
+                except ValueError:
+                    continue
+
+        return suffix
+
     def find_nodes_by_initializer(self, graph, initializer):
         """
         Find all nodes with given initializer as an input.
@@ -428,6 +441,11 @@ def replace_input_of_all_nodes(self, old_input_name, new_input_name):
         for node in self.model.graph.node:
             ONNXModel.replace_node_input(node, old_input_name, new_input_name)
 
+    def replace_input_of_nodes(self, old_input_name, new_input_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
     @staticmethod
     def replace_node_output(node, old_output_name, new_output_name):
         assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
@@ -439,6 +457,11 @@ def replace_output_of_all_nodes(self, old_output_name, new_output_name):
         for node in self.model.graph.node:
             ONNXModel.replace_node_output(node, old_output_name, new_output_name)
 
+    def replace_output_of_nodes(self, old_output_name, new_output_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
     def remove_unused_constant(self):
         input_name_to_nodes = self.input_name_to_nodes()
 
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index f6491f32d87b..f84e00abd610 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -4,19 +4,13 @@
 # license information.
 # --------------------------------------------------------------------------
 import logging
-from typing import Any, Dict
 
 import numpy as np
 import onnx
 import onnx.numpy_helper
 from onnx import onnx_pb as onnx_proto
 
-try:
-    from onnx.reference.op_run import to_array_extended
-except ImportError:
-    # old version of onnx.
-    to_array_extended = None
-
+from .base_quantizer import BaseQuantizer, QuantizationParams
 from .calibrate import TensorData
 from .onnx_model import ONNXModel
 from .quant_utils import (
@@ -24,7 +18,6 @@
     QuantizationMode,
     QuantizedValue,
     QuantizedValueType,
-    QuantType,
     __producer__,
     __version__,
     add_infer_metadata,
@@ -34,37 +27,14 @@
     find_by_name,
     get_qmin_qmax_for_qType,
     get_qrange_for_qType,
-    model_has_infer_metadata,
     ms_domain,
-    quantize_data,
-    quantize_nparray,
     save_and_reload_model_with_shape_infer,
     tensor_proto_to_array,
 )
 from .registry import CreateOpQuantizer
 
 
-class QuantizationParams:
-    def __init__(self, **data: Dict[str, Any]):
-        self.data = {}
-        for k, v in data.items():
-            if not isinstance(k, str):
-                raise TypeError(f"Keys must be strings not {type(k)}.")
-            if not isinstance(v, (int, float, str, QuantType)):
-                raise TypeError(f"Values must be int, float, str, or QuantType not {type(v)}.")
-            self.data[k] = v
-
-    def __iter__(self):
-        yield from self.data
-
-    def __getitem__(self, key):
-        return self.data[key]
-
-    def __len__(self):
-        return len(self.data)
-
-
-class ONNXQuantizer:
+class ONNXQuantizer(BaseQuantizer):
     def __init__(
         self,
         model,
@@ -80,63 +50,36 @@ def __init__(
         op_types_to_quantize,
         extra_options=None,
     ):
-        if not model_has_infer_metadata(model):
-            model = save_and_reload_model_with_shape_infer(model)
-        self.value_infos = {vi.name: vi for vi in model.graph.value_info}
-        self.value_infos.update({ot.name: ot for ot in model.graph.output})
-        self.value_infos.update({it.name: it for it in model.graph.input})
+        BaseQuantizer.__init__(
+            self,
+            model,
+            per_channel,
+            reduce_range,
+            weight_qType,
+            activation_qType,
+            tensors_range,
+            nodes_to_quantize,
+            nodes_to_exclude,
+            op_types_to_quantize,
+            extra_options,
+        )
 
-        self.model = ONNXModel(model)
         if not static:
             self.model.replace_gemm_with_matmul()
+            # We need to update value_infos.
+            model = save_and_reload_model_with_shape_infer(self.model.model)
+            self.value_infos = {vi.name: vi for vi in model.graph.value_info}
+            self.value_infos.update({ot.name: ot for ot in model.graph.output})
+            self.value_infos.update({it.name: it for it in model.graph.input})
+            self.model = ONNXModel(model)
 
-        self.per_channel = per_channel  # weight-pack per channel
-        self.reduce_range = reduce_range
         self.mode = mode  # QuantizationMode.Value
         self.static = static  # use static quantization for inputs.
-        self.fuse_dynamic_quant = False
+        self.fuse_dynamic_quant = self.opset_version > 10
 
-        self.extra_options = extra_options if extra_options else {}
-        self.enable_subgraph_quantization = (
-            "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
-        )
-        self.force_quantize_no_input_check = (
-            "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
-        )
         self.q_matmul_const_b_only = "MatMulConstBOnly" in self.extra_options and self.extra_options["MatMulConstBOnly"]
-        self.is_weight_symmetric = (
-            weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
-            if "WeightSymmetric" not in self.extra_options
-            else self.extra_options["WeightSymmetric"]
-        )
-        self.is_activation_symmetric = (
-            False if "ActivationSymmetric" not in self.extra_options else self.extra_options["ActivationSymmetric"]
-        )
-        self.min_real_range = self.extra_options.get("MinimumRealRange")
 
-        self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
-        self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
-        """
-            Dictionary specifying the min and max values for tensors. It has following format:
-                {
-                    "param_name": [min, max]
-                }
-            example:
-                {
-                    'Conv_3:0': [np.float32(0), np.float32(0.5)],
-                    'Conv_4:0': [np.float32(1), np.float32(3.5)]
-                }
-        """
-        if tensors_range is not None and any(map(lambda t: not isinstance(t, TensorData), tensors_range.values())):
-            raise TypeError(
-                f"tensors_range contains unexpected types {set(type(v) for v in tensors_range.values())}, not TensorData."
-            )
-        self.tensors_range = tensors_range
-        self.nodes_to_quantize = nodes_to_quantize  # specific nodes to quantize
-        self.nodes_to_exclude = nodes_to_exclude  # specific nodes to exclude
-        self.op_types_to_quantize = op_types_to_quantize
         self.new_nodes = []
-        self.parent = None
         self.graph_scope = "/"  # for human readable debug information
         self.tensor_names = {}  # in case the shape inference not totally working
         self.tensor_names.update({ot.name: 1 for ot in model.graph.output})
@@ -144,12 +87,9 @@ def __init__(
         for node in self.model.model.graph.node:
             self.tensor_names.update({output_name: 1 for output_name in node.output})
 
-        self.opset_version = self.check_opset_version()
-
         if self.mode not in QuantizationMode:
             raise ValueError(f"unsupported quantization mode {self.mode}")
 
-        self.tensor_quant_overrides = self._get_and_check_tensor_quant_overrides()
         self.quantization_params = self.calculate_quantization_params()
 
         # QuantizeRange tensor name and zero tensor name for scale and zero point calculation.
@@ -166,89 +106,6 @@ def __init__(
         # some output from nodes will be quantized, yet itself should be treat as existing so
         # no dequantized will be applied when needed later
         self.generated_value_names = self.model.get_non_initializer_inputs()
-        # to store specified scale and zeropoint instead of calculated value, tensor_name->(scale, zeropoint)
-        self.used_scale_zp_map = {}
-
-    def _get_and_check_tensor_quant_overrides(self):
-        """
-        Get tensor quantization overrides and check correctness.
-        """
-        tensor_quant_overrides = self.extra_options.get("TensorQuantOverrides", {})
-
-        # Validate that compatible/valid overrides are provided.
-        if tensor_quant_overrides:
-            initializer_names = self.model.get_initializer_name_set()
-            value_info_names = set(self.value_infos.keys())
-            keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
-
-            for tensor_name, quant_overrides_list in tensor_quant_overrides.items():
-                if tensor_name not in initializer_names and tensor_name not in value_info_names:
-                    raise ValueError(f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model")
-
-                if not isinstance(quant_overrides_list, list):
-                    raise ValueError(f"Tensor quantization overrides for '{tensor_name}' are not in a list")
-
-                is_initializer = tensor_name in initializer_names
-                if not is_initializer and len(quant_overrides_list) > 1:
-                    raise ValueError(
-                        f"Tensor '{tensor_name}' has a list of per-channel overrides, but is not an initializer"
-                    )
-
-                quant_type = None
-                for index, quant_overrides in enumerate(quant_overrides_list):
-                    if not isinstance(quant_overrides, dict):
-                        raise ValueError(
-                            f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict"
-                        )
-
-                    # For per-channel quantization, all channels must use the same quantization type.
-                    # Therefore, if the user tries to override the quant_type for a channel, it must match in all
-                    # other channels.
-                    if index == 0:
-                        quant_type = quant_overrides.get("quant_type")
-                    elif quant_type != quant_overrides.get("quant_type"):
-                        raise ValueError(
-                            "Channel quantization types for tensor '{tensor_name}' do not match at index {index}."
-                        )
-
-                    has_scale = "scale" in quant_overrides
-                    has_zero_point = "zero_point" in quant_overrides
-
-                    if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
-                        raise ValueError(
-                            "Must provide both 'scale' and 'zero_point' if one of the overrides is provided"
-                        )
-
-                    if has_scale:
-                        for key in keys_unsupported_with_scale_zp:
-                            if key in quant_overrides:
-                                raise ValueError(
-                                    f"Tensor override option '{key}' is invalid with 'scale' and 'zero_point'"
-                                )
-
-        return tensor_quant_overrides
-
-    def get_per_tensor_quant_overrides(self, tensor_name):
-        quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{}])
-        num_overrides = len(quant_overrides_list)
-        if num_overrides > 1:
-            raise ValueError(
-                f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
-                f"but found {num_overrides} per-channel overrides."
-            )
-
-        return quant_overrides_list[0] if num_overrides > 0 else {}
-
-    def get_per_channel_quant_overrides(self, tensor_name, num_channels):
-        quant_overrides_list = self.tensor_quant_overrides.get(tensor_name, [{} for i in range(num_channels)])
-
-        if len(quant_overrides_list) != num_channels:
-            raise ValueError(
-                f"Expected tensor '{tensor_name}' to have {num_channels} per-channel quantization overrides, "
-                f"but found {len(quant_overrides_list)} instead."
-            )
-
-        return quant_overrides_list
 
     # routines for subgraph support
     def quantize_subgraph(self, subgraph, graph_key):
@@ -316,46 +173,6 @@ def quantize_node_with_sub_graph(self, node):
             kwargs.update(kv)
         return onnx.helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
 
-    def check_opset_version(self):
-        ai_onnx_domain = [
-            opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
-        ]
-        if len(ai_onnx_domain) != 1:
-            raise ValueError("Failed to find proper ai.onnx domain")
-        opset_version = ai_onnx_domain[0].version
-
-        if opset_version == 10:
-            logging.warning(
-                "The original model opset version is {}, which does not support node fusions. Please update the model to opset >= 11 for better performance.".format(
-                    opset_version
-                )
-            )
-            return 10
-
-        if opset_version < 10:
-            logging.warning(
-                "The original model opset version is {}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model.".format(
-                    opset_version
-                )
-            )
-            self.model.model.opset_import.remove(ai_onnx_domain[0])
-            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
-            opset_version = 11
-
-        if opset_version < 19 and self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            logging.warning(
-                "The original model opset version is {}, which does not support quantization to float 8. "
-                "Please update the model to opset >= 19. Updating the model automatically to opset 19. "
-                "Please verify the quantized model.".format(opset_version)
-            )
-            self.model.model.opset_import.remove(ai_onnx_domain[0])
-            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
-            self.model.model.ir_version = 9
-            opset_version = 19
-
-        self.fuse_dynamic_quant = True
-        return opset_version
-
     def has_QDQ_nodes(self):  # noqa: N802
         """
         Detect if model already has QuantizeLinear or DequantizeLinear.
@@ -380,7 +197,7 @@ def add_new_nodes(self, nodes):
     def quantize_model(self):
         if self.has_QDQ_nodes():
             logging.warning(
-                "Please check if the model is already quantized."
+                "Please check if the model is already quantized. "
                 "Note you don't need to quantize a QAT model. OnnxRuntime support to run QAT model directly."
             )
 
@@ -422,20 +239,47 @@ def quantize_model(self):
 
         return self.model.model
 
-    def is_input_a_initializer(self, input_name):
-        initializer = find_by_name(input_name, self.model.initializer())
-        return initializer is not None
-
-    def is_per_channel(self):
-        return self.per_channel
+    def _get_default_tensor_type(self, tensor_name):
+        if "DefaultTensorType" in self.extra_options:
+            logging.info(
+                "get_tensor_type returns DefaultTensorType for tensor name %r, use %d",
+                tensor_name,
+                self.extra_options["DefaultTensorType"],
+            )
+            return self.extra_options["DefaultTensorType"]
+        raise RuntimeError(
+            f"Unable to find data type for weight_name={tensor_name!r}. "
+            f"shape_inference failed to return a type probably this node is "
+            f"from a different domain or using an input produced by such an operator. "
+            f"This may happen if you quantize a model already quantized. "
+            f"You may use extra_options `DefaultTensorType` to indicate "
+            f"the default weight type, usually `onnx.TensorProto.FLOAT`."
+        )
 
-    def is_valid_quantize_weight(self, weight_name):
-        weight = find_by_name(weight_name, self.model.initializer())
+    def get_tensor_type(self, tensor_name, mandatory=False):
+        weight = find_by_name(tensor_name, self.model.initializer())
         if weight is not None:
-            return weight.data_type == onnx_proto.TensorProto.FLOAT
+            return weight.data_type
+        if tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type"):
+                if mandatory and vi.type.tensor_type.elem_type == 0:
+                    return self._get_default_tensor_type(tensor_name)
+                return vi.type.tensor_type.elem_type
         if (not self.enable_subgraph_quantization) or (self.parent is None):
-            return False
-        return self.parent.is_valid_quantize_weight(weight_name)
+            if mandatory:
+                return self._get_default_tensor_type(tensor_name)
+            return None
+        otype = self.parent.is_valid_quantize_weight(tensor_name)
+        if otype is not None:
+            return otype
+        if self.enable_subgraph_quantization and self.parent:
+            res = self.parent.get_tensor_type(tensor_name)
+            if res is not None:
+                return res
+        if mandatory:
+            return self._get_default_tensor_type(tensor_name)
+        return None
 
     def is_float_tensor(self, tensor_name):
         if self.is_input_a_initializer(tensor_name):
@@ -443,33 +287,24 @@ def is_float_tensor(self, tensor_name):
 
         if tensor_name in self.value_infos:
             vi = self.value_infos[tensor_name]
-            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (onnx_proto.TensorProto.FLOAT,):
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                onnx_proto.TensorProto.FLOAT,
+                onnx_proto.TensorProto.FLOAT16,
+            ):
                 return True
-        elif self.enable_subgraph_quantization and self.parent:
-            return self.parent.is_float_tensor(tensor_name)
-        else:
             logging.warning(
-                "Failed to infer data type of tensor: {}. Please add data type info for this tensor "
-                "if your model has customized operators.".format(tensor_name)
+                f"Inference failed or unsupported type to quantize for tensor {tensor_name!r}, type is {vi.type}."
             )
-
-        return False
-
-    def should_quantize_node(self, node):
-        if (
-            self.nodes_to_quantize is not None
-            and len(self.nodes_to_quantize) != 0
-            and node.name not in self.nodes_to_quantize
-        ):
             return False
 
-        if node.op_type not in self.op_types_to_quantize:
-            return False
-
-        if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
-            return False
+        if self.enable_subgraph_quantization and self.parent:
+            return self.parent.is_float_tensor(tensor_name)
 
-        return True
+        logging.warning(
+            f"Failed to infer data type of tensor: {tensor_name!r}. Please add data type info for this tensor "
+            f"if your model has customized operators."
+        )
+        return False
 
     def _get_dynamic_input_quantization_params(self, input_name, nodes_list, qType):
         """
@@ -487,11 +322,12 @@ def _get_dynamic_input_quantization_params(self, input_name, nodes_list, qType):
             return self._get_dynamic_input_quantization_params_float8e4m3fn(input_name, nodes_list)
         raise ValueError(f"Unexpected value for qType={qType}.")
 
-    def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
+    def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list, initial_type):
         """
         Create nodes for dynamic quantization of input to int8 and add them to nodes_list
             parameter input_name: Name of the input.
             parameter nodes_list: new nodes are appended to this list.
+            parameter initial_type: initial weight type (FLOAT or FLOAT16)
             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
         """
         qType = onnx_proto.TensorProto.INT8  # noqa: N806
@@ -550,7 +386,7 @@ def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
         #   and divide by (quantize_range/2.0) which will be equal to max(...)*2.0/quantize_range
         initializer_div = onnx.helper.make_tensor(
             self.fixed_qrange_int8_name,
-            onnx_proto.TensorProto.FLOAT,
+            initial_type,
             [],
             [get_qrange_for_qType(qType) / 2.0],
         )
@@ -570,11 +406,12 @@ def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
 
         return input_scale_name, self.fixed_zero_zp_name, [], []
 
-    def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list):
+    def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list, initial_type):
         """
         Create nodes for dynamic quantization of input to uint8 and add them to nodes_list
             parameter input_name: Name of the input.
             parameter nodes_list: new nodes are appended to this list.
+            parameter initial_type: initial weight type (FLAOT or FLOAT16)
             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
         """
         qType = onnx_proto.TensorProto.UINT8  # noqa: N806
@@ -605,12 +442,12 @@ def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list):
         # Add tensors for quantize range and zero value.
         initializer_qrange = onnx.helper.make_tensor(
             self.fixed_qrange_uint8_name,
-            onnx_proto.TensorProto.FLOAT,
+            initial_type,
             [],
             [get_qrange_for_qType(qType)],
         )
         self.model.add_initializer(initializer_qrange)
-        initializer_qvalue = onnx.helper.make_tensor(self.fixed_zero_name, onnx_proto.TensorProto.FLOAT, [], [0.0])
+        initializer_qvalue = onnx.helper.make_tensor(self.fixed_zero_name, initial_type, [], [0.0])
         self.model.add_initializer(initializer_qvalue)
 
         # Compute Scale
@@ -686,12 +523,20 @@ def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=Non
                     f"Specified values for output {param_name}: {params}"
                 )
 
-            zero_point_values = [params["zero_point"]]
-            scale_values = [params["scale"]]
+            zero_point_values = np.array([params["zero_point"]])
+            if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
+                raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
+            scale_values = np.array([params["scale"]])
+            assert scale_values.dtype != np.float64
             zero_point_type = params["quant_type"]
         else:
-            zero_point_values = [use_zeropoint]
-            scale_values = [use_scale]
+            zero_point_values = np.array([use_zeropoint])
+            scale_values = np.array([use_scale])
+            params = self.quantization_params[param_name]
+            if "scale" in params:
+                dtype = params["scale"].dtype
+                scale_values = scale_values.astype(dtype)
+            assert scale_values.dtype != np.float64
 
         zero_point_shape = []
         zero_point_name = param_name + "_zero_point"
@@ -699,19 +544,17 @@ def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=Non
         scale_name = param_name + "_scale"
 
         # Add initializers
-        init_zp = onnx.helper.make_tensor(zero_point_name, zero_point_type, zero_point_shape, zero_point_values)
+        init_zp = onnx.helper.make_tensor(
+            zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
+        )
         self.model.add_initializer(init_zp)
-        if zero_point_type in {
-            onnx_proto.TensorProto.FLOAT8E4M3FN,
-            onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
-            onnx_proto.TensorProto.FLOAT8E5M2,
-            onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
-        }:
-            # TODO: enable FLOAT16 support
+        if scale_values.dtype == np.float32:
             scale_type = onnx_proto.TensorProto.FLOAT
+        elif scale_values.dtype == np.float16:
+            scale_type = onnx_proto.TensorProto.FLOAT16
         else:
-            scale_type = onnx_proto.TensorProto.FLOAT
-        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values)
+            raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
+        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
         self.model.add_initializer(init_scale)
 
         return True, scale_name, zero_point_name, scale_shape, zero_point_shape
@@ -779,18 +622,6 @@ def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name=N
         self.quantized_value_map[input_name] = QuantizedValue(input_name, output_name, scale_name, zp_name, qType)
         return [*nodes, qlinear_node]
 
-    def set_quant_scale_zp(self, tensor_name, value):
-        assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float) and zeropoint"
-        assert tensor_name not in self.used_scale_zp_map, f"{tensor_name} has been setted before"
-        self.used_scale_zp_map[tensor_name] = value
-
-    def find_quant_scale_zp(self, input_name):
-        if input_name in self.used_scale_zp_map:
-            return self.used_scale_zp_map[input_name]
-        if self.parent is not None:
-            return self.parent.find_quantized_value(input_name)
-        return (None, None)
-
     def find_quantized_value(self, input_name):
         if input_name in self.quantized_value_map:
             return self.quantized_value_map[input_name]
@@ -812,11 +643,6 @@ def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
         weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
         weight_scale = tensor_proto_to_array(weight_initializer)
 
-        # get bias
-        bias_initializer = find_by_name(bias_name, self.model.initializer())
-        bias_data = tensor_proto_to_array(bias_initializer)
-        quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
-
         # get scale for input
         if input_name in self.quantized_value_map:
             input_scale_name = self.quantized_value_map[input_name].scale_name
@@ -828,60 +654,14 @@ def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
         inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
         input_scale = tensor_proto_to_array(inputscale_initializer)
 
-        # quantize bias
-        if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            # Note: if the quantized type is float 8, the bias is converted into float 16.
-            # cublasLtMatMul only supports (b)float16 or float32 bias.
-
-            data = np.asarray(bias_data)
-            quantized_data = data.astype(np.float32)
-            bias_scale = np.array([1], dtype=quantized_data.dtype)
-            bias_scale_data = bias_scale.reshape(-1)
-            packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
-            self.model.initializer_extend([packed_bias_initializer])
-            node_type = "Cast"
-            # TODO: enable FLOAT16 support
-            node_qtype = onnx.TensorProto.FLOAT
-        else:
-            # calculate scale for bias
-            # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
-            bias_scale = input_scale * weight_scale * beta
-
-            quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
-
-            # update bias initializer
-            bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
-            packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
-            self.model.initializer_extend([packed_bias_initializer])
-            bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1)
-            node_type = "DequantizeLinear"
-            node_qtype = self.weight_qType
-
-        # update scale initializer
-        quantized_bias_scale_name = quantized_bias_name + "_scale"
-        if self.is_per_channel():
-            packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
-        else:
-            packed_bias_scale_initializer = onnx.helper.make_tensor(
-                quantized_bias_scale_name, onnx_proto.TensorProto.FLOAT, [], bias_scale_data
-            )
-        self.model.initializer_extend([packed_bias_scale_initializer])
-
-        # update zero initializer
-        if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            tensor_type = self.weight_qType
-        else:
-            tensor_type = onnx_proto.TensorProto.INT32
-
-        quantized_bias_zp_name = quantized_bias_name + "_zero_point"
-        if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
-        elif self.is_per_channel():
-            bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
-            packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
-        else:
-            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
-        self.model.initializer_extend([packed_bias_zp_initializer])
+        (
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            bias_scale_data,
+            node_type,
+            node_qtype,
+        ) = self.quantize_bias_static_impl(bias_name, input_scale, weight_scale, beta)
 
         assert bias_name not in self.quantized_value_map
         quantized_value = QuantizedValue(
@@ -1072,68 +852,9 @@ def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_wei
                 quantized_value.scale_name,
             )
 
-        q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
-        zp_name = weight.name + "_zero_point"
-        scale_name = weight.name + "_scale"
-
-        # Quantize weight data. Use quantization overrides if provided by the user.
-        weight_data = tensor_proto_to_array(weight)
-        quant_overrides = self.get_per_tensor_quant_overrides(weight.name)
-        if "quant_type" in quant_overrides:
-            qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
-
-        if "scale" in quant_overrides and "zero_point" in quant_overrides:
-            zero_point, scale = quant_overrides["zero_point"], quant_overrides["scale"]
-            q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
-        else:
-            _, _, zero_point, scale, q_weight_data = quantize_data(
-                weight_data.flatten().tolist(),
-                qType,
-                quant_overrides.get("symmetric", self.is_weight_symmetric),
-                reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
-                min_real_range=self.min_real_range,
-                rmin_override=quant_overrides.get("rmin"),
-                rmax_override=quant_overrides.get("rmax"),
-            )
-
-        if qType in {
-            onnx.TensorProto.FLOAT8E4M3FN,
-            onnx.TensorProto.FLOAT8E4M3FNUZ,
-            onnx.TensorProto.FLOAT8E5M2,
-            onnx.TensorProto.FLOAT8E5M2FNUZ,
-        }:
-            # TODO: enable FLOAT16 support
-            scale_dtype = onnx_proto.TensorProto.FLOAT
-        else:
-            scale_dtype = onnx_proto.TensorProto.FLOAT
-        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], [scale])
-        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], [zero_point])
-        self.model.initializer_extend([scale_initializer, zero_initializer])
-
-        if not keep_float_weight:
-            if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-                q_weight_initializer = onnx.TensorProto()
-                q_weight_initializer.data_type = self.weight_qType
-                q_weight_initializer.dims.extend(weight.dims)
-                q_weight_initializer.name = q_weight_name
-                # Do not remove .flatten().copy() numpy is not clear about data persistence.
-                q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
-                if to_array_extended is not None:
-                    # This test should not be needed but it helped catch some issues
-                    # with data persistence and tobytes.
-                    check = to_array_extended(q_weight_initializer)
-                    if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
-                        raise RuntimeError(
-                            f"The initializer of shape {weight_data.shape} could not be created, expecting "
-                            f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
-                            f"\nraw={str(q_weight_initializer)[:200]}."
-                        )
-            else:
-                q_weight_data = np.asarray(q_weight_data, dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[qType]).reshape(
-                    weight.dims
-                )
-                q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
-            self.model.initializer_extend([q_weight_initializer])
+        q_weight_name, zp_name, scale_name = self.quantize_initializer_impl(
+            weight, qType, reduce_range, keep_float_weight
+        )
 
         # Log entry for this quantized weight
         quantized_value = QuantizedValue(
@@ -1164,65 +885,9 @@ def quantize_weight_per_channel(
                 quantized_value.scale_name,
             )
 
-        initializer = find_by_name(weight_name, self.model.initializer())
-        if initializer is None:
-            raise ValueError("{} is not an initializer", weight_name)
-
-        weights = tensor_proto_to_array(initializer)
-        channel_count = weights.shape[channel_axis]
-        quant_overrides_for_channels = self.get_per_channel_quant_overrides(weight_name, channel_count)
-
-        # If user provides per-channel quantization overrides, all channels must use the same quantization type.
-        # So, just use the first channel's type.
-        if "quant_type" in quant_overrides_for_channels[0]:
-            weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type  # noqa: N806
-
-        zero_point_list = []
-        scale_list = []
-        quantized_per_channel_data_list = []
-        for i in range(channel_count):
-            per_channel_data = weights.take(i, channel_axis)
-            channel_quant_overrides = quant_overrides_for_channels[i]
-
-            if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
-                zero_point, scale = channel_quant_overrides["zero_point"], channel_quant_overrides["scale"]
-                quantized_per_channel_data = quantize_nparray(
-                    weight_qType, per_channel_data.flatten(), scale, zero_point
-                )
-            else:
-                symmetric = channel_quant_overrides.get(
-                    "symmetric",
-                    (
-                        self.is_weight_symmetric
-                        or weight_qType in (onnx_proto.TensorProto.INT8, onnx_proto.TensorProto.FLOAT8E4M3FN)
-                    ),
-                )
-                _, _, zero_point, scale, quantized_per_channel_data = quantize_data(
-                    per_channel_data.flatten().tolist(),
-                    weight_qType,
-                    symmetric,
-                    reduce_range=channel_quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
-                    min_real_range=self.min_real_range,
-                    rmin_override=channel_quant_overrides.get("rmin"),
-                    rmax_override=channel_quant_overrides.get("rmax"),
-                )
-
-            zero_point_list.append(zero_point)
-            scale_list.append(scale)
-            quantized_per_channel_data_list.append(quantized_per_channel_data)
-
-        # combine per_channel_data into one
-        reshape_dims = list(weights.shape)  # deep copy
-        reshape_dims[channel_axis] = 1  # only one per channel for reshape
-        quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
-        for i in range(1, len(quantized_per_channel_data_list)):
-            channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
-            quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
-
-        q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
-        zp_name = weight_name + "_zero_point"
-        scale_name = weight_name + "_scale"
-
+        q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel_impl(
+            weight_name, weight_qType, channel_axis, reduce_range, keep_float_weight
+        )
         quantized_value = QuantizedValue(
             weight_name,
             q_weight_name,
@@ -1233,29 +898,12 @@ def quantize_weight_per_channel(
         )
         self.quantized_value_map[weight_name] = quantized_value
 
-        # Update packed weight, zero point, and scale initializers
-        zero_scale_shape = [initializer.dims[channel_axis]]
-        scale_initializer = onnx.helper.make_tensor(
-            scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape, scale_list
-        )
-        zero_initializer = onnx.helper.make_tensor(zp_name, weight_qType, zero_scale_shape, zero_point_list)
-
-        self.model.initializer_extend([scale_initializer, zero_initializer])
-
-        if not keep_float_weight:
-            quantized_weights = np.asarray(
-                quantized_weights,
-                dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight_qType],
-            ).reshape(initializer.dims)
-            q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
-            self.model.initializer_extend([q_weight_initializer])
-
         return q_weight_name, zp_name, scale_name
 
     def _dequantize_value(self, value_name):
         """
         Given a value (input/output) which is quantized, add a DequantizeLinear node to dequantize
-        it back to float32
+        it back to float32 or float16
             parameter value_name: value to dequantize
             parameter new_nodes_list: List of new nodes created before processing current node
             return: None if there is already a DequantizeLinear node that dequantizes it
@@ -1264,6 +912,16 @@ def _dequantize_value(self, value_name):
         if (value_name in self.quantized_value_map) and (value_name not in self.generated_value_names):
             quantized_value = self.quantized_value_map[value_name]
             # Add DequantizeLinear Node for this input
+
+            scale_init = find_by_name(quantized_value.scale_name, self.model.initializer())
+
+            # In case we are working with subgraphs, the graph `producer_name` is set to `"onnx-quantizer"` in the `quantize_subgraph` method. In this case, the scale initializer may be on the top level graph, so the check below can not be done.
+            if self.model.model.producer_name != "onnx-quantizer" or (
+                self.model.model.producer_name == "onnx-quantizer" and scale_init is not None
+            ):
+                # axis is not specified so scale_init must be a scalar.
+                assert onnx.numpy_helper.to_array(scale_init).size == 1
+
             dqlinear_name = value_name + "_DequantizeLinear"
             dqlinear_node = self.model.find_node_by_name(dqlinear_name, self.new_nodes, self.model.graph())
             if dqlinear_node is None:
@@ -1295,24 +953,9 @@ def _dequantize_outputs(self):
 
     def calculate_quantization_params(self):
         if self.tensors_range is None:
-            return
+            return None
 
-        # adjust tensor_ranges for input of Clip and Relu node
-        for node in self.model.nodes():
-            if node.op_type not in ["Clip", "Relu"]:
-                continue
-            if self.is_activation_symmetric:
-                continue
-            if not self.should_quantize_node(node):
-                continue
-            if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
-                continue
-            if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
-                continue
-            td = self.tensors_range[node.output[0]]
-            if not isinstance(td, TensorData):
-                raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
-            self.tensors_range[node.input[0]] = td
+        self.adjust_tensor_ranges()
 
         quantization_params = {}
         for tensor_name in self.tensors_range:
@@ -1320,7 +963,7 @@ def calculate_quantization_params(self):
             if not isinstance(td, TensorData):
                 raise TypeError(f"Unexpected type {type(td)} for {tensor_name!r}.")
 
-            quant_overrides = self.get_per_tensor_quant_overrides(tensor_name)
+            quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(tensor_name, default_val={})
 
             quant_type = self.activation_qType
             if "quant_type" in quant_overrides:
diff --git a/onnxruntime/python/tools/quantization/operators/concat.py b/onnxruntime/python/tools/quantization/operators/concat.py
index a4f359cf5684..57fcec9cd380 100644
--- a/onnxruntime/python/tools/quantization/operators/concat.py
+++ b/onnxruntime/python/tools/quantization/operators/concat.py
@@ -30,7 +30,7 @@ def quantize(self):
             zero_point_names,
             scale_names,
             nodes,
-        ) = self.quantizer.quantize_activation(node, [*range(0, len(node.input))])
+        ) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
         if not data_found or q_input_names is None:
             return super().quantize()
 
@@ -52,7 +52,7 @@ def quantize(self):
         qnode_name = node.name + "_quant" if node.name else ""
 
         qlconcat_inputs = [output_scale_name, output_zp_name]
-        for i in range(0, len(q_input_names)):
+        for i in range(len(q_input_names)):
             qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
         qlconcat_node = onnx.helper.make_node(
             "QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
diff --git a/onnxruntime/python/tools/quantization/operators/conv.py b/onnxruntime/python/tools/quantization/operators/conv.py
index 23f9eaf4b0e0..922884a5f638 100644
--- a/onnxruntime/python/tools/quantization/operators/conv.py
+++ b/onnxruntime/python/tools/quantization/operators/conv.py
@@ -89,13 +89,14 @@ def quantize(self):
         nodes.append(conv_integer_node)
 
         # Add cast operation to cast convInteger output to float.
+        onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
         cast_op_output = conv_integer_output + "_cast_output"
         cast_node = onnx.helper.make_node(
             "Cast",
             [conv_integer_output],
             [cast_op_output],
             conv_integer_output + "_cast",
-            to=onnx_proto.TensorProto.FLOAT,
+            to=onnx_type,  # TODO: FLOAT ot FLOAT16
         )
         nodes.append(cast_node)
 
@@ -193,7 +194,7 @@ def quantize(self):
             bias_present = True
 
         qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
-        qlinear_conv_name = qlinear_conv_name = node.name + "_quant" if node.name else ""
+        qlinear_conv_name = node.name + "_quant" if node.name else ""
 
         kwargs = {}
         for attribute in node.attribute:
@@ -245,10 +246,13 @@ def quantize(self):
         if not self.disable_qdq_for_node_output:
             self.quantizer.quantize_activation_tensor(node.output[0])
 
-        if self.quantizer.is_per_channel():
-            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], 0)
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if node.op_type == "Conv" else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
         else:
             self.quantizer.quantize_weight_tensor(node.input[1])
 
         if len(node.input) == 3:
-            self.quantizer.quantize_bias_tensor(node.input[2], node.input[0], node.input[1])
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
diff --git a/onnxruntime/python/tools/quantization/operators/direct_q8.py b/onnxruntime/python/tools/quantization/operators/direct_q8.py
index c14532b96acb..ae9679ae8ec7 100644
--- a/onnxruntime/python/tools/quantization/operators/direct_q8.py
+++ b/onnxruntime/python/tools/quantization/operators/direct_q8.py
@@ -73,6 +73,6 @@ def quantize(self):
         if self.quantizer.force_quantize_no_input_check:
             self.quantizer.quantize_activation_tensor(self.node.input[0])
             if not self.disable_qdq_for_node_output:
-                self.quantizer.quantize_activation_tensor(self.node.output[0], self.node.input[0])
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
         elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
-            self.quantizer.quantize_activation_tensor(self.node.output[0], self.node.input[0])
+            self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
diff --git a/onnxruntime/python/tools/quantization/operators/gather.py b/onnxruntime/python/tools/quantization/operators/gather.py
index f48725d1e428..e390e874a266 100644
--- a/onnxruntime/python/tools/quantization/operators/gather.py
+++ b/onnxruntime/python/tools/quantization/operators/gather.py
@@ -59,6 +59,6 @@ def quantize(self):
 
         if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
             self.quantizer.quantize_activation_tensor(node.input[0])
-            self.quantizer.quantize_activation_tensor(node.output[0], node.input[0])
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
         elif self.quantizer.is_tensor_quantized(node.input[0]):
-            self.quantizer.quantize_activation_tensor(node.output[0], node.input[0])
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
diff --git a/onnxruntime/python/tools/quantization/operators/gemm.py b/onnxruntime/python/tools/quantization/operators/gemm.py
index 32fdb729635a..5d7bf6e2cd2d 100644
--- a/onnxruntime/python/tools/quantization/operators/gemm.py
+++ b/onnxruntime/python/tools/quantization/operators/gemm.py
@@ -146,18 +146,21 @@ def quantize(self):
         if not self.disable_qdq_for_node_output:
             self.quantizer.quantize_activation_tensor(node.output[0])
 
-        if self.quantizer.is_per_channel():
-            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], 0 if is_B_transposed(node) else 1)
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if is_B_transposed(node) else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
         else:
             self.quantizer.quantize_weight_tensor(node.input[1])
 
         if len(node.input) == 3:
             if self.quantizer.is_input_a_initializer(node.input[2]):
-                self.quantizer.quantize_bias_tensor(node.input[2], node.input[0], node.input[1], get_beta(self.node))
+                self.quantizer.quantize_bias_tensor(
+                    node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
+                )
                 set_default_beta(self.node)
             else:
                 logging.warning(
-                    "Bias of Gemm node '{}' is not constant. Please exclude this node for better performance.".format(
-                        self.node.name
-                    )
+                    f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
                 )
diff --git a/onnxruntime/python/tools/quantization/operators/lstm.py b/onnxruntime/python/tools/quantization/operators/lstm.py
index 90a52cb528b3..3ad3147cb8db 100644
--- a/onnxruntime/python/tools/quantization/operators/lstm.py
+++ b/onnxruntime/python/tools/quantization/operators/lstm.py
@@ -103,6 +103,8 @@ def quantize(self):
 
         kwargs = {}
         for attribute in node.attribute:
+            if attribute.name == "layout":
+                continue
             kwargs.update(attribute_to_kwarg(attribute))
         kwargs["domain"] = ms_domain
 
diff --git a/onnxruntime/python/tools/quantization/operators/matmul.py b/onnxruntime/python/tools/quantization/operators/matmul.py
index 2dbdbdbeb545..5d2961581b8b 100644
--- a/onnxruntime/python/tools/quantization/operators/matmul.py
+++ b/onnxruntime/python/tools/quantization/operators/matmul.py
@@ -1,4 +1,5 @@
 import itertools
+import logging
 
 import onnx
 from onnx import onnx_pb as onnx_proto
@@ -14,17 +15,19 @@ def __init__(self, onnx_quantizer, onnx_node):
 
     def should_quantize(self):
         if not self.quantizer.should_quantize_node(self.node):
+            logging.debug(f"Ignore MatMul {self.node.name}]")
             return False
 
         if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
             not self.quantizer.is_float_tensor(self.node.input[0])
         ):
+            logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
             return False
 
         # do not quantize non-constant B matrices for matmul
         if self.quantizer.q_matmul_const_b_only:
             if not self.quantizer.find_initializer_in_path(self.node.input[1]):
-                print(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
+                logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
                 return False
         return True
 
@@ -72,12 +75,13 @@ def quantize(self):
 
         # Add cast operation to cast matmulInteger output to float.
         cast_op_output = matmul_integer_output + "_cast_output"
+        otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
         cast_node = onnx.helper.make_node(
             "Cast",
             [matmul_integer_output],
             [cast_op_output],
             matmul_integer_output + "_cast",
-            to=onnx_proto.TensorProto.FLOAT,  # TODO: support FLOAT16 as well.
+            to=otype,
         )
         nodes.append(cast_node)
 
@@ -168,11 +172,23 @@ def quantize(self):
         qlinear_matmul_inputs.append(output_scale_name)
         qlinear_matmul_inputs.append(output_zp_name)
 
+        domain = (
+            "com.microsoft"
+            if self.quantizer.weight_qType
+            in {
+                onnx_proto.TensorProto.FLOAT8E4M3FN,
+                onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
+                onnx_proto.TensorProto.FLOAT8E5M2,
+                onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
+            }
+            else ""
+        )
         qlinear_matmul_node = onnx.helper.make_node(
             "QLinearMatMul",
             qlinear_matmul_inputs,
             [qlinear_matmul_output],
             qlinear_matmul_name,
+            domain=domain,
         )
         nodes.append(qlinear_matmul_node)
 
@@ -203,9 +219,10 @@ def quantize(self):
             nodes_to_iterate = itertools.chain(node.input, node.output)
 
         for tensor_name in nodes_to_iterate:
-            # only support per-channel quantization on weight
-            if self.quantizer.is_per_channel() and find_by_name(tensor_name, self.quantizer.model.initializer()):
-                channel_axis = self.quantizer.qdq_op_type_per_channel_support_to_axis.get(node.op_type, 1)
+            is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
+                tensor_name, default_axis=1, op_type=node.op_type
+            )
+            if is_per_channel:
                 self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
             else:
                 self.quantizer.quantize_activation_tensor(tensor_name)
diff --git a/onnxruntime/python/tools/quantization/operators/norm.py b/onnxruntime/python/tools/quantization/operators/norm.py
index e825fe607560..8c4c6c78582a 100644
--- a/onnxruntime/python/tools/quantization/operators/norm.py
+++ b/onnxruntime/python/tools/quantization/operators/norm.py
@@ -19,17 +19,20 @@ def quantize(self):
 
         # Scale
         scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
+        scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=1, op_type=node.op_type
+        )
 
-        if self.quantizer.is_per_channel() and scale_is_initializer:
-            channel_axis = self.quantizer.qdq_op_type_per_channel_support_to_axis.get(node.op_type, 1)
-            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=channel_axis)
+        if scale_is_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
         elif scale_is_initializer:
             self.quantizer.quantize_weight_tensor(node.input[1])
         else:
             self.quantizer.quantize_activation_tensor(node.input[1])
 
         # Bias
-        self.quantizer.quantize_bias_tensor(node.input[2], node.input[0], node.input[1])
+        if len(node.input) > 2 and node.input[2]:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
 
         # Output
         if not self.disable_qdq_for_node_output:
diff --git a/onnxruntime/python/tools/quantization/operators/softmax.py b/onnxruntime/python/tools/quantization/operators/softmax.py
index 76c9054caa84..4b39fae8ac06 100644
--- a/onnxruntime/python/tools/quantization/operators/softmax.py
+++ b/onnxruntime/python/tools/quantization/operators/softmax.py
@@ -1,16 +1,8 @@
 import onnx
+import onnx.helper
 
-from ..quant_utils import (
-    TENSOR_NAME_QUANT_SUFFIX,
-    QuantizedValue,
-    QuantizedValueType,
-    attribute_to_kwarg,
-    compute_scale_zp,
-    get_qmin_qmax_for_qType,
-    ms_domain,
-)
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
-from .qdq_base_operator import QDQOperatorBase
 
 
 class QLinearSoftmax(QuantOperatorBase):
@@ -80,27 +72,3 @@ def quantize(self):
         nodes.append(qnode)
         self.quantizer.new_nodes += nodes
         return None
-
-
-class QDQSoftmax(QDQOperatorBase):
-    def quantize(self):
-        super().quantize()
-        output_name = self.node.output[0]
-        quant_overrides = self.quantizer.get_per_tensor_quant_overrides(output_name)
-
-        quant_type = self.quantizer.activation_qType
-        if "quant_type" in quant_overrides:
-            quant_type = quant_overrides["quant_type"].tensor_type
-
-        if "scale" in quant_overrides and "zero_point" in quant_overrides:
-            out_zero_point, out_scale = quant_overrides["zero_point"], quant_overrides["scale"]
-        else:
-            # Unless overridden by the user, force Softmax to range from 0.0 to 1.0
-            rmin = quant_overrides.get("rmin", 0.0)
-            rmax = quant_overrides.get("rmax", 1.0)
-            symmetric = quant_overrides.get("symmetric", self.quantizer.is_activation_symmetric)
-            reduce_range = quant_overrides.get("reduce_range", False)
-            qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
-            out_zero_point, out_scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=symmetric)
-
-        self.quantizer.set_quant_scale_zp(output_name, (out_scale, out_zero_point))
diff --git a/onnxruntime/python/tools/quantization/operators/split.py b/onnxruntime/python/tools/quantization/operators/split.py
index c36b767f5abc..74fc30cd075d 100644
--- a/onnxruntime/python/tools/quantization/operators/split.py
+++ b/onnxruntime/python/tools/quantization/operators/split.py
@@ -60,4 +60,4 @@ def quantize(self):
             self.quantizer.quantize_activation_tensor(node.input[0])
         if not self.disable_qdq_for_node_output:
             for output in node.output:
-                self.quantizer.quantize_activation_tensor(output, node.input[0])
+                self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
diff --git a/onnxruntime/python/tools/quantization/qdq_loss_debug.py b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
index 67938de54a10..f9ed844febe4 100644
--- a/onnxruntime/python/tools/quantization/qdq_loss_debug.py
+++ b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
@@ -42,7 +42,7 @@ def get_next(self):
 
 import numpy
 import onnx
-from onnx import TensorProto, helper, numpy_helper
+from onnx import helper, numpy_helper
 
 import onnxruntime
 
@@ -86,7 +86,7 @@ def modify_model_output_intermediate_tensors(
         op_types_for_saving = []
     saver = CalibraterBase(input_model_path, op_types_to_calibrate=op_types_for_saving)
     model_to_augment = saver.model
-    tensors, _ = saver.select_tensors_to_calibrate(model_to_augment)
+    tensors, value_infos = saver.select_tensors_to_calibrate(model_to_augment)
     reshape_shape_name = "LinearReshape_" + str(time.time())
     reshape_shape = numpy_helper.from_array(numpy.array([-1], dtype=numpy.int64), reshape_shape_name)
     model_to_augment.graph.initializer.append(reshape_shape)
@@ -100,7 +100,9 @@ def modify_model_output_intermediate_tensors(
             name=reshape_output,
         )
         model_to_augment.graph.node.append(reshape_node)
-        reshape_output_value_info = helper.make_tensor_value_info(reshape_output, TensorProto.FLOAT, [-1])
+        reshape_output_value_info = helper.make_tensor_value_info(
+            reshape_output, value_infos[tensor_name].type.tensor_type.elem_type, [-1]
+        )
         model_to_augment.graph.output.append(reshape_output_value_info)
 
     onnx.save(
@@ -312,6 +314,14 @@ def create_weight_matching(float_model_path: str, qdq_model_path: str) -> Dict[s
             weight_zp = numpy.zeros(weight_scale.shape, dtype=numpy.int32)
 
         # Perform dequantization:
+        if weight_scale.size == weight_zp.size == 1:
+            # Avoids the confusion between a scaler and a tensor of one element.
+            weight_scale = weight_scale.reshape(tuple())
+            weight_zp = weight_zp.reshape(tuple())
+        if weight_scale.shape != weight_zp.shape:
+            raise RuntimeError(
+                f"scale and zero_point must have the same shape but {weight_scale.shape} != {weight_zp.shape}"
+            )
         weight_quant = _run_dequantize_linear(weight_tensor, weight_scale, weight_zp, channel_axis=axis)
         weight_name = weight_name[: -len(TENSOR_NAME_QUANT_SUFFIX)]
         if weight_quant is None:
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 187555ff76fb..2416cf970e46 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -3,15 +3,21 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
+
 import logging
+from dataclasses import dataclass
 from enum import Enum
+from typing import Any
 
+import numpy as np
 import onnx
 import onnx.numpy_helper
 from onnx import TensorProto
 from onnx import onnx_pb as onnx_proto
 
-from .onnx_quantizer import ONNXQuantizer
+from .base_quantizer import BaseQuantizer, QuantizationParams
+from .calibrate import TensorData
 from .quant_utils import (
     DEQUANT_OP_NAME,
     QUANT_OP_NAME,
@@ -24,8 +30,13 @@
     add_quant_input_suffix,
     add_quant_output_suffix,
     add_quant_suffix,
+    compute_scale_zp,
+    compute_scale_zp_float8,
     find_by_name,
+    get_qmin_qmax_for_qType,
     ms_domain,
+    normalize_axis,
+    tensor_proto_to_array,
 )
 from .registry import CreateQDQQuantizer
 
@@ -36,22 +47,91 @@ class QDQQuantTensorType(Enum):
     BIAS = 2
 
 
+# Holds the name of the node input from which a node output will share the
+# same quantization param initializers (zero-point and scale initializers).
+# Ex: A Transpose node's output will use the same quant param initializers used at the input.
+@dataclass
+class QDQQuantParamProvider:
+    input_name: str
+    node_name: str
+
+
+# Holds information for tensors that have been marked for quantization by operator quantizers.
+# Does not hold information for bias tensors.
 class QDQTensorQuantInfo:
-    def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provider=None, axis=None):
+    def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provider=None, axis=None, data_type=None):
         self.tensor_type = tensor_type
         self.quant_para_provider = quant_para_provider
         self.axis = axis
         self.is_shared = quant_para_provider is not None
+        assert data_type is not None
+        self.data_type = data_type
+
+
+# Holds information for bias tensors that have been marked for quantization by operator quantizers.
+@dataclass
+class QDQBiasQuantInfo:
+    node_name: str
+    input_name: str
+    weight_name: str
+    beta: float
+
+
+# Holds quantization parameter values (scale, zp) for a tensor.
+# A tensor typically has a one set of quantization parameters, unless the tensor is
+# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
+@dataclass
+class QDQTensorQuantParams:
+    original: QuantizationParams  # Generated by producer node.
+    converted: QuantizationParams | None  # Converted type consumed by some (or all/none) consumer nodes.
+    converted_recv_nodes: set[str] | None  # The name of nodes that consume the converted type.
+
+
+# Holds scale and zero_point initializer TensorProtos.
+@dataclass
+class QDQScaleZpInitializers:
+    scale: TensorProto
+    zero_point: TensorProto
+
+
+# Holds all scale and zero-point initializers for a tensor.
+# A tensor typically has a one set of quantization parameters, unless the tensor is
+# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
+@dataclass
+class QDQTensorScaleZpInitializers:
+    original: QDQScaleZpInitializers
+    converted: QDQScaleZpInitializers | None
+    converted_recv_nodes: set[str] | None
+
+
+# Holds cached information of a tensor's quantized values (types, zp/scale initializer names, etc.).
+# A tensor typically has a one set of quantization parameters, unless the tensor is
+# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
+@dataclass
+class QDQTensorQuantizedValue:
+    original: QuantizedValue
+    converted: QuantizedValue | None
+    converted_recv_nodes: set[str] | None
 
+    def get_for_consumer(self, consumer_node_name) -> QuantizedValue:
+        if self.converted is None:  # Quantized value is not converted, return original
+            return self.original
 
-class QDQQuantizer(ONNXQuantizer):
+        if self.converted_recv_nodes is None:  # All consumers receive the converted value
+            return self.converted
+
+        # Check if consumer node name is in the list of nodes that
+        # receive the converted quantization value. If not, return the original value generated
+        # by the tensor's producer.
+        return self.converted if (consumer_node_name in self.converted_recv_nodes) else self.original
+
+
+class QDQQuantizer(BaseQuantizer):
     def __init__(
         self,
         model,
         per_channel,
         reduce_range,
-        mode,
-        static,
         weight_qType,
         activation_qType,
         tensors_range,
@@ -60,13 +140,11 @@ def __init__(
         op_types_to_quantize,
         extra_options=None,
     ):
-        ONNXQuantizer.__init__(
+        BaseQuantizer.__init__(
             self,
             model,
             per_channel,
             reduce_range,
-            mode,
-            static,
             weight_qType,
             activation_qType,
             tensors_range,
@@ -76,7 +154,7 @@ def __init__(
             extra_options,
         )
         self.tensors_to_quantize = {}
-        self.bias_to_quantize = []
+        self.bias_to_quantize = {}
 
         self.nodes_to_remove = []
 
@@ -85,40 +163,27 @@ def __init__(
         # because those ops may be followed by nodes that require high resolution inputs.
         # Adding QDQ for those ops' output may end up with worse accuracy.
         # So, we don't recommend to add QDQ to node's output under such condition.
-        self.op_types_to_exclude_output_quantization = (
-            []
-            if "OpTypesToExcludeOutputQuantization" not in extra_options
-            else extra_options["OpTypesToExcludeOutputQuantization"]
-        )
+        self.op_types_to_exclude_output_quantization = extra_options.get("OpTypesToExcludeOutputQuantization", [])
 
         # We do quantization on Dequantizelinear's input to remove Quantizelinear for weight as an optimization.
         # In some cases, for example QDQ BERT model for TensorRT, QDQ should always appear as a pair.
         # Therefore, we need to disable this optimization and add qdq pair to weight.
-        self.add_qdq_pair_to_weight = (
-            False if "AddQDQPairToWeight" not in extra_options else extra_options["AddQDQPairToWeight"]
-        )
+        self.add_qdq_pair_to_weight = extra_options.get("AddQDQPairToWeight", False)
 
         # Some scenarios do not need the bias quantized. For example, in the case of Quantization Aware Training,
         # quantizing the bias is not needed. This is because in QAT, all model parameters are expected to be in
         # floating point format. To that end, we can use the FakeQuant operator for weights and activations that
         # can always have QDQ pairs (by using AddQDQPairToWeight). But for biases in a quantized model, we can't use
         # FakeQuant because it only ever appears before a DQ (since it is quantized as int32).
-        self.quantize_bias = True if "QuantizeBias" not in extra_options else extra_options["QuantizeBias"]
+        self.quantize_bias = extra_options.get("QuantizeBias", True)
 
         # The default behavior is that multiple nodes can share a QDQ pair as their inputs.
         # In TRT, QDQ pair can`t be shared between nodes, so it will create dedicated QDQ pairs for each node.
-        self.dedicated_qdq_pair = (
-            False if "DedicatedQDQPair" not in extra_options else extra_options["DedicatedQDQPair"]
-        )
-        if self.dedicated_qdq_pair:
-            self.tensor_to_its_receiving_nodes = {}
+        self.dedicated_qdq_pair = extra_options.get("DedicatedQDQPair", False)
+        self.tensor_to_its_receiving_nodes = {}
 
         # Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.
-        self.qdq_op_type_per_channel_support_to_axis = (
-            {}
-            if "QDQOpTypePerChannelSupportToAxis" not in extra_options
-            else extra_options["QDQOpTypePerChannelSupportToAxis"]
-        )
+        self.qdq_op_type_per_channel_support_to_axis = extra_options.get("QDQOpTypePerChannelSupportToAxis", {})
 
         self.qdq_op_domain = ms_domain if extra_options.get("UseQDQContribOps", False) else None
 
@@ -126,7 +191,10 @@ def __init__(
         # if the activation or weight types are 16-bit integers.
         # TODO: Remove this override (and use only the 'UseQDQContribOps' option) if/when ONNX adds 16-bit support.
         int16_types = (TensorProto.UINT16, TensorProto.INT16)
-        if not self.qdq_op_domain and (self.activation_qType in int16_types or self.weight_qType in int16_types):
+        overrides_have_int16 = any(t.tensor_type in int16_types for t in self.tensor_quant_override_qtypes)
+        if not self.qdq_op_domain and (
+            self.activation_qType in int16_types or self.weight_qType in int16_types or overrides_have_int16
+        ):
             logging.warning(
                 "ONNX QuantizeLinear and DequantizeLinear operators do not support 16-bit integer quantization types. "
                 f"The domain of QuantizeLinear and DequantizeLinear operators will be set to '{ms_domain}' to "
@@ -134,91 +202,154 @@ def __init__(
             )
             self.qdq_op_domain = ms_domain
 
+        self.quantization_params = self.calc_graph_quant_params()
+
+        # Map of all original value names to quantized value names
+        self.quantized_value_map = {}
+
+    def _get_tensor_type(self, tensor_name):
+        """
+        Check if tensor can be quantized
+        """
+        weight = find_by_name(tensor_name, self.model.initializer())
+        if weight is not None:
+            return weight.data_type
+        elif tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type"):
+                return vi.type.tensor_type.elem_type
+        return None
+
     def _is_tensor_quantizable(self, tensor_name):
         """
         Check if tensor can be quantized
         """
         weight = find_by_name(tensor_name, self.model.initializer())
         if weight is not None:
-            if weight.data_type == onnx_proto.TensorProto.FLOAT:
+            if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
                 return True
         elif tensor_name in self.value_infos:
             vi = self.value_infos[tensor_name]
-            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == TensorProto.FLOAT:
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                TensorProto.FLOAT,
+                TensorProto.FLOAT16,
+            ):
                 return True
         else:
             logging.warning(
-                "failed to infer the type of tensor: {}. Skip to quantize it. Please check if it is expected.".format(
-                    tensor_name
-                )
+                f"failed to infer the type of tensor: {tensor_name}. Skip to quantize it. Please check if it is expected."
             )
 
         return False
 
-    def __quantize_tensor(self, tensor_name, quant_sharing_param=None, tensor_type=QDQQuantTensorType.ACTIVATION):
+    def __quantize_tensor(self, tensor_name, quant_sharing_provider=None, tensor_type=QDQQuantTensorType.ACTIVATION):
         """
-        Quantize tensors. If quant_param_tensor is not None, tensor with name tensor_name will be quantized with same
-        quantization parameters as tensor quant_param_tensor
+        Adds a tensor to the list (actually a dict) of tensors to quantize. Called indirectly by op quantizers that
+        want to quantize a tensor (i.e., "mark" a tensor for quantization).
+
+        If quant_sharing_provider is not None, tensor with name tensor_name will be quantized with the same
+        quantization parameters as the node input specified in quant_sharing_provider. Ex: A Tranpose node's output
+        will typically use the same quantization parameter initializers used at the Transpose node's input.
 
         Args:
             tensor_name: name of the tensor to quantize
-            quant_sharing_param: name of the tensor that provides quantization parameter
+            quant_sharing_provider: name of the tensor and node that provides quantization parameter
             tensor_type: QDQQuantTensorType default ACTIVATION
         """
         if self._is_tensor_quantizable(tensor_name):
-            if quant_sharing_param:
+            if quant_sharing_provider:
+                if not isinstance(quant_sharing_provider, QDQQuantParamProvider):
+                    raise TypeError(
+                        f"quant_sharing_provider must be of type QDQQuantParamProvider, not {type(quant_sharing_provider)}."
+                    )
+
+                data_type = self._get_tensor_type(tensor_name)
                 self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
-                    tensor_type=tensor_type, quant_para_provider=quant_sharing_param
+                    tensor_type=tensor_type, quant_para_provider=quant_sharing_provider, data_type=data_type
                 )
             elif tensor_name not in self.tensors_to_quantize:
-                self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(tensor_type=tensor_type)
+                data_type = self._get_tensor_type(tensor_name)
+                self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(tensor_type=tensor_type, data_type=data_type)
 
-    def quantize_activation_tensor(self, tensor_name, quant_sharing_param=None):
+    def quantize_activation_tensor(self, tensor_name: str):
         """
-        Quantize Activation Tensor
+        Adds a tensor to the list of tensors to quantize. Called by op quantizers that
+        want to quantize a tensor (i.e., "mark" a tensor for quantization).
+
         Args:
             tensor_name: name of the tensor to quantize
-            quant_sharing_param: name of the tensor that provides quantization parameter
-
         """
-        return self.__quantize_tensor(tensor_name, quant_sharing_param, QDQQuantTensorType.ACTIVATION)
+        return self.__quantize_tensor(tensor_name, None, QDQQuantTensorType.ACTIVATION)
 
-    def quantize_weight_tensor(self, tensor_name, quant_sharing_param=None):
+    def quantize_output_same_as_input(self, output_name: str, input_name: str, node_name: str):
         """
-        Quantize Weight Tensor
+        Adds a tensor to the list of tensors to quantize. Called by op quantizers that
+        want to quantize an output tensor using the same quantization parameters as one of the node's inputs.
+
+        Ex: A Tranpose node's output will typically use the same quantization parameter initializers used at
+        the Transpose node's input.
+
         Args:
-            tensor_name: name of the tensor to quantize
-            quant_sharing_param: name of the tensor that provides quantization parameter
+            output_name: name of the node output to quantize so that it uses the same quantization params as an input.
+            input_name: name of the node input from which the output tensor will get its quantization params.
+            node_name: name of the node that consumes `input_name`.
+        """
+        return self.__quantize_tensor(
+            output_name, QDQQuantParamProvider(input_name, node_name), QDQQuantTensorType.ACTIVATION
+        )
 
+    def quantize_weight_tensor(self, tensor_name: str):
+        """
+        Adds a tensor to the list of weight tensors to quantize. Called by op quantizers that
+        want to quantize a weight (i.e., "mark" a weight for quantization).
+
+        Args:
+            tensor_name: name of the weight to quantize
         """
-        return self.__quantize_tensor(tensor_name, quant_sharing_param, QDQQuantTensorType.WEIGHT)
+        return self.__quantize_tensor(tensor_name, None, QDQQuantTensorType.WEIGHT)
 
     def quantize_weight_tensor_per_channel(self, tensor_name, axis):
         weight = find_by_name(tensor_name, self.model.initializer())
         if weight:
-            if weight.data_type == onnx_proto.TensorProto.FLOAT:
+            if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
                 self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
-                    tensor_type=QDQQuantTensorType.WEIGHT, axis=axis
+                    tensor_type=QDQQuantTensorType.WEIGHT, axis=axis, data_type=weight.data_type
                 )
         else:
             logging.warning(f"only support per-channel quantization on weight. Tensor: {tensor_name} is not quantized.")
 
-    def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
+    def quantize_bias_tensor(self, node_name, bias_name, input_name, weight_name, beta=1.0):
+        """
+        Adds a bias tensor to the list of bias tensors to quantize. Called by op quantizers that
+        want to quantize a bias with bias_zero_point = 0 and bias_scale = input_scale * weight_scale * beta.
+        TODO: Explain the reasoning for using this formula.
+
+        Args:
+            node_name: name of the node that consumes the bias, input, and weight tensors.
+            bias_name: name of the bias tensor to quantize.
+            input_name: name of the input tensor whose scale is used to compute the bias's scale.
+            weight_name: name of the weight tensor whose scale is used to compute the bias's scale.
+            beta: Multiplier used to compute the bias's scale.
+        """
         # If the user provided quantization overrides for this tensor, treat it as a regular weight.
         if self.tensor_quant_overrides.get(bias_name):
             logging.info(
                 f"Quantizing bias tensor '{bias_name}' as a weight due to the presence of user-specified overrides"
             )
-            if self.per_channel:
-                self.quantize_weight_tensor_per_channel(bias_name, 0)
+            is_per_channel, axis = self.is_tensor_per_channel(bias_name, default_axis=0)
+            if is_per_channel:
+                self.quantize_weight_tensor_per_channel(bias_name, axis)
             else:
                 self.quantize_weight_tensor(bias_name)
             return
 
         weight = find_by_name(bias_name, self.model.initializer())
         if weight is not None:
-            if weight.data_type == onnx_proto.TensorProto.FLOAT:
-                self.bias_to_quantize.append((bias_name, input_name, weight_name, beta))
+            if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
+                if bias_name not in self.bias_to_quantize:
+                    self.bias_to_quantize[bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta)
+                else:
+                    logging.warning(f"Bias {bias_name} has already been marked for quantization")
         else:
             logging.warning(f"Expected {bias_name} to be a weight")
 
@@ -234,11 +365,10 @@ def quantize_model(self):
                 op_quantizer = CreateQDQQuantizer(self, node)
                 op_quantizer.quantize()
 
-                if self.dedicated_qdq_pair:
-                    for tensor_name in node.input:
-                        if tensor_name not in self.tensor_to_its_receiving_nodes:
-                            self.tensor_to_its_receiving_nodes[tensor_name] = []
-                        self.tensor_to_its_receiving_nodes[tensor_name].append(node)
+                for tensor_name in node.input:
+                    if tensor_name not in self.tensor_to_its_receiving_nodes:
+                        self.tensor_to_its_receiving_nodes[tensor_name] = []
+                    self.tensor_to_its_receiving_nodes[tensor_name].append(node)
 
         self._quantize_normal_tensors()
         self._quantize_sharing_param_tensors()
@@ -250,12 +380,16 @@ def quantize_model(self):
 
         self.model.model.producer_name = __producer__
         self.model.model.producer_version = __version__
+        if self.qdq_op_domain == ms_domain:
+            self.model.set_opset_import(ms_domain, 1)
 
         return self.model.model
 
     def try_replacing_upstream_output(self, upstream_output_name, output_name):
         if (
             output_name in self.quantization_params
+            and self.quantization_params[output_name].converted is None
+            and self.quantization_params[upstream_output_name].converted is None
             and len(self.model.input_name_to_nodes()[upstream_output_name]) == 1
             and not self.model.is_graph_output(upstream_output_name)
             and not self.model.is_graph_input(upstream_output_name)
@@ -266,6 +400,50 @@ def try_replacing_upstream_output(self, upstream_output_name, output_name):
             return True
         return False
 
+    def _create_q_node(
+        self,
+        q_input: str,
+        q_output: str,
+        quant_node_name: str,
+        scale_name: str,
+        zp_name: str,
+        axis: int | None = None,
+    ):
+        """
+        Creates a QuantizeLinear node and adds it to the model.
+        """
+        qlinear_node = onnx.helper.make_node(
+            QUANT_OP_NAME,
+            [q_input, scale_name, zp_name],
+            [q_output],
+            quant_node_name,
+            axis=axis,
+            domain=self.qdq_op_domain,
+        )
+        self.model.add_nodes([qlinear_node])
+
+    def _create_dq_node(
+        self,
+        dq_input: str,
+        dq_output: str,
+        dequant_node_name: str,
+        scale_name: str,
+        zp_name: str,
+        axis: int | None = None,
+    ):
+        """
+        Creates a DequantizeLinear node and adds it to the model.
+        """
+        dequant_node = onnx.helper.make_node(
+            DEQUANT_OP_NAME,
+            [dq_input, scale_name, zp_name],
+            [dq_output],
+            dequant_node_name,
+            axis=axis,
+            domain=self.qdq_op_domain,
+        )
+        self.model.add_nodes([dequant_node])
+
     def _create_qdq_nodes(
         self, q_input, q_output, quant_node_name, dq_input, dq_output, dequant_node_name, scale_name, zp_name, axis=None
     ):
@@ -292,6 +470,10 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
         if axis is not None:
             if self.opset_version < 13:
                 raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.")
+            qtype = self.activation_qType
+            if self.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
+                qtype = onnx_proto.TensorProto.INT8
+
             q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel(
                 weight_name,
                 # Quantization type is forced to be TensorProto.INT8.
@@ -300,7 +482,7 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
                 # QLinearConv expects to have a unique value for all channels.
                 # This code does not enforce that but it is necessarily the case when the
                 # quantization is symmetric (as for INT8).
-                onnx_proto.TensorProto.INT8,
+                qtype,
                 axis,
                 keep_float_weight=self.add_qdq_pair_to_weight,
             )
@@ -338,7 +520,7 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
             )
             self.model.add_node(dequant_node)
 
-    def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):
+    def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name, data_type=None):
         if (
             self.dedicated_qdq_pair
             and tensor_name in self.tensor_to_its_receiving_nodes
@@ -371,8 +553,9 @@ def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):
                         scale_name,
                         zp_name,
                         QuantizedValueType.Input,
+                        scale_type=data_type,
                     )
-                    self.quantized_value_map[tensor_name] = quantized_value
+                    self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(quantized_value, None, None)
         else:
             q_input = tensor_name
             dq_output = add_dequant_output_suffix(tensor_name)
@@ -400,10 +583,167 @@ def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):
                 scale_name,
                 zp_name,
                 QuantizedValueType.Input,
+                scale_type=data_type,
+            )
+            self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(quantized_value, None, None)
+
+    def _add_qdq_ops_for_converted_activation(
+        self,
+        tensor_name,
+        first_scale_name,
+        first_zp_name,
+        scale_data_type,
+        convert_scale_name,
+        convert_zp_name,
+        convert_recv_nodes,
+    ):
+        """
+        Adds Q and DQ ops to a tensor whose quantized data type is converted. That is, some consumers may use the
+        original data type from the producer, while other consumers use the converted data type.
+        This is generally done by adding a sequence of ops that convert from one data type (e.g., uint8) to another (e.g., uint16).
+
+        T_float ---> Quant(to u8) ---> Convert(to u16) ---> Dequant(to float) ---> T_float'
+        where Convert(to u16) is equivalent to: ---> Dequant(to float) ---> Quant(to u16) --->
+
+        This function handles the following scenarios:
+
+        1) Tensor T is not a graph output; all consumers use the converted type
+
+            <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 ---> <Consumers>
+
+        2) Tensor T is not a graph output; some consumers use the original type, others use the converted type
+
+            <Producer> ---> Q1 -+-> DQ1 ---> <Consumers of original type>
+                                |
+                                +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
+
+        3) Tensor T is a graph output; all consumers use the converted type
+
+            <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 -+-> <Consumers>
+                                                          |
+                                                          +-> <Graph output>
+
+        4) Tensor T is a graph output; some consumers use the original type, others use the converted type
+
+            <Producer> ---> Q1 -+-> DQ1 -+-> <Consumers of original type>
+                                |        |
+                                |        +-> <Graph output>
+                                |
+                                +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
+        """
+        tensor_recv_nodes = set([node.name for node in self.tensor_to_its_receiving_nodes[tensor_name]])
+
+        if (
+            self.dedicated_qdq_pair
+            and tensor_name in self.tensor_to_its_receiving_nodes
+            and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1
+        ):
+            # TODO: Add support for dedicated_qdq_pair if/when needed.
+            raise ValueError(
+                "Do not currently support converted quant_types in TensorQuantOverrides when the `dedicated_qdq_pair` extra_option is enabled"
+            )
+
+        # Determine which nodes consume the original quantized type and which nodes
+        # consume the converted quantized type.
+        original_recv_nodes = tensor_recv_nodes
+        if convert_recv_nodes is None:  # In this case, all consumers receive the converted type.
+            convert_recv_nodes = tensor_recv_nodes
+            original_recv_nodes = set()
+        else:
+            original_recv_nodes = original_recv_nodes - convert_recv_nodes
+
+        all_use_converted = len(convert_recv_nodes) == len(tensor_recv_nodes)
+        is_graph_output = self.model.is_graph_output(tensor_name)
+
+        # Create first Q op.
+        first_q_input = tensor_name
+        if is_graph_output:
+            first_q_input = add_quant_input_suffix(tensor_name)
+            self.model.replace_output_of_all_nodes(tensor_name, first_q_input)
+
+        first_q_output = add_quant_output_suffix(tensor_name)
+        self._create_q_node(
+            first_q_input, first_q_output, add_quant_suffix(tensor_name), first_scale_name, first_zp_name
+        )
+
+        # Create first DQ op.
+        first_dq_output = add_dequant_output_suffix(tensor_name)
+        if is_graph_output and not all_use_converted:
+            first_dq_output = tensor_name
+        if original_recv_nodes and first_dq_output != tensor_name:
+            self.model.replace_input_of_nodes(tensor_name, first_dq_output, original_recv_nodes)
+
+        self._create_dq_node(
+            first_q_output, first_dq_output, add_dequant_suffix(tensor_name), first_scale_name, first_zp_name
+        )
+
+        # Create parallel clone of first DQ op if _not all_ consumers use the converted type.
+        # --> DQ1' --> Q2 --> DQ2 --> <Consumers of converted type>
+        #
+        # This DQ clone would only have one consumer Q node (Q2) and could be potentially fused with
+        # it by some EPs (e.g., QNN) without breaking other "node units".
+        # Ex QNN fusion:
+        # --> Convert (fused) --> DQ2 --> <Consumers of converted type>
+        second_q_input = first_dq_output
+        if not all_use_converted:
+            second_q_input = add_quant_input_suffix(f"{tensor_name}_convert")
+            self._create_dq_node(
+                first_q_output,
+                second_q_input,
+                add_dequant_suffix(f"{tensor_name}_convert_clone"),
+                first_scale_name,
+                first_zp_name,
             )
-            self.quantized_value_map[tensor_name] = quantized_value
+
+        # Create second Q op.
+        second_q_output = add_quant_output_suffix(f"{tensor_name}_convert")
+        self._create_q_node(
+            second_q_input,
+            second_q_output,
+            add_quant_suffix(f"{tensor_name}_convert"),
+            convert_scale_name,
+            convert_zp_name,
+        )
+
+        # Create second DQ op.
+        second_dq_output = add_dequant_output_suffix(f"{tensor_name}_convert")
+        if is_graph_output and all_use_converted:
+            second_dq_output = tensor_name
+        if convert_recv_nodes and second_dq_output != tensor_name:
+            self.model.replace_input_of_nodes(tensor_name, second_dq_output, convert_recv_nodes)
+        self._create_dq_node(
+            second_q_output,
+            second_dq_output,
+            add_dequant_suffix(f"{tensor_name}_convert"),
+            convert_scale_name,
+            convert_zp_name,
+        )
+
+        # Store in quantized_value_map
+        original_quantized_value = QuantizedValue(
+            tensor_name,
+            first_dq_output,
+            first_scale_name,
+            first_zp_name,
+            QuantizedValueType.Input,
+            scale_type=scale_data_type,
+        )
+        converted_quantized_value = QuantizedValue(
+            tensor_name,
+            second_dq_output,
+            convert_scale_name,
+            convert_zp_name,
+            QuantizedValueType.Input,
+            scale_type=scale_data_type,
+        )
+        self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(
+            original_quantized_value, converted_quantized_value, convert_recv_nodes
+        )
 
     def _quantize_normal_tensors(self):
+        """
+        Adds Q/DQ ops to tensors (activations and weights) that have been marked for quantization by op quantizers.
+        """
         for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
             if tensor_name in self.quantized_value_map:
                 continue
@@ -414,53 +754,112 @@ def _quantize_normal_tensors(self):
                 if initializer:
                     self._add_qdq_pair_for_initializer(initializer, tensor_info.tensor_type, tensor_info.axis)
                 else:
-                    used_scale, used_zp = self.find_quant_scale_zp(tensor_name)
-                    data_found, scale_name, zp_name, _, _ = self._get_quantization_params(
-                        tensor_name, used_scale, used_zp
-                    )
-
-                    if not data_found:
+                    tensor_qparam_initializers = self._make_tensor_scale_zp_initializers(tensor_name)
+                    if not tensor_qparam_initializers:
                         raise ValueError(
                             f"Quantization parameters are not specified for param {tensor_name}. "
                             "In static mode quantization params for inputs and outputs of nodes to be quantized are required."
                         )
 
-                    self._add_qdq_pair_for_activation(tensor_name, scale_name, zp_name)
+                    if tensor_qparam_initializers.converted is None:
+                        # Normal case: <producer> --> Q --> DQ --> <consumers>
+                        self._add_qdq_pair_for_activation(
+                            tensor_name,
+                            tensor_qparam_initializers.original.scale.name,
+                            tensor_qparam_initializers.original.zero_point.name,
+                            data_type=tensor_info.data_type,
+                        )
+                    else:
+                        # Conversion case: <producer> ---> Q1 -+-> DQ1 --> <consumers of original type>
+                        #                                      |
+                        #                                      +-> DQ1' --> Q2 --> DQ2 --> <consumers of converted type>
+                        assert tensor_info.data_type == tensor_qparam_initializers.original.scale.data_type
+                        self._add_qdq_ops_for_converted_activation(
+                            tensor_name,
+                            tensor_qparam_initializers.original.scale.name,
+                            tensor_qparam_initializers.original.zero_point.name,
+                            tensor_info.data_type,
+                            tensor_qparam_initializers.converted.scale.name,
+                            tensor_qparam_initializers.converted.zero_point.name,
+                            tensor_qparam_initializers.converted_recv_nodes,
+                        )
 
                 del self.tensors_to_quantize[tensor_name]
 
     def _quantize_sharing_param_tensors(self):
+        """
+        Adds Q/DQ ops to tensors that have been marked for quantization by op quantizers.
+        Only operates on tensors that want to use the quantization parameter initializers from an upstream tensor.
+        For example, a Transpose node's output tensor will typically want to use the same quantization parameter
+        initializers as the Transpose node's input.
+        """
         while self.tensors_to_quantize:
             for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
-                tensor_provider_name = tensor_info.quant_para_provider
-                if tensor_provider_name in self.quantized_value_map:
+                quant_provider = tensor_info.quant_para_provider
+                if quant_provider and quant_provider.input_name in self.quantized_value_map:
                     del self.tensors_to_quantize[tensor_name]
 
-                    quantized_value = self.quantized_value_map[tensor_provider_name]
-                    # Quantize the input
-                    initializer = find_by_name(tensor_name, self.model.initializer())
-                    if initializer is not None:
+                    quantized_value = self.quantized_value_map[quant_provider.input_name].get_for_consumer(
+                        quant_provider.node_name
+                    )
+                    if self.is_input_a_initializer(tensor_name):
                         raise ValueError("Quantization parameter shared mode is not supported for weight yet")
-                    self._add_qdq_pair_for_activation(tensor_name, quantized_value.scale_name, quantized_value.zp_name)
+
+                    # Need to check if this tensor's quant_type is converted for some consumers.
+                    # If so, create new scale/zp initializers for these consumers.
+                    converted_qparam_inits = None
+                    converted_recv_nodes = None
+                    if tensor_name in self.quantization_params:
+                        tensor_params = self.quantization_params[tensor_name]
+                        if tensor_params.converted:
+                            converted_qparam_inits = self._make_scale_zp_initializers(
+                                tensor_name, tensor_params.converted, "_convert"
+                            )
+                            converted_recv_nodes = tensor_params.converted_recv_nodes
+
+                    if converted_qparam_inits is None:
+                        # Normal case: <producer> --> Q_shared --> DQ_shared --> <consumers>
+                        self._add_qdq_pair_for_activation(
+                            tensor_name, quantized_value.scale_name, quantized_value.zp_name
+                        )
+                    else:
+                        # Conversion case: <producer> ---> Q_shared -+-> DQ_shared --> <consumers of original type>
+                        #                                            |
+                        #                                            +-> DQ_shared' --> Q2 --> DQ2 --> <consumers of converted type>
+                        self._add_qdq_ops_for_converted_activation(
+                            tensor_name,
+                            quantized_value.scale_name,
+                            quantized_value.zp_name,
+                            converted_qparam_inits.scale.data_type,
+                            converted_qparam_inits.scale.name,
+                            converted_qparam_inits.zero_point.name,
+                            converted_recv_nodes,
+                        )
 
     def _quantize_bias_tensors(self):
-        for bias_name, input_name, weight_name, beta in self.bias_to_quantize:
+        """
+        Adds DQ ops (or Cast) for bias tensors that have been marked for quantization by op quantizers.
+        """
+        for bias_name, bias_info in self.bias_to_quantize.items():
             if bias_name in self.quantized_value_map:
                 continue
             # Quantize the input
-            self.quantize_bias_static(bias_name, input_name, weight_name, beta)
-            self.model.remove_initializer(find_by_name(bias_name, self.model.initializer()))
-            quant_value = self.quantized_value_map[bias_name]
+            self.quantize_bias_static(bias_name, bias_info)
+            init = find_by_name(bias_name, self.model.initializer())
+            self.model.remove_initializer(init)
+            quant_value = self.quantized_value_map[bias_name].original
             if quant_value.node_type == "Cast":
                 # simple cast to float 16 and not DequantizeLinear
                 # cublasLtMatmul only supports (b)float16, float bias.
+                if not isinstance(init.data_type, int):
+                    raise TypeError(f"Unexpected type {type(init.data_type)} for input={bias_info.input_name!r}")
                 node_name = add_dequant_suffix(bias_name)
                 dequant_node = onnx.helper.make_node(
                     "Cast",
                     [quant_value.q_name],
                     [bias_name],
                     name=node_name,
-                    to=onnx.TensorProto.FLOAT,
+                    to=init.data_type,
                 )
             elif quant_value.node_type in (None, "DequantizeLinear"):
                 if quant_value.node_qtype in {
@@ -492,5 +891,283 @@ def _quantize_bias_tensors(self):
                 raise RuntimeError(f"Unexpected operator type {quant_value.node_type!r}.")
             self.model.add_node(dequant_node)
 
-    def is_tensor_quantized(self, tensor_name):
+    def is_tensor_quantized(self, tensor_name: str):
         return tensor_name in self.tensors_to_quantize or tensor_name in self.bias_to_quantize
+
+    def quantize_initializer(
+        self,
+        weight: onnx.TensorProto,
+        qType: onnx.TensorProto.DataType,
+        reduce_range: bool = False,
+        keep_float_weight: bool = False,
+    ) -> tuple[str, str, str]:
+        """
+        :param weight: TensorProto initializer
+        :param qType: type to quantize to
+        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
+                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
+        :return: quantized weight name, zero point name, scale name
+        """
+        # Find if this input is already quantized
+        if weight.name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[weight.name].original
+            return (
+                quantized_value.q_name,
+                quantized_value.zp_name,
+                quantized_value.scale_name,
+            )
+
+        q_weight_name, zp_name, scale_name = self.quantize_initializer_impl(
+            weight, qType, reduce_range, keep_float_weight
+        )
+
+        # Log entry for this quantized weight
+        quantized_value = QuantizedValue(
+            weight.name,
+            q_weight_name,
+            scale_name,
+            zp_name,
+            QuantizedValueType.Initializer,
+            None,
+        )
+        self.quantized_value_map[weight.name] = QDQTensorQuantizedValue(quantized_value, None, None)
+        return q_weight_name, zp_name, scale_name
+
+    def is_tensor_per_channel(
+        self,
+        tensor_name: str,
+        default_axis: int,
+        op_type: str | None = None,
+    ) -> tuple[bool, int | None]:
+        """
+        Checks if a given tensor is configured to be quantized per-channel. If so, also returns the channel axis.
+
+        ORT only supports per-channel quantization on static weights (i.e., ONNX initializers). If the user did not provide
+        tensor quantization overrides for this tensor, then the value of self.per_channel determines if the weight
+        is to be quantized per-channel.
+
+        Params:
+            tensor_name: The name of the tensor to check.
+            default_axis: The default channel axis. This method checks if the normalized axis is within bounds.
+                          Can be overridden via the extra_options 'QDQOpTypePerChannelSupportToAxis'
+                          and 'TensorQuantOverrides'.
+            op_type: Optional, defaults to None. The operator type that is the only consumer of this weight.
+                     Used to access the extra option 'QDQOpTypePerChannelSupportToAxis'.
+        Returns:
+            A tuple (is_per_channel, axis) in which the first element indicates whether the tensor is
+            quantized per-channel and the second element is the channel axis.
+            The returned axis is only None if the tensor is not per-channel or the axis is out of bounds.
+        """
+        weight_initializer = self.initializers.get(tensor_name)
+        if weight_initializer is None:
+            return False, None  # Only support per-channel weights
+
+        if self.tensor_quant_overrides.has_per_tensor_overrides(tensor_name):
+            return False, None  # User provided per-tensor overrides for this initializer
+
+        has_per_chan_overrides = self.tensor_quant_overrides.has_per_channel_overrides(tensor_name)
+        if not self.per_channel and not has_per_chan_overrides:
+            return False, None  # global self.per_channel is off and user did not provide per-channel overrides.
+
+        axis = self.qdq_op_type_per_channel_support_to_axis.get(op_type, default_axis) if op_type else default_axis
+        if has_per_chan_overrides:
+            per_chan_overrides = self.tensor_quant_overrides.get_per_channel_overrides(tensor_name)
+            axis = per_chan_overrides[0]["axis"]  # Prefer axis from user-specified tensor-level overrides if available
+
+        weight_nparray = tensor_proto_to_array(weight_initializer)
+        weight_rank = len(weight_nparray.shape)
+        axis_valid, axis = normalize_axis(axis, weight_rank)
+        if not axis_valid:
+            logging.warning(f"Axis {axis} is out-of-range for weight '{tensor_name}' with rank {weight_rank}")
+            return False, None
+
+        return True, axis
+
+    def quantize_weight_per_channel(
+        self,
+        weight_name: str,
+        weight_qType: onnx.TensorProto.DataType,
+        channel_axis: int,
+        reduce_range: bool = True,
+        keep_float_weight: bool = False,
+    ) -> tuple[str, str, str]:
+        # Find if this input is already quantized
+        if weight_name in self.quantized_value_map:
+            quantized_value = self.quantized_value_map[weight_name].original
+            return (
+                quantized_value.q_name,
+                quantized_value.zp_name,
+                quantized_value.scale_name,
+            )
+
+        q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel_impl(
+            weight_name, weight_qType, channel_axis, reduce_range, keep_float_weight
+        )
+        quantized_value = QuantizedValue(
+            weight_name,
+            q_weight_name,
+            scale_name,
+            zp_name,
+            QuantizedValueType.Initializer,
+            None,
+        )
+        self.quantized_value_map[weight_name] = QDQTensorQuantizedValue(quantized_value, None, None)
+
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str:
+        """
+        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+
+        # Handle case where bias already in quantization map
+        if bias_name in self.quantized_value_map:
+            return self.quantized_value_map[bias_name].original.q_name
+
+        # get scale for weight
+        weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name
+        weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
+        weight_scale = tensor_proto_to_array(weight_initializer)
+
+        # get scale for input
+        input_scale_name = (
+            self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name
+        )
+        inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
+        input_scale = tensor_proto_to_array(inputscale_initializer)
+
+        (
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            bias_scale_data,
+            node_type,
+            node_qtype,
+        ) = self.quantize_bias_static_impl(bias_name, input_scale, weight_scale, bias_info.beta)
+
+        quantized_value = QuantizedValue(
+            bias_name,
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            QuantizedValueType.Initializer,
+            0 if bias_scale_data.size > 1 else None,
+            node_type=node_type,
+            node_qtype=node_qtype,
+        )
+        self.quantized_value_map[bias_name] = QDQTensorQuantizedValue(quantized_value, None, None)
+
+        return quantized_bias_name
+
+    def _make_scale_zp_initializers(
+        self, param_name: str, params: QuantizationParams, init_name_suffix: str = ""
+    ) -> QDQScaleZpInitializers:
+        """
+        Creates and returns scale and zero-point initializers for the given quantization params. The initializers are
+        named:
+            - {param_name}_zero_point{init_name_suffix}
+            - {param_name}_scale{init_name_suffix}
+        """
+        zero_point_values = np.array([params["zero_point"]])
+        if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
+            raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
+        scale_values = np.array([params["scale"]])
+        assert scale_values.dtype != np.float64
+        zero_point_type = params.data.get("quant_type", self.activation_qType)
+
+        zero_point_shape = []
+        zero_point_name = param_name + "_zero_point" + init_name_suffix
+        scale_shape = []
+        scale_name = param_name + "_scale" + init_name_suffix
+
+        # Add initializers to model
+        init_zp = onnx.helper.make_tensor(
+            zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
+        )
+        self.model.add_initializer(init_zp)
+
+        if scale_values.dtype == np.float32:
+            scale_type = onnx_proto.TensorProto.FLOAT
+        elif scale_values.dtype == np.float16:
+            scale_type = onnx_proto.TensorProto.FLOAT16
+        else:
+            raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
+        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
+        self.model.add_initializer(init_scale)
+
+        return QDQScaleZpInitializers(init_scale, init_zp)
+
+    def _make_tensor_scale_zp_initializers(self, tensor_name: str) -> QDQTensorScaleZpInitializers | None:
+        """
+        Create and returns all scale/zero_point initializers for a given tensor. If the tensor is converted
+        to a different quantization type, this function creates two pairs of zp/scale initializers. Otherwise,
+        only one pair of zp/scale initializers is created.
+        """
+        if self.quantization_params is None or tensor_name not in self.quantization_params:
+            logging.info(f'Quantization parameters for tensor:"{tensor_name}" not specified')
+            return None
+
+        tensor_params = self.quantization_params[tensor_name]
+        if not isinstance(tensor_params, QDQTensorQuantParams):
+            raise TypeError(f"Unexpected type {type(tensor_params)} for {tensor_name!r}.")
+
+        original_inits = self._make_scale_zp_initializers(tensor_name, tensor_params.original)
+        converted_inits = (
+            self._make_scale_zp_initializers(tensor_name, tensor_params.converted, "_convert")
+            if tensor_params.converted
+            else None
+        )
+
+        return QDQTensorScaleZpInitializers(original_inits, converted_inits, tensor_params.converted_recv_nodes)
+
+    def calc_quant_params(self, tensor_data: TensorData, quant_overrides: dict[str, Any]) -> QuantizationParams:
+        """
+        Calculates quantization parameters (scale/zero-point) given a tensor's min/max range and optional
+        user-provided overrides.
+        """
+        quant_type = self.activation_qType
+        if "quant_type" in quant_overrides:
+            quant_type = quant_overrides["quant_type"].tensor_type
+
+        if "scale" in quant_overrides and "zero_point" in quant_overrides:
+            zero, scale = quant_overrides["zero_point"], quant_overrides["scale"]
+        elif quant_type == onnx.TensorProto.FLOAT8E4M3FN:
+            zero, scale = compute_scale_zp_float8(quant_type, tensor_data.avg_std[1])
+        else:
+            rmin = quant_overrides.get("rmin", tensor_data.range_value[0])
+            rmax = quant_overrides.get("rmax", tensor_data.range_value[1])
+            symmetric = quant_overrides.get("symmetric", self.is_activation_symmetric)
+            reduce_range = quant_overrides.get("reduce_range", False)
+            qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
+            zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
+
+        return QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)
+
+    def calc_graph_quant_params(self) -> dict[str, QDQTensorQuantParams]:
+        """
+        Calculates quantization parameters (scale/zero-point) for all tensors in the graph using each tensor's min/max range
+        and optional user-provided overrides.
+        """
+        if self.tensors_range is None:
+            return {}
+
+        self.adjust_tensor_ranges()
+
+        quantization_params = {}
+        for tensor_name in self.tensors_range:
+            td = self.tensors_range[tensor_name]
+            if not isinstance(td, TensorData):
+                raise TypeError(f"Unexpected type {type(td)} for {tensor_name!r}.")
+
+            quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(tensor_name, default_val={})
+            original = self.calc_quant_params(td, quant_overrides)
+            converted = None
+            converted_recv_nodes = None
+
+            if "convert" in quant_overrides:
+                converted = self.calc_quant_params(td, quant_overrides["convert"])
+                converted_recv_nodes = quant_overrides["convert"].get("recv_nodes")
+
+            quantization_params[tensor_name] = QDQTensorQuantParams(original, converted, converted_recv_nodes)
+
+        return quantization_params
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 9acee9d8ab12..35b5e1c8ba82 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -1,3 +1,10 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
 import logging
 import os
 import tempfile
@@ -124,25 +131,41 @@ def from_string(format):
 }
 
 ONNX_INT_TYPE_RANGE = {
-    onnx_proto.TensorProto.UINT8: (0, 255),
-    onnx_proto.TensorProto.INT8: (-128, 127),
-    onnx_proto.TensorProto.UINT16: (0, 65535),
-    onnx_proto.TensorProto.INT16: (-32768, 32767),
+    onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(255, dtype=numpy.uint8)),
+    onnx_proto.TensorProto.INT8: (numpy.array(-128, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
+    onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(65535, dtype=numpy.uint16)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-32768, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
 }
 
 ONNX_INT_TYPE_SYMMETRIC_RANGE = {
-    onnx_proto.TensorProto.INT8: (-127, 127),
-    onnx_proto.TensorProto.INT16: (-32767, 32767),
+    onnx_proto.TensorProto.INT8: (numpy.array(-127, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-32767, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
 }
 
 ONNX_INT_TYPE_REDUCED_RANGE = {
-    onnx_proto.TensorProto.UINT8: (0, 127),
-    onnx_proto.TensorProto.INT8: (-64, 64),
-    onnx_proto.TensorProto.UINT16: (0, 32767),
-    onnx_proto.TensorProto.INT16: (-16384, 16384),
+    onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(127, dtype=numpy.uint8)),
+    onnx_proto.TensorProto.INT8: (numpy.array(-64, dtype=numpy.int8), numpy.array(64, dtype=numpy.int8)),
+    onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(32767, dtype=numpy.uint16)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-16384, dtype=numpy.int16), numpy.array(16384, dtype=numpy.int16)),
 }
 
 
+def _check_type(*args, zero_point_index=-1):
+    new_args = []
+    for i, a in enumerate(args):
+        if numpy.issubdtype(type(a), numpy.number):
+            new_args.append(numpy.array(a))
+        elif isinstance(a, numpy.ndarray):
+            new_args.append(a)
+        else:
+            raise TypeError(f"arg {i} is not an array: {a}")
+        if i == zero_point_index:
+            v = new_args[-1]
+            if v.dtype == numpy.float32 or v.dtype == numpy.float16:
+                raise TypeError(f"zero_point cannot be {v.dtype}")
+    return tuple(new_args) if len(new_args) > 1 else new_args[0]
+
+
 def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
     assert (
         qType in ONNX_TYPE_TO_NP_TYPE
@@ -155,6 +178,12 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
     ):
         if zero_point != 0:
             raise NotImplementedError(f"zero_point is expected to be null for float 8 not {zero_point!r}.")
+        if arr.dtype == numpy.float32:
+            onnx_type = TensorProto.FLOAT
+        elif arr.dtype == numpy.float16:
+            onnx_type = TensorProto.FLOAT16
+        else:
+            raise ValueError(f"Unexpected dtype {arr.dtype}.")
         onnx_model = make_model(
             make_graph(
                 [
@@ -165,14 +194,14 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
                 ],
                 "qu",
                 [
-                    make_tensor_value_info("X", TensorProto.FLOAT, None),
-                    make_tensor_value_info("scale", TensorProto.FLOAT, None),
+                    make_tensor_value_info("X", onnx_type, None),
+                    make_tensor_value_info("scale", onnx_type, None),
                 ],
                 [make_tensor_value_info("Y", qType, None)],
             )
         )
         ref = ReferenceEvaluator(onnx_model)
-        return ref.run(None, {"X": arr.astype(numpy.float32), "scale": scale.astype(numpy.float32)})[0]
+        return _check_type(ref.run(None, {"X": arr, "scale": scale})[0])
     else:
         dtype = ONNX_TYPE_TO_NP_TYPE[qType]
         (qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True)
@@ -181,7 +210,7 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
         cliphigh = min(qmax, high) if high is not None else qmax
         arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
         numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
-        return arr_fp32.astype(dtype)
+        return _check_type(arr_fp32.astype(dtype))
 
 
 def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=None):
@@ -210,24 +239,39 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=Non
     # Adjust rmin and rmax such that 0 is included in the range. This is
     # required to make sure zero can be represented by the quantization data
     # type (i.e. to make sure qmin <= zero_point <= qmax)
-    rmin = min(rmin, 0)
-    rmax = max(rmax, 0)
+    rmin = numpy.minimum(rmin, numpy.array(0, dtype=rmin.dtype))
+    rmax = numpy.maximum(rmax, numpy.array(0, dtype=rmax.dtype))
 
     # Ensure a minimum float-point range if specified.
     if min_real_range is not None:
         rmax = max(rmax, rmin + min_real_range)
 
     if symmetric:
-        absmax = max(abs(rmin), abs(rmax))
+        absmax = numpy.maximum(numpy.abs(rmin), numpy.abs(rmax))
         rmin = -absmax
         rmax = +absmax
 
-    scale = (rmax - rmin) / float(qmax - qmin)
-    if scale < numpy.finfo(numpy.float32).tiny:
-        scale = 1.0
-        zero_point = 0
+    assert qmin <= qmax, f"qmin={rmin} > qmax={rmax}"
+    dr = numpy.array(rmax - rmin, dtype=numpy.float64)
+    dq = numpy.array(qmax, dtype=numpy.float64) - numpy.array(qmin, dtype=numpy.float64)
+    scale = numpy.array(dr / dq)
+    assert scale >= 0, "scale isse"
+    if scale < numpy.finfo(rmax.dtype).tiny:
+        scale = numpy.array(1.0, dtype=rmax.dtype)
+        zero_point = numpy.array(0, dtype=qmin.dtype)
     else:
-        zero_point = round(qmin - rmin / scale)
+        if symmetric:
+            # When symmetric (i.e., rmax == -rmin), the zero_point formula reduces to round((qmax + qmin) / 2.0).
+            # This simpler formula doesn't depend on scale and guarantees that the zero point values
+            # for int8, uint8, int16, and uint16 are always 0, 128, 0, and 32768, respectively.
+            # This is important for per-channel/symmetric QLinearConv on CPU EP, which requires all channels to have
+            # the exact same zero_point values.
+            zero_point = numpy.array(
+                numpy.round((qmin + qmax) / numpy.array(2.0, dtype=numpy.float64)), dtype=qmin.dtype
+            )
+        else:
+            zero_point = numpy.array(numpy.round(qmin - rmin / scale), dtype=qmin.dtype)
+        scale = scale.astype(rmax.dtype)
 
     return [zero_point, scale]
 
@@ -242,21 +286,30 @@ def compute_scale_zp_float8(element_type, std):
     More details in notebook `quantization_fp8.ipynb
     <https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/quantization_fp8.ipynb>`_.
     """
+    zp_dtype = None
     if element_type not in FLOAT8_DISTRIBUTIONS:
         if element_type == TensorProto.FLOAT8E4M3FN:
             from onnx.numpy_helper import float8e4m3_to_float32
+            from onnx.reference.custom_element_types import float8e4m3fn
 
-            all_values = [float8e4m3_to_float32(i) for i in range(0, 256)]
+            zp_dtype = float8e4m3fn
+            all_values = [float8e4m3_to_float32(i) for i in range(256)]
             values = numpy.array(
                 [f for f in all_values if not numpy.isnan(f) and not numpy.isinf(f)], dtype=numpy.float32
             )
         else:
             raise ValueError(f"Quantization to element_type={element_type} not implemented.")
         FLOAT8_DISTRIBUTIONS[element_type] = values
+    elif element_type == TensorProto.FLOAT8E4M3FN:
+        from onnx.reference.custom_element_types import float8e4m3fn
+
+        zp_dtype = float8e4m3fn
 
+    if zp_dtype is None:
+        raise TypeError(f"Unexpected element_type {element_type}.")
     std_f8 = numpy.std(FLOAT8_DISTRIBUTIONS[element_type])
-    zero = 0
-    scale = std / std_f8
+    zero = numpy.array(0, dtype=zp_dtype)
+    scale = numpy.array(std / std_f8, dtype=std.dtype)
     return [zero, scale]
 
 
@@ -288,40 +341,43 @@ def quantize_data(
     - *S*: scale
     - *z*: zero point
     """
-
+    if not isinstance(data, numpy.ndarray):
+        raise TypeError(f"Weight must be given as an array not {type(data)}.")
     if rmin_override is not None:
         rmin = rmin_override
     else:
-        rmin = min(data) if len(data) else 0
+        rmin = data.min() if len(data) else 0.0
 
     if rmax_override is not None:
         rmax = rmax_override
     else:
-        rmax = max(data) if len(data) else 0
+        rmax = data.max() if len(data) else 0.0
 
+    rmin = numpy.array(rmin, dtype=data.dtype)
+    rmax = numpy.array(rmax, dtype=data.dtype)
     zero_point = 0
-    scale = 1.0
+    scale = numpy.array(1.0, dtype=data.dtype)
 
     if qType == TensorProto.FLOAT8E4M3FN:
         if reduce_range:
             raise RuntimeError("Unsupported option reduce_range=True for float 8.")
         std = numpy.std(data)
         zero_point, scale = compute_scale_zp_float8(qType, std)
-        quantized_data = quantize_nparray(qType, numpy.asarray(data), scale, zero_point)
+        quantized_data = quantize_nparray(qType, data, scale, zero_point)
         if any((quantized_data.astype(numpy.uint8).ravel() & 127) == 127):
             np_data = numpy.asarray(data)
             raise RuntimeError(
                 f"One of the quantized value is NaN data in [{np_data.min()}, {np_data.max()}], "
                 f"quantized_data in [{quantized_data.min()}, {quantized_data.max()}]."
             )
-        return rmin, rmax, zero_point, scale, quantized_data
+        return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
 
     if qType in (TensorProto.INT8, TensorProto.UINT8, TensorProto.INT16, TensorProto.UINT16):
         if len(data):
             qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
             zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range)
-        quantized_data = quantize_nparray(qType, numpy.asarray(data), scale, zero_point)
-        return rmin, rmax, zero_point, scale, quantized_data
+        quantized_data = quantize_nparray(qType, data, scale, zero_point)
+        return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
 
     raise ValueError(f"Unexpected value for qType={qType}.")
 
@@ -347,6 +403,14 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):  # noqa
     if not qrange:
         raise ValueError(f"Unexpected data type {qType} requested. Only INT8, UINT8, INT16, and UINT16 are supported.")
 
+    qmin, qmax = qrange
+    if qmin > 0 or qmax < 0:
+        raise ValueError(
+            f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while "
+            f"qmin:{qmin}, qmmax:{qmax}, dtype={qmin.dtype}, reduce_range={reduce_range}, "
+            f"symmetric={symmetric}, qType={qType}"
+        )
+
     return qrange
 
 
@@ -360,6 +424,18 @@ def get_qrange_for_qType(qType, reduce_range=False, symmetric=False):  # noqa: N
     return qmax - qmin
 
 
+def normalize_axis(axis: int, rank: int) -> tuple[bool, int]:
+    """
+    Helper function that tries to return a normalized axis in the range [0, rank - 1].
+    :parameter axis: The axis to normalize.
+    :parameter rank: The tensor rank (number of dimensions).
+    :return (is_valid, axis_norm)
+    """
+    axis_norm = axis + rank if axis < 0 else axis
+    is_valid = axis_norm >= 0 and axis_norm < rank
+    return is_valid, axis_norm
+
+
 class QuantizedInitializer:
     """
     Represents a linearly quantized weight input from ONNX operators
@@ -406,6 +482,7 @@ def __init__(
         axis=None,
         node_type=None,
         node_qtype=None,
+        scale_type=None,
     ):
         self.original_name = name
         self.q_name = new_quantized_name
@@ -415,6 +492,7 @@ def __init__(
         self.axis = axis
         self.node_type = node_type
         self.node_qtype = node_qtype
+        self.scale_type = scale_type
 
 
 class BiasToQuantize:
@@ -481,7 +559,7 @@ def get_elem_index(elem_name, elem_list):
     Helper function to return index of an item in a node list
     """
     elem_idx = -1
-    for i in range(0, len(elem_list)):
+    for i in range(len(elem_list)):
         if elem_list[i] == elem_name:
             elem_idx = i
     return elem_idx
@@ -604,7 +682,7 @@ def smooth_distribution(p, eps=0.0001):
 
     if not n_nonzeros:
         # raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
-        return -1
+        return None
     eps1 = eps * float(n_zeros) / float(n_nonzeros)
     assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
         n_zeros,
@@ -694,7 +772,7 @@ def save_and_reload_model_with_shape_infer(model: ModelProto) -> ModelProto:
 
 
 def tensor_proto_to_array(initializer: TensorProto) -> numpy.ndarray:
-    if initializer.data_type == onnx_proto.TensorProto.FLOAT:
+    if initializer.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
         return onnx.numpy_helper.to_array(initializer)
 
     raise ValueError(
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index aed46563c276..9ebd7bf3c408 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -6,6 +6,9 @@
 import logging
 import tempfile
 from pathlib import Path
+from typing import Union
+
+import onnx
 
 from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
 from .onnx_quantizer import ONNXQuantizer
@@ -16,6 +19,7 @@
     QuantType,
     load_model_with_shape_infer,
     model_has_pre_process_metadata,
+    save_and_reload_model_with_shape_infer,
 )
 from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
 
@@ -280,8 +284,8 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua
 
 
 def quantize_static(
-    model_input,
-    model_output,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     calibration_data_reader: CalibrationDataReader,
     quant_format=QuantFormat.QDQ,
     op_types_to_quantize=None,
@@ -304,7 +308,7 @@ def quantize_static(
 
     Args:
 
-        model_input: file path of model to quantize
+        model_input: file path of model or ModelProto to quantize
         model_output: file path of quantized model
         calibration_data_reader: a calibration data reader. It
             enumerates calibration data and generates inputs for the
@@ -435,7 +439,11 @@ def quantize_static(
         qdq_ops = list(QDQRegistry.keys())
         op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
 
-    model = load_model_with_shape_infer(Path(model_input))
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
 
     pre_processed: bool = model_has_pre_process_metadata(model)
     if not pre_processed:
@@ -466,7 +474,6 @@ def quantize_static(
 
         import copy
 
-        import onnx
         from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
 
         def inc_dataloader():
@@ -478,16 +485,23 @@ def inc_dataloader():
         dataloader = inc_dataloader()
         sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
         del dataloader
-        model = sq.transform(
-            extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True)
-        ).model
-        nodes_to_exclude.extend([i.name for i in model.graph.node if i.name not in orig_nodes])
+        model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
         sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
         model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
-        onnx.save_model(model, model_input, save_as_external_data=True)
+        model.save(model_input)
+        nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+        if isinstance(model_input, onnx.ModelProto):
+            output_path = str(Path(quant_tmp_dir) / "model_input.onnx")
+            onnx.save_model(
+                model_input,
+                output_path,
+                save_as_external_data=True,
+            )
+            model_input = output_path
+
         calibrator = create_calibrator(
             Path(model_input),
             op_types_to_quantize,
@@ -526,8 +540,6 @@ def inc_dataloader():
             model,
             per_channel,
             reduce_range,
-            mode,
-            True,  # static
             weight_type,
             activation_type,
             tensors_range,
@@ -551,8 +563,8 @@ def inc_dataloader():
 
 
 def quantize_dynamic(
-    model_input: Path,
-    model_output: Path,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     op_types_to_quantize=None,
     per_channel=False,
     reduce_range=False,
@@ -565,7 +577,7 @@ def quantize_dynamic(
     """Given an onnx model, create a quantized onnx model and save it into a file
 
     Args:
-        model_input: file path of model to quantize
+        model_input: file path of model or ModelProto to quantize
         model_output: file path of quantized model
         op_types_to_quantize:
             specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
@@ -614,7 +626,11 @@ def quantize_dynamic(
     if not op_types_to_quantize or len(op_types_to_quantize) == 0:
         op_types_to_quantize = list(IntegerOpsRegistry.keys())
 
-    model = load_model_with_shape_infer(Path(model_input))
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
 
     pre_processed: bool = model_has_pre_process_metadata(model)
     if not pre_processed:
@@ -647,15 +663,15 @@ def quantize_dynamic(
 
 
 def quantize(
-    model_input: Path,
-    model_output: Path,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     quant_config: QuantConfig,
 ):
     """Quantize a model with QuantConfig.
 
     Args:
-        model_input (Path): Path to the model to quantize.
-        model_output (Path): Path to save the quantized model.
+        model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
+        model_output (str | Path): Path to save the quantized model.
         quant_config (QuantConfig): Quantization Configuration.
     """
 
diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
index a693f4192bc2..b00e830a2a36 100644
--- a/onnxruntime/python/tools/quantization/registry.py
+++ b/onnxruntime/python/tools/quantization/registry.py
@@ -18,7 +18,7 @@
 from .operators.pooling import QLinearPool
 from .operators.qdq_base_operator import QDQOperatorBase
 from .operators.resize import QDQResize, QResize
-from .operators.softmax import QDQSoftmax, QLinearSoftmax
+from .operators.softmax import QLinearSoftmax
 from .operators.split import QDQSplit, QSplit
 from .operators.where import QDQWhere, QLinearWhere
 from .quant_utils import QuantizationMode
@@ -79,7 +79,6 @@
     "MatMul": QDQMatMul,
     "Split": QDQSplit,
     "Gather": QDQGather,
-    "Softmax": QDQSoftmax,
     "Where": QDQWhere,
     "InstanceNormalization": QDQNormalization,
     "LayerNormalization": QDQNormalization,
diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py
index b7d472661038..c07007f9d612 100644
--- a/onnxruntime/python/tools/quantization/shape_inference.py
+++ b/onnxruntime/python/tools/quantization/shape_inference.py
@@ -9,12 +9,13 @@
 import tempfile
 import traceback
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 import onnx
 
 import onnxruntime
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
+from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
 
 from .quant_utils import add_pre_process_metadata
 
@@ -22,8 +23,8 @@
 
 
 def quant_pre_process(
-    input_model_path: str,
-    output_model_path: str,
+    input_model: Optional[Union[str, Path, onnx.ModelProto]] = None,
+    output_model_path: Optional[Union[str, Path]] = None,
     skip_optimization: bool = False,
     skip_onnx_shape: bool = False,
     skip_symbolic_shape: bool = False,
@@ -35,11 +36,12 @@ def quant_pre_process(
     all_tensors_to_one_file: bool = False,
     external_data_location: Optional[str] = None,
     external_data_size_threshold: int = 1024,
+    **deprecated_kwargs,
 ) -> None:
     """Shape inference and model optimization, in preparation for quantization.
 
     Args:
-        input_model_path: Path to the input model file")
+        input_model: Path to the input model file or ModelProto
         output_model_path: Path to the output model file
         skip_optimization: Skip model optimization step if true. This may result in ONNX shape
             inference failure for some models.
@@ -62,14 +64,22 @@ def quant_pre_process(
         external_data_location: The file location to save the external file
         external_data_size_threshold: The size threshold for external data
     """
+
+    if input_model is None:
+        input_model = deprecated_kwargs.pop("input_model_path", None)
+    assert input_model is not None
+
+    assert output_model_path is not None, "output_model_path is required."
+
     with tempfile.TemporaryDirectory(prefix="pre.quant.") as quant_tmp_dir:
         temp_path = Path(quant_tmp_dir)
         model = None
 
         if not skip_symbolic_shape:
             logger.info("Performing symbolic shape inference...")
+            loaded_model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
             model = SymbolicShapeInference.infer_shapes(
-                onnx.load(input_model_path),
+                loaded_model,
                 int_max,
                 auto_merge,
                 guess_output_rank,
@@ -80,18 +90,18 @@ def quant_pre_process(
             # Use ORT optimizers (native code) to optimize model
             if not skip_symbolic_shape:
                 # Need to save the inferenced model to file so as to run the optimizer
-                input_model_path = str(temp_path / "symbolic_shape_inferred.onnx")
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
                 if save_as_external_data:
                     onnx.save_model(
                         model,
-                        input_model_path,
+                        input_model,
                         save_as_external_data=True,
                         all_tensors_to_one_file=all_tensors_to_one_file,
                         size_threshold=external_data_size_threshold,
                         convert_attribute=False,
                     )
                 else:
-                    onnx.save(model, input_model_path)
+                    onnx.save(model, input_model)
                 model = None
 
             opt_model_path = str(temp_path / "optimized.onnx")
@@ -99,7 +109,19 @@ def quant_pre_process(
                 sess_option = onnxruntime.SessionOptions()
                 sess_option.optimized_model_filepath = opt_model_path
                 sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
-                sess = onnxruntime.InferenceSession(input_model_path, sess_option, providers=["CPUExecutionProvider"])
+                # For large model, extract external data from model and add to session options
+                if isinstance(input_model, onnx.ModelProto):
+                    if has_external_data(input_model):
+                        raise ValueError(
+                            "ModelProto has external data not loaded into memory, ORT cannot create session. "
+                            "Please load external data before calling this function. "
+                            "See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information."
+                        )
+                    external_names, external_values = extract_raw_data_from_model(input_model)
+                    sess_option.add_external_initializers(list(external_names), list(external_values))
+                    input_model = input_model.SerializeToString()
+
+                sess = onnxruntime.InferenceSession(input_model, sess_option, providers=["CPUExecutionProvider"])
                 # Close the session to avoid the cleanup error on Windows for temp folders
                 # https://github.com/microsoft/onnxruntime/issues/17627
                 del sess
@@ -109,7 +131,7 @@ def quant_pre_process(
                 )
                 logger.error(traceback.format_exc())
 
-            input_model_path = opt_model_path
+            input_model = opt_model_path
 
         if not skip_onnx_shape:
             # ONNX shape inference.
@@ -117,26 +139,37 @@ def quant_pre_process(
             # If the skip optimization is specified, we could be dealing with a
             # large model. So be on the safe side, save the model
             if model is not None:
-                input_model_path = str(temp_path / "symbolic_shape_inferred.onnx")
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
                 if save_as_external_data:
                     onnx.save_model(
                         model,
-                        input_model_path,
+                        input_model,
                         save_as_external_data=True,
                         all_tensors_to_one_file=all_tensors_to_one_file,
                         size_threshold=external_data_size_threshold,
                         convert_attribute=False,
                     )
                 else:
-                    onnx.save(model, input_model_path)
+                    onnx.save(model, input_model)
                 model = None
 
+            if isinstance(input_model, onnx.ModelProto):
+                input_model = str(Path(quant_tmp_dir) / "model_input.onnx")
+                onnx.save_model(
+                    model,
+                    input_model,
+                    save_as_external_data=True,
+                    all_tensors_to_one_file=all_tensors_to_one_file,
+                    size_threshold=external_data_size_threshold,
+                    convert_attribute=False,
+                )
+
             inferred_model_path = str(temp_path / "onnx_shape_inferred.onnx")
-            onnx.shape_inference.infer_shapes_path(input_model_path, inferred_model_path)
+            onnx.shape_inference.infer_shapes_path(input_model, inferred_model_path)
             model = onnx.load(inferred_model_path)
 
     if model is None:
-        model = onnx.load(input_model_path)
+        model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
 
     add_pre_process_metadata(model)
 
diff --git a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
new file mode 100644
index 000000000000..6050bd2e05ec
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py
@@ -0,0 +1,516 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import json
+from collections.abc import MutableMapping
+from dataclasses import dataclass
+from typing import Any
+
+import onnx
+
+from .quant_utils import QuantType, tensor_proto_to_array
+
+
+@dataclass
+class QuantTypeInfo:
+    """
+    The quantization type information for a tensor override.
+    """
+
+    quant_type: QuantType
+    symmetric: bool | None = None  # If None, assumes default is used.
+    reduce_range: bool | None = None  # If None, assumes default is used.
+    axis: int | None = None  # If None, assumes per-tensor quantization
+
+    def __eq__(self, other: object):
+        if isinstance(other, QuantTypeInfo):
+            return (
+                self.quant_type == other.quant_type
+                and (self.symmetric is None or other.symmetric is None or self.symmetric == other.symmetric)
+                and (self.reduce_range is None or other.reduce_range is None or self.reduce_range == other.reduce_range)
+                and (self.axis == other.axis)
+            )
+        return NotImplemented
+
+    @staticmethod
+    def load_from_dict(
+        raw_dict: dict[str, Any],
+        default_qtype: QuantType | None = None,
+        default_symmetric: bool | None = None,
+        default_reduce_range: bool | None = None,
+    ) -> QuantTypeInfo:
+        return QuantTypeInfo(
+            raw_dict.get("quant_type", default_qtype),
+            raw_dict.get("symmetric", default_symmetric),
+            raw_dict.get("reduce_range", default_reduce_range),
+            raw_dict.get("axis"),
+        )
+
+    def save_to_dict(self, raw_dict: dict[str, Any]):
+        raw_dict["quant_type"] = self.quant_type
+        if self.symmetric is not None:
+            raw_dict["symmetric"] = self.symmetric
+        if self.reduce_range is not None:
+            raw_dict["reduce_range"] = self.reduce_range
+        if self.axis is not None:
+            raw_dict["axis"] = self.axis
+
+
+class TensorQuantOverridesHelper(MutableMapping):
+    """
+    Utility wrapper over the tensor quantization overrides passed via extra_options.
+    """
+
+    def __init__(self, raw_overrides: dict[str, list[dict[str, Any]]]):
+        self.overrides = raw_overrides
+        self.quant_types = None
+        self.keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
+
+    def has_per_tensor_overrides(self, tensor_name: str) -> bool:
+        overrides_list = self.overrides.get(tensor_name)
+        return overrides_list and "axis" not in overrides_list[0]
+
+    def has_per_channel_overrides(self, tensor_name: str) -> bool:
+        overrides_list = self.overrides.get(tensor_name)
+        return overrides_list and "axis" in overrides_list[0]
+
+    def get_per_tensor_overrides(
+        self,
+        tensor_name: str,
+        default_val: dict[str, Any] | None = None,
+    ) -> dict[str, Any] | None:
+        default_list_val = [default_val] if default_val is not None else None
+        overrides_list = self.overrides.get(tensor_name, default_list_val)
+        if overrides_list and "axis" in overrides_list[0]:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
+                f"but found per-channel overrides."
+            )
+
+        return overrides_list[0] if overrides_list else None
+
+    def get_per_channel_overrides(
+        self,
+        tensor_name: str,
+        default_val: list[dict[str, Any]] | None = None,
+    ) -> list[dict[str, Any]] | None:
+        overrides_list = self.overrides.get(tensor_name, default_val)
+
+        if not overrides_list:
+            return None
+
+        if "axis" not in overrides_list[0]:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to have per-channel quantization overrides (axis value is missing).",
+            )
+
+        return overrides_list
+
+    def get_quant_types(self) -> set[QuantType]:
+        if self.quant_types is not None:
+            return self.quant_types
+
+        self.quant_types = set()
+
+        if self.overrides:
+            for quant_overrides_list in self.overrides.values():
+                for quant_overrides in quant_overrides_list:
+                    if "quant_type" in quant_overrides:
+                        self.quant_types.add(quant_overrides["quant_type"])
+
+                    if "convert" in quant_overrides and "quant_type" in quant_overrides["convert"]:
+                        self.quant_types.add(quant_overrides["convert"]["quant_type"])
+
+        return self.quant_types
+
+    def _is_valid_per_tensor(
+        self,
+        initializers,
+        default_activation_qtype,
+        tensor_name: str,
+        quant_overrides: dict[str, Any],
+    ) -> tuple[bool, str | None]:
+        if not isinstance(quant_overrides, dict):
+            return (
+                False,
+                f"Tensor quantization overrides for '{tensor_name}' are not in a dict",
+            )
+
+        is_initializer = tensor_name in initializers
+
+        quant_type = quant_overrides.get("quant_type")
+        if quant_type:
+            self.quant_types.add(quant_type)
+
+        has_scale = "scale" in quant_overrides
+        has_zero_point = "zero_point" in quant_overrides
+
+        if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
+            return (
+                False,
+                "Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
+            )
+
+        if has_scale:
+            keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides))
+            if keys:
+                return (
+                    False,
+                    f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                )
+
+        if "reduce_range" in quant_overrides and not is_initializer:
+            return (
+                False,
+                f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
+            )
+
+        if "convert" in quant_overrides:
+            if is_initializer:
+                return False, "Cannot use 'convert' override for initializers"
+
+            if "quant_type" not in quant_overrides["convert"]:
+                return False, f"'convert' options (tensor '{tensor_name}') must specify a 'quant_type'"
+
+            if "reduce_range" in quant_overrides["convert"]:
+                return (
+                    False,
+                    f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
+                )
+
+            convert_quant_type = quant_overrides["convert"]["quant_type"]
+            original_quant_type = quant_type if quant_type is not None else default_activation_qtype
+            if convert_quant_type == original_quant_type:
+                return (
+                    False,
+                    f"'convert' quant_type must differ from original quant_type (tensor '{tensor_name}')",
+                )
+
+            convert_has_scale = "scale" in quant_overrides["convert"]
+            convert_has_zero_point = "zero_point" in quant_overrides["convert"]
+
+            if (convert_has_scale and not convert_has_zero_point) or (convert_has_zero_point and not convert_has_scale):
+                return (
+                    False,
+                    f"Must provide both 'scale' and 'zero_point' if one of the overrides is provided (tensor '{tensor_name}')",
+                )
+
+            if convert_has_scale:
+                keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides["convert"]))
+                if keys:
+                    return (
+                        False,
+                        f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point' "
+                        f"(tensor '{tensor_name}')",
+                    )
+
+            self.quant_types.add(convert_quant_type)
+
+        return True, None
+
+    def _is_valid_per_channel(
+        self,
+        initializers,
+        tensor_name: str,
+        quant_overrides_list: list[dict[str, Any]],
+    ) -> tuple[bool, str | None]:
+        is_initializer = tensor_name in initializers
+
+        if not is_initializer:
+            return (
+                False,
+                f"Tensor '{tensor_name}' has per-channel overrides, but is not an initializer",
+            )
+
+        axis = quant_overrides_list[0].get("axis")
+
+        if axis is None:
+            return (
+                False,
+                f"Per-channel overrides for tensor {tensor_name} is missing an 'axis' value in "
+                "the first channel dictionary.",
+            )
+
+        weight_shape = tensor_proto_to_array(initializers[tensor_name]).shape
+        weight_rank = len(weight_shape)
+        norm_axis = axis
+        if norm_axis < 0:
+            norm_axis += weight_rank
+
+        if norm_axis < 0 or norm_axis >= len(weight_shape):
+            return (
+                False,
+                f"Axis override value is out-of-bounds for tensor {tensor_name} (rank {len(weight_shape)})",
+            )
+
+        if len(quant_overrides_list) > 1 and len(quant_overrides_list) != weight_shape[norm_axis]:
+            return (
+                False,
+                f"Incorrect number of channel overrides for tensor {tensor_name} (axis {axis}), "
+                f"expected {weight_shape[axis]}, but found {len(quant_overrides_list)}.",
+            )
+
+        if "convert" in quant_overrides_list[0]:
+            return False, f"Cannot use 'convert' override for initializers, such as {tensor_name}."
+
+        quant_type = quant_overrides_list[0].get("quant_type")
+        if quant_type:
+            self.quant_types.add(quant_type)
+
+        symmetric = quant_overrides_list[0].get("symmetric")
+        reduce_range = quant_overrides_list[0].get("reduce_range")
+
+        has_scale = "scale" in quant_overrides_list[0]
+        has_zero_point = "zero_point" in quant_overrides_list[0]
+        has_scale_zp = has_scale and has_zero_point
+
+        if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
+            return (
+                False,
+                "Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
+            )
+
+        if has_scale_zp:
+            keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides_list[0]))
+            if keys:
+                return (
+                    False,
+                    f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                )
+
+        has_rmin = "rmin" in quant_overrides_list[0]
+        has_rmax = "rmax" in quant_overrides_list[0]
+        has_rmin_rmax = has_rmin and has_rmax
+        if (has_rmin and not has_rmax) or (not has_rmin and has_rmax):
+            return (
+                False,
+                "Must provide both 'rmin' and 'rmax' if one is provided",
+            )
+
+        for index, quant_overrides in enumerate(quant_overrides_list[1:]):
+            if not isinstance(quant_overrides, dict):
+                return (
+                    False,
+                    f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict",
+                )
+
+            if "convert" in quant_overrides:
+                return False, f"Cannot use 'convert' override for initializers, such as {tensor_name}."
+
+            # For per-channel quantization, all channels must use the same quantization type, axis, symmetric
+            # and reduce_range values. And, if specified, they must be present in the first channel dict
+            # (i.e., quant_overrides_list[0]).
+            if "quant_type" in quant_overrides and quant_type != quant_overrides["quant_type"]:
+                return (
+                    False,
+                    "Channel quantization types for tensor '{tensor_name}' do not match at index {index}.",
+                )
+            if "axis" in quant_overrides and axis != quant_overrides["axis"] and norm_axis != quant_overrides["axis"]:
+                return (
+                    False,
+                    "Channel axis for tensor '{tensor_name}' does not match at index {index}.",
+                )
+            if "symmetric" in quant_overrides and symmetric != quant_overrides["symmetric"]:
+                return (
+                    False,
+                    "Channel symmetric value for tensor '{tensor_name}' does not match at index {index}.",
+                )
+            if "reduce_range" in quant_overrides and reduce_range != quant_overrides["reduce_range"]:
+                return (
+                    False,
+                    "Channel reduce_range value for tensor '{tensor_name}' does not match at index {index}.",
+                )
+
+            # If override scale/zp, must do so for all channels.
+            chan_has_scale_zp = "scale" in quant_overrides and "zero_point" in quant_overrides
+
+            if has_scale_zp and not chan_has_scale_zp:
+                return (
+                    False,
+                    "Per-channel overrides that specify scale/zero_point must do so for all channels, "
+                    f"but tensor '{tensor_name}' is missing them at index {index}.",
+                )
+
+            if chan_has_scale_zp:
+                keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides))
+                if keys:
+                    return (
+                        False,
+                        f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                    )
+
+            # If override rmin/rmax, must do so for all channels.
+            chan_has_rmin_rmax = "rmin" in quant_overrides and "rmax" in quant_overrides
+            if has_rmin_rmax and not chan_has_rmin_rmax:
+                return (
+                    False,
+                    "Per-channel overrides that specify rmin/rmax must do so for all channels, "
+                    f"but tensor '{tensor_name}' is missing them at index {index}.",
+                )
+
+        return True, None
+
+    def is_valid(
+        self,
+        initializers: dict[str, onnx.TensorProto],
+        activation_names: set[str],
+        default_activation_qtype,
+    ) -> tuple[bool, str | None]:
+        self.quant_types = set()
+
+        # Validate that compatible/valid overrides are provided.
+        if self.overrides:
+            for tensor_name, quant_overrides_list in self.overrides.items():
+                if tensor_name not in initializers and tensor_name not in activation_names:
+                    return False, f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model"
+
+                if not isinstance(quant_overrides_list, list):
+                    return False, f"Tensor quantization overrides for '{tensor_name}' are not in a list"
+
+                if not quant_overrides_list:
+                    continue
+
+                if not isinstance(quant_overrides_list[0], dict):
+                    return False, f"Tensor quantization overrides at index 0 for '{tensor_name}' are not in a dict"
+
+                if not quant_overrides_list[0]:
+                    continue
+
+                axis = quant_overrides_list[0].get("axis")
+                is_per_channel = len(quant_overrides_list) > 1 or axis is not None
+
+                if is_per_channel:
+                    return self._is_valid_per_channel(initializers, tensor_name, quant_overrides_list)
+
+                return self._is_valid_per_tensor(
+                    initializers, default_activation_qtype, tensor_name, quant_overrides_list[0]
+                )
+
+        return True, None
+
+    def update_tensor_overrides(
+        self,
+        tensor_name: str,
+        new_vals: dict[str, Any],
+        channels: list[int] | None = None,
+        overwrite: bool = True,
+    ) -> bool:
+        if not new_vals:
+            return False
+
+        channels = set(channels) if channels is not None else None
+        have_overrides = self.overrides.get(tensor_name)
+
+        # If `overwrite` is False, check if we would overwrite anything.
+        do_update = True
+        if not overwrite and have_overrides:
+            for channel, overrides in enumerate(self.overrides[tensor_name]):
+                if channels is not None and channel not in channels:
+                    continue
+                if set(new_vals).intersection(set(overrides)):
+                    do_update = False
+                    break
+
+        # Do the update if `overwrite` is True or if nothing is overwritten (do not want partial overwrites).
+        if do_update:
+            if not have_overrides:
+                self.overrides[tensor_name] = [{}]
+
+            for channel, overrides in enumerate(self.overrides[tensor_name]):
+                if channels is not None and channel not in channels:
+                    continue
+                overrides.update(new_vals)
+
+        return do_update
+
+    def get_node_output_qtype_info(
+        self,
+        output_name: str,
+        default_qtype: QuantType | None,
+        default_symmetric: bool | None = None,
+    ) -> QuantTypeInfo:
+        # Outputs are activations, which do not support 'reduce_range' or 'axis'
+        if output_name not in self.overrides:
+            return QuantTypeInfo(default_qtype, default_symmetric)
+
+        tensor_overrides = self.overrides[output_name][0]
+
+        return QuantTypeInfo(
+            tensor_overrides.get("quant_type", default_qtype),
+            tensor_overrides.get("symmetric", default_symmetric),
+        )
+
+    def get_node_input_qtype_info(
+        self,
+        input_name: str,
+        node_name: str,
+        default_qtype: QuantType | None,
+        default_symmetric: bool | None = None,
+        default_reduce_range: bool | None = None,
+    ) -> QuantTypeInfo:
+        if input_name not in self.overrides or not self.overrides[input_name]:
+            return QuantTypeInfo(default_qtype, default_symmetric, default_reduce_range)
+
+        # Get the first overrides dict in the list. This works for both per-tensor and per-channel
+        # quantization because all channels must use the same quant type.
+        tensor_overrides = self.overrides[input_name][0]
+        producer_type = tensor_overrides.get("quant_type", default_qtype)
+
+        if "convert" not in tensor_overrides:
+            return QuantTypeInfo(
+                producer_type,
+                tensor_overrides.get("symmetric", default_symmetric),
+                tensor_overrides.get("reduce_range", default_reduce_range),
+                tensor_overrides.get("axis"),
+            )
+
+        # This tensor is converted. Check if the node gets the original qtype or the converted qtype.
+        convert_dict = tensor_overrides["convert"]
+        qtype_info = QuantTypeInfo(
+            producer_type,
+            convert_dict.get("symmetric", default_symmetric),
+            # Converted tensors are not initializers, so do not have 'axis' or 'reduce_range'.
+        )
+
+        # Check if all nodes receive the converted type (i.e., recv_nodes is None) or this node
+        # is in the list of consumers (recv_nodes).
+        if ("recv_nodes" not in convert_dict) or (node_name in convert_dict["recv_nodes"]):
+            qtype_info.quant_type = convert_dict["quant_type"]
+
+        return qtype_info
+
+    def pprint_str(self, indent=None) -> str:
+        return json.dumps(self.overrides, default=str, indent=indent)
+
+    def empty(self) -> bool:
+        return not self.overrides
+
+    def get_dict(self) -> dict[str, list[dict[str, Any]]]:
+        return self.overrides
+
+    # Required implementations of abstract methods in collections.abc.MutableMapping
+    # so that this class can be used like a dict.
+    def __setitem__(self, key: str, value: list[dict]):
+        self.overrides[key] = value
+
+    def __getitem__(self, key: str) -> list[dict]:
+        return self.overrides[key]
+
+    def __delitem__(self, key: str):
+        del self.overrides[key]
+
+    def __iter__(self):
+        return iter(self.overrides)
+
+    def __len__(self):
+        return len(self.overrides)
+
+    def __str__(self) -> str:
+        return str(self.overrides)
+
+    def __repr__(self) -> str:
+        return f"{super().__repr__()}, TensorQuantOverridesHelper({self.overrides})"
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index e90eea553c18..040bf5ae76ff 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -197,6 +197,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "BiasGelu": self._infer_BiasGelu,
             "BiasSplitGelu": self._infer_BiasSplitGelu,
             "DecoderMaskedMultiHeadAttention": self._infer_DecoderMaskedMultiHeadAttention,
+            "DequantizeLinear": self._infer_DequantizeLinear,
             "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
             "FastGelu": self._infer_FastGelu,
             "GatedRelativePositionBias": self._infer_GatedRelativePositionBias,
@@ -204,6 +205,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "GemmFastGelu": self._infer_GemmFastGelu,
             "GemmFloat8": self._infer_GemmFloat8,
             "GroupNorm": self._infer_GroupNorm,
+            "GroupQueryAttention": self._infer_GroupQueryAttention,
             "SkipGroupNorm": self._infer_SkipGroupNorm,
             "LayerNormalization": self._infer_LayerNormalization,
             "LongformerAttention": self._infer_LongformerAttention,
@@ -211,7 +213,9 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "NhwcConv": self._infer_NhwcConv,
             "PackedAttention": self._infer_PackedAttention,
             "PackedMultiHeadAttention": self._infer_PackedMultiHeadAttention,
+            "PagedAttention": self._infer_PagedAttention,
             "PythonOp": self._infer_PythonOp,
+            "QuantizeLinear": self._infer_QuantizeLinear,
             "QuickGelu": self._infer_FastGelu,
             "RelativePositionBias": self._infer_RelativePositionBias,
             "RemovePadding": self._infer_RemovePadding,
@@ -238,6 +242,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "upsample_nearest1d": self._infer_aten_upsample,
             "upsample_nearest2d": self._infer_aten_upsample,
             "upsample_nearest3d": self._infer_aten_upsample,
+            "upsample_bicubic2d": self._infer_aten_upsample,
         }
         self.run_ = True
         self.suggested_merge_ = {}
@@ -277,7 +282,7 @@ def _add_suggested_merge(self, symbols, apply=False):
         # when nothing to map to, use the shorter one
         if map_to is None:
             if self.verbose_ > 0:
-                logger.warning("Potential unsafe merge between symbolic expressions: ({})".format(",".join(symbols)))
+                logger.warning("Potential unsafe merge between symbolic expressions: (%s)", ",".join(symbols))
             symbols_list = list(symbols)
             lens = [len(s) for s in symbols_list]
             map_to = symbols_list[lens.index(min(lens))]
@@ -330,10 +335,7 @@ def _merge_symbols(self, dims):
                     int_dim = is_int.index(1)
                     if self.verbose_ > 0:
                         logger.debug(
-                            "dim {} has been merged with value {}".format(
-                                unique_dims[:int_dim] + unique_dims[int_dim + 1 :],
-                                unique_dims[int_dim],
-                            )
+                            f"dim {unique_dims[:int_dim] + unique_dims[int_dim + 1 :]} has been merged with value {unique_dims[int_dim]}"
                         )
                     self._check_merged_dims(unique_dims, allow_broadcast=False)
                     return unique_dims[int_dim]
@@ -345,7 +347,7 @@ def _merge_symbols(self, dims):
                 return None
         if all([d == dims[0] for d in dims]):
             return dims[0]
-        merged = [self.suggested_merge_[d] if d in self.suggested_merge_ else d for d in dims]
+        merged = [self.suggested_merge_.get(d, d) for d in dims]
         if all([d == merged[0] for d in merged]):
             assert merged[0] in self.symbolic_dims_
             return merged[0]
@@ -374,7 +376,7 @@ def _broadcast_shapes(self, shape1, shape2):
                     if self.auto_merge_:
                         self._add_suggested_merge([dim1, dim2], apply=True)
                     else:
-                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))
+                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))  # noqa: G003
             new_shape = [new_dim, *new_shape]
         return new_shape
 
@@ -457,6 +459,8 @@ def _onnx_infer_single_node(self, node):
             "GemmFastGelu",
             "LayerNormalization",
             "LongformerAttention",
+            "DequantizeLinear",
+            "QuantizeLinear",
             "RelativePositionBias",
             "RemovePadding",
             "RestorePadding",
@@ -464,9 +468,11 @@ def _onnx_infer_single_node(self, node):
             "SkipLayerNormalization",
             "SkipSimplifiedLayerNormalization",
             "PackedAttention",
+            "PagedAttention",
             "PythonOp",
             "MultiHeadAttention",
             "GroupNorm",
+            "GroupQueryAttention",
             "SkipGroupNorm",
             "BiasSplitGelu",
             "BiasAdd",
@@ -489,6 +495,28 @@ def _onnx_infer_single_node(self, node):
                     if (name in self.initializers_ and name not in self.graph_inputs_)
                 ]
 
+            if node.op_type in [
+                "Add",
+                "Sub",
+                "Mul",
+                "Div",
+                "MatMul",
+                "MatMulInteger",
+                "MatMulInteger16",
+                "Where",
+                "Sum",
+            ]:
+                if node.output[0] in self.known_vi_:
+                    vi = self.known_vi_[node.output[0]]
+                    out_rank = len(get_shape_from_type_proto(vi.type))
+                    in_shapes = [self._get_shape(node, i) for i in range(len(node.input))]
+                    for d in range(
+                        out_rank - (2 if node.op_type in ["MatMul", "MatMulInteger", "MatMulInteger16"] else 0)
+                    ):
+                        in_dims = [s[len(s) - out_rank + d] for s in in_shapes if len(s) + d >= out_rank]
+                        if len(in_dims) > 1:
+                            self._check_merged_dims(in_dims, allow_broadcast=True)
+
             # run single node inference with self.known_vi_ shapes
             tmp_graph = helper.make_graph(
                 [node],
@@ -654,12 +682,7 @@ def _new_symbolic_dim(self, prefix, dim):
 
     def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0):
         return self._new_symbolic_dim(
-            "{}{}_{}_o{}_".format(
-                node.op_type,
-                self.prefix_,
-                list(self.out_mp_.graph.node).index(node),
-                out_idx,
-            ),
+            f"{node.op_type}{self.prefix_}_{list(self.out_mp_.graph.node).index(node)}_o{out_idx}_",
             dim,
         )
 
@@ -815,17 +838,21 @@ def _infer_ArrayFeatureExtractor(self, node):  # noqa: N802
     def _infer_symbolic_compute_ops(self, node):
         funcs = {
             "Add": lambda l: l[0] + l[1],  # noqa: E741
-            "Div": lambda l: int(l[0] // l[1])  # noqa: E741
-            if isinstance(l[0] // l[1], float)
-            else l[0] // l[1],  # integer div in sympy
+            "Div": lambda l: (  # noqa: E741
+                int(l[0] // l[1]) if isinstance(l[0] // l[1], float) else l[0] // l[1]
+            ),  # integer div in sympy
             "Equal": lambda l: l[0] == l[1],  # noqa: E741
             "Floor": lambda l: sympy.floor(l[0]),  # noqa: E741
-            "Max": lambda l: l[1]  # noqa: E741
-            if is_literal(l[0]) and int(l[0]) < -self.int_max_
-            else (l[0] if is_literal(l[1]) and int(l[1]) < -self.int_max_ else sympy.Max(l[0], l[1])),
-            "Min": lambda l: l[1]  # noqa: E741
-            if is_literal(l[0]) and int(l[0]) > self.int_max_
-            else (l[0] if is_literal(l[1]) and int(l[1]) > self.int_max_ else sympy.Min(l[0], l[1])),
+            "Max": lambda l: (  # noqa: E741
+                l[1]
+                if is_literal(l[0]) and int(l[0]) < -self.int_max_
+                else (l[0] if is_literal(l[1]) and int(l[1]) < -self.int_max_ else sympy.Max(l[0], l[1]))
+            ),
+            "Min": lambda l: (  # noqa: E741
+                l[1]
+                if is_literal(l[0]) and int(l[0]) > self.int_max_
+                else (l[0] if is_literal(l[1]) and int(l[1]) > self.int_max_ else sympy.Min(l[0], l[1]))
+            ),
             "Mul": lambda l: int(l[0] * l[1]) if isinstance(l[0] * l[1], float) else l[0] * l[1],  # noqa: E741
             "Sub": lambda l: l[0] - l[1],  # noqa: E741
             "Where": lambda l: l[1] if l[0] else l[2],  # noqa: E741
@@ -979,6 +1006,29 @@ def _infer_NhwcConv(self, node):  # noqa: N802
             )
         )
 
+    def _infer_DequantizeLinear(self, node):  # noqa: N802
+        # Get the output data type from the scale input (index 1, required).
+        output_dtype = self.known_vi_[node.input[1]].type.tensor_type.elem_type
+
+        # Get the output shape from the first input.
+        output_shape = self._get_shape(node, 0)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
+
+    def _infer_QuantizeLinear(self, node):  # noqa: N802
+        # Get the output data type from the zero-point input (index 2, optional).
+        # Otherwise, default to uint8
+        output_dtype = onnx.TensorProto.UINT8
+        if len(node.input) > 2 and node.input[2]:
+            output_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
+
+        # Get the output shape from the first input.
+        output_shape = self._get_shape(node, 0)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
+
     def _infer_Einsum(self, node):  # noqa: N802
         # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
         equation = get_attribute(node, "equation")
@@ -1180,9 +1230,7 @@ def _infer_Loop(self, node):  # noqa: N802
         if need_second_infer:
             if self.verbose_ > 2:
                 logger.debug(
-                    "Rerun Loop: {}({}...), because of sequence in loop carried variables".format(
-                        node.name, node.output[0]
-                    )
+                    f"Rerun Loop: {node.name}({node.output[0]}...), because of sequence in loop carried variables"
                 )
             self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False)
 
@@ -1444,9 +1492,11 @@ def _infer_aten_group_norm(self, node):
                         output_dtype,
                         [
                             N if N is not None else str(self._new_symbolic_dim_from_output(node, i, 0)),
-                            as_scalar(group)
-                            if group is not None
-                            else str(self._new_symbolic_dim_from_output(node, i, 1)),
+                            (
+                                as_scalar(group)
+                                if group is not None
+                                else str(self._new_symbolic_dim_from_output(node, i, 1))
+                            ),
                         ],
                     )
                 )
@@ -1805,7 +1855,7 @@ def handle_negative_index(index, bound):
             axes = self._try_get_value(node, 3)
             steps = self._try_get_value(node, 4)
             if axes is None and not (starts is None and ends is None):
-                axes = list(range(0, len(starts if starts is not None else ends)))
+                axes = list(range(len(starts if starts is not None else ends)))
             if steps is None and not (starts is None and ends is None):
                 steps = [1] * len(starts if starts is not None else ends)
             axes = as_list(axes, keep_none=True)
@@ -1902,8 +1952,17 @@ def _infer_SoftmaxCrossEntropyLoss(self, node):  # noqa: N802
     def _infer_Split_Common(self, node, make_value_info_func):  # noqa: N802
         input_sympy_shape = self._get_sympy_shape(node, 0)
         axis = handle_negative_axis(get_attribute(node, "axis", 0), len(input_sympy_shape))
-        split = get_attribute(node, "split")
-        if not split:
+        op_set = get_opset(self.out_mp_)
+
+        # Depending on op-version 'split' are provided as attribute or via 2nd input
+        if op_set < 13:
+            split = get_attribute(node, "split")
+            assert self._try_get_value(node, 1) is None
+        else:
+            split = self._try_get_value(node, 1)
+            assert get_attribute(node, "split") is None
+
+        if split is None:
             num_outputs = len(node.output)
             split = [input_sympy_shape[axis] / sympy.Integer(num_outputs)] * num_outputs
             self._update_computed_dims(split)
@@ -2382,6 +2441,35 @@ def _infer_SkipLayerNormalization(self, node):  # noqa: N802
     def _infer_GroupNorm(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
+    def _infer_PagedAttention(self, node):  # noqa: N802
+        self._propagate_shape_and_type(node)
+
+    def _infer_GroupQueryAttention(self, node):  # noqa: N802
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+
+        past_shape = self._try_get_shape(node, 3)
+        if past_shape is not None:
+            vi = self.known_vi_[node.output[1]]
+            vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, past_shape))
+            vi = self.known_vi_[node.output[2]]
+            vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, past_shape))
+
+        if node.input[1] != "" and node.input[2] != "":
+            self._propagate_shape_and_type(node, 0, 0)
+        else:
+            # combined qkv: (batch_size, sequence_length, num_heads * head_size + 2 * kv_num_heads * head_size)
+            assert node.input[1] == "" and node.input[2] == ""
+            num_heads = get_attribute(node, "num_heads")
+            kv_num_heads = get_attribute(node, "kv_num_heads")
+            query_shape = self._get_shape(node, 0)
+            if query_shape is not None:
+                hidden_size = query_shape[2]
+                if isinstance(hidden_size, int):
+                    head_size = int(hidden_size / (num_heads + 2 * kv_num_heads))
+                    query_shape[2] = num_heads * head_size
+                    vi = self.known_vi_[node.output[0]]
+                    vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, query_shape))
+
     def _infer_SkipGroupNorm(self, node):  # noqa: N802
         self._propagate_shape_and_type(node, 0, 0)
         if len(node.output) > 1:
@@ -2415,9 +2503,9 @@ def _infer_RotaryEmbedding(self, node):  # noqa: N802
 
     def _infer_PythonOp(self, node):  # noqa: N802
         output_tensor_types = get_attribute(node, "output_tensor_types")
-        assert output_tensor_types
+        assert output_tensor_types, f"PythonOp '{node.name}' has no output_tensor_types attribute."
         output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
-        assert output_tensor_ranks
+        assert output_tensor_ranks, f"PythonOp '{node.name}' has no output_tensor_ranks attribute."
 
         from onnxruntime.capi._pybind_state import get_shape_inference_function
 
@@ -2438,7 +2526,10 @@ def _infer_PythonOp(self, node):  # noqa: N802
                 input_dtype = self.known_vi_[node.input[input_index]].type.tensor_type.elem_type
                 input_dtypes.append(input_dtype)
             output_shapes, output_dtypes = shape_inferer(node, input_shapes, input_dtypes)
-            assert len(output_shapes) == len(output_dtypes) == (len(node.output) - 1)
+            assert len(output_shapes) == len(output_dtypes) == (len(node.output) - 1), (
+                f"PythonOp '{func_name}' returned {len(output_shapes)} shapes and {len(output_dtypes)} dtypes, "
+                f"but expected {len(node.output) - 1} outputs."
+            )
             for i in range(len(node.output) - 1):
                 output_index = i + 1
                 vi = self.known_vi_[node.output[output_index]]
@@ -2590,11 +2681,9 @@ def get_prereq(node):
                         break
 
             if self.verbose_ > 2:
-                logger.debug(node.op_type + ": " + node.name)
+                logger.debug(node.op_type + ": " + node.name)  # noqa: G003
                 for i, name in enumerate(node.input):
-                    logger.debug(
-                        "  Input {}: {} {}".format(i, name, "initializer" if name in self.initializers_ else "")
-                    )
+                    logger.debug("  Input %s: %s %s", i, name, "initializer" if name in self.initializers_ else "")
 
             # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb']
             # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case
@@ -2643,7 +2732,7 @@ def get_prereq(node):
                             seq_cls_type = out_type.sequence_type.elem_type.WhichOneof("value")
                             if seq_cls_type == "tensor_type":
                                 logger.debug(
-                                    "  {}: sequence of {} {}".format(
+                                    "  {}: sequence of {} {}".format(  # noqa: G001
                                         node.output[i_o],
                                         str(get_shape_from_value_info(vi)),
                                         onnx.TensorProto.DataType.Name(
@@ -2661,14 +2750,10 @@ def get_prereq(node):
                 out_type_undefined = out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED
                 if self.verbose_ > 2:
                     logger.debug(
-                        "  {}: {} {}".format(
-                            node.output[i_o],
-                            str(out_shape),
-                            onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type),
-                        )
+                        f"  {node.output[i_o]}: {out_shape!s} {onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type)}"
                     )
                     if node.output[i_o] in self.sympy_data_:
-                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))
+                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))  # noqa: G003
 
                 # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
                 if (
@@ -2769,24 +2854,16 @@ def get_prereq(node):
                             if self.verbose_ > 0:
                                 if is_unknown_op:
                                     logger.debug(
-                                        "Possible unknown op: {} node: {}, guessing {} shape".format(
-                                            node.op_type, node.name, vi.name
-                                        )
+                                        f"Possible unknown op: {node.op_type} node: {node.name}, guessing {vi.name} shape"
                                     )
                                 if self.verbose_ > 2:
-                                    logger.debug(
-                                        "  {}: {} {}".format(
-                                            node.output[i_o],
-                                            str(new_shape),
-                                            vi.type.tensor_type.elem_type,
-                                        )
-                                    )
+                                    logger.debug(f"  {node.output[i_o]}: {new_shape!s} {vi.type.tensor_type.elem_type}")
 
                             self.run_ = True
                             continue  # continue the inference after guess, no need to stop as no merge is needed
 
                     if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined:
-                        logger.debug("Stopping at incomplete shape inference at " + node.op_type + ": " + node.name)
+                        logger.debug("Stopping at incomplete shape inference at %s: %s", node.op_type, node.name)
                         logger.debug("node inputs:")
                         for i in node.input:
                             if i in self.known_vi_:
@@ -2800,7 +2877,7 @@ def get_prereq(node):
                             else:
                                 logger.debug(f"not in known_vi_ for {o}")
                         if self.auto_merge_ and not out_type_undefined:
-                            logger.debug("Merging: " + str(self.suggested_merge_))
+                            logger.debug("Merging: " + str(self.suggested_merge_))  # noqa: G003
                     return False
 
         self.run_ = False
@@ -2885,9 +2962,9 @@ def parse_arguments():
 
 if __name__ == "__main__":
     args = parse_arguments()
-    logger.info("input model: " + args.input)
+    logger.info("input model: " + args.input)  # noqa: G003
     if args.output:
-        logger.info("output model " + args.output)
+        logger.info("output model " + args.output)  # noqa: G003
     logger.info("Doing symbolic shape inference...")
     out_mp = SymbolicShapeInference.infer_shapes(
         onnx.load(args.input),
diff --git a/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py b/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
new file mode 100644
index 000000000000..b94c2cb76a63
--- /dev/null
+++ b/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from argparse import ArgumentParser
+
+import onnx
+import tensorrt as trt
+from onnx import TensorProto, helper
+
+
+class TensorRTEngineWrapperCreator:
+    def __init__(self, args):
+        ctx_embed_mode = args.embed_mode
+        engine_cache_path = args.trt_engine_cache_path
+        self.model_name = args.model_name
+        self.dynamic_dim_count = 0
+        self.plugins = args.plugins
+
+        # Get serialized engine from engine cache
+        with open(engine_cache_path, "rb") as file:
+            engine_buffer = file.read()
+
+        if ctx_embed_mode:
+            ep_cache_context_content = engine_buffer
+        else:
+            ep_cache_context_content = engine_cache_path
+
+        logger = trt.Logger(trt.Logger.WARNING)
+
+        # Enable TRT plugins
+        trt.init_libnvinfer_plugins(logger, "")
+        if len(self.plugins):
+            import ctypes
+
+            ctypes.CDLL(self.plugins)
+
+        # Deserialize an TRT engine
+        runtime = trt.Runtime(logger)
+        engine = runtime.deserialize_cuda_engine(engine_buffer)
+        num_bindings = engine.num_bindings
+
+        input_tensors = []
+        output_tensors = []
+        input_tensor_shapes = []
+        output_tensor_shapes = []
+        input_tensor_types = []
+        output_tensor_types = []
+
+        # Get type and shape of each input/output
+        for b_index in range(num_bindings):
+            tensor_name = engine.get_tensor_name(b_index)
+            tensor_shape = engine.get_tensor_shape(tensor_name)
+            tensor_type = engine.get_tensor_dtype(tensor_name)
+            if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
+                input_tensors.append(tensor_name)
+                input_tensor_shapes.append(tensor_shape)
+                input_tensor_types.append(tensor_type)
+            else:
+                output_tensors.append(tensor_name)
+                output_tensor_shapes.append(tensor_shape)
+                output_tensor_types.append(tensor_type)
+
+        # Note:
+        # The TRT engine should be built with min, max and opt profiles so that dynamic shape input can have dimension of "-1"
+        print(input_tensors)
+        print(input_tensor_types)
+        print(input_tensor_shapes)
+        print(output_tensors)
+        print(output_tensor_types)
+        print(output_tensor_shapes)
+
+        nodes = [
+            helper.make_node(
+                "EPContext",
+                input_tensors,
+                output_tensors,
+                "EPContext",
+                domain="com.microsoft",
+                embed_mode=ctx_embed_mode,
+                ep_cache_context=ep_cache_context_content,
+            ),
+        ]
+
+        model_inputs = []
+        for i in range(len(input_tensors)):
+            model_inputs.append(
+                helper.make_tensor_value_info(
+                    input_tensors[i],
+                    self.trt_data_type_to_onnx_data_type(input_tensor_types[i]),
+                    self.trt_shape_to_ort_shape(input_tensor_shapes[i]),
+                )
+            )
+
+        model_outputs = []
+        for i in range(len(output_tensors)):
+            model_outputs.append(
+                helper.make_tensor_value_info(
+                    output_tensors[i],
+                    self.trt_data_type_to_onnx_data_type(output_tensor_types[i]),
+                    self.trt_shape_to_ort_shape(output_tensor_shapes[i]),
+                )
+            )
+
+        self.graph = helper.make_graph(
+            nodes,
+            "trt_engine_wrapper",
+            model_inputs,
+            model_outputs,
+        )
+
+    def trt_data_type_to_onnx_data_type(self, trt_data_type):
+        if trt_data_type == trt.DataType.FLOAT:
+            return TensorProto.FLOAT
+        elif trt_data_type == trt.DataType.HALF:
+            return TensorProto.FLOAT16
+        elif trt_data_type == trt.DataType.INT8:
+            return TensorProto.INT8
+        elif trt_data_type == trt.DataType.INT32:
+            return TensorProto.INT32
+        elif trt_data_type == trt.DataType.BOOL:
+            return TensorProto.BOOL
+        elif trt_data_type == trt.DataType.UINT8:
+            return TensorProto.UINT8
+        else:
+            return TensorProto.UNDEFINED
+
+    # TRT uses "-1" to represent dynamic dimension
+    # ORT uses symbolic name to represent dynamic dimension
+    # Here we only do the conversion when there is any dynamic dimension in the shape
+    def trt_shape_to_ort_shape(self, trt_data_shape):
+        def has_dynamic_dim(trt_data_shape):
+            if any(dim == -1 for dim in trt_data_shape):
+                return True
+            return False
+
+        if not has_dynamic_dim(trt_data_shape):
+            return trt_data_shape
+
+        ort_data_shape = []
+        if has_dynamic_dim(trt_data_shape):
+            for dim in trt_data_shape:
+                if dim == -1:
+                    ort_data_shape.append("free_dim_" + str(self.dynamic_dim_count))
+                    self.dynamic_dim_count += 1
+                else:
+                    ort_data_shape.append(dim)
+        return ort_data_shape
+
+    def create_model(self):
+        model = helper.make_model(self.graph)
+        onnx.save(model, self.model_name)
+        print(self.model_name + " is created.")
+
+
+def main():
+    parser = ArgumentParser("Generate Onnx model which includes the TensorRT engine binary.")
+    parser.add_argument(
+        "-p", "--trt_engine_cache_path", help="Required. Path to TensorRT engine cache.", required=True, type=str
+    )
+    parser.add_argument(
+        "-e",
+        "--embed_mode",
+        help="mode 0 means the engine cache path and mode 1 means engine binary data",
+        required=False,
+        default=0,
+        type=int,
+    )
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        help="Model name to be created",
+        required=False,
+        default="trt_engine_wrapper.onnx",
+        type=str,
+    )
+    parser.add_argument(
+        "--plugins",
+        help="List of plugin paths to load",
+        required=False,
+        default=[],
+        nargs="+",
+        type=str,
+    )
+    args = parser.parse_args()
+    ctor = TensorRTEngineWrapperCreator(args)
+    ctor.create_model()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index b33491b356e8..8af074f24acc 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -790,7 +790,7 @@ def skip_ep(model_name, ep, model_to_fail_ep):
 
     # if ep in fail_ep_list and fail_ep_list[ep] == "runtime error":
     if ep in fail_ep_list:
-        logger.info("Skip testing " + model_name + " using " + ep + " since it has some issues.")
+        logger.info("Skip testing " + model_name + " using " + ep + " since it has some issues.")  # noqa: G003
         return True
 
     return False
@@ -925,7 +925,7 @@ def find_model_path(path):
 
     logger.info(target_model_path)
     if len(target_model_path) > 1:
-        logger.error("We expect to find only one model in " + path)
+        logger.error("We expect to find only one model in " + path)  # noqa: G003
         raise
 
     return target_model_path[0]
@@ -1575,15 +1575,13 @@ def output_metrics(model_to_metrics, csv_filename):
         for value in results:
             row = [
                 value["model_name"],
-                value["ratio_of_ops_in_cuda_not_fallback_cpu"]
-                if "ratio_of_ops_in_cuda_not_fallback_cpu" in value
-                else "  ",
-                value["total_ops_in_trt"] if "total_ops_in_trt" in value else "  ",
-                value["total_ops"] if "total_ops" in value else "  ",
-                value["ratio_of_ops_in_trt"] if "ratio_of_ops_in_trt" in value else "  ",
-                value["total_trt_execution_time"] if "total_trt_execution_time" in value else "  ",
-                value["total_execution_time"] if "total_execution_time" in value else "  ",
-                value["ratio_of_execution_time_in_trt"] if "ratio_of_execution_time_in_trt" in value else "  ",
+                value.get("ratio_of_ops_in_cuda_not_fallback_cpu", "  "),
+                value.get("total_ops_in_trt", "  "),
+                value.get("total_ops", "  "),
+                value.get("ratio_of_ops_in_trt", "  "),
+                value.get("total_trt_execution_time", "  "),
+                value.get("total_execution_time", "  "),
+                value.get("ratio_of_execution_time_in_trt", "  "),
             ]
             csv_writer.writerow(row)
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index 93d41551c712..f12d4599817b 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -80,9 +80,9 @@ def main():
     benchmark = is_benchmark_mode(args.running_mode)  # noqa: F405
 
     for model, model_info in models.items():
-        logger.info("\n" + "=" * 40 + "=" * len(model))  # noqa: F405
-        logger.info("=" * 20 + model + "=" * 20)  # noqa: F405
-        logger.info("=" * 40 + "=" * len(model))  # noqa: F405
+        logger.info("\n" + "=" * 40 + "=" * len(model))  # noqa: F405, G003
+        logger.info("=" * 20 + model + "=" * 20)  # noqa: F405, G003
+        logger.info("=" * 40 + "=" * len(model))  # noqa: F405, G003
 
         model_info["model_name"] = model
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index b98aafc27579..2b3af071f92b 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -10,13 +10,15 @@
 import os
 import pty
 import shlex
+import subprocess
 import sys
 from typing import List, Optional
 
 TRT_DOCKER_FILES = {
-    "8.4": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4",
-    "8.5": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
-    "8.6": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
+    "8.4.cuda_11_6_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4",
+    "8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
+    "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
+    "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
@@ -45,7 +47,7 @@ def get_common_docker_build_args(args: argparse.Namespace) -> List[str]:
     :return: A list of common 'docker build' arguments.
     """
 
-    return [
+    command = [
         "--no-cache",
         "-t",
         f"{args.image_name}",
@@ -54,6 +56,14 @@ def get_common_docker_build_args(args: argparse.Namespace) -> List[str]:
         "--build-arg",
         f"ONNXRUNTIME_BRANCH={args.branch}",
     ]
+    if args.use_tensorrt_oss_parser:
+        command.extend(
+            [
+                "--build-arg",
+                "PARSER_CONFIG=--use_tensorrt_oss_parser",
+            ]
+        )
+    return command
 
 
 def is_valid_ver_str(version: str, min_comps: int = 0, max_comps: int = 0) -> bool:
@@ -91,18 +101,11 @@ def docker_build_trt(args: argparse.Namespace):
     :param args: The arguments to this script.
     """
 
-    if not is_valid_ver_str(args.trt_version, min_comps=2, max_comps=4):
-        print(f"[ERROR]: Invalid TensorRT version '{args.trt_version}'", file=sys.stderr)
-        sys.exit(1)
-
-    vers_comps = args.trt_version.split(".")
-    trt_ver_key = f"{vers_comps[0]}.{vers_comps[1]}"
-
-    if trt_ver_key not in TRT_DOCKER_FILES:
+    if args.trt_version not in TRT_DOCKER_FILES:
         print(f"[ERROR]: TensorRT version '{args.trt_version}' is currently unsupported", file=sys.stderr)
         sys.exit(1)
 
-    docker_file = TRT_DOCKER_FILES[trt_ver_key]
+    docker_file = TRT_DOCKER_FILES[args.trt_version]
     docker_file_path = os.path.normpath(os.path.join(args.repo_path, docker_file))
 
     if not os.path.isfile(docker_file_path):
@@ -136,11 +139,7 @@ def docker_build_trt_bin(args: argparse.Namespace):
         sys.exit(1)
 
     if not is_valid_ver_str(args.tar_cuda_version, 2, 2):
-        print("[ERROR]: Must specify a valid CUDA version for binary TensorRT installs (e.g., 11.x)", file=sys.stderr)
-        sys.exit(1)
-
-    if not is_valid_ver_str(args.tar_cudnn_version, 2, 2):
-        print("[ERROR]: Must specify a valid cuDNN version for binary TensorRT installs (e.g., 8.x)", file=sys.stderr)
+        print("[ERROR]: Must specify a valid CUDA version for binary TensorRT installs (e.g., 12.4)", file=sys.stderr)
         sys.exit(1)
 
     if not os.path.isfile(docker_file_path):
@@ -162,8 +161,6 @@ def docker_build_trt_bin(args: argparse.Namespace):
             "--build-arg",
             f"TAR_CUDA_VERSION={args.tar_cuda_version}",
             "--build-arg",
-            f"TAR_CUDNN_VERSION={args.tar_cudnn_version}",
-            "--build-arg",
             f"TRT_BINS_DIR={args.trt_bins_dir}",
             "-f",
             f"{docker_file_path}",
@@ -176,6 +173,55 @@ def docker_build_trt_bin(args: argparse.Namespace):
         sys.exit(1)
 
 
+def overwrite_onnx_tensorrt_commit_id(commit_id):
+    """
+    Overwrite onnx-tensorrt commit id in cmake/deps.txt.
+    """
+    deps_file_path = "../../../../../../cmake/deps.txt"
+    line_index = None
+    zip_url = None
+
+    with open(deps_file_path) as file:
+        lines = file.readlines()
+
+    for i, line in enumerate(lines):
+        if line.startswith("onnx_tensorrt"):
+            parts = line.split(";")
+            zip_url = ";".join([parts[0], f"https://github.com/onnx/onnx-tensorrt/archive/{commit_id}.zip", parts[2]])
+            line_index = i
+            break
+
+    if line_index and zip_url:
+        wget_command = f"wget {zip_url.split(';')[1]} -O temp.zip"
+        subprocess.run(wget_command, shell=True, check=True)
+
+        sha1sum_command = "sha1sum temp.zip"
+        result = subprocess.run(sha1sum_command, shell=True, capture_output=True, text=True, check=True)
+        hash_value = result.stdout.split()[0]
+
+        lines[line_index] = zip_url.split(";")[0] + ";" + zip_url.split(";")[1] + ";" + hash_value + "\n"
+
+        with open(deps_file_path, "w") as file:
+            file.writelines(lines)
+
+        print(f"Updated deps.txt with new commit id {commit_id} and hash {hash_value}")
+
+        # Verify updated deps.txt
+        try:
+            with open(deps_file_path) as file:
+                lines = file.readlines()
+                for line in lines:
+                    if line.startswith("onnx_tensorrt"):
+                        print(line.strip())
+                        break
+        except Exception as e:
+            print(f"Failed to read the file: {e}")
+
+        os.remove("temp.zip")
+    else:
+        print("onnx_tensorrt commit id overwrite failed, entry not found in deps.txt")
+
+
 def parse_arguments() -> argparse.Namespace:
     """
     Parses command-line arguments and returns an object with each argument as a field.
@@ -187,8 +233,11 @@ def parse_arguments() -> argparse.Namespace:
     parser.add_argument("-r", "--repo_path", required=True, help="Path to the onnxruntime repository")
     parser.add_argument("-i", "--image_name", required=True, help="The resulting Docker image name")
     parser.add_argument("-b", "--branch", default="main", help="Name of the onnxruntime git branch to checkout")
-    parser.add_argument("-t", "--trt_version", default="8.4.1.5", help="TensorRT version (e.g., 8.4.1.5)")
+    parser.add_argument(
+        "-t", "--trt_version", default="8.6.cuda_11_8_cudnn_8", help="TensorRT version (e.g., 8.6.cuda_11_8_cudnn_8)"
+    )
     parser.add_argument("-a", "--cuda_arch", default="75", help="CUDA architecture (e.g., 75)")
+    parser.add_argument("-o", "--oss_parser_commit_id", default="", help="commit id of onnx-tensorrt")
 
     # Command-line options for installing TensorRT from binaries.
     parser.add_argument(
@@ -200,14 +249,15 @@ def parse_arguments() -> argparse.Namespace:
     parser.add_argument(
         "--tar_cuda_version",
         default="",
-        help="CUDA version (e.g., 11.8) used to find TensorRT EA binary tar.gz package",
+        help="CUDA version (e.g., 12.4) used to find TensorRT EA binary tar.gz package",
     )
+    parser.add_argument("--trt_bins_dir", default="", help="Directory containing TensorRT tar.gz package")
     parser.add_argument(
-        "--tar_cudnn_version",
-        default="",
-        help="CUDA version (e.g., 8.6) used to find TensorRT EA binary tar.gz package",
+        "--use_tensorrt_oss_parser",
+        action="store_true",
+        default=False,
+        help="Use TensorRT OSS Parser",
     )
-    parser.add_argument("--trt_bins_dir", default="", help="Directory containing TensorRT tar.gz package")
 
     return parser.parse_args()
 
@@ -224,6 +274,8 @@ def main() -> int:
     if args.install_bin:
         docker_build_trt_bin(args)
     else:
+        if args.oss_parser_commit_id != "":
+            overwrite_onnx_tensorrt_commit_id(args.oss_parser_commit_id)
         docker_build_trt(args)
 
     return 0
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py b/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
index 6e20071683d9..c7d4a7836132 100755
--- a/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
@@ -13,6 +13,12 @@ def parse_arguments():
     parser.add_argument("-b", "--branch", required=False, default="master", help="Github branch to test perf off of")
     parser.add_argument("-s", "--save", required=False, help="Directory to archive wheel file")
     parser.add_argument("-a", "--use_archived", required=False, help="Archived wheel file")
+    parser.add_argument(
+        "--use_tensorrt_oss_parser",
+        action="store_true",
+        default=False,
+        help="Use TensorRT OSS Parser",
+    )
     args = parser.parse_args()
     return args
 
@@ -35,14 +41,14 @@ def install_new_ort_wheel(ort_master_path):
 def main():
     args = parse_arguments()
 
-    cmake_tar = "cmake-3.18.4-Linux-x86_64.tar.gz"
+    cmake_tar = "cmake-3.28.3-linux-x86_64.tar.gz"
     if not os.path.exists(cmake_tar):
-        subprocess.run(["wget", "-c", "https://cmake.org/files/v3.18/" + cmake_tar], check=True)
+        subprocess.run(["wget", "-c", "https://cmake.org/files/v3.28/" + cmake_tar], check=True)
     tar = tarfile.open(cmake_tar)
     tar.extractall()
     tar.close()
 
-    os.environ["PATH"] = os.path.join(os.path.abspath("cmake-3.18.4-Linux-x86_64"), "bin") + ":" + os.environ["PATH"]
+    os.environ["PATH"] = os.path.join(os.path.abspath("cmake-3.28.3-linux-x86_64"), "bin") + ":" + os.environ["PATH"]
     os.environ["CUDACXX"] = os.path.join(args.cuda_home, "bin", "nvcc")
 
     ort_master_path = args.ort_master_path
@@ -57,24 +63,24 @@ def main():
         subprocess.run(["git", "fetch"], check=True)
         subprocess.run(["git", "checkout", args.branch], check=True)
         subprocess.run(["git", "pull", "origin", args.branch], check=True)
-        subprocess.run(
-            [
-                "./build.sh",
-                "--config",
-                "Release",
-                "--use_tensorrt",
-                "--tensorrt_home",
-                args.tensorrt_home,
-                "--cuda_home",
-                args.cuda_home,
-                "--cudnn",
-                "/usr/lib/x86_64-linux-gnu",
-                "--build_wheel",
-                "--skip_tests",
-                "--parallel",
-            ],
-            check=True,
-        )
+        command = [
+            "./build.sh",
+            "--config",
+            "Release",
+            "--use_tensorrt",
+            "--tensorrt_home",
+            args.tensorrt_home,
+            "--cuda_home",
+            args.cuda_home,
+            "--cudnn",
+            "/usr/lib/x86_64-linux-gnu",
+            "--build_wheel",
+            "--skip_tests",
+            "--parallel",
+        ]
+        if args.use_tensorrt_oss_parser:
+            command.append("--use_tensorrt_oss_parser")
+        subprocess.run(command, check=True)
 
         ort_wheel_file = install_new_ort_wheel(ort_master_path)
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
index dd53fe612746..2cfdd39bc96a 100755
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
@@ -4,13 +4,14 @@
 
 set -x
 
-while getopts p:o:l:s: parameter
+while getopts p:o:l:s:c: parameter
 do case "${parameter}"
 in
 p) WORKSPACE=${OPTARG};;
 o) ORT_BINARY_PATH=${OPTARG};;
 l) BUILD_ORT_LATEST=${OPTARG};;
 s) ORT_SOURCE=${OPTARG};;
+c) CONCURRENCY=${OPTARG};;
 esac
 done
 
@@ -104,6 +105,26 @@ fi
 
 mv valgrind.log result
 
+# Concurrency Test
+FRCNN_FOLDER="/data/ep-perf-models/onnx-zoo-models/FasterRCNN-10/"
+
+mkdir FasterRCNN-10/
+cp -r ${FRCNN_FOLDER}/test_data_set_0 ${FRCNN_FOLDER}/faster_rcnn_R_50_FPN_1x.onnx ./FasterRCNN-10/
+
+# replicate test inputs
+for (( i=1; i<CONCURRENCY; i++ )); do
+    cp -r "./FasterRCNN-10/test_data_set_0/" "./FasterRCNN-10/test_data_set_$i/"
+done
+
+pip install onnx requests packaging
+python ${ORT_SOURCE}/onnxruntime/python/tools/symbolic_shape_infer.py \
+    --input="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
+    --output="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
+    --auto_merge
+
+${ORT_SOURCE}/build/Linux/Release/onnx_test_runner -e tensorrt -c ${CONCURRENCY} -r 100 ./FasterRCNN-10/ > concurrency_test.log 2>&1
+mv concurrency_test.log result
+
 # Run AddressSanitizer 
 ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
index 4e94c63ee6c2..a355e4cf5d36 100755
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
@@ -3,13 +3,14 @@
 set -x
 
 # Parse Arguments
-while getopts w:d:p:l: parameter
+while getopts w:d:p:l:c: parameter
 do case "${parameter}"
 in 
 w) WORKSPACE=${OPTARG};; # workspace folder of onnxruntime
 d) DOCKER_IMAGE=${OPTARG};; # docker image:"trt-ep-mem-test" docker image is already pre-built on perf machine
 p) MEM_TEST_DIR=${OPTARG};; # mem test dir
 l) BUILD_ORT_LATEST=${OPTARG};; # whether to build latest ORT
+c) CONCURRENCY=${OPTARG};;
 esac
 done 
 
@@ -24,4 +25,4 @@ then
     BUILD_ORT_LATEST="true"
 fi
 
-docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST
+docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST -c $CONCURRENCY
diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
index 0f5614bd5160..fe941096e2fe 100644
--- a/onnxruntime/python/tools/tensorrt/perf/post.py
+++ b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import argparse
+import csv
 import datetime
 import os
 import sys
@@ -56,6 +57,7 @@ def parse_arguments():
     parser.add_argument("-b", "--branch", help="Branch", required=True)
     parser.add_argument("--kusto_conn", help="Kusto connection URL", required=True)
     parser.add_argument("--database", help="Database name", required=True)
+    parser.add_argument("--use_tensorrt_oss_parser", help="Use TensorRT OSS parser", required=False)
     parser.add_argument(
         "-d",
         "--commit_datetime",
@@ -309,7 +311,7 @@ def get_specs(specs, branch, commit_hash, commit_datetime):
     :return: The updated table.
     """
 
-    init_id = int(specs.tail(1).get(".", 0)) + 1
+    init_id = int(specs.tail(1).get(".", 0).iloc[0]) + 1
     specs_additional = pd.DataFrame(
         {
             ".": [init_id, init_id + 1, init_id + 2],
@@ -370,7 +372,7 @@ def write_table(
     ingest_client.ingest_from_dataframe(table, ingestion_properties=ingestion_props)
 
 
-def get_identifier(commit_datetime, commit_hash, trt_version, branch):
+def get_identifier(commit_datetime, commit_hash, trt_version, branch, use_tensorrt_oss_parser):
     """
     Returns an identifier that associates uploaded data with an ORT commit/date/branch and a TensorRT version.
 
@@ -383,7 +385,23 @@ def get_identifier(commit_datetime, commit_hash, trt_version, branch):
     """
 
     date = str(commit_datetime.date())  # extract date only
-    return date + "_" + commit_hash + "_" + trt_version + "_" + branch
+    if use_tensorrt_oss_parser:
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        root_dir = os.path.abspath(os.path.join(current_dir, "../../../../.."))
+        deps_txt_path = os.path.join(root_dir, "cmake", "deps.txt")
+        commit_head = ""
+        with open(deps_txt_path) as file:
+            for line in file:
+                parts = line.split(";")
+                if parts[0] == "onnx_tensorrt":
+                    url = parts[1]
+                    commit = url.split("/")[-1]
+                    commit_head = commit[:6]
+                    break
+        parser = f"oss_{commit_head}"
+    else:
+        parser = "builtin"
+    return "_".join([date, commit_hash, trt_version, parser, branch])
 
 
 def main():
@@ -396,14 +414,18 @@ def main():
     # connect to database
     kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn)
     ingest_client = QueuedIngestClient(kcsb_ingest)
-    identifier = get_identifier(args.commit_datetime, args.commit_hash, args.trt_version, args.branch)
+    identifier = get_identifier(
+        args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser
+    )
+    print(f"DB record identifier: {identifier}")
     upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
 
     try:
+        # Load EP Perf test results from /result
         result_file = args.report_folder
-
-        folders = os.listdir(result_file)
-        os.chdir(result_file)
+        result_perf_test_path = os.path.join(result_file, "result")
+        folders = os.listdir(result_perf_test_path)
+        os.chdir(result_perf_test_path)
 
         tables = [
             fail_name,
@@ -426,13 +448,13 @@ def main():
         for model_group in folders:
             os.chdir(model_group)
             csv_filenames = os.listdir()
-            for csv in csv_filenames:
-                table = pd.read_csv(csv)
-                if session_name in csv:
+            for csv_file in csv_filenames:
+                table = pd.read_csv(csv_file)
+                if session_name in csv_file:
                     table_results[session_name] = pd.concat(
                         [table_results[session_name], get_session(table, model_group)], ignore_index=True
                     )
-                elif specs_name in csv:
+                elif specs_name in csv_file:
                     table_results[specs_name] = pd.concat(
                         [
                             table_results[specs_name],
@@ -440,12 +462,12 @@ def main():
                         ],
                         ignore_index=True,
                     )
-                elif fail_name in csv:
+                elif fail_name in csv_file:
                     table_results[fail_name] = pd.concat(
                         [table_results[fail_name], get_failures(table, model_group)],
                         ignore_index=True,
                     )
-                elif latency_name in csv:
+                elif latency_name in csv_file:
                     table_results[memory_name] = pd.concat(
                         [table_results[memory_name], get_memory(table, model_group)],
                         ignore_index=True,
@@ -455,11 +477,11 @@ def main():
                         [table_results[latency_name], get_latency(table, model_group)],
                         ignore_index=True,
                     )
-                elif status_name in csv:
+                elif status_name in csv_file:
                     table_results[status_name] = pd.concat(
                         [table_results[status_name], get_status(table, model_group)], ignore_index=True
                     )
-                elif op_metrics_name in csv:
+                elif op_metrics_name in csv_file:
                     table = table.assign(Group=model_group)
                     table_results[op_metrics_name] = pd.concat(
                         [table_results[op_metrics_name], table], ignore_index=True
@@ -493,6 +515,43 @@ def main():
                 args.commit_datetime,
             )
 
+        # Load concurrency test results
+        result_mem_test_path = os.path.join(result_file, "result_mem_test")
+        os.chdir(result_mem_test_path)
+        log_path = "concurrency_test.log"
+        if os.path.exists(log_path):
+            print("Generating concurrency test report")
+            with open(log_path) as log_file:
+                log_content = log_file.read()
+
+            failed_cases_section = log_content.split("Failed Test Cases:")[1]
+
+            # passed = 1 if no failed test cases
+            if failed_cases_section.strip() == "":
+                passed = 1
+            else:
+                passed = 0
+
+            csv_path = "concurrency_test.csv"
+            with open(csv_path, "w", newline="") as csv_file:
+                csv_writer = csv.writer(csv_file)
+                csv_writer.writerow(["Passed", "Log"])
+                csv_writer.writerow([passed, log_content])
+
+            db_table_name = "ep_concurrencytest_record"
+            table = pd.read_csv(csv_path)
+            write_table(
+                ingest_client,
+                args.database,
+                table,
+                db_table_name,
+                upload_time,
+                identifier,
+                args.branch,
+                args.commit_hash,
+                args.commit_datetime,
+            )
+
     except BaseException as e:
         print(str(e))
         sys.exit(1)
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index f506516442b1..9baafbbfff0e 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -36,6 +36,8 @@
             python benchmark.py -e torchscript onnxruntime -p "int8" -o
         Run OnnxRuntime with the ROCM provider and graph optimization script:
             python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
+        Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
+            python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm
 
     It is recommended to use run_benchmark.sh to launch benchmark.
 """
@@ -106,6 +108,7 @@ def run_onnxruntime(
     use_raw_attention_mask,
     model_fusion_statistics,
     model_source,
+    enable_arm64_bfloat16_fastmath_mlas_gemm,
     args,
 ):
     import onnxruntime
@@ -209,6 +212,7 @@ def run_onnxruntime(
                 enable_all_optimization=True,
                 num_threads=num_threads,
                 verbose=verbose,
+                enable_mlas_gemm_fastmath_arm64_bfloat16=enable_arm64_bfloat16_fastmath_mlas_gemm,
             )
             if ort_session is None:
                 continue
@@ -344,9 +348,7 @@ def run_pytorch(
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
-            max_input_size = (
-                tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
-            )
+            max_input_size = tokenizer.max_model_input_sizes.get(model_name, 1024)
 
         logger.debug(f"Model {model}")
         logger.debug(f"Number of parameters {model.num_parameters()}")
@@ -498,9 +500,7 @@ def run_tensorflow(
 
         tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
-        max_input_size = (
-            tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
-        )
+        max_input_size = tokenizer.max_model_input_sizes.get(model_name, 1024)
 
         for batch_size in batch_sizes:
             if batch_size <= 0:
@@ -764,6 +764,14 @@ def parse_arguments():
         help="Manually set the model's layer number",
     )
 
+    parser.add_argument(
+        "--enable_arm64_bfloat16_fastmath_mlas_gemm",
+        required=False,
+        action="store_true",
+        help="Enable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP ",
+    )
+    parser.set_defaults(enable_arm64_bfloat16_fastmath_mlas_gemm=False)
+
     FusionOptions.add_arguments(parser)
 
     args = parser.parse_args()
@@ -794,7 +802,7 @@ def main():
         try:
             os.mkdir(args.cache_dir)
         except OSError:
-            logger.error("Creation of the directory %s failed" % args.cache_dir)
+            logger.error("Creation of the directory %s failed" % args.cache_dir)  # noqa: G002
 
     enable_torch = "torch" in args.engines
     enable_torch2 = "torch2" in args.engines
@@ -909,10 +917,11 @@ def main():
                     use_raw_attention_mask,
                     model_fusion_statistics,
                     args.model_source,
+                    args.enable_arm64_bfloat16_fastmath_mlas_gemm,
                     args,
                 )
             except Exception:
-                logger.error("Exception", exc_info=True)
+                logger.exception("Exception")
 
     time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
     if model_fusion_statistics:
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index b6f7a44450c6..66f7a6344776 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -85,6 +85,7 @@ def create_onnxruntime_session(
     num_threads=-1,
     enable_profiling=False,
     verbose=False,
+    enable_mlas_gemm_fastmath_arm64_bfloat16=False,
     provider_options={},  # map execution provider name to its option  # noqa: B006
 ):
     session = None
@@ -136,9 +137,12 @@ def create_onnxruntime_session(
         if provider_options:
             providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]
 
+        if enable_mlas_gemm_fastmath_arm64_bfloat16:
+            sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1")
+
         session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
     except Exception:
-        logger.error("Exception", exc_info=True)
+        logger.error("Exception", exc_info=True)  # noqa: G201
 
     return session
 
@@ -341,11 +345,7 @@ def inference_ort_with_io_binding(
     # Bind inputs to device
     for name in ort_inputs:
         np_input = torch.from_numpy(ort_inputs[name]).to(device)
-        input_type = (
-            IO_BINDING_DATA_TYPE_MAP[str(ort_inputs[name].dtype)]
-            if str(ort_inputs[name].dtype) in IO_BINDING_DATA_TYPE_MAP
-            else data_type
-        )
+        input_type = IO_BINDING_DATA_TYPE_MAP.get(str(ort_inputs[name].dtype), data_type)
         io_binding.bind_input(
             name,
             np_input.device.type,
@@ -589,7 +589,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None):
             if max_usage is None:
                 return None
 
-            print(f"GPU memory usage: before={memory_before_test}  peak={max_usage}")
+            logger.info(f"GPU memory usage: before={memory_before_test}  peak={max_usage}")
             if len(memory_before_test) >= 1 and len(max_usage) >= 1 and len(memory_before_test) == len(max_usage):
                 # When there are multiple GPUs, we will check the one with maximum usage.
                 max_used = 0
@@ -620,7 +620,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None):
             monitor.keep_measuring = False
             max_usage = mem_thread.result()
 
-        print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
+        logger.info(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
         return max_usage - memory_before_test
 
 
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 9c743a83819c..17c5d3602bb3 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -232,9 +232,9 @@ def onnxruntime_inference(session, all_inputs, output_names):
 def to_string(model_path, session, test_setting):
     sess_options = session.get_session_options()
     option = f"model={os.path.basename(model_path)},"
-    option += "graph_optimization_level={},intra_op_num_threads={},".format(
-        sess_options.graph_optimization_level, sess_options.intra_op_num_threads
-    ).replace("GraphOptimizationLevel.ORT_", "")
+    option += f"graph_optimization_level={sess_options.graph_optimization_level},intra_op_num_threads={sess_options.intra_op_num_threads},".replace(
+        "GraphOptimizationLevel.ORT_", ""
+    )
 
     option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},"
     option += f"test_cases={test_setting.test_cases},test_times={test_setting.test_times},"
diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py
index 84ecae1907cd..aa82e047df32 100644
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@@ -174,12 +174,10 @@ def output_test_data(directory: str, inputs: Dict[str, np.ndarray]):
     else:
         print("Warning: directory %s existed. Files will be overwritten." % directory)
 
-    index = 0
-    for name, data in inputs.items():
+    for index, (name, data) in enumerate(inputs.items()):
         tensor = numpy_helper.from_array(data, name)
         with open(os.path.join(directory, f"input_{index}.pb"), "wb") as file:
             file.write(tensor.SerializeToString())
-        index += 1
 
 
 def fake_test_data(
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 61e4c97c75c8..0c5125e74c8a 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -59,16 +59,10 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-1, atol=1e-3):
                         print(f"abs_diff={abs_diff}")
 
     if diff_count == 0:
-        print(
-            "100% passed for {} random inputs given thresholds (rtol={}, atol={}).".format(
-                len(baseline_results), rtol, atol
-            )
-        )
+        print(f"100% passed for {len(baseline_results)} random inputs given thresholds (rtol={rtol}, atol={atol}).")
     else:
         print(
-            "WARNING: {} out of {} results NOT passed for thresholds (rtol={}, atol={}).".format(
-                diff_count, len(baseline_results), rtol, atol
-            )
+            f"WARNING: {diff_count} out of {len(baseline_results)} results NOT passed for thresholds (rtol={rtol}, atol={atol})."
         )
 
     print(f"maximum absolute difference={max_abs_diff}")
@@ -117,11 +111,7 @@ def run_test(
         baseline_model, all_inputs, use_gpu, disable_optimization=True
     )
     if verbose:
-        print(
-            "baseline average latency (all optimizations disabled): {} ms".format(
-                statistics.mean(baseline_latency) * 1000
-            )
-        )
+        print(f"baseline average latency (all optimizations disabled): {statistics.mean(baseline_latency) * 1000} ms")
 
     if output_dir is not None:
         for i, inputs in enumerate(all_inputs):
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index 17f0dd0bc607..894e11275056 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -55,10 +55,6 @@
 import torch
 from benchmark_helper import Precision, setup_logger
 from fusion_utils import NumpyHelper
-from models.gpt2.convert_to_onnx import main as convert_gpt2_to_onnx
-from models.gpt2.gpt2_helper import PRETRAINED_GPT2_MODELS
-from models.t5.convert_to_onnx import export_onnx_models as export_t5_onnx_models
-from models.t5.t5_helper import PRETRAINED_MT5_MODELS, PRETRAINED_T5_MODELS
 from onnx import GraphProto, ModelProto, TensorProto
 from onnx_model import OnnxModel
 from transformers import (
@@ -73,6 +69,10 @@
 )
 
 from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_available_providers
+from onnxruntime.transformers.models.gpt2.convert_to_onnx import main as convert_gpt2_to_onnx
+from onnxruntime.transformers.models.gpt2.gpt2_helper import PRETRAINED_GPT2_MODELS
+from onnxruntime.transformers.models.t5.convert_to_onnx import export_onnx_models as export_t5_onnx_models
+from onnxruntime.transformers.models.t5.t5_helper import PRETRAINED_MT5_MODELS, PRETRAINED_T5_MODELS
 
 logger = logging.getLogger("")
 
@@ -372,7 +372,7 @@ def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace:
         type=int,
         required=False,
         default=1,
-        help="Minimumber of tokens we keep per batch example in the output.",
+        help="Minimum number of tokens we keep per batch example in the output.",
     )
 
     beam_parameters_group.add_argument(
@@ -466,7 +466,7 @@ def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace:
         "--save_test_data",
         required=False,
         action="store_true",
-        help="save test data for onnxruntimer_perf_test tool",
+        help="save test data for onnxruntime_perf_test tool",
     )
     test_group.set_defaults(save_test_data=False)
 
@@ -1225,7 +1225,7 @@ def find_past_seq_len_usage(subg: GraphProto):
     tensor_names_to_rename = set()
     nodes_to_remove = []
 
-    graph_intput_names = {inp.name: index for index, inp in enumerate(subg.input)}
+    graph_input_names = {inp.name: index for index, inp in enumerate(subg.input)}
 
     input_name_to_nodes = {}
     output_name_to_node = {}
@@ -1259,7 +1259,7 @@ def find_past_seq_len_usage(subg: GraphProto):
                 if (
                     shape_node.op_type == "Shape"
                     and shape_node.input[0]
-                    and shape_node.input[0] in graph_intput_names
+                    and shape_node.input[0] in graph_input_names
                     and (
                         shape_node.input[0].startswith("past_key_self_")
                         or shape_node.input[0].startswith("past_value_self_")
@@ -1273,7 +1273,7 @@ def find_past_seq_len_usage(subg: GraphProto):
 
 
 def replace_mha_with_gqa(
-    model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1, window_size: int = 0
+    model: OnnxModel, attn_mask: str, kv_num_heads: int = 0, world_size: int = 1, window_size: int = -1
 ):
     # Insert attention_mask subgraph to calculate shared inputs for all GroupQueryAttention nodes
     #
@@ -1339,31 +1339,163 @@ def replace_mha_with_gqa(
     )
 
     # Replace MultiHeadAttention with GroupQueryAttention
+    #
+    # When replacing, fuse the following subgraph:
+    #
+    #                 root_input
+    #               /     |      \
+    #         MatMul    MatMul    MatMul
+    #           |         |         |
+    #          Add       Add       Add      (optional Adds)
+    #           |         |         |
+    #         RotEmb    RotEmb      |
+    #            \        |        /
+    #             MultiHeadAttention
+    #
+    # to this new subgraph:
+    #
+    #                 root_input
+    #                     |
+    #                PackedMatMul           (if possible)
+    #                     |
+    #                 PackedAdd             (if possible)
+    #                     |
+    #             GroupQueryAttention
+    #
+
     mha_nodes = list(filter(lambda node: node.op_type == "MultiHeadAttention", model.model.graph.node))
-    for node in mha_nodes:
-        num_heads_mha = 0
+    for idx, node in enumerate(mha_nodes):
+        # Detect Q path to MHA
+        q_path_1 = model.match_parent_path(node, ["RotaryEmbedding", "Add", "MatMul"], [0, 0, 0])
+        q_path_2 = model.match_parent_path(node, ["RotaryEmbedding", "MatMul"], [0, 0])
+
+        q_rotary, q_add, q_matmul = None, None, None
+        if q_path_1 is not None:
+            q_rotary, q_add, q_matmul = q_path_1
+        elif q_path_2 is not None:
+            q_rotary, q_matmul = q_path_2
+
+        # Detect K path to MHA
+        k_path_1 = model.match_parent_path(node, ["RotaryEmbedding", "Add", "MatMul"], [1, 0, 0])
+        k_path_2 = model.match_parent_path(node, ["RotaryEmbedding", "MatMul"], [1, 0])
+
+        k_rotary, k_add, k_matmul = None, None, None
+        if k_path_1 is not None:
+            k_rotary, k_add, k_matmul = k_path_1
+        elif k_path_2 is not None:
+            k_rotary, k_matmul = k_path_2
+
+        # Detect V path to MHA
+        v_path_1 = model.match_parent_path(node, ["Add", "MatMul"], [2, 0])
+        v_path_2 = model.match_parent_path(node, ["MatMul"], [2])
+
+        v_add, v_matmul = None, None
+        if v_path_1 is not None:
+            v_add, v_matmul = v_path_1
+        elif v_path_2 is not None:
+            v_matmul = v_path_2[0]
+
+        # Get `interleaved` attribute from RotaryEmbedding
+        interleaved = 0
+        if q_rotary is not None and k_rotary is not None:
+            for att in q_rotary.attribute:
+                if att.name == "interleaved":
+                    interleaved = att.i
+
+        # Get `num_heads` attribute from MHA
+        num_heads = 0
         for att in node.attribute:
             if att.name == "num_heads":
-                num_heads_mha = att.i
+                num_heads = att.i
+
+        # Check if root_input to Q/K/V paths is the same
+        root_input_is_same = q_matmul.input[0] == k_matmul.input[0] and k_matmul.input[0] == v_matmul.input[0]
+
+        # Check if Q/K/V paths all have bias or all don't have bias
+        all_paths_have_bias = q_add is not None and k_add is not None and v_add is not None
+        all_paths_have_no_bias = q_add is None and k_add is None and v_add is None
+
+        # Make PackedMatMul node if possible
+        q_input_to_attention, k_input_to_attention, v_input_to_attention = "", "", ""
+        if root_input_is_same and (all_paths_have_bias or all_paths_have_no_bias):
+            qw = NumpyHelper.to_array(model.get_initializer(q_matmul.input[1]))
+            kw = NumpyHelper.to_array(model.get_initializer(k_matmul.input[1]))
+            vw = NumpyHelper.to_array(model.get_initializer(v_matmul.input[1]))
+
+            dim = qw.shape[-1]
+            qkv_weight = np.stack((qw, kw, vw), axis=1).reshape(dim, 3 * dim)
+            qkv_weight = onnx.numpy_helper.from_array(qkv_weight, name=f"QKV_Weight_{idx}")
+            model.add_initializer(qkv_weight)
+
+            packed_matmul_node = onnx.helper.make_node(
+                "MatMul",
+                inputs=[q_matmul.input[0], qkv_weight.name],
+                outputs=[f"{qkv_weight.name}_output"],
+                name=model.create_node_name("MatMul"),
+            )
+            model.model.graph.node.extend([packed_matmul_node])
+            model.model.graph.node.remove(q_matmul)
+            model.model.graph.node.remove(k_matmul)
+            model.model.graph.node.remove(v_matmul)
+            q_input_to_attention = packed_matmul_node.output[0]
+
+            # Make PackedAdd node if possible
+            if all_paths_have_bias:
+                qb = NumpyHelper.to_array(model.get_initializer(q_add.input[1]))
+                kb = NumpyHelper.to_array(model.get_initializer(k_add.input[1]))
+                vb = NumpyHelper.to_array(model.get_initializer(v_add.input[1]))
+
+                dim = qb.shape[-1]
+                qkv_bias = np.stack((qb, kb, vb), axis=0).reshape(3 * dim)
+                qkv_bias = onnx.numpy_helper.from_array(qkv_bias, name=f"QKV_Bias_{idx}")
+                model.add_initializer(qkv_bias)
+                packed_add_node = onnx.helper.make_node(
+                    "Add",
+                    inputs=[packed_matmul_node.output[0], qkv_bias.name],
+                    outputs=[f"{qkv_bias.name}_output"],
+                )
+                model.model.graph.node.extend([packed_add_node])
+                model.model.graph.node.remove(q_add)
+                model.model.graph.node.remove(k_add)
+                model.model.graph.node.remove(v_add)
+                q_input_to_attention = packed_add_node.output[0]
+
+        else:
+            q_input_to_attention = q_matmul.output[0]
+            k_input_to_attention = k_matmul.output[0]
+            v_input_to_attention = v_matmul.output[0]
+
+        # Make GQA node
         gqa_node = onnx.helper.make_node(
             "GroupQueryAttention",
             inputs=[
-                node.input[0],  # query
-                node.input[1],  # key
-                node.input[2],  # value
+                q_input_to_attention,  # query
+                k_input_to_attention,  # key
+                v_input_to_attention,  # value
                 node.input[6],  # past_key
                 node.input[7],  # past_value
-                "seqlens_k",  # seqlens_k (for attention_mask)
-                "total_seq_len",  # total_seq_len (for attention_mask)
+                seqlen_k_cast_node.output[0],  # seqlens_k (for attention mask)
+                total_seqlen_cast_node.output[0],  # total_seq_len (for attention mask)
+                q_rotary.input[2] if q_rotary is not None else "",  # cos_cache (for rotary embeddings)
+                q_rotary.input[3] if q_rotary is not None else "",  # sin_cache (for rotary embeddings)
             ],
             outputs=node.output,
             name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"),
             domain="com.microsoft",
-            num_heads=num_heads_mha // world_size,
-            kv_num_heads=num_heads_mha // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
+            num_heads=num_heads // world_size,
+            kv_num_heads=num_heads // world_size if kv_num_heads == 0 else kv_num_heads // world_size,
+            local_window_size=window_size,
+            do_rotary=int(q_rotary is not None and k_rotary is not None),
+            rotary_interleaved=interleaved,
         )
         model.model.graph.node.remove(node)
         model.model.graph.node.extend([gqa_node])
+
+        if q_rotary is not None:
+            model.model.graph.node.remove(q_rotary)
+        if k_rotary is not None:
+            model.model.graph.node.remove(k_rotary)
+
     return model
 
 
@@ -1423,7 +1555,7 @@ def update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha(subg: ModelP
         if node.op_type == "MultiHeadAttention":
             old_nodes.extend([node])
 
-    # If not all the MultiheadAttention nodes are fused, this optimization is not applicable
+    # If not all the MultiHeadAttention nodes are fused, this optimization is not applicable
     if len(old_nodes) < num_layers:
         return False
 
diff --git a/onnxruntime/python/tools/transformers/dynamo_onnx_helper.py b/onnxruntime/python/tools/transformers/dynamo_onnx_helper.py
new file mode 100644
index 000000000000..9a66afe3ad4f
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/dynamo_onnx_helper.py
@@ -0,0 +1,104 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import logging
+
+import onnx
+
+
+class DynamoOnnxHelper:
+    """
+    Helper class for processing ONNX models exported by torch Dynamo.
+    """
+
+    def __init__(self, model: onnx.ModelProto):
+        self.model = model
+
+    def update_edges(self, edge_mapping: dict) -> None:
+        """
+        Updates the edges in the model according to the given mapping.
+        """
+        for node in self.model.graph.node:
+            for i in range(len(node.input)):
+                if node.input[i] in edge_mapping:
+                    node.input[i] = edge_mapping[node.input[i]]
+            for i in range(len(node.output)):
+                if node.output[i] in edge_mapping:
+                    node.output[i] = edge_mapping[node.output[i]]
+
+        for graph_input in self.model.graph.input:
+            if graph_input.name in edge_mapping:
+                graph_input.name = edge_mapping[graph_input.name]
+        for graph_output in self.model.graph.output:
+            if graph_output.name in edge_mapping:
+                graph_output.name = edge_mapping[graph_output.name]
+
+    def unroll_function(self, func_name: str) -> None:
+        """
+        Unrolls the function with the given name in the model.
+        """
+        logging.info(f"Unrolling function {func_name}...")
+        nodes_to_remove = []
+        nodes_to_add = []
+        edges_to_remove = []
+        edges_to_add = []
+        for node in self.model.graph.node:
+            if node.op_type == func_name:
+                nodes_to_remove.append(node)
+                edges_to_remove.extend(list(node.input) + list(node.output))
+
+        func_to_remove = None
+        for f in self.model.functions:
+            if f.name == func_name:
+                nodes_to_add.extend(list(f.node))
+                edges_to_add.extend(list(f.input) + list(f.output))
+                func_to_remove = f
+
+        assert len(edges_to_remove) == len(edges_to_add)
+
+        for node in nodes_to_remove:
+            self.model.graph.node.remove(node)
+        for node in nodes_to_add:
+            self.model.graph.node.append(node)
+        if func_to_remove is not None:
+            self.model.functions.remove(func_to_remove)
+
+        edge_mapping = {}
+        for i in range(len(edges_to_remove)):
+            k = edges_to_remove[i]
+            v = edges_to_add[i]
+            if k != v:
+                edge_mapping[k] = v
+
+        return self.update_edges(edge_mapping)
+
+    def remove_function(self, func_name: str, input_id: int, output_id: int) -> None:
+        """
+        Removes the function in the model.
+        """
+        edge_mapping = {}
+        nodes_to_remove = []
+        for node in self.model.graph.node:
+            if node.op_type.find(func_name) != -1:
+                edge_mapping[node.input[input_id]] = node.output[output_id]
+                nodes_to_remove.append(node)
+        for node in nodes_to_remove:
+            self.model.graph.node.remove(node)
+
+        self.update_edges(edge_mapping)
+
+    def remove_dropout_layer(self) -> None:
+        """
+        Removes the dropout layer in the model.
+        """
+        logging.info("Removing dropout layer...")
+        self.remove_function("Dropout", 0, 0)
+
+    def remove_lm_head_layer(self) -> None:
+        """
+        Removes the LM head layer in the model.
+        """
+        logging.info("Removing LM head layer...")
+        # bugbug: need to copy the right vi over
+        self.remove_function("Linear_lm_head", 2, 0)
diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index f680a15fc2c1..2398bb9d6031 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -174,6 +174,7 @@ def convert_float_to_float16(
     node_block_list=None,
     force_fp16_initializers=False,
     force_fp16_inputs=None,
+    use_bfloat16_as_blocked_nodes_dtype=False,
 ):
     """Convert tensor float type in the input ONNX model to tensor float16.
 
@@ -410,9 +411,7 @@ def convert_float_to_float16(
             value_info_list.append(make_value_info_from_tensor(value.initializer))
             if value.fp32_nodes and not force_fp16_initializers:
                 logger.info(
-                    "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
-                        value.fp16_nodes
-                    )
+                    f"initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{value.fp16_nodes}"
                 )
 
     # Some operators have data type fixed as float for some input. Add a float16 to float cast for those inputs.
@@ -436,6 +435,7 @@ def convert_float_to_float16(
                     node.input[i] = output_name
                     break
 
+    accuracy_type = TensorProto.BFLOAT16 if use_bfloat16_as_blocked_nodes_dtype else TensorProto.FLOAT
     # process the nodes in block list that doesn't support tensor(float16)
     for node in node_list:
         # if input's name is in the value_info_list meaning input is tensor(float16) type,
@@ -450,10 +450,10 @@ def convert_float_to_float16(
                     new_value_info.CopyFrom(value_info)
                     output_name = node.name + "_input_cast_" + str(i)
                     new_value_info.name = output_name
-                    new_value_info.type.tensor_type.elem_type = TensorProto.FLOAT
+                    new_value_info.type.tensor_type.elem_type = accuracy_type
                     # add Cast node (from tensor(float16) to tensor(float) before current node
                     node_name = node.name + "_input_cast" + str(i)
-                    new_node = [helper.make_node("Cast", [input_name], [output_name], to=1, name=node_name)]
+                    new_node = [helper.make_node("Cast", [input_name], [output_name], to=accuracy_type, name=node_name)]
                     model.graph.node.extend(new_node)
                     # change current node's input name
                     node.input[i] = output_name
@@ -469,7 +469,7 @@ def convert_float_to_float16(
                     new_value_info.CopyFrom(value_info)
                     input_name = node.name + "_output_cast_" + str(i)
                     new_value_info.name = input_name
-                    new_value_info.type.tensor_type.elem_type = TensorProto.FLOAT
+                    new_value_info.type.tensor_type.elem_type = accuracy_type
                     # add Cast node (from tensor(float) to tensor(float16) after current node
                     node_name = node.name + "_output_cast" + str(i)
                     new_node = [helper.make_node("Cast", [input_name], [output], to=10, name=node_name)]
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index d11cb91d98b0..f48cabd25fc5 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -129,6 +129,9 @@ def __init__(
         self.num_heads_warning = True
         self.hidden_size_warning = True
 
+        self.shape_infer = None
+        self.shape_infer_done = True
+
     def get_num_heads_and_hidden_size_from_concat(self, concat: NodeProto) -> Tuple[int, int]:
         """
         Detect num_heads and hidden_size from Concat node in the following subgraph:
@@ -202,12 +205,15 @@ def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]
         return num_heads, hidden_size
 
     def get_add_qk_str(self, add_qk: NodeProto):
-        shape_infer = self.model.infer_runtime_shape(update=True)
-        if shape_infer is None:
+        if not self.shape_infer_done:
+            self.shape_infer = self.model.infer_runtime_shape(update=True)
+            self.shape_infer_done = True
+
+        if self.shape_infer is None:
             return None
 
-        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
-        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
+        input_0_shape = self.shape_infer.get_edge_shape(add_qk.input[0])
+        input_1_shape = self.shape_infer.get_edge_shape(add_qk.input[1])
 
         if input_0_shape is None or input_1_shape is None:
             logger.debug(f"one of the inputs of {add_qk} is None")
diff --git a/onnxruntime/python/tools/transformers/fusion_attention_unet.py b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
index 250ec5f3eb15..048c13cdb1e2 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention_unet.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
@@ -28,10 +28,19 @@ def __init__(
         enable_packed_qkv: bool,
         enable_packed_kv: bool,
     ):
-        super().__init__(model, "MultiHeadAttention" if is_cross_attention else "Attention", ["LayerNormalization"])
+        super().__init__(
+            model,
+            "Attention" if is_cross_attention and enable_packed_qkv else "MultiHeadAttention",
+            ["LayerNormalization"],
+        )
         self.hidden_size = hidden_size
         self.num_heads = num_heads
         self.is_cross_attention = is_cross_attention
+
+        # Note: pack Q/K/V or K/V weights into one tensor make it harder for updating initializers for LoRA.
+        # To support LoRA, it is better to use separated Q, K and V inputs in offline optimization,
+        # and CUDA operator pre-packs those tensors to preferred format based on available kernels.
+        # In this way, we can support LoRA and get optimal performance at same time.
         self.enable_packed_qkv = enable_packed_qkv
         self.enable_packed_kv = enable_packed_kv
 
@@ -170,9 +179,7 @@ def create_attention_node(
             return None
 
         # Sometimes weights are stored in fp16
-        if q_weight.data_type == 10:
-            logger.debug("weights are in fp16. Please run fp16 conversion after optimization")
-            return None
+        float_type = q_weight.data_type
 
         qw = NumpyHelper.to_array(q_weight)
         kw = NumpyHelper.to_array(k_weight)
@@ -212,7 +219,7 @@ def create_attention_node(
                 matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_QKV")
                 self.add_initializer(
                     name=matmul_node_name + "_weight",
-                    data_type=TensorProto.FLOAT,
+                    data_type=float_type,
                     dims=[qkv_weight.shape[0], qkv_weight.shape[1]],
                     vals=qkv_weight,
                 )
@@ -235,8 +242,11 @@ def create_attention_node(
 
                 reshape_node = helper.make_node(
                     "Reshape",
-                    inputs=[matmul_node_name + "_out", matmul_node_name + "_reshape_shape"],
-                    outputs=[attention_node_name + "_input"],
+                    inputs=[
+                        matmul_node_name + "_out",
+                        matmul_node_name + "_reshape_shape",
+                    ],
+                    outputs=[attention_node_name + "_qkv_input"],
                     name=matmul_node_name + "_reshape",
                 )
                 self.node_name_to_graph_name[reshape_node.name] = self.this_graph_name
@@ -251,7 +261,7 @@ def create_attention_node(
 
                 self.add_initializer(
                     name=attention_node_name + "_qkv_weight",
-                    data_type=TensorProto.FLOAT,
+                    data_type=float_type,
                     dims=[qw_in_size, qkv_weight_dim],
                     vals=qkv_weight,
                 )
@@ -280,7 +290,7 @@ def create_attention_node(
                 matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_KV")
                 self.add_initializer(
                     name=matmul_node_name + "_weight",
-                    data_type=TensorProto.FLOAT,
+                    data_type=float_type,
                     dims=[kv_weight.shape[0], kv_weight.shape[1]],
                     vals=kv_weight,
                 )
@@ -303,8 +313,11 @@ def create_attention_node(
 
                 reshape_node = helper.make_node(
                     "Reshape",
-                    inputs=[matmul_node_name + "_out", matmul_node_name + "_reshape_shape"],
-                    outputs=[k_matmul.output[0]],
+                    inputs=[
+                        matmul_node_name + "_out",
+                        matmul_node_name + "_reshape_shape",
+                    ],
+                    outputs=[attention_node_name + "_kv_input"],
                     name=matmul_node_name + "_reshape",
                 )
                 self.node_name_to_graph_name[reshape_node.name] = self.this_graph_name
@@ -317,7 +330,7 @@ def create_attention_node(
 
         self.add_initializer(
             name=attention_node_name + "_qkv_bias",
-            data_type=TensorProto.FLOAT,
+            data_type=float_type,
             dims=[qkv_bias_dim],
             vals=qkv_bias,
         )
@@ -330,7 +343,7 @@ def create_attention_node(
                     attention_node_name + "_qkv_bias",
                 ]
             else:
-                attention_inputs = [attention_node_name + "_input"]
+                attention_inputs = [attention_node_name + "_qkv_input"]
         else:
             if not self.enable_packed_kv:
                 attention_inputs = [
@@ -342,7 +355,7 @@ def create_attention_node(
             else:
                 attention_inputs = [
                     q_matmul.output[0],
-                    k_matmul.output[0],
+                    attention_node_name + "_kv_input",
                 ]
 
         attention_node = helper.make_node(
@@ -360,9 +373,7 @@ def create_attention_node(
             else "MultiHeadAttention ({})".format(
                 "self attention with packed qkv"
                 if self.enable_packed_qkv
-                else "cross attention with packed kv"
-                if self.enable_packed_kv
-                else "cross attention"
+                else "cross attention with packed kv" if self.enable_packed_kv else "cross attention"
             )
         )
         self.increase_counter(counter_name)
@@ -830,15 +841,16 @@ def create_attention_node_lora(
             else "MultiHeadAttention ({})".format(
                 "self attention with packed qkv"
                 if self.enable_packed_qkv
-                else "cross attention with packed kv"
-                if self.enable_packed_kv
-                else "cross attention"
+                else "cross attention with packed kv" if self.enable_packed_kv else "cross attention"
             )
         )
         self.increase_counter(counter_name)
         return attention_node
 
     def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        if self.fuse_a1111_fp16(normalize_node, input_name_to_nodes, output_name_to_node):
+            return
+
         node_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
 
         # In SD 1.5, for self attention, LayerNorm has parent Reshape
@@ -1168,3 +1180,125 @@ def match_lora_path(
             return (lora_mul_node, lora_matmul_1_node)
 
         return None
+
+    def fuse_a1111_fp16(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        """Fuse attention of fp16 UNet exported in A1111 (stable diffusion webui) extension"""
+        entry_path = self.model.match_parent_path(normalize_node, ["Cast", "Add"], [0, 0])
+        if entry_path is None:
+            entry_path = self.model.match_parent_path(normalize_node, ["Cast", "Reshape"], [0, 0])
+            if entry_path is None:
+                return False
+        _cast, node_before_layernorm = entry_path
+
+        root_input = node_before_layernorm.output[0]
+
+        children_nodes = input_name_to_nodes[root_input]
+        skip_add = None
+        for node in children_nodes:
+            if node.op_type == "Add":  # SkipLayerNormalization fusion is not applied yet
+                skip_add = node
+                break
+        if skip_add is None:
+            return False
+
+        match_qkv = self.match_qkv_a1111(root_input, skip_add)
+        if match_qkv is None:
+            return False
+
+        (
+            reshape_qkv,
+            transpose_qkv,
+            reshape_q,
+            matmul_q,
+            matmul_k,
+            matmul_v,
+        ) = match_qkv
+
+        cast_q = self.model.match_parent(matmul_q, "Cast", 0)
+        cast_k = self.model.match_parent(matmul_k, "Cast", 0)
+        cast_v = self.model.match_parent(matmul_v, "Cast", 0)
+        if not (
+            cast_q is not None
+            and cast_k is not None
+            and (cast_q == cast_k if not self.is_cross_attention else cast_q != cast_k)
+            and cast_k == cast_v
+        ):
+            return False
+
+        if cast_q.input[0] != normalize_node.output[0]:
+            return False
+
+        attention_last_node = reshape_qkv
+
+        q_num_heads = self.get_num_heads(reshape_q, True) or self.get_num_heads(reshape_q, False)
+        if q_num_heads <= 0:
+            logger.debug("fuse_attention: failed to detect num_heads")
+            return False
+
+        q_hidden_size = self.get_hidden_size(normalize_node)
+
+        # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
+        new_node = self.create_attention_node(
+            matmul_q,
+            matmul_k,
+            matmul_v,
+            q_num_heads,
+            q_hidden_size,
+            input=matmul_q.input[0],
+            output=attention_last_node.output[0],
+        )
+        if new_node is None:
+            return False
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+        self.nodes_to_remove.extend([attention_last_node, transpose_qkv])
+
+        # Use prune graph to remove nodes since they are shared by all attention nodes.
+        self.prune_graph = True
+        return True
+
+    def match_qkv_a1111(self, root_input, skip_add):
+        """Match Q, K and V paths exported by A1111 (stable diffusion webui) extension"""
+        another_input = 1 if skip_add.input[0] == root_input else 0
+        qkv_nodes = self.model.match_parent_path(
+            skip_add,
+            ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "Einsum"],
+            [another_input, None, None, 0, 0, 0],
+        )
+
+        if qkv_nodes is None:
+            return None
+
+        (_, _, reshape_qkv, transpose_qkv, reshape_einsum, einsum_qkv) = qkv_nodes
+
+        v_nodes = self.model.match_parent_path(einsum_qkv, ["Reshape", "Transpose", "Reshape", "MatMul"], [1, 0, 0, 0])
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return None
+        (_, _, _, matmul_v) = v_nodes
+
+        qk_nodes = self.model.match_parent_path(
+            einsum_qkv, ["Cast", "Cast", "Softmax", "Mul", "Einsum"], [0, 0, 0, 0, None]
+        )
+        if qk_nodes is not None:
+            (_, _, _softmax_qk, _, einsum_qk) = qk_nodes
+        else:
+            logger.debug("fuse_attention: failed to match qk path")
+            return None
+
+        q_nodes = self.model.match_parent_path(einsum_qk, ["Reshape", "Transpose", "Reshape", "MatMul"], [0, 0, 0, 0])
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return None
+        (_, _transpose_q, reshape_q, matmul_q) = q_nodes
+
+        k_nodes = self.model.match_parent_path(einsum_qk, ["Reshape", "Transpose", "Reshape", "MatMul"], [1, 0, 0, 0])
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return None
+
+        (_, _, _, matmul_k) = k_nodes
+
+        return reshape_qkv, transpose_qkv, reshape_q, matmul_q, matmul_k, matmul_v
diff --git a/onnxruntime/python/tools/transformers/fusion_bart_attention.py b/onnxruntime/python/tools/transformers/fusion_bart_attention.py
index 71801401e9d0..ebecc1db2479 100644
--- a/onnxruntime/python/tools/transformers/fusion_bart_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_bart_attention.py
@@ -74,13 +74,74 @@ def check_runtime_shape_path(
 
         return True
 
+    def check_runtime_shape_path_openai(
+        self,
+        reshape_qkv_2,
+        matmul_qkv,
+        add_qk,
+        matmul_qk,
+        add_q,
+    ):
+        reshape_qkv_2_path = self.model.match_parent_path(
+            reshape_qkv_2, ["Concat", "Slice", "Gather", "Shape"], [1, 0, 0, 0]
+        )
+        if reshape_qkv_2_path is None:
+            return False
+        else:
+            if reshape_qkv_2_path[-1].input[0] != matmul_qkv.output[0]:
+                return False
+
+        matmul_qk_path_1 = self.model.match_parent_path(
+            matmul_qk, ["Mul", "Pow", "Cast", "Div", "Gather", "Shape"], [0, 1, 0, 0, 0, 0]
+        )
+        matmul_qk_path_2 = self.model.match_parent_path(
+            matmul_qk, ["Mul", "Pow", "Cast", "Div", "Gather", "Shape"], [1, 1, 0, 0, 0, 0]
+        )
+        if matmul_qk_path_1 is None or matmul_qk_path_2 is None:
+            return False
+
+        mul_1 = matmul_qk_path_1[0]
+        mul_2 = matmul_qk_path_2[0]
+        if mul_1.input[1] != mul_2.input[1]:
+            return False
+        if matmul_qk_path_1[-1].input[0] != add_q.output[0] and matmul_qk_path_2[-1].input[0] != add_q.output[0]:
+            return False
+
+        # For decoder attentions only
+        if add_qk is not None:
+            add_qk_path = self.model.match_parent_path(add_qk, ["Slice"], [1])
+            if add_qk_path is None:
+                return False
+            slice_q_path_1 = self.model.match_parent_path(
+                add_qk_path[0], ["Slice", "Unsqueeze", "Gather", "Shape"], [0, 2, 0, 0]
+            )
+            slice_q_path_2 = self.model.match_parent_path(add_qk_path[0], ["Unsqueeze", "Gather", "Shape"], [2, 0, 0])
+            if slice_q_path_1 is None and slice_q_path_2 is None:
+                return False
+            _, unsqueeze_1, _, _ = slice_q_path_1
+            unsqueeze_2, _, _ = slice_q_path_2
+            if unsqueeze_1.input[0] != unsqueeze_2.input[0]:
+                return False
+            if slice_q_path_1[-1].input[0] != add_q.output[0] and slice_q_path_2[-1].input[0] != add_q.output[0]:
+                return False
+
+        return True
+
     def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Track if fusion is occurring for OpenAI implementation of Whisper
+        model_impl_openai = False
+
         # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
         qkv_nodes = self.model.match_parent_path(
             normalize_node,
             ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
             [1, 1, 0, 0, 0, 0],
         )
+        qkv_nodes_openai = self.model.match_parent_path(
+            normalize_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [1, 1, 0, 0, 0],
+        )
         if qkv_nodes is not None:
             (
                 add_out,
@@ -90,6 +151,17 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 reshape_qkv_1,
                 matmul_qkv,
             ) = qkv_nodes
+        elif qkv_nodes_openai is not None:
+            qkv_nodes = qkv_nodes_openai
+            (
+                add_out,
+                matmul_out,
+                reshape_qkv_2,
+                transpose_qkv,
+                matmul_qkv,
+            ) = qkv_nodes
+            # Set model implementation to openai
+            model_impl_openai = True
         else:
             return
 
@@ -137,6 +209,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Reshape", "Transpose", "Reshape", "Add", "MatMul"],
             [1, 0, 0, 0, None],
         )
+        v_nodes_openai = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "Add", "MatMul"],
+            [1, 0, 0, None],
+        )
         v_nodes_with_past_self_attn = self.model.match_parent_path(
             # Decoder attention with past value concatenated before MatMul
             matmul_qkv,
@@ -149,12 +226,52 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Reshape"],
             [1],
         )
+        v_nodes_with_past_cross_attn_openai = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "Reshape", "Transpose"],
+            [1, 0, 0, 0],
+        )
         past_v, present_v = "", ""
         reshape_v_2, add_v = None, None
         if v_nodes is not None:
             (reshape_v_2, transpose_v, reshape_v_1, add_v, matmul_v) = v_nodes
             # For initial pass through encoder-decoder_with_past to get starting past values (beam search)
             present_v = transpose_v.output[0]
+        elif v_nodes_openai is not None:
+            v_nodes = v_nodes_openai
+            (transpose_v, reshape_v_1, add_v, matmul_v) = v_nodes
+            # For initial pass through encoder-decoder_with_past to get starting past values (beam search)
+
+            # Find the child path to access the correct present_v values
+            # Openai impl provides present/past v values in 3D format
+            # whereas ort MultiHeadAttention expects v values in 4D, hence the
+            # additional Reshape and Transpose nodes are added
+            # For encoder attention types
+            # Add -> Reshape -> Transpose -> Present_V
+            reshape_path = self.model.match_child_path(
+                add_v,
+                ["Reshape", "Transpose"],
+                exclude=[reshape_v_1],
+            )
+            # For decoder attention types
+            # add_v_node                     Reshape <- Transpose <-Past_V
+            #           \                  /
+            #             \              /
+            #               -> Concat <-
+            #                    |
+            #                    |--> Reshape -> Transpose -> Present_V
+            concat_path = self.model.match_child_path(add_v, ["Concat", "Reshape", "Transpose"])
+            if reshape_path is not None:
+                (_, transpose_add_v) = reshape_path
+                if transpose_add_v.output[0] in graph_output_names:
+                    present_v = transpose_add_v.output[0]
+            if concat_path is not None:
+                (concat_v, _, transpose_concat_v) = concat_path
+                if transpose_concat_v.output[0] in graph_output_names:
+                    present_v = transpose_concat_v.output[0]
+                concat_nodes = self.model.match_parent_path(concat_v, ["Reshape", "Transpose"], [0, 0])
+                _, transpose_concat_v_in = concat_nodes
+                past_v = transpose_concat_v_in.input[0]
         elif v_nodes_with_past_self_attn is not None:
             (reshape_v_2, concat_v, transpose_v, reshape_v_1, add_v, matmul_v) = v_nodes_with_past_self_attn
             v_nodes = v_nodes_with_past_self_attn
@@ -171,6 +288,18 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     filter(lambda node: node.op_type == "Identity", self.model.input_name_to_nodes()[past_v])
                 )
                 present_v = identity_node_v[0].output[0] if len(identity_node_v) == 1 else ""
+        elif (
+            v_nodes_with_past_cross_attn_openai is not None
+            and v_nodes_with_past_cross_attn_openai[-1].input[0] in graph_input_names
+        ):
+            v_nodes = v_nodes_with_past_cross_attn_openai
+            past_v = v_nodes[-1].input[0]
+            present_v = v_nodes[-1].output[0]
+            if present_v not in graph_output_names:
+                identity_node_v = list(
+                    filter(lambda node: node.op_type == "Identity", self.model.input_name_to_nodes()[past_v])
+                )
+                present_v = identity_node_v[0].output[0] if len(identity_node_v) == 1 else ""
         else:
             logger.debug("fuse_attention: failed to match v path")
             return
@@ -181,12 +310,17 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         qk_nodes_2 = self.model.match_parent_path(
             matmul_qkv, ["Softmax", "Reshape", "Add", "Reshape", "MatMul"], [0, 0, 0, 0, 0]
         )
+        qk_nodes_2_openai = self.model.match_parent_path(matmul_qkv, ["Softmax", "Add", "MatMul"], [0, 0, 0])
+        add_qk = None
         if qk_nodes_1 is not None:
             _, matmul_qk = qk_nodes_1
             qk_nodes = qk_nodes_1
         elif qk_nodes_2 is not None:
             _, _, add_qk, _, matmul_qk = qk_nodes_2
             qk_nodes = qk_nodes_2
+        elif qk_nodes_2_openai is not None:
+            _, add_qk, matmul_qk = qk_nodes_2_openai
+            qk_nodes = qk_nodes_2_openai
         else:
             return
 
@@ -195,8 +329,17 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Reshape", "Transpose", "Reshape", "Mul", "Add", "MatMul"],
             [0, 0, 0, 0, 0, 1],
         )
+        q_nodes_openai = self.model.match_parent_path(
+            matmul_qk,
+            ["Mul", "Transpose", "Reshape", "Add", "MatMul"],
+            [0, 0, 0, 0, 1],
+        )
+        reshape_q_2 = None
         if q_nodes is not None:
             reshape_q_2, transpose_q, reshape_q_1, mul_q, add_q, matmul_q = q_nodes
+        elif q_nodes_openai is not None:
+            q_nodes = q_nodes_openai
+            mul_q, transpose_q, reshape_q_1, add_q, matmul_q = q_nodes
         else:
             return
 
@@ -205,6 +348,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Transpose", "Reshape", "Transpose", "Reshape", "Add", "MatMul"],
             [1, 0, 0, 0, 0, 1],
         )
+        k_nodes_with_bias_openai = self.model.match_parent_path(
+            matmul_qk,
+            ["Mul", "Transpose", "Reshape", "MatMul"],
+            [1, 0, 0, 0],
+        )
         k_nodes_no_bias = self.model.match_parent_path(
             matmul_qk,
             ["Transpose", "Reshape", "Transpose", "Reshape", "MatMul"],
@@ -222,11 +370,52 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Transpose", "Reshape"],
             [1, 0],
         )
+        k_nodes_no_bias_with_past_cross_attn_openai = self.model.match_parent_path(
+            # Decoder attention with past key directly used in MatMul
+            matmul_qk,
+            ["Mul", "Transpose", "Reshape", "Reshape", "Transpose"],
+            [1, 0, 0, 0, 0],
+        )
         past_k, present_k = "", ""
         reshape_k_2, reshape_k_1, matmul_k = None, None, None
         if k_nodes_with_bias is not None:
             _, reshape_k_2, transpose_k_1, reshape_k_1, add_k, matmul_k = k_nodes_with_bias
             k_nodes = k_nodes_with_bias
+        elif k_nodes_with_bias_openai is not None:
+            mul_k, transpose_k_1, reshape_k_1, matmul_k = k_nodes_with_bias_openai
+            k_nodes = k_nodes_with_bias_openai
+            present_k = matmul_k.output[0]
+
+            # Find the child path to access the correct present_k values
+            # Openai impl provides present/past k values in 3D format
+            # whereas ort MultiHeadAttention expects k values in 4D, hence the
+            # additional Reshape and Transpose nodes are added
+            # For encoder attention types
+            # Matmul -> Reshape -> Transpose -> Present_K
+            reshape_path = self.model.match_child_path(
+                matmul_k,
+                ["Reshape", "Transpose"],
+                exclude=[reshape_k_1],
+            )
+            # For decoder attention types
+            # matmul_k_node                  Reshape <- Transpose <- Past_K
+            #           \                  /
+            #             \              /
+            #               -> Concat <-
+            #                    |
+            #                    |--> Reshape -> Transpose -> Present_K
+            concat_path = self.model.match_child_path(matmul_k, ["Concat", "Reshape", "Transpose"])
+            if reshape_path is not None:
+                (_, transpose_matmul_k) = reshape_path
+                if transpose_matmul_k.output[0] in graph_output_names:
+                    present_k = transpose_matmul_k.output[0]
+            if concat_path is not None:
+                (concat_k, _, transpose_concat_k) = concat_path
+                if transpose_concat_k.output[0] in graph_output_names:
+                    present_k = transpose_concat_k.output[0]
+                concat_nodes = self.model.match_parent_path(concat_k, ["Reshape", "Transpose"], [0, 0])
+                _, transpose_concat_k_in = concat_nodes
+                past_k = transpose_concat_k_in.input[0]
         elif k_nodes_no_bias is not None:
             _, reshape_k_2, transpose_k_1, reshape_k_1, matmul_k = k_nodes_no_bias
             k_nodes = k_nodes_no_bias
@@ -249,12 +438,24 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     filter(lambda node: node.op_type == "Identity", self.model.input_name_to_nodes()[past_k])
                 )
                 present_k = identity_node_k[0].output[0] if len(identity_node_k) == 1 else ""
+        elif (
+            k_nodes_no_bias_with_past_cross_attn_openai is not None
+            and k_nodes_no_bias_with_past_cross_attn_openai[-1].input[0] in graph_input_names
+        ):
+            k_nodes = k_nodes_no_bias_with_past_cross_attn_openai
+            past_k = k_nodes[-1].input[0]
+            present_k = k_nodes[-1].output[0]
+            if present_k not in graph_output_names:
+                identity_node_k = list(
+                    filter(lambda node: node.op_type == "Identity", self.model.input_name_to_nodes()[past_k])
+                )
+                present_k = identity_node_k[0].output[0] if len(identity_node_k) == 1 else ""
         else:
             return
         past_k = past_k if past_k in graph_input_names else ""
         present_k = present_k if present_k in graph_output_names else ""
 
-        if k_nodes in (k_nodes_no_bias, k_nodes_no_bias_with_past_self_attn):
+        if k_nodes in (k_nodes_with_bias_openai, k_nodes_no_bias, k_nodes_no_bias_with_past_self_attn):
             # Create empty Add node for attention graph
             bias_dim = self.model.get_initializer(add_v.input[0]).dims[0]
             empty_bias_name = "empty_bias"
@@ -270,13 +471,29 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             add_name = self.model.create_node_name("Add")
             add_k = helper.make_node("Add", [empty_bias_name, matmul_k.output[0]], [reshape_k_1.name], add_name)
 
-        if not past_k and not self.check_runtime_shape_path(
-            reshape_qkv_2,
-            reshape_qkv_1,
-            reshape_q_2,
-            reshape_k_2,
-            reshape_v_2,
-            root_input,
+        if (
+            model_impl_openai
+            and not past_k
+            and not self.check_runtime_shape_path_openai(
+                reshape_qkv_2,
+                matmul_qkv,
+                add_qk,
+                matmul_qk,
+                add_q,
+            )
+        ):
+            return
+        elif (
+            not model_impl_openai
+            and not past_k
+            and not self.check_runtime_shape_path(
+                reshape_qkv_2,
+                reshape_qkv_1,
+                reshape_q_2,
+                reshape_k_2,
+                reshape_v_2,
+                root_input,
+            )
         ):
             return
 
@@ -301,8 +518,10 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         # 4) Decoder cross attention with two_root_inputs=True and qk_nodes=qk_nodes_1
         # 5) Decoder cross attention with past with three_root_inputs=True and qk_nodes=qk_nodes_1
         encoder_attention = one_root_input and qk_nodes == qk_nodes_1
-        decoder_attention = one_root_input and qk_nodes == qk_nodes_2
-        decoder_attention_with_past = encoder_attention and past_k and past_v
+        decoder_attention = one_root_input and qk_nodes in (qk_nodes_2, qk_nodes_2_openai)
+        decoder_attention_with_past = (
+            (encoder_attention if not model_impl_openai else decoder_attention) and past_k and past_v
+        )
         decoder_cross_attention = two_root_inputs and qk_nodes == qk_nodes_1
         decoder_cross_attention_with_past = three_root_inputs and qk_nodes == qk_nodes_1
 
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index bc38399e3cce..70ff57f0626e 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -28,7 +28,9 @@ def __init__(self, model: OnnxModel, description: str = "no mask"):
             description,
         )
         self.utils = FusionUtils(model)
-        self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
+        self.shape_infer = None
+        self.shape_infer_done = False
+
         # The following will be reset in each fuse call of FusionEmbedLayerNormalization
         self.attention = None
         self.embed_node = None
@@ -329,9 +331,13 @@ def check_embedding(self, word_embedding_gather, segment_embedding_gather, posit
         segment_ids = segment_embedding_gather.input[1] if segment_embedding_gather else None
         position_ids = position_embedding_gather.input[1]
 
-        if self.shape_infer_helper is not None:
-            input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
-            position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
+        if not self.shape_infer_done:
+            self.shape_infer = self.model.infer_runtime_shape(update=True)
+            self.shape_infer_done = True
+
+        if self.shape_infer is not None:
+            input_ids_shape = self.shape_infer.get_edge_shape(input_ids)
+            position_ids_shape = self.shape_infer.get_edge_shape(position_ids)
             assert input_ids_shape and position_ids_shape
             if not (
                 len(input_ids_shape) == 2
@@ -339,18 +345,13 @@ def check_embedding(self, word_embedding_gather, segment_embedding_gather, posit
                 and input_ids_shape[1] == position_ids_shape[1]
             ):
                 logger.info(
-                    "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
-                        input_ids_shape, position_ids_shape
-                    )
+                    f"Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {input_ids_shape} vs {position_ids_shape}"
                 )
                 return False
 
-            if segment_ids and not self.shape_infer_helper.compare_shape(input_ids, segment_ids):
+            if segment_ids and not self.shape_infer.compare_shape(input_ids, segment_ids):
                 logger.info(
-                    "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
-                        input_ids_shape,
-                        self.shape_infer_helper.get_edge_shape(segment_ids),
-                    )
+                    f"Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {input_ids_shape} != {self.shape_infer.get_edge_shape(segment_ids)}"
                 )
                 return False
 
diff --git a/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py b/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py
index f1d803a3cc08..4d9913f427b3 100644
--- a/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py
@@ -32,7 +32,7 @@ def get_dimensions(self, input_name: str) -> Union[int, None]:
             return self.get_dimensions_from_tensor_proto(graph_input)
 
         if not self.shape_infer_done:
-            self.shape_infer = self.model.infer_runtime_shape({}, update=True)
+            self.shape_infer = self.model.infer_runtime_shape(update=True)
             self.shape_infer_done = True
 
         if self.shape_infer is not None:
diff --git a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
index 141ebb1f95a1..5233fdf272fb 100644
--- a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
+++ b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
@@ -7,7 +7,8 @@
 from typing import List
 
 from fusion_base import Fusion
-from onnx import TensorProto, helper, numpy_helper
+from fusion_utils import FusionUtils
+from onnx import helper, numpy_helper
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
@@ -19,6 +20,7 @@ class FusionNhwcConv(Fusion):
     def __init__(self, model: OnnxModel, update_weight=False):
         super().__init__(model, "NhwcConv", ["Conv"], "NhwcConv")
         self.update_weight = update_weight
+        self.fusion_utils = FusionUtils(model)
 
     def create_transpose_node(self, input_name: str, perm: List[int], output_name=None):
         """Append a Transpose node after an input"""
@@ -49,6 +51,15 @@ def fuse(self, conv, input_name_to_nodes, output_name_to_node):
         if len(weight.shape) != 4:
             return
 
+        dtype = self.model.get_dtype(nhwc_conv_input)
+        if not (dtype is not None and weight_tensor.data_type == dtype):
+            cast_node = self.fusion_utils.add_cast_node(
+                input_name=nhwc_conv_input,
+                to_type=weight_tensor.data_type,
+                output_name_to_node=output_name_to_node,
+            )
+            nhwc_conv_input = cast_node.output[0]
+
         if self.update_weight:
             # Transpose weights from NCHW to NHWC
             weight = weight.transpose(0, 2, 3, 1)
@@ -56,7 +67,7 @@ def fuse(self, conv, input_name_to_nodes, output_name_to_node):
             weight_name = node_name + "_weight_NHWC"
             self.add_initializer(
                 name=weight_name,
-                data_type=TensorProto.FLOAT,
+                data_type=weight_tensor.data_type,
                 dims=list(weight.shape),
                 vals=weight,
             )
diff --git a/onnxruntime/python/tools/transformers/fusion_options.py b/onnxruntime/python/tools/transformers/fusion_options.py
index b9b92d2fe8a0..edac1989e4e9 100644
--- a/onnxruntime/python/tools/transformers/fusion_options.py
+++ b/onnxruntime/python/tools/transformers/fusion_options.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 from argparse import ArgumentParser
+from enum import Enum
 
 
 class AttentionMaskFormat:
@@ -19,6 +20,23 @@ class AttentionMaskFormat:
     NoMask = 3
 
 
+class AttentionOpType(Enum):
+    Attention = "Attention"
+    MultiHeadAttention = "MultiHeadAttention"
+    GroupQueryAttention = "GroupQueryAttention"
+    PagedAttention = "PagedAttention"
+
+    def __str__(self):
+        return self.value
+
+    # Override __eq__ to return string comparison
+    def __hash__(self):
+        return hash(self.value)
+
+    def __eq__(self, other):
+        return other.value == self.value
+
+
 class FusionOptions:
     """Options of fusion in graph optimization"""
 
@@ -57,6 +75,8 @@ def __init__(self, model_type):
         elif model_type == "vit":
             self.attention_mask_format = AttentionMaskFormat.NoMask
 
+        self.attention_op_type = None
+
         # options for stable diffusion
         if model_type in ["unet", "vae", "clip"]:
             self.enable_nhwc_conv = True
@@ -76,6 +96,9 @@ def use_raw_attention_mask(self, use_raw_mask=True):
     def disable_attention_mask(self):
         self.attention_mask_format = AttentionMaskFormat.NoMask
 
+    def set_attention_op_type(self, attn_op_type: AttentionOpType):
+        self.attention_op_type = attn_op_type
+
     @staticmethod
     def parse(args):
         options = FusionOptions(args.model_type)
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
index 6c44bb11e24d..5f395b364eb6 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
@@ -75,9 +75,11 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 
         if not self.model.is_safe_to_fuse_nodes(
             subgraph_nodes,
-            [node.output[0], downstream_quantize_node.output[0]]
-            if downstream_shape_node is not None
-            else downstream_quantize_node.output,
+            (
+                [node.output[0], downstream_quantize_node.output[0]]
+                if downstream_shape_node is not None
+                else downstream_quantize_node.output
+            ),
             input_name_to_nodes,
             output_name_to_node,
         ):
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
index cf2b35772175..5ec6dadc1e67 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
@@ -77,9 +77,11 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 
         if not self.model.is_safe_to_fuse_nodes(
             subgraph_nodes,
-            [node.output[0], downstream_quantize_node.output[0]]
-            if downstream_shape_node is not None
-            else downstream_quantize_node.output,
+            (
+                [node.output[0], downstream_quantize_node.output[0]]
+                if downstream_shape_node is not None
+                else downstream_quantize_node.output
+            ),
             input_name_to_nodes,
             output_name_to_node,
         ):
diff --git a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
index de89b35366a2..7384cace21a6 100644
--- a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
@@ -362,8 +362,10 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         # v_nodes_1 is for LLaMA-2 Microsoft
         # v_nodes_3 is for LLaMA-2 Hugging Face
         # v_nodes_4 is for LLaMA-2 70B model
+        # v_nodes_5 is for Phi-2 DirectML
         past_v, present_v, past_seq_len = "", "", ""
         v_nodes = None
+        add_v = None
         v_nodes_1 = self.model.match_parent_path(
             matmul_qkv,
             ["Reshape", "Transpose", "Concat", "Transpose", "Reshape", "MatMul"],
@@ -491,6 +493,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ],
             output_name_to_node=None,
         )
+        v_nodes_5 = self.model.match_parent_path(
+            matmul_qkv,
+            ["Concat", "Transpose", "Reshape", "Add", "MatMul"],
+            [1, 1, 0, 0, 1],
+        )
         if v_nodes_1 is not None:
             reshape_v_2, _, concat_v, _, reshape_v_1, matmul_v = v_nodes_1
             v_nodes = v_nodes_1
@@ -521,6 +528,12 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             v_nodes = v_nodes_4
             past_v = concat_v.input[0]
             present_v = concat_v.output[0]
+        elif v_nodes_5 is not None:
+            concat_v, transpose_v, reshape_v, add_v, matmul_v = v_nodes_5
+            matmul_v = add_v
+            v_nodes = v_nodes_5
+            past_v = concat_v.input[0]
+            present_v = concat_v.output[0]
         else:
             logger.debug("fuse_rotary_attention: failed to match v path")
             return
@@ -539,6 +552,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 
         # attn_mask_nodes_1, attn_mask_nodes_2 are for LLaMA-2 Microsoft's 3D attention mask
         # attn_mask_nodes_3, attn_mask_nodes_4 are for LLaMA-2 Hugging Face's 2D attention mask
+        # attn_mask_nodes_5, attn_mask_nodes_6 are for LLaMA-2 Microsoft's model for the DML EP
+        # attn_mask_nodes_7 is for LLaMA-2 Hugging Face's changes to the attention mask
         attn_mask, add_qk_str = "", ""
         attn_mask_nodes_1 = self.model.match_parent_path(
             add_qk,
@@ -570,6 +585,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Expand", "Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
             [1, 0, 2, 1, 0, 0, 0],
         )
+        attn_mask_nodes_7 = self.model.match_parent_path(
+            add_qk,
+            ["Where", "Cast", "Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0],
+        )
         if attn_mask_nodes_1 is not None:
             _, slice_mask_1, slice_mask_2 = attn_mask_nodes_1
             attn_mask = slice_mask_1.output[0]
@@ -588,6 +608,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         elif attn_mask_nodes_6 is not None:
             # The mask has already been reshaped to (B,N,S,T)
             add_qk_str = attn_mask_nodes_6[0].output[0]
+        elif attn_mask_nodes_7 is not None:
+            # Reshape from (B,1,S,T) to (B,N,S,T)
+            add_qk_str = self.reshape_add_qk(attn_mask_nodes_7[0].output[0])
         else:
             logger.debug("fuse_rotary_attention: failed to match attention mask nodes")
             return
@@ -597,6 +620,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         # k_nodes_4 is for LLaMA-2 70B Hugging Face
         past_k, present_k = "", ""
         k_nodes = None
+        slice_k = None
+        concat_k_half = None
         k_nodes_1 = self.model.match_parent_path(
             matmul_qk,
             ["Reshape", "Transpose", "Concat", "Transpose", "RotaryEmbedding", "MatMul"],
@@ -780,6 +805,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ],
             output_name_to_node=None,
         )
+        k_nodes_5 = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "Concat", "Concat", "RotaryEmbedding", "Slice", "Transpose", "Reshape", "Add", "MatMul"],
+            [1, 0, 1, 0, 0, 0, 0, 0, 1],
+        )
         if k_nodes_1 is not None:
             reshape_k_2, _, concat_k, _, rotary_k, matmul_k = k_nodes_1
             k_nodes = k_nodes_1
@@ -813,13 +843,21 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             k_nodes = k_nodes_4
             past_k = concat_k.input[0]
             present_k = concat_k.output[0]
+        elif k_nodes_5 is not None:
+            _, concat_k, concat_k_half, rotary_k, slice_k, _, reshape_k, _, matmul_k = k_nodes_5
+            k_nodes = k_nodes_5
+            past_k = concat_k.input[0]
+            present_k = concat_k.output[0]
         else:
             logger.debug("fuse_rotary_attention: failed to match k nodes")
             return
 
         # q_nodes_1 is for LLaMA-2 Microsoft
         # q_nodes_2 is for LLaMA-2 Hugging Face
+        # q_nodes_3 is for Phi-2 DirectML
         q_nodes = None
+        slice_q = None
+        concat_q_half = None
         q_nodes_1 = self.model.match_parent_path(
             matmul_qk,
             ["Reshape", "Transpose", "RotaryEmbedding", "MatMul"],
@@ -830,12 +868,20 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["RotaryEmbedding", "Transpose", "Reshape", "MatMul"],
             [0, 0, 0, 0],
         )
+        q_nodes_3 = self.model.match_parent_path(
+            matmul_qk,
+            ["Concat", "RotaryEmbedding", "Slice", "Transpose", "Reshape", "Add", "MatMul"],
+            [0, 0, 0, 0, 0, 0, 1],
+        )
         if q_nodes_1 is not None:
             reshape_q_2, _, rotary_q, matmul_q = q_nodes_1
             q_nodes = q_nodes_1
         elif q_nodes_2 is not None:
             rotary_q, _, reshape_q, matmul_q = q_nodes_2
             q_nodes = q_nodes_2
+        elif q_nodes_3 is not None:
+            concat_q_half, rotary_q, slice_q, _, reshape_q, _, matmul_q = q_nodes_3
+            q_nodes = q_nodes_3
         else:
             logger.debug("fuse_rotary_attention: failed to match q nodes")
             return
@@ -875,15 +921,132 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             # Rename inputs of rotary_q/k so it connects with output of matmul_q/k
             # Before: MatMul --> Reshape --> Transpose --> RotaryEmbedding
             # After: MatMul --> RotaryEmbedding
-            rotary_q.input[0] = matmul_q.output[0]
-            rotary_k.input[0] = matmul_k.output[0]
+            rotary_q.input[0] = slice_q.output[0] if slice_q else matmul_q.output[0]
+            rotary_k.input[0] = slice_k.output[0] if slice_k else matmul_k.output[0]
 
             # Rename current output of rotary_k (present_key) so it doesn't match output of MHA (present_key)
-            rotary_k.output[0] = rotary_k.name + "_output_0"
+            if concat_q_half is None:
+                rotary_k.output[0] = rotary_k.name + "_output_0"
 
             if qkv_nodes == qkv_nodes_3:
                 qkv_nodes = qkv_nodes[1:]
 
+        def create_hidden_size_concat_node(reshape_q):
+            """Detect num_heads and hidden_size for ONNX model from phi-2
+            Args:
+                reshape_q (NodeProto): reshape node for q
+            Returns:
+                hidden_size_concat_node(NodeProto): Concat node to be used by reshape
+            """
+            concat = self.model.match_parent(reshape_q, "Concat", 1)
+
+            if concat is None:
+                logger.debug("fuse_rotary_attention: failed to trace the concat node from reshape_q")
+                return None
+
+            # The shape is a tensor like [?, ?, num_heads, head_size]
+            num_head_constant_node = self.model.get_constant_value(concat.input[2])
+            head_size_constant_node = self.model.get_constant_value(concat.input[3])
+
+            if num_head_constant_node is None or head_size_constant_node is None:
+                logger.debug("fuse_rotary_attention: failed to get constant nodes of num_heads or head_size")
+                return None
+
+            num_head_value = num_head_constant_node[0]
+            head_size_value = head_size_constant_node[0]
+
+            hidden_size = num_head_value * head_size_value
+
+            hidden_size_initilizer = self.model.create_node_name("Initializer", name_prefix="hidden_size")
+            if self.model.get_initializer(hidden_size_initilizer) is None:
+                self.add_initializer(
+                    name=hidden_size_initilizer,
+                    data_type=TensorProto.INT64,
+                    dims=[1],
+                    vals=[hidden_size],
+                    raw=False,
+                )
+
+            hidden_size_reshape_node_name = self.model.create_node_name("Concat", name_prefix="hidden_size_concat")
+
+            hidden_size_concat_node = helper.make_node(
+                "Concat",
+                inputs=[
+                    concat.input[0],
+                    concat.input[1],
+                    hidden_size_initilizer,
+                ],
+                outputs=[hidden_size_reshape_node_name + "output_0"],
+                name=hidden_size_reshape_node_name,
+            )
+            hidden_size_concat_node.attribute.extend([helper.make_attribute("axis", 0)])
+
+            return hidden_size_concat_node
+
+        # Add Tranpose and Reshape nodes for patial rotary embedding applied in phi-2 before passing into MHA
+        if concat_q_half and concat_k_half:
+            # Transpose the key output of rotary Embedding
+            k_transpose_node_name = self.model.create_node_name("Transpose")
+            k_tranpose_output_name = k_transpose_node_name + "_output_0"
+            k_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[concat_k_half.output[0]],
+                outputs=[k_tranpose_output_name],
+                name=k_transpose_node_name,
+            )
+
+            k_transpose_node.attribute.extend([helper.make_attribute("perm", [0, 2, 1, 3])])
+
+            # Transpose the query output of rotary Embedding
+            q_transpose_node_name = self.model.create_node_name("Transpose")
+            q_tranpose_output_name = q_transpose_node_name + "_output_0"
+            q_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[concat_q_half.output[0]],
+                outputs=[q_tranpose_output_name],
+                name=q_transpose_node_name,
+            )
+
+            q_transpose_node.attribute.extend([helper.make_attribute("perm", [0, 2, 1, 3])])
+
+            hidden_size_concat_node = create_hidden_size_concat_node(reshape_k)
+            if hidden_size_concat_node is None:
+                logger.debug("fuse_rotary_attention: failed to create hidden_size_concat_node")
+                return
+
+            # Reshape the Rotary Embedding output for key for 4D to 3D
+            concat_k_reshape_node_name = self.model.create_node_name("Reshape", name_prefix="concat_k_half")
+            concat_k_reshape_node = helper.make_node(
+                "Reshape",
+                inputs=[k_transpose_node.output[0], hidden_size_concat_node.output[0]],
+                outputs=[concat_k_reshape_node_name + "_output_0"],
+                name=concat_k_reshape_node_name,
+            )
+
+            # Reshape the Rotary Embedding output for query from 4D to 3D
+            concat_q_reshape_node_name = self.model.create_node_name("Reshape", name_prefix="concat_q_half")
+            concat_q_reshape_node = helper.make_node(
+                "Reshape",
+                inputs=[q_transpose_node.output[0], hidden_size_concat_node.output[0]],
+                outputs=[concat_q_reshape_node_name + "_output_0"],
+                name=concat_q_reshape_node_name,
+            )
+
+            rotary_k = concat_k_reshape_node
+            rotary_q = concat_q_reshape_node
+
+            self.nodes_to_add.append(hidden_size_concat_node)
+            self.nodes_to_add.append(k_transpose_node)
+            self.nodes_to_add.append(q_transpose_node)
+            self.nodes_to_add.append(concat_k_reshape_node)
+            self.nodes_to_add.append(concat_q_reshape_node)
+
+            self.node_name_to_graph_name[hidden_size_concat_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[k_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[q_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[concat_k_reshape_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[concat_q_reshape_node.name] = self.this_graph_name
+
         new_node = self.create_mha_node(
             matmul_q.input[0],
             root_output,
@@ -907,7 +1070,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         self.nodes_to_remove.extend(qkv_nodes[1:])
 
         if v_nodes != v_nodes_4:
-            self.nodes_to_remove.extend(v_nodes[:-1])
+            self.nodes_to_remove.extend(v_nodes[:-1] if add_v is None else v_nodes[:-2])
         else:
             nodes_to_keep = [v_nodes[0][-1]]
             for temp_path in v_nodes:
@@ -926,6 +1089,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             self.nodes_to_remove.append(k_nodes[1])
             self.nodes_to_remove.append(k_nodes[3])
             self.nodes_to_remove.append(k_nodes[4])
+        elif k_nodes == k_nodes_5:
+            self.nodes_to_remove.append(k_nodes[0])
+            self.nodes_to_remove.append(k_nodes[1])
         elif k_nodes == k_nodes_4:
             nodes_to_keep = [k_nodes[0][-1], k_nodes[0][-4]]
             for temp_path in k_nodes:
@@ -936,7 +1102,6 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         elif q_nodes == q_nodes_2:
             self.nodes_to_remove.append(q_nodes[1])
             self.nodes_to_remove.append(q_nodes[2])
-
         self.prune_graph = True
 
 
@@ -1157,30 +1322,66 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             #     return x_embed
 
             # Check paths for rotate_half(x)
-            rotate_half_x2_path_1 = self.model.match_parent_path(
+            rotate_half_x2_path_1_1 = self.model.match_parent_path(
                 node,
                 ["Mul", "Concat", "Neg", "Slice", "Transpose"],
                 [1, 0, 0, 0, 0],
             )
-            rotate_half_x2_path_2 = self.model.match_parent_path(
+
+            rotate_half_x2_path_1_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Concat", "Neg", "Slice", "Slice"],
+                [1, 0, 0, 0, 0],
+            )
+
+            rotate_half_x2_path_1 = rotate_half_x2_path_1_1 or rotate_half_x2_path_1_2
+
+            rotate_half_x2_path_2_1 = self.model.match_parent_path(
                 node,
                 ["Mul", "Concat", "Neg", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Transpose"],
                 [1, 0, 0, 0, 1, 0, 0, 0, 0],
             )
+
+            rotate_half_x2_path_2_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Concat", "Neg", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Slice"],
+                [1, 0, 0, 0, 1, 0, 0, 0, 0],
+            )
+
+            rotate_half_x2_path_2 = rotate_half_x2_path_2_1 or rotate_half_x2_path_2_2
+
             if rotate_half_x2_path_1 is None or rotate_half_x2_path_2 is None:
                 logger.debug("fuse_rotary_embeddings: failed to match x2 in rotate_half")
                 return
 
-            rotate_half_x1_path_1 = self.model.match_parent_path(
+            rotate_half_x1_path_1_1 = self.model.match_parent_path(
                 node,
                 ["Mul", "Concat", "Slice", "Transpose"],
                 [1, 0, 1, 0],
             )
-            rotate_half_x1_path_2 = self.model.match_parent_path(
+
+            rotate_half_x1_path_1_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Concat", "Slice", "Slice"],
+                [1, 0, 1, 0],
+            )
+
+            rotate_half_x1_path_1 = rotate_half_x1_path_1_1 or rotate_half_x1_path_1_2
+
+            rotate_half_x1_path_2_1 = self.model.match_parent_path(
                 node,
                 ["Mul", "Concat", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Transpose"],
                 [1, 0, 1, 2, 0, 0, 0, 0],
             )
+
+            rotate_half_x1_path_2_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Concat", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Slice"],
+                [1, 0, 1, 2, 0, 0, 0, 0],
+            )
+
+            rotate_half_x1_path_2 = rotate_half_x1_path_2_1 or rotate_half_x1_path_2_2
+
             if rotate_half_x1_path_1 is None or rotate_half_x1_path_2 is None:
                 logger.debug("fuse_rotary_embeddings: failed to match x1 in rotate_half")
                 return
@@ -1195,11 +1396,20 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
                 return
 
             # Check path for x
-            x_path = self.model.match_parent_path(
+            x_path_1 = self.model.match_parent_path(
                 node,
                 ["Mul", "Transpose"],
                 [0, 0],
             )
+
+            x_path_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Slice"],
+                [0, 0],
+            )
+
+            x_path = x_path_1 or x_path_2
+
             if x_path is None:
                 logger.debug("fuse_rotary_embeddings: failed to match x in rotate_half")
                 return
diff --git a/onnxruntime/python/tools/transformers/fusion_shape.py b/onnxruntime/python/tools/transformers/fusion_shape.py
index bc32d78eda66..dfa77fc7d022 100644
--- a/onnxruntime/python/tools/transformers/fusion_shape.py
+++ b/onnxruntime/python/tools/transformers/fusion_shape.py
@@ -29,12 +29,12 @@ def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[i
             return None
 
     def get_dimensions(self, input_name: str) -> Union[int, None]:
-        graph_input = self.model.find_graph_input(input_name)
-        if graph_input:
-            return self.get_dimensions_from_tensor_proto(graph_input)
+        shape = self.model.get_shape(input_name)
+        if shape is not None:
+            return len(shape)
 
         if not self.shape_infer_done:
-            self.shape_infer = self.model.infer_runtime_shape({}, update=True)
+            self.shape_infer = self.model.infer_runtime_shape(update=True)
             self.shape_infer_done = True
 
         if self.shape_infer is not None:
diff --git a/onnxruntime/python/tools/transformers/fusion_skip_group_norm.py b/onnxruntime/python/tools/transformers/fusion_skip_group_norm.py
index df80acbd9780..676052f74796 100644
--- a/onnxruntime/python/tools/transformers/fusion_skip_group_norm.py
+++ b/onnxruntime/python/tools/transformers/fusion_skip_group_norm.py
@@ -147,7 +147,7 @@ def match_bias_path(self, node, input_name_to_nodes, output_name_to_node):
 
     def match_transpose_from_nhwc(self, output_name, input_name_to_nodes, output_name_to_node):
         """Match whether an output is from a Transpose(perm=[0,3,1,2]) node."""
-        parent = output_name_to_node[output_name] if output_name in output_name_to_node else None
+        parent = output_name_to_node.get(output_name, None)
         if parent is not None and parent.op_type == "Transpose":
             permutation = OnnxModel.get_node_attribute(parent, "perm")
             if permutation == [0, 3, 1, 2]:
diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py
index afc968fab46c..726c587ff704 100644
--- a/onnxruntime/python/tools/transformers/fusion_utils.py
+++ b/onnxruntime/python/tools/transformers/fusion_utils.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 from logging import getLogger
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy
 from numpy import array_equal, ndarray
@@ -29,17 +29,7 @@ def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]:
         return False, input_name
 
     def cast_input(self, input_name: str, target_type="int32"):
-        cast_output = input_name + "_" + target_type
-
-        # Avoid consequent Cast nodes.
-        inputs = [input_name]
-        output_name_to_node = self.model.output_name_to_node()
-        if input_name in output_name_to_node:
-            parent_node = output_name_to_node[input_name]
-            if parent_node and parent_node.op_type == "Cast":
-                inputs = [parent_node.input[0]]
-
-        cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output])
+        output_name = input_name + "_" + target_type
 
         if target_type == "int32":
             to_type = int(TensorProto.INT32)
@@ -50,10 +40,36 @@ def cast_input(self, input_name: str, target_type="int32"):
         else:
             raise ValueError("Invalid target_type: {target_type}")
 
+        cast_node = self.add_cast_node(input_name, to_type, output_name)
+
+        return output_name, cast_node
+
+    def add_cast_node(
+        self,
+        input_name: str,
+        to_type: int,
+        output_name: Optional[str] = None,
+        output_name_to_node=None,
+        graph_name: Optional[str] = None,
+    ):
+        if output_name is None:
+            output_name = input_name + f"_cast_to_{to_type}"
+
+        # Avoid consequent Cast nodes.
+        inputs = [input_name]
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+        if input_name in output_name_to_node:
+            parent_node = output_name_to_node[input_name]
+            if parent_node and parent_node.op_type == "Cast":
+                inputs = [parent_node.input[0]]
+
+        cast_node = helper.make_node("Cast", inputs=inputs, outputs=[output_name])
+
         cast_node.attribute.extend([helper.make_attribute("to", to_type)])
-        self.model.add_node(cast_node)
+        self.model.add_node(cast_node, graph_name=graph_name)
 
-        return cast_output, cast_node
+        return cast_node
 
     def cast_input_to_int32(self, input_name: str):
         return self.cast_input(input_name, "int32")
@@ -224,9 +240,10 @@ def check_node_input_value(self, node, input_index: int, expected_value):
     def remove_identity_nodes(self):
         """Remove Identity nodes, except those right before graph output."""
         nodes_to_remove = []
+        graph_output_names = self.model.get_graphs_output_names()
         for node in self.model.nodes():
             if node.op_type == "Identity":
-                if node.output[0] not in self.model.get_graphs_output_names():
+                if node.output[0] not in graph_output_names:
                     self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
                     nodes_to_remove.append(node)
 
diff --git a/onnxruntime/python/tools/transformers/import_utils.py b/onnxruntime/python/tools/transformers/import_utils.py
new file mode 100644
index 000000000000..9755a26b7b00
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/import_utils.py
@@ -0,0 +1,20 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import importlib.metadata
+import importlib.util
+
+
+def is_installed(package):
+    try:
+        dist = importlib.metadata.distribution(package)
+    except importlib.metadata.PackageNotFoundError:
+        try:
+            spec = importlib.util.find_spec(package)
+        except ModuleNotFoundError:
+            return False
+
+        return spec is not None
+
+    return dist is not None
diff --git a/onnxruntime/python/tools/transformers/io_binding_helper.py b/onnxruntime/python/tools/transformers/io_binding_helper.py
index 50703b9c17e0..58a49525b919 100644
--- a/onnxruntime/python/tools/transformers/io_binding_helper.py
+++ b/onnxruntime/python/tools/transformers/io_binding_helper.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 from collections import OrderedDict
 from typing import Any, Dict, List, Tuple, Union
@@ -5,7 +6,7 @@
 import numpy
 import torch
 
-from onnxruntime import InferenceSession
+from onnxruntime import InferenceSession, RunOptions
 
 logger = logging.getLogger(__name__)
 
@@ -227,7 +228,6 @@ def __del__(self):
         del self.input_tensors
         del self.output_tensors
         del self.io_binding
-        del self.ort_session
 
     def allocate_buffers(self, shape_dict: Dict[str, Union[Tuple[int], List[int]]]):
         """Allocate tensors for I/O Binding"""
@@ -276,7 +276,7 @@ def allocate_buffers(self, shape_dict: Dict[str, Union[Tuple[int], List[int]]]):
                     tensor.data_ptr(),
                 )
 
-    def infer(self, feed_dict: Dict[str, torch.Tensor]):
+    def infer(self, feed_dict: Dict[str, torch.Tensor], run_options: RunOptions = None, synchronize: bool = False):
         """Bind input tensors and run inference"""
         for name, tensor in feed_dict.items():
             assert isinstance(tensor, torch.Tensor) and tensor.is_contiguous()
@@ -285,16 +285,7 @@ def infer(self, feed_dict: Dict[str, torch.Tensor]):
                     assert self.input_tensors[name].nelement() == tensor.nelement()
                     assert self.input_tensors[name].dtype == tensor.dtype
                     assert tensor.device.type == "cuda"
-                    # Please install cuda-python package with a version corresponding to CUDA in your machine.
-                    from cuda import cudart
-
-                    # Update input tensor inplace since cuda graph requires input and output has fixed memory address.
-                    cudart.cudaMemcpy(
-                        self.input_tensors[name].data_ptr(),
-                        tensor.data_ptr(),
-                        tensor.element_size() * tensor.nelement(),
-                        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice,
-                    )
+                    self.input_tensors[name].copy_(tensor)
                 else:
                     self.io_binding.bind_input(
                         name,
@@ -305,14 +296,115 @@ def infer(self, feed_dict: Dict[str, torch.Tensor]):
                         tensor.data_ptr(),
                     )
 
-        self.ort_session.run_with_iobinding(self.io_binding)
+        # Synchronization are not needed in most cases unless different streams are used or inputs/outputs are in CPU.
+        if synchronize:
+            self.io_binding.synchronize_inputs()
+            self.ort_session.run_with_iobinding(self.io_binding, run_options)
+            self.io_binding.synchronize_outputs()
+        else:
+            self.ort_session.run_with_iobinding(self.io_binding, run_options)
 
         return self.output_tensors
 
     @staticmethod
-    def get_cuda_provider_options(device_id: int, enable_cuda_graph: bool) -> Dict[str, Any]:
-        return {
+    def get_cuda_provider_options(device_id: int, enable_cuda_graph: bool, stream: int = 0) -> Dict[str, Any]:
+        options = {
             "device_id": device_id,
             "arena_extend_strategy": "kSameAsRequested",
             "enable_cuda_graph": enable_cuda_graph,
         }
+
+        # Stream is address of a CUDA stream. 0 means the default stream.
+        if stream != 0:
+            options["user_compute_stream"] = str(stream)
+
+        return options
+
+
+class GpuBinding(CudaSession):
+    def __init__(
+        self,
+        ort_session: InferenceSession,
+        device: torch.device,
+        shape_dict: Dict[str, Union[Tuple[int], List[int]]],
+        enable_gpu_graph: bool = False,
+        gpu_graph_id: int = -1,
+        stream: int = 0,
+    ):
+        super().__init__(ort_session, device, enable_gpu_graph)
+        self.allocate_buffers(shape_dict)
+        self.gpu_graph_id = gpu_graph_id
+        # For cuda graph, we need to keep a copy of shape_dict to check if the shape is same in inference later.
+        self.shape_dict = copy.deepcopy(shape_dict) if enable_gpu_graph else None
+        self.stream = stream
+        # The gpu graph id of last run. It will be saved to image metadata.
+        self.last_run_gpu_graph_id = None
+
+    def get_run_options(self, disable_cuda_graph_in_run: bool = False) -> RunOptions:
+        options = RunOptions()
+
+        gpu_graph_id = -1 if disable_cuda_graph_in_run else self.gpu_graph_id
+
+        options.add_run_config_entry("gpu_graph_id", str(gpu_graph_id))
+
+        self.last_run_gpu_graph_id = gpu_graph_id
+
+        return options
+
+    def infer(self, feed_dict: Dict[str, torch.Tensor], disable_cuda_graph_in_run: bool = False):
+        run_options = self.get_run_options(disable_cuda_graph_in_run)
+
+        if self.stream:
+            run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
+
+        return super().infer(feed_dict, run_options)
+
+
+class GpuBindingManager:
+    """A manager for I/O bindings that support multiple CUDA Graphs.
+    One cuda graph is reused for same input shape. Automatically add a new cuda graph for new input shape.
+    """
+
+    def __init__(self, ort_session: InferenceSession, device: torch.device, stream: int = 0, max_cuda_graphs: int = 1):
+        self.ort_session = ort_session
+        self.device = device
+
+        # Binding supports cuda graphs. For a binding, it is able to disable cuda graph for a specific run.
+        self.graph_bindings = []
+
+        # Binding for not using cuda graph.
+        self.no_graph_binding = None
+
+        self.stream = stream
+
+        self.max_cuda_graphs = max_cuda_graphs
+
+    def get_binding(
+        self,
+        shape_dict: Dict[str, Union[Tuple[int], List[int]]],
+        use_cuda_graph: bool = False,
+    ) -> GpuBinding:
+        for gpu_graph_binding in self.graph_bindings:
+            # Found a cuda graph that captured with the same shape
+            if gpu_graph_binding.shape_dict == shape_dict:
+                return gpu_graph_binding
+
+        # Reached the maximum number of cuda graphs. Return a binding without cuda graph.
+        if len(self.graph_bindings) >= self.max_cuda_graphs or (not use_cuda_graph):
+            if self.no_graph_binding is None:
+                self.no_graph_binding = GpuBinding(self.ort_session, self.device, shape_dict, stream=self.stream)
+            else:
+                self.no_graph_binding.allocate_buffers(shape_dict)
+            return self.no_graph_binding
+
+        # This is a new input shape, create a new cuda graph
+        gpu_graph_binding = GpuBinding(
+            self.ort_session,
+            self.device,
+            shape_dict,
+            enable_gpu_graph=True,
+            gpu_graph_id=len(self.graph_bindings),
+            stream=self.stream,
+        )
+        self.graph_bindings.append(gpu_graph_binding)
+        return gpu_graph_binding
diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 1601b1a203b9..2083419087a6 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -224,26 +224,35 @@ def fetch_onnx_inputs_outputs_name(
     if not num_of_past_key:
         num_of_past_key = model.config.num_hidden_layers
 
-    onnx_inp_names = ("input_ids", "attention_mask")
+    # filter out constant inputs
+    onnx_inp_names = tuple(
+        [torch_input_names[i] for i in range(len(torch_input_names)) if isinstance(onnx_inputs[i], torch.Tensor)]
+    )
+    assert (
+        "input_ids" in onnx_inp_names and "attention_mask" in onnx_inp_names
+    ), "input_ids and attention_mask must be existed in inputs"
     onnx_out_names = ("logits",)
     onnx_dynamic_axes = {
         "input_ids": {0: "batch_size", 1: "seq_len"},
         "attention_mask": {0: "batch_size", 1: "seq_len"},
     }
+    # add dyanmic dimensions for the unkonw inputs
+    for idx, name in enumerate(onnx_inp_names):
+        if name not in onnx_dynamic_axes:
+            unknown_dims = {i: f"{idx}__unknown_dims__{i}" for i in range(onnx_inputs[idx].dim())}
+            onnx_dynamic_axes[name] = unknown_dims
     if input_with_past:
         for i in range(num_of_past_key):
-            onnx_inp_names += (f"present_key.{i}",)
-            onnx_inp_names += (f"present_values.{i}",)
+            onnx_inp_names += (f"past_key_values.{i}.key",)
+            onnx_inp_names += (f"past_key_values.{i}.value",)
 
             onnx_dynamic_axes[onnx_inp_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_inp_names[-2]] = kv_cache_axis
 
     if with_past or input_with_past:
         for i in range(num_of_past_key):
-            onnx_out_names += (f"past_key.{i}",)
-            onnx_out_names += (f"past_values.{i}",)
-            onnx_dynamic_axes[onnx_out_names[-1]] = kv_cache_axis
-            onnx_dynamic_axes[onnx_out_names[-2]] = kv_cache_axis
+            onnx_out_names += (f"present.{i}.key",)
+            onnx_out_names += (f"present.{i}.value",)
 
     for idx, name in enumerate(torch_input_names):
         if input_with_past:
diff --git a/onnxruntime/python/tools/transformers/metrics.py b/onnxruntime/python/tools/transformers/metrics.py
new file mode 100644
index 000000000000..282c75ba8f6a
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/metrics.py
@@ -0,0 +1,164 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import datetime
+import json
+from typing import Optional
+
+import pandas as pd
+
+
+class BaseObject:
+    def __init__(self):
+        self.customized = {}
+
+    def to_dict(self):
+        default_values = self.__dict__.copy()
+        default_values.pop("customized", None)
+        default_values.update(self.customized)
+
+        for k, v in default_values.items():
+            if isinstance(v, BaseObject):
+                default_values[k] = v.to_dict()
+
+        return {k: v for k, v in default_values.items() if v}
+
+
+class ModelInfo(BaseObject):
+    def __init__(
+        self,
+        full_name: Optional[str] = None,
+        is_huggingface: Optional[bool] = False,
+        is_text_generation: Optional[bool] = False,
+        short_name: Optional[str] = None,
+    ):
+        super().__init__()
+        self.full_name = full_name
+        self.is_huggingface = is_huggingface
+        self.is_text_generation = is_text_generation
+        self.short_name = short_name
+        self.input_shape = []
+
+
+class BackendOptions(BaseObject):
+    def __init__(
+        self,
+        enable_profiling: Optional[bool] = False,
+        execution_provider: Optional[str] = None,
+        use_io_binding: Optional[bool] = False,
+    ):
+        super().__init__()
+        self.enable_profiling = enable_profiling
+        self.execution_provider = execution_provider
+        self.use_io_binding = use_io_binding
+
+
+class Config(BaseObject):
+    def __init__(
+        self,
+        backend: Optional[str] = "onnxruntime",
+        batch_size: Optional[int] = 1,
+        seq_length: Optional[int] = 0,
+        precision: Optional[str] = "fp32",
+        warmup_runs: Optional[int] = 1,
+        measured_runs: Optional[int] = 10,
+    ):
+        super().__init__()
+        self.backend = backend
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.precision = precision
+        self.warmup_runs = warmup_runs
+        self.measured_runs = measured_runs
+        self.model_info = ModelInfo()
+        self.backend_options = BackendOptions()
+
+
+class Metadata(BaseObject):
+    def __init__(
+        self,
+        device: Optional[str] = None,
+        package_name: Optional[str] = None,
+        package_version: Optional[str] = None,
+        platform: Optional[str] = None,
+        python_version: Optional[str] = None,
+    ):
+        super().__init__()
+        self.device = device
+        self.package_name = package_name
+        self.package_version = package_version
+        self.platform = platform
+        self.python_version = python_version
+
+
+class Metrics(BaseObject):
+    def __init__(
+        self,
+        latency_ms_mean: Optional[float] = 0.0,
+        throughput_qps: Optional[float] = 0.0,
+        max_memory_usage_GB: Optional[float] = 0.0,
+    ):
+        super().__init__()
+        self.latency_ms_mean = latency_ms_mean
+        self.throughput_qps = throughput_qps
+        self.max_memory_usage_GB = max_memory_usage_GB
+
+
+class BenchmarkRecord:
+    def __init__(
+        self,
+        model_name: str,
+        precision: str,
+        backend: str,
+        device: str,
+        package_name: str,
+        package_version: str,
+        batch_size: Optional[int] = 1,
+        warmup_runs: Optional[int] = 1,
+        measured_runs: Optional[int] = 10,
+        trigger_date: Optional[str] = None,
+    ):
+        self.config = Config()
+        self.metrics = Metrics()
+        self.metadata = Metadata()
+        self.trigger_date = trigger_date or datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        self.config.model_info.full_name = model_name
+        self.config.precision = precision
+        self.config.backend = backend
+        self.config.batch_size = batch_size
+        self.config.warmup_runs = warmup_runs
+        self.config.measured_runs = measured_runs
+        self.metadata.device = device
+        self.metadata.package_name = package_name
+        self.metadata.package_version = package_version
+
+    def to_dict(self) -> dict:
+        return {
+            "config": self.config.to_dict(),
+            "metadata": self.metadata.to_dict(),
+            "metrics": self.metrics.to_dict(),
+            "trigger_date": self.trigger_date,
+        }
+
+    def to_json(self) -> str:
+        return json.dumps(self.to_dict(), default=str)
+
+    @classmethod
+    def save_as_csv(cls, file_name: str, records: list) -> None:
+        if records is None or len(records) == 0:
+            return
+        rds = [record.to_dict() for record in records]
+        df = pd.json_normalize(rds)
+        df.to_csv(file_name, index=False)
+
+    @classmethod
+    def save_as_json(cls, file_name: str, records: list) -> None:
+        if records is None or len(records) == 0:
+            return
+        rds = [record.to_dict() for record in records]
+        with open(file_name, "w") as f:
+            json.dump(rds, f, indent=4, default=str)
diff --git a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
index 6089c960e47e..8797fd9c2cfa 100644
--- a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
+++ b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
@@ -193,7 +193,7 @@ def output_summary(results: List[Dict[str, Any]], csv_filename: str, metric_name
 
             if row:
                 for key in key_names:
-                    row[key] = values[key] if key in values else ""
+                    row[key] = values.get(key, "")
                 csv_writer.writerow(row)
 
         csv_file.flush()
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index e48f0adc832c..6d6a057574a1 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -400,7 +400,7 @@ def main(args):
                         }
                         csv_writer.writerow(row)
                     except Exception:
-                        logger.error("Exception", exc_info=True)
+                        logger.error("Exception", exc_info=True)  # noqa: G201
                         return None
 
     logger.info(f"Results are saved to file {csv_filename}")
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
index e01585ae8416..9153193a4974 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -630,7 +630,7 @@ def pytorch_inference(model, inputs: Gpt2Inputs, total_runs: int = 0):
                 latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("PyTorch inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("PyTorch inference time = {} ms".format(format(average_latency, ".2f")))  # noqa: G001
 
         return outputs, average_latency
 
@@ -662,7 +662,7 @@ def onnxruntime_inference(ort_session, inputs: Gpt2Inputs, total_runs: int = 0):
             latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, ".2f")))  # noqa: G001
 
         return ort_outputs, average_latency
 
@@ -741,7 +741,7 @@ def onnxruntime_inference_with_binded_io(
             latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("OnnxRuntime with IO binding inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("OnnxRuntime with IO binding inference time = %.2f ms", average_latency)
 
         return ort_outputs, average_latency
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
index a1e6d3125e7f..b039f1351b1d 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
@@ -171,17 +171,15 @@ def print_wins(wins, rows, test_name):
 
     rank = 0
     previous_value = -1
-    count = 0
-    for key, value in sorted_wins.items():
+    for count, (key, value) in enumerate(sorted_wins.items()):
         if value != previous_value:
             rank = count
         previous_value = value
-        count += 1
 
         for row in rows:
             if row["run_id"] == key:
                 logger.info(
-                    "{:02d}: WINs={:02d}, run_id={}, latency={:5.2f}, top1_match={:.4f}, size={}_MB, experiment={}, {}".format(
+                    "{:02d}: WINs={:02d}, run_id={}, latency={:5.2f}, top1_match={:.4f}, size={}_MB, experiment={}, {}".format(  # noqa: G001
                         rank,
                         value,
                         key,
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
index 12700f00ad0c..f4705bef6a98 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
@@ -387,8 +387,8 @@ def test_generation(
             if i % 10 == 0:
                 print(f"{i}")
             input_ids = inputs["input_ids"]
-            position_ids = inputs["position_ids"] if "position_ids" in inputs else None
-            attention_mask = inputs["attention_mask"] if "attention_mask" in inputs else None
+            position_ids = inputs.get("position_ids", None)
+            attention_mask = inputs.get("attention_mask", None)
 
             onnx_runner = Gpt2Tester(
                 input_ids,
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index e7bcc19635f4..04671e47c033 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -1,7 +1,14 @@
 # Contents
  - [LLaMA-2](#llama-2)
+   - [Prerequisites](#prerequisites)
    - [Exporting LLaMA-2](#exporting-llama-2)
+   - [Examples of Exporting LLaMA-2](#examples-of-exporting-llama-2)
+   - [Parity Checking LLaMA-2](#parity-checking-llama-2)
    - [Benchmarking LLaMA-2](#benchmark-llama-2)
+     - [Variants](#variants)
+     - [Benchmark All](#benchmark-all)
+     - [Benchmark E2E](#benchmark-e2e)
+   - [E2E Inference with LLaMA-2](#e2e-inference-with-llama-2)
  - [Mistral](#mistral)
    - [Exporting Mistral](#exporting-mistral)
    - [Optimizing and Quantizing Mistral](#optimizing-and-quantizing-mistral)
@@ -42,23 +49,6 @@ $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama
 
 To make this option compatible with [Hugging Face's Optimum](https://github.com/huggingface/optimum), you will need to create `config.json` and `generation_config.json` for your model and store them in the same directory as your ONNX models. For example, you can find those JSON files for LLaMA-2 7B on Hugging Face [here](https://huggingface.co/meta-llama/Llama-2-7b-hf).
 
-As indicated in `requirements.txt`, you will also need to install Optimum from source. Once installed, you will need to modify `ORTModelForCausalLM.forward` in `optimum/optimum/onnxruntime/modeling_decoder.py` as follows:
-
-```
-# Before
-if self.use_cache:
-    if past_key_values is not None:
-        input_ids = input_ids[:, -1:]
-        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
-
-
-# After
-if self.use_cache:
-    if past_key_values is not None:
-        input_ids = input_ids[:, -1:] if past_key_values[0][0].shape[2] != 0 else input_ids
-        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
-```
-
 ### Option 2: from [Microsoft's custom export](https://github.com/microsoft/Llama-2-Onnx)
 
 Please follow the [README instructions](https://github.com/microsoft/Llama-2-Onnx#before-you-start) in the custom export of LLaMA-2.
@@ -246,6 +236,55 @@ $ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudn
 $ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa
 ```
 
+## Parity Checking LLaMA-2
+
+Here are some examples of how you can use the parity checker to verify your LLaMA-2 ONNX model.
+
+1. Merged ONNX model, FP32 CPU
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cpu \
+    --precision fp32 \
+    --cache_dir ./model_cache \
+```
+
+2. Merged ONNX model, FP32 CUDA
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cuda \
+    --precision fp32 \
+    --cache_dir ./model_cache \
+```
+
+3. Merged ONNX model, FP16 CUDA
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cuda \
+    --precision fp16 \
+    --cache_dir ./model_cache \
+```
+
+4. Merged ONNX model, FP16 CUDA with GroupQueryAttention + Buffer Sharing Enabled
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --use_buffer_share \
+    --execution_provider cuda \
+    --precision fp16 \
+    --cache_dir ./model_cache \
+```
+
 ## Benchmark LLaMA-2
 
 Here are some examples of how you can benchmark LLaMA-2.
@@ -254,9 +293,10 @@ Here are some examples of how you can benchmark LLaMA-2.
 
 1. PyTorch without `torch.compile`, FP32
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-eager \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -266,9 +306,10 @@ python3 -m models.llama.benchmark \
 
 2. PyTorch with `torch.compile`, FP16
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-compile \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -278,10 +319,11 @@ python3 -m models.llama.benchmark \
 
 3. Optimum + ONNX Runtime, FP32, export via Optimum or convert_to_onnx
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -291,10 +333,11 @@ python3 -m models.llama.benchmark \
 
 4. Optimum + ONNX Runtime, FP16, export via Optimum or convert_to_onnx
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -304,10 +347,11 @@ python3 -m models.llama.benchmark \
 
 5. ONNX Runtime, FP32, Microsoft custom export
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -316,10 +360,11 @@ python3 -m models.llama.benchmark \
 
 6. ONNX Runtime, FP16, Microsoft custom export
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -332,6 +377,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m models.llama.benchmark \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -344,6 +390,7 @@ CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -356,6 +403,7 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 bash benchmark_70b_model.sh 4 \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-70b-dis/rank_{}_Llama-2-70b-hf_decoder_merged_model_fp16.onnx \
     --model-name meta-llama/Llama-2-70b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --device cuda \
     --warmup-runs 5 \
@@ -367,13 +415,14 @@ You can profile a variant by adding the `--profile` flag and providing one batch
 ### Benchmark All
 You can use `benchmark_all.py` to benchmark across various options and automatically store the results in a CSV file. Here is an example.
 ```
-python3 -m models.llama.benchmark_all \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_all \
     --hf-pt-eager \
     --hf-pt-compile \
     --hf-ort-dir-path ./llama2-7b-fp16/ \
     --ort-convert-to-onnx-model-path ./llama2-7b-fp16/Llama-2-7b-hf_decoder_merged_model_fp16.onnx \
     --ort-msft-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -383,6 +432,72 @@ python3 -m models.llama.benchmark_all \
     --timeout 60  # number of minutes before moving to the next benchmark
 ```
 
+### Benchmark E2E
+You can use `benchmark_e2e.py` to benchmark the full end-to-end scenario and automatically store the results in a CSV file. This tool uses `argmax` for sampling to standardize the benchmarking process.
+
+1. PyTorch without `torch.compile`, FP32
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type pt-eager \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cpu \
+    --auth
+```
+
+2. PyTorch with `torch.compile`, FP16
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type pt-compile \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cuda \
+    --auth
+```
+
+3. ONNX Runtime with `convert_to_onnx`, FP32
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type ort \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --onnx-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cpu \
+    --auth
+```
+
+4. ONNX Runtime with `convert_to_onnx`, FP16
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type ort \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --onnx-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cuda \
+    --use_buffer_share \
+    --auth
+```
+
+## E2E Inference with LLaMA-2
+
+For end-to-end inference, please visit the [ONNX Runtime Inference Examples folder](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/models/llama) for a step-by-step walkthrough, code examples, and performance metrics.
+
 # Mistral
 
 ## Introduction
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
index a53dead77dea..b9d6b30baae8 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 import datetime
 import gc
@@ -14,11 +19,12 @@
 from benchmark_helper import measure_memory, setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
-    add_io_bindings,
+    add_io_bindings_as_ortvalues,
     get_merged_sample_with_past_kv_inputs,
     get_msft_sample_inputs,
     get_sample_inputs,
     get_sample_with_past_kv_inputs,
+    verify_ort_inputs,
 )
 from optimum.onnxruntime import ORTModelForCausalLM
 from torch.profiler import ProfilerActivity, profile, record_function
@@ -48,19 +54,9 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
     init_inputs, iter_inputs = None, None
 
     # For past_present_share_buffer:
-    # Set max_seq_len to 16384 for CodeLLaMA (finetuned variant of LLaMA-2)
-    # Set max_seq_len to 4096 for Hugging Face LLaMA-2 model since that is the default value
     # Set max_seq_len to 2048 for Microsoft LLaMA-2 model since that is the max value currently supported
-    temp_name = args.model_name.lower().replace("-", "").replace("_", "")
-    max_seq_len = (
-        2048
-        if args.benchmark_type == "ort-msft"
-        else 16384
-        if "codellama" in temp_name
-        else 4096
-        if "llama2" in temp_name
-        else 2048
-    )
+    # Set max_seq_len to config value for other models
+    max_seq_len = 2048 if args.benchmark_type == "ort-msft" else args.config.max_position_embeddings
 
     if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
         init_inputs = get_sample_inputs(
@@ -107,7 +103,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
                 past_seq_len=0,
                 max_seq_len=max_seq_len,
                 use_fp16=args.use_fp16,
-                use_gqa=args.use_gqa,
+                use_buffer_share=args.use_buffer_share,
                 engine="pt",
                 return_dict=True,
             )
@@ -119,7 +115,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
                 past_seq_len=args.sequence_length,
                 max_seq_len=max_seq_len,
                 use_fp16=args.use_fp16,
-                use_gqa=args.use_gqa,
+                use_buffer_share=args.use_buffer_share,
                 engine="pt",
                 return_dict=True,
             )
@@ -134,7 +130,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
             past_seq_len=0,
             max_seq_len=max_seq_len,
             use_fp16=args.use_fp16,
-            use_gqa=args.use_gqa,
+            use_buffer_share=args.use_buffer_share,
             engine="ort",
             return_dict=True,
             world_size=args.world_size,
@@ -147,7 +143,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
             past_seq_len=args.sequence_length,
             max_seq_len=max_seq_len,
             use_fp16=args.use_fp16,
-            use_gqa=args.use_gqa,
+            use_buffer_share=args.use_buffer_share,
             engine="ort",
             return_dict=True,
             world_size=args.world_size,
@@ -164,7 +160,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
             seq_len=args.sequence_length,
             max_seq_len=max_seq_len,
             use_fp16=args.use_fp16,
-            use_gqa=args.use_gqa,
+            use_buffer_share=args.use_buffer_share,
             split_kv=split_kv,
         )
         iter_inputs = get_msft_sample_inputs(
@@ -174,7 +170,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
             seq_len=1,
             max_seq_len=max_seq_len,
             use_fp16=args.use_fp16,
-            use_gqa=args.use_gqa,
+            use_buffer_share=args.use_buffer_share,
             split_kv=split_kv,
         )
 
@@ -203,6 +199,7 @@ def get_model(args: argparse.Namespace):
             torch_dtype=torch.float16 if args.use_fp16 else torch.float32,
             use_auth_token=args.auth,
             use_cache=True,
+            cache_dir=args.cache_dir,
         ).to(args.target_device)
         end_time = time.time()
 
@@ -243,7 +240,7 @@ def get_model(args: argparse.Namespace):
             decoder_file_name=decoder_file_name,
             decoder_with_past_file_name=decoder_with_past_file_name,
             use_auth_token=args.auth,
-            use_io_binding=(args.device != "cpu"),
+            use_io_binding=True,  # Large perf gain even for cpu due to avoiding output copy.
             use_merged=(True if decoder_file_name == "model.onnx" else None),
             provider=provider,
             provider_options=provider_options,
@@ -278,21 +275,25 @@ def time_fn(args, fn, inputs):
         outputs = fn(inputs)
         logger.info(outputs)
 
-    input_sync = (  # noqa: E731
-        lambda *kwargs: args.io_binding.synchronize_inputs()
+    input_sync = lambda *kwargs: (  # noqa: E731
+        args.io_binding.synchronize_inputs()
         if args.device != "cpu" and args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}  # ORT synchronize
-        else lambda *kwargs: torch.cuda.synchronize()
-        if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
-        else lambda *kwargs: None  # no-op function
-    )
+        else lambda *kwargs: (
+            torch.cuda.synchronize()
+            if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
+            else lambda *kwargs: None
+        )
+    )  # no-op function
 
-    output_sync = (  # noqa: E731
-        lambda *kwargs: args.io_binding.synchronize_outputs()
+    output_sync = lambda *kwargs: (  # noqa: E731
+        args.io_binding.synchronize_outputs()
         if args.device != "cpu" and args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}  # ORT synchronize
-        else lambda *kwargs: torch.cuda.synchronize()
-        if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
-        else lambda *kwargs: None  # no-op function
-    )
+        else lambda *kwargs: (
+            torch.cuda.synchronize()
+            if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
+            else lambda *kwargs: None
+        )
+    )  # no-op function
 
     for _ in warmup_range:
         input_sync()
@@ -444,25 +445,13 @@ def get_logits(inputs):
 
 def run_ort_inference(args, init_inputs, iter_inputs, model):
     def prepare_ort_inputs(inputs, kv_cache_ortvalues):
-        # Check that all model inputs will be provided
-        model_inputs = set(map(lambda model_input: model_input.name, model.get_inputs()))
-        user_inputs = set(inputs.keys())
-        missing_inputs = model_inputs - user_inputs
-        if len(missing_inputs):
-            logger.error(f"The following model inputs are missing: {missing_inputs}")
-            raise Exception("There are missing inputs to the model. Please add them and try again.")
-
-        # Remove unnecessary inputs from model inputs
-        unnecessary_inputs = user_inputs - model_inputs
-        if len(unnecessary_inputs):
-            for unnecessary_input in unnecessary_inputs:
-                logger.info(f"Removing unnecessary input '{unnecessary_input}' from user provided inputs")
-                del inputs[unnecessary_input]
+        # Verify model inputs
+        inputs = verify_ort_inputs(model, inputs)
 
         # Add IO bindings for non-CPU execution providers
         if args.device != "cpu":
-            io_binding, kv_cache_ortvalues = add_io_bindings(
-                model, inputs, args.device, int(args.rank), args.use_gqa, kv_cache_ortvalues
+            io_binding, kv_cache_ortvalues = add_io_bindings_as_ortvalues(
+                model, inputs, args.device, int(args.rank), args.use_buffer_share, kv_cache_ortvalues
             )
             setattr(args, "io_binding", io_binding)  # noqa: B010
             return io_binding, kv_cache_ortvalues
@@ -612,6 +601,13 @@ def get_args(rank=0):
     parser.add_argument("--pt-num-rows", type=int, default=1000, help="Number of rows for PyTorch profiler to display")
     parser.add_argument("--verbose", default=False, action="store_true")
     parser.add_argument("--log-folder", type=str, default=os.path.join("."), help="Folder to cache log files")
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        required=True,
+        default="./model_cache",
+        help="Cache dir where Hugging Face files are stored",
+    )
 
     args = parser.parse_args()
 
@@ -662,8 +658,8 @@ def main():
 
     args.rank = rank
     args.world_size = world_size
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
-    config = AutoConfig.from_pretrained(args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, cache_dir=args.cache_dir)
+    config = AutoConfig.from_pretrained(args.model_name, cache_dir=args.cache_dir)
     target_device = f"cuda:{args.rank}" if args.device != "cpu" else args.device
     use_fp16 = args.precision == "fp16"
 
@@ -682,9 +678,9 @@ def main():
         gqa_nodes = list(filter(lambda node: node.op_type == "GroupQueryAttention", onnx_model.graph.node))
 
         use_buffer_share = use_fp16 and len(gqa_nodes) > 0 and args.device != "cpu"
-        setattr(args, "use_gqa", use_buffer_share)  # noqa: B010
+        setattr(args, "use_buffer_share", use_buffer_share)  # noqa: B010
     else:
-        setattr(args, "use_gqa", False)  # noqa: B010
+        setattr(args, "use_buffer_share", False)  # noqa: B010
 
     # Measure prompt cost (init_inputs) and generated token cost (iter_inputs)
     for batch_size, sequence_length in itertools.product(args.batch_sizes, args.sequence_lengths):
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
index b35a5e27f9ea..2433ae3d9b5e 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 import datetime
 import json
@@ -7,6 +12,7 @@
 
 import torch
 from benchmark_helper import setup_logger
+from metrics import BenchmarkRecord
 
 logger = logging.getLogger(__name__)
 
@@ -77,6 +83,13 @@ def get_args():
         help="Path to ONNX model from convert_to_onnx",
     )
 
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        default="./model_cache",
+        help="Cache dir where Hugging Face files are stored",
+    )
+
     parser.add_argument(
         "--model-name",
         type=str,
@@ -121,11 +134,19 @@ def get_args():
         help="Number of mins to attempt the benchmark before moving on",
     )
 
+    parser.add_argument(
+        "--log-folder",
+        type=str,
+        default=None,
+        help="Path to folder to save logs and results",
+    )
+
     args = parser.parse_args()
 
     setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-"))  # noqa: B010
     log_folder_name = f"./{args.model_size}_{args.precision}"
-    setattr(args, "log_folder", log_folder_name)  # noqa: B010
+    if not args.log_folder:
+        args.log_folder = log_folder_name
     os.makedirs(args.log_folder, exist_ok=True)
 
     # Convert timeout value to secs
@@ -197,6 +218,9 @@ def save_results(results, filename):
     df = pd.DataFrame(
         results,
         columns=[
+            "Warmup Runs",
+            "Measured Runs",
+            "Model Name",
             "Engine",
             "Precision",
             "Device",
@@ -211,6 +235,8 @@ def save_results(results, filename):
     )
 
     # Set column types
+    df["Warmup Runs"] = df["Warmup Runs"].astype("int")
+    df["Measured Runs"] = df["Measured Runs"].astype("int")
     df["Batch Size"] = df["Batch Size"].astype("int")
     df["Sequence Length"] = df["Sequence Length"].astype("int")
     df["Latency (s)"] = df["Latency (s)"].astype("float")
@@ -218,7 +244,52 @@ def save_results(results, filename):
     df["Throughput (tps)"] = df["Throughput (tps)"].astype("float")
     df["Memory (GB)"] = df["Memory (GB)"].astype("float")
 
-    df.to_csv(filename, index=False)
+    # get package name and version
+    import pkg_resources
+
+    installed_packages = pkg_resources.working_set
+    installed_packages_list = sorted(
+        [
+            f"{i.key}=={i.version}"
+            for i in installed_packages
+            if i.key in ["ort-nightly-gpu", "ort-nightly", "onnxruntime", "onnxruntime-gpu"]
+        ]
+    )
+
+    ort_pkg_name = ""
+    ort_pkg_version = ""
+    if installed_packages_list:
+        ort_pkg_name = installed_packages_list[0].split("==")[0]
+        ort_pkg_version = installed_packages_list[0].split("==")[1]
+
+    # Save results to csv with standard format
+    records = []
+    for _, row in df.iterrows():
+        if row["Engine"] in ["optimum-ort", "onnxruntime"]:
+            record = BenchmarkRecord(
+                row["Model Name"], row["Precision"], "onnxruntime", row["Device"], ort_pkg_name, ort_pkg_version
+            )
+        elif row["Engine"] in ["pytorch-eager", "pytorch-compile"]:
+            record = BenchmarkRecord(
+                row["Model Name"], row["Precision"], "pytorch", row["Device"], torch.__name__, torch.__version__
+            )
+        else:
+            record = BenchmarkRecord(row["Model Name"], row["Precision"], row["Engine"], row["Device"], "", "")
+        record.config.warmup_runs = row["Warmup Runs"]
+        record.config.measured_runs = row["Measured Runs"]
+        record.config.batch_size = row["Batch Size"]
+        record.config.seq_length = row["Sequence Length"]
+        record.config.customized["measure_step"] = row["Step"]
+        record.config.customized["engine"] = row["Engine"]
+        record.metrics.customized["latency_s_mean"] = row["Latency (s)"]
+        record.metrics.latency_ms_mean = row["Latency (ms)"]
+        record.metrics.customized["throughput_tps"] = row["Throughput (tps)"]
+        record.metrics.max_memory_usage_GB = row["Memory (GB)"]
+
+        records.append(record)
+
+    BenchmarkRecord.save_as_csv(filename, records)
+    BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
     logger.info(f"Results saved in {filename}!")
 
 
@@ -234,7 +305,7 @@ def benchmark(args, benchmark_cmd, engine):
 
     # Create entries for csv
     logger.info("Gathering data from log files...")
-    base_results = [engine, args.precision, args.device]
+    base_results = [args.warmup_runs, args.num_runs, args.model_name, engine, args.precision, args.device]
     results = process_log_file(args.device_id, log_path, base_results)
 
     return results
@@ -273,6 +344,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark PyTorch without torch.compile")
@@ -303,6 +376,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark PyTorch with torch.compile")
@@ -335,6 +410,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark Optimum + ONNX Runtime")
@@ -367,6 +444,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
         ]
         logger.info("Benchmark Microsoft model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "ort-msft")
@@ -398,6 +477,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
         ]
         logger.info("Benchmark convert_to_onnx model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "onnxruntime")
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
new file mode 100644
index 000000000000..b69bd229745c
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
@@ -0,0 +1,581 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+# This is an end-to-end benchmarking script for the Hugging Face LLaMA-2 model.
+#
+# Prerequisites:
+# 1) Install `huggingface-cli`:
+#
+# $ pip install huggingface_hub
+#
+# 2) Authenticate with Hugging Face's CLI:
+#
+# $ huggingface-cli login
+#
+# 3) Accept Meta's license in Hugging Face to access the models at https://huggingface.co/meta-llama/
+#
+# 4) Install the latest ONNX Runtime version
+#
+# $ pip install onnxruntime-gpu
+#
+# 5) Install flash attention v2
+#
+# $ pip install flash-attn --no-build-isolation
+#
+# 6) Install bitsandbytes
+#
+# $ pip install bitsandbytes
+
+from __future__ import annotations
+
+import argparse
+import datetime
+import gc
+import itertools
+import json
+import logging
+import os
+import textwrap
+import time
+
+import numpy as np
+import pandas as pd
+import torch
+from benchmark_helper import setup_logger
+from llama_inputs import add_io_bindings_as_tensors, get_initial_inputs_and_outputs
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+import onnxruntime as ort
+
+logger = logging.getLogger(__name__)
+
+
+def get_model(args: argparse.Namespace):
+    if args.benchmark_type in {"pt-eager", "pt-compile"}:
+        model = None
+        if args.onnx_precision == "int4" and args.device == "cuda":
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+            )
+
+            model = AutoModelForCausalLM.from_pretrained(
+                args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+                cache_dir=args.cache_dir,
+                torch_dtype=args.torch_dtype,
+                use_auth_token=args.auth,
+                use_cache=True,
+                attn_implementation="flash_attention_2",
+                quantization_config=bnb_config,
+                max_memory={args.device_id: "80GB"},
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+                cache_dir=args.cache_dir,
+                torch_dtype=args.torch_dtype,
+                use_auth_token=args.auth,
+                use_cache=True,
+                attn_implementation=("flash_attention_2" if args.device == "cuda" else "sdpa"),
+            ).to(args.target_device)
+
+        model.eval()
+
+        if args.benchmark_type == "pt-compile":
+            model = torch.compile(model)
+
+    else:
+        sess_options = ort.SessionOptions()
+        ep = (
+            ("CUDAExecutionProvider", {"device_id": args.device_id})
+            if args.device == "cuda"
+            else "CPUExecutionProvider"
+        )
+        model = ort.InferenceSession(args.onnx_model_path, sess_options=sess_options, providers=[ep])
+
+    return model
+
+
+def run_inference(args, model, runs, inputs, outputs):
+    if args.benchmark_type == "pt-compile":
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+    # Synchronize inputs
+    io_binding = None
+    if args.benchmark_type in {"pt-eager", "pt-compile"}:
+        if args.device != "cpu":
+            torch.cuda.synchronize(args.target_device)
+    else:
+        io_binding = add_io_bindings_as_tensors(model, inputs, outputs, args.use_fp16, args.use_buffer_share)
+        io_binding.synchronize_inputs()
+
+    # Run inference
+    start = time.perf_counter()
+    for _ in range(runs):
+        if args.benchmark_type in {"pt-eager", "pt-compile"}:
+            with torch.no_grad():
+                outputs = model(**inputs)
+                if args.device != "cpu":
+                    torch.cuda.synchronize(args.target_device)
+        else:
+            model.run_with_iobinding(io_binding)
+            io_binding.synchronize_outputs()
+
+    end = time.perf_counter()
+    avg = (end - start) / runs
+    return avg, outputs
+
+
+def prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt):
+    clear_cache()
+    inputs, outputs = get_initial_inputs_and_outputs(
+        config, tokenizer, prompt_length, prompt, args.target_device, args.use_fp16, args.use_buffer_share, args.engine
+    )
+    _, outputs = run_inference(args, model, args.warmup_runs, inputs, outputs)
+    return inputs, outputs
+
+
+def clear_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def save_results(results, filename, gen_length):
+    df = pd.DataFrame(
+        results,
+        columns=[
+            "Batch Size",
+            "Prompt Length",
+            "Prompt Processing Latency (ms)",
+            "Prompt Processing Throughput (tps)",
+            "Sampling Latency (ms)",
+            "Sampling Throughput (tps)",
+            "First Token Generated Latency (ms)",
+            "First Token Generated Throughput (tps)",
+            f"Average Latency of First {gen_length // 2} Tokens Generated (ms)",
+            f"Average Throughput of First {gen_length // 2} Tokens Generated (tps)",
+            f"Average Latency of First {gen_length} Tokens Generated (ms)",
+            f"Average Throughput of First {gen_length} Tokens Generated (tps)",
+            "Wall-Clock Latency (s)",
+            "Wall-Clock Throughput (tps)",
+        ],
+    )
+
+    df.to_csv(filename, index=False)
+    logger.info(f"Results saved in {filename}!")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-bt",
+        "--benchmark-type",
+        type=str,
+        required=True,
+        choices=["pt-eager", "pt-compile", "ort"],
+    )
+
+    parser.add_argument(
+        "-m",
+        "--model-name",
+        type=str,
+        required=False,
+        help="Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf')",
+    )
+
+    parser.add_argument(
+        "-a",
+        "--auth",
+        default=False,
+        action="store_true",
+        help="Use Hugging Face authentication token to access model",
+    )
+
+    parser.add_argument(
+        "-c",
+        "--cache-dir",
+        type=str,
+        default=os.path.join(".", "model_cache"),
+        help="Path to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.",
+    )
+
+    parser.add_argument(
+        "--hf-dir-path",
+        type=str,
+        default="",
+        help="Path to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--onnx-model-path",
+        required=False,
+        help="Path to ONNX model",
+    )
+
+    parser.add_argument(
+        "-f",
+        "--prompts-file",
+        required=True,
+        default=os.path.join(".", "models", "llama", "prompts.json"),
+        help="JSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt",
+    )
+
+    parser.add_argument(
+        "--use_buffer_share",
+        default=False,
+        action="store_true",
+        help="Use when GroupQueryAttention (GQA) is in ONNX model",
+    )
+
+    parser.add_argument(
+        "--anomaly-filtering",
+        default=False,
+        action="store_true",
+        help="Use this flag to filter anomaly accelerator times for tokens generated. \
+              This may give more accurate latency and throughput metrics for tokens generated. \
+              Wall-clock metrics are still reported with anomaly times though.",
+    ),
+
+    parser.add_argument(
+        "-b",
+        "--batch-sizes",
+        default="1 2",
+    )
+
+    parser.add_argument(
+        "-s",
+        "--prompt-lengths",
+        default="16 64 256 1024",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        required=True,
+        type=str,
+        default="fp32",
+        choices=["int4", "int8", "fp16", "fp32"],
+        help="Precision for model. For ONNX models, the model's precision should be set before running this script.",
+    )
+
+    parser.add_argument(
+        "-g",
+        "--generation-length",
+        type=int,
+        default=256,
+        help="Number of new tokens to generate",
+    )
+
+    parser.add_argument(
+        "-d",
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        choices=["cpu", "cuda"],
+    )
+
+    parser.add_argument("-id", "--device-id", type=int, default=0)
+    parser.add_argument("-w", "--warmup-runs", type=int, default=5)
+    parser.add_argument("-n", "--num-runs", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=2)
+
+    args = parser.parse_args()
+
+    # Set seed properties
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    # Set runtime properties
+    if "ort" in args.benchmark_type:
+        setattr(args, "execution_provider", f"{args.device.upper()}ExecutionProvider")  # noqa: B010
+        if args.execution_provider == "CUDAExecutionProvider":
+            args.execution_provider = (args.execution_provider, {"device_id": args.device_id})
+
+    # Check that paths have been specified for any benchmarking with ORT
+    if args.benchmark_type == "ort":
+        assert args.onnx_model_path, "Please specify a path to `--onnx-model-path`"
+
+    args.batch_sizes = args.batch_sizes.split(" ")
+    args.prompt_lengths = args.prompt_lengths.split(" ")
+
+    # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
+    setattr(args, "onnx_precision", args.precision)  # noqa: B010
+    args.precision = (
+        "fp32" if args.precision in {"int8", "fp32"} or (args.precision == "int4" and args.device == "cpu") else "fp16"
+    )
+
+    target_device = f"cuda:{args.device_id}" if args.device != "cpu" else args.device
+    torch_dtype = torch.float16 if args.precision == "fp16" else torch.float32
+    engine = "ort" if args.benchmark_type == "ort" else "pt"
+    setattr(args, "target_device", target_device)  # noqa: B010
+    setattr(args, "torch_dtype", torch_dtype)  # noqa: B010
+    setattr(args, "engine", engine)  # noqa: B010
+    setattr(args, "use_fp16", args.precision == "fp16")  # noqa: B010
+
+    return args
+
+
+def main():
+    args = get_args()
+    setup_logger(False)
+    logger.info(args.__dict__)
+
+    # Get prompts and prompt sizes
+    size_to_prompt = None
+    with open(args.prompts_file) as f:
+        size_to_prompt = json.load(f, object_hook=lambda d: {int(k): v for k, v in d.items()})
+
+    # Get config, tokenizer, and model
+    config = AutoConfig.from_pretrained(
+        args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+        cache_dir=args.cache_dir,
+        use_auth_token=args.auth,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+        cache_dir=args.cache_dir,
+        use_auth_token=args.auth,
+    )
+    model = get_model(args)
+
+    all_csv_metrics = []
+    for batch_size, prompt_length in itertools.product(args.batch_sizes, args.prompt_lengths):
+        batch_size, prompt_length = int(batch_size), int(prompt_length)  # noqa: PLW2901
+        logger.info(f"Running batch size = {batch_size}, prompt length = {prompt_length}")
+        clear_cache()
+        max_length = prompt_length + args.generation_length
+
+        if prompt_length not in size_to_prompt:
+            raise NotImplementedError(
+                textwrap.dedent(
+                    f"""
+                                A prompt of size {prompt_length} was not found in '{args.prompts_file}'. There are a couple of solutions to fix this.
+                                1) You can change one of the keys in '{args.prompts_file}' to be {prompt_length}.
+                                    If {prompt_length} < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until {prompt_length} = actual prompt's length.
+                                    If {prompt_length} > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that {prompt_length} = actual prompt's length.
+                                2) You can add a new key-value entry in '{args.prompts_file}' of the form '{prompt_length}': 'your prompt goes here'.
+                """
+                )
+            )
+        prompt = [size_to_prompt[prompt_length]] * batch_size
+        csv_metrics = [batch_size, prompt_length]
+
+        try:
+            # Measure prompt processing
+            logger.info("Measuring prompt processing...")
+            inputs, outputs = prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt)
+            accelerator_prompt_latency_s, outputs = run_inference(args, model, args.num_runs, inputs, outputs)
+
+            # Calculate prompt metrics
+            accelerator_prompt_latency_ms = accelerator_prompt_latency_s * 1000
+            accelerator_prompt_thrpt = batch_size * (prompt_length / accelerator_prompt_latency_s)
+            logger.info(f"Average Latency of Prompt Processing: {accelerator_prompt_latency_ms} ms")
+            logger.info(
+                f"Average Throughput of Prompt Processing: {batch_size * (prompt_length / accelerator_prompt_latency_s)} tps"
+            )
+            csv_metrics.extend([accelerator_prompt_latency_ms, accelerator_prompt_thrpt])
+
+            # Measure token generation
+            logger.info("Measuring token generation...")
+            clear_cache()
+            inputs, outputs = prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt)
+
+            all_token_ids = inputs["input_ids"].clone()
+            current_length = all_token_ids.shape[-1]
+            num_heads = config.num_key_value_heads
+            head_size = (
+                config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
+            )
+
+            has_eos = torch.zeros(batch_size, device=args.target_device, dtype=torch.bool)
+
+            # 0th entry will have prompt accelerator time, 1st entry onwards will have token generation accelerator time
+            accelerator_times = []
+            sampling_times = []  # cost to sample after each model run
+
+            wall_clock_start_time = time.perf_counter()
+            while current_length <= max_length:
+                # Run inference
+                accelerator_time_latency_s, outputs = run_inference(args, model, 1, inputs, outputs)
+                accelerator_times.append(accelerator_time_latency_s)
+
+                # Sample with argmax (greedy search)
+                sampling_start_time = time.perf_counter()
+                if outputs["logits"].shape[1] > 1:
+                    prompt_end_indices = inputs["attention_mask"].sum(1) - 1
+                    idxs = (
+                        prompt_end_indices.unsqueeze(dim=1)
+                        .repeat(1, config.vocab_size)
+                        .view(batch_size, 1, config.vocab_size)
+                    )
+                    next_token_logits = torch.gather(outputs["logits"], 1, idxs).squeeze()
+                else:
+                    next_token_logits = outputs["logits"][:, -1, :]
+                next_tokens = torch.argmax(next_token_logits, dim=-1)
+
+                # Check if we previously reached EOS token id or if generated token id is EOS token id
+                has_eos = has_eos | next_tokens == tokenizer.eos_token_id
+
+                # Determine which new tokens to add to list of all token ids
+                # Add EOS token ids for batch entries that ended early (ragged batching scenario where some batch entries ended early and some haven't)
+                tokens_to_add = next_tokens.masked_fill(has_eos, tokenizer.eos_token_id).reshape([batch_size, 1])
+                sampling_end_time = time.perf_counter()
+                sampling_times.append(sampling_end_time - sampling_start_time)
+
+                all_token_ids = torch.cat([all_token_ids, tokens_to_add], dim=-1)
+                current_length += 1
+
+                # Update inputs for next inference run
+                inputs["input_ids"] = tokens_to_add
+                inputs["attention_mask"] = torch.cat(
+                    [inputs["attention_mask"], (~has_eos).to(torch.int64).reshape(batch_size, 1)], 1
+                )
+                inputs["position_ids"] = (
+                    None
+                    if "position_ids" not in inputs
+                    else torch.max(inputs["position_ids"], dim=1)[0].reshape(batch_size, 1) + 1
+                )
+
+                # Set logits to zeros for next inference run and re-use memory buffer
+                if outputs["logits"].shape[1] != 1:
+                    outputs["logits"] = outputs["logits"][:, :1, :].contiguous()
+                outputs["logits"].zero_()
+
+                # Update KV caches for next inference run
+                if args.engine == "pt":
+                    # Update KV caches for PyTorch
+                    inputs["past_key_values"] = outputs["past_key_values"]
+                elif not args.use_buffer_share:
+                    # Update KV caches for ONNX Runtime if buffer sharing is not used
+                    for i in range(config.num_hidden_layers):
+                        inputs[f"past_key_values.{i}.key"] = outputs[f"present.{i}.key"]
+                        inputs[f"past_key_values.{i}.value"] = outputs[f"present.{i}.value"]
+
+                    new_sequence_length = inputs["attention_mask"].shape[1]
+                    for i in range(config.num_hidden_layers):
+                        present_key = torch.zeros(
+                            batch_size,
+                            num_heads,
+                            new_sequence_length,
+                            head_size,
+                            device=args.target_device,
+                            dtype=args.torch_dtype,
+                        )
+                        present_value = torch.zeros(
+                            batch_size,
+                            num_heads,
+                            new_sequence_length,
+                            head_size,
+                            device=args.target_device,
+                            dtype=args.torch_dtype,
+                        )
+                        outputs.update(
+                            {
+                                f"present.{i}.key": present_key.contiguous(),
+                                f"present.{i}.value": present_value.contiguous(),
+                            }
+                        )
+
+            wall_clock_end_time = time.perf_counter()
+
+            # Filter out any anomaly accelerator times (e.g. for `torch.compile`)
+            accelerator_times.pop(0)  # Remove prompt processing time
+            if args.anomaly_filtering:
+                anomaly_threshold_factor = 10
+                min_time_s = min(accelerator_times)
+                orig_size = len(accelerator_times)
+                accelerator_times = list(
+                    filter(lambda acc_time: acc_time < anomaly_threshold_factor * min_time_s, accelerator_times)
+                )
+                new_size = len(accelerator_times)
+                logger.info(
+                    f"Filtered out {orig_size - new_size} anomaly accelerator times that are {anomaly_threshold_factor}x greater than {min_time_s * 1000} ms..."
+                )
+
+            #######################################################
+            # Calculate sampling and first token generated metrics
+            #######################################################
+
+            # Calculate sampling metrics
+            avg_sampling_latency_s = sum(sampling_times) / len(sampling_times)
+            avg_sampling_latency_ms = avg_sampling_latency_s * 1000
+            avg_sampling_thrpt = batch_size * (1 / avg_sampling_latency_s)
+            logger.info(f"Average Latency of Sampling: {avg_sampling_latency_ms} ms")
+            logger.info(f"Average Throughput of Sampling: {avg_sampling_thrpt} tps")
+
+            # Calculate first token generated metrics
+            first_token_latency_s = accelerator_times[0]
+            first_token_latency_ms = first_token_latency_s * 1000
+            first_token_thrpt = batch_size * (1 / first_token_latency_s)
+            logger.info(f"Latency of First Token Generated: {first_token_latency_ms} ms")
+            logger.info(f"Throughput of First Token Generated: {first_token_thrpt} tps")
+
+            ####################################################
+            # Calculate first `halfway` token generated metrics
+            ####################################################
+
+            halfway = args.generation_length // 2
+            halfway_token_latency_s = sum(accelerator_times[:halfway]) / len(accelerator_times[:halfway])
+            halfway_token_latency_ms = halfway_token_latency_s * 1000
+            halfway_token_thrpt = batch_size * (1 / halfway_token_latency_s)
+            logger.info(f"Average Latency of First {halfway} Tokens Generated: {halfway_token_latency_ms} ms")
+            logger.info(f"Average Throughput of First {halfway} Tokens Generated: {halfway_token_thrpt} tps")
+
+            #########################################
+            # Calculate all tokens generated metrics
+            #########################################
+
+            all_token_latency_s = sum(accelerator_times) / len(accelerator_times)
+            all_token_latency_ms = all_token_latency_s * 1000
+            all_token_thrpt = batch_size * (1 / all_token_latency_s)
+            logger.info(
+                f"Average Latency of First {args.generation_length} Tokens Generated: {all_token_latency_ms} ms"
+            )
+            logger.info(f"Average Throughput of First {args.generation_length} Tokens Generated: {all_token_thrpt} tps")
+
+            ###############################
+            # Calculate wall clock metrics
+            ###############################
+
+            wall_clock_latency_s = wall_clock_end_time - wall_clock_start_time
+            wall_clock_thrpt = batch_size * ((prompt_length + args.generation_length) / wall_clock_latency_s)
+            logger.info(f"Wall-Clock Latency: {wall_clock_latency_s} s")
+            logger.info(
+                f"Wall-Clock Throughput: {batch_size * ((prompt_length + args.generation_length) / wall_clock_latency_s)} tps"
+            )
+
+            # Add metrics to CSV
+            logger.info("Adding results to CSV")
+            csv_metrics.extend(
+                [
+                    avg_sampling_latency_ms,
+                    avg_sampling_thrpt,
+                    first_token_latency_ms,
+                    first_token_thrpt,
+                    halfway_token_latency_ms,
+                    halfway_token_thrpt,
+                    all_token_latency_ms,
+                    all_token_thrpt,
+                    wall_clock_latency_s,
+                    wall_clock_thrpt,
+                ]
+            )
+            all_csv_metrics.append(csv_metrics)
+
+        except:  # noqa: E722
+            logger.info(f"Could not benchmark at batch size = {batch_size}, prompt length = {prompt_length}")
+
+    filename = f"benchmark_{args.engine}_e2e_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
+    save_results(all_csv_metrics, filename, args.generation_length)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index e694b5050cc8..9990c1d006c1 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -1,9 +1,17 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
 import argparse
 import logging
 import os
 import shutil
+import subprocess
+import sys
 from itertools import chain
-from typing import List
 
 import onnx
 import torch
@@ -21,11 +29,12 @@
 from onnxruntime import quantization as ort_quantization
 from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
 
+torch_export_onnx_opset_version = 14
 logger = logging.getLogger("")
 init_dist()
 
 
-def get_model_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_model_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in input_names:
@@ -42,7 +51,7 @@ def get_model_dynamic_axes(input_names: List[str], output_names: List[str]):
     return dynamic_axes
 
 
-def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_model_with_past_kv_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in {"input_ids", "position_ids"}:
@@ -65,7 +74,7 @@ def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: Li
     return dynamic_axes
 
 
-def get_merged_model_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_merged_model_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in {"input_ids", "position_ids"}:
@@ -229,7 +238,7 @@ def run_torchscript_separate_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -288,7 +297,7 @@ def run_torchscript_separate_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -368,7 +377,7 @@ def run_torchscript_merged_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -406,13 +415,38 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov
         only_onnxruntime=False,
     )
     model_opt.save_model_to_file(output_path, use_external_data_format=True)
+
+    # Run symbolic shape inference on optimized model to avoid shape errors during runtime
+    # Ex: Before attention fusion, RotaryEmbedding assumes a 4D input and produces a 4D output.
+    # After attention fusion, RotaryEmbedding expects a 3D input and produces a 3D output.
+    wheel_cmd = [sys.executable, "-m", "onnxruntime.tools.symbolic_shape_infer"]
+    source_cmd = [sys.executable, "../symbolic_shape_infer.py"]
+    symbolic_shape_infer_args = [
+        "--input",
+        output_path,
+        "--output",
+        output_path,
+        "--auto_merge",
+        "--save_as_external_data",
+        "--all_tensors_to_one_file",
+        "--external_data_location",
+        os.path.basename(output_path) + ".data",
+    ]
+
+    file_path = os.path.dirname(__file__)
+    if os.path.exists(os.path.join(file_path, "../../../tools/symbolic_shape_infer.py")):
+        main_cmd = wheel_cmd
+    else:
+        main_cmd = source_cmd
+    subprocess.run(main_cmd + symbolic_shape_infer_args)  # noqa: PLW1510
+
     logger.info(f"The ONNX model at {input_path} has been successfully optimized and saved at {output_path}!")
     if remove_model:
         remove_existing_model(input_path)
 
 
 def convert_to_float16(
-    args: argparse.Namespace, config: AutoConfig, old_paths: List[str], rank: int = 0, world_size: int = 1
+    args: argparse.Namespace, config: AutoConfig, old_paths: list[str], rank: int = 0, world_size: int = 1
 ):
     decoder_model_fp16_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp16.onnx")
     decoder_with_past_model_fp16_path = os.path.join(
@@ -635,7 +669,7 @@ def get_args():
         help="Run a specific quantization algorithm (blockwise for int4, smooth_quant for int8, quantize_dynamic for int8). Blockwise is recommended. Need to install extra packages in `requirements-quant.txt` for SmoothQuant.",
     )
 
-    blockwise_group = parser.add_argument_group("4-bit quantization")
+    blockwise_group = parser.add_argument_group("blockwise (4-bit quantization)")
 
     blockwise_group.add_argument(
         "--block_size",
@@ -645,6 +679,15 @@ def get_args():
         help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.",
     )
 
+    blockwise_group.add_argument(
+        "--int4_accuracy_level",
+        required=False,
+        type=int,
+        help="Accuracy level of the 4-bit quantized MatMul computation. "
+        "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
+        "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
+    )
+
     smooth_quant_group = parser.add_argument_group("smooth_quant (8-bit quantization)")
 
     smooth_quant_group.add_argument(
@@ -743,6 +786,13 @@ def get_args():
         action="store_true",
         help="Avoid exporting model, only apply quantizations and optimizations to existing model exported from optimum.",
     )
+
+    parser.add_argument(
+        "--small_gpu",
+        action="store_true",
+        help="Load the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB.",
+    )
+
     parser.set_defaults(optimize_optimum=False)
 
     args = parser.parse_args()
@@ -750,9 +800,7 @@ def get_args():
 
 
 def main():
-    if version.parse(torch.__version__) < version.parse("2.2.0") and "2.2.0.dev" not in torch.__version__:
-        # Second predicate is for comparing nightly (ex: 2.2.0.dev20230920 vs 2.2.0) since first predicate is false
-        # in that scenario. It can be removed when torch v2.2.0 is released in stable.
+    if version.parse(torch.__version__) < version.parse("2.2.0"):
         logger.error(f"Detected PyTorch version {torch.__version__}. Please upgrade and use v2.2.0 or newer.")
         return
 
@@ -901,9 +949,11 @@ def main():
                             ort_quantization.quantize_dynamic(
                                 fp32_path,
                                 int8_path,
-                                op_types_to_quantize=["MatMul", "Gemm", "Gather"]
-                                if args.quantize_embedding_layer
-                                else ["MatMul", "Gemm"],
+                                op_types_to_quantize=(
+                                    ["MatMul", "Gemm", "Gather"]
+                                    if args.quantize_embedding_layer
+                                    else ["MatMul", "Gemm"]
+                                ),
                                 per_channel=args.quantize_per_channel,
                                 reduce_range=args.quantize_reduce_range,
                                 use_external_data_format=True,
@@ -937,7 +987,13 @@ def main():
                 for fp_path, int4_path in zip(old_paths, new_paths):
                     if os.path.exists(fp_path):
                         model = onnx.load_model(fp_path, load_external_data=True)
-                        quant = MatMul4BitsQuantizer(model, args.block_size, is_symmetric=True, nodes_to_exclude=[])
+                        quant = MatMul4BitsQuantizer(
+                            model=model,
+                            block_size=args.block_size,
+                            is_symmetric=True,
+                            accuracy_level=args.int4_accuracy_level,
+                            nodes_to_exclude=[],
+                        )
                         quant.process()
                         quant.model.save_model_to_file(int4_path, use_external_data_format=True)
                         del model
@@ -973,20 +1029,22 @@ def main():
             os.path.join(args.output, filename),
             "-ep",
             args.execution_provider,
-            "-fp",
+            "--precision",
             args.precision,
             "--cache_dir",
             args.cache_dir,
+            "--torch_model_directory",
+            args.input,
         ]
+        if args.small_gpu:
+            parity_cmd.append("--small_gpu")
         if "with_past" in filename:
             parity_cmd.append("--use_past_kv")
         if "merged" in filename:
             parity_cmd.append("--merged")
-        if args.use_gqa:
-            parity_cmd.append("--use_gqa")
 
         try:
-            logger.debug(f"check parity with cmd: {parity_cmd}")
+            logger.info(f"check parity with cmd: {parity_cmd}")
             parity_check(parity_cmd)
         except Exception as e:
             logger.warning(f"An error occurred while verifying parity: {e}", exc_info=True)
diff --git a/onnxruntime/python/tools/transformers/models/llama/dist_settings.py b/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
index 72192ce8d8c6..3b53f60758b2 100644
--- a/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
+++ b/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import os
 
 import torch.distributed as dist
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
index bae1ae82e8f7..7b3caf0b7017 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
@@ -1,8 +1,13 @@
-from typing import List, Tuple
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
 
 import numpy as np
 import torch
-from transformers import AutoConfig
+from transformers import AutoConfig, AutoTokenizer
 
 from onnxruntime import InferenceSession, OrtValue
 
@@ -122,7 +127,7 @@ def get_merged_sample_with_past_kv_inputs(
     past_seq_len: int,
     max_seq_len: int,
     use_fp16: bool = False,
-    use_gqa: bool = False,
+    use_buffer_share: bool = False,
     engine: str = "pt",
     return_dict: bool = False,
     world_size: int = 1,
@@ -157,7 +162,7 @@ def get_merged_sample_with_past_kv_inputs(
         assert isinstance(past_kv, dict)
         inputs.update(past_kv)
 
-        if use_gqa:
+        if use_buffer_share:
             inputs = enable_past_present_share_buffer(inputs, past_seq_len, max_seq_len)
 
     else:
@@ -175,7 +180,7 @@ def get_msft_sample_inputs(
     seq_len: int,
     max_seq_len: int,
     use_fp16: bool,
-    use_gqa: bool,
+    use_buffer_share: bool,
     split_kv: bool,
 ):
     np_dtype = np.float16 if use_fp16 else np.float32
@@ -213,7 +218,7 @@ def get_msft_sample_inputs(
                 }
             )
 
-        if use_gqa:
+        if use_buffer_share:
             ort_inputs = enable_past_present_share_buffer(ort_inputs, past_seq_len, max_seq_len)
 
     return ort_inputs
@@ -222,7 +227,8 @@ def get_msft_sample_inputs(
 # Create past_key_values
 # Each is of shape (batch_size, num_heads, past_sequence_length, head_size)
 def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, use_fp16: bool, world_size: int = 1):
-    num_heads, head_size = config.num_key_value_heads // world_size, config.hidden_size // config.num_attention_heads
+    num_heads = config.num_key_value_heads // world_size
+    head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
     torch_dtype = torch.float16 if use_fp16 else torch.float32
     past_kv = [
         (
@@ -235,7 +241,7 @@ def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, u
 
 
 # Convert list of past_key_values to dict of past_key and past_value
-def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tensor]]):
+def flatten_past_kv_inputs(past_key_values: list[tuple[torch.Tensor, torch.Tensor]]):
     past_kv = {}
     for i, (past_k, past_v) in enumerate(past_key_values):
         past_kv[f"past_key_values.{i}.key"] = past_k.detach().cpu().numpy()
@@ -246,7 +252,7 @@ def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tenso
 # Format PyTorch inputs to ONNX Runtime inputs
 def convert_inputs_for_ort(
     pt_inputs: dict,
-    use_gqa: bool = False,
+    use_buffer_share: bool = False,
     past_seq_len: int = 0,
     max_seq_len: int = 2048,
     device: str = "",
@@ -262,12 +268,14 @@ def convert_inputs_for_ort(
             ort_inputs[k] = v.detach().cpu().numpy()
 
     # Reshape KV caches if using past-present-share-buffer
-    if use_gqa and device != "" and device != "cpu" and device_id > -1:
+    if use_buffer_share and device != "" and device != "cpu" and device_id > -1:
         ort_inputs = enable_past_present_share_buffer(ort_inputs, past_seq_len, max_seq_len)
 
     return ort_inputs
 
 
+# Re-allocate KV caches from (batch_size, num_heads, past_sequence_length, head_size) to
+# (batch_size, num_heads, max_sequence_length, head_size) for past-present buffer sharing
 def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_seq_len: int):
     for k, v in ort_inputs.items():
         # Allocate new buffers with max_sequence_length for GQA
@@ -280,15 +288,48 @@ def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_se
     return ort_inputs
 
 
-# Add IO bindings for execution providers
-def add_io_bindings(
-    model: InferenceSession, ort_inputs: dict, device: str, device_id: int, use_gqa: bool, kv_cache_ortvalues: dict
+# Verify ONNX Runtime inputs with model
+def verify_ort_inputs(model: InferenceSession, ort_inputs: dict):
+    # Check that all model inputs will be provided
+    model_inputs = set(map(lambda model_input: model_input.name, model.get_inputs()))
+    user_inputs = set(ort_inputs.keys())
+    missing_inputs = model_inputs - user_inputs
+    if len(missing_inputs):
+        print(f"The following model inputs are missing: {missing_inputs}")
+        raise Exception("There are missing inputs to the model. Please add them and try again.")
+
+    # Remove unnecessary inputs from model inputs
+    unnecessary_inputs = user_inputs - model_inputs
+    if len(unnecessary_inputs):
+        for unnecessary_input in unnecessary_inputs:
+            print(f"Removing unnecessary input '{unnecessary_input}' from user provided inputs")
+            del ort_inputs[unnecessary_input]
+
+    return ort_inputs
+
+
+# Add IO bindings for execution providers using OrtValue
+# Use when you need to run inference once or twice to save memory
+def add_io_bindings_as_ortvalues(
+    model: InferenceSession,
+    ort_inputs: dict,
+    device: str,
+    device_id: int,
+    use_buffer_share: bool,
+    kv_cache_ortvalues: dict,
 ):
     io_binding = model.io_binding()
 
+    model_inputs = set(map(lambda i: i.name, model.get_inputs()))
     for k, v in ort_inputs.items():
+        # Use this check to handle scenarios such as INT4 CUDA and FP16 CUDA models with
+        # GQA + RotaryEmbedding fusion where `position_ids` is removed as an ONNX model input
+        # but `position_ids` is used as a PyTorch model input
+        if k not in model_inputs:
+            continue
+
         # Bind OrtValue inputs to device
-        if use_gqa and ("cache" in k or "past_key_values" in k):
+        if use_buffer_share and ("cache" in k or "past_key_values" in k):
             if k not in kv_cache_ortvalues:
                 v_device = OrtValue.ortvalue_from_numpy(v, device_type=device, device_id=device_id)
                 io_binding.bind_ortvalue_input(k, v_device)
@@ -302,7 +343,7 @@ def add_io_bindings(
 
     for output in model.get_outputs():
         name = output.name
-        if use_gqa and ("out" in name or "present" in name):
+        if use_buffer_share and ("out" in name or "present" in name):
             # Bind present KV cache outputs to past KV cache inputs in order to buffer share
             input_name = name.replace("out", "cache").replace("present", "past_key_values")
             io_binding.bind_ortvalue_output(name, kv_cache_ortvalues[input_name])
@@ -310,3 +351,163 @@ def add_io_bindings(
             io_binding.bind_output(name, device_type=device, device_id=device_id)
 
     return io_binding, kv_cache_ortvalues
+
+
+# Add IO bindings for execution providers using PyTorch tensors
+# Use when you need to run inference many times
+def add_io_bindings_as_tensors(
+    model: InferenceSession, inputs: dict, outputs: dict, use_fp16: bool, use_buffer_share: bool
+):
+    # Verify model inputs
+    inputs = verify_ort_inputs(model, inputs)
+
+    device = None
+    pt_to_np = {
+        "torch.int32": np.int32,
+        "torch.int64": np.int64,
+        "torch.float16": np.float16,
+        "torch.float32": np.float32,
+    }
+
+    # Bind inputs/outputs to IO binding
+    io_binding = model.io_binding()
+    for k, v in inputs.items():
+        io_binding.bind_input(
+            name=k,
+            device_type=v.device.type,
+            device_id=0 if v.device.type == "cpu" else v.device.index,
+            element_type=pt_to_np[repr(v.dtype)],
+            shape=tuple(v.shape),
+            buffer_ptr=v.data_ptr(),
+        )
+        device = v.device
+
+    for output in model.get_outputs():
+        name = output.name
+        if use_buffer_share and "present" in name:
+            # Bind KV cache outputs to KV cache inputs
+            v = inputs[name.replace("present", "past_key_values")]
+            io_binding.bind_output(
+                name=name,
+                device_type=v.device.type,
+                device_id=v.device.index,
+                element_type=np.float16,
+                shape=tuple(v.shape),
+                buffer_ptr=v.data_ptr(),
+            )
+        else:
+            v = outputs[name]
+            io_binding.bind_output(
+                name=name,
+                device_type=device.type,
+                device_id=0 if device.type == "cpu" else device.index,
+                element_type=(np.float16 if use_fp16 else np.float32),
+                shape=tuple(v.shape),
+                buffer_ptr=v.data_ptr(),
+            )
+
+    return io_binding
+
+
+# Get actual inputs when using real data (instead of sample data) and initialize outputs
+def get_initial_inputs_and_outputs(
+    config: AutoConfig,
+    tokenizer: AutoTokenizer,
+    requested_length: int,
+    prompt: list[str],
+    device: torch.device,
+    use_fp16: bool,
+    use_buffer_share: bool,
+    engine: str,
+):
+    tokenizer.pad_token = "[PAD]"
+    encodings_dict = tokenizer.batch_encode_plus(prompt, padding=True)
+    torch_dtype = torch.float16 if use_fp16 else torch.float32
+
+    # input_ids:      pad token id is 0
+    # attention_mask: pad token id is 0
+    # position_ids:   pad token id is 1
+    input_ids = torch.tensor(encodings_dict["input_ids"], device=device, dtype=torch.int64)
+    attention_mask = torch.tensor(encodings_dict["attention_mask"], device=device, dtype=torch.int64)
+    position_ids = get_position_ids(attention_mask, use_past_kv=False)
+
+    # Check if tokenized prompt length matches the requested prompt length
+    tokenized_length = input_ids.shape[-1]
+    if tokenized_length > requested_length:
+        # Shorten the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
+        input_ids = input_ids[:, :requested_length]
+        attention_mask = attention_mask[:, :requested_length]
+        position_ids = get_position_ids(attention_mask, use_past_kv=False)
+    elif tokenized_length < requested_length:
+        # Lengthen the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
+        input_ids_first_col = input_ids[:, 0].unsqueeze(0).T
+        attention_mask_first_col = attention_mask[:, 0].unsqueeze(0).T
+        for _ in range(requested_length - tokenized_length):
+            input_ids = torch.hstack((input_ids_first_col, input_ids))
+            attention_mask = torch.hstack((attention_mask_first_col, attention_mask))
+        position_ids = get_position_ids(attention_mask, use_past_kv=False)
+
+    tokenized_length = input_ids.shape[-1]
+    assert tokenized_length == requested_length
+
+    # Create inputs
+    inputs = {
+        "input_ids": input_ids.contiguous() if engine == "ort" else input_ids,
+        "attention_mask": attention_mask.contiguous() if engine == "ort" else attention_mask,
+        "position_ids": position_ids.contiguous() if engine == "ort" else position_ids,
+    }
+    if engine != "ort":
+        inputs["past_key_values"] = []
+
+    # Get shape of KV cache inputs
+    batch_size, sequence_length = input_ids.shape
+    max_sequence_length = config.max_position_embeddings
+    num_heads = config.num_key_value_heads
+    head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
+
+    # Create KV cache inputs
+    for i in range(config.num_hidden_layers):
+        past_key = torch.zeros(
+            batch_size,
+            num_heads,
+            max_sequence_length if use_buffer_share else 0,
+            head_size,
+            device=device,
+            dtype=torch_dtype,
+        )
+        past_value = torch.zeros(
+            batch_size,
+            num_heads,
+            max_sequence_length if use_buffer_share else 0,
+            head_size,
+            device=device,
+            dtype=torch_dtype,
+        )
+        if engine == "ort":
+            inputs.update(
+                {
+                    f"past_key_values.{i}.key": past_key.contiguous(),
+                    f"past_key_values.{i}.value": past_value.contiguous(),
+                }
+            )
+        else:
+            inputs["past_key_values"].append((past_key, past_value))
+
+    outputs = None
+    if engine == "ort":
+        # Create outputs
+        logits = torch.zeros(batch_size, sequence_length, config.vocab_size, device=device, dtype=torch_dtype)
+        outputs = {"logits": logits.contiguous()}
+        if not use_buffer_share:
+            for i in range(config.num_hidden_layers):
+                present_key = torch.zeros(
+                    batch_size, num_heads, sequence_length, head_size, device=device, dtype=torch_dtype
+                )
+                present_value = torch.zeros(
+                    batch_size, num_heads, sequence_length, head_size, device=device, dtype=torch_dtype
+                )
+                outputs.update(
+                    {f"present.{i}.key": present_key.contiguous(), f"present.{i}.value": present_value.contiguous()}
+                )
+
+    return inputs, outputs
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
index 418a65325c8f..c3e754e3df5a 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@@ -1,32 +1,37 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
 import argparse
 import logging
 import os
 import time
-from typing import List
 
 import numpy as np
 import torch
 from benchmark_helper import setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
-    add_io_bindings,
+    add_io_bindings_as_ortvalues,
     convert_inputs_for_ort,
     get_merged_sample_with_past_kv_inputs,
     get_sample_inputs,
     get_sample_with_past_kv_inputs,
 )
 from llama_torch import setup_torch_model
-from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoConfig
 
 import onnxruntime as ort
 
 logger = logging.getLogger("")
 
 
-def get_sequence_lengths(args: argparse.Namespace):
+def get_sequence_lengths(args: argparse.Namespace, config: AutoConfig):
     past_sequence_length, curr_sequence_length = (8, 1) if args.use_past_kv else (0, 8)
-    temp_name = args.model_name.lower().replace("-", "").replace("_", "")
-    max_sequence_length = 16384 if "codellama" in temp_name else 4096 if "llama2" in temp_name else 2048
+    max_sequence_length = config.max_position_embeddings
     return past_sequence_length, curr_sequence_length, max_sequence_length
 
 
@@ -34,7 +39,7 @@ def get_inputs(args: argparse.Namespace, config: AutoConfig):
     # Dummy values for parity
     world_size = get_size()
     batch_size = 2
-    past_sequence_length, sequence_length, max_sequence_length = get_sequence_lengths(args)
+    past_sequence_length, sequence_length, max_sequence_length = get_sequence_lengths(args, config)
 
     if args.merged:
         inputs = get_merged_sample_with_past_kv_inputs(
@@ -45,7 +50,7 @@ def get_inputs(args: argparse.Namespace, config: AutoConfig):
             past_seq_len=past_sequence_length,
             max_seq_len=max_sequence_length,
             use_fp16=args.use_fp16,
-            use_gqa=args.use_gqa,
+            use_buffer_share=args.use_buffer_share,
             return_dict=True,
             world_size=world_size,
         )
@@ -66,26 +71,45 @@ def get_inputs(args: argparse.Namespace, config: AutoConfig):
 
 
 def verify_parity(
-    args: argparse.Namespace, config: AutoConfig, pt_model: AutoModelForCausalLM, kv_cache_ortvalues: dict
+    args: argparse.Namespace,
+    location: str,
+    use_auth_token: bool,
+    kv_cache_ortvalues: dict,
+    pytorch_model: None | torch.nn.Module = None,
+    config: None | AutoConfig = None,
 ):
+    # If it's running in a machine which GPU memory < 36GB, it should unload the llama in GPU in time and free the GPU memory for ORT.
+    py_model = pytorch_model
+    if py_model is None:
+        config, py_model = setup_torch_model(
+            args,
+            location,
+            use_auth_token,
+            torch_dtype=(torch.float16 if args.use_fp16 else torch.float32),
+            device=args.device,
+        )
+
     inputs = get_inputs(args, config)
 
     # Run inference with PyTorch
     if args.execution_provider != "cpu":
         torch.cuda.synchronize()
     start_time = time.time()
-    pt_outputs = pt_model(**inputs).logits.detach().cpu().numpy()
+    pt_outputs = py_model(**inputs).logits.detach().cpu().numpy()
     if args.execution_provider != "cpu":
         torch.cuda.synchronize()
     end_time = time.time()
     logger.info(f"PyTorch took {end_time - start_time} s")
-    del pt_model
+
+    if args.small_gpu and py_model is not None:
+        del py_model
+        torch.cuda.empty_cache()
 
     # Run inference with ORT
-    past_sequence_length, _, max_sequence_length = get_sequence_lengths(args)
+    past_sequence_length, _, max_sequence_length = get_sequence_lengths(args, config)
     inputs = convert_inputs_for_ort(
         inputs,
-        use_gqa=args.use_gqa,
+        use_buffer_share=args.use_buffer_share,
         past_seq_len=past_sequence_length,
         max_seq_len=max_sequence_length,
         device=args.execution_provider,
@@ -103,13 +127,13 @@ def verify_parity(
 
     # Add IO bindings for non-CPU execution providers
     if args.execution_provider != "cpu":
-        io_binding, kv_cache_ortvalues = add_io_bindings(
+        io_binding, kv_cache_ortvalues = add_io_bindings_as_ortvalues(
             ort_model,
-            inputs,
-            args.execution_provider,
-            int(args.rank),
-            args.use_gqa,
-            kv_cache_ortvalues,
+            ort_inputs=inputs,
+            device=args.execution_provider,
+            device_id=int(args.rank),
+            use_buffer_share=args.use_buffer_share,
+            kv_cache_ortvalues=kv_cache_ortvalues,
         )
 
         io_binding.synchronize_inputs()
@@ -139,13 +163,13 @@ def verify_parity(
     return kv_cache_ortvalues
 
 
-def get_args(argv: List[str]):
+def get_args(argv: list[str]):
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
         "-m",
         "--model_name",
-        required=True,
+        required=False,
         help="Model name in Hugging Face",
     )
 
@@ -192,11 +216,11 @@ def get_args(argv: List[str]):
 
     parser.add_argument(
         "-g",
-        "--use_gqa",
+        "--use_buffer_share",
         action="store_true",
-        help="Use if model has GroupQueryAttention",
+        help="Use if model has GroupQueryAttention and you want to enable past-present buffer sharing",
     )
-    parser.set_defaults(use_gqa=False)
+    parser.set_defaults(use_buffer_share=False)
 
     parser.add_argument(
         "--merged",
@@ -221,6 +245,13 @@ def get_args(argv: List[str]):
         help="model cache dir to override default HF cache dir to avoid overflood the /home dir",
     )
 
+    # The argument is used for CI mainly, because the CI machine has 24G GPU memory at most.
+    parser.add_argument(
+        "--small_gpu",
+        action="store_true",
+        help="Load the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB. ",
+    )
+
     args = parser.parse_args() if argv == [] else parser.parse_args(argv)
 
     # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
@@ -232,7 +263,7 @@ def get_args(argv: List[str]):
     return args
 
 
-def main(argv: List[str] = []):  # noqa: B006
+def main(argv: list[str] = []):  # noqa: B006
     args = get_args(argv)
     setup_logger(args.verbose)
     logger.info(f"Arguments: {args}")
@@ -246,25 +277,29 @@ def main(argv: List[str] = []):  # noqa: B006
     use_auth_token = args.torch_model_directory == os.path.join(".")
     location = args.model_name if use_auth_token else args.torch_model_directory
 
-    config, llama = setup_torch_model(
-        args,
-        location,
-        use_auth_token,
-        torch_dtype=(torch.float16 if args.use_fp16 else torch.float32),
-        device=args.device,
-    )
-
     kv_cache_ortvalues = {}
     if not args.merged:
-        verify_parity(args, config, llama, kv_cache_ortvalues)
+        verify_parity(args, location, use_auth_token, kv_cache_ortvalues)
     else:
-        # Verify prompt generation in merged model (decoder_model.onnx)
+        config = llama = None
+        if not args.small_gpu:
+            config, llama = setup_torch_model(
+                args,
+                location,
+                use_auth_token,
+                torch_dtype=(torch.float16 if args.use_fp16 else torch.float32),
+                device=args.device,
+            )
+
+        # Verify prompt processing in merged model (decoder_model.onnx)
         args.use_past_kv = False
-        kv_cache_ortvalues = verify_parity(args, config, llama, kv_cache_ortvalues)
+        kv_cache_ortvalues = verify_parity(
+            args, location, use_auth_token, kv_cache_ortvalues, pytorch_model=llama, config=config
+        )
 
         # Verify token generation in merged model (decoder_with_past_model.onnx)
         args.use_past_kv = True
-        verify_parity(args, config, llama, kv_cache_ortvalues)
+        verify_parity(args, location, use_auth_token, kv_cache_ortvalues, pytorch_model=llama, config=config)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
index 94e0397116d1..d570e2d7ee08 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import logging
 import os
 
@@ -21,6 +26,7 @@ def setup_torch_model(args, location, use_auth_token, torch_dtype=torch.float32,
         if i == rank % (world_size):
             l_config = AutoConfig.from_pretrained(location, use_auth_token=use_auth_token, cache_dir=args.cache_dir)
             l_config.use_cache = True
+            l_config._attn_implementation = "eager"  # "eager" uses LlamaAttention for attention layer
             llama = AutoModelForCausalLM.from_pretrained(
                 location,
                 use_auth_token=use_auth_token,
diff --git a/onnxruntime/python/tools/transformers/models/llama/prompts.json b/onnxruntime/python/tools/transformers/models/llama/prompts.json
new file mode 100644
index 000000000000..5d8fae99dbc7
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/llama/prompts.json
@@ -0,0 +1,11 @@
+{
+    "16": "How are astronauts launched into space quickly on those rockets? ",
+    "64": "Today, we will learn how to bake a chocolate cake. First, you need to have all of the ingredients to bake. Otherwise, the chocolate cake won't be tasty. You will also need a large baking pan to hold the batter. ",
+    "256": "Risk Management and Insurance (RMI) is a field that focuses on the identification, assessment and financial mitigation of risk. It's about insurance but also more than that. For example, insurance companies look at risk factors such as age, gender and medical history to determine how much they will charge for life insurance coverage. However, RMI is not just about buying insurance (although it is a big part of this). It is also about taking steps to reduce the likelihood that something bad happens in the first place. For example, you may think twice before crossing a busy road if there's a high risk of being hit by a car or getting injured. In addition to insurance companies and financial services firms, RMI professionals work with individuals (customers), businesses and other entities (clients). Their job is to identify potential risks and help mitigate them before they become problems for their clients. This can include helping people prepare financially for unexpected events like losing a job or being injured in an accident, as well as assisting businesses with managing risk exposure from things like natural disasters or cyber attacks. Insurance companies use RMI to ",
+    "1024": "Risk Management and Insurance (RMI) is a field that focuses on the identification, assessment and financial mitigation of risk. It's about insurance but also more than that. For example, insurance companies look at risk factors such as age, gender and medical history to determine how much they will charge for life insurance coverage. However, RMI is not just about buying insurance (although it is a big part of this). It is also about taking steps to reduce the likelihood that something bad happens in the first place. For example, you may think twice before crossing a busy road if there's a high risk of being hit by a car or getting injured. In addition to insurance companies and financial services firms, RMI professionals work with individuals (customers), businesses and other entities (clients). Their job is to identify potential risks and help mitigate them before they become problems for their clients. This can include helping people prepare financially for unexpected events like losing a job or being injured in an accident, as well as assisting businesses with managing risk exposure from things like natural disasters or cyber attacks. Insurance companies use RMI to assess the level of risk associated with potential customers and determine how much they should charge them for coverage. For example, if you are a healthy 25-year old male who doesn't smoke and has never been in an accident, your insurance premiums will likely be lower than those of someone else who fits into one or more of these categories (or all three). Risk Management & Insurance is the process by which you can protect yourself from financial loss. It's about taking control of your money and making sure that it's safe, secure and accessible to you when you need it most. The first step in risk management is understanding what risks are important to you as an individual or a family member who may depend on the income generated by these investments for their livelihood. Once you have identified these key risk factors, then we can help identify how best to manage them through various strategies such as setting up automatic payments into savings accounts so that money is always available when needed most; setting aside emergency funds in case something unexpected happens (e.g., illness); investing wisely so that returns outpace inflation over time; diversifying portfolios by adding stocks and bonds which will help reduce volatility while still providing growth potential through dividends/interest payments over longer periods of time than if invested solely into one type of asset class alone etc. The field of risk management and insurance is growing rapidly, as more people become aware of the potential dangers that can arise from an unforeseen event or accident. As a result, there are many different careers within this field that you may want to consider if you're interested in working with risks and helping others protect themselves from them.One common career path in risk management is as an insurance agent/broker. This person would work for an insurance company or brokerage firm, selling policies to clients who need coverage against things like car accidents or home damage caused by natural disasters such as fires or floods. Insurance agents typically work on commission (i.e., they receive a percentage of every sale). This is important because it means that the more successful an agent is at selling policies, the higher his/her income will be. Another career option within risk management is working for an insurance company itself rather than as an external broker or salesperson. In this case, you'd help manage claims made by policyholders who have been injured through no fault of their own (for example after being hit by another driver). You can also work in risk analysis, a field that involves analyzing the potential risks associated with various investments and projects. This is done to determine whether or not an opportunity has enough upside to justify taking on any related risks. In addition, you might also be responsible for developing strategies to minimize those risks so they don't result in big losses if something goes wrong down the road. If your goal is to work as a broker or agent, then there are some prerequisites that will need to be met before beginning this career path: You must have an associate's degree from an accredited college; pass an exam administered by state regulators (the Series 6) and/or complete additional training offered by professional organizations such as NAFA, which stands for National Association of Financial Advisors. After meeting these requirements, you'll then need to find employment at one or more insurance companies where they offer positions that allow new hires some flexibility when starting out their careers.Risk management and insurance is a broad field that includes many different types of jobs. ",
+    "2048": "Artificial Intelligence (AI) is a transformative technology that has the potential to revolutionize society in many ways. AI can be used to enhance the accuracy and efficiency of decision-making, improve lives through new apps and services, and solve some of the thorny policy problems of climate change, infrastructure, and healthcare. In this essay, I will discuss some of the ways AI can benefit society. One of the most significant benefits of AI is its ability to improve healthcare. AI can assist doctors, nurses, and other healthcare professionals in making better diagnoses and faster decisions on a course of treatment, based on the large amount of data that currently exists. AI allows doctors to pinpoint effective drugs that may have otherwise been overlooked and can identify higher-risk individuals before any human can. AI can also help relieve the burden on healthcare professionals by taking care of routine data collection and filing, freeing up time for other higher-value activities. Another area where AI can benefit society is in the fight against climate change. AI can be used to analyze vast amounts of data, identify patterns, and provide accurate predictions. It can help us forecast what further spread of pandemics is going to look like, and track their development around the world. AI can also help us predict the impact of climate change on our planet and develop strategies to mitigate its effects. For example, AI can be used to optimize energy consumption, reduce waste, and improve the efficiency of transportation systems. AI can also benefit society by improving education. AI-powered educational tools can help students learn more effectively by providing personalized learning experiences tailored to their individual needs. AI can also help teachers by automating routine tasks such as grading and providing feedback on student work. This can free up time for teachers to focus on more important tasks such as lesson planning and student engagement. AI can also benefit society by improving public safety. AI-powered surveillance systems can help law enforcement agencies detect and prevent crime more effectively. AI can also be used to analyze social media data to identify potential threats and prevent them before they occur. For example, AI can be used to detect hate speech and other forms of online harassment, which can help prevent cyberbullying and other forms of online abuse. Finally, AI can benefit society by improving the economy. AI can help businesses become more efficient by automating routine tasks and providing insights into customer behavior. This can help businesses make better decisions and improve their bottom line. AI can also help create new jobs by enabling the development of new products and services that were previously impossible. In conclusion, AI has the potential to benefit society in many ways. From improving healthcare and education to fighting climate change and improving public safety, AI can help us solve some of the most pressing problems facing our world today. As we continue to develop and refine this transformative technology, it is important that we do so in an ethical and responsible manner, ensuring that the benefits of AI are shared by all members of society. AI has been a topic of discussion for many years, and while it has brought many benefits to society, there are also concerns about its impact. In this essay, I will discuss some of the reasons why AI may not help society. Firstly, AI can be biased. AI systems are designed by humans, and they can be infused with the biases of their creators. This can lead to discrimination against certain groups of people and can perpetuate existing inequalities in society. Additionally, AI can lack transparency, making it difficult to understand how decisions are being made. This can lead to mistrust of AI systems and can hinder their adoption. Secondly, AI can be used to automate jobs, which can lead to unemployment. While AI can increase productivity and efficiency, it can also lead to job displacement, particularly in industries that rely heavily on manual labor. This can have a negative impact on individuals and communities, particularly those that are already marginalized. Thirdly, AI can be used to create fake content, such as deepfakes, which can be used to spread misinformation and propaganda. This can have serious consequences for democracy and can undermine trust in institutions. Fourthly, AI can be used to create autonomous weapons, which can have devastating consequences. These weapons can make decisions without human intervention, which can lead to unintended consequences and can be difficult to control. Fifthly, AI can be used to create surveillance systems that infringe on privacy rights. These systems can be used to monitor individuals without their knowledge or consent, which can have serious consequences for civil liberties. In conclusion, while AI has many potential benefits, there are also concerns about its impact on society. It is important to consider these concerns and to ensure that AI is developed and used in a responsible and ethical manner. Within AI, there are also many subfields. Reinforcement learning is a type of machine learning algorithm that focuses on training models to make decisions in an environment in order to maximize a reward. This is typically done through trial and error, as the algorithm receives feedback in the form of rewards or punishments for its actions. Reinforcement learning has many potential benefits for society, some of which are discussed below. Firstly, reinforcement learning can be used to improve industrial automation and robotics. By training robots to learn from their own experiences, they can gain the skills necessary to perform complex tasks without human intervention. This can lead to increased efficiency and productivity in industries such as manufacturing and logistics. Secondly, reinforcement learning can be used to optimize traffic control systems. By training models to make real-time decisions based on traffic patterns and other data, traffic flow can be improved, reducing congestion and travel times. Thirdly, reinforcement learning can be used to improve healthcare. By training models to make decisions based on patient data, doctors can make more accurate diagnoses and develop more effective treatment plans. This can lead to better health outcomes for patients and can reduce healthcare costs. Fourthly, reinforcement learning can be used to improve education. By training models to adapt to individual student needs, personalized learning experiences can be created that are tailored to each student\u2019s strengths and weaknesses. This can lead to improved academic performance and can help to close the achievement gap. Finally, reinforcement learning can be used to improve environmental sustainability. By training models to make decisions based on environmental data, such as weather patterns and pollution levels, more effective policies can be developed to reduce carbon emissions and protect natural resources. In conclusion, reinforcement learning has many potential benefits for society. By training models to make decisions based on feedback from their environment, we can create more efficient and effective systems in a wide range of fields. However, it is important to consider the ethical implications of these technologies and to ensure that they are developed and used in a responsible and ethical manner. Multi-modal models are another type of machine learning that can process and find relationships between different types of data, such as images, video, audio, and text. They have the potential to revolutionize many aspects of our lives, from healthcare to transportation to education. In this essay, I will discuss how multi-modal models can help society in various ways. One of the most significant benefits of multi-modal models is their ability to transform unstructured data into structured data that can be analyzed. For example, a company could use a multi-modal model to extract data from images or PDFs of invoices or receipts. This would enable them to analyze the data more efficiently and make better-informed decisions. Another benefit of multi-modal models is their ability to cater to various learning styles. Blended and multi-modal learning can reach people who benefit from different learning styles. By understanding their individual learning styles, employees can leverage resources that are compatible with how they process information most effectively. Multi-modal models can also help improve healthcare. For example, they can be used to analyze medical images and identify patterns that might be difficult for human doctors to detect. This can lead to earlier diagnoses and more effective treatments. In addition, multi-modal models can help improve transportation. For example, they can be used to analyze traffic patterns and optimize traffic flow. This can help reduce congestion and improve safety on the roads. Finally, multi-modal models can help improve education. For example, they can be used to create personalized learning experiences for students based on their individual learning styles. This can help students learn more effectively and efficiently. In conclusion, multi-modal models have the potential to help society in many ways. They can transform unstructured data into structured data, cater to various learning styles, improve healthcare, transportation, and education. However, like any new technology, it is important to approach it with caution and consider the potential risks and benefits. I hope this essay has provided some insight into the potential benefits of multi-modal models. Throughout this essay, I have demonstrated the numerous benefits that artificial intelligence will bring to our society. I have also shown some examples of various categories within artificial intelligence that have varying purposes. It is important to consider that each category has its own purpose and has its own pros and cons to it. In conclusion, we must use AI responsibly. ",
+    "3840": "Artificial Intelligence (AI) is a transformative technology that has the potential to revolutionize society in many ways. AI can be used to enhance the accuracy and efficiency of decision-making, improve lives through new apps and services, and solve some of the thorny policy problems of climate change, infrastructure, and healthcare. In this essay, I will discuss some of the ways AI can benefit society. One of the most significant benefits of AI is its ability to improve healthcare. AI can assist doctors, nurses, and other healthcare professionals in making better diagnoses and faster decisions on a course of treatment, based on the large amount of data that currently exists. AI allows doctors to pinpoint effective drugs that may have otherwise been overlooked and can identify higher-risk individuals before any human can. AI can also help relieve the burden on healthcare professionals by taking care of routine data collection and filing, freeing up time for other higher-value activities. Another area where AI can benefit society is in the fight against climate change. AI can be used to analyze vast amounts of data, identify patterns, and provide accurate predictions. It can help us forecast what further spread of pandemics is going to look like, and track their development around the world. AI can also help us predict the impact of climate change on our planet and develop strategies to mitigate its effects. For example, AI can be used to optimize energy consumption, reduce waste, and improve the efficiency of transportation systems. AI can also benefit society by improving education. AI-powered educational tools can help students learn more effectively by providing personalized learning experiences tailored to their individual needs. AI can also help teachers by automating routine tasks such as grading and providing feedback on student work. This can free up time for teachers to focus on more important tasks such as lesson planning and student engagement. AI can also benefit society by improving public safety. AI-powered surveillance systems can help law enforcement agencies detect and prevent crime more effectively. AI can also be used to analyze social media data to identify potential threats and prevent them before they occur. For example, AI can be used to detect hate speech and other forms of online harassment, which can help prevent cyberbullying and other forms of online abuse. Finally, AI can benefit society by improving the economy. AI can help businesses become more efficient by automating routine tasks and providing insights into customer behavior. This can help businesses make better decisions and improve their bottom line. AI can also help create new jobs by enabling the development of new products and services that were previously impossible. In conclusion, AI has the potential to benefit society in many ways. From improving healthcare and education to fighting climate change and improving public safety, AI can help us solve some of the most pressing problems facing our world today. As we continue to develop and refine this transformative technology, it is important that we do so in an ethical and responsible manner, ensuring that the benefits of AI are shared by all members of society. AI has been a topic of discussion for many years, and while it has brought many benefits to society, there are also concerns about its impact. In this essay, I will discuss some of the reasons why AI may not help society. Firstly, AI can be biased. AI systems are designed by humans, and they can be infused with the biases of their creators. This can lead to discrimination against certain groups of people and can perpetuate existing inequalities in society. Additionally, AI can lack transparency, making it difficult to understand how decisions are being made. This can lead to mistrust of AI systems and can hinder their adoption. Secondly, AI can be used to automate jobs, which can lead to unemployment. While AI can increase productivity and efficiency, it can also lead to job displacement, particularly in industries that rely heavily on manual labor. This can have a negative impact on individuals and communities, particularly those that are already marginalized. Thirdly, AI can be used to create fake content, such as deepfakes, which can be used to spread misinformation and propaganda. This can have serious consequences for democracy and can undermine trust in institutions. Fourthly, AI can be used to create autonomous weapons, which can have devastating consequences. These weapons can make decisions without human intervention, which can lead to unintended consequences and can be difficult to control. Fifthly, AI can be used to create surveillance systems that infringe on privacy rights. These systems can be used to monitor individuals without their knowledge or consent, which can have serious consequences for civil liberties. In conclusion, while AI has many potential benefits, there are also concerns about its impact on society. It is important to consider these concerns and to ensure that AI is developed and used in a responsible and ethical manner. Within AI, there are also many subfields. Reinforcement learning is a type of machine learning algorithm that focuses on training models to make decisions in an environment in order to maximize a reward. This is typically done through trial and error, as the algorithm receives feedback in the form of rewards or punishments for its actions. Reinforcement learning has many potential benefits for society, some of which are discussed below. Firstly, reinforcement learning can be used to improve industrial automation and robotics. By training robots to learn from their own experiences, they can gain the skills necessary to perform complex tasks without human intervention. This can lead to increased efficiency and productivity in industries such as manufacturing and logistics. Secondly, reinforcement learning can be used to optimize traffic control systems. By training models to make real-time decisions based on traffic patterns and other data, traffic flow can be improved, reducing congestion and travel times. Thirdly, reinforcement learning can be used to improve healthcare. By training models to make decisions based on patient data, doctors can make more accurate diagnoses and develop more effective treatment plans. This can lead to better health outcomes for patients and can reduce healthcare costs. Fourthly, reinforcement learning can be used to improve education. By training models to adapt to individual student needs, personalized learning experiences can be created that are tailored to each student\u2019s strengths and weaknesses. This can lead to improved academic performance and can help to close the achievement gap. Finally, reinforcement learning can be used to improve environmental sustainability. By training models to make decisions based on environmental data, such as weather patterns and pollution levels, more effective policies can be developed to reduce carbon emissions and protect natural resources. In conclusion, reinforcement learning has many potential benefits for society. By training models to make decisions based on feedback from their environment, we can create more efficient and effective systems in a wide range of fields. However, it is important to consider the ethical implications of these technologies and to ensure that they are developed and used in a responsible and ethical manner. Multi-modal models are another type of machine learning that can process and find relationships between different types of data, such as images, video, audio, and text. They have the potential to revolutionize many aspects of our lives, from healthcare to transportation to education. In this essay, I will discuss how multi-modal models can help society in various ways. One of the most significant benefits of multi-modal models is their ability to transform unstructured data into structured data that can be analyzed. For example, a company could use a multi-modal model to extract data from images or PDFs of invoices or receipts. This would enable them to analyze the data more efficiently and make better-informed decisions. Another benefit of multi-modal models is their ability to cater to various learning styles. Blended and multi-modal learning can reach people who benefit from different learning styles. By understanding their individual learning styles, employees can leverage resources that are compatible with how they process information most effectively. Multi-modal models can also help improve healthcare. For example, they can be used to analyze medical images and identify patterns that might be difficult for human doctors to detect. This can lead to earlier diagnoses and more effective treatments. In addition, multi-modal models can help improve transportation. For example, they can be used to analyze traffic patterns and optimize traffic flow. This can help reduce congestion and improve safety on the roads. Finally, multi-modal models can help improve education. For example, they can be used to create personalized learning experiences for students based on their individual learning styles. This can help students learn more effectively and efficiently. In conclusion, multi-modal models have the potential to help society in many ways. They can transform unstructured data into structured data, cater to various learning styles, improve healthcare, transportation, and education. However, like any new technology, it is important to approach it with caution and consider the potential risks and benefits. I hope this essay has provided some insight into the potential benefits of multi-modal models. Semi-supervised learning is a type of machine learning that falls in between supervised and unsupervised learning. It is a method that uses a small amount of labeled data and a large amount of unlabeled data to train a model. The goal of semi-supervised learning is to learn a function that can accurately predict the output variable based on the input variables, similar to supervised learning. However, unlike supervised learning, the algorithm is trained on a dataset that contains both labeled and unlabeled data. Semi-supervised learning is particularly useful when there is a large amount of unlabeled data available, but it\u2019s too expensive or difficult to label all of it. The primary advantage of semi-supervised learning is that it can reduce the amount of annotated data used. This is particularly useful when labeled data is scarce or expensive to obtain. By using a small amount of labeled data and a large amount of unlabeled data, semi-supervised learning algorithms can learn from both types of data and improve their accuracy. Semi-supervised learning algorithms are also capable of consolidating overfitting tendencies, which is a common problem in supervised learning. Another advantage of semi-supervised learning is that it is versatile. It can be applied in various situations, from image recognition to crawlers. For example, in text classification, the goal is to classify a given text into one or more predefined categories. Semi-supervised learning can be used to train a text classification model using a small amount of labeled data and a large amount of unlabeled text data. In image classification, the goal is to classify a given image into one or more predefined categories. Semi-supervised learning can be used to train an image classification model using a small amount of labeled data and a large amount of unlabeled image data. In anomaly detection, the goal is to detect patterns or observations that are unusual or different from the norm. Semi-supervised learning can be used to detect anomalies using a small amount of labeled data and a large amount of unlabeled data. Semi-supervised learning algorithms are also stable and simple. They have high efficiency and can be used to improve the performance and generalization of models. However, semi-supervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of unlabeled data to be effective. If there is not enough unlabeled data available, the algorithm may not be able to learn effectively. Additionally, semi-supervised learning algorithms can be sensitive to the quality of the labeled data. If the labeled data is noisy or incorrect, the algorithm may not be able to learn effectively. In conclusion, semi-supervised learning is a powerful tool that can be used to improve the accuracy and generalization of machine learning models. It is particularly useful when labeled data is scarce or expensive to obtain. Semi-supervised learning algorithms can learn from both labeled and unlabeled data, which makes them versatile and capable of consolidating overfitting tendencies. However, semi-supervised learning algorithms also have some disadvantages, such as requiring a large amount of unlabeled data to be effective and being sensitive to the quality of the labeled data. Despite these disadvantages, semi-supervised learning is a valuable technique that can be used to improve the performance of machine learning models. Supervised learning is a type of machine learning that involves training a model on labeled data. The goal of supervised learning is to learn a function that can accurately predict the output variable based on the input variables. Supervised learning is widely used in various fields, including image recognition, speech recognition, natural language processing, and more. One of the primary advantages of supervised learning is that it allows for accurate predictions. Supervised learning models can provide highly accurate predictions or classifications when trained on a diverse and representative dataset. This makes supervised learning particularly useful in situations where accuracy is critical, such as in medical diagnosis or fraud detection. Another advantage of supervised learning is that it is easy to understand and implement. Supervised learning algorithms are relatively simple and can be implemented using a variety of programming languages and libraries. This makes it accessible to a wide range of developers and data scientists. Supervised learning is also versatile. It can be applied to a wide range of problem domains, making it a flexible approach for various industries and applications. For example, in image classification, the goal is to classify a given image into one or more predefined categories. Supervised learning can be used to train an image classification model using a labeled dataset of images and their corresponding categories. In speech recognition, the goal is to transcribe spoken words into text. Supervised learning can be used to train a speech recognition model using a labeled dataset of audio recordings and their corresponding transcriptions. Supervised learning algorithms are also capable of handling missing data. If there is missing data in the labeled dataset, supervised learning algorithms can still learn from the available data and make accurate predictions. This is particularly useful in situations where data is incomplete or noisy. However, supervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of labeled data to be effective. If there is not enough labeled data available, the algorithm may not be able to learn effectively. Additionally, supervised learning algorithms can be sensitive to the quality of the labeled data. If the labeled data is noisy or incorrect, the algorithm may not be able to learn effectively. In conclusion, supervised learning is a powerful tool that can be used to make accurate predictions and classifications. It is easy to understand and implement, and it is versatile enough to be applied to a wide range of problem domains. However, supervised learning algorithms also have some disadvantages, such as requiring a large amount of labeled data to be effective and being sensitive to the quality of the labeled data. Despite these disadvantages, supervised learning is a valuable technique that can be used to improve the performance of machine learning models. Unsupervised learning is a type of machine learning that involves training a model on unlabeled data. The goal of unsupervised learning is to learn the underlying structure of the data, without any prior knowledge of the output variable. Unsupervised learning is widely used in various fields, including image recognition, natural language processing, and more. One of the primary advantages of unsupervised learning is that it can handle large amounts of unlabeled and unstructured data. This makes unsupervised learning particularly useful in situations where labeled data is scarce or expensive to obtain. By using unsupervised learning algorithms, we can learn from the available data and make accurate predictions. Another advantage of unsupervised learning is that it can identify previously undetected patterns in data. Unsupervised learning algorithms can be used to cluster data points into groups based on their similarities. This can be useful in various applications, such as customer segmentation, anomaly detection, and more. Unsupervised learning algorithms are also capable of dimensionality reduction. This is particularly useful when dealing with high-dimensional data, such as images or text. By reducing the dimensionality of the data, unsupervised learning algorithms can improve the efficiency and accuracy of the model. Unsupervised learning algorithms are also capable of feature learning. Feature learning is the process of automatically learning features from the input data. This can be useful in various applications, such as image recognition, where the algorithm can learn features such as edges, corners, and more. However, unsupervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of unlabeled data to be effective. If there is not enough unlabeled data available, the algorithm may not be able to learn effectively. Additionally, unsupervised learning algorithms can be sensitive to the quality of the data. If the data is noisy or incorrect, the algorithm may not be able to learn effectively. As you can see, artificial intelligence (AI) is a wide-ranging field that encompasses various sub-fields. Some of the sub-fields that we have previously discussed include reinforcement learning, multi-modal learning, semi-supervised learning, supervised learning, unsupervised learning, and much more. There are also many application domains for artificial intelligence (AI) that can utilize it. Throughout this essay, I have demonstrated the numerous benefits that artificial intelligence (AI) will bring to our society. I have also shown some examples of various categories within artificial intelligence that have varying purposes. It is important to consider that each category has its own purpose and has its own pros and cons to it. What do you think artificial intelligence will bring to our society? Will it be used in a responsible manner? ",
+    "4096": "In the heart of Eldoria, where ancient forests whispered secrets and rivers sang forgotten melodies, lay the Enchanted Labyrinth. Its walls, adorned with shimmering runes, concealed a portal to realms unknown. Few dared to venture inside, for the labyrinth was said to twist time and reality. Evelyn, a curious young mage, stood before the labyrinth's entrance. Her emerald eyes sparkled with determination. She clutched a cracked map, its ink fading like memories lost to the wind. Legends spoke of a treasure hidden deep within - a relic capable of granting any wish. As Evelyn stepped across the threshold, the air thickened. The walls shifted, rearranging themselves. She followed the faint glow of her lantern, each step echoing through eternity. Shadows danced, whispering forgotten names. Was this a dream or a nightmare? Deeper into the labyrinth, Evelyn encountered Aelar, the Guardian of Time. His silver hair flowed like moonlight, and his eyes held the weight of centuries. Aelar barred her path, his staff crackling with energy. 'Seeker,' he intoned, 'answer my riddle, and the way shall open.' Evelyn's heart raced. 'Ask, Guardian.' 'What has roots as old as time, yet dances with the wind?' She pondered, memories of her grandmother's tales flooding her mind. 'A tree,' she replied. Aelar smiled, and the walls shifted once more. 'Proceed, Seeker.' The labyrinth twisted, revealing a moonlit grove. Trees hummed ancient lullabies, and fireflies wove constellations in the air. At the center stood a weeping willow, its branches brushing the ground like a grieving widow's veil. Evelyn approached, her fingers tracing the bark. 'Why do you weep?' The willow's voice, soft as falling petals, answered, 'I guard the Tear of Eternity.' Evelyn's breath caught. The Tear - a gem said to hold memories of lost civilizations. She plucked it from a low branch, its facets reflecting forgotten faces. As Evelyn pressed onward, the labyrinth tightened its grip. She faced illusions - lovers lost, friends betrayed. Doubt gnawed at her resolve. Was the treasure worth the cost? At the labyrinth's heart, she found a mirror. Her reflection wavered, revealing her deepest desire: her sister, Lysandra, who vanished years ago. Tears blurred the glass. 'Speak your wish,' the mirror whispered. Evelyn's voice trembled. 'Bring Lysandra back.' The mirror shattered, and reality fractured. Lysandra stepped through, eyes wide with wonder. 'Evelyn?' Lysandra's return came at a cost - the labyrinth demanded balance. For every wish granted, a memory faded. Evelyn watched as her childhood laughter dissolved like mist. Together, they exited the labyrinth, the Tear pulsing in Evelyn's palm. She gazed at her sister, both joy and sorrow in her eyes. 'Was it worth it?' Lysandra asked. Evelyn smiled. 'In Eldoria, every choice we make becomes a story. And ours, dear sister, is woven in stardust and sacrifice.' And so, the Enchanted Labyrinth whispered its final secret: Wishes are threads, and memories their loom. In the land of Aetherfall, where mist-clad mountains touched the heavens and rivers whispered forgotten spells, a prophecy echoed through time. It spoke of the Starstone, a gem said to hold the universe's secrets - the key to creation and destruction. Eldric, a humble blacksmith with eyes like storm clouds, stumbled upon an ancient map. Its ink had faded, but the constellations remained. Guided by fate, he set forth, leaving his forge behind. Eldric's journey led him to the Whispering Forest, where trees conversed in hushed tones. Their leaves whispered of hidden paths and treacherous guardians. Eldric's heart pounded as he stepped into the shadows. There, he met Lyria, a forest nymph with silver hair and eyes like moonlit pools. She guarded the first clue - a riddle etched into a petal: 'In the heart of the forest, where time bends, seek the Wellspring of Echoes. There, the Starstone awaits.' Eldric followed Lyria's guidance. The Wellspring lay within a moon-kissed glade. Its waters shimmered, reflecting memories of lost lovers, ancient battles, and forgotten oaths. Eldric dipped his hand, and the riddle unfolded: 'To find the Starstone, seek the Three Keys: the tear of a fallen star, the breath of a dragon, and the song of a forgotten bard.' Eldric climbed the Stardust Peaks, where fallen stars lay embedded in the rock. Each tear held a fragment of cosmic sorrow. He found one - a sapphire gem pulsing with celestial fire. But it was guarded by Drakor, the last of the star dragons. Drakor's scales shimmered like galaxies. His eyes held eons of wisdom. 'Why seek the Tear, mortal?' 'To save Aetherfall,' Eldric replied. 'To restore balance.' Drakor nodded, and with a breath, he shattered the gem. Eldric caught the falling tear - a shard of eternity. Next, Eldric sailed to the Isle of Shadows, where the void whispered secrets. There, he faced Nyxia, the ancient shadow dragon. Her wings spanned continents, and her breath could devour stars. 'Why seek my breath?' Nyxia hissed. 'To awaken the Starstone,' Eldric said. 'To mend the rifts.' Nyxia's eyes glowed. She exhaled - a stream of darkness. Eldric captured it in a crystal vial - the Breath of the Void. The final key lay in the Bard's Hollow, where echoes of lost melodies lingered. Eldric met Silvan, a ghostly minstrel who strummed a lute of moonwood. 'Sing,' Silvan urged. 'The Song of the Forgotten.' Eldric sang of battles, love, and sacrifice. The hollow trembled, and from the mist, a spectral harp appeared. Its strings hummed - the Song of Ages. Eldric plucked the notes, and they merged into a silver key - the Song of the Forgotten. At the Nexus of Worlds, Eldric assembled the keys - the Tear, the Breath, and the Song. The ground quaked, and the Starstone emerged - a gem of cosmic hues. Its light wove reality, mending fractures in Aetherfall. But the prophecy held a twist: the Starstone demanded a choice. Eldric could use it to reshape the world or sacrifice it to heal the void. He gazed at Lyria, Drakor, Nyxia, and Silvan - their fates intertwined. With a heavy heart, he whispered, 'Balance.' And so, the Starstone shattered, its fragments seeding new constellations. Eldric returned to his forge, but his hammer now shaped more than iron - it forged destiny. Lyria, the Forest Nymph Lyria, with her silver hair and eyes like moonlit pools, remained in the Whispering Forest. She became its guardian, weaving spells to protect the ancient trees. Her laughter echoed through the glades, and travelers whispered of a nymph who danced with moonbeams. Lyria's heart held a secret - the memory of Eldric's touch, the warmth of their shared quest. She tended to the Wellspring of Echoes, ensuring its waters flowed through time, carrying whispers of forgotten tales. Drakor, the Last Star Dragon Drakor, the last of the star dragons, retreated to the highest peak of the Stardust Peaks. There, he curled his immense form around the shattered Tear of the Fallen. His scales absorbed its cosmic fire, and he became a living constellation - a beacon for lost souls. Drakor's breath no longer consumed stars; instead, it birthed new constellations. Travelers gazed at the night sky, seeking guidance in his patterns. Drakor's eyes held both sorrow and hope, for he knew that balance required sacrifice. Nyxia, the Ancient Shadow Dragon Nyxia, with wings spanning continents, chose a different path. She descended to the Isle of Shadows, where the void whispered secrets. There, she guarded the Abyss of Remembrance - a rift between worlds. Nyxia's breath no longer devoured stars; it sealed the rifts. She became a bridge, allowing souls to traverse realms. Those who sought lost loved ones or glimpses of forgotten memories found solace in her shadowed embrace. Nyxia's eyes held the weight of choices made and unmade, and she vowed to keep the balance intact. Silvan, the Ghostly Minstrel Silvan, the spectral minstrel, wandered the Bard's Hollow. His lute of moonwood sang melodies of love, loss, and courage. Silvan's song echoed through time, touching hearts across Aetherfall. He became the keeper of memories - the forgotten bard who whispered forgotten names. When travelers stumbled upon the hollow, Silvan strummed his lute, and their own stories surfaced. He wove their experiences into the Song of Ages, ensuring that no tale would fade into oblivion. Silvan's translucent form danced in moonlight, a bridge between the living and the departed. Eldric, the Blacksmith As for Eldric, the humble blacksmith, he returned to his forge in the village of Hearthstone. His hammer now shaped more than iron - it forged destiny. Eldric crafted talismans from the Tear of the Fallen, the Breath of the Void, and the Song of the Forgotten. These talismans healed rifts, mended broken hearts, and ignited hope. Eldric's eyes held the wisdom of realms explored, and he knew that Aetherfall's balance rested on the choices of ordinary souls. He continued to tell the tale of the Starstone, passing it down through generations, ensuring that the magic endured. And so, dear reader, the threads of fate intertwined - a forest nymph, a star dragon, a shadow, and a minstrel - all bound by the echoes of a forgotten song. The Chronicles of the Celestial Weaver In the forgotten village of Astralis, where the night sky wept silver tears, lived a young girl named Elara. Her eyes held the secrets of constellations, and her fingers danced like stardust. But Astralis suffered - a curse had befallen the heavens. The stars dimmed, their brilliance fading. Elara's grandmother, Lyris, whispered of an ancient prophecy: 'When the stars falter, seek the Celestial Weaver.' Elara vowed to unravel the mystery and save her village. Guided by Lyris's map, Elara ventured into the Veiled Forest, where moonlight wove through ancient oaks. There, she met Silas, the enigmatic weaver. His loom hummed with cosmic threads - the Loom of Eternity. 'Seek the lost constellations,' Silas said. 'Weave them anew.' Elara's heart raced. She plucked a silver thread - the remnants of Orion - and began to weave. The loom responded, stars rekindling. But the cost was memory - Elara forgot her childhood laughter. Elara's journey spanned realms: The Nebula Caves: She retrieved the Pleiades, their sisterhood echoing through time. The Comet's Trail: She chased Halley's Comet, capturing its fiery tail. The Abyss of Lyra: There, Vega's song echoed - a melody of love and longing. Each constellation restored, Elara's memories faded. She forgot her first kiss, her mother's lullabies. Yet Astralis glimmered - the stars brightened. In the Celestial Citadel, Elara faced Draco, the fallen dragon. His scales bore scars - the price of rebellion. He guarded the final constellation - the Serpent. 'Why weave the stars?' Draco hissed. 'They betrayed me.' Elara's fingers trembled. 'To save my village.' Draco's eyes softened. 'We were once kin. We'll share this memory.' As Elara wove the Serpent, she glimpsed Draco's love for Lyris - their forbidden bond. The constellation blazed, and Elara remembered both love and sacrifice. Back in Astralis, the stars blazed anew. Villagers rejoiced, but Elara's memories were fragile threads. Lyris embraced her. 'You've woven fate,' Lyris said. 'But the Loom demands balance.' Elara faced Silas. 'What price?' He smiled - a constellation of wrinkles. 'Your memories or the stars.' Elara hesitated. She remembered her grandmother's stories, her stolen kisses. She chose the stars. Elara became the new Celestial Weaver. Her memories - her life - wove into the cosmos. Astralis thrived, but Elara forgot her name, her laughter, her love. Lyris whispered, 'Weavers are forgotten, but their constellations endure.' And so, Elara wove - the forgotten girl who stitched eternity. Elara, now the Celestial Weaver, wove constellations with threads of memory. Astralis thrived - the villagers danced under starlit skies, unaware of their forgotten histories. Lyris watched her granddaughter, her eyes both proud and sorrowful. 'Elara,' Lyris whispered, 'the Loom demands more than memories.' Elara's fingers trembled. She glimpsed her own reflection in the cosmic threads - the girl who once dreamed of love and laughter. But now, her past was a constellation of faded stars. Silas, the former weaver, lingered in the shadows. His form blurred - a specter between realms. He spoke of the Whispering Veil, a boundary separating memory from oblivion. Beyond it lay forgotten worlds, lost loves, and forbidden truths. 'Cross the Veil,' Silas urged. 'Retrieve what was sacrificed.' Elara hesitated. She yearned for her stolen memories - the taste of strawberries, the warmth of a lover's touch. But the Veil was treacherous - a labyrinth of half-remembered echoes. Elara stepped into the Veil. Its mist clung to her skin, whispering secrets. She glimpsed fragments of her past - a stolen kiss, a tear shed for a fallen friend. The path forked: The Garden of Remembrance: Blooming with forgotten faces, this garden promised reunion. Elara could reclaim her lost memories, but at a cost - the stars would dim once more. The Abyss of Oblivion: A chasm of emptiness. Here, Elara could sever her ties to Astralis, becoming a true Celestial Weaver. The stars would blaze forever, but her existence would be a threadless void. Elara hesitated. She remembered Lyris's lullabies, Silas's enigmatic smile, and Draco's love for her grandmother. She yearned for her stolen laughter - the taste of strawberries, the warmth of a lover's touch. But the stars - Astralis - called to her. The village thrived, its people dancing under constellations she had rekindled. Elara's choice would echo across eternity. She faced the Veil's center - a mirror reflecting her fragmented self. Her fingers trembled. 'Balance,' she whispered. And so, Elara wove anew. She plucked threads from the Garden of Remembrance, reclaiming stolen moments. The stars dimmed, but Astralis glowed with forgotten love. Silas nodded. 'You've chosen well, Weaver.' Elara's memories returned - the taste of strawberries, the warmth of a lover's touch. She kissed Lyris's forehead, whispered Draco's name, and stepped back into Astralis. The stars blazed - the legacy of a girl who stitched eternity. Short stories like these are great to listen and read because they allow us to explore our creative minds and broaden our imaginations. They also inspire us to learn from others and can become culturally impactful. The themes of these stories can also dive deep into philosophical questions and raise awareness for important issues. The plots for these stories are sometimes based on real life events as well and can have deep emotional impact.",
+    "7936": "The Effects of Airplanes: A Closer Look Airplanes have revolutionized the way we travel, connect, and explore the world. From short domestic flights to transcontinental journeys, these metal birds have become an integral part of our lives. However, their impact extends beyond convenience and adventure. Let's delve into the effects of airplanes from various angles. Environmental Impact Fuel Consumption and Emissions Airplanes consume vast amounts of fuel during flight. For instance, a Boeing 747, with a gas tank capacity of 63,500 gallons, burns approximately five gallons of jet fuel per mile traveled. On a 4,000-mile flight, this translates to 20,000 gallons of fuel. However, when we consider the number of passengers (around 400), the fuel efficiency per traveler is surprisingly better than that of cars. A Honda Civic, which gets 30 miles per gallon, would need 133 gallons of fuel for the same distance. Even an RV, which moves just seven miles on a gallon of gasoline, would require about 285 gallons per traveler. Greenhouse Gas Emissions Airplanes emit greenhouse gases directly into the upper atmosphere, where they can linger longer and cause more damage than the same gases at lower altitudes. While air travel contributes to climate change, it's essential to recognize that other forms of transportation, such as cars and ships, also emit greenhouse gases. The challenge lies in finding ways to reduce aviation emissions without compromising connectivity and mobility. Ozone Depletion and Contrails Planes affect the concentration of other gases and pollutants in the atmosphere. They lead to a short-term increase in ozone (O3) but a long-term decrease. Contrails - those white streaks left behind by planes - can contribute to cloud formation and impact local weather patterns. Balancing the benefits of air travel with environmental concerns remains a critical challenge. Human Health Implications Jet Lag and Sleep Disruption Frequent flyers are no strangers to jet lag. Crossing time zones disrupts our circadian rhythms, affecting sleep patterns, mood, and overall well-being. Pilots, flight attendants, and passengers alike experience the effects of rapid travel across time zones. Dehydration and Blood Pressure Changes The low humidity in airplane cabins can lead to dehydration. Additionally, changes in cabin pressure affect blood pressure, especially during takeoff and landing. Staying hydrated and moving around during long flights can mitigate these effects. Risk of Contagious Diseases Airplanes put passengers in close proximity to one another. Recirculated air, shared surfaces, and confined spaces create an environment conducive to the spread of infections. While airlines take precautions, travelers should remain vigilant, especially during flu seasons. The Perspective Shift: Seeing Earth from Above Beyond the environmental and health impacts, airplanes have transformed our worldview. Before the Wright brothers' epochal breakthrough, humans were grounded, limited to terrestrial views. The advent of flight not only boosted our power of movement but also enhanced our vision. From above, we witness the curvature of the Earth, the vastness of oceans, and the intricate patterns of landscapes. Airplanes have made us global citizens, connecting us to distant lands and cultures. In conclusion, airplanes are a double-edged sword. They offer unparalleled mobility and exploration but come with environmental consequences and health considerations. As we continue to innovate and improve aviation technology, let's strive for a balance - a world where we soar through the skies while safeguarding our planet and well-being. Economic Impact Air Travel Industry The aviation industry is a significant contributor to the global economy. Airlines, airports, manufacturers, and associated services generate substantial revenue and employment. Air travel facilitates international trade, tourism, and business interactions. However, it also faces challenges such as fuel price fluctuations, competition, and regulatory complexities. Supply Chain and Cargo Transport Airplanes play a crucial role in transporting goods across continents. High-value and time-sensitive cargo, including perishable items, pharmaceuticals, and electronics, rely on air freight. The efficiency of supply chains owes much to the speed and reach of airplanes. Tourism and Local Economies Tourism heavily depends on air travel. Popular destinations thrive due to the influx of visitors arriving by plane. Local economies benefit from tourism-related activities, including hospitality, restaurants, and souvenir shops. Conversely, overreliance on tourism can strain natural resources and cultural heritage. Technological Advancements Aerospace Engineering The development of airplanes has driven advancements in aerospace engineering. Innovations in materials, aerodynamics, and propulsion systems have led to more efficient and safer aircraft. Research in areas like supersonic flight, electric planes, and autonomous drones continues to shape the industry. Navigation and Communication Airplanes rely on sophisticated navigation systems, including GPS, radar, and inertial guidance. These technologies enhance safety, accuracy, and efficiency. Communication networks allow pilots to stay connected with air traffic control, other planes, and ground stations. Social and Cultural Effects Global Connectivity Airplanes have transformed our perception of distance. What once took weeks by ship or months by land can now be accomplished in hours. Families separated by oceans reunite, students study abroad, and cultural exchange flourishes. The world feels smaller, and our interconnectedness grows. Iconic Symbols Airplanes evoke a sense of wonder and adventure. The iconic silhouettes of jumbo jets, fighter planes, and vintage biplanes symbolize human achievement and exploration. Airshows, aviation museums, and historical flights celebrate this legacy. Challenges and Future Prospects Sustainability The aviation industry faces the challenge of reducing its environmental impact. Researchers explore alternative fuels, electric propulsion, and lightweight materials. Balancing growth with sustainability remains critical. Airspace Congestion As air travel becomes more accessible, airspace congestion intensifies. Efficient air traffic management, improved routes, and next-generation air traffic control systems are essential to prevent gridlock. Security and Safety Ensuring the safety of passengers, crew, and cargo remains paramount. Rigorous security protocols, maintenance standards, and emergency preparedness are vital. In conclusion, airplanes are more than mere vessels of transportation. They shape economies, connect cultures, and inspire innovation. As we soar into the future, let's navigate the skies responsibly, appreciating both the marvels and challenges of flight. The Effects of Space Travel on the Human Body Space travel, with its awe-inspiring vistas and boundless possibilities, has captivated humanity for decades. However, venturing beyond our home planet comes with a price - a price paid not only in technological challenges but also in the toll it takes on the human body. Let us explore the effects of space travel, from radiation exposure to altered gravity, and how astronauts adapt to these extreme conditions. Space Radiation: A Silent Threat Radiation Exposure On Earth, our protective magnetic field and atmosphere shield us from the majority of space radiation. However, in space, astronauts face direct exposure to cosmic rays and solar particles. These high-energy particles can penetrate the body, damaging cells and DNA. Increased risk of cancer and degenerative diseases, such as heart disease and cataracts, have been observed in human populations exposed to radiation on Earth. In space, health risks from radiation are mainly driven by long-term impacts. Altered Gravity: A Weighty Matter Microgravity and Muscle Atrophy Astronauts aboard the International Space Station (ISS) experience microgravity, where their bodies float freely. While this weightlessness allows for breathtaking experiments and observations, it wreaks havoc on muscles and bones. Without the constant pull of gravity, muscles weaken, and bones lose density. Astronauts must engage in rigorous exercise routines to counteract muscle atrophy and maintain bone health. Fluid Redistribution and Swollen Faces In microgravity, bodily fluids shift upward, causing facial puffiness and fluid retention. Astronauts often joke about their 'moon faces.' This fluid redistribution can also affect vision, leading to a condition known as spaceflight-associated neuro-ocular syndrome (SANS). Isolation and Confinement: The Mental Strain Psychological Challenges Space missions involve prolonged isolation and confinement. Astronauts live in tight quarters, cut off from the natural world. The absence of familiar sights, sounds, and smells can lead to feelings of loneliness and anxiety. Coping mechanisms, communication with loved ones, and psychological support are crucial to maintaining mental well-being. Distance from Earth: A Cosmic Solitude Emotional Impact The vastness of space can evoke existential thoughts. Astronauts gaze back at Earth - a tiny blue dot suspended in the cosmic void - and grapple with their insignificance. The emotional weight of being far from home, family, and friends can be profound. Hostile and Closed Environments: Surviving in the Void Spacecraft Living Conditions Spacecraft are marvels of engineering, but they are also confined capsules. Astronauts adapt to tight spaces, recycled air, and limited privacy. The constant hum of machinery and the absence of natural light can wear on their senses. Risk of Infection In closed environments, microbes thrive. Astronauts must maintain strict hygiene to prevent infections. The immune system faces unique challenges, especially during extended missions. The Resilience of Astronauts Adaptation and Innovation Astronauts are remarkable in their ability to adapt. They learn to navigate microgravity, perform complex tasks, and troubleshoot technical glitches. Their resilience drives innovation, leading to better spacecraft design and life support systems. The Twin Study: Scott and Mark Kelly Scott Kelly and his identical twin brother, Mark Kelly, participated in the unique Twins Study. Scott spent nearly a year aboard the ISS, while Mark remained on Earth. By comparing their physiological and psychological changes, researchers gained valuable insights into the effects of space travel. Looking Ahead: Mars and Beyond Challenges for Deep Space Missions As we plan for Mars missions and beyond, we face the RIDGE of space travel: Space Radiation: Shielding astronauts from cosmic rays. Isolation and Confinement: Maintaining mental health during long journeys. Distance from Earth: Coping with cosmic solitude. Gravity Fields: Addressing muscle and bone health. Hostile/Closed Environments: Ensuring safety and hygiene. In conclusion, space travel is a delicate balance between exploration and preservation. As we venture farther into the cosmos, we must safeguard both our scientific curiosity and the well-being of those who dare to explore the final frontier. The Environmental Impact of Airplanes and Spaceships Airplanes and spaceships have transformed the way we explore our planet and beyond. However, their operations come with significant environmental consequences. Let's delve into the effects of these flying machines on our delicate ecosystem. Climate Change Air travel is a major contributor to climate change due to greenhouse gas emissions. Jet engines burn fossil fuels (mostly aviation gasoline or jet fuel), releasing carbon dioxide (CO2), nitrogen oxides (NOx), and water vapor into the atmosphere. These emissions trap heat, leading to global warming. Although aviation accounts for about 3.5 percent of human-induced climate change, its impact is disproportionately high due to emissions at high altitudes. Air Quality Airplanes emit pollutants such as sulfur dioxide (SO2), particulate matter (PM), and volatile organic compounds (VOCs). These pollutants degrade air quality near airports and along flight paths. Ground-level ozone formation, which harms human health and ecosystems, is also influenced by aviation emissions. Noise Pollution The roar of jet engines disrupts communities around airports. Noise pollution affects sleep patterns, stress levels, and overall well-being. Efforts to reduce noise include quieter engine designs and flight path adjustments. Spaceships: Earth's Atmospheric Guardians Rocket Launches and Pollution Rocket launches, essential for space exploration, release pollutants into the atmosphere. The fuel used - such as unsymmetrical dimethylhydrazine (UDMH) - can be highly carcinogenic and ecologically damaging. For instance, the Baikonur Cosmodrome in Kazakhstan, the world's oldest spaceport, has left a large zone of pollution due to toxic rocket fuel seeping into the soil. Carbon Particles and Geo-Engineering Recent research highlights the impact of rocket emissions on the atmosphere. Black carbon (soot) particles from rockets can absorb heat, acting as a form of geo-engineering. As commercial space launches increase, so does the concern about their environmental effects. Balancing Exploration and Preservation Space Tourism The rise of space tourism introduces new challenges. As more people venture beyond Earth, we must consider the cumulative impact of rocket emissions. Balancing our curiosity with environmental stewardship is crucial. Sustainable Practices Efforts are underway to develop cleaner propulsion technologies, use alternative fuels, and minimize space debris. Innovations like reusable rockets and electric propulsion aim to reduce the environmental footprint of space travel. Looking Ahead: A Cosmic Responsibility Mars and Beyond As we dream of Mars colonies and interstellar travel, we must tread carefully. The RIDGE of space exploration - Radiation, Isolation, Distance, Gravity, and Environment - requires sustainable solutions. Let's explore the cosmos while safeguarding our home planet. In conclusion, airplanes and spaceships propel us toward the stars, but their effects ripple through our atmosphere and ecosystems. As stewards of both Earth and space, we must navigate the skies responsibly, seeking harmony between exploration and preservation. From the ground to the sky, dining experiences have transcended traditional restaurant settings. Imagine savoring gourmet meals while suspended high above the earth, with breathtaking views stretching as far as the eye can see. Welcome to the world of aerial dining, where culinary delights meet gravity-defying elegance. Dinner in the Sky: Elevating Gastronomy The Original Concept Dinner in the Sky, born in 2006, is the epitome of dining with a twist. Picture a massive table - more like a platform - hoisted almost 200 feet into the air by a sturdy crane. Guests, chefs, and waitstaff don their white hats as they ascend to the skies. The setting? A floating dinner table, surrounded by nothing but open air and panoramic vistas. The Experience As you settle into your seat, the anticipation builds. The restaurant staff orchestrates a three-course fine dining experience, all while suspended in midair. The menu features carefully crafted dishes, often prepared beforehand and finished in a convection oven right there in the sky. Each bite is accompanied by awe-inspiring views - city skylines, rolling landscapes, or even the vastness of the ocean. Safety First Before you ascend, a safety briefing ensures that you're securely strapped in. The thrill of being airborne mingles with the elegance of haute cuisine. Whether it's a romantic date night or a corporate event, Dinner in the Sky promises an unforgettable meal. Sky-High Restaurants Around the World Dubai Marina: A Feast Above the Waters Situated in Dubai Marina, this dining concept boasts some of the best views of the city skyline, surrounding waters, and the iconic Palm Jumeirah. Imagine floating above the ground while you dine - a one-of-a-kind experience you simply cannot miss. After the safety briefing near Skydive Dubai, you're hoisted 50 meters into the air, suspended over the bustling marina. The fusion of flavors meets the fusion of horizons. Las Vegas: Unparalleled Views of the Strip In the entertainment capital of the world, Dinner in the Sky Las Vegas takes fine dining to new heights - literally. As the sun sets, you ascend, and the glittering lights of the Las Vegas Strip come alive. The most unforgettable dinner you'll ever have awaits, with the cityscape stretching out beneath you. It's a feast for the senses, where culinary artistry meets architectural marvels. The Future of Aerial Gastronomy Sustainability and Innovation As we look ahead, the challenge lies in balancing indulgence with environmental responsibility. How can we minimize the carbon footprint of these lofty dining experiences? Innovations like electric-powered cranes, locally sourced ingredients, and waste reduction strategies are steps toward a more sustainable future. Beyond Earth: Space Tourism and Cosmic Cuisine With the rise of space tourism, could we soon dine among the stars? Imagine a celestial restaurant aboard a spacecraft, overlooking Earth from orbit. Cosmic cuisine - crafted by zero-gravity chefs - might become the ultimate bucket-list experience. As we explore the cosmos, let's ensure that our gastronomic adventures leave no trace behind. In conclusion, dining in the air transcends mere sustenance. It's a celebration of human ingenuity, a fusion of flavors and vistas, and a reminder that our appetite for exploration knows no bounds. So, raise your glass (carefully!) to the skies and savor the magic of dining aloft. Dining in the Sky is a unique and exhilarating culinary experience that elevates traditional dining to new heights - literally. Here are the key aspects of this extraordinary concept: The Setting: Up, Up, and Away! Imagine being seated at a massive table suspended high above the ground, often hundreds of feet in the air. The dining platform is typically hoisted by a sturdy crane or other mechanical means. Guests, chefs, and waitstaff ascend together, creating an unforgettable communal experience. The Experience: A Feast with a View As you settle into your seat, anticipation builds. The thrill of being airborne mingles with the elegance of haute cuisine. The menu features carefully crafted dishes, often prepared beforehand and finished on-site. Whether it's breakfast, lunch, or dinner, each course is served against a backdrop of breathtaking views - city skylines, rolling landscapes, or even the vastness of the ocean. The floating table becomes a stage for culinary artistry, where flavors dance amidst the clouds. Safety First: Buckle Up! Before ascending, guests receive a safety briefing. Straps secure them to their seats, ensuring a worry-free dining experience. The focus shifts from gravity to gastronomy as the platform rises, leaving the ground far below. Locations Around the World: Where the Sky Meets the Plate Dubai Marina: Suspended above the bustling marina, diners enjoy views of the city skyline and the iconic Palm Jumeirah. Las Vegas: As the sun sets, guests ascend over the glittering lights of the Las Vegas Strip, creating an unparalleled dining spectacle. The Future: Sustainability and Cosmic Cuisine Balancing indulgence with environmental responsibility is crucial. Innovations like electric-powered cranes and locally sourced ingredients aim to reduce the carbon footprint. Could cosmic cuisine be next? With the rise of space tourism, imagine dining aboard a spacecraft, overlooking Earth from orbit. Zero-gravity chefs crafting celestial dishes - it's a tantalizing prospect. Introduction The sky, our celestial canvas, is a dynamic theater where cosmic phenomena unfold. From twinkling stars to majestic planets, the sky offers a mesmerizing display that captivates astronomers and dreamers alike. In this essay, we'll explore the various elements of celestial weather, from meteor showers to planetary alignments. Stars and Constellations Stellar Climates Stars, like earthly weather patterns, exhibit their own 'climates.' Some stars burn fiercely, radiating intense heat, while others are cooler and more temperate. The constellations, those celestial neighborhoods, form intricate patterns across the night sky. Imagine them as cosmic weather maps, guiding our eyes to distant realms. Meteor Showers: Celestial Rainfall Meteor showers are cosmic storms, where Earth passes through debris left behind by comets. As these tiny particles collide with our atmosphere, they ignite, creating streaks of light - the meteors. The Perseids in August and the Geminids in December are celestial fireworks, painting the sky with ephemeral beauty. Planets and Their Dance Planetary Weather Systems Our solar system hosts a diverse range of planets, each with its own atmospheric conditions. Venus, shrouded in thick clouds of sulfuric acid, experiences hurricane-force winds. Mars, with its rusty surface, battles dust storms that engulf the entire planet. Jupiter's Great Red Spot - a colossal storm - has raged for centuries. Conjunctions and Oppositions Planets engage in a cosmic ballet. Conjunctions occur when two planets appear close together in the sky, as if sharing a celestial embrace. Oppositions, on the other hand, position a planet directly opposite the Sun, making it visible all night. Witnessing Mars during opposition feels like meeting an old friend. Lunar Weather Phases of the Moon The Moon, Earth's faithful companion, cycles through its phases. New Moon, First Quarter, Full Moon - the lunar weather changes predictably. During a lunar eclipse, our planet casts a shadow on the Moon, turning it coppery red. It's a cosmic reminder of our place in the grand celestial drama. Tides: The Ocean's Cosmic Response The Moon's gravitational pull orchestrates tides on Earth. High tides and low tides ebb and flow, responding to lunar cues. The celestial dance between Earth, Moon, and Sun shapes our oceans, affecting coastlines and marine life. Celestial Events Comets: Cosmic Visitors Comets, celestial vagabonds, journey through our solar system. Their icy cores release gas and dust, forming magnificent tails. Halley's Comet, a recurring visitor, graces our skies once every 76 years. Its return is a cosmic homecoming. Supernovae: Stellar Explosions When massive stars reach the end of their lives, they explode in brilliant supernovae. These cosmic fireworks outshine entire galaxies. Witnessing a supernova - a rare event - is like glimpsing the universe's raw power. Conclusion As we gaze upward, let's remember that the sky is not merely a backdrop but a living, breathing entity. Its weather - both familiar and otherworldly - shapes our cosmic experience. So, next time you look up, consider the celestial forecast: a blend of stardust, wonder, and infinite possibilities. In the words of Carl Sagan, 'The cosmos is within us. We are made of star-stuff.' Cosmic Mysteries Dark Matter and Dark Energy The sky harbors secrets beyond our comprehension. Among them are dark matter and dark energy. Dark matter, invisible and elusive, exerts gravitational influence on galaxies, holding them together. Imagine it as the cosmic glue binding the universe. Dark energy, on the other hand, accelerates the universe's expansion, pushing galaxies apart. These cosmic enigmas remain shrouded in mystery, awaiting discovery. Auroras: Celestial Light Shows When charged particles from the Sun collide with Earth's magnetic field, they create auroras - the ethereal dance of light near the poles. The Northern Lights (Aurora Borealis) and Southern Lights (Aurora Australis) paint the night sky with hues of green, pink, and purple. These celestial ballets remind us of our interconnectedness with the solar system. Celestial Timekeeping Stellar Clocks The sky serves as humanity's oldest timekeeper. Ancient civilizations relied on celestial events for calendars. The sidereal day, based on Earth's rotation relative to distant stars, is approximately 23 hours, 56 minutes, and 4 seconds. Constellations rise and set, marking the passage of time - a cosmic heartbeat. Eclipses: Celestial Alignments Solar and lunar eclipses are cosmic alignments. During a solar eclipse, the Moon obscures the Sun, casting a shadow on Earth. The eerie twilight and the diamond ring effect evoke awe. Lunar eclipses, when Earth's shadow engulfs the Moon, transform it into a reddish orb - an astronomical spectacle witnessed by civilizations across millennia. Cosmic Harmony Music of the Spheres Ancient philosophers believed in the 'music of the spheres.' They imagined celestial bodies - planets, stars, and moons - emitting harmonious vibrations. Each celestial note contributed to a cosmic symphony. While we no longer hear this celestial music, its metaphorical resonance persists - a reminder that the universe hums with hidden melodies. Galactic Weather Patterns Galaxies, like weather systems, evolve. Spiral galaxies, with their graceful arms, resemble cosmic hurricanes. Elliptical galaxies, shaped like celestial footballs, harbor dormant black holes at their cores. Colliding galaxies create celestial tempests, birthing new stars. The cosmic weather forecast predicts galactic collisions, stellar births, and cosmic winds. Conclusion: Our Cosmic Home As we conclude our cosmic odyssey, remember that the sky is not an abstract canvas - it's our celestial home. Whether you're stargazing from a mountaintop or contemplating the Moon's craters, you participate in the grand cosmic narrative. The sky whispers tales of creation, destruction, and eternity. So, dear reader, look up. Embrace the celestial weather - the storms and serenades. For in the vastness of space, we find wonder, humility, and a shared cosmic kinship. As Carl Sagan eloquently put it, 'We are a way for the cosmos to know itself.' Introduction The universe is a symphony, and planets are its celestial notes. These enigmatic orbs dance around stars, weaving tales of creation, destruction, and cosmic balance. In this essay, we embark on a cosmic journey to explore the eight planets of our solar system and their profound significance. Mercury: The Swift Messenger Mercury, the swiftest planet, orbits closest to the Sun. Its surface is a rugged landscape of craters and cliffs, baked by scorching temperatures during the day and chilled at night. Named after the Roman messenger god, Mercury shuttles between extremes, delivering cosmic messages across the solar system. Venus: Earth's Fiery Twin Venus, Earth's twin sister, hides behind thick clouds of sulfuric acid. Its surface resembles a volcanic inferno, with temperatures hot enough to melt lead. Yet, its beauty lies in its radiant glow - the Morning and Evening Star - illuminating our dawn and dusk. Earth: Our Blue Gem Earth, our precious home, teems with life. Its oceans, forests, and deserts form a delicate biosphere. From the icy poles to the equatorial rainforests, Earth's diverse climates sustain a symphony of ecosystems. We are its guardians, entrusted with its care. Mars: The Red Planet's Mysteries Mars, the Red Planet, beckons explorers. Its rusty surface bears ancient river valleys and polar ice caps. Could Mars harbor hidden reservoirs of life? Robotic rovers traverse its deserts, seeking answers beneath its crimson skies. Jupiter: King of the Gas Giants Jupiter, the colossal gas giant, boasts a mesmerizing tapestry of bands and storms. Its Great Red Spot - a tempest larger than Earth - has raged for centuries. Jupiter's gravitational pull shapes the solar system, protecting inner planets from cosmic debris. Saturn: Jewel of the Rings Saturn, adorned with majestic rings, is a cosmic jewel. These icy hoops, composed of countless particles, create a celestial ballet. Saturn's moons - Titan, Enceladus, and others - beckon us to explore their icy landscapes. Uranus: The Original Ice Giant Uranus, tipped on its side, spins like a cosmic top. Its icy blue hue conceals turbulent storms. Uranus remains a mystery, awaiting further study by future missions. Neptune: The Farthest Wanderer Neptune, shrouded in azure clouds, is the outermost planet. Its winds whip at supersonic speeds, and its icy heart harbors storms that rival Jupiter's. Voyager 2, our interstellar traveler, captured Neptune's beauty as it sailed past. Conclusion: Cosmic Harmony Planets are cosmic harmonizers. Their gravitational dances sculpt orbits, stir tides, and guide comets. They remind us of our place in the grand cosmic orchestra. As we gaze at the night sky, let us cherish these celestial companions - the guardians of harmony. In the words of Carl Sagan, 'We are made of star-stuff.' Our existence echoes the cosmic rhythm, and planets are our celestial partners in this cosmic waltz. Pluto, once considered our ninth planet, now holds the title of a dwarf planet. The International Astronomical Union (IAU) made this reclassification in 2006. Pluto didn't meet one of the three criteria the IAU uses to define a full-sized planet: it has not cleared its neighboring region of other objects. Despite its demotion, Pluto remains a fascinating member of the Kuiper belt, a ring of bodies beyond Neptune's orbit. It is the ninth-largest and tenth-most-massive known object to directly orbit the Sun. Although smaller than Earth's moon, Pluto's icy and rocky composition continues to intrigue astronomers and stargazers alike. NASA's New Horizons mission is a remarkable endeavor that has expanded our understanding of the outer reaches of our solar system. Let's delve into the details of this pioneering spacecraft: Objective: New Horizons was designed to study the dwarf planet Pluto, its moons, and other objects in the Kuiper Belt. Launch Date: On January 19, 2006, New Horizons embarked on its epic journey. Spacecraft Mass: Weighing 1,054 pounds (478 kilograms), it carried a suite of scientific instruments. Mission Design and Management: The mission was led by NASA in collaboration with the Johns Hopkins University Applied Physics Laboratory (APL). Historic Flyby: On July 14, 2015, New Horizons made history by becoming the first spacecraft to explore Pluto up close. It captured stunning images of Pluto's diverse geological features, including its icy plains, rugged mountains, and frozen canyons. Moons of Pluto: During the flyby, New Horizons also studied Pluto's five moons, including the intriguing Charon. Arrokoth Flyby: In early 2019, New Horizons achieved another milestone by flying past Arrokoth (2014 MU69). Arrokoth is a Kuiper Belt Object, making it the most distant object ever explored up close. Kuiper Belt: This region extends from about 30 AU (near Neptune's orbit) to about 50 AU from the Sun. New Horizons ventured into this uncharted territory. New Horizons carried an impressive array of instruments, including: Ralph: A visible and infrared imager/spectrometer. Alice: An ultraviolet imaging spectrometer. Radio-Science Experiment (REX): Studied radio signals. Long-Range Reconnaissance Imager (LORRI): Captured high-resolution images. Solar Wind and Plasma Spectrometer (SWAP): Analyzed solar wind. Pluto Energetic Particle Spectrometer Science Investigation (PEPSSI): Studied particles around Pluto. Student Dust Counter (SDC): Measured dust impacts. New Horizons provided insights into Pluto's atmosphere, surface, and geology. It revealed icy mountains, glaciers, and mysterious dark regions. The spacecraft also observed Jupiter's moons (Io, Europa, and Ganymede) during its long journey. As of 2023, New Horizons continues to explore the outer solar system, contributing to our understanding of distant bodies. In summary, New Horizons has been a trailblazer, revealing the secrets of Pluto and venturing into the cosmic frontier. Its legacy inspires future missions and fuels our curiosity about the cosmos. ",
+    "8192": "Once upon a time, in a quaint little village nestled amidst rolling hills, there existed an old teapot. But this was no ordinary teapot; it was a magical one. Its handle curved just so, and its spout seemed to whisper secrets to the wind. The villagers called it 'Elara,' and they believed it held the power to grant wishes. Elara sat on the windowsill of Mrs. Abernathy's cozy cottage. Mrs. Abernathy was a kind-hearted woman with twinkling eyes and a penchant for herbal teas. She'd inherited the teapot from her grandmother, who, in turn, had received it from a mysterious traveler. One chilly evening, as the sun dipped below the horizon, Mrs. Abernathy brewed her favorite chamomile tea. She poured the fragrant liquid into Elara, and to her astonishment, the teapot began to glow. The room filled with a soft, golden light, and Mrs. Abernathy felt a tingle in her fingertips. 'Make a wish,' whispered Elara, its spout quivering. Mrs. Abernathy hesitated. She'd heard tales of wishes gone awry - of greedy desires leading to unintended consequences. But her heart yearned for something simple: a garden filled with blooming roses. So, she closed her eyes and wished for just that. The next morning, Mrs. Abernathy stepped outside, and her breath caught. The air smelled of roses - sweet and heady. But when she looked around, she gasped. Her modest garden had transformed into a riot of colors. Roses of every hue - crimson, ivory, apricot - bloomed in profusion. They climbed the walls, twined around the picket fence, and even spilled onto the cobblestone path. Word spread throughout the village, and soon everyone wanted a turn with Elara. The baker wished for the perfect sourdough loaf, and it appeared in his oven. The blacksmith wished for strength, and his arms bulged with newfound muscle. The schoolteacher wished for wisdom, and her lectures became captivating tales. But as wishes multiplied, so did the consequences. The baker's sourdough grew sentient and demanded to be called 'Doughbert.' The blacksmith's strength made him accidentally crush his anvil. And the schoolteacher's wisdom led her to question the very fabric of reality. Mrs. Abernathy watched with a mix of amusement and concern. Elara seemed to thrive on granting wishes, but its porcelain surface bore faint cracks. Was it growing weaker? One day, a young girl named Lily approached Elara. Her eyes sparkled with innocence, and she clutched a dandelion in her hand. 'Teapot,' she said, 'I wish for a friend.' Elara hesitated. It sensed the purity of Lily's heart, but it also knew the weight of loneliness. With a shudder, it granted the wish. And so, Lily's dandelion transformed into a giggling sprite named Petal. They danced through meadows, shared secrets, and became inseparable. Elara's cracks deepened, but it didn't mind. As seasons passed, Mrs. Abernathy sat by the window, watching Elara fade. Yet, she felt no regret. For in granting wishes, the teapot had found purpose. And perhaps, just perhaps, it had one final wish left - to be remembered. And so, when Mrs. Abernathy's time came, she whispered to Elara, 'Thank you.' The teapot glowed one last time, and Mrs. Abernathy drifted away, leaving behind a garden of roses and a village full of stories. And that, my dear reader, is how the enchanted teapot became a legend - a vessel of magic, love, and wishes granted with a fragile heart. As the seasons changed, so did the village. The once-sleepy hamlet now buzzed with visitors from distant lands. They came seeking Elara, the legendary teapot that granted wishes. Some sought riches, others fame, but most yearned for something deeper - a connection to the mystical. Among the newcomers was a weary traveler named Ezra. His cloak was tattered, and his boots bore the marks of countless miles. He'd heard whispers of Elara's magic and hoped it could mend his broken heart. For Ezra had lost his beloved, and grief weighed upon him like an anchor. Mrs. Abernathy, now an old woman with silver hair, welcomed Ezra into her cottage. Elara sat on the windowsill, its porcelain surface etched with memories. Mrs. Abernathy poured chamomile tea into the teapot, and it glowed faintly, as if recognizing an old friend. 'Make a wish,' Mrs. Abernathy said, her voice soft. Ezra hesitated. His wish was simple yet profound: to see his love once more, if only in a dream. He closed his eyes and whispered, 'I wish for a single night with her.' Elara trembled, its spout quivering. It understood the ache of lost love - the longing that transcended time. And so, it granted Ezra's wish. That night, as the moon hung low in the sky, Ezra lay on Mrs. Abernathy's creaky bed. Elara sat beside him, its glow illuminating the room. He drifted into slumber, and there, in the realm between wakefulness and dreams, he found himself in a moonlit garden. His love, Isolde, stood before him. Her eyes were the color of forget-me-nots, and her laughter echoed like wind chimes. They danced beneath a silver canopy, twirling through memories - their first kiss, stolen moments by the river, promises whispered under ancient oaks. But dreams are fragile, and dawn approached. Isolde's form wavered, and Ezra clung to her. 'Stay,' he pleaded. 'Just a little longer.' Isolde smiled, her touch like a butterfly's kiss. 'Time bends here,' she said. 'But you must wake, my love.' As the sun peeked over the horizon, Ezra opened his eyes. Elara sat on the windowsill, its glow fading. Mrs. Abernathy watched him, her gaze knowing. 'Did you see her?' she asked. Ezra nodded, tears glistening. 'She was real, Mrs. Abernathy. I held her again.' The village marveled at Ezra's tale - the man who danced with a ghost. They flocked to Elara, each with their wishes. The blacksmith wished for forgiveness, the baker for inspiration, and the schoolteacher for courage. Elara obliged, its cracks deepening, but it never complained. One day, as winter painted the landscape white, Mrs. Abernathy grew frail. She called Ezra to her bedside. 'Elara's magic wanes,' she whispered. 'But it has one final wish.' Ezra knelt beside her. 'What is it?' 'Take Elara beyond the hills,' Mrs. Abernathy said. 'To the ancient oak where Isolde and I carved our initials. There, bury the teapot. It will become part of the earth, and its magic will seep into the roots.' And so, on a frost-kissed morning, Ezra carried Elara to the oak. He dug a small hole, placed the teapot inside, and covered it with soil. As he patted the ground, he felt a tremor - a farewell. The next spring, the oak bloomed with roses - crimson, ivory, apricot. And in its shade, a dandelion sprouted. Its petals glowed like moonlight, and when the wind whispered, it carried echoes of laughter. Ezra knew then that Elara's wish had come true. It had become part of the land, woven into the fabric of stories. And perhaps, just perhaps, it still listened, granting silent wishes to those who believed. And so, the legend of Elara lived on - a teapot turned earth, a vessel of love, and a bridge between worlds. In the heart of the Whispering Forest, where ancient trees leaned close and their leaves murmured secrets, lived a young girl named Evelyn. She had eyes the color of moss and hair that tangled like wild vines. Evelyn was no ordinary child; she could hear the forest's whispers - the soft rustle of leaves, the creaking of branches, and the laughter of unseen creatures. The villagers feared the Whispering Forest. They said it was cursed - a place where time flowed differently, where shadows danced with mischief, and where lost souls wandered forever. But Evelyn felt drawn to its heart. She believed the forest held answers - about her missing parents, about the world beyond the village. One moonlit night, when the forest beckoned with silver fingers, Evelyn slipped away from her tiny cottage. She wore a cloak spun from spider silk and carried a lantern that glowed like a captured star. The trees leaned in, their bark etched with ancient runes. They whispered her name - Evelyn, Evelyn - as if they knew her purpose. Deeper she ventured, past gnarled roots and dew-kissed ferns. The air smelled of moss and memories. The lantern's light flickered, casting eerie shadows on the forest floor. And then, she heard it - the melody of the Whispering Forest. It was a haunting tune, sung by unseen lips, and it tugged at her heart. 'Who are you?' Evelyn whispered. The forest answered - a chorus of voices, overlapping and harmonizing. 'We are the echoes of forgotten dreams, the guardians of lost paths. Seek what you desire, but beware the price.' Evelyn pressed on. She reached a clearing where moonflowers bloomed - a sea of pale petals that glowed like fallen stars. In their midst stood a stone pedestal, and atop it rested a silver key. It was unlike any key she'd seen - twisted and delicate, with a single emerald set in its bow. The whispers intensified. 'Take the key,' they urged. 'Unlock the door to your destiny.' Evelyn hesitated. What door? What destiny? She thought of her parents - their laughter, their scent of pine and adventure. They'd vanished when she was a baby, leaving only a crumpled map with cryptic symbols. With trembling fingers, she picked up the key. It felt warm, alive. And then, she saw it - a door, half-hidden behind an ancient oak. Its wood was etched with constellations, and its handle bore the same emerald as the key. Evelyn inserted the key into the lock. The door groaned open, revealing a tunnel - a ribbon of darkness that wound deeper into the forest. The whispers grew urgent. 'Step through, Evelyn. Find your truth.' She stepped into the tunnel, and the world shifted. Time blurred, and she glimpsed her parents - laughing, dancing, fading like smoke. The tunnel led to a chamber - a celestial cavern where stars swirled in liquid patterns. And there, on a stone pedestal, lay a crystal vial. The whispers crescendoed. 'Drink,' they urged. 'Remember.' Evelyn uncorked the vial. Memories flooded her - the scent of pine, her parents' laughter, the taste of adventure. Tears blurred her vision. She drank, and the forest embraced her - a cocoon of whispers, of love, of belonging. When Evelyn emerged, the Whispering Forest had changed. It no longer whispered of curses but sang of hope. She carried her parents' memories - their legacy - and vowed to protect the forest's secrets. And so, Evelyn became the new guardian. She tended the moonflowers, listened to the trees, and sang the haunting melody. The villagers no longer feared the forest; they sought its solace, its magic. And every night, as the moon rose, Evelyn stood by the ancient oak. She whispered her parents' names, and the forest whispered back - a lullaby woven from stardust and love. Beyond the Whispering Forest, where the moonflowers bloomed and the stars whispered secrets, lay a forgotten path. It was a narrow trail, overgrown with moss and guarded by ancient stones. Few dared to tread there, for it led to the Compass Grove. Lysander, a young cartographer with ink-stained fingers and a heart full of wanderlust, stumbled upon this path one misty morning. His boots sank into damp earth, and the air smelled of pine and possibility. He carried a tattered map - a relic passed down through generations. Its edges bore cryptic symbols, and its center held a blank space - an uncharted territory. The Compass Grove was said to house a mystical compass - the Wayfinder's Compass - forged by the first explorers. It was no ordinary instrument; it pointed not to north, but to one's true desire. Legends whispered that whoever held the compass could navigate not only the physical world but also the labyrinth of their own heart. Lysander's pulse quickened. He yearned for adventure - to map uncharted lands, to unravel mysteries. His parents had vanished during an expedition, leaving behind a single clue: the blank space on the map. Perhaps the Compass Grove held answers. As he pushed through brambles and ferns, the forest seemed to guide him. Sunlight filtered through leaves, casting dappled patterns on the ground. And then, he saw it - a circle of ancient stones, their surfaces etched with symbols. At the center stood a pedestal, and atop it rested the Wayfinder's Compass. Lysander's breath caught. The compass was unlike any he'd seen. Its needle shimmered like a captured star, and its dial bore not cardinal directions but enigmatic words: Dreams, Regret, Destiny, and Hope. He touched the compass, and it hummed - a vibration that resonated in his bones. The whispers began - the voices of long-lost explorers, of forgotten dreams. 'Choose,' they urged. 'Choose your path.' Lysander hesitated. Dreams? Regret? Destiny? Hope? Each word held a promise, a peril. He thought of his parents - their laughter, their courage. He thought of the blank space on the map - the uncharted territory that beckoned. And so, he turned the dial to Dreams. The needle quivered, then settled - a path leading deeper into the forest. Lysander followed, lantern in hand, heart pounding. The compass guided him past silver streams and ancient oaks. It led him to a hidden waterfall - a curtain of moonlight that shimmered like stardust. There, he glimpsed a figure - a woman with eyes like forgotten constellations. She wore a cloak spun from spider silk, and her hair flowed like a river. 'Lysander,' she said, her voice a melody. 'You seek dreams.' He nodded. 'I seek answers. About my parents.' The woman touched his forehead, and memories flooded him - the scent of pine, his parents' laughter, the taste of adventure. 'Dreams are maps,' she said. 'They guide us beyond what we see.' Lysander understood. Dreams were compasses of the soul. His parents had followed theirs, and now he would follow his. He stepped through the waterfall, and the world shifted. He found himself on a cliff overlooking a vast sea - a sea of blank parchment. Islands floated in the distance, waiting to be charted. Lysander unrolled his map - the one with the blank space - and dipped his quill. He drew coastlines, marked mountains, and named each land. And as he mapped, the compass glowed - a beacon of dreams fulfilled. Lysander knew then that he was not merely a cartographer; he was a dreamweaver. His parents' legacy flowed through him - their courage, their laughter, their love. And so, Lysander sailed the uncharted seas, guided by the Wayfinder's Compass. He discovered islands of forgotten myths, forests of whispered tales, and cities where stars danced in the streets. He wrote his own story - a cartography of dreams. And in the Compass Grove, the ancient stones whispered his name - Lysander, Lysander - as if they knew he'd found his true north. In the heart of the city, where cobblestone streets wound like forgotten memories, stood an abandoned mansion. Its windows were boarded up, and ivy clung to its crumbling walls. But within those decaying walls lay a secret - a clockwork garden. Evelyn, a curious girl with eyes like rain-kissed petals, discovered the mansion one rainy afternoon. She wore mismatched socks and carried a notebook filled with sketches - a testament to her love for hidden wonders. The mansion's gate creaked open, and Evelyn stepped into a world frozen in time. The clockwork garden was unlike any other. Its flowers were made of gears and springs, their petals unfolding with precise clicks. The roses ticked, the daffodils whirred, and the tulips chimed. And at the center stood a colossal mechanical tree - its branches reaching toward the sky, its leaves spinning like miniature windmills. Evelyn gasped. She'd read about clockwork wonders - the automatons that danced at royal balls, the pocket watches that whispered secrets. But this garden was alive - a symphony of metal and magic. As she explored, she noticed a silver key embedded in the tree's trunk. It gleamed, beckoning her. Evelyn hesitated. What did the key unlock? And why had the clockwork garden been abandoned? The flowers seemed to whisper. 'Unlock the tree,' they urged. 'Discover its heart.' Evelyn turned the key. The tree shuddered, and its branches parted, revealing a hidden chamber. Inside, a mechanical heart pulsed - a delicate contraption of brass and crystal. It hummed, resonating with the rhythm of forgotten time. And then, she heard it - the voice of the tree. 'I am Chronos,' it said. 'Guardian of moments.' Evelyn's heart raced. 'Moments?' 'Every petal, every leaf,' Chronos explained. 'They hold memories - the laughter of lovers, the tears of parting, the whispers of dreams. But time has fractured. The clockwork garden is frozen, and I am fading.' Evelyn understood. The mansion's former owner - a clockmaker named Lysander - had built this garden to capture fleeting moments. But Lysander had vanished, leaving Chronos incomplete. 'I can mend you,' Evelyn said. 'But why was the garden abandoned?' Chronos sighed - a sound like winding gears. 'Lysander sought eternity. He believed that by freezing time, he could preserve love, prevent loss. But he forgot that life thrives in impermanence.' Evelyn touched the mechanical heart. 'Can we fix it?' Chronos nodded. 'You must find Lysander's final creation - the Celestial Gear. It lies beyond the city, where the river meets the stars.' And so, Evelyn embarked on her quest. She followed the river, past moonlit bridges and forgotten docks. The Celestial Gear awaited - a constellation of interlocking wheels, its center a pulsing light. As she placed the gear in Chronos's heart, the clockwork garden stirred. Flowers bloomed, petals unfurling with joy. The mechanical tree's leaves spun faster, and time flowed once more. But Chronos grew weaker. 'I am bound to this place,' it said. 'My purpose fulfilled.' Evelyn wept. 'Can't you come with me?' Chronos smiled - a clockwork smile. 'I am part of the garden now. But you, dear Evelyn, carry its memory.' And so, she returned to the mansion, where the clockwork garden thrived. She sketched its wonders, capturing gears and petals on paper. And when she closed her eyes, she heard the whispers - the laughter of lovers, the tears of parting, the echoes of dreams. Evelyn became the new guardian. She tended the flowers, wound the tree, and listened to Chronos's fading heartbeat. And every night, as the stars wheeled overhead, she whispered her thanks. For in the heart of the clockwork garden, time danced - a fragile waltz of moments, preserved and cherished. In the heart of the Astronomer's Quarter, where cobblestone streets wound like celestial paths, stood an ancient observatory. Its domed roof bore the scars of countless meteor showers, and its telescopes whispered secrets to the night sky. But within those hallowed walls lay a mystery - a forgotten constellation. Aria, a young stargazer with eyes like distant galaxies, discovered the observatory one moonless night. She wore a cloak spun from stardust and carried a pocket-sized atlas - a testament to her love for the heavens. The observatory's door creaked open, and Aria stepped into a world woven with cosmic threads. The forgotten constellation was unlike any other. Its stars were elusive, their positions shifting with each passing century. Astronomers had once mapped it - a celestial tapestry of myth and memory - but over time, its name faded, its stories lost. As Aria explored, she noticed a silver quill resting on an ancient star chart. Its nib gleamed, beckoning her. Aria hesitated. What secrets did the quill hold? And why had the forgotten constellation slipped from memory? The stars seemed to whisper. 'Write,' they urged. 'Illuminate the night.' Aria dipped the quill in ink. The constellations above shifted - a celestial dance awaiting completion. She traced the forgotten lines - the Hunter's Bow, the Weaver's Loom, the Lost Lyre. And then, she saw it - a gap in the sky, a void where a constellation once blazed. The quill hummed - a vibration that resonated in her bones. The whispers intensified. 'Remember,' they urged. 'Remember the story.' And so, Aria wrote - a tale woven from stardust and longing. She penned the forgotten constellation's name: Lyra's Veil. Its stars had once guided lovers across oceans, inspired poets to verses, and cradled dreams in their luminous arms. But Lyra's Veil had vanished - a casualty of time's relentless march. Its stories faded, its purpose lost. Aria vowed to restore it - to stitch the celestial fabric, thread by thread. She climbed to the observatory's rooftop, where telescopes pointed toward infinity. Aria gazed at the sky, her breath mingling with the Milky Way. And there, in the gap, she saw it - the faint glimmer of Lyra's Veil. The quill guided her. She drew the missing lines - the Weaver's Loom reconnected, the Lost Lyre's melody restored. And as she wrote, the stars responded. Lyra's Veil emerged - a constellation reborn. But Aria felt a pull - a cosmic yearning. She touched the quill to her heart, and memories flooded her - the scent of stardust, her grandmother's bedtime stories, the taste of wonder. 'Guard it,' whispered the stars. 'Guard Lyra's Veil.' And so, Aria became the new guardian. She tended the observatory, charted the skies, and whispered the forgotten stories. The astronomers marveled - the gap was gone, and Lyra's Veil blazed once more. But Aria knew her duty. She would write new tales - of love, of courage, of dreams stitched together. And every night, as the constellations wheeled overhead, she whispered her thanks. For in the heart of the forgotten constellation, time danced - a fragile waltz of memory, preserved and cherished. In the heart of the bustling city, where skyscrapers touched the clouds and neon signs flickered like distant stars, lived a forgotten runner named Evelyn. She wasn't famous like the sprinters on billboards or the marathon champions with their gleaming medals. No, Evelyn was an ordinary woman who ran for the sheer joy of it. Every morning, before the sun peeked over the horizon, Evelyn laced up her worn-out sneakers. She followed the same route - a loop around the park, past the fountain where pigeons bathed, and along the riverbank where willow trees whispered secrets. Her pace was steady, her breaths rhythmic. She ran not to win races but to escape the noise of life - to find solace in the rhythm of her footsteps. But the city had forgotten Evelyn. The sports channels didn't broadcast her runs, and the local newspapers didn't write about her achievements. She was a lone figure - a silhouette against the dawn, chasing dreams that no one else cared about. One chilly morning, as Evelyn jogged along the river, she noticed a poster taped to a lamppost. It announced the city's annual marathon - the grand event that drew elite athletes from around the world. Evelyn's heart skipped a beat. She'd never run a marathon, but the idea tugged at her like a distant constellation. She tore off the poster and studied it. The race would wind through the city's streets, past cheering crowds and historic landmarks. The finish line was the grand stadium - the same stadium where she'd watched her heroes cross the tape, their names echoing through the loudspeakers. Evelyn hesitated. She wasn't a professional runner. She didn't have a coach or a team. But something stirred within her - a longing to be part of the marathon, to leave her mark on the city she loved. And so, she trained. She woke earlier, ran farther, and pushed her limits. She practiced pacing, fueled by oatmeal and determination. The other runners didn't notice her - a middle-aged woman with graying hair - but Evelyn didn't mind. She was a comet streaking through the pre-dawn darkness, fueled by her own quiet fire. On marathon day, the city buzzed with excitement. The streets were lined with spectators - families with homemade signs, old couples in folding chairs, children waving tiny flags. The elite runners surged ahead, their strides effortless. But Evelyn was in the middle of the pack - a forgotten runner among thousands. As she crossed each mile marker, Evelyn felt a surge of pride. She wasn't breaking records, but she was breaking barriers - the ones she'd built around herself. The cheers of the crowd fueled her - their encouragement like solar winds pushing her forward. And then, at mile 20, exhaustion hit. Evelyn's legs wobbled, her breaths came in ragged gasps. She glanced at the grand stadium - the finish line shimmering like a distant galaxy. But her body rebelled. She wanted to collapse, to fade into anonymity. And that's when she saw him - a young boy with a crumpled sign. It read, 'Go, Evelyn! You're not forgotten.' Tears blurred her vision. She pushed through the pain, her heartbeat a metronome of determination. As Evelyn crossed the finish line, the crowd erupted. The loudspeakers blared her name - Evelyn, Evelyn - and the forgotten runner became a star. She collapsed into the arms of a volunteer, her legs trembling. But she'd done it. She'd run the marathon - the one that mattered to her. The newspapers wrote about her - the woman who defied odds, who ran not for glory but for love. And the city remembered Evelyn - the forgotten runner who'd become a constellation, lighting the way for others. Lysander stood at the finish line of the marathon, his chest heaving, sweat-soaked shirt clinging to his skin. The stadium roared - a symphony of applause and encouragement. But amidst the cheers, he felt a void - an ache that no medal could fill. He'd run the race - the one that mattered to him. Yet, as he caught his breath, Lysander wondered about the blank space on his map. The uncharted territory - the reason his parents had vanished - still haunted him. A shadow fell across the track. It was Evelyn, the forgotten runner. Her eyes sparkled with determination, and her worn-out sneakers bore the marks of countless miles. She'd finished the marathon too, her name echoing through the loudspeakers. 'Evelyn,' Lysander said, his voice hoarse. 'Why do we run?' She leaned against the railing, gazing at the city beyond. 'For the same reason we map,' she replied. 'To find what's lost.' Lysander nodded. 'The Compass Grove,' he said. 'The Wayfinder's Compass.' Evelyn's eyes widened. 'You know of it?' He traced the blank space on his map - the gap where the forgotten constellation should be. 'My parents sought it,' Lysander confessed. 'They believed it held answers - about time, about destiny.' Evelyn's fingers brushed the silver quill in her pocket. 'And did they find it?' He shook his head. 'They vanished. But I won't stop searching.' Together, they left the stadium - the forgotten runner and the cartographer. They followed the same path - the one that led beyond the city, into the Whispering Forest. The compass guided them - the needle pointing not to north, but to dreams. As they reached the ancient stones of the Compass Grove, Evelyn gasped. 'Look,' she said, her voice hushed. There, etched into the stones, were symbols - the Weaver's Loom, the Lost Lyre, and the Hunter's Bow. And at the center stood the pedestal - the Wayfinder's Compass. Lysander touched it - the needle quivering. 'What do we seek?' he asked. Evelyn's eyes held galaxies. 'Not just answers,' she said. 'But connection - to the forgotten, to each other.' And so, they turned the dial - to Hope. The compass hummed, and the forest whispered. A path opened - a ribbon of moonlight leading deeper. They stepped through, and the world shifted. Stars swirled - a celestial dance. And there, in the gap, they saw it - the forgotten constellation. Lyra's Veil blazed - a tapestry of memories, stitched by stardust. Its stars guided lovers, inspired poets, and cradled dreams. Lysander and Evelyn held hands - the cartographer and the runner. They traced the lines - the Weaver's Loom reconnected, the Lost Lyre's melody restored. And as they gazed at Lyra's Veil, they felt it - a cosmic yearning. Not for fame or medals, but for eternity - the kind woven into forgotten constellations. Together, they whispered their thanks - to the stars, to the forest, to each other. In the small town of Maplewood, basketball was more than a game - it was a way of life. The local high school gym, with its creaky wooden floors and flickering lights, held memories etched into the hearts of generations. Tommy Reynolds, a lanky teenager with dreams as big as the full moon, had grown up shooting hoops in that gym. His father, a former basketball star, had taught him the art of the game - the perfect arc of a jump shot, the rhythm of dribbling, and the magic of teamwork. But Tommy wasn't like his father. He lacked the height and the natural talent. Still, he practiced tirelessly, his sneakers squeaking on the polished floor. He'd stare at the faded championship banners hanging from the rafters - the ones his father had helped win - and imagine his own name there someday. Senior year arrived, and Tommy made the varsity team. He wasn't a star player, but he hustled, diving for loose balls and setting screens. The crowd cheered louder for the flashy slam dunks, but Tommy's heart beat for the fundamentals - the bounce pass, the defensive stance, the pick-and-roll. The state championship game loomed - a David-and-Goliath matchup against the undefeated Oakwood Tigers. They had a towering center, a lightning-fast point guard, and a reputation for crushing opponents. Maplewood was the underdog, the team with heart but not much else. As the final seconds ticked away, the score was tied. Tommy stood at center court, sweat dripping down his face. The gym seemed to hold its breath. He glanced at the banners - the ghosts of champions past urging him on. The ball found its way to Tommy. He dribbled, eyes scanning the court. His father's voice echoed in his mind: 'Trust your instincts, son.' He drove toward the basket, the Tigers' defense closing in. But instead of taking the shot, Tommy passed - the perfect bounce pass to his teammate, Danny. Danny leaped, releasing the ball just as the buzzer sounded. The gym erupted. The ball swirled through the net - a miracle shot that defied physics. Maplewood had won - the underdogs had toppled the giants. Tommy's teammates lifted him on their shoulders. The crowd chanted his name. But as he glanced at the banners, he knew the truth. It wasn't just his shot - it was the culmination of every bounce pass, every defensive stance, every pick-and-roll. His father hugged him - a rare display of emotion. 'You did it, Tommy,' he whispered. 'You made your mark.' And there, in the glow of victory, Tommy realized that sometimes the greatest miracles happen at center court - not in the spotlight, but in the quiet moments of practice, persistence, and heart."
+}
diff --git a/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py b/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
index e8b563261001..33084aec214c 100644
--- a/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
+++ b/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 
 import numpy as np
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
index b634bcc50f6e..307afbc12290 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
@@ -1,4 +1,5 @@
 -r requirements.txt
-# Please manually install torch>=2.2.0.dev20230920 with CUDA enabled for the CUDA version installed in your system.
+# Please manually install torch>=2.2.0 with CUDA enabled for the CUDA version installed in your system.
 # Instructions can be found here: https://pytorch.org/get-started/locally/
-onnxruntime-gpu>=1.16.2
\ No newline at end of file
+onnxruntime-gpu>=1.16.2
+py3nvml
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
index 4210f36982ae..e991c2f27a1a 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
@@ -1,6 +1,7 @@
-git+https://github.com/huggingface/optimum.git
-transformers>=4.33.2
-torch>=2.2.0.dev20230920
+optimum>=1.14.1
+transformers>=4.33.2,<= 4.37.2
+torch>=2.2.0
 onnx>=1.14.0
 datasets>=2.8.0
-protobuf==3.20.2
\ No newline at end of file
+protobuf==3.20.2
+psutil
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
index c9a679c4eac8..ab92a1234373 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
@@ -289,9 +289,7 @@ def inference():
 
 
 def load_torch_model(model_name, device):
-    torch_model_name_or_dir = (
-        PRETRAINED_LONGFORMER_MODELS[model_name] if model_name in PRETRAINED_LONGFORMER_MODELS else model_name
-    )
+    torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS.get(model_name, model_name)
     model = LongformerModel.from_pretrained(torch_model_name_or_dir)
     model.to(device)
     return model
@@ -337,7 +335,7 @@ def test_ort(args, device) -> List[Dict[str, Any]]:
 
     onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx
 
-    optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")
+    optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")  # noqa: PIE810
     precision = "fp32" if not onnx_model_path.endswith("_fp16.onnx") else "fp16"
 
     model = load_torch_model(model_name, device)
@@ -592,7 +590,7 @@ def run_tests(
     logger.info(f"ORT_LONGFORMER_COMPACT_MEMORY={compact_memory}")
 
     os.environ["ORT_LONGFORMER_USE_HALF4"] = "1" if use_half4 else "0"
-    logger.info("ORT_LONGFORMER_USE_HALF4={}".format("1" if use_half4 else "0"))
+    logger.info("ORT_LONGFORMER_USE_HALF4={}".format("1" if use_half4 else "0"))  # noqa: G001
 
     results = []
     test_times = 1000
diff --git a/onnxruntime/python/tools/transformers/models/phi2/README.md b/onnxruntime/python/tools/transformers/models/phi2/README.md
new file mode 100644
index 000000000000..da62bba0f02f
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/phi2/README.md
@@ -0,0 +1,120 @@
+# Phi2 Optimizations
+## Prerequisites
+A Linux machine for [TorchDynamo-based ONNX Exporter](https://pytorch.org/docs/stable/onnx.html#torchdynamo-based-onnx-exporter)\
+Install onnx, onnxscript and transformers by running
+```bash
+pip install -r requirements.txt
+```
+To export ONNX, PyTorch version 2.2.0 or higher is required. The [official website](https://pytorch.org/) offers packages compatible with CUDA 11.8 and 12.1. Please select the appropriate version according to your needs.
+\
+\
+**There are two options to run the conversion script:**\
+_From source:_
+```bash
+# Default onnxruntime package is built with CUDA 11.8. For CUDA 12.x, refer to https://onnxruntime.ai/docs/install/#python-installs
+pip install onnxruntime-gpu==1.17.0 # or onnxruntime==1.17.0 if using cpu
+git clone git@github.com:microsoft/onnxruntime.git
+cd onnxruntime/onnxruntime/python/tools/transformers
+python -m models.phi2.convert_to_onnx -h
+```
+_From wheel:_ \
+Install [ORT nightly package](https://onnxruntime.ai/docs/install/#inference-install-table-for-all-languages)
+```bash
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx -h
+```
+
+## Export optimized phi2 onnx model for different scenarios
+**Export FP32 ONNX model for Nvidia GPUs** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --fp32_gpu
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --fp32_gpu
+```
+\
+**Export FP16 ONNX model for Nvidia GPUs** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --fp16_gpu
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --fp16_gpu
+```
+\
+**Export INT4 ONNX model for Nvidia GPUs** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --int4_gpu
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --int4_gpu
+```
+\
+**Export FP16 ONNX model for Nvidia GPUs with CUDA architecture SM=80~89** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --fp16_gpu_sm8x
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --fp16_gpu_sm8x
+```
+\
+**Export INT4 ONNX model for Nvidia GPUs with CUDA architecture SM=80~89** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --int4_gpu_sm8x
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --int4_gpu_sm8x
+```
+\
+**Export FP32 ONNX model for CPU** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --fp32_cpu
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --fp32_cpu
+```
+\
+**Export INT4 ONNX model for CPU** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --int4_cpu
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --int4_cpu
+```
+\
+**Export all at once** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --fp32_cpu --int4_cpu --fp32_gpu --fp16_gpu --int4_gpu --fp16_gpu_sm8x --int4_gpu_sm8x
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --fp32_cpu --int4_cpu --fp32_gpu --fp16_gpu --int4_gpu --fp16_gpu_sm8x --int4_gpu_sm8x
+```
+## Run example with ORT
+**(e.g) Export FP16 and INT4 ONNX models for Nvidia GPUs with CUDA architecture SM=80~89 and run examples.** \
+_From source:_
+```
+python -m models.phi2.convert_to_onnx --fp16_gpu_sm8x --int4_gpu_sm8x --run_example
+```
+_From wheel:_
+```
+python -m onnxruntime.transformers.models.phi2.convert_to_onnx --fp16_gpu_sm8x --int4_gpu_sm8x --run_example
+```
+The inference example currently supports all models running on CUDA.
+
+## Limitations
+- TorchDynamo-based ONNX Exporter only supports Linux.
+- The program may not run as expected if the machine has limited memory. e.g Dynamo export may use ~11.6GB; Optimization may use ~4.5GB for each.
diff --git a/onnxruntime/python/tools/transformers/models/phi2/__init__.py b/onnxruntime/python/tools/transformers/models/phi2/__init__.py
new file mode 100644
index 000000000000..e80f36a391fe
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/phi2/__init__.py
@@ -0,0 +1,12 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import os
+import sys
+
+sys.path.append(os.path.dirname(__file__))
+
+transformers_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
+if transformers_dir not in sys.path:
+    sys.path.append(transformers_dir)
diff --git a/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py
new file mode 100644
index 000000000000..808377842324
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py
@@ -0,0 +1,576 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import onnx
+import torch
+from benchmark_helper import Precision
+from fusion_options import AttentionOpType
+from onnx_model import OnnxModel
+from transformers import AutoConfig, AutoModelForCausalLM
+
+from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
+
+
+class ConvertPhi2ToONNX:
+    def __init__(
+        self,
+        device: torch.device,
+        model_class: str = "microsoft/phi-2",
+        cache_dir: str = "./cache",
+    ):
+        self.model_class = model_class
+        self.device = device
+        self.cache_dir = cache_dir
+        self.phi_config = AutoConfig.from_pretrained(self.model_class, trust_remote_code=True, cache_dir=self.cache_dir)
+        self.phi_model = None
+        self.batch_size = 2
+        self.sequence_length = 8
+        self.attn_op_type = None
+        self.precision = None
+        self.block_size = 16
+        self.accuracy_level = None
+
+    def set_quantization_params(self, block_size: int, accuracy_level: int | None):
+        self.block_size = block_size
+        self.accuracy_level = accuracy_level
+
+    def init_attn_type_and_precision(self, attn_op_type: AttentionOpType, precision: Precision):
+        self.attn_op_type = attn_op_type
+        self.precision = precision
+
+    def erase_onnx_model(self, onnx_path: str) -> None:
+        assert onnx_path.endswith(".onnx")
+        if not os.path.exists(onnx_path):
+            return
+
+        model = onnx.load_model(onnx_path, load_external_data=False)
+        onnx_data_path = None
+        for initializer in model.graph.initializer:
+            if initializer.data_location == 1 and initializer.external_data[0].key == "location":
+                onnx_data_path = "./" + initializer.external_data[0].value
+                break
+        logging.info(f"Erasing {onnx_path}...")
+        os.remove(onnx_path)
+        if onnx_data_path is not None:
+            onnx_data_path = os.path.join(Path(onnx_path).parent, onnx_data_path)
+            logging.info(f"Erasing {onnx_data_path}...")
+            os.remove(onnx_data_path)
+
+    def get_phi2_torch_model(self):
+        logging.info("Loading phi2 torch model...")
+        if self.phi_model is not None:
+            return
+        self.phi_model = AutoModelForCausalLM.from_pretrained(
+            self.model_class, trust_remote_code=True, cache_dir=self.cache_dir
+        )
+        self.phi_model.eval()
+        self.phi_model.to(self.device)
+
+    def get_phi2_torch_inputs(self, batch_size: int, sequence_length: int):
+        input_ids = torch.randint(
+            low=0,
+            high=self.phi_config.vocab_size,
+            size=(batch_size, sequence_length),
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.get_phi2_torch_model()
+        torch_inputs = self.phi_model.prepare_inputs_for_generation(
+            input_ids, past_key_values=self.phi_model(input_ids, use_cache=True)["past_key_values"]
+        )
+        return torch_inputs["input_ids"], torch_inputs["attention_mask"], torch_inputs["past_key_values"]
+
+    def dynamo_export(self, onnx_path: str):
+        input_ids, attention_mask, past_key_values = self.get_phi2_torch_inputs(self.batch_size, self.sequence_length)
+        self.phi_model(input_ids, attention_mask=attention_mask, past_key_values=past_key_values)
+
+        from torch._dynamo import config
+
+        config.capture_scalar_outputs = True
+
+        logging.info("Exporting Phi2 torch model to ONNX...")
+        torch.onnx.dynamo_export(
+            self.phi_model,
+            input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            export_options=torch.onnx.ExportOptions(dynamic_shapes=True),
+        ).save(onnx_path)
+        onnx.checker.check_model(onnx_path)
+        onnx.shape_inference.infer_shapes_path(onnx_path)
+
+    def optimize_phi2_onnx(self, onnx_path: str, onnx_path_opt: str):
+        from fusion_options import FusionOptions
+        from optimizer import optimize_model
+
+        optimization_options = FusionOptions("phi")
+        optimization_options.set_attention_op_type(self.attn_op_type)
+        optimizer = optimize_model(
+            onnx_path,
+            model_type="phi",
+            num_heads=self.phi_config.num_attention_heads,
+            hidden_size=self.phi_config.hidden_size,
+            opt_level=0,
+            optimization_options=optimization_options,
+            only_onnxruntime=False,
+        )
+
+        fused_op_count = optimizer.get_fused_operator_statistics()
+        if optimizer.is_fully_optimized(fused_op_count):
+            logging.info("Model is fully optimized.")
+        else:
+            logging.info("Model is not fully optimized.")
+
+        if self.precision == Precision.FLOAT32:
+            optimizer.save_model_to_file(onnx_path_opt, use_external_data_format=True)
+            return
+
+        if (
+            self.precision == Precision.FLOAT16 or self.precision == Precision.INT4
+        ) and self.attn_op_type != AttentionOpType.MultiHeadAttention:
+            # We keep last three layers of Attention as float32 or bfloat16 to avoid overflow.
+            node_block_list = (
+                [
+                    "Attention_29",
+                    "Attention_30",
+                    "Attention_31",
+                ]
+                if self.attn_op_type != AttentionOpType.PagedAttention
+                else []
+            )  # TODO: temp setting for paged attention
+            logging.info("Converting onnx model to float16/bfloat16...")
+            optimizer.convert_float_to_float16(
+                keep_io_types=False,
+                node_block_list=node_block_list,
+                use_symbolic_shape_infer=True,
+                use_bfloat16_as_blocked_nodes_dtype=self.attn_op_type == AttentionOpType.GroupQueryAttention,
+            )
+            logging.info("Converting onnx model to float16/bfloat16 done.")
+
+        if self.precision == Precision.FLOAT16:
+            optimizer.save_model_to_file(onnx_path_opt, use_external_data_format=True)
+            return
+        else:
+            assert self.precision == Precision.INT4
+            quant = MatMul4BitsQuantizer(
+                model=optimizer.model,
+                block_size=self.block_size,
+                is_symmetric=True,
+                accuracy_level=self.accuracy_level,
+            )
+            quant.process()
+            quant.model.save_model_to_file(onnx_path_opt, use_external_data_format=True)
+
+    # This function currently only works for phi2 model
+    def convert_to_use_cuda_graph(self, in_onnx_path: str, out_onnx_path: str):
+        onnx_model = OnnxModel(onnx.load(in_onnx_path, load_external_data=True))
+
+        from onnx import TensorProto, helper
+
+        graph = onnx_model.graph()
+        new_inputs = []
+        for vi in graph.input:
+            if "attention_mask" in vi.name:
+                vi_seqlen_k = helper.make_tensor_value_info(
+                    "seqlens_k",
+                    elem_type=TensorProto.INT32,
+                    shape=["batch_size"],
+                )
+                vi_total_seq_len = helper.make_tensor_value_info(
+                    "total_sequence_length",
+                    elem_type=TensorProto.INT32,
+                    shape=[1],
+                )
+                new_inputs.extend([vi_seqlen_k, vi_total_seq_len])
+            else:
+                new_inputs.append(vi)
+
+        graph.ClearField("input")
+        graph.input.extend(new_inputs)
+
+        gqas = onnx_model.get_nodes_by_op_type("GroupQueryAttention")
+        gqa = gqas[0]
+        seqlens_path = onnx_model.match_parent_path(
+            gqa,
+            ["Cast", "Sub", "ReduceSum", "Cast"],
+            [5, 0, 0, 0],
+        )
+        if seqlens_path is None:
+            raise RuntimeError("Failed to find seqlens path for GroupQueryAttention node.")
+        total_seq_len_path = onnx_model.match_parent_path(
+            gqa,
+            ["Cast", "Gather", "Shape"],
+            [6, 0, 0],
+        )
+        if total_seq_len_path is None:
+            raise RuntimeError("Failed to find total_seq_len path for GroupQueryAttention node.")
+        onnx_model.remove_nodes(seqlens_path)
+        onnx_model.remove_nodes(total_seq_len_path)
+
+        for gqa in gqas:
+            gqa.input[5] = "seqlens_k"
+            gqa.input[6] = "total_sequence_length"
+
+        onnx_model.save(onnx_model.model, out_onnx_path, save_as_external_data=True)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--fp32_cpu",
+        required=False,
+        action="store_true",
+        help="Generate fp32 ONNX model for CPU",
+    )
+
+    parser.add_argument(
+        "--int4_cpu",
+        required=False,
+        action="store_true",
+        help="Generate int4 ONNX model for CPU",
+    )
+
+    parser.add_argument(
+        "--fp32_gpu",
+        required=False,
+        action="store_true",
+        help="Generate fp32 ONNX model for Nvidia GPUs",
+    )
+
+    parser.add_argument(
+        "--fp16_gpu",
+        required=False,
+        action="store_true",
+        help="Generate fp16 ONNX model for Nvidia GPUs",
+    )
+
+    parser.add_argument(
+        "--int4_gpu",
+        required=False,
+        action="store_true",
+        help="Generate int4 ONNX model for Nvidia GPUs",
+    )
+
+    parser.add_argument(
+        "--fp16_gpu_sm8x",
+        required=False,
+        action="store_true",
+        help="Generate fp16 ONNX model for Nvidia GPUs with CUDA architecture SM=80~89",
+    )
+
+    parser.add_argument(
+        "--int4_gpu_sm8x",
+        required=False,
+        action="store_true",
+        help="Generate int4 ONNX model for Nvidia GPUs with CUDA architecture SM=80~89",
+    )
+
+    parser.add_argument(
+        "--fp16_vllm",
+        required=False,
+        action="store_true",
+        help="Generate fp16 ONNX model for ORT VLLM",
+    )
+
+    parser.add_argument(
+        "--int4_vllm",
+        required=False,
+        action="store_true",
+        help="Generate int4 ONNX model for ORT VLLM",
+    )
+
+    parser.add_argument(
+        "--use_cuda_graph",
+        required=False,
+        action="store_true",
+        help="Use CUDA Graph in decoding process",
+    )
+
+    parser.add_argument(
+        "--overwrite",
+        required=False,
+        action="store_true",
+        help="Overwrite existing ONNX models",
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        required=False,
+        type=str,
+        default="./cache",
+        help="The cache directory for the pytorch model",
+    )
+
+    parser.add_argument(
+        "--device_id",
+        required=False,
+        type=int,
+        default=0,
+        help="The device id for the pytorch model",
+    )
+
+    parser.add_argument(
+        "--run_example",
+        required=False,
+        action="store_true",
+        help="Run ORT inference example",
+    )
+
+    parser.add_argument(
+        "--run_benchmark",
+        required=False,
+        action="store_true",
+        help="Run ORT benchmark",
+    )
+
+    parser.add_argument(
+        "--skip_export",
+        required=False,
+        action="store_true",
+        help="Skip exporting ONNX model",
+    )
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="The output directory for the ONNX models",
+        default="phi2_onnx_models",
+    )
+
+    parser.add_argument(
+        "--block_size",
+        required=False,
+        default=16,
+        type=int,
+        help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.",
+    )
+
+    parser.add_argument(
+        "--int4_accuracy_level",
+        required=False,
+        type=int,
+        help="Accuracy level of the 4-bit quantized MatMul computation. "
+        "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
+        "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_arguments()
+
+    device = torch.device("cuda", args.device_id) if torch.cuda.is_available() else torch.device("cpu")
+
+    converter = ConvertPhi2ToONNX(device, cache_dir=args.cache_dir)
+    converter.set_quantization_params(args.block_size, args.int4_accuracy_level)
+
+    output_dir = args.output_dir
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    original_onnx_path = os.path.join(output_dir, "phi2_original.onnx")
+
+    if not args.skip_export:
+        if not os.path.exists(original_onnx_path) or args.overwrite:
+            converter.dynamo_export(original_onnx_path)
+
+    model_type_to_args = {
+        "fp32_cpu": (
+            AttentionOpType.MultiHeadAttention,
+            Precision.FLOAT32,
+            os.path.join(output_dir, "phi2_decoder_fp32_cpu.onnx"),
+        ),
+        "int4_cpu": (
+            AttentionOpType.MultiHeadAttention,
+            Precision.INT4,
+            os.path.join(output_dir, "phi2_decoder_int4_cpu.onnx"),
+        ),
+        "fp32_gpu": (
+            AttentionOpType.Attention,
+            Precision.FLOAT32,
+            os.path.join(output_dir, "phi2_decoder_fp32_gpu.onnx"),
+        ),
+        "fp16_gpu": (
+            AttentionOpType.Attention,
+            Precision.FLOAT16,
+            os.path.join(output_dir, "phi2_decoder_fp16_gpu.onnx"),
+        ),
+        "int4_gpu": (AttentionOpType.Attention, Precision.INT4, os.path.join(output_dir, "phi2_decoder_int4_gpu.onnx")),
+        "fp16_gpu_sm8x": (
+            AttentionOpType.GroupQueryAttention,
+            Precision.FLOAT16,
+            os.path.join(output_dir, "phi2_decoder_fp16_gpu_sm8x.onnx"),
+        ),
+        "int4_gpu_sm8x": (
+            AttentionOpType.GroupQueryAttention,
+            Precision.INT4,
+            os.path.join(output_dir, "phi2_decoder_int4_gpu_sm8x.onnx"),
+        ),
+        "fp16_vllm": (
+            AttentionOpType.PagedAttention,
+            Precision.FLOAT16,
+            os.path.join(output_dir, "phi2_decoder_fp16_vllm.onnx"),
+        ),
+        "int4_vllm": (
+            AttentionOpType.PagedAttention,
+            Precision.INT4,
+            os.path.join(output_dir, "phi2_decoder_int4_vllm.onnx"),
+        ),
+    }
+
+    if not args.skip_export:
+        from multiprocessing import Process
+
+        def run_optimize_phi2_onnx(
+            converter: ConvertPhi2ToONNX,
+            original_onnx_path: str,
+            attention_type: AttentionOpType,
+            precision: Precision,
+            optimized_onnx_path: str,
+        ):
+            converter.init_attn_type_and_precision(attention_type, precision)
+            converter.optimize_phi2_onnx(original_onnx_path, optimized_onnx_path)
+            if args.use_cuda_graph:
+                assert args.fp16_gpu_sm8x or args.int4_gpu_sm8x
+                converter.convert_to_use_cuda_graph(optimized_onnx_path, optimized_onnx_path)
+
+        processes = []
+        if args.fp32_cpu:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx, args=(converter, original_onnx_path, *model_type_to_args["fp32_cpu"])
+                )
+            )
+
+        if args.int4_cpu:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx, args=(converter, original_onnx_path, *model_type_to_args["int4_cpu"])
+                )
+            )
+
+        if args.fp32_gpu:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx, args=(converter, original_onnx_path, *model_type_to_args["fp32_gpu"])
+                )
+            )
+
+        if args.fp16_gpu:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx, args=(converter, original_onnx_path, *model_type_to_args["fp16_gpu"])
+                )
+            )
+
+        if args.int4_gpu:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx, args=(converter, original_onnx_path, *model_type_to_args["int4_gpu"])
+                )
+            )
+
+        if args.fp16_gpu_sm8x:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx,
+                    args=(converter, original_onnx_path, *model_type_to_args["fp16_gpu_sm8x"]),
+                )
+            )
+
+        if args.int4_gpu_sm8x:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx,
+                    args=(converter, original_onnx_path, *model_type_to_args["int4_gpu_sm8x"]),
+                )
+            )
+
+        if args.fp16_vllm:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx,
+                    args=(converter, original_onnx_path, *model_type_to_args["fp16_vllm"]),
+                )
+            )
+
+        if args.int4_vllm:
+            processes.append(
+                Process(
+                    target=run_optimize_phi2_onnx,
+                    args=(converter, original_onnx_path, *model_type_to_args["int4_vllm"]),
+                )
+            )
+
+        [p.start() for p in processes]
+        [p.join() for p in processes]
+
+    if args.run_example or args.run_benchmark:
+        from inference_example import run_phi2
+
+        if args.fp16_gpu_sm8x:
+            logging.info("Running fp16_gpu_sm8x example...")
+            run_phi2(
+                onnx_model_path=model_type_to_args["fp16_gpu_sm8x"][2],
+                use_buffer_share=True,
+                device_id=args.device_id,
+                use_step=True,
+                use_cuda_graph=args.use_cuda_graph,
+                run_benchmark=args.run_benchmark,
+            )
+        if args.int4_gpu_sm8x:
+            logging.info("Running int4_gpu_sm8x example...")
+            run_phi2(
+                onnx_model_path=model_type_to_args["int4_gpu_sm8x"][2],
+                use_buffer_share=True,
+                device_id=args.device_id,
+                use_step=True,
+                use_cuda_graph=args.use_cuda_graph,
+                run_benchmark=args.run_benchmark,
+            )
+        if args.fp32_gpu:
+            logging.info("Running fp32_gpu example...")
+            run_phi2(
+                onnx_model_path=model_type_to_args["fp32_gpu"][2],
+                use_buffer_share=False,
+                device_id=args.device_id,
+                packed_kv=True,
+                use_fp16=False,
+                run_benchmark=args.run_benchmark,
+            )
+        if args.fp16_gpu:
+            logging.info("Running fp16_gpu example...")
+            run_phi2(
+                onnx_model_path=model_type_to_args["fp16_gpu"][2],
+                use_buffer_share=False,
+                device_id=args.device_id,
+                packed_kv=True,
+                run_benchmark=args.run_benchmark,
+            )
+        if args.int4_gpu:
+            logging.info("Running int4_gpu example...")
+            run_phi2(
+                onnx_model_path=model_type_to_args["int4_gpu"][2],
+                use_buffer_share=False,
+                device_id=args.device_id,
+                packed_kv=True,
+                run_benchmark=args.run_benchmark,
+            )
+        if args.fp32_cpu or args.int4_cpu or args.fp16_vllm or args.int4_vllm:
+            raise NotImplementedError("CPU/vllm inference example is not implemented yet.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/transformers/models/phi2/inference_example.py b/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
new file mode 100644
index 000000000000..eb66533f0083
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
@@ -0,0 +1,414 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer
+
+import onnxruntime as ort
+
+pt_to_np = {
+    "torch.int32": np.int32,
+    "torch.int64": np.int64,
+    "torch.float32": np.float32,
+    "torch.float16": np.float16,
+}
+
+
+def cuda_memcpy(dst, src):
+    from cuda import cudart
+
+    cudart.cudaMemcpy(
+        dst.data_ptr(),
+        src.data_ptr(),
+        src.element_size() * src.nelement(),
+        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice,
+    )
+
+
+class ORTGenerator:
+    def __init__(self, decoder_path):
+        self.onnx_decoder_path = decoder_path
+        self.num_heads = 32
+        self.head_size = 80
+        self.num_layers = 32
+        self.max_sequence_length = 2048
+        self.device_id = 0
+        self.use_cuda_graph = False
+        self.use_traced_inputs = False
+        self.static_inputs_map = {}
+
+    def append_static_inputs(self, batch_size):
+        # Only use this function with GQA and with use_cuda_graph=True
+        if batch_size in self.static_inputs_map:
+            return
+
+        cpu_device = torch.device("cpu")
+        cuda_device = torch.device("cuda", self.device_id)
+
+        static_io = {}
+        static_io["input_ids"] = torch.zeros((batch_size, 1), dtype=torch.int32, device=cuda_device)
+        static_io["step"] = torch.tensor([0], dtype=torch.int64, device=cuda_device)
+        static_io["seqlens_k"] = torch.tensor(batch_size * [0], dtype=torch.int32, device=cuda_device)
+        static_io["total_sequence_length"] = torch.tensor([0], dtype=torch.int32, device=cpu_device)
+
+        cache_shape = (batch_size, self.num_heads, self.max_sequence_length, self.head_size)
+        for i in range(self.num_layers):
+            cache = torch.zeros(cache_shape, device=cuda_device, dtype=torch.float16)
+            static_io.update({f"past_key_{i}": cache.contiguous(), f"past_value_{i}": cache.clone().contiguous()})
+
+        static_io["logits"] = torch.zeros((batch_size, 1, 51200), dtype=torch.float16, device=cuda_device)
+
+        self.static_inputs_map[batch_size] = static_io
+
+    def get_initial_inputs_and_outputs(self, encodings_dict):
+        self.torch_dtype = torch.float16 if self.use_fp16 else torch.float32
+
+        input_ids = torch.tensor(encodings_dict["input_ids"], device=self.device, dtype=torch.int32)
+        attention_mask = torch.tensor(encodings_dict["attention_mask"], device=self.device, dtype=torch.int32)
+
+        batch_size, sequence_length = input_ids.shape
+
+        self.use_traced_inputs = (
+            self.use_cuda_graph
+            and (batch_size in self.static_inputs_map)
+            and self.use_buffer_share
+            and not self.packed_kv
+        )
+
+        step = (
+            torch.tensor([0], device=self.device, dtype=torch.int64)
+            if not self.use_traced_inputs
+            else self.static_inputs_map[batch_size]["step"]
+        )
+
+        seqlens_k = (
+            torch.tensor(batch_size * [0], device=self.device, dtype=torch.int32)
+            if not self.use_traced_inputs
+            else self.static_inputs_map[batch_size]["seqlens_k"]
+        )
+        cuda_memcpy(seqlens_k, attention_mask.sum(1).sub(1).to(torch.int32))
+
+        total_seq_length = (
+            torch.tensor([0], device=torch.device("cpu"), dtype=torch.int32)
+            if not self.use_traced_inputs
+            else self.static_inputs_map[batch_size]["total_sequence_length"]
+        )
+        total_seq_length[0] = sequence_length
+
+        inputs = {
+            "input_ids": input_ids.contiguous(),
+            "attention_mask": attention_mask.contiguous(),
+        }
+
+        if self.use_step:
+            inputs["step"] = step.contiguous()
+
+        if self.use_cuda_graph:
+            inputs["seqlens_k"] = seqlens_k.contiguous()
+            inputs["total_sequence_length"] = total_seq_length.contiguous()
+            del inputs["attention_mask"]
+
+        past_seq_length = self.max_sequence_length if self.use_buffer_share else 0
+        past_shape = (
+            (2, batch_size, self.num_heads, past_seq_length, self.head_size)
+            if self.packed_kv
+            else (batch_size, self.num_heads, past_seq_length, self.head_size)
+        )
+
+        if not self.use_traced_inputs:
+            for i in range(self.num_layers):
+                past = torch.zeros(past_shape, device=self.device, dtype=self.torch_dtype)
+                (
+                    inputs.update({f"past_key_{i}": past.contiguous(), f"past_value_{i}": past.clone().contiguous()})
+                    if not self.packed_kv
+                    else inputs.update({f"past_{i}": past.contiguous()})
+                )
+        else:
+            for i in range(self.num_layers):
+                inputs.update(
+                    {
+                        f"past_key_{i}": self.static_inputs_map[batch_size][f"past_key_{i}"].contiguous(),
+                        f"past_value_{i}": self.static_inputs_map[batch_size][f"past_value_{i}"].contiguous(),
+                    }
+                )
+
+        logits = torch.zeros(batch_size, sequence_length, 51200, device=self.device, dtype=self.torch_dtype)
+        outputs = {"logits": logits.contiguous()}
+
+        if not self.use_buffer_share:
+            present_shape = (
+                (2, batch_size, self.num_heads, sequence_length, self.head_size)
+                if self.packed_kv
+                else (batch_size, self.num_heads, sequence_length, self.head_size)
+            )
+            for i in range(self.num_layers):
+                present = torch.zeros(present_shape, device=self.device, dtype=self.torch_dtype)
+                (
+                    outputs.update(
+                        {f"present_key_{i}": present.contiguous(), f"present_value_{i}": present.contiguous()}
+                    )
+                    if not self.packed_kv
+                    else outputs.update({f"present_{i}": present.contiguous()})
+                )
+
+        return inputs, outputs
+
+    def apply_io_binding(self, model: ort.InferenceSession, inputs: dict, outputs: dict):
+        io_binding = model.io_binding()
+        device = None
+
+        for k, v in inputs.items():
+            io_binding.bind_input(
+                name=k,
+                device_type=v.device.type,
+                device_id=0 if v.device.type == "cpu" else v.device.index,
+                element_type=pt_to_np[repr(v.dtype)],
+                shape=tuple(v.shape),
+                buffer_ptr=v.data_ptr(),
+            )
+            device = v.device
+
+        for output in model.get_outputs():
+            name = output.name
+            if self.use_buffer_share and "present" in name:
+                v = inputs[name.replace("present", "past")]
+                io_binding.bind_output(
+                    name=name,
+                    device_type=v.device.type,
+                    device_id=v.device.index,
+                    element_type=(np.float16 if self.use_fp16 else np.float32),
+                    shape=tuple(v.shape),
+                    buffer_ptr=v.data_ptr(),
+                )
+            else:
+                v = outputs[name]
+                io_binding.bind_output(
+                    name=name,
+                    device_type=device.type,
+                    device_id=0 if device.type == "cpu" else device.index,
+                    element_type=(np.float16 if self.use_fp16 else np.float32),
+                    shape=tuple(v.shape),
+                    buffer_ptr=v.data_ptr(),
+                )
+
+        return io_binding
+
+    def create_session(
+        self, device_id, use_fp16=True, use_buffer_share=True, packed_kv=False, use_step=False, use_cuda_graph=False
+    ):
+        self.device_id = device_id
+        sess_options = ort.SessionOptions()
+        sess_options.log_verbosity_level = 4
+        sess_options.log_severity_level = 4
+        self.use_cuda_graph = use_cuda_graph
+        ep = (
+            ("CUDAExecutionProvider", {"device_id": self.device_id, "enable_cuda_graph": self.use_cuda_graph})
+            if self.device_id >= 0
+            else "CPUExecutionProvider"
+        )
+        self.sess = ort.InferenceSession(self.onnx_decoder_path, sess_options=sess_options, providers=[ep])
+        self.ro = ort.RunOptions()
+
+        self.device = torch.device("cuda", self.device_id) if torch.cuda.is_available() else torch.device("cpu")
+        self.use_fp16 = use_fp16
+        self.use_buffer_share = use_buffer_share
+        self.packed_kv = packed_kv
+        self.use_step = use_step
+
+        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
+        self.tokenizer.pad_token = "[PAD]"
+
+    def generate_impl(self, encodings_dict, max_length, cuda_graph_annotation, benchmark=False):
+        inputs, outputs = self.get_initial_inputs_and_outputs(encodings_dict)
+
+        all_token_ids = inputs["input_ids"].clone()
+        batch_size, sequence_length = all_token_ids.shape
+
+        current_length = sequence_length
+        has_eos = torch.zeros(batch_size, device=self.device, dtype=torch.bool)
+
+        if benchmark:
+            import time
+
+            latency = []
+
+        prompt_run = True
+        while current_length < max_length:
+            io_binding = self.apply_io_binding(self.sess, inputs, outputs)
+
+            if benchmark:
+                start = time.time()
+
+            io_binding.synchronize_inputs()
+            if prompt_run:
+                if self.use_cuda_graph:
+                    # Disable CUDA graph for the prompt run
+                    self.ro.add_run_config_entry("gpu_graph_id", "-1")
+                self.sess.run_with_iobinding(io_binding, self.ro)
+                if self.use_cuda_graph:
+                    # Enable CUDA graph for the decoding run
+                    self.ro.add_run_config_entry(
+                        "gpu_graph_id", str(cuda_graph_annotation) if self.use_traced_inputs else "-1"
+                    )
+                prompt_run = False
+            else:
+                self.sess.run_with_iobinding(io_binding, self.ro)
+            io_binding.synchronize_outputs()
+
+            if benchmark:
+                end = time.time()
+                latency.append(end - start)
+
+            # Sample with argmax (greedy search)
+            next_token_logits = outputs["logits"][:, -1, :]
+            next_tokens = torch.argmax(next_token_logits, dim=-1)
+
+            # Check if we previously reached EOS token id or if generated token id is EOS token id
+            has_eos = has_eos | next_tokens == self.tokenizer.eos_token_id
+
+            # Determine which new tokens to add to list of all token ids
+            # Add EOS token ids for batch entries that ended early (ragged batching scenario where some batch entries ended early and some haven't)
+            tokens_to_add = next_tokens.masked_fill(has_eos, self.tokenizer.eos_token_id).reshape([batch_size, 1])
+            all_token_ids = torch.cat([all_token_ids, tokens_to_add], dim=-1)
+
+            # Return early if all batch entries have reached EOS token id
+            if torch.all(has_eos):
+                break
+
+            # Update inputs for next inference run
+            current_length += 1
+
+            inputs["input_ids"] = tokens_to_add.to(torch.int32)
+            if self.use_traced_inputs:
+                cuda_memcpy(self.static_inputs_map[batch_size]["input_ids"], inputs["input_ids"])
+                inputs["input_ids"] = self.static_inputs_map[batch_size]["input_ids"]
+
+            if self.use_step:
+                inputs["step"] = torch.tensor([current_length - 1], device=self.device, dtype=torch.int64)
+                if self.use_traced_inputs:
+                    cuda_memcpy(self.static_inputs_map[batch_size]["step"], inputs["step"])
+                    inputs["step"] = self.static_inputs_map[batch_size]["step"]
+
+            if self.use_cuda_graph:
+                previous_seqlens_k = inputs["seqlens_k"]
+                inputs["seqlens_k"] = (previous_seqlens_k + (~has_eos).reshape(batch_size, 1)).to(torch.int32)
+                inputs["total_sequence_length"][0] = current_length
+                if self.use_traced_inputs:
+                    cuda_memcpy(self.static_inputs_map[batch_size]["seqlens_k"], inputs["seqlens_k"])
+                    inputs["seqlens_k"] = self.static_inputs_map[batch_size]["seqlens_k"]
+                    self.static_inputs_map[batch_size]["total_sequence_length"][0] = inputs["total_sequence_length"][0]
+                    inputs["total_sequence_length"] = self.static_inputs_map[batch_size]["total_sequence_length"]
+            else:
+                inputs["attention_mask"] = torch.cat(
+                    [inputs["attention_mask"], (~has_eos).reshape(batch_size, 1)], 1
+                ).to(torch.int32)
+
+            # Set logits to zeros for next inference run and re-use memory buffer
+            if outputs["logits"].shape[1] != 1:
+                outputs["logits"] = outputs["logits"][:, :1, :].contiguous()
+                if self.use_traced_inputs:
+                    outputs["logits"] = self.static_inputs_map[batch_size]["logits"]
+            outputs["logits"].zero_()
+
+            if not self.use_buffer_share:
+                for i in range(self.num_layers):
+                    if not self.packed_kv:
+                        inputs[f"past_key_{i}"] = outputs[f"present_key_{i}"]
+                        inputs[f"past_value_{i}"] = outputs[f"present_value_{i}"]
+                    else:
+                        inputs[f"past_{i}"] = outputs[f"present_{i}"]
+
+                new_sequence_length = inputs["attention_mask"].shape[1]
+                present_shape = (
+                    (2, batch_size, self.num_heads, new_sequence_length, self.head_size)
+                    if self.packed_kv
+                    else (batch_size, self.num_heads, new_sequence_length, self.head_size)
+                )
+                for i in range(self.num_layers):
+                    present = torch.zeros(present_shape, device=self.device, dtype=self.torch_dtype)
+                    (
+                        outputs.update(
+                            {
+                                f"present_key_{i}": present.contiguous(),
+                                f"present_value_{i}": present.clone().contiguous(),
+                            }
+                        )
+                        if not self.packed_kv
+                        else outputs.update({f"present_{i}": present.contiguous()})
+                    )
+
+        if benchmark:
+            print(
+                f"Batch size: {batch_size}, Sequence length: {sequence_length}, Token num: {max_length - sequence_length}"
+            )
+            print(f"Prompt letency: {1000 * latency[0]}ms, Token latency: {1000 * np.mean(latency[1:])}ms")
+            return
+
+        texts = self.tokenizer.batch_decode(all_token_ids, skip_special_tokens=True)
+        return texts
+
+    def generate(self, prompt, max_length, cuda_graph_annotation):
+        encodings_dict = self.tokenizer.batch_encode_plus(prompt, padding=True)
+
+        return self.generate_impl(encodings_dict, max_length, cuda_graph_annotation)
+
+    def generate_benchmark(self, prompt_shape, token_num, cuda_graph_annotation):
+        batch_size, sequence_length = prompt_shape
+        max_length = sequence_length + token_num
+
+        encodings_dict = {}
+        encodings_dict["input_ids"] = torch.randint(0, 50264, (batch_size, sequence_length), dtype=torch.int32).tolist()
+        encodings_dict["attention_mask"] = torch.ones((batch_size, sequence_length), dtype=torch.int32).tolist()
+
+        # Warm up run
+        self.generate_impl(encodings_dict, max_length, cuda_graph_annotation, benchmark=False)
+
+        # Benchmark run
+        self.generate_impl(encodings_dict, max_length, cuda_graph_annotation, benchmark=True)
+
+
+def run_phi2(
+    onnx_model_path,
+    use_buffer_share,
+    device_id,
+    packed_kv=False,
+    use_fp16=True,
+    use_step=False,
+    use_cuda_graph=False,
+    run_benchmark=False,
+):
+    generator = ORTGenerator(onnx_model_path)
+    generator.create_session(device_id, use_fp16, use_buffer_share, packed_kv, use_step, use_cuda_graph)
+
+    def simple_run(prompt):
+        example_batch_size = len(prompt)
+        if use_cuda_graph:
+            generator.append_static_inputs(batch_size=example_batch_size)
+        texts = generator.generate(prompt, max_length=210, cuda_graph_annotation=example_batch_size)
+
+        for i in range(len(texts)):
+            print("Prompt: ", prompt[i])
+            print("Texts: ", texts[i])
+
+    prompt = [
+        '''```python
+    def print_prime(n):
+    """
+    Print all primes between 1 and n
+    """'''
+    ]
+
+    if not run_benchmark:
+        simple_run(prompt)
+
+    # Run simple benchmark. Time the decoder only.
+    if run_benchmark:
+        token_num = 32
+        for batch_size in [1, 2, 4, 8]:
+            generator.append_static_inputs(batch_size)
+            for sequence_length in [16, 512]:
+                prompt_shape = (batch_size, sequence_length)
+                generator.generate_benchmark(prompt_shape, token_num, cuda_graph_annotation=batch_size)
diff --git a/onnxruntime/python/tools/transformers/models/phi2/requirements.txt b/onnxruntime/python/tools/transformers/models/phi2/requirements.txt
new file mode 100644
index 000000000000..af6f441c149d
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/phi2/requirements.txt
@@ -0,0 +1,3 @@
+onnx>=1.15.0
+transformers>=4.36.2
+onnxscript>=0.1.0.dev20240126
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
index b10c10c87ee5..8607485bc265 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
@@ -51,7 +51,7 @@ sh build.sh --config Release  --build_shared_lib --parallel --use_cuda --cuda_ve
             --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=80 \
             --allow_running_as_root
 python3 -m pip install --upgrade pip
-python3 -m pip install build/Linux/Release/dist/onnxruntime_gpu-1.17.0-cp310-cp310-linux_x86_64.whl --force-reinstall
+python3 -m pip install build/Linux/Release/dist/onnxruntime_gpu-*.whl --force-reinstall
 ```
 
 If the GPU is not A100, change `CMAKE_CUDA_ARCHITECTURES=80` in the command line according to the GPU compute capacity (like 89 for RTX 4090, or 86 for RTX 3090).
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
index 6c337af78e0a..3879e25386d5 100755
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@@ -315,13 +315,13 @@ def get_optimum_ort_pipeline(
                 directory,
                 provider=provider,
                 session_options=None,
-                use_io_binding=False,
+                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
             )
         else:
             pipeline = ORTStableDiffusionPipeline.from_pretrained(
                 directory,
                 provider=provider,
-                use_io_binding=False,
+                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
             )
     elif "xl" in model_name:
         pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
@@ -329,7 +329,7 @@ def get_optimum_ort_pipeline(
             export=True,
             provider=provider,
             session_options=None,
-            use_io_binding=False,
+            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
         )
         pipeline.save_pretrained(directory)
     else:
@@ -337,7 +337,7 @@ def get_optimum_ort_pipeline(
             model_name,
             export=True,
             provider=provider,
-            use_io_binding=False,
+            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
         )
         pipeline.save_pretrained(directory)
 
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
index 40692701c28d..a3caba138f44 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
@@ -32,13 +32,8 @@
     repeat_prompt,
 )
 
-if __name__ == "__main__":
-    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
-
-    parser = arg_parser("Options for Stable Diffusion Demo")
-    add_controlnet_arguments(parser)
-    args = parse_arguments(is_xl=False, parser=parser)
 
+def main(args):
     controlnet_images, controlnet_scale = process_controlnet_arguments(args)
 
     pipeline, refiner = load_pipelines(args)
@@ -61,6 +56,7 @@ def run_inference(warmup=False):
             controlnet_scales=controlnet_scale,
             show_latency=not warmup,
             output_type="pil",
+            deterministic=args.deterministic,
         )
 
     if not args.disable_cuda_graph:
@@ -87,3 +83,20 @@ def run_inference(warmup=False):
     pipeline.save_images(images, prompt, negative_prompt, metadata)
 
     pipeline.teardown()
+
+
+if __name__ == "__main__":
+    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+
+    parser = arg_parser("Options for Stable Diffusion Demo")
+    add_controlnet_arguments(parser)
+    args = parse_arguments(is_xl=False, parser=parser)
+
+    if args.user_compute_stream:
+        import torch
+
+        s = torch.cuda.Stream()
+        with torch.cuda.stream(s):
+            main(args)
+    else:
+        main(args)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
index 19bbb45d77c9..24fa6a2c5134 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
@@ -132,9 +132,11 @@ def run_demo(args):
 
 
 def run_dynamic_shape_demo(args):
-    """Run demo of generating images with different settings with ORT CUDA provider."""
+    """
+    Run demo of generating images with different settings with ORT CUDA provider.
+    Try "python demo_txt2img_xl.py --max-cuda-graphs 3 --user-compute-stream" to see the effect of multiple CUDA graphs.
+    """
     args.engine = "ORT_CUDA"
-    args.disable_cuda_graph = True
     base, refiner = load_pipelines(args, 1)
 
     prompts = [
@@ -216,7 +218,6 @@ def run_dynamic_shape_demo(args):
 def run_turbo_demo(args):
     """Run demo of generating images with test prompts with ORT CUDA provider."""
     args.engine = "ORT_CUDA"
-    args.disable_cuda_graph = True
     base, refiner = load_pipelines(args, 1)
 
     from datasets import load_dataset
@@ -239,13 +240,7 @@ def run_turbo_demo(args):
         refiner.teardown()
 
 
-if __name__ == "__main__":
-    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
-
-    parser = arg_parser("Options for Stable Diffusion XL Demo")
-    add_controlnet_arguments(parser)
-    args = parse_arguments(is_xl=True, parser=parser)
-
+def main(args):
     no_prompt = isinstance(args.prompt, list) and len(args.prompt) == 1 and not args.prompt[0]
     if no_prompt:
         if args.version == "xl-turbo":
@@ -254,3 +249,20 @@ def run_turbo_demo(args):
             run_dynamic_shape_demo(args)
     else:
         run_demo(args)
+
+
+if __name__ == "__main__":
+    coloredlogs.install(fmt="%(funcName)20s: %(message)s")
+
+    parser = arg_parser("Options for Stable Diffusion XL Demo")
+    add_controlnet_arguments(parser)
+    args = parse_arguments(is_xl=True, parser=parser)
+
+    if args.user_compute_stream:
+        import torch
+
+        s = torch.cuda.Stream()
+        with torch.cuda.stream(s):
+            main(args)
+    else:
+        main(args)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
index 965a2598a248..a50940933eb8 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
@@ -23,7 +23,7 @@
 import os
 import sys
 from importlib.metadata import PackageNotFoundError, version
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import controlnet_aux
 import cv2
@@ -239,10 +239,15 @@ def parse_arguments(is_xl: bool, parser):
     )
     parser.add_argument("--nvtx-profile", action="store_true", help="Enable NVTX markers for performance profiling.")
     parser.add_argument("--seed", type=int, default=None, help="Seed for random generator to get consistent results.")
+    parser.add_argument("--deterministic", action="store_true", help="use deterministic algorithms.")
     parser.add_argument("-dc", "--disable-cuda-graph", action="store_true", help="Disable cuda graph.")
 
+    parser.add_argument("--framework-model-dir", default=None, help="framework model directory")
+
     group = parser.add_argument_group("Options for ORT_CUDA engine only")
     group.add_argument("--enable-vae-slicing", action="store_true", help="True will feed only one image to VAE once.")
+    group.add_argument("--max-cuda-graphs", type=int, default=1, help="Max number of cuda graphs to use. Default 1.")
+    group.add_argument("--user-compute-stream", action="store_true", help="Use user compute stream.")
 
     # TensorRT only options
     group = parser.add_argument_group("Options for TensorRT (--engine=TRT) only")
@@ -397,14 +402,16 @@ def initialize_pipeline(
     max_image_size: int = 1024,
     max_batch_size: int = 16,
     opt_batch_size: int = 1,
-    build_all_tactics=False,
-    do_classifier_free_guidance=False,
-    lcm=False,
+    build_all_tactics: bool = False,
+    do_classifier_free_guidance: bool = False,
+    lcm: bool = False,
     controlnet=None,
     lora_weights=None,
-    lora_scale=1.0,
-    use_fp16_vae=True,
-    use_vae=True,
+    lora_scale: float = 1.0,
+    use_fp16_vae: bool = True,
+    use_vae: bool = True,
+    framework_model_dir: Optional[str] = None,
+    max_cuda_graphs: int = 1,
 ):
     pipeline_info = PipelineInfo(
         version,
@@ -424,7 +431,7 @@ def initialize_pipeline(
     input_engine_dir = engine_dir
 
     onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
-        work_dir=work_dir, pipeline_info=pipeline_info, engine_type=engine_type
+        work_dir=work_dir, pipeline_info=pipeline_info, engine_type=engine_type, framework_model_dir=framework_model_dir
     )
 
     pipeline = StableDiffusionPipeline(
@@ -461,6 +468,7 @@ def initialize_pipeline(
             tmp_dir=os.path.join(work_dir or ".", engine_type.name, pipeline_info.short_name(), "tmp"),
             device_id=torch.cuda.current_device(),
             import_engine_dir=import_engine_dir,
+            max_cuda_graphs=max_cuda_graphs,
         )
     elif engine_type == EngineType.ORT_TRT:
         pipeline.backend.build_engines(
@@ -557,6 +565,8 @@ def load_pipelines(args, batch_size=None):
         "lora_scale": args.lora_scale,
         "use_fp16_vae": "xl" in args.version,
         "use_vae": True,
+        "framework_model_dir": args.framework_model_dir,
+        "max_cuda_graphs": args.max_cuda_graphs,
     }
 
     if "xl" in args.version:
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
index 10af22e44d3a..c2cfc165e32c 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
@@ -414,7 +414,6 @@ def get_profile_id(self, batch_size, image_height, image_width, static_batch, st
 
     def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_image_shape):
         """For TensorRT"""
-        pass
 
     def get_shape_dict(self, batch_size, image_height, image_width):
         pass
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
index 46a83f5dc228..26b9a2792e9e 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
@@ -5,6 +5,7 @@
 import hashlib
 import os
 from enum import Enum
+from typing import Optional
 
 import torch
 from diffusion_models import CLIP, VAE, CLIPWithProj, PipelineInfo, UNet, UNetXL
@@ -91,7 +92,7 @@ def get_diffusers_module_name(self, model_name):
             "unetxl": "unet",
             "vae": "vae_decoder",
         }
-        return name_mapping[model_name] if model_name in name_mapping else model_name
+        return name_mapping.get(model_name, model_name)
 
     def get_cached_model_name(self, model_name):
         model_name = self.get_diffusers_module_name(model_name)
@@ -273,7 +274,9 @@ def vae_decode(self, latents):
         return self._vae_decode(latents)
 
 
-def get_engine_paths(work_dir: str, pipeline_info: PipelineInfo, engine_type: EngineType):
+def get_engine_paths(
+    work_dir: str, pipeline_info: PipelineInfo, engine_type: EngineType, framework_model_dir: Optional[str] = None
+):
     root_dir = work_dir or "."
     short_name = pipeline_info.short_name()
 
@@ -287,6 +290,7 @@ def get_engine_paths(work_dir: str, pipeline_info: PipelineInfo, engine_type: En
 
     # Shared among ORT_CUDA, ORT_TRT and TRT engines, and need use load_model(..., always_download_fp16=True)
     # So that the shared model is always fp16.
-    framework_model_dir = os.path.join(root_dir, "torch_model")
+    if framework_model_dir is None:
+        framework_model_dir = os.path.join(root_dir, "torch_model")
 
     return onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py
index 6ab4858f11f2..56012e223b18 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py
@@ -6,7 +6,7 @@
 import gc
 import logging
 import os
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import onnx
 import torch
@@ -15,25 +15,25 @@
 from packaging import version
 
 import onnxruntime as ort
-from onnxruntime.transformers.io_binding_helper import CudaSession
+from onnxruntime.transformers.io_binding_helper import CudaSession, GpuBindingManager
 from onnxruntime.transformers.onnx_model import OnnxModel
 
 logger = logging.getLogger(__name__)
 
 
-class OrtCudaEngine(CudaSession):
+class OrtCudaEngine:
     def __init__(
         self,
         onnx_path,
         device_id: int = 0,
         enable_cuda_graph: bool = False,
         disable_optimization: bool = False,
+        max_cuda_graphs: int = 1,
     ):
         self.onnx_path = onnx_path
         self.provider = "CUDAExecutionProvider"
-        self.provider_options = CudaSession.get_cuda_provider_options(device_id, enable_cuda_graph)
-        # self.provider_options["enable_skip_layer_norm_strict_mode"] = True
-
+        self.stream = torch.cuda.current_stream().cuda_stream
+        self.provider_options = CudaSession.get_cuda_provider_options(device_id, enable_cuda_graph, self.stream)
         session_options = ort.SessionOptions()
 
         # When the model has been optimized by onnxruntime, we can disable optimization to save session creation time.
@@ -52,10 +52,33 @@ def __init__(
         logger.info("created CUDA EP session for %s", onnx_path)
 
         device = torch.device("cuda", device_id)
-        super().__init__(ort_session, device, enable_cuda_graph)
+        self.enable_cuda_graph = enable_cuda_graph
+
+        # Support multiple CUDA graphs for different input shapes.
+        # For clip2 model that disabled cuda graph, max_cuda_graphs is updated to 0 here.
+        self.gpu_binding_manager = GpuBindingManager(
+            ort_session=ort_session,
+            device=device,
+            stream=self.stream,
+            max_cuda_graphs=max_cuda_graphs if enable_cuda_graph else 0,
+        )
+
+        self.current_gpu_binding = None
+
+    def metadata(self, name: str):
+        data = {}
+        if self.current_gpu_binding is not None:
+            if self.current_gpu_binding.last_run_gpu_graph_id >= 0:
+                data[f"{name}.gpu_graph_id"] = self.current_gpu_binding.last_run_gpu_graph_id
+        return data
+
+    def infer(self, feed_dict: Dict[str, torch.Tensor]):
+        return self.current_gpu_binding.infer(feed_dict=feed_dict, disable_cuda_graph_in_run=not self.enable_cuda_graph)
 
     def allocate_buffers(self, shape_dict, device):
-        super().allocate_buffers(shape_dict)
+        self.current_gpu_binding = self.gpu_binding_manager.get_binding(
+            shape_dict=shape_dict, use_cuda_graph=self.enable_cuda_graph
+        )
 
 
 class _ModelConfig:
@@ -220,6 +243,7 @@ def build_engines(
         device_id: int = 0,
         save_fp32_intermediate_model: bool = False,
         import_engine_dir: Optional[str] = None,
+        max_cuda_graphs: int = 1,
     ):
         self.torch_device = torch.device("cuda", device_id)
         self.load_models(framework_model_dir)
@@ -352,6 +376,7 @@ def build_engines(
                 device_id=device_id,
                 enable_cuda_graph=use_cuda_graph,
                 disable_optimization=False,
+                max_cuda_graphs=max_cuda_graphs,
             )
 
             logger.info("%s options for %s: %s", engine.provider, model_name, engine.provider_options)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
index 104ce984bd40..1629537dc294 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
@@ -547,7 +547,7 @@ def pt_to_numpy(images: torch.FloatTensor):
         return ((images + 1) / 2).clamp(0, 1).detach().permute(0, 2, 3, 1).float().cpu().numpy()
 
     def metadata(self) -> Dict[str, Any]:
-        return {
+        data = {
             "actual_steps": self.actual_steps,
             "seed": self.get_current_seed(),
             "name": self.pipeline_info.name(),
@@ -555,6 +555,12 @@ def metadata(self) -> Dict[str, Any]:
             "custom_unet": self.pipeline_info.custom_unet(),
         }
 
+        if self.engine_type == EngineType.ORT_CUDA:
+            for engine_name, engine in self.backend.engines.items():
+                data.update(engine.metadata(engine_name))
+
+        return data
+
     def save_images(self, images: List, prompt: List[str], negative_prompt: List[str], metadata: Dict[str, Any]):
         session_id = str(random.randint(1000, 9999))
         for i, image in enumerate(images):
@@ -754,6 +760,7 @@ def run(
         controlnet_scales: Optional[torch.Tensor] = None,
         show_latency: bool = False,
         output_type: str = "pil",
+        deterministic: bool = False,
     ):
         """
         Run the diffusion pipeline.
@@ -783,6 +790,9 @@ def run(
             output_type (str):
                 It can be "latent", "pt" or "pil".
         """
+        if deterministic:
+            torch.use_deterministic_algorithms(True)
+
         if self.is_backend_tensorrt():
             import tensorrt as trt
             from trt_utilities import TRT_LOGGER
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt
index d2488fe6d6db..0798b659306b 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt
@@ -1,5 +1,5 @@
 diffusers==0.24.0
-transformers==4.36.0
+transformers==4.38.0
 numpy>=1.24.1
 accelerate
 onnx==1.14.1
@@ -9,7 +9,7 @@ packaging
 protobuf==3.20.3
 psutil
 sympy
-controlnet_aux
+controlnet_aux==0.0.7
 # The following are for SDXL
 optimum==1.14.1
 safetensors
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/astronaut_riding_txt2image-DDIM-50.png b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/astronaut_riding_txt2image-DDIM-50.png
new file mode 100644
index 000000000000..9d20ce550301
Binary files /dev/null and b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/astronaut_riding_txt2image-DDIM-50.png differ
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
new file mode 100644
index 000000000000..da7f47b144b9
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
@@ -0,0 +1,73 @@
+import argparse
+import os
+from typing import Optional
+
+import cv2
+import open_clip
+import torch
+from PIL import Image
+from sentence_transformers import util
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser(description="Options for Compare 2 image")
+    parser.add_argument("--image1", type=str, help="Path to image 1")
+    parser.add_argument("--image2", type=str, help="Path to image 2")
+    parser.add_argument("--cache_dir", type=str, help="Path to model cache directory")
+    args = parser.parse_args()
+    return args
+
+
+def image_encoder(img: Image.Image, cache_dir: Optional[str] = None):  # -> torch.Tensor:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model, _, preprocess = open_clip.create_model_and_transforms(
+        "ViT-B-16-plus-240", pretrained="laion400m_e32", cache_dir=cache_dir
+    )
+    model.to(device)
+
+    img1 = Image.fromarray(img).convert("RGB")
+    img1 = preprocess(img1).unsqueeze(0).to(device)
+    img1 = model.encode_image(img1)
+    return img1
+
+
+def load_image(image_path: str):  # -> Image.Image:
+    # cv2.imread() can silently fail when the path is too long
+    # https://stackoverflow.com/questions/68716321/how-to-use-absolute-path-in-cv2-imread
+    if os.path.isabs(image_path):
+        directory = os.path.dirname(image_path)
+        current_directory = os.getcwd()
+        os.chdir(directory)
+        img = cv2.imread(os.path.basename(image_path), cv2.IMREAD_UNCHANGED)
+        os.chdir(current_directory)
+    else:
+        img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
+    return img
+
+
+def generate_score(image1: str, image2: str, cache_dir: Optional[str] = None):  # -> float:
+    test_img = load_image(image1)
+    data_img = load_image(image2)
+    img1 = image_encoder(test_img, cache_dir)
+    img2 = image_encoder(data_img, cache_dir)
+    cos_scores = util.pytorch_cos_sim(img1, img2)
+    score = round(float(cos_scores[0][0]) * 100, 2)
+    return score
+
+
+def main():
+    args = arg_parser()
+    image1 = args.image1
+    image2 = args.image2
+    cache_dir = args.cache_dir
+    score = round(generate_score(image1, image2, cache_dir), 2)
+    print("similarity Score: ", {score})
+    if score < 97:
+        print(f"{image1} and {image2} are different")
+        raise SystemExit(1)
+    else:
+        print(f"{image1} and {image2} are same")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt
new file mode 100644
index 000000000000..e51ffb395c64
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt
@@ -0,0 +1,4 @@
+git+https://github.com/openai/CLIP.git
+open_clip_torch
+sentence_transformers
+pillow
diff --git a/onnxruntime/python/tools/transformers/models/whisper/README.md b/onnxruntime/python/tools/transformers/models/whisper/README.md
index 8ff5c8a6e1de..b44124340a2c 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/README.md
+++ b/onnxruntime/python/tools/transformers/models/whisper/README.md
@@ -1,5 +1,23 @@
 # Whisper
 
+## Prerequisites
+
+Please note the package versions needed for using Whisper in the `requirements.txt` file that fits your scenario.
+- `requirements-cpu.txt`
+  - For running Whisper on CPU
+- `requirements-cuda.txt`
+  - For running Whisper on CUDA
+  - Note that `torch` with CUDA enabled is not installed automatically. This is because `torch` should be installed with the CUDA version used on your machine. Please visit [the PyTorch website](https://pytorch.org/get-started/locally/) to download the `torch` version that is used with the CUDA version installed on your machine and satisfies the requirement listed in the file.
+- `requirements.txt`
+  - Package versions needed in each of the above files
+- ffmpeg-python is also required, but please install it by source code with allowed codecs to avoid any patent risks.
+
+In addition to the above packages, you will need to install `ffmpeg` on your machine. Visit the [FFmpeg website](https://ffmpeg.org/) for details. You can also install it natively using package managers.
+
+- Linux: `sudo apt-get install ffmpeg`
+- MacOS: `sudo brew install ffmpeg`
+- Windows: Download from website
+
 ## Exporting Whisper with Beam Search
 
 There are several ways to export Whisper with beam search (using Whisper tiny as an example).
@@ -10,10 +28,10 @@ There are several ways to export Whisper with beam search (using Whisper tiny as
 # From source
 $ git clone https://github.com/microsoft/onnxruntime
 $ cd onnxruntime/onnxruntime/python/tools/transformers/
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format
 
 # From wheel
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format
 ```
 
 ### Option 2: end-to-end model from [Olive](https://github.com/microsoft/Olive/tree/main/examples/whisper)
@@ -39,40 +57,49 @@ model.save_pretrained(model_name.split("/")[-1] + "-onnx")
 
 Here are some additional examples for exporting Whisper with beam search.
 
+To see all available options
+```
+# From source:
+$ python3 -m models.whisper.convert_to_onnx --help
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx --help
+```
+
 Export with Forced Decoder Input Ids
 ```
 # From source:
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --use_forced_decoder_ids
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --use_forced_decoder_ids
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --use_forced_decoder_ids
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --use_forced_decoder_ids
 ```
 
 Export + Optimize for FP32
 ```
 # From source:
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --optimize_onnx --precision fp32
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --optimize_onnx --precision fp32
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --optimize_onnx --precision fp32
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --optimize_onnx --precision fp32
 ```
 
 Export + Optimize for FP16 and GPU
 ```
 # From source:
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --optimize_onnx --precision fp16 --use_gpu --provider cuda
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --optimize_onnx --precision fp16 --use_gpu --provider cuda --disable_auto_mixed_precision
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --optimize_onnx --precision fp16 --use_gpu --provider cuda
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --optimize_onnx --precision fp16 --use_gpu --provider cuda --disable_auto_mixed_precision
 ```
 
 Export + Quantize for INT8
 ```
 # From source:
-$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --precision int8 --quantize_embedding_layer
+$ python3 -m models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --precision int8 --quantize_embedding_layer
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-tiny --output whispertiny --use_external_data_format --precision int8 --quantize_embedding_layer
+$ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/whisper-large-v3 --output whisperlargev3 --use_external_data_format --precision int8 --quantize_embedding_layer
 ```
 
 ## Benchmark Whisper
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
index 759ae6d14f18..3f7a292a0274 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
@@ -1,3 +1,9 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
 import argparse
 import ast
 import datetime
@@ -54,6 +60,8 @@ def load_via_numpy():
             inputs["decoder_input_ids"] = np.array([args.decoder_input_ids], dtype=np.int32)
         if args.has_logits_processor:
             inputs["logits_processor"] = np.array([args.logits_processor], dtype=np.int32)
+        if args.has_temperature:
+            inputs["temperature"] = np.array([args.temperature], dtype=np.float32)
 
     # Measure time taken to load audio file
     logger.info(f"Load audio: {args.audio_path}")
@@ -137,10 +145,10 @@ def get_model(args: argparse.Namespace):
         start_time = time.time()
         model = ORTModelForSpeechSeq2Seq.from_pretrained(
             args.hf_ort_dir_path,
-            use_io_binding=(args.device != "cpu"),
             provider=provider,
             provider_options=provider_options,
             session_options=sess_options,
+            use_io_binding=True,  # Avoid memory copy overhead
         )
         end_time = time.time()
 
@@ -163,6 +171,7 @@ def get_model(args: argparse.Namespace):
 def time_fn(args, fn, inputs):
     warmup_inputs = inputs[0] if type(inputs) is tuple else inputs
     benchmark_inputs = inputs[1] if type(inputs) is tuple else inputs
+    torch_device = torch.device(args.target_device)
 
     # Warm up
     warmup_range = (
@@ -180,7 +189,7 @@ def time_fn(args, fn, inputs):
 
     # Benchmark
     if args.device != "cpu":
-        torch.cuda.synchronize()
+        torch.cuda.synchronize(torch_device)
     start_time = time.time()
 
     bench_range = (
@@ -192,7 +201,7 @@ def time_fn(args, fn, inputs):
         fn(benchmark_inputs)
 
     if args.device != "cpu":
-        torch.cuda.synchronize()
+        torch.cuda.synchronize(torch_device)
     end_time = time.time()
 
     # Newline print after trange in order to print metrics on new lines without progress bar on same line
@@ -401,7 +410,8 @@ def handle_output(output):
         actual_output = handle_output(ort_outputs[0][0])
         logger.info(f"Generated token length: {len(actual_output)} tokens")
         transcription = args.processor.batch_decode(ort_outputs[0], skip_special_tokens=True)[0]
-        logger.info(f"Transcription: {transcription}")
+        # print to stdout as the output for comparison
+        print(f"{transcription}")
 
     measure_fn(args, generate_fn, ort_inputs)
 
@@ -500,7 +510,13 @@ def parse_args():
         "--logits-processor",
         type=int,
         default=1,
-        help="Type of logits processor to use. See `BeamSearch` in https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/graph/contrib_ops/contrib_defs.cc for details.",
+        help="Whether to use timestamps logits processor or not (0 for false, 1 for true).",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Temperature value for generation.",
     )
 
     # Args for accessing detailed info
@@ -581,6 +597,7 @@ def main():
         args.has_audio_stream = "audio_stream" in ort_model_inputs
         setattr(args, "has_decoder_input_ids", "decoder_input_ids" in ort_model_inputs)  # noqa: B010
         setattr(args, "has_logits_processor", "logits_processor" in ort_model_inputs)  # noqa: B010
+        setattr(args, "has_temperature", "temperature" in ort_model_inputs)  # noqa: B010
 
         if args.decoder_input_ids == []:
             args.decoder_input_ids = [config.decoder_start_token_id]
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
index 071b539ac189..814b0dd1ef6a 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
@@ -1,3 +1,9 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
 import argparse
 import datetime
 import json
@@ -8,6 +14,7 @@
 import librosa
 import torch
 from benchmark_helper import setup_logger
+from metrics import BenchmarkRecord
 from transformers import WhisperConfig, WhisperProcessor
 
 logger = logging.getLogger(__name__)
@@ -123,13 +130,21 @@ def get_args():
         help="Number of mins to attempt the benchmark before moving on",
     )
 
+    parser.add_argument(
+        "--log-folder",
+        type=str,
+        default=None,
+        help="Path to folder to save logs and results",
+    )
+
     parser.add_argument("--tune", default=False, action="store_true")
 
     args = parser.parse_args()
 
     setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-"))  # noqa: B010
     log_folder_name = f"./{args.model_size}-{args.precision}"
-    setattr(args, "log_folder", log_folder_name)  # noqa: B010
+    if not args.log_folder:
+        args.log_folder = log_folder_name
     os.makedirs(args.log_folder, exist_ok=True)
 
     # Convert timeout value to secs
@@ -235,6 +250,9 @@ def save_results(results, filename):
     df = pd.DataFrame(
         results,
         columns=[
+            "Warmup Runs",
+            "Measured Runs",
+            "Model Name",
             "Engine",
             "Precision",
             "Device",
@@ -254,6 +272,8 @@ def save_results(results, filename):
     )
 
     # Set column types
+    df["Warmup Runs"] = df["Warmup Runs"].astype("int")
+    df["Measured Runs"] = df["Measured Runs"].astype("int")
     df["Duration (s)"] = df["Duration (s)"].astype("float")
     df["Token Length"] = df["Token Length"].astype("int")
     df["Load Audio Latency (s)"] = df["Load Audio Latency (s)"].astype("float")
@@ -266,7 +286,55 @@ def save_results(results, filename):
     df["Memory (GB)"] = df["Memory (GB)"].astype("float")
     df["Real Time Factor (RTF)"] = df["Real Time Factor (RTF)"].astype("float")
 
-    df.to_csv(filename, index=False)
+    # get package name and version
+    import pkg_resources
+
+    installed_packages = pkg_resources.working_set
+    installed_packages_list = sorted(
+        [
+            f"{i.key}=={i.version}"
+            for i in installed_packages
+            if i.key in ["ort-nightly-gpu", "ort-nightly", "onnxruntime", "onnxruntime-gpu"]
+        ]
+    )
+    ort_pkg_name = ""
+    ort_pkg_version = ""
+    if installed_packages_list:
+        ort_pkg_name = installed_packages_list[0].split("==")[0]
+        ort_pkg_version = installed_packages_list[0].split("==")[1]
+
+    # Save results to csv with standard format
+    records = []
+    for _, row in df.iterrows():
+        if row["Engine"] == "onnxruntime":
+            record = BenchmarkRecord(
+                row["Model Name"], row["Precision"], row["Engine"], row["Device"], ort_pkg_name, ort_pkg_version
+            )
+        else:
+            record = BenchmarkRecord(
+                row["Model Name"], row["Precision"], row["Engine"], row["Device"], torch.__name__, torch.__version__
+            )
+        record.config.customized["audio_file"] = row["Audio File"]
+        record.config.warmup_runs = row["Warmup Runs"]
+        record.config.measured_runs = row["Measured Runs"]
+
+        record.metrics.customized["duration"] = row["Duration (s)"]
+        record.metrics.customized["token_length"] = row["Token Length"]
+        record.metrics.customized["load_audio_latency"] = row["Load Audio Latency (s)"]
+        record.metrics.customized["load_audio_throughput"] = row["Load Audio Throughput (qps)"]
+        record.metrics.customized["feature_extractor_latency_s"] = row["Feature Extractor Latency (s)"]
+        record.metrics.customized["feature_extractor_throughput_qps"] = row["Feature Extractor Throughput (qps)"]
+        record.metrics.customized["per_token_latency_ms"] = row["Per Token Latency (ms/token)"]
+        record.metrics.customized["rtf"] = row["Real Time Factor (RTF)"]
+
+        record.metrics.latency_ms_mean = row["Latency (s)"] * 1000
+        record.metrics.throughput_qps = row["Throughput (qps)"]
+        record.metrics.max_memory_usage_GB = row["Memory (GB)"]
+
+        records.append(record)
+
+    BenchmarkRecord.save_as_csv(filename, records)
+    BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
     logger.info(f"Results saved in {filename}!")
 
 
@@ -282,7 +350,16 @@ def benchmark(args, benchmark_cmd, engine, audio_file, duration):
 
     # Create entries for csv
     logger.info("Gathering data from log files...")
-    base_results = [engine, args.precision, args.device, audio_file, duration]
+    base_results = [
+        args.warmup_runs,
+        args.num_runs,
+        args.model_name,
+        engine,
+        args.precision,
+        args.device,
+        audio_file,
+        duration,
+    ]
     results = process_log_file(args.device_id, log_path, base_results)
 
     return results
diff --git a/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
index 50637b772c23..bdd49b9f70a4 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
@@ -28,17 +28,34 @@
 def parse_arguments(argv=None):
     parser = argparse.ArgumentParser()
 
-    pretrained_models = PRETRAINED_WHISPER_MODELS
-    parser.add_argument(
+    conversion_args = parser.add_argument_group("Conversion Process Args")
+    optional_inputs = parser.add_argument_group("Optional Inputs (for WhisperBeamSearch op)")
+    optional_outputs = parser.add_argument_group("Optional Outputs (for WhisperBeamSearch op)")
+    quant_args = parser.add_argument_group("INT8 Quantization Args")
+
+    #################################
+    # Conversion options for Whisper
+    #################################
+
+    conversion_args.add_argument(
         "-m",
         "--model_name_or_path",
         required=False,
         default=PRETRAINED_WHISPER_MODELS[0],
         type=str,
-        help="Model path, or pretrained model name in the list: " + ", ".join(pretrained_models),
+        help="Model path, or pretrained model name in the list: " + ", ".join(PRETRAINED_WHISPER_MODELS),
+    )
+
+    conversion_args.add_argument(
+        "--model_impl",
+        required=False,
+        default="hf",
+        choices=["hf", "openai"],
+        type=str,
+        help="Select implementation for export of encoder and decoder subgraphs",
     )
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "--cache_dir",
         required=False,
         type=str,
@@ -46,7 +63,7 @@ def parse_arguments(argv=None):
         help="Directory to cache pre-trained models",
     )
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "--output",
         required=False,
         type=str,
@@ -54,19 +71,24 @@ def parse_arguments(argv=None):
         help="Output directory",
     )
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "-o",
         "--optimize_onnx",
         required=False,
         action="store_true",
         help="Use optimizer.py to optimize onnx model",
     )
-    parser.set_defaults(optimize_onnx=False)
+    conversion_args.set_defaults(optimize_onnx=False)
 
-    parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
-    parser.set_defaults(use_gpu=False)
+    conversion_args.add_argument(
+        "--use_gpu",
+        required=False,
+        action="store_true",
+        help="Use GPU for model inference",
+    )
+    conversion_args.set_defaults(use_gpu=False)
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "-p",
         "--precision",
         required=False,
@@ -76,221 +98,226 @@ def parse_arguments(argv=None):
         help="Precision of model to run. fp32 for full precision, fp16 for half precision, int8 for quantization",
     )
 
-    parser.add_argument("--verbose", required=False, action="store_true")
-    parser.set_defaults(verbose=False)
-
-    parser.add_argument("-e", "--use_external_data_format", required=False, action="store_true")
-    parser.set_defaults(use_external_data_format=False)
-
-    parser.add_argument(
-        "-s",
-        "--use_decoder_start_token",
+    conversion_args.add_argument(
+        "--use_int64_inputs",
         required=False,
         action="store_true",
-        help="Use config.decoder_start_token_id. Otherwise, add an extra graph input to \
-              the encoder-decoder-init subgraph for decoder_input_ids.",
+        help="Use int64 instead of int32 for input_ids and attention_mask.",
     )
-    parser.set_defaults(use_decoder_start_token=False)
+    conversion_args.set_defaults(use_int64_inputs=False)
 
-    parser.add_argument(
-        "-f",
-        "--use_forced_decoder_ids",
+    conversion_args.add_argument(
+        "--disable_auto_mixed_precision",
         required=False,
         action="store_true",
-        help="Use decoder_input_ids as an extra graph input to the beam search op",
+        help="Use pure fp16 instead of mixed precision",
     )
-    parser.set_defaults(use_forced_decoder_ids=False)
+    conversion_args.set_defaults(disable_auto_mixed_precision=False)
 
-    parser.add_argument(
-        "-l",
-        "--use_logits_processor",
+    conversion_args.add_argument(
+        "-r",
+        "--provider",
         required=False,
-        action="store_true",
-        help="Use logits_processor as an extra graph input to enable specific logits processing",
+        type=str,
+        default="cpu",
+        choices=list(PROVIDERS.keys()),
+        help="Provider to benchmark. Default is CPUExecutionProvider.",
     )
-    parser.set_defaults(use_specific_logits_processor=False)
 
-    parser.add_argument(
-        "-v",
-        "--use_vocab_mask",
+    conversion_args.add_argument(
+        "--verbose",
         required=False,
         action="store_true",
-        help="Use vocab_mask as an extra graph input to enable specific logits processing",
+        help="Enable verbose logging",
     )
-    parser.set_defaults(use_vocab_mask=False)
+    conversion_args.set_defaults(verbose=False)
 
-    parser.add_argument(
-        "-u",
-        "--use_prefix_vocab_mask",
+    conversion_args.add_argument(
+        "-e",
+        "--use_external_data_format",
         required=False,
         action="store_true",
-        help="Use prefix_vocab_mask as an extra graph input to enable specific logits processing",
+        help="Save weights in external file. Necessary for 'small', 'medium', and 'large' models. Optional for 'tiny' and 'base' models.",
     )
-    parser.set_defaults(use_prefix_vocab_mask=False)
+    conversion_args.set_defaults(use_external_data_format=False)
 
-    parser.add_argument(
+    conversion_args.add_argument(
         "-w",
         "--overwrite",
         required=False,
         action="store_true",
-        help="overwrite existing ONNX model",
+        help="Overwrite existing ONNX model",
     )
-    parser.set_defaults(overwrite=False)
+    conversion_args.set_defaults(overwrite=False)
 
-    parser.add_argument(
-        "--disable_auto_mixed_precision",
+    conversion_args.add_argument(
+        "--separate_encoder_and_decoder_init",
         required=False,
         action="store_true",
-        help="use pure fp16 instead of mixed precision",
+        help="Do not merge encoder and decoder init to initialize past KV caches. Output 3 instead of 2 ONNX models.",
     )
-    parser.set_defaults(disable_auto_mixed_precision=False)
+    conversion_args.set_defaults(separate_encoder_and_decoder_init=False)
 
-    parser.add_argument(
-        "--separate_encoder_and_decoder_init",
+    conversion_args.add_argument(
+        "--no_beam_search_op",
         required=False,
         action="store_true",
-        help="Do not merge encode and decoder init. Output 3 instead of 2 onnx models.",
+        help="Do not produce model with WhisperBeamSearch op, which chains encdecinit and decoder models into one op.",
     )
-    parser.set_defaults(separate_encoder_and_decoder_init=False)
+    conversion_args.set_defaults(no_beam_search_op=False)
 
-    parser.add_argument(
-        "--use_int64_inputs",
+    conversion_args.add_argument(
+        "--state_dict_path",
+        type=str,
+        default="",
+        help="Filepath to load pre-trained model with custom state dictionary (e.g. pytorch_model.bin)",
+    )
+
+    #############################################################
+    # Optional inputs for Whisper
+    # (listed below in the order that WhisperBeamSearch expects)
+    #############################################################
+
+    optional_inputs.add_argument(
+        "-v",
+        "--use_vocab_mask",
         required=False,
         action="store_true",
-        help="Use int64 instead of int32 for input_ids, position_ids and attention_mask.",
+        help="Use vocab_mask as an extra graph input to enable specific logits processing",
     )
-    parser.set_defaults(use_int64_inputs=False)
+    optional_inputs.set_defaults(use_vocab_mask=False)
 
-    parser.add_argument(
-        "--chain_model",
+    optional_inputs.add_argument(
+        "-u",
+        "--use_prefix_vocab_mask",
         required=False,
         action="store_true",
-        help="Produce beam search model with chained encdecinit and decoder.",
+        help="Use prefix_vocab_mask as an extra graph input to enable specific logits processing",
     )
-    parser.set_defaults(chain_model=True)
+    optional_inputs.set_defaults(use_prefix_vocab_mask=False)
 
-    parser.add_argument(
-        "--use_whisper_beamsearch",
+    optional_inputs.add_argument(
+        "-f",
+        "--use_forced_decoder_ids",
         required=False,
         action="store_true",
-        help="When chain_model, using WhisperBeamSearch operator rather than BeamSearch operator. \
-              It will be set to true when collect_cross_qk, extra_decoding_ids or output_no_speech_probs is set.",
+        help="Use decoder_input_ids as an extra graph input to the beam search op",
     )
-    parser.set_defaults(use_whisper_beamsearch=False)
+    optional_inputs.set_defaults(use_forced_decoder_ids=False)
 
-    parser.add_argument(
-        "--extra_decoding_ids",
+    optional_inputs.add_argument(
+        "-l",
+        "--use_logits_processor",
         required=False,
         action="store_true",
-        help="Need extra starting decoding ids for some feature like cross qk. Default if false.",
+        help="Use logits_processor as an extra graph input to enable specific logits processing",
     )
-    parser.set_defaults(extra_decoding_ids=False)
+    optional_inputs.set_defaults(use_specific_logits_processor=False)
 
-    parser.add_argument(
+    optional_inputs.add_argument(
         "--collect_cross_qk",
         required=False,
         action="store_true",
         help="Beam search model collect stacked cross QK.",
     )
-    parser.set_defaults(collect_cross_qk=False)
+    optional_inputs.set_defaults(collect_cross_qk=False)
 
-    parser.add_argument(
-        "--output_cross_qk",
+    optional_inputs.add_argument(
+        "--extra_decoding_ids",
         required=False,
         action="store_true",
-        help="Beam search model output collected qk as output. Also hint collect_cross_qk",
+        help="Need extra starting decoding ids for some feature like cross qk. Default if false.",
     )
-    parser.set_defaults(output_cross_qk=False)
+    optional_inputs.set_defaults(extra_decoding_ids=False)
 
-    parser.add_argument(
-        "--no_speech_token_id",
-        default=50362,
+    optional_inputs.add_argument(
+        "-t",
+        "--use_temperature",
+        required=False,
+        action="store_true",
+        help="Use temperature as an extra graph input for the WhisperBeamSearch op",
+    )
+    optional_inputs.set_defaults(use_temperature=False)
+
+    optional_inputs.add_argument(
+        "--no_repeat_ngram_size",
         type=int,
-        help="specify no_speech_token_id. Default is 50362. if >= 0, will be add into beam search attr. \
-              Note that default value maybe different between the multilingual and English-only models.",
+        default=0,
+        help="default to 0",
     )
 
-    parser.add_argument(
-        "--output_no_speech_probs",
+    #############################################################
+    # Optional outputs for Whisper
+    # (listed below in the order that WhisperBeamSearch expects)
+    #############################################################
+
+    optional_outputs.add_argument(
+        "--output_sequence_scores",
         required=False,
         action="store_true",
-        help="Beam search model output no speech probs which is computed from the encoder/context-decoder graph.",
+        help="Beam search model output scores for each generated sequence.",
     )
-    parser.set_defaults(output_no_speech_probs=False)
+    optional_outputs.set_defaults(output_sequence_scores=False)
 
-    parser.add_argument(
+    optional_outputs.add_argument(
         "--output_scores",
         required=False,
         action="store_true",
         help="Beam search model output scores over vocab per generated token.",
     )
-    parser.set_defaults(output_scores=False)
+    optional_outputs.set_defaults(output_scores=False)
 
-    parser.add_argument(
-        "--output_sequence_scores",
+    optional_outputs.add_argument(
+        "--output_cross_qk",
         required=False,
         action="store_true",
-        help="Beam search model output scores for each generated sequence.",
+        help="Beam search model output collected qk as output. Also hint collect_cross_qk",
     )
-    parser.set_defaults(output_sequence_scores=False)
+    optional_outputs.set_defaults(output_cross_qk=False)
 
-    parser.add_argument(
+    optional_outputs.add_argument(
         "--cross_qk_onnx_model",
         required=False,
         type=str,
         default=None,
-        help="the model which consume cross_qk.",
+        help="The model which consumes cross_qk outputs.",
     )
 
-    parser.add_argument(
-        "--beam_output_model",
-        type=str,
-        default="whisper_beamsearch.onnx",
-        help="default name is whisper_beamsearch.onnx.",
+    optional_outputs.add_argument(
+        "--output_no_speech_probs",
+        required=False,
+        action="store_true",
+        help="Beam search model output no speech probs which is computed from the encoder/context-decoder graph.",
     )
+    optional_outputs.set_defaults(output_no_speech_probs=False)
 
-    parser.add_argument(
+    ###################################
+    # Quantization options for Whisper
+    ###################################
+
+    quant_args.add_argument(
         "--quantize_embedding_layer",
         required=False,
         action="store_true",
         help="Quantize MatMul, GEMM, and Gather.",
     )
-    parser.set_defaults(quantize_embedding_layer=False)
+    quant_args.set_defaults(quantize_embedding_layer=False)
 
-    parser.add_argument(
+    quant_args.add_argument(
         "--quantize_per_channel",
         required=False,
         action="store_true",
         help="Quantize weights per each channel.",
     )
-    parser.set_defaults(quantize_per_channel=False)
+    quant_args.set_defaults(quantize_per_channel=False)
 
-    parser.add_argument(
+    quant_args.add_argument(
         "--quantize_reduce_range",
         required=False,
         action="store_true",
         help="Quantize weights with 7 bits.",
     )
-    parser.set_defaults(quantize_reduce_range=False)
-
-    parser.add_argument("--no_repeat_ngram_size", type=int, default=0, help="default to 0")
-
-    parser.add_argument(
-        "--state_dict_path",
-        type=str,
-        default="",
-        help="filepath to load pre-trained model with custom state dictionary (e.g. pytorch_model.bin)",
-    )
-
-    parser.add_argument(
-        "-r",
-        "--provider",
-        required=False,
-        type=str,
-        default="cpu",
-        choices=list(PROVIDERS.keys()),
-        help="Provider to benchmark. Default is CPUExecutionProvider.",
-    )
+    quant_args.set_defaults(quantize_reduce_range=False)
 
     args = parser.parse_args(argv)
     args.collect_cross_qk = args.collect_cross_qk or args.output_cross_qk
@@ -300,6 +327,7 @@ def parse_arguments(argv=None):
 
 def export_onnx_models(
     model_name_or_path,
+    model_impl,
     cache_dir,
     output_dir,
     use_gpu,
@@ -307,7 +335,7 @@ def export_onnx_models(
     optimize_onnx,
     precision,
     verbose,
-    use_decoder_start_token: bool = False,
+    use_forced_decoder_ids: bool = False,
     merge_encoder_and_decoder_init: bool = True,
     overwrite: bool = False,
     disable_auto_mixed_precision: bool = False,
@@ -321,7 +349,7 @@ def export_onnx_models(
     device = torch.device("cuda:0" if use_gpu else "cpu")
 
     models = WhisperHelper.load_model(
-        model_name_or_path, cache_dir, device, merge_encoder_and_decoder_init, state_dict_path
+        model_name_or_path, model_impl, cache_dir, device, merge_encoder_and_decoder_init, state_dict_path
     )
     config = models["decoder"].config
 
@@ -352,7 +380,6 @@ def export_onnx_models(
                 onnx_path,
                 verbose,
                 use_external_data_format,
-                use_decoder_input_ids=not use_decoder_start_token,
                 use_int32_inputs=use_int32_inputs,
             )
         else:
@@ -387,16 +414,16 @@ def export_onnx_models(
                     quantization.quantize_dynamic(
                         onnx_path,
                         output_path,
-                        op_types_to_quantize=["MatMul", "Gemm", "Gather"]
-                        if quantize_embedding_layer
-                        else ["MatMul", "Gemm"],
+                        op_types_to_quantize=(
+                            ["MatMul", "Gemm", "Gather"] if quantize_embedding_layer else ["MatMul", "Gemm"]
+                        ),
                         use_external_data_format=use_external_data_format,
                         per_channel=quantize_per_channel,
                         reduce_range=quantize_reduce_range,
                         extra_options={"MatMulConstBOnly": True},
                     )
             else:
-                logger.info(f"Skip optimizing: existed ONNX model {onnx_path}")
+                logger.info(f"Skip optimizing: existing ONNX model {onnx_path}")
         else:
             output_path = onnx_path
 
@@ -431,6 +458,7 @@ def main(argv=None):
 
     output_paths = export_onnx_models(
         args.model_name_or_path,
+        args.model_impl,
         cache_dir,
         output_dir,
         args.use_gpu,
@@ -438,7 +466,7 @@ def main(argv=None):
         args.optimize_onnx,
         args.precision,
         args.verbose,
-        args.use_decoder_start_token,
+        args.use_forced_decoder_ids,
         not args.separate_encoder_and_decoder_init,
         args.overwrite,
         args.disable_auto_mixed_precision,
@@ -451,7 +479,7 @@ def main(argv=None):
     )
 
     max_diff = 0
-    if args.chain_model:
+    if not args.no_beam_search_op:
         logger.info("Chaining model ... :")
         args.beam_model_output_dir = WhisperHelper.get_onnx_path(
             output_dir,
@@ -478,7 +506,13 @@ def main(argv=None):
         # Wrap parity check in try-except to allow export to continue in case this produces an error
         try:
             with torch.no_grad():
-                max_diff = WhisperHelper.verify_onnx(args.model_name_or_path, ort_session, device)
+                # Verify batched decoding with prompts for whisper openai implementation
+                if args.model_impl == "openai" and args.use_forced_decoder_ids:
+                    max_diff = WhisperHelper.verify_onnx(
+                        args.model_name_or_path, cache_dir, ort_session, device, batch_size=2, prompt_mode=True
+                    )
+                else:
+                    max_diff = WhisperHelper.verify_onnx(args.model_name_or_path, cache_dir, ort_session, device)
             if max_diff > 1e-4:
                 logger.warning("PyTorch and ONNX Runtime results are NOT close")
             else:
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements-cpu.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements-cpu.txt
new file mode 100644
index 000000000000..db2cd9532432
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements-cpu.txt
@@ -0,0 +1,2 @@
+-r requirements.txt
+onnxruntime>=1.17.1
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements-cuda.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements-cuda.txt
new file mode 100644
index 000000000000..9bd215de9bc0
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements-cuda.txt
@@ -0,0 +1,4 @@
+-r requirements.txt
+# Please manually install torch>=1.13.0 with CUDA enabled for the CUDA version installed in your system.
+# Instructions can be found here: https://pytorch.org/get-started/locally/
+onnxruntime-gpu>=1.17.1
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
new file mode 100644
index 000000000000..4cb808501713
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
@@ -0,0 +1,13 @@
+torch>=1.13.0
+transformers>=4.24.0
+openai-whisper
+datasets
+soundfile
+librosa
+optimum
+onnxruntime-extensions>=0.9.0
+onnx>=1.15.0
+protobuf==3.20.2
+numpy==1.23.3
+psutil
+py3nvml
diff --git a/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3 b/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3
new file mode 100644
index 000000000000..6d220f5ede6a
Binary files /dev/null and b/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3 differ
diff --git a/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt b/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt
new file mode 100644
index 000000000000..e3dbef248d0b
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt
@@ -0,0 +1 @@
+ the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
index 33958e55f8c3..be05ebc9d5da 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
@@ -1,3 +1,9 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
 import logging
 import os
 
@@ -9,7 +15,7 @@
     update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha,
 )
 from onnx import TensorProto, helper
-from transformers import WhisperConfig
+from transformers import WhisperConfig, WhisperTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -23,19 +29,33 @@ def verify_inputs(beam_inputs, graph_inputs):
         assert graph_input.name in beam_input
 
 
+def clean_list(arr, remove_all_strings=True):
+    if remove_all_strings:
+        # Remove all empty strings in list
+        return list(filter(lambda elm: elm != "", arr))
+
+    # Remove empty strings at end of list
+    while len(arr) > 0:
+        if arr[-1] == "":
+            arr.pop()
+        else:
+            break
+    return arr
+
+
 def chain_model(args):
-    # Load encoder/decoder and insert necessary (but unused) graph inputs expected by BeamSearch op or WhisperBeamSearch op
-    args.use_whisper_beamsearch = (
-        args.use_whisper_beamsearch or args.collect_cross_qk or args.output_no_speech_probs or args.extra_decoding_ids
-    )
+    # Load encoder/decoder and insert necessary (but unused) graph inputs expected by WhisperBeamSearch op
     encoder_model = onnx.load_model(args.encoder_path, load_external_data=True)
     encoder_model.graph.name = "encoderdecoderinit subgraph"
 
     decoder_model = onnx.load_model(args.decoder_path, load_external_data=True)
     decoder_model.graph.name = "decoder subgraph"
 
-    config = WhisperConfig.from_pretrained(args.model_name_or_path)
+    config = WhisperConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
+    tokenizer = WhisperTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
 
+    # Create inputs/outputs for WhisperBeamSearch op
+    temperature_name = "temperature_fp16" if args.precision == Precision.FLOAT16 else "temperature"
     beam_inputs = [
         "input_features_fp16" if args.precision == Precision.FLOAT16 else "input_features",
         "max_length",
@@ -44,37 +64,27 @@ def chain_model(args):
         "num_return_sequences",
         "length_penalty_fp16" if args.precision == Precision.FLOAT16 else "length_penalty",
         "repetition_penalty_fp16" if args.precision == Precision.FLOAT16 else "repetition_penalty",
-        "vocab_mask" if args.use_prefix_vocab_mask else "",
+        "vocab_mask" if args.use_vocab_mask else "",
         "prefix_vocab_mask" if args.use_prefix_vocab_mask else "",
         "",  # attention mask
         "decoder_input_ids" if args.use_forced_decoder_ids else "",
         "logits_processor" if args.use_logits_processor else "",
+        "cross_qk_layer_head" if args.collect_cross_qk else "",
+        "extra_decoding_ids" if args.extra_decoding_ids else "",
+        temperature_name if args.use_temperature else "",
     ]
 
-    beam_outputs = ["sequences"]
-    if args.output_sequence_scores:
-        beam_outputs.append("sequence_scores")
-    if args.output_scores:
-        beam_outputs.append("scores")
-
-    if args.use_whisper_beamsearch:
-        assert len(beam_inputs) == 12
-        beam_inputs.extend(
-            [
-                "cross_qk_layer_head" if args.collect_cross_qk else "",
-                "extra_decoding_ids" if args.extra_decoding_ids else "",
-            ]
-        )
-        if args.collect_cross_qk:
-            while len(beam_outputs) < 3:
-                beam_outputs.extend([""])
-            beam_outputs.extend(["cross_qk"])
-        if args.output_no_speech_probs:
-            while len(beam_outputs) < 4:
-                beam_outputs.extend([""])
-            beam_outputs.extend(["no_speech_probs_beam"])
-
-    input_features_cast_node, len_pen_cast_node, rep_pen_cast_node = None, None, None
+    sequence_scores_name = "sequence_scores_fp16" if args.precision == Precision.FLOAT16 else "sequence_scores"
+    scores_name = "scores_fp16" if args.precision == Precision.FLOAT16 else "scores"
+    beam_outputs = [
+        "sequences",
+        sequence_scores_name if args.output_sequence_scores else "",
+        scores_name if args.output_scores else "",
+        "cross_qk" if args.collect_cross_qk else "",
+        "no_speech_probs_beam" if args.output_no_speech_probs else "",
+    ]
+
+    graph_nodes = []
     if args.precision == Precision.FLOAT16:
         input_features_cast_node = helper.make_node(
             "Cast",
@@ -97,26 +107,70 @@ def chain_model(args):
             name="CastRepetitionPenaltyToFp16",
             to=TensorProto.FLOAT16,
         )
-
-    operator_type = "WhisperBeamSearch" if args.use_whisper_beamsearch else "BeamSearch"
-    node = helper.make_node(operator_type, inputs=beam_inputs, outputs=beam_outputs, name="BeamSearch_zcode")
-    node.domain = "com.microsoft"
-    node.attribute.extend(
-        [
-            helper.make_attribute("eos_token_id", config.eos_token_id),
-            helper.make_attribute("pad_token_id", config.pad_token_id),
-            helper.make_attribute("decoder_start_token_id", config.decoder_start_token_id),
-            helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
-            helper.make_attribute("early_stopping", True),
-            helper.make_attribute("model_type", 2),
-        ]
+        graph_nodes.extend([input_features_cast_node, len_pen_cast_node, rep_pen_cast_node])
+
+        if args.use_temperature:
+            temp_cast_node = helper.make_node(
+                "Cast",
+                inputs=["temperature"],
+                outputs=["temperature_fp16"],
+                name="temperature_to_fp16",
+                to=TensorProto.FLOAT16,
+            )
+            graph_nodes.append(temp_cast_node)
+
+        if args.output_sequence_scores:
+            output_sequence_scores_cast_node = helper.make_node(
+                "Cast",
+                inputs=["sequence_scores_fp16"],
+                outputs=["sequence_scores"],
+                name="CastOutputSequenceScoresToFp32",
+                to=TensorProto.FLOAT,
+            )
+            graph_nodes.append(output_sequence_scores_cast_node)
+
+        if args.output_scores:
+            output_scores_cast_node = helper.make_node(
+                "Cast",
+                inputs=["scores_fp16"],
+                outputs=["scores"],
+                name="CastScoresToFp32",
+                to=TensorProto.FLOAT,
+            )
+            graph_nodes.append(output_scores_cast_node)
+
+    # Create WhisperBeamSearch op
+    beam_search_attrs = [
+        helper.make_attribute("eos_token_id", config.eos_token_id),
+        helper.make_attribute("pad_token_id", config.pad_token_id),
+        helper.make_attribute(
+            "decoder_start_token_id", config.decoder_start_token_id
+        ),  # same as tokenizer.convert_tokens_to_ids(['<|startoftranscript|>'])[0]
+        helper.make_attribute("translate_token_id", tokenizer.convert_tokens_to_ids(["<|translate|>"])[0]),
+        helper.make_attribute("transcribe_token_id", tokenizer.convert_tokens_to_ids(["<|transcribe|>"])[0]),
+        helper.make_attribute("start_of_lm_token_id", tokenizer.convert_tokens_to_ids(["<|startoflm|>"])[0]),
+        (
+            helper.make_attribute("no_speech_token_id", tokenizer.convert_tokens_to_ids(["<|nospeech|>"])[0])
+            if args.output_no_speech_probs
+            else ""
+        ),
+        helper.make_attribute("no_timestamps_token_id", tokenizer.convert_tokens_to_ids(["<|notimestamps|>"])[0]),
+        helper.make_attribute("beginning_timestamp_token_id", tokenizer.convert_tokens_to_ids(["<|0.00|>"])[0]),
+        helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
+        helper.make_attribute("early_stopping", True),
+        helper.make_attribute("model_type", 2),
+        helper.make_attribute("decoder_output_cross_qk", 1) if args.collect_cross_qk else "",
+    ]
+    node = helper.make_node(
+        "WhisperBeamSearch",
+        inputs=clean_list(beam_inputs, remove_all_strings=False),
+        outputs=clean_list(beam_outputs, remove_all_strings=False),
+        name="BeamSearch",
+        domain="com.microsoft",
     )
-    if args.use_whisper_beamsearch:
-        if args.collect_cross_qk:
-            node.attribute.extend([helper.make_attribute("decoder_output_cross_qk", 1)])
-        if args.no_speech_token_id >= 0:
-            node.attribute.extend([helper.make_attribute("no_speech_token", args.no_speech_token_id)])
+    node.attribute.extend(clean_list(beam_search_attrs, remove_all_strings=True))
 
+    # Graph inputs
     input_features = helper.make_tensor_value_info(
         "input_features", TensorProto.FLOAT, ["batch_size", "feature_size", "sequence_length"]
     )
@@ -126,73 +180,63 @@ def chain_model(args):
     num_return_sequences = helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1])
     length_penalty = helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1])
     repetition_penalty = helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1])
+    vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [config.vocab_size])
+    prefix_vocab_mask = helper.make_tensor_value_info(
+        "prefix_vocab_mask", TensorProto.INT32, ["batch_size", config.vocab_size]
+    )
+    decoder_input_ids = helper.make_tensor_value_info(
+        "decoder_input_ids", TensorProto.INT32, ["batch_size", "initial_sequence_length"]
+    )
+    logits_processor = helper.make_tensor_value_info("logits_processor", TensorProto.INT32, [1])
+    cross_qk_layer_head = helper.make_tensor_value_info("cross_qk_layer_head", TensorProto.INT32, ["num_layer_head", 2])
+    extra_decoding_ids = helper.make_tensor_value_info(
+        "extra_decoding_ids", TensorProto.INT32, ["batch_size", "extra_decoding_ids_len"]
+    )
+    temperature = helper.make_tensor_value_info("temperature", TensorProto.FLOAT, [1])
 
-    graph_inputs = [
-        input_features,
-        max_length,
-        min_length,
-        num_beams,
-        num_return_sequences,
-        length_penalty,
-        repetition_penalty,
-    ]
-    if args.use_vocab_mask:
-        vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [config.vocab_size])
-        graph_inputs.append(vocab_mask)
-
-    if args.use_prefix_vocab_mask:
-        prefix_vocab_mask = helper.make_tensor_value_info(
-            "prefix_vocab_mask", TensorProto.INT32, ["batch_size", config.vocab_size]
-        )
-        graph_inputs.append(prefix_vocab_mask)
-
-    if args.use_forced_decoder_ids:
-        decoder_input_ids = helper.make_tensor_value_info(
-            "decoder_input_ids", TensorProto.INT32, ["batch_size", "initial_sequence_length"]
-        )
-        graph_inputs.append(decoder_input_ids)
-
-    if args.use_logits_processor:
-        logits_processor = helper.make_tensor_value_info("logits_processor", TensorProto.INT32, [1])
-        graph_inputs.append(logits_processor)
-
-    if args.collect_cross_qk:
-        cross_qk_layer_head = helper.make_tensor_value_info(
-            "cross_qk_layer_head", TensorProto.INT32, ["num_layer_head", 2]
-        )
-        graph_inputs.append(cross_qk_layer_head)
-
-    if args.extra_decoding_ids:
-        extra_decoding_ids = helper.make_tensor_value_info(
-            "extra_decoding_ids", TensorProto.INT32, ["batch_size", "extra_decoding_ids_len"]
-        )
-        graph_inputs.append(extra_decoding_ids)
+    graph_inputs = clean_list(
+        [
+            input_features,
+            max_length,
+            min_length,
+            num_beams,
+            num_return_sequences,
+            length_penalty,
+            repetition_penalty,
+            vocab_mask if args.use_vocab_mask else "",
+            prefix_vocab_mask if args.use_prefix_vocab_mask else "",
+            decoder_input_ids if args.use_forced_decoder_ids else "",
+            logits_processor if args.use_logits_processor else "",
+            cross_qk_layer_head if args.collect_cross_qk else "",
+            extra_decoding_ids if args.extra_decoding_ids else "",
+            temperature if args.use_temperature else "",
+        ]
+    )
 
-    # graph outputs
+    # Graph outputs
     sequences = helper.make_tensor_value_info(
         "sequences", TensorProto.INT32, ["batch_size", "num_return_sequences", "max_length"]
     )
-    graph_outputs = [sequences]
-    if args.output_cross_qk or (not args.cross_qk_onnx_model and args.collect_cross_qk):
-        cross_qk = helper.make_tensor_value_info(
-            "cross_qk",
-            TensorProto.FLOAT,
-            ["batch_size", "num_return_sequences", "num_layer_head_cross_qk", "max_length", "frames"],
-        )
-        graph_outputs.extend([cross_qk])
-
-    if args.output_no_speech_probs:
-        no_speech_probs = helper.make_tensor_value_info("no_speech_probs", TensorProto.FLOAT, ["batch_size"])
-        graph_outputs.extend([no_speech_probs])
-
-    if args.output_sequence_scores:
-        sequence_scores = helper.make_tensor_value_info("sequence_scores", TensorProto.FLOAT, ["batch_size"])
-        graph_outputs.extend([sequence_scores])
+    sequence_scores = helper.make_tensor_value_info("sequence_scores", TensorProto.FLOAT, ["batch_size"])
+    scores = helper.make_tensor_value_info("scores", TensorProto.FLOAT, ["batch_size"])
+    cross_qk = helper.make_tensor_value_info(
+        "cross_qk",
+        TensorProto.FLOAT,
+        ["batch_size", "num_return_sequences", "num_layer_head_cross_qk", "max_length", "frames"],
+    )
+    no_speech_probs = helper.make_tensor_value_info("no_speech_probs", TensorProto.FLOAT, ["batch_size"])
 
-    if args.output_scores:
-        scores = helper.make_tensor_value_info("scores", TensorProto.FLOAT, ["batch_size"])
-        graph_outputs.extend([scores])
+    graph_outputs = clean_list(
+        [
+            sequences,
+            sequence_scores if args.output_sequence_scores else "",
+            scores if args.output_scores else "",
+            cross_qk if args.output_cross_qk or (not args.cross_qk_onnx_model and args.collect_cross_qk) else "",
+            no_speech_probs if args.output_no_speech_probs else "",
+        ]
+    )
 
+    # Replace MultiHeadAttention with DecoderMaskedMultiHeadAttention for CUDA EP inference
     if hasattr(args, "use_gpu") and args.use_gpu:
         if update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha(decoder_model.graph):
             logger.info("Updated whisper decoder subgraph to use DecoderMaskedMultiHeadAttention successfully!")
@@ -213,11 +257,7 @@ def chain_model(args):
 
     opset_import = [helper.make_opsetid(domain="com.microsoft", version=1), helper.make_opsetid(domain="", version=17)]
 
-    graph_nodes = (
-        [input_features_cast_node, len_pen_cast_node, rep_pen_cast_node, node]
-        if args.precision == Precision.FLOAT16
-        else [node]
-    )
+    graph_nodes.append(node)
     if args.output_no_speech_probs:
         prob_cast_node = helper.make_node(
             "Cast",
@@ -226,9 +266,16 @@ def chain_model(args):
             name="no_speech_probs_cast_to_fp32",
             to=TensorProto.FLOAT,
         )
-        graph_nodes.extend([prob_cast_node])
-
-    beam_graph = helper.make_graph(graph_nodes, "beam-search-test", graph_inputs, graph_outputs, initializers)
+        graph_nodes.append(prob_cast_node)
+
+    # Make graph with WhisperBeamSearch op
+    beam_graph = helper.make_graph(
+        graph_nodes,
+        name="WhisperBeamSearch Graph",
+        inputs=graph_inputs,
+        outputs=graph_outputs,
+        initializer=initializers,
+    )
     beam_graph_input_names = [gi.name for gi in graph_inputs]
     beam_graph_output_names = [go.name for go in graph_outputs]
 
@@ -262,10 +309,12 @@ def chain_model(args):
         ir_version=decoder_model.ir_version,
     )
 
+    # Save WhisperBeamSearch graph and external data
     if os.path.isfile(args.beam_model_output_dir):
         logger.info(f"Overwriting {args.beam_model_output_dir} and {args.beam_model_output_dir + '.data'}")
         os.remove(args.beam_model_output_dir)
         os.remove(args.beam_model_output_dir + ".data")
+
     onnx.save(
         beam_model,
         args.beam_model_output_dir,
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
index eca5ce3de15d..5da235d72ca0 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
@@ -18,6 +18,7 @@
 from onnx_model import OnnxModel
 from torch_onnx_export_helper import torch_onnx_export
 from transformers import WhisperConfig, file_utils
+from whisper_openai_helper import WhisperDecoderInitOpenai
 
 from onnxruntime import InferenceSession
 
@@ -67,10 +68,13 @@ def forward(
 class WhisperDecoder(torch.nn.Module):
     """A Whisper decoder with past key values"""
 
-    def __init__(self, decoder, config):
+    def __init__(self, decoder, config, model_impl: str = "hf", model: torch.nn.Module = None):
         super().__init__()
         self.decoder = decoder
         self.config = config
+        self.model_impl = model_impl
+        if model is not None:
+            self.whisper_decoder_openai_init = WhisperDecoderInitOpenai(model, decoder)
 
     def forward(self, decoder_input_ids, *past):
         encoder_outputs = file_utils.ModelOutput()
@@ -78,6 +82,14 @@ def forward(self, decoder_input_ids, *past):
         encoder_outputs["last_hidden_state"] = dummy_encoder_hidden_states
         encoder_outputs["hidden_states"] = dummy_encoder_hidden_states
         encoder_outputs["attentions"] = None
+
+        if self.model_impl == "openai":
+            dummy_encoder_hidden_states.unsqueeze(0)
+            dec_out, present = self.whisper_decoder_openai_init(
+                decoder_input_ids, dummy_encoder_hidden_states, past=past
+            )
+            return dec_out, present
+
         if len(past) == 0:
             past_key_values = None
         else:
@@ -114,6 +126,7 @@ def create_dummy(
         device: torch.device,
         float16: bool = False,
         use_int32_inputs: bool = False,
+        model_impl: str = "hf",
     ):  # -> WhisperDecoderInputs:
         """Create dummy inputs for WhisperDecoder.
 
@@ -158,7 +171,7 @@ def create_dummy(
             cross_attention_past_shape = [
                 batch_size,
                 num_attention_heads,
-                encode_sequence_length,
+                encode_sequence_length if model_impl == "hf" else past_decode_sequence_length,
                 head_size,
             ]
 
@@ -213,9 +226,10 @@ def export_onnx(
             decoder.config,
             batch_size=2,
             encode_sequence_length=3000,
-            past_decode_sequence_length=5 if isinstance(decoder, WhisperDecoder) else 0,
+            past_decode_sequence_length=6 if isinstance(decoder, WhisperDecoder) else 0,
             device=device,
             use_int32_inputs=use_int32_inputs,
+            model_impl=decoder.model_impl,
         )
         input_list = inputs.to_list()
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
index 826d6e42c077..93281848a5c9 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
@@ -25,12 +25,15 @@
 class WhisperEncoder(torch.nn.Module):
     """Whisper encoder outputs only the last hidden state"""
 
-    def __init__(self, encoder, config: WhisperConfig):
+    def __init__(self, encoder, config: WhisperConfig, model_impl: str = "hf"):
         super().__init__()
         self.encoder = encoder
         self.config = config
+        self.model_impl = model_impl
 
     def forward(self, input_features):
+        if self.model_impl == "openai":
+            return self.encoder(input_features)
         return self.encoder.model.encoder(input_features)[0]
 
 
@@ -40,7 +43,11 @@ def __init__(self, input_features):
 
     @staticmethod
     def create_dummy(
-        batch_size: int, sequence_length: int, feature_size: int, device: torch.device, use_int32_inputs: bool
+        batch_size: int,
+        sequence_length: int,
+        feature_size: int,
+        device: torch.device,
+        use_int32_inputs: bool = False,
     ):
         """Create dummy inputs for Whisper encoder.
 
@@ -61,9 +68,9 @@ def create_dummy(
         return WhisperEncoderInputs(input_features)
 
     def to_list(self) -> List:
-        if self.input_features is None:
+        if self.input_ids is None:
             return []
-        return [self.input_features]
+        return [self.input_ids]
 
 
 class WhisperEncoderHelper:
@@ -74,6 +81,7 @@ def export_onnx(
         onnx_model_path: str,
         verbose: bool = True,
         use_external_data_format: bool = False,
+        use_int32_inputs: bool = False,
     ):
         """Export encoder to ONNX
 
@@ -90,6 +98,7 @@ def export_onnx(
             sequence_length=3000,
             feature_size=config.num_mel_bins,
             device=device,
+            use_int32_inputs=use_int32_inputs,
         )
 
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
index a145178dbf37..fab2a2aa4c8a 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
@@ -19,6 +19,7 @@
 from transformers import WhisperConfig
 from whisper_decoder import WhisperDecoderInit
 from whisper_encoder import WhisperEncoder, WhisperEncoderInputs
+from whisper_openai_helper import WhisperDecoderInitOpenai
 
 from onnxruntime import InferenceSession
 
@@ -34,22 +35,35 @@ def __init__(
         decoder: torch.nn.Module,
         config: WhisperConfig,
         decoder_start_token_id: Optional[int] = None,
+        model_impl: str = "hf",
+        model: torch.nn.Module = None,
     ):
         super().__init__()
         self.config = config
-        self.whisper_encoder = WhisperEncoder(encoder, config)
+        self.whisper_encoder = WhisperEncoder(encoder, config, model_impl=model_impl)
         self.whisper_decoder_init = WhisperDecoderInit(decoder, config, decoder_start_token_id)
+        if model is not None:
+            self.whisper_decoder_openai_init = WhisperDecoderInitOpenai(model, decoder)
+        self.model_impl = model_impl
 
     def forward(
         self,
         encoder_input_ids: torch.Tensor,
         decoder_input_ids: torch.Tensor = None,
+        remove_hooks: bool = False,
     ):
         encoder_hidden_states: torch.FloatTensor = self.whisper_encoder(encoder_input_ids)
         # Decoder out: (logits, past_key_values, encoder_hidden_state)
-        decinit_out = self.whisper_decoder_init(decoder_input_ids, encoder_hidden_states)
-        present_self, present_cross = PastKeyValuesHelper.group_by_self_and_cross(decinit_out[1])
-        present = present_self + present_cross
+        if self.model_impl == "openai":
+            encoder_hidden_states.unsqueeze(0)
+            decinit_out, present = self.whisper_decoder_openai_init(
+                decoder_input_ids, encoder_hidden_states, remove_hooks=remove_hooks
+            )
+            return decinit_out, encoder_hidden_states, present
+        else:
+            decinit_out = self.whisper_decoder_init(decoder_input_ids, encoder_hidden_states)
+            present_self, present_cross = PastKeyValuesHelper.group_by_self_and_cross(decinit_out[1])
+            present = present_self + present_cross
         return decinit_out[0], encoder_hidden_states, present
 
 
@@ -63,7 +77,7 @@ def create_dummy(
         config: WhisperConfig,
         batch_size: int,
         encode_sequence_length: int,
-        use_decoder_input_ids: int,
+        use_decoder_input_ids: bool,
         device: torch.device,
         use_int32_inputs: bool = False,
     ):  # -> WhisperEncoderDecoderInitInputs:
@@ -72,7 +86,6 @@ def create_dummy(
             sequence_length=3000,
             feature_size=config.num_mel_bins,
             device=device,
-            use_int32_inputs=use_int32_inputs,
         )
         decoder_input_ids = None
         if use_decoder_input_ids:
@@ -114,13 +127,13 @@ def export_onnx(
             model.config,
             batch_size=2,
             encode_sequence_length=3000,
-            use_decoder_input_ids=use_decoder_input_ids,
+            use_decoder_input_ids=True,
             device=device,
             use_int32_inputs=use_int32_inputs,
         )
         input_list = inputs.to_list()
 
-        out = model(inputs.encoder_input_ids, inputs.decoder_input_ids)
+        out = model(inputs.encoder_input_ids, inputs.decoder_input_ids, remove_hooks=True)
         present = out[2]
         present_names = PastKeyValuesHelper.get_input_names(present, encoder=True)
 
@@ -146,7 +159,7 @@ def export_onnx(
         hidden_size = str(model.config.d_model)
         head_size = str(model.config.d_model // model.config.encoder_attention_heads)
         dynamic_axes = {
-            "encoder_input_ids": {0: "batch_size", 1: "encode_sequence_length"},
+            "encoder_input_ids": {0: "batch_size", 1: "feature_size"},
             "encoder_hidden_states": {
                 0: "batch_size",
                 1: "encode_sequence_length",
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
index 8c22cd5e745b..9fb51dd9b43c 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
@@ -6,37 +6,37 @@
 
 import logging
 import os
-import sys
 from pathlib import Path
 from typing import Dict, Tuple, Union
 
 import numpy as np
 import torch
+from float16 import float_to_float16_max_diff
+from onnx_model import OnnxModel
+from optimizer import optimize_model
+from packaging import version
 from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperProcessor
+from transformers import __version__ as transformers_version
 from whisper_decoder import WhisperDecoder, WhisperDecoderHelper, WhisperDecoderInit
 from whisper_encoder import WhisperEncoder, WhisperEncoderHelper
 from whisper_encoder_decoder_init import WhisperEncoderDecoderInit, WhisperEncoderDecoderInitHelper
 
 from onnxruntime import InferenceSession
 
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from float16 import float_to_float16_max_diff  # noqa: E402
-from onnx_model import OnnxModel  # noqa: E402
-from optimizer import optimize_model  # noqa: E402
-
 logger = logging.getLogger(__name__)
 
 PRETRAINED_WHISPER_MODELS = [
     "whisper-tiny",
     "whisper-tiny.en",
+    "whisper-base",
+    "whisper-base.en",
     "whisper-small",
     "whisper-small.en",
     "whisper-medium",
     "whisper-medium.en",
-    "whisper-base",
-    "whisper-base.en",
     "whisper-large",
     "whisper-large-v2",
+    "whisper-large-v3",
 ]
 
 
@@ -70,9 +70,49 @@ def get_onnx_path(
         directory = os.path.join(output_dir, model_name) if new_folder else output_dir
         return os.path.join(directory, model_name + ".onnx")
 
+    @staticmethod
+    def load_model_openai(
+        model_name_or_path: str,
+        cache_dir: str,
+        device: torch.device,
+    ) -> torch.nn.Module:
+        """Load model given a pretrained name or path, then build models for ONNX conversion.
+
+        Args:
+            model_name_or_path (str): pretrained model name or path
+            cache_dir (str): cache directory
+            device (torch.device): device to run the model
+            merge_encoder_and_decoder_init (bool, optional): Whether merge encoder and decoder initialization into one ONNX model. Defaults to True.
+        Returns:
+            Dict[str, torch.nn.Module]: mapping from name to modules for ONNX conversion.
+        """
+        from whisper import _ALIGNMENT_HEADS, _MODELS, _download
+        from whisper.model import ModelDimensions, Whisper
+
+        in_memory = False
+
+        model_name = model_name_or_path.split("/")[-1][8:]
+        checkpoint_file, alignment_heads = None, None
+        if model_name in _MODELS:
+            checkpoint_file = _download(_MODELS[model_name], cache_dir, in_memory)
+            alignment_heads = _ALIGNMENT_HEADS[model_name]
+
+        with open(checkpoint_file, "rb") as fp:
+            checkpoint = torch.load(fp, map_location=device)
+        del checkpoint_file
+
+        dims = ModelDimensions(**checkpoint["dims"])
+        model = Whisper(dims)
+        model.load_state_dict(checkpoint["model_state_dict"])
+
+        if alignment_heads is not None:
+            model.set_alignment_heads(alignment_heads)
+        return model.to(device)
+
     @staticmethod
     def load_model(
         model_name_or_path: str,
+        model_impl: str,
         cache_dir: str,
         device: torch.device,
         merge_encoder_and_decoder_init: bool = True,
@@ -88,19 +128,33 @@ def load_model(
         Returns:
             Dict[str, torch.nn.Module]: mapping from name to modules for ONNX conversion.
         """
-        model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        extra_kwargs = {}
+        if version.parse(transformers_version) >= version.parse("4.36.0"):
+            extra_kwargs["attn_implementation"] = "eager"
+        model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, cache_dir=cache_dir, **extra_kwargs)
+
+        if model_impl == "openai":
+            openai_model = WhisperHelper.load_model_openai(model_name_or_path, cache_dir, device)
+            model_encoder, model_decoder = openai_model.encoder, openai_model.decoder
+            passed_model = openai_model
+        else:
+            model_encoder, model_decoder = model, model
+            passed_model = None
+
         if state_dict_path:
             model.load_state_dict(torch.load(state_dict_path), strict=False)
 
-        decoder = WhisperDecoder(model, model.config)
+        decoder = WhisperDecoder(model_decoder, model.config, model_impl=model_impl, model=passed_model)
         decoder.eval().to(device)
 
         if merge_encoder_and_decoder_init:
             encoder_decoder_init = WhisperEncoderDecoderInit(
-                model,
-                model,
+                model_encoder,
+                model_decoder,
                 model.config,
                 decoder_start_token_id=None,
+                model_impl=model_impl,
+                model=passed_model,
             )
             return {"encoder_decoder_init": encoder_decoder_init, "decoder": decoder}
         else:
@@ -260,31 +314,37 @@ def optimize_onnx(
         m.save_model_to_file(optimized_model_path, use_external_data_format, all_tensors_to_one_file=True)
 
     @staticmethod
-    def verify_onnx(
-        model_name_or_path: str,
-        ort_session: InferenceSession,
+    def pt_transcription_for_verify_onnx(
+        processor: WhisperProcessor,
+        pt_model: torch.nn.Module,
         device: torch.device,
+        batch_size: int = 1,
+        prompt_mode: bool = False,
     ):
-        """Compare the result from PyTorch and ONNX Runtime to verify the ONNX model is good."""
-        pt_model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path).to(device)
-        processor = WhisperProcessor.from_pretrained(model_name_or_path)
-        config = WhisperConfig.from_pretrained(model_name_or_path)
-
         # Try to import `datasets` pip package
         try:
             from datasets import load_dataset
         except Exception as e:
-            logger.error(f"An error occurred while importing `datasets`: {e}", exc_info=True)
+            logger.error(f"An error occurred while importing `datasets`: {e}", exc_info=True)  # noqa: G201
             install_cmd = "pip install datasets"
             logger.warning(f"Could not import `datasets`. Attempting to install `datasets` via `{install_cmd}`.")
             os.system(install_cmd)
 
-        from datasets import load_dataset  # noqa: F811
+        from datasets import load_dataset
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        input_features = processor([ds[0]["audio"]["array"]], return_tensors="pt").input_features
-
-        batch_size, max_length, min_length, num_beams, num_return_sequences = 1, 26, 0, 5, 1
+        input_features_ = []
+        if batch_size == 1:
+            input_features = processor([ds[0]["audio"]["array"]], return_tensors="pt").input_features
+        else:
+            input_features_ = [
+                processor([ds[3]["audio"]["array"]], return_tensors="pt").input_features,
+                processor([ds[3]["audio"]["array"]], return_tensors="pt").input_features,
+            ]
+            assert len(input_features_) == batch_size
+            input_features = torch.cat((input_features_[0], input_features_[1]))
+
+        max_length, min_length, num_beams, num_return_sequences = 30, 0, 1, 1
         length_penalty, repetition_penalty = 1.0, 1.0
         inputs = {
             "input_features": input_features.to(device),
@@ -297,10 +357,97 @@ def verify_onnx(
             "early_stopping": True,
             "use_cache": True,
         }
-        pt_outputs = pt_model.generate(**inputs).detach().cpu().numpy()
 
+        if prompt_mode:
+            prompts = ["John has doubts", "Maria has grave doubts"]
+            prompt_ids = [processor.get_prompt_ids(p) for p in prompts]
+            pt_transcription = []
+            pt_outputs = []
+            # The looping for model.generate is necessary here due to the limitation as per
+            # https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperForConditionalGeneration.generate.prompt_ids
+            # prompt_ids input requires a tensor of rank 1
+            for i in range(batch_size):
+                inputs["prompt_ids"] = torch.from_numpy(prompt_ids[i])
+                inputs["input_features"] = input_features_[i].to(device)
+                pt_output = pt_model.generate(**inputs).detach().cpu().numpy()
+                pt_outputs.append(pt_output)
+                pt_transcription.append(processor.batch_decode(pt_output, skip_special_tokens=True)[0])
+            inputs["input_features"] = input_features
+            del inputs["prompt_ids"]
+        else:
+            prompt_ids = []
+            pt_outputs = pt_model.generate(**inputs).detach().cpu().numpy()
+            pt_transcription = [processor.batch_decode(pt_outputs, skip_special_tokens=True)[0]]
+            pt_outputs = list(pt_outputs)
         del inputs["early_stopping"]
         del inputs["use_cache"]
+        return inputs, pt_transcription, pt_outputs, prompt_ids
+
+    @staticmethod
+    def select_transcription_options(
+        batch_size: int,
+        prompt_mode: bool,
+    ):
+        if batch_size > 1 and prompt_mode:
+            expected_transcription_no_comma_prompt1 = " John has doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky I"
+            expected_transcription_misspelled_prompt1 = " John has doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of Rocky I"
+            expected_transcription_no_comma_prompt2 = " Maria has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky"
+            expected_transcription_misspelled_prompt2 = " Maria has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of Rocky I"
+            expected_transcription_options = {
+                expected_transcription_no_comma_prompt1,
+                expected_transcription_no_comma_prompt2,
+                expected_transcription_misspelled_prompt1,
+                expected_transcription_misspelled_prompt2,
+            }
+        else:
+            expected_transcription_no_comma = (
+                " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
+            )
+            expected_transcription_with_comma = (
+                " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+            )
+            expected_transcription_with_quote_and_comma = (
+                ' "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+            )
+            expected_transcription_options = {
+                expected_transcription_no_comma,
+                expected_transcription_with_comma,
+                expected_transcription_with_quote_and_comma,
+            }
+        return expected_transcription_options
+
+    @staticmethod
+    def verify_onnx(
+        model_name_or_path: str,
+        cache_dir: str,
+        ort_session: InferenceSession,
+        device: torch.device,
+        batch_size: int = 1,
+        prompt_mode: bool = False,
+    ):
+        """Compare the result from PyTorch and ONNX Runtime to verify the ONNX model is good."""
+        extra_kwargs = {}
+        if version.parse(transformers_version) >= version.parse("4.36.0"):
+            extra_kwargs["attn_implementation"] = "eager"
+        pt_model = WhisperForConditionalGeneration.from_pretrained(
+            model_name_or_path, cache_dir=cache_dir, **extra_kwargs
+        ).to(device)
+        processor = WhisperProcessor.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        config = WhisperConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+
+        inputs, pt_transcription, pt_outputs, decoder_prompt_ids = WhisperHelper.pt_transcription_for_verify_onnx(
+            processor,
+            pt_model,
+            device,
+            batch_size=batch_size,
+            prompt_mode=prompt_mode,
+        )
+
+        start_id = [config.decoder_start_token_id]  # ex: [50258]
+        prompt_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
+        prompt_ids = list(map(lambda token: token[1], prompt_ids))  # ex: [50259, 50358, 50363]
+        forced_decoder_ids = start_id + prompt_ids  # ex: [50258, 50259, 50358, 50363]
+
         ort_names = list(map(lambda entry: entry.name, ort_session.get_inputs()))
         ort_dtypes = list(map(lambda entry: entry.type, ort_session.get_inputs()))
         ort_to_np = {
@@ -321,43 +468,57 @@ def verify_onnx(
             elif name == "prefix_vocab_mask":
                 inputs[name] = np.ones((batch_size, config.vocab_size), dtype=ort_to_np[dtype])
             elif name == "decoder_input_ids":
-                raw_input_ids = (
-                    [[config.decoder_start_token_id]]
-                    if use_extra_decoding_ids
-                    else [[config.decoder_start_token_id, 50259, 50359, 50363]]
-                )
-                inputs[name] = np.array(raw_input_ids, dtype=ort_to_np[dtype])
+                if not prompt_mode:
+                    raw_input_ids = [start_id] if use_extra_decoding_ids else [forced_decoder_ids]
+                    inputs[name] = np.array(raw_input_ids, dtype=ort_to_np[dtype])
+                else:
+                    # This logic handles the scenario for when prompts are not of the same size
+                    # For example if our prompt ids are [p1_id_1, p1_id_2] and [p2_id_1]
+                    # The final decoder_input_ids will look as such after padding
+                    # [prev_token, p1_id_1, p1_id_2, start_token, lang_token, transcribe_token]
+                    # [prev_token, p2_id_1, PAD_TOKEN, start_token, lang_token, transcribe_token]
+                    ort_prompts = []
+                    for i in range(batch_size):
+                        ort_prompts.append(decoder_prompt_ids[i].tolist())
+                    max_len = max(len(p) for p in ort_prompts)
+                    padded_prompts = []
+                    for p in ort_prompts:
+                        padded_prompt = [*p, *([config.pad_token_id] * (max_len - len(p)))]
+                        padded_prompts.append(padded_prompt + forced_decoder_ids)
+                    inputs[name] = np.array(padded_prompts, dtype=ort_to_np[dtype])
             elif name == "logits_processor":
                 inputs[name] = np.array([1], dtype=ort_to_np[dtype])
             elif name == "cross_qk_layer_head":
                 inputs[name] = np.array([[0, 0]], dtype=ort_to_np[dtype])
             elif name == "extra_decoding_ids":
-                inputs[name] = np.repeat(np.array([[50259, 50359, 50363]], dtype=ort_to_np[dtype]), batch_size, 0)
+                inputs[name] = np.repeat(np.array([prompt_ids], dtype=ort_to_np[dtype]), batch_size, 0)
+            elif name == "temperature":
+                inputs[name] = np.array([1.0], dtype=ort_to_np[dtype])
             else:
                 inputs[name] = np.array([inputs[name]], dtype=ort_to_np[dtype])
-        ort_outputs = ort_session.run(None, inputs)[0][0]
-
-        if pt_outputs.shape != ort_outputs.shape:
-            logger.warning("PyTorch and ONNX Runtime outputs do not have the same shape")
-
-        diff = pt_outputs - ort_outputs
-        max_diff = max(diff.min(), diff.max(), key=abs)
-
-        if max_diff > 0:
-            # For ONNX Runtime INT8 model
-            pt_expected_transcription = (
-                " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
-            )
-            pt_transcription = processor.batch_decode(pt_outputs, skip_special_tokens=True)
-            ort_expected_transcription = (
-                " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
-            )
-            ort_transcription = processor.batch_decode(ort_outputs, skip_special_tokens=True)
-
-            parity = (
-                pt_expected_transcription == pt_transcription[0] and ort_expected_transcription == ort_transcription[0]
+        ort_outputs = ort_session.run(None, inputs)[0][:, 0, :]
+        ort_transcription = processor.batch_decode(ort_outputs, skip_special_tokens=True)
+        expected_transcription_options = WhisperHelper.select_transcription_options(batch_size, prompt_mode)
+
+        parity = 1
+        for i in range(batch_size):
+            parity *= (
+                pt_transcription[i] in expected_transcription_options
+                and ort_transcription[i] in expected_transcription_options
             )
-            if parity:
-                max_diff = 0
+        max_diff = 0
+
+        if not parity:
+            for i in range(batch_size):
+                if pt_outputs[i].shape != ort_outputs[i].shape:
+                    diff = pt_outputs[i] - ort_outputs[i][:, : len(pt_outputs[i])]
+                else:
+                    diff = pt_outputs[i] - ort_outputs[i]
+                max_diff_i = max(diff.min(), diff.max(), key=abs)
+                max_diff = max(max_diff, max_diff_i)
+
+        if max_diff != 0:
+            logger.warning(f"PyTorch outputs: {pt_transcription}")
+            logger.warning(f"ONNX Runtime outputs: {ort_transcription}")
 
         return max_diff
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_openai_helper.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_openai_helper.py
new file mode 100644
index 000000000000..849c3059f21f
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_openai_helper.py
@@ -0,0 +1,84 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import logging
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+class WhisperDecoderInitOpenai(torch.nn.Module):
+    """WhisperDecoderInit for Openai."""
+
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        decoder: torch.nn.Module,
+    ):
+        super().__init__()
+        self.whisper_model = model
+        self.whisper_decoder = decoder
+        self.kv_cache = {}
+
+    @torch.no_grad()
+    def forward(
+        self,
+        tokens,
+        audio_features,
+        past=None,
+        remove_hooks=False,
+    ):
+        # Create a kv_cache for past_values
+        past_kv_cache = dict()
+        if past is not None:
+            # Convert past values from 4D to 3D
+            past = [torch.transpose(val, 1, 2) for val in past]
+            past = [val.reshape(val.shape[:2] + (-1,)) for val in past]
+            half_idx = len(past) // 2
+            for idx, block in enumerate(self.whisper_decoder.blocks):
+                past_kv_cache[block.attn.key] = past[2 * idx]
+                past_kv_cache[block.attn.value] = past[2 * idx + 1]
+                past_kv_cache[block.cross_attn.key] = past[2 * idx + half_idx]
+                past_kv_cache[block.cross_attn.value] = past[2 * idx + half_idx + 1]
+
+        hooks = None
+        if not self.kv_cache:
+            self.kv_cache, hooks = self.whisper_model.install_kv_cache_hooks()
+
+        logits = self.whisper_decoder(tokens, audio_features, kv_cache=past_kv_cache)
+
+        # Add concat node for past values
+        if past is not None:
+            for block in self.whisper_decoder.blocks:
+                self.kv_cache[block.attn.key] = torch.cat(
+                    [past_kv_cache[block.attn.key], self.kv_cache[block.attn.key]], dim=1
+                ).detach()
+                self.kv_cache[block.attn.value] = torch.cat(
+                    [past_kv_cache[block.attn.value], self.kv_cache[block.attn.value]], dim=1
+                ).detach()
+
+        present_self, present_cross = [], []
+        # Group self and cross values
+        for block in self.whisper_decoder.blocks:
+            present_self.append(self.kv_cache[block.attn.key])
+            present_self.append(self.kv_cache[block.attn.value])
+            if past is None:
+                present_cross.append(self.kv_cache[block.cross_attn.key])
+                present_cross.append(self.kv_cache[block.cross_attn.value])
+
+        present_self = present_self + present_cross
+        # Add reshape and transpose ops to convert from 3D to 4D
+        present_self = [
+            present_val.reshape(present_val.shape[:2] + (-1, 64)).transpose(1, 2) for present_val in present_self
+        ]
+
+        # Remove forward hooks to avoid model cloning step
+        if hooks is not None and remove_hooks:
+            self.kv_cache = {}
+            for hook in hooks:
+                hook.remove()
+        return logits, present_self
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 4e064fa53bfc..3967a7875f3a 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -492,10 +492,7 @@ def export_onnx_model_from_pt(
         example_inputs = image_processor(data, return_tensors="pt")
     else:
         tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-        max_input_size = (
-            tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
-        )
-
+        max_input_size = tokenizer.max_model_input_sizes.get(model_name, 1024)
         example_inputs = tokenizer.encode_plus("This is a sample input", return_tensors="pt")
 
     example_inputs = filter_inputs(example_inputs, input_names)
@@ -599,9 +596,7 @@ def export_onnx_model_from_tf(
     # Fix "Using pad_token, but it is not set yet" error.
     if tokenizer.pad_token is None:
         tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-    max_input_size = (
-        tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
-    )
+    max_input_size = tokenizer.max_model_input_sizes.get(model_name, 1024)
 
     config, model = load_tf_model(model_name, model_class, cache_dir, config_modifier)
     model.resize_token_embeddings(len(tokenizer))
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 37b39c91b5c1..a8fc6e661933 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -40,6 +40,12 @@ def initialize(self, model):
         self.enable_shape_infer: bool = True
         self.all_graphs: Optional[List[GraphProto]] = None
 
+        # Cache of shape and data type from onnx graph to speed up optimization.
+        # Be careful that fusion shall not reuse node output name for different shape/type (in adding/removing nodes)
+        # Note that these do not cache the symbolic shape inference result.
+        self._dtype_dict: Optional[Dict[str, int]] = None
+        self._shape_dict: Optional[Dict[str, List]] = None
+
     def disable_shape_inference(self):
         self.enable_shape_infer = False
 
@@ -76,6 +82,10 @@ def output_name_to_node(self):
                     output_name_to_node[output_name] = node
         return output_name_to_node
 
+    def functions(self):
+        all_functions = [list(self.model.functions)]
+        return all_functions
+
     def nodes(self):
         all_nodes = []
         for graph in self.graphs():
@@ -420,6 +430,54 @@ def find_first_child_by_type(self, node, child_type, input_name_to_nodes=None, r
 
         return None
 
+    def match_child_path(
+        self,
+        node,
+        child_op_types,
+        child_output_index=None,
+        return_indice=None,
+        exclude=[],  # noqa: B006
+    ):
+        """
+        Find a sequence of input edges based on constraints on parent op_type and index.
+        When input_index is None, we will find the first parent node based on constraints,
+        and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            child_op_types (str): constraint of child node op_type of each input edge.
+            child_output_index (list): constraint of input index of each input edge. None means no constraint.
+            return_indice (list): a list to append the input index
+                                  When there is no constraint on input index of an edge.
+
+        Returns:
+            children: a list of matched children node.
+        """
+        if child_output_index is not None:
+            assert len(child_output_index) == len(child_op_types)
+
+        current_node = node
+        matched_children = []
+        for i, op_type in enumerate(child_op_types):
+            matched_child = None
+            node_children = self.get_children(current_node)
+            for child_i, child in enumerate(node_children):
+                if child.op_type == op_type and child not in exclude:
+                    if child_output_index is not None and child_output_index[i] != child_i:
+                        logger.debug(
+                            f"Failed to match index={i} child_output_index={child_output_index[i]} op_type={op_type}",
+                            stack_info=True,
+                        )
+                        return None
+                    matched_child = child
+            if matched_child is None:
+                logger.debug(f"Failed to match child op_type={op_type}", stack_info=True)
+                return None
+
+            matched_children.append(matched_child)
+            current_node = matched_child
+        return matched_children
+
     def find_first_parent_by_type(self, node, parent_type, output_name_to_node=None, recursive=True):
         if output_name_to_node is None:
             output_name_to_node = self.output_name_to_node()
@@ -519,20 +577,60 @@ def tensor_shape_to_list(self, tensor_type):
                 shape_list.append("?")  # shall not happen
         return shape_list
 
-    def get_dtype(self, input_or_output: str):
-        """Try get data type given a name (could be initializer, graph input or output)."""
-        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
+    def get_dtype(self, name: str, symbolic_shape_helper: Optional[SymbolicShapeInferenceHelper] = None):
+        """Try get data type given a name (could be initializer, input or output of graph or node)."""
 
-        if input_or_output in tensor_type_map:
-            return tensor_type_map[input_or_output].tensor_type.elem_type
+        if self._dtype_dict is None:
+            self._dtype_dict = {}
+            for value_info in itertools.chain(
+                self.model.graph.value_info,
+                self.model.graph.input,
+                self.model.graph.output,
+            ):
+                self._dtype_dict[value_info.name] = value_info.type.tensor_type.elem_type
 
-        graph_input = self.find_graph_input(input_or_output)
-        if graph_input:
-            return graph_input.type.tensor_type.elem_type
+            for initializer in self.model.graph.initializer:
+                if initializer.name not in self._dtype_dict:
+                    self._dtype_dict[initializer.name] = initializer.data_type
 
-        graph_output = self.find_graph_output(input_or_output)
-        if graph_output:
-            return graph_output.type.tensor_type.elem_type
+        if name in self._dtype_dict:
+            return self._dtype_dict[name]
+
+        if symbolic_shape_helper is not None and name in symbolic_shape_helper.known_vi_:
+            value_info = symbolic_shape_helper.known_vi_[name]
+            return value_info.type.tensor_type.elem_type
+
+        return None
+
+    def get_shape(self, name: str, symbolic_shape_helper: Optional[SymbolicShapeInferenceHelper] = None):
+        """Try get shape given a name (could be initializer, input or output of graph or node)."""
+
+        if self._shape_dict is None:
+            self._shape_dict = {}
+            for value_info in itertools.chain(
+                self.model.graph.value_info,
+                self.model.graph.input,
+                self.model.graph.output,
+            ):
+                if value_info.type.tensor_type.HasField("shape"):
+                    shape = []
+                    for dim in value_info.type.tensor_type.shape.dim:
+                        if dim.dim_param:
+                            shape.append(dim.dim_param)
+                        else:
+                            shape.append(dim.dim_value)
+                    self._shape_dict[value_info.name] = shape
+
+            for initializer in self.model.graph.initializer:
+                if initializer.name not in self._shape_dict:
+                    self._shape_dict[initializer.name] = initializer.dims
+
+        if name in self._shape_dict:
+            return self._shape_dict[name]
+
+        if symbolic_shape_helper is not None and name in symbolic_shape_helper.known_vi_:
+            value_info = symbolic_shape_helper.known_vi_[name]
+            return value_info.type.tensor_type.elem_type
 
         return None
 
@@ -566,23 +664,14 @@ def remove_cascaded_cast_nodes(self):
     def remove_useless_cast_nodes(self):
         """Remove cast nodes that are not needed: input and output has same data type."""
         shape_infer = self.infer_runtime_shape(update=True)
-        if shape_infer is None:
-            logger.info("Skip removing useless cast nodes since shape inference failed.")
-            return
-
-        def get_data_type(input_or_output_name):
-            dtype = self.get_dtype(input_or_output_name)
-            if dtype:
-                return dtype
-            if shape_infer.known_vi_[input_or_output_name].type.tensor_type.HasField("elem_type"):
-                return shape_infer.known_vi_[input_or_output_name].type.tensor_type.elem_type
-            return None
+        if self.enable_shape_infer and shape_infer is None:
+            logger.warning("shape inference failed which might impact useless cast node detection.")
 
         nodes_to_remove = []
         for node in self.nodes():
             if node.op_type == "Cast":
-                input_dtype = get_data_type(node.input[0])
-                output_dtype = get_data_type(node.output[0])
+                input_dtype = self.get_dtype(node.input[0], shape_infer)
+                output_dtype = self.get_dtype(node.output[0], shape_infer)
                 if input_dtype and input_dtype == output_dtype:
                     nodes_to_remove.append(node)
 
@@ -601,7 +690,10 @@ def get_data_type(input_or_output_name):
                     self.replace_input_of_all_nodes(node.output[0], node.input[0])
                 self.remove_node(node)
 
-            logger.info("Removed %d Cast nodes with output type same as input", len(nodes_to_remove))
+            logger.info(
+                "Removed %d Cast nodes with output type same as input",
+                len(nodes_to_remove),
+            )
 
     def convert_model_float32_to_float16(self, cast_input_output=True):
         logger.warning(
@@ -693,6 +785,7 @@ def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
                     "node_block_list",
                     "force_fp16_initializers",
                     "force_fp16_inputs",
+                    "use_bfloat16_as_blocked_nodes_dtype",
                 ]
                 if key in kwargs
             }
@@ -793,11 +886,9 @@ def get_graph_inputs(self, current_node, recursive=False):
 
     @staticmethod
     def input_index(node_output, child_node):
-        index = 0
-        for input in child_node.input:
+        for index, input in enumerate(child_node.input):
             if input == node_output:
                 return index
-            index += 1
         return -1
 
     def remove_unused_constant(self):
@@ -863,7 +954,7 @@ def get_first_output(node):
         num_nodes_removed = 0
         for node in self.model.graph.node:
             first_output = get_first_output(node)
-            kept_node = output_to_node[first_output] if first_output in output_to_node else None
+            kept_node = output_to_node.get(first_output)
 
             # Need double check the node since fused node might reuse output name of some nodes to be removed.
             # It is slow to compare whole node, so we compare op_type first to avoid comparing node in most cases.
@@ -1214,7 +1305,10 @@ def remove_duplicated_initializer(self, cache: Optional[dict] = None):
                 continue
             for j in range(i + 1, initializer_count):
                 if OnnxModel.has_same_value(
-                    self.model.graph.initializer[i], self.model.graph.initializer[j], cache, cache
+                    self.model.graph.initializer[i],
+                    self.model.graph.initializer[j],
+                    cache,
+                    cache,
                 ):
                     same[j] = i
 
@@ -1223,7 +1317,8 @@ def remove_duplicated_initializer(self, cache: Optional[dict] = None):
             if same[i] >= 0:
                 count += 1
                 self.replace_input_of_all_nodes(
-                    self.model.graph.initializer[i].name, self.model.graph.initializer[same[i]].name
+                    self.model.graph.initializer[i].name,
+                    self.model.graph.initializer[same[i]].name,
                 )
 
         if count > 0:
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bart.py b/onnxruntime/python/tools/transformers/onnx_model_bart.py
index 2a48722d17a1..61a786d7af60 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bart.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bart.py
@@ -121,7 +121,7 @@ def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node):
 
 
 class BartOnnxModel(BertOnnxModel):
-    def __init__(self, model, num_heads, hidden_size):
+    def __init__(self, model, num_heads, hidden_size, model_impl="hf"):
         super().__init__(model, num_heads, hidden_size)
         self.attention_mask = AttentionMask(self)
         self.attention_fusion = FusionBartAttention(self, self.hidden_size, self.num_heads, self.attention_mask)
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert.py b/onnxruntime/python/tools/transformers/onnx_model_bert.py
index 51deb67ce5bf..431e64509e3c 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert.py
@@ -126,7 +126,8 @@ def fuse_rotary_embeddings(self):
         # Remove non-MS domain functions
         rot_emb_nodes = list(
             filter(
-                lambda node: node.op_type == "RotaryEmbedding" and node.domain != "com.microsoft", self.model.graph.node
+                lambda node: node.op_type == "RotaryEmbedding" and node.domain != "com.microsoft",
+                self.model.graph.node,
             )
         )
         non_ms_domains_to_keep = set(map(lambda node: node.domain, rot_emb_nodes))
@@ -350,7 +351,11 @@ def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bo
             self.attention_mask.set_mask_format(options.attention_mask_format)
             if options.use_multi_head_attention and not isinstance(self.attention_fusion, FusionBartAttention):
                 self.attention_fusion = FusionAttention(
-                    self, self.hidden_size, self.num_heads, self.attention_mask, options.use_multi_head_attention
+                    self,
+                    self.hidden_size,
+                    self.num_heads,
+                    self.attention_mask,
+                    options.use_multi_head_attention,
                 )
 
         if (options is None) or options.enable_attention:
@@ -415,7 +420,12 @@ def get_fused_operator_statistics(self):
             "SkipSimplifiedLayerNormalization",
             "RotaryEmbedding",
         ]
-        q_ops = ["QOrderedAttention", "QOrderedGelu", "QOrderedLayerNormalization", "QOrderedMatMul"]
+        q_ops = [
+            "QOrderedAttention",
+            "QOrderedGelu",
+            "QOrderedLayerNormalization",
+            "QOrderedMatMul",
+        ]
         for op in ops + q_ops:
             nodes = self.get_nodes_by_op_type(op)
             op_count[op] = len(nodes)
diff --git a/onnxruntime/python/tools/transformers/onnx_model_phi.py b/onnxruntime/python/tools/transformers/onnx_model_phi.py
new file mode 100644
index 000000000000..05a27ba487f4
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/onnx_model_phi.py
@@ -0,0 +1,930 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import numpy as np
+from dynamo_onnx_helper import DynamoOnnxHelper
+from fusion_base import Fusion
+from fusion_options import AttentionOpType, FusionOptions
+from fusion_skiplayernorm import FusionBiasSkipLayerNormalization, FusionSkipLayerNormalization
+from fusion_utils import NumpyHelper
+from onnx import ModelProto, NodeProto, TensorProto, helper, numpy_helper
+from onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class ProcessGemmWFunc:
+    def __call__(self, x):
+        return np.transpose(x, (1, 0))
+
+
+class ProcessMatMulQFunc:
+    def __call__(self, x):
+        return np.transpose(np.split(x, 3, 0)[0], (1, 0))
+
+
+class ProcessMatMulKFunc:
+    def __call__(self, x):
+        return np.transpose(np.split(x, 3, 0)[1], (1, 0))
+
+
+class ProcessMatMulVFunc:
+    def __call__(self, x):
+        return np.transpose(np.split(x, 3, 0)[2], (1, 0))
+
+
+class ProcessBiasQFunc:
+    def __call__(self, x):
+        x = np.split(x, 3, -1)[0]
+        return x
+
+
+class ProcessBiasKFunc:
+    def __call__(self, x):
+        x = np.split(x, 3, -1)[1]
+        return x
+
+
+class ProcessBiasVFunc:
+    def __call__(self, x):
+        x = np.split(x, 3, -1)[2]
+        return x
+
+
+class ProcessRotCacheFunc:
+    def __call__(self, x):
+        # half rotary embedding
+        assert len(x.shape) == 2
+        if x.shape[1] == 32:
+            return x[:, 0:16]
+        return x
+
+
+# TODO: move to a seperate file
+class Fission(Fusion):
+    def __init__(
+        self,
+        model: OnnxModel,
+        nodes_to_find: List[str],
+    ):
+        super().__init__(model, "DONOTUSE", nodes_to_find)
+
+    def set_attention_op_type(self, attn_op_type: AttentionOpType):
+        self.attn_op_type = attn_op_type
+
+    def get_uname(self, layer_id, name):
+        return name + "_" + str(layer_id)
+
+    def get_edge_by_name(self, edges, name):
+        for edge in edges:
+            if edge == name or edge.endswith(name) or edge.startswith(name):
+                return edge
+        raise ValueError(f"Edge {name} not found")
+
+    def get_input_by_name(self, node, name):
+        return self.get_edge_by_name(node.input, name)
+
+    def get_output_by_name(self, node, name):
+        return self.get_edge_by_name(node.output, name)
+
+    def process_initializer(self, initializer_name, functor, custom_name=None):
+        i = self.model.get_initializer(initializer_name)
+        i_np_array = NumpyHelper.to_array(i)
+        processed_i_np_array = functor(i_np_array)
+        new_tensor = helper.make_tensor(
+            initializer_name + "_processed" if custom_name is None else custom_name,
+            data_type=TensorProto.FLOAT,
+            dims=processed_i_np_array.shape,
+            vals=processed_i_np_array.flatten().tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(new_tensor, self.this_graph_name)
+        return new_tensor.name
+
+    def add_fp32_value_info(self, name):
+        new_value_info = self.model.graph().value_info.add()
+        new_value_info.name = name
+        new_value_info.type.tensor_type.elem_type = TensorProto.FLOAT
+
+    def add_int64_value_info(self, name):
+        new_value_info = self.model.graph().value_info.add()
+        new_value_info.name = name
+        new_value_info.type.tensor_type.elem_type = TensorProto.INT64
+
+    def replace_fp32_value_info(self, name, shape):
+        for value_info in self.model.graph().value_info:
+            if value_info.name == name:
+                self.model.graph().value_info.remove(value_info)
+                break
+        new_value_info = helper.make_tensor_value_info(
+            name,
+            elem_type=TensorProto.FLOAT,
+            shape=shape,
+        )
+        self.model.graph().value_info.extend([new_value_info])
+
+    def set_unique_name_and_add_nodes(
+        self, subgraph_nodes: List[NodeProto], layer_id: int, layer_known_edges_names: List[str]
+    ):
+        for new_node in subgraph_nodes:
+            for i, name in enumerate(new_node.input):
+                if name == "":
+                    continue
+                elif name not in layer_known_edges_names:
+                    new_node.input[i] = self.get_uname(layer_id, name)
+                    self.add_fp32_value_info(new_node.input[i])
+            for i, name in enumerate(new_node.output):
+                if name == "":
+                    continue
+                elif name not in layer_known_edges_names:
+                    new_node.output[i] = self.get_uname(layer_id, name)
+                    self.add_fp32_value_info(new_node.output[i])
+            new_node.name = self.get_uname(layer_id, new_node.name)
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+    def layernorm(self, inputs: List[str], outputs: List[str], prefix: str = ""):
+        assert len(inputs) == 3
+        assert len(outputs) == 1
+        node = helper.make_node(
+            "LayerNormalization",
+            inputs=inputs,
+            outputs=outputs,
+            name=prefix + "_LayerNormalization",
+            epsilon=9.999999747378752e-06,
+        )
+        return [node]
+
+    def gemm(self, inputs: List[str], outputs: List[str], prefix: str = ""):
+        assert len(inputs) == 3
+        assert len(outputs) == 1
+        matmul = helper.make_node(
+            "MatMul",
+            inputs=[inputs[0], inputs[1]],
+            outputs=[prefix + "matmul_out"],
+            name=prefix + "MatMul",
+        )
+        add = helper.make_node(
+            "Add",
+            inputs=[prefix + "matmul_out", inputs[2]],
+            outputs=outputs,
+            name=prefix + "Bias",
+        )
+        return [matmul, add]
+
+    def rotary(self, inputs: List[str], outputs: List[str], prefix: str = "", rot_dim=32, num_heads=32):
+        assert len(inputs) == 4
+        assert len(outputs) == 1
+        node = helper.make_node(
+            "RotaryEmbedding",
+            inputs=inputs,
+            outputs=outputs,
+            name=prefix + "RotaryEmbedding",
+            domain="com.microsoft",
+            rotary_embedding_dim=rot_dim,
+            num_heads=num_heads,
+        )
+        return [node]
+
+    def fastgelu(self, inputs: List[str], outputs: List[str], prefix: str = ""):
+        assert len(inputs) == 1
+        assert len(outputs) == 1
+        node = helper.make_node(
+            "FastGelu",
+            inputs=inputs,
+            outputs=outputs,
+            name=prefix + "FastGelu",
+            domain="com.microsoft",
+        )
+        return [node]
+
+    def add(self, inputs: List[str], outputs: List[str], prefix: str = ""):
+        assert len(inputs) == 2
+        assert len(outputs) == 1
+        node = helper.make_node(
+            "Add",
+            inputs=inputs,
+            outputs=outputs,
+            name=prefix + "Add",
+        )
+        return [node]
+
+    def mha(self, inputs: List[str], outputs: List[str], prefix: str = "", num_heads=32):
+        assert len(inputs) == 8
+        assert len(outputs) == 3
+        node = helper.make_node(
+            "MultiHeadAttention",
+            inputs=inputs,
+            outputs=outputs,
+            name=prefix + "MultiHeadAttention",
+            domain="com.microsoft",
+            num_heads=num_heads,
+            unidirectional=1,
+        )
+        return [node]
+
+    def gqa(self, inputs: List[str], outputs: List[str], prefix: str = "", num_heads=32):
+        assert len(inputs) == 7
+        assert len(outputs) == 3
+        node = helper.make_node(
+            "GroupQueryAttention",
+            inputs=inputs,
+            outputs=outputs,
+            name=prefix + "GroupQueryAttention",
+            domain="com.microsoft",
+            num_heads=num_heads,
+            kv_num_heads=num_heads,
+        )
+        return [node]
+
+    def attention(self, inputs: List[str], outputs: List[str], prefix: str = "", num_heads=32):
+        assert len(inputs) == 5
+        assert len(outputs) == 2
+        node = helper.make_node(
+            "Attention",
+            inputs=inputs,
+            outputs=outputs,
+            name=prefix + "Attention",
+            domain="com.microsoft",
+            num_heads=num_heads,
+            unidirectional=1,
+            do_rotary=1,
+            rotary_embedding_dim=32,
+        )
+        return [node]
+
+    def paged_attn(
+        self,
+        inputs: List[str],
+        outputs: List[str],
+        prefix: str = "",
+        num_heads=32,
+        head_size=80,
+        scale=0.11180339753627777,
+    ):
+        assert len(inputs) == 6
+        assert len(outputs) == 1
+        node = helper.make_node(
+            "PagedAttention",
+            inputs=inputs,
+            outputs=outputs,
+            name=prefix + "PagedAttention",
+            domain="vllm.ort.ext",
+            num_heads=num_heads,
+            num_kv_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+        )
+        return [node]
+
+
+class Phi2PreProcessor(DynamoOnnxHelper):
+    def __init__(self, model: ModelProto, num_heads: int, hidden_size: int):
+        super().__init__(model)
+        self.num_hidden_layers = 32
+        self.num_attention_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.func_name = "modeling_phi_PhiModel_model_1"
+
+    def get_phi2_edge_dict(self) -> dict:
+        edge_dict = {}
+        edge_dict["lm_head_1"] = "logits"
+        edge_dict["l_input_ids_"] = "input_ids"
+        edge_dict["key_states"] = "past_key_0"
+        edge_dict["value_states"] = "past_value_0"
+        for i in range(1, self.num_hidden_layers, 1):
+            edge_dict[f"key_states_{i}"] = f"past_key_{i}"
+            edge_dict[f"value_states_{i}"] = f"past_value_{i}"
+            edge_dict[f"model_layers_{i}_1"] = f"present_key_{i}"
+            edge_dict[f"model_layers_{i}_1_1"] = f"present_value_{i}"
+
+        outputs = [o.name for o in self.model.graph.output]
+        if "model_layers_0_1_1" in outputs and "model_layers_0_1_2" in outputs:
+            edge_dict["model_layers_0_1_1"] = "present_key_0"
+            edge_dict["model_layers_0_1_2"] = "present_value_0"
+        else:
+            assert "model_layers_0_1" in outputs and "model_layers_0_1_1" in outputs
+            edge_dict["model_layers_0_1"] = "present_key_0"
+            edge_dict["model_layers_0_1_1"] = "present_value_0"
+        return edge_dict
+
+    def simplify_phi2_op_type(self):
+        phi2_transformer_layer_name = "modeling_phi_PhiDecoderLayer_model_layers"
+        for node in self.model.graph.node:
+            index = node.op_type.find(phi2_transformer_layer_name)
+            if index != -1:
+                node.op_type = node.op_type[index:]
+
+    def process_graph_io(self, attn_op_type: AttentionOpType):
+        self.use_attn = attn_op_type == AttentionOpType.Attention
+        self.use_vllm = attn_op_type == AttentionOpType.PagedAttention
+        graph = self.model.graph
+        new_inputs = []
+        for vi in graph.input:
+            if "input_ids" in vi.name:
+                vi_iid = helper.make_tensor_value_info(
+                    vi.name,
+                    elem_type=TensorProto.INT32 if not self.use_vllm else TensorProto.INT64,
+                    shape=["batch_size", "seq_len"],
+                )
+                vi_step = helper.make_tensor_value_info(
+                    "step",
+                    elem_type=TensorProto.INT64,
+                    shape=[1],
+                )
+                vi_pid = helper.make_tensor_value_info(
+                    "position_ids",
+                    elem_type=TensorProto.INT64,
+                    shape=["batch_size", "seq_len"],
+                )
+                vi_mask = helper.make_tensor_value_info(
+                    "attention_mask",
+                    elem_type=TensorProto.INT32,
+                    shape=["batch_size", "seq_len"],
+                )
+                vi_meta = helper.make_tensor_value_info(
+                    "input_metadata",
+                    elem_type=TensorProto.INT64,
+                    shape=[1],
+                )
+                (
+                    new_inputs.extend([vi_iid, vi_step, vi_mask])
+                    if not self.use_vllm
+                    else new_inputs.extend([vi_iid, vi_pid, vi_meta])
+                )
+            if self.use_attn:
+                if "past_key" in vi.name:
+                    vi_cache = helper.make_tensor_value_info(
+                        vi.name.replace("past_key", "past"),
+                        elem_type=vi.type.tensor_type.elem_type,
+                        shape=[
+                            2,
+                            "batch_size",
+                            self.num_attention_heads,
+                            "past_seq_len",
+                            self.hidden_size // self.num_attention_heads,
+                        ],
+                    )
+                    new_inputs.extend([vi_cache])
+            elif self.use_vllm:
+                if "past_key" in vi.name:
+                    vi_cache = helper.make_tensor_value_info(
+                        vi.name,
+                        elem_type=vi.type.tensor_type.elem_type,
+                        shape=["num_blocks", "num_heads", "head_size_x", "block_size", "block_x"],
+                    )
+                    new_inputs.extend([vi_cache])
+                if "past_value" in vi.name:
+                    vi_cache = helper.make_tensor_value_info(
+                        vi.name,
+                        elem_type=vi.type.tensor_type.elem_type,
+                        shape=[
+                            "num_blocks",
+                            "num_heads",
+                            "head_size",
+                            "block_size",
+                        ],
+                    )
+                    new_inputs.extend([vi_cache])
+            else:
+                if "past_key" in vi.name or "past_value" in vi.name:
+                    vi_cache = helper.make_tensor_value_info(
+                        vi.name,
+                        elem_type=vi.type.tensor_type.elem_type,
+                        shape=[
+                            "batch_size",
+                            self.num_attention_heads,
+                            "past_seq_len",
+                            self.hidden_size // self.num_attention_heads,
+                        ],
+                    )
+                    new_inputs.extend([vi_cache])
+
+        graph.ClearField("input")
+        graph.input.extend(new_inputs)
+
+        new_outputs = []
+        for i, vi in enumerate(graph.output):
+            if i == 0:
+                new_outputs.extend([vi])
+            else:
+                if self.use_attn:
+                    if "present_key" in vi.name:
+                        vi_cache = helper.make_tensor_value_info(
+                            vi.name.replace("present_key", "present"),
+                            elem_type=vi.type.tensor_type.elem_type,
+                            shape=[
+                                2,
+                                "batch_size",
+                                self.num_attention_heads,
+                                "total_seq_len",
+                                self.hidden_size // self.num_attention_heads,
+                            ],
+                        )
+                        new_outputs.extend([vi_cache])
+                elif self.use_vllm:
+                    pass
+                else:
+                    vi_cache = helper.make_tensor_value_info(
+                        vi.name,
+                        elem_type=vi.type.tensor_type.elem_type,
+                        shape=[
+                            "batch_size",
+                            self.num_attention_heads,
+                            "total_seq_len",
+                            self.hidden_size // self.num_attention_heads,
+                        ],
+                    )
+                    new_outputs.extend([vi_cache])
+
+        graph.ClearField("output")
+        graph.output.extend(new_outputs)
+
+    def preprocess_onnx(self, attn_op_type: AttentionOpType):
+        function_name = None
+        for func in self.model.functions:
+            if func.name.endswith(self.func_name):
+                function_name = func.name
+                break
+        assert function_name is not None
+        self.unroll_function(function_name)
+        self.update_edges(self.get_phi2_edge_dict())
+        self.simplify_phi2_op_type()
+        self.remove_dropout_layer()
+        if attn_op_type == AttentionOpType.PagedAttention:
+            self.remove_lm_head_layer()
+        self.process_graph_io(attn_op_type)
+
+
+class FissionTransformerEmbeddingPhi(Fission):
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, ["torch_nn_modules_sparse_Embedding_model_embed_tokens_1"])
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        logger.info("Optimizing %s...", node.name)
+
+        assert len(node.input) == 2
+        assert len(node.output) == 1
+
+        input = node.input[0]
+        output = node.output[0]
+
+        embedding = self.get_input_by_name(node, "embed_tokens.weight")
+
+        layer_known_edges_names = [input, output, embedding]
+
+        subgraph_nodes = [
+            helper.make_node(
+                "Gather",
+                inputs=[embedding, input],
+                outputs=[output],
+                name="Embedding_Gather",
+            ),
+        ]
+
+        self.set_unique_name_and_add_nodes(subgraph_nodes, 0, layer_known_edges_names)
+        self.nodes_to_remove.append(node)
+        self.prune_graph = True
+
+
+class FissionTransformerLayerNormPhi(Fission):
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, ["torch_nn_modules_normalization_LayerNorm_model_final_layernorm_1"])
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        logger.info("Optimizing %s...", node.name)
+
+        assert len(node.input) == 3
+        assert len(node.output) == 1
+
+        input = node.input[0]
+        output = node.output[0]
+
+        ln_weight = self.get_input_by_name(node, "final_layernorm.weight")
+        ln_bias = self.get_input_by_name(node, "final_layernorm.bias")
+
+        layer_known_edges_names = [input, output, ln_weight, ln_bias]
+
+        subgraph_nodes = []
+        subgraph_nodes.extend(self.layernorm([input, ln_weight, ln_bias], [output], "Final"))
+
+        self.set_unique_name_and_add_nodes(subgraph_nodes, 99, layer_known_edges_names)
+
+        self.replace_fp32_value_info(input, ["batch_size", "seq_len", "hidden_size"])
+        self.replace_fp32_value_info(output, ["batch_size", "seq_len", "hidden_size"])
+
+        self.nodes_to_remove.append(node)
+        self.prune_graph = True
+
+
+class FissionTransformerCausalLMHeadPhi(Fission):
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, ["torch_nn_modules_linear_Linear_lm_head_1"])
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        logger.info("Optimizing %s...", node.name)
+
+        assert len(node.input) == 5
+        assert len(node.output) == 1
+
+        input = node.input[2]
+        output = node.output[0]
+
+        fc_weight = self.process_initializer(self.get_input_by_name(node, "lm_head.weight"), ProcessGemmWFunc())
+        fc_bias = self.get_input_by_name(node, "lm_head.bias")
+
+        layer_known_edges_names = [input, output, fc_weight, fc_bias]
+
+        subgraph_nodes = []
+        subgraph_nodes.extend(self.gemm([input, fc_weight, fc_bias], [output], "LMHead_"))
+
+        self.set_unique_name_and_add_nodes(subgraph_nodes, 99, layer_known_edges_names)
+
+        self.replace_fp32_value_info(input, ["batch_size", "seq_len", "hidden_size"])
+        self.replace_fp32_value_info(output, ["batch_size", "seq_len", 51200])
+
+        self.nodes_to_remove.append(node)
+        self.prune_graph = True
+
+
+class FissionTransformerBlockPhi(Fission):
+    def __init__(
+        self,
+        model: OnnxModel,
+        num_heads: int,
+    ):
+        self.num_heads = num_heads
+        max_num_layers = 32
+        self.func_to_layer_id = {}
+        nodes_to_find = []
+        for layer in range(max_num_layers):
+            func_name = f"modeling_phi_PhiDecoderLayer_model_layers_{layer}_1"
+            nodes_to_find.append(func_name)
+            self.func_to_layer_id[func_name] = layer
+
+        super().__init__(model, nodes_to_find)
+
+    def get_layer_id(self, node):
+        return self.func_to_layer_id[node.op_type]
+
+    def get_gqa_aux_nodes(self):
+        gqa_aux_nodes = [
+            helper.make_node(
+                "Cast",
+                inputs=["attention_mask"],
+                outputs=["mask_int64"],
+                name="Cast_gqa_aux_0",
+                to=TensorProto.INT64,
+            ),
+            helper.make_node(
+                "ReduceSum",
+                inputs=["mask_int64", "one"],
+                outputs=["mask_row_sums"],
+                name="ReduceSum_gqa_aux",
+            ),
+            helper.make_node(
+                "Sub",
+                inputs=["mask_row_sums", "one"],
+                outputs=["seqlens_k_int64"],
+                name="Sub_gqa_aux",
+            ),
+            helper.make_node(
+                "Cast",
+                inputs=["seqlens_k_int64"],
+                outputs=["seqlens_k"],
+                name="Cast_gqa_aux_1",
+                to=TensorProto.INT32,
+            ),
+            helper.make_node("Shape", inputs=["mask_int64"], outputs=["mask_shape"], name="Shape_gqa_aux_0"),
+            helper.make_node(
+                "Gather",
+                inputs=["mask_shape", "one"],
+                outputs=["total_seq_len_int64"],
+                name="Gather_gqa_aux_0",
+                axis=0,
+            ),
+            helper.make_node(
+                "Cast",
+                inputs=["total_seq_len_int64"],
+                outputs=["total_sequence_length"],
+                name="Cast_gqa_aux_2",
+                to=TensorProto.INT32,
+            ),
+        ]
+        return gqa_aux_nodes
+
+    def pack_qkv_gemm(self, q_w, k_w, v_w, q_b, k_b, v_b, weight_name, bias_name):
+        q_weight = self.model.get_initializer(q_w)
+        k_weight = self.model.get_initializer(k_w)
+        v_weight = self.model.get_initializer(v_w)
+        qw = np.transpose(NumpyHelper.to_array(q_weight), (1, 0))
+        kw = np.transpose(NumpyHelper.to_array(k_weight), (1, 0))
+        vw = np.transpose(NumpyHelper.to_array(v_weight), (1, 0))
+        qkv_weight = np.stack((qw, kw, vw), axis=1)
+
+        q_bias = self.model.get_initializer(q_b)
+        k_bias = self.model.get_initializer(k_b)
+        v_bias = self.model.get_initializer(v_b)
+        qb = NumpyHelper.to_array(q_bias)
+        kb = NumpyHelper.to_array(k_bias)
+        vb = NumpyHelper.to_array(v_bias)
+        qkv_bias = np.stack((qb, kb, vb), axis=0)
+
+        hidden_size = qkv_weight.shape[0]
+
+        weight = helper.make_tensor(
+            weight_name,
+            data_type=TensorProto.FLOAT,
+            dims=[hidden_size, hidden_size * 3],
+            vals=qkv_weight.flatten().tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(weight, self.this_graph_name)
+
+        bias = helper.make_tensor(
+            bias_name,
+            data_type=TensorProto.FLOAT,
+            dims=[hidden_size * 3],
+            vals=qkv_bias.flatten().tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(bias, self.this_graph_name)
+
+        self.add_fp32_value_info(weight.name)
+        self.add_fp32_value_info(bias.name)
+
+        return weight_name, bias_name
+
+    def fuse(
+        self,
+        node,
+        input_name_to_nodes,
+        output_name_to_node,
+    ):
+        logger.info("Optimizing %s...", node.name)
+
+        logger.info(f"AttentionOpType: {self.attn_op_type}")
+
+        layer_id = self.get_layer_id(node)
+
+        i_hidden_states = node.input[0]
+        i_key_cache = self.get_input_by_name(node, "past_key")
+        i_value_cache = self.get_input_by_name(node, "past_value")
+
+        o_hidden_states = node.output[-1]
+        o_key_cache = self.get_output_by_name(node, "present_key")
+        o_value_cache = self.get_output_by_name(node, "present_value")
+
+        ln_weight = self.get_input_by_name(node, "input_layernorm.weight")
+        ln_bias = self.get_input_by_name(node, "input_layernorm.bias")
+
+        attn_q_weight, attn_q_bias, attn_k_weight, attn_k_bias, attn_v_weight, attn_v_bias = (
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+        attn_qkv_weight, attn_qkv_bias = None, None
+        cos_cache, sin_cache = None, None
+
+        if self.attn_op_type != AttentionOpType.Attention:
+            attn_q_weight = self.process_initializer(
+                self.get_input_by_name(node, "self_attn.q_proj.weight"), ProcessGemmWFunc()
+            )
+            attn_k_weight = self.process_initializer(
+                self.get_input_by_name(node, "self_attn.k_proj.weight"), ProcessGemmWFunc()
+            )
+            attn_v_weight = self.process_initializer(
+                self.get_input_by_name(node, "self_attn.v_proj.weight"), ProcessGemmWFunc()
+            )
+            attn_q_bias = self.get_input_by_name(node, "self_attn.q_proj.bias")
+            attn_k_bias = self.get_input_by_name(node, "self_attn.k_proj.bias")
+            attn_v_bias = self.get_input_by_name(node, "self_attn.v_proj.bias")
+
+            cos_cache = self.process_initializer(
+                self.get_input_by_name(node, "rotary_emb.cos_cached"), ProcessRotCacheFunc()
+            )
+            sin_cache = self.process_initializer(
+                self.get_input_by_name(node, "rotary_emb.sin_cached"), ProcessRotCacheFunc()
+            )
+        else:
+            attn_qkv_weight, attn_qkv_bias = self.pack_qkv_gemm(
+                self.get_input_by_name(node, "self_attn.q_proj.weight"),
+                self.get_input_by_name(node, "self_attn.k_proj.weight"),
+                self.get_input_by_name(node, "self_attn.v_proj.weight"),
+                self.get_input_by_name(node, "self_attn.q_proj.bias"),
+                self.get_input_by_name(node, "self_attn.k_proj.bias"),
+                self.get_input_by_name(node, "self_attn.v_proj.bias"),
+                self.get_uname(layer_id, "attn_qkv_weight"),
+                self.get_uname(layer_id, "attn_qkv_bias"),
+            )
+
+        attn_out_weight = self.process_initializer(
+            self.get_input_by_name(node, "self_attn.dense.weight"), ProcessGemmWFunc()
+        )
+        attn_out_bias = self.get_input_by_name(node, "self_attn.dense.bias")
+
+        mlp_fc1_weight = self.process_initializer(self.get_input_by_name(node, "mlp.fc1.weight"), ProcessGemmWFunc())
+        mlp_fc2_weight = self.process_initializer(self.get_input_by_name(node, "mlp.fc2.weight"), ProcessGemmWFunc())
+        mlp_fc1_bias = self.get_input_by_name(node, "mlp.fc1.bias")
+        mlp_fc2_bias = self.get_input_by_name(node, "mlp.fc2.bias")
+
+        layer_known_edges_names = []
+        layer_known_edges_names.extend([i_hidden_states, i_key_cache, i_value_cache])
+        layer_known_edges_names.extend([o_hidden_states, o_key_cache, o_value_cache])
+        layer_known_edges_names.extend([ln_weight, ln_bias])
+        if self.attn_op_type != AttentionOpType.Attention:
+            layer_known_edges_names.extend(
+                [
+                    attn_q_weight,
+                    attn_q_bias,
+                    attn_k_weight,
+                    attn_k_bias,
+                    attn_v_weight,
+                    attn_v_bias,
+                    cos_cache,
+                    sin_cache,
+                ]
+            )
+        else:
+            layer_known_edges_names.extend([attn_qkv_weight, attn_qkv_bias])
+        layer_known_edges_names.extend(
+            [attn_out_weight, attn_out_bias, mlp_fc1_weight, mlp_fc1_bias, mlp_fc2_weight, mlp_fc2_bias]
+        )
+        layer_known_edges_names.extend(
+            ["attention_mask", "step", "seqlens_k", "total_sequence_length", "input_metadata", "position_ids"]
+        )
+
+        subgraph_nodes = []
+        subgraph_nodes.extend(self.layernorm([i_hidden_states, ln_weight, ln_bias], ["ln_out"]))
+        subgraph_nodes.extend(self.gemm(["attn_out", attn_out_weight, attn_out_bias], ["attn_add_out"], "OutProj_"))
+        subgraph_nodes.extend(self.gemm(["ln_out", mlp_fc1_weight, mlp_fc1_bias], ["fc1_out"], "FC1_"))
+        subgraph_nodes.extend(self.fastgelu(["fc1_out"], ["gelu_out"]))
+        subgraph_nodes.extend(self.gemm(["gelu_out", mlp_fc2_weight, mlp_fc2_bias], ["fc2_out"], "FC2_"))
+        subgraph_nodes.extend(self.add(["attn_add_out", "fc2_out"], ["residual_1_out"], "Residual_1"))
+        subgraph_nodes.extend(self.add([i_hidden_states, "residual_1_out"], [o_hidden_states], "Residual_2"))
+        if self.attn_op_type != AttentionOpType.Attention:
+            subgraph_nodes.extend(self.gemm(["ln_out", attn_q_weight, attn_q_bias], ["query"], "Q_"))
+            subgraph_nodes.extend(self.gemm(["ln_out", attn_k_weight, attn_k_bias], ["key"], "K_"))
+            subgraph_nodes.extend(self.gemm(["ln_out", attn_v_weight, attn_v_bias], ["value"], "V_"))
+            # vllm engine requires full position ids as the input
+            pos_ids_name = "position_ids" if self.attn_op_type == AttentionOpType.PagedAttention else "step"
+            subgraph_nodes.extend(self.rotary(["query", pos_ids_name, cos_cache, sin_cache], ["query_rot"], "Q_"))
+            subgraph_nodes.extend(self.rotary(["key", pos_ids_name, cos_cache, sin_cache], ["key_rot"], "K_"))
+            if self.attn_op_type == AttentionOpType.MultiHeadAttention:
+                subgraph_nodes.extend(
+                    self.mha(
+                        ["query_rot", "key_rot", "value", "", "attention_mask", "", i_key_cache, i_value_cache],
+                        ["attn_out", o_key_cache, o_value_cache],
+                    )
+                )
+            elif self.attn_op_type == AttentionOpType.GroupQueryAttention:
+                subgraph_nodes.extend(
+                    self.gqa(
+                        [
+                            "query_rot",
+                            "key_rot",
+                            "value",
+                            i_key_cache,
+                            i_value_cache,
+                            "seqlens_k",
+                            "total_sequence_length",
+                        ],
+                        ["attn_out", o_key_cache, o_value_cache],
+                    )
+                )
+                if layer_id == 0:
+                    gqa_aux_nodes = self.get_gqa_aux_nodes()
+                    for new_node in gqa_aux_nodes:
+                        self.nodes_to_add.append(new_node)
+                        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+                    self.model.add_initializer(
+                        numpy_helper.from_array(np.array([1], dtype="int64"), name="one"), self.this_graph_name
+                    )
+            elif self.attn_op_type == AttentionOpType.PagedAttention:
+                subgraph_nodes.extend(
+                    self.paged_attn(
+                        ["query_rot", "key_rot", "value", i_key_cache, i_value_cache, "input_metadata"],
+                        ["attn_out"],
+                    )
+                )
+        else:
+            past_name = f"past_{layer_id}"
+            present_name = f"present_{layer_id}"
+            layer_known_edges_names.extend([past_name, present_name])
+            subgraph_nodes.extend(
+                self.attention(
+                    ["ln_out", attn_qkv_weight, attn_qkv_bias, "attention_mask", past_name], ["attn_out", present_name]
+                )
+            )
+
+        self.set_unique_name_and_add_nodes(subgraph_nodes, layer_id, layer_known_edges_names)
+
+        self.replace_fp32_value_info(i_hidden_states, ["batch_size", "seq_len", "hidden_size"])
+        self.replace_fp32_value_info(o_hidden_states, ["batch_size", "seq_len", "hidden_size"])
+
+        self.nodes_to_remove.append(node)
+        self.prune_graph = True
+
+
+class PhiOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int, hidden_size: int):
+        super().__init__(model)
+        self.phi2_preprocessor = Phi2PreProcessor(self.model, num_heads, hidden_size)
+        self.fission_transformer_block = FissionTransformerBlockPhi(self, num_heads)
+        self.fission_causal_lm_head = FissionTransformerCausalLMHeadPhi(self)
+        self.fission_transformer_layernorm = FissionTransformerLayerNormPhi(self)
+        self.fission_transformer_embedding = FissionTransformerEmbeddingPhi(self)
+
+    def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False):
+        assert options is not None
+        attn_op_type = options.attention_op_type
+
+        self.fission_transformer_block.set_attention_op_type(attn_op_type)
+
+        self.phi2_preprocessor.preprocess_onnx(attn_op_type)
+
+        self.fission_transformer_block.apply()
+        self.fission_transformer_layernorm.apply()
+        self.fission_causal_lm_head.apply()
+        self.fission_transformer_embedding.apply()
+
+        super().prune_graph()
+
+        # SLN ctor is placed here intentionally to delay the symbolic shape inference
+        self.fuse_sln = FusionSkipLayerNormalization(self)
+        self.fuse_bias_sln = FusionBiasSkipLayerNormalization(self)
+        self.fuse_sln.apply()
+        self.fuse_bias_sln.apply()
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "Attention",
+            "MultiHeadAttention",
+            "GroupQueryAttention",
+            "PagedAttention",
+            "Gelu",
+            "BiasGelu",
+            "FastGelu",
+            "LayerNormalization",
+            "SkipLayerNormalization",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+
+        logger.info(f"Optimized operators: {op_count}")
+        return op_count
+
+    def is_fully_optimized(self, fused_op_count=None):
+        """
+        Returns True when the model is fully optimized.
+        """
+        if fused_op_count is None:
+            fused_op_count = self.get_fused_operator_statistics()
+
+        def op_count(op_name: str):
+            return fused_op_count.get(op_name) or 0
+
+        attention = (
+            op_count("Attention")
+            + op_count("MultiHeadAttention")
+            + op_count("GroupQueryAttention")
+            + op_count("PagedAttention")
+        )
+        gelu = op_count("Gelu") + op_count("BiasGelu") + op_count("FastGelu")
+        layer_norm = op_count("LayerNormalization") + op_count("SkipLayerNormalization")
+
+        is_perfect = (attention > 0) and (attention == gelu) and (layer_norm >= attention)
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu (or FastGelu) not fused")
+
+        if attention == 0:
+            logger.warning("Attention (or MultiHeadAttention) not fused")
+
+        return is_perfect
diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py
index 4d15b9288e7b..77e24986f0fd 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_unet.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from logging import getLogger
+import logging
 from typing import Optional
 
 from fusion_attention_unet import FusionAttentionUnet
@@ -14,11 +14,12 @@
 from fusion_options import FusionOptions
 from fusion_skip_group_norm import FusionSkipGroupNorm
 from fusion_transpose import FusionInsertTranspose, FusionTranspose
+from import_utils import is_installed
 from onnx import ModelProto
 from onnx_model import OnnxModel
 from onnx_model_bert import BertOnnxModel
 
-logger = getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class UnetOnnxModel(BertOnnxModel):
@@ -94,14 +95,24 @@ def fuse_multi_head_attention(self, options: Optional[FusionOptions] = None):
         # Self Attention
         enable_packed_qkv = (options is None) or options.enable_packed_qkv
         self_attention_fusion = FusionAttentionUnet(
-            self, self.hidden_size, self.num_heads, False, enable_packed_qkv, False
+            self,
+            self.hidden_size,
+            self.num_heads,
+            is_cross_attention=False,
+            enable_packed_qkv=enable_packed_qkv,
+            enable_packed_kv=False,
         )
         self_attention_fusion.apply()
 
         # Cross Attention
         enable_packed_kv = (options is None) or options.enable_packed_kv
         cross_attention_fusion = FusionAttentionUnet(
-            self, self.hidden_size, self.num_heads, True, False, enable_packed_kv
+            self,
+            self.hidden_size,
+            self.num_heads,
+            is_cross_attention=True,
+            enable_packed_qkv=False,
+            enable_packed_kv=enable_packed_kv,
         )
         cross_attention_fusion.apply()
 
@@ -110,23 +121,48 @@ def fuse_bias_add(self):
         fusion.apply()
 
     def optimize(self, options: Optional[FusionOptions] = None):
+        if is_installed("tqdm"):
+            import tqdm
+            from tqdm.contrib.logging import logging_redirect_tqdm
+
+            with logging_redirect_tqdm():
+                steps = 18
+                progress_bar = tqdm.tqdm(range(steps), initial=0, desc="fusion")
+                self._optimize(options, progress_bar)
+        else:
+            logger.info("tqdm is not installed. Run optimization without progress bar")
+            self._optimize(options, None)
+
+    def _optimize(self, options: Optional[FusionOptions] = None, progress_bar=None):
         if (options is not None) and not options.enable_shape_inference:
             self.disable_shape_inference()
 
         self.utils.remove_identity_nodes()
+        if progress_bar:
+            progress_bar.update(1)
 
         # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
         self.utils.remove_useless_cast_nodes()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_layer_norm:
             self.fuse_layer_norm()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_gelu:
             self.fuse_gelu()
+        if progress_bar:
+            progress_bar.update(1)
 
         self.preprocess()
+        if progress_bar:
+            progress_bar.update(1)
 
         self.fuse_reshape()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_group_norm:
             channels_last = (options is None) or options.group_norm_channels_last
@@ -135,42 +171,66 @@ def optimize(self, options: Optional[FusionOptions] = None):
 
             insert_transpose_fusion = FusionInsertTranspose(self)
             insert_transpose_fusion.apply()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_bias_splitgelu:
             bias_split_gelu_fusion = FusionBiasSplitGelu(self)
             bias_split_gelu_fusion.apply()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_attention:
+            # self.save_model_to_file("before_mha.onnx")
             self.fuse_multi_head_attention(options)
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_skip_layer_norm:
             self.fuse_skip_layer_norm()
+        if progress_bar:
+            progress_bar.update(1)
 
         self.fuse_shape()
+        if progress_bar:
+            progress_bar.update(1)
 
         # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
         self.utils.remove_useless_reshape_nodes()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_skip_group_norm:
             skip_group_norm_fusion = FusionSkipGroupNorm(self)
             skip_group_norm_fusion.apply()
+        if progress_bar:
+            progress_bar.update(1)
 
         if (options is None) or options.enable_bias_skip_layer_norm:
             # Fuse SkipLayerNormalization and Add Bias before it.
             self.fuse_add_bias_skip_layer_norm()
+        if progress_bar:
+            progress_bar.update(1)
 
         if options is not None and options.enable_gelu_approximation:
             self.gelu_approximation()
+        if progress_bar:
+            progress_bar.update(1)
 
         if options is None or options.enable_nhwc_conv:
             self.convert_conv_to_nhwc()
-
             self.merge_adjacent_transpose()
+        if progress_bar:
+            progress_bar.update(1)
 
         if options is not None and options.enable_bias_add:
             self.fuse_bias_add()
+        if progress_bar:
+            progress_bar.update(1)
 
         self.postprocess()
+        if progress_bar:
+            progress_bar.update(1)
 
         logger.info(f"opset version: {self.get_opset_version()}")
 
@@ -190,6 +250,7 @@ def get_fused_operator_statistics(self):
             "NhwcConv",
             "BiasAdd",
         ]
+
         for op in ops:
             nodes = self.get_nodes_by_op_type(op)
             op_count[op] = len(nodes)
diff --git a/onnxruntime/python/tools/transformers/onnx_utils.py b/onnxruntime/python/tools/transformers/onnx_utils.py
new file mode 100644
index 000000000000..64fade936939
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/onnx_utils.py
@@ -0,0 +1,55 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from fusion_utils import NumpyHelper
+from onnx import ModelProto, TensorProto
+from onnx.external_data_helper import set_external_data
+from onnx_model import OnnxModel
+
+from onnxruntime import OrtValue
+
+
+def extract_raw_data_from_model(model: ModelProto):
+    """
+    Extract external data from model and return the external data as a list of tuples (name, value).
+    Note this function does not handle external data that is not loaded into the model as raw data.
+
+    Args:
+        model (ModelProto): the model proto to extract external data from.
+    Returns:
+        (external_names, external_values): a tuple of two lists of external data names and values.
+    """
+    external_data = []
+    onnx_model = OnnxModel(model)
+    for graph in onnx_model.graphs():
+        for initializer in graph.initializer:
+            name = initializer.name
+
+            if initializer.HasField("raw_data"):
+                numpy_tensor = NumpyHelper.to_array(initializer)
+                ort_value = OrtValue.ortvalue_from_numpy(numpy_tensor)
+                external_data.append((name, ort_value))
+                # mimic set_external_data
+                set_external_data(initializer, location="foo.bin")
+                initializer.name = name
+                initializer.ClearField("raw_data")
+
+    return zip(*external_data)
+
+
+def has_external_data(model: ModelProto):
+    """
+    Check if the model has external data.
+
+    Args:
+        model (ModelProto): the model proto to check for external data.
+    Returns:
+        bool: True if the model has external data, False otherwise.
+    """
+    onnx_model = OnnxModel(model)
+    for graph in onnx_model.graphs():
+        for initializer in graph.initializer:
+            if initializer.HasField("data_location") and initializer.data_location == TensorProto.EXTERNAL:
+                return True
+    return False
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index ba61f4f6e43b..5f161674b614 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -21,11 +21,12 @@
 import logging
 import os
 import tempfile
-from typing import Dict, List, Optional
+from pathlib import Path
+from typing import Dict, List, Optional, Union
 
 import coloredlogs
 from fusion_options import FusionOptions
-from onnx import ModelProto, TensorProto, load_model
+from onnx import ModelProto, load_model
 from onnx_model import OnnxModel
 from onnx_model_bart import BartOnnxModel
 from onnx_model_bert import BertOnnxModel
@@ -34,10 +35,14 @@
 from onnx_model_clip import ClipOnnxModel
 from onnx_model_conformer import ConformerOnnxModel
 from onnx_model_gpt2 import Gpt2OnnxModel
+from onnx_model_phi import PhiOnnxModel
 from onnx_model_t5 import T5OnnxModel
 from onnx_model_tnlr import TnlrOnnxModel
 from onnx_model_unet import UnetOnnxModel
 from onnx_model_vae import VaeOnnxModel
+from onnx_utils import extract_raw_data_from_model, has_external_data
+
+import onnxruntime
 
 logger = logging.getLogger(__name__)
 
@@ -58,11 +63,12 @@
     "vae": (VaeOnnxModel, "pytorch", 1),  # UAE in Stable Diffusion
     "vit": (BertOnnxModel, "pytorch", 1),
     "conformer": (ConformerOnnxModel, "pytorch", 1),
+    "phi": (PhiOnnxModel, "pytorch", 0),
 }
 
 
 def optimize_by_onnxruntime(
-    onnx_model_path: str,
+    onnx_model: Optional[Union[str, ModelProto]] = None,
     use_gpu: bool = False,
     optimized_model_path: Optional[str] = None,
     opt_level: Optional[int] = 99,
@@ -73,12 +79,13 @@ def optimize_by_onnxruntime(
     external_data_file_threshold: int = 1024,
     *,
     provider: Optional[str] = None,
+    **deprecated_kwargs,
 ) -> str:
     """
     Use onnxruntime to optimize model.
 
     Args:
-        onnx_model_path (str): the path of input onnx model.
+        onnx_model (str | ModelProto): the path of input onnx model or ModelProto.
         use_gpu (bool): whether the optimized model is targeted to run in GPU.
         optimized_model_path (str or None): the path of optimized model.
         opt_level (int): graph optimization level.
@@ -93,7 +100,9 @@ def optimize_by_onnxruntime(
     assert opt_level in [1, 2, 99]
     from torch import version as torch_version
 
-    import onnxruntime
+    if onnx_model is None:
+        onnx_model = deprecated_kwargs.pop("onnx_model_path", None)
+    assert onnx_model is not None
 
     if (
         use_gpu
@@ -103,9 +112,13 @@ def optimize_by_onnxruntime(
         )
     ):
         logger.error("There is no gpu for onnxruntime to do optimization.")
-        return onnx_model_path
+        return onnx_model
 
-    model = OnnxModel(load_model(onnx_model_path, load_external_data=False))
+    model = (
+        OnnxModel(load_model(onnx_model, load_external_data=False))
+        if isinstance(onnx_model, str)
+        else OnnxModel(onnx_model)
+    )
     if model.use_float16() and not use_gpu:
         logger.warning(
             "This model uses float16 in the graph, use_gpu=False might cause extra Cast nodes. "
@@ -123,7 +136,10 @@ def optimize_by_onnxruntime(
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
 
     if optimized_model_path is None:
-        path_prefix = onnx_model_path[:-5]  # remove .onnx suffix
+        if isinstance(onnx_model, str):
+            path_prefix = str(Path(onnx_model).with_suffix(""))  # remove .onnx suffix
+        else:
+            path_prefix = "optimized_model"
         optimized_model_path = "{}_o{}_{}.onnx".format(path_prefix, opt_level, "gpu" if use_gpu else "cpu")
 
     sess_options.optimized_model_filepath = optimized_model_path
@@ -172,7 +188,20 @@ def optimize_by_onnxruntime(
         else:
             providers.append("CUDAExecutionProvider")
 
-    onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers, **kwargs)
+    # For large model, extract external data from model and add to session options
+    if isinstance(onnx_model, ModelProto):
+        if has_external_data(onnx_model):
+            raise ValueError(
+                "ModelProto has external data not loaded into memory, ORT cannot create session. "
+                "Please load external data before calling this function. "
+                "See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information."
+            )
+        external_names, external_values = extract_raw_data_from_model(onnx_model)
+        sess_options.add_external_initializers(list(external_names), list(external_values))
+
+    # Inference session is only used to optimize the model.
+    onnx_model = onnx_model.SerializeToString() if isinstance(onnx_model, ModelProto) else onnx_model
+    onnxruntime.InferenceSession(onnx_model, sess_options, providers=providers, **kwargs)
 
     assert os.path.exists(optimized_model_path) and os.path.isfile(optimized_model_path)
     logger.debug("Save optimized model by onnxruntime to %s", optimized_model_path)
@@ -185,7 +214,7 @@ def optimize_by_fusion(
     num_heads: int = 0,
     hidden_size: int = 0,
     optimization_options: Optional[FusionOptions] = None,
-):
+) -> OnnxModel:
     """Optimize Model by graph fusion logic.
 
     Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
@@ -239,7 +268,7 @@ def optimize_by_fusion(
 
 
 def optimize_model(
-    input: str,
+    input: Union[str, ModelProto],
     model_type: str = "bert",
     num_heads: int = 0,
     hidden_size: int = 0,
@@ -250,7 +279,7 @@ def optimize_model(
     verbose: bool = False,
     *,
     provider: Optional[str] = None,
-):
+) -> OnnxModel:
     """Optimize Model by OnnxRuntime and/or python fusion logic.
 
     ONNX Runtime has graph optimizations (https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html).
@@ -273,7 +302,7 @@ def optimize_model(
     For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
 
     Args:
-        input (str): input model path.
+        input (str | ModelProto): input model path or ModelProto.
         model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
         num_heads (int, optional): number of attention heads. Defaults to 0.
             0 allows detect the parameter from graph automatically.
@@ -296,9 +325,9 @@ def optimize_model(
 
     if model_type not in MODEL_TYPES:
         logger.warning(f"Unsupported model type: {model_type} for optimization, directly return model.")
-        return OnnxModel(load_model(input))
+        return OnnxModel(load_model(input)) if isinstance(input, str) else OnnxModel(input)
 
-    (optimizer_class, _producer, default_opt_level) = MODEL_TYPES[model_type]
+    (optimizer_class, _, default_opt_level) = MODEL_TYPES[model_type]
 
     if opt_level is None:
         opt_level = default_opt_level
@@ -314,11 +343,9 @@ def optimize_model(
 
     # Auto detect if input model has external data
     has_external_data_file = False
-    original_model = load_model(input, load_external_data=False)
-    for initializer in original_model.graph.initializer:
-        if initializer.HasField("data_location") and initializer.data_location == TensorProto.EXTERNAL:
-            has_external_data_file = True
-            break
+    original_model = load_model(input, load_external_data=False) if isinstance(input, str) else input
+    if has_external_data(original_model):
+        has_external_data_file = True
     del original_model
 
     if opt_level > 1:
@@ -363,7 +390,12 @@ def optimize_model(
     if only_onnxruntime and not temp_model_path:
         logger.warning("Please specify a positive value for opt_level when only_onnxruntime is True")
 
-    model = load_model(temp_model_path or input)
+    if temp_model_path is not None:
+        model = load_model(temp_model_path)
+    elif isinstance(input, str):
+        model = load_model(input)
+    else:
+        model = input
 
     if only_onnxruntime:
         optimizer = optimizer_class(model, num_heads, hidden_size)
diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py
index 8e45b149eaf0..2306b579f92f 100644
--- a/onnxruntime/python/tools/transformers/profiler.py
+++ b/onnxruntime/python/tools/transformers/profiler.py
@@ -329,7 +329,7 @@ def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
         calls = node_freq[node_name]
         avg_time = duration / float(calls)
         percentage = (duration / total) * 100.0
-        provider = node_provider[node_name] if node_name in node_provider else ""
+        provider = node_provider.get(node_name, "")
         before_percentage += percentage
         lines.append(
             f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
@@ -347,7 +347,7 @@ def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
         calls = node_freq[node_name]
         avg_time = duration / float(calls)
         percentage = (duration / total) * 100.0
-        provider = node_provider[node_name] if node_name in node_provider else ""
+        provider = node_provider.get(node_name, "")
         lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
 
     return lines
@@ -393,7 +393,7 @@ def group_node_results(sess_time, kernel_time_only, use_gpu):
                     total_fence_time += item["dur"]
                 continue
 
-            provider = item["args"]["provider"] if "provider" in item["args"] else ""
+            provider = item["args"].get("provider", "")
             if provider in provider_counter:
                 provider_counter[provider] += 1
             else:
@@ -425,7 +425,7 @@ def group_node_results(sess_time, kernel_time_only, use_gpu):
     lines.append("-" * 64)
     lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
     for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
-        fence_time = op_fence_time[op_name] if op_name in op_fence_time else 0
+        fence_time = op_fence_time.get(op_name, 0)
         kernel_time_ratio = kernel_time / total_kernel_time
         total_time = kernel_time + fence_time
         time_ratio = total_time / (total_kernel_time + total_fence_time)
diff --git a/onnxruntime/python/tools/transformers/quantize_helper.py b/onnxruntime/python/tools/transformers/quantize_helper.py
index a449e881ad36..6a25196dbc24 100644
--- a/onnxruntime/python/tools/transformers/quantize_helper.py
+++ b/onnxruntime/python/tools/transformers/quantize_helper.py
@@ -7,7 +7,7 @@
 import logging
 import os
 
-import onnx  # noqa: F401
+import onnx
 import torch
 from transformers.modeling_utils import Conv1D
 
@@ -69,6 +69,7 @@ def quantize_onnx_model(onnx_model_path, quantized_model_path, use_external_data
             onnx_model_path,
             quantized_model_path,
             use_external_data_format=use_external_data_format,
+            extra_options={"DefaultTensorType": onnx.TensorProto.FLOAT},
         )
         logger.info(f"quantized model saved to:{quantized_model_path}")
         # TODO: inlcude external data in total model size.
diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh
old mode 100644
new mode 100755
index f0422839c11e..64d6ecde618f
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@@ -34,6 +34,9 @@ run_gpu_fp16=true
 run_cpu_fp32=false
 run_cpu_int8=false
 
+# Set this to true to enable bfloat16 fastmath gemm kernels on aarch64 platforms with bfloat16 support
+arm64_bfloat16_fastmath_mode=false
+
 average_over=1000
 # CPU takes longer time to run, only run 100 inferences to get average latency.
 if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
@@ -63,7 +66,7 @@ models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
 # export CUDA_VISIBLE_DEVICES=1
 
 # This script will generate a logs file with a list of commands used in tests.
-echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log
+echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" arm64_bfloat16_fastmath_mode=$arm64_bfloat16_fastmath_mode >> benchmark.log
 
 # Set it to false to skip testing. You can use it to dry run this script with the log file.
 run_tests=true
@@ -127,6 +130,10 @@ if [ "$force_layer_number" = true ] ; then
   benchmark_options="$benchmark_options --force_num_layers $layer_number"
 fi
 
+if [ "$arm64_bfloat16_fastmath_mode" = true ] ; then
+  benchmark_options="$benchmark_options --enable_arm64_bfloat16_fastmath_mlas_gemm"
+fi
+
 # -------------------------------------------
 run_one_test() {
     if [ "$run_ort" = true ] ; then
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index ac6218866299..503930b23229 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -133,9 +133,7 @@ def use_static_input(self, inputs, batch_size=1, max_seq_len=128):
                     dim_proto.dim_value = max_seq_len
                 elif dim_proto.HasField("dim_value") and dim_proto.dim_value != max_seq_len:
                     raise ValueError(
-                        "Unable to set dimension value to {} for axis {} of {}. Contradicts existing dimension value {}.".format(
-                            max_seq_len, 1, input.name, dim_proto.dim_value
-                        )
+                        f"Unable to set dimension value to {max_seq_len} for axis {1} of {input.name}. Contradicts existing dimension value {dim_proto.dim_value}."
                     )
 
     def create_dummy_inputs(
diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
index f3e67930adbf..66f24c47f6cd 100644
--- a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
+++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
@@ -4,6 +4,7 @@
 # --------------------------------------------------------------------------
 
 import torch
+from torch._C._onnx import OperatorExportTypes
 
 TrainingMode = torch.onnx.TrainingMode
 from packaging.version import Version  # noqa: E402
@@ -18,7 +19,7 @@ def torch_onnx_export(
     training=TrainingMode.EVAL,
     input_names=None,
     output_names=None,
-    operator_export_type=None,
+    operator_export_type=OperatorExportTypes.ONNX,
     opset_version=None,
     _retain_param_name=None,
     do_constant_folding=True,
diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/__init__.py b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/__init__.py
index 8bf7cbf80eb3..9dee6564509d 100644
--- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/__init__.py
+++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/__init__.py
@@ -29,5 +29,5 @@ def load_aten_op_executor_cpp_extension():
     from onnxruntime.training.ortmodule.torch_cpp_extensions import aten_op_executor
 
     _C.register_aten_op_executor(
-        str(aten_op_executor.is_cpu_argument_address()), str(aten_op_executor.execute_aten_operator_address())
+        str(aten_op_executor.is_tensor_argument_address()), str(aten_op_executor.execute_aten_operator_address())
     )
diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
index 903a394a06ef..f4d2f68d4d8b 100644
--- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
+++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
@@ -34,18 +34,23 @@ struct ATenOperator {
   std::vector<bool> is_optional_arguments;
   std::vector<c10::optional<c10::IValue>> default_values;
   size_t return_size;
+  std::vector<c10::TypeKind> ret_kinds;
 
-  c10::IValue ToIValueArgument(const DLManagedTensor* dlpack, size_t index) const {
+  c10::IValue ToIValueArgument(DLManagedTensor* dlpack, size_t index) const {
     TORCH_INTERNAL_ASSERT(index < argument_size);
     bool is_optional = is_optional_arguments[index];
-    TORCH_INTERNAL_ASSERT(dlpack || is_optional || default_values[index]);
+    TORCH_INTERNAL_ASSERT(dlpack || is_optional || default_values[index] ||
+                          elem_kinds[index] == c10::TypeKind::TensorType);
     if (!dlpack) {
       if (is_optional) {
         // Optional argument always has no default value.
         return c10::IValue(c10::nullopt);
       }
-
-      return *default_values[index];
+      if (default_values[index]) {
+        return *default_values[index];
+      }
+      // Fow bw func, it's possible that input is an undefined tensor from fw outputs, dlpack is nullptr for such case.
+      return c10::IValue(at::Tensor());
     }
 
     bool is_list = is_list_arguments[index];
@@ -142,7 +147,10 @@ class ATenOperatorCache {
       }
       aten_op.return_size = schema.returns().size();
       for (const auto& ret : schema.returns()) {
-        TORCH_INTERNAL_ASSERT(ret.type()->kind() == c10::TypeKind::TensorType);
+        c10::TypeKind ret_type = ret.type()->kind();
+        // Support tensor or int only for now.
+        TORCH_INTERNAL_ASSERT(ret_type == c10::TypeKind::TensorType || ret_type == c10::TypeKind::IntType);
+        aten_op.ret_kinds.emplace_back(ret_type);
       }
       ops_.emplace(key, aten_op);
     }
@@ -154,32 +162,15 @@ class ATenOperatorCache {
   std::unordered_map<std::pair<std::string, std::string>, ATenOperator, PairHash> ops_;
 };
 
-const std::unordered_map<std::string, std::unordered_set<size_t>> kCpuTensorInputsMap = {
-    {"_efficient_attention_forward", {4, 5, 11, 12}}, {"_efficient_attention_backward", {6, 7, 12, 13}}};
-
-const std::unordered_map<std::string, std::unordered_set<size_t>> kCpuTensorOutputsMap = {
-    {"_efficient_attention_forward", {2, 3}}};
-
-// Backend uses this function to check if an argument is CPU input or not.
-bool IsCpuArgument(const char* op_name, const char* overload_name, size_t index, bool is_input) {
+// Backend uses this function to check if an argument is tensor type or not.
+bool IsTensorArgument(const char* op_name, const char* overload_name, size_t index, bool is_input) {
+  const auto& aten_op = ATenOperatorCache::Instance().GetOperator(op_name, overload_name);
   if (is_input) {
-    // If the argument is non-tensor type, it's CPU argument.
-    const auto& aten_op = ATenOperatorCache::Instance().GetOperator(op_name, overload_name);
     TORCH_INTERNAL_ASSERT(index < aten_op.argument_size);
-    if (aten_op.elem_kinds[index] != c10::TypeKind::TensorType) {
-      return true;
-    }
-  }
-
-  std::string full_name = std::string(op_name);
-  std::string overload_name_str = std::string(overload_name);
-  if (overload_name_str != "") {
-    full_name += ("." + overload_name_str);
+    return aten_op.elem_kinds[index] == c10::TypeKind::TensorType;
   }
-
-  const auto& cpu_tensors_map = is_input ? kCpuTensorInputsMap : kCpuTensorOutputsMap;
-  return cpu_tensors_map.find(full_name) != cpu_tensors_map.end() &&
-         cpu_tensors_map.at(full_name).find(index) != cpu_tensors_map.at(full_name).end();
+  TORCH_INTERNAL_ASSERT(index < aten_op.return_size);
+  return aten_op.ret_kinds[index] == c10::TypeKind::TensorType;
 }
 
 void ExecuteATenOperator(const char* op_name, const char* overload_name, size_t input_size,
@@ -216,16 +207,23 @@ void ExecuteATenOperator(const char* op_name, const char* overload_name, size_t
   TORCH_INTERNAL_ASSERT(output_size == aten_op.return_size);
   size_t output_index = 0;
   for (const auto& ret : torch::jit::pop(stack, output_size)) {
-    const auto& tensor = ret.toTensor();
-    dlpack_outputs[output_index++] =
-        tensor.defined() ? at::toDLPack(tensor.is_contiguous() ? tensor : tensor.contiguous()) : nullptr;
+    if (ret.isTensor()) {
+      const auto& tensor = ret.toTensor();
+      dlpack_outputs[output_index++] =
+          tensor.defined() ? at::toDLPack(tensor.is_contiguous() ? tensor : tensor.contiguous()) : nullptr;
+    } else if (ret.isInt()) {
+      at::Tensor scalar = at::scalar_to_tensor(at::Scalar(ret.toInt()));
+      dlpack_outputs[output_index++] = at::toDLPack(scalar);
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
   }
 }
 
-size_t is_cpu_argument_address() { return reinterpret_cast<size_t>(&IsCpuArgument); }
+size_t is_tensor_argument_address() { return reinterpret_cast<size_t>(&IsTensorArgument); }
 size_t execute_aten_operator_address() { return reinterpret_cast<size_t>(&ExecuteATenOperator); }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("is_cpu_argument_address", &is_cpu_argument_address, "Address of tensor argument check.");
+  m.def("is_tensor_argument_address", &is_tensor_argument_address, "Address of tensor argument check.");
   m.def("execute_aten_operator_address", &execute_aten_operator_address, "Address of Aten operator executor");
 }
diff --git a/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py b/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
index 329fba5aa670..7d5716b85db3 100644
--- a/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
+++ b/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
@@ -5,7 +5,7 @@
 
 from onnxruntime.capi import _pybind_state as _C
 
-from .aten_op_executor import execute_aten_operator_address, is_cpu_argument_address
+from .aten_op_executor import execute_aten_operator_address, is_tensor_argument_address
 
 
 def run_once_aten_op_executor(f):
@@ -30,7 +30,7 @@ def aten_op_executor_wrapper(*args, **kwargs):
 
 @run_once_aten_op_executor
 def load_aten_op_executor_cpp_extension():
-    _C.register_aten_op_executor(str(is_cpu_argument_address()), str(execute_aten_operator_address()))
+    _C.register_aten_op_executor(str(is_tensor_argument_address()), str(execute_aten_operator_address()))
 
 
 def init_aten_op_executor():
diff --git a/onnxruntime/test/common/cuda_op_test_utils.cc b/onnxruntime/test/common/cuda_op_test_utils.cc
new file mode 100644
index 000000000000..bab4e9a60e2e
--- /dev/null
+++ b/onnxruntime/test/common/cuda_op_test_utils.cc
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef USE_CUDA
+#include "cuda_runtime_api.h"
+#endif
+
+namespace onnxruntime {
+namespace test {
+
+int GetCudaArchitecture() {
+  // This will cache the result so we only call cudaGetDeviceProperties once.
+  // Usually, we test on a single GPU or multiple GPUs of same architecture, so it's fine to cache the result.
+  static int cuda_arch = -1;
+
+#ifdef USE_CUDA
+  if (cuda_arch == -1) {
+    int current_device_id = 0;
+    cudaGetDevice(&current_device_id);
+    // must wait GPU idle, otherwise cudaGetDeviceProperties might fail
+    cudaDeviceSynchronize();
+    cudaDeviceProp prop;
+
+    // When cudaGetDeviceProperties fails, just return -1 and no error is raised.
+    // If cuda device has issue, test will fail anyway so no need to raise error here.
+    if (cudaSuccess == cudaGetDeviceProperties(&prop, current_device_id)) {
+      cuda_arch = prop.major * 100 + prop.minor * 10;
+    }
+  }
+#endif
+
+  return cuda_arch;
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index 043e3059c38d..6f3e46062856 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -4,37 +4,20 @@
 #pragma once
 
 #include "test/util/include/default_providers.h"
-#ifdef USE_CUDA
-#include "cuda_runtime_api.h"
-#endif
 
 namespace onnxruntime {
 namespace test {
 
+// CUDA architecture of the current device like 100 * major + 10 * minor.
+// Please call this function after CUDA EP is enabled.
+int GetCudaArchitecture();
+
 inline bool HasCudaEnvironment(int min_cuda_architecture) {
   if (DefaultCudaExecutionProvider().get() == nullptr) {
     return false;
   }
 
-  if (min_cuda_architecture == 0) {
-    return true;
-  }
-
-  int cuda_architecture = 0;
-
-#ifdef USE_CUDA
-  int currentCudaDevice = 0;
-  cudaGetDevice(&currentCudaDevice);
-  cudaDeviceSynchronize();
-  cudaDeviceProp prop;
-  if (cudaSuccess != cudaGetDeviceProperties(&prop, currentCudaDevice)) {
-    return false;
-  }
-
-  cuda_architecture = prop.major * 100 + prop.minor * 10;
-#endif
-
-  return cuda_architecture >= min_cuda_architecture;
+  return GetCudaArchitecture() >= min_cuda_architecture;
 }
 
 inline bool NeedSkipIfCudaArchLowerThan(int min_cuda_architecture) {
diff --git a/onnxruntime/test/common/logging/helpers.h b/onnxruntime/test/common/logging/helpers.h
index c8d027022850..7fd03b72e53a 100644
--- a/onnxruntime/test/common/logging/helpers.h
+++ b/onnxruntime/test/common/logging/helpers.h
@@ -5,7 +5,6 @@
 
 #include <sstream>
 
-#include "date/date.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -30,7 +29,7 @@ class MockSink : public ::onnxruntime::logging::ISink {
 #endif
 
 ACTION(PrintArgs) {
-  using date::operator<<;
+  using onnxruntime::logging::timestamp_ns::operator<<;
 
   // const Timestamp &timestamp, const std::string &logger_id, const Message &message
   //                  arg0                          arg1                        arg2
diff --git a/onnxruntime/test/common/logging/sinks_test.cc b/onnxruntime/test/common/logging/sinks_test.cc
index 28fb407bc2f0..7ca8d5fc1152 100644
--- a/onnxruntime/test/common/logging/sinks_test.cc
+++ b/onnxruntime/test/common/logging/sinks_test.cc
@@ -156,7 +156,7 @@ TEST(LoggingTests, TestCompositeSink) {
   EXPECT_CALL(*sink_ptr2, SendImpl(testing::_, testing::_, testing::_)).Times(1);
 
   CompositeSink* sink = new CompositeSink();
-  sink->AddSink(std::unique_ptr<ISink>{sink_ptr1}).AddSink(std::unique_ptr<ISink>{sink_ptr2});
+  sink->AddSink(std::unique_ptr<ISink>{sink_ptr1}, min_log_level).AddSink(std::unique_ptr<ISink>{sink_ptr2}, min_log_level);
   LoggingManager manager{std::unique_ptr<ISink>(sink), min_log_level, false, InstanceType::Temporal};
 
   auto logger = manager.CreateLogger(logid);
diff --git a/onnxruntime/test/common/trt_op_test_utils.h b/onnxruntime/test/common/trt_op_test_utils.h
new file mode 100644
index 000000000000..a0b0b9bb1931
--- /dev/null
+++ b/onnxruntime/test/common/trt_op_test_utils.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "test/common/cuda_op_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+// TensorRT EP Segmentation fault on A100: https://github.com/microsoft/onnxruntime/issues/19530
+inline const std::unordered_set<std::string> ExcludeTrtOnA100() {
+  // Note: GetCudaArchitecture need USE_CUDA to be defined. Currently, it is defined when TRT EP is enabled.
+  // If we want to make TRT EP independent of CUDA EP, we need to change the implementation of GetCudaArchitecture.
+  if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
+    return {kTensorrtExecutionProvider};
+  }
+
+  return {};
+}
+
+// Add TensorRT EP to an excluded provider list when running on A100
+inline const std::unordered_set<std::string>& ExcludeTrtOnA100(std::unordered_set<std::string>& excluded_providers) {
+  if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
+    excluded_providers.insert(kTensorrtExecutionProvider);
+    return excluded_providers;
+  }
+
+  return excluded_providers;
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/activation_op_test.cc b/onnxruntime/test/contrib_ops/activation_op_test.cc
index b1e54ec605a3..061fffa572be 100644
--- a/onnxruntime/test/contrib_ops/activation_op_test.cc
+++ b/onnxruntime/test/contrib_ops/activation_op_test.cc
@@ -22,7 +22,8 @@ namespace test {
 TEST_F(ActivationOpTest, ThresholdedRelu_version_1_to_9) {
   float alpha = 0.1f;
   TestActivationOp<float>(
-      "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, {{"alpha", alpha}}, true, 1);
+      "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, {{"alpha", alpha}}, {},
+      true, 1);
 }
 
 TEST_F(ActivationOpTest, ScaledTanh) {
@@ -46,14 +47,18 @@ TEST_F(ActivationOpTest, ParametricSoftplus) {
         else
           return alpha * logf(expf(bx) + 1);
       },
-      {{"alpha", alpha}, {"beta", beta}}, false);  // Disable TensorRT due to result mismatch
+      {{"alpha", alpha}, {"beta", beta}}, {}, false);  // Disable TensorRT due to result mismatch
 }
 
+// [TODO] Temporarily ignore this test for OpenVINO
+// Fails due to accuracy mismatch
+#if !defined(USE_OPENVINO)
 TEST_F(ActivationOpTest, Gelu) {
   TestActivationOp<float>(
       "Gelu", input_values, [](float x) { return x * 0.5f * (1.0f + std::erf(x * static_cast<float>(M_SQRT1_2))); }, {},
-      false, 1, kMSDomain);
+      {}, false, 1, kMSDomain);
 }
+#endif
 
 #if defined(USE_DNNL)
 std::vector<BFloat16> expected_output_bfloat16(const std::vector<float>& input_data) {
@@ -115,7 +120,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 
   // Silu = x*sigmoid(x), i.e., alpha = 1.0f.
@@ -129,7 +134,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 
   // Negative alpha.
@@ -143,7 +148,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 }
 
diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc
index b652e0723f5a..a8e2fccdd046 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_op_test.cc
@@ -227,6 +227,12 @@ static void RunAttentionTest(
       tester.AddOptionalInputEdge<int32_t>();
     }
 
+    if (use_float16) {
+      tester.SetOutputTolerance(0.005f);
+    } else {
+      tester.SetOutputTolerance(0.001f, 0.001f);
+    }
+
     if (enable_cuda) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -254,6 +260,9 @@ static void RunAttentionTest(
     if (enable_dml) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       execution_providers.push_back(DefaultDmlExecutionProvider());
+      if (use_float16) {
+        tester.SetOutputTolerance(0.02f);
+      }
       tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
     }
   }
@@ -2013,13 +2022,6 @@ TEST(AttentionTest, AttentionMaskIndexOutOfRange) {
 #if !defined(__wasm__)
 // TODO: fix in web assembly
 TEST(AttentionTest, AttentionPastState_dynamic) {
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  // Do not run this test unless TF32 is disabled explicitly.
-  if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault<int>("NVIDIA_TF32_OVERRIDE", 1) != 0) {
-    GTEST_SKIP() << "Skipping AttentionPastState_dynamic in A100 since TF32 is enabled";
-    return;
-  }
-
   // create rand inputs
   RandomValueGenerator random{};
 
@@ -2101,13 +2103,6 @@ static void RunModelWithRandomInput(
     std::vector<int32_t>& mask_index_data,
     std::string& onnx_model,
     bool is_float16) {
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  // Do not run this test unless TF32 is disabled explicitly.
-  if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault<int>("NVIDIA_TF32_OVERRIDE", 1) != 0) {
-    GTEST_SKIP() << "Skipping RunModelWithRandomInput in A100 since TF32 is enabled";
-    return;
-  }
-
   RandomValueGenerator random{234};
 
   constexpr int hidden_size = 768;
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 156ed3799fc2..6ce9f5de68f1 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -70,7 +74,9 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
 
   Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -161,7 +167,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -254,7 +262,9 @@ TEST(BeamSearchTest, GptBeamSearchWithInitDecoderFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -346,7 +356,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16_VocabPadded) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
diff --git a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
index 88a2bdf6a484..8a37ef921fd2 100644
--- a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
@@ -31,10 +31,8 @@ static void RunAttentionTest(
     const std::vector<float>* new_value_cache = nullptr,
     const std::vector<float>* key_cache = nullptr,
     const std::vector<float>* value_cache = nullptr,
-    const std::initializer_list<bool>* key_padding_mask_data = nullptr,
-    bool use_float16 = false) {
-  int min_cuda_architecture = use_float16 ? 530 : 0;
-  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+    const std::initializer_list<bool>* key_padding_mask_data = nullptr) {
+  bool enable_cuda = HasCudaEnvironment(0);
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
   bool enable_cpu = false;
 
@@ -99,6 +97,7 @@ static void RunAttentionTest(
       tester.AddOutput<float>("new_key_cache", output_cache_dims, *new_key_cache);
       tester.AddOutput<float>("new_value_cache", output_cache_dims, *new_value_cache);
     }
+    tester.SetOutputTolerance(0.001f, 0.001f);
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     if (enable_cuda) {
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index d9c870a7dc52..17c9e8592f64 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -463,12 +463,12 @@ std::vector<MLFloat16> QK_Transpose(MLFloat16* q_matrix, MLFloat16* k_transpose_
 
 // Softmax_QK_Transpose
 template <typename T>
-std::vector<T> Softmax_QK_Transpose(T* qk_transpose_matrix,
-                                    int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size);
+std::vector<T> Softmax_QK_Transpose(T* qk_transpose_matrix, int batch_size, int num_heads,
+                                    int sequence_length, int total_sequence_length, int head_size);
 
 template <>
-std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix,
-                                        int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size) {
+std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix, int batch_size, int num_heads,
+                                        int sequence_length, int total_sequence_length, int /*head_size*/) {
   if (sequence_length != 1) {
     throw std::runtime_error("Not supported");
   }
@@ -506,8 +506,8 @@ std::vector<float> Softmax_QK_Transpose(float* qk_transpose_matrix,
 }
 
 template <>
-std::vector<MLFloat16> Softmax_QK_Transpose(MLFloat16* qk_transpose_matrix,
-                                            int batch_size, int num_heads, int sequence_length, int total_sequence_length, int head_size) {
+std::vector<MLFloat16> Softmax_QK_Transpose(MLFloat16* qk_transpose_matrix, int batch_size, int num_heads,
+                                            int sequence_length, int total_sequence_length, int /*head_size*/) {
   if (sequence_length != 1) {
     throw std::runtime_error("Not supported");
   }
@@ -640,223 +640,283 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) {
     return;
   }
 
-  // Vary batch size
-  for (int batch_size = 1; batch_size <= 5; batch_size += 2) {
-    // Vary kv_lengths
-    for (int past_sequence_length = 1; past_sequence_length <= 3000; past_sequence_length += 150) {
-      int sequence_length = 1;
-      int number_of_heads = 12;
-      // Vary head_size / hidden_size
-      int hidden_sizes[3] = {384, 768, 1536};
-      for (int hidden_size : hidden_sizes) {
-        int head_size = (hidden_size / number_of_heads);
-        int total_sequence_length = sequence_length + past_sequence_length;
-        int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
+  // Buckets for test data:
+  // batch_size: 1, >=2
+  // past_sequence_length 0~30, 31~2046, >=2047 (so that total_sequence_length: 1~31, 32~2047, >=2048)
+  // head_size: 32, 64, 128
+  struct MyTestCase {
+    int batch_size;
+    int past_sequence_length;
+    int hidden_size;
+  } test_cases[] = {
+      {1, 0, 768},
+      {1, 1, 384},
+      {2, 30, 768},
+      {3, 31, 1536},
+      {4, 512, 384},
+      {1, 1024, 768},
+      {1, 2046, 1536},
+      {2, 2047, 384},
+      {3, 3000, 768},
+  };
+
+  constexpr int sequence_length = 1;
+  constexpr int number_of_heads = 12;
+
+  for (MyTestCase test_case : test_cases) {
+    int batch_size = test_case.batch_size;
+    int past_sequence_length = test_case.past_sequence_length;
+    int hidden_size = test_case.hidden_size;
+
+    int head_size = (hidden_size / number_of_heads);
+    int total_sequence_length = sequence_length + past_sequence_length;
+    int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
+
+    OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
+    tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
+    tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
+
+    std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
+    std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
+    std::vector<int64_t> bias_dims = {3 * hidden_size};
+    std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
+
+    auto input = CreateRandom<float>(batch_size * sequence_length * hidden_size);
+    tester.AddInput<float>("input", input_dims, input);
+
+    auto weight = CreateRandom<float>(hidden_size * 3 * hidden_size);
+    tester.AddInput<float>("weight", weights_dims, weight);
+
+    auto bias = CreateRandom<float>(3 * hidden_size);
+    tester.AddInput<float>("bias", bias_dims, bias);
+
+    // Mask
+    tester.AddOptionalInputEdge<int32_t>();
+
+    // Past
+    std::vector<int64_t> past_dims = {2, batch_size, number_of_heads, max_sequence_length, head_size};
+    int past_present_size = 2 * batch_size * number_of_heads * max_sequence_length * head_size;
+
+    auto kv_cache = CreateRandom<float>(past_present_size);
+
+    auto reordered_kv_cache = ReorderKVCache<float>(kv_cache, batch_size,
+                                                    number_of_heads, past_sequence_length, head_size, max_sequence_length);
+
+    // Validate if reordering went well - by transposing and checking equality
+    int chunk_size = 16 / sizeof(float);
+    int num_chunks = head_size / chunk_size;
+    auto transposed = Transpose<float>(kv_cache.data(), batch_size, number_of_heads, num_chunks, max_sequence_length, chunk_size);
+    CheckEquality<float>(transposed.data(), reordered_kv_cache.data(), batch_size, number_of_heads, num_chunks,
+                         max_sequence_length, past_sequence_length, chunk_size);
+
+    tester.AddInput<float>("past", past_dims, reordered_kv_cache);
+
+    // Rel
+    tester.AddOptionalInputEdge<float>();
+
+    // Past sequence length
+    std::vector<int32_t> arr_past_sequence_len(1, past_sequence_length);
+    tester.AddInput<int32_t>("past_sequence_length", {1}, arr_past_sequence_len);
+
+    // QKV MatMul
+    auto qkv = QKV(input, weight, bias, batch_size, sequence_length, hidden_size);
+    auto* qkv_matrix = qkv.data();
+
+    auto pair = MergePastKWithPresentKAndTranspose<float>(kv_cache.data(), qkv_matrix + hidden_size, batch_size,
+                                                          number_of_heads, past_sequence_length,
+                                                          max_sequence_length, head_size);
+
+    auto k_merged = pair.first;
+    auto k_transpose = pair.second;
+
+    auto qk_transpose = QK_Transpose<float>(qkv_matrix, k_transpose.data(), batch_size, number_of_heads,
+                                            total_sequence_length, head_size);
+
+    auto softmax_qk_transpose = Softmax_QK_Transpose<float>(qk_transpose.data(), batch_size, number_of_heads,
+                                                            sequence_length, total_sequence_length, head_size);
+
+    auto present = MergeReorderedKVCacheWithK<float>(reordered_kv_cache, qkv_matrix + hidden_size, batch_size,
+                                                     number_of_heads, past_sequence_length, max_sequence_length, head_size);
+
+    // Validate our test logic
+    // We want to validate if our merged "unordered" K is the same as
+    // the merged "ordered" K so that the QKT we do in our test code
+    // is equivalent to the QKT we do in the kernel
+    ValidateReorderedMergedKWithK<float>(k_merged.data(), present.data(), batch_size, number_of_heads, total_sequence_length, max_sequence_length, head_size);
+
+    MergeReorderedKVCacheWithV<float>(present.data() + (past_present_size / 2), qkv_matrix + 2 * hidden_size, batch_size,
+                                      number_of_heads, past_sequence_length, max_sequence_length, head_size);
+
+    auto output = Softmax_QK_Transpose_V<float>(softmax_qk_transpose.data(), present.data() + (past_present_size / 2),
+                                                batch_size, number_of_heads,
+                                                sequence_length, total_sequence_length,
+                                                max_sequence_length, head_size);
 
-        OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
-        tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
-        tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
+    // Output(s)
+    tester.AddOutput<float>("output", input_dims, output);
+    tester.AddOutput<float>("present", past_dims, present);
 
-        std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
-        std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
-        std::vector<int64_t> bias_dims = {3 * hidden_size};
-        std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
+    tester.SetOutputTolerance(0.001f, 0.001f);
 
-        auto input = CreateRandom<float>(batch_size * sequence_length * hidden_size);
-        tester.AddInput<float>("input", input_dims, input);
-
-        auto weight = CreateRandom<float>(hidden_size * 3 * hidden_size);
-        tester.AddInput<float>("weight", weights_dims, weight);
+    // Run - Regular kernel execution path
+    {
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+    }
 
-        auto bias = CreateRandom<float>(3 * hidden_size);
-        tester.AddInput<float>("bias", bias_dims, bias);
+    // Test alternate kernel path of loading more KV data "in flight"
+    {
+      ScopedEnvironmentVariables scoped_env_vars{
+          EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
 
-        // Mask
-        tester.AddOptionalInputEdge<int32_t>();
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
 
-        // Past
-        std::vector<int64_t> past_dims = {2, batch_size, number_of_heads, max_sequence_length, head_size};
-        int past_present_size = 2 * batch_size * number_of_heads * max_sequence_length * head_size;
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+    }
+  }
+}
 
-        auto kv_cache = CreateRandom<float>(past_present_size);
+TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
+  // The kernel is only supported on CC 5.3 or higher GPUs
+  if (NeedSkipIfCudaArchLowerThan(530)) {
+    return;
+  }
 
-        auto reordered_kv_cache = ReorderKVCache<float>(kv_cache, batch_size,
+  // Buckets for test data:
+  // batch_size: 1, >=2
+  // past_sequence_length 0, 1~30, 31~2046, >=2047 (so that total_sequence_length: 1, 2-31, 32~2047, >=2048)
+  // head_size: 32, 64, 128
+  struct MyTestCase {
+    int batch_size;
+    int past_sequence_length;
+    int hidden_size;
+  } test_cases[] = {
+      {1, 0, 768},
+      {1, 1, 768},
+      {3, 30, 384},
+      {8, 31, 1536},
+      {4, 256, 384},
+      {3, 1024, 768},
+      {2, 2046, 1536},
+      {1, 2047, 384},
+      {2, 3000, 768},
+  };
+
+  constexpr int sequence_length = 1;
+  constexpr int number_of_heads = 12;
+
+  for (MyTestCase test_case : test_cases) {
+    int batch_size = test_case.batch_size;
+    int past_sequence_length = test_case.past_sequence_length;
+    int hidden_size = test_case.hidden_size;
+
+    int head_size = (hidden_size / number_of_heads);
+    int total_sequence_length = sequence_length + past_sequence_length;
+    int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
+
+    OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
+    tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
+    tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
+
+    std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
+    std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
+    std::vector<int64_t> bias_dims = {3 * hidden_size};
+    std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
+
+    auto input = CreateRandom<MLFloat16>(batch_size * sequence_length * hidden_size);
+    tester.AddInput<MLFloat16>("input", input_dims, input);
+
+    auto weight = CreateRandom<MLFloat16>(hidden_size * 3 * hidden_size);
+    tester.AddInput<MLFloat16>("weight", weights_dims, weight);
+
+    auto bias = CreateRandom<MLFloat16>(3 * hidden_size);
+    tester.AddInput<MLFloat16>("bias", bias_dims, bias);
+
+    // Mask
+    tester.AddOptionalInputEdge<int32_t>();
+
+    // Past
+    std::vector<int64_t> past_dims = {2, batch_size, number_of_heads, max_sequence_length, head_size};
+    int past_present_size = 2 * batch_size * number_of_heads * max_sequence_length * head_size;
+
+    auto kv_cache = CreateRandom<MLFloat16>(past_present_size);
+
+    auto reordered_kv_cache = ReorderKVCache<MLFloat16>(kv_cache, batch_size,
                                                         number_of_heads, past_sequence_length, head_size, max_sequence_length);
 
-        // Validate if reordering went well - by transposing and checking equality
-        int chunk_size = 16 / sizeof(float);
-        int num_chunks = head_size / chunk_size;
-        auto transposed = Transpose<float>(kv_cache.data(), batch_size, number_of_heads, num_chunks, max_sequence_length, chunk_size);
-        CheckEquality<float>(transposed.data(), reordered_kv_cache.data(), batch_size, number_of_heads, num_chunks,
+    // Validate if reordering went well - by transposing and checking equality
+    int chunk_size = 16 / sizeof(MLFloat16);
+    int num_chunks = head_size / chunk_size;
+    auto transposed = Transpose<MLFloat16>(kv_cache.data(), batch_size, number_of_heads, num_chunks, max_sequence_length, chunk_size);
+    CheckEquality<MLFloat16>(transposed.data(), reordered_kv_cache.data(), batch_size, number_of_heads, num_chunks,
                              max_sequence_length, past_sequence_length, chunk_size);
 
-        tester.AddInput<float>("past", past_dims, reordered_kv_cache);
+    tester.AddInput<MLFloat16>("past", past_dims, reordered_kv_cache);
 
-        // Rel
-        tester.AddOptionalInputEdge<float>();
+    // Rel
+    tester.AddOptionalInputEdge<MLFloat16>();
 
-        // Past sequence length
-        std::vector<int32_t> arr_past_sequence_len(1, past_sequence_length);
-        tester.AddInput<int32_t>("past_sequence_length", {1}, arr_past_sequence_len);
+    // Past sequence length
+    std::vector<int32_t> arr_past_sequence_len(1, past_sequence_length);
+    tester.AddInput<int32_t>("past_sequence_length", {1}, arr_past_sequence_len);
 
-        // QKV MatMul
-        auto qkv = QKV(input, weight, bias, batch_size, sequence_length, hidden_size);
-        auto* qkv_matrix = qkv.data();
+    // QKV MatMul
+    auto qkv = QKV(input, weight, bias, batch_size, sequence_length, hidden_size);
+    auto* qkv_matrix = qkv.data();
 
-        auto pair = MergePastKWithPresentKAndTranspose<float>(kv_cache.data(), qkv_matrix + hidden_size, batch_size,
+    auto pair = MergePastKWithPresentKAndTranspose<MLFloat16>(kv_cache.data(), qkv_matrix + hidden_size, batch_size,
                                                               number_of_heads, past_sequence_length,
                                                               max_sequence_length, head_size);
 
-        auto k_merged = pair.first;
-        auto k_transpose = pair.second;
+    auto k_merged = pair.first;
+    auto k_transpose = pair.second;
 
-        auto qk_transpose = QK_Transpose<float>(qkv_matrix, k_transpose.data(), batch_size, number_of_heads,
+    auto qk_transpose = QK_Transpose<MLFloat16>(qkv_matrix, k_transpose.data(), batch_size, number_of_heads,
                                                 total_sequence_length, head_size);
 
-        auto softmax_qk_transpose = Softmax_QK_Transpose<float>(qk_transpose.data(), batch_size, number_of_heads,
+    auto softmax_qk_transpose = Softmax_QK_Transpose<MLFloat16>(qk_transpose.data(), batch_size, number_of_heads,
                                                                 sequence_length, total_sequence_length, head_size);
 
-        auto present = MergeReorderedKVCacheWithK<float>(reordered_kv_cache, qkv_matrix + hidden_size, batch_size,
+    auto present = MergeReorderedKVCacheWithK<MLFloat16>(reordered_kv_cache, qkv_matrix + hidden_size, batch_size,
                                                          number_of_heads, past_sequence_length, max_sequence_length, head_size);
 
-        // Validate our test logic
-        // We want to validate if our merged "unordered" K is the same as
-        // the merged "ordered" K so that the QKT we do in our test code
-        // is equivalent to the QKT we do in the kernel
-        ValidateReorderedMergedKWithK<float>(k_merged.data(), present.data(), batch_size, number_of_heads, total_sequence_length, max_sequence_length, head_size);
+    // Validate our test logic
+    // We want to validate if our merged "unordered" K is the same as
+    // the merged "ordered" K so that the QKT we do in our test code
+    // is equivalent to the QKT we do in the kernel
+    ValidateReorderedMergedKWithK<MLFloat16>(k_merged.data(), present.data(), batch_size, number_of_heads, total_sequence_length, max_sequence_length, head_size);
 
-        MergeReorderedKVCacheWithV<float>(present.data() + (past_present_size / 2), qkv_matrix + 2 * hidden_size, batch_size,
+    MergeReorderedKVCacheWithV<MLFloat16>(present.data() + (past_present_size / 2), qkv_matrix + 2 * hidden_size, batch_size,
                                           number_of_heads, past_sequence_length, max_sequence_length, head_size);
 
-        auto output = Softmax_QK_Transpose_V<float>(softmax_qk_transpose.data(), present.data() + (past_present_size / 2),
-                                                    batch_size, number_of_heads,
-                                                    sequence_length, total_sequence_length,
-                                                    max_sequence_length, head_size);
+    auto output = Softmax_QK_Transpose_V(softmax_qk_transpose.data(), present.data() + (past_present_size / 2),
+                                         batch_size, number_of_heads,
+                                         sequence_length, total_sequence_length,
+                                         max_sequence_length, head_size);
 
-        // Output(s)
-        tester.AddOutput<float>("output", input_dims, output);
+    // Output(s)
+    tester.AddOutput<MLFloat16>("output", input_dims, output);
+    tester.AddOutput<MLFloat16>("present", past_dims, present);
 
-        tester.AddOutput<float>("present", past_dims, present);
+    tester.SetOutputTolerance(0.005f);
 
-        // Run
-        std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-        execution_providers.push_back(DefaultCudaExecutionProvider());
-        tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
-      }
+    // Run - Regular kernel execution path
+    {
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
     }
-  }
-}
-
-TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
-  // The kernel is only supported on CC 5.3 or higher GPUs
-  if (NeedSkipIfCudaArchLowerThan(530)) {
-    return;
-  }
-
-  // Vary batch size
-  for (int batch_size = 1; batch_size <= 5; batch_size += 2) {
-    // Vary kv_lengths
-    for (int past_sequence_length = 1; past_sequence_length <= 3000; past_sequence_length += 150) {
-      int sequence_length = 1;
-      int number_of_heads = 12;
-
-      // Vary head_size / hidden_size
-      int hidden_sizes[3] = {384, 768, 1536};
-      for (int hidden_size : hidden_sizes) {
-        int head_size = (hidden_size / number_of_heads);
-        int total_sequence_length = sequence_length + past_sequence_length;
-        int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
-
-        OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
-        tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
-        tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
-
-        std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
-        std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
-        std::vector<int64_t> bias_dims = {3 * hidden_size};
-        std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
 
-        auto input = CreateRandom<MLFloat16>(batch_size * sequence_length * hidden_size);
-        tester.AddInput<MLFloat16>("input", input_dims, input);
+    // Test alternate kernel path of loading more KV data "in flight"
+    {
+      ScopedEnvironmentVariables scoped_env_vars{
+          EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
 
-        auto weight = CreateRandom<MLFloat16>(hidden_size * 3 * hidden_size);
-        tester.AddInput<MLFloat16>("weight", weights_dims, weight);
-
-        auto bias = CreateRandom<MLFloat16>(3 * hidden_size);
-        tester.AddInput<MLFloat16>("bias", bias_dims, bias);
-
-        // Mask
-        tester.AddOptionalInputEdge<int32_t>();
-
-        // Past
-        std::vector<int64_t> past_dims = {2, batch_size, number_of_heads, max_sequence_length, head_size};
-        int past_present_size = 2 * batch_size * number_of_heads * max_sequence_length * head_size;
-
-        auto kv_cache = CreateRandom<MLFloat16>(past_present_size);
-
-        auto reordered_kv_cache = ReorderKVCache<MLFloat16>(kv_cache, batch_size,
-                                                            number_of_heads, past_sequence_length, head_size, max_sequence_length);
-
-        // Validate if reordering went well - by transposing and checking equality
-        int chunk_size = 16 / sizeof(MLFloat16);
-        int num_chunks = head_size / chunk_size;
-        auto transposed = Transpose<MLFloat16>(kv_cache.data(), batch_size, number_of_heads, num_chunks, max_sequence_length, chunk_size);
-        CheckEquality<MLFloat16>(transposed.data(), reordered_kv_cache.data(), batch_size, number_of_heads, num_chunks,
-                                 max_sequence_length, past_sequence_length, chunk_size);
-
-        tester.AddInput<MLFloat16>("past", past_dims, reordered_kv_cache);
-
-        // Rel
-        tester.AddOptionalInputEdge<MLFloat16>();
-
-        // Past sequence length
-        std::vector<int32_t> arr_past_sequence_len(1, past_sequence_length);
-        tester.AddInput<int32_t>("past_sequence_length", {1}, arr_past_sequence_len);
-
-        // QKV MatMul
-        auto qkv = QKV(input, weight, bias, batch_size, sequence_length, hidden_size);
-        auto* qkv_matrix = qkv.data();
-
-        auto pair = MergePastKWithPresentKAndTranspose<MLFloat16>(kv_cache.data(), qkv_matrix + hidden_size, batch_size,
-                                                                  number_of_heads, past_sequence_length,
-                                                                  max_sequence_length, head_size);
-
-        auto k_merged = pair.first;
-        auto k_transpose = pair.second;
-
-        auto qk_transpose = QK_Transpose<MLFloat16>(qkv_matrix, k_transpose.data(), batch_size, number_of_heads,
-                                                    total_sequence_length, head_size);
-
-        auto softmax_qk_transpose = Softmax_QK_Transpose<MLFloat16>(qk_transpose.data(), batch_size, number_of_heads,
-                                                                    sequence_length, total_sequence_length, head_size);
-
-        auto present = MergeReorderedKVCacheWithK<MLFloat16>(reordered_kv_cache, qkv_matrix + hidden_size, batch_size,
-                                                             number_of_heads, past_sequence_length, max_sequence_length, head_size);
-
-        // Validate our test logic
-        // We want to validate if our merged "unordered" K is the same as
-        // the merged "ordered" K so that the QKT we do in our test code
-        // is equivalent to the QKT we do in the kernel
-        ValidateReorderedMergedKWithK<MLFloat16>(k_merged.data(), present.data(), batch_size, number_of_heads, total_sequence_length, max_sequence_length, head_size);
-
-        MergeReorderedKVCacheWithV<MLFloat16>(present.data() + (past_present_size / 2), qkv_matrix + 2 * hidden_size, batch_size,
-                                              number_of_heads, past_sequence_length, max_sequence_length, head_size);
-
-        auto output = Softmax_QK_Transpose_V(softmax_qk_transpose.data(), present.data() + (past_present_size / 2),
-                                             batch_size, number_of_heads,
-                                             sequence_length, total_sequence_length,
-                                             max_sequence_length, head_size);
-
-        // Output(s)
-        tester.AddOutput<MLFloat16>("output", input_dims, output);
-
-        tester.AddOutput<MLFloat16>("present", past_dims, present);
-
-        // Run
-        std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-        execution_providers.push_back(DefaultCudaExecutionProvider());
-        tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
-      }
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
     }
   }
 }
@@ -864,4 +924,4 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
 #endif
 
 }  // namespace test
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
index c70f659f1b64..0b64ea3de8de 100644
--- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
+++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
@@ -23,20 +23,85 @@ namespace onnxruntime {
 namespace test {
 
 template <typename T>
-void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
-                               std::vector<int64_t> B_dims,
-                               const std::string& reference_model,
-                               bool is_matrix_b_constant,
+static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, const int64_t K,
+                                           const std::vector<float>& A_data, const std::vector<T>& B_data,
+                                           std::vector<float>& B_scale, std::vector<T>& B_zero_point,
+                                           const std::vector<float>& Bias, std::vector<float>& Y_data,
+                                           bool per_column, bool has_zp, bool has_bias) {
+  // DynamicQuantize Matrix A
+  const uint32_t num_elements = static_cast<uint32_t>(M * K);
+  std::vector<T> QuantA_data(num_elements);
+  std::vector<float> A_scale;
+  std::vector<T> A_zero_point;
+
+  // Get max and min
+  float min = std::numeric_limits<float>::max();
+  float max = std::numeric_limits<float>::lowest();
+  float qmax = static_cast<float>(std::numeric_limits<T>::max());
+  float qmin = static_cast<float>(std::numeric_limits<T>::lowest());
+
+  for (uint32_t i = 0; i < num_elements; ++i) {
+    max = std::max(A_data[i], max);
+    min = std::min(A_data[i], min);
+  }
+
+  // Adjust the maximum and minimum to include zero
+  max = std::max(max, 0.0f);
+  min = std::min(min, 0.0f);
+
+  float scale = static_cast<float>(max - min) / (qmax - qmin);
+  T zeroPoint = std::round(std::clamp(qmin - min / scale, qmin, qmax));
+
+  A_scale.push_back(scale);
+  A_zero_point.push_back(zeroPoint);
+
+  // Matrix Multiplication
+  for (uint32_t i = 0; i < num_elements; ++i) {
+    QuantA_data[i] = static_cast<T>(std::round((A_data[i] / scale) + zeroPoint));
+  }
+  if (!per_column) {
+    B_zero_point.resize(N, B_zero_point[0]);
+    B_scale.resize(N, B_scale[0]);
+  }
+
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        float A_dequantized = (static_cast<int>(QuantA_data[m * K + k]) - static_cast<int>(A_zero_point[0])) * A_scale[0];
+
+        float B_dequantized = has_zp ? (static_cast<int>(B_data[k * N + n]) - static_cast<int>(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n];
+
+        sum += A_dequantized * B_dequantized;
+      }
+      if (has_bias) {
+        sum += Bias[n];
+      }
+      Y_data[m * N + n] = sum;
+    }
+  }
+}
+
+template <typename T>
+void TestDynamicQuantizeMatMul(bool is_matrix_b_constant,
                                bool per_column = false,
                                bool has_zp = true,
-                               bool has_bias = false) {
+                               bool has_bias = false,
+                               bool empty_input = false) {
   // create rand inputs
-  RandomValueGenerator random{};
-
+  RandomValueGenerator random{1668426375};
+
+  int64_t M = empty_input ? 1 : 4;
+  int64_t N = 128;
+  int64_t K = 128;
+  std::vector<int64_t> A_dims{empty_input ? 0 : M, K};
+  std::vector<int64_t> B_dims{K, N};
+  std::vector<int64_t> Y_dims{empty_input ? 0 : M, K};
   std::vector<float> A_data = random.Uniform<float>(A_dims, -1.0f, 1.0f);
-
   std::vector<T> B_data;
-  std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+  std::vector<T> tmp_B_data = random.Uniform<T>(B_dims,
+                                                (std::is_same_v<T, int8_t>) ? std::numeric_limits<int8_t>::lowest() / 2 : std::numeric_limits<uint8_t>::lowest(),
+                                                std::numeric_limits<T>::max() / 2);
   std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T {
     return static_cast<T>(v);
   });
@@ -47,7 +112,9 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
   std::for_each(B_zero_point.begin(),
                 B_zero_point.end(),
                 [&random](T& zp) {
-                  zp = static_cast<T>(random.Uniform<int32_t>(std::array<int64_t, 1>{1}, std::numeric_limits<T>::min(), std::numeric_limits<T>::max())[0]);
+                  zp = static_cast<T>(random.Uniform<T>(std::array<int64_t, 1>{1},
+                                                        std::numeric_limits<T>::min(),
+                                                        std::numeric_limits<T>::max())[0]);
                 });
 
   std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
@@ -69,77 +136,85 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
     test.AddOptionalInputEdge<float>();
   }
 
-  test.AddReferenceOutputs(reference_model);
+  std::vector<float> Y_data(M * N);
+  CalculateDynamicQuantizeMatMul<T>(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data,
+                                    per_column, has_zp, has_bias);
+  test.AddOutput<float>("Y", Y_dims, Y_data);
+  test.SetOutputRelErr("Y", 0.02f);
   test.Run();
 }
 
-template <typename Scalar, bool HasZeroPoint, bool HasBias>
-void RunDynamicQuantizeMatMulTest(const string& model_path) {
-  std::vector<int64_t> A_dims{4, 128};
-  std::vector<int64_t> B_dims{128, 128};
-  std::vector<int64_t> Y_dims{4, 128};
-
-  TestDynamicQuantizeMatMul<Scalar>(A_dims,
-                                    B_dims,
-                                    model_path,
-                                    false,        /*is_matrix_b_constant*/
-                                    false,        /*per_column*/
-                                    HasZeroPoint, /*has_zp*/
-                                    HasBias       /*has_bias*/
+template <typename T, bool HasZeroPoint, bool HasBias>
+void RunDynamicQuantizeMatMulTest() {
+  TestDynamicQuantizeMatMul<T>(false,        /*is_matrix_b_constant*/
+                               false,        /*per_column*/
+                               HasZeroPoint, /*has_zp*/
+                               HasBias       /*has_bias*/
   );
 
-  TestDynamicQuantizeMatMul<Scalar>(A_dims,
-                                    B_dims,
-                                    model_path,
-                                    true,         /*is_matrix_b_constant*/
-                                    false,        /*per_column*/
-                                    HasZeroPoint, /*has_zp*/
-                                    HasBias       /*has_bias*/
+  TestDynamicQuantizeMatMul<T>(true,         /*is_matrix_b_constant*/
+                               false,        /*per_column*/
+                               HasZeroPoint, /*has_zp*/
+                               HasBias       /*has_bias*/
   );
 
-  TestDynamicQuantizeMatMul<Scalar>(A_dims,
-                                    B_dims,
-                                    model_path,
-                                    false,        /*is_matrix_b_constant*/
-                                    true,         /*per_column*/
-                                    HasZeroPoint, /*has_zp*/
-                                    HasBias       /*has_bias*/
+  TestDynamicQuantizeMatMul<T>(false,        /*is_matrix_b_constant*/
+                               true,         /*per_column*/
+                               HasZeroPoint, /*has_zp*/
+                               HasBias       /*has_bias*/
   );
 
-  TestDynamicQuantizeMatMul<Scalar>(A_dims,
-                                    B_dims,
-                                    model_path,
-                                    true,         /*is_matrix_b_constant*/
-                                    true,         /*per_column*/
-                                    HasZeroPoint, /*has_zp*/
-                                    HasBias       /*has_bias*/
+  TestDynamicQuantizeMatMul<T>(true,         /*is_matrix_b_constant*/
+                               true,         /*per_column*/
+                               HasZeroPoint, /*has_zp*/
+                               HasBias       /*has_bias*/
   );
 }
 
-TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test) {
-  RunDynamicQuantizeMatMulTest<int8_t, true, false>("testdata/dynamic_quantize_matmul_int8.onnx");
-  RunDynamicQuantizeMatMulTest<uint8_t, true, false>("testdata/dynamic_quantize_matmul_uint8.onnx");
+TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_S8) {
+  RunDynamicQuantizeMatMulTest<int8_t, true, false>();
 }
 
-TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test) {
-  RunDynamicQuantizeMatMulTest<int8_t, false, true>("testdata/dynamic_quantize_matmul_int8_bias.onnx");
-  RunDynamicQuantizeMatMulTest<uint8_t, false, true>("testdata/dynamic_quantize_matmul_uint8_bias.onnx");
+TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_U8) {
+  RunDynamicQuantizeMatMulTest<uint8_t, true, false>();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_S8) {
+  RunDynamicQuantizeMatMulTest<int8_t, false, true>();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) {
+  RunDynamicQuantizeMatMulTest<uint8_t, false, true>();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) {
+  RunDynamicQuantizeMatMulTest<int8_t, false, false>();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_U8) {
+  RunDynamicQuantizeMatMulTest<uint8_t, false, false>();
+}
+
+TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_S8) {
+  RunDynamicQuantizeMatMulTest<int8_t, true, true>();
+}
+
+TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) {
+  RunDynamicQuantizeMatMulTest<uint8_t, true, true>();
 }
 
 TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) {
-  std::vector<int64_t> A_dims{0, 128};
-  std::vector<int64_t> B_dims{128, 128};
-  std::vector<int64_t> Y_dims{0, 128};
-
-  TestDynamicQuantizeMatMul<uint8_t>(A_dims,
-                                     B_dims,
-                                     "testdata/dynamic_quantize_matmul_uint8.onnx",
-                                     false /*is_matrix_b_constant*/);
-
-  TestDynamicQuantizeMatMul<uint8_t>(A_dims,
-                                     B_dims,
-                                     "testdata/dynamic_quantize_matmul_uint8.onnx",
-                                     true /*is_matrix_b_constant*/);
+  std::vector<int64_t> A_dims{0, 2};
+  std::vector<int64_t> B_dims{2, 2};
+  std::vector<int64_t> Y_dims{0, 2};
+  OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
+  test.AddInput<float>("T1", A_dims, {});
+  test.AddInput<uint8_t>("T2", B_dims, {1, 6, 0, 8});
+  test.AddInput<float>("b_scale", {1}, {1.0f});
+  test.AddInput<uint8_t>("b_zero_point", {1}, {0});
+  test.AddOptionalInputEdge<float>();
+  test.AddOutput<float>("Y", {0, 2}, {});
+  test.Run();
 }
 
 TEST(DynamicQuantizeMatMul, B_PerColumn_ND) {
diff --git a/onnxruntime/test/contrib_ops/fft_op_test.cc b/onnxruntime/test/contrib_ops/fft_op_test.cc
index 56a6466c760f..7a6b6cca6425 100644
--- a/onnxruntime/test/contrib_ops/fft_op_test.cc
+++ b/onnxruntime/test/contrib_ops/fft_op_test.cc
@@ -25,6 +25,7 @@ TEST(ContribOpTest, Rfft) {
   // Target values conputed using PyTorch torch.fft.rfft(X, dim=-1, norm="backward")
   test.AddInput<float>("X", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
   test.AddOutput<float>("Y", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
@@ -45,6 +46,7 @@ TEST(ContribOpTest, Irfft) {
   test.AddAttribute("normalized", static_cast<int64_t>(0));
   test.AddInput<float>("X", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
   test.AddOutput<float>("Y", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc b/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
index a24f3b6b441e..d9d2681dd3b3 100644
--- a/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
@@ -50,6 +50,8 @@ static void RunGemmFastGeluGpuTest(const std::vector<float>& input_data, const s
     tester.AddOutput<float>("Y", output_dims, output_data);
   }
 
+  tester.SetOutputTolerance(use_float16 ? 0.005f : 0.0025f);
+
   tester.Config(run_with_tunable_op)
       .RunWithConfig();
 }
@@ -154,7 +156,7 @@ TEST(GemmFastGeluTest, GemmFastGeluWithoutBiasFloat16) {
 
   RunGemmFastGeluGpuTest(input_data, weight_data, bias_data, output_data,
                          input_dims, weight_dims, bias_dims, output_dims,
-                         false);
+                         false, true);
 }
 
 TEST(GemmFastGeluTest, GemmFastGeluWithBiasFloat16) {
@@ -189,7 +191,7 @@ TEST(GemmFastGeluTest, GemmFastGeluWithBiasFloat16) {
 
   RunGemmFastGeluGpuTest(input_data, weight_data, bias_data, output_data,
                          input_dims, weight_dims, bias_dims, output_dims,
-                         true);
+                         true, true);
 }
 
 TEST(GemmFastGeluTest, GemmFastGeluWithBias_bfloat16) {
diff --git a/onnxruntime/test/contrib_ops/gemma_rotary_emb_test.cc b/onnxruntime/test/contrib_ops/gemma_rotary_emb_test.cc
new file mode 100644
index 000000000000..80adf04f402a
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/gemma_rotary_emb_test.cc
@@ -0,0 +1,104 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cassert>
+#include "gtest/gtest.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
+#include "test/providers/provider_test_utils.h"
+#include <cstdlib>  // For rand() and srand()
+
+namespace onnxruntime {
+namespace test {
+
+constexpr auto k_random_data_min = -1.0f;
+constexpr auto k_random_data_max = 1.0f;
+
+namespace {
+enum class TensorType {
+  kFloat,
+  kFloat16,
+  kBFloat16
+};
+}  // anonymous namespace
+
+static void calculateExpectedOutput(const std::vector<float>& emb_data,
+                                    const std::vector<MLFloat16>& q_data,
+                                    const std::vector<MLFloat16>& q_rot_data,
+                                    const std::vector<MLFloat16>& k_data,
+                                    const std::vector<MLFloat16>& k_rot_data,
+                                    const std::vector<int64_t>& mul_dim,
+                                    std::vector<MLFloat16>& output1,
+                                    std::vector<MLFloat16>& output2) {
+  for (long int i = 0; i < mul_dim[0]; ++i) {
+    for (long int j = 0; j < mul_dim[1]; ++j) {
+      for (long int k = 0; k < mul_dim[2]; ++k) {
+        for (long int l = 0; l < mul_dim[3]; ++l) {
+          long int embIdx = i * mul_dim[1] * mul_dim[3] + k * mul_dim[3] + l;
+          long int mulIdx = i * mul_dim[1] * mul_dim[2] * mul_dim[3] + j * mul_dim[2] * mul_dim[3] + k * mul_dim[3] + l;
+
+          MLFloat16 sin_val = static_cast<MLFloat16>(sin(emb_data[embIdx]));
+          MLFloat16 cos_val = static_cast<MLFloat16>(cos(emb_data[embIdx]));
+          MLFloat16 q_val = static_cast<MLFloat16>(q_data[mulIdx]);
+          MLFloat16 q_rot_val = static_cast<MLFloat16>(q_rot_data[mulIdx]);
+          MLFloat16 k_val = static_cast<MLFloat16>(k_data[mulIdx]);
+          MLFloat16 k_rot_val = static_cast<MLFloat16>(k_rot_data[mulIdx]);
+          output1.push_back(static_cast<MLFloat16>(q_val * cos_val + q_rot_val * sin_val));
+          output2.push_back(static_cast<MLFloat16>(k_val * cos_val + k_rot_val * sin_val));
+        }
+      }
+    }
+  }
+}
+
+static void RunTest() {
+  std::string op_type = "GemmaRotaryEmbedding";
+  std::vector<int64_t> emb_dim = {1, 2, 2};
+  std::vector<int64_t> mul_dim = {1, 3, 2, 2};
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+
+  int min_cuda_architecture = 530;
+  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+
+  if (enable_cuda) {
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+  }
+
+  if (execution_providers.size() == 0) {
+    // Return early if CI pipeline does not support EP (e.g. CUDA EP for CPU CI pipeline)
+    return;
+  }
+
+  OpTester test(op_type.c_str(), 1, onnxruntime::kMSDomain);
+
+  // create rand inputs
+  RandomValueGenerator random{};
+  const std::vector<float> emb_data = random.Uniform<float>(emb_dim, k_random_data_min, k_random_data_max);
+  const std::vector<MLFloat16> q = random.Uniform<MLFloat16>(mul_dim, k_random_data_min, k_random_data_max);
+  const std::vector<MLFloat16> q_rot = random.Uniform<MLFloat16>(mul_dim, k_random_data_min, k_random_data_max);
+  const std::vector<MLFloat16> k = random.Uniform<MLFloat16>(mul_dim, k_random_data_min, k_random_data_max);
+  const std::vector<MLFloat16> k_rot = random.Uniform<MLFloat16>(mul_dim, k_random_data_min, k_random_data_max);
+
+  std::vector<MLFloat16> output1;
+  std::vector<MLFloat16> output2;
+
+  calculateExpectedOutput(emb_data, q, q_rot, k, k_rot, mul_dim, output1, output2);
+
+  test.AddInput<float>("emb", emb_dim, emb_data);
+  test.AddInput<MLFloat16>("q_data", mul_dim, q);
+  test.AddInput<MLFloat16>("q_rot_data", mul_dim, q_rot);
+  test.AddInput<MLFloat16>("k_data", mul_dim, k);
+  test.AddInput<MLFloat16>("k_rot_data", mul_dim, k_rot);
+  test.AddOutput<MLFloat16>("output1", mul_dim, output1);
+  test.AddOutput<MLFloat16>("output2", mul_dim, output2);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(GemmaRotaryEmbeddingTest, GemmaRotaryEmbedding_Small) {
+  RunTest();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/test/contrib_ops/greedy_search_test.cc b/onnxruntime/test/contrib_ops/greedy_search_test.cc
index 1baf50c1ba61..73da82d4bb03 100644
--- a/onnxruntime/test/contrib_ops/greedy_search_test.cc
+++ b/onnxruntime/test/contrib_ops/greedy_search_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -64,9 +68,13 @@ TEST(GreedySearchTest, GptGreedySearchFp16_VocabPadded) {
 
   if (is_cuda || is_rocm) {
     Ort::SessionOptions session_options;
+#ifdef USE_CUDA
     if (is_cuda) {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2 cuda_options;
+      cuda_options.use_tf32 = false;
+      session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
     }
+#endif
     if (is_rocm) {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
     }
@@ -145,9 +153,13 @@ TEST(GreedySearchTest, GptGreedySearchFp32) {
 
   if (is_cuda || is_rocm) {
     Ort::SessionOptions session_options;
+#ifdef USE_CUDA
     if (is_cuda) {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2 cuda_options;
+      cuda_options.use_tf32 = false;
+      session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
     }
+#endif
     if (is_rocm) {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
     }
diff --git a/onnxruntime/test/contrib_ops/gridsample_test.cc b/onnxruntime/test/contrib_ops/gridsample_test.cc
index 1f31c2bd21f1..d970178e29ab 100644
--- a/onnxruntime/test/contrib_ops/gridsample_test.cc
+++ b/onnxruntime/test/contrib_ops/gridsample_test.cc
@@ -32,7 +32,7 @@ TEST(GridsampleContribOpTest, gridsample_default) {
                          3.8000f, 7.9000f, 8.7000f, 9.5000f, 10.3000f, 5.3000f,
                          5.4000f, 11.1000f, 11.9000f, 12.7000f, 13.5000f, 6.9000f,
                          3.0000f, 6.1500f, 6.5500f, 6.9500f, 7.3500f, 3.7500f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_paddingmode_zeros) {
@@ -45,7 +45,7 @@ TEST(GridsampleContribOpTest, gridsample_paddingmode_zeros) {
                         5.0000f, 5.0000f, 10.0000f, 10.0000f});
   test.AddAttribute("padding_mode", "zeros");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.0000f, 0.0000f, 1.7000f, 0.0000f, 0.0000f, 1.7000f, 0.0000f, 0.0000f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_paddingmode_border) {
@@ -58,7 +58,7 @@ TEST(GridsampleContribOpTest, gridsample_paddingmode_border) {
                         5.0000f, 5.0000f, 10.0000f, 10.0000f});
   test.AddAttribute("padding_mode", "border");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.0000f, 0.0000f, 1.7000f, 5.0000f, 5.0000f, 1.7000f, 5.0000f, 5.0000f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_paddingmode_reflection) {
@@ -71,7 +71,8 @@ TEST(GridsampleContribOpTest, gridsample_paddingmode_reflection) {
                         5.0000f, 5.0000f, 10.0000f, 10.0000f});
   test.AddAttribute("padding_mode", "reflection");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {2.5000f, 0.0000f, 1.7000f, 2.5000f, 2.5000f, 1.7000f, 5.0000f, 2.5000f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // Accuracy issue for QNN
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider, kQnnExecutionProvider});  // Accuracy issue for QNN
 }
 
 TEST(GridsampleContribOpTest, gridsample_aligncorners_true) {
@@ -86,7 +87,7 @@ TEST(GridsampleContribOpTest, gridsample_aligncorners_true) {
   test.AddAttribute("mode", "bilinear");
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.0000f, 1.2500f, 2.0000f, 2.5000f, 2.5000f, 2.0000f, 3.7500f, 5.0000f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_mode_bilinear) {
@@ -99,7 +100,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_bilinear) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "bilinear");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.0000f, 0.5000f, 1.7000f, 2.5000f, 2.5000f, 1.7000f, 4.5000f, 1.2500f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_mode_nearest) {
@@ -112,7 +113,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_nearest) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "nearest");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {0.f, 0.f, 2.f, 2.f, 2.f, 2.f, 5.f, 0.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 TEST(GridsampleContribOpTest, gridsample_mode_bicubic) {
@@ -125,7 +126,8 @@ TEST(GridsampleContribOpTest, gridsample_mode_bicubic) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "bicubic");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {-0.1406f, 0.3828f, 1.7556f, 2.9688f, 2.9688f, 1.7556f, 5.1445f, 1.3906f});
-  test.Run();
+  test.SetOutputTolerance(0.0001f);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 84bbee35eed5..655c4951f262 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -7,6 +7,7 @@
 #include "core/session/inference_session.h"
 #include "test/common/dnnl_op_test_utils.h"
 #include "test/common/tensor_op_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
 #include "test/framework/test_utils.h"
 #include "test/util/include/default_providers.h"
 #include "test/providers/provider_test_utils.h"
@@ -75,6 +76,28 @@ TEST(LayerNormTest, LayerNorm) {
   test.Run();
 }
 
+TEST(LayerNormTest, LayerNorm_BFloat16Input) {
+// prevents test from running on non-BF16-supporting hardware
+#ifdef USE_CUDA
+  int min_cuda_architecture = 530;
+  if (!HasCudaEnvironment(min_cuda_architecture)) {
+    LOGS_DEFAULT(WARNING) << "Hardware NOT support BFP16";
+    return;
+  }
+#endif
+  OpTester test("LayerNormalization");
+  test.AddAttribute<float>("epsilon", 1e-05f);
+
+  std::vector<int64_t> dims{1, 2, 3};
+  test.AddInput<BFloat16>("x", dims, MakeBFloat16({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}));
+  test.AddInput<BFloat16>("gamma", {3}, MakeBFloat16({1.0f, 1.0f, 1.0f}));
+  test.AddOutput<BFloat16>("output", dims, MakeBFloat16({-1.2247f, 0.0f, 1.2247f, -1.2247f, 0.0f, 1.2247f}));
+  // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
+            kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
+}
+
 TEST(LayerNormTest, LayerNorm_Scale) {
   OpTester test("LayerNormalization");
   test.AddAttribute<float>("epsilon", 1e-05f);
@@ -137,6 +160,7 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias) {
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -149,6 +173,8 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16Input) {
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.SetOutputTolerance(0.0001f);
+
   // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kTensorrtExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
@@ -205,6 +231,9 @@ TEST(LayerNormTest, LayerNorm17_double) {
   test.AddInput<double>("x", dims, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0});
   test.AddInput<double>("gamma", {3}, {1.0, 1.0, 1.0});
   test.AddOutput<double>("output", dims, {-1.2247, 0.0, 1.2247, -1.2247, 0.0, 1.2247});
+
+  test.SetOutputTolerance(0.0001f);
+
   // DNNL does not support double
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider});
 }
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 113b94fa6f7c..d294fd4e2b0e 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #ifndef ORT_MINIMAL_BUILD
+#include <gsl/narrow>
 
 #include "core/common/span_utils.h"
 #include "core/framework/tensor.h"
@@ -14,6 +15,8 @@
 #include "test/optimizer/graph_transform_test_builder.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/session/ort_env.h"
 #include "core/util/qmath.h"
 
 #include <chrono>
@@ -21,12 +24,13 @@
 
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
+extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
+
 namespace test {
 
 static constexpr int QBits = 4;
-
 void QuantizeDequantize(std::vector<float>& raw_vals,
                         std::vector<uint8_t>& quant_vals,
                         std::vector<float>& scales,
@@ -34,9 +38,8 @@ void QuantizeDequantize(std::vector<float>& raw_vals,
                         int32_t N,
                         int32_t K,
                         int32_t block_size) {
-  OrtThreadPoolParams to;
-  auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to,
-                                          concurrency::ThreadPoolType::INTRA_OP);
+  auto& ortenv = **ort_env.get();
+  onnxruntime::concurrency::ThreadPool* tp = ortenv.GetEnvironment().GetIntraOpThreadPool();
 
   MlasQuantizeBlockwise<float, 4>(
       quant_vals.data(),
@@ -48,7 +51,7 @@ void QuantizeDequantize(std::vector<float>& raw_vals,
       K,
       N,
       N,
-      tp.get());
+      tp);
 
   // Note that input1_f_vals is NxK after dequant
   MlasDequantizeBlockwise<float, 4>(
@@ -60,11 +63,13 @@ void QuantizeDequantize(std::vector<float>& raw_vals,
       true,                                  // columnwise quantization
       K,                                     // number of rows
       N,                                     // number of columns
-      tp.get());
+      tp);
 }
 
-void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_COMPUTE_TYPE comp_type,
-             bool has_zeropoint, bool use_float16, float fp16_abs_error = 0.02f) {
+void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level,
+             bool has_zeropoint, bool use_float16, bool has_g_idx = false,
+             bool zp_is_4bit = true, float fp16_abs_error = 0.02f) {
+  zp_is_4bit = zp_is_4bit | has_g_idx;
   RandomValueGenerator random{1234};
   std::vector<float> input0_vals(random.Gaussian<float>(std::vector<int64_t>({M, K}), 0.0f, 0.25f));
   std::vector<float> input1_f_vals(random.Gaussian<float>(std::vector<int64_t>({K, N}), 0.0f, 0.25f));
@@ -110,13 +115,41 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_CO
   test.AddAttribute<int64_t>("N", N);
   test.AddAttribute<int64_t>("block_size", block_size);
   test.AddAttribute<int64_t>("bits", QBits);
-  test.AddAttribute<int64_t>("accuracy_level", comp_type);
+  test.AddAttribute<int64_t>("accuracy_level", accuracy_level);
+  auto ceildiv = [](int64_t a, int64_t b) { return (a + b - 1) / b; };
+
   if (use_float16) {
     test.AddInput<MLFloat16>("A", {M, K}, ToFloat16(input0_vals), false);
     test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
     test.AddInput<MLFloat16>("scales", {static_cast<int64_t>(q_scale_size)}, ToFloat16(scales), true);
     if (has_zeropoint) {
-      test.AddInput<uint8_t>("zero_points", {static_cast<int64_t>(q_zp_size_in_bytes)}, zp, true);
+      if (zp_is_4bit) {
+        test.AddInput<uint8_t>("zero_points", {static_cast<int64_t>(q_zp_size_in_bytes)}, zp, true);
+      } else {
+        std::vector<float> zp_f;
+        zp_f.reserve(q_zp_size_in_bytes * 2);
+        for (size_t i = 0; i < zp.size(); i++) {
+          zp_f.push_back(static_cast<float>(zp[i] & 0xf));
+          zp_f.push_back(static_cast<float>((zp[i] >> 4) & 0xf));
+        }
+        size_t ind = zp_f.size() - 1;
+        while (zp_f.size() != q_scale_size) {
+          zp_f.erase(zp_f.begin() + ind);
+          ind -= q_scale_size / N + 1;
+        }
+
+        test.AddInput<MLFloat16>("zero_points", {static_cast<int64_t>(q_scale_size)}, ToFloat16(zp_f), true);
+      }
+    } else {
+      test.AddInput<uint8_t>("", {0}, {});
+    }
+    if (has_g_idx) {
+      int K_pad = gsl::narrow<int32_t>(ceildiv(K, block_size) * block_size);
+      std::vector<int32_t> g_idx(K_pad);
+      for (int64_t i = 0; i < K_pad; i++) {
+        g_idx[i] = gsl::narrow<int32_t>(i / block_size);
+      }
+      test.AddInput<int32_t>("g_idx", {static_cast<int64_t>(K_pad)}, g_idx, true);
     }
 
     test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(expected_vals));
@@ -130,11 +163,36 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_CO
     test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
     test.AddInput<float>("scales", {static_cast<int64_t>(q_scale_size)}, scales, true);
     if (has_zeropoint) {
-      test.AddInput<uint8_t>("zero_points", {static_cast<int64_t>(q_zp_size_in_bytes)}, zp, true);
-    }
+      if (zp_is_4bit) {
+        test.AddInput<uint8_t>("zero_points", {static_cast<int64_t>(q_zp_size_in_bytes)}, zp, true);
+      } else {
+        std::vector<float> zp_f;
+        zp_f.reserve(q_zp_size_in_bytes * 2);
+        for (size_t i = 0; i < zp.size(); i++) {
+          zp_f.push_back(static_cast<float>(zp[i] & 0xf));
+          zp_f.push_back(static_cast<float>((zp[i] >> 4) & 0xf));
+        }
+        size_t ind = zp_f.size() - 1;
+        while (zp_f.size() != q_scale_size) {
+          zp_f.erase(zp_f.begin() + ind);
+          ind -= q_scale_size / N + 1;
+        }
 
+        test.AddInput<float>("zero_points", {static_cast<int64_t>(q_scale_size)}, zp_f, true);
+      }
+    } else {
+      test.AddInput<uint8_t>("", {0}, {});
+    }
+    if (has_g_idx) {
+      int K_pad = gsl::narrow<int32_t>(ceildiv(K, block_size) * block_size);
+      std::vector<int32_t> g_idx(K_pad);
+      for (int64_t i = 0; i < K_pad; i++) {
+        g_idx[i] = gsl::narrow<int32_t>(i / block_size);
+      }
+      test.AddInput<int32_t>("g_idx", {static_cast<int64_t>(K_pad)}, g_idx, true);
+    }
     test.AddOutput<float>("Y", {M, N}, expected_vals);
-    if (comp_type == CompInt8) {
+    if (accuracy_level == 4) {
       test.SetOutputAbsErr("Y", 0.1f);
     }
 
@@ -147,10 +205,19 @@ TEST(MatMulNBits, Float32) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          for (auto comp : {CompUndef, CompFp32, CompInt8}) {
-            RunTest(M, N, K, block_size, comp, false, false);
-            RunTest(M, N, K, block_size, comp, true, false);
+#ifdef ORT_NEURAL_SPEED
+          for (auto accuracy_level : {0, 1, 4}) {
+            RunTest(M, N, K, block_size, accuracy_level, false, false);
+            RunTest(M, N, K, block_size, accuracy_level, true, false);
+          }
+#else
+          for (auto accuracy_level : {0}) {
+            RunTest(M, N, K, block_size, accuracy_level, false, false);
+            RunTest(M, N, K, block_size, accuracy_level, true, false);
+            RunTest(M, N, K, block_size, accuracy_level, false, false, true);
+            RunTest(M, N, K, block_size, accuracy_level, true, false, false, false);
           }
+#endif
         }
       }
     }
@@ -163,8 +230,10 @@ TEST(MatMulNBits, Float16) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          RunTest(M, N, K, block_size, CompUndef, false, true);
-          RunTest(M, N, K, block_size, CompUndef, true, true);
+          for (auto has_gidx : {true, false}) {
+            RunTest(M, N, K, block_size, 0, false, true, has_gidx);
+            RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
+          }
         }
       }
     }
@@ -174,9 +243,9 @@ TEST(MatMulNBits, Float16) {
 TEST(MatMulNBits, Float16Large) {
   for (auto block_size : {16, 32, 64, 128}) {
     for (auto symmetric : {false, true}) {
-      RunTest(1, 4096, 4096, block_size, CompUndef, symmetric, true, 0.05f);
-      RunTest(1, 4096, 11008, block_size, CompUndef, symmetric, true, 0.05f);
-      RunTest(1, 11008, 4096, block_size, CompUndef, symmetric, true, 0.05f);
+      RunTest(1, 4096, 4096, block_size, 0, symmetric, true, false, true, 0.05f);
+      RunTest(1, 4096, 11008, block_size, 0, symmetric, true, false, true, 0.05f);
+      RunTest(1, 11008, 4096, block_size, 0, symmetric, true, false, true, 0.05f);
     }
   }
 }
@@ -184,11 +253,11 @@ TEST(MatMulNBits, Float16Large) {
 #endif
 
 void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_size, bool is_asym,
-                                   MLAS_SQNBIT_COMPUTE_TYPE acc_lvl) {
+                                   int64_t acc_lvl) {
   // (M x K) X (K x N)
 
   OpTester test("MatMulNBits", 1, kMSDomain);
-  test.AddAttribute<int64_t>("accuracy_level", int64_t(acc_lvl));
+  test.AddAttribute<int64_t>("accuracy_level", acc_lvl);
   test.AddAttribute<int64_t>("block_size", int64_t(block_size));
   test.AddAttribute<int64_t>("bits", QBits);
   test.AddAttribute<int64_t>("N", N);
@@ -268,7 +337,7 @@ void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_si
     test.AddInput<uint8_t>("zero_points", {N, static_cast<int64_t>(kblks / 2)}, input3_vals, true);
   }
   test.AddOutput<float>("Y", {M, N}, expected_vals, false);
-  if (acc_lvl == CompInt8) {
+  if (acc_lvl == 4) {
     test.SetOutputAbsErr("Y", 0.1f);
   }
 
@@ -341,14 +410,14 @@ void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_si
   }
 }
 
-#ifdef MLAS_JBLAS
+#ifdef ORT_NEURAL_SPEED
 TEST(MatMulNBits, SharedPrepackedWeights) {
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, true, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, false, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, CompInt8);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 1024, false, CompInt8);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 4096, false, CompInt8);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, true, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, false, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, 4);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 1024, false, 4);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 4096, false, 4);
 }
 #endif
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 26ce5272d25e..8d7629b5fda1 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -23,135 +23,408 @@ using namespace std;
 namespace onnxruntime {
 namespace test {
 
-template <typename IType, typename WType>
-void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
-                              std::vector<int64_t> B_dims,
-                              const std::string& reference_model,
-                              bool is_matrix_b_constant,
+template <typename IType, typename WType, typename OType>
+static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K,
+                                          const std::vector<IType>& A_data, const std::vector<OType>& A_scale,
+                                          const std::vector<IType>& A_zero_point, const std::vector<WType>& B_data,
+                                          std::vector<OType>& B_scale, std::vector<WType>& B_zero_point,
+                                          const std::vector<OType>& Bias, std::vector<float>& Y_data,
+                                          bool per_column, bool has_zp, bool has_bias) {
+  if (!per_column) {
+    B_zero_point.resize(N, B_zero_point[0]);
+    B_scale.resize(N, B_scale[0]);
+  }
+
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        float A_dequantized = has_zp ? (static_cast<int>(A_data[m * K + k]) - static_cast<int>(A_zero_point[0])) * A_scale[0] : A_data[m * K + k] * A_scale[0];
+        float B_dequantized = has_zp ? (static_cast<int>(B_data[k * N + n]) - static_cast<int>(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n];
+
+        sum += A_dequantized * B_dequantized;
+      }
+      if (has_bias) {
+        sum += Bias[n];
+      }
+      Y_data[m * N + n] = static_cast<OType>(sum);
+    }
+  }
+}
+
+template <typename IType, typename WType, typename OType>
+void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
                               bool per_column = false,
                               bool has_zp = true,
                               bool has_bias = false) {
   // create rand inputs
   RandomValueGenerator random{};
-
+  int64_t M = 4;
+  int64_t N = 128;
+  int64_t K = 128;
+  std::vector<int64_t> A_dims{M, K};
+  std::vector<int64_t> B_dims{K, N};
+  std::vector<int64_t> Y_dims{M, K};
   std::vector<IType> A_data;
-  std::vector<int> tmp_A_data = random.Uniform<int32_t>(A_dims,
-                                                        std::numeric_limits<WType>::lowest(),
-                                                        std::numeric_limits<WType>::max());
-  std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> WType {
+  std::vector<IType> tmp_A_data = random.Uniform<IType>(A_dims,
+                                                        std::numeric_limits<IType>::lowest(),
+                                                        std::numeric_limits<IType>::max());
+  std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> IType {
     return static_cast<IType>(v);
   });
 
   std::vector<WType> B_data;
-  std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims,
-                                                        std::numeric_limits<WType>::lowest(),
-                                                        std::numeric_limits<WType>::max());
+
+  std::vector<WType> tmp_B_data;
+  tmp_B_data = random.Uniform<WType>(B_dims,
+                                     std::is_signed<WType>::value ? std::numeric_limits<int8_t>::lowest() / 2 : std::numeric_limits<uint8_t>::lowest(),
+                                     std::numeric_limits<WType>::max() / 2);
   std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType {
     return static_cast<WType>(v);
   });
 
-  std::vector<float> A_scale = random.Uniform<float>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
+  std::vector<OType> A_scale = random.Uniform<OType>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
   std::vector<IType> A_zero_point{(std::numeric_limits<IType>::lowest() + std::numeric_limits<IType>::max() + IType(2)) / 2};
 
   int64_t b_scale_zp_size = per_column ? B_dims.back() : 1;
-  std::vector<float> B_scale = random.Uniform<float>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
+  std::vector<OType> B_scale = random.Uniform<OType>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
 
   std::vector<WType> B_zero_point(b_scale_zp_size);
   std::for_each(B_zero_point.begin(),
                 B_zero_point.end(),
                 [&random](WType& zp) {
-                  zp = static_cast<WType>(random.Uniform<int32_t>(std::array<int64_t, 1>{1},
-                                                                  std::numeric_limits<WType>::lowest(),
-                                                                  std::numeric_limits<WType>::max())[0]);
+                  zp = static_cast<WType>(random.Uniform<WType>(std::array<int64_t, 1>{1},
+                                                                std::numeric_limits<WType>::lowest(),
+                                                                std::numeric_limits<WType>::max())[0]);
                 });
 
-  std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
+  std::vector<OType> Bias = random.Uniform<OType>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
   OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
   test.AddInput<IType>("A", A_dims, A_data);
   test.AddInput<WType>("B", B_dims, B_data, is_matrix_b_constant);
-  test.AddInput<float>("a_scale", {1}, A_scale);
-  test.AddInput<float>("b_scale", {b_scale_zp_size}, B_scale);
+  test.AddInput<OType>("a_scale", {1}, A_scale);
+  test.AddInput<OType>("b_scale", {b_scale_zp_size}, B_scale);
 
   if (has_zp) {
     test.AddInput<IType>("a_zero_point", {1}, A_zero_point);
     test.AddInput<WType>("b_zero_point", {b_scale_zp_size}, B_zero_point);
   } else {
-    test.AddOptionalInputEdge<WType>();
+    test.AddOptionalInputEdge<IType>();
     test.AddOptionalInputEdge<WType>();
   }
 
   if (has_bias) {
-    test.AddInput<float>("bias", {B_dims.back()}, Bias);
+    test.AddInput<OType>("bias", {B_dims.back()}, Bias);
   } else {
-    test.AddOptionalInputEdge<float>();
+    test.AddOptionalInputEdge<OType>();
   }
 
-  test.AddReferenceOutputs(reference_model);
-  test.SetOutputRelErr("Y", 1e-4f);
-  test.Run();
-}
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<IType, WType, OType>(M, N, K, A_data, A_scale, A_zero_point,
+                                                     B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                     per_column, has_zp, has_bias);
 
-template <typename IType, typename WType, bool HasZeroPoint, bool HasBias>
-void RunMatMulIntegerToFloatTest(const string& model_path) {
-  std::vector<int64_t> A_dims{4, 128};
-  std::vector<int64_t> B_dims{128, 128};
-  std::vector<int64_t> Y_dims{4, 128};
+  if (std::is_same_v<OType, float>) {
+    test.AddOutput<float>("Y", {M, N}, Y_data);
+    test.SetOutputAbsErr("Y", 0.001f);
+    test.SetOutputRelErr("Y", 0.02f);
+  } else {
+    test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+    test.SetOutputAbsErr("Y", 0.5f);
+  }
 
-  TestMatMulIntegerToFloat<IType, WType>(A_dims,
-                                         B_dims,
-                                         model_path,
-                                         false,        /*is_matrix_b_constant*/
-                                         false,        /*per_column*/
-                                         HasZeroPoint, /*has_zp*/
-                                         HasBias       /*has_bias*/
+  // Only DML EP supports these data type combinations for now
+  if (std::is_same_v<OType, MLFloat16> ||
+      (std::is_same_v<OType, float> &&
+       std::is_same_v<IType, int8_t> &&
+       std::is_same_v<WType, uint8_t>)) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultDmlExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  } else {
+    test.Run();
+  }
+}
+
+template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
+void RunMatMulIntegerToFloatTest() {
+  TestMatMulIntegerToFloat<IType, WType, OType>(
+      false,        /*is_matrix_b_constant*/
+      false,        /*per_column*/
+      HasZeroPoint, /*has_zp*/
+      HasBias       /*has_bias*/
   );
 
-  TestMatMulIntegerToFloat<IType, WType>(A_dims,
-                                         B_dims,
-                                         model_path,
-                                         true,         /*is_matrix_b_constant*/
-                                         false,        /*per_column*/
-                                         HasZeroPoint, /*has_zp*/
-                                         HasBias       /*has_bias*/
+  TestMatMulIntegerToFloat<IType, WType, OType>(
+      true,         /*is_matrix_b_constant*/
+      false,        /*per_column*/
+      HasZeroPoint, /*has_zp*/
+      HasBias       /*has_bias*/
   );
 
-  TestMatMulIntegerToFloat<IType, WType>(A_dims,
-                                         B_dims,
-                                         model_path,
-                                         false,        /*is_matrix_b_constant*/
-                                         true,         /*per_column*/
-                                         HasZeroPoint, /*has_zp*/
-                                         HasBias       /*has_bias*/
+  TestMatMulIntegerToFloat<IType, WType, OType>(
+      false,        /*is_matrix_b_constant*/
+      true,         /*per_column*/
+      HasZeroPoint, /*has_zp*/
+      HasBias       /*has_bias*/
   );
 
-  TestMatMulIntegerToFloat<IType, WType>(A_dims,
-                                         B_dims,
-                                         model_path,
-                                         true,         /*is_matrix_b_constant*/
-                                         true,         /*per_column*/
-                                         HasZeroPoint, /*has_zp*/
-                                         HasBias       /*has_bias*/
+  TestMatMulIntegerToFloat<IType, WType, OType>(
+      true,         /*is_matrix_b_constant*/
+      true,         /*per_column*/
+      HasZeroPoint, /*has_zp*/
+      HasBias       /*has_bias*/
   );
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, true, false>("testdata/matmul_integer_to_float_int8.onnx");
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, true, false>("testdata/matmul_integer_to_float_uint8.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, false>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8) {
-  RunMatMulIntegerToFloatTest<uint8_t, int8_t, false, true>("testdata/matmul_integer_to_float_int8_bias.onnx");
-  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, false, true>("testdata/matmul_integer_to_float_uint8_bias.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, true>();
 }
 
-TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, true, false>("testdata/matmul_integer_to_float_int8_int8.onnx");
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, false, false>();
 }
 
-TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) {
-  RunMatMulIntegerToFloatTest<int8_t, int8_t, false, true>("testdata/matmul_integer_to_float_int8_int8_bias.onnx");
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, false>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, true>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, false, false>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) {
+  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, float, true, true>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, true>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, false, false>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
+  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, true>();
+}
+
+// DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
+#if defined(USE_DML)
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, true>();
+}
+
+TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, false, false>();
+}
+
+TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8) {
+  RunMatMulIntegerToFloatTest<int8_t, int8_t, float, true, true>();
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
+
+  std::vector<uint8_t> A_data = {1, 5, 2, 1, 9,
+                                 1, 1, 3, 7, 2};
+  std::vector<uint8_t> B_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<MLFloat16> A_scale = ToFloat16({3.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<uint8_t>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  std::vector<uint8_t> A_zero_point = {1};
+  std::vector<uint8_t> B_zero_point = {1};
+
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<uint8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                             B_data, B_scale, B_zero_point, {}, Y_data,
+                                                             false, true, false);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8S8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
+
+  std::vector<uint8_t> A_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<int8_t> B_data = {2, -1, -9, 1, 1,
+                                -1, 0, -3, 1, -4};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<uint8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
+  std::vector<uint8_t> A_zero_point = {1};
+  std::vector<int8_t> B_zero_point = {3};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
+
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<int8_t>("b_zero_point", {1}, B_zero_point);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<uint8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                            B_data, B_scale, B_zero_point, {}, Y_data,
+                                                            false, true, false);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8S8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
+
+  std::vector<int8_t> A_data = {3, 7, -2, 1, 1,
+                                2, -1, -9, 1, 1};
+  std::vector<int8_t> B_data = {2, -1, -9, 1, 1,
+                                -1, 0, -3, 1, -4};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
+  std::vector<int8_t> A_zero_point = {-1};
+  std::vector<int8_t> B_zero_point = {3};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
+
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<int8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                           B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                           false, true, true);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8U8) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 5;
+  int64_t N = 5;
+  int64_t K = 2;
+
+  std::vector<int8_t> A_data = {3, 7, -2, 1, 1,
+                                2, -1, -9, 1, 1};
+  std::vector<uint8_t> B_data = {3, 7, 2, 1, 1,
+                                 2, 1, 9, 1, 1};
+  std::vector<MLFloat16> A_scale = ToFloat16({-4.0f});
+  std::vector<MLFloat16> B_scale = ToFloat16({2.0f});
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  std::vector<int8_t> A_zero_point = {-1};
+  std::vector<uint8_t> B_zero_point = {1};
+  std::vector<MLFloat16> Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f});
+
+  test.AddInput<MLFloat16>("a_scale", {1}, A_scale);
+  test.AddInput<MLFloat16>("b_scale", {1}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, uint8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                            B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                            false, true, true);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16) {
+  OpTester test("MatMulIntegerToFloat", 1, kMSDomain);
+  int64_t M = 2;
+  int64_t N = 2;
+  int64_t K = 3;
+
+  std::vector<int8_t> A_data = {11, -2, 5,
+                                -1, 3, 10};
+  std::vector<int8_t> B_data = {-13, -2,
+                                9, 55,
+                                -1, 23};
+  std::vector<MLFloat16> A_scale = ToFloat16({0.910f});
+  std::vector<MLFloat16> B_scale = ToFloat16({1.10f, 1.123f});
+
+  std::vector<int8_t> A_zero_point = {113};
+  std::vector<int8_t> B_zero_point = {98, 71};
+
+  std::vector<MLFloat16> Bias = ToFloat16({0.10f, 1.123f});
+
+  test.AddInput<int8_t>("A", {M, K}, A_data);
+  test.AddInput<int8_t>("B", {K, N}, B_data);
+
+  test.AddInput<MLFloat16>("a_scale", {}, {A_scale});
+  test.AddInput<MLFloat16>("b_scale", {N}, B_scale);
+  test.AddInput<int8_t>("a_zero_point", {}, {A_zero_point});
+  test.AddInput<int8_t>("b_zero_point", {N}, B_zero_point);
+  test.AddInput<MLFloat16>("bias", {N}, Bias);
+
+  std::vector<float> Y_data(M * N);
+  CalculateMatMulIntegerToFloat<int8_t, int8_t, MLFloat16>(M, N, K, A_data, A_scale, A_zero_point,
+                                                           B_data, B_scale, B_zero_point, Bias, Y_data,
+                                                           true, true, true);
+
+  test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
+  test.SetOutputRelErr("Y", 2e-2f);
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultDmlExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
+#endif
 
 TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   auto test_case = [&](const std::vector<int64_t>& input_shape,
diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
index 844cc877f256..7dbaadd51d14 100644
--- a/onnxruntime/test/contrib_ops/moe_test.cc
+++ b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUTLASS
-
 #include "gtest/gtest.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
@@ -11,32 +9,27 @@
 namespace onnxruntime {
 namespace test {
 
-static void RunMoETest(
-    const std::vector<float>& input,
-    const std::vector<float>& router_probs,
-    const std::vector<float>& fc1_experts_weights,
-    const std::vector<float>& fc2_experts_weights,
-    const std::vector<float>& fc1_experts_bias,
-    const std::vector<float>& fc2_experts_bias,
-    const std::vector<float>& output_data,
-    int num_rows,
-    int num_experts,
-    int hidden_size,
-    int inter_size,
-    std::string activation_type,
-    bool use_float16 = false) {
-  int min_cuda_architecture = use_float16 ? 530 : 0;
+#ifndef ENABLE_TRAINING
+static void RunMoETest(const std::vector<float>& input, const std::vector<float>& router_probs,
+                       const std::vector<float>& fc1_experts_weights, const std::vector<float>& fc2_experts_weights,
+                       const std::vector<float>& fc3_experts_weights, const std::vector<float>& fc1_experts_bias,
+                       const std::vector<float>& fc2_experts_bias, const std::vector<float>& output_data, int num_rows,
+                       int num_experts, int hidden_size, int inter_size, std::string activation_type,
+                       int normalize_routing_weights = 0, int top_k = 1, bool use_float16 = false) {
+  int min_cuda_architecture = use_float16 ? 700 : 0;
 
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
   if (enable_cuda) {
     OpTester tester("MoE", 1, onnxruntime::kMSDomain);
-    tester.AddAttribute<int64_t>("k", static_cast<int64_t>(1));
+    tester.AddAttribute<int64_t>("k", static_cast<int64_t>(top_k));
     tester.AddAttribute<std::string>("activation_type", activation_type);
+    tester.AddAttribute<int64_t>("normalize_routing_weights", static_cast<int64_t>(normalize_routing_weights));
 
     std::vector<int64_t> input_dims = {num_rows, hidden_size};
     std::vector<int64_t> router_probs_dims = {num_rows, num_experts};
     std::vector<int64_t> fc1_experts_weights_dims = {num_experts, hidden_size, inter_size};
     std::vector<int64_t> fc2_experts_weights_dims = {num_experts, inter_size, hidden_size};
+    std::vector<int64_t> fc3_experts_weights_dims = fc1_experts_weights_dims;
     std::vector<int64_t> fc1_experts_bias_dims = {num_experts, inter_size};
     std::vector<int64_t> fc2_experts_bias_dims = {num_experts, hidden_size};
     std::vector<int64_t> output_dims = {num_rows, hidden_size};
@@ -45,18 +38,42 @@ static void RunMoETest(
       tester.AddInput<MLFloat16>("input", input_dims, ToFloat16(input));
       tester.AddInput<MLFloat16>("router_probs", router_probs_dims, ToFloat16(router_probs));
       tester.AddInput<MLFloat16>("fc1_experts_weights", fc1_experts_weights_dims, ToFloat16(fc1_experts_weights));
+      if (!fc1_experts_bias.empty()) {
+        tester.AddInput<MLFloat16>("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias));
+      } else {
+        tester.AddOptionalInputEdge<MLFloat16>();
+      }
       tester.AddInput<MLFloat16>("fc2_experts_weights", fc2_experts_weights_dims, ToFloat16(fc2_experts_weights));
-      tester.AddInput<MLFloat16>("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias));
-      tester.AddInput<MLFloat16>("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias));
+      if (!fc2_experts_bias.empty()) {
+        tester.AddInput<MLFloat16>("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias));
+      } else {
+        tester.AddOptionalInputEdge<MLFloat16>();
+      }
+      if (!fc3_experts_weights.empty()) {
+        tester.AddInput<MLFloat16>("fc3_experts_weights", fc3_experts_weights_dims, ToFloat16(fc3_experts_weights));
+      }
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+      tester.SetOutputTolerance(0.005f);
     } else {
       tester.AddInput<float>("input", input_dims, input);
       tester.AddInput<float>("router_probs", router_probs_dims, router_probs);
       tester.AddInput<float>("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights);
+      if (!fc1_experts_bias.empty()) {
+        tester.AddInput<float>("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias);
+      } else {
+        tester.AddOptionalInputEdge<float>();
+      }
       tester.AddInput<float>("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights);
-      tester.AddInput<float>("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias);
-      tester.AddInput<float>("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias);
+      if (!fc2_experts_bias.empty()) {
+        tester.AddInput<float>("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias);
+      } else {
+        tester.AddOptionalInputEdge<float>();
+      }
+      if (!fc3_experts_weights.empty()) {
+        tester.AddInput<float>("fc3_experts_weights", fc3_experts_weights_dims, fc3_experts_weights);
+      }
       tester.AddOutput<float>("output", output_dims, output_data);
+      tester.SetOutputTolerance(0.001f);
     }
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
@@ -65,6 +82,52 @@ static void RunMoETest(
   }
 }
 
+// TODO(wy): Add python parity tests that can serve as examples. Need cutlass upgrade to build cutlass extensions to
+// add weights preprocesser to onnxruntime_pybind_quant.cc
+static void RunQMoETest(const std::vector<float>& input, const std::vector<float>& router_probs,
+                        const std::vector<uint8_t>& fc1_experts_weights,
+                        const std::vector<uint8_t>& fc2_experts_weights,
+                        const std::vector<uint8_t>& fc3_experts_weights, const std::vector<float>& fc1_scales,
+                        const std::vector<float>& fc2_scales, const std::vector<float>& fc3_scales,
+                        const std::vector<float>& output_data, int num_rows, int num_experts, int hidden_size,
+                        int inter_size, std::string activation_type, int normalize_routing_weights = 0, int top_k = 1) {
+  bool enable_cuda = HasCudaEnvironment(700);
+  if (enable_cuda) {
+    OpTester tester("QMoE", 1, onnxruntime::kMSDomain);
+    tester.AddAttribute<int64_t>("k", static_cast<int64_t>(top_k));
+    tester.AddAttribute<std::string>("activation_type", activation_type);
+    tester.AddAttribute<int64_t>("normalize_routing_weights", static_cast<int64_t>(normalize_routing_weights));
+
+    std::vector<int64_t> input_dims = {num_rows, hidden_size};
+    std::vector<int64_t> router_probs_dims = {num_rows, num_experts};
+    std::vector<int64_t> fc1_experts_weights_dims = {num_experts, hidden_size, inter_size / 2};
+    std::vector<int64_t> fc2_experts_weights_dims = {num_experts, inter_size, hidden_size / 2};
+    std::vector<int64_t> fc3_experts_weights_dims = fc1_experts_weights_dims;
+    std::vector<int64_t> fc1_scales_dims = {num_experts, inter_size};
+    std::vector<int64_t> fc2_scales_dims = {num_experts, hidden_size};
+    std::vector<int64_t> fc3_scales_dims = fc1_scales_dims;
+    std::vector<int64_t> output_dims = {num_rows, hidden_size};
+
+    tester.AddInput<MLFloat16>("input", input_dims, ToFloat16(input));
+    tester.AddInput<MLFloat16>("router_probs", router_probs_dims, ToFloat16(router_probs));
+
+    tester.AddInput<uint8_t>("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights);
+    tester.AddInput<MLFloat16>("fc1_scales", fc1_scales_dims, ToFloat16(fc1_scales));
+    tester.AddOptionalInputEdge<MLFloat16>();  // fc1_experts_bias
+    tester.AddInput<uint8_t>("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights);
+    tester.AddInput<MLFloat16>("fc2_scales", fc2_scales_dims, ToFloat16(fc2_scales));
+    tester.AddOptionalInputEdge<MLFloat16>();  // fc2_experts_bias
+    tester.AddInput<uint8_t>("fc3_experts_weights", fc3_experts_weights_dims, fc3_experts_weights);
+    tester.AddInput<MLFloat16>("fc3_scales", fc3_scales_dims, ToFloat16(fc3_scales));
+    tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+    tester.SetOutputTolerance(0.005f);
+
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MoETest, MoETest_Gelu) {
   int num_rows = 4;
   int num_experts = 4;
@@ -80,135 +143,145 @@ TEST(MoETest, MoETest_Gelu) {
       -0.84837115f, 0.100507565f, -0.10548311f, 0.40957215f, 1.0159845f, 0.26919764f, 0.021741152f, -0.34184334f,
       -0.71324956f, 0.29018253f, -0.18227568f, 0.31496462f, -0.48426327f, -1.006643f, -0.100081146f, -0.07692295f};
   const std::vector<float> fc1_experts_weights = {
-      0.14731085f, 0.52229995f, 0.14753294f, 0.22475791f, 0.20864725f, 0.6708725f, 0.20204341f, 0.4890914f,
-      0.52103406f, 0.8223115f, 0.122039974f, 0.15674388f, 0.20966923f, 0.8499667f, 0.3202675f, 0.92174435f,
-      0.6808038f, 0.563313f, 0.496278f, 0.40115923f, 0.5627332f, 0.38582766f, 0.49648678f, 0.5637965f,
-      0.10889745f, 0.23793429f, 0.90374637f, 0.09422666f, 0.4640969f, 0.99461937f, 0.6806185f, 0.5141565f,
-      0.066695035f, 0.74768895f, 0.14385962f, 0.35806787f, 0.33224183f, 0.4259563f, 0.50546914f, 0.91240376f,
-      0.5624194f, 0.9478464f, 0.8058562f, 0.18389302f, 0.72425205f, 0.14655197f, 0.28808743f, 0.64706135f,
-      0.66509604f, 0.875114f, 0.33904207f, 0.50080043f, 0.7574118f, 0.016453922f, 0.8614903f, 0.08653879f,
-      0.50689125f, 0.41499162f, 0.23666352f, 0.5660855f, 0.91345936f, 0.35384023f, 0.20315295f, 0.31508058f,
-      0.0044258237f, 0.725697f, 0.25986814f, 0.16632986f, 0.21194929f, 0.787478f, 0.76478684f, 0.8837609f,
-      0.68136156f, 0.33302015f, 0.36027592f, 0.647715f, 0.91101736f, 0.6359461f, 0.26342732f, 0.2649613f,
-      0.02726549f, 0.608024f, 0.21940875f, 0.054212093f, 0.93843824f, 0.1752944f, 0.44311923f, 0.64324677f,
-      0.51592916f, 0.16355914f, 0.09583914f, 0.8985412f, 0.58141935f, 0.91481227f, 0.3323797f, 0.6472777f,
-      0.3856619f, 0.47776443f, 0.1954779f, 0.66910046f, 0.65808296f, 0.4896857f, 0.38754892f, 0.1917851f,
-      0.8457724f, 0.12778795f, 0.70483273f, 0.33187324f, 0.258766f, 0.58982253f, 0.24027151f, 0.6152024f,
-      0.5981904f, 0.12875527f, 0.5832493f, 0.7129646f, 0.6979155f, 0.43706065f, 0.09010619f, 0.42292297f,
-      0.67365384f, 0.31756145f, 0.68979055f, 0.8329813f, 0.2389242f, 0.5049309f, 0.7067495f, 0.5391889f,
-      0.54176575f, 0.5624327f, 0.10692614f, 0.5392941f, 0.8462349f, 0.9505569f, 0.79387546f, 0.5670015f,
-      0.7335071f, 0.25676018f, 0.08565581f, 0.07003945f, 0.99880487f, 0.8173947f, 0.15438312f, 0.6956213f,
-      0.8775838f, 0.9998074f, 0.93719745f, 0.8873769f, 0.38537037f, 0.32452917f, 0.9105244f, 0.7801898f,
-      0.19911051f, 0.9495086f, 0.7415793f, 0.77256775f, 0.18661183f, 0.6434499f, 0.32471877f, 0.8906783f,
-      0.4100297f, 0.69465625f, 0.5888109f, 0.7127341f, 0.33008623f, 0.7437857f, 0.15076452f, 0.6129275f,
-      0.16170406f, 0.006731212f, 0.09847212f, 0.89473504f, 0.7705178f, 0.96910787f, 0.9005606f, 0.053477287f,
-      0.15878445f, 0.4192087f, 0.17528385f, 0.84719825f, 0.121996105f, 0.25604928f, 0.016954303f, 0.21612722f,
-      0.91123873f, 0.90938f, 0.85791886f, 0.88606364f, 0.94459325f, 0.3719685f, 0.72000104f, 0.9454652f,
-      0.6654094f, 0.9998382f, 0.75933146f, 0.81082416f, 0.32500392f, 0.73991376f, 0.5574533f, 0.38059133f,
-      0.21814507f, 0.21944171f, 0.11525959f, 0.83566517f, 0.8554656f, 0.44309366f, 0.210657f, 0.88645273f,
-      0.81974447f, 0.537167f, 0.26393235f, 0.9595239f, 0.70447034f, 0.12042731f, 0.97854143f, 0.8796869f,
-      0.31775457f, 0.78107727f, 0.21590549f, 0.42164284f, 0.9245506f, 0.52065957f, 0.14639091f, 0.33288354f,
-      0.36427742f, 0.4035356f, 0.5478503f, 0.9624148f, 0.5267702f, 0.19128f, 0.52562714f, 0.7397436f,
-      0.7480201f, 0.04303074f, 0.41052878f, 0.12842774f, 0.2866572f, 0.6801467f, 0.1449349f, 0.68586344f,
-      0.92438906f, 0.5327942f, 0.16675615f, 0.32085752f, 0.60918206f, 0.11884099f, 0.74840516f, 0.04606521f,
-      0.01935333f, 0.014169693f, 0.39856833f, 0.83621645f, 0.026760519f, 0.91559356f, 0.29998857f, 0.64644206f,
-      0.52280146f, 0.049140453f, 0.9146645f, 0.7692217f, 0.99699783f, 0.7526061f, 0.1699655f, 0.9172919f,
-      0.5268722f, 0.73710823f, 0.09908545f, 0.35618675f, 0.009061217f, 0.30525374f, 0.6078656f, 0.10741913f,
-      0.6593821f, 0.7684034f, 0.56965464f, 0.16545832f, 0.11234015f, 0.3457417f, 0.7194791f, 0.9931982f,
-      0.7875145f, 0.44369537f, 0.6753082f, 0.009468555f, 0.07294935f, 0.73330396f, 0.2167924f, 0.74054784f,
-      0.14703393f, 0.25234455f, 0.08815551f, 0.76092035f, 0.44905245f, 0.88480055f, 0.8094361f, 0.7766713f,
-      0.51607805f, 0.345411f, 0.39128417f, 0.5664503f, 0.74785477f, 0.14970505f, 0.91963893f, 0.44563496f,
-      0.08102721f, 0.22947109f, 0.94240886f, 0.9572636f, 0.036860168f, 0.85264915f, 0.7505796f, 0.79595923f,
-      0.9232646f, 0.23052484f, 0.6578879f, 0.7046166f, 0.35225332f, 0.66732657f, 0.3561433f, 0.80913067f,
-      0.3612727f, 0.31360215f, 0.6258745f, 0.6773468f, 0.25571418f, 0.54419917f, 0.78976786f, 0.45025164f,
-      0.65216696f, 0.3794065f, 0.6752498f, 0.1378029f, 0.2059856f, 0.24620473f, 0.95950544f, 0.36545795f,
-      0.49863482f, 0.25775224f, 0.99914503f, 0.9883351f, 0.122906685f, 0.09466505f, 0.12100351f, 0.49758863f,
-      0.37254804f, 0.17272717f, 0.32066393f, 0.59446543f, 0.23875463f, 0.61079127f, 0.38534206f, 0.25771832f,
-      0.56869274f, 0.9111291f, 0.16196036f, 0.5232172f, 0.31561613f, 0.99065316f, 0.025618374f, 0.0206694f,
-      0.9926925f, 0.18365502f, 0.5958617f, 0.45684695f, 0.3946715f, 0.3883261f, 0.8177203f, 0.5238985f,
-      0.013192713f, 0.20481992f, 0.32954985f, 0.7516082f, 0.17643315f, 0.9714598f, 0.38863534f, 0.410219f,
-      0.891779f, 0.75130385f, 0.92406017f, 0.7892222f, 0.34832305f, 0.1682638f, 0.46279848f, 0.9138188f,
-      0.3321901f, 0.036315024f, 0.7049642f, 0.9867357f, 0.3576584f, 0.08598822f, 0.046470165f, 0.6252997f,
-      0.46214014f, 0.24750638f, 0.60106593f, 0.6898794f, 0.8976595f, 0.8881911f, 0.42515814f, 0.059116423f,
-      0.048188448f, 0.9668448f, 0.7210276f, 0.7179537f, 0.06738949f, 0.96300787f, 0.97367156f, 0.95143014f,
-      0.07820749f, 0.3113383f, 0.1561181f, 0.9734828f, 0.28516f, 0.27172273f, 0.76195645f, 0.26870382f,
-      0.25373894f, 0.45626426f, 0.45194024f, 0.11051077f, 0.91683406f, 0.27943915f, 0.67735744f, 0.9348918f,
-      0.7521582f, 0.57078993f, 0.9254285f, 0.5672131f, 0.2686717f, 0.97299975f, 0.61834025f, 0.012159586f,
-      0.3576542f, 0.15941626f, 0.9383765f, 0.41742706f, 0.044237554f, 0.46856833f, 0.81400645f, 0.6299002f,
-      0.6581022f, 0.5464366f, 0.68640935f, 0.378174f, 0.3010999f, 0.032645762f, 0.12333155f, 0.71670127f,
-      0.20394331f, 0.57173324f, 0.6595957f, 0.53540194f, 0.17582512f, 0.9781642f, 0.20925027f, 0.9112503f,
-      0.10224587f, 0.37972575f, 0.7719844f, 0.29570967f, 0.9200215f, 0.15592176f, 0.080114245f, 0.27454042f,
-      0.5808252f, 0.96037793f, 0.26129955f, 0.6788141f, 0.37464648f, 0.39156884f, 0.8676517f, 0.112507045f,
-      0.55310667f, 0.9702046f, 0.4312939f, 0.88821906f, 0.3460216f, 0.9024811f, 0.016334832f, 0.42793816f,
-      0.4121768f, 0.6620425f, 0.6961637f, 0.88390845f, 0.425507f, 0.48017246f, 0.8424056f, 0.36471343f,
-      0.9383168f, 0.16709393f, 0.44589508f, 0.47314453f, 0.72310495f, 0.84183806f, 0.4207481f, 0.0857597f,
-      0.7477461f, 0.6495659f, 0.70084965f, 0.19156617f, 0.8217978f, 0.9735775f, 0.5433857f, 0.032975793f,
-      0.85099494f, 0.12927437f, 0.61493605f, 0.5726589f, 0.26598173f, 0.6740978f, 0.052783668f, 0.61387974f};
+      0.14731085f, 0.6808038f, 0.066695035f, 0.66509604f, 0.0044258237f, 0.02726549f, 0.3856619f, 0.5981904f,
+      0.52229995f, 0.563313f, 0.74768895f, 0.875114f, 0.725697f, 0.608024f, 0.47776443f, 0.12875527f,
+      0.14753294f, 0.496278f, 0.14385962f, 0.33904207f, 0.25986814f, 0.21940875f, 0.1954779f, 0.5832493f,
+      0.22475791f, 0.40115923f, 0.35806787f, 0.50080043f, 0.16632986f, 0.054212093f, 0.66910046f, 0.7129646f,
+      0.20864725f, 0.5627332f, 0.33224183f, 0.7574118f, 0.21194929f, 0.93843824f, 0.65808296f, 0.6979155f,
+      0.6708725f, 0.38582766f, 0.4259563f, 0.016453922f, 0.787478f, 0.1752944f, 0.4896857f, 0.43706065f,
+      0.20204341f, 0.49648678f, 0.50546914f, 0.8614903f, 0.76478684f, 0.44311923f, 0.38754892f, 0.09010619f,
+      0.4890914f, 0.5637965f, 0.91240376f, 0.08653879f, 0.8837609f, 0.64324677f, 0.1917851f, 0.42292297f,
+      0.52103406f, 0.10889745f, 0.5624194f, 0.50689125f, 0.68136156f, 0.51592916f, 0.8457724f, 0.67365384f,
+      0.8223115f, 0.23793429f, 0.9478464f, 0.41499162f, 0.33302015f, 0.16355914f, 0.12778795f, 0.31756145f,
+      0.122039974f, 0.90374637f, 0.8058562f, 0.23666352f, 0.36027592f, 0.09583914f, 0.70483273f, 0.68979055f,
+      0.15674388f, 0.09422666f, 0.18389302f, 0.5660855f, 0.647715f, 0.8985412f, 0.33187324f, 0.8329813f,
+      0.20966923f, 0.4640969f, 0.72425205f, 0.91345936f, 0.91101736f, 0.58141935f, 0.258766f, 0.2389242f,
+      0.8499667f, 0.99461937f, 0.14655197f, 0.35384023f, 0.6359461f, 0.91481227f, 0.58982253f, 0.5049309f,
+      0.3202675f, 0.6806185f, 0.28808743f, 0.20315295f, 0.26342732f, 0.3323797f, 0.24027151f, 0.7067495f,
+      0.92174435f, 0.5141565f, 0.64706135f, 0.31508058f, 0.2649613f, 0.6472777f, 0.6152024f, 0.5391889f,
+      0.54176575f, 0.8775838f, 0.4100297f, 0.15878445f, 0.6654094f, 0.81974447f, 0.36427742f, 0.92438906f,
+      0.5624327f, 0.9998074f, 0.69465625f, 0.4192087f, 0.9998382f, 0.537167f, 0.4035356f, 0.5327942f,
+      0.10692614f, 0.93719745f, 0.5888109f, 0.17528385f, 0.75933146f, 0.26393235f, 0.5478503f, 0.16675615f,
+      0.5392941f, 0.8873769f, 0.7127341f, 0.84719825f, 0.81082416f, 0.9595239f, 0.9624148f, 0.32085752f,
+      0.8462349f, 0.38537037f, 0.33008623f, 0.121996105f, 0.32500392f, 0.70447034f, 0.5267702f, 0.60918206f,
+      0.9505569f, 0.32452917f, 0.7437857f, 0.25604928f, 0.73991376f, 0.12042731f, 0.19128f, 0.11884099f,
+      0.79387546f, 0.9105244f, 0.15076452f, 0.016954303f, 0.5574533f, 0.97854143f, 0.52562714f, 0.74840516f,
+      0.5670015f, 0.7801898f, 0.6129275f, 0.21612722f, 0.38059133f, 0.8796869f, 0.7397436f, 0.04606521f,
+      0.7335071f, 0.19911051f, 0.16170406f, 0.91123873f, 0.21814507f, 0.31775457f, 0.7480201f, 0.01935333f,
+      0.25676018f, 0.9495086f, 0.006731212f, 0.90938f, 0.21944171f, 0.78107727f, 0.04303074f, 0.014169693f,
+      0.08565581f, 0.7415793f, 0.09847212f, 0.85791886f, 0.11525959f, 0.21590549f, 0.41052878f, 0.39856833f,
+      0.07003945f, 0.77256775f, 0.89473504f, 0.88606364f, 0.83566517f, 0.42164284f, 0.12842774f, 0.83621645f,
+      0.99880487f, 0.18661183f, 0.7705178f, 0.94459325f, 0.8554656f, 0.9245506f, 0.2866572f, 0.026760519f,
+      0.8173947f, 0.6434499f, 0.96910787f, 0.3719685f, 0.44309366f, 0.52065957f, 0.6801467f, 0.91559356f,
+      0.15438312f, 0.32471877f, 0.9005606f, 0.72000104f, 0.210657f, 0.14639091f, 0.1449349f, 0.29998857f,
+      0.6956213f, 0.8906783f, 0.053477287f, 0.9454652f, 0.88645273f, 0.33288354f, 0.68586344f, 0.64644206f,
+      0.52280146f, 0.6593821f, 0.14703393f, 0.08102721f, 0.3612727f, 0.49863482f, 0.56869274f, 0.013192713f,
+      0.049140453f, 0.7684034f, 0.25234455f, 0.22947109f, 0.31360215f, 0.25775224f, 0.9111291f, 0.20481992f,
+      0.9146645f, 0.56965464f, 0.08815551f, 0.94240886f, 0.6258745f, 0.99914503f, 0.16196036f, 0.32954985f,
+      0.7692217f, 0.16545832f, 0.76092035f, 0.9572636f, 0.6773468f, 0.9883351f, 0.5232172f, 0.7516082f,
+      0.99699783f, 0.11234015f, 0.44905245f, 0.036860168f, 0.25571418f, 0.122906685f, 0.31561613f, 0.17643315f,
+      0.7526061f, 0.3457417f, 0.88480055f, 0.85264915f, 0.54419917f, 0.09466505f, 0.99065316f, 0.9714598f,
+      0.1699655f, 0.7194791f, 0.8094361f, 0.7505796f, 0.78976786f, 0.12100351f, 0.025618374f, 0.38863534f,
+      0.9172919f, 0.9931982f, 0.7766713f, 0.79595923f, 0.45025164f, 0.49758863f, 0.0206694f, 0.410219f,
+      0.5268722f, 0.7875145f, 0.51607805f, 0.9232646f, 0.65216696f, 0.37254804f, 0.9926925f, 0.891779f,
+      0.73710823f, 0.44369537f, 0.345411f, 0.23052484f, 0.3794065f, 0.17272717f, 0.18365502f, 0.75130385f,
+      0.09908545f, 0.6753082f, 0.39128417f, 0.6578879f, 0.6752498f, 0.32066393f, 0.5958617f, 0.92406017f,
+      0.35618675f, 0.009468555f, 0.5664503f, 0.7046166f, 0.1378029f, 0.59446543f, 0.45684695f, 0.7892222f,
+      0.009061217f, 0.07294935f, 0.74785477f, 0.35225332f, 0.2059856f, 0.23875463f, 0.3946715f, 0.34832305f,
+      0.30525374f, 0.73330396f, 0.14970505f, 0.66732657f, 0.24620473f, 0.61079127f, 0.3883261f, 0.1682638f,
+      0.6078656f, 0.2167924f, 0.91963893f, 0.3561433f, 0.95950544f, 0.38534206f, 0.8177203f, 0.46279848f,
+      0.10741913f, 0.74054784f, 0.44563496f, 0.80913067f, 0.36545795f, 0.25771832f, 0.5238985f, 0.9138188f,
+      0.3321901f, 0.048188448f, 0.25373894f, 0.3576542f, 0.20394331f, 0.5808252f, 0.4121768f, 0.7477461f,
+      0.036315024f, 0.9668448f, 0.45626426f, 0.15941626f, 0.57173324f, 0.96037793f, 0.6620425f, 0.6495659f,
+      0.7049642f, 0.7210276f, 0.45194024f, 0.9383765f, 0.6595957f, 0.26129955f, 0.6961637f, 0.70084965f,
+      0.9867357f, 0.7179537f, 0.11051077f, 0.41742706f, 0.53540194f, 0.6788141f, 0.88390845f, 0.19156617f,
+      0.3576584f, 0.06738949f, 0.91683406f, 0.044237554f, 0.17582512f, 0.37464648f, 0.425507f, 0.8217978f,
+      0.08598822f, 0.96300787f, 0.27943915f, 0.46856833f, 0.9781642f, 0.39156884f, 0.48017246f, 0.9735775f,
+      0.046470165f, 0.97367156f, 0.67735744f, 0.81400645f, 0.20925027f, 0.8676517f, 0.8424056f, 0.5433857f,
+      0.6252997f, 0.95143014f, 0.9348918f, 0.6299002f, 0.9112503f, 0.112507045f, 0.36471343f, 0.032975793f,
+      0.46214014f, 0.07820749f, 0.7521582f, 0.6581022f, 0.10224587f, 0.55310667f, 0.9383168f, 0.85099494f,
+      0.24750638f, 0.3113383f, 0.57078993f, 0.5464366f, 0.37972575f, 0.9702046f, 0.16709393f, 0.12927437f,
+      0.60106593f, 0.1561181f, 0.9254285f, 0.68640935f, 0.7719844f, 0.4312939f, 0.44589508f, 0.61493605f,
+      0.6898794f, 0.9734828f, 0.5672131f, 0.378174f, 0.29570967f, 0.88821906f, 0.47314453f, 0.5726589f,
+      0.8976595f, 0.28516f, 0.2686717f, 0.3010999f, 0.9200215f, 0.3460216f, 0.72310495f, 0.26598173f,
+      0.8881911f, 0.27172273f, 0.97299975f, 0.032645762f, 0.15592176f, 0.9024811f, 0.84183806f, 0.6740978f,
+      0.42515814f, 0.76195645f, 0.61834025f, 0.12333155f, 0.080114245f, 0.016334832f, 0.4207481f, 0.052783668f,
+      0.059116423f, 0.26870382f, 0.012159586f, 0.71670127f, 0.27454042f, 0.42793816f, 0.0857597f, 0.61387974f};
   const std::vector<float> fc2_experts_weights = {
-      0.18302453f, 0.44593316f, 0.5643144f, 0.9259722f, 0.26143986f, 0.82031804f, 0.4364831f, 0.2625361f,
-      0.06460017f, 0.04124081f, 0.98830533f, 0.37530023f, 0.5249744f, 0.63555616f, 0.8398661f, 0.92673707f,
-      0.9055086f, 0.12955844f, 0.4198916f, 0.20413119f, 0.21432412f, 0.6186035f, 0.969324f, 0.099448025f,
-      0.80260223f, 0.24076664f, 0.40261286f, 0.89688545f, 0.38691485f, 0.5455279f, 0.15048373f, 0.92562044f,
-      0.43536508f, 0.13430476f, 0.64640516f, 0.14449131f, 0.10324633f, 0.5304596f, 0.8964218f, 0.358508f,
-      0.73533344f, 0.9296606f, 0.83163047f, 0.23771948f, 0.44519007f, 0.34265757f, 0.09793854f, 0.5002066f,
-      0.87621754f, 0.9212578f, 0.54665035f, 0.6135615f, 0.28353918f, 0.8774212f, 0.29194576f, 0.1526736f,
-      0.57699674f, 0.7996927f, 0.04920423f, 0.95198375f, 0.67986554f, 0.14969361f, 0.39229625f, 0.93378997f,
-      0.11638266f, 0.3538614f, 0.66399014f, 0.06195748f, 0.7740991f, 0.7602738f, 0.81010276f, 0.18122643f,
-      0.9980005f, 0.20361924f, 0.99917024f, 0.020154774f, 0.054515004f, 0.80709815f, 0.55225646f, 0.52884465f,
-      0.22312081f, 0.29026228f, 0.35380626f, 0.012922287f, 0.52598435f, 0.58842945f, 0.4995767f, 0.66146517f,
-      0.9744255f, 0.632942f, 0.3169638f, 0.29422665f, 0.18009722f, 0.15339059f, 0.41947508f, 0.4115672f,
-      0.72243124f, 0.2862816f, 0.89860183f, 0.14915991f, 0.5014211f, 0.94945997f, 0.99719256f, 0.21036887f,
-      0.5890645f, 0.55906135f, 0.26557416f, 0.32725257f, 0.635427f, 0.1523174f, 0.58249784f, 0.71636236f,
-      0.30296493f, 0.9153206f, 0.46709478f, 0.72685635f, 0.9951532f, 0.34716582f, 0.7717041f, 0.3569854f,
-      0.4269635f, 0.41526443f, 0.4968937f, 0.3111158f, 0.61719346f, 0.5188402f, 0.8169449f, 0.39879733f,
-      0.5501401f, 0.31400484f, 0.08127314f, 0.7023336f, 0.56397897f, 0.29975814f, 0.33094752f, 0.63076067f,
-      0.40959156f, 0.82673794f, 0.52832156f, 0.68886834f, 0.7178481f, 0.37731683f, 0.71633244f, 0.86896664f,
-      0.5230092f, 0.59784645f, 0.5181678f, 0.8461837f, 0.28890234f, 0.23421508f, 0.7178768f, 0.06484294f,
-      0.5080162f, 0.27005446f, 0.8300168f, 0.034480453f, 0.8031663f, 0.9946784f, 0.60117006f, 0.46668667f,
-      0.9921749f, 0.28632385f, 0.45993322f, 0.28104752f, 0.43097937f, 0.60866946f, 0.5667807f, 0.40556252f,
-      7.969141e-05f, 0.52560204f, 0.48518902f, 0.5752184f, 0.8831251f, 0.9860047f, 0.20335877f, 0.46882278f,
-      0.2996632f, 0.03917718f, 0.13617045f, 0.96928054f, 0.79153055f, 0.76857555f, 0.7778716f, 0.102760494f,
-      0.5525096f, 0.9653573f, 0.22095704f, 0.94479716f, 0.63141924f, 0.8517718f, 0.28580618f, 0.73050886f,
-      0.05675614f, 0.46825224f, 0.6667756f, 0.6499472f, 0.91840404f, 0.99132854f, 0.9548785f, 0.8356961f,
-      0.851531f, 0.43548512f, 0.111976564f, 0.31438643f, 0.44386774f, 0.22980672f, 0.75558543f, 0.6755136f,
-      0.58067596f, 0.62078035f, 0.93922615f, 0.6821157f, 0.061530292f, 0.13705963f, 0.7203748f, 0.5681396f,
-      0.7438458f, 0.0006400347f, 0.038565338f, 0.8066132f, 0.81982285f, 0.047644496f, 0.68979263f, 0.109577894f,
-      0.8786539f, 0.6568952f, 0.99439347f, 0.0070040226f, 0.018661916f, 0.838051f, 0.94391155f, 0.80634f,
-      0.8324149f, 0.078864336f, 0.8619068f, 0.027926445f, 0.61170083f, 0.17248261f, 0.30140227f, 0.5885344f,
-      0.30341f, 0.42088854f, 0.02608782f, 0.02856338f, 0.69368154f, 0.28836077f, 0.19580519f, 0.30270886f,
-      0.09121573f, 0.100299895f, 0.79918617f, 0.75412107f, 0.56660175f, 0.22687018f, 0.6663505f, 0.5224626f,
-      0.1426636f, 0.6075949f, 0.95527196f, 0.008196831f, 0.0028039217f, 0.5640625f, 0.87651116f, 0.19575512f,
-      0.61006856f, 0.85149264f, 0.6541582f, 0.6082054f, 0.998863f, 0.82573634f, 0.21878648f, 0.54321826f,
-      0.7554362f, 0.94095474f, 0.002533555f, 0.77075267f, 0.35483408f, 0.010389388f, 0.610987f, 0.22779316f,
-      0.5708561f, 0.17537653f, 0.12373549f, 0.4575745f, 0.33203715f, 0.79243237f, 0.54310906f, 0.8902793f,
-      0.5937015f, 0.33921933f, 0.8386668f, 0.52732253f, 0.59384584f, 0.3391887f, 0.5017944f, 0.40386343f,
-      0.45749134f, 0.110060334f, 0.49692506f, 0.084977865f, 0.3924346f, 0.7897731f, 0.15232486f, 0.16297412f,
-      0.37791175f, 0.36293298f, 0.5846437f, 0.5830078f, 0.75354826f, 0.15555972f, 0.4647144f, 0.7796456f,
-      0.93248576f, 0.46352726f, 0.2106899f, 0.6437313f, 0.78473866f, 0.18762505f, 0.20985329f, 0.7209991f,
-      0.464967f, 0.02775067f, 0.21170747f, 0.7027664f, 0.33041215f, 0.8451145f, 0.89526993f, 0.57273495f,
-      0.46046263f, 0.34128642f, 0.47471708f, 0.59101045f, 0.11807448f, 0.38050216f, 0.08409953f, 0.80687743f,
-      0.18158185f, 0.9567719f, 0.3711096f, 0.21356237f, 0.74022657f, 0.57453954f, 0.846228f, 0.70873487f,
-      0.018330276f, 0.8162452f, 0.40584308f, 0.27901447f, 0.81752694f, 0.86466515f, 0.060534656f, 0.45478833f,
-      0.9106033f, 0.6936434f, 0.92123467f, 0.32865065f, 0.22417879f, 0.9299548f, 0.70841146f, 0.97999126f,
-      0.2911517f, 0.17896658f, 0.44139355f, 0.029210031f, 0.6959876f, 0.8687942f, 0.62002844f, 0.45059657f,
-      0.74790317f, 0.18262434f, 0.98912156f, 0.0028281808f, 0.021027386f, 0.38184917f, 0.90842223f, 0.5500629f,
-      0.69202286f, 0.13349658f, 0.6823429f, 0.44412827f, 0.7004118f, 0.8531213f, 0.7173401f, 0.4574679f,
-      0.46920043f, 0.18640989f, 0.31914896f, 0.82491904f, 0.29950172f, 0.8105199f, 0.30173403f, 0.38355058f,
-      0.5106411f, 0.04116726f, 0.49500751f, 0.44960213f, 0.45508182f, 0.4000479f, 0.89418864f, 0.8689936f,
-      0.16112137f, 0.7322634f, 0.10780871f, 0.07433933f, 0.652841f, 0.50734824f, 0.26674682f, 0.017748117f,
-      0.30643195f, 0.66699976f, 0.03719926f, 0.014267266f, 0.56343627f, 0.13979793f, 0.061959863f, 0.3073569f,
-      0.41949958f, 0.045647383f, 0.16613615f, 0.5327839f, 0.028514147f, 0.4297228f, 0.17714864f, 0.15338135f,
-      0.6965155f, 0.11515516f, 0.1210829f, 0.78514075f, 0.59348315f, 0.9553564f, 0.36635226f, 0.25849247f,
-      0.45372677f, 0.5025297f, 0.88132215f, 0.0019600391f, 0.46439964f, 0.7211761f, 0.22465849f, 0.2459296f,
-      0.7416339f, 0.020907402f, 0.6184779f, 0.112906754f, 0.7485309f, 0.072479784f, 0.8074024f, 0.026683688f,
-      0.07971662f, 0.50736845f, 0.8939942f, 0.0718022f, 0.27697015f, 0.9391413f, 0.4161513f, 0.7071423f,
-      0.019000888f, 0.34275955f, 0.24608392f, 0.9215306f, 0.70751995f, 0.13516217f, 0.5806135f, 0.49425328f,
-      0.29456508f, 0.21446168f, 0.3340807f, 0.89411324f, 0.14157385f, 0.14382833f, 0.34574044f, 0.50869817f,
-      0.63610595f, 0.51500404f, 0.37963718f, 0.19682491f, 0.41028368f, 0.29872334f, 0.9039644f, 0.013295233f,
-      0.1810705f, 0.093204916f, 0.4086216f, 0.8896367f, 0.9382696f, 0.06472236f, 0.47833657f, 0.7934831f,
-      0.7203987f, 0.9095519f, 0.4861309f, 0.16405362f, 0.83076525f, 0.3285427f, 0.7588931f, 0.37678176f,
-      0.71254706f, 0.949713f, 0.96492773f, 0.044967473f, 0.16925985f, 0.2932666f, 0.18114948f, 0.97975004f,
-      0.4558406f, 0.16832972f, 0.27750528f, 0.2238177f, 0.7039947f, 0.06387442f, 0.033798456f, 0.007119417f};
+      0.18302453f, 0.06460017f, 0.9055086f, 0.80260223f, 0.43536508f, 0.73533344f, 0.87621754f,
+      0.57699674f, 0.11638266f, 0.9980005f, 0.22312081f, 0.9744255f, 0.72243124f, 0.5890645f,
+      0.30296493f, 0.4269635f, 0.44593316f, 0.04124081f, 0.12955844f, 0.24076664f, 0.13430476f,
+      0.9296606f, 0.9212578f, 0.7996927f, 0.3538614f, 0.20361924f, 0.29026228f, 0.632942f,
+      0.2862816f, 0.55906135f, 0.9153206f, 0.41526443f, 0.5643144f, 0.98830533f, 0.4198916f,
+      0.40261286f, 0.64640516f, 0.83163047f, 0.54665035f, 0.04920423f, 0.66399014f, 0.99917024f,
+      0.35380626f, 0.3169638f, 0.89860183f, 0.26557416f, 0.46709478f, 0.4968937f, 0.9259722f,
+      0.37530023f, 0.20413119f, 0.89688545f, 0.14449131f, 0.23771948f, 0.6135615f, 0.95198375f,
+      0.06195748f, 0.020154774f, 0.012922287f, 0.29422665f, 0.14915991f, 0.32725257f, 0.72685635f,
+      0.3111158f, 0.26143986f, 0.5249744f, 0.21432412f, 0.38691485f, 0.10324633f, 0.44519007f,
+      0.28353918f, 0.67986554f, 0.7740991f, 0.054515004f, 0.52598435f, 0.18009722f, 0.5014211f,
+      0.635427f, 0.9951532f, 0.61719346f, 0.82031804f, 0.63555616f, 0.6186035f, 0.5455279f,
+      0.5304596f, 0.34265757f, 0.8774212f, 0.14969361f, 0.7602738f, 0.80709815f, 0.58842945f,
+      0.15339059f, 0.94945997f, 0.1523174f, 0.34716582f, 0.5188402f, 0.4364831f, 0.8398661f,
+      0.969324f, 0.15048373f, 0.8964218f, 0.09793854f, 0.29194576f, 0.39229625f, 0.81010276f,
+      0.55225646f, 0.4995767f, 0.41947508f, 0.99719256f, 0.58249784f, 0.7717041f, 0.8169449f,
+      0.2625361f, 0.92673707f, 0.099448025f, 0.92562044f, 0.358508f, 0.5002066f, 0.1526736f,
+      0.93378997f, 0.18122643f, 0.52884465f, 0.66146517f, 0.4115672f, 0.21036887f, 0.71636236f,
+      0.3569854f, 0.39879733f, 0.5501401f, 0.40959156f, 0.5230092f, 0.5080162f, 0.9921749f,
+      7.969141e-05f, 0.2996632f, 0.5525096f, 0.05675614f, 0.851531f, 0.58067596f, 0.7438458f,
+      0.8786539f, 0.8324149f, 0.30341f, 0.09121573f, 0.31400484f, 0.82673794f, 0.59784645f,
+      0.27005446f, 0.28632385f, 0.52560204f, 0.03917718f, 0.9653573f, 0.46825224f, 0.43548512f,
+      0.62078035f, 0.0006400347f, 0.6568952f, 0.078864336f, 0.42088854f, 0.100299895f, 0.08127314f,
+      0.52832156f, 0.5181678f, 0.8300168f, 0.45993322f, 0.48518902f, 0.13617045f, 0.22095704f,
+      0.6667756f, 0.111976564f, 0.93922615f, 0.038565338f, 0.99439347f, 0.8619068f, 0.02608782f,
+      0.79918617f, 0.7023336f, 0.68886834f, 0.8461837f, 0.034480453f, 0.28104752f, 0.5752184f,
+      0.96928054f, 0.94479716f, 0.6499472f, 0.31438643f, 0.6821157f, 0.8066132f, 0.0070040226f,
+      0.027926445f, 0.02856338f, 0.75412107f, 0.56397897f, 0.7178481f, 0.28890234f, 0.8031663f,
+      0.43097937f, 0.8831251f, 0.79153055f, 0.63141924f, 0.91840404f, 0.44386774f, 0.061530292f,
+      0.81982285f, 0.018661916f, 0.61170083f, 0.69368154f, 0.56660175f, 0.29975814f, 0.37731683f,
+      0.23421508f, 0.9946784f, 0.60866946f, 0.9860047f, 0.76857555f, 0.8517718f, 0.99132854f,
+      0.22980672f, 0.13705963f, 0.047644496f, 0.838051f, 0.17248261f, 0.28836077f, 0.22687018f,
+      0.33094752f, 0.71633244f, 0.7178768f, 0.60117006f, 0.5667807f, 0.20335877f, 0.7778716f,
+      0.28580618f, 0.9548785f, 0.75558543f, 0.7203748f, 0.68979263f, 0.94391155f, 0.30140227f,
+      0.19580519f, 0.6663505f, 0.63076067f, 0.86896664f, 0.06484294f, 0.46668667f, 0.40556252f,
+      0.46882278f, 0.102760494f, 0.73050886f, 0.8356961f, 0.6755136f, 0.5681396f, 0.109577894f,
+      0.80634f, 0.5885344f, 0.30270886f, 0.5224626f, 0.1426636f, 0.61006856f, 0.7554362f,
+      0.5708561f, 0.5937015f, 0.45749134f, 0.37791175f, 0.93248576f, 0.464967f, 0.46046263f,
+      0.18158185f, 0.018330276f, 0.9106033f, 0.2911517f, 0.74790317f, 0.69202286f, 0.6075949f,
+      0.85149264f, 0.94095474f, 0.17537653f, 0.33921933f, 0.110060334f, 0.36293298f, 0.46352726f,
+      0.02775067f, 0.34128642f, 0.9567719f, 0.8162452f, 0.6936434f, 0.17896658f, 0.18262434f,
+      0.13349658f, 0.95527196f, 0.6541582f, 0.002533555f, 0.12373549f, 0.8386668f, 0.49692506f,
+      0.5846437f, 0.2106899f, 0.21170747f, 0.47471708f, 0.3711096f, 0.40584308f, 0.92123467f,
+      0.44139355f, 0.98912156f, 0.6823429f, 0.008196831f, 0.6082054f, 0.77075267f, 0.4575745f,
+      0.52732253f, 0.084977865f, 0.5830078f, 0.6437313f, 0.7027664f, 0.59101045f, 0.21356237f,
+      0.27901447f, 0.32865065f, 0.029210031f, 0.0028281808f, 0.44412827f, 0.0028039217f, 0.998863f,
+      0.35483408f, 0.33203715f, 0.59384584f, 0.3924346f, 0.75354826f, 0.78473866f, 0.33041215f,
+      0.11807448f, 0.74022657f, 0.81752694f, 0.22417879f, 0.6959876f, 0.021027386f, 0.7004118f,
+      0.5640625f, 0.82573634f, 0.010389388f, 0.79243237f, 0.3391887f, 0.7897731f, 0.15555972f,
+      0.18762505f, 0.8451145f, 0.38050216f, 0.57453954f, 0.86466515f, 0.9299548f, 0.8687942f,
+      0.38184917f, 0.8531213f, 0.87651116f, 0.21878648f, 0.610987f, 0.54310906f, 0.5017944f,
+      0.15232486f, 0.4647144f, 0.20985329f, 0.89526993f, 0.08409953f, 0.846228f, 0.060534656f,
+      0.70841146f, 0.62002844f, 0.90842223f, 0.7173401f, 0.19575512f, 0.54321826f, 0.22779316f,
+      0.8902793f, 0.40386343f, 0.16297412f, 0.7796456f, 0.7209991f, 0.57273495f, 0.80687743f,
+      0.70873487f, 0.45478833f, 0.97999126f, 0.45059657f, 0.5500629f, 0.4574679f, 0.46920043f,
+      0.5106411f, 0.16112137f, 0.30643195f, 0.41949958f, 0.6965155f, 0.45372677f, 0.7416339f,
+      0.07971662f, 0.019000888f, 0.29456508f, 0.63610595f, 0.1810705f, 0.7203987f, 0.71254706f,
+      0.4558406f, 0.18640989f, 0.04116726f, 0.7322634f, 0.66699976f, 0.045647383f, 0.11515516f,
+      0.5025297f, 0.020907402f, 0.50736845f, 0.34275955f, 0.21446168f, 0.51500404f, 0.093204916f,
+      0.9095519f, 0.949713f, 0.16832972f, 0.31914896f, 0.49500751f, 0.10780871f, 0.03719926f,
+      0.16613615f, 0.1210829f, 0.88132215f, 0.6184779f, 0.8939942f, 0.24608392f, 0.3340807f,
+      0.37963718f, 0.4086216f, 0.4861309f, 0.96492773f, 0.27750528f, 0.82491904f, 0.44960213f,
+      0.07433933f, 0.014267266f, 0.5327839f, 0.78514075f, 0.0019600391f, 0.112906754f, 0.0718022f,
+      0.9215306f, 0.89411324f, 0.19682491f, 0.8896367f, 0.16405362f, 0.044967473f, 0.2238177f,
+      0.29950172f, 0.45508182f, 0.652841f, 0.56343627f, 0.028514147f, 0.59348315f, 0.46439964f,
+      0.7485309f, 0.27697015f, 0.70751995f, 0.14157385f, 0.41028368f, 0.9382696f, 0.83076525f,
+      0.16925985f, 0.7039947f, 0.8105199f, 0.4000479f, 0.50734824f, 0.13979793f, 0.4297228f,
+      0.9553564f, 0.7211761f, 0.072479784f, 0.9391413f, 0.13516217f, 0.14382833f, 0.29872334f,
+      0.06472236f, 0.3285427f, 0.2932666f, 0.06387442f, 0.30173403f, 0.89418864f, 0.26674682f,
+      0.061959863f, 0.17714864f, 0.36635226f, 0.22465849f, 0.8074024f, 0.4161513f, 0.5806135f,
+      0.34574044f, 0.9039644f, 0.47833657f, 0.7588931f, 0.18114948f, 0.033798456f, 0.38355058f,
+      0.8689936f, 0.017748117f, 0.3073569f, 0.15338135f, 0.25849247f, 0.2459296f, 0.026683688f,
+      0.7071423f, 0.49425328f, 0.50869817f, 0.013295233f, 0.7934831f, 0.37678176f, 0.97975004f,
+      0.007119417f};
   const std::vector<float> fc1_experts_bias = {
       0.71526206f, 0.7472273f, 0.18946046f, 0.6239893f, 0.86909235f, 0.5726507f, 0.3942092f, 0.5369412f,
       0.44638616f, 0.7517496f, 0.16049433f, 0.75355124f, 0.7818118f, 0.19706267f, 0.9082818f, 0.9910924f,
@@ -229,18 +302,8 @@ TEST(MoETest, MoETest_Gelu) {
       0.565234f, 0.17098689f, 0.10810414f, 0.43916586f, 0.3535297f, 0.45673048f, 0.3853893f, 0.18613164f,
       1.3354061f, 0.5049282f, 0.72775036f, 0.90331376f, 1.2945517f, 0.9123066f, 1.1995136f, 0.7708638f};
 
-  RunMoETest(input,
-             router_probs,
-             fc1_experts_weights,
-             fc2_experts_weights,
-             fc1_experts_bias,
-             fc2_experts_bias,
-             output,
-             num_rows,
-             num_experts,
-             hidden_size,
-             inter_size,
-             "gelu");
+  RunMoETest(input, router_probs, fc1_experts_weights, fc2_experts_weights, {}, fc1_experts_bias, fc2_experts_bias,
+             output, num_rows, num_experts, hidden_size, inter_size, "gelu");
 }
 
 TEST(MoETest, MoETest_Relu) {
@@ -258,135 +321,145 @@ TEST(MoETest, MoETest_Relu) {
       -0.08146476f, -0.40439552f, 1.0100367f, -0.7724162f, -0.08113786f, -0.36328858f, 0.3688482f, -0.013465762f,
       -0.32420647f, -0.3815508f, 0.79585606f, 0.14430691f, -0.21869831f, 0.11483674f, -0.11992836f, 0.35216537f};
   const std::vector<float> fc1_experts_weights = {
-      0.81960344f, 0.9296998f, 0.45050132f, 0.38805157f, 0.50729614f, 0.47014588f, 0.62020564f, 0.6401168f,
-      0.045871615f, 0.31548113f, 0.92106473f, 0.6947775f, 0.4751312f, 0.19854712f, 0.19409746f, 0.052116573f,
-      0.3370188f, 0.6688521f, 0.8188108f, 0.73084867f, 0.058027983f, 0.19931877f, 0.42109168f, 0.98367476f,
-      0.57232875f, 0.37051463f, 0.7068576f, 0.30955923f, 0.17637217f, 0.8649436f, 0.2726491f, 0.39976662f,
-      0.0025978684f, 0.8346353f, 0.8788173f, 0.6822241f, 0.1513629f, 0.0065300465f, 0.093910515f, 0.8728501f,
-      0.7400529f, 0.9207522f, 0.76193494f, 0.6265461f, 0.49510366f, 0.11974698f, 0.07161391f, 0.032325685f,
-      0.704681f, 0.254516f, 0.3993737f, 0.21224737f, 0.40888822f, 0.14808255f, 0.17329216f, 0.6658554f,
-      0.3514018f, 0.8086716f, 0.33959562f, 0.13321638f, 0.41178054f, 0.2576263f, 0.3470292f, 0.024002194f,
-      0.77974546f, 0.15189773f, 0.75130886f, 0.7268921f, 0.85721636f, 0.11647397f, 0.8595984f, 0.2636242f,
-      0.6855346f, 0.96955734f, 0.42948407f, 0.49613327f, 0.38488472f, 0.08250773f, 0.73995143f, 0.003641069f,
-      0.81039995f, 0.87411255f, 0.9728532f, 0.38206023f, 0.08917904f, 0.61241513f, 0.77621365f, 0.0023456216f,
-      0.38650817f, 0.20027226f, 0.45626813f, 0.25389326f, 0.2956162f, 0.34127057f, 0.024847984f, 0.91025376f,
-      0.9191656f, 0.42156547f, 0.44305897f, 0.29594004f, 0.04846859f, 0.013427794f, 0.6858292f, 0.22547692f,
-      0.17856151f, 0.4609884f, 0.33349442f, 0.3382396f, 0.5160656f, 0.3939438f, 0.3278438f, 0.26059705f,
-      0.0930863f, 0.9192536f, 0.29990643f, 0.63248974f, 0.32651705f, 0.54063064f, 0.9661502f, 0.73036134f,
-      0.06670016f, 0.6984514f, 0.9746214f, 0.63154167f, 0.83521235f, 0.99294376f, 0.4233855f, 0.6037772f,
-      0.15248245f, 0.39696145f, 0.8702919f, 0.7563229f, 0.18360549f, 0.099057496f, 0.15831816f, 0.00656116f,
-      0.114180505f, 0.3763513f, 0.8374386f, 0.5836911f, 0.11969727f, 0.09888804f, 0.74873763f, 0.12807935f,
-      0.43843627f, 0.739853f, 0.26859397f, 0.44548005f, 0.45647776f, 0.38170832f, 0.24648392f, 0.054280818f,
-      0.0958215f, 0.23226917f, 0.98291886f, 0.25849265f, 0.16423601f, 0.6211971f, 0.63780516f, 0.77395487f,
-      0.8800602f, 0.7784371f, 0.004249513f, 0.5443443f, 0.80287653f, 0.45378727f, 0.20536041f, 0.9766699f,
-      0.31298608f, 0.21532774f, 0.04922247f, 0.52233416f, 0.72156656f, 0.6106814f, 0.59887487f, 0.12080628f,
-      0.03305638f, 0.5088047f, 0.95591706f, 0.7884607f, 0.20888287f, 0.43509573f, 0.13140821f, 0.2587883f,
-      0.5905492f, 0.77226925f, 0.91418463f, 0.04094696f, 0.8343076f, 0.14735395f, 0.6872336f, 0.92312264f,
-      0.5070212f, 0.9549045f, 0.07397425f, 0.3090204f, 0.79162645f, 0.39106607f, 0.39764988f, 0.29160416f,
-      0.84465307f, 0.7452516f, 0.66022503f, 0.21901816f, 0.09412521f, 0.5540803f, 0.6481394f, 0.26914406f,
-      0.36010116f, 0.83768386f, 0.53982985f, 0.52255917f, 0.37694973f, 0.04720515f, 0.029871285f, 0.26099247f,
-      0.2458393f, 0.6557768f, 0.35444462f, 0.30438894f, 0.9767149f, 0.67416143f, 0.85645115f, 0.25794363f,
-      0.2957666f, 0.68377024f, 0.16686243f, 0.17314798f, 0.47585016f, 0.31711966f, 0.125171f, 0.7965795f,
-      0.90208143f, 0.58111167f, 0.41294336f, 0.036863506f, 0.31788063f, 0.6272928f, 0.73576546f, 0.43679124f,
-      0.30232358f, 0.77861303f, 0.10180014f, 0.816009f, 0.30602258f, 0.5076527f, 0.40119207f, 0.5606195f,
-      0.3489008f, 0.8635635f, 0.48700142f, 0.89029974f, 0.98074025f, 0.25640452f, 0.13524544f, 0.901151f,
-      0.89180696f, 0.11822635f, 0.46134835f, 0.006936848f, 0.09070045f, 0.59657127f, 0.6330173f, 0.6059905f,
-      0.36391765f, 0.96128887f, 0.571489f, 0.2049576f, 0.4716931f, 0.6200726f, 0.67509633f, 0.14645958f,
-      0.6873948f, 0.24455917f, 0.08452982f, 0.22689629f, 0.9822047f, 0.9274289f, 0.9477422f, 0.7935056f,
-      0.87772477f, 0.43307513f, 0.22488606f, 0.7498283f, 0.24090862f, 0.16256708f, 0.34033298f, 0.6049296f,
-      0.7573983f, 0.3057955f, 0.20571685f, 0.56744653f, 0.2052834f, 0.17446929f, 0.76062596f, 0.4160077f,
-      0.9568925f, 0.9863913f, 0.64955276f, 0.67207885f, 0.61514187f, 0.50783044f, 0.46363378f, 0.50687206f,
-      0.6867124f, 0.9648854f, 0.37042046f, 0.2886421f, 0.37891757f, 0.25843787f, 0.58501935f, 0.8732242f,
-      0.8909887f, 0.72956276f, 0.13203424f, 0.23164761f, 0.3901443f, 0.40783793f, 0.54112387f, 0.041014254f,
-      0.65562236f, 0.11856395f, 0.18362767f, 0.08430874f, 0.9356598f, 0.026530087f, 0.8771834f, 0.48319155f,
-      0.4418506f, 0.81273925f, 0.4537862f, 0.81357706f, 0.8615075f, 0.06589496f, 0.692392f, 0.5943895f,
-      0.60750586f, 0.5729957f, 0.6367655f, 0.2594666f, 0.43602943f, 0.97506f, 0.83592474f, 0.48121578f,
-      0.029734552f, 0.5219139f, 0.15951324f, 0.90659577f, 0.19645631f, 0.4638992f, 0.38902867f, 0.5889769f,
-      0.9705138f, 0.5475096f, 0.789582f, 0.8881108f, 0.9036556f, 0.32732427f, 0.38817167f, 0.7409689f,
-      0.36356616f, 0.734132f, 0.39076614f, 0.16087383f, 0.70352167f, 0.576659f, 0.7229242f, 0.996743f,
-      0.84136647f, 0.97399056f, 0.5267614f, 0.06989372f, 0.14923638f, 0.18941313f, 0.059375823f, 0.24937624f,
-      0.039716125f, 0.038692355f, 0.20122272f, 0.0070830584f, 0.19309378f, 0.69065434f, 0.9170264f, 0.3512686f,
-      0.3545606f, 0.76697665f, 0.25331455f, 0.26358372f, 0.80806476f, 0.064349174f, 0.5611374f, 0.941691f,
-      0.58574325f, 0.6359719f, 0.20880443f, 0.49310172f, 0.5274922f, 0.62271714f, 0.694273f, 0.9344639f,
-      0.11835027f, 0.51498765f, 0.25018185f, 0.10446805f, 0.45996118f, 0.059881568f, 0.8489496f, 0.5579074f,
-      0.23052096f, 0.76128954f, 0.02678603f, 0.3066004f, 0.40259063f, 0.07512486f, 0.18205583f, 0.4183907f,
-      0.8793823f, 0.9828271f, 0.8181312f, 0.20143801f, 0.17288941f, 0.9363466f, 0.6768587f, 0.51328385f,
-      0.56766605f, 0.098151624f, 0.33305728f, 0.98130906f, 0.3766839f, 0.47491795f, 0.08483446f, 0.22029644f,
-      0.4897902f, 0.18942028f, 0.4379952f, 0.7034796f, 0.0109113455f, 0.64850605f, 0.16939592f, 0.25597447f,
-      0.69195485f, 0.8975601f, 0.36334568f, 0.29471546f, 0.04788208f, 0.24217117f, 0.062181532f, 0.38556474f,
-      0.6020277f, 0.03156215f, 0.93655676f, 0.81369543f, 0.010527074f, 0.2611835f, 0.6630776f, 0.3972702f,
-      0.44551176f, 0.27424216f, 0.9016098f, 0.22050089f, 0.9146384f, 0.53226113f, 0.6005109f, 0.8900659f,
-      0.4176172f, 0.21532834f, 0.4191329f, 0.9055267f, 0.12900633f, 0.6134902f, 0.008604288f, 0.76215106f,
-      0.68473387f, 0.5211961f, 0.71459657f, 0.50056237f, 0.7766764f, 0.10418975f, 0.42657375f, 0.7218073f,
-      0.9979084f, 0.7546957f, 0.1364128f, 0.8845484f, 0.38850087f, 0.39324278f, 0.04554516f, 0.42129284f,
-      0.8536634f, 0.5697224f, 0.20877302f, 0.65390605f, 0.3396778f, 0.956497f, 0.066022694f, 0.34206223f,
-      0.017213225f, 0.3030849f, 0.6576238f, 0.9813073f, 0.58397317f, 0.99017924f, 0.59782606f, 0.788768f,
-      0.9008311f, 0.91796166f, 0.22013813f, 0.959695f, 0.80288273f, 0.2662105f, 0.26139832f, 0.080626905f};
+      0.81960344f, 0.3370188f, 0.0025978684f, 0.704681f, 0.77974546f, 0.81039995f, 0.9191656f,
+      0.0930863f, 0.9296998f, 0.6688521f, 0.8346353f, 0.254516f, 0.15189773f, 0.87411255f,
+      0.42156547f, 0.9192536f, 0.45050132f, 0.8188108f, 0.8788173f, 0.3993737f, 0.75130886f,
+      0.9728532f, 0.44305897f, 0.29990643f, 0.38805157f, 0.73084867f, 0.6822241f, 0.21224737f,
+      0.7268921f, 0.38206023f, 0.29594004f, 0.63248974f, 0.50729614f, 0.058027983f, 0.1513629f,
+      0.40888822f, 0.85721636f, 0.08917904f, 0.04846859f, 0.32651705f, 0.47014588f, 0.19931877f,
+      0.0065300465f, 0.14808255f, 0.11647397f, 0.61241513f, 0.013427794f, 0.54063064f, 0.62020564f,
+      0.42109168f, 0.093910515f, 0.17329216f, 0.8595984f, 0.77621365f, 0.6858292f, 0.9661502f,
+      0.6401168f, 0.98367476f, 0.8728501f, 0.6658554f, 0.2636242f, 0.0023456216f, 0.22547692f,
+      0.73036134f, 0.045871615f, 0.57232875f, 0.7400529f, 0.3514018f, 0.6855346f, 0.38650817f,
+      0.17856151f, 0.06670016f, 0.31548113f, 0.37051463f, 0.9207522f, 0.8086716f, 0.96955734f,
+      0.20027226f, 0.4609884f, 0.6984514f, 0.92106473f, 0.7068576f, 0.76193494f, 0.33959562f,
+      0.42948407f, 0.45626813f, 0.33349442f, 0.9746214f, 0.6947775f, 0.30955923f, 0.6265461f,
+      0.13321638f, 0.49613327f, 0.25389326f, 0.3382396f, 0.63154167f, 0.4751312f, 0.17637217f,
+      0.49510366f, 0.41178054f, 0.38488472f, 0.2956162f, 0.5160656f, 0.83521235f, 0.19854712f,
+      0.8649436f, 0.11974698f, 0.2576263f, 0.08250773f, 0.34127057f, 0.3939438f, 0.99294376f,
+      0.19409746f, 0.2726491f, 0.07161391f, 0.3470292f, 0.73995143f, 0.024847984f, 0.3278438f,
+      0.4233855f, 0.052116573f, 0.39976662f, 0.032325685f, 0.024002194f, 0.003641069f, 0.91025376f,
+      0.26059705f, 0.6037772f, 0.15248245f, 0.43843627f, 0.8800602f, 0.03305638f, 0.5070212f,
+      0.36010116f, 0.2957666f, 0.30232358f, 0.39696145f, 0.739853f, 0.7784371f, 0.5088047f,
+      0.9549045f, 0.83768386f, 0.68377024f, 0.77861303f, 0.8702919f, 0.26859397f, 0.004249513f,
+      0.95591706f, 0.07397425f, 0.53982985f, 0.16686243f, 0.10180014f, 0.7563229f, 0.44548005f,
+      0.5443443f, 0.7884607f, 0.3090204f, 0.52255917f, 0.17314798f, 0.816009f, 0.18360549f,
+      0.45647776f, 0.80287653f, 0.20888287f, 0.79162645f, 0.37694973f, 0.47585016f, 0.30602258f,
+      0.099057496f, 0.38170832f, 0.45378727f, 0.43509573f, 0.39106607f, 0.04720515f, 0.31711966f,
+      0.5076527f, 0.15831816f, 0.24648392f, 0.20536041f, 0.13140821f, 0.39764988f, 0.029871285f,
+      0.125171f, 0.40119207f, 0.00656116f, 0.054280818f, 0.9766699f, 0.2587883f, 0.29160416f,
+      0.26099247f, 0.7965795f, 0.5606195f, 0.114180505f, 0.0958215f, 0.31298608f, 0.5905492f,
+      0.84465307f, 0.2458393f, 0.90208143f, 0.3489008f, 0.3763513f, 0.23226917f, 0.21532774f,
+      0.77226925f, 0.7452516f, 0.6557768f, 0.58111167f, 0.8635635f, 0.8374386f, 0.98291886f,
+      0.04922247f, 0.91418463f, 0.66022503f, 0.35444462f, 0.41294336f, 0.48700142f, 0.5836911f,
+      0.25849265f, 0.52233416f, 0.04094696f, 0.21901816f, 0.30438894f, 0.036863506f, 0.89029974f,
+      0.11969727f, 0.16423601f, 0.72156656f, 0.8343076f, 0.09412521f, 0.9767149f, 0.31788063f,
+      0.98074025f, 0.09888804f, 0.6211971f, 0.6106814f, 0.14735395f, 0.5540803f, 0.67416143f,
+      0.6272928f, 0.25640452f, 0.74873763f, 0.63780516f, 0.59887487f, 0.6872336f, 0.6481394f,
+      0.85645115f, 0.73576546f, 0.13524544f, 0.12807935f, 0.77395487f, 0.12080628f, 0.92312264f,
+      0.26914406f, 0.25794363f, 0.43679124f, 0.901151f, 0.89180696f, 0.6873948f, 0.7573983f,
+      0.6867124f, 0.65562236f, 0.60750586f, 0.9705138f, 0.84136647f, 0.11822635f, 0.24455917f,
+      0.3057955f, 0.9648854f, 0.11856395f, 0.5729957f, 0.5475096f, 0.97399056f, 0.46134835f,
+      0.08452982f, 0.20571685f, 0.37042046f, 0.18362767f, 0.6367655f, 0.789582f, 0.5267614f,
+      0.006936848f, 0.22689629f, 0.56744653f, 0.2886421f, 0.08430874f, 0.2594666f, 0.8881108f,
+      0.06989372f, 0.09070045f, 0.9822047f, 0.2052834f, 0.37891757f, 0.9356598f, 0.43602943f,
+      0.9036556f, 0.14923638f, 0.59657127f, 0.9274289f, 0.17446929f, 0.25843787f, 0.026530087f,
+      0.97506f, 0.32732427f, 0.18941313f, 0.6330173f, 0.9477422f, 0.76062596f, 0.58501935f,
+      0.8771834f, 0.83592474f, 0.38817167f, 0.059375823f, 0.6059905f, 0.7935056f, 0.4160077f,
+      0.8732242f, 0.48319155f, 0.48121578f, 0.7409689f, 0.24937624f, 0.36391765f, 0.87772477f,
+      0.9568925f, 0.8909887f, 0.4418506f, 0.029734552f, 0.36356616f, 0.039716125f, 0.96128887f,
+      0.43307513f, 0.9863913f, 0.72956276f, 0.81273925f, 0.5219139f, 0.734132f, 0.038692355f,
+      0.571489f, 0.22488606f, 0.64955276f, 0.13203424f, 0.4537862f, 0.15951324f, 0.39076614f,
+      0.20122272f, 0.2049576f, 0.7498283f, 0.67207885f, 0.23164761f, 0.81357706f, 0.90659577f,
+      0.16087383f, 0.0070830584f, 0.4716931f, 0.24090862f, 0.61514187f, 0.3901443f, 0.8615075f,
+      0.19645631f, 0.70352167f, 0.19309378f, 0.6200726f, 0.16256708f, 0.50783044f, 0.40783793f,
+      0.06589496f, 0.4638992f, 0.576659f, 0.69065434f, 0.67509633f, 0.34033298f, 0.46363378f,
+      0.54112387f, 0.692392f, 0.38902867f, 0.7229242f, 0.9170264f, 0.14645958f, 0.6049296f,
+      0.50687206f, 0.041014254f, 0.5943895f, 0.5889769f, 0.996743f, 0.3512686f, 0.3545606f,
+      0.11835027f, 0.8793823f, 0.4897902f, 0.6020277f, 0.4176172f, 0.9979084f, 0.017213225f,
+      0.76697665f, 0.51498765f, 0.9828271f, 0.18942028f, 0.03156215f, 0.21532834f, 0.7546957f,
+      0.3030849f, 0.25331455f, 0.25018185f, 0.8181312f, 0.4379952f, 0.93655676f, 0.4191329f,
+      0.1364128f, 0.6576238f, 0.26358372f, 0.10446805f, 0.20143801f, 0.7034796f, 0.81369543f,
+      0.9055267f, 0.8845484f, 0.9813073f, 0.80806476f, 0.45996118f, 0.17288941f, 0.0109113455f,
+      0.010527074f, 0.12900633f, 0.38850087f, 0.58397317f, 0.064349174f, 0.059881568f, 0.9363466f,
+      0.64850605f, 0.2611835f, 0.6134902f, 0.39324278f, 0.99017924f, 0.5611374f, 0.8489496f,
+      0.6768587f, 0.16939592f, 0.6630776f, 0.008604288f, 0.04554516f, 0.59782606f, 0.941691f,
+      0.5579074f, 0.51328385f, 0.25597447f, 0.3972702f, 0.76215106f, 0.42129284f, 0.788768f,
+      0.58574325f, 0.23052096f, 0.56766605f, 0.69195485f, 0.44551176f, 0.68473387f, 0.8536634f,
+      0.9008311f, 0.6359719f, 0.76128954f, 0.098151624f, 0.8975601f, 0.27424216f, 0.5211961f,
+      0.5697224f, 0.91796166f, 0.20880443f, 0.02678603f, 0.33305728f, 0.36334568f, 0.9016098f,
+      0.71459657f, 0.20877302f, 0.22013813f, 0.49310172f, 0.3066004f, 0.98130906f, 0.29471546f,
+      0.22050089f, 0.50056237f, 0.65390605f, 0.959695f, 0.5274922f, 0.40259063f, 0.3766839f,
+      0.04788208f, 0.9146384f, 0.7766764f, 0.3396778f, 0.80288273f, 0.62271714f, 0.07512486f,
+      0.47491795f, 0.24217117f, 0.53226113f, 0.10418975f, 0.956497f, 0.2662105f, 0.694273f,
+      0.18205583f, 0.08483446f, 0.062181532f, 0.6005109f, 0.42657375f, 0.066022694f, 0.26139832f,
+      0.9344639f, 0.4183907f, 0.22029644f, 0.38556474f, 0.8900659f, 0.7218073f, 0.34206223f,
+      0.080626905f};
   const std::vector<float> fc2_experts_weights = {
-      0.6255686f, 0.09472537f, 0.71121234f, 0.65789884f, 0.065598905f, 0.63625044f, 0.45933473f, 0.7284089f,
-      0.7868948f, 0.0029274821f, 0.95854944f, 0.919321f, 0.6989418f, 0.043019474f, 0.32138962f, 0.35509557f,
-      0.37150103f, 0.78196156f, 0.6817853f, 0.89608955f, 0.31273842f, 0.6682699f, 0.6778976f, 0.08370459f,
-      0.014990091f, 0.24055547f, 0.84227383f, 0.029270172f, 0.0647831f, 0.7801003f, 0.7697645f, 0.91119635f,
-      0.12253064f, 0.13405013f, 0.75649333f, 0.9348151f, 0.7991694f, 0.57832605f, 0.66478735f, 0.97456336f,
-      0.17739785f, 0.2729941f, 0.8497335f, 0.15788019f, 0.22429371f, 0.86499554f, 0.65776104f, 0.661535f,
-      0.2880798f, 0.49309975f, 0.9576164f, 0.19988996f, 0.5039311f, 0.73779976f, 0.15482187f, 0.98558843f,
-      0.25019473f, 0.379932f, 0.36471486f, 0.17417055f, 0.009367704f, 0.7819258f, 0.63283706f, 0.031699598f,
-      0.1781866f, 0.994184f, 0.6911175f, 0.7006223f, 0.20085096f, 0.28080195f, 0.42452294f, 0.40856004f,
-      0.15737581f, 0.5411925f, 0.549694f, 0.4366895f, 0.5693159f, 0.3018247f, 0.63012594f, 0.6885702f,
-      0.2366305f, 0.004210472f, 0.7617172f, 0.61926836f, 0.24570602f, 0.981851f, 0.273876f, 0.8378734f,
-      0.75366426f, 0.080795944f, 0.82247066f, 0.040263534f, 0.22299266f, 0.41664255f, 0.16297674f, 0.98845494f,
-      0.39971018f, 0.69859487f, 0.053544044f, 0.7878332f, 0.34460813f, 0.11966437f, 0.5731115f, 0.7422309f,
-      0.93269855f, 0.19460368f, 0.25394785f, 0.59613144f, 0.6356306f, 0.6922361f, 0.7744376f, 0.38662314f,
-      0.7777848f, 0.8686458f, 0.36938924f, 0.8557286f, 0.74428976f, 0.9410264f, 0.21586305f, 0.2530955f,
-      0.35543054f, 0.52536315f, 0.8000995f, 0.21456867f, 0.750327f, 0.3208093f, 0.80205464f, 0.47626138f,
-      0.061956525f, 0.22487706f, 0.13812399f, 0.74798125f, 0.1647259f, 0.45834088f, 0.6078779f, 0.22580266f,
-      0.644235f, 0.011788309f, 0.14224577f, 0.0469383f, 0.34876132f, 0.3178513f, 0.5715967f, 0.40754277f,
-      0.735041f, 0.9583977f, 0.67939556f, 0.30301625f, 0.031807184f, 0.68110096f, 0.25227106f, 0.75443816f,
-      0.83424246f, 0.69286025f, 0.9691554f, 0.9748982f, 0.60586995f, 0.13568163f, 0.94672066f, 0.26275212f,
-      0.2638232f, 0.9183893f, 0.88740516f, 0.65107566f, 0.5313419f, 0.07941705f, 0.44809794f, 0.9795632f,
-      0.6273294f, 0.542809f, 0.3961745f, 0.32560885f, 0.79801136f, 0.53083426f, 0.8252871f, 0.4115007f,
-      0.7184546f, 0.70638496f, 0.57973206f, 0.8141865f, 0.81332296f, 0.96346164f, 0.88438797f, 0.37215167f,
-      0.0766899f, 0.5914087f, 0.49563587f, 0.3695873f, 0.41627264f, 0.5235164f, 0.86481494f, 0.6558706f,
-      0.32245284f, 0.29438752f, 0.37618434f, 0.3067485f, 0.9496114f, 0.76482266f, 0.95148784f, 0.5015968f,
-      0.60083544f, 0.67338234f, 0.026723444f, 0.5446483f, 0.466555f, 0.21967298f, 0.112026334f, 0.9426372f,
-      0.906533f, 0.73173434f, 0.97712487f, 0.29709607f, 0.41363865f, 0.6893093f, 0.4173867f, 0.4018826f,
-      0.086719275f, 0.63433063f, 0.1978364f, 0.5181831f, 0.9874878f, 0.34609234f, 0.34240413f, 0.8016564f,
-      0.31617337f, 0.4570613f, 0.96686924f, 0.29501313f, 0.14229488f, 0.22017813f, 0.36137718f, 0.26275063f,
-      0.24053413f, 0.70197225f, 0.58496886f, 0.33996922f, 0.11154431f, 0.34257007f, 0.28898042f, 0.33729053f,
-      0.048938513f, 0.60771453f, 0.13263822f, 0.11060041f, 0.091483414f, 0.70869184f, 0.19898665f, 0.29362458f,
-      0.8919203f, 0.7654821f, 0.7866956f, 0.02524674f, 0.1414501f, 0.3112445f, 0.9130488f, 0.5511502f,
-      0.12605143f, 0.5031309f, 0.11166459f, 0.39045036f, 0.36251247f, 0.9328308f, 0.65486836f, 0.41281444f,
-      0.5844644f, 0.35566723f, 0.6964502f, 0.6977819f, 0.63427305f, 0.30511153f, 0.92657536f, 0.42781502f,
-      0.30534166f, 0.813157f, 0.90752834f, 0.9975799f, 0.64812917f, 0.32955307f, 0.753946f, 0.92897725f,
-      0.009582937f, 0.43805653f, 0.15901726f, 0.5931799f, 0.7067924f, 0.39670604f, 0.45817143f, 0.7250554f,
-      0.41596514f, 0.08011025f, 0.900068f, 0.24834275f, 0.44507074f, 0.5471632f, 0.46995157f, 0.029657006f,
-      0.7294f, 0.27288425f, 0.2406702f, 0.6194577f, 0.23906898f, 0.26892018f, 0.33152503f, 0.3121612f,
-      0.29118127f, 0.36515707f, 0.6299379f, 0.095391035f, 0.19735986f, 0.5072957f, 0.56953406f, 0.77614623f,
-      0.14877802f, 0.65959847f, 0.7841949f, 0.7776301f, 0.03428924f, 0.3091979f, 0.07021719f, 0.18359429f,
-      0.77849144f, 0.42534047f, 0.7123557f, 0.20649683f, 0.57597995f, 0.19757104f, 0.749946f, 0.2813105f,
-      0.37462044f, 0.06618434f, 0.50165176f, 0.9747401f, 0.7426891f, 0.23322952f, 0.50672436f, 0.44517577f,
-      0.09746289f, 0.89204556f, 0.50806034f, 0.6052985f, 0.2980855f, 0.26604044f, 0.5824448f, 0.68485546f,
-      0.612149f, 0.25902748f, 0.9854489f, 0.4263978f, 0.19379246f, 0.26614368f, 0.9922104f, 0.5000241f,
-      0.4321279f, 0.2919191f, 0.3689273f, 0.078885734f, 0.10265827f, 0.79264474f, 0.9277247f, 0.9771502f,
-      0.13902885f, 0.77043164f, 0.19051671f, 0.7982801f, 0.86077714f, 0.8869355f, 0.86002564f, 0.81278664f,
-      0.5097318f, 0.7297412f, 0.32111454f, 0.7177174f, 0.33929902f, 0.49160433f, 0.064810574f, 0.3692627f,
-      0.23706353f, 0.3313396f, 0.18070674f, 0.05027789f, 0.53255826f, 0.8244896f, 0.9553747f, 0.7917771f,
-      0.24083132f, 0.005495131f, 0.6896569f, 0.78015697f, 0.07074398f, 0.67929304f, 0.9227386f, 0.5302883f,
-      0.19877058f, 0.90993816f, 0.71350795f, 0.8311006f, 0.16185725f, 0.79097277f, 0.15846318f, 0.99474716f,
-      0.28815013f, 0.80128354f, 0.6001208f, 0.63250524f, 0.4233225f, 0.7053677f, 0.29161406f, 0.028710365f,
-      0.30789846f, 0.8917693f, 0.36836517f, 0.6571592f, 0.3151368f, 0.8750746f, 0.7992451f, 0.6765068f,
-      0.24441916f, 0.091435075f, 0.5188247f, 0.20667112f, 0.9110969f, 0.019512117f, 0.72343415f, 0.998457f,
-      0.7504142f, 0.6704894f, 0.01892668f, 0.9809466f, 0.41447622f, 0.032795787f, 0.9935814f, 0.29653466f,
-      0.4646262f, 0.95763975f, 0.15339965f, 0.14625502f, 0.58130866f, 0.43307304f, 0.6151709f, 0.08064735f,
-      0.5149533f, 0.27762014f, 0.25419557f, 0.04218155f, 0.7651092f, 0.59631824f, 0.077278376f, 0.89677596f,
-      0.6508104f, 0.5927816f, 0.2064318f, 0.57540226f, 0.9817701f, 0.84294224f, 0.11056489f, 0.9564106f,
-      0.5387549f, 0.74048257f, 0.88833815f, 0.9262546f, 0.11023259f, 0.93783194f, 0.16041255f, 0.53748304f,
-      0.1506182f, 0.39038336f, 0.47727865f, 0.44018233f, 0.42101204f, 0.53943527f, 0.99320936f, 0.79050577f,
-      0.77973497f, 0.7001237f, 0.88709056f, 0.4769255f, 0.5397561f, 0.60289854f, 0.06393474f, 0.09722155f,
-      0.5613007f, 0.30437487f, 0.49082512f, 0.3852706f, 0.5778314f, 0.8253078f, 0.33417904f, 0.9004303f,
-      0.8947809f, 0.11625093f, 0.11388689f, 0.09546256f, 0.22598988f, 0.30536187f, 0.46236527f, 0.3784039f,
-      0.24737573f, 0.3411532f, 0.31912774f, 0.9905191f, 0.31468558f, 0.14199954f, 0.7078488f, 0.47111923f,
-      0.882782f, 0.8124163f, 0.9593644f, 0.13382024f, 0.8214317f, 0.9196194f, 0.25308424f, 0.95958996f};
+      0.6255686f, 0.7868948f, 0.37150103f, 0.014990091f, 0.12253064f, 0.17739785f, 0.2880798f, 0.25019473f,
+      0.1781866f, 0.15737581f, 0.2366305f, 0.75366426f, 0.39971018f, 0.93269855f, 0.7777848f, 0.35543054f,
+      0.09472537f, 0.0029274821f, 0.78196156f, 0.24055547f, 0.13405013f, 0.2729941f, 0.49309975f, 0.379932f,
+      0.994184f, 0.5411925f, 0.004210472f, 0.080795944f, 0.69859487f, 0.19460368f, 0.8686458f, 0.52536315f,
+      0.71121234f, 0.95854944f, 0.6817853f, 0.84227383f, 0.75649333f, 0.8497335f, 0.9576164f, 0.36471486f,
+      0.6911175f, 0.549694f, 0.7617172f, 0.82247066f, 0.053544044f, 0.25394785f, 0.36938924f, 0.8000995f,
+      0.65789884f, 0.919321f, 0.89608955f, 0.029270172f, 0.9348151f, 0.15788019f, 0.19988996f, 0.17417055f,
+      0.7006223f, 0.4366895f, 0.61926836f, 0.040263534f, 0.7878332f, 0.59613144f, 0.8557286f, 0.21456867f,
+      0.065598905f, 0.6989418f, 0.31273842f, 0.0647831f, 0.7991694f, 0.22429371f, 0.5039311f, 0.009367704f,
+      0.20085096f, 0.5693159f, 0.24570602f, 0.22299266f, 0.34460813f, 0.6356306f, 0.74428976f, 0.750327f,
+      0.63625044f, 0.043019474f, 0.6682699f, 0.7801003f, 0.57832605f, 0.86499554f, 0.73779976f, 0.7819258f,
+      0.28080195f, 0.3018247f, 0.981851f, 0.41664255f, 0.11966437f, 0.6922361f, 0.9410264f, 0.3208093f,
+      0.45933473f, 0.32138962f, 0.6778976f, 0.7697645f, 0.66478735f, 0.65776104f, 0.15482187f, 0.63283706f,
+      0.42452294f, 0.63012594f, 0.273876f, 0.16297674f, 0.5731115f, 0.7744376f, 0.21586305f, 0.80205464f,
+      0.7284089f, 0.35509557f, 0.08370459f, 0.91119635f, 0.97456336f, 0.661535f, 0.98558843f, 0.031699598f,
+      0.40856004f, 0.6885702f, 0.8378734f, 0.98845494f, 0.7422309f, 0.38662314f, 0.2530955f, 0.47626138f,
+      0.061956525f, 0.644235f, 0.735041f, 0.83424246f, 0.2638232f, 0.6273294f, 0.7184546f, 0.0766899f,
+      0.32245284f, 0.60083544f, 0.906533f, 0.086719275f, 0.31617337f, 0.24053413f, 0.048938513f, 0.8919203f,
+      0.22487706f, 0.011788309f, 0.9583977f, 0.69286025f, 0.9183893f, 0.542809f, 0.70638496f, 0.5914087f,
+      0.29438752f, 0.67338234f, 0.73173434f, 0.63433063f, 0.4570613f, 0.70197225f, 0.60771453f, 0.7654821f,
+      0.13812399f, 0.14224577f, 0.67939556f, 0.9691554f, 0.88740516f, 0.3961745f, 0.57973206f, 0.49563587f,
+      0.37618434f, 0.026723444f, 0.97712487f, 0.1978364f, 0.96686924f, 0.58496886f, 0.13263822f, 0.7866956f,
+      0.74798125f, 0.0469383f, 0.30301625f, 0.9748982f, 0.65107566f, 0.32560885f, 0.8141865f, 0.3695873f,
+      0.3067485f, 0.5446483f, 0.29709607f, 0.5181831f, 0.29501313f, 0.33996922f, 0.11060041f, 0.02524674f,
+      0.1647259f, 0.34876132f, 0.031807184f, 0.60586995f, 0.5313419f, 0.79801136f, 0.81332296f, 0.41627264f,
+      0.9496114f, 0.466555f, 0.41363865f, 0.9874878f, 0.14229488f, 0.11154431f, 0.091483414f, 0.1414501f,
+      0.45834088f, 0.3178513f, 0.68110096f, 0.13568163f, 0.07941705f, 0.53083426f, 0.96346164f, 0.5235164f,
+      0.76482266f, 0.21967298f, 0.6893093f, 0.34609234f, 0.22017813f, 0.34257007f, 0.70869184f, 0.3112445f,
+      0.6078779f, 0.5715967f, 0.25227106f, 0.94672066f, 0.44809794f, 0.8252871f, 0.88438797f, 0.86481494f,
+      0.95148784f, 0.112026334f, 0.4173867f, 0.34240413f, 0.36137718f, 0.28898042f, 0.19898665f, 0.9130488f,
+      0.22580266f, 0.40754277f, 0.75443816f, 0.26275212f, 0.9795632f, 0.4115007f, 0.37215167f, 0.6558706f,
+      0.5015968f, 0.9426372f, 0.4018826f, 0.8016564f, 0.26275063f, 0.33729053f, 0.29362458f, 0.5511502f,
+      0.12605143f, 0.5844644f, 0.30534166f, 0.009582937f, 0.41596514f, 0.7294f, 0.29118127f, 0.14877802f,
+      0.77849144f, 0.37462044f, 0.09746289f, 0.612149f, 0.4321279f, 0.13902885f, 0.5097318f, 0.23706353f,
+      0.5031309f, 0.35566723f, 0.813157f, 0.43805653f, 0.08011025f, 0.27288425f, 0.36515707f, 0.65959847f,
+      0.42534047f, 0.06618434f, 0.89204556f, 0.25902748f, 0.2919191f, 0.77043164f, 0.7297412f, 0.3313396f,
+      0.11166459f, 0.6964502f, 0.90752834f, 0.15901726f, 0.900068f, 0.2406702f, 0.6299379f, 0.7841949f,
+      0.7123557f, 0.50165176f, 0.50806034f, 0.9854489f, 0.3689273f, 0.19051671f, 0.32111454f, 0.18070674f,
+      0.39045036f, 0.6977819f, 0.9975799f, 0.5931799f, 0.24834275f, 0.6194577f, 0.095391035f, 0.7776301f,
+      0.20649683f, 0.9747401f, 0.6052985f, 0.4263978f, 0.078885734f, 0.7982801f, 0.7177174f, 0.05027789f,
+      0.36251247f, 0.63427305f, 0.64812917f, 0.7067924f, 0.44507074f, 0.23906898f, 0.19735986f, 0.03428924f,
+      0.57597995f, 0.7426891f, 0.2980855f, 0.19379246f, 0.10265827f, 0.86077714f, 0.33929902f, 0.53255826f,
+      0.9328308f, 0.30511153f, 0.32955307f, 0.39670604f, 0.5471632f, 0.26892018f, 0.5072957f, 0.3091979f,
+      0.19757104f, 0.23322952f, 0.26604044f, 0.26614368f, 0.79264474f, 0.8869355f, 0.49160433f, 0.8244896f,
+      0.65486836f, 0.92657536f, 0.753946f, 0.45817143f, 0.46995157f, 0.33152503f, 0.56953406f, 0.07021719f,
+      0.749946f, 0.50672436f, 0.5824448f, 0.9922104f, 0.9277247f, 0.86002564f, 0.064810574f, 0.9553747f,
+      0.41281444f, 0.42781502f, 0.92897725f, 0.7250554f, 0.029657006f, 0.3121612f, 0.77614623f, 0.18359429f,
+      0.2813105f, 0.44517577f, 0.68485546f, 0.5000241f, 0.9771502f, 0.81278664f, 0.3692627f, 0.7917771f,
+      0.24083132f, 0.19877058f, 0.28815013f, 0.30789846f, 0.24441916f, 0.7504142f, 0.4646262f, 0.5149533f,
+      0.6508104f, 0.5387549f, 0.1506182f, 0.77973497f, 0.5613007f, 0.8947809f, 0.24737573f, 0.882782f,
+      0.005495131f, 0.90993816f, 0.80128354f, 0.8917693f, 0.091435075f, 0.6704894f, 0.95763975f, 0.27762014f,
+      0.5927816f, 0.74048257f, 0.39038336f, 0.7001237f, 0.30437487f, 0.11625093f, 0.3411532f, 0.8124163f,
+      0.6896569f, 0.71350795f, 0.6001208f, 0.36836517f, 0.5188247f, 0.01892668f, 0.15339965f, 0.25419557f,
+      0.2064318f, 0.88833815f, 0.47727865f, 0.88709056f, 0.49082512f, 0.11388689f, 0.31912774f, 0.9593644f,
+      0.78015697f, 0.8311006f, 0.63250524f, 0.6571592f, 0.20667112f, 0.9809466f, 0.14625502f, 0.04218155f,
+      0.57540226f, 0.9262546f, 0.44018233f, 0.4769255f, 0.3852706f, 0.09546256f, 0.9905191f, 0.13382024f,
+      0.07074398f, 0.16185725f, 0.4233225f, 0.3151368f, 0.9110969f, 0.41447622f, 0.58130866f, 0.7651092f,
+      0.9817701f, 0.11023259f, 0.42101204f, 0.5397561f, 0.5778314f, 0.22598988f, 0.31468558f, 0.8214317f,
+      0.67929304f, 0.79097277f, 0.7053677f, 0.8750746f, 0.019512117f, 0.032795787f, 0.43307304f, 0.59631824f,
+      0.84294224f, 0.93783194f, 0.53943527f, 0.60289854f, 0.8253078f, 0.30536187f, 0.14199954f, 0.9196194f,
+      0.9227386f, 0.15846318f, 0.29161406f, 0.7992451f, 0.72343415f, 0.9935814f, 0.6151709f, 0.077278376f,
+      0.11056489f, 0.16041255f, 0.99320936f, 0.06393474f, 0.33417904f, 0.46236527f, 0.7078488f, 0.25308424f,
+      0.5302883f, 0.99474716f, 0.028710365f, 0.6765068f, 0.998457f, 0.29653466f, 0.08064735f, 0.89677596f,
+      0.9564106f, 0.53748304f, 0.79050577f, 0.09722155f, 0.9004303f, 0.3784039f, 0.47111923f, 0.95958996f};
   const std::vector<float> fc1_experts_bias = {
       0.8748215f, 0.5054756f, 0.74107623f, 0.32518923f, 0.0639081f, 0.62639004f, 0.64906263f, 0.17322052f,
       0.7424998f, 0.07288867f, 0.93031204f, 0.9841952f, 0.6361292f, 0.18628561f, 0.7433356f, 0.5852079f,
@@ -407,21 +480,793 @@ TEST(MoETest, MoETest_Relu) {
       0.012911659f, 0.045757107f, 0.27884653f, 0.3585817f, 0.116771236f, 0.25755364f, 0.23161705f, 0.2906256f,
       4.8571277f, 5.649453f, 5.485141f, 5.306299f, 4.767025f, 6.9010167f, 5.3520975f, 6.711155f};
 
-  RunMoETest(input,
-             router_probs,
-             fc1_experts_weights,
-             fc2_experts_weights,
-             fc1_experts_bias,
-             fc2_experts_bias,
-             output,
-             num_rows,
-             num_experts,
-             hidden_size,
-             inter_size,
-             "relu");
+  RunMoETest(input, router_probs, fc1_experts_weights, fc2_experts_weights, {}, fc1_experts_bias, fc2_experts_bias,
+             output, num_rows, num_experts, hidden_size, inter_size, "relu");
 }
 
-}  // namespace test
-}  // namespace onnxruntime
+TEST(MoETest, MoETest_Mixtral) {
+  int num_rows = 6;
+  int num_experts = 8;
+  int hidden_size = 4;
+  int inter_size = 8;
+
+  const std::vector<float> input = {0.9212995f, 0.5282444f, -0.008228387f, -1.449332f, -0.6051824f, -0.17924511f,
+                                    0.1995587f, -1.2461947f, 0.86708033f, 0.19191018f, 1.1600108f, -0.008815222f,
+                                    0.8504777f, -0.84964496f, -1.4019964f, 0.17225051f, 0.35569248f, 1.2056456f,
+                                    1.3690308f, -0.69495815f, 1.4324434f, 0.22761835f, -1.1286871f, 1.124213f};
+  const std::vector<float> router_probs = {
+      -0.09331456f, -0.47121337f, 0.07311103f, 0.47643483f, 0.21135253f, -0.72226393f, -0.048502743f, 0.39447474f,
+      -0.9014899f, -0.36629856f, -0.23088816f, -0.099606544f, -0.45191774f, -0.30394578f, 0.6266495f, 0.67937183f,
+      0.27117345f, -0.36059442f, 0.81510246f, 0.61359257f, 0.07649982f, -0.44949868f, -0.54758865f, 0.4736983f,
+      0.21584567f, 0.21296778f, 0.093342215f, -0.09353682f, 0.61422515f, 0.19574627f, 0.0063361377f, -0.2465148f,
+      0.15675665f, -0.4546509f, 0.24447554f, 0.5921611f, -0.18192923f, -0.66116416f, -0.40265432f, 0.33475468f,
+      1.2906091f, 0.4709078f, 0.16256471f, 0.19308007f, 0.97568524f, 0.25876164f, -0.7964541f, -1.0319631f};
+  const std::vector<float> fc1_experts_weights = {
+      0.3860137f, 0.083209574f, -0.16235226f, 0.30897498f, 0.077925384f, 0.4039817f, 0.054659843f,
+      -0.15768659f, 0.13434184f, -0.13558972f, 0.21042877f, 0.44641107f, 0.28902978f, -0.21858627f,
+      0.28863233f, 0.089463115f, 0.25391752f, -0.30475253f, -0.49495423f, -0.19318026f, -0.38351142f,
+      0.41026944f, 0.14401567f, 0.20710677f, 0.15813059f, -0.008697987f, 0.39130414f, -0.3552568f,
+      0.031481862f, -0.3412701f, 0.154176f, -0.17219114f, 0.41923493f, -0.09923202f, 0.43019837f,
+      0.15579104f, -0.4233985f, 0.3460176f, -0.13757241f, -0.19166303f, -0.41503525f, -0.49708033f,
+      0.14305532f, -0.109221935f, 0.19466156f, -0.41033173f, 0.37121457f, -0.36702687f, -0.08633667f,
+      0.10443485f, 0.2581259f, 0.40365517f, 0.45547962f, -0.39646107f, 0.12583363f, -0.21506298f,
+      -0.054792404f, -0.37424505f, 0.45542932f, -0.36697525f, 0.26722562f, 0.1757198f, 0.16247797f,
+      -0.2703231f, -0.49740213f, 0.33463532f, 0.37881732f, 0.1822241f, -0.3486371f, -0.49346995f,
+      -0.40608948f, 0.37285012f, 0.24005288f, 0.42075223f, 0.26193494f, 0.12654608f, -0.0048963428f,
+      -0.38025302f, -0.4283861f, -0.46767431f, 0.20468098f, -0.245484f, -0.10062629f, -0.28775263f,
+      -0.09111178f, -0.35191745f, -0.32670784f, 0.16585541f, -0.1485982f, 0.3086716f, -0.16040438f,
+      -0.36678362f, -0.088219464f, -0.2423737f, -0.15297079f, -0.4759978f, -0.34751755f, -0.10303855f,
+      0.3702919f, 0.25632292f, -0.3163945f, -0.4009425f, -0.34168184f, -0.49343884f, -0.3858195f,
+      -0.1236487f, 0.33743858f, 0.08369112f, -0.38030273f, -0.40111196f, 0.24873763f, -0.37192065f,
+      -0.06156373f, 0.23985302f, -0.23140603f, -0.05451995f, -0.04352224f, -0.118291676f, -0.25351608f,
+      -0.44571918f, -0.4041785f, -0.26773083f, 0.48291886f, -0.24150735f, -0.335764f, 0.121197104f,
+      0.13780516f, 0.27395487f, -0.20423341f, 0.18377024f, -0.33313757f, -0.32685202f, -0.024149835f,
+      -0.18288034f, -0.374829f, 0.29657948f, 0.40208143f, 0.08111167f, -0.08705664f, -0.4631365f,
+      -0.18211937f, 0.12729281f, 0.23576546f, -0.06320876f, -0.19767642f, 0.27861303f, -0.39819986f,
+      0.31600899f, -0.19397742f, 0.0076527f, -0.09880793f, 0.060619473f, -0.1510992f, 0.36356348f,
+      -0.012998581f, 0.39029974f, 0.48074025f, -0.24359548f, -0.36475456f, 0.401151f, 0.15562236f,
+      -0.38143605f, -0.31637233f, -0.41569126f, 0.43565983f, -0.4734699f, 0.37718338f, -0.01680845f,
+      -0.058149397f, 0.31273925f, -0.046213806f, 0.31357706f, 0.36150748f, -0.43410504f, 0.19239199f,
+      0.0943895f, 0.10750586f, 0.07299572f, 0.13676548f, -0.24053341f, -0.063970566f, 0.47506f,
+      0.33592474f, -0.018784225f, -0.47026545f, 0.021913886f, -0.34048676f, 0.40659577f, -0.3035437f,
+      -0.036100805f, -0.11097133f, 0.08897692f, 0.3793823f, 0.48282713f, 0.3181312f, -0.298562f,
+      -0.3271106f, 0.4363466f, 0.17685872f, 0.013283849f, 0.067666054f, -0.40184838f, -0.16694272f,
+      0.48130906f, -0.12331611f, -0.025082052f, -0.41516554f, -0.27970356f, -0.010209799f, -0.31057972f,
+      -0.062004805f, 0.20347959f, -0.48908865f, 0.14850605f, -0.33060408f, -0.24402553f, 0.19195485f,
+      0.39756012f, -0.13665432f, -0.20528454f, -0.45211792f, -0.25782883f, -0.43781847f, -0.114435256f,
+      0.12556863f, -0.40527463f, 0.21121234f, 0.15789884f, -0.4344011f, 0.13625044f, -0.04066527f,
+      0.22840887f, 0.2868948f, -0.49707252f, 0.45854944f, 0.419321f, 0.19894183f, -0.45698053f,
+      -0.17861038f, -0.14490443f, -0.12849897f, 0.28196156f, 0.18178529f, 0.39608955f, -0.18726158f,
+      0.16826987f, 0.17789757f, -0.4162954f, -0.4850099f, -0.25944453f, 0.34227383f, -0.47072983f,
+      -0.4352169f, 0.2801003f, 0.26976448f, 0.41119635f};
+  const std::vector<float> fc2_experts_weights = {
+      0.10833451f, -0.07365984f, 0.2932343f, -0.20955177f, -0.21085852f, -0.21087126f, 0.31800103f,
+      0.11782206f, 0.34020698f, -0.29177922f, -0.35068116f, -0.27660736f, -0.2378315f, 0.14320332f,
+      0.12659892f, 0.29377612f, -0.18258394f, -0.24102151f, 0.1875877f, -0.14290786f, 0.21457997f,
+      -0.08389844f, 0.20224877f, -0.27469966f, -0.17842063f, 0.1077901f, 0.07474385f, -0.09014153f,
+      0.21074237f, 0.24034885f, -0.2563875f, -0.18875091f, 0.32136288f, 0.0776935f, 0.045481127f,
+      -0.3115706f, 0.14841764f, -0.053040292f, -0.16197139f, 0.30368346f, 0.0788243f, -0.19561274f,
+      -0.17894714f, -0.016884197f, 0.19741052f, -0.090344846f, -0.20172869f, -0.12107184f, -0.26413083f,
+      0.12608862f, 0.27366453f, -0.3328494f, 0.08211302f, 0.18264277f, 0.064109616f, -0.12590908f,
+      0.18453442f, 0.18579696f, 0.13220324f, -0.062126897f, -0.09362138f, 0.037823465f, -0.062456656f,
+      -0.10535928f, 0.1978099f, -0.24614547f, 0.17770219f, 0.16043694f, 0.2525901f, -0.27119386f,
+      0.25427446f, -0.16714293f, 0.13119277f, 0.33202717f, -0.04986229f, -0.0027341924f, -0.0813988f,
+      -0.2952116f, 0.16967128f, -0.35097876f, 0.21948591f, 0.2645375f, 0.33435768f, -0.08339601f,
+      -0.2904943f, 0.0794895f, 0.19531254f, -0.35189477f, -0.080250844f, -0.21193951f, -0.0309231f,
+      -0.17402375f, -0.14452116f, -0.11223866f, -0.33598322f, 0.2900932f, 0.26874313f, 0.19688474f,
+      -0.35054854f, 0.03135616f, 0.21416605f, -0.032677338f, -0.20834164f, 0.33705652f, -0.1322388f,
+      -0.20129368f, -0.31874785f, 0.015792634f, 0.15667121f, 0.07826358f, 0.069915086f, -0.26813045f,
+      -0.330179f, 0.006225848f, 0.32238203f, 0.20397249f, -0.2058509f, -0.04589425f, -0.26063374f,
+      -0.17056243f, 0.064027935f, 0.19252343f, 0.29287276f, -0.3245995f, 0.23639117f, -0.24935842f,
+      0.13239416f, 0.29919288f, 0.27704936f, -0.26995474f, -0.02733084f, -0.34864828f, -0.2894185f,
+      0.068286195f, 0.09405743f, 0.07494662f, -0.096224755f, 0.3261805f, 0.050550338f, -0.20862648f,
+      -0.020016002f, 0.08490415f, 0.123811804f, -0.24999082f, 0.13250813f, -0.18062393f, -0.2937818f,
+      -0.19311349f, 0.3409702f, 0.30223787f, 0.31660154f, 0.2075398f, 0.26709175f, -0.04732303f,
+      -0.19453493f, 0.17665526f, -0.18320526f, -0.2386011f, -0.11290163f, 0.07419645f, 0.3327035f,
+      0.033594366f, 0.20476541f, 0.2744358f, 0.2854276f, -0.122100174f, -0.07907457f, 0.17039073f,
+      -0.09647329f, 0.16555631f, -0.07724f, -0.23979841f, 0.14391156f, 0.054206114f, 0.15763119f,
+      0.35125035f, 0.24138254f, 0.33516192f, 0.018923176f, -0.30413106f, -0.24802732f, -0.21961808f,
+      -0.31156835f, -0.17721775f, -0.32546985f, -0.32619375f, -0.21126744f, -0.3485449f, -0.21701548f,
+      0.13481297f, 0.29488218f, -0.10516899f, 0.072144486f, -0.3312356f, 0.30869225f, 0.22181617f,
+      -0.34610963f, -0.16886877f, 0.11531327f, -0.07264093f, -0.038529005f, -0.15963489f, 0.283981f,
+      -0.19763571f, 0.2931936f, 0.022812065f, 0.07107194f, 0.27581826f, -0.058253434f, -0.20129326f,
+      -0.057181682f, 0.28675067f, -0.26233214f, 0.080249704f, -0.34746924f, 0.18536879f, 0.13062657f,
+      0.014987925f, 0.15174268f, 0.0003976555f, 0.19563977f, -0.2798801f, -0.051920194f, 0.15684144f,
+      -0.26691115f, -0.25876564f, 0.18136817f, 0.3074607f, 0.21154472f, 0.055384878f, 0.11652225f,
+      0.33556697f, -0.22811417f, -0.16051741f, 0.24729891f, -0.24191524f, -0.19495378f, 0.25809082f,
+      0.1115539f, 0.11422251f, -0.1498502f, -0.0048792143f, 0.32358363f, -0.21220984f, 0.002779711f,
+      0.16814983f, -0.24407779f, 0.34336287f, -0.176639f, -0.08490091f, -0.09566104f, -0.23039621f,
+      -0.34692943f, 0.19935164f, 0.09392998f, -0.33113837f};
+  const std::vector<float> fc3_experts_weights = {
+      0.45783097f, -0.16871625f, -0.1772582f, -0.4837973f, -0.2863351f, 0.12490183f, -0.06599659f,
+      -0.362943f, 0.011728346f, -0.34154075f, -0.42419833f, -0.27533132f, -0.43760604f, -0.31836903f,
+      0.49980444f, 0.09443748f, 0.15407985f, -0.46634215f, -0.3283869f, -0.16642791f, 0.07818556f,
+      -0.43996066f, -0.21543652f, -0.2993343f, 0.0013856292f, -0.1860516f, -0.034647882f, -0.33881485f,
+      -0.34319758f, -0.2917009f, -0.17114872f, -0.39464045f, 0.31960344f, 0.42969978f, -0.049498677f,
+      -0.11194843f, 0.007296145f, -0.029854119f, 0.12020564f, 0.14011681f, -0.45412838f, -0.18451887f,
+      0.42106473f, 0.19477749f, -0.024868786f, -0.30145288f, -0.30590254f, -0.44788343f, -0.16298121f,
+      0.16885209f, 0.31881082f, 0.23084867f, -0.44197202f, -0.30068123f, -0.078908324f, 0.48367476f,
+      0.07232875f, -0.12948537f, 0.20685762f, -0.19044077f, -0.32362783f, 0.36494362f, -0.22735089f,
+      -0.100233376f, 0.4191656f, -0.07843453f, -0.056941032f, -0.20405996f, -0.4515314f, -0.4865722f,
+      0.18582922f, -0.27452308f, -0.3214385f, -0.039011598f, -0.16650558f, -0.16176039f, 0.016065598f,
+      -0.10605621f, -0.17215621f, -0.23940295f, -0.4069137f, 0.4192536f, -0.20009357f, 0.13248974f,
+      -0.17348295f, 0.04063064f, 0.46615022f, 0.23036134f, -0.43329984f, 0.1984514f, 0.47462142f,
+      0.13154167f, 0.33521235f, 0.49294376f, -0.0766145f, 0.10377723f, 0.0070211887f, 0.4549045f,
+      -0.42602575f, -0.1909796f, 0.29162645f, -0.108933926f, -0.102350116f, -0.20839584f, 0.34465307f,
+      0.2452516f, 0.16022503f, -0.28098184f, -0.4058748f, 0.054080307f, 0.14813942f, -0.23085594f,
+      -0.13989884f, 0.33768386f, 0.03982985f, 0.022559166f, -0.12305027f, -0.45279485f, -0.47012872f,
+      -0.23900753f, -0.2541607f, 0.1557768f, -0.14555538f, -0.19561106f, 0.4767149f, 0.17416143f,
+      0.35645115f, -0.24205637f, 0.2573983f, -0.19420451f, -0.29428315f, 0.06744653f, -0.2947166f,
+      -0.3255307f, 0.26062596f, -0.0839923f, 0.4568925f, 0.4863913f, 0.14955276f, 0.17207885f,
+      0.11514187f, 0.007830441f, -0.036366224f, 0.006872058f, 0.18671238f, 0.4648854f, -0.12957954f,
+      -0.21135789f, -0.121082425f, -0.24156213f, 0.08501935f, 0.3732242f, 0.3909887f, 0.22956276f,
+      -0.36796576f, -0.2683524f, -0.10985571f, -0.09216207f, 0.041123867f, -0.45898575f, -0.14543939f,
+      0.26697665f, -0.24668545f, -0.23641628f, 0.30806476f, -0.43565083f, 0.06113738f, 0.44169098f,
+      0.08574325f, 0.1359719f, -0.29119557f, -0.006898284f, 0.027492225f, 0.12271714f, 0.194273f,
+      0.43446392f, -0.38164973f, 0.0149876475f, -0.24981815f, -0.39553195f, -0.040038824f, -0.44011843f,
+      0.3489496f, 0.057907403f, -0.26947904f, 0.26128954f, -0.47321397f, -0.19339961f, -0.09740937f,
+      -0.42487514f, -0.31794417f, -0.08160931f, 0.4979084f, 0.2546957f, -0.3635872f, 0.38454843f,
+      -0.11149913f, -0.10675722f, -0.45445484f, -0.07870716f, 0.35366338f, 0.069722414f, -0.29122698f,
+      0.15390605f, -0.16032219f, 0.456497f, -0.4339773f, -0.15793777f, -0.48278677f, -0.19691509f,
+      0.15762383f, 0.48130733f, 0.08397317f, 0.49017924f, 0.09782606f, 0.288768f, 0.4008311f,
+      0.41796166f, -0.27986187f, 0.45969498f, 0.30288273f, -0.2337895f, -0.23860168f, -0.4193731f,
+      -0.3218134f, 0.49418402f, 0.19111753f, 0.20062232f, -0.29914904f, -0.21919805f, -0.07547706f,
+      -0.09143996f, -0.3426242f, 0.041192472f, 0.049694f, -0.063310504f, 0.06931591f, -0.19817531f,
+      0.13012594f, 0.1885702f, -0.2633695f, -0.49578953f, 0.2617172f, 0.11926836f, -0.25429398f,
+      0.48185098f, -0.22612399f, 0.3378734f, 0.25366426f, -0.41920406f, 0.32247066f, -0.45973647f,
+      -0.27700734f, -0.08335745f, -0.33702326f, 0.48845494f};
+  const std::vector<float> output = {0.026516449f, 0.04061616f, 0.04403834f, -0.13644142f, 0.038774252f,
+                                     0.024002096f, -0.061423667f, 0.034824893f, -0.022858473f, 0.04693405f,
+                                     -0.0120724365f, -0.028846134f, -0.0168579f, -0.07958221f, 0.048179876f,
+                                     0.053492386f, -0.026292695f, -0.009724421f, -0.026503641f, 0.031220898f,
+                                     0.04189077f, 0.11775493f, -0.037770163f, -0.0790936f};
+
+  RunMoETest(input, router_probs, fc1_experts_weights, fc2_experts_weights, fc3_experts_weights, {}, {}, output,
+             num_rows, num_experts, hidden_size, inter_size, "silu", 1, /*normalize_routing_weights*/
+             2 /*top_k*/);
+}
 
+TEST(MoETest, QMoETest_Mixtral_Int4) {
+  int num_rows = 2;
+  int num_experts = 2;
+  int hidden_size = 64;
+  int inter_size = 64;
+
+  const std::vector<float> input = {
+      -0.8477f, -0.0746f, 1.606f, -0.3242f, 0.4028f, 0.2384f, -0.0359f, -1.667f, -1.265f, -0.3035f, 0.5327f,
+      1.109f, 1.111f, 0.533f, -0.5947f, -0.2009f, 0.4224f, -0.576f, 0.825f, 1.038f, -0.2722f, 0.0497f,
+      1.963f, -1.075f, -0.8374f, 1.055f, 0.448f, -0.602f, -0.2874f, -1.311f, -0.0609f, -1.991f, -0.0732f,
+      -1.49f, 0.6636f, -0.4053f, -1.603f, -1.088f, 0.09534f, -0.6807f, -0.3958f, 1.205f, -0.4275f, 0.82f,
+      1.029f, 0.2693f, 1.229f, 1.116f, 0.718f, -0.827f, 2.527f, -1.041f, 1.042f, -2.771f, -0.654f,
+      0.7144f, 0.6255f, -0.00957f, -0.2313f, 0.4663f, 2.803f, 0.0655f, 1.232f, 1.557f, -1.238f, -1.337f,
+      0.1522f, -0.2783f, 0.2252f, 2.252f, 0.557f, -0.6885f, 1.16f, -0.5244f, -1.424f, -0.02344f, -1.09f,
+      -0.749f, -1.118f, -2.6f, -1.308f, -0.742f, 0.3064f, 1.892f, 1.573f, -0.3843f, 0.6475f, 0.38f,
+      -1.423f, -2.04f, 0.005592f, -0.5977f, 1.063f, -1.626f, -0.04883f, -2.041f, -0.502f, -0.8906f, -0.3987f,
+      0.387f, 0.4644f, -1.419f, -1.35f, -0.9634f, -0.871f, -0.53f, 0.495f, -0.6157f, 0.6523f, -1.036f,
+      -1.234f, 0.11566f, 0.2035f, -1.782f, 0.837f, -0.8955f, -1.392f, -0.4f, 0.6533f, -0.289f, -1.328f,
+      0.528f, -1.269f, -0.581f, -0.4805f, -1.539f, -0.554f, 0.478f};
+  const std::vector<float> router_probs = {-0.579f, -0.07007f, 0.0784f, 0.5327f};
+  const std::vector<uint8_t> fc1_experts_weights = {
+      31, 119, 6, 42, 175, 252, 107, 46, 177, 207, 30, 178, 230, 186, 37, 69, 175, 194, 74, 203, 73, 190, 129,
+      112, 203, 106, 103, 156, 52, 121, 95, 101, 29, 149, 95, 107, 247, 189, 182, 92, 136, 49, 56, 227, 58, 71,
+      26, 111, 192, 107, 253, 212, 206, 86, 171, 35, 130, 119, 32, 66, 17, 99, 14, 11, 188, 109, 242, 62, 124,
+      127, 140, 70, 110, 81, 18, 90, 206, 12, 4, 240, 63, 17, 119, 191, 87, 245, 85, 170, 129, 213, 96, 249,
+      91, 127, 68, 172, 34, 23, 102, 76, 222, 244, 156, 71, 35, 55, 34, 166, 194, 164, 32, 193, 94, 144, 60,
+      200, 16, 255, 137, 29, 205, 211, 167, 39, 163, 178, 47, 244, 232, 70, 207, 190, 177, 105, 53, 88, 29, 246,
+      150, 176, 166, 224, 69, 68, 245, 85, 155, 237, 114, 129, 3, 237, 88, 245, 165, 72, 194, 38, 125, 249, 110,
+      235, 242, 116, 151, 65, 48, 196, 129, 79, 170, 63, 186, 95, 42, 173, 252, 235, 245, 207, 163, 38, 185, 69,
+      163, 102, 131, 116, 124, 153, 135, 30, 123, 10, 210, 137, 115, 58, 209, 244, 239, 55, 16, 127, 190, 109, 168,
+      152, 111, 29, 201, 31, 109, 56, 62, 1, 191, 50, 149, 179, 198, 241, 252, 100, 150, 66, 172, 200, 52, 4,
+      132, 82, 141, 103, 161, 17, 185, 62, 255, 121, 158, 184, 31, 243, 93, 103, 243, 91, 99, 16, 143, 74, 27,
+      19, 56, 100, 122, 183, 64, 131, 239, 183, 77, 143, 240, 194, 148, 42, 171, 112, 230, 204, 239, 224, 156, 245,
+      50, 75, 174, 255, 86, 217, 246, 79, 17, 74, 87, 29, 218, 6, 1, 77, 251, 235, 251, 14, 177, 21, 26,
+      219, 87, 203, 108, 241, 50, 69, 66, 104, 236, 111, 47, 51, 77, 8, 233, 220, 187, 162, 191, 15, 35, 196,
+      159, 16, 155, 220, 123, 26, 147, 68, 84, 127, 167, 52, 183, 253, 246, 108, 59, 142, 65, 214, 173, 62, 165,
+      204, 178, 95, 201, 211, 108, 132, 161, 220, 3, 191, 34, 251, 28, 245, 99, 160, 87, 43, 154, 244, 100, 76,
+      164, 28, 159, 155, 110, 137, 90, 109, 217, 125, 145, 143, 215, 238, 127, 182, 165, 31, 181, 172, 6, 63, 149,
+      133, 127, 247, 60, 91, 47, 87, 248, 214, 69, 225, 183, 202, 159, 210, 85, 2, 34, 33, 83, 91, 86, 243,
+      84, 149, 146, 5, 104, 125, 178, 63, 182, 159, 166, 186, 95, 101, 250, 124, 119, 207, 199, 37, 65, 182, 78,
+      218, 164, 193, 71, 19, 73, 88, 88, 25, 80, 60, 109, 161, 23, 78, 139, 248, 243, 122, 201, 53, 67, 68,
+      140, 249, 116, 139, 39, 72, 197, 92, 54, 53, 209, 28, 226, 149, 237, 216, 239, 241, 223, 214, 52, 85, 240,
+      237, 187, 106, 220, 186, 49, 87, 219, 235, 63, 213, 248, 176, 196, 135, 177, 79, 8, 66, 243, 18, 127, 201,
+      16, 167, 252, 26, 95, 225, 154, 210, 202, 182, 227, 232, 249, 135, 47, 151, 254, 169, 34, 31, 159, 29, 233,
+      228, 37, 213, 40, 245, 22, 47, 73, 213, 93, 242, 117, 75, 110, 248, 206, 74, 24, 139, 252, 174, 245, 101,
+      161, 214, 58, 154, 202, 25, 147, 127, 100, 111, 217, 190, 167, 20, 16, 237, 167, 247, 247, 192, 54, 153, 63,
+      187, 178, 64, 182, 209, 247, 218, 211, 45, 242, 132, 94, 137, 238, 184, 240, 250, 42, 193, 150, 139, 159, 242,
+      149, 89, 22, 111, 9, 78, 194, 146, 249, 173, 185, 243, 147, 250, 228, 144, 123, 250, 49, 92, 231, 1, 152,
+      179, 101, 178, 255, 94, 136, 6, 30, 77, 173, 137, 110, 56, 16, 90, 95, 115, 145, 113, 51, 172, 152, 242,
+      119, 7, 186, 149, 168, 213, 228, 229, 133, 31, 40, 189, 19, 74, 88, 75, 134, 255, 17, 116, 208, 224, 242,
+      252, 156, 153, 44, 165, 119, 23, 206, 175, 33, 213, 59, 243, 103, 244, 92, 130, 184, 162, 229, 49, 203, 157,
+      208, 106, 156, 237, 218, 223, 235, 58, 203, 117, 228, 119, 127, 58, 169, 171, 166, 203, 180, 254, 149, 90, 37,
+      117, 63, 98, 107, 130, 143, 179, 72, 168, 184, 61, 137, 185, 123, 57, 70, 90, 115, 78, 26, 36, 167, 237,
+      145, 247, 220, 103, 129, 207, 183, 6, 56, 178, 46, 198, 245, 220, 42, 59, 134, 217, 186, 33, 50, 200, 145,
+      84, 96, 139, 72, 115, 253, 221, 42, 177, 49, 22, 201, 183, 194, 46, 222, 94, 169, 233, 22, 252, 228, 48,
+      91, 72, 201, 150, 203, 219, 247, 30, 117, 19, 131, 74, 235, 214, 223, 221, 244, 36, 170, 94, 84, 253, 52,
+      186, 223, 127, 156, 248, 182, 74, 89, 223, 202, 255, 194, 151, 195, 115, 176, 180, 55, 194, 33, 14, 133, 39,
+      250, 129, 142, 25, 49, 126, 47, 67, 215, 56, 116, 242, 117, 36, 98, 207, 78, 168, 150, 175, 109, 229, 54,
+      45, 221, 205, 130, 52, 133, 208, 174, 234, 234, 188, 71, 250, 3, 43, 225, 57, 144, 225, 157, 202, 251, 194,
+      242, 106, 188, 121, 239, 104, 206, 238, 116, 28, 253, 1, 62, 153, 193, 147, 24, 120, 70, 241, 148, 54, 227,
+      159, 242, 208, 23, 14, 102, 41, 29, 254, 184, 52, 27, 45, 69, 137, 149, 23, 151, 123, 99, 190, 107, 247,
+      81, 164, 57, 163, 77, 213, 250, 203, 235, 134, 85, 83, 1, 188, 70, 100, 70, 56, 217, 34, 55, 103, 123,
+      9, 191, 147, 132, 36, 21, 38, 89, 82, 170, 166, 167, 129, 238, 171, 227, 254, 188, 14, 202, 249, 54, 158,
+      12, 146, 139, 203, 113, 153, 163, 180, 104, 220, 210, 108, 77, 200, 183, 135, 118, 241, 170, 143, 238, 120, 68,
+      215, 67, 133, 182, 222, 155, 12, 254, 157, 250, 87, 190, 226, 141, 62, 250, 212, 43, 218, 109, 229, 79, 213,
+      249, 190, 225, 107, 238, 15, 233, 72, 56, 193, 64, 86, 156, 215, 170, 70, 69, 28, 221, 97, 21, 73, 8,
+      184, 53, 138, 80, 239, 59, 117, 23, 39, 104, 167, 202, 47, 231, 253, 174, 226, 114, 40, 140, 3, 200, 198,
+      134, 85, 10, 104, 50, 1, 75, 243, 1, 248, 148, 143, 180, 213, 47, 198, 76, 78, 12, 15, 128, 23, 144,
+      82, 50, 132, 219, 73, 81, 45, 169, 39, 178, 85, 130, 111, 17, 111, 54, 13, 47, 184, 139, 245, 23, 74,
+      202, 138, 183, 30, 136, 134, 120, 105, 95, 253, 89, 252, 67, 169, 149, 251, 4, 57, 175, 30, 79, 15, 104,
+      34, 65, 252, 101, 211, 6, 81, 72, 40, 18, 240, 163, 255, 53, 132, 173, 124, 251, 145, 69, 98, 46, 137,
+      179, 11, 113, 247, 158, 232, 179, 215, 14, 47, 185, 237, 58, 121, 159, 227, 227, 244, 73, 214, 181, 149, 47,
+      93, 20, 245, 237, 166, 192, 82, 133, 84, 33, 124, 12, 229, 59, 44, 30, 62, 136, 243, 143, 119, 101, 253,
+      36, 159, 245, 87, 175, 250, 223, 86, 185, 146, 51, 39, 248, 128, 212, 189, 62, 190, 232, 134, 148, 88, 158,
+      113, 157, 130, 201, 122, 132, 250, 250, 87, 200, 79, 47, 148, 30, 232, 74, 199, 188, 175, 234, 8, 95, 141,
+      253, 49, 158, 20, 135, 153, 195, 212, 255, 104, 68, 78, 75, 155, 253, 163, 146, 58, 124, 34, 231, 4, 138,
+      241, 19, 36, 25, 114, 230, 167, 147, 72, 69, 242, 130, 46, 228, 130, 210, 149, 74, 248, 79, 251, 31, 180,
+      249, 233, 148, 243, 47, 170, 153, 176, 185, 207, 111, 191, 191, 13, 180, 247, 72, 91, 196, 244, 187, 245, 16,
+      188, 239, 66, 253, 181, 45, 52, 245, 26, 63, 112, 42, 68, 29, 216, 166, 57, 87, 252, 27, 159, 177, 143,
+      142, 34, 244, 38, 214, 144, 243, 72, 165, 69, 30, 241, 164, 126, 164, 228, 142, 184, 251, 172, 51, 49, 202,
+      241, 76, 203, 169, 210, 46, 204, 49, 108, 138, 164, 116, 188, 23, 163, 251, 107, 156, 159, 69, 125, 163, 93,
+      80, 31, 97, 200, 83, 212, 111, 248, 154, 82, 128, 187, 232, 254, 195, 174, 195, 69, 204, 39, 67, 34, 78,
+      253, 107, 2, 219, 245, 222, 22, 200, 83, 39, 220, 206, 47, 76, 118, 63, 67, 142, 24, 115, 97, 56, 233,
+      199, 230, 206, 186, 91, 18, 141, 148, 43, 94, 25, 96, 140, 233, 165, 202, 242, 71, 209, 235, 173, 105, 40,
+      82, 63, 165, 218, 95, 48, 89, 103, 89, 24, 100, 195, 208, 251, 28, 240, 235, 157, 207, 191, 52, 249, 203,
+      134, 40, 153, 219, 102, 29, 32, 196, 71, 149, 124, 189, 183, 14, 244, 90, 123, 152, 206, 232, 70, 138, 111,
+      231, 6, 237, 243, 69, 101, 118, 41, 70, 171, 97, 133, 207, 234, 21, 24, 223, 131, 55, 18, 253, 128, 63,
+      179, 85, 174, 62, 36, 217, 191, 90, 142, 215, 79, 247, 16, 53, 200, 241, 85, 141, 194, 125, 75, 30, 237,
+      4, 255, 247, 80, 69, 127, 204, 238, 106, 255, 98, 24, 206, 110, 57, 103, 90, 175, 100, 253, 151, 206, 90,
+      33, 243, 52, 13, 51, 212, 99, 182, 10, 105, 41, 111, 22, 182, 207, 205, 81, 138, 203, 205, 99, 204, 125,
+      152, 55, 3, 130, 116, 116, 106, 178, 74, 234, 56, 51, 20, 223, 180, 49, 155, 79, 53, 194, 227, 118, 116,
+      165, 100, 196, 255, 91, 74, 250, 19, 177, 79, 188, 141, 91, 149, 65, 28, 76, 53, 185, 123, 164, 127, 74,
+      168, 73, 65, 202, 112, 41, 111, 233, 240, 200, 241, 155, 181, 78, 136, 222, 55, 161, 1, 252, 241, 35, 54,
+      223, 122, 32, 31, 59, 46, 177, 168, 246, 226, 25, 71, 24, 221, 120, 27, 118, 242, 216, 200, 252, 208, 174,
+      165, 169, 177, 84, 249, 19, 236, 146, 17, 15, 29, 117, 31, 24, 67, 208, 105, 117, 100, 109, 19, 34, 159,
+      82, 33, 194, 122, 211, 167, 239, 207, 153, 109, 211, 29, 209, 111, 224, 178, 249, 244, 35, 127, 72, 18, 36,
+      211, 211, 251, 98, 142, 200, 131, 114, 93, 84, 51, 75, 87, 252, 180, 27, 111, 16, 238, 233, 186, 168, 181,
+      99, 31, 43, 101, 0, 127, 202, 206, 219, 93, 42, 206, 142, 209, 28, 58, 252, 168, 119, 101, 236, 157, 53,
+      61, 31, 129, 120, 81, 116, 129, 20, 20, 170, 181, 173, 33, 40, 67, 24, 78, 159, 158, 255, 210, 197, 193,
+      73, 127, 254, 239, 216, 130, 121, 97, 249, 22, 136, 179, 221, 10, 40, 246, 89, 48, 36, 57, 186, 236, 58,
+      110, 45, 217, 173, 46, 210, 225, 223, 235, 91, 55, 79, 82, 162, 17, 22, 223, 11, 1, 189, 11, 180, 47,
+      191, 225, 173, 175, 46, 212, 158, 75, 68, 62, 99, 203, 115, 174, 139, 141, 81, 135, 75, 131, 221, 138, 247,
+      155, 154, 56, 220, 123, 32, 28, 151, 47, 241, 14, 14, 84, 147, 39, 247, 99, 249, 187, 111, 26, 60, 140,
+      194, 5, 130, 101, 114, 74, 229, 127, 52, 252, 233, 79, 13, 213, 46, 196, 171, 215, 250, 166, 165, 37, 171,
+      143, 244, 108, 131, 87, 95, 248, 223, 169, 98, 195, 211, 227, 201, 122, 202, 170, 220, 98, 246, 166, 182, 115,
+      177, 67, 189, 94, 175, 28, 9, 225, 100, 173, 217, 150, 32, 90, 72, 190, 222, 65, 169, 170, 175, 179, 194,
+      42, 102, 122, 218, 174, 217, 177, 200, 249, 111, 195, 246, 172, 60, 60, 160, 113, 205, 27, 87, 233, 189, 95,
+      10, 176, 72, 54, 206, 131, 114, 245, 31, 123, 31, 55, 15, 189, 165, 122, 93, 45, 65, 154, 252, 102, 37,
+      39, 132, 85, 148, 140, 37, 110, 201, 35, 31, 54, 243, 174, 164, 230, 255, 55, 7, 220, 123, 246, 98, 31,
+      12, 137, 63, 95, 44, 255, 165, 241, 78, 225, 172, 72, 119, 209, 63, 209, 116, 7, 152, 47, 38, 191, 210,
+      155, 175, 100, 95, 147, 210, 30, 137, 13, 226, 201, 232, 63, 169, 236, 164, 29, 225, 254, 225, 181, 253, 155,
+      211, 100, 33, 255, 75, 14, 86, 238, 0, 124, 199, 102, 67, 230, 84, 161, 90, 169, 181, 140, 11, 133, 41,
+      201, 177, 136, 234, 155, 31, 67, 254, 100, 21, 224, 17, 164, 218, 193, 147, 191, 64, 21, 125, 43, 63, 221,
+      90, 160, 95, 68, 244, 34, 115, 79, 241, 171, 151, 21, 167, 122, 221, 1, 236, 85, 41, 156, 73, 72, 218,
+      146, 171, 44, 62, 233, 174, 75, 237, 232, 23, 207, 64, 128, 165, 209, 135, 170, 49, 143, 238, 109, 228, 200,
+      67, 114, 135, 72, 51, 253, 107, 74, 202, 58, 160, 186, 194, 63, 97, 175, 241, 180, 58, 187, 19, 212, 175,
+      225, 184, 224, 21, 190, 67, 154, 106, 170, 129, 195, 92, 4, 122, 15, 252, 88, 182, 218, 127, 20, 137, 226,
+      143, 205, 70, 126, 137, 190, 120, 186, 175, 44, 169, 76, 26, 45, 213, 99, 171, 144, 90, 100, 175, 104, 66,
+      159, 155, 218, 31, 54, 242, 98, 50, 204, 82, 65, 8, 138, 143, 109, 222, 40, 65, 244, 4, 165, 153, 195,
+      253, 217, 11, 186, 54, 53, 37, 66, 35, 95, 241, 185, 18, 18, 37, 245, 63, 157, 139, 109, 250, 235, 110,
+      6, 85, 234, 85, 1, 211, 82, 159, 245, 71, 92, 52, 233, 124, 250, 244, 174, 218, 47, 37, 232, 203, 199,
+      243, 175, 244, 166, 44, 108, 93, 79, 51, 241, 241, 64, 124, 61, 69, 246, 241, 140, 255, 79, 235, 58, 196,
+      255, 216, 207, 193, 143, 86, 89, 30, 158, 57, 120, 55, 60, 130, 220, 148, 89, 138, 253, 210, 86, 125, 104,
+      83, 244, 158, 100, 56, 216, 103, 13, 77, 106, 41, 163, 164, 237, 2, 95, 141, 76, 250, 238, 199, 129, 41,
+      69, 128, 127, 87, 22, 124, 27, 95, 172, 233, 47, 29, 198, 57, 234, 251, 207, 107, 215, 49, 147, 20, 116,
+      146, 77, 48, 138, 156, 137, 218, 160, 168, 31, 175, 101, 61, 169, 170, 6, 113, 255, 228, 156, 107, 248, 243,
+      109, 201, 245, 134, 240, 37, 96, 165, 63, 94, 56, 191, 97, 58, 57, 125, 247, 21, 249, 124, 143, 140, 235,
+      234, 45, 188, 27, 239, 149, 235, 122, 76, 82, 124, 242, 163, 65, 95, 1, 64, 227, 26, 55, 211, 151, 138,
+      124, 191, 100, 88, 55, 80, 16, 31, 101, 51, 138, 141, 235, 99, 60, 118, 40, 168, 141, 138, 25, 97, 167,
+      37, 174, 98, 213, 26, 76, 121, 235, 113, 247, 131, 114, 133, 252, 245, 244, 33, 108, 253, 122, 201, 143, 92,
+      140, 29, 201, 197, 144, 65, 189, 251, 147, 211, 43, 234, 170, 44, 101, 36, 40, 29, 19, 216, 134, 82, 71,
+      147, 110, 170, 208, 184, 3, 212, 1, 121, 63, 188, 135, 53, 31, 89, 232, 63, 158, 70, 30, 231, 102, 65,
+      22, 150, 244, 121, 148, 90, 103, 155, 242, 48, 185, 245, 59, 105, 182, 110, 59, 127, 33, 248, 52, 192, 125,
+      218, 50, 169, 252, 255, 122, 71, 158, 255, 158, 178, 65, 164, 223, 137, 182, 78, 253, 118, 66, 130, 238, 90,
+      81, 214, 160, 24, 40, 113, 112, 63, 111, 218, 234, 203, 30, 49, 97, 137, 97, 30, 212, 158, 89, 156, 224,
+      56, 70, 203, 48, 119, 148, 255, 18, 4, 35, 117, 42, 79, 248, 178, 226, 250, 67, 250, 71, 215, 170, 197,
+      176, 124, 207, 83, 34, 72, 73, 46, 133, 1, 15, 117, 18, 225, 95, 60, 186, 169, 53, 176, 136, 63, 104,
+      75, 219, 115, 78, 239, 211, 44, 198, 130, 156, 166, 98, 215, 144, 240, 93, 209, 254, 255, 251, 150, 124, 172,
+      228, 6, 183, 79, 127, 241, 62, 121, 129, 183, 228, 30, 237, 244, 108, 246, 65, 75, 241, 145, 82, 217, 210,
+      122, 79, 79, 244, 167, 167, 119, 20, 72, 202, 139, 57, 212, 141, 246, 239, 50, 204, 179, 122, 156, 146, 216,
+      243, 73, 159, 31, 182, 227, 67, 191, 130, 248, 227, 191, 85, 85, 241, 37, 147, 37, 171, 50, 38, 240, 241,
+      140, 61, 83, 236, 116, 62, 245, 44, 91, 227, 189, 220, 246, 243, 186, 175, 145, 88, 23, 61, 175, 6, 175,
+      148, 255, 118, 53, 237, 38, 109, 110, 254, 191, 236, 58, 3, 73, 194, 159, 83, 80, 145, 251, 129, 101, 216,
+      241, 156, 178, 255, 46, 230, 198, 160, 190, 236, 187, 91, 48, 127, 111, 250, 168, 218, 87, 111, 95, 101, 207,
+      168, 98, 64, 199, 207, 170, 37, 101, 125, 190, 166, 139, 157, 39, 246, 39, 21, 137, 250, 172, 253, 80, 113,
+      5, 18, 155, 92, 18, 180, 111, 239, 165, 246, 91, 87, 57, 223, 206, 62, 245, 227, 121, 84, 180, 162, 25,
+      240, 223, 225, 111, 28, 203, 8, 20, 203, 17, 235, 56, 233, 198, 158, 250, 181, 57, 191, 173, 72, 136, 169,
+      232, 179, 51, 222, 165, 152, 85, 125, 199, 118, 82, 182, 77, 240, 249, 255, 209, 127, 49, 190, 235, 18, 143,
+      125, 52, 60, 12, 111, 95, 115, 117, 111, 220, 90, 159, 147, 26, 152, 84, 137, 70, 8, 182, 12, 249, 252,
+      191, 247, 49, 180, 130, 239, 87, 201, 99, 86, 80, 122, 26, 104, 243, 250, 245, 225, 46, 242, 102, 143, 15,
+      154, 27, 134, 236, 86, 50, 31, 152, 254, 47, 127, 249, 204, 245, 53, 207, 100, 84, 47, 88, 210, 213, 161,
+      227, 71, 182, 30, 141, 127, 78, 83, 33, 213, 131, 33, 169, 222, 202, 80, 179, 164, 227, 222, 218, 65, 254,
+      228, 75, 140, 218, 190, 214, 157, 224, 118, 74, 82, 134, 148, 70, 143, 225, 35, 43, 53, 91, 177, 240, 239,
+      132, 139, 226, 245, 39, 97, 45, 74, 251, 87, 175, 185, 187, 205, 83, 33, 45, 155, 164, 132, 61, 146, 56,
+      229, 175, 110, 94, 204, 177, 111, 126, 206, 105, 67, 215, 205, 253, 237, 46, 89, 116, 14, 35, 92, 123, 43,
+      242, 249, 35, 227, 233, 16, 251, 175, 245, 103, 127, 196, 233, 43, 23, 135, 27, 189, 97, 29, 5, 178, 37,
+      52, 27, 140, 229, 63, 86, 138, 32, 34, 87, 38, 206, 32, 155, 191, 48, 61, 63, 113, 27, 43, 92, 149,
+      143, 47, 137, 116, 150, 127, 49, 82, 25, 22, 99, 7, 61, 255, 246, 203, 83, 132, 166, 44, 75, 211, 131,
+      94, 26, 127, 105, 73, 189, 163, 182, 57, 95, 248, 34, 97, 244, 255, 47, 131, 55, 188, 97, 80, 205, 124,
+      184, 111, 50, 168, 44, 159, 249, 74, 164, 200, 227, 33, 253, 24, 217, 38, 135, 130, 87, 114, 90, 199, 216,
+      27, 127, 252, 246, 134, 80, 49, 147, 186, 122, 107, 70, 249, 30, 95, 72, 34, 205, 229, 95, 79, 142, 71,
+      194, 133, 36, 215, 188, 100, 111, 129, 190, 93, 113, 227, 167, 29, 254, 212, 38, 141, 182, 105, 164, 121, 147,
+      225, 252, 137, 32, 50, 31, 64, 35, 168, 78, 185, 31, 38, 73, 255, 33, 60, 38, 125, 49, 12, 242, 248,
+      94, 104, 114, 78, 255, 101, 76, 158, 95, 13, 212, 169, 242, 116, 250, 154, 184, 86, 68, 172, 166, 95, 182,
+      21, 191, 24, 145, 207, 26, 119, 201, 58, 126, 203, 10, 147, 111, 165, 251, 99, 64, 26, 77, 218, 208, 244,
+      239, 93, 226, 254, 68, 198, 82, 188, 47, 200, 15, 44, 100, 33, 90, 55, 71, 249, 148, 175, 219, 59, 159,
+      191, 127, 8, 125, 206, 27, 67, 136, 234, 89, 12, 203, 163, 52, 78, 167, 144, 68, 112, 127, 176, 235, 113,
+      116, 39, 217, 143, 26, 23, 232, 23, 54, 203, 231, 43, 41, 22, 56, 137, 150, 111, 47, 107, 242, 146, 190,
+      63, 154, 248, 73, 25, 251, 161, 77, 187, 158, 34, 87, 21, 224, 133, 0, 239, 190, 152, 87, 128, 229, 24,
+      130, 40, 95, 111, 241, 251, 251, 12, 26, 254, 81, 111, 47, 91, 137, 22, 61, 202, 85, 175, 174, 169, 196,
+      218, 253, 1, 192, 54, 46, 81, 190, 33, 17, 248, 66, 248, 240, 226, 49, 52, 148, 177, 246, 20, 41, 211,
+      128, 12, 247, 221, 130, 118, 113, 183, 47, 230, 233, 57, 162, 183, 74, 28, 195, 132, 210, 124, 248, 255, 166,
+      51, 207, 135, 157, 115, 248, 238, 126, 195, 175, 177, 36, 68, 225, 216, 205, 214, 200, 243, 248, 22, 231, 138,
+      168, 34, 210, 145, 236, 23, 175, 25, 248, 208, 77, 215, 160, 242, 209, 168, 25, 216, 191, 84, 182, 191, 143,
+      243, 240, 182, 210, 241, 170, 216, 182, 29, 87, 168, 99, 128, 216, 169, 245, 68, 33, 153, 164, 28, 56, 79,
+      157, 145, 17, 158, 124, 171, 255, 225, 225, 252, 43, 61, 249, 165, 40, 197, 111, 7, 8, 166, 211, 241, 84,
+      185, 240, 35, 48, 146, 167, 163, 110, 247, 119, 50, 112, 101, 239, 87, 66, 34, 107, 106, 138, 29, 234, 43,
+      151, 206, 254, 95, 142, 85, 46, 74, 46, 89, 200, 199, 210, 198, 156, 126, 247, 17, 44, 140, 84, 159, 187,
+      213, 202, 151, 46, 234, 165, 78, 247, 210, 250, 228, 223, 85, 27, 137, 127, 131, 91, 42, 124, 97, 237, 139,
+      24, 175, 47, 104, 81, 163, 210, 56, 193, 107, 115, 203, 250, 184, 140, 206, 51, 13, 118, 156, 164, 132, 226,
+      127, 213, 32, 23, 66, 55, 137, 154, 29, 125, 226, 85, 38, 207, 180, 91, 24, 35, 167, 224, 138, 93, 66,
+      40, 85, 185, 43, 88, 184, 142, 37, 3, 234, 220, 165, 127, 219, 130, 48, 49, 175, 55, 79, 196, 50, 197,
+      141, 111, 217, 44, 209, 150, 57, 253, 208, 107, 222, 146, 130, 22, 161, 118, 96, 177, 219, 128, 67, 114, 228,
+      143, 247, 49, 204, 69, 25, 7, 169, 52, 35, 168, 255, 181, 86, 79, 169, 135, 151, 21, 74, 119, 106, 15,
+      26, 68, 16, 118, 42, 93, 207, 189, 59, 138, 190, 206, 204, 123, 53, 197, 83, 61, 149, 123, 54, 26, 247,
+      22, 63, 254, 255, 24, 154, 63, 232, 29, 97, 233, 128, 207, 79, 169, 239, 251, 57, 38, 51, 102, 63, 72,
+      226, 234, 250, 254, 190, 141, 96, 137, 54, 234, 111, 111, 190, 106, 95, 233, 147, 226, 15, 63, 251, 41, 53,
+      58, 132, 97, 73, 38, 80, 213, 162, 250, 125, 179, 154, 27, 229, 171, 159, 237, 145, 128, 207, 211, 74, 134,
+      249, 155, 25, 47, 163, 247, 183, 184, 94, 59, 212, 227, 188, 162, 253, 114, 79, 250, 92, 130, 111, 42, 75,
+      45, 136, 209, 108, 30, 100, 93, 221, 142, 125, 228, 143, 255, 219, 238, 67, 182, 40, 158, 133, 21, 248, 143,
+      195, 220, 193, 96, 111, 120, 208, 6, 249, 42, 225, 30, 159, 223, 116, 203, 35, 243, 9, 147, 251, 29, 217,
+      184, 125, 131, 165, 79, 85, 53, 127, 47, 146, 229, 245, 185, 173, 37, 230, 209, 118, 108, 78, 114, 246, 208,
+      200, 186, 239, 244, 2, 188, 114, 255, 76, 89, 86, 157, 237, 225, 41, 43, 43, 86, 89, 52, 132, 26, 148,
+      253, 31, 185, 174, 248, 131, 92, 34, 173, 11, 72, 198, 145, 215, 180, 106, 173, 44, 99, 46, 116, 31, 51,
+      71, 77, 203, 138, 17, 55, 70, 183, 181, 155, 151, 229, 81, 204, 249, 19, 158, 237, 134, 96, 126, 88, 139,
+      111, 138, 247, 110, 253, 217, 107, 111, 77, 234, 130, 199, 177, 143, 199, 47, 123, 155, 32, 99, 117, 16, 181,
+      155, 87, 166, 244, 42, 36, 251, 108, 15, 45, 108, 33, 85, 97, 117, 203, 142, 45, 102, 225, 249, 255, 191,
+      19, 17, 255, 229, 214, 84, 89, 201, 41, 249, 242, 208, 188, 220, 51, 140, 233, 224, 97, 192, 114, 242, 46,
+      23, 241, 29, 59, 150, 179, 21, 239, 109, 77, 205, 63, 81, 102, 116, 33, 87, 84, 190, 255, 249, 71, 40,
+      97, 117};
+  const std::vector<uint8_t> fc2_experts_weights = {
+      194, 252, 114, 86, 142, 245, 201, 173, 104, 137, 96, 176, 255, 143, 35, 68, 92, 181, 221, 16, 7, 116, 143,
+      99, 41, 223, 180, 116, 244, 139, 190, 255, 138, 100, 204, 207, 5, 23, 77, 101, 217, 92, 134, 241, 138, 183,
+      146, 41, 171, 194, 248, 212, 175, 20, 12, 202, 83, 43, 255, 239, 87, 77, 70, 168, 40, 227, 76, 38, 121,
+      67, 146, 21, 139, 160, 109, 169, 212, 66, 131, 219, 235, 113, 1, 246, 88, 5, 180, 106, 247, 179, 253, 170,
+      103, 204, 178, 62, 173, 37, 80, 214, 130, 113, 23, 137, 0, 183, 32, 115, 1, 253, 222, 223, 19, 114, 243,
+      57, 208, 115, 251, 89, 66, 63, 81, 9, 83, 94, 128, 146, 23, 85, 208, 96, 142, 87, 190, 157, 83, 0,
+      169, 183, 221, 110, 96, 118, 39, 215, 61, 44, 19, 42, 64, 207, 130, 184, 50, 255, 132, 126, 191, 108, 110,
+      226, 232, 44, 175, 234, 228, 67, 85, 191, 56, 90, 44, 219, 49, 21, 246, 113, 70, 69, 67, 118, 132, 241,
+      118, 75, 185, 159, 254, 44, 183, 104, 40, 86, 18, 95, 106, 13, 65, 216, 245, 242, 46, 230, 223, 59, 167,
+      239, 230, 159, 252, 41, 141, 193, 65, 187, 74, 95, 187, 86, 99, 68, 255, 230, 93, 241, 138, 74, 56, 162,
+      138, 150, 143, 220, 59, 152, 212, 218, 251, 165, 13, 207, 184, 78, 11, 49, 79, 245, 181, 136, 197, 239, 110,
+      7, 251, 90, 78, 151, 23, 74, 27, 159, 97, 82, 79, 181, 228, 111, 136, 86, 227, 86, 170, 147, 172, 245,
+      65, 136, 162, 119, 224, 94, 252, 1, 190, 25, 96, 181, 83, 95, 237, 132, 22, 19, 251, 63, 94, 99, 137,
+      62, 219, 223, 159, 206, 246, 96, 71, 85, 255, 119, 55, 190, 175, 108, 175, 189, 101, 172, 224, 175, 191, 204,
+      137, 52, 46, 107, 6, 177, 160, 96, 253, 131, 123, 215, 95, 107, 220, 123, 143, 9, 251, 101, 87, 241, 163,
+      162, 66, 97, 166, 206, 251, 244, 206, 125, 242, 246, 9, 110, 49, 168, 82, 249, 238, 214, 209, 91, 212, 76,
+      122, 12, 88, 247, 41, 241, 253, 29, 203, 226, 51, 60, 17, 0, 255, 193, 237, 213, 45, 96, 201, 175, 111,
+      148, 209, 250, 140, 237, 197, 55, 248, 158, 236, 245, 78, 111, 27, 103, 235, 191, 23, 250, 216, 95, 2, 45,
+      79, 46, 116, 153, 169, 43, 183, 41, 86, 119, 228, 158, 138, 62, 145, 30, 15, 199, 59, 223, 67, 124, 237,
+      186, 98, 138, 94, 184, 156, 249, 117, 40, 223, 158, 38, 91, 176, 94, 160, 254, 23, 175, 142, 235, 3, 146,
+      251, 255, 205, 42, 122, 255, 26, 149, 136, 207, 79, 31, 218, 150, 248, 161, 14, 220, 55, 76, 135, 3, 195,
+      108, 35, 239, 171, 120, 201, 250, 219, 16, 72, 181, 232, 245, 63, 243, 197, 68, 177, 83, 53, 106, 254, 28,
+      89, 45, 31, 243, 162, 150, 122, 82, 155, 74, 77, 220, 46, 49, 254, 105, 43, 18, 62, 234, 13, 189, 85,
+      189, 222, 206, 63, 99, 171, 247, 242, 136, 166, 167, 31, 166, 174, 85, 232, 25, 7, 179, 91, 218, 212, 129,
+      68, 31, 208, 101, 133, 45, 11, 163, 41, 104, 190, 169, 239, 254, 196, 162, 158, 220, 62, 239, 25, 225, 48,
+      20, 147, 119, 151, 178, 216, 238, 185, 119, 191, 34, 191, 87, 230, 94, 212, 148, 130, 130, 15, 101, 26, 99,
+      130, 17, 252, 229, 56, 247, 175, 12, 40, 12, 224, 210, 247, 23, 152, 63, 67, 131, 220, 86, 217, 69, 116,
+      5, 240, 239, 173, 165, 138, 254, 182, 208, 192, 27, 39, 97, 171, 251, 22, 154, 110, 149, 89, 107, 245, 100,
+      230, 64, 137, 85, 62, 152, 166, 223, 246, 250, 186, 183, 146, 212, 207, 169, 191, 247, 246, 19, 248, 49, 47,
+      180, 213, 25, 108, 253, 153, 189, 149, 193, 191, 87, 44, 88, 63, 62, 254, 166, 55, 191, 207, 81, 143, 161,
+      247, 15, 55, 174, 158, 214, 233, 181, 156, 134, 109, 159, 179, 245, 28, 197, 86, 249, 145, 113, 14, 44, 101,
+      41, 221, 15, 230, 151, 232, 47, 69, 199, 172, 147, 143, 113, 70, 70, 222, 162, 71, 29, 38, 71, 183, 252,
+      22, 233, 175, 247, 29, 95, 118, 153, 17, 83, 31, 125, 137, 238, 255, 47, 92, 104, 211, 106, 104, 102, 39,
+      86, 228, 92, 105, 108, 84, 175, 159, 185, 190, 121, 144, 74, 222, 159, 38, 35, 192, 209, 205, 1, 211, 73,
+      84, 162, 192, 236, 250, 155, 245, 223, 52, 242, 14, 242, 37, 245, 170, 110, 181, 82, 251, 244, 143, 71, 56,
+      231, 86, 236, 55, 130, 21, 232, 156, 40, 96, 113, 17, 253, 35, 168, 107, 247, 254, 253, 147, 252, 85, 222,
+      211, 238, 231, 70, 149, 174, 246, 194, 212, 163, 254, 59, 233, 45, 182, 244, 132, 173, 33, 64, 79, 71, 118,
+      42, 134, 0, 202, 30, 47, 103, 212, 40, 232, 124, 116, 75, 29, 254, 253, 124, 47, 217, 35, 87, 114, 228,
+      75, 47, 54, 212, 222, 225, 72, 99, 109, 198, 88, 63, 1, 98, 76, 242, 18, 193, 24, 47, 1, 103, 153,
+      41, 11, 140, 100, 217, 110, 96, 24, 136, 97, 116, 138, 42, 48, 114, 232, 164, 201, 62, 236, 255, 202, 249,
+      253, 66, 246, 68, 21, 220, 15, 47, 239, 205, 150, 127, 104, 154, 20, 176, 48, 41, 90, 149, 181, 251, 19,
+      232, 41, 64, 55, 91, 155, 23, 33, 100, 102, 238, 79, 19, 169, 49, 152, 105, 98, 137, 250, 166, 38, 33,
+      254, 251, 160, 142, 200, 171, 236, 247, 153, 177, 36, 178, 219, 228, 253, 145, 12, 152, 68, 34, 89, 163, 34,
+      47, 247, 140, 228, 45, 179, 100, 3, 134, 4, 151, 67, 144, 180, 40, 245, 234, 113, 29, 149, 201, 44, 132,
+      86, 90, 228, 121, 64, 66, 78, 194, 200, 235, 92, 133, 3, 157, 126, 152, 191, 85, 222, 85, 168, 83, 202,
+      178, 216, 154, 101, 95, 186, 106, 183, 53, 103, 202, 110, 72, 37, 81, 15, 194, 228, 251, 255, 246, 219, 65,
+      168, 83, 30, 11, 63, 23, 28, 87, 106, 185, 138, 246, 89, 132, 45, 255, 107, 85, 18, 90, 244, 158, 214,
+      76, 81, 90, 249, 151, 58, 232, 91, 231, 99, 214, 141, 165, 69, 140, 234, 193, 72, 73, 158, 23, 38, 244,
+      213, 247, 250, 8, 88, 241, 159, 91, 111, 253, 99, 66, 232, 155, 17, 167, 61, 199, 236, 255, 204, 253, 118,
+      34, 194, 8, 136, 237, 159, 117, 63, 247, 30, 83, 229, 226, 10, 253, 235, 148, 85, 85, 127, 221, 153, 165,
+      245, 141, 239, 227, 216, 24, 91, 50, 246, 171, 215, 113, 184, 17, 153, 164, 60, 29, 105, 190, 123, 143, 111,
+      177, 225, 68, 78, 210, 217, 28, 191, 26, 255, 187, 255, 166, 69, 239, 79, 236, 85, 155, 34, 6, 247, 172,
+      61, 41, 68, 159, 247, 54, 159, 103, 98, 112, 209, 188, 252, 148, 125, 113, 230, 167, 154, 53, 143, 168, 172,
+      66, 253, 52, 209, 191, 253, 238, 9, 46, 240, 23, 175, 148, 85, 54, 116, 55, 214, 253, 152, 175, 6, 98,
+      158, 109, 204, 219, 107, 107, 109, 206, 150, 57, 250, 125, 170, 229, 20, 175, 104, 90, 250, 54, 112, 46, 92,
+      250, 156, 181, 177, 177, 166, 167, 250, 181, 209, 208, 127, 235, 151, 133, 86, 147, 42, 42, 84, 133, 136, 170,
+      202, 172, 28, 115, 160, 106, 251, 79, 205, 177, 67, 169, 69, 184, 109, 207, 164, 38, 247, 16, 245, 251, 129,
+      246, 85, 156, 255, 218, 74, 212, 246, 81, 196, 161, 196, 23, 174, 234, 115, 35, 56, 126, 133, 16, 65, 139,
+      113, 63, 243, 23, 18, 211, 30, 236, 167, 189, 129, 111, 15, 228, 254, 120, 127, 141, 215, 232, 119, 119, 113,
+      127, 254, 111, 241, 86, 14, 154, 50, 201, 146, 87, 25, 239, 39, 24, 232, 94, 185, 226, 130, 95, 0, 127,
+      6, 142, 209, 31, 203, 85, 215, 81, 22, 189, 109, 109, 252, 219, 175, 70, 246, 194, 198, 21, 227, 52, 247,
+      149, 32, 242, 126, 5, 236, 140, 73, 117, 177, 169, 162, 68, 44, 132, 68, 77, 110, 54, 130, 161, 235, 118,
+      154, 250, 47, 75, 249, 8, 169, 195, 231, 244, 192, 33, 155, 133, 27, 103, 173, 213, 231, 168, 44, 225, 30,
+      9, 29, 19, 172, 211, 253, 226, 31, 191, 0, 237, 92, 149, 224, 121, 198, 142, 200, 255, 28, 33, 22, 196,
+      66, 213, 73, 104, 64, 133, 221, 181, 147, 69, 105, 234, 243, 97, 169, 130, 242, 229, 171, 202, 210, 53, 29,
+      97, 103, 20, 53, 224, 37, 74, 155, 98, 73, 255, 124, 188, 118, 166, 188, 103, 236, 19, 39, 106, 205, 254,
+      161, 251, 181, 191, 223, 211, 79, 153, 209, 128, 160, 205, 36, 186, 39, 22, 187, 92, 237, 130, 212, 8, 190,
+      91, 166, 145, 223, 232, 83, 88, 94, 242, 164, 94, 97, 120, 246, 31, 243, 57, 59, 100, 221, 60, 1, 39,
+      178, 111, 61, 50, 115, 127, 72, 103, 226, 15, 221, 46, 237, 91, 111, 230, 43, 181, 183, 23, 146, 24, 67,
+      248, 191, 168, 74, 170, 95, 163, 177, 171, 47, 80, 175, 11, 13, 111, 125, 254, 55, 202, 56, 216, 103, 216,
+      134, 50, 222, 169, 211, 118, 107, 135, 221, 90, 102, 69, 142, 108, 40, 223, 86, 15, 157, 79, 110, 198, 12,
+      70, 54, 184, 118, 159, 87, 150, 78, 25, 254, 251, 196, 205, 153, 157, 43, 17, 54, 214, 210, 174, 22, 238,
+      161, 175, 239, 239, 28, 128, 234, 25, 67, 97, 136, 171, 177, 12, 95, 212, 195, 251, 50, 246, 175, 61, 3,
+      89, 68, 142, 41, 43, 114, 67, 130, 48, 53, 106, 47, 119, 106, 253, 129, 242, 254, 223, 90, 77, 56, 221,
+      31, 140, 194, 65, 80, 73, 228, 3, 94, 181, 72, 206, 223, 81, 171, 103, 203, 225, 152, 9, 204, 170, 247,
+      26, 155, 197, 65, 239, 112, 5, 22, 72, 142, 69, 172, 127, 13, 131, 246, 189, 175, 242, 157, 130, 212, 59,
+      175, 254, 55, 144, 163, 60, 59, 194, 103, 144, 95, 123, 18, 234, 183, 239, 45, 52, 212, 242, 172, 164, 128,
+      50, 69, 183, 85, 18, 116, 120, 242, 151, 179, 4, 246, 161, 74, 69, 170, 39, 16, 84, 192, 27, 178, 122,
+      30, 248, 79, 135, 98, 118, 26, 212, 52, 0, 175, 51, 29, 222, 134, 114, 206, 37, 167, 13, 110, 143, 136,
+      164, 249, 59, 209, 230, 181, 205, 237, 203, 28, 177, 224, 228, 102, 39, 156, 22, 204, 142, 160, 195, 51, 207,
+      56, 255, 63, 24, 123, 153, 92, 90, 244, 117, 187, 115, 21, 85, 129, 166, 122, 31, 19, 214, 173, 76, 255,
+      204, 133, 127, 106, 94, 133, 225, 222, 97, 100, 170, 103, 11, 227, 2, 144, 174, 103, 177, 168, 133, 19, 83,
+      255, 100, 55, 128, 230, 142, 163, 136, 206, 45, 122, 78, 99, 236, 150, 133, 34, 71, 93, 245, 246, 251, 149,
+      65, 14, 240, 34, 234, 100, 250, 142, 80, 39, 97, 61, 76, 75, 85, 49, 113, 29, 99, 223, 156, 243, 90,
+      240, 121, 208, 51, 204, 72, 17, 255, 105, 228, 128, 249, 147, 253, 127, 160, 38, 207, 22, 79, 137, 226, 207,
+      218, 213, 166, 154, 177, 4, 204, 235, 182, 59, 245, 206, 213, 244, 135, 71, 25, 92, 216, 142, 81, 129, 255,
+      100, 99, 36, 38, 79, 47, 168, 252, 7, 146, 46, 191, 72, 131, 83, 98, 121, 94, 178, 31, 41, 229, 220,
+      210, 255, 209, 16, 40, 159, 126, 189, 250, 10, 89, 245, 39, 222, 208, 124, 201, 127, 84, 69, 125, 186, 65,
+      250, 52, 79, 165, 45, 232, 234, 250, 162, 85, 220, 186, 165, 229, 136, 255, 72, 34, 47, 239, 15, 213, 129,
+      25, 247, 146, 116, 222, 242, 57, 229, 28, 34, 180, 222, 205, 162, 167, 237, 191, 141, 142, 164, 253, 44, 254,
+      44, 183, 245, 250, 227, 65, 192, 165, 238, 39, 33, 210, 236, 204, 139, 95, 88, 140, 115, 116, 46, 131, 119,
+      2, 141, 243, 37, 247, 49, 36, 56, 63, 218, 242, 28, 149, 100, 49, 87, 183, 210, 250, 168, 148, 244, 117,
+      68, 159, 53, 247, 76, 57, 82, 245, 63, 105, 3, 236, 94, 205, 232, 36, 248, 156, 254, 185, 7, 139, 218,
+      99, 207, 166, 54, 148, 52, 97, 102, 62, 230, 143, 115, 170, 72, 140, 38, 202, 147, 49, 242, 245, 165, 219,
+      158, 240, 111, 177, 193, 81, 176, 181, 14, 18, 110, 70, 55, 99, 80, 249, 127, 161, 29, 51, 220, 143, 44,
+      204, 79, 168, 245, 244, 58, 254, 108, 184, 34, 198, 152, 196, 218, 239, 245, 61, 138, 145, 221, 65, 4, 68,
+      104, 6, 39, 40, 241, 99, 104, 203, 108, 255, 127, 123, 246, 118, 93, 230, 70, 242, 114, 117, 189, 127, 84,
+      233, 2, 183, 88, 47, 243, 217, 98, 89, 225, 89, 31, 138, 2, 221, 5, 79, 247, 63, 120, 99, 71, 184,
+      62, 131, 142, 217, 209, 215, 251, 125, 126, 54, 109, 89, 193, 234, 51, 215, 72, 85, 140, 245, 39, 243, 24,
+      70, 4, 18, 120, 151, 166, 170, 154, 105, 160, 65, 185, 132, 227, 238, 52, 188, 219, 71, 233, 130, 91, 93,
+      239, 218, 72, 195, 199, 157, 199, 211, 98, 202, 118, 163, 28, 178, 64, 181, 169, 2, 191, 74, 161, 254, 19,
+      33, 154, 163, 191, 187, 158, 232, 245, 255, 250, 205, 62, 70, 111, 63, 158, 94, 136, 253, 46, 220, 41, 174,
+      183, 211, 91, 15, 136, 2, 75, 124, 246, 236, 204, 196, 66, 238, 79, 122, 225, 219, 108, 228, 156, 172, 39,
+      107, 163, 191, 196, 182, 29, 69, 147, 250, 36, 42, 188, 116, 196, 163, 146, 161, 255, 228, 4, 125, 253, 124,
+      72, 248, 19, 108, 223, 250, 103, 242, 78, 245, 90, 53, 58, 16, 42, 31, 177, 173, 245, 126, 84, 52, 55,
+      243, 148, 239, 57, 126, 220, 252, 172, 72, 175, 65, 119, 38, 188, 178, 7, 180, 76, 95, 31, 230, 253, 123,
+      81, 187, 184, 39, 111, 218, 94, 251, 187, 175, 24, 158, 90, 128, 130, 145, 33, 86, 46, 143, 33, 175, 134,
+      215, 99, 54, 214, 70, 28, 204, 205, 238, 252, 172, 166, 210, 42, 11, 213, 67, 143, 48, 145, 152, 19, 147,
+      18, 31, 17, 174, 233, 186, 132, 215, 24, 205, 87, 250, 100, 128, 249, 20, 66, 3, 79, 50, 88, 42, 145,
+      156, 45, 246, 169, 230, 163, 130, 13, 164, 127, 76, 150, 60, 101, 209, 112, 34, 247, 117, 197, 23, 114, 164,
+      27, 142, 56, 227, 79, 49, 159, 21, 177, 124, 212, 89, 250, 113, 6, 16, 159, 33, 231, 238, 80, 245, 195,
+      207, 248, 205, 253, 145, 154, 220, 96, 164, 140, 231, 89, 161, 6, 213, 163, 30, 123, 232, 102, 188, 217, 159,
+      128, 16, 29, 27, 28, 199, 114, 212, 95, 111, 255, 109, 215, 5, 101, 191, 216, 134, 194, 108, 98, 167, 61,
+      218, 49, 159, 10, 95, 159, 158, 166, 37, 129, 62, 151, 144, 55, 36, 54, 107, 156, 216, 219, 203, 138, 36,
+      250, 220, 215, 237, 226, 88, 159, 136, 107, 117, 153, 239, 155, 18, 156, 69, 49, 2, 118, 142, 241, 41, 168,
+      56, 155, 250, 105, 112, 217, 189, 214, 164, 162, 244, 52, 95, 36, 35, 191, 66, 51, 75, 192, 171, 103, 118,
+      93, 237, 124, 70, 141, 10, 62, 84, 150, 120, 228, 85, 156, 140, 144, 68, 189, 239, 222, 203, 119, 197, 132,
+      209, 162, 135, 179, 163, 212, 117, 79, 47, 104, 239, 19, 41, 166, 95, 129, 187, 67, 44, 159, 141, 167, 248,
+      43, 179, 42, 15, 94, 249, 234, 104, 118, 225, 254, 236, 204, 92, 13, 26, 204, 141, 195, 227, 2, 104, 251,
+      22, 146, 225, 179, 250, 49, 224, 65, 253, 5, 64, 245, 122, 155, 253, 156, 100, 114, 239, 111, 97, 199, 251,
+      133, 253, 25, 27, 143, 251, 212, 169, 66, 111, 131, 58, 23, 117, 31, 123, 61, 148, 166, 127, 179, 37, 163,
+      3, 82, 94, 113, 41, 80, 56, 207, 67, 173, 41, 28, 249, 157, 10, 42, 98, 136, 201, 213, 127, 102, 34,
+      44, 228, 230, 63, 23, 158, 15, 75, 69, 241, 72, 131, 6, 199, 253, 165, 175, 217, 39, 101, 29, 84, 213,
+      51, 132, 21, 195, 127, 246, 254, 191, 165, 96, 71, 190, 91, 124, 252, 76, 97, 188, 153, 47, 210, 193, 105,
+      71, 206, 209, 146, 102, 254, 251, 43, 152, 117, 188, 195, 196, 88, 246, 223, 102, 100, 159, 7, 252, 43, 240,
+      167, 106, 228, 175, 127, 237, 159, 249, 211, 203, 193, 191, 135, 109, 12, 190, 241, 67, 93, 130, 72, 251, 84,
+      77, 243, 169, 112, 243, 219, 129, 23, 91, 251, 43, 188, 141, 32, 170, 91, 173, 231, 60, 198, 42, 159, 117,
+      218, 197, 63, 226, 96, 127, 201, 179, 206, 186, 144, 219, 84, 145, 216, 51, 47, 2, 113, 241, 68, 149, 98,
+      200, 203, 232, 43, 239, 21, 129, 255, 213, 175, 119, 246, 229, 22, 159, 49, 54, 246, 93, 2, 130, 116, 49,
+      212, 38, 51, 244, 99, 63, 24, 220, 4, 60, 33, 250, 190, 121, 197, 254, 50, 120, 181, 65, 124, 0, 30,
+      249, 209, 209, 124, 136, 241, 134, 4, 116, 174, 87, 11, 113, 177, 239, 82, 66, 125, 215, 84, 105, 153, 227,
+      166, 132, 214, 76, 209, 163, 69, 58, 111, 167, 253, 217, 27, 20, 189, 196, 49, 180, 181, 0, 78, 99, 103,
+      175, 67, 39, 202, 161, 48, 125, 182, 236, 196, 98, 69, 93, 246, 174, 179, 195, 35, 233, 196, 105, 95, 127,
+      25, 7, 153, 69, 42, 94, 19, 113, 40, 171, 113, 33, 191, 176, 215, 71, 230, 134, 187, 213, 233, 107, 225,
+      254, 33, 249, 248, 165, 74, 227, 56, 153, 191, 195, 249, 57, 223, 184, 218, 58, 246, 83, 149, 108, 135, 21,
+      187, 33, 12, 179, 72, 152, 59, 2, 92, 18, 244, 185, 76, 243, 166, 247, 159, 185, 169, 116, 174, 67, 98,
+      241, 248, 137, 239, 17, 0, 224, 0, 191, 230, 117, 248, 58, 170, 27, 251, 202, 176, 148, 57, 2, 220, 130,
+      86, 242, 217, 193, 120, 89, 173, 143, 55, 5, 94, 41, 63, 75, 33, 221, 126, 16, 95, 166, 209, 253, 151,
+      131, 17, 245, 250, 24, 129, 69, 161, 105, 67, 100, 222, 34, 67, 226, 68, 20, 151, 12, 117, 213, 29, 193,
+      241, 90, 238, 15, 162, 60, 146, 66, 155, 52, 196, 197, 65, 138, 25, 44, 216, 56, 241, 191, 200, 76, 86,
+      47, 214, 250, 46, 177, 66, 186, 21, 98, 46, 176, 216, 221, 140, 172, 42, 124, 105, 253, 239, 125, 13, 29,
+      147, 84, 165, 142, 87, 5, 35, 116, 212, 17, 54, 193, 35, 92, 16, 88, 172, 184, 172, 39, 245, 149, 60,
+      115, 76, 95, 185, 254, 16, 177, 116, 160, 191, 43, 191, 84, 204, 56, 77, 109, 91, 224, 120, 244, 95, 165,
+      156, 109, 255, 226, 223, 142, 170, 220, 55, 99, 112, 27, 108, 223, 185, 107, 155, 104, 191, 78, 70, 81, 19,
+      40, 242, 10, 56, 244, 79, 245, 204, 23, 33, 161, 108, 63, 239, 46, 174, 214, 105, 235, 180, 148, 88, 84,
+      30, 27, 164, 84, 58, 30, 73, 220, 135, 108, 3, 117, 204, 51, 24, 236, 192, 115, 254, 74, 123, 58, 28,
+      200, 228, 192, 61, 229, 214, 34, 51, 251, 89, 95, 240, 67, 162, 83, 90, 159, 79, 26, 115, 109, 129, 142,
+      218, 249, 19, 164, 245, 60, 244, 193, 2, 136, 211, 95, 73, 194, 17, 44, 204, 251, 223, 92, 146, 213, 187,
+      84, 49, 120, 216, 49, 63, 102, 160, 85, 185, 148, 233, 207, 135, 254, 245, 236, 4, 21, 61, 177, 109, 24,
+      151, 195, 36, 109, 159, 20, 86, 153, 79, 159, 237, 176, 176, 223, 127, 134, 143, 150, 65, 243, 222, 249, 52,
+      254, 244, 23, 19, 149, 180, 211, 181, 147, 100, 106, 175, 230, 90, 147, 239, 232, 253, 178, 56, 147, 97, 210,
+      177, 108, 185, 98, 207, 98, 102, 18, 90, 161, 240, 94, 85, 165, 127, 226, 74, 46, 249, 21, 19, 165, 20,
+      51, 43, 10, 209, 210, 23, 111, 215, 31, 98, 239, 191, 175, 86, 217, 76, 113, 61, 26, 7, 251, 102, 7,
+      76, 75, 107, 72, 233, 115, 123, 178, 36, 23, 29, 187, 131, 216, 36, 9, 251, 79, 47, 93, 105, 210, 236,
+      102, 228, 110, 156, 202, 228, 73, 95, 119, 118, 126, 247, 248, 118, 250, 227, 29, 113, 89, 66, 115, 154, 217,
+      12, 104, 183, 138, 158, 252, 87, 156, 43, 159, 59, 19, 205, 128, 91, 41, 113, 100, 167, 253, 95, 44, 235,
+      60, 51, 223, 27, 57, 40, 196, 182, 60, 16, 167, 3, 136, 249, 80, 187, 250, 94, 39, 231, 15, 181, 181,
+      181, 97, 214, 211, 145, 119, 234, 78, 24, 75, 190, 30, 218, 221, 215, 211, 93, 54, 173, 14, 28, 159, 69,
+      73, 108, 46, 255, 186, 178, 20, 246, 189, 23, 45, 65, 132, 145, 98, 54, 220, 199, 136, 181, 64, 205, 240,
+      72, 105, 157, 91, 89, 120, 93, 231, 20, 39, 216, 92, 109, 95, 177, 134, 57, 15, 126, 12, 218, 17, 58,
+      210, 252, 71, 28, 28, 128, 147, 70, 217, 27, 69, 76, 185, 250, 25, 18, 46, 109, 127, 161, 211, 68, 202,
+      251, 5, 153, 176, 102, 129, 20, 183, 249, 50, 61, 241, 252, 139, 220, 52, 235, 254, 188, 186, 230, 227, 214,
+      225, 116, 64, 241, 251, 102, 213, 191, 115, 183, 46, 59, 79, 168, 93, 49, 53, 207, 242, 249, 40, 214, 197,
+      193, 49, 210, 201, 63, 182, 136, 247, 29, 72, 113, 255, 164, 204, 109, 141, 120, 3, 73, 129, 230, 214, 107,
+      190, 246, 94, 13, 157, 202, 235, 83, 239, 106, 101, 143, 82, 207, 107, 28, 66, 153, 191, 236, 11, 133, 242,
+      125, 163, 39, 49, 69, 49, 222, 129, 241, 215, 170, 85, 177, 93, 103, 215, 221, 88, 31, 191, 61, 221, 105,
+      117, 55, 26, 41, 27, 197, 220, 160, 188, 36, 162, 144, 43, 239, 28, 121, 250, 43, 148, 78, 244, 136, 110,
+      26, 73, 81, 243, 183, 25, 19, 101, 94, 8, 211, 45, 225, 81, 247, 255, 236, 111, 19, 230, 60, 132, 4,
+      182, 154, 154, 96, 185, 131, 51, 191, 58, 78, 135, 170, 201, 210, 81, 135, 94, 250, 49, 47, 180, 5, 37,
+      145, 237, 223, 111, 40, 24, 255, 39, 101, 76, 94, 125, 106, 113, 33, 20, 254, 69, 180, 67, 137, 223, 61,
+      128, 78, 34, 48, 75, 108, 31, 207, 180, 254, 205, 40, 182, 205, 233, 35, 149, 76, 37, 179, 134, 43, 184,
+      219, 25, 203, 175, 188, 83, 202, 58, 90, 254, 39, 139, 203, 118, 159, 143, 140, 77, 159, 107, 104, 168, 203,
+      44, 18, 191, 164, 39, 62, 150, 71, 52, 173, 13, 212, 244, 190, 57, 68, 119, 235, 226, 12, 87, 102, 41,
+      156, 68, 47, 36, 39, 188, 231, 97, 65, 72, 129, 223, 19, 145, 227, 191, 14, 254, 213, 118, 119, 101, 214,
+      182, 121, 2, 170, 21, 9, 14, 28, 255, 9, 205, 182, 69, 58, 172, 245, 120, 237, 11, 53, 49, 90, 79,
+      235, 241, 95, 166, 90, 215, 119, 77, 111, 145, 13, 157, 33, 57, 39, 252, 127, 2, 153, 151, 12, 110, 81,
+      136, 107, 239, 74, 38, 156, 179, 53, 89, 106, 169, 158, 72, 190, 38, 125, 187, 253, 113, 212, 139, 191, 147,
+      155, 65, 159, 249, 251, 63, 67, 201, 173, 24, 76, 207, 101, 109, 31, 163, 136, 252, 202, 37, 52, 126, 120,
+      81, 225};
+  const std::vector<uint8_t> fc3_experts_weights = {
+      123, 186, 42, 165, 140, 44, 223, 124, 136, 165, 213, 231, 84, 236, 233, 49, 19, 130, 60, 166, 63, 110, 31,
+      193, 171, 238, 175, 234, 180, 113, 216, 26, 159, 185, 138, 72, 239, 132, 203, 94, 120, 242, 252, 33, 253, 154,
+      247, 128, 96, 218, 30, 47, 218, 24, 82, 105, 64, 231, 188, 35, 255, 89, 198, 116, 84, 157, 44, 160, 175,
+      53, 93, 90, 166, 198, 49, 152, 113, 238, 86, 121, 159, 234, 205, 166, 70, 130, 244, 136, 58, 124, 65, 174,
+      228, 142, 107, 216, 135, 94, 196, 77, 42, 195, 99, 2, 61, 31, 31, 170, 41, 108, 54, 169, 204, 106, 67,
+      204, 147, 111, 35, 27, 113, 55, 167, 76, 95, 191, 248, 208, 231, 172, 193, 31, 121, 140, 250, 187, 117, 93,
+      27, 82, 229, 56, 251, 164, 33, 238, 22, 124, 152, 80, 154, 78, 241, 217, 250, 54, 229, 38, 183, 63, 74,
+      161, 62, 69, 90, 232, 59, 49, 162, 140, 130, 210, 131, 253, 179, 159, 87, 120, 231, 123, 96, 166, 105, 95,
+      18, 220, 249, 7, 150, 79, 29, 206, 112, 20, 181, 3, 52, 171, 124, 187, 88, 136, 98, 25, 94, 47, 212,
+      53, 29, 239, 154, 164, 212, 61, 208, 171, 253, 243, 216, 71, 207, 231, 49, 174, 29, 244, 43, 168, 77, 203,
+      129, 233, 123, 131, 33, 254, 151, 138, 110, 245, 89, 173, 220, 53, 60, 80, 147, 195, 255, 42, 8, 18, 139,
+      97, 146, 248, 54, 220, 163, 22, 242, 58, 198, 57, 126, 122, 203, 118, 149, 13, 145, 193, 154, 237, 207, 99,
+      82, 122, 2, 207, 26, 88, 62, 71, 173, 148, 251, 231, 41, 35, 168, 110, 226, 13, 109, 159, 216, 39, 100,
+      43, 65, 167, 20, 255, 15, 37, 110, 220, 54, 175, 40, 88, 49, 5, 189, 146, 218, 110, 207, 118, 213, 76,
+      191, 145, 12, 214, 238, 32, 123, 62, 58, 106, 223, 63, 24, 122, 127, 192, 189, 215, 129, 2, 212, 242, 207,
+      161, 183, 246, 181, 194, 106, 153, 134, 105, 255, 242, 131, 252, 110, 153, 79, 240, 153, 191, 197, 205, 161, 136,
+      55, 243, 141, 212, 116, 169, 162, 14, 45, 222, 22, 54, 142, 136, 247, 194, 72, 198, 20, 219, 125, 238, 45,
+      255, 217, 134, 19, 234, 36, 57, 49, 108, 38, 54, 166, 101, 200, 31, 64, 90, 147, 144, 71, 41, 138, 12,
+      78, 52, 66, 241, 216, 30, 187, 179, 205, 162, 63, 60, 236, 223, 20, 247, 199, 235, 194, 94, 223, 239, 127,
+      169, 251, 238, 199, 172, 206, 164, 227, 220, 243, 105, 125, 113, 116, 245, 247, 194, 249, 28, 174, 151, 28, 114,
+      47, 205, 148, 241, 95, 55, 55, 197, 188, 152, 96, 85, 190, 114, 108, 144, 226, 237, 86, 25, 130, 200, 17,
+      199, 242, 100, 248, 154, 140, 44, 154, 209, 155, 27, 249, 26, 226, 214, 211, 172, 184, 122, 61, 17, 247, 255,
+      14, 111, 39, 28, 63, 224, 27, 132, 42, 79, 45, 171, 155, 35, 230, 163, 215, 225, 175, 198, 184, 110, 21,
+      247, 251, 83, 77, 223, 230, 135, 216, 199, 119, 131, 71, 121, 129, 185, 244, 155, 125, 205, 17, 249, 114, 133,
+      79, 117, 53, 115, 65, 178, 226, 113, 69, 96, 246, 8, 16, 223, 210, 108, 68, 61, 89, 170, 3, 133, 11,
+      197, 243, 89, 15, 201, 85, 125, 219, 193, 47, 106, 248, 52, 145, 191, 176, 207, 159, 219, 43, 242, 52, 250,
+      250, 31, 248, 131, 175, 249, 43, 250, 239, 101, 45, 114, 62, 95, 167, 237, 190, 169, 109, 76, 119, 193, 229,
+      157, 148, 90, 62, 23, 207, 40, 136, 131, 133, 119, 84, 101, 20, 217, 28, 144, 237, 169, 75, 100, 104, 110,
+      113, 1, 101, 153, 253, 243, 13, 43, 171, 90, 255, 170, 204, 118, 251, 129, 253, 239, 233, 122, 134, 239, 225,
+      167, 176, 94, 50, 76, 164, 234, 125, 109, 238, 114, 164, 164, 49, 163, 125, 193, 38, 32, 193, 242, 119, 38,
+      116, 84, 189, 242, 161, 159, 179, 191, 253, 14, 43, 179, 37, 215, 181, 230, 94, 44, 140, 61, 82, 182, 210,
+      221, 248, 104, 224, 249, 18, 126, 36, 40, 119, 72, 49, 171, 203, 86, 208, 15, 15, 201, 18, 164, 157, 171,
+      155, 225, 176, 137, 228, 246, 228, 160, 159, 116, 245, 94, 195, 219, 197, 207, 254, 188, 135, 104, 64, 223, 233,
+      212, 137, 31, 115, 115, 202, 40, 247, 111, 39, 142, 213, 171, 229, 10, 92, 234, 56, 50, 192, 69, 9, 253,
+      208, 111, 36, 110, 137, 128, 94, 136, 191, 243, 41, 197, 61, 102, 46, 220, 39, 45, 155, 112, 110, 47, 84,
+      21, 43, 68, 172, 154, 1, 76, 26, 110, 94, 181, 58, 63, 173, 127, 12, 19, 124, 6, 212, 182, 76, 21,
+      209, 235, 123, 48, 151, 172, 189, 241, 240, 147, 144, 147, 216, 236, 53, 56, 252, 241, 123, 63, 25, 202, 131,
+      40, 163, 119, 254, 98, 86, 245, 86, 229, 62, 42, 175, 76, 226, 8, 246, 251, 17, 76, 204, 59, 132, 73,
+      196, 194, 88, 51, 117, 19, 146, 154, 90, 79, 72, 207, 177, 182, 99, 185, 113, 179, 91, 130, 156, 47, 209,
+      187, 191, 130, 62, 22, 66, 189, 174, 137, 176, 174, 253, 161, 142, 35, 67, 243, 1, 96, 158, 67, 36, 255,
+      223, 214, 181, 200, 234, 230, 127, 52, 155, 241, 6, 100, 56, 227, 28, 172, 149, 47, 63, 107, 40, 35, 173,
+      174, 76, 173, 64, 78, 101, 103, 104, 150, 95, 240, 148, 226, 120, 56, 72, 118, 183, 142, 40, 246, 75, 34,
+      37, 95, 230, 179, 96, 99, 245, 140, 64, 87, 103, 161, 158, 130, 87, 164, 11, 249, 254, 65, 14, 82, 60,
+      179, 94, 242, 43, 238, 167, 108, 171, 21, 128, 238, 98, 136, 133, 196, 209, 111, 34, 202, 98, 90, 104, 9,
+      222, 99, 70, 246, 47, 77, 219, 35, 28, 125, 79, 231, 54, 31, 207, 119, 159, 95, 239, 54, 3, 39, 18,
+      5, 96, 101, 166, 21, 15, 138, 198, 44, 154, 148, 50, 219, 73, 218, 240, 78, 76, 176, 252, 37, 221, 230,
+      17, 67, 113, 11, 155, 201, 7, 53, 250, 188, 223, 102, 79, 217, 24, 170, 148, 25, 82, 11, 45, 75, 211,
+      72, 10, 166, 78, 79, 17, 152, 143, 58, 118, 135, 90, 156, 24, 157, 253, 205, 60, 241, 8, 251, 77, 172,
+      173, 148, 172, 66, 19, 59, 126, 50, 87, 172, 251, 25, 2, 212, 111, 215, 239, 28, 63, 79, 179, 167, 20,
+      18, 222, 66, 167, 253, 226, 115, 132, 81, 156, 61, 225, 188, 178, 225, 185, 9, 27, 242, 0, 196, 191, 122,
+      133, 8, 101, 21, 250, 28, 133, 39, 69, 126, 149, 43, 239, 152, 127, 155, 2, 231, 72, 156, 169, 20, 171,
+      210, 230, 14, 116, 110, 128, 245, 224, 89, 153, 49, 63, 90, 52, 75, 106, 77, 188, 26, 186, 120, 146, 223,
+      223, 158, 252, 93, 231, 155, 50, 99, 131, 42, 9, 88, 131, 247, 41, 245, 255, 77, 26, 151, 121, 79, 47,
+      250, 41, 29, 120, 133, 198, 177, 202, 100, 80, 185, 112, 134, 179, 39, 164, 190, 68, 72, 48, 104, 253, 117,
+      223, 190, 244, 134, 232, 153, 147, 56, 242, 197, 75, 77, 69, 148, 161, 155, 101, 173, 224, 66, 154, 129, 126,
+      4, 188, 79, 90, 119, 146, 255, 140, 202, 133, 153, 190, 5, 251, 228, 174, 183, 164, 171, 251, 67, 209, 255,
+      20, 67, 86, 226, 209, 178, 7, 87, 222, 69, 203, 57, 225, 24, 172, 250, 127, 241, 114, 215, 38, 28, 209,
+      130, 89, 108, 28, 28, 138, 196, 195, 127, 56, 164, 178, 206, 236, 146, 29, 190, 129, 213, 193, 222, 84, 41,
+      148, 249, 112, 172, 201, 0, 119, 252, 182, 238, 23, 210, 63, 94, 217, 146, 222, 238, 33, 51, 64, 80, 138,
+      218, 136, 244, 105, 22, 126, 205, 221, 143, 237, 111, 152, 218, 223, 126, 229, 178, 201, 202, 84, 244, 33, 234,
+      89, 196, 147, 77, 51, 156, 28, 50, 10, 154, 6, 245, 214, 69, 131, 141, 100, 86, 179, 157, 124, 104, 179,
+      48, 174, 183, 189, 21, 98, 55, 87, 117, 119, 200, 16, 233, 204, 151, 169, 119, 236, 151, 8, 202, 202, 255,
+      99, 41, 33, 207, 124, 239, 212, 147, 235, 18, 129, 37, 125, 151, 58, 79, 26, 75, 13, 169, 205, 167, 161,
+      235, 76, 41, 29, 203, 4, 26, 25, 232, 17, 41, 68, 111, 147, 17, 31, 178, 111, 220, 148, 42, 136, 79,
+      141, 186, 138, 191, 35, 61, 248, 121, 130, 218, 139, 234, 154, 56, 10, 188, 194, 33, 138, 246, 59, 13, 179,
+      27, 78, 23, 174, 61, 78, 46, 74, 132, 100, 127, 78, 207, 97, 167, 82, 249, 208, 84, 65, 87, 74, 9,
+      210, 195, 244, 249, 208, 223, 89, 246, 229, 62, 35, 74, 253, 82, 115, 50, 76, 111, 139, 21, 249, 165, 205,
+      48, 54, 31, 159, 53, 74, 37, 173, 232, 62, 184, 63, 109, 49, 221, 193, 196, 139, 214, 212, 139, 245, 23,
+      211, 243, 17, 135, 235, 31, 179, 184, 152, 210, 202, 245, 85, 31, 228, 193, 202, 234, 14, 133, 94, 118, 255,
+      68, 210, 46, 204, 199, 106, 16, 25, 19, 126, 112, 104, 220, 241, 218, 67, 216, 219, 40, 192, 159, 205, 49,
+      108, 58, 60, 25, 63, 184, 247, 213, 211, 54, 198, 46, 103, 186, 155, 224, 159, 246, 131, 190, 215, 221, 194,
+      234, 26, 201, 71, 170, 40, 185, 236, 169, 210, 66, 249, 78, 245, 67, 35, 221, 252, 180, 25, 43, 200, 53,
+      250, 108, 207, 118, 135, 99, 116, 213, 153, 255, 22, 94, 248, 58, 204, 4, 2, 190, 208, 191, 130, 87, 156,
+      2, 174, 15, 248, 164, 159, 41, 39, 29, 47, 42, 102, 248, 116, 59, 77, 228, 157, 61, 121, 4, 163, 165,
+      33, 156, 242, 247, 45, 31, 51, 170, 23, 183, 252, 245, 124, 4, 87, 103, 144, 118, 182, 237, 159, 140, 242,
+      190, 131, 126, 16, 179, 105, 31, 72, 254, 243, 25, 207, 45, 194, 234, 241, 55, 18, 69, 118, 30, 1, 252,
+      40, 164, 231, 225, 6, 23, 104, 157, 51, 249, 247, 4, 208, 149, 17, 58, 180, 248, 215, 140, 236, 178, 21,
+      133, 110, 155, 79, 245, 69, 35, 255, 189, 245, 87, 216, 123, 3, 155, 202, 253, 32, 237, 154, 120, 20, 232,
+      47, 178, 109, 200, 177, 43, 8, 97, 82, 115, 166, 106, 161, 120, 28, 44, 227, 84, 165, 86, 229, 168, 9,
+      234, 233, 80, 215, 118, 220, 176, 138, 218, 127, 251, 169, 236, 121, 215, 98, 72, 27, 221, 203, 67, 59, 194,
+      79, 167, 118, 50, 98, 141, 162, 224, 181, 124, 57, 57, 191, 230, 201, 213, 15, 84, 6, 28, 112, 228, 53,
+      196, 61, 143, 154, 249, 110, 47, 236, 33, 191, 95, 102, 22, 189, 73, 108, 112, 122, 23, 147, 216, 229, 147,
+      255, 63, 41, 93, 129, 131, 251, 88, 168, 75, 39, 252, 249, 227, 52, 175, 93, 254, 96, 196, 121, 155, 36,
+      95, 252, 88, 9, 74, 50, 254, 40, 64, 75, 121, 88, 185, 98, 15, 51, 87, 163, 253, 122, 132, 35, 196,
+      194, 250, 163, 100, 70, 55, 39, 184, 4, 171, 216, 95, 204, 243, 111, 47, 254, 95, 47, 90, 182, 101, 46,
+      140, 56, 97, 201, 83, 225, 128, 147, 66, 29, 222, 54, 133, 44, 97, 249, 177, 222, 158, 76, 59, 164, 195,
+      230, 151, 58, 68, 117, 55, 8, 94, 107, 233, 48, 49, 214, 230, 114, 239, 48, 97, 92, 5, 101, 95, 58,
+      245, 213, 148, 33, 38, 49, 232, 92, 24, 110, 188, 149, 243, 8, 152, 252, 202, 254, 203, 220, 72, 198, 176,
+      218, 156, 63, 227, 106, 145, 178, 255, 86, 33, 191, 117, 175, 161, 249, 78, 146, 56, 136, 217, 158, 87, 98,
+      212, 131, 81, 58, 6, 70, 59, 239, 155, 247, 169, 63, 92, 5, 228, 162, 69, 40, 221, 95, 111, 13, 182,
+      79, 180, 10, 165, 242, 161, 149, 88, 246, 201, 96, 107, 89, 250, 220, 212, 253, 10, 108, 31, 167, 130, 126,
+      119, 96, 225, 70, 149, 46, 151, 131, 84, 246, 188, 184, 146, 36, 160, 72, 194, 214, 161, 223, 235, 222, 233,
+      243, 70, 158, 131, 103, 22, 120, 58, 89, 190, 17, 187, 92, 104, 191, 103, 187, 218, 244, 111, 246, 178, 73,
+      95, 188, 254, 52, 116, 31, 195, 66, 148, 54, 231, 109, 151, 251, 35, 100, 146, 49, 96, 194, 213, 250, 143,
+      95, 111, 193, 114, 212, 250, 225, 46, 249, 179, 211, 75, 149, 221, 133, 74, 138, 230, 61, 87, 215, 106, 199,
+      246, 239, 31, 63, 81, 172, 247, 206, 87, 118, 1, 38, 125, 196, 78, 138, 99, 58, 81, 157, 82, 252, 59,
+      118, 117, 83, 172, 39, 222, 163, 181, 121, 204, 142, 99, 101, 248, 55, 17, 182, 75, 71, 170, 77, 70, 154,
+      242, 159, 178, 243, 201, 235, 165, 129, 127, 149, 158, 20, 52, 193, 8, 231, 210, 53, 47, 47, 220, 127, 101,
+      243, 220, 219, 188, 221, 166, 84, 173, 140, 111, 106, 42, 88, 15, 200, 59, 248, 214, 246, 202, 242, 77, 238,
+      210, 113, 10, 217, 241, 191, 201, 132, 122, 248, 173, 236, 214, 18, 201, 205, 198, 218, 36, 127, 95, 56, 251,
+      233, 169, 218, 173, 243, 251, 84, 38, 133, 178, 108, 150, 181, 244, 78, 143, 34, 167, 87, 12, 255, 138, 242,
+      194, 174, 198, 243, 100, 250, 227, 136, 35, 222, 53, 144, 249, 60, 105, 28, 111, 0, 53, 46, 239, 81, 21,
+      137, 143, 84, 4, 91, 52, 233, 158, 55, 181, 38, 125, 25, 20, 198, 103, 17, 190, 248, 20, 186, 22, 86,
+      165, 42, 175, 85, 85, 75, 171, 41, 51, 14, 207, 23, 250, 196, 249, 230, 50, 118, 29, 255, 191, 3, 55,
+      58, 237, 106, 172, 15, 75, 117, 89, 122, 108, 248, 227, 79, 106, 245, 61, 211, 190, 49, 165, 202, 29, 82,
+      94, 6, 141, 157, 173, 71, 69, 64, 69, 57, 181, 106, 234, 9, 69, 15, 203, 135, 60, 143, 197, 117, 117,
+      175, 202, 251, 115, 238, 130, 72, 111, 93, 161, 138, 251, 23, 78, 205, 1, 211, 30, 22, 239, 25, 162, 140,
+      10, 245, 22, 172, 181, 244, 50, 3, 85, 101, 120, 101, 68, 67, 70, 180, 195, 73, 121, 91, 138, 136, 241,
+      248, 246, 98, 252, 169, 57, 159, 255, 223, 50, 78, 76, 205, 164, 136, 180, 44, 135, 175, 81, 215, 120, 154,
+      38, 200, 129, 173, 201, 8, 129, 153, 247, 223, 172, 143, 80, 207, 197, 47, 84, 140, 253, 248, 103, 95, 156,
+      230, 161, 60, 71, 95, 207, 101, 134, 247, 244, 196, 212, 140, 30, 143, 60, 115, 145, 181, 89, 105, 61, 146,
+      175, 136, 244, 84, 223, 22, 180, 140, 25, 143, 76, 36, 58, 127, 46, 106, 169, 94, 45, 240, 44, 26, 83,
+      48, 253, 35, 237, 214, 251, 249, 233, 172, 102, 175, 93, 182, 170, 227, 221, 49, 23, 202, 231, 44, 247, 4,
+      188, 212, 182, 132, 73, 159, 179, 216, 180, 127, 45, 145, 104, 99, 63, 228, 185, 239, 176, 24, 69, 78, 174,
+      213, 230, 142, 172, 217, 197, 219, 103, 133, 76, 50, 192, 28, 30, 218, 40, 15, 141, 191, 173, 10, 93, 196,
+      231, 205, 17, 60, 117, 219, 13, 255, 194, 190, 159, 77, 24, 116, 41, 206, 245, 224, 91, 255, 162, 171, 89,
+      106, 168, 18, 18, 140, 89, 190, 38, 120, 107, 200, 236, 225, 170, 35, 81, 50, 243, 111, 124, 255, 83, 95,
+      175, 242, 219, 166, 102, 254, 175, 176, 194, 142, 110, 15, 63, 59, 37, 241, 172, 113, 29, 120, 63, 68, 128,
+      186, 187, 186, 197, 177, 214, 80, 204, 68, 250, 169, 188, 162, 47, 244, 27, 229, 247, 108, 96, 124, 100, 70,
+      103, 205, 126, 239, 229, 155, 255, 221, 54, 15, 168, 96, 137, 149, 15, 209, 250, 54, 248, 246, 198, 105, 118,
+      244, 46, 90, 111, 159, 15, 153, 211, 206, 199, 251, 96, 240, 55, 75, 191, 228, 152, 228, 178, 162, 143, 15,
+      247, 81, 144, 18, 178, 145, 107, 182, 216, 90, 255, 149, 126, 246, 178, 212, 1, 71, 105, 114, 123, 223, 69,
+      171, 91, 140, 137, 133, 143, 44, 107, 123, 80, 132, 240, 75, 175, 43, 102, 89, 98, 174, 187, 153, 250, 191,
+      99, 182, 248, 69, 196, 142, 123, 25, 248, 37, 221, 127, 68, 110, 101, 134, 196, 64, 35, 223, 173, 217, 79,
+      234, 96, 255, 134, 65, 253, 133, 50, 164, 119, 99, 109, 175, 251, 158, 242, 247, 223, 68, 177, 239, 57, 199,
+      236, 54, 140, 254, 113, 101, 195, 41, 60, 38, 187, 210, 252, 83, 178, 85, 143, 135, 12, 148, 90, 115, 73,
+      207, 165, 247, 178, 53, 159, 176, 219, 145, 177, 21, 112, 107, 170, 150, 91, 196, 27, 128, 210, 219, 65, 206,
+      182, 214, 188, 253, 126, 94, 245, 158, 223, 151, 81, 232, 194, 157, 166, 147, 253, 161, 69, 77, 254, 4, 59,
+      210, 70, 31, 118, 108, 68, 253, 200, 212, 52, 159, 68, 118, 190, 191, 157, 218, 110, 103, 247, 207, 183, 23,
+      223, 223, 206, 105, 105, 95, 22, 83, 90, 117, 242, 123, 120, 91, 253, 63, 74, 109, 125, 207, 127, 154, 240,
+      242, 17, 167, 246, 107, 59, 226, 135, 72, 196, 28, 197, 153, 150, 221, 53, 251, 60, 193, 20, 176, 81, 177,
+      178, 116, 175, 106, 24, 95, 255, 5, 253, 95, 187, 51, 220, 252, 94, 45, 16, 188, 108, 132, 231, 136, 10,
+      162, 230, 36, 138, 212, 111, 217, 53, 172, 97, 241, 123, 113, 181, 128, 159, 189, 71, 32, 46, 32, 246, 31,
+      245, 138, 84, 120, 198, 62, 86, 218, 244, 121, 111, 179, 46, 244, 1, 120, 214, 209, 138, 249, 146, 40, 31,
+      193, 19, 239, 37, 150, 12, 111, 82, 197, 55, 234, 31, 121, 213, 148, 90, 114, 253, 33, 242, 235, 138, 143,
+      87, 230, 42, 173, 239, 104, 161, 115, 237, 212, 223, 67, 79, 250, 219, 249, 116, 65, 25, 105, 52, 116, 51,
+      229, 214, 197, 34, 80, 117, 131, 219, 129, 251, 169, 247, 43, 46, 255, 43, 171, 41, 229, 43, 175, 205, 23,
+      219, 169, 79, 166, 58, 81, 147, 240, 195, 138, 32, 138, 71, 59, 188, 232, 228, 253, 252, 182, 39, 183, 62,
+      243, 24, 107, 212, 142, 152, 168, 63, 227, 81, 254, 105, 239, 203, 232, 74, 154, 73, 101, 106, 235, 247, 42,
+      211, 17, 65, 231, 97, 128, 71, 218, 148, 196, 223, 155, 242, 122, 131, 41, 59, 136, 130, 42, 104, 250, 175,
+      238, 42, 149, 245, 143, 252, 65, 243, 60, 170, 132, 221, 126, 251, 135, 102, 120, 113, 241, 249, 220, 37, 248,
+      166, 19, 77, 30, 55, 242, 61, 191, 217, 211, 43, 15, 218, 194, 151, 255, 153, 33, 203, 97, 77, 9, 179,
+      55, 140, 185, 90, 62, 163, 86, 171, 173, 118, 173, 247, 151, 89, 48, 86, 195, 131, 42, 8, 47, 254, 97,
+      38, 182, 80, 53, 111, 93, 147, 111, 240, 124, 102, 94, 127, 141, 247, 86, 11, 37, 11, 73, 218, 224, 62,
+      75, 235, 197, 123, 98, 62, 127, 246, 27, 1, 132, 244, 217, 26, 182, 110, 220, 209, 153, 188, 207, 118, 72,
+      109, 16, 23, 45, 244, 86, 240, 47, 49, 169, 233, 49, 151, 242, 76, 216, 202, 243, 101, 16, 176, 196, 51,
+      225, 204, 68, 217, 185, 59, 42, 36, 255, 117, 191, 219, 145, 31, 220, 37, 17, 237, 35, 7, 194, 57, 168,
+      163, 175, 3, 93, 54, 12, 203, 153, 35, 116, 191, 244, 196, 132, 226, 217, 171, 25, 92, 207, 51, 250, 245,
+      217, 152, 244, 71, 34, 74, 194, 176, 127, 253, 83, 41, 48, 199, 82, 72, 244, 228, 136, 146, 145, 179, 33,
+      103, 202, 149, 92, 159, 26, 233, 154, 133, 183, 194, 36, 2, 206, 31, 209, 131, 230, 109, 0, 109, 92, 49,
+      150, 19, 112, 19, 68, 126, 120, 207, 138, 93, 11, 22, 197, 140, 188, 85, 173, 117, 194, 200, 31, 221, 152,
+      138, 251, 192, 185, 47, 116, 118, 13, 189, 34, 142, 103, 47, 69, 151, 134, 122, 13, 34, 66, 211, 25, 180,
+      151, 227, 104, 201, 175, 96, 248, 114, 140, 90, 145, 203, 90, 171, 129, 187, 219, 162, 245, 119, 107, 84, 171,
+      255, 218, 26, 46, 132, 203, 79, 191, 87, 10, 124, 85, 164, 105, 115, 82, 158, 175, 182, 105, 246, 29, 97,
+      86, 2, 246, 234, 81, 29, 36, 68, 197, 37, 172, 30, 45, 44, 111, 44, 249, 63, 146, 187, 14, 224, 254,
+      52, 218, 75, 54, 185, 165, 92, 214, 70, 247, 253, 79, 232, 112, 230, 42, 58, 53, 15, 98, 135, 139, 76,
+      47, 244, 68, 7, 48, 75, 211, 87, 107, 203, 94, 67, 35, 241, 248, 146, 213, 53, 47, 98, 45, 173, 206,
+      82, 65, 229, 182, 96, 175, 234, 252, 131, 78, 120, 104, 105, 150, 119, 123, 122, 61, 65, 185, 39, 199, 163,
+      127, 193, 222, 149, 184, 233, 75, 98, 170, 150, 196, 226, 235, 242, 26, 192, 17, 84, 218, 119, 62, 58, 59,
+      164, 80, 54, 117, 164, 212, 6, 219, 212, 203, 175, 242, 162, 87, 170, 19, 90, 212, 37, 108, 141, 83, 157,
+      77, 155, 25, 236, 8, 216, 21, 6, 95, 88, 239, 19, 31, 188, 103, 29, 186, 92, 129, 6, 76, 254, 130,
+      242, 209, 181, 53, 29, 251, 164, 165, 111, 47, 89, 122, 196, 103, 252, 112, 127, 133, 76, 110, 172, 82, 119,
+      201, 222, 85, 188, 238, 131, 110, 187, 204, 175, 185, 68, 85, 31, 222, 35, 119, 138, 88, 223, 71, 154, 152,
+      251, 161, 216, 192, 252, 133, 120, 177, 212, 172, 147, 122, 184, 154, 42, 179, 56, 255, 191, 235, 172, 134, 60,
+      98, 187, 243, 199, 253, 250, 84, 72, 228, 121, 168, 20, 168, 99, 120, 93, 242, 177, 110, 195, 143, 217, 54,
+      104, 223, 14, 11, 95, 226, 43, 129, 23, 135, 152, 95, 147, 3, 131, 18, 127, 187, 67, 250, 240, 206, 40,
+      140, 218, 67, 161, 252, 85, 12, 27, 193, 43, 242, 106, 220, 158, 164, 222, 236, 198, 240, 202, 201, 108, 38,
+      52, 96, 238, 24, 175, 118, 65, 194, 116, 236, 248, 29, 1, 106, 176, 24, 229, 192, 172, 95, 83, 180, 253,
+      254, 245, 105, 88, 154, 53, 183, 41, 57, 245, 127, 98, 27, 250, 251, 207, 48, 148, 251, 89, 12, 227, 214,
+      141, 37, 133, 160, 251, 112, 230, 191, 106, 244, 74, 98, 178, 221, 118, 102, 127, 7, 27, 179, 110, 61, 252,
+      133, 107, 97, 23, 98, 244, 211, 98, 89, 191, 44, 170, 197, 43, 240, 29, 199, 248, 210, 239, 148, 106, 209,
+      195, 252, 178, 79, 140, 234, 75, 108, 78, 194, 175, 251, 246, 146, 26, 242, 212, 60, 235, 225, 76, 140, 68,
+      185, 219, 190, 137, 159, 32, 237, 188, 101, 65, 177, 28, 238, 152, 161, 137, 117, 245, 3, 149, 126, 114, 199,
+      39, 49, 255, 13, 15, 53, 186, 74, 245, 25, 245, 197, 251, 129, 47, 25, 153, 198, 133, 226, 167, 88, 94,
+      245, 245, 74, 129, 255, 233, 121, 145, 219, 243, 157, 239, 152, 121, 161, 190, 223, 197, 240, 230, 55, 25, 246,
+      156, 255, 197, 160, 239, 136, 214, 13, 203, 163, 208, 79, 246, 181, 213, 167, 56, 104, 245, 33, 48, 191, 251,
+      33, 127, 100, 71, 66, 54, 104, 224, 85, 34, 255, 52, 247, 83, 68, 227, 120, 232, 117, 105, 66, 237, 217,
+      169, 175, 191, 17, 72, 214, 5, 99, 191, 227, 121, 171, 67, 226, 190, 150, 152, 81, 255, 3, 156, 119, 228,
+      98, 215};
+  const std::vector<float> fc1_scales = {
+      0.01553376f, 0.015543817f, 0.015551699f, 0.015492203f, 0.015023133f, 0.0154082235f, 0.0155232195f,
+      0.01528402f, 0.015559638f, 0.015533516f, 0.015493423f, 0.015256615f, 0.0152339935f, 0.015549371f,
+      0.015381575f, 0.015576782f, 0.015412793f, 0.015498972f, 0.0151363555f, 0.015505189f, 0.014904913f,
+      0.015218727f, 0.015376769f, 0.015279377f, 0.015432924f, 0.015483502f, 0.015457189f, 0.015407557f,
+      0.0156120695f, 0.014825948f, 0.015501786f, 0.015303297f, 0.015532501f, 0.0152144935f, 0.015333908f,
+      0.01479763f, 0.015206473f, 0.01543629f, 0.015437368f, 0.01513233f, 0.015589874f, 0.015567031f,
+      0.015393224f, 0.014935784f, 0.015579218f, 0.015432265f, 0.015484579f, 0.015261326f, 0.015371274f,
+      0.015189547f, 0.015558099f, 0.014714118f, 0.015086958f, 0.015577158f, 0.014815275f, 0.01525769f,
+      0.015569633f, 0.014951542f, 0.015491992f, 0.015379513f, 0.015588352f, 0.015455488f, 0.015094815f,
+      0.015585413f, 0.0151954815f, 0.015539678f, 0.015179157f, 0.015570812f, 0.015453467f, 0.015222808f,
+      0.015130177f, 0.015514964f, 0.015050512f, 0.013596393f, 0.015181009f, 0.014813691f, 0.015430912f,
+      0.015623035f, 0.015465939f, 0.0155621655f, 0.015619047f, 0.015616288f, 0.015411615f, 0.015294425f,
+      0.015334727f, 0.01536013f, 0.015485667f, 0.015279645f, 0.015232291f, 0.015200818f, 0.014945071f,
+      0.015612004f, 0.015533011f, 0.01562017f, 0.015604494f, 0.015526485f, 0.014934285f, 0.015624931f,
+      0.015617797f, 0.0155350845f, 0.015362147f, 0.015408119f, 0.01547795f, 0.014903402f, 0.0154722165f,
+      0.015608951f, 0.015536772f, 0.015497636f, 0.01543246f, 0.015433108f, 0.015222307f, 0.0156019665f,
+      0.0154854f, 0.014986996f, 0.015555747f, 0.015378246f, 0.015050007f, 0.015395556f, 0.0154241435f,
+      0.015317103f, 0.015418313f, 0.015221456f, 0.015339879f, 0.015616156f, 0.01556934f, 0.015396217f,
+      0.015617745f, 0.015584825f};
+  const std::vector<float> fc2_scales = {
+      0.015234984f, 0.015523607f, 0.015164727f, 0.01548125f, 0.015093872f, 0.015315635f, 0.015266418f,
+      0.015527874f, 0.015592782f, 0.015093137f, 0.014813861f, 0.015202709f, 0.0153913535f, 0.01537223f,
+      0.015511734f, 0.015440272f, 0.015092988f, 0.015597204f, 0.015287647f, 0.015497316f, 0.015502119f,
+      0.015546441f, 0.015100006f, 0.015404332f, 0.015531912f, 0.015555983f, 0.01507354f, 0.015588721f,
+      0.01545357f, 0.015513655f, 0.015537361f, 0.015617292f, 0.015471501f, 0.015559636f, 0.015541913f,
+      0.015565485f, 0.015380409f, 0.015168384f, 0.0155151095f, 0.015469871f, 0.015443675f, 0.015554659f,
+      0.015623292f, 0.014806481f, 0.015374577f, 0.015407367f, 0.015303424f, 0.015412778f, 0.015173398f,
+      0.015220221f, 0.015319703f, 0.015124975f, 0.015372854f, 0.015297962f, 0.015397722f, 0.015355343f,
+      0.015466366f, 0.01507015f, 0.015495513f, 0.015593667f, 0.015281979f, 0.015336113f, 0.015525f,
+      0.01537925f, 0.015516909f, 0.015614616f, 0.015543677f, 0.015600901f, 0.0153762605f, 0.015399329f,
+      0.015290953f, 0.015491776f, 0.015287561f, 0.015271302f, 0.015343454f, 0.015566604f, 0.015624354f,
+      0.01533857f, 0.015119089f, 0.015481008f, 0.015398314f, 0.015596798f, 0.0153150465f, 0.015608612f,
+      0.015555618f, 0.015332868f, 0.015389856f, 0.015581448f, 0.015621847f, 0.015410677f, 0.01556886f,
+      0.015614897f, 0.01547879f, 0.015478665f, 0.015515525f, 0.01555785f, 0.01561863f, 0.015433328f,
+      0.015305866f, 0.015573423f, 0.015373498f, 0.0155666135f, 0.015396729f, 0.015547626f, 0.014429122f,
+      0.015496805f, 0.015291028f, 0.015550148f, 0.015425619f, 0.0155315865f, 0.015438886f, 0.015576545f,
+      0.015619017f, 0.01515908f, 0.015479961f, 0.015447514f, 0.015065838f, 0.015309097f, 0.015131723f,
+      0.014979966f, 0.014841583f, 0.015531611f, 0.015469328f, 0.015101345f, 0.015491165f, 0.0155728385f,
+      0.015560919f, 0.015370855f};
+  const std::vector<float> fc3_scales = {
+      0.015415549f, 0.015507627f, 0.014678219f, 0.015550405f, 0.015007719f, 0.015621224f, 0.0155345425f,
+      0.015270567f, 0.015584674f, 0.015545895f, 0.015420519f, 0.015511904f, 0.015497334f, 0.015613152f,
+      0.015344387f, 0.015462939f, 0.015408138f, 0.015263364f, 0.015522234f, 0.015557403f, 0.015617529f,
+      0.0155323f, 0.015070785f, 0.0154183265f, 0.015569469f, 0.014966013f, 0.015585924f, 0.0155711975f,
+      0.01525447f, 0.015368329f, 0.015493156f, 0.015439328f, 0.015451316f, 0.015313955f, 0.015007403f,
+      0.015397709f, 0.015486734f, 0.01554385f, 0.015589319f, 0.015365845f, 0.0152554605f, 0.015575631f,
+      0.015524423f, 0.015446551f, 0.01492084f, 0.015455352f, 0.014697226f, 0.015101928f, 0.01525531f,
+      0.01557962f, 0.015178623f, 0.015425265f, 0.015473807f, 0.015434511f, 0.015518608f, 0.015348455f,
+      0.014946166f, 0.0153529495f, 0.015595689f, 0.015601011f, 0.015585726f, 0.0155280195f, 0.014892634f,
+      0.015474405f, 0.015582396f, 0.01517096f, 0.015513012f, 0.015467694f, 0.015459979f, 0.015562061f,
+      0.015136767f, 0.015591653f, 0.015295904f, 0.014878606f, 0.015608272f, 0.015360581f, 0.015440369f,
+      0.015552597f, 0.0153689645f, 0.015544422f, 0.015161956f, 0.015341356f, 0.015590522f, 0.0155716f,
+      0.0153000355f, 0.015417134f, 0.015441434f, 0.015425701f, 0.015540993f, 0.015532201f, 0.015549095f,
+      0.015335085f, 0.01554049f, 0.015028752f, 0.015245372f, 0.01556482f, 0.015607696f, 0.015421748f,
+      0.0154471155f, 0.015398482f, 0.015602099f, 0.015455678f, 0.015591139f, 0.01557602f, 0.015448909f,
+      0.0153864585f, 0.015211966f, 0.015580256f, 0.015525388f, 0.015311712f, 0.015527213f, 0.015249299f,
+      0.015606547f, 0.0154935885f, 0.015555864f, 0.01537651f, 0.015581995f, 0.015337018f, 0.01547428f,
+      0.015216509f, 0.015208464f, 0.015577957f, 0.015380967f, 0.015528679f, 0.015578562f, 0.015344413f,
+      0.015526013f, 0.015194058f};
+  const std::vector<float> output = {
+      0.04828f, -0.05322f, -0.11176f, 0.09344f, -0.02678f, 0.09827f, 0.06616f, -0.04233f, -0.03937f, 0.1582f,
+      -0.0437f, 0.04413f, 0.0931f, 0.11127f, -0.0747f, -0.10297f, -0.06226f, 0.02866f, -0.1395f, -0.008934f,
+      -0.0385f, 0.1564f, 0.1207f, -0.104f, 0.131f, -0.01776f, -0.00962f, 0.05615f, -0.0129f, -0.01724f,
+      -0.06555f, 0.00729f, -0.02585f, 0.01662f, 0.1351f, -0.02095f, 0.1703f, -0.0237f, -0.1381f, 0.10895f,
+      -0.0724f, 0.04358f, 0.1371f, -0.0707f, 0.02188f, -0.06122f, -0.03586f, -0.01924f, 0.01304f, -0.039f,
+      0.12317f, -0.2336f, 0.0972f, -0.0862f, 0.05716f, 0.05075f, 0.1477f, 0.1316f, -0.05365f, -0.1301f,
+      0.01836f, -0.09186f, 0.0641f, -0.10913f, -0.1576f, 0.0441f, 0.03537f, -0.062f, 0.06915f, 0.02954f,
+      0.1605f, -0.05975f, -0.08435f, 0.1779f, -0.01181f, 0.001026f, 0.1284f, 0.1531f, 0.0571f, -0.1577f,
+      0.05838f, 0.1444f, -0.02432f, 0.10065f, -0.04343f, -0.09296f, 0.0335f, -0.00582f, 0.004944f, -0.013054f,
+      -0.049f, 0.0776f, 0.04633f, 0.0746f, -0.1191f, -0.1118f, -0.209f, -0.09753f, -0.02882f, -0.01466f,
+      -0.08655f, 0.1167f, -0.02155f, 0.05896f, 0.0117f, -0.05618f, 0.0908f, 0.1324f, -0.04462f, 0.04077f,
+      -0.02385f, 0.01863f, 0.0729f, 0.1226f, -0.1261f, -0.0583f, 0.0774f, -0.1523f, 0.2018f, 0.1119f,
+      -0.04095f, -0.01188f, 0.1113f, 0.0502f, 0.00584f, -0.02325f, 0.02837f, 0.04144f};
+
+  RunQMoETest(input, router_probs, fc1_experts_weights, fc2_experts_weights, fc3_experts_weights, fc1_scales,
+              fc2_scales, fc3_scales, output, num_rows, num_experts, hidden_size, inter_size, "silu",
+              1, /*normalize_routing_weights*/
+              2 /*top_k*/);
+}
 #endif
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
index 22253955566f..5f811c8cf35f 100644
--- a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
@@ -107,6 +107,7 @@ static void RunPackedMultiHeadAttentionTest(
       }
 
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+      tester.SetOutputTolerance(0.005f);
     } else {
       if (is_packed_qkv) {
         tester.AddInput<float>("query", packed_qkv_dims, query_data);
@@ -131,6 +132,7 @@ static void RunPackedMultiHeadAttentionTest(
       }
 
       tester.AddOutput<float>("output", output_dims, output_data);
+      tester.SetOutputTolerance(0.001f, 0.001f);
     }
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
index 8fb245819fd2..71b6f27b5391 100644
--- a/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
@@ -66,6 +66,9 @@ void RunQLinearGlobalAveragePool(
   test.AddInput<float>("y_scale", {}, {y_scale});
   test.AddInput<T8Bits>("y_zero_point", {}, {y_zero_point});
   test.AddOutput<T8Bits>("Y", y_dims, y_data);
+  if (channels_last) {
+    test.AddAttribute("channels_last", (int64_t)1LL);
+  }
 
   auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
     const OrtValue& ort_value = fetches[0];
diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
index 3af334696a97..54dd831fe2fc 100644
--- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
@@ -20,7 +20,8 @@ namespace test {
 enum class EP : char {
   CPU,
   CUDA,
-  DNNL
+  DNNL,
+  DML
 };
 
 // input:      [batch_size, sequence_length, hidden_size]
@@ -89,11 +90,13 @@ void RunQAttention(const std::vector<float>& input_data,
     tester.AddInput<MLFloat16>("input_scale", {1}, ToFloat16({input_quant_params.scale}));
     tester.AddInput<MLFloat16>("weight_scale", {1}, ToFloat16({weight_quant_params.scale}));
     tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+    tester.SetOutputTolerance(0.01f);
   } else {
     tester.AddInput<float>("bias", bias_dims, bias_data);
     tester.AddInput<float>("input_scale", {1}, {input_quant_params.scale});
     tester.AddInput<float>("weight_scale", {1}, {weight_quant_params.scale});
     tester.AddOutput<float>("output", output_dims, output_data);
+    tester.SetOutputTolerance(0.005f);
   }
 
   if (mask_index_data.size() > 0) {
@@ -111,7 +114,9 @@ void RunQAttention(const std::vector<float>& input_data,
     execution_providers.push_back(DefaultCudaExecutionProvider());
   } else if constexpr (ep == EP::CPU) {
     execution_providers.push_back(DefaultCpuExecutionProvider());
-  } else {  // onednn ep
+  } else if constexpr (ep == EP::DML) {
+    execution_providers.push_back(DefaultDmlExecutionProvider());
+  } else {  //  onednn ep
     execution_providers.push_back(DefaultDnnlExecutionProvider());
   }
 
@@ -192,6 +197,52 @@ static void RunQAttentionDNNL(
 #endif
 }
 
+static void RunQAttentionDML(
+    const std::vector<float>& input_data,
+    const std::vector<float>& weights_data,
+    const std::vector<float>& bias_data,
+    const std::vector<int32_t>& mask_index_data,
+    const std::vector<float>& output_data,
+    int batch_size,
+    int sequence_length,
+    int hidden_size,
+    int number_of_heads,
+    bool use_special_quantize_parameter = true,
+    bool is_unidirectional = false,
+    int input_hidden_size = 0) {
+  // Return without running code if USE_DML is not defined
+#ifdef USE_DML
+  bool enable_dml = (nullptr != DefaultDmlExecutionProvider().get());
+  if (enable_dml) {
+    quantization::Params<uint8_t> input_quant_params(/*scale=*/0.0f, /*zero_point=*/0);
+    quantization::Params<int8_t> weights_quant_params(/*scale=*/0.0f, /*zero_point=*/0);
+    if (use_special_quantize_parameter) {
+      input_quant_params.scale = 0.1f;
+      weights_quant_params.scale = 0.1f;
+      input_quant_params.zero_point = 128;
+      weights_quant_params.zero_point = 1;
+    }
+
+    RunQAttention<uint8_t, int8_t, EP::DML>(
+        input_data, weights_data, bias_data, mask_index_data, output_data, input_quant_params, weights_quant_params,
+        batch_size, sequence_length, hidden_size, number_of_heads, is_unidirectional, false, input_hidden_size);
+  }
+#else
+  ORT_UNUSED_PARAMETER(input_data);
+  ORT_UNUSED_PARAMETER(weights_data);
+  ORT_UNUSED_PARAMETER(bias_data);
+  ORT_UNUSED_PARAMETER(mask_index_data);
+  ORT_UNUSED_PARAMETER(output_data);
+  ORT_UNUSED_PARAMETER(batch_size);
+  ORT_UNUSED_PARAMETER(sequence_length);
+  ORT_UNUSED_PARAMETER(hidden_size);
+  ORT_UNUSED_PARAMETER(number_of_heads);
+  ORT_UNUSED_PARAMETER(use_special_quantize_parameter);
+  ORT_UNUSED_PARAMETER(is_unidirectional);
+  ORT_UNUSED_PARAMETER(input_hidden_size);
+#endif
+}
+
 static void RunQAttentionU8U8(
     const std::vector<float>& input_data,
     const std::vector<float>& weights_data,
@@ -272,6 +323,9 @@ static void RunQAttentionAll(
   RunQAttentionDNNL(input_data, weight_data, bias_data, mask_index_data, output_data,
                     batch_size, sequence_length, hidden_size, number_of_heads,
                     use_special_quantize_parameter, is_unidirectional, input_hidden_size);
+  RunQAttentionDML(input_data, weight_data, bias_data, mask_index_data, output_data,
+                   batch_size, sequence_length, hidden_size, number_of_heads,
+                   use_special_quantize_parameter, is_unidirectional, input_hidden_size);
 }
 
 // ONEDNN EP only supports 2D raw mask
@@ -859,8 +913,8 @@ void TestQuantizedAttentionPastState(int64_t batch,
   std::vector<int64_t> input_dims{batch, seq_len, hidden_size};
   std::vector<InputT> input_data = random.Gaussian<InputT>(input_dims, input_mean, static_cast<InputT>(input_range / 6), input_min, input_max);
 
-  constexpr WeightT weight_min = std::numeric_limits<WeightT>::min();
-  constexpr WeightT weight_max = std::numeric_limits<WeightT>::max();
+  constexpr WeightT weight_min = std::is_same_v<WeightT, int8_t> ? std::numeric_limits<int8_t>::min() / 2 : std::numeric_limits<WeightT>::min();
+  constexpr WeightT weight_max = std::numeric_limits<WeightT>::max() / 2;
   constexpr int32_t weight_range = weight_max - weight_min;
 
   std::vector<WeightT> weight_zero_point(weight_scale_zp_size);
diff --git a/onnxruntime/test/contrib_ops/quantize_ops_test.cc b/onnxruntime/test/contrib_ops/quantize_ops_test.cc
index 64a97ed4f945..db685967ae5f 100644
--- a/onnxruntime/test/contrib_ops/quantize_ops_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_ops_test.cc
@@ -76,6 +76,16 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_per_tensor_float_int32_cpu) {
   test.Run();
 }
 
+TEST(DequantizeLinearOpTest, DequantizeLinearOpTest_BroadcastTensorOfOne) {
+  OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
+
+  test.AddInput<int32_t>("x", {4}, {-30, -3, 100, 127});
+  test.AddInput<float>("x_scale", {1}, {2.0f}, true);
+  test.AddInput<int32_t>("zero_point", {1}, {0}, true);
+  test.AddOutput<float>("y", {4}, {-60.f, -6.f, 200.f, 254.f});
+  test.Run();
+}
+
 #ifdef USE_CUDA
 TEST(DequantizeLinearOpTest, DequantizeLinear_per_tensor_half_uint8) {
   OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
index 55f01bf0d3f1..e64de0e6da16 100644
--- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -11,6 +11,14 @@
 namespace onnxruntime {
 namespace test {
 
+namespace {
+enum class TensorType {
+  kFloat,
+  kFloat16,
+  kBFloat16
+};
+}  // anonymous namespace
+
 static void RunTest(
     const std::vector<float>& input_data,
     const std::vector<int64_t>& position_ids,
@@ -20,10 +28,11 @@ static void RunTest(
     int batch_size,
     int sequence_length,
     int head_size,
+    int rotary_embedding_dim,
     int num_heads,
     int max_sequence_length,
     int64_t interleaved,
-    bool use_float16,
+    TensorType tensor_type,
     bool disable_cpu,
     bool disable_cuda,
     bool disable_dml) {
@@ -36,7 +45,9 @@ static void RunTest(
   int hidden_size = num_heads * head_size;
   std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
   std::vector<int64_t> pos_dims;
-  std::vector<int64_t> cache_dims = {max_sequence_length, head_size / 2};
+  std::vector<int64_t> cache_dims = {max_sequence_length, rotary_embedding_dim > 0
+                                                              ? rotary_embedding_dim / 2
+                                                              : head_size / 2};
 
   assert(hidden_size != 0 && head_size != 0 && num_heads != 0 && max_sequence_length != 0);
   assert(max_sequence_length >= sequence_length);
@@ -49,7 +60,10 @@ static void RunTest(
   std::string op_type = "RotaryEmbedding";
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 
-  int min_cuda_architecture = use_float16 ? 530 : 0;
+  int min_cuda_architecture = (tensor_type == TensorType::kBFloat16)
+                                  ? 800
+                              : (tensor_type == TensorType::kFloat16) ? 530
+                                                                      : 0;
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
   bool enable_dml = (nullptr != DefaultDmlExecutionProvider().get()) && !disable_dml;
 
@@ -59,7 +73,7 @@ static void RunTest(
   if (enable_dml && !disable_dml) {
     execution_providers.push_back(DefaultDmlExecutionProvider());
   }
-  if (!use_float16 && !disable_cpu) {
+  if (tensor_type == TensorType::kFloat && !disable_cpu) {
     execution_providers.push_back(DefaultCpuExecutionProvider());
   }
   if (execution_providers.size() == 0) {
@@ -70,20 +84,36 @@ static void RunTest(
   OpTester test(op_type.c_str(), 1, onnxruntime::kMSDomain);
   test.AddAttribute<int64_t>("interleaved", interleaved);
 
-  if (!use_float16) {
+  if (rotary_embedding_dim > 0) {
+    test.AddAttribute<int64_t>("rotary_embedding_dim", rotary_embedding_dim);
+    test.AddAttribute<int64_t>("num_heads", num_heads);
+  }
+
+  if (tensor_type == TensorType::kFloat) {
     test.AddInput<float>("input", input_dims, input_data);
     test.AddInput<int64_t>("position_ids", pos_dims, position_ids);
     test.AddInput<float>("cos_cache", cache_dims, cos_cache);
     test.AddInput<float>("sin_cache", cache_dims, sin_cache);
     test.AddOutput<float>("output", input_dims, output_data);
-  } else {
+  } else if (tensor_type == TensorType::kFloat16) {
     test.AddInput<MLFloat16>("input", input_dims, ToFloat16(input_data));
     test.AddInput<int64_t>("position_ids", pos_dims, position_ids);
     test.AddInput<MLFloat16>("cos_cache", cache_dims, ToFloat16(cos_cache));
     test.AddInput<MLFloat16>("sin_cache", cache_dims, ToFloat16(sin_cache));
     test.AddOutput<MLFloat16>("output", input_dims, ToFloat16(output_data));
+  } else {
+    test.AddInput<BFloat16>("input", input_dims, FloatsToBFloat16s(input_data));
+    test.AddInput<int64_t>("position_ids", pos_dims, position_ids);
+    test.AddInput<BFloat16>("cos_cache", cache_dims, FloatsToBFloat16s(cos_cache));
+    test.AddInput<BFloat16>("sin_cache", cache_dims, FloatsToBFloat16s(sin_cache));
+    test.AddOutput<BFloat16>("output", input_dims, FloatsToBFloat16s(output_data));
+  }
+  if (tensor_type == TensorType::kBFloat16) {
+    test.SetOutputAbsErr("output", 0.03f);
+  } else {
+    test.SetOutputAbsErr("output", 0.002f);
   }
-  test.SetOutputAbsErr("output", 0.002f);
+
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
@@ -95,10 +125,12 @@ static void RunTests(const std::vector<float>& input_data,
                      int batch_size,
                      int sequence_length,
                      int head_size = 0,
+                     int rotary_embedding_dim = 0,
                      int num_heads = 0,
                      int max_sequence_length = 0,
                      int64_t interleaved = 0,
-                     bool use_float16 = true) {
+                     bool use_float16 = true,
+                     bool disable_dml = false) {
   // FP32 test for CPU
   RunTest(input_data,
           position_ids,
@@ -108,10 +140,11 @@ static void RunTests(const std::vector<float>& input_data,
           batch_size,
           sequence_length,
           head_size,
+          rotary_embedding_dim,
           num_heads,
           max_sequence_length,
           interleaved,
-          false, /* use_fp16 */
+          TensorType::kFloat,
           false, /* disable_cpu */
           true,  /* disable_cuda */
           true /* disable_dml */);
@@ -125,13 +158,14 @@ static void RunTests(const std::vector<float>& input_data,
           batch_size,
           sequence_length,
           head_size,
+          rotary_embedding_dim,
           num_heads,
           max_sequence_length,
           interleaved,
-          false, /* use_fp16 */
+          TensorType::kFloat,
           false, /* disable_cpu */
           false, /* disable_cuda */
-          false /* disable_dml */);
+          disable_dml || false /* disable_dml */);
 
   // FP16 test for CUDA and DML
   if (use_float16) {
@@ -143,13 +177,31 @@ static void RunTests(const std::vector<float>& input_data,
             batch_size,
             sequence_length,
             head_size,
+            rotary_embedding_dim,
             num_heads,
             max_sequence_length,
             interleaved,
-            true,  /* use_fp16 */
+            TensorType::kFloat16,
             true,  /* disable_cpu */
             false, /* disable_cuda*/
-            false /* disable_dml */);
+            disable_dml || false /* disable_dml */);
+
+    // RunTest(input_data,
+    //         position_ids,
+    //         cos_cache,
+    //         sin_cache,
+    //         output_data,
+    //         batch_size,
+    //         sequence_length,
+    //         head_size,
+    //         rotary_embedding_dim,
+    //         num_heads,
+    //         max_sequence_length,
+    //         interleaved,
+    //         TensorType::kBFloat16,
+    //         true,  /* disable_cpu */
+    //         false, /* disable_cuda*/
+    //         false /* disable_dml */);
   }
 }
 
@@ -159,6 +211,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_SmallData_LlamaMSFT) {
   int sequence_length = 3;
   int num_heads = 2;
   int head_size = 4;
+  int rotary_embedding_dim = 0;
   int max_sequence_length = 8;
   int64_t interleaved = 1;  // true
 
@@ -190,6 +243,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_SmallData_LlamaMSFT) {
            batch_size,
            sequence_length,
            head_size,
+           rotary_embedding_dim,
            num_heads,
            max_sequence_length,
            interleaved);
@@ -201,6 +255,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_LargeData_LlamaMSFT) {
   int sequence_length = 8;
   int num_heads = 4;
   int head_size = 6;
+  int rotary_embedding_dim = 0;
   int max_sequence_length = 16;
   int64_t interleaved = 1;  // true
 
@@ -388,6 +443,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_LargeData_LlamaMSFT) {
            batch_size,
            sequence_length,
            head_size,
+           rotary_embedding_dim,
            num_heads,
            max_sequence_length,
            interleaved);
@@ -399,6 +455,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_LargeData_LlamaMSFT) {
   int sequence_length = 8;
   int num_heads = 4;
   int head_size = 6;
+  int rotary_embedding_dim = 0;
   int max_sequence_length = 16;
   int64_t interleaved = 0;  // false
 
@@ -586,6 +643,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_LargeData_LlamaMSFT) {
            batch_size,
            sequence_length,
            head_size,
+           rotary_embedding_dim,
            num_heads,
            max_sequence_length,
            interleaved);
@@ -597,6 +655,7 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_SmallData_LlamaMSFT) {
   int sequence_length = 2;
   int num_heads = 3;
   int head_size = 6;
+  int rotary_embedding_dim = 0;
   int max_sequence_length = 4;
   int64_t interleaved = 0;  // false
 
@@ -632,10 +691,52 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_SmallData_LlamaMSFT) {
            batch_size,
            sequence_length,
            head_size,
+           rotary_embedding_dim,
            num_heads,
            max_sequence_length,
            interleaved);
 }
 
+TEST(RotaryEmbeddingTest, RotaryEmbedding_CustomRotaryDim_SmallData_Phi) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int num_heads = 1;
+  int head_size = 6;
+  int rotary_embedding_dim = 4;
+  int max_sequence_length = 2;
+  int64_t interleaved = 0;  // false
+
+  std::vector<float> input_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f, 1.1676f, 1.0076f, -0.7529f,
+      -0.2250f, -0.4327f, -1.5071f, -0.4586f};
+
+  std::vector<int64_t> position_ids = {0, 1};
+
+  std::vector<float> cos_cache = {
+      1.0000f, 1.0000f, 1.0000f, 0.5403f};
+
+  std::vector<float> sin_cache = {
+      0.0000f, 0.0000f, 0.0000f, 0.8415f};
+
+  std::vector<float> output_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f, 1.1676f, 1.0076f, -0.0427f,
+      -0.2250f, -0.8673f, -1.5071f, -0.4586f};
+
+  RunTests(input_data,
+           position_ids,
+           cos_cache,
+           sin_cache,
+           output_data,
+           batch_size,
+           sequence_length,
+           head_size,
+           rotary_embedding_dim,
+           num_heads,
+           max_sequence_length,
+           interleaved,
+           true, /*use_fp16*/
+           true /*disable_dml*/);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/sampling_test.cc b/onnxruntime/test/contrib_ops/sampling_test.cc
index 733bc9f01fd1..d987a1cae427 100644
--- a/onnxruntime/test/contrib_ops/sampling_test.cc
+++ b/onnxruntime/test/contrib_ops/sampling_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -65,7 +69,10 @@ TEST(SamplingTest, Gpt2Sampling_GPU) {
     LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture";
     return;
   }
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else  // USE_ROCM
   OrtROCMProviderOptions rocm_options;
   // TODO - verify the default settings
diff --git a/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc b/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc
index fefd5722054d..ea8537f243f5 100644
--- a/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc
@@ -114,16 +114,21 @@ TEST(SkipGroupNormTest, SkipGroupNorm_with_bias) {
 
   int min_cuda_architecture = 530;
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+  bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
 
   std::array<int, 2> channels_last_values = {-1, 1};
 
   for (const int channels_last : channels_last_values) {
-    if (enable_cuda) {
+    if (enable_cuda || enable_rocm) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       if (enable_cuda && channels_last != 0) {
         execution_providers.push_back(DefaultCudaExecutionProvider());
       }
 
+      if (enable_rocm && channels_last != 0) {
+        execution_providers.push_back(DefaultRocmExecutionProvider());
+      }
+
       // Don't run the test if no providers are supported
       if (execution_providers.empty()) {
         continue;
@@ -230,6 +235,7 @@ TEST(SkipGroupNormTest, SkipGroupNorm_no_bias_broadcast_skip) {
 
   int min_cuda_architecture = 530;
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+  bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
 
   std::array<bool, 2> has_add_out_values = {true, false};
   std::array<int, 2> skip_dims = {2, 4};
@@ -237,12 +243,16 @@ TEST(SkipGroupNormTest, SkipGroupNorm_no_bias_broadcast_skip) {
   constexpr int channels_last = 1;
   for (const int skip_dim : skip_dims) {
     for (const bool has_add_out : has_add_out_values) {
-      if (enable_cuda) {
+      if (enable_cuda || enable_rocm) {
         std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
         if (enable_cuda && channels_last != 0) {
           execution_providers.push_back(DefaultCudaExecutionProvider());
         }
 
+        if (enable_rocm && channels_last != 0) {
+          execution_providers.push_back(DefaultRocmExecutionProvider());
+        }
+
         // Don't run the test if no providers are supported
         if (execution_providers.empty()) {
           continue;
diff --git a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
new file mode 100644
index 000000000000..942b1c4d2c2a
--- /dev/null
+++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
@@ -0,0 +1,206 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *    blkq4_fp16_quant_sm80.h
+ *
+ * Abstract:
+ *   Oracle computation for blockwise 4b quantization for fp16
+ *   gemm kernel specifically for Ampere GPUs. This is used for
+ *   testing the cuda kernel implementation in
+ *   (test/providers/cuda/test_cases)
+ *   and for testing the cuda op prepack code in (test/optimizer)
+ */
+
+#pragma once
+
+#include "core/util/matrix_layout.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace test {
+
+static inline void sm80_prepack_weights_ref(
+    int rows,
+    int columns,
+    const MatrixRef<uint8_t const, ColumnMajorLayout, true>& tensor_weight,
+    const MatrixRef<uint8_t, ColumnMajorLayout, true>& tensor_weight_prepacked) {
+  ORT_ENFORCE(tensor_weight.shape()[0] == rows / 2 && tensor_weight.shape()[1] == columns,
+              "Unexpected tensor_weight shape! Expected: (", rows / 2, ", ", columns, "), Got: (",
+              tensor_weight.shape()[0], ", ", tensor_weight.shape()[1], ").");
+  ORT_ENFORCE(tensor_weight_prepacked.shape()[0] == rows && tensor_weight_prepacked.shape()[1] == columns / 2,
+              "tensor_weight_prepacked shape is not compatible with prepacked weight shape");
+
+  auto t0_base = make_Position(0, 0);
+  auto t1_base = make_Position(4, 0);
+  auto t2_base = make_Position(0, 8);
+  auto t3_base = make_Position(4, 8);
+  for (int col_dtile = 0; col_dtile < columns / 16; ++col_dtile) {
+    for (int row_dtile = 0; row_dtile < rows / 16; ++row_dtile) {
+      // Packing from a 8x16 tile to a 16x8 tile
+      auto dtile_base = make_Position(row_dtile * 8, col_dtile * 16);
+      auto packed_tile_base = make_Position(row_dtile * 16, col_dtile * 8);
+      for (int col = 0; col < 8; ++col) {
+        for (int row = 0; row < 4; ++row) {
+          auto cord = make_Position(row, col);
+          auto packed_cord = packed_tile_base + make_Position(row * 4, col);  // packed tile is 16x8
+          uint8_t buf[4];
+          buf[0] = tensor_weight.at(dtile_base + t0_base + cord);
+          buf[1] = tensor_weight.at(dtile_base + t1_base + cord);
+          buf[2] = tensor_weight.at(dtile_base + t2_base + cord);
+          buf[3] = tensor_weight.at(dtile_base + t3_base + cord);
+
+          // [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7] so that each pair of adjacent weights
+          // are in different b16 register at the same positions. This makes it easier to convert to
+          // fp16x2 format in a b32 register
+
+          tensor_weight_prepacked.at(packed_cord) = (buf[0] & 0x0f) | ((buf[1] & 0x0f) << 4);
+          tensor_weight_prepacked.at(packed_cord + make_Position(1, 0)) = (buf[2] & 0x0f) | ((buf[3] & 0x0f) << 4);
+          tensor_weight_prepacked.at(packed_cord + make_Position(2, 0)) = ((buf[0] & 0xf0) >> 4) | (buf[1] & 0xf0);
+          tensor_weight_prepacked.at(packed_cord + make_Position(3, 0)) = ((buf[2] & 0xf0) >> 4) | (buf[3] & 0xf0);
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename ScaleElementT,
+    typename Layout,
+    typename QuantBlocking>
+inline void sm80_prepack_quant_scales_ref(
+    int rows,
+    int columns,
+    const MatrixRef<ScaleElementT const, Layout, true>& tensor_scale,
+    const MatrixRef<ScaleElementT, Layout, true>& tensor_scale_prepacked) {
+  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] ==
+                                                                             (columns / QuantBlocking::kColumn),
+              "Unexpected tensor_scale shape! Expected: (",
+              rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")");
+  ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape());
+
+  // Only prepacking scale and offset tensors for a often used special case:
+  //    16b gemm (2 elements per 32b register, operand tile shape 8x8)
+  //    2 B operand tiles per mma instruction stacked on k dimension
+  //    (1,n) quantization blocking
+  if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) {
+    ORT_THROW(
+        "sm80_prepack_quant_scales_ref should only be called for "
+        " row-wise block quantization on 16b float values.");
+  }
+
+  // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
+  // holds a fragment of the tile containing 2 elements in the k dimension. Most often we use
+  // mma instruction shape of 16x8x16, which means 2 B tiles are stacked in the k dimension,
+  // as shown below (T stands for thread):
+  // T0, T4, T8, T12
+  // T1, T5, T9, T13
+  // T2, T6, T10, T14
+  // T3, T7, T11, T15
+  // T0, T4, T8, T12
+  // T1, T5, T9, T13
+  // T2, T6, T10, T14
+  // T3, T7, T11, T15
+  //
+  // We need to deliver quantization scale and offset elements to the corresponding threads,
+  // so we can perform dequantization efficiently. With a column major layout, each thread
+  // needs two separate loads for a mma instruction, due to the tile fragment layout shown
+  // above. To reduce the number of loads, we rearrange each column as below, so we can use
+  // a single load to load fragments for two tiles:
+  // T0        T0
+  // T1        T0
+  // T2        T1
+  // T3   =>   T1
+  // T0        T2
+  // T1        T2
+  // T2        T3
+  // T3        T3
+
+  for (int col = 0; col < tensor_scale.shape()[1]; ++col) {
+    for (int row_blk = 0; row_blk < tensor_scale.shape()[0]; row_blk += 16) {
+      for (int thread_id = 0; thread_id < 4; thread_id++) {
+        const int dst_idx = row_blk + thread_id * 4;
+        const int src_idx = row_blk + thread_id * 2;
+        tensor_scale_prepacked.at(dst_idx + 0, col) = tensor_scale.at(src_idx + 0, col);
+        tensor_scale_prepacked.at(dst_idx + 1, col) = tensor_scale.at(src_idx + 1, col);
+        tensor_scale_prepacked.at(dst_idx + 2, col) = tensor_scale.at(src_idx + 8, col);
+        tensor_scale_prepacked.at(dst_idx + 3, col) = tensor_scale.at(src_idx + 9, col);
+      }
+    }
+  }
+}
+
+template <typename Layout, typename QuantBlocking>
+inline void sm80_prepack_quant_offsets_ref(
+    int rows,
+    int columns,
+    MatrixRef<uint8_t const, Layout, true> tensor_offset,
+    MatrixRef<uint8_t, Layout, true> tensor_offset_prepacked) {
+  const auto meta_shape = make_Position(rows / QuantBlocking::kRow, columns / QuantBlocking::kColumn);
+  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+  ORT_ENFORCE(tensor_offset_prepacked.shape() == meta_shape,
+              "Unexpected tensor_offset_prepacked shape (",
+              tensor_offset_prepacked.shape()[0], ",", tensor_offset_prepacked.shape()[1],
+              ")! Expected: (", meta_shape[0], ", ", meta_shape[1], ")");
+  ORT_ENFORCE(tensor_offset.shape() == zp_shape,
+              "Unexpected tensor_offset shape (",
+              tensor_offset.shape()[0], ",", tensor_offset.shape()[1],
+              ")! Expected: (", zp_shape[0], ", ", zp_shape[1], ")");
+
+  // Only prepacking scale and offset tensors for a often used special case:
+  //    16b gemm (2 elements per 32b register, operand tile shape 8x8)
+  //    2 B operand tiles per mma instruction stacked on k dimension
+  //    (1,n) quantization blocking
+  if constexpr (QuantBlocking::kRow != 1) {
+    ORT_THROW("sm80_prepack_quant_offsets_ref should only be called for row-wise block quantization.");
+  }
+  // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
+  // holds a fragment of the tile containing 2 elements in the k dimension. Most often we use
+  // mma instruction shape of 16x8x16, which means 2 B tiles are stacked in the k dimension,
+  // as shown below (T stands for thread):
+  // T0, T4, T8, T12
+  // T1, T5, T9, T13
+  // T2, T6, T10, T14
+  // T3, T7, T11, T15
+  // T0, T4, T8, T12
+  // T1, T5, T9, T13
+  // T2, T6, T10, T14
+  // T3, T7, T11, T15
+  //
+  // We need to deliver quantization scale and offset elements to the corresponding threads,
+  // so we can perform dequantization efficiently. With a column major layout, each thread
+  // needs two separate loads for a mma instruction, due to the tile fragment layout shown
+  // above. To reduce the number of loads, we rearrange each column as below, so we can use
+  // a single load to load fragments for two tiles:
+  // T0        T0
+  // T1        T0
+  // T2        T1
+  // T3   =>   T1
+  // T0        T2
+  // T1        T2
+  // T2        T3
+  // T3        T3
+  if (tensor_offset_prepacked.good()) {
+    for (int col = 0; col < tensor_offset_prepacked.shape()[1]; ++col) {
+      for (int row_blk = 0; row_blk < tensor_offset_prepacked.shape()[0]; row_blk += 16) {
+        for (int thread_id = 0; thread_id < 4; thread_id++) {
+          const int dst_idx = row_blk + thread_id * 4;
+          const int src_idx = row_blk + thread_id * 2;
+          // [a, b, c, d] => [a, c, b, d] so that adjacent weights are in their own
+          // 16b element: [a, x, b, x] and [x, c, x, d], which makes it easier to
+          // convert to fp16x2 format in a b32 register
+          uint8_t pair01 = tensor_offset.at(src_idx / 2, col);
+          uint8_t pair89 = tensor_offset.at((src_idx + 8) / 2, col);
+          tensor_offset_prepacked.at(dst_idx + 0, col) = pair01 & 0xf;
+          tensor_offset_prepacked.at(dst_idx + 1, col) = pair89 & 0xf;
+          tensor_offset_prepacked.at(dst_idx + 2, col) = pair01 >> 4;
+          tensor_offset_prepacked.at(dst_idx + 3, col) = pair89 >> 4;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 2147a4253ef3..3a01f2c8d95a 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -254,7 +254,7 @@ class PlannerTest : public ::testing::Test {
     ASSERT_NE(ep, nullptr);
     auto info = std::make_unique<OpKernelInfo>(
         *p_node, kernel_def, *ep, state_->GetInitializedTensors(), state_->GetOrtValueNameIdxMap(),
-        state_->GetDataTransferMgr());
+        state_->GetDataTransferMgr(), state_->GetAllocators(), state_->GetSessionOptions().config_options);
 
     op_kernel_infos_.push_back(std::move(info));
     const auto kernel_type_str_resolver = OpSchemaKernelTypeStrResolver{};
@@ -327,10 +327,23 @@ class PlannerTest : public ::testing::Test {
 
     if (invoke_createPlan_explicityly) {
       onnxruntime::GraphViewer graph_viewer{graph_};
-      status = SequentialPlanner::CreatePlan(nullptr, graph_viewer, outer_scope_node_args, execution_providers_,
-                                             kernel_create_info_map, {}, {}, state_->GetOrtValueNameIdxMap(), test_context,
-                                             MockStreamHandleRegsitry(), /* {{kCpuExecutionProvider, 1}}, {},*/
-                                             ORT_TSTR(""), DefaultLoggingManager().DefaultLogger(), plan_);
+      status = SequentialPlanner::CreatePlan(
+          nullptr,
+          graph_viewer,
+          outer_scope_node_args,
+          execution_providers_,
+          kernel_create_info_map,
+          {},
+          {},
+          state_->GetOrtValueNameIdxMap(),
+          test_context,
+#ifdef ORT_ENABLE_STREAM
+          MockStreamHandleRegsitry(),
+#endif
+          /* {{kCpuExecutionProvider, 1}}, {},*/
+          ORT_TSTR(""),
+          DefaultLoggingManager().DefaultLogger(),
+          plan_);
 
       EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
       // AllocationPlanTestUtility::BasicIntegrityCheck(*plan_, name_to_arg_.size());
@@ -1943,12 +1956,9 @@ TEST_F(PlannerTest, TestCpuIf) {
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/cpu_if.onnx"));
-  auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
-  ASSERT_TRUE(status.IsOK());
-  status = sess.Load();
-  ASSERT_TRUE(status.IsOK());
-  status = sess.Initialize();
-  ASSERT_TRUE(status.IsOK());
+  ASSERT_STATUS_OK(sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
+  ASSERT_STATUS_OK(sess.Load());
+  ASSERT_STATUS_OK(sess.Initialize());
 
   auto& sess_state = const_cast<onnxruntime::SessionState&>(sess.GetSessionState());
   const auto& exe_plan = sess_state.GetExecutionPlan()->execution_plan;
@@ -1958,8 +1968,76 @@ TEST_F(PlannerTest, TestCpuIf) {
       exe_plan[1]->steps_[7]->GetNodeIndex() == 7) {
     // there must be a wait before cpu If node
     static const std::string WaitOnEPStep = "WaitOnEPStep";
-    ASSERT_TRUE(exe_plan[1]->steps_[6]->ToString().substr(0, WaitOnEPStep.size()) == WaitOnEPStep);
+    ASSERT_EQ(exe_plan[1]->steps_[6]->ToString().substr(0, WaitOnEPStep.size()), WaitOnEPStep);
+  }
+}
+
+// model looks like:
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+// Shape ----------------> Reshape --> Shape ------------------> Reshape
+//                           ^                                     ^
+// InstanceNormalization ----|         InstanceNormalization ------|
+//
+// Python script to create this model:
+// def CreateModelFor19480():
+//    #shape->reshape->shape->reshape, 4 gather
+//    graphNodes = []
+//    graphNodes.append(h.make_node('Shape', inputs=['shape_input'], outputs=['9']))
+//    graphNodes.append(h.make_node('InstanceNormalization', inputs=['in0_input', 'scale0', 'B0'], outputs=['8']))
+//    graphNodes.append(h.make_node('Reshape', inputs=['8', '9'], outputs=['Reshape15_output']))
+//    graphNodes.append(h.make_node('Shape', inputs=['Reshape15_output'], outputs=['281']))
+//    graphNodes.append(h.make_node('InstanceNormalization', inputs=['in1_input', 'scale1', 'B1'], outputs=['293']))
+//    graphNodes.append(h.make_node('Reshape', inputs=['293', '281'], outputs=['output0']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices1'], outputs=['output1']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices2'], outputs=['output2']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices3'], outputs=['output3']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices4'], outputs=['output4']))
+//    g = h.make_graph(graphNodes, 'issue_19480',
+//                     [h.make_tensor_value_info('shape_input', tp.FLOAT, ['batch', 128, None, None]),
+//                      h.make_tensor_value_info('in0_input', tp.FLOAT, ['batch', 32, None]),
+//                      h.make_tensor_value_info('scale0', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('B0', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('in1_input', tp.FLOAT, ['batch', 32, None]),
+//                      h.make_tensor_value_info('scale1', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('B1', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('indices1', tp.INT32, []),
+//                      h.make_tensor_value_info('indices2', tp.INT32, []),
+//                      h.make_tensor_value_info('indices3', tp.INT32, []),
+//                      h.make_tensor_value_info('indices4', tp.INT32, [])],
+//                     [h.make_tensor_value_info('output0', tp.FLOAT, None),
+//                      h.make_tensor_value_info('output1', tp.INT64, None),
+//                      h.make_tensor_value_info('output2', tp.INT64, None),
+//                      h.make_tensor_value_info('output3', tp.INT64, None),
+//                      h.make_tensor_value_info('output4', tp.INT64, None)])
+//    model = h.make_model(g, opset_imports=[h.make_operatorsetid("", 17)], producer_name='producer_name')
+//    onnx.save(model, 'issue_19480.onnx')
+//
+TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
+  SessionOptions sess_opt;
+  sess_opt.graph_optimization_level = TransformerLevel::Default;
+
+  InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx"));
+  auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
+  status = sess.Load();
+  status = sess.Initialize();
+  ASSERT_TRUE(status.IsOK()) << "No crash";
+  const SequentialExecutionPlan* plan = sess.GetSessionState().GetExecutionPlan();
+  ASSERT_EQ(plan->allocation_plan[14].alloc_kind, AllocKind::kReuse) << "The input of reshape and gather will reuse the output of shape";
+
+  int gather_count = 0;
+  for (size_t i = 0; i < plan->execution_plan[1]->steps_.size(); i++) {
+    if (strstr(typeid(*(plan->execution_plan[1]->steps_[i])).name(), "LaunchKernelStep")) {
+      const Node* node = sess.GetSessionState().GetGraphViewer().GetNode(plan->execution_plan[1]->steps_[i]->GetNodeIndex());
+      if (node->OpType() == "Gather")
+        gather_count++;
+      else
+        FAIL() << "CPU stream should contain only gather ops";
+    }
   }
+  ASSERT_EQ(gather_count, 4) << "4 gather ops are all placed in CPU stream";
 }
 #endif
 }  // namespace test
diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc
index 2c1cd48d3d02..896105862849 100644
--- a/onnxruntime/test/framework/allocator_test.cc
+++ b/onnxruntime/test/framework/allocator_test.cc
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include <absl/base/config.h>
 
 #include "core/framework/allocator.h"
 
@@ -15,7 +16,7 @@ TEST(AllocatorTest, CPUAllocatorTest) {
   EXPECT_EQ(cpu_arena->Info().id, 0);
 
   // arena is disabled for CPUExecutionProvider on x86 and JEMalloc
-#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_JEMALLOC) && !defined(USE_MIMALLOC)
+#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_JEMALLOC) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
   EXPECT_EQ(cpu_arena->Info().alloc_type, OrtAllocatorType::OrtArenaAllocator);
 #else
   EXPECT_EQ(cpu_arena->Info().alloc_type, OrtAllocatorType::OrtDeviceAllocator);
diff --git a/onnxruntime/test/framework/bfc_arena_test.cc b/onnxruntime/test/framework/bfc_arena_test.cc
index 2d3c1521f9c0..e9f734057da1 100644
--- a/onnxruntime/test/framework/bfc_arena_test.cc
+++ b/onnxruntime/test/framework/bfc_arena_test.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <absl/base/config.h>
 #include "core/framework/bfc_arena.h"
 #include "core/framework/allocator_utils.h"
 #include "gtest/gtest.h"
@@ -164,6 +165,8 @@ void TestCustomMemoryLimit_ProcessException(const OnnxRuntimeException& ex) {
 #endif  // #ifdef GTEST_USES_POSIX_RE
 }
 
+// Address Sanitizer would report allocation-size-too-big if we don't disable this test.
+#ifndef ABSL_HAVE_ADDRESS_SANITIZER
 TEST(BFCArenaTest, TestCustomMemoryLimit) {
   {
     // Configure a 1MiB byte limit
@@ -214,6 +217,7 @@ TEST(BFCArenaTest, TestCustomMemoryLimit) {
     b.Free(first_ptr);
   }
 }
+#endif
 
 TEST(BFCArenaTest, AllocationsAndDeallocationsWithGrowth) {
   // Max of 2GiB, but starts out small.
@@ -333,6 +337,7 @@ struct StreamMock : public Stream {
   Status CleanUpOnRunEnd() override { return Status::OK(); }
 };
 
+#ifdef ORT_ENABLE_STREAM
 TEST(StreamAwareArenaTest, TwoStreamAllocation) {
   StreamAwareArena a(std::unique_ptr<IAllocator>(new CPUAllocator()), 1 << 30, false);
   CheckStats(&a, 0, 0, 0, 0);
@@ -409,6 +414,7 @@ TEST(StreamAwareArenaTest, TestSecureTheChunk) {
   EXPECT_TRUE(waitFunctionInvoked) << "wait function should be invoked";
   a.Free(p2);
 }
+#endif
 
 TEST(BFCArenaTest, TestExtendStrategy) {
   int64_t extend_delta_bytes = 0;
diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc
index ec572ce9deed..60752d7456d9 100644
--- a/onnxruntime/test/framework/execution_frame_test.cc
+++ b/onnxruntime/test/framework/execution_frame_test.cc
@@ -75,7 +75,16 @@ TEST_F(ExecutionFrameTest, TensorAllocationTest) {
   ASSERT_STATUS_OK(state.FinalizeSessionState(ORT_TSTR(""), kernel_registry_manager));
 
   vector<OrtValue> outputs;
-  ExecutionFrame frame({}, {}, {}, outputs, {}, {}, state);
+  ExecutionFrame frame(
+      {},
+      {},
+      {},
+      outputs,
+      {},
+#ifdef ORT_ENABLE_STREAM
+      {},
+#endif
+      state);
 
   int start_index = frame.GetNodeOffset(node->Index());
   ASSERT_EQ(start_index, 0);
@@ -150,7 +159,16 @@ TEST_F(ExecutionFrameTest, OutputShapeValidationTest) {
   ASSERT_STATUS_OK(state.FinalizeSessionState(ORT_TSTR(""), kernel_registry_manager));
 
   vector<OrtValue> outputs;
-  ExecutionFrame frame({}, {}, {}, outputs, {}, {}, state);
+  ExecutionFrame frame(
+      {},
+      {},
+      {},
+      outputs,
+      {},
+#ifdef ORT_ENABLE_STREAM
+      {},
+#endif
+      state);
 
   int start_index = frame.GetNodeOffset(node->Index());
   ASSERT_EQ(start_index, 0);
@@ -216,7 +234,16 @@ TEST_F(ExecutionFrameTest, FeedInDataTest) {
   ASSERT_TRUE(mlvalue_name_idx_map.GetIdx("Y", y_idx).IsOK());
 
   vector<OrtValue> outputs;
-  ExecutionFrame frame(AsSpan({x_idx}), AsSpan({value}), AsSpan({y_idx}), outputs, {}, {}, state);
+  ExecutionFrame frame(
+      AsSpan({x_idx}),
+      AsSpan({value}),
+      AsSpan({y_idx}),
+      outputs,
+      {},
+#ifdef ORT_ENABLE_STREAM
+      {},
+#endif
+      state);
 
   OrtValue* p_ml_value = frame.GetMutableNodeInputOrOutputMLValue(0);
   Tensor* p_tensor_arg_0 = p_ml_value ? p_ml_value->GetMutable<Tensor>() : nullptr;
@@ -299,7 +326,16 @@ TEST_F(ExecutionFrameTest, MemPatternTest) {
                        std::vector<float>(6, 1.0f), &v3);
 
   std::vector<OrtValue> outputs;
-  ExecutionFrame frame(AsSpan({x1_idx, x2_idx, x3_idx}), AsSpan({v1, v2, v3}), AsSpan({t3_idx}), outputs, {}, {}, state);
+  ExecutionFrame frame(
+      AsSpan({x1_idx, x2_idx, x3_idx}),
+      AsSpan({v1, v2, v3}),
+      AsSpan({t3_idx}),
+      outputs,
+      {},
+#ifdef ORT_ENABLE_STREAM
+      {},
+#endif
+      state);
 
   OrtValue& mlvalue3 = *frame.GetMutableNodeInputOrOutputMLValue(3);
   OrtValue& mlvalue4 = *frame.GetMutableNodeInputOrOutputMLValue(4);
@@ -388,7 +424,16 @@ TEST_F(ExecutionFrameTest, MemPatternWithExternalOutputsTest) {
   CreateMLValue<float>(cpu_allocator, std::vector<int64_t>{2, 2}, std::vector<float>(4, 1.0f), &t_value);
 
   vector<OrtValue> outputs;
-  ExecutionFrame frame(AsSpan({x_idx}), AsSpan({x_value}), AsSpan({y_idx}), outputs, {}, {}, state);
+  ExecutionFrame frame(
+      AsSpan({x_idx}),
+      AsSpan({x_value}),
+      AsSpan({y_idx}),
+      outputs,
+      {},
+#ifdef ORT_ENABLE_STREAM
+      {},
+#endif
+      state);
 
   ASSERT_FALSE(frame.GetMutableNodeInputOrOutputMLValue(t_idx)->IsTensor());
   ASSERT_STATUS_OK(frame.SetOutputMLValue(t_idx, t_value));
diff --git a/onnxruntime/test/framework/execution_provider_test.cc b/onnxruntime/test/framework/execution_provider_test.cc
index 5a7351a766fa..390fda7bfc5a 100644
--- a/onnxruntime/test/framework/execution_provider_test.cc
+++ b/onnxruntime/test/framework/execution_provider_test.cc
@@ -6,6 +6,7 @@
 #include "test_utils.h"
 #include "test/test_environment.h"
 #include "test/util/include/asserts.h"
+#include "core/framework/model_metadef_id_generator.h"
 
 #include "gtest/gtest.h"
 
@@ -18,11 +19,14 @@ class TestEP : public IExecutionProvider {
   static constexpr const char* kEPType = "TestEP";
 
  public:
-  TestEP() : IExecutionProvider{kEPType, true} {}
+  TestEP() : IExecutionProvider{kEPType} {}
 
   int GetId(const GraphViewer& viewer, HashValue& model_hash) {
-    return GenerateMetaDefId(viewer, model_hash);
+    return metadef_id_generator_.GenerateId(viewer, model_hash);
   }
+
+ private:
+  ModelMetadefIdGenerator metadef_id_generator_;
 };
 
 TEST(ExecutionProviderTest, MetadefIdGeneratorUsingModelPath) {
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 486ec37d1eeb..d0520ebbcba5 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -82,6 +82,11 @@ ProviderInfo_ROCM& GetProviderInfo_ROCM();
 class FuseAdd : public OpKernel {
  public:
   explicit FuseAdd(const OpKernelInfo& info) : OpKernel(info) {
+    // logic for testing that a session options config value can be read here
+    auto test_throw_in_ctor = info.GetConfigOptions().GetConfigEntry("ThrowInKernelCtor");
+    if (test_throw_in_ctor == "1") {
+      ORT_THROW("Test exception in ctor");
+    };
   }
 
   Status Compute(OpKernelContext* context) const override {
@@ -96,6 +101,7 @@ class FuseAdd : public OpKernel {
     return Status::OK();
   }
 };
+
 constexpr const char* kFuseTest = "FuseTest";
 constexpr const char* kFuseExecutionProvider = "FuseExecutionProvider";
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kFuseExecutionProvider, kFuseTest, 1, FuseAdd);
@@ -578,6 +584,9 @@ TEST(InferenceSessionTests, ModelMetadata) {
 }
 #endif
 TEST(InferenceSessionTests, CheckRunLogger) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   SessionOptions so;
 
   so.session_logid = "CheckRunLogger";
@@ -837,6 +846,9 @@ TEST(InferenceSessionTests, PreAllocateOutputVector) {
 }
 
 TEST(InferenceSessionTests, ConfigureVerbosityLevel) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   SessionOptions so;
 
   so.session_logid = "ConfigureVerbosityLevel";
@@ -1257,28 +1269,22 @@ TEST(InferenceSessionTests, TestOptionalInputs) {
       ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
     }
     // required, optional and invalid input
-    status = RunOptionalInputTest(true, true, true, version, sess_env);
-    ASSERT_FALSE(status.IsOK());
-    EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid input name"));
+    ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(RunOptionalInputTest(true, true, true, version, sess_env),
+                                        "Invalid input name");
 
     // missing required
-    status = RunOptionalInputTest(false, true, false, version, sess_env);
-    ASSERT_FALSE(status.IsOK());
-    if (version == 3) {
-      EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid input name"));
-    } else {
-      EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Missing Input:"));
-    }
+    ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(RunOptionalInputTest(false, true, false, version, sess_env),
+                                        (version == 3 ? "Invalid input name" : "Missing Input:"));
   }
 }
 
-TEST(ExecutionProviderTest, FunctionTest) {
-  onnxruntime::Model model("graph_1", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
+static void CreateFuseOpModel(const std::string& model_file_name) {
+  onnxruntime::Model model("graph_1", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                           {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
   auto& graph = model.MainGraph();
   std::vector<onnxruntime::NodeArg*> inputs;
   std::vector<onnxruntime::NodeArg*> outputs;
 
-  // FLOAT tensor.
   ONNX_NAMESPACE::TypeProto float_tensor;
   float_tensor.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
   float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(3);
@@ -1301,18 +1307,19 @@ TEST(ExecutionProviderTest, FunctionTest) {
   outputs.push_back(&output_arg_2);
   graph.AddNode("node_2", "Add", "node 2.", inputs, outputs);
 
-  auto status = graph.Resolve();
-  ASSERT_TRUE(status.IsOK());
+  ASSERT_STATUS_OK(graph.Resolve());
+  ASSERT_STATUS_OK(onnxruntime::Model::Save(model, model_file_name));
+}
+
+TEST(ExecutionProviderTest, FunctionTest) {
   std::string model_file_name = "execution_provider_test_graph.onnx";
-  status = onnxruntime::Model::Save(model, model_file_name);
+  CreateFuseOpModel(model_file_name);
 
   SessionOptions so;
   so.session_logid = "ExecutionProviderTest.FunctionTest";
-  InferenceSession session_object{so, GetEnvironment()};
-  status = session_object.Load(model_file_name);
-  ASSERT_TRUE(status.IsOK());
-  status = session_object.Initialize();
-  ASSERT_TRUE(status.IsOK());
+  InferenceSession session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.Load(model_file_name));
+  ASSERT_STATUS_OK(session.Initialize());
 
   RunOptions run_options;
   run_options.run_tag = so.session_logid;
@@ -1323,11 +1330,14 @@ TEST(ExecutionProviderTest, FunctionTest) {
   std::vector<int64_t> dims_mul_x = {3, 2};
   std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   OrtValue ml_value_x;
-  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x, &ml_value_x);
+  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x,
+                       &ml_value_x);
   OrtValue ml_value_y;
-  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x, &ml_value_y);
+  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x,
+                       &ml_value_y);
   OrtValue ml_value_z;
-  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x, &ml_value_z);
+  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x,
+                       &ml_value_z);
   NameMLValMap feeds;
   feeds.insert(std::make_pair("X", ml_value_x));
   feeds.insert(std::make_pair("Y", ml_value_y));
@@ -1343,67 +1353,33 @@ TEST(ExecutionProviderTest, FunctionTest) {
   std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f};
 
   // Now run
-  status = session_object.Run(run_options, feeds, output_names, &fetches);
-  ASSERT_TRUE(status.IsOK());
+  ASSERT_STATUS_OK(session.Run(run_options, feeds, output_names, &fetches));
   VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
 
-  InferenceSession session_object_2{so, GetEnvironment()};
-  ASSERT_STATUS_OK(
-      session_object_2.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
-  ASSERT_STATUS_OK(session_object_2.Load(model_file_name));
-  ASSERT_STATUS_OK(session_object_2.Initialize());
-  ASSERT_STATUS_OK(session_object_2.Run(run_options, feeds, output_names, &fetches));
+  InferenceSession session2{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session2.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
+  ASSERT_STATUS_OK(session2.Load(model_file_name));
+  ASSERT_STATUS_OK(session2.Initialize());
+  ASSERT_STATUS_OK(session2.Run(run_options, feeds, output_names, &fetches));
   VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
 }
 
 TEST(ExecutionProviderTest, ShapeInferenceForFusedFunctionTest) {
-  onnxruntime::Model model("graph_1", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
-  auto& graph = model.MainGraph();
-  std::vector<onnxruntime::NodeArg*> inputs;
-  std::vector<onnxruntime::NodeArg*> outputs;
-
-  // FLOAT tensor.
-  ONNX_NAMESPACE::TypeProto float_tensor;
-  float_tensor.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
-  float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(3);
-  float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(2);
-
-  auto& input_arg_1 = graph.GetOrCreateNodeArg("X", &float_tensor);
-  auto& input_arg_2 = graph.GetOrCreateNodeArg("Y", &float_tensor);
-  inputs.push_back(&input_arg_1);
-  inputs.push_back(&input_arg_2);
-  auto& output_arg = graph.GetOrCreateNodeArg("node_1_out_1", &float_tensor);
-  outputs.push_back(&output_arg);
-  graph.AddNode("node_1", "Add", "node 1.", inputs, outputs);
-
-  auto& input_arg_3 = graph.GetOrCreateNodeArg("Z", &float_tensor);
-  inputs.clear();
-  inputs.push_back(&output_arg);
-  inputs.push_back(&input_arg_3);
-  auto& output_arg_2 = graph.GetOrCreateNodeArg("M", &float_tensor);
-  outputs.clear();
-  outputs.push_back(&output_arg_2);
-  graph.AddNode("node_2", "Add", "node 2.", inputs, outputs);
-
-  auto status = graph.Resolve();
-  ASSERT_TRUE(status.IsOK());
   std::string model_file_name = "fused_node_shape_inference_test_graph.onnx";
-  status = onnxruntime::Model::Save(model, model_file_name);
+
+  CreateFuseOpModel(model_file_name);
 
   SessionOptions so;
   so.session_logid = "ExecutionProviderTest.ShapeInferenceForFusedFunctionTest";
   InferenceSessionWrapper session{so, GetEnvironment()};
-  ASSERT_STATUS_OK(
-      session.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
-  status = session.Load(model_file_name);
-  ASSERT_TRUE(status.IsOK());
-  status = session.Initialize();
-  ASSERT_TRUE(status.IsOK());
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
+  ASSERT_STATUS_OK(session.Load(model_file_name));
+  ASSERT_STATUS_OK(session.Initialize());
 
   Graph& fused_graph = session.GetMutableGraph();
-  ASSERT_TRUE(fused_graph.NumberOfNodes() == 1);
+  ASSERT_EQ(fused_graph.NumberOfNodes(), 1);
   auto& fused_node = *fused_graph.Nodes().begin();
-  ASSERT_TRUE(fused_node.NodeType() == Node::Type::Fused);
+  ASSERT_EQ(fused_node.NodeType(), Node::Type::Fused);
   ASSERT_TRUE(fused_node.Op()->has_type_and_shape_inference_function());
 
   // Clear shape inference data from output node to verify that assigned inference function is called
@@ -1413,7 +1389,25 @@ TEST(ExecutionProviderTest, ShapeInferenceForFusedFunctionTest) {
   ASSERT_STATUS_OK(fused_graph.Resolve());
 
   ASSERT_TRUE(fused_node_output.Shape() != nullptr);
-  ASSERT_TRUE(utils::GetTensorShapeFromTensorShapeProto(*fused_node_output.Shape()) == utils::GetTensorShapeFromTensorShapeProto(float_tensor.tensor_type().shape()));
+  ASSERT_EQ(utils::GetTensorShapeFromTensorShapeProto(*fused_node_output.Shape()), TensorShape({3, 2}));
+}
+
+TEST(ExecutionProviderTest, OpKernelInfoCanReadConfigOptions) {
+  std::string model_file_name = "OpKernelInfoCanReadConfigOptions.onnx";
+  CreateFuseOpModel(model_file_name);
+
+  SessionOptions so;
+  so.session_logid = "ExecutionProviderTest.OpKernelInfoCanReadConfigOptions";
+
+  // add a config key that if read causes the Fuse op kernel to throw in the ctor. this is just to test the value is passed
+  // through in the simplest way, as the kernel is constructed in InferenceSession::Intialize so we don't need to
+  // actually run the model.
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry("ThrowInKernelCtor", "1"));
+
+  InferenceSession session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
+  ASSERT_STATUS_OK(session.Load(model_file_name));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session.Initialize(), "Test exception in ctor");
 }
 
 TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
@@ -2661,6 +2655,9 @@ class InferenceSessionTestSharingAllocator : public InferenceSessionWrapper {
 
 // Ensure sessions use the same allocator. It uses ORT created allocator.
 TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsUseSameOrtCreatedAllocator) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2706,6 +2703,9 @@ TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsUseSameOrtCreatedAllo
 
 // Ensure sessions don't use the same allocator. It uses ORT created allocator.
 TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsDontUseSameOrtCreatedAllocator) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2758,6 +2758,9 @@ class InferenceSessionTestSharingInitializer : public InferenceSessionWrapper {
 };
 
 TEST(InferenceSessionTests, InitializerSharing_EnsureSessionsUseUserAddedInitializer) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2941,7 +2944,15 @@ TEST(InferenceSessionTests, GlobalThreadPoolWithDenormalAsZero) {
 }
 
 // test inter thread pool with setting denormal as zero
+#if !defined(__APPLE__)
+// TODO (hasesh): Debug this test failure on MacOS 12 with XCode 14.2
+// It seemingly passes on MacOS 13 with XCode 15.x but we had to drop down to Mac OS 12
+// because at the time of writing this, Mac OS 13 images were making CI/Packaging pipelines
+// very unstable.
 TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   // test if denormal-as-zero mode is supported
   if (!SetDenormalAsZero(false)) {
     return;
@@ -2995,6 +3006,7 @@ TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) {
   VerifyThreadPoolWithDenormalAsZero(session2.GetIntraOpThreadPoolToUse(), false);
   VerifyThreadPoolWithDenormalAsZero(session2.GetInterOpThreadPoolToUse(), false);
 }
+#endif
 
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index e1ce1d4abf81..ed698ab92014 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <iostream>
+#include <absl/base/config.h>
 
 #include "asserts.h"
 #include "core/framework/execution_providers.h"
@@ -84,9 +85,10 @@ TEST_P(SessionStateAddGetKernelTest, AddGetKernelTest) {
   auto kernel_def = KernelDefBuilder().SetName("Variable").Provider(kCpuExecutionProvider).SinceVersion(1, 10).Build();
 
   OpKernelInfo p_info(node, *kernel_def, *cpu_execution_provider, s.GetConstantInitializedTensors(),
-                      s.GetOrtValueNameIdxMap(), s.GetDataTransferMgr());
-  unique_ptr<TestOpKernel> p_kernel;
-  p_kernel.reset(new TestOpKernel(p_info));
+                      s.GetOrtValueNameIdxMap(), s.GetDataTransferMgr(), s.GetAllocators(),
+                      s.GetSessionOptions().config_options);
+
+  std::unique_ptr<TestOpKernel> p_kernel = std::make_unique<TestOpKernel>(p_info);
   size_t orig_num_outputs = p_kernel->Node().OutputDefs().size();
   std::cout << "node_idx: " << node.Index() << std::endl;
 
@@ -170,13 +172,16 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
 
   GraphPartitioner partitioner(krm, execution_providers);
   ASSERT_STATUS_OK(
-      partitioner.Partition(graph, session_state.GetMutableFuncMgr(),
-                            [](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
-                               const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
-                              AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
-                              return layout_transformation::TransformLayoutForEP(
-                                  graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
-                            }));
+      partitioner.Partition(
+          graph, session_state.GetMutableFuncMgr(),
+          [](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
+             const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
+            AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
+            return layout_transformation::TransformLayoutForEP(
+                graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
+          },
+          sess_options.config_options,
+          DefaultLoggingManager().DefaultLogger()));
 
   ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -211,7 +216,7 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
 // if the relevant session option config flag is set
 // For this test we need to enable the arena-based allocator which is not supported on x86 builds, so
 // enable this test only on x64 builds
-#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_MIMALLOC)
+#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
   AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
   // Part 1: Feature turned ON (i.e.) allocate from non-arena memory
@@ -256,7 +261,9 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                          const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
           return layout_transformation::TransformLayoutForEP(graph, modified, execution_provider,
                                                              cpu_allocator, debug_graph_fn);
-        }));
+        },
+        sess_options.config_options,
+        DefaultLoggingManager().DefaultLogger()));
 
     ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -313,7 +320,9 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                          const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
           return layout_transformation::TransformLayoutForEP(
               graph, modified, execution_provider, cpu_allocator, debug_graph_fn);
-        }));
+        },
+        sess_options.config_options,
+        DefaultLoggingManager().DefaultLogger()));
 
     // Finalize the session state
     ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
diff --git a/onnxruntime/test/framework/shape_inference_test.cc b/onnxruntime/test/framework/shape_inference_test.cc
index bfabcd567803..f5258760eb20 100644
--- a/onnxruntime/test/framework/shape_inference_test.cc
+++ b/onnxruntime/test/framework/shape_inference_test.cc
@@ -5,13 +5,16 @@
 #include <unordered_map>
 
 #include "gtest/gtest.h"
+#include "core/common/span_utils.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_cxx_api.h"
 #include "test/framework/model_builder_utils.h"
+#include "test/util/include/asserts.h"
 #include "test/util/include/test_utils.h"
+#include "test/util/include/inference_session_wrapper.h"
 #include "test/test_environment.h"
 
 using namespace ONNX_NAMESPACE;
-using namespace std;
 
 namespace onnxruntime {
 namespace test {
@@ -22,7 +25,7 @@ class ShapeInferenceTest : public ::testing::Test {
  protected:
   onnxruntime::Model model_;
   int node_count_;
-  std::unordered_map<string, std::unique_ptr<onnxruntime::NodeArg>> name_to_arg_;
+  std::unordered_map<std::string, std::unique_ptr<onnxruntime::NodeArg>> name_to_arg_;
 
  public:
   ShapeInferenceTest() : model_("Test", false, DefaultLoggingManager().DefaultLogger()), node_count_(0) {}
@@ -73,5 +76,91 @@ TEST_F(ShapeInferenceTest, BasicTest) {
   CheckShapeEquality(InputShape(node), OutputShape(node));
 }
 
+namespace {
+struct MyCustomKernelWithOptionalInput {
+  MyCustomKernelWithOptionalInput(const OrtKernelInfo* /*info*/) {
+  }
+
+  OrtStatusPtr ComputeV2(OrtKernelContext* /* context */) const {
+    return nullptr;
+  }
+};
+
+struct MyCustomOpWithOptionalInput : Ort::CustomOpBase<MyCustomOpWithOptionalInput,
+                                                       MyCustomKernelWithOptionalInput,
+                                                       true> {
+  explicit MyCustomOpWithOptionalInput(const char* provider) : provider_(provider) {}
+
+  OrtStatusPtr CreateKernelV2(const OrtApi& /* api */, const OrtKernelInfo* info, void** kernel) const {
+    *kernel = new MyCustomKernelWithOptionalInput(info);
+    return nullptr;
+  };
+
+  const char* GetName() const { return "FooBar"; };
+  const char* GetExecutionProviderType() const { return provider_; };
+
+  size_t GetInputTypeCount() const { return 3; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const {
+    // The second input (index == 1) is optional
+    if (index == 1)
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t /*index*/) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+ private:
+  const char* provider_;
+};
+
+const ORTCHAR_T* const OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = ORT_TSTR("testdata/foo_bar_2.onnx");
+
+}  // namespace
+
+// CustomOps Output type inference function quits if it
+// encounters the an output that is optional and absent.
+// It quits without any errors or logging. We want to make sure
+// that inference proceeds for all of the outputs when absent optional inputs are present
+TEST(ShapeInferenceCustomOpTest, custom_op_optional_input_inference_test) {
+  MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider};
+
+  const auto& env = GetEnvironment();
+
+  Ort::CustomOpDomain op_domain("test");
+  op_domain.Add(&custom_op);
+
+  std::initializer_list<OrtCustomOpDomain*> op_domains = {static_cast<OrtCustomOpDomain*>(op_domain)};
+
+  SessionOptions sess_opts;
+  sess_opts.inter_op_param.thread_pool_size = 1;
+  sess_opts.intra_op_param.thread_pool_size = 1;
+
+  InferenceSessionWrapper session{sess_opts, env, OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2};
+  ASSERT_STATUS_OK(session.AddCustomOpDomains(AsSpan(op_domains)));
+
+  ASSERT_STATUS_OK(session.Load());
+  ASSERT_STATUS_OK(session.Initialize());
+
+  const onnxruntime::Model& model = session.GetModel();
+  const auto& graph = model.MainGraph();
+  const auto& nodes = graph.Nodes();
+  for (const auto& node : nodes) {
+    if (node.OpType() == "FooBar") {
+      // check inferred shapes
+      const auto* node_arg = node.OutputDefs()[0];
+      const auto* type_proto = node_arg->TypeAsProto();
+      ASSERT_NE(nullptr, type_proto);
+      ASSERT_EQ(ONNX_NAMESPACE::TypeProto::ValueCase::kTensorType, type_proto->value_case());
+      ASSERT_EQ(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, type_proto->tensor_type().elem_type());
+    }
+  }
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/tensor_test.cc b/onnxruntime/test/framework/tensor_test.cc
index 38e3f184ebc1..9202543b75a6 100644
--- a/onnxruntime/test/framework/tensor_test.cc
+++ b/onnxruntime/test/framework/tensor_test.cc
@@ -6,7 +6,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-
+#include <absl/base/config.h>
 #include <sstream>
 
 namespace onnxruntime {
@@ -138,7 +138,7 @@ TEST(TensorTest, EmptyTensorTest) {
   EXPECT_EQ(location.id, 0);
 
   // arena is disabled for CPUExecutionProvider on x86 and JEMalloc
-#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_JEMALLOC) && !defined(USE_MIMALLOC)
+#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_JEMALLOC) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
   EXPECT_EQ(location.alloc_type, OrtAllocatorType::OrtArenaAllocator);
 #else
   EXPECT_EQ(location.alloc_type, OrtAllocatorType::OrtDeviceAllocator);
diff --git a/onnxruntime/test/framework/test_tensor_loader.cc b/onnxruntime/test/framework/test_tensor_loader.cc
index e71830be08b5..71d70abceb82 100644
--- a/onnxruntime/test/framework/test_tensor_loader.cc
+++ b/onnxruntime/test/framework/test_tensor_loader.cc
@@ -95,6 +95,7 @@ TEST(CApiTensorTest, load_simple_float_tensor_allocator) {
   g_ort->ReleaseStatus(ort_st);
 }
 
+#if !defined(__wasm__)
 template <bool use_current_dir>
 static void run_external_data_test() {
   FILE* fp;
@@ -154,6 +155,7 @@ TEST(CApiTensorTest, load_float_tensor_with_external_data) {
   run_external_data_test<true>();
   run_external_data_test<false>();
 }
+#endif
 
 #if defined(__amd64__) || defined(_M_X64)
 #ifndef __ANDROID__
diff --git a/onnxruntime/test/framework/tunable_op_test.cc b/onnxruntime/test/framework/tunable_op_test.cc
index 0d9e557ebc81..6fe0754db40d 100644
--- a/onnxruntime/test/framework/tunable_op_test.cc
+++ b/onnxruntime/test/framework/tunable_op_test.cc
@@ -82,7 +82,7 @@ class TestEP : public IExecutionProvider {
   TestTuningContext tuning_ctx_{this};
 
  public:
-  TestEP() : IExecutionProvider{kEPType, true} {}
+  TestEP() : IExecutionProvider{kEPType} {}
 
   ITuningContext* GetTuningContext() const override {
     return const_cast<TestTuningContext*>(&tuning_ctx_);
@@ -459,6 +459,8 @@ class TunableVecAddSelectFastestIfSupported : public TunableOp<VecAddParamsRecor
   }
 };
 
+// We run Android tests in a simulator so the result might be different
+#if defined(__ANDROID__) && defined(NDEBUG)
 TEST(TunableOp, SelectFastestIfSupported) {
 #ifdef ORT_NO_RTTI
   GTEST_SKIP() << "TunableOp needs RTTI to work correctly";
@@ -483,6 +485,7 @@ TEST(TunableOp, SelectFastestIfSupported) {
   ASSERT_EQ(last_run, "FastestNarrow");
 #endif
 }
+#endif
 
 TEST(TunableOp, DisabledWithManualSelection) {
 #ifdef ORT_NO_RTTI
diff --git a/onnxruntime/test/global_thread_pools/test_inference.cc b/onnxruntime/test/global_thread_pools/test_inference.cc
index 4772e7de2bdd..f553682975f1 100644
--- a/onnxruntime/test/global_thread_pools/test_inference.cc
+++ b/onnxruntime/test/global_thread_pools/test_inference.cc
@@ -55,9 +55,15 @@ static void RunSession(OrtAllocator& allocator, Ort::Session& session_object,
   // size_t total_len = type_info.GetElementCount();
   ASSERT_EQ(values_y.size(), static_cast<size_t>(5));
 
+// test inference is using onnxruntime_shared_lib_test_LIBS, so HasCudaEnvironment(800) isn't available
+#ifdef USE_CUDA
+  const float tolerance = 1e-5f;
+#else
+  const float tolerance = 1e-6f;
+#endif
   OutT* f = output_tensor->GetTensorMutableData<OutT>();
   for (size_t i = 0; i != static_cast<size_t>(5); ++i) {
-    ASSERT_NEAR(values_y[i], f[i], 1e-6f);
+    ASSERT_NEAR(values_y[i], f[i], tolerance);
   }
 }
 
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 24f34492954a..4b676021dfe6 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -1503,10 +1503,8 @@ TEST_F(GraphTest, ShapeInferenceErrorHandling) {
 
   graph.AddNode("node_1", "ShapeInferenceThrowsOp", "node 1", {&input_arg1}, {&output_arg1});
 
-  auto status = graph.Resolve();
-  EXPECT_FALSE(status.IsOK());
-  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Node (node_1) Op (ShapeInferenceThrowsOp) "
-                                                        "[ShapeInferenceError] try harder"));
+  EXPECT_STATUS_NOT_OK_AND_HAS_SUBSTR(graph.Resolve(),
+                                      "Node (node_1) Op (ShapeInferenceThrowsOp) [ShapeInferenceError] try harder");
 }
 
 TEST_F(GraphTest, AddTensorAttribute) {
@@ -2024,10 +2022,9 @@ TEST_F(GraphTest, LoadModelMissingInput) {
   SetTypeAndShape(output->mutable_type()->mutable_tensor_type(), 1, {2, 2});
 
   std::shared_ptr<Model> model;
-  Status st = Model::Load(std::move(m), model, nullptr, *logger_);
-  ASSERT_FALSE(st.IsOK());
-  ASSERT_THAT(st.ErrorMessage(), testing::HasSubstr("Invalid model. Node input 'y' is not a graph input, "
-                                                    "initializer, or output of a previous node."));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(Model::Load(std::move(m), model, nullptr, *logger_),
+                                      "Invalid model. Node input 'y' is not a graph input, "
+                                      "initializer, or output of a previous node.");
 }
 
 // if an initializer is backing an optional graph input, it can't be removed even if unused in the graph.
diff --git a/onnxruntime/test/logging_apis/test_logging_apis.cc b/onnxruntime/test/logging_apis/test_logging_apis.cc
index 65d0eddb4bb0..d72c47493d80 100644
--- a/onnxruntime/test/logging_apis/test_logging_apis.cc
+++ b/onnxruntime/test/logging_apis/test_logging_apis.cc
@@ -12,7 +12,7 @@
 #pragma GCC diagnostic pop
 #endif
 #endif
-
+#include <absl/base/config.h>
 #include "gtest/gtest.h"
 
 // Manually initialize the Ort API object for every test.
@@ -167,7 +167,13 @@ TEST_F(RealCAPITestsFixture, CApiLoggerLogMessage) {
                                                     ORT_FILE, line_num, static_cast<const char*>(__FUNCTION__)));
 }
 
+// The code below where it tests for formatting error generates an out-of-bound memory access. Therefore we disable it
+// when memory sanitizer is enabled.
+#ifdef ABSL_HAVE_ADDRESS_SANITIZER
+TEST_F(RealCAPITestsFixture, DISABLED_CppApiORTCXXLOG) {
+#else
 TEST_F(RealCAPITestsFixture, CppApiORTCXXLOG) {
+#endif
   // Tests the output and filtering of the ORT_CXX_LOG and ORT_CXX_LOG_NOEXCEPT macros in the C++ API.
   // The first two calls go through, but the last two calls are filtered out due to an insufficient severity.
 
@@ -203,7 +209,11 @@ TEST_F(RealCAPITestsFixture, CppApiORTCXXLOG) {
   ORT_CXX_LOG_NOEXCEPT(cpp_ort_logger, OrtLoggingLevel::ORT_LOGGING_LEVEL_INFO, "Ignored2");
 }
 
+#ifdef ABSL_HAVE_ADDRESS_SANITIZER
+TEST_F(RealCAPITestsFixture, DISABLED_CppApiORTCXXLOGF) {
+#else
 TEST_F(RealCAPITestsFixture, CppApiORTCXXLOGF) {
+#endif
   // Tests the output and filtering of the ORT_CXX_LOGF and ORT_CXX_LOGF_NOEXCEPT macros in the C++ API.
   // The first set of calls go through. The next set of calls are filtered out due to an insufficient severity.
   // The last calls have a formatting error and we expect an exception depending on which macro is used.
@@ -259,23 +269,6 @@ TEST_F(RealCAPITestsFixture, CppApiORTCXXLOGF) {
 
   line_num = __LINE__ + 1;
   ORT_CXX_LOGF_NOEXCEPT(cpp_ort_logger, OrtLoggingLevel::ORT_LOGGING_LEVEL_INFO, "Ignored %d", line_num);
-
-  //
-  // Test errors due to formatting error.
-  //
-
-  // Catch expected exception from ORT_CXX_LOGF macro.
-  try {
-    line_num = __LINE__ + 1;
-    ORT_CXX_LOGF(cpp_ort_logger, OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "%ls", "abc");
-    FAIL();
-  } catch (const Ort::Exception& excpt) {
-    ASSERT_THAT(excpt.what(), testing::HasSubstr("Failed to log message due to formatting error"));
-  }
-
-  // The formatting error is ignored with the ORT_CXX_LOGF_NOEXCEPT macro
-  line_num = __LINE__ + 1;
-  ORT_CXX_LOGF_NOEXCEPT(cpp_ort_logger, OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "%ls", "abc");
 }
 
 TEST_F(MockCAPITestsFixture, CppLogMacroBypassCApiCall) {
diff --git a/onnxruntime/test/mlas/bench/bench_computesoftmax.cpp b/onnxruntime/test/mlas/bench/bench_computesoftmax.cpp
new file mode 100644
index 000000000000..f777a7cfc430
--- /dev/null
+++ b/onnxruntime/test/mlas/bench/bench_computesoftmax.cpp
@@ -0,0 +1,233 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/mlas/lib/mlasi.h"
+#include "core/util/thread_utils.h"
+#include "test/mlas/bench/bench_util.h"
+
+using onnxruntime::narrow;
+
+struct RestrictAlignedPtr {
+  float* ptr;               // Aligned pointer within the underlying buffer
+  void* underlying_buffer;  // Underlying buffer (including extra space for alignment)
+};
+
+// Return a RestrictAlignedPtr where the ptr is aligned to byte_aligned, but not to byte_aligned * 2
+RestrictAlignedPtr restrict_aligned_alloc(int D, int byte_aligned) {
+  if (byte_aligned <= 0 || (byte_aligned & (byte_aligned - 1)) != 0) {
+    throw std::invalid_argument("Alignment must be a power of 2");
+  }
+
+  const int byte_alignedx2 = byte_aligned << 1;
+
+  void* buffer = malloc(D * sizeof(float) + byte_alignedx2 * 2);
+  if (buffer == nullptr) {
+    ORT_THROW_EX(std::bad_alloc);
+  }
+
+  uintptr_t address = reinterpret_cast<uintptr_t>(buffer);
+  uintptr_t aligned_address = ((address + byte_alignedx2 - 1) & ~(byte_alignedx2 - 1)) + byte_aligned;
+  ORT_ENFORCE((aligned_address % byte_aligned == 0) && (aligned_address % byte_alignedx2 != 0));
+  float* aligned_ptr = reinterpret_cast<float*>(aligned_address);
+
+  return {aligned_ptr, buffer};
+}
+
+void COMPUTESOFTMAXINPLACE(benchmark::State& state) {
+  const auto byte_aligned = narrow<int>(state.range(0));
+  const auto N = narrow<int>(state.range(1));
+  const auto D = narrow<int>(state.range(2));
+  const auto threads = narrow<int>(state.range(3));
+
+  if (N <= 0 || D <= 0 || threads <= 0) {
+    throw std::invalid_argument("N, D, and Threads must be greater than 0!");
+  }
+
+  OrtThreadPoolParams tpo;
+  tpo.thread_pool_size = threads;
+  tpo.auto_set_affinity = true;
+
+  std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(
+      onnxruntime::concurrency::CreateThreadPool(
+          &onnxruntime::Env::Default(), tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
+
+  auto data = RandomVectorUniform<float>(static_cast<size_t>(N * D), -1.0f, 1.0f);
+  RestrictAlignedPtr ptr = restrict_aligned_alloc(N * D, byte_aligned);
+  float* input = ptr.ptr;
+  float* output = input;
+  std::copy(data.begin(), data.end(), input);  // Copy the data to the aligned memory
+
+  // warming up run
+  MlasComputeSoftmax(input, output, N, D, false, tp.get());
+
+  for (auto _ : state) {
+    MlasComputeSoftmax(input, output, N, D, false, tp.get());
+  }
+
+  free(ptr.underlying_buffer);
+}
+
+void REDUCEMAXIMUMF32KERNELAVX(benchmark::State& state) {
+  const auto byte_aligned = narrow<int>(state.range(0));
+  const auto D = narrow<int>(state.range(1));
+
+  if (D <= 0) {
+    throw std::invalid_argument("D must be greater than 0!");
+  }
+
+  auto data = RandomVectorUniform<float>(static_cast<size_t>(D), -1.0f, 1.0f);
+  RestrictAlignedPtr ptr = restrict_aligned_alloc(D, byte_aligned);
+  float* input = ptr.ptr;
+  std::copy(data.begin(), data.end(), input);  // Copy the data to the aligned memory
+
+  // warming up run
+  float Maximum = MlasReduceMaximumF32KernelAvx(input, D);
+
+  for (auto _ : state) {
+    Maximum = MlasReduceMaximumF32KernelAvx(input, D);
+  }
+
+  free(ptr.underlying_buffer);
+  (void)Maximum;
+}
+
+void REDUCEMAXIMUMF32KERNELAVX512F(benchmark::State& state) {
+  const auto byte_aligned = narrow<int>(state.range(0));
+  const auto D = narrow<int>(state.range(1));
+
+  if (D <= 0) {
+    throw std::invalid_argument("D must be greater than 0!");
+  }
+
+  auto data = RandomVectorUniform<float>(static_cast<size_t>(D), -1.0f, 1.0f);
+  RestrictAlignedPtr ptr = restrict_aligned_alloc(D, byte_aligned);
+  float* input = ptr.ptr;
+  std::copy(data.begin(), data.end(), input);  // Copy the data to the aligned memory
+
+  // warming up run
+  float Maximum = MlasReduceMaximumF32KernelAvx512F(input, D);
+
+  for (auto _ : state) {
+    Maximum = MlasReduceMaximumF32KernelAvx512F(input, D);
+  }
+
+  free(ptr.underlying_buffer);
+  (void)Maximum;
+}
+
+void COMPUTESUMEXPF32KERNELAVX512F(benchmark::State& state) {
+  const auto byte_aligned = narrow<int>(state.range(0));
+  const auto D = narrow<int>(state.range(1));
+
+  if (D <= 0) {
+    throw std::invalid_argument("D must be greater than 0!");
+  }
+
+  auto data = RandomVectorUniform<float>(static_cast<size_t>(D), -1.0f, 1.0f);
+  RestrictAlignedPtr ptr = restrict_aligned_alloc(D, byte_aligned);
+  float* input = ptr.ptr;
+  float* output = input;
+  std::copy(data.begin(), data.end(), input);  // Copy the data to the aligned memory
+
+  float Maximum = MlasReduceMaximumF32KernelAvx(input, D);
+  float NegativeMaximum = -Maximum;
+
+  // warming up run
+  float Accumulation = MlasComputeSumExpF32KernelAvx512F(input, output, D, &NegativeMaximum);
+
+  for (auto _ : state) {
+    Accumulation = MlasComputeSumExpF32KernelAvx512F(input, output, D, &NegativeMaximum);
+  }
+
+  free(ptr.underlying_buffer);
+  (void)Accumulation;
+}
+
+void COMPUTESOFTMAXOUTPUTF32KERNELAVX(benchmark::State& state) {
+  const auto byte_aligned = narrow<int>(state.range(0));
+  const auto D = narrow<int>(state.range(1));
+
+  if (D <= 0) {
+    throw std::invalid_argument("D must be greater than 0!");
+  }
+
+  auto data = RandomVectorUniform<float>(static_cast<size_t>(D), -1.0f, 1.0f);
+  RestrictAlignedPtr ptr = restrict_aligned_alloc(D, byte_aligned);
+  float* input = ptr.ptr;
+  float* output = input;
+  std::copy(data.begin(), data.end(), input);  // Copy the data to the aligned memory
+
+  float Maximum = MlasReduceMaximumF32KernelAvx(input, D);
+  float NegativeMaximum = -Maximum;
+
+  float Accumulation = MlasComputeSumExpF32KernelAvx512F(input, output, D, &NegativeMaximum);
+
+  float Parameters[] = {1.0f / Accumulation};
+
+  // warming up run
+  MlasComputeSoftmaxOutputF32KernelAvx(output, D, Parameters);
+
+  for (auto _ : state) {
+    MlasComputeSoftmaxOutputF32KernelAvx(output, D, Parameters);
+  }
+
+  free(ptr.underlying_buffer);
+}
+
+static void ComputeSoftmaxInplaceArgs(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"ByteAligned", "N", "D", "Threads"});
+  for (int threads : {1, 8}) {
+    for (int byte_aligned : {64}) {  // MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT is 64
+      b->Args({byte_aligned, 16000, 4, threads});
+      b->Args({byte_aligned, 16000, 500, threads});
+      b->Args({byte_aligned, 48000, 3, threads});
+      b->Args({byte_aligned, 48000, 2000, threads});
+      b->Args({byte_aligned, 80000, 5, threads});
+      b->Args({byte_aligned, 80000, 2000, threads});
+      b->Args({byte_aligned, 112000, 7, threads});
+      b->Args({byte_aligned, 112000, 2000, threads});
+      b->Args({byte_aligned, 144000, 9, threads});
+      b->Args({byte_aligned, 144000, 2000, threads});
+      b->Args({byte_aligned, 176000, 11, threads});
+      b->Args({byte_aligned, 176000, 2000, threads});
+      b->Args({byte_aligned, 208000, 13, threads});
+      b->Args({byte_aligned, 208000, 2000, threads});
+      b->Args({byte_aligned, 240000, 15, threads});
+      b->Args({byte_aligned, 240000, 2000, threads});
+    }
+  }
+}
+
+BENCHMARK(COMPUTESOFTMAXINPLACE)->Apply(ComputeSoftmaxInplaceArgs)->UseRealTime();
+
+BENCHMARK(REDUCEMAXIMUMF32KERNELAVX)
+    ->ArgNames({"ByteAligned", "D"})
+    ->ArgsProduct({
+        {4, 8, 16, 32, 64, 128},                     // ByteAligned
+        {3, 4, 5, 7, 9, 11, 13, 15, 16, 500, 2000},  // D
+    })
+    ->UseRealTime();
+
+BENCHMARK(REDUCEMAXIMUMF32KERNELAVX512F)
+    ->ArgNames({"ByteAligned", "D"})
+    ->ArgsProduct({
+        {4, 8, 16, 32, 64, 128},                     // ByteAligned
+        {3, 4, 5, 7, 9, 11, 13, 15, 16, 500, 2000},  // D
+    })
+    ->UseRealTime();
+
+BENCHMARK(COMPUTESUMEXPF32KERNELAVX512F)
+    ->ArgNames({"ByteAligned", "D"})
+    ->ArgsProduct({
+        {4, 8, 16, 32, 64, 128},                 // ByteAligned
+        {3, 4, 5, 7, 9, 11, 13, 15, 500, 2000},  // D
+    })
+    ->UseRealTime();
+
+BENCHMARK(COMPUTESOFTMAXOUTPUTF32KERNELAVX)
+    ->ArgNames({"ByteAligned", "D"})
+    ->ArgsProduct({
+        {4, 8, 16, 32, 64, 128},                     // ByteAligned
+        {3, 4, 5, 7, 9, 11, 13, 15, 16, 500, 2000},  // D
+    })
+    ->UseRealTime();
diff --git a/onnxruntime/test/mlas/bench/bench_q4gemm.cpp b/onnxruntime/test/mlas/bench/bench_q4gemm.cpp
index 87e360161276..61b3f57d8daa 100644
--- a/onnxruntime/test/mlas/bench/bench_q4gemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_q4gemm.cpp
@@ -109,12 +109,19 @@ void Q8Q4GEMM(benchmark::State& state, MLAS_BLK_QUANT_TYPE qtype) {
 
 static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
   b->ArgNames(q4gemm_bench_arg_names);
-  ArgsProduct(b, {{1, 1024, 2048}, {4096}, {4096}, {8}});
+  b->ArgsProduct({{1, 1024, 2048}, {4096}, {4096}, {8}});
 }
 
-BENCHMARK_CAPTURE(Q4GEMM, Q4Sym, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM, Q4Zp8, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM, Q4Sym128, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Sym, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Zp8, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Sym128, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
+[[maybe_unused]] static const bool benchmarks_registered = []() {
+  const bool is_q4gemm_supported = MlasQ4GemmPackBSize(BlkQ4Sym, 1, 1) > 0;
+  if (is_q4gemm_supported) {
+    BENCHMARK_CAPTURE(Q4GEMM, Q4Sym, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q4GEMM, Q4Zp8, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q4GEMM, Q4Sym128, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Sym, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Zp8, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Sym128, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
+    return true;
+  }
+  return false;
+}();
diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index 115641f6a6ef..39d135236b89 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -224,8 +224,7 @@ BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime();
 
 static void General_Conv2d(benchmark::internal::Benchmark* b) {
   b->ArgNames(ArgNamesForConv(2));
-  ArgsProduct(
-      b,
+  b->ArgsProduct(
       {{2},       // Rank,
        {1},       // N
        {1, 2},    // Groups
diff --git a/onnxruntime/test/mlas/bench/bench_sgemm.cpp b/onnxruntime/test/mlas/bench/bench_sgemm.cpp
index e6e34bc88ad5..a94d33cd77f6 100644
--- a/onnxruntime/test/mlas/bench/bench_sgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sgemm.cpp
@@ -103,14 +103,14 @@ void SGEMM(benchmark::State& state, bool pack_b, bool trans_a, bool trans_b, flo
 
 static void GemmSizeWithOne(benchmark::internal::Benchmark* b) {
   b->ArgNames(sgemm_bench_arg_names);
-  ArgsProduct(b, {{1}, {63, 255, 1023}, {63, 255, 1023}});
-  ArgsProduct(b, {{63, 255, 1023}, {1}, {63, 255, 1023}});
-  ArgsProduct(b, {{63, 255, 1023}, {63, 255, 1023}, {1}});
+  b->ArgsProduct({{1}, {63, 255, 1023}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {1}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {1}});
 }
 
 static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
   b->ArgNames(sgemm_bench_arg_names);
-  ArgsProduct(b, {{63, 255, 1023}, {63, 255, 1023}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {63, 255, 1023}});
 }
 
 BENCHMARK_CAPTURE(SGEMM, NORMAL_NoTrans, false, false, false)->Apply(GemmSizeProducts)->UseRealTime();
@@ -128,7 +128,7 @@ BENCHMARK_CAPTURE(SGEMM, PACKB_TransA, true, true, false)->Apply(GemmSizeProduct
 
 static void GemmLLMSizeProducts(benchmark::internal::Benchmark* b) {
   b->ArgNames(sgemm_bench_arg_names);
-  ArgsProduct(b, {{1, 1024, 2048}, {4096, 11008}, {4096, 11008}});
+  b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}});
 }
 
 BENCHMARK_CAPTURE(SGEMM, LLM, false, false, true)->Apply(GemmLLMSizeProducts)->UseRealTime();
diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
index cf67ef6f8205..04f5947e1371 100644
--- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
@@ -4,33 +4,40 @@
 #include "mlas_q4.h"
 #include "mlas_qnbit.h"
 
+#include <memory>
+#include <sstream>
 #include <stdexcept>
+#include <vector>
 
 #include "benchmark/benchmark.h"
 
 #include "bench_util.h"
+#include "core/common/narrow.h"
 #include "core/util/thread_utils.h"
-
-template <size_t BlkBitWidth, size_t BlkLen, bool Symmetric>
-void SQNBITGEMM(benchmark::State& state) {
-  if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
-  if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
-  if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!");
-  if (state.range(3) <= 0) throw std::invalid_argument("Threads must greater than 0!");
-
-  const size_t M = static_cast<size_t>(state.range(0));
-  const size_t N = static_cast<size_t>(state.range(1));
-  const size_t K = static_cast<size_t>(state.range(2));
-  const size_t threads = static_cast<size_t>(state.range(3));
+#include "core/platform/env_var_utils.h"
+
+using onnxruntime::narrow;
+
+template <size_t BlkBitWidth>
+void RunSQNBitGemmBenchmark(size_t BlkLen,
+                            size_t M, size_t N, size_t K,
+                            size_t Threads,
+                            bool Symmetric,
+                            MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+                            benchmark::State& state) {
+  if (!MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) {
+    state.SkipWithMessage("SQNBitGemm is not available with the given configuration on the current machine.");
+    return;
+  }
 
   size_t QuantBDataSizeInBytes, QuantBScaleSize, QuantBZeroPointSizeInBytes;
   MlasBlockwiseQuantizedBufferSizes(
-      BlkBitWidth, BlkLen, /* columnwise */ true,
+      BlkBitWidth, static_cast<int>(BlkLen), /* columnwise */ true,
       static_cast<int>(K), static_cast<int>(N),
       QuantBDataSizeInBytes, QuantBScaleSize, &QuantBZeroPointSizeInBytes);
 
   OrtThreadPoolParams tpo;
-  tpo.thread_pool_size = static_cast<int>(threads);
+  tpo.thread_pool_size = static_cast<int>(Threads);
   tpo.auto_set_affinity = true;
 
   std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(
@@ -47,14 +54,30 @@ void SQNBITGEMM(benchmark::State& state) {
 
   MlasQuantizeBlockwise<float, BlkBitWidth>(QuantBData.data(), QuantBScale.data(),
                                             Symmetric ? nullptr : QuantBZeroPoint.data(),
-                                            B.data(), BlkLen, /* columnwise */ true,
+                                            B.data(), static_cast<int>(BlkLen), /* columnwise */ true,
                                             static_cast<int>(K), static_cast<int>(N), static_cast<int>(N),
                                             tp.get());
 
+  std::unique_ptr<std::byte[]> Workspace;
+  if (const auto WorkspaceSize = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
+      WorkspaceSize > 0) {
+    Workspace = std::make_unique<std::byte[]>(WorkspaceSize);
+  }
+
+  std::unique_ptr<std::byte[]> PackedQuantBData;
+  if (const auto PackedQuantBDataSize = MlasSQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType);
+      PackedQuantBDataSize > 0) {
+    PackedQuantBData = std::make_unique<std::byte[]>(PackedQuantBDataSize);
+    MlasSQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData.data(), PackedQuantBData.get(),
+                                 tp.get());
+  }
+
   MLAS_SQNBIT_GEMM_DATA_PARAMS params{};
   params.A = A.data();
   params.lda = K;
-  params.QuantBData = QuantBData.data();
+  params.QuantBData = PackedQuantBData != nullptr
+                          ? static_cast<const void*>(PackedQuantBData.get())
+                          : static_cast<const void*>(QuantBData.data());
   params.QuantBScale = QuantBScale.data();
   params.QuantBZeroPoint = Symmetric ? nullptr : QuantBZeroPoint.data();
   params.Bias = nullptr;
@@ -62,79 +85,64 @@ void SQNBITGEMM(benchmark::State& state) {
   params.ldc = N;
 
   // warm up run
-  MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, &params, tp.get());
+  MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace.get(), tp.get());
 
   for (auto _ : state) {
-    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, &params, tp.get());
+    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace.get(), tp.get());
   }
 }
 
-static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
-  b->ArgNames({"M", "N", "K", "Threads"});
-  ArgsProduct(b, {{1, 1024, 2048}, {4096, 11008}, {4096, 11008}, {8}});
+template <size_t BlkBitWidth>
+void SQNBITGEMM(benchmark::State& state) {
+  const auto BlkLen = narrow<size_t>(state.range(0));
+  const auto M = narrow<size_t>(state.range(1));
+  const auto N = narrow<size_t>(state.range(2));
+  const auto K = narrow<size_t>(state.range(3));
+  const auto Threads = narrow<size_t>(state.range(4));
+  const auto Symmetric = narrow<bool>(state.range(5));
+  const auto ComputeType = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(state.range(6));
+
+  RunSQNBitGemmBenchmark<BlkBitWidth>(BlkLen, M, N, K, Threads, Symmetric, ComputeType, state);
 }
 
-BENCHMARK(SQNBITGEMM<4, 16, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 16, true>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 32, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 32, true>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 64, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 64, true>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 128, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 128, true>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 256, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 256, true>)->Apply(GemmSizeProducts)->UseRealTime();
-
-#ifdef MLAS_JBLAS
-void Q4GEMM_Jblas(benchmark::State& state, int block_size, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE cmp_type) {
-  if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
-  if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
-  if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!");
-  if (state.range(3) <= 0) throw std::invalid_argument("Threads must greater than 0!");
-
-  const size_t M = static_cast<size_t>(state.range(0));
-  const size_t N = static_cast<size_t>(state.range(1));
-  const size_t K = static_cast<size_t>(state.range(2));
-  const size_t threads = static_cast<size_t>(state.range(3));
-  block_size = block_size == -1 ? static_cast<int>(K) : block_size;
-  const size_t pack_b_size = MlasNBitsGemmPackBSize(N, K, block_size, 4, is_asym, cmp_type);
-
-  OrtThreadPoolParams tpo;
-  tpo.thread_pool_size = static_cast<int>(threads);
-  tpo.auto_set_affinity = true;
-  std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(onnxruntime::concurrency::CreateThreadPool(
-      &onnxruntime::Env::Default(), tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
-
-  auto A1 = RandomVectorUniform(static_cast<size_t>(M * K), -1.0f, 1.0f);
-  auto B1 = RandomVectorUniform<uint8_t>(static_cast<size_t>(N * K / 2), 0, 255);
-  auto blk_num = static_cast<size_t>((K + block_size - 1) / block_size);
-  auto B_scale = RandomVectorUniform(static_cast<size_t>(N * blk_num), 0.003f, 0.005f);
-  std::vector<float> C1(static_cast<size_t>(M * N));
-  auto B_zp = RandomVectorUniform<uint8_t>(static_cast<size_t>(N * blk_num / 2), 0, 255);
-
-  std::vector<int8_t> B1_packed(pack_b_size);
-  MlasNBitsGemmPackB(B1_packed.data(), B1.data(), B_scale.data(), is_asym ? B_zp.data() : nullptr, N, K, K, block_size,
-                     4, is_asym, true, cmp_type, tp.get());
-
-  MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS params1;
-  params1.A = A1.data();
-  params1.lda = K;
-  params1.C = C1.data();
-  params1.ldc = N;
-  params1.B = B1_packed.data();
-  std::vector<int8_t> workspace(static_cast<size_t>(M <= 32 ? 32 : M) * K * 4);
-  MlasSQNBitsGemmBatchPackedB(M, N, K, 1, &params1, workspace.data(), tp.get());
+// This test gets benchmark arguments from environment variables.
+template <size_t BlkBitWidth>
+void SQNBITGEMM_ENV(benchmark::State& state) {
+  using onnxruntime::ParseEnvironmentVariableWithDefault;
+
+  const auto BlkLen = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_BLKLEN", 32);
+  const auto M = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_M", 1);
+  const auto N = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_N", 4096);
+  const auto K = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_K", 4096);
+  const auto Threads = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_THREADS", 1);
+  const auto Symmetric = ParseEnvironmentVariableWithDefault<bool>("ORT_SQNBITGEMM_SYMMETRIC", true);
+  const auto ComputeType = ParseEnvironmentVariableWithDefault<int32_t>("ORT_SQNBITGEMM_COMPUTE_TYPE",
+                                                                        static_cast<int32_t>(CompFp32));
+
+  RunSQNBitGemmBenchmark<BlkBitWidth>(BlkLen, M, N, K, Threads, Symmetric,
+                                      static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(ComputeType),
+                                      state);
+
+  std::ostringstream s;
+  s << "BlkBitWidth:" << BlkBitWidth << "/BlkLen:" << BlkLen
+    << "/M:" << M << "/N:" << N << "/K:" << K
+    << "/Threads:" << Threads << "/Symmetric:" << Symmetric << "/ComputeType:" << ComputeType;
+  state.SetLabel(s.str());
+}
 
-  for (auto _ : state) {
-    MlasSQNBitsGemmBatchPackedB(M, N, K, 1, &params1, workspace.data(), tp.get());
-  }
+static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"BlkLen", "M", "N", "K", "Threads", "Symmetric", "ComputeType"});
+
+  b->ArgsProduct({
+      {16, 32, 64, 128, 256},                  // BlkLen
+      {1, 1024, 2048},                         // M
+      {4096, 11008},                           // N
+      {4096, 11008},                           // K
+      {1, 8},                                  // Threads
+      {int64_t{false}, int64_t{true}},         // Symmetric
+      {int64_t{CompFp32}, int64_t{CompInt8}},  // ComputeType
+  });
 }
 
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymInt8, 32, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymInt8, 128, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymInt8, -1, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymFp32, 32, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymFp32, 128, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymFp32, -1, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32AsymFp32, 32, true, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-#endif
+BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime();
+BENCHMARK(SQNBITGEMM_ENV<4>)->UseRealTime();
diff --git a/onnxruntime/test/mlas/bench/bench_util.cpp b/onnxruntime/test/mlas/bench/bench_util.cpp
index b79cd3a2a40a..6b59b7e01b46 100644
--- a/onnxruntime/test/mlas/bench/bench_util.cpp
+++ b/onnxruntime/test/mlas/bench/bench_util.cpp
@@ -22,29 +22,3 @@ std::vector<float> RandomVectorUniform(std::vector<int64_t> shape, float min_val
   }
   return RandomVectorUniform(static_cast<size_t>(sz), min_value, max_value);
 }
-
-// The Benchmark used here do not contains this as in newer version.
-// Use the code from newer version.
-void ArgsProduct(benchmark::internal::Benchmark* bench,
-                 const std::vector<std::vector<int64_t>>& arglists) {
-  std::vector<std::size_t> indices(arglists.size(), 0);
-  const std::size_t total = std::accumulate(
-      std::begin(arglists), std::end(arglists), std::size_t{1},
-      [](const std::size_t res, const std::vector<int64_t>& arglist) {
-        return res * arglist.size();
-      });
-  std::vector<int64_t> args;
-  args.reserve(arglists.size());
-  for (std::size_t i = 0; i < total; i++) {
-    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
-      args.push_back(arglists[arg][indices[arg]]);
-    }
-    bench->Args(args);
-    args.clear();
-
-    std::size_t arg = 0;
-    do {
-      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
-    } while (indices[arg++] == 0 && arg < arglists.size());
-  }
-}
diff --git a/onnxruntime/test/mlas/bench/bench_util.h b/onnxruntime/test/mlas/bench/bench_util.h
index a2b49e117da3..f96dd5c673b3 100644
--- a/onnxruntime/test/mlas/bench/bench_util.h
+++ b/onnxruntime/test/mlas/bench/bench_util.h
@@ -5,11 +5,9 @@
 
 #include <benchmark/benchmark.h>
 
+#include <functional>
 #include <random>
 
-void ArgsProduct(benchmark::internal::Benchmark* bench,
-                 const std::vector<std::vector<int64_t>>& arglists);
-
 template <typename ElementType>
 std::vector<ElementType> RandomVectorUniform(
     size_t N,
diff --git a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
index 484a9a22429d..969997d2b84e 100644
--- a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
+++ b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "test_fp16.h"
+#include <iomanip>
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 
diff --git a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
index 955c3b120198..c317395bee97 100644
--- a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
+++ b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
@@ -19,7 +19,7 @@ Module Name:
 #include "test_util.h"
 #include "mlas_q4.h"
 
-#if (defined(_M_AMD64) || defined(__x86_64__))
+#if ((defined(_M_AMD64) && !defined(_M_ARM64EC)) || defined(__x86_64__))
 
 /**
  * @brief For testing purpose,
@@ -93,7 +93,7 @@ class MlasQ4dqTest : public MlasTestBase {
                                      << K << "] QType: " << qtype;
     }
 
-#if (defined(_M_AMD64) || defined(__x86_64__))
+#if ((defined(_M_AMD64) && !defined(_M_ARM64EC)) || defined(__x86_64__))
 
     /* Test MlasBlkQ4DequantSgemmPackB, make sure we can reuse SGEMM kernel as it rearrange B the same way as sgemm pack B*/
     const size_t AlignedN = (N + 15) & ~15;
diff --git a/onnxruntime/test/mlas/unittest/test_sbgemm.cpp b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
new file mode 100644
index 000000000000..941de8f05061
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
@@ -0,0 +1,141 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    test_sbgemm.cpp
+
+Abstract:
+
+    Tests for MLAS bf16 precision GEMM.
+
+--*/
+
+#if defined(__aarch64__) && defined(__linux__)
+
+#include "test_sbgemm.h"
+
+//
+// Short Execute() test helper to register each test seperately by all parameters.
+//
+template <typename AType, typename BType, bool Packed, bool Threaded>
+class SBGemmShortExecuteTest : public MlasTestFixture<MlasSBGemmTest<AType, BType, Packed, Threaded>> {
+ public:
+  explicit SBGemmShortExecuteTest(size_t M, size_t N, size_t K, size_t Batch, bool hasBias)
+      : M_(M), N_(N), K_(K), Batch_(Batch), hasBias_(hasBias) {}
+
+  void TestBody() override {
+    MlasTestFixture<MlasSBGemmTest<AType, BType, Packed, Threaded>>::mlas_tester->Test(M_, N_, K_, Batch_, hasBias_);
+  }
+
+  static size_t RegisterSingleTest(size_t M, size_t N, size_t K, size_t Batch, bool hasBias) {
+    std::stringstream ss;
+    ss << "Batch" << Batch << "/M" << M << "xN" << N << "xK" << K << "/"
+       << "hasBias" << hasBias;
+    auto test_name = ss.str();
+
+    testing::RegisterTest(
+        MlasSBGemmTest<AType, BType, Packed, Threaded>::GetTestSuiteName(),
+        test_name.c_str(),
+        nullptr,
+        test_name.c_str(),
+        __FILE__,
+        __LINE__,
+        // Important to use the fixture type as the return type here.
+        [=]() -> MlasTestFixture<MlasSBGemmTest<AType, BType, Packed, Threaded>>* {
+          return new SBGemmShortExecuteTest<AType, BType, Packed, Threaded>(
+              M, N, K, Batch, hasBias);
+        });
+
+    return 1;
+  }
+
+  static size_t RegisterShortExecuteTests() {
+    size_t test_registered = 0;
+    for (size_t b = 1; b < 16; b++) {
+      test_registered += RegisterSingleTest(b, b, b, 1, false);
+      test_registered += RegisterSingleTest(b, b, b, 1, true);
+    }
+    for (size_t b = 16; b <= 256; b <<= 1) {
+      test_registered += RegisterSingleTest(b, b, b, 1, false);
+      test_registered += RegisterSingleTest(b, b, b, 1, true);
+    }
+    for (size_t b = 256; b < 320; b += 32) {
+      test_registered += RegisterSingleTest(b, b, b, 1, true);
+    }
+    for (size_t b = 1; b < 96; b++) {
+      test_registered += RegisterSingleTest(1, b, 32, 1, false);
+      test_registered += RegisterSingleTest(1, 32, b, 1, true);
+      test_registered += RegisterSingleTest(1, b, b, 1, false);
+      if (!Packed) {
+        test_registered += RegisterSingleTest(1, b, 32, 3, true);
+        test_registered += RegisterSingleTest(1, 32, b, 5, false);
+      }
+    }
+    // TODO: check why the cosine similary is < 0.99 for this shape alone
+    // test_registered += RegisterSingleTest(43, 500, 401, 1, true);
+    test_registered += RegisterSingleTest(1001, 1027, 1031, 1, false);
+    if (!Packed) {
+      test_registered += RegisterSingleTest(43, 500, 401, 5, true);
+      test_registered += RegisterSingleTest(1000, 1029, 1030, 3, false);
+    }
+
+    return test_registered;
+  }
+
+ private:
+  size_t M_, N_, K_, Batch_;
+  bool hasBias_;
+};
+
+static size_t SBGemmRegistLongExecute() {
+  size_t count = 0;
+
+  count += MlasLongExecuteTests<MlasSBGemmTest<float, float, false, false>>::RegisterLongExecute();
+  if (MlasSBGemmPackBSize(128, 128) > 0) {
+    count += MlasLongExecuteTests<MlasSBGemmTest<float, float, true, false>>::RegisterLongExecute();
+  }
+
+  if (GetMlasThreadPool() != nullptr) {
+    count += MlasLongExecuteTests<MlasSBGemmTest<float, float, false, true>>::RegisterLongExecute();
+    if (MlasSBGemmPackBSize(128, 128) > 0) {
+      count += MlasLongExecuteTests<MlasSBGemmTest<float, float, true, true>>::RegisterLongExecute();
+    }
+  }
+
+  return count;
+}
+
+static size_t SBGemmRegistShortExecute() {
+  size_t count = 0;
+
+  count += SBGemmShortExecuteTest<float, float, false, false>::RegisterShortExecuteTests();
+  if (MlasSBGemmPackBSize(128, 128) > 0) {
+    count += SBGemmShortExecuteTest<float, float, true, false>::RegisterShortExecuteTests();
+  }
+
+  if (GetMlasThreadPool() != nullptr) {
+    count += SBGemmShortExecuteTest<float, float, false, true>::RegisterShortExecuteTests();
+    if (MlasSBGemmPackBSize(128, 128) > 0) {
+      count += SBGemmShortExecuteTest<float, float, true, true>::RegisterShortExecuteTests();
+    }
+  }
+
+  return count;
+}
+
+static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
+  if (!MlasBf16AccelerationSupported()) {
+    return false;
+  }
+
+  if (is_short_execute) {
+    return SBGemmRegistShortExecute() > 0;
+  }
+  return SBGemmRegistLongExecute() > 0;
+});
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/test/mlas/unittest/test_sbgemm.h b/onnxruntime/test/mlas/unittest/test_sbgemm.h
new file mode 100644
index 000000000000..13701e2e3de4
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_sbgemm.h
@@ -0,0 +1,281 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    test_sbgemm.h
+
+Abstract:
+
+    Tests for MLAS bf16 precision GEMM.
+
+--*/
+
+#if defined(__aarch64__) && defined(__linux__)
+
+#pragma once
+
+#include "test_util.h"
+
+template <typename T>
+void SmallFloatFill(T* start, size_t size) {
+  constexpr float MinimumFillValue = -11.0f;
+  auto FillAddress = start;
+  size_t offset = size % 23;
+
+  for (size_t i = 0; i < size; i++) {
+    offset = (offset + 21) % 23;
+    *FillAddress++ = T((MinimumFillValue + offset) / 16.0f);
+  }
+}
+
+float cosine_similarity(const float* A, const float* B, size_t Vector_Length) {
+  float dot = 0.0, denom_a = 0.0, denom_b = 0.0;
+  for (size_t i = 0u; i < Vector_Length; ++i) {
+    dot += A[i] * B[i];
+    denom_a += A[i] * A[i];
+    denom_b += B[i] * B[i];
+  }
+  return dot / (sqrt(denom_a) * sqrt(denom_b));
+}
+
+/**
+ * @brief Test class for bf16 precision GEMM
+ * @tparam AType  Data type of A matrix, need to be float
+ * @tparam BType  Data type of b matrix, can be either float or prepacked bf16
+ */
+template <typename AType, typename BType, bool Packed, bool Threaded>
+class MlasSBGemmTest : public MlasTestBase {
+ private:
+  MatrixGuardBuffer<uint8_t> BufferBPacked;
+  MatrixGuardBuffer<AType> BufferA;
+  MatrixGuardBuffer<BType> BufferB;
+  MatrixGuardBuffer<float> BufferBias;
+  MatrixGuardBuffer<float> BufferC;
+  MatrixGuardBuffer<float> BufferCReference;
+  MatrixGuardBuffer<float> BufferFloatC;
+  MLAS_THREADPOOL* threadpool_;
+
+  void* PackB(size_t N, size_t K, const BType* B, size_t ldb) {
+    size_t PackedBSize = MlasSBGemmPackBSize(N, K);
+    if (PackedBSize == 0) {
+      return nullptr;
+    }
+    void* PackedB = BufferBPacked.GetBuffer(PackedBSize);
+    if (std::is_same<BType, float>::value) {
+      MlasSBGemmConvertPackB(N, K, (const float*)B, ldb, PackedB);
+    } else {
+    }
+    return PackedB;
+  }
+
+  void CallSBGemm(size_t M,
+                  size_t N,
+                  size_t K,
+                  size_t BatchSize,
+                  const float* A,
+                  size_t lda,
+                  const BType* B,
+                  size_t ldb,
+                  const float* Bias,
+                  float* C,
+                  size_t ldc) {
+    std::vector<MLAS_SBGEMM_DATA_PARAMS> GemmParameters(BatchSize);
+
+    for (size_t i = 0; i < GemmParameters.size(); i++) {
+      auto& params = GemmParameters[i];
+      params.A = A + (M * lda * i);
+      params.lda = lda;
+      if (nullptr != Bias) {
+        params.Bias = reinterpret_cast<const float*>(Bias + N * i);
+      } else {
+        params.Bias = nullptr;
+      }
+      params.C = reinterpret_cast<float*>(C + (M * ldc * i));
+      params.ldc = ldc;
+      params.AIsfp32 = true;
+      params.BIsfp32 = true;
+
+      if (Packed) {
+        ASSERT_EQ(BatchSize, size_t(1)) << "Packing B not supported in batching yet!";
+        params.B = PackB(N, K, B, ldb);
+        params.ldb = 0;
+        params.BIsfp32 = false;
+      } else {
+        params.B = B + (K * N * i);
+        params.ldb = ldb;
+      }
+    }
+
+    MlasSBGemmBatch(M, N, K, BatchSize, GemmParameters.data(), threadpool_);
+  }
+
+  void ReferenceSgemm(size_t M,
+                      size_t N,
+                      size_t K,
+                      size_t BatchSize,
+                      const AType* A,
+                      const BType* B,
+                      const float* Bias,
+                      float* C) {
+    constexpr size_t KStride = 256;
+
+    for (size_t batch = 0; batch < BatchSize; batch++) {
+      for (size_t m = 0; m < M; m++) {
+        for (size_t n = 0; n < N; n++) {
+          const AType* a = A + M * K * batch + m * K;
+          const BType* b = B + K * N * batch + n;
+          float* c = C + (M * N * batch) + (m * N) + n;
+
+          for (size_t k = 0; k < K; k += KStride) {
+            float sum = 0.0f;
+            if (k == 0 && Bias != nullptr) {
+              sum = float(Bias[n]);
+            }
+            for (size_t kk = 0; kk < std::min(KStride, K - k); kk++) {
+              float down(float(*b) * float(*a) + sum);
+              sum = float(down);
+              b += N;
+              a += 1;
+            }
+            if (k == 0) {
+              *c = sum;
+            } else {
+              float d(sum + *c);
+              *c = float(d);
+            }
+          }
+        }
+      }
+      if (Bias) {
+        Bias += N;
+      }
+    }
+  }
+
+ public:
+  MlasSBGemmTest() : threadpool_(Threaded ? GetMlasThreadPool() : nullptr) {}
+
+  void Test(size_t M, size_t N, size_t K, size_t BatchSize, bool withBias) {
+    AType* A = BufferA.GetFilledBuffer(K * M * BatchSize + 16, SmallFloatFill<AType>);
+    AType Atail[16];
+    std::memcpy(Atail, A + K * M * BatchSize, 16 * sizeof(AType));
+
+    BType* B = BufferB.GetFilledBuffer(N * K * BatchSize + 16, SmallFloatFill<BType>);
+    BType Btail[16];
+    std::memcpy(Btail, B + N * K * BatchSize, 16 * sizeof(BType));
+
+    float BiasTail[16];
+    const float* Bias = nullptr;
+    if (withBias) {
+      Bias = BufferBias.GetFilledBuffer(N * BatchSize + 16, SmallFloatFill<float>);
+      std::memcpy(BiasTail, Bias + N * BatchSize, 16 * sizeof(float));
+    }
+
+    float* C = BufferC.GetFilledBuffer(N * M * BatchSize, SmallFloatFill<float>);
+    float* CReference = BufferCReference.GetFilledBuffer(
+        N * M * BatchSize,
+        [](float* start, size_t size) {
+          std::fill_n(start, size, -1.0f);
+        });
+    this->CallSBGemm(M, N, K, BatchSize, A, K, B, N, Bias, C, N);
+    ReferenceSgemm(M, N, K, BatchSize, A, B, Bias, CReference);
+    const float cosine_similarity_threshold = 0.98;
+
+    for (size_t batch = 0, f = 0; batch < BatchSize; batch++) {
+      for (size_t m = 0; m < M; m++) {
+        for (size_t n = 0; n < N; n++, f++) {
+          if (!(CloseEnough(float(C[f]), CReference[f]))) {
+            float cos_sim = cosine_similarity(C, CReference, (BatchSize * M * N));
+            if (abs(cos_sim) < cosine_similarity_threshold) {
+              ASSERT_TRUE(false) << "cosine similarity check failed" << cos_sim;
+            } else {
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    ASSERT_EQ(std::memcmp(Atail, A + K * M * BatchSize, 16 * sizeof(AType)), 0) << "Matrix A buffer overwritten!";
+    ASSERT_EQ(std::memcmp(Btail, B + N * K * BatchSize, 16 * sizeof(BType)), 0) << "Matrix B buffer overwritten!";
+    if (withBias) {
+      ASSERT_EQ(std::memcmp(BiasTail, Bias + N * BatchSize, 16 * sizeof(float)), 0) << "Bias buffer overwritten!";
+    }
+  }
+
+ private:
+ public:
+  static const char* GetTestSuiteName() {
+    static std::string suite_name = std::string("SBGemmFP") +
+                                    (std::is_same<AType, float>::value ? "32" : "16") +
+                                    (std::is_same<BType, float>::value ? "32" : "16") +
+                                    (Packed ? "_Packed" : "_NoPack") +
+                                    (Threaded ? "_Threaded" : "_SingleThread");
+    return suite_name.c_str();
+  }
+
+  void ExecuteLong(void) override {
+    for (size_t M = 16; M < 160; M += 32) {
+      for (size_t N = 16; N < 160; N += 32) {
+        static const size_t ks[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 20, 32, 48, 64, 118, 119, 120, 121, 122, 160, 240, 320};
+        for (size_t k = 0; k < _countof(ks); k++) {
+          size_t K = ks[k];
+
+          Test(M, N, K, 1, false);
+          Test(M, N, K, 1, true);
+          Test(M + 1, N, K, 1, false);
+          Test(M, N + 1, K, 1, true);
+          Test(M + 1, N + 1, K, 1, false);
+          Test(M + 3, N + 2, K, 1, true);
+          Test(M + 4, N, K, 1, false);
+          Test(M, N + 4, K, 1, true);
+          Test(M + 4, N + 4, K, 1, false);
+          Test(M + 3, N + 7, K, 1, true);
+          Test(M + 8, N, K, 1, false);
+          Test(M, N + 8, K, 1, true);
+          Test(M + 12, N + 12, K, 1, false);
+          Test(M + 13, N, K, 1, true);
+          Test(M, N + 15, K, 1, false);
+          Test(M + 15, N + 15, K, 1, false);
+          if (!Packed) {
+            Test(M, N, K, 7, false);
+            Test(M + 3, N, K, 8, true);
+            Test(M, N + 1, K, 9, false);
+            Test(M + 12, N, K, 10, true);
+            Test(M, N + 15, K, 11, false);
+            Test(M + 15, N + 15, K, 12, true);
+          }
+        }
+      }
+      printf("M %zd\n", M);
+    }
+
+    for (size_t M = 1; M < 160; M++) {
+      for (size_t N = 1; N < 160; N++) {
+        for (size_t K = 1; K < 160; K++) {
+          Test(M, N, K, 1, true);
+        }
+      }
+      printf("M %zd\n", M);
+    }
+
+    for (size_t M = 160; M < 320; M += 24) {
+      for (size_t N = 112; N < 320; N += 24) {
+        for (size_t K = 1; K < 16; K++) {
+          Test(M, N, K, 1, true);
+        }
+        for (size_t K = 16; K < 160; K += 32) {
+          Test(M, N, K, 1, false);
+        }
+      }
+      printf("M %zd\n", M);
+    }
+  }
+};
+
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
index 6c97d6030157..ed09d7ee92b2 100644
--- a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
@@ -18,6 +18,17 @@ Module Name:
 #include "mlas_q4.h"
 #include "mlas_qnbit.h"
 
+static constexpr const char* ComputeTypeName(MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType) {
+  switch (ComputeType) {
+    case CompFp32:
+      return "Fp32";
+    case CompInt8:
+      return "Int8";
+    default:
+      return "unknown";
+  }
+}
+
 /**
  * @brief Test class for n-bit int block quantized GEMM
  *        Note: only 2-D matmul supported for now
@@ -26,12 +37,16 @@ template <size_t BlkBitWidth, size_t BlkLen>
 class MlasSQNBitGemmTest : public MlasTestBase {
  private:
   MatrixGuardBuffer<float> BufferA;
+  MatrixGuardBuffer<int8_t> BufferQuantAData;
+  MatrixGuardBuffer<float> BufferQuantAScale;
   MatrixGuardBuffer<float> BufferB;
   MatrixGuardBuffer<uint8_t> BufferQuantBData;
+  MatrixGuardBuffer<std::byte> BufferPackedQuantBData;
   MatrixGuardBuffer<uint8_t> BufferQuantBZeroPoint;
   MatrixGuardBuffer<float> BufferQuantBScale;
   MatrixGuardBuffer<float> BufferDequantizedB;
   MatrixGuardBuffer<float> BufferBias;
+  MatrixGuardBuffer<std::byte> BufferWorkspace;
   MatrixGuardBuffer<float> BufferC;
   MatrixGuardBuffer<float> BufferCReference;
 
@@ -40,12 +55,15 @@ class MlasSQNBitGemmTest : public MlasTestBase {
                 size_t K,
                 const float* A,
                 size_t lda,
-                const uint8_t* QuantBData,
+                const void* QuantBData,
+                const void* PackedQuantBData,
                 const float* QuantBScale,
-                const uint8_t* QuantBZeroPoint,
+                const void* QuantBZeroPoint,
                 const float* Bias,
                 float* C,
                 size_t ldc,
+                void* Workspace,
+                MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
                 MLAS_THREADPOOL* Threadpool) {
     MLAS_SQNBIT_GEMM_DATA_PARAMS params;
     params.A = A;
@@ -53,23 +71,106 @@ class MlasSQNBitGemmTest : public MlasTestBase {
     params.Bias = Bias;
     params.C = C;
     params.ldc = ldc;
-    params.QuantBData = QuantBData;
+    params.QuantBData = PackedQuantBData != nullptr ? PackedQuantBData : QuantBData;
     params.QuantBScale = QuantBScale;
     params.QuantBZeroPoint = QuantBZeroPoint;
     params.PostProcessor = nullptr;
 
-    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, &params, Threadpool);
+    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace, Threadpool);
+  }
+
+  void QuantizeA(size_t M, size_t K, const float* A, int8_t* QuantAData, float* QuantAScale) {
+    const size_t BlockCountK = (K + BlkLen - 1) / BlkLen;
+    const size_t lda = K;
+    for (size_t m = 0; m < M; ++m) {
+      for (size_t k = 0, k_blk = 0; k < K; k += BlkLen, ++k_blk) {
+        const size_t local_blk_len = std::min(K - k, BlkLen);
+        float blk_a[BlkLen]{};
+        std::copy_n(A + m * lda + k, local_blk_len, blk_a);
+
+        float amax = 0.0f;  // max of absolute values of A block
+        for (size_t kk = 0; kk < local_blk_len; ++kk) {
+          float a = blk_a[kk];
+          amax = std::max(amax, fabsf(a));
+        }
+
+        constexpr float range_max = (1 << 7) - 1;
+        const float scale = amax / range_max;
+        const float scale_reciprocal = scale != 0.0f ? 1.0f / scale : 0.0f;
+
+        QuantAScale[m * BlockCountK + k_blk] = scale;
+
+        for (size_t kk = 0; kk < BlkLen; ++kk) {
+          const float q = roundf(blk_a[kk] * scale_reciprocal);
+          QuantAData[m * BlockCountK * BlkLen + k + kk] =
+              static_cast<int8_t>(
+                  std::clamp(q,
+                             static_cast<float>(std::numeric_limits<int8_t>::min()),
+                             static_cast<float>(std::numeric_limits<int8_t>::max())));
+        }
+      }
+    }
+  }
+
+  void CallReferenceGemm_CompInt8(size_t M,
+                                  size_t N,
+                                  size_t K,
+                                  const float* A,
+                                  const uint8_t* QuantBData,
+                                  const float* QuantBScale,
+                                  const uint8_t* QuantBZeroPoint,
+                                  const float* Bias,
+                                  float* C) {
+    const size_t BlockCountK = (K + BlkLen - 1) / BlkLen;
+
+    int8_t* QuantAData = BufferQuantAData.GetBuffer(M * BlockCountK * BlkLen);
+    float* QuantAScale = BufferQuantAScale.GetBuffer(M * BlockCountK);
+    QuantizeA(M, K, A, QuantAData, QuantAScale);
+
+    for (size_t m = 0; m < M; ++m) {
+      for (size_t n = 0; n < N; ++n) {
+        float sum = Bias == nullptr ? 0.0f : Bias[n];
+        for (size_t k = 0, k_blk = 0; k < K; k += BlkLen, ++k_blk) {
+          const size_t k_blk_len = std::min(K - k, BlkLen);
+
+          const float a_scale = QuantAScale[m * BlockCountK + k_blk];
+
+          const float b_scale = QuantBScale[n * BlockCountK + k_blk];
+
+          static_assert(BlkBitWidth == 4, "only implemented for 4-bit quantized B");
+
+          uint8_t b_zp = 8;
+          if (QuantBZeroPoint != nullptr) {
+            const uint8_t b_zp_byte = QuantBZeroPoint[n * ((BlockCountK + 1) / 2) + k_blk / 2];
+            b_zp = (k_blk & 1) ? (b_zp_byte >> 4) : (b_zp_byte & 0x0F);
+          }
+
+          int32_t qsum = 0;
+
+          for (size_t kk = 0; kk < k_blk_len; ++kk) {
+            const int8_t qa = QuantAData[m * BlockCountK * BlkLen + k + kk];
+            const uint8_t qb_byte = QuantBData[(n * BlockCountK * BlkLen + k + kk) / 2];
+            const int8_t qb = ((kk & 1) == 1 ? (qb_byte >> 4) : (qb_byte & 0x0F)) - b_zp;
+            qsum += qa * qb;
+          }
+
+          sum += static_cast<float>(qsum) * a_scale * b_scale;
+        }
+
+        C[m * N + n] = sum;
+      }
+    }
   }
 
-  void CallReferenceGemm(size_t M,
-                         size_t N,
-                         size_t K,
-                         const float* A,
-                         const uint8_t* QuantBData,
-                         const float* QuantBScale,
-                         const uint8_t* QuantBZeroPoint,
-                         const float* Bias,
-                         float* C) {
+  void CallReferenceGemm_CompFp32(size_t M,
+                                  size_t N,
+                                  size_t K,
+                                  const float* A,
+                                  const uint8_t* QuantBData,
+                                  const float* QuantBScale,
+                                  const uint8_t* QuantBZeroPoint,
+                                  const float* Bias,
+                                  float* C) {
     float* DequantizedBData = BufferDequantizedB.GetBuffer(K * N);
     MlasDequantizeBlockwise<float, BlkBitWidth>(
         DequantizedBData, QuantBData, QuantBScale, QuantBZeroPoint, BlkLen, /* columnwise */ true,
@@ -95,6 +196,7 @@ class MlasSQNBitGemmTest : public MlasTestBase {
 
  public:
   void Test(size_t M, size_t N, size_t K,
+            MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
             bool WithBias, bool Symmetric, bool WithThreadpool) {
     MLAS_THREADPOOL* Threadpool = WithThreadpool ? GetMlasThreadPool() : nullptr;
 
@@ -126,7 +228,7 @@ class MlasSQNBitGemmTest : public MlasTestBase {
     float* C = BufferC.GetBuffer(N * M, true);
     float* CReference = BufferCReference.GetBuffer(N * M, true);
 
-    // pack B
+    // quantize B
     uint8_t* QuantBData = nullptr;
     float* QuantBScale = nullptr;
     uint8_t* QuantBZeroPoint = nullptr;
@@ -138,20 +240,49 @@ class MlasSQNBitGemmTest : public MlasTestBase {
 
       QuantBData = BufferQuantBData.GetBuffer(QuantBDataSizeInBytes);
       QuantBScale = BufferQuantBScale.GetBuffer(QuantBScaleSize);
-      if (Symmetric) {
+      if (!Symmetric) {
         QuantBZeroPoint = BufferQuantBZeroPoint.GetBuffer(QuantBZeroPointSizeInBytes);
       }
 
-      MlasQuantizeBlockwise<float, 4>(QuantBData, QuantBScale, QuantBZeroPoint,
-                                      B, BlkLen,
-                                      /* columnwise */ true,
-                                      static_cast<int>(K), static_cast<int>(N),
-                                      static_cast<int>(N),
-                                      GetMlasThreadPool());
+      MlasQuantizeBlockwise<float, BlkBitWidth>(QuantBData, QuantBScale, QuantBZeroPoint,
+                                                B, BlkLen,
+                                                /* columnwise */ true,
+                                                static_cast<int>(K), static_cast<int>(N),
+                                                static_cast<int>(N),
+                                                GetMlasThreadPool());
     }
 
-    CallGemm(M, N, K, A, /* lda */ K, QuantBData, QuantBScale, QuantBZeroPoint, Bias, C, /* ldc */ N, Threadpool);
-    CallReferenceGemm(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference);
+    void* Workspace = nullptr;
+    if (const auto WorkspaceSize = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
+        WorkspaceSize > 0) {
+      Workspace = BufferWorkspace.GetBuffer(WorkspaceSize);
+    }
+
+    void* PackedQuantBData = nullptr;
+    if (const auto PackedQuantBDataSize = MlasSQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType);
+        PackedQuantBDataSize > 0) {
+      PackedQuantBData = BufferPackedQuantBData.GetBuffer(PackedQuantBDataSize);
+      MlasSQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData, PackedQuantBData,
+                                   GetMlasThreadPool());
+    }
+
+    if (ComputeType == CompFp32) {
+      CallReferenceGemm_CompFp32(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference);
+    } else if (ComputeType == CompInt8) {
+      CallReferenceGemm_CompInt8(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference);
+    } else {
+      FAIL() << "Test is not implemented for compute type "
+             << ComputeType << " (" << ComputeTypeName(ComputeType) << ")";
+    }
+
+    CallGemm(M, N, K,
+             A, /* lda */ K,
+             QuantBData, PackedQuantBData, QuantBScale, QuantBZeroPoint,
+             Bias,
+             C, /* ldc */ N,
+             Workspace,
+             ComputeType,
+             Threadpool);
 
     size_t f = 0;
     for (size_t m = 0; m < M; m++) {
@@ -179,74 +310,90 @@ template <size_t BlkBitWidth, size_t BlkLen>
 class SQNBitGemmShortExecuteTest : public MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>> {
  public:
   explicit SQNBitGemmShortExecuteTest(size_t M, size_t N, size_t K,
+                                      MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
                                       bool WithThreadpool, bool Symmetric, bool WithBias)
-      : M_(M), N_(N), K_(K), WithThreadpool_(WithThreadpool), Symmetric_(Symmetric), WithBias_(WithBias) {
+      : M_(M),
+        N_(N),
+        K_(K),
+        ComputeType_(ComputeType),
+        WithThreadpool_(WithThreadpool),
+        Symmetric_(Symmetric),
+        WithBias_(WithBias) {
   }
 
   void TestBody() override {
     MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>>::mlas_tester->Test(
-        M_, N_, K_, WithThreadpool_, Symmetric_, WithBias_);
+        M_, N_, K_, ComputeType_, WithThreadpool_, Symmetric_, WithBias_);
   }
 
   static size_t RegisterSingleTest(size_t M, size_t N, size_t K,
+                                   MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
                                    bool WithThreadpool, bool Symmetric, bool WithBias) {
-    std::stringstream ss;
-    ss << (WithThreadpool ? "SingleThread" : "Threaded")
-       << "/isSymmetric" << Symmetric
-       << "/M" << M << "xN" << N << "xK" << K
-       << "/hasBias" << WithBias;
-    auto test_name = ss.str();
-
-    testing::RegisterTest(
-        MlasSQNBitGemmTest<BlkBitWidth, BlkLen>::GetTestSuiteName(),
-        test_name.c_str(),
-        nullptr,
-        test_name.c_str(),
-        __FILE__,
-        __LINE__,
-        // Important to use the fixture type as the return type here.
-        [=]() -> MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>>* {
-          return new SQNBitGemmShortExecuteTest(
-              M, N, K, WithThreadpool, Symmetric, WithBias);
-        });
-
-    return 1;
+    size_t tests_registered = 0;
+
+    if (MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) {
+      std::stringstream ss;
+      ss << (WithThreadpool ? "SingleThread" : "Threaded")
+         << "/isSymmetric" << Symmetric
+         << "/M" << M << "xN" << N << "xK" << K
+         << "/hasBias" << WithBias
+         << "/computeType" << ComputeTypeName(ComputeType);
+      auto test_name = ss.str();
+
+      testing::RegisterTest(
+          MlasSQNBitGemmTest<BlkBitWidth, BlkLen>::GetTestSuiteName(),
+          test_name.c_str(),
+          nullptr,
+          test_name.c_str(),
+          __FILE__,
+          __LINE__,
+          // Important to use the fixture type as the return type here.
+          [=]() -> MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>>* {
+            return new SQNBitGemmShortExecuteTest(
+                M, N, K, ComputeType, WithThreadpool, Symmetric, WithBias);
+          });
+
+      tests_registered += 1;
+    }
+
+    return tests_registered;
   }
 
   static size_t RegisterShortExecuteTests() {
-    size_t test_registered = 0;
+    size_t tests_registered = 0;
 
-    if (MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen)) {
+    for (MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType : {CompFp32, CompInt8}) {
       for (bool WithThreadpool : {false, true}) {
         for (bool Symmetric : {false, true}) {
           for (size_t b = 1; b < 16; b++) {
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, false);
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, true);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, false);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, true);
           }
           for (size_t b = 16; b <= 256; b <<= 1) {
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, false);
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, true);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, false);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, true);
           }
           for (size_t b = 256; b < 320; b += 32) {
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, true);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, true);
           }
           for (size_t b = 1; b < 96; b++) {
-            test_registered += RegisterSingleTest(1, b, 32, WithThreadpool, Symmetric, false);
-            test_registered += RegisterSingleTest(1, 32, b, WithThreadpool, Symmetric, true);
-            test_registered += RegisterSingleTest(1, b, b, WithThreadpool, Symmetric, false);
+            tests_registered += RegisterSingleTest(1, b, 32, ComputeType, WithThreadpool, Symmetric, false);
+            tests_registered += RegisterSingleTest(1, 32, b, ComputeType, WithThreadpool, Symmetric, true);
+            tests_registered += RegisterSingleTest(1, b, b, ComputeType, WithThreadpool, Symmetric, false);
           }
-          test_registered += RegisterSingleTest(43, 500, 401, WithThreadpool, Symmetric, true);
+          tests_registered += RegisterSingleTest(43, 500, 401, ComputeType, WithThreadpool, Symmetric, true);
 
-          // test_registered += RegisterSingleTest(1001, 1027, 1031, WithThreadpool, Symmetric, false);
+          // tests_registered += RegisterSingleTest(1001, 1027, 1031, ComputeType, WithThreadpool, Symmetric, false);
         }
       }
     }
 
-    return test_registered;
+    return tests_registered;
   }
 
  private:
   size_t M_, N_, K_;
+  MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType_;
   bool WithThreadpool_, Symmetric_, WithBias_;
 };
 
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index 6d07ddde5c44..e12e9401413b 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -267,12 +267,12 @@ void LoopDataFile(int test_data_pb_fd, bool is_input, const TestModelInfo& model
 }  // namespace
 
 #if !defined(ORT_MINIMAL_BUILD)
-std::unique_ptr<TestModelInfo> TestModelInfo::LoadOnnxModel(_In_ const PATH_CHAR_TYPE* model_url) {
+std::unique_ptr<TestModelInfo> TestModelInfo::LoadOnnxModel(const std::filesystem::path& model_url) {
   return std::make_unique<OnnxModelInfo>(model_url);
 }
 #endif
 
-std::unique_ptr<TestModelInfo> TestModelInfo::LoadOrtModel(_In_ const PATH_CHAR_TYPE* model_url) {
+std::unique_ptr<TestModelInfo> TestModelInfo::LoadOrtModel(const std::filesystem::path& model_url) {
   return std::make_unique<OnnxModelInfo>(model_url, true);
 }
 
@@ -290,7 +290,7 @@ class OnnxTestCase : public ITestCase {
   mutable std::vector<std::string> debuginfo_strings_;
   mutable onnxruntime::OrtMutex m_;
 
-  std::vector<std::basic_string<PATH_CHAR_TYPE>> test_data_dirs_;
+  std::vector<std::filesystem::path> test_data_dirs_;
 
   std::string GetDatasetDebugInfoString(size_t dataset_id) const override {
     std::lock_guard<OrtMutex> l(m_);
@@ -343,7 +343,7 @@ class OnnxTestCase : public ITestCase {
 
   size_t GetDataCount() const override { return test_data_dirs_.size(); }
   const std::string& GetNodeName() const override { return model_info_->GetNodeName(); }
-  const PATH_CHAR_TYPE* GetModelUrl() const override { return model_info_->GetModelUrl(); }
+  const std::filesystem::path& GetModelUrl() const override { return model_info_->GetModelUrl(); }
   const std::string& GetTestCaseName() const override { return test_case_name_; }
   std::string GetTestCaseVersion() const override { return model_info_->GetNominalOpsetVersion(); }
 
@@ -396,7 +396,14 @@ static std::string trim_str(const std::string& in) {
   return s;
 }
 
-static bool read_config_file(const std::basic_string<PATH_CHAR_TYPE>& path, std::map<std::string, std::string>& fc) {
+/**
+ * @brief Read a text file that each line is a key value pair separated by ':'
+ * @param path File path
+ * @param fc output key value pairs
+ * @return True, success. False, the file doesn't exist or could be read.
+ */
+static bool ReadConfigFile(const std::filesystem::path& path, std::map<std::string, std::string>& fc) {
+  if (!std::filesystem::exists(path)) return false;
   std::ifstream infile(path);
   if (!infile.good()) {
     return false;
@@ -474,10 +481,10 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b,
     ORT_THROW("index out of bound");
   }
 
-  PATH_STRING_TYPE test_data_pb = ConcatPathComponent(
-      test_data_dirs_[id], (is_input ? ORT_TSTR("inputs.pb") : ORT_TSTR("outputs.pb")));
+  std::filesystem::path test_data_pb =
+      test_data_dirs_[id] / (is_input ? ORT_TSTR("inputs.pb") : ORT_TSTR("outputs.pb"));
   int test_data_pb_fd;
-  auto st = Env::Default().FileOpenRd(test_data_pb, test_data_pb_fd);
+  auto st = Env::Default().FileOpenRd(test_data_pb.string(), test_data_pb_fd);
   if (st.IsOK()) {  // has an all-in-one input file
     std::ostringstream oss;
     {
@@ -504,21 +511,23 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b,
 
   std::vector<PATH_STRING_TYPE> test_data_pb_files;
 
-  const PATH_STRING_TYPE& dir_path = test_data_dirs_[id];
-  LoopDir(dir_path,
-          [&test_data_pb_files, &dir_path, is_input](const PATH_CHAR_TYPE* filename, OrtFileType f_type) -> bool {
-            if (filename[0] == '.') return true;
-            if (f_type != OrtFileType::TYPE_REG) return true;
-            std::basic_string<PATH_CHAR_TYPE> filename_str = filename;
-            if (!HasExtensionOf(filename_str, ORT_TSTR("pb"))) return true;
-            const std::basic_string<PATH_CHAR_TYPE> file_prefix =
-                is_input ? ORT_TSTR("input_") : ORT_TSTR("output_");
-            if (!filename_str.compare(0, file_prefix.length(), file_prefix)) {
-              std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(dir_path, filename_str);
-              test_data_pb_files.push_back(p);
-            }
-            return true;
-          });
+  std::filesystem::path dir_fs_path = test_data_dirs_[id];
+  if (!std::filesystem::exists(dir_fs_path)) return;
+
+  for (auto const& dir_entry : std::filesystem::directory_iterator(dir_fs_path)) {
+    if (!dir_entry.is_regular_file()) continue;
+    const std::filesystem::path& path = dir_entry.path();
+    if (!path.filename().has_extension()) {
+      continue;
+    }
+    if (path.filename().extension().compare(ORT_TSTR(".pb")) != 0) continue;
+    const std::basic_string<PATH_CHAR_TYPE> file_prefix =
+        is_input ? ORT_TSTR("input_") : ORT_TSTR("output_");
+    auto filename_str = path.filename().native();
+    if (filename_str.compare(0, file_prefix.length(), file_prefix) == 0) {
+      test_data_pb_files.push_back(path.native());
+    }
+  }
 
   SortFileNames(test_data_pb_files);
 
@@ -691,11 +700,13 @@ void OnnxTestCase::ConvertTestData(const ONNX_NAMESPACE::OptionalProto& test_dat
 OnnxTestCase::OnnxTestCase(const std::string& test_case_name, _In_ std::unique_ptr<TestModelInfo> model,
                            double default_per_sample_tolerance, double default_relative_per_sample_tolerance)
     : test_case_name_(test_case_name), model_info_(std::move(model)) {
-  std::basic_string<PATH_CHAR_TYPE> test_case_dir = model_info_->GetDir();
-
+  std::filesystem::path test_case_dir = model_info_->GetDir();
+  if (!std::filesystem::exists(test_case_dir)) {
+    ORT_THROW("test case dir doesn't exist");
+  }
   // parse config
-  std::basic_string<PATH_CHAR_TYPE> config_path =
-      ConcatPathComponent(test_case_dir, ORT_TSTR("config.txt"));
+  std::filesystem::path config_path =
+      test_case_dir / ORT_TSTR("config.txt");
   /* Note: protobuf-lite doesn't support reading protobuf files as text-format. Config.txt is exactly that.
      That's the reason I've to parse the file in a different way to read the configs. Currently
      this affects 2 tests - fp16_tiny_yolov2 and fp16_inception_v1. It's not clear why we've to use protobuf
@@ -705,7 +716,7 @@ OnnxTestCase::OnnxTestCase(const std::string& test_case_name, _In_ std::unique_p
   per_sample_tolerance_ = default_per_sample_tolerance;
   relative_per_sample_tolerance_ = default_relative_per_sample_tolerance;
   post_processing_ = false;
-  if (read_config_file(config_path, fc)) {
+  if (ReadConfigFile(config_path, fc)) {
     if (fc.count("per_sample_tolerance") > 0) {
       per_sample_tolerance_ = stod(fc["per_sample_tolerance"]);
     }
@@ -716,16 +727,11 @@ OnnxTestCase::OnnxTestCase(const std::string& test_case_name, _In_ std::unique_p
       post_processing_ = fc["post_processing"] == "true";
     }
   }
-
-  LoopDir(test_case_dir, [&test_case_dir, this](const PATH_CHAR_TYPE* filename, OrtFileType f_type) -> bool {
-    if (filename[0] == '.') return true;
-    if (f_type == OrtFileType::TYPE_DIR) {
-      std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(test_case_dir, filename);
-      test_data_dirs_.push_back(p);
-      debuginfo_strings_.push_back(ToUTF8String(p));
-    }
-    return true;
-  });
+  for (auto const& dir_entry : std::filesystem::directory_iterator(test_case_dir)) {
+    if (!dir_entry.is_directory()) continue;
+    test_data_dirs_.push_back(dir_entry.path());
+    debuginfo_strings_.push_back(ToUTF8String(dir_entry.path().string()));
+  }
 }
 
 void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths,
@@ -737,20 +743,23 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
                const std::function<void(std::unique_ptr<ITestCase>)>& process_function) {
   std::vector<std::basic_string<PATH_CHAR_TYPE>> paths(input_paths);
   while (!paths.empty()) {
-    std::basic_string<PATH_CHAR_TYPE> node_data_root_path = paths.back();
+    std::filesystem::path node_data_root_path = paths.back();
     paths.pop_back();
-    std::basic_string<PATH_CHAR_TYPE> my_dir_name = GetLastComponent(node_data_root_path);
-    LoopDir(node_data_root_path, [&](const PATH_CHAR_TYPE* filename, OrtFileType f_type) -> bool {
-      if (filename[0] == '.') return true;
-      if (f_type == OrtFileType::TYPE_DIR) {
-        std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(node_data_root_path, filename);
-        paths.push_back(p);
-        return true;
+    if (!std::filesystem::exists(node_data_root_path)) continue;
+    std::filesystem::path my_dir_name = node_data_root_path.filename();
+    for (auto const& dir_entry : std::filesystem::directory_iterator(node_data_root_path)) {
+      if (dir_entry.is_directory()) {
+        paths.push_back(dir_entry.path());
+        continue;
       }
-
-      std::basic_string<PATH_CHAR_TYPE> filename_str = filename;
-      bool is_onnx_format = HasExtensionOf(filename_str, ORT_TSTR("onnx"));
-      bool is_ort_format = HasExtensionOf(filename_str, ORT_TSTR("ort"));
+      if (!dir_entry.is_regular_file()) continue;
+      std::filesystem::path filename_str = dir_entry.path().filename();
+      if (filename_str.empty() || filename_str.native()[0] == ORT_TSTR('.')) {
+        // Ignore hidden files.
+        continue;
+      }
+      bool is_onnx_format = filename_str.has_extension() && (filename_str.extension().compare(ORT_TSTR(".onnx")) == 0);
+      bool is_ort_format = filename_str.has_extension() && (filename_str.extension().compare(ORT_TSTR(".ort")) == 0);
       bool is_valid_model = false;
 
 #if !defined(ORT_MINIMAL_BUILD)
@@ -759,42 +768,40 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
 
       is_valid_model = is_valid_model || is_ort_format;
       if (!is_valid_model)
-        return true;
+        continue;
 
-      std::basic_string<PATH_CHAR_TYPE> test_case_name = my_dir_name;
+      std::basic_string<PATH_CHAR_TYPE> test_case_name = my_dir_name.native();
       if (test_case_name.compare(0, 5, ORT_TSTR("test_")) == 0) test_case_name = test_case_name.substr(5);
 
       if (!whitelisted_test_cases.empty() && std::find(whitelisted_test_cases.begin(), whitelisted_test_cases.end(),
                                                        test_case_name) == whitelisted_test_cases.end()) {
-        return true;
+        continue;
       }
-      if (disabled_tests.find(test_case_name) != disabled_tests.end()) return true;
-
-      std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(node_data_root_path, filename_str);
+      if (disabled_tests.find(test_case_name) != disabled_tests.end()) continue;
 
       std::unique_ptr<TestModelInfo> model_info;
 
       if (is_onnx_format) {
 #if !defined(ORT_MINIMAL_BUILD)
-        model_info = TestModelInfo::LoadOnnxModel(p.c_str());
+        model_info = TestModelInfo::LoadOnnxModel(dir_entry.path());
 #else
         ORT_THROW("onnx model is not supported in this build");
 #endif
       } else if (is_ort_format) {
-        model_info = TestModelInfo::LoadOrtModel(p.c_str());
+        model_info = TestModelInfo::LoadOrtModel(dir_entry.path());
       } else {
         ORT_NOT_IMPLEMENTED(ToUTF8String(filename_str), " is not supported");
       }
 
       auto test_case_dir = model_info->GetDir();
-      auto test_case_name_in_log = test_case_name + ORT_TSTR(" in ") + test_case_dir;
+      auto test_case_name_in_log = test_case_name + ORT_TSTR(" in ") + test_case_dir.native();
 
 #if !defined(ORT_MINIMAL_BUILD) && !defined(USE_QNN)
       // to skip some models like *-int8 or *-qdq
       if ((reinterpret_cast<OnnxModelInfo*>(model_info.get()))->HasDomain(ONNX_NAMESPACE::AI_ONNX_TRAINING_DOMAIN) ||
           (reinterpret_cast<OnnxModelInfo*>(model_info.get()))->HasDomain(ONNX_NAMESPACE::AI_ONNX_PREVIEW_TRAINING_DOMAIN)) {
         fprintf(stderr, "Skip test case:: %s %s\n", ToUTF8String(test_case_name_in_log).c_str(), " as it has training domain");
-        return true;
+        continue;
       }
 #endif
 
@@ -809,7 +816,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
       });
       if (!has_test_data) {
         fprintf(stderr, "Skip test case:: %s %s\n", ToUTF8String(test_case_name_in_log).c_str(), " due to no test data");
-        return true;
+        continue;
       }
 
       if (broken_tests) {
@@ -820,7 +827,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
             (opset_version == TestModelInfo::unknown_version || iter->broken_opset_versions_.empty() ||
              iter->broken_opset_versions_.find(opset_version) != iter->broken_opset_versions_.end())) {
           fprintf(stderr, "Skip test case:: %s %s\n", ToUTF8String(test_case_name_in_log).c_str(), " due to broken_tests");
-          return true;
+          continue;
         }
       }
 
@@ -829,7 +836,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
           std::string keyword = *iter2;
           if (ToUTF8String(test_case_name).find(keyword) != std::string::npos) {
             fprintf(stderr, "Skip test case:: %s %s\n", ToUTF8String(test_case_name_in_log).c_str(), " as it is in broken test keywords");
-            return true;
+            continue;
           }
         }
       }
@@ -841,8 +848,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
                                                         tolerances.relative(tolerance_key));
       fprintf(stdout, "Load Test Case: %s\n", ToUTF8String(test_case_name_in_log).c_str());
       process_function(std::move(l));
-      return true;
-    });
+    }
   }
 }
 
@@ -954,7 +960,6 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
       {"reduce_log_sum_exp_empty_set_expanded", "unknown version", {}},
       {"reduce_prod_empty_set", "unknown version", {}},
       {"reduce_sum_empty_set", "unknown version", {}},
-      {"reduce_sum_square_empty_set", "unknown version", {}},
       {"reduce_sum_square_empty_set_expanded", "unknown version", {}},
 #ifdef ENABLE_TRAINING_CORE
       {"adagrad", "not a registered function/op", {}},                  // Op not registered.
@@ -1007,7 +1012,16 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
       {"softmax_cross_entropy_mean_weight", "type error", {"opset12"}},
       {"softmax_cross_entropy_mean_no_weight_ignore_index_4d", "type error", {"opset12"}},
 #endif
-      {"mask_rcnn_keras", "this model currently has an invalid contrib op version set to 10", {}}});
+      {"mask_rcnn_keras", "this model currently has an invalid contrib op version set to 10", {}},
+      // ONNX 1.16.0 fix: https://github.com/onnx/onnx/pull/5741
+      // ORT pending PR: https://github.com/microsoft/onnxruntime/pull/18377
+      {"maxpool_2d_ceil_output_size_reduce_by_one",
+       "ONNX 1.16.0 fixed maxpool output size bug and added this test. "
+       "Enable when merge: https://github.com/microsoft/onnxruntime/pull/18377",
+       {}},
+      {"dequantizelinear_blocked", "blocked quantization (onnx 1.16.0) not supported", {}},
+      {"quantizelinear_blocked_asymmetric", "blocked quantization (onnx 1.16.0) not supported", {}},
+      {"quantizelinear_blocked_symmetric", "blocked quantization (onnx 1.16.0) not supported", {}}});
 
   // Some EPs may fail to pass some specific testcases.
   // For example TenosrRT EP may fail on FLOAT16 related testcases if GPU doesn't support float16.
@@ -1352,6 +1366,7 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     broken_tests->insert({"gridsample_volumetric_nearest_align_corners_0", "unknown version"});
     broken_tests->insert({"gridsample_volumetric_nearest_align_corners_1", "unknown version"});
     broken_tests->insert({"spacetodepth", "result differs"});
+    broken_tests->insert({"reduce_sum_square_empty_set_expanded", "unknown version"});
     // Fails with QNN SDK 2.17.0:
     // expected 7.70947 (40f6b3f3), got 7.84096 (40fae920), diff: 0.131491, tol=0.00870947 idx=419. 100 of 1715 differ
     broken_tests->insert({"facedetection_op8_qdq", "result differs"});
diff --git a/onnxruntime/test/onnx/TestCase.h b/onnxruntime/test/onnx/TestCase.h
index 96b0b5f6f7c0..0cb92056d378 100644
--- a/onnxruntime/test/onnx/TestCase.h
+++ b/onnxruntime/test/onnx/TestCase.h
@@ -6,6 +6,7 @@
 #include <mutex>
 #include <unordered_map>
 #include <unordered_set>
+#include <filesystem>
 #include <core/common/common.h>
 #include <core/common/status.h>
 #include <core/platform/path_lib.h>
@@ -31,7 +32,7 @@ class ITestCase {
   virtual void LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b,
                             std::unordered_map<std::string, Ort::Value>& name_data_map,
                             bool is_input) const = 0;
-  virtual const PATH_CHAR_TYPE* GetModelUrl() const = 0;
+  virtual const std::filesystem::path& GetModelUrl() const = 0;
   virtual const std::string& GetNodeName() const = 0;
   virtual const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t i) const = 0;
   virtual const ONNX_NAMESPACE::ValueInfoProto* GetOutputInfoFromModel(size_t i) const = 0;
@@ -50,14 +51,9 @@ class ITestCase {
 
 class TestModelInfo {
  public:
-  virtual const PATH_CHAR_TYPE* GetModelUrl() const = 0;
-  virtual std::basic_string<PATH_CHAR_TYPE> GetDir() const {
-    std::basic_string<PATH_CHAR_TYPE> test_case_dir;
-    auto st = onnxruntime::GetDirNameFromFilePath(GetModelUrl(), test_case_dir);
-    if (!st.IsOK()) {
-      ORT_THROW("GetDirNameFromFilePath failed");
-    }
-    return test_case_dir;
+  virtual const std::filesystem::path& GetModelUrl() const = 0;
+  virtual std::filesystem::path GetDir() const {
+    return GetModelUrl().parent_path();
   }
   virtual const std::string& GetNodeName() const = 0;
   virtual const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t i) const = 0;
@@ -70,10 +66,10 @@ class TestModelInfo {
   virtual ~TestModelInfo() = default;
 
 #if !defined(ORT_MINIMAL_BUILD)
-  static std::unique_ptr<TestModelInfo> LoadOnnxModel(_In_ const PATH_CHAR_TYPE* model_url);
+  static std::unique_ptr<TestModelInfo> LoadOnnxModel(const std::filesystem::path& model_url);
 #endif
 
-  static std::unique_ptr<TestModelInfo> LoadOrtModel(_In_ const PATH_CHAR_TYPE* model_url);
+  static std::unique_ptr<TestModelInfo> LoadOrtModel(const std::filesystem::path& model_url);
 
   static const std::string unknown_version;
 };
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 51edb91b5d3a..c1905776110e 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -25,6 +25,10 @@
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 using namespace onnxruntime;
 
 namespace {
@@ -51,15 +55,22 @@ void usage() {
       "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options:  'basic', 'detailed', default 'off'.\n"
+      "\t    [QNN only] [profiling_file_path]: QNN profiling file path if ETW not enabled.\n"
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
-      "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
+      "\t    'high_power_saver', 'low_balanced', 'extreme_power_saver', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
       "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
       "\t    0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
       "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
       "\t    '0', '1', '2', '3', default is '0'.\n"
+      "\t    [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
+      "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
+      "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
+      "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
+      "\t    [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
+      "\t    Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
       "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_path|/folderpath/libQnnCpu.so\" \n\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
@@ -335,11 +346,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     logging_level = ORT_LOGGING_LEVEL_VERBOSE;
   }
 
-  if (concurrent_session_runs > 1 && repeat_count > 1) {
-    fprintf(stderr, "when you use '-r [repeat]', please set '-c' to 1\n");
-    usage();
-    return -1;
-  }
   argc -= optind;
   argv += optind;
   if (argc < 1) {
@@ -400,12 +406,15 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 
     if (enable_tensorrt) {
 #ifdef USE_TENSORRT
-      OrtCUDAProviderOptions cuda_options;
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
+#ifdef USE_CUDA
+      OrtCUDAProviderOptionsV2 cuda_options;
       cuda_options.device_id = device_id;
       cuda_options.do_copy_in_default_stream = true;
+      cuda_options.use_tf32 = false;
       // TODO: Support arena configuration for users of test runner
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
-      sf.AppendExecutionProvider_CUDA(cuda_options);
+      sf.AppendExecutionProvider_CUDA_V2(cuda_options);
+#endif
 #else
       fprintf(stderr, "TensorRT is not supported in this build");
       return -1;
@@ -423,10 +432,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     }
     if (enable_cuda) {
 #ifdef USE_CUDA
-      OrtCUDAProviderOptions cuda_options;
+      OrtCUDAProviderOptionsV2 cuda_options;
       cuda_options.do_copy_in_default_stream = true;
+      cuda_options.use_tf32 = false;
       // TODO: Support arena configuration for users of test runner
-      sf.AppendExecutionProvider_CUDA(cuda_options);
+      sf.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else
       fprintf(stderr, "CUDA is not supported in this build");
       return -1;
@@ -470,9 +480,9 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
         std::string key(token.substr(0, pos));
         std::string value(token.substr(pos + 1));
 
-        if (key == "backend_path") {
+        if (key == "backend_path" || key == "profiling_file_path") {
           if (value.empty()) {
-            ORT_THROW("Please provide the QNN backend path.");
+            ORT_THROW("Please provide the valid file path.");
           }
         } else if (key == "qnn_context_embed_mode") {
           if (value != "0") {
@@ -483,11 +493,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
           if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
             ORT_THROW("Supported profiling_level: off, basic, detailed");
           }
-        } else if (key == "rpc_control_latency" || key == "vtcm_mb") {
+        } else if (key == "rpc_control_latency" || key == "vtcm_mb" || key == "soc_model" || key == "device_id") {
           // no validation
         } else if (key == "htp_performance_mode") {
           std::set<std::string> supported_htp_perf_mode = {"burst", "balanced", "default", "high_performance",
-                                                           "high_power_saver", "low_balanced", "low_power_saver",
+                                                           "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver",
                                                            "power_saver", "sustained_high_performance"};
           if (supported_htp_perf_mode.find(value) == supported_htp_perf_mode.end()) {
             std::ostringstream str_stream;
@@ -512,10 +522,29 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             std::string str = str_stream.str();
             ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
           }
+        } else if (key == "htp_arch") {
+          std::unordered_set<std::string> supported_htp_archs = {"0", "68", "69", "73", "75"};
+          if (supported_htp_archs.find(value) == supported_htp_archs.end()) {
+            std::ostringstream str_stream;
+            std::copy(supported_htp_archs.begin(), supported_htp_archs.end(),
+                      std::ostream_iterator<std::string>(str_stream, ","));
+            std::string str = str_stream.str();
+            ORT_THROW("Wrong value for htp_arch. select from: " + str);
+          }
+        } else if (key == "enable_htp_fp16_precision") {
+          std::unordered_set<std::string> supported_options = {"0", "1"};
+          if (supported_options.find(value) == supported_options.end()) {
+            std::ostringstream str_stream;
+            std::copy(supported_options.begin(), supported_options.end(),
+                      std::ostream_iterator<std::string>(str_stream, ","));
+            std::string str = str_stream.str();
+            ORT_THROW("Wrong value for enable_htp_fp16_precision. select from: " + str);
+          }
         } else {
           ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
-'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
-'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
+'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
+'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority',
+'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision'])");
         }
 
         qnn_options[key] = value;
diff --git a/onnxruntime/test/onnx/microbenchmark/activation.cc b/onnxruntime/test/onnx/microbenchmark/activation.cc
index 77590f5c0a30..69ee72996365 100644
--- a/onnxruntime/test/onnx/microbenchmark/activation.cc
+++ b/onnxruntime/test/onnx/microbenchmark/activation.cc
@@ -11,6 +11,7 @@
 #include "core/framework/node_index_info.h"
 #include "core/framework/execution_frame.h"
 #include "contrib_ops/cpu/activations.h"
+#include "core/providers/cpu/tensor/gelu.h"
 #include "core/providers/cpu/activation/activations.h"
 #include <onnx/defs/attr_proto_util.h>
 #include <benchmark/benchmark.h>
@@ -69,7 +70,18 @@ struct KernelAndDef {
                   .SetDomain(domain)
                   .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
                   .Build();
-    OpKernelInfo info(main_node, *out.def, *out.a, {}, {}, {});
+
+    // these usually come from the session state. OpKernelInfo stores references to them so we need a valid backing
+    // instance even though we don't use them in this test.
+    static const std::unordered_map<int, OrtValue> constant_initialized_tensors;
+    static const OrtValueNameIdxMap mlvalue_name_idx_map;
+    static const DataTransferManager data_transfer_mgr;
+    static const AllocatorMap allocators;
+    static const ConfigOptions config_options;
+    OpKernelInfo info(main_node, *out.def, *out.a,
+                      constant_initialized_tensors, mlvalue_name_idx_map, data_transfer_mgr, allocators,
+                      config_options);
+
     out.kernel = std::make_unique<KernelType>(info);
     return out;
   }
@@ -171,7 +183,7 @@ static void RunSingleNode(const std::string& op_name, const std::string& domain,
 }
 
 static void BM_GeluCompute(benchmark::State& state) {
-  RunSingleNode<contrib::Gelu<float>>("Gelu", kMSDomain, {}, state);
+  RunSingleNode<Gelu<float>>("Gelu", kMSDomain, {}, state);
 }
 
 BENCHMARK(BM_GeluCompute)
diff --git a/onnxruntime/test/onnx/onnx_model_info.cc b/onnxruntime/test/onnx/onnx_model_info.cc
index d6afa99382e6..f23012aee9fd 100644
--- a/onnxruntime/test/onnx/onnx_model_info.cc
+++ b/onnxruntime/test/onnx/onnx_model_info.cc
@@ -14,7 +14,7 @@
 
 using namespace onnxruntime;
 
-OnnxModelInfo::OnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url, bool is_ort_model)
+OnnxModelInfo::OnnxModelInfo(const std::filesystem::path& model_url, bool is_ort_model)
     : model_url_(model_url) {
   if (is_ort_model) {
     InitOrtModelInfo(model_url);
@@ -38,7 +38,7 @@ static void RepeatedPtrFieldToVector(const ::google::protobuf::RepeatedPtrField<
   }
 }
 
-void OnnxModelInfo::InitOnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url) {  // parse model
+void OnnxModelInfo::InitOnnxModelInfo(const std::filesystem::path& model_url) {  // parse model
   int model_fd;
   auto st = Env::Default().FileOpenRd(model_url, model_fd);
   if (!st.IsOK()) {
@@ -50,7 +50,9 @@ void OnnxModelInfo::InitOnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url) {  /
   const bool parse_result = model_pb.ParseFromZeroCopyStream(&input) && input.GetErrno() == 0;
   if (!parse_result) {
     (void)Env::Default().FileClose(model_fd);
-    ORT_THROW("Failed to load model because protobuf parsing failed.");
+    std::ostringstream oss;
+    oss << "Failed to load model from " << model_url << " because protobuf parsing failed.";
+    ORT_THROW(oss.str());
   }
   (void)Env::Default().FileClose(model_fd);
   {
@@ -91,7 +93,7 @@ void OnnxModelInfo::InitOnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url) {  /
 
 #endif  // #if !defined(ORT_MINIMAL_BUILD)
 
-void OnnxModelInfo::InitOrtModelInfo(_In_ const PATH_CHAR_TYPE* model_url) {
+void OnnxModelInfo::InitOrtModelInfo(const std::filesystem::path& model_url) {
   std::vector<uint8_t> bytes;
   size_t num_bytes = 0;
   const auto model_location = ToWideString(model_url);
diff --git a/onnxruntime/test/onnx/onnx_model_info.h b/onnxruntime/test/onnx/onnx_model_info.h
index a0aa27df64a9..48e297376aff 100644
--- a/onnxruntime/test/onnx/onnx_model_info.h
+++ b/onnxruntime/test/onnx/onnx_model_info.h
@@ -13,16 +13,16 @@ class OnnxModelInfo : public TestModelInfo {
   std::vector<ONNX_NAMESPACE::ValueInfoProto> input_value_info_;
   std::vector<ONNX_NAMESPACE::ValueInfoProto> output_value_info_;
   std::unordered_map<std::string, int64_t> domain_to_version_;
-  const std::basic_string<PATH_CHAR_TYPE> model_url_;
+  const std::filesystem::path model_url_;
 
 #if !defined(ORT_MINIMAL_BUILD)
-  void InitOnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url);
+  void InitOnnxModelInfo(const std::filesystem::path& model_url);
 #endif
 
-  void InitOrtModelInfo(_In_ const PATH_CHAR_TYPE* model_url);
+  void InitOrtModelInfo(const std::filesystem::path& model_url);
 
  public:
-  OnnxModelInfo(_In_ const PATH_CHAR_TYPE* model_url, bool is_ort_model = false);
+  OnnxModelInfo(const std::filesystem::path& path, bool is_ort_model = false);
   bool HasDomain(const std::string& name) const {
     return domain_to_version_.find(name) != domain_to_version_.end();
   }
@@ -32,7 +32,7 @@ class OnnxModelInfo : public TestModelInfo {
     return iter == domain_to_version_.end() ? -1 : iter->second;
   }
 
-  const PATH_CHAR_TYPE* GetModelUrl() const override { return model_url_.c_str(); }
+  const std::filesystem::path& GetModelUrl() const override { return model_url_; }
   std::string GetNominalOpsetVersion() const override { return onnx_nominal_opset_vesion_; }
 
   const std::string& GetNodeName() const override { return node_name_; }
diff --git a/onnxruntime/test/onnx/testcase_request.cc b/onnxruntime/test/onnx/testcase_request.cc
index 9ca8273ac907..9d653571ca2e 100644
--- a/onnxruntime/test/onnx/testcase_request.cc
+++ b/onnxruntime/test/onnx/testcase_request.cc
@@ -36,7 +36,7 @@ bool TestCaseRequestContext::SetupSession() {
   ORT_TRY {
     const auto* test_case_name = test_case_.GetTestCaseName().c_str();
     session_opts_.SetLogId(test_case_name);
-    Ort::Session session{env_, test_case_.GetModelUrl(), session_opts_};
+    Ort::Session session{env_, test_case_.GetModelUrl().native().c_str(), session_opts_};
     session_ = std::move(session);
     LOGF_DEFAULT(INFO, "Testing %s\n", test_case_name);
     return true;
diff --git a/onnxruntime/test/optimizer/cse_test.cc b/onnxruntime/test/optimizer/cse_test.cc
index cccfc8d77fce..bad96406df84 100644
--- a/onnxruntime/test/optimizer/cse_test.cc
+++ b/onnxruntime/test/optimizer/cse_test.cc
@@ -1,11 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "test/framework/test_utils.h"
-#include "test/test_environment.h"
 #include "core/graph/model.h"
 #include "core/optimizer/common_subexpression_elimination.h"
 #include "core/optimizer/graph_transformer_mgr.h"
+#include "test/framework/test_utils.h"
+#include "test/test_environment.h"
+#include "test/util/include/asserts.h"
 
 #ifdef ENABLE_TRAINING
 #include "orttraining/core/optimizer/graph_transformer_utils.h"
@@ -272,20 +273,21 @@ TEST(CseTests, MergedValueAndGraphOutputAreOutputsOfSameNode) {
 TEST(CseTests, MergeConstants) {
   auto model_uri = ORT_TSTR("testdata/transform/cse/cse_merge_constants.onnx");
   std::shared_ptr<Model> model;
-  ASSERT_TRUE(Model::Load(model_uri, model, nullptr,
-                          DefaultLoggingManager().DefaultLogger())
-                  .IsOK());
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()));
+
   Graph& graph = model->MainGraph();
   GraphTransformerManager graph_transformation_mgr(1);
   // In current implementation, equal constants are not merged. So CSE must precede constant folding, otherwise we end up
   // with multiple copies of the same constant.
   std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
-  ASSERT_TRUE(
-      graph_transformation_mgr.Register(std::make_unique<CommonSubexpressionElimination>(), TransformerLevel::Level1).IsOK());
-  ASSERT_TRUE(
-      graph_transformation_mgr.Register(std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1).IsOK());
-  ASSERT_TRUE(
-      graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK());
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<CommonSubexpressionElimination>(),
+                                                     TransformerLevel::Level1));
+  const ConfigOptions empty_config_options;
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1,
+                                                              DefaultLoggingManager().DefaultLogger()));
 
   ASSERT_EQ(graph.GetAllInitializedTensors().size(), 1U);
   auto op_count = CountOpsInGraph(graph);
diff --git a/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc b/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
index 7a67747f7cf4..2c72658ea98f 100644
--- a/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
+++ b/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
@@ -75,7 +75,7 @@ std::function<void(ModelTestBuilder&)> GetGraphBuilder(const GraphConfig& config
 
 void RunEnsureUniqueDQForNodeUnitTest(const GraphConfig& config, int expected_dq_count) {
   auto run_tests = [config, expected_dq_count](bool use_ms_domain_qdq_ops, bool use_16bit_qdq_ops) {
-    constexpr int opset_version = 12;
+    int opset_version = use_16bit_qdq_ops ? 21 : 12;
     const char* dequantize_linear_key = use_ms_domain_qdq_ops ? "com.microsoft.DequantizeLinear" : "DequantizeLinear";
     std::function<void(ModelTestBuilder&)> graph_builder_fn = use_16bit_qdq_ops
                                                                   ? GetGraphBuilder<uint16_t>(config, use_ms_domain_qdq_ops)
@@ -122,7 +122,8 @@ void RunEnsureUniqueDQForNodeUnitTest(const GraphConfig& config, int expected_dq
     }
   };
 
-  run_tests(false, false);
+  run_tests(/*use_ms_domain*/ false, /*use_16bit*/ false);
+  run_tests(/*use_ms_domain*/ false, /*use_16bit*/ true);
 #if !defined(DISABLE_CONTRIB_OPS)
   run_tests(true, false);  // Use contrib QDQ ops.
   run_tests(true, true);   // Use 16-bit contrib QDQ ops.
@@ -234,4 +235,44 @@ TEST(EnsureUniqueDQForNodeUnitTests, QDQWithMultiConsumerDQNodes) {
   EXPECT_EQ(OpCount(op_count_before, "DequantizeLinear") + 4, OpCount(op_count_after, "DequantizeLinear"));
 }
 
+TEST(EnsureUniqueDQForNodeUnitTests, QDQWithMultiConsumerDQNodesPreservingAttributes) {
+  constexpr auto model_uri = ORT_TSTR("testdata/qdq_with_multi_consumer_q_dq_axis.onnx");
+
+  SessionOptions session_options{};
+  // test interaction with level 1 transformers
+  session_options.graph_optimization_level = TransformerLevel::Level1;
+
+  InferenceSessionWrapper session{session_options, GetEnvironment()};
+
+  ASSERT_STATUS_OK(session.Load(model_uri));
+
+  const auto op_count_before = CountOpsInGraph(session.GetGraph());
+
+  ASSERT_STATUS_OK(session.Initialize());
+
+  const auto op_count_after = CountOpsInGraph(session.GetGraph());
+
+  EXPECT_EQ(OpCount(op_count_before, "DequantizeLinear") + 8, OpCount(op_count_after, "DequantizeLinear"));
+
+  int64_t given_axis = 0;  // all the following 4 DQ nodes and their duplicated one should have axis = 0
+  std::string axis_dq_name0 = "Convolution28_Output_0/fusedmuladd_B/DequantizeLinear";
+  std::string axis_dq_name1 = "Parameter5/DequantizeLinear";
+  std::string axis_dq_name2 = "Convolution110_Output_0/fusedmuladd_B/DequantizeLinear";
+  std::string axis_dq_name3 = "Parameter87/DequantizeLinear";
+  for (const auto& node : session.GetGraph().Nodes()) {
+    if (node.OpType() == "DequantizeLinear") {
+      if (node.Name().find(axis_dq_name0) == 0 ||
+          node.Name().find(axis_dq_name1) == 0 ||
+          node.Name().find(axis_dq_name2) == 0 ||
+          node.Name().find(axis_dq_name3) == 0) {
+        const auto& attrs = node.GetAttributes();
+        ASSERT_TRUE(attrs.find("axis") != attrs.end());
+        const auto& axis_attr = attrs.at("axis");
+        int64_t axis = axis_attr.i();
+        EXPECT_EQ(axis, given_axis);
+      }
+    }
+  }
+}
+
 }  // namespace onnxruntime::test
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index ef6e2d531bc1..6356651e3fd9 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -65,9 +65,11 @@
 #include "core/optimizer/relu_clip_fusion.h"
 #include "core/optimizer/reshape_fusion.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
+#include "core/optimizer/shape_input_merge.h"
 #include "core/optimizer/slice_elimination.h"
 #include "core/optimizer/unsqueeze_elimination.h"
 #include "core/optimizer/utils.h"
+#include "core/optimizer/label_encoder_fusion.h"
 #include "core/platform/env.h"
 #include "core/session/inference_session.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -186,6 +188,11 @@ TEST_F(GraphTransformationTests, DequantizeLinearNodeNotEliminated) {
   test_case(MODEL_FOLDER "qdq_with_multi_consumer_dq_nodes.fixed.onnx",
             false,  // use_contrib_qdq
             *logger_);
+
+  // Test with 16-bit DequantizeLinear(21)
+  test_case(MODEL_FOLDER "qdq_with_multi_consumer_dq_nodes.fixed.qdq16.onnx",
+            false,  // use_contrib_qdq
+            *logger_);
 #if !defined(DISABLE_CONTRIB_OPS)
   // Test with 8-bit com.microsoft.DequantizeLinear
   test_case(MODEL_FOLDER "qdq_with_multi_consumer_dq_nodes.fixed.qdq_contrib.onnx",
@@ -575,12 +582,14 @@ TEST_F(GraphTransformationTests, ConstantFolding) {
   ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
   Graph& graph = model->MainGraph();
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
-  ASSERT_TRUE(op_to_count["Unsqueeze"] == 2);
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  ASSERT_EQ(op_to_count["Unsqueeze"], 2);
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
@@ -595,11 +604,13 @@ TEST_F(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
   Graph& graph = model->MainGraph();
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   ASSERT_TRUE(op_to_count["Unsqueeze"] == 2);
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
+
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   // assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu
   for (auto& node : graph.Nodes()) {
@@ -624,11 +635,12 @@ TEST_F(GraphTransformationTests, ConstantFoldingUnsupportedFloat16) {
   Graph& graph = model->MainGraph();
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   ASSERT_TRUE(op_to_count["Mul"] == 1);
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   // assign all nodes to CUDA. the constant folding should try folding the node on the CPU and fail, thus leaving the
   // EP as CUDA and not constant folding the node.
@@ -707,11 +719,12 @@ TEST_F(GraphTransformationTests, ConstantFoldingSubgraph) {
 
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   ASSERT_TRUE(op_to_count["Add"] == 2);  // one in each subgraph
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
@@ -731,14 +744,15 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithShapeToInitializer) {
   ASSERT_TRUE(op_to_count["Unsqueeze"] == 3);
 
   InlinedHashSet<std::string_view> compatible_eps;
-  InlinedHashSet<std::string> excluded_initializers;
-  excluded_initializers.insert("matmul_weight");
+  InlinedHashSet<std::string> excluded_initializers = {"matmul_weight"};
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
       std::make_unique<ConstantFolding>(*e.get(),
                                         false /*skip_dequantize_linear*/,
+                                        empty_config_options,
                                         compatible_eps,
                                         excluded_initializers),
       TransformerLevel::Level1));
@@ -763,11 +777,11 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithScalarShapeToInitializer) {
 
   InlinedHashSet<std::string_view> compatible_eps;
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(),
-                                        false /*skip_dequantize_linear*/,
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options,
                                         compatible_eps),
       TransformerLevel::Level1));
 
@@ -792,11 +806,11 @@ TEST_F(GraphTransformationTests, ConstantFoldingForOpsWithMissingOptionalInputs)
 
   InlinedHashSet<std::string_view> compatible_eps;
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(),
-                                        false /*skip_dequantize_linear*/,
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options,
                                         compatible_eps),
       TransformerLevel::Level1));
 
@@ -876,6 +890,10 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithDequantizeLinear) {
 
   test_case(MODEL_FOLDER "fusion/constant_folding_dequantizelinear.onnx",
             false, *logger_);
+
+  // Test with 16-bit DequantizeLinear(21).
+  test_case(MODEL_FOLDER "fusion/constant_folding_dequantizelinear.qdq16.onnx",
+            false, *logger_);
 #if !defined(DISABLE_CONTRIB_OPS)
   // Test with 8-bit contrib QDQ ops
   test_case(MODEL_FOLDER "fusion/constant_folding_dequantizelinear.qdq_contrib.onnx",
@@ -965,11 +983,12 @@ TEST_F(GraphTransformationTests, ConstantFolding_RemoveDanglingInputNodesToConst
   ASSERT_TRUE(op_to_count["Add"] == 1);            // Input node to Shape
   ASSERT_TRUE(op_to_count["RandomUniform"] == 1);  // Input node to Add
 
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
@@ -988,10 +1007,13 @@ TEST_F(GraphTransformationTests, ConstantFoldingAShapeNodeDeepInTheGraph) {
   ASSERT_TRUE(op_to_count["Shape"] == 4);
 
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
+
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
   op_to_count = CountOpsInGraph(graph);
@@ -1014,9 +1036,12 @@ TEST_F(GraphTransformationTests, ConstantFoldingStringInitializer) {
   ASSERT_EQ(op_to_count["Identity"], 1);
 
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
   op_to_count = CountOpsInGraph(graph);
@@ -1886,6 +1911,68 @@ TEST_F(GraphTransformationTests, DivMulFusion) {
   ASSERT_TRUE(op_to_count["Mul"] == 2);
 }
 
+TEST_F(GraphTransformationTests, LabelEncoderFusion) {
+  using common::INVALID_GRAPH;
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/label_encoder.onnx";
+
+  NameMLValMap feeds;
+
+  constexpr size_t ALPH = 26;
+  OrtValue mlvalue_a;
+  std::vector<int64_t> dims_a = {ALPH};
+  std::vector<std::string> values_a = {};
+  for (char letter = 'a'; letter <= 'z'; letter++) {
+    values_a.emplace_back(1, letter);
+  }
+  CreateMLValue<std::string>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_a,
+                             values_a, &mlvalue_a);
+  feeds.insert(std::make_pair("A", mlvalue_a));
+
+  bool is_implemented = true;
+
+  auto run_model_test = [&](TransformerLevel level, std::vector<OrtValue>& fetches, const int requiredLabelEncoderCount) {
+    SessionOptions session_options;
+    session_options.graph_optimization_level = level;
+    session_options.session_logid = "OptimizerTests";
+    InferenceSessionWrapper session{session_options, GetEnvironment()};
+
+    // If we did not initialize the session correctly, the operator is missing.
+    if (!session.Load(model_uri).IsOK() || !session.Initialize().IsOK()) {
+      is_implemented = false;
+      return;
+    }
+
+    // Count if the number of LabelEncoders is as expected
+    std::map<std::string, int> op_to_count = CountOpsInGraph(session.GetGraph());
+    ASSERT_TRUE(op_to_count["ai.onnx.ml.LabelEncoder"] == requiredLabelEncoderCount);
+
+    std::vector<std::string> output_names = {};
+    for (const auto& output : session.GetGraph().GetOutputs()) {
+      output_names.push_back(output->Name());
+    }
+
+    RunOptions run_options;
+    ASSERT_STATUS_OK(session.Run(run_options, feeds, output_names, &fetches));
+  };
+
+  // run model with and w/o optimizations and compare the results
+  std::vector<OrtValue> unoptimized_fetches;
+  run_model_test(TransformerLevel::Default, unoptimized_fetches, 11);
+
+  std::vector<OrtValue> optimized_fetches;
+  run_model_test(TransformerLevel::MaxLevel, optimized_fetches, 7);
+
+  // If there was a problem loading the model, do not compare the 2 results
+  if (!is_implemented) {
+    GTEST_SKIP();
+    return;
+  }
+
+  // Compare results
+  auto ret = CompareOrtValue(optimized_fetches[0], unoptimized_fetches[0], 0.0, 0.0, false);
+  EXPECT_EQ(ret.first, COMPARE_RESULT::SUCCESS) << ret.second;
+}
+
 TEST_F(GraphTransformationTests, NotWhereFusion) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/not_where.onnx";
   std::shared_ptr<Model> model;
@@ -2709,7 +2796,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTranspose) {
   auto gemm_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm/GemmTransposeFusion/"; });
 
   auto& node = *gemm_node;
   ASSERT_TRUE(node.OpType() == "Gemm");
@@ -2745,7 +2832,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTransposeTo2Gemm
   auto gemm1_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm1_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm1/GemmTransposeFusion/"; });
 
   auto& node1 = *gemm1_node;
   ASSERT_TRUE(node1.OpType() == "Gemm");
@@ -2758,7 +2845,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTransposeTo2Gemm
   auto gemm2_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm2_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm2/GemmTransposeFusion/"; });
 
   auto& node2 = *gemm2_node;
   ASSERT_TRUE(node2.OpType() == "Gemm");
@@ -4602,38 +4689,43 @@ TEST_F(GraphTransformationTests, GeluApproximation_SessionOptionConfig) {
 }
 
 // Test DoubleQDQPairsRemover to remove unnecessary DQ->Q nodes in the middle
-TEST_F(GraphTransformationTests, DoublQDQRemover_RemoveDupQDQ) {
-  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "qdq_optimization/dup_qdq.onnx";
-  std::shared_ptr<Model> p_model;
-  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
-  Graph& graph = p_model->MainGraph();
+TEST_F(GraphTransformationTests, DoublQDQRemover_RemoveDupQDQ_Float16) {
+  auto RunTest = [this](const ORTCHAR_T* model_uri) {
+    std::shared_ptr<Model> p_model;
+    ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+    Graph& graph = p_model->MainGraph();
 
-  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<DoubleQDQPairsRemover>(), TransformerLevel::Level1));
-  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+    onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+    ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<DoubleQDQPairsRemover>(), TransformerLevel::Level1));
+    ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
-  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
-  EXPECT_EQ(op_to_count["QuantizeLinear"], 3);
-  EXPECT_EQ(op_to_count["DequantizeLinear"], 4);
+    std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+    EXPECT_EQ(op_to_count["QuantizeLinear"], 3);
+    EXPECT_EQ(op_to_count["DequantizeLinear"], 4);
 
-  std::string dq_scale_name_before_reshape_node;
-  std::string zp_name_before_reshape_node;
-  std::string dq_scale_name_after_reshape_node;
-  std::string zp_name_after_reshape_node;
-  for (auto& node : graph.Nodes()) {
-    if (node.Name() == "dq_2") {
-      dq_scale_name_before_reshape_node = node.InputDefs()[QDQ::InputIndex::SCALE_ID]->Name();
-      zp_name_before_reshape_node = node.InputDefs()[QDQ::InputIndex::ZERO_POINT_ID]->Name();
-    }
-    if (node.Name() == "q_3") {
-      dq_scale_name_after_reshape_node = node.InputDefs()[QDQ::InputIndex::SCALE_ID]->Name();
-      zp_name_after_reshape_node = node.InputDefs()[QDQ::InputIndex::ZERO_POINT_ID]->Name();
+    std::string dq_scale_name_before_reshape_node;
+    std::string zp_name_before_reshape_node;
+    std::string dq_scale_name_after_reshape_node;
+    std::string zp_name_after_reshape_node;
+    for (auto& node : graph.Nodes()) {
+      if (node.Name() == "dq_2") {
+        dq_scale_name_before_reshape_node = node.InputDefs()[QDQ::InputIndex::SCALE_ID]->Name();
+        zp_name_before_reshape_node = node.InputDefs()[QDQ::InputIndex::ZERO_POINT_ID]->Name();
+      }
+      if (node.Name() == "q_3") {
+        dq_scale_name_after_reshape_node = node.InputDefs()[QDQ::InputIndex::SCALE_ID]->Name();
+        zp_name_after_reshape_node = node.InputDefs()[QDQ::InputIndex::ZERO_POINT_ID]->Name();
+      }
     }
-  }
-  EXPECT_EQ(dq_scale_name_before_reshape_node.empty(), false);
-  EXPECT_EQ(zp_name_before_reshape_node.empty(), false);
-  EXPECT_EQ(dq_scale_name_before_reshape_node, dq_scale_name_after_reshape_node);
-  EXPECT_EQ(zp_name_before_reshape_node, zp_name_after_reshape_node);
+    EXPECT_EQ(dq_scale_name_before_reshape_node.empty(), false);
+    EXPECT_EQ(zp_name_before_reshape_node.empty(), false);
+    EXPECT_EQ(dq_scale_name_before_reshape_node, dq_scale_name_after_reshape_node);
+    EXPECT_EQ(zp_name_before_reshape_node, zp_name_after_reshape_node);
+  };
+
+  RunTest(MODEL_FOLDER "qdq_optimization/dup_qdq.onnx");
+  RunTest(MODEL_FOLDER "qdq_optimization/dup_qdq_float16.onnx");
+  RunTest(MODEL_FOLDER "qdq_optimization/dup_qdq_bfloat16.onnx");
 }
 
 // Test Gelu -> FastGelu
@@ -4860,6 +4952,53 @@ TEST_F(GraphTransformationTests, FastGeluFusionWithCastsTest3) {
   ASSERT_TRUE(op_to_count["com.microsoft.FastGelu"] == 1);
 }
 
+TEST_F(GraphTransformationTests, CseWithConstantOfShape) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    std::vector<std::variant<int64_t, std::string>> input_shape;
+    input_shape.reserve(4);
+    input_shape.emplace_back("dim0");
+    input_shape.emplace_back(512);
+    input_shape.emplace_back(16);
+    input_shape.emplace_back("dim3");
+    auto* input_arg = builder.MakeSymbolicInput<float>(input_shape);
+    auto* shape_out_1 = builder.MakeIntermediate();
+    auto* shape_out_2 = builder.MakeIntermediate();
+    auto* constant_of_shape_out_1 = builder.MakeIntermediate();
+    auto* constant_of_shape_out_2 = builder.MakeIntermediate();
+    auto* mul_out_1 = builder.MakeIntermediate();
+    auto* mul_out_2 = builder.MakeOutput();
+    builder.AddNode("Shape", {input_arg}, {shape_out_1});
+    builder.AddNode("Shape", {input_arg}, {shape_out_2});
+    TensorProto value_tensor;
+    value_tensor.add_dims(1);
+    float value = 2.333f;
+    value_tensor.set_raw_data(reinterpret_cast<const char*>(&value), sizeof(float));
+    value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    builder.AddNode("ConstantOfShape", {shape_out_1}, {constant_of_shape_out_1}).AddAttribute("value", value_tensor);
+    builder.AddNode("ConstantOfShape", {shape_out_2}, {constant_of_shape_out_2}).AddAttribute("value", value_tensor);
+    builder.AddNode("Mul", {input_arg, constant_of_shape_out_1}, {mul_out_1});
+    builder.AddNode("Mul", {mul_out_1, constant_of_shape_out_2}, {mul_out_2});
+  };
+
+  auto pre_graph_checker = [&](Graph& graph) {
+    auto op_count_map = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_map["Shape"] == 2);
+    TEST_RETURN_IF_NOT(op_count_map["ConstantOfShape"] == 2);
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [&](Graph& graph) {
+    auto op_count_map = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_map["Shape"] == 1);
+    TEST_RETURN_IF_NOT(op_count_map["ConstantOfShape"] == 1);
+    return Status::OK();
+  };
+
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<CommonSubexpressionElimination>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
+}
+
 TEST_F(GraphTransformationTests, QuickGelu) {
   // Sigmoid(x*alpha)*x, float
   {
@@ -5660,6 +5799,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
   EXPECT_EQ(op_to_count["Add"], 1);
 }
 
+#ifdef USE_DML
+TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kDmlExecutionProvider);
+  }
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+}
+#endif  // USE_DML
+
 #endif
 
 #ifndef DISABLE_CONTRIB_OPS
@@ -5875,6 +6032,33 @@ TEST_F(GraphTransformationTests, FilterEnabledOptimizers) {
   ASSERT_TRUE(op_to_count["Add"] == 1);
 }
 
+TEST_F(GraphTransformationTests, FilterEnabledOptimizersViaSessionOptions) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/constant_folding_with_scalar_shape_to_initializer.onnx";
+
+  SessionOptions so;
+  so.session_logid = "GraphTransformationTests.FilterEnabledOptimizersViaSessionOptions";
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsDisableSpecifiedOptimizers, "ConstantFolding"));
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ASSERT_STATUS_OK(session_object.Load(model_uri));
+
+  const auto& graph = session_object.GetGraph();
+
+  // check the ops that should go away if the constant folding transformer runs
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Shape"] == 1);
+  ASSERT_TRUE(op_to_count["ConstantOfShape"] == 1);
+  ASSERT_TRUE(op_to_count["Add"] == 1);
+
+  ASSERT_STATUS_OK(session_object.Initialize());  // Initialize runs the transformers
+
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Shape"] == 1);
+  ASSERT_TRUE(op_to_count["ConstantOfShape"] == 1);
+  ASSERT_TRUE(op_to_count["Add"] == 1);
+}
+
 TEST_F(GraphTransformationTests, PropagateCastOpsTests) {
   using Strategy = GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy;
   struct PropagateCastOpsTestSpecs {
@@ -7039,13 +7223,13 @@ TEST_F(GraphTransformationTests, ConstantSharing_ShouldNotShareForGraphOutput) {
   }
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_AllGather) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* data_arg = builder.MakeInput<float>({{54}});
     auto* shape_arg = builder.MakeInput<int64_t>({{4}});
     auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 3, 3}});
     auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
-    auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
+    auto* gather_index_2 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(1)});
     auto* gather_index_3 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(2)});
     auto* gather_out_1 = builder.MakeIntermediate();
     auto* gather_out_2 = builder.MakeIntermediate();
@@ -7062,7 +7246,8 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion) {
     builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
         .AddAttribute("axis", static_cast<int64_t>(2));
     builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+    builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
     builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
   };
 
@@ -7071,27 +7256,16 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion) {
     return Status::OK();
   };
 
-  // OpSet-12
+  // OpSet-12, not support
   {
     auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axes") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axes").ints().at(0)));
-        }
-      }
+      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 0);
+      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
       return Status::OK();
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
     ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 12, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
@@ -7101,7 +7275,7 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion) {
     auto post_graph_checker = [&](Graph& graph) {
       TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
       TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
+      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 2);
       for (auto& node : graph.Nodes()) {
         if (node.OpType() == "Split") {
           auto& attrs = node.GetAttributes();
@@ -7120,249 +7294,140 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion) {
       return Status::OK();
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
     ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
-
-  // OpSet-18
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          const NodeArg& input_arg = *(node.InputDefs()[1]);
-          const ONNX_NAMESPACE::TensorProto* tensor_proto =
-              graph_utils::GetConstantInitializer(graph, input_arg.Name());
-          TEST_RETURN_IF_NOT(tensor_proto != nullptr);
-          Initializer init_const{*tensor_proto, graph.ModelPath()};
-          TEST_RETURN_IF_NOT(tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64);
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(*(init_const.data<int64_t>())));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion_NoSqueeze) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_AllSlice_GraphInput) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* data_arg = builder.MakeInput<float>({{54}});
-    auto* shape_arg = builder.MakeInput<int64_t>({{4}});
-    auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 3, 3}});
-    auto* gather_index_1 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(0)});
-    auto* gather_index_2 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(1)});
-    auto* gather_index_3 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(2)});
-    auto* gather_out_1 = builder.MakeIntermediate();
-    auto* gather_out_2 = builder.MakeIntermediate();
-    auto* gather_out_3 = builder.MakeIntermediate();
+    auto* data_arg = builder.MakeInput<float>({{2, 3, 8, 3}});
+    auto* starts_1 = builder.MakeInitializer<int64_t>({1}, {0});
+    auto* ends_1 = builder.MakeInitializer<int64_t>({1}, {2});
+    auto* axes_1 = builder.MakeInitializer<int64_t>({1}, {2});
+    auto* steps_1 = builder.MakeInitializer<int64_t>({1}, {1});
+    auto* starts_2 = builder.MakeInitializer<int64_t>({1}, {2});
+    auto* ends_2 = builder.MakeInitializer<int64_t>({1}, {-2});
+    auto* axes_2 = builder.MakeInitializer<int64_t>({1}, {-2});
+    auto* steps_2 = builder.MakeInitializer<int64_t>({1}, {1});
+    auto* starts_3 = builder.MakeInitializer<int64_t>({1}, {-2});
+    auto* ends_3 = builder.MakeInitializer<int64_t>({1}, {16});
+    auto* axes_3 = builder.MakeInitializer<int64_t>({1}, {2});
+    auto* slice_out_1 = builder.MakeIntermediate();
+    auto* slice_out_2 = builder.MakeIntermediate();
+    auto* slice_out_3 = builder.MakeIntermediate();
     auto* transpose_out_1 = builder.MakeOutput();
     auto* transpose_out_2 = builder.MakeOutput();
     auto* transpose_out_3 = builder.MakeOutput();
 
-    builder.AddNode("Reshape", {data_arg, shape_arg}, {reshape_out});
-    builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
-        .AddAttribute("axis", static_cast<int64_t>(2));
-    builder.AddNode("Gather", {reshape_out, gather_index_2}, {gather_out_2})
-        .AddAttribute("axis", static_cast<int64_t>(-2));
-    builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
-        .AddAttribute("axis", static_cast<int64_t>(2));
-    builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+    builder.AddNode("Slice", {data_arg, starts_1, ends_1, axes_1, steps_1}, {slice_out_1});
+    builder.AddNode("Slice", {data_arg, starts_2, ends_2, axes_2, steps_2}, {slice_out_2});
+    builder.AddNode("Slice", {data_arg, starts_3, ends_3, axes_3}, {slice_out_3});
+    builder.AddNode("Transpose", {slice_out_1}, {transpose_out_1})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+    builder.AddNode("Transpose", {slice_out_2}, {transpose_out_2})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+    builder.AddNode("Transpose", {slice_out_3}, {transpose_out_3})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
   };
 
   auto pre_graph_checker = [&](Graph& graph) {
-    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Slice"] == 3);
     return Status::OK();
   };
 
-  // OpSet-12
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 12, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-
-  // OpSet-14
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-
-  // OpSet-18
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        }
+  auto post_graph_checker = [&](Graph& graph) {
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
+    for (auto& node : graph.Nodes()) {
+      if (node.OpType() == "Split") {
+        auto& attrs = node.GetAttributes();
+        TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
+        TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
       }
-      return Status::OK();
-    };
+    }
+    return Status::OK();
+  };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer), TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion_Consume_Input) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_Combined) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* data_arg = builder.MakeInput<float>({{2, 3, 3, 3}});
-    auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
-    auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
-    auto* gather_index_3 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(2)});
+    auto* data_arg = builder.MakeInput<float>({{144}});
+    auto* shape_arg = builder.MakeInput<int64_t>({{4}});
+    auto* reshape_out = builder.MakeIntermediate<float>({{2, 8, 3, 3}});
+    auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(5)});
+    auto* starts_2 = builder.MakeInitializer<int64_t>({1}, {6});
+    auto* ends_2 = builder.MakeInitializer<int64_t>({1}, {8});
+    auto* axes_2 = builder.MakeInitializer<int64_t>({1}, {-3});
+    auto* steps_2 = builder.MakeInitializer<int64_t>({1}, {1});
+    auto* gather_index_3 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(4)});
+    auto* starts_4 = builder.MakeInitializer<int64_t>({1}, {-16});
+    auto* ends_4 = builder.MakeInitializer<int64_t>({1}, {4});
+    auto* axes_4 = builder.MakeInitializer<int64_t>({1}, {1});
     auto* gather_out_1 = builder.MakeIntermediate();
-    auto* gather_out_2 = builder.MakeIntermediate();
+    auto* slice_out_2 = builder.MakeIntermediate();
     auto* gather_out_3 = builder.MakeIntermediate();
+    auto* slice_out_4 = builder.MakeIntermediate();
     auto* transpose_out_1 = builder.MakeOutput();
     auto* transpose_out_2 = builder.MakeOutput();
     auto* transpose_out_3 = builder.MakeOutput();
+    auto* transpose_out_4 = builder.MakeOutput();
 
-    builder.AddNode("Gather", {data_arg, gather_index_1}, {gather_out_1}).AddAttribute("axis", static_cast<int64_t>(2));
-    builder.AddNode("Gather", {data_arg, gather_index_2}, {gather_out_2})
-        .AddAttribute("axis", static_cast<int64_t>(-2));
-    builder.AddNode("Gather", {data_arg, gather_index_3}, {gather_out_3}).AddAttribute("axis", static_cast<int64_t>(2));
+    builder.AddNode("Reshape", {data_arg, shape_arg}, {reshape_out});
+    builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
+        .AddAttribute("axis", static_cast<int64_t>(1));
+    builder.AddNode("Slice", {reshape_out, starts_2, ends_2, axes_2, steps_2}, {slice_out_2});
+    builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
+        .AddAttribute("axis", static_cast<int64_t>(-3));
+    builder.AddNode("Slice", {reshape_out, starts_4, ends_4, axes_4}, {slice_out_4});
     builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-    builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3}).AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+    builder.AddNode("Transpose", {slice_out_2}, {transpose_out_2})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+    builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+    builder.AddNode("Transpose", {slice_out_4}, {transpose_out_4})
+        .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
   };
 
   auto pre_graph_checker = [&](Graph& graph) {
-    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 2);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Slice"] == 2);
     return Status::OK();
   };
 
-  // OpSet-12
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axes") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axes").ints().at(0)));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 12, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-
-  // OpSet-14
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          const NodeArg& input_arg = *(node.InputDefs()[1]);
-          const ONNX_NAMESPACE::TensorProto* tensor_proto =
-              graph_utils::GetConstantInitializer(graph, input_arg.Name());
-          TEST_RETURN_IF_NOT(tensor_proto != nullptr);
-          Initializer init_const{*tensor_proto, graph.ModelPath()};
-          TEST_RETURN_IF_NOT(tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64);
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(*(init_const.data<int64_t>())));
-        }
-      }
-      return Status::OK();
-    };
-
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
-
-  // OpSet-18
-  {
-    auto post_graph_checker = [&](Graph& graph) {
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
-      TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 3);
-      for (auto& node : graph.Nodes()) {
-        if (node.OpType() == "Split") {
-          auto& attrs = node.GetAttributes();
-          TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(attrs.at("axis").i()));
-        } else if (node.OpType() == "Squeeze") {
-          const NodeArg& input_arg = *(node.InputDefs()[1]);
-          const ONNX_NAMESPACE::TensorProto* tensor_proto =
-              graph_utils::GetConstantInitializer(graph, input_arg.Name());
-          TEST_RETURN_IF_NOT(tensor_proto != nullptr);
-          Initializer init_const{*tensor_proto, graph.ModelPath()};
-          TEST_RETURN_IF_NOT(tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64);
-          TEST_RETURN_IF_NOT(2 == static_cast<int>(*(init_const.data<int64_t>())));
-        }
+  auto post_graph_checker = [&](Graph& graph) {
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 0);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 1);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 1);
+    for (auto& node : graph.Nodes()) {
+      if (node.OpType() == "Split") {
+        auto& attrs = node.GetAttributes();
+        TEST_RETURN_IF_NOT(attrs.find("axis") != attrs.end());
+        TEST_RETURN_IF_NOT(1 == static_cast<int>(attrs.at("axis").i()));
+      } else if (node.OpType() == "Squeeze") {
+        const NodeArg& input_arg = *(node.InputDefs()[1]);
+        const ONNX_NAMESPACE::TensorProto* tensor_proto = graph_utils::GetConstantInitializer(graph, input_arg.Name());
+        TEST_RETURN_IF_NOT(tensor_proto != nullptr);
+        Initializer init_const{*tensor_proto, graph.ModelPath()};
+        TEST_RETURN_IF_NOT(tensor_proto->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64);
+        TEST_RETURN_IF_NOT(1 == static_cast<int>(*(init_const.data<int64_t>())));
       }
-      return Status::OK();
-    };
+    }
+    return Status::OK();
+  };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer),
-                                          TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
-  }
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion_Consume_Initializer) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_Consume_Initializer) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* data_arg = builder.MakeInitializer<float>({2, 3, 3, 3}, std::vector<float>(54));
     auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
@@ -7410,31 +7475,31 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion_Consume_Initializer) {
     return Status::OK();
   };
 
-  std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
   ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1,
                                         1, pre_graph_checker, post_graph_checker));
 }
 
-TEST_F(GraphTransformationTests, GatherToSplitFusion_Invalid) {
+TEST_F(GraphTransformationTests, GatherSliceToSplitFusion_Invalid) {
   auto pre_graph_checker = [&](Graph& graph) {
-    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] > 0 || CountOpsInGraph(graph)["Slice"] > 0);
     return Status::OK();
   };
   auto post_graph_checker = [&](Graph& graph) {
-    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] == 3);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Gather"] > 0 || CountOpsInGraph(graph)["Slice"] > 0);
     TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Split"] == 0);
     TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Squeeze"] == 0);
     return Status::OK();
   };
 
-  // Invalid shape.
+  // Not cover all elements of specific dimension.
   {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto* data_arg = builder.MakeInput<float>({{72}});
-      auto* shape_arg = builder.MakeInput<int64_t>({{1}});
+      auto* shape_arg = builder.MakeInput<int64_t>({{4}});
       auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 4, 3}});
       auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
-      auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
+      auto* gather_index_2 = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(1)});
       auto* gather_index_3 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(2)});
       auto* gather_out_1 = builder.MakeIntermediate();
       auto* gather_out_2 = builder.MakeIntermediate();
@@ -7447,63 +7512,65 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion_Invalid) {
       builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
           .AddAttribute("axis", static_cast<int64_t>(2));
       builder.AddNode("Gather", {reshape_out, gather_index_2}, {gather_out_2})
-          .AddAttribute("axis", static_cast<int64_t>(2));
+          .AddAttribute("axis", static_cast<int64_t>(-2));
       builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
           .AddAttribute("axis", static_cast<int64_t>(2));
       builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1})
           .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
       builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
       builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3})
           .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 12, *logger_, std::move(transformer),
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
+    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
 
-  // Invalid Gather indices.
+  // Has overlap.
   {
     auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* data_arg = builder.MakeInput<float>({{54}});
-      auto* shape_arg = builder.MakeInput<int64_t>({{1}});
-      auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 3, 3}});
-      auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
-      auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
-      auto* gather_index_3 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
-      auto* gather_out_1 = builder.MakeIntermediate();
-      auto* gather_out_2 = builder.MakeIntermediate();
-      auto* gather_out_3 = builder.MakeIntermediate();
+      auto* data_arg = builder.MakeInput<float>({{2, 3, 8, 3}});
+      auto* starts_1 = builder.MakeInitializer<int64_t>({1}, {0});
+      auto* ends_1 = builder.MakeInitializer<int64_t>({1}, {3});
+      auto* axes_1 = builder.MakeInitializer<int64_t>({1}, {2});
+      auto* steps_1 = builder.MakeInitializer<int64_t>({1}, {1});
+      auto* starts_2 = builder.MakeInitializer<int64_t>({1}, {2});
+      auto* ends_2 = builder.MakeInitializer<int64_t>({1}, {-2});
+      auto* axes_2 = builder.MakeInitializer<int64_t>({1}, {-2});
+      auto* steps_2 = builder.MakeInitializer<int64_t>({1}, {1});
+      auto* starts_3 = builder.MakeInitializer<int64_t>({1}, {-2});
+      auto* ends_3 = builder.MakeInitializer<int64_t>({1}, {16});
+      auto* axes_3 = builder.MakeInitializer<int64_t>({1}, {2});
+      auto* slice_out_1 = builder.MakeIntermediate();
+      auto* slice_out_2 = builder.MakeIntermediate();
+      auto* slice_out_3 = builder.MakeIntermediate();
       auto* transpose_out_1 = builder.MakeOutput();
       auto* transpose_out_2 = builder.MakeOutput();
       auto* transpose_out_3 = builder.MakeOutput();
 
-      builder.AddNode("Reshape", {data_arg, shape_arg}, {reshape_out});
-      builder.AddNode("Gather", {reshape_out, gather_index_1}, {gather_out_1})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-      builder.AddNode("Gather", {reshape_out, gather_index_2}, {gather_out_2})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-      builder.AddNode("Gather", {reshape_out, gather_index_3}, {gather_out_3})
-          .AddAttribute("axis", static_cast<int64_t>(2));
-      builder.AddNode("Transpose", {gather_out_1}, {transpose_out_1})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-      builder.AddNode("Transpose", {gather_out_2}, {transpose_out_2})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
-      builder.AddNode("Transpose", {gather_out_3}, {transpose_out_3})
-          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
+      builder.AddNode("Slice", {data_arg, starts_1, ends_1, axes_1, steps_1}, {slice_out_1});
+      builder.AddNode("Slice", {data_arg, starts_2, ends_2, axes_2, steps_2}, {slice_out_2});
+      builder.AddNode("Slice", {data_arg, starts_3, ends_3, axes_3}, {slice_out_3});
+      builder.AddNode("Transpose", {slice_out_1}, {transpose_out_1})
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+      builder.AddNode("Transpose", {slice_out_2}, {transpose_out_2})
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+      builder.AddNode("Transpose", {slice_out_3}, {transpose_out_3})
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
-    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
+    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 18, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
 
-  // Invalid Gather axis.
+  // Invalid axis.
   {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto* data_arg = builder.MakeInput<float>({{54}});
-      auto* shape_arg = builder.MakeInput<int64_t>({{1}});
+      auto* shape_arg = builder.MakeInput<int64_t>({{4}});
       auto* reshape_out = builder.MakeIntermediate<float>({{2, 3, 3, 3}});
       auto* gather_index_1 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(0)});
       auto* gather_index_2 = builder.MakeInitializer<int64_t>({}, {static_cast<int64_t>(1)});
@@ -7530,7 +7597,7 @@ TEST_F(GraphTransformationTests, GatherToSplitFusion_Invalid) {
           .AddAttribute("perm", std::vector<int64_t>{0, 2, 1});
     };
 
-    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherToSplitFusion>();
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<GatherSliceToSplitFusion>();
     ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer),
                                           TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker));
   }
@@ -7623,5 +7690,79 @@ TEST_F(GraphTransformationTests, GatherToSliceFusion) {
   }
 }
 
+TEST_F(GraphTransformationTests, ShapeInputMerge) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    std::vector<std::variant<int64_t, std::string>> input_shape;
+    input_shape.reserve(5);
+    input_shape.emplace_back("dim0");
+    input_shape.emplace_back(512);
+    input_shape.emplace_back(1);
+    input_shape.emplace_back(1536);
+    input_shape.emplace_back("dim4");
+    auto* input_arg = builder.MakeSymbolicInput<float>(input_shape);
+    auto* neg_out = builder.MakeIntermediate();
+    auto* axes_initializer = builder.MakeInitializer<int64_t>({1}, {static_cast<int64_t>(2)});
+    auto* squeeze_out = builder.MakeIntermediate();
+    auto* cast_out = builder.MakeIntermediate();
+    auto* unsqueeze_out = builder.MakeOutput();
+    auto* shape_1_out = builder.MakeOutput();
+    auto* shape_2_out = builder.MakeOutput();
+    auto* shape_3_out = builder.MakeOutput();
+    auto* shape_4_out = builder.MakeOutput();
+    auto* shape_5_out = builder.MakeOutput();
+    builder.AddNode("Neg", {input_arg}, {neg_out});
+    builder.AddNode("Squeeze", {neg_out, axes_initializer}, {squeeze_out});
+    builder.AddNode("Cast", {squeeze_out}, {cast_out}).AddAttribute("to", static_cast<int64_t>(10));
+    builder.AddNode("Unsqueeze", {cast_out, axes_initializer}, {unsqueeze_out});
+    builder.AddNode("Shape", {input_arg}, {shape_1_out});
+    builder.AddNode("Shape", {neg_out}, {shape_2_out});
+    builder.AddNode("Shape", {squeeze_out}, {shape_3_out});
+    builder.AddNode("Shape", {cast_out}, {shape_4_out});
+    builder.AddNode("Shape", {unsqueeze_out}, {shape_5_out});
+  };
+
+  auto pre_graph_checker = [&](Graph& graph) {
+    InlinedHashMap<std::string, int> ref_count;
+    for (auto& node : graph.Nodes()) {
+      if (node.OpType() == "Shape") {
+        std::string name = node.InputDefs()[0]->Name();
+        if (ref_count.find(name) == ref_count.end()) {
+          ref_count[name] = 1;
+        } else {
+          ref_count[name]++;
+        }
+      }
+    }
+    TEST_RETURN_IF_NOT(ref_count.size() == 5);
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [&](Graph& graph) {
+    InlinedHashMap<std::string, int> ref_count;
+    for (auto& node : graph.Nodes()) {
+      if (node.OpType() == "Shape") {
+        std::string name = node.InputDefs()[0]->Name();
+        if (ref_count.find(name) == ref_count.end()) {
+          ref_count[name] = 1;
+        } else {
+          ref_count[name]++;
+        }
+      }
+    }
+    TEST_RETURN_IF_NOT(ref_count.size() == 2);
+    int sum = 0, mul = 1;
+    for (auto& entry : ref_count) {
+      sum += entry.second;
+      mul *= entry.second;
+    }
+    TEST_RETURN_IF_NOT(sum == 5 && mul == 6);
+    return Status::OK();
+  };
+
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<ShapeInputMerge>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
index 63577131480c..57f10d9a4eb6 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -263,13 +263,14 @@ class ModelTestBuilder {
   Node& AddNode(const std::string& op_type,
                 const std::vector<NodeArg*>& input_args,
                 const std::vector<NodeArg*>& output_args,
-                const std::string& domain = "") {
+                const std::string& domain = "",
+                const NodeAttributes* attributes = nullptr) {
     return graph_.AddNode(graph_.GenerateNodeName("node"),
                           op_type,
                           "description",
                           input_args,
                           output_args,
-                          nullptr,
+                          attributes,
                           domain);
   }
 
@@ -299,6 +300,23 @@ class ModelTestBuilder {
     return AddNode("QuantizeLinear", input_args, {output_arg}, domain);
   }
 
+  template <typename T>
+  typename std::enable_if<IsTypeQuantLinearCompatible<T>::value, Node&>::type
+  AddQuantizeLinearNode(NodeArg* input_arg,
+                        const std::vector<float>& input_scales,
+                        const std::vector<T>& input_zero_points,
+                        NodeArg* output_arg,
+                        const NodeAttributes* attributes = nullptr,
+                        bool use_ms_domain = false) {
+    std::vector<NodeArg*> input_args;
+    input_args.push_back(input_arg);
+    input_args.push_back(Make1DInitializer<float>(input_scales));
+    input_args.push_back(Make1DInitializer<T>(input_zero_points));
+
+    std::string domain = use_ms_domain ? kMSDomain : "";
+    return AddNode("QuantizeLinear", input_args, {output_arg}, domain, attributes);
+  }
+
   Node& AddQuantizeLinearNode(NodeArg* input_arg,
                               float input_scale,
                               NodeArg* output_arg,
@@ -311,6 +329,19 @@ class ModelTestBuilder {
     return AddNode("QuantizeLinear", input_args, {output_arg}, domain);
   }
 
+  Node& AddQuantizeLinearNode(NodeArg* input_arg,
+                              const std::vector<float>& input_scales,
+                              NodeArg* output_arg,
+                              const NodeAttributes* attributes = nullptr,
+                              bool use_ms_domain = false) {
+    std::vector<NodeArg*> input_args;
+    input_args.push_back(input_arg);
+    input_args.push_back(Make1DInitializer<float>(input_scales));
+
+    std::string domain = use_ms_domain ? kMSDomain : "";
+    return AddNode("QuantizeLinear", input_args, {output_arg}, domain, attributes);
+  }
+
   template <typename T>
   typename std::enable_if<IsTypeDequantLinearCompatible<T>::value, Node&>::type
   AddDequantizeLinearNode(NodeArg* input_arg,
@@ -327,6 +358,23 @@ class ModelTestBuilder {
     return AddNode("DequantizeLinear", input_args, {output_arg}, domain);
   }
 
+  template <typename T>
+  typename std::enable_if<IsTypeDequantLinearCompatible<T>::value, Node&>::type
+  AddDequantizeLinearNode(NodeArg* input_arg,
+                          const std::vector<float>& input_scales,
+                          const std::vector<T>& input_zero_points,
+                          NodeArg* output_arg,
+                          const NodeAttributes* attributes = nullptr,
+                          bool use_ms_domain = false) {
+    std::vector<NodeArg*> input_args;
+    input_args.push_back(input_arg);
+    input_args.push_back(Make1DInitializer<float>(input_scales));
+    input_args.push_back(Make1DInitializer<T>(input_zero_points));
+
+    std::string domain = use_ms_domain ? kMSDomain : "";
+    return AddNode("DequantizeLinear", input_args, {output_arg}, domain, attributes);
+  }
+
   Node& AddDequantizeLinearNode(NodeArg* input_arg,
                                 float input_scale,
                                 NodeArg* output_arg,
@@ -339,6 +387,19 @@ class ModelTestBuilder {
     return AddNode("DequantizeLinear", input_args, {output_arg}, domain);
   }
 
+  Node& AddDequantizeLinearNode(NodeArg* input_arg,
+                                const std::vector<float>& input_scales,
+                                NodeArg* output_arg,
+                                const NodeAttributes* attributes = nullptr,
+                                bool use_ms_domain = false) {
+    std::vector<NodeArg*> input_args;
+    input_args.push_back(input_arg);
+    input_args.push_back(Make1DInitializer<float>(input_scales));
+
+    std::string domain = use_ms_domain ? kMSDomain : "";
+    return AddNode("DequantizeLinear", input_args, {output_arg}, domain, attributes);
+  }
+
   template <typename TWeight>
   Node& AddQLinearConvNode(NodeArg* input_arg,
                            float input_scale,
diff --git a/onnxruntime/test/optimizer/initializer_test.cc b/onnxruntime/test/optimizer/initializer_test.cc
index 8da7e6d82074..ee93cfaa67e2 100644
--- a/onnxruntime/test/optimizer/initializer_test.cc
+++ b/onnxruntime/test/optimizer/initializer_test.cc
@@ -19,6 +19,7 @@
 
 namespace onnxruntime {
 namespace test {
+#if !defined(__wasm__)
 namespace {
 template <typename T>
 Status WriteExternalDataFile(gsl::span<const T> data, const PathString& path, ScopedFileDeleter& file_deleter) {
@@ -106,6 +107,7 @@ TEST(OptimizerInitializerTest, LoadExternalData) {
     EXPECT_THROW(Initializer i(tensor_proto, tensor_data_dir_path), OnnxRuntimeException);
   }
 }
+#endif
 
 template <typename T>
 constexpr ONNX_NAMESPACE::TensorProto_DataType GetTensorProtoDataType();
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index c254d340cdcb..e6f0a259805e 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -518,7 +518,7 @@ TEST(NhwcTransformerTests, ConvMixTensorRanks) {
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 
-std::vector<MLFloat16> randomfp16(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
+static std::vector<MLFloat16> ARangeOfFP16Values(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
   std::vector<MLFloat16> val(detail::SizeFromDims(shape));
   float start = min.ToFloat();
   float end = max.ToFloat();
@@ -534,22 +534,22 @@ std::vector<MLFloat16> randomfp16(const std::vector<int64_t>& shape, MLFloat16 m
   return val;
 }
 
-template <>
-NodeArg* ModelTestBuilder::MakeInput<MLFloat16>(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
-  return MakeInput<MLFloat16>(shape, randomfp16(shape, min, max));
+static NodeArg* MakeInputARangeFP16(ModelTestBuilder& builder, const std::vector<int64_t>& shape,
+                                    MLFloat16 min, MLFloat16 max) {
+  return builder.MakeInput<MLFloat16>(shape, ARangeOfFP16Values(shape, min, max));
 }
 
-template <>
-NodeArg* ModelTestBuilder::MakeInitializer(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
-  return MakeInitializer(shape, randomfp16(shape, min, max));
+static NodeArg* MakeInitializerARangeFP16(ModelTestBuilder& builder, const std::vector<int64_t>& shape,
+                                          MLFloat16 min, MLFloat16 max) {
+  return builder.MakeInitializer<MLFloat16>(shape, ARangeOfFP16Values(shape, min, max));
 }
 
 TEST(NhwcTransformerTests, ConvFp16) {
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<MLFloat16>(input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* input_arg = MakeInputARangeFP16(builder, input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
       auto* output_arg = builder.MakeOutput();
-      auto* weight_arg = builder.MakeInitializer(weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* weight_arg = MakeInitializerARangeFP16(builder, weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
 
       builder.AddConvNode(input_arg, weight_arg, output_arg);
     };
@@ -575,10 +575,10 @@ TEST(NhwcTransformerTests, ConvFp16) {
 TEST(NhwcTransformerTests, ConvMaxPoolFp16) {
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<MLFloat16>(input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* input_arg = MakeInputARangeFP16(builder, input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
       auto* conv_output_arg = builder.MakeIntermediate();
       auto* output_arg = builder.MakeOutput();
-      auto* conv_weight_arg = builder.MakeInitializer(weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* conv_weight_arg = MakeInitializerARangeFP16(builder, weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
 
       builder.AddConvNode(input_arg, conv_weight_arg, conv_output_arg);
       Node& pool_node = builder.AddNode("MaxPool", {conv_output_arg}, {output_arg});
@@ -609,13 +609,13 @@ TEST(NhwcTransformerTests, ConvMaxPoolFp16) {
 
 TEST(NhwcTransformerTests, ConvGlobalAveragePoolFp16) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<MLFloat16>({1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* input_arg = MakeInputARangeFP16(builder, {1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
     auto* conv1_output_arg = builder.MakeIntermediate();
     auto* conv2_output_arg = builder.MakeIntermediate();
     auto* gavgpool1_output_arg = builder.MakeIntermediate();
     auto* output_arg = builder.MakeOutput();
-    auto* conv1_weight_arg = builder.MakeInitializer<MLFloat16>({30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
-    auto* conv2_weight_arg = builder.MakeInitializer<MLFloat16>({16, 30, 1, 1}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv1_weight_arg = MakeInitializerARangeFP16(builder, {30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv2_weight_arg = MakeInitializerARangeFP16(builder, {16, 30, 1, 1}, MLFloat16(-1.5f), MLFloat16(1.5f));
 
     Node& conv1_node = builder.AddConvNode(input_arg, conv1_weight_arg, conv1_output_arg);
     conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
@@ -640,13 +640,13 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePoolFp16) {
 
 TEST(NhwcTransformerTests, ConvAveragePoolFp16) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<MLFloat16>({1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* input_arg = MakeInputARangeFP16(builder, {1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
     auto* conv1_output_arg = builder.MakeIntermediate();
     auto* conv2_output_arg = builder.MakeIntermediate();
     auto* avgpool1_output_arg = builder.MakeIntermediate();
     auto* output_arg = builder.MakeOutput();
-    auto* conv1_weight_arg = builder.MakeInitializer<MLFloat16>({30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
-    auto* conv2_weight_arg = builder.MakeInitializer<MLFloat16>({16, 30, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv1_weight_arg = MakeInitializerARangeFP16(builder, {30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv2_weight_arg = MakeInitializerARangeFP16(builder, {16, 30, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
 
     Node& conv1_node = builder.AddConvNode(input_arg, conv1_weight_arg, conv1_output_arg);
     conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
diff --git a/onnxruntime/test/optimizer/optimizer_test.cc b/onnxruntime/test/optimizer/optimizer_test.cc
index 2ce1e3881d81..79704f2cc79e 100644
--- a/onnxruntime/test/optimizer/optimizer_test.cc
+++ b/onnxruntime/test/optimizer/optimizer_test.cc
@@ -27,7 +27,8 @@ namespace test {
 static const std::string MODEL_FOLDER = "testdata/transform/";
 
 TEST(OptimizerTest, Basic) {
-  Model model("OptimizerBasic", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
+  Model model("OptimizerBasic", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
   auto& graph = model.MainGraph();
 
   constexpr int tensor_dim = 10;
@@ -65,8 +66,7 @@ TEST(OptimizerTest, Basic) {
     nodes.push_back(&node);
   }
 
-  std::unique_ptr<CPUExecutionProvider> cpu_execution_provider =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  auto cpu_execution_provider = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
 #if !defined(DISABLE_SPARSE_TENSORS)
   OptimizerExecutionFrame::Info info(nodes, initialized_tensor_set,
                                      graph.ModelPath(),
@@ -85,8 +85,10 @@ TEST(OptimizerTest, Basic) {
   OptimizerExecutionFrame frame(info, fetch_mlvalue_idxs);
   const logging::Logger& logger = DefaultLoggingManager().DefaultLogger();
 
+  const ConfigOptions empty_config_options;
+
   for (auto& node : graph.Nodes()) {
-    auto kernel = info.CreateKernel(&node);
+    auto kernel = info.CreateKernel(&node, empty_config_options);
 
     // kernel can only be a nullptr if a CPU kernel implementation has been removed,
     // if that is the case, OpKernelContext instance construction will throw in the next step
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
index 5cb4633dadd4..414a0fbeb78f 100644
--- a/onnxruntime/test/optimizer/qdq_test_utils.h
+++ b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -40,8 +40,21 @@ AddQDQNodePairWithOutputAsGraphOutput(ModelTestBuilder& builder, NodeArg* q_inpu
   return dq_output;
 }
 
+template <typename T>
+typename std::enable_if<IsTypeQuantLinearCompatible<T>::value, NodeArg*>::type
+AddQDQNodePair(ModelTestBuilder& builder, NodeArg* q_input, const std::vector<float>& scales,
+               const std::vector<T>& zero_points, const NodeAttributes* q_attrs = nullptr,
+               const NodeAttributes* dq_attrs = nullptr, bool use_ms_domain = false) {
+  auto* q_output = builder.MakeIntermediate();
+  auto* dq_output = builder.MakeIntermediate();
+  builder.AddQuantizeLinearNode<T>(q_input, scales, zero_points, q_output, q_attrs, use_ms_domain);
+  builder.AddDequantizeLinearNode<T>(q_output, scales, zero_points, dq_output, dq_attrs, use_ms_domain);
+  return dq_output;
+}
+
 template <typename InputType, typename WeightType, typename BiasType, typename OutputType>
-GetQDQTestCaseFn BuildQDQConvTransposeTestCase(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
+GetQDQTestCaseFn BuildQDQConvTransposeTestCase(const std::vector<int64_t>& input_shape,
+                                               const std::vector<int64_t>& weights_shape) {
   return [input_shape, weights_shape](ModelTestBuilder& builder) {
     auto* input_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
     auto* output_arg = builder.MakeOutput();
@@ -71,7 +84,8 @@ GetQDQTestCaseFn BuildQDQConvTransposeTestCase(const std::vector<int64_t>& input
                                                 dq_w_output);
 
     auto* dq_bias_output = builder.MakeIntermediate();
-    auto* bias = builder.MakeInitializer<BiasType>({weights_shape[0]}, static_cast<BiasType>(0), static_cast<BiasType>(127));
+    auto* bias = builder.MakeInitializer<BiasType>({weights_shape[0]}, static_cast<BiasType>(0),
+                                                   static_cast<BiasType>(127));
     builder.AddDequantizeLinearNode<BiasType>(bias, .0012f,
                                               0,
                                               dq_bias_output);
@@ -126,7 +140,8 @@ GetQDQTestCaseFn BuildQDQConvTestCase(const std::vector<int64_t>& input_shape,
                                                 use_contrib_qdq);
 
     auto* dq_bias_output = builder.MakeIntermediate();
-    auto* bias = builder.MakeInitializer<BiasType>({weights_shape[0]}, static_cast<BiasType>(0), static_cast<BiasType>(127));
+    auto* bias = builder.MakeInitializer<BiasType>({weights_shape[0]}, static_cast<BiasType>(0),
+                                                   static_cast<BiasType>(127));
     builder.AddDequantizeLinearNode<BiasType>(bias, .0012f,
                                               0,
                                               dq_bias_output,
@@ -389,7 +404,8 @@ GetQDQTestCaseFn BuildConsolidationTestCase(
     const int64_t& axis,
     bool use_contrib_qdq = false) {
   return [input_shape, axis, use_contrib_qdq](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<float>(input_shape, std::numeric_limits<float>::min(), std::numeric_limits<float>::max());
+    auto* input_arg = builder.MakeInput<float>(input_shape, std::numeric_limits<float>::min(),
+                                               std::numeric_limits<float>::max());
     InputType dq_zp = std::numeric_limits<InputType>::max() / 2;
     OutputType q_zp = std::numeric_limits<OutputType>::max() / 2;
     auto* upper_dq_output = builder.MakeIntermediate();
@@ -447,7 +463,8 @@ GetQDQTestCaseFn BuildDoubleQDQTestCases(Type1 zp_1, Type2 zp_2, Type3 zp_3, Typ
 template <typename T>
 GetQDQTestCaseFn BuildDoubleQDQWithoutLastOutput(int output_index, bool use_contrib_qdq = false) {
   return [=](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<float>({2, 3, 4}, std::numeric_limits<float>::min(), std::numeric_limits<float>::max());
+    auto* input_arg = builder.MakeInput<float>({2, 3, 4}, std::numeric_limits<float>::min(),
+                                               std::numeric_limits<float>::max());
     T zp = (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / 2;
     float scale = 0.003f;
     std::vector<NodeArg*> outputs(4);
@@ -632,7 +649,8 @@ GetQDQTestCaseFn BuildQDQConcatTestCase(const std::vector<std::vector<int64_t>>&
 
 GetQDQTestCaseFn BuildQDQConcatTestCaseUnsupportedInputScaleZp();
 
-GetQDQTestCaseFn BuildQDQMatMulTestCase(const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape);
+GetQDQTestCaseFn BuildQDQMatMulTestCase(const std::vector<int64_t>& input1_shape,
+                                        const std::vector<int64_t>& input2_shape);
 
 template <typename Input1Type, typename Input2Type, typename OutputType, typename BiasType = int32_t>
 GetQDQTestCaseFn BuildQDQGemmTestCase(const std::vector<int64_t>& input1_shape,
@@ -673,7 +691,8 @@ GetQDQTestCaseFn BuildQDQGemmTestCase(const std::vector<int64_t>& input1_shape,
 
     if (has_bias) {
       auto* dq_bias_output = builder.MakeIntermediate();
-      auto* bias = builder.MakeInitializer<BiasType>({input2_shape[0]}, static_cast<BiasType>(0), static_cast<BiasType>(127));
+      auto* bias = builder.MakeInitializer<BiasType>({input2_shape[0]}, static_cast<BiasType>(0),
+                                                     static_cast<BiasType>(127));
       builder.AddDequantizeLinearNode<BiasType>(bias, 0.00156f,
                                                 0,
                                                 dq_bias_output);
diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc
new file mode 100644
index 000000000000..ec9f78da14a7
--- /dev/null
+++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc
@@ -0,0 +1,730 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/compute_capability.h"
+#include "core/graph/model.h"
+#include "core/graph/onnx_protobuf.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/optimizer/qdq_transformer/qdq_final_cleanup.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/optimizer/utils.h"
+#include "core/providers/partitioning_utils.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/environment.h"
+#include "core/session/inference_session.h"
+
+#include "test/compare_ortvalue.h"
+#include "test/test_environment.h"
+#include "test/framework/test_utils.h"
+#include "test/util/include/asserts.h"
+#include "test/util/include/inference_session_wrapper.h"
+
+#include "gtest/gtest.h"
+#include "graph_transform_test_builder.h"
+
+#include "qdq_test_utils.h"
+
+#if defined(__aarch64__) && defined(__linux__) && !defined(DISABLE_CONTRIB_OPS)
+
+struct QDQOpKeys {
+  const char* quantize_linear;
+  const char* dequantize_linear;
+};
+
+constexpr QDQOpKeys GetQDQOpKeys(bool use_contrib_qdq) {
+  if (use_contrib_qdq) {
+    return {"com.microsoft.QuantizeLinear", "com.microsoft.DequantizeLinear"};
+  }
+  return {"QuantizeLinear", "DequantizeLinear"};
+}
+
+namespace onnxruntime {
+namespace test {
+
+#if !defined(DISABLE_CONTRIB_OPS)
+
+TEST(QDQTransformerTests, DQ_S8_to_U8_FastMath) {
+  auto test_case = [](bool use_contrib_qdq) {
+    const std::vector<int64_t>& input_shape = {19, 37};
+    const std::vector<int64_t>& weights_shape = {37, 23};
+
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<float>(input_shape, -1.f, 1.f);
+
+      // Use full range weight values to expose u8s8 overflow problems
+      auto* weight = builder.MakeInitializer<int8_t>(weights_shape, -128, 127);
+      auto* output_arg = builder.MakeOutput();
+
+      // add QDQ activation
+      typedef std::numeric_limits<uint8_t> Input1Limits;
+      auto* dq1_output = AddQDQNodePair<int8_t>(builder, input1_arg, .039f,
+                                                (int8_t)((Input1Limits::max() + Input1Limits::min()) / 2 + 1),
+                                                use_contrib_qdq);
+
+      // add DQ weight
+      auto* dq_w_output = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<int8_t>(weight, .003f, -10, dq_w_output, use_contrib_qdq);
+
+      builder.AddNode("MatMul", {dq1_output, dq_w_output}, {output_arg});
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+      EXPECT_EQ(op_to_count["MatMul"], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 1);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/, /*using NAN as a magic number to trigger cosine similarity*/
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      18 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      19 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+
+    auto add_session_options_disable_fm = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options_disable_fm);
+  };
+
+  test_case(false);  // Use ONNX QDQ ops
+  test_case(true);   // Use com.microsoft QDQ ops
+}
+
+template <typename Input1Type, typename Input2Type, typename OutputType>
+void QDQTransformerMatMulTests(bool has_output_q, bool disable_fastmath = false) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq = false) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<float>(input1_shape, -1.f, 1.f);
+      auto* input2_arg = builder.MakeInput<float>(input2_shape, -1.f, 1.f);
+      auto* output_arg = builder.MakeOutput();
+
+      typedef std::numeric_limits<Input1Type> Input1Limits;
+      typedef std::numeric_limits<Input2Type> Input2Limits;
+      typedef std::numeric_limits<OutputType> OutputTypeLimits;
+
+      // add QDQ 1
+      auto* q1_output = builder.MakeIntermediate();
+      auto* dq1_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
+                                                .039f,
+                                                (Input1Limits::max() + Input1Limits::min()) / 2 + 1,
+                                                q1_output, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<Input1Type>(q1_output,
+                                                  .039f,
+                                                  (Input2Limits::max() + Input1Limits::min()) / 2 + 1,
+                                                  dq1_output, use_contrib_qdq);
+
+      // add QDQ 2
+      auto* q2_output = builder.MakeIntermediate();
+      auto* dq2_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
+                                                .04f,
+                                                (Input2Limits::max() + Input2Limits::min()) / 2 + 1,
+                                                q2_output, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<Input2Type>(q2_output,
+                                                  .04f,
+                                                  (Input2Limits::max() + Input2Limits::min()) / 2 + 1,
+                                                  dq2_output, use_contrib_qdq);
+
+      if (has_output_q) {
+        // add binary operator
+        auto* matmul_op_output = builder.MakeIntermediate();
+        builder.AddNode("MatMul", {dq1_output, dq2_output}, {matmul_op_output});
+
+        // add QDQ output
+        auto* q3_output = builder.MakeIntermediate();
+        builder.AddQuantizeLinearNode<OutputType>(matmul_op_output,
+                                                  .039f,
+                                                  (OutputTypeLimits::max() + OutputTypeLimits::min()) / 2 + 1,
+                                                  q3_output, use_contrib_qdq);
+        builder.AddDequantizeLinearNode<OutputType>(q3_output,
+                                                    .039f,
+                                                    (OutputTypeLimits::max() + OutputTypeLimits::min()) / 2 + 1,
+                                                    output_arg, use_contrib_qdq);
+      } else {
+        builder.AddNode("MatMul", {dq1_output, dq2_output}, {output_arg});
+      }
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      if (has_output_q) {
+        if constexpr (std::is_same<Input1Type, OutputType>::value &&
+                      (std::is_same<Input1Type, uint8_t>::value ||
+                       QDQIsInt8Allowed() && std::is_same<Input2Type, int8_t>::value)) {
+          EXPECT_EQ(op_to_count["QLinearMatMul"], 1);
+          EXPECT_EQ(op_to_count["MatMul"], 0);
+          EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+          EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+        } else {
+          EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
+          EXPECT_EQ(op_to_count["MatMul"], 1);
+          EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 3);
+          EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 3);
+        }
+      } else {
+        if constexpr (std::is_same<Input1Type, uint8_t>::value ||
+                      (QDQIsInt8Allowed() && std::is_same<Input2Type, int8_t>::value)) {
+          EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+          EXPECT_EQ(op_to_count["MatMul"], 0);
+          EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+          EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+        } else {
+          EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0);
+          EXPECT_EQ(op_to_count["MatMul"], 1);
+          EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+          EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 2);
+        }
+      }
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      18 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      19 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+
+    if (disable_fastmath) {
+      auto add_session_options = [&](SessionOptions& so) {
+        ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+            kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+      };
+
+      TransformerTester(build_test_case,
+                        check_graph,
+                        TransformerLevel::Level1,
+                        TransformerLevel::Level2,
+                        12 /*opset_version*/,
+                        NAN /*per_sample_tolerance*/,
+                        NAN /*relative_per_sample_tolerance*/,
+                        std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                        add_session_options);
+    }
+  };
+
+  test_case({1, 2, 2}, {1, 2, 4});
+  test_case({1, 23, 13, 13}, {13, 13});
+  test_case({1, 22, 11, 13, 15}, {1, 22, 11, 15, 15});
+  test_case({1, 2, 2}, {1, 2, 4}, true);  // Use com.microsoft QDQ ops
+}
+
+TEST(QDQTransformerTests, MatMul_U8U8U8_FastMath) {
+  QDQTransformerMatMulTests<uint8_t, uint8_t, uint8_t>(false);
+  QDQTransformerMatMulTests<uint8_t, uint8_t, uint8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_U8S8S8_FastMath) {
+  QDQTransformerMatMulTests<uint8_t, int8_t, int8_t>(false);
+  QDQTransformerMatMulTests<uint8_t, int8_t, int8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_U8U8S8_FastMath) {
+  QDQTransformerMatMulTests<uint8_t, uint8_t, int8_t>(false);
+  QDQTransformerMatMulTests<uint8_t, uint8_t, int8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_U8S8U8_FastMath) {
+  QDQTransformerMatMulTests<uint8_t, int8_t, uint8_t>(false);
+  QDQTransformerMatMulTests<uint8_t, int8_t, uint8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_S8S8S8_FastMath) {
+  QDQTransformerMatMulTests<int8_t, int8_t, int8_t>(false);
+  QDQTransformerMatMulTests<int8_t, int8_t, int8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_S8U8U8_FastMath) {
+  QDQTransformerMatMulTests<int8_t, uint8_t, uint8_t>(false);
+  QDQTransformerMatMulTests<int8_t, uint8_t, uint8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_S8U8S8_FastMath) {
+  QDQTransformerMatMulTests<int8_t, uint8_t, int8_t>(false);
+  QDQTransformerMatMulTests<int8_t, uint8_t, int8_t>(true);
+}
+
+TEST(QDQTransformerTests, MatMul_S8S8U8_FastMath) {
+  QDQTransformerMatMulTests<int8_t, int8_t, uint8_t>(false);
+  QDQTransformerMatMulTests<int8_t, int8_t, uint8_t>(true);
+}
+
+// dummy test to disable the fastmath session op
+TEST(QDQTransformerTests, MatMul_S8S8U8_DisableFastMath) {
+  QDQTransformerMatMulTests<int8_t, int8_t, uint8_t>(false, true);
+  QDQTransformerMatMulTests<int8_t, int8_t, uint8_t>(true, true);
+}
+
+template <typename Input1Type, typename Input2Type, typename OutputType, typename BiasType = int32_t>
+void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one = false, bool disable_fastmath = false) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq = false) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<float>(input1_shape, -1.f, 1.f);
+      auto* input2_arg = builder.MakeInput<float>(input2_shape, -1.f, 1.f);
+      auto* output_arg = builder.MakeOutput();
+
+      typedef std::numeric_limits<Input1Type> Input1Limits;
+      typedef std::numeric_limits<Input2Type> Input2Limits;
+      typedef std::numeric_limits<OutputType> OutputTypeLimits;
+
+      std::vector<NodeArg*> input_args;
+
+      // add QDQ A
+      auto* q1_output = builder.MakeIntermediate();
+      auto* dq1_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<Input1Type>(input1_arg,
+                                                .039f,
+                                                (Input1Limits::max() + Input1Limits::min()) / 2 + 1,
+                                                q1_output, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<Input1Type>(q1_output,
+                                                  .039f,
+                                                  (Input2Limits::max() + Input1Limits::min()) / 2 + 1,
+                                                  dq1_output, use_contrib_qdq);
+
+      input_args.push_back(dq1_output);
+
+      // add QDQ B
+      auto* q2_output = builder.MakeIntermediate();
+      auto* dq2_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<Input2Type>(input2_arg,
+                                                .04f,
+                                                (Input2Limits::max() + Input2Limits::min()) / 2 + 1,
+                                                q2_output, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<Input2Type>(q2_output,
+                                                  .04f,
+                                                  (Input2Limits::max() + Input2Limits::min()) / 2 + 1,
+                                                  dq2_output, use_contrib_qdq);
+      input_args.push_back(dq2_output);
+
+      if (has_bias) {
+        auto* dq_bias_output = builder.MakeIntermediate();
+        auto* bias = builder.MakeInitializer<BiasType>({input2_shape[1]}, static_cast<BiasType>(0), static_cast<BiasType>(127));
+        builder.AddDequantizeLinearNode<BiasType>(bias, 0.00156f,
+                                                  0,
+                                                  dq_bias_output, use_contrib_qdq);
+        input_args.push_back(dq_bias_output);
+      }
+
+      Node* gemm_node = nullptr;
+
+      if (has_output_q) {
+        auto* gemm_op_output = builder.MakeIntermediate();
+        gemm_node = &builder.AddNode("Gemm", input_args, {gemm_op_output});
+
+        // add QDQ output
+        auto* q3_output = builder.MakeIntermediate();
+        builder.AddQuantizeLinearNode<OutputType>(gemm_op_output,
+                                                  .039f,
+                                                  (OutputTypeLimits::max() + OutputTypeLimits::min()) / 2 + 1,
+                                                  q3_output, use_contrib_qdq);
+        builder.AddDequantizeLinearNode<OutputType>(q3_output,
+                                                    .039f,
+                                                    (OutputTypeLimits::max() + OutputTypeLimits::min()) / 2 + 1,
+                                                    output_arg, use_contrib_qdq);
+      } else {
+        gemm_node = &builder.AddNode("Gemm", input_args, {output_arg});
+      }
+
+      if (beta_not_one) {
+        gemm_node->AddAttribute("beta", 2.0f);
+      }
+    };
+
+    auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      if ((!has_output_q || std::is_same_v<Input1Type, OutputType>)&&(!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
+          (std::is_same_v<Input1Type, uint8_t> || std::is_same_v<Input2Type, int8_t>)) {
+        EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1);
+        EXPECT_EQ(op_to_count["Gemm"], 0);
+        EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+        EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], has_output_q ? 1 : 0);
+      } else {
+        int q_count = 2;   // Q for A and B
+        int dq_count = 2;  // DQ for A and B
+        if (has_bias) {
+          dq_count++;
+        }
+        if (has_output_q) {
+          q_count++;
+          dq_count++;
+        }
+        EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 0);
+        EXPECT_EQ(op_to_count["Gemm"], 1);
+        EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], q_count);
+        EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], dq_count);
+      }
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_binary_op_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_binary_op_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      18 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_binary_op_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      19 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                      add_session_options);
+
+    if (disable_fastmath) {
+      auto add_session_options = [&](SessionOptions& so) {
+        ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+            kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+      };
+
+      TransformerTester(build_test_case,
+                        check_binary_op_graph,
+                        TransformerLevel::Level1,
+                        TransformerLevel::Level2,
+                        12 /*opset_version*/,
+                        NAN /*per_sample_tolerance*/,
+                        NAN /*relative_per_sample_tolerance*/,
+                        std::make_unique<QDQSelectorActionTransformer>(QDQIsInt8Allowed()),
+                        add_session_options);
+    }
+  };
+
+  test_case({2, 2}, {2, 4});
+  test_case({13, 15}, {15, 15});
+  test_case({2, 2}, {2, 4}, true);  // Use com.microsoft QDQ ops
+}
+
+template <typename Input1Type, typename Input2Type, typename OutputType, typename BiasType = int32_t>
+void QDQTransformerGemmTests() {
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(false, false);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(false, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, false);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(false, false, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(false, true, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, false, true);
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, true, true);
+  // dummy test to disable the fastmath session
+  QDQTransformerGemmTests<Input1Type, Input2Type, OutputType, BiasType>(true, true, true, true);
+}
+
+TEST(QDQTransformerTests, Gemm_U8U8U8_FastMath) {
+  QDQTransformerGemmTests<uint8_t, uint8_t, uint8_t>();
+  QDQTransformerGemmTests<uint8_t, uint8_t, uint8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_U8S8S8_FastMath) {
+  QDQTransformerGemmTests<uint8_t, int8_t, int8_t>();
+  QDQTransformerGemmTests<uint8_t, int8_t, int8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_U8U8S8_FastMath) {
+  QDQTransformerGemmTests<uint8_t, uint8_t, int8_t>();
+  QDQTransformerGemmTests<uint8_t, uint8_t, int8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_U8S8U8_FastMath) {
+  QDQTransformerGemmTests<uint8_t, int8_t, uint8_t>();
+  QDQTransformerGemmTests<uint8_t, int8_t, uint8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_S8S8S8_FastMath) {
+  QDQTransformerGemmTests<int8_t, int8_t, int8_t>();
+  QDQTransformerGemmTests<int8_t, int8_t, int8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_S8U8U8_FastMath) {
+  QDQTransformerGemmTests<int8_t, uint8_t, uint8_t>();
+  QDQTransformerGemmTests<int8_t, uint8_t, uint8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_S8U8S8_FastMath) {
+  QDQTransformerGemmTests<int8_t, uint8_t, int8_t>();
+  QDQTransformerGemmTests<int8_t, uint8_t, int8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, Gemm_S8S8U8_FastMath) {
+  QDQTransformerGemmTests<int8_t, int8_t, uint8_t>();
+  QDQTransformerGemmTests<int8_t, int8_t, uint8_t, uint8_t>();
+}
+
+TEST(QDQTransformerTests, MatMul_No_Fusion_FastMath) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<float>(input1_shape, -1.f, 1.f);
+      auto* input2_arg = builder.MakeInput<float>(input2_shape, -1.f, 1.f);
+      auto* output_arg = builder.MakeOutput();
+
+      // add QDQ + MatMul
+      auto* matmul_output = builder.MakeIntermediate();
+      auto* dq_matmul_output1 = AddQDQNodePair<uint8_t>(builder, input1_arg, .004f, 129, use_contrib_qdq);
+      builder.AddNode("MatMul", {dq_matmul_output1, input2_arg}, {matmul_output});
+
+      // add Q
+      builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg, use_contrib_qdq);
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count["MatMul"], 1);
+      EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+
+    auto add_session_options_disable_fm = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options_disable_fm);
+  };
+
+  test_case({12, 37}, {37, 12}, false /*use_contrib_qdq*/);
+  test_case({12, 37}, {37, 12}, true /*use_contrib_qdq*/);
+}
+
+TEST(QDQTransformerTests, MatMul_1st_Input_Int8_FastMath) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<int8_t>(input1_shape, -128, 127);
+      auto* input2_arg = builder.MakeInput<float>(input2_shape, -1.f, 1.f);
+      auto* output_arg = builder.MakeOutput();
+
+      // add DQ with type int8
+      auto* dq_output_1 = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<int8_t>(input1_arg, .004f, 1, dq_output_1, use_contrib_qdq);
+
+      // add QDQ + MatMul
+      auto* matmul_output = builder.MakeIntermediate();
+      auto* dq_matmul_output2 = AddQDQNodePair<uint8_t>(builder, input2_arg, .004f, 129, use_contrib_qdq);
+      builder.AddNode("MatMul", {dq_output_1, dq_matmul_output2}, {matmul_output});
+
+      // add Q
+      builder.AddQuantizeLinearNode<uint8_t>(matmul_output, .0039f, 135, output_arg, use_contrib_qdq);
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count["MatMul"], 1);
+      EXPECT_EQ(op_to_count["QLinearMatMul"], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 2);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 2);
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options);
+
+    auto add_session_options_disable_fm = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr, add_session_options_disable_fm);
+  };
+
+  test_case({12, 37}, {37, 12}, false /*use_contrib_qdq*/);
+  test_case({12, 37}, {37, 12}, true /*use_contrib_qdq*/);
+  test_case({23, 13, 13}, {13, 13}, false /*use_contrib_qdq*/);
+  test_case({22, 11, 13, 15}, {15, 13}, false /*use_contrib_qdq*/);
+}
+
+TEST(QDQTransformerTests, MatMulIntegerToFloat_FastMath) {
+  auto test_case = [&](const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& input2_shape,
+                       bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<uint8_t>(input1_shape,
+                                                    std::numeric_limits<uint8_t>::min(),
+                                                    std::numeric_limits<uint8_t>::max());
+      auto* input2_arg = builder.MakeInput<uint8_t>(input2_shape,
+                                                    std::numeric_limits<uint8_t>::min(),
+                                                    std::numeric_limits<uint8_t>::max());
+      auto* output_arg = builder.MakeOutput();
+
+      // add DQ
+      auto* dq_output_1 = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<uint8_t>(input1_arg, .0035f, 135, dq_output_1, use_contrib_qdq);
+
+      auto* dq_output_2 = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<uint8_t>(input2_arg, .0035f, 135, dq_output_2, use_contrib_qdq);
+
+      builder.AddNode("MatMul", {dq_output_1, dq_output_2}, {output_arg});
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+    };
+
+    auto add_session_options = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr,
+                      add_session_options);
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      19 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr,
+                      add_session_options);
+
+    auto add_session_options_disable_fm = [&](SessionOptions& so) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Level1,
+                      TransformerLevel::Level2,
+                      12 /*opset_version*/,
+                      NAN /*per_sample_tolerance*/,
+                      NAN /*relative_per_sample_tolerance*/,
+                      nullptr,
+                      add_session_options_disable_fm);
+  };
+
+  test_case({12, 37}, {37, 12}, false /*use_contrib_qdq*/);
+  test_case({12, 37}, {37, 12}, true /*use_contrib_qdq*/);
+  test_case({23, 13, 13}, {13, 13}, false /*use_contrib_qdq*/);
+  test_case({22, 11, 13, 15}, {15, 13}, false /*use_contrib_qdq*/);
+}
+
+#endif  // !defined(DISABLE_CONTRIB_OPS) && defined(__aarch64)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // defined(__aarch64) && defined(__linux__) && !defined(DISABLE_CONTRIB_OPS)
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 13333f1558cc..ae263a7ca7d3 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <type_traits>
 #include "core/framework/compute_capability.h"
+#include "core/framework/node_unit.h"
 #include "core/graph/model.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/mlas/inc/mlas.h"
@@ -9,7 +11,6 @@
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/optimizer/utils.h"
 #include "core/providers/partitioning_utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/environment.h"
@@ -30,10 +31,6 @@
 #pragma warning(disable : 4127)
 #endif  // #if defined(_MSC_VER)
 
-#ifdef USE_NNAPI
-#include "core/providers/shared/node_unit/node_unit.h"
-#endif  // #ifdef USE_NNAPI
-
 struct QDQOpKeys {
   const char* quantize_linear;
   const char* dequantize_linear;
@@ -895,7 +892,8 @@ TEST(QDQTransformerTests, Gemm_S8S8U8) {
 template <typename QuantType>
 static void RunGatherDropQDQTestCase(const std::vector<int64_t>& input1_shape,
                                      const std::vector<int64_t>& weights_shape,
-                                     bool use_contrib_qdq = false) {
+                                     bool use_contrib_qdq = false,
+                                     int opset = 12) {
   auto build_test_case = [input1_shape, weights_shape, use_contrib_qdq](ModelTestBuilder& builder) {
     auto* input1_arg = builder.MakeInput<int64_t>(input1_shape, 0, weights_shape[0] - 1);
     auto* output_arg = builder.MakeOutput();
@@ -920,21 +918,23 @@ static void RunGatherDropQDQTestCase(const std::vector<int64_t>& input1_shape,
     EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
   };
 
-  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset);
 }
 
 // Checks that Q/DQ nodes are dropped from DQ -> Gather -> Q. Uses 8-bit and 16-bit Q/DQ ops.
 TEST(QDQTransformerTests, Gather) {
-  RunGatherDropQDQTestCase<int8_t>({12, 37}, {24, 12});
-  RunGatherDropQDQTestCase<int8_t>({12, 37}, {24, 12}, true);   // Use com.microsoft QDQ ops
-  RunGatherDropQDQTestCase<int16_t>({12, 37}, {24, 12}, true);  // Use int16 com.microsoft QDQ ops
+  RunGatherDropQDQTestCase<int8_t>({12, 37}, {24, 12});              // Use ONNX int8 QDQ ops
+  RunGatherDropQDQTestCase<int8_t>({12, 37}, {24, 12}, true);        // Use com.microsoft QDQ ops
+  RunGatherDropQDQTestCase<int16_t>({12, 37}, {24, 12}, true);       // Use int16 com.microsoft QDQ ops
+  RunGatherDropQDQTestCase<int16_t>({12, 37}, {24, 12}, false, 21);  // Use ONNX int16 QDQ ops
 }
 
 // Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Reshape -> Q.
 template <typename QuantType>
 static void RunReshapeDropQDQTestCase(const std::vector<int64_t>& input_shape,
                                       const std::vector<int64_t>& new_shape,
-                                      bool use_contrib_qdq = false) {
+                                      bool use_contrib_qdq = false,
+                                      int opset = 12) {
   auto build_test_case = [input_shape, new_shape, use_contrib_qdq](ModelTestBuilder& builder) {
     constexpr QuantType qmin = std::numeric_limits<QuantType>::min();
     constexpr QuantType qmax = std::numeric_limits<QuantType>::max();
@@ -962,15 +962,17 @@ static void RunReshapeDropQDQTestCase(const std::vector<int64_t>& input_shape,
     EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
   };
 
-  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset);
 }
 
 // Checks that Q/DQ nodes are dropped from DQ -> Reshape -> Q. Uses 8-bit and 16-bit Q/DQ ops.
 TEST(QDQTransformerTests, ReshapeDropQDQ) {
   RunReshapeDropQDQTestCase<int8_t>({1, 3, 2, 2}, {1, 12});
-  RunReshapeDropQDQTestCase<int8_t>({1, 3, 2, 2}, {1, 12}, true);    // Use com.microsoft QDQ ops
-  RunReshapeDropQDQTestCase<int16_t>({1, 3, 2, 2}, {1, 12}, true);   // Use int16 com.microsoft QDQ ops
-  RunReshapeDropQDQTestCase<uint16_t>({1, 3, 2, 2}, {1, 12}, true);  // Use int16 com.microsoft QDQ ops
+  RunReshapeDropQDQTestCase<int8_t>({1, 3, 2, 2}, {1, 12}, true);         // Use com.microsoft QDQ ops
+  RunReshapeDropQDQTestCase<int16_t>({1, 3, 2, 2}, {1, 12}, true);        // Use int16 com.microsoft QDQ ops
+  RunReshapeDropQDQTestCase<uint16_t>({1, 3, 2, 2}, {1, 12}, true);       // Use int16 com.microsoft QDQ ops
+  RunReshapeDropQDQTestCase<int16_t>({1, 3, 2, 2}, {1, 12}, false, 21);   // Use int16 ONNX QDQ ops
+  RunReshapeDropQDQTestCase<uint16_t>({1, 3, 2, 2}, {1, 12}, false, 21);  // Use int16 ONNX QDQ ops
 }
 
 // Runs a test case that checks if Q/DQ nodes are dropped from DQ -> (Un)Squeeze -> Q.
@@ -978,7 +980,8 @@ template <typename QuantType>
 static void RunSqueezeUnsqueezeDropQDQTestCase(const std::string& squeeze_type,
                                                const std::vector<int64_t>& input_shape,
                                                const std::vector<int64_t>& axes,
-                                               bool use_contrib_qdq = false) {
+                                               bool use_contrib_qdq = false,
+                                               int opset = 13) {
   auto build_test_case = [squeeze_type, input_shape, axes, use_contrib_qdq](ModelTestBuilder& builder) {
     constexpr QuantType qmin = std::numeric_limits<QuantType>::min();
     constexpr QuantType qmax = std::numeric_limits<QuantType>::max();
@@ -1006,8 +1009,7 @@ static void RunSqueezeUnsqueezeDropQDQTestCase(const std::string& squeeze_type,
     EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
   };
 
-  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2,
-                    13 /* opset_version */);
+  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset);
 }
 
 // Checks that Q/DQ nodes are dropped from DQ -> Squeeze -> Q. Uses 8-bit and 16-bit Q/DQ ops.
@@ -1016,6 +1018,10 @@ TEST(QDQTransformerTests, SqueezeDropQDQ) {
   RunSqueezeUnsqueezeDropQDQTestCase<int8_t>("Squeeze", {1, 3, 2, 2}, {0}, true);    // Use MS domain QDQ ops
   RunSqueezeUnsqueezeDropQDQTestCase<int16_t>("Squeeze", {1, 3, 2, 2}, {0}, true);   // Use int16 MS domain QDQ ops
   RunSqueezeUnsqueezeDropQDQTestCase<uint16_t>("Squeeze", {1, 3, 2, 2}, {0}, true);  // Use int16 MS domain QDQ ops
+
+  // With 16-bit ONNX QDQ ops.
+  RunSqueezeUnsqueezeDropQDQTestCase<int16_t>("Squeeze", {1, 3, 2, 2}, {0}, false, 21);
+  RunSqueezeUnsqueezeDropQDQTestCase<uint16_t>("Squeeze", {1, 3, 2, 2}, {0}, false, 21);
 }
 
 // Checks that Q/DQ nodes are dropped from DQ -> Unsqueeze -> Q. Uses 8-bit and 16-bit Q/DQ ops.
@@ -1024,6 +1030,10 @@ TEST(QDQTransformerTests, UnsqueezeDropQDQ) {
   RunSqueezeUnsqueezeDropQDQTestCase<int8_t>("Unsqueeze", {1, 3, 2, 2}, {0}, true);    // Use MS domain QDQ ops
   RunSqueezeUnsqueezeDropQDQTestCase<int16_t>("Unsqueeze", {1, 3, 2, 2}, {0}, true);   // Use int16 MS domain QDQ ops
   RunSqueezeUnsqueezeDropQDQTestCase<uint16_t>("Unsqueeze", {1, 3, 2, 2}, {0}, true);  // Use int16 MS domain QDQ ops
+
+  // With 16-bit ONNX QDQ ops.
+  RunSqueezeUnsqueezeDropQDQTestCase<int16_t>("Unsqueeze", {1, 3, 2, 2}, {0}, false, 21);
+  RunSqueezeUnsqueezeDropQDQTestCase<uint16_t>("Unsqueeze", {1, 3, 2, 2}, {0}, false, 21);
 }
 
 TEST(QDQTransformerTests, DoubleQDQ) {
@@ -1170,7 +1180,7 @@ TEST(QDQTransformerTests, DoubleQDQ) {
 
 template <typename QuantType>
 static void RunDoubleQDQWithoutLastNodeBeingOutput(int output_index, int expected_Q_count, int expected_DQ_count,
-                                                   bool use_contrib_qdq = false) {
+                                                   bool use_contrib_qdq = false, int opset = 12) {
   auto graph = [&](InferenceSessionWrapper& session) {
     auto op_to_count = CountOpsInGraph(session.GetGraph());
     const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
@@ -1181,20 +1191,21 @@ static void RunDoubleQDQWithoutLastNodeBeingOutput(int output_index, int expecte
       BuildDoubleQDQWithoutLastOutput<QuantType>(output_index, use_contrib_qdq),
       graph,
       TransformerLevel::Default,
-      TransformerLevel::Level1);
+      TransformerLevel::Level1,
+      opset);
 }
 
 TEST(QDQTransformerTests, DoubleQDQ_Without_Last_Node_Being_Output) {
-  constexpr bool use_contrib_qdq = true;  // For readability.
+  constexpr bool use_ms_qdq = true;  // For readability.
 
   // the first node being a graph output doesn't prevent the DQ -> Q in the middle from being removed
   // if they have matching type/scale/zp
   // Q -> DQ -> Q -> DQ
   //  `-> graph output
   RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(0, 1, 1);
-  RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(0, 1, 1, use_contrib_qdq);
-  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(0, 1, 1, use_contrib_qdq);
-  RunDoubleQDQWithoutLastNodeBeingOutput<int16_t>(0, 1, 1, use_contrib_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(0, 1, 1, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(0, 1, 1, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<int16_t>(0, 1, 1, use_ms_qdq);
 
   // EnsureUniqueDQForNodeUnit will duplicate first DQ, but after that the DQ -> Q in the middle can still be removed
   // leaveing one Q and 2 DQ.
@@ -1204,22 +1215,26 @@ TEST(QDQTransformerTests, DoubleQDQ_Without_Last_Node_Being_Output) {
   // Q -> DQ -> Q -> DQ
   //  `-> DQ -> graph output
   RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(1, 1, 2);
-  RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(1, 1, 2, use_contrib_qdq);
-  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(1, 1, 2, use_contrib_qdq);
-  RunDoubleQDQWithoutLastNodeBeingOutput<int16_t>(1, 1, 2, use_contrib_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(1, 1, 2, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(1, 1, 2, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<int16_t>(1, 1, 2, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(1, 1, 2, !use_ms_qdq, 21);
+  RunDoubleQDQWithoutLastNodeBeingOutput<int16_t>(1, 1, 2, !use_ms_qdq, 21);
 
   RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(2, 2, 2);
-  RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(2, 2, 2, use_contrib_qdq);
-  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(2, 2, 2, use_contrib_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(2, 2, 2, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(2, 2, 2, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(2, 2, 2, !use_ms_qdq, 21);
 
   // last node being a graph output doesn't prevent the DQ -> Q in the middle from being removed
   RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(3, 1, 1);
-  RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(3, 1, 1, use_contrib_qdq);
-  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(3, 1, 1, use_contrib_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint8_t>(3, 1, 1, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(3, 1, 1, use_ms_qdq);
+  RunDoubleQDQWithoutLastNodeBeingOutput<uint16_t>(3, 1, 1, !use_ms_qdq, 21);
 }
 
 // Runs a test that checks if DQ -> Split -> Q (many) is replaced with just Split.
-template <typename InputQType, typename OutputQType>
+template <typename QuantType>
 static void RunDropSplitQDQTestCase(const std::vector<int64_t>& input_shape, int64_t axis,
                                     bool all_same_quant_params, bool use_contrib_qdq = false) {
   auto check_graph = [all_same_quant_params, use_contrib_qdq](InferenceSessionWrapper& session) {
@@ -1231,16 +1246,21 @@ static void RunDropSplitQDQTestCase(const std::vector<int64_t>& input_shape, int
     EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], expected_q_ops);
     EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], expected_dq_ops);
   };
-  TransformerTester(BuildQDQSplitTestCase<InputQType, OutputQType>(input_shape, axis, !all_same_quant_params,
-                                                                   use_contrib_qdq),
+
+  std::vector<int> opsets = {12, 13, 18, 19, 21};
+  if constexpr (std::is_same_v<QuantType, uint16_t> || std::is_same_v<QuantType, int16_t>) {
+    opsets = std::vector<int>{21};
+  }
+
+  TransformerTester(BuildQDQSplitTestCase<QuantType, QuantType>(input_shape, axis, !all_same_quant_params,
+                                                                use_contrib_qdq),
                     check_graph,
                     TransformerLevel::Level1,
                     TransformerLevel::Level2,
-                    {12, 13, 18, 19});  // Test different ways to specify the split in each opset:
-                                        // 12 - split into equal parts without explicit 'split' attribute
-                                        // 13 - use optional 'split' input to split into 3 parts
-                                        // 18 - use 'num_outputs' attribute to split into 3 parts
-                                        // 19 - use 'num_outputs' attribute to split into 3 parts
+                    opsets);  // Test different ways to specify the split in each opset:
+                              // 12 - split into equal parts without explicit 'split' attribute
+                              // 13 - use optional 'split' input to split into 3 parts
+                              // >= 18 - use 'num_outputs' attribute to split into 3 parts
 }
 
 // Test that DQ -> Split -> Q (many) is replaced with just Split for various quantization types.
@@ -1250,10 +1270,12 @@ TEST(QDQTransformerTests, Split) {
   {
     constexpr bool ALL_SAME_QUANT_PARAMS = true;
     constexpr bool USE_CONTRIB_QDQ_OPS = true;
-    RunDropSplitQDQTestCase<int8_t, int8_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS);
-    RunDropSplitQDQTestCase<int8_t, int8_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
-    RunDropSplitQDQTestCase<int16_t, int16_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
-    RunDropSplitQDQTestCase<uint16_t, uint16_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<int8_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS);
+    RunDropSplitQDQTestCase<int8_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<int16_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<uint16_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<int16_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS, !USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<uint16_t>({6, 18, 54}, 0, ALL_SAME_QUANT_PARAMS, !USE_CONTRIB_QDQ_OPS);
   }
 
   // Test cases that DO NOT drop Q/DQ ops from DQ -> Split -> Q (many)
@@ -1261,10 +1283,12 @@ TEST(QDQTransformerTests, Split) {
   {
     constexpr bool DIFF_QUANT_PARAMS = false;
     constexpr bool USE_CONTRIB_QDQ_OPS = true;
-    RunDropSplitQDQTestCase<int8_t, int8_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS);
-    RunDropSplitQDQTestCase<int8_t, int8_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
-    RunDropSplitQDQTestCase<int16_t, int16_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
-    RunDropSplitQDQTestCase<uint16_t, uint16_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<int8_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS);
+    RunDropSplitQDQTestCase<int8_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<int16_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<uint16_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS, USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<int16_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS, !USE_CONTRIB_QDQ_OPS);
+    RunDropSplitQDQTestCase<uint16_t>({6, 18, 54}, 0, DIFF_QUANT_PARAMS, !USE_CONTRIB_QDQ_OPS);
   }
 }
 
@@ -1330,7 +1354,8 @@ TEST(QDQTransformerTests, Where) {
 
 template <typename QuantType>
 static void RunDropQDQTransposeTestCase(const std::vector<int64_t>& input_shape, const std::vector<int64_t>& perms,
-                                        bool use_contrib_qdq = false) {
+                                        bool use_contrib_qdq = false,
+                                        int opset = 12) {
   // model has DQ -> Mul -> Q -> DQ -> Transpose -> Q -> output
   // post transform and optimization it should be DQ -> Mul -> Q -> Transpose(uint8) -> output
   auto check_graph = [&](InferenceSessionWrapper& session) {
@@ -1345,7 +1370,8 @@ static void RunDropQDQTransposeTestCase(const std::vector<int64_t>& input_shape,
   TransformerTester(BuildQDQTransposeTestCase<QuantType, QuantType>(input_shape, perms, use_contrib_qdq),
                     check_graph,
                     TransformerLevel::Level1,
-                    TransformerLevel::Level2);
+                    TransformerLevel::Level2,
+                    opset);
 }
 
 TEST(QDQTransformerTests, Transpose) {
@@ -1353,11 +1379,13 @@ TEST(QDQTransformerTests, Transpose) {
   RunDropQDQTransposeTestCase<uint8_t>({2, 13, 12, 37}, {0, 3, 1, 2}, true /*use_contrib_qdq*/);
   RunDropQDQTransposeTestCase<uint16_t>({2, 13, 12, 37}, {0, 3, 1, 2}, true /*use_contrib_qdq*/);
   RunDropQDQTransposeTestCase<int16_t>({2, 13, 12, 37}, {0, 3, 1, 2}, true /*use_contrib_qdq*/);
+  RunDropQDQTransposeTestCase<uint16_t>({2, 13, 12, 37}, {0, 3, 1, 2}, false /*use_contrib_qdq*/, 21 /*opset*/);
+  RunDropQDQTransposeTestCase<int16_t>({2, 13, 12, 37}, {0, 3, 1, 2}, false /*use_contrib_qdq*/, 21 /*opset*/);
 }
 
 template <typename QuantType>
 static void RunQDQTransposeNoFusionTestCase(const std::vector<int64_t>& input1_shape, const std::vector<int64_t>& perms,
-                                            bool use_contrib_qdq = false) {
+                                            bool use_contrib_qdq = false, int opset = 12) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* input1_arg = builder.MakeInput<QuantType>(input1_shape, std::numeric_limits<QuantType>::min(),
                                                     std::numeric_limits<QuantType>::max());
@@ -1383,7 +1411,7 @@ static void RunQDQTransposeNoFusionTestCase(const std::vector<int64_t>& input1_s
     EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
   };
 
-  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2);
+  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset);
 }
 
 TEST(QDQTransformerTests, Transpose_No_Fusion) {
@@ -1391,6 +1419,8 @@ TEST(QDQTransformerTests, Transpose_No_Fusion) {
   RunQDQTransposeNoFusionTestCase<int8_t>({2, 13, 12, 37}, {0, 3, 1, 2}, true /*use_contrib_qdq*/);
   RunQDQTransposeNoFusionTestCase<int16_t>({2, 13, 12, 37}, {0, 3, 1, 2}, true /*use_contrib_qdq*/);
   RunQDQTransposeNoFusionTestCase<uint16_t>({2, 13, 12, 37}, {0, 3, 1, 2}, true /*use_contrib_qdq*/);
+  RunQDQTransposeNoFusionTestCase<int16_t>({2, 13, 12, 37}, {0, 3, 1, 2}, false /*use_contrib_qdq*/, 21 /*opset*/);
+  RunQDQTransposeNoFusionTestCase<uint16_t>({2, 13, 12, 37}, {0, 3, 1, 2}, false /*use_contrib_qdq*/, 21 /*opset*/);
 }
 
 TEST(QDQTransformerTests, Resize) {
@@ -1572,14 +1602,18 @@ static void RunArgMaxDropDQTestCase(const std::vector<int64_t>& input_shape,
     EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], expect_drop_dq ? 0 : 1);
   };
 
+  std::vector<int> opsets = {13, 19, 21};
+
+  if constexpr (std::is_same_v<QuantType, uint16_t> || std::is_same_v<QuantType, int16_t>) {
+    if (!use_contrib_qdq) {
+      opsets = std::vector<int>{21};  // Only ONNX opset 21 supports 16-bit ints.
+    }
+  }
+
   TransformerTester(build_test_case, check_graph,
                     TransformerLevel::Level1,
                     TransformerLevel::Level2,
-                    /* opset_version */ 13);
-  TransformerTester(build_test_case, check_graph,
-                    TransformerLevel::Level1,
-                    TransformerLevel::Level2,
-                    /* opset_version */ 19);
+                    opsets);
 }
 
 // Checks that the DQ node is dropped from DQ -> ArgMax. Uses 8-bit and 16-bit Q/DQ ops.
@@ -1590,6 +1624,8 @@ TEST(QDQTransformerTests, ArgMax) {
   // Should *not* drop DQ for 16-bit DQ -> ArgMax (because ORT does not support 16-bit input types for ArgMax).
   RunArgMaxDropDQTestCase<uint16_t>({2, 13, 12, 37}, 1, 0, 0, true /*use_contrib_qdq*/, false /*expect_drop_dq*/);
   RunArgMaxDropDQTestCase<int16_t>({2, 13, 12, 37}, 1, 0, 0, true /*use_contrib_qdq*/, false /*expect_drop_dq*/);
+  RunArgMaxDropDQTestCase<uint16_t>({2, 13, 12, 37}, 1, 0, 0, false /*use_contrib_qdq*/, false /*expect_drop_dq*/);
+  RunArgMaxDropDQTestCase<int16_t>({2, 13, 12, 37}, 1, 0, 0, false /*use_contrib_qdq*/, false /*expect_drop_dq*/);
 
   RunArgMaxDropDQTestCase<uint8_t>({2, 13, 12, 37}, 0, 1, 0, false);
   RunArgMaxDropDQTestCase<uint8_t>({2, 13, 12, 37}, 0, 0, 1, false);
@@ -3243,14 +3279,14 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) {
     ASSERT_EQ(std::vector<NodeIndex>({4}), qdq_group.q_nodes);
   }
 
-// The function GetAllNodeUnits is enabled for NNAPI EP only for now
-#ifdef USE_NNAPI
+// The function GetAllNodeUnits is used by NNAPI, XNNPACK and QNN
+#if defined(USE_NNAPI) || defined(USE_QNN) || defined(USE_XNNPACK)
   {
     // Get all the NodeUnits in the graph_viewer
     std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
     std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-    std::tie(node_unit_holder, node_unit_map) = GetAllNodeUnits(whole_graph_viewer);
+    std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(whole_graph_viewer);
 
     // We should get a single QDQ Node unit in the result
     ASSERT_EQ(1, node_unit_holder.size());
@@ -3288,7 +3324,7 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) {
     verify_io_def(qdq_node_unit.Inputs()[2], *whole_graph_viewer.GetNode(2));   // DQ_bias
     verify_io_def(qdq_node_unit.Outputs()[0], *whole_graph_viewer.GetNode(4));  // Q_output
   }
-#endif  // #ifdef USE_NNAPI
+#endif  // defined(USE_NNAPI) || defined(USE_QNN) || defined(USE_XNNPACK)
 
   // Create a graph viewer covers part of the graph
   // Make sure the qdq conv selector will fail for the partial graph
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index 5a754c745fdd..bae37adb19ee 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -3,6 +3,7 @@
 
 #include <optional>
 #include <random>
+#include <type_traits>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -3552,11 +3553,19 @@ static void RunQuantizeLinearTestCase(const std::optional<std::vector<int64_t>>&
     EXPECT_EQ(transpose_cost, 0);
   };
 
+  std::vector<int> opsets = {15, 18, 21};
+
+  if constexpr (std::is_same_v<QuantType, uint16_t> || std::is_same_v<QuantType, int16_t>) {
+    if (q_domain == kOnnxDomain) {
+      opsets = std::vector<int>{21};  // Only ONNX opset 21 supports 16-bit ints.
+    }
+  }
+
   TransformerTester(build_test_case,
                     check_optimized_graph,
                     TransformerLevel::Default,
                     TransformerLevel::Level1,
-                    /*opset_version*/ {15, 18});
+                    opsets);
 }
 
 TEST(TransposeOptimizerTests, TestQuantizeLinearScalar) {
@@ -3565,6 +3574,8 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearScalar) {
   std::optional<ONNX_NAMESPACE::AttributeProto> empty_axis;  // No axis value.
 
   RunQuantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, empty_axis, kOnnxDomain);
+  RunQuantizeLinearTestCase<uint16_t>(zp_input_shape, zp_value_shape, empty_axis, kOnnxDomain);
+  RunQuantizeLinearTestCase<int16_t>(zp_input_shape, zp_value_shape, empty_axis, kOnnxDomain);
 
 #if !defined(DISABLE_CONTRIB_OPS)
   // Use com.microsoft.QuantizeLinear op.
@@ -3580,6 +3591,8 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearScalarIgnoreAxis) {
   auto ignored_axis = utils::MakeAttribute("axis", static_cast<int64_t>(10));  // Should be ignored for per-tensor Q
 
   RunQuantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, ignored_axis, kOnnxDomain);
+  RunQuantizeLinearTestCase<uint16_t>(zp_input_shape, zp_value_shape, ignored_axis, kOnnxDomain);
+  RunQuantizeLinearTestCase<int16_t>(zp_input_shape, zp_value_shape, ignored_axis, kOnnxDomain);
 
 #if !defined(DISABLE_CONTRIB_OPS)
   // Use com.microsoft.QuantizeLinear op.
@@ -3595,6 +3608,8 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearVector) {
   auto axis = utils::MakeAttribute("axis", static_cast<int64_t>(0));
 
   RunQuantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, axis, kOnnxDomain);
+  RunQuantizeLinearTestCase<uint16_t>(zp_input_shape, zp_value_shape, axis, kOnnxDomain);
+  RunQuantizeLinearTestCase<int16_t>(zp_input_shape, zp_value_shape, axis, kOnnxDomain);
 
 #if !defined(DISABLE_CONTRIB_OPS)
   // Use com.microsoft.QuantizeLinear op.
@@ -3610,6 +3625,8 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearVectorUnknownRank) {
   auto axis = utils::MakeAttribute("axis", static_cast<int64_t>(1));
 
   RunQuantizeLinearTestCase<uint8_t>(zp_unknown_shape, zp_value_shape, axis, kOnnxDomain);
+  RunQuantizeLinearTestCase<uint16_t>(zp_unknown_shape, zp_value_shape, axis, kOnnxDomain);
+  RunQuantizeLinearTestCase<int16_t>(zp_unknown_shape, zp_value_shape, axis, kOnnxDomain);
 
 #if !defined(DISABLE_CONTRIB_OPS)
   // Use com.microsoft.QuantizeLinear op.
@@ -3693,11 +3710,19 @@ static void RunDequantizeLinearTestCase(const std::optional<std::vector<int64_t>
     EXPECT_EQ(transpose_cost, 0);
   };
 
+  std::vector<int> opsets = {15, 18, 21};
+
+  if constexpr (std::is_same_v<QuantType, uint16_t> || std::is_same_v<QuantType, int16_t>) {
+    if (q_domain == kOnnxDomain) {
+      opsets = std::vector<int>{21};  // Only ONNX opset 21 supports 16-bit ints.
+    }
+  }
+
   TransformerTester(build_test_case,
                     check_optimized_graph,
                     TransformerLevel::Default,
                     TransformerLevel::Level1,
-                    /*opset_version*/ {15, 18});
+                    opsets);
 }
 
 TEST(TransposeOptimizerTests, TestDequantizeLinearScalarIgnoreAxis) {
@@ -3706,6 +3731,8 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearScalarIgnoreAxis) {
   auto ignored_axis = utils::MakeAttribute("axis", static_cast<int64_t>(10));  // Should be ignored for per-tensor Q
 
   RunDequantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, ignored_axis, kOnnxDomain);
+  RunDequantizeLinearTestCase<uint16_t>(zp_input_shape, zp_value_shape, ignored_axis, kOnnxDomain);
+  RunDequantizeLinearTestCase<int16_t>(zp_input_shape, zp_value_shape, ignored_axis, kOnnxDomain);
 #if !defined(DISABLE_CONTRIB_OPS)
   // Use com.microsoft.DequantizeLinear ops
   RunDequantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, ignored_axis, kMSDomain);
@@ -3720,6 +3747,8 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearVector) {
   auto axis = utils::MakeAttribute("axis", static_cast<int64_t>(-4));
 
   RunDequantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, axis, kOnnxDomain);
+  RunDequantizeLinearTestCase<uint16_t>(zp_input_shape, zp_value_shape, axis, kOnnxDomain);
+  RunDequantizeLinearTestCase<int16_t>(zp_input_shape, zp_value_shape, axis, kOnnxDomain);
 #if !defined(DISABLE_CONTRIB_OPS)
   // Use com.microsoft.DequantizeLinear ops
   RunDequantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, axis, kMSDomain);
@@ -3734,6 +3763,8 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearNoAxis) {
   std::optional<ONNX_NAMESPACE::AttributeProto> no_axis;  // Empty axis value will not be set.
 
   RunDequantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain);
+  RunDequantizeLinearTestCase<uint16_t>(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain);
+  RunDequantizeLinearTestCase<int16_t>(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain);
 #if !defined(DISABLE_CONTRIB_OPS)
   // Use com.microsoft.DequantizeLinear ops
   RunDequantizeLinearTestCase<uint8_t>(zp_input_shape, zp_value_shape, no_axis, kMSDomain);
diff --git a/onnxruntime/test/perftest/README.md b/onnxruntime/test/perftest/README.md
index 59059cf6b62b..4169d1bf54c6 100644
--- a/onnxruntime/test/perftest/README.md
+++ b/onnxruntime/test/perftest/README.md
@@ -35,6 +35,10 @@ Options:
 	-x: [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes. A value of 0 means the test will auto-select a default. Must >=0.
 	
 	-y: [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means the test will auto-select a default. Must >=0.
+
+        -C: [session_config_entries]: Specify session configuration entries as key-value pairs: -C "<key1>|<val1> <key2>|<val2>"
+                                      Refer to onnxruntime_session_options_config_keys.h for valid keys and values.
+                                      [Example] -C "session.disable_cpu_ep_fallback|1 ep.context_enable|1"
 	
 	-h: help.
 
diff --git a/onnxruntime/test/perftest/TFModelInfo.cc b/onnxruntime/test/perftest/TFModelInfo.cc
deleted file mode 100644
index 82f5359545b4..000000000000
--- a/onnxruntime/test/perftest/TFModelInfo.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "TFModelInfo.h"
-
-#include <memory>
-
-#include <core/platform/env.h>
-
-std::unique_ptr<TestModelInfo> TFModelInfo::Create(_In_ const PATH_CHAR_TYPE* model_url) {
-  std::unique_ptr<TFModelInfo> model_info = std::make_unique<TFModelInfo>();
-
-  model_info->model_url_ = model_url;
-  std::basic_string<PATH_CHAR_TYPE> meta_file_path = model_url;
-  meta_file_path.append(ORT_TSTR(".meta"));
-  const onnxruntime::Env& env = onnxruntime::Env::Default();
-  size_t len;
-  auto status = env.GetFileLength(meta_file_path.c_str(), len);
-  if (!status.IsOK()) {
-    ORT_THROW(status.ErrorMessage());
-  }
-  std::string file_content;
-  file_content.resize(len);
-  auto buffer_span = gsl::make_span(&file_content[0], file_content.size());
-  status = onnxruntime::Env::Default().ReadFileIntoBuffer(meta_file_path.c_str(), 0, len, buffer_span);
-  if (!status.IsOK()) {
-    ORT_THROW(status.ErrorMessage());
-  }
-  // this string is not null terminated
-  std::istringstream is{file_content};
-
-  std::string line;
-  while (std::getline(is, line)) {
-    size_t line_len = 0;
-    if (!line.empty() && line.back() == '\n') {
-      line_len = line.length() - 1;
-      if (line_len > 0 && line[line_len - 1] == '\r') {
-        --line_len;
-      }
-      line.resize(line_len);
-    }
-    if (line.empty()) continue;
-    if (line.compare(0, 6, "input=") == 0) {
-      model_info->input_names_.push_back(line.substr(6));
-    } else if (line.compare(0, 7, "output=") == 0) {
-      model_info->output_names_.push_back(line.substr(7));
-    } else {
-      ORT_THROW("unknown line:", line.size());
-    }
-  }
-
-  return model_info;
-}
-
-int TFModelInfo::GetInputCount() const { return static_cast<int>(input_names_.size()); }
-int TFModelInfo::GetOutputCount() const { return static_cast<int>(output_names_.size()); }
-const std::string& TFModelInfo::GetInputName(size_t i) const { return input_names_[i]; }
-const std::string& TFModelInfo::GetOutputName(size_t i) const { return output_names_[i]; }
diff --git a/onnxruntime/test/perftest/TFModelInfo.h b/onnxruntime/test/perftest/TFModelInfo.h
deleted file mode 100644
index 2ca60010e300..000000000000
--- a/onnxruntime/test/perftest/TFModelInfo.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "TestCase.h"
-#include <string>
-#include <vector>
-
-class TFModelInfo : public TestModelInfo {
- public:
-  const PATH_CHAR_TYPE* GetModelUrl() const override { return model_url_.c_str(); }
-
-  const std::string& GetNodeName() const override { return node_name_; }
-  const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t) const override { return nullptr; }
-  const ONNX_NAMESPACE::ValueInfoProto* GetOutputInfoFromModel(size_t) const override { return nullptr; }
-
-  int GetInputCount() const override;
-  int GetOutputCount() const override;
-  const std::string& GetInputName(size_t i) const override;
-  const std::string& GetOutputName(size_t i) const override;
-  ~TFModelInfo() override = default;
-
-  static std::unique_ptr<TestModelInfo> Create(_In_ const PATH_CHAR_TYPE* model_url);
-  TFModelInfo() = default;
-
- private:
-  std::basic_string<PATH_CHAR_TYPE> model_url_;
-  std::vector<std::string> input_names_;
-  std::vector<std::string> output_names_;
-  std::string node_name_;
-};
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 6e3252aaeb4b..b637ea187466 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -6,6 +6,9 @@
 
 #include <string.h>
 #include <iostream>
+#include <sstream>
+#include <string_view>
+#include <unordered_map>
 
 // Windows Specific
 #ifdef _WIN32
@@ -57,26 +60,45 @@ namespace perftest {
       "\t-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). \n"
       "\t-q [CUDA only] use separate stream for copy. \n"
       "\t-z: Set denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals.\n"
+      "\t-C: Specify session configuration entries as key-value pairs: -C \"<key1>|<value1> <key2>|<value2>\" \n"
+      "\t    Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
+      "\t    [Example] -C \"session.disable_cpu_ep_fallback|1 ep.context_enable|1\" \n"
       "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
+      "\t    [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n"
+      "\n"
+      "\t    [DML only] [performance_preference]: DML device performance preference, options: 'default', 'minimum_power', 'high_performance', \n"
+      "\t    [DML only] [device_filter]: DML device filter, options: 'any', 'gpu', 'npu', \n"
+      "\t    [DML only] [disable_metacommands]: Options: 'true', 'false', \n"
+      "\t    [DML only] [enable_dynamic_graph_fusion]: Options: 'true', 'false', \n"
+      "\t    [DML only] [enable_graph_serialization]: Options: 'true', 'false', \n"
+      "\n"
       "\t    [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
       "\t    [OpenVINO only] [device_id]: Selects a particular hardware device for inference.\n"
       "\t    [OpenVINO only] [enable_npu_fast_compile]: Optionally enabled to speeds up the model's compilation on NPU device targets.\n"
       "\t    [OpenVINO only] [num_of_threads]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
       "\t    [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
       "\t    [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
+      "\t    [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
+      "\n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
+      "\t    [profiling_file_path] : QNN profiling file path if ETW not enabled.\n"
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
-      "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
+      "\t    'high_power_saver', 'low_balanced', 'extreme_power_saver', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
       "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
       "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
       "\t    '0', '1', '2', '3', default is '0'.\n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
-      "\t [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
-      "\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n\n"
+      "\t    [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
+      "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
+      "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
+      "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
+      "\t    [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
+      "\t    Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n"
+      "\t    [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n"
+      "\n"
       "\t    [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
       "\t    [TensorRT only] [trt_min_subgraph_size]: Minimum size of TensorRT subgraphs.\n"
       "\t    [TensorRT only] [trt_max_workspace_size]: Set TensorRT maximum workspace size in byte.\n"
@@ -89,23 +111,27 @@ namespace perftest {
       "\t    [TensorRT only] [trt_dump_subgraphs]: Dump TRT subgraph to onnx model.\n"
       "\t    [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n"
       "\t    [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n"
+      "\t    [TensorRT only] [trt_engine_cache_prefix]: Customize engine cache prefix when trt_engine_cache_enable is true.\n"
       "\t    [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
       "\t    [TensorRT only] [trt_context_memory_sharing_enable]: Enable TensorRT context memory sharing between subgraphs.\n"
       "\t    [TensorRT only] [trt_layer_norm_fp32_fallback]: Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow.\n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
-      "\t [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
+      "\t    [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
+      "\n"
       "\t    [NNAPI only] [NNAPI_FLAG_USE_FP16]: Use fp16 relaxation in NNAPI EP..\n"
       "\t    [NNAPI only] [NNAPI_FLAG_USE_NCHW]: Use the NCHW layout in NNAPI EP.\n"
       "\t    [NNAPI only] [NNAPI_FLAG_CPU_DISABLED]: Prevent NNAPI from using CPU devices.\n"
       "\t    [NNAPI only] [NNAPI_FLAG_CPU_ONLY]: Using CPU only in NNAPI EP.\n"
-      "\t [Usage]: -e <provider_name> -i '<key1> <key2>'\n\n"
-      "\t [Example] [For NNAPI EP] -e nnapi -i \" NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED \"\n"
+      "\t    [Example] [For NNAPI EP] -e nnapi -i \"NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED\"\n"
+      "\n"
+      "\t    [CoreML only] [COREML_FLAG_CREATE_MLPROGRAM]: Create an ML Program model instead of Neural Network.\n"
+      "\t    [Example] [For CoreML EP] -e coreml -i \"COREML_FLAG_CREATE_MLPROGRAM\"\n"
+      "\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
       "\t    [SNPE only] [priority]: execution priority, options: 'low', 'normal'. \n"
       "\t    [SNPE only] [buffer_type]: options: 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. default: ITENSOR'. \n"
       "\t    [SNPE only] [enable_init_cache]: enable SNPE init caching feature, set to 1 to enabled it. Disabled by default. \n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
-      "\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
+      "\t    [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
+      "\n"
       "\t-T [Set intra op thread affinities]: Specify intra op thread affinity string\n"
       "\t [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6 \n"
       "\t\t Use semicolon to separate configuration between threads.\n"
@@ -113,6 +139,7 @@ namespace perftest {
       "\t\t The number of affinities must be equal to intra_op_num_threads - 1\n\n"
       "\t-D [Disable thread spinning]: disable spinning entirely for thread owned by onnxruntime intra-op thread pool.\n"
       "\t-Z [Force thread to stop spinning between runs]: disallow thread from spinning during runs to reduce cpu usage.\n"
+      "\t-n [Exit after session creation]: allow user to measure session creation time to measure impact of enabling any initialization optimizations.\n"
       "\t-h: help\n");
 }
 #ifdef _WIN32
@@ -140,9 +167,42 @@ static bool ParseDimensionOverride(std::basic_string<ORTCHAR_T>& dim_identifier,
   return true;
 }
 
+static bool ParseSessionConfigs(const std::string& configs_string,
+                                std::unordered_map<std::string, std::string>& session_configs) {
+  std::istringstream ss(configs_string);
+  std::string token;
+
+  while (ss >> token) {
+    if (token == "") {
+      continue;
+    }
+
+    std::string_view token_sv(token);
+
+    auto pos = token_sv.find("|");
+    if (pos == std::string_view::npos || pos == 0 || pos == token_sv.length()) {
+      // Error: must use a '|' to separate the key and value for session configuration entries.
+      return false;
+    }
+
+    std::string key(token_sv.substr(0, pos));
+    std::string value(token_sv.substr(pos + 1));
+
+    auto it = session_configs.find(key);
+    if (it != session_configs.end()) {
+      // Error: specified duplicate session configuration entry: {key}
+      return false;
+    }
+
+    session_configs.insert(std::make_pair(std::move(key), std::move(value)));
+  }
+
+  return true;
+}
+
 /*static*/ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
   int ch;
-  while ((ch = getopt(argc, argv, ORT_TSTR("b:m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:AMPIDZvhsqz"))) != -1) {
+  while ((ch = getopt(argc, argv, ORT_TSTR("m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqzn"))) != -1) {
     switch (ch) {
       case 'f': {
         std::basic_string<ORTCHAR_T> dim_name;
@@ -171,9 +231,6 @@ static bool ParseDimensionOverride(std::basic_string<ORTCHAR_T>& dim_identifier,
           return false;
         }
         break;
-      case 'b':
-        test_config.backend = optarg;
-        break;
       case 'p':
         test_config.run_config.profile_file = optarg;
         break;
@@ -313,12 +370,21 @@ static bool ParseDimensionOverride(std::basic_string<ORTCHAR_T>& dim_identifier,
       case 'T':
         test_config.run_config.intra_op_thread_affinities = ToUTF8String(optarg);
         break;
+      case 'C': {
+        if (!ParseSessionConfigs(ToUTF8String(optarg), test_config.run_config.session_config_entries)) {
+          return false;
+        }
+        break;
+      }
       case 'D':
         test_config.run_config.disable_spinning = true;
         break;
       case 'Z':
         test_config.run_config.disable_spinning_between_run = true;
         break;
+      case 'n':
+        test_config.run_config.exit_after_session_creation = true;
+        break;
       case '?':
       case 'h':
       default:
diff --git a/onnxruntime/test/perftest/main.cc b/onnxruntime/test/perftest/main.cc
index 36f08167c221..43bf54963cab 100644
--- a/onnxruntime/test/perftest/main.cc
+++ b/onnxruntime/test/perftest/main.cc
@@ -43,6 +43,13 @@ int real_main(int argc, char* argv[]) {
   }
   std::random_device rd;
   perftest::PerformanceRunner perf_runner(env, test_config, rd);
+
+  // Exit if user enabled -n option so that user can measure session creation time
+  if (test_config.run_config.exit_after_session_creation) {
+    perf_runner.LogSessionCreationTime();
+    return 0;
+  }
+
   auto status = perf_runner.Run();
   if (!status.IsOK()) {
     printf("Run failed:%s\n", status.ErrorMessage().c_str());
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 04c9ae1f2310..b1dcf094a30d 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -16,6 +16,11 @@
 #include "providers.h"
 #include "TestCase.h"
 
+#ifdef USE_DML
+#include "core/providers/dml/dml_provider_factory.h"
+#include "core/providers/dml/dml_session_options_config_keys.h"
+#endif
+
 #ifdef _WIN32
 #define strdup _strdup
 #endif
@@ -42,8 +47,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
                                                const TestModelInfo& m)
     : rand_engine_(rd()), input_names_(m.GetInputCount()), input_names_str_(m.GetInputCount()), input_length_(m.GetInputCount()) {
   Ort::SessionOptions session_options;
-  const std::string& provider_name = performance_test_config.machine_config.provider_type_name;
-  if (provider_name == onnxruntime::kDnnlExecutionProvider) {
+  provider_name_ = performance_test_config.machine_config.provider_type_name;
+  if (provider_name_ == onnxruntime::kDnnlExecutionProvider) {
 #ifdef USE_DNNL
     // Generate provider options
     OrtDnnlProviderOptions dnnl_options;
@@ -96,7 +101,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #else
     ORT_THROW("DNNL is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kCudaExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kCudaExecutionProvider) {
 #ifdef USE_CUDA
     const auto& api = Ort::GetApi();
     OrtCUDAProviderOptionsV2* cuda_options;
@@ -161,11 +166,13 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #else
     ORT_THROW("CUDA is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kTensorrtExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kTensorrtExecutionProvider) {
 #ifdef USE_TENSORRT
     const auto& api = Ort::GetApi();
     OrtTensorRTProviderOptionsV2* tensorrt_options;
     Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
+    std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(
+        tensorrt_options, api.ReleaseTensorRTProviderOptions);
     std::vector<const char*> option_keys, option_values;
     // used to keep all option keys and value strings alive
     std::list<std::string> buffer;
@@ -215,7 +222,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #else
     ORT_THROW("TensorRT is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kOpenVINOExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kOpenVINOExecutionProvider) {
 #ifdef USE_OPENVINO
 #ifdef _MSC_VER
     std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
@@ -240,7 +247,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
       if (key == "device_type") {
         std::set<std::string> ov_supported_device_types = {"CPU_FP32", "CPU_FP16", "GPU_FP32",
                                                            "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
-                                                           "GPU.0_FP16", "GPU.1_FP16"};
+                                                           "GPU.0_FP16", "GPU.1_FP16", "NPU"};
         if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) {
           ov_options[key] = value;
         } else if (value.find("HETERO:") == 0) {
@@ -251,9 +258,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           ov_options[key] = value;
         } else {
           ORT_THROW(
-              "[ERROR] [OpenVINO] You have selcted wrong configuration value for the key 'device_type'. "
+              "[ERROR] [OpenVINO] You have selected a wrong configuration value for the key 'device_type'. "
               "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', "
-              "'GPU.0_FP16', 'GPU.1_FP16' or from"
+              "'GPU.0_FP16', 'GPU.1_FP16', 'NPU' or from"
               " HETERO/MULTI/AUTO options available. \n");
         }
       } else if (key == "device_id") {
@@ -301,11 +308,11 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         ORT_THROW("[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO. ['device_type', 'device_id', 'enable_npu_fast_compile', 'num_of_threads', 'cache_dir', 'num_streams', 'enable_opencl_throttling', 'disable_dynamic_shapes'] \n");
       }
     }
-    session_options.AppendExecutionProvider("OpenVINO", ov_options);
+    session_options.AppendExecutionProvider_OpenVINO_V2(ov_options);
 #else
     ORT_THROW("OpenVINO is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kQnnExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kQnnExecutionProvider) {
 #ifdef USE_QNN
 #ifdef _MSC_VER
     std::string option_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
@@ -328,20 +335,20 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
       std::string key(token.substr(0, pos));
       std::string value(token.substr(pos + 1));
 
-      if (key == "backend_path") {
+      if (key == "backend_path" || key == "profiling_file_path") {
         if (value.empty()) {
-          ORT_THROW("Please provide the QNN backend path.");
+          ORT_THROW("Please provide the valid file path.");
         }
       } else if (key == "profiling_level") {
         std::set<std::string> supported_profiling_level = {"off", "basic", "detailed"};
         if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
           ORT_THROW("Supported profiling_level: off, basic, detailed");
         }
-      } else if (key == "rpc_control_latency" || key == "vtcm_mb") {
+      } else if (key == "rpc_control_latency" || key == "vtcm_mb" || key == "soc_model" || key == "device_id") {
         // no validation
       } else if (key == "htp_performance_mode") {
         std::set<std::string> supported_htp_perf_mode = {"burst", "balanced", "default", "high_performance",
-                                                         "high_power_saver", "low_balanced", "low_power_saver",
+                                                         "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver",
                                                          "power_saver", "sustained_high_performance"};
         if (supported_htp_perf_mode.find(value) == supported_htp_perf_mode.end()) {
           std::ostringstream str_stream;
@@ -366,10 +373,29 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         if (supported_qnn_context_priority.find(value) == supported_qnn_context_priority.end()) {
           ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high");
         }
+      } else if (key == "htp_arch") {
+        std::unordered_set<std::string> supported_htp_archs = {"0", "68", "69", "73", "75"};
+        if (supported_htp_archs.find(value) == supported_htp_archs.end()) {
+          std::ostringstream str_stream;
+          std::copy(supported_htp_archs.begin(), supported_htp_archs.end(),
+                    std::ostream_iterator<std::string>(str_stream, ","));
+          std::string str = str_stream.str();
+          ORT_THROW("Wrong value for htp_arch. select from: " + str);
+        }
+      } else if (key == "enable_htp_fp16_precision") {
+        std::unordered_set<std::string> supported_options = {"0", "1"};
+        if (supported_options.find(value) == supported_options.end()) {
+          std::ostringstream str_stream;
+          std::copy(supported_options.begin(), supported_options.end(),
+                    std::ostream_iterator<std::string>(str_stream, ","));
+          std::string str = str_stream.str();
+          ORT_THROW("Wrong value for enable_htp_fp16_precision. select from: " + str);
+        }
       } else {
         ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
-'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
-'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
+'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
+'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority', 'soc_model',
+'htp_arch', 'device_id', 'enable_htp_fp16_precision'])");
       }
 
       qnn_options[key] = value;
@@ -378,7 +404,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #else
     ORT_THROW("QNN is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kSnpeExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kSnpeExecutionProvider) {
 #ifdef USE_SNPE
 #ifdef _MSC_VER
     std::string option_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
@@ -430,7 +456,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #else
     ORT_THROW("SNPE is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kNnapiExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kNnapiExecutionProvider) {
 #ifdef USE_NNAPI
     uint32_t nnapi_flags = 0;
 #ifdef _MSC_VER
@@ -451,29 +477,121 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         nnapi_flags |= NNAPI_FLAG_CPU_ONLY;
       } else if (key.empty()) {
       } else {
-        ORT_THROW("[ERROR] [NNAPI] wrong key type entered. Choose from the following runtime key options that are available for NNAPI. ['NNAPI_FLAG_USE_FP16', 'NNAPI_FLAG_USE_NCHW', 'NNAPI_FLAG_CPU_DISABLED', 'NNAPI_FLAG_CPU_ONLY'] \n");
+        ORT_THROW(
+            "[ERROR] [NNAPI] wrong key type entered. Choose from the following runtime key options "
+            "that are available for NNAPI. "
+            "['NNAPI_FLAG_USE_FP16', 'NNAPI_FLAG_USE_NCHW', 'NNAPI_FLAG_CPU_DISABLED', 'NNAPI_FLAG_CPU_ONLY'] \n");
       }
     }
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(session_options, nnapi_flags));
 #else
     ORT_THROW("NNAPI is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kCoreMLExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kCoreMLExecutionProvider) {
+#ifdef __APPLE__
 #ifdef USE_COREML
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, 0));
+    uint32_t coreml_flags = 0;
+    std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
+    std::istringstream ss(ov_string);
+
+    std::string key;
+    while (ss >> key) {
+      if (key == "COREML_FLAG_CREATE_MLPROGRAM") {
+        coreml_flags |= COREML_FLAG_CREATE_MLPROGRAM;
+        std::cout << "Enabling ML Program.\n";
+      } else if (key.empty()) {
+      } else {
+        ORT_THROW(
+            "[ERROR] [CoreML] wrong key type entered. Choose from the following runtime key options "
+            "that are available for CoreML. ['COREML_FLAG_CREATE_MLPROGRAM'] \n");
+      }
+    }
+    // COREML_FLAG_CREATE_MLPROGRAM
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, coreml_flags));
 #else
-    ORT_THROW("COREML is not supported in this build\n");
+    ORT_THROW("CoreML is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kDmlExecutionProvider) {
+#else
+    ORT_THROW("COREML is not supported on this platform.\n");
+#endif
+  } else if (provider_name_ == onnxruntime::kDmlExecutionProvider) {
 #ifdef USE_DML
     std::unordered_map<std::string, std::string> dml_options;
     dml_options["performance_preference"] = "high_performance";
     dml_options["device_filter"] = "gpu";
+    dml_options["disable_metacommands"] = "false";
+    dml_options["enable_dynamic_graph_fusion"] = "false";
+#ifdef _MSC_VER
+    std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
+#else
+    std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
+#endif
+    std::istringstream ss(ov_string);
+    std::string token;
+    while (ss >> token) {
+      if (token == "") {
+        continue;
+      }
+      auto pos = token.find("|");
+      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
+        ORT_THROW("[ERROR] [DML] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
+      }
+
+      auto key = token.substr(0, pos);
+      auto value = token.substr(pos + 1);
+
+      if (key == "device_filter") {
+        std::set<std::string> ov_supported_device_types = {"gpu", "npu"};
+        if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) {
+          dml_options[key] = value;
+        } else {
+          ORT_THROW(
+              "[ERROR] [DML] You have selected a wrong configuration value for the key 'device_filter'. "
+              "Select from 'gpu', or 'npu' \n");
+        }
+      } else if (key == "performance_preference") {
+        std::set<std::string> ov_supported_values = {"default", "high_performance", "minimal_power"};
+        if (ov_supported_values.find(value) != ov_supported_values.end()) {
+          dml_options[key] = value;
+        } else {
+          ORT_THROW(
+              "[ERROR] [DML] You have selected a wrong configuration value for the key 'performance_preference'. "
+              "Select from 'default', 'high_performance' or 'minimal_power' \n");
+        }
+      } else if (key == "disable_metacommands") {
+        std::set<std::string> ov_supported_values = {"true", "True", "false", "False"};
+        if (ov_supported_values.find(value) != ov_supported_values.end()) {
+          dml_options[key] = value;
+        } else {
+          ORT_THROW(
+              "[ERROR] [DML] You have selcted wrong value for the key 'disable_metacommands'. "
+              "Select from 'true' or 'false' \n");
+        }
+      } else if (key == "enable_dynamic_graph_fusion") {
+        std::set<std::string> ov_supported_values = {"true", "True", "false", "False"};
+        if (ov_supported_values.find(value) != ov_supported_values.end()) {
+          dml_options[key] = value;
+        } else {
+          ORT_THROW(
+              "[ERROR] [DML] You have selcted wrong value for the key 'enable_dynamic_graph_fusion'. "
+              "Select from 'true' or 'false' \n");
+        }
+      } else if (key == "enable_graph_serialization") {
+        std::set<std::string> ov_supported_values = {"true", "True", "false", "False"};
+        if (ov_supported_values.find(value) != ov_supported_values.end()) {
+          session_options.AddConfigEntry(kOrtSessionOptionsConfigEnableGraphSerialization, value.data());
+        } else {
+          ORT_THROW(
+              "[ERROR] [DML] You have selcted wrong value for the key 'enable_graph_serialization'. "
+              "Select from 'true' or 'false' \n");
+        }
+      }
+    }
     session_options.AppendExecutionProvider("DML", dml_options);
 #else
     ORT_THROW("DML is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kAclExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kAclExecutionProvider) {
 #ifdef USE_ACL
     Ort::ThrowOnError(
         OrtSessionOptionsAppendExecutionProvider_ACL(session_options,
@@ -481,14 +599,14 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #else
     ORT_THROW("Acl is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kArmNNExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kArmNNExecutionProvider) {
 #ifdef USE_ARMNN
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ArmNN(session_options,
                                                                      performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
 #else
     ORT_THROW("ArmNN is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kRocmExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kRocmExecutionProvider) {
 #ifdef USE_ROCM
     OrtROCMProviderOptions rocm_options;
     rocm_options.miopen_conv_exhaustive_search = performance_test_config.run_config.cudnn_conv_algo;
@@ -498,7 +616,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #else
     ORT_THROW("ROCM is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kMIGraphXExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kMIGraphXExecutionProvider) {
 #ifdef USE_MIGRAPHX
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(session_options, 0));
     OrtROCMProviderOptions rocm_options;
@@ -508,7 +626,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #else
     ORT_THROW("MIGraphX is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kXnnpackExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kXnnpackExecutionProvider) {
 #ifdef USE_XNNPACK
     session_options.AddConfigEntry(kOrtSessionOptionsConfigAllowIntraOpSpinning, "0");
     session_options.AppendExecutionProvider(
@@ -516,7 +634,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #else
     ORT_THROW("Xnnpack is not supported in this build\n");
 #endif
-  } else if (provider_name == onnxruntime::kVitisAIExecutionProvider) {
+  } else if (provider_name_ == onnxruntime::kVitisAIExecutionProvider) {
 #ifdef USE_VITISAI
 #ifdef _MSC_VER
     std::string option_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
@@ -540,11 +658,11 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
       std::string value(token.substr(pos + 1));
       vitisai_session_options[key] = value;
     }
-    session_options.AppendExecutionProvider("VitisAI", vitisai_session_options);
+    session_options.AppendExecutionProvider_VitisAI(vitisai_session_options);
 #else
     ORT_THROW("VitisAI is not supported in this build\n");
 #endif
-  } else if (!provider_name.empty() && provider_name != onnxruntime::kCpuExecutionProvider) {
+  } else if (!provider_name_.empty() && provider_name_ != onnxruntime::kCpuExecutionProvider) {
     ORT_THROW("This backend is not included in perf test runner.\n");
   }
 
@@ -559,22 +677,41 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     session_options.DisableMemPattern();
   session_options.SetExecutionMode(performance_test_config.run_config.execution_mode);
 
+  // Set any extra session configuration entries provided by the user via command-line arguments.
+  //
+  // Some session config entries can also be set via dedicated command-line options.
+  // If the user uses multiple command-line options to set the same session config entry,
+  // we'll print a warning. Note that the dedicated command-line options will take precedence.
+  const auto& user_session_configs = performance_test_config.run_config.session_config_entries;
+  for (auto& it : user_session_configs) {
+    session_options.AddConfigEntry(it.first.c_str(), it.second.c_str());
+  }
+
+  auto warn_dup_config_entry = [&user_session_configs](const char* key) -> void {
+    if (user_session_configs.find(key) != user_session_configs.end()) {
+      fprintf(stderr, "[WARNING]: Trying to set session config entry '%s' via multiple command-line options\n", key);
+    }
+  };
+
   if (performance_test_config.run_config.intra_op_num_threads > 0) {
     fprintf(stdout, "Setting intra_op_num_threads to %d\n", performance_test_config.run_config.intra_op_num_threads);
     session_options.SetIntraOpNumThreads(performance_test_config.run_config.intra_op_num_threads);
   }
 
   if (!performance_test_config.run_config.intra_op_thread_affinities.empty()) {
+    warn_dup_config_entry(kOrtSessionOptionsConfigIntraOpThreadAffinities);
     fprintf(stdout, "Setting intra op thread affinity as %s\n", performance_test_config.run_config.intra_op_thread_affinities.c_str());
     session_options.AddConfigEntry(kOrtSessionOptionsConfigIntraOpThreadAffinities, performance_test_config.run_config.intra_op_thread_affinities.c_str());
   }
 
   if (performance_test_config.run_config.disable_spinning) {
+    warn_dup_config_entry(kOrtSessionOptionsConfigAllowIntraOpSpinning);
     fprintf(stdout, "Disabling intra-op thread spinning entirely\n");
     session_options.AddConfigEntry(kOrtSessionOptionsConfigAllowIntraOpSpinning, "0");
   }
 
   if (performance_test_config.run_config.disable_spinning_between_run) {
+    warn_dup_config_entry(kOrtSessionOptionsConfigForceSpinningStop);
     fprintf(stdout, "Disabling intra-op thread spinning between runs\n");
     session_options.AddConfigEntry(kOrtSessionOptionsConfigForceSpinningStop, "1");
   }
@@ -586,12 +723,16 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 
   // Set optimization level.
   session_options.SetGraphOptimizationLevel(performance_test_config.run_config.optimization_level);
-  if (!performance_test_config.run_config.profile_file.empty())
+  if (!performance_test_config.run_config.profile_file.empty()) {
     session_options.EnableProfiling(performance_test_config.run_config.profile_file.c_str());
-  if (!performance_test_config.run_config.optimized_model_path.empty())
+  }
+  if (!performance_test_config.run_config.optimized_model_path.empty()) {
     session_options.SetOptimizedModelFilePath(performance_test_config.run_config.optimized_model_path.c_str());
-  if (performance_test_config.run_config.set_denormal_as_zero)
+  }
+  if (performance_test_config.run_config.set_denormal_as_zero) {
+    warn_dup_config_entry(kOrtSessionOptionsConfigSetDenormalAsZero);
     session_options.AddConfigEntry(kOrtSessionOptionsConfigSetDenormalAsZero, "1");
+  }
   if (!performance_test_config.run_config.free_dim_name_overrides.empty()) {
     for (auto const& dim_override : performance_test_config.run_config.free_dim_name_overrides) {
       if (g_ort->AddFreeDimensionOverrideByName(session_options, ToUTF8String(dim_override.first).c_str(), dim_override.second) != nullptr) {
diff --git a/onnxruntime/test/perftest/ort_test_session.h b/onnxruntime/test/perftest/ort_test_session.h
index 208e3de53b1d..f1a4220ab325 100644
--- a/onnxruntime/test/perftest/ort_test_session.h
+++ b/onnxruntime/test/perftest/ort_test_session.h
@@ -45,6 +45,7 @@ class OnnxRuntimeTestSession : public TestSession {
   std::vector<const char*> input_names_;
   std::vector<std::string> input_names_str_;
   const int input_length_;
+  std::string provider_name_;
 };
 
 }  // namespace perftest
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index 9f2cbcf6a21f..08d77008dc25 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -10,12 +10,8 @@
 #include <iostream>
 
 #include "TestCase.h"
-#include "TFModelInfo.h"
 #include "utils.h"
 #include "ort_test_session.h"
-#ifdef HAVE_TENSORFLOW
-#include "tf_test_session.h"
-#endif
 using onnxruntime::Status;
 
 // TODO: Temporary, while we bring up the threadpool impl...
@@ -115,6 +111,11 @@ void PerformanceResult::DumpToFile(const std::basic_string<ORTCHAR_T>& path, boo
   }
 }
 
+void PerformanceRunner::LogSessionCreationTime() {
+  std::chrono::duration<double> session_create_duration = session_create_end_ - session_create_start_;
+  std::cout << "\nSession creation time cost: " << session_create_duration.count() << " s\n";
+}
+
 Status PerformanceRunner::Run() {
   if (!Initialize()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "failed to initialize.");
@@ -255,47 +256,25 @@ Status PerformanceRunner::ForkJoinRepeat() {
 }
 
 static std::unique_ptr<TestModelInfo> CreateModelInfo(const PerformanceTestConfig& performance_test_config_) {
-  if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("ort")) == 0) {
-    const auto& file_path = performance_test_config_.model_info.model_file_path;
+  const auto& file_path = performance_test_config_.model_info.model_file_path;
 #if !defined(ORT_MINIMAL_BUILD)
-    if (HasExtensionOf(file_path, ORT_TSTR("onnx"))) {
-      return TestModelInfo::LoadOnnxModel(performance_test_config_.model_info.model_file_path.c_str());
-    }
-#endif
-
-    if (HasExtensionOf(file_path, ORT_TSTR("ort"))) {
-      return TestModelInfo::LoadOrtModel(performance_test_config_.model_info.model_file_path.c_str());
-    }
-
-    ORT_NOT_IMPLEMENTED(ToUTF8String(file_path), " is not supported");
+  if (HasExtensionOf(file_path, ORT_TSTR("onnx"))) {
+    return TestModelInfo::LoadOnnxModel(performance_test_config_.model_info.model_file_path.c_str());
   }
+#endif
 
-  if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("tf")) == 0) {
-    return TFModelInfo::Create(performance_test_config_.model_info.model_file_path.c_str());
+  if (HasExtensionOf(file_path, ORT_TSTR("ort"))) {
+    return TestModelInfo::LoadOrtModel(performance_test_config_.model_info.model_file_path.c_str());
   }
 
-  ORT_NOT_IMPLEMENTED(ToUTF8String(performance_test_config_.backend), " is not supported");
-}
-
-static std::unique_ptr<TestSession> CreateSession(Ort::Env& env, std::random_device& rd,
-                                                  const PerformanceTestConfig& performance_test_config_,
-                                                  const TestModelInfo& test_model_info) {
-  if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("ort")) == 0) {
-    return std::make_unique<OnnxRuntimeTestSession>(env, rd, performance_test_config_, test_model_info);
-  }
-#ifdef HAVE_TENSORFLOW
-  if (CompareCString(performance_test_config_.backend.c_str(), ORT_TSTR("tf")) == 0) {
-    return new TensorflowTestSession(rd, performance_test_config_, test_model_info);
-  }
-#endif
-  ORT_NOT_IMPLEMENTED(ToUTF8String(performance_test_config_.backend), " is not supported");
+  ORT_NOT_IMPLEMENTED(ToUTF8String(file_path), " is not supported");
 }
 
 PerformanceRunner::PerformanceRunner(Ort::Env& env, const PerformanceTestConfig& test_config, std::random_device& rd)
     : performance_test_config_(test_config),
       test_model_info_(CreateModelInfo(test_config)) {
   session_create_start_ = std::chrono::high_resolution_clock::now();
-  session_ = CreateSession(env, rd, test_config, *test_model_info_);
+  session_ = std::make_unique<OnnxRuntimeTestSession>(env, rd, performance_test_config_, *test_model_info_);
   session_create_end_ = std::chrono::high_resolution_clock::now();
 }
 
diff --git a/onnxruntime/test/perftest/performance_runner.h b/onnxruntime/test/perftest/performance_runner.h
index da2df9c39f44..cb1cb661550a 100644
--- a/onnxruntime/test/perftest/performance_runner.h
+++ b/onnxruntime/test/perftest/performance_runner.h
@@ -46,6 +46,8 @@ class PerformanceRunner {
   ~PerformanceRunner();
   Status Run();
 
+  void LogSessionCreationTime();
+
   inline const PerformanceResult& GetResult() const { return performance_result_; }
 
   inline void SerializeResult() const {
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
index 43ad556247f9..70a6b12690d5 100644
--- a/onnxruntime/test/perftest/test_configuration.h
+++ b/onnxruntime/test/perftest/test_configuration.h
@@ -6,6 +6,7 @@
 #include <map>
 #include <cstdint>
 #include <string>
+#include <unordered_map>
 
 #include "core/graph/constants.h"
 #include "core/framework/session_options.h"
@@ -56,18 +57,19 @@ struct RunConfig {
   bool do_cuda_copy_in_separate_stream{false};
   bool set_denormal_as_zero{false};
   std::basic_string<ORTCHAR_T> ep_runtime_config_string;
+  std::unordered_map<std::string, std::string> session_config_entries;
   std::map<std::basic_string<ORTCHAR_T>, int64_t> free_dim_name_overrides;
   std::map<std::basic_string<ORTCHAR_T>, int64_t> free_dim_denotation_overrides;
   std::string intra_op_thread_affinities;
   bool disable_spinning = false;
   bool disable_spinning_between_run = false;
+  bool exit_after_session_creation = false;
 };
 
 struct PerformanceTestConfig {
   ModelInfo model_info;
   MachineConfig machine_config;
   RunConfig run_config;
-  std::basic_string<ORTCHAR_T> backend = ORT_TSTR("ort");
 };
 
 }  // namespace perftest
diff --git a/onnxruntime/test/platform/apple/apple_package_test/Podfile.template b/onnxruntime/test/platform/apple/apple_package_test/Podfile.template
index 3d191d6fb1cc..4958e4fa8549 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/Podfile.template
+++ b/onnxruntime/test/platform/apple/apple_package_test/Podfile.template
@@ -1,6 +1,10 @@
 def include_macos_target
   if '@C_POD_NAME@' != 'onnxruntime-mobile-c'
-    return true
+    if ENV['SKIP_MACOS_TEST'] != 'true'
+      return true
+    else
+      return false
+    end
   end
   return false
 end
diff --git a/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj b/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
index f0582d41734b..eb7345be3770 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
+++ b/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
@@ -49,6 +49,7 @@
 		229E595826586B4A006E41AE /* sigmoid.ort */ = {isa = PBXFileReference; lastKnownFileType = file; path = sigmoid.ort; sourceTree = "<group>"; };
 		22C1D8DE271A79AF002CEE67 /* ios_package_testUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = ios_package_testUITests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
 		22C1D8E9271A79FD002CEE67 /* ios_package_uitest_cpp_api.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_package_uitest_cpp_api.mm; sourceTree = "<group>"; };
+		513C65792B85789400E4EDFD /* ios_package_test.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = ios_package_test.entitlements; sourceTree = "<group>"; };
 		51C316B92B0881450033C70B /* macos_package_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = macos_package_test.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		51C316BB2B0881450033C70B /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		51C316BC2B0881450033C70B /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@@ -117,6 +118,7 @@
 		229E591E265869BF006E41AE /* ios_package_test */ = {
 			isa = PBXGroup;
 			children = (
+				513C65792B85789400E4EDFD /* ios_package_test.entitlements */,
 				229E591F265869BF006E41AE /* AppDelegate.h */,
 				229E5920265869BF006E41AE /* AppDelegate.m */,
 				229E5928265869BF006E41AE /* Main.storyboard */,
@@ -521,8 +523,11 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGNING_REQUIRED = NO;
+				CODE_SIGNING_STYLE = Automatic;
+				CODE_SIGN_ENTITLEMENTS = ios_package_test/ios_package_test.entitlements;
 				INFOPLIST_FILE = ios_package_test/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -530,9 +535,9 @@
 				PRODUCT_BUNDLE_IDENTIFIER = "ai.onnxruntime.tests.ios-package-test";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
+				SUPPORTS_MACCATALYST = YES;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
@@ -541,8 +546,11 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGNING_REQUIRED = NO;
+				CODE_SIGNING_STYLE = Automatic;
+				CODE_SIGN_ENTITLEMENTS = ios_package_test/ios_package_test.entitlements;
 				INFOPLIST_FILE = ios_package_test/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -550,9 +558,9 @@
 				PRODUCT_BUNDLE_IDENTIFIER = "ai.onnxruntime.tests.ios-package-test";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
+				SUPPORTS_MACCATALYST = YES;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
@@ -563,7 +571,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				GENERATE_INFOPLIST_FILE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -585,7 +593,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				GENERATE_INFOPLIST_FILE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements b/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements
new file mode 100644
index 000000000000..ee95ab7e582d
--- /dev/null
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>com.apple.security.app-sandbox</key>
+	<true/>
+	<key>com.apple.security.network.client</key>
+	<true/>
+</dict>
+</plist>
diff --git a/onnxruntime/test/platform/env_test.cc b/onnxruntime/test/platform/env_test.cc
index e0bffed8c4c1..9ea8586af693 100644
--- a/onnxruntime/test/platform/env_test.cc
+++ b/onnxruntime/test/platform/env_test.cc
@@ -33,5 +33,25 @@ TEST(PlatformEnvTest, DirectoryCreationAndDeletion) {
   ASSERT_FALSE(env.FolderExists(root_dir));
 }
 
+TEST(PlatformEnvTest, GetErrnoInfo) {
+  // command that should generate an errno error
+  std::ifstream file("non_existent_file");
+  ASSERT_TRUE(file.fail());
+  auto [err, msg] = GetErrnoInfo();
+  ASSERT_EQ(err, ENOENT);
+
+#if defined(_WIN32)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
+
+  // GetErrnoInfo uses strerror_r or strerror_s depending on the platform. use the unsafe std::sterror to get the
+  // expected value given this is a unit test so doesn't have to be as robust.
+  ASSERT_EQ(msg, std::strerror(ENOENT));
+
+#if defined(_WIN32)
+#pragma warning(pop)
+#endif
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/platform/windows/logging/HowToValidateEtwSinkOutput.md b/onnxruntime/test/platform/windows/logging/HowToValidateEtwSinkOutput.md
index 2f8d06d66d57..309b474c016c 100644
--- a/onnxruntime/test/platform/windows/logging/HowToValidateEtwSinkOutput.md
+++ b/onnxruntime/test/platform/windows/logging/HowToValidateEtwSinkOutput.md
@@ -1,10 +1,19 @@
-## Validating ETW Sink unit test output
+## About the ETW Sink
 
-## Setup
-Install Windows Performance Toolkit from <https://docs.microsoft.com/en-us/windows-hardware/get-started/adk-install>
-You get to select components when installing, so can select just the performance toolkit.
+The ETW Sink (ONNXRuntimeTraceLoggingProvider) allows ONNX semi-structured printf style logs to be output via ETW.
 
-Overview of the steps is at <https://msdn.microsoft.com/en-us/library/windows/desktop/dn904629(v=vs.85).aspx> if you want more detail.
+ETW makes it easy and useful to only enable and listen for events with great performance, and when you need them instead of only at compile time.
+Therefore ONNX will preserve any existing loggers and log severity [provided at compile time](/docs/FAQ.md?plain=1#L7).
+
+However, when the provider is enabled a new ETW logger sink will also be added and the severity separately controlled via ETW dynamically.
+
+- Provider GUID: 929DD115-1ECB-4CB5-B060-EBD4983C421D
+- Keyword: Logs (0x2) keyword per [logging.h](/include/onnxruntime/core/common/logging/logging.h)
+- Level: 1-5 ([CRITICAL through VERBOSE](https://learn.microsoft.com/en-us/windows/win32/api/evntprov/ns-evntprov-event_descriptor)) [mapping](/onnxruntime/core/platform/windows/logging/etw_sink.cc) to [ONNX severity](/include/onnxruntime/core/common/logging/severity.h) in an intuitive manner
+
+Notes:
+- The ETW provider must be enabled prior to session creation, as that as when internal logging setup is complete
+- Other structured ETW logs are output via the other Microsoft.ML.ONNXRuntime ETW provider. Both used together are recommended
 
 ## Capturing ETW trace output
 
@@ -25,9 +34,17 @@ Run the ETW sink unit tests
 Stop the ETW tracing
     `<path to repo>\onnxruntime\test\platform\windows\logging> wpr -stop TraceCaptureFile.etl EtwSinkTest`
 
-## View the output
+## View the trace output
+
+### Setup
+- Install Windows Performance Analyzer (Preview) from the Windows Store - <https://www.microsoft.com/en-us/p/windows-performance-analyzer-preview/9n58qrw40dfw>
+- Or from the ADK <https://docs.microsoft.com/en-us/windows-hardware/get-started/adk-install>
+  - You get to select components when installing, so can select just the performance toolkit.
+  - Overview of the steps is at <https://msdn.microsoft.com/en-us/library/windows/desktop/dn904629(v=vs.85).aspx> if you want more detail.
+
+### Viewing
 
-Open TraceCaptureFile.etl file Windows Performance Analyzer.
+Open TraceCaptureFile.etl file in Windows Performance Analyzer.
 
 Expand the "System Activity" dropdown in the left pane, and double-click "Generic Events".
 That should open events in an Analysis window in the right pane. You should see an event
diff --git a/onnxruntime/test/platform/windows/logging/etw_sink_test.cc b/onnxruntime/test/platform/windows/logging/etw_sink_test.cc
index 7436ac5bd172..05ef81d05f4e 100644
--- a/onnxruntime/test/platform/windows/logging/etw_sink_test.cc
+++ b/onnxruntime/test/platform/windows/logging/etw_sink_test.cc
@@ -47,8 +47,8 @@ TEST(LoggingTests, TestEtwSink) {
 /// </summary>
 TEST(LoggingTests, TestEtwSinkCtor) {
   CompositeSink* sinks = new CompositeSink();
-  sinks->AddSink(std::unique_ptr<ISink>(new EtwSink()))
-      .AddSink(std::unique_ptr<ISink>(new EtwSink()));
+  sinks->AddSink(std::unique_ptr<ISink>(new EtwSink()), Severity::kWARNING)
+      .AddSink(std::unique_ptr<ISink>(new EtwSink()), Severity::kWARNING);
 
   LoggingManager manager{std::unique_ptr<ISink>{sinks},
                          Severity::kWARNING,
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 16cce85f7cb0..8d84c689cd23 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -120,6 +120,20 @@ void BaseTester::SetOutputRelErr(const char* name, float v) {
   it->validation_params.relative_error = optional<float>(v);
 }
 
+void BaseTester::SetOutputTolerance(float abs_error, float rel_error) {
+  for (auto& output : output_data_) {
+    if (output.def.Exists()) {
+      if (abs_error >= 0.0f) {
+        output.validation_params.absolute_error = optional<float>(abs_error);
+      }
+
+      if (rel_error >= 0.0f) {
+        output.validation_params.relative_error = optional<float>(rel_error);
+      }
+    }
+  }
+}
+
 std::vector<int64_t> BaseTester::GetDimsForProto(gsl::span<const int64_t> dims) {
   std::vector<int64_t> dims_for_proto{dims.begin(), dims.end()};
   if (add_symbolic_dim_to_tensor_data_ >= 0 &&
@@ -613,6 +627,9 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
                          number_of_pre_packed_weights_counter,
                          number_of_shared_pre_packed_weights_counter);
     } else {
+      // synthetic EP name for testing CoreML EP with ML Program
+      constexpr const char* kCoreMLExecutionProviderMLProgram = "CoreMLExecutionProvider_MLProgram";
+
 #ifdef USE_TENSORRT
       // only run trt ep to reduce test time
       static const std::string all_provider_types[] = {
@@ -622,6 +639,9 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
       static const std::string all_provider_types[] = {
           kCpuExecutionProvider,
           kCudaExecutionProvider,
+#ifdef ENABLE_CUDA_NHWC_OPS
+          kCudaNHWCExecutionProvider,
+#endif
           kDnnlExecutionProvider,
           kTensorrtExecutionProvider,
           kOpenVINOExecutionProvider,
@@ -631,10 +651,16 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           kNnapiExecutionProvider,
           kRocmExecutionProvider,
           kCoreMLExecutionProvider,
+          kCoreMLExecutionProviderMLProgram,
           kQnnExecutionProvider,
           kSnpeExecutionProvider,
           kXnnpackExecutionProvider,
       };
+
+      // need to special case any synthetic EP names in the exclude list
+      if (ctx_.excluded_provider_types.count(kCoreMLExecutionProvider) > 0) {
+        ctx_.excluded_provider_types.insert(kCoreMLExecutionProviderMLProgram);
+      }
 #endif
 
       bool has_run = false;
@@ -650,6 +676,10 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           execution_provider = DefaultCpuExecutionProvider();
         else if (provider_type == onnxruntime::kCudaExecutionProvider)
           execution_provider = DefaultCudaExecutionProvider();
+#ifdef ENABLE_CUDA_NHWC_OPS
+        else if (provider_type == onnxruntime::kCudaNHWCExecutionProvider)
+          execution_provider = DefaultCudaNHWCExecutionProvider();
+#endif
         else if (provider_type == onnxruntime::kDnnlExecutionProvider)
           execution_provider = DefaultDnnlExecutionProvider();
         else if (provider_type == onnxruntime::kOpenVINOExecutionProvider)
@@ -668,6 +698,8 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           execution_provider = DefaultRocmExecutionProvider();
         else if (provider_type == onnxruntime::kCoreMLExecutionProvider)
           execution_provider = DefaultCoreMLExecutionProvider();
+        else if (provider_type == kCoreMLExecutionProviderMLProgram)
+          execution_provider = DefaultCoreMLExecutionProvider(/*use_mlprogram*/ true);
         else if (provider_type == onnxruntime::kSnpeExecutionProvider)
           execution_provider = DefaultSnpeExecutionProvider();
         else if (provider_type == onnxruntime::kQnnExecutionProvider)
diff --git a/onnxruntime/test/providers/base_tester.h b/onnxruntime/test/providers/base_tester.h
index 5607e58315a1..c276ae494df4 100644
--- a/onnxruntime/test/providers/base_tester.h
+++ b/onnxruntime/test/providers/base_tester.h
@@ -519,9 +519,20 @@ class BaseTester {
     custom_session_registries_.push_back(registry);
   }
 
+  // For floating types (double/float/half/bfloat16), tolerance is similar to numpy.isclose:
+  //   absolute(expected_value - actual_value) <= abs_error + rel_error * absolute(expected_value)
+  // For integer types, tolerance parameters are ignored except the following cases:
+  //   For uint8, tolerance is only applied to NNAPI/XNNPACK/DML providers.
+  //   For int8, only abs_error is used, and rel_error is ignored. See checkers.cc for detail.
+  // If abs_error or rel_error is not set, a default value is used (search DefaultTolerance for detail).
   void SetOutputAbsErr(const char* name, float v);
   void SetOutputRelErr(const char* name, float v);
 
+  // Set absolute and relative tolerance for all existed outputs.
+  // Negative value will be ignored.
+  // Note that it will not set tolerance for new outputs added after this call.
+  void SetOutputTolerance(float abs_error, float rel_error = -1.0f);
+
   // Number of times to call InferenceSession::Run. The same feeds are used each time.
   // e.g. used to verify the generator ops behave as expected
   void SetNumRunCalls(int n) {
diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
index 85ccb8f175f6..47c18c478dd9 100644
--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
@@ -14,6 +14,95 @@
 namespace onnxruntime {
 namespace test {
 namespace {
+
+template <typename T>
+struct DefaultTolerance;
+
+template <>
+struct DefaultTolerance<double> {
+  static constexpr float absolute = 1e-5f;
+  static constexpr float relative = 1e-5f;
+
+  // Allow to have different default absolute tolerance for different providers.
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
+};
+
+template <>
+struct DefaultTolerance<float> {
+#if defined(ENABLE_TRAINING)
+  static constexpr float absolute = 1e-3f;
+#else
+  static constexpr float absolute = 1e-5f;
+#endif
+
+  static constexpr float relative = 1e-4f;
+
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
+};
+
+template <>
+struct DefaultTolerance<MLFloat16> {
+#if defined(ENABLE_TRAINING)
+  static constexpr float absolute = 0.005f;
+#else
+  // The thresholds for inference are estimated with PyTorch script like the following:
+  //    x = torch.rand(1000, 1000)
+  //    absolute = ((x + 1e-6).to(torch.float16) - x).abs().max() * 10
+  //    x[abs(x) < absolute] = absolute
+  //    relative = ((x - x.to(torch.float16)) / x).abs().max() * 2
+  static constexpr float absolute = 0.0025f;
+#endif
+
+  static constexpr float relative = 0.001f;
+
+  static float get_absolute(const std::string& provider_type) {
+    if (provider_type == kDmlExecutionProvider) {
+      return 0.005f;
+    }
+    return absolute;
+  }
+};
+
+template <>
+struct DefaultTolerance<BFloat16> {
+  // The thresholds for inference are estimated with PyTorch script like the following:
+  //    x = torch.rand(1000, 1000)
+  //    absolute = ((x + 1e-6).to(torch.bfloat16) - x).abs().max() * 10
+  //    x[abs(x) < absolute] = absolute
+  //    relative = ((x - x.to(torch.bfloat16)) / x).abs().max() * 2
+  static constexpr float absolute = 0.02f;
+  static constexpr float relative = 0.01f;
+
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
+};
+
+struct ToleranceParams {
+  float absolute;
+  float relative;
+};
+
+template <typename T>
+ToleranceParams get_tolerance_params(const ValidateOutputParams& params, const std::string& provider_type) {
+  ToleranceParams new_params;
+  new_params.absolute = params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance<T>::get_absolute(provider_type);
+  new_params.relative = params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance<T>::relative;
+  return new_params;
+}
+
+template <typename T>
+T get_tolerance(const ToleranceParams& params, T expected_value) {
+  static_assert(std::is_floating_point<T>::value, "T must be a floating point type");
+
+  // The formula is similar to numpy.isclose: https://numpy.org/doc/stable/reference/generated/numpy.isclose.html
+  return static_cast<T>(params.absolute) + static_cast<T>(params.relative) * std::abs(expected_value);
+}
+
 template <typename T>
 Tensor copy_sort(const Tensor& src, const AllocatorPtr& allocator) {
   Tensor result(src.DataType(), src.Shape(), allocator);
@@ -67,7 +156,7 @@ struct TensorCheck {
       cur_actual = actual.Data<T>();
     }
 
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
       EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
     }
   }
@@ -111,7 +200,7 @@ struct TensorCheck<uint8_t> {
       double threshold = has_abs_err ? *(params.absolute_error)
                                      : 0.0;
 
-      for (int i = 0; i < size; ++i) {
+      for (int64_t i = 0; i < size; ++i) {
         if (has_rel_err) {
           EXPECT_NEAR(cur_expected[i], cur_actual[i],
                       *(params.relative_error) * cur_expected[i])  // expected[i] is unsigned, can't be negative
@@ -121,7 +210,7 @@ struct TensorCheck<uint8_t> {
         }
       }
     } else {
-      for (int i = 0; i < size; ++i) {
+      for (int64_t i = 0; i < size; ++i) {
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
       }
     }
@@ -153,15 +242,18 @@ struct TensorCheck<int8_t> {
       cur_actual = actual.template Data<int8_t>();
     }
 
-    const bool has_abs_err = params.absolute_error.has_value();
+    // When absolute error is less than 1 for int8, it has same effect as no tolerance.
+    const bool has_abs_err = params.absolute_error.has_value() && *(params.absolute_error) >= 1.0f;
+
+    // TODO: the relative error is not used for int8 yet.
     if (has_abs_err) {
       double threshold = *(params.absolute_error);
 
-      for (int i = 0; i < size; ++i) {
+      for (int64_t i = 0; i < size; ++i) {
         EXPECT_NEAR(cur_expected[i], cur_actual[i], threshold) << "i:" << i;
       }
     } else {
-      for (int i = 0; i < size; ++i) {
+      for (int64_t i = 0; i < size; ++i) {
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
       }
     }
@@ -173,12 +265,9 @@ struct TensorCheck<double> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto size = actual.Shape().Size();
 
-    bool has_abs_err = params.absolute_error.has_value();
-    bool has_rel_err = params.relative_error.has_value();
-
     // deal with rare cases in which order of output data from a kernel MAY be
     // undefined
     Tensor expected_sorted, actual_sorted;
@@ -193,12 +282,9 @@ struct TensorCheck<double> {
       cur_actual = actual.Data<double>();
     }
 
-    double threshold = 0.001;
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-    threshold = 0.005;
-#endif
+    auto tolerance_params = get_tolerance_params<double>(params, provider_type);
 
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
       // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
       // If the isinf check is first the isnan check and branch gets omitted
       if (std::isnan(cur_expected[i])) {
@@ -206,53 +292,36 @@ struct TensorCheck<double> {
       } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        if (!has_abs_err && !has_rel_err) {
-          // the default for existing tests
-          EXPECT_NEAR(cur_expected[i], cur_actual[i], threshold) << "i:" << i;
-        } else {
-          if (has_abs_err) {
-            EXPECT_NEAR(cur_expected[i], cur_actual[i], *(params.absolute_error)) << "i:" << i;
-          }
-          if (has_rel_err) {
-            EXPECT_NEAR(cur_expected[i], cur_actual[i], *(params.relative_error) * std::abs(cur_expected[i]))
-                << "i:" << i;
-          }
-        }
+        double tolerance = get_tolerance<double>(tolerance_params, cur_expected[i]);
+        EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
       }
     }
   }
 };
 
-template <typename TypeToCheck>
+template <typename T>
 void InternalNumericalCheck(const Tensor& expected,
                             const Tensor& actual,
                             const ValidateOutputParams& params,
-                            const std::string& /*provider_type*/) {
-  const bool has_abs_err = params.absolute_error.has_value();
-  const bool has_rel_err = params.relative_error.has_value();
-
+                            const std::string& provider_type) {
   // deal with rare cases in which order of output data from a kernel MAY be
   // undefined
   Tensor expected_sorted, actual_sorted;
-  const TypeToCheck* cur_expected;
-  const TypeToCheck* cur_actual;
+  const T* cur_expected;
+  const T* cur_actual;
   auto size = actual.Shape().Size();
   if (params.sort_output) {
-    sort_expected_and_actual_buffers<TypeToCheck>(expected, expected_sorted, actual, actual_sorted);
-    cur_expected = expected_sorted.Data<TypeToCheck>();
-    cur_actual = actual_sorted.Data<TypeToCheck>();
+    sort_expected_and_actual_buffers<T>(expected, expected_sorted, actual, actual_sorted);
+    cur_expected = expected_sorted.Data<T>();
+    cur_actual = actual_sorted.Data<T>();
   } else {
-    cur_expected = expected.Data<TypeToCheck>();
-    cur_actual = actual.Data<TypeToCheck>();
+    cur_expected = expected.Data<T>();
+    cur_actual = actual.Data<T>();
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-  constexpr float threshold = 0.005f;
-#else
-  constexpr float threshold = 0.0001f;
-#endif
+  auto tolerance_params = get_tolerance_params<T>(params, provider_type);
 
-  for (int i = 0; i < size; ++i) {
+  for (int64_t i = 0; i < size; ++i) {
     // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
     // If the isinf check is first the isnan check and branch gets omitted
     if (std::isnan(cur_expected[i])) {
@@ -260,19 +329,8 @@ void InternalNumericalCheck(const Tensor& expected,
     } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
       EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
     } else {
-      if (!has_abs_err && !has_rel_err) {
-        // the default for existing tests
-        EXPECT_NEAR(cur_expected[i], cur_actual[i], threshold) << "i:" << i;
-      } else {
-        if (has_abs_err) {
-          EXPECT_NEAR(cur_expected[i], cur_actual[i], *(params.absolute_error))
-              << "i:" << i;
-        }
-        if (has_rel_err) {
-          EXPECT_NEAR(cur_expected[i], cur_actual[i], *(params.relative_error) * std::abs(cur_expected[i]))
-              << "i:" << i;
-        }
-      }
+      T tolerance = get_tolerance<T>(tolerance_params, cur_expected[i]);
+      EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
     }
   }
 }
@@ -292,7 +350,7 @@ struct TensorCheck<MLFloat16> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto* cur_expected = expected.Data<MLFloat16>();
     auto* cur_actual = actual.Data<MLFloat16>();
     auto size = actual.Shape().Size();
@@ -308,34 +366,16 @@ struct TensorCheck<MLFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    const bool has_abs_err = params.absolute_error.has_value();
-    const bool has_rel_err = params.relative_error.has_value();
+    auto tolerance_params = get_tolerance_params<MLFloat16>(params, provider_type);
 
-    float threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM)
-    threshold = 0.005f;
-#elif defined(USE_DML)
-    threshold = 0.02f;
-#endif
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
         EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i;
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        if (!has_abs_err && !has_rel_err) {
-          // the default for existing tests
-          EXPECT_NEAR(f_expected[i], f_actual[i], threshold) << "i:" << i;
-        } else {
-          if (has_abs_err) {
-            EXPECT_NEAR(f_expected[i], f_actual[i], *(params.absolute_error))
-                << "i:" << i;
-          }
-          if (has_rel_err) {
-            EXPECT_NEAR(f_expected[i], f_actual[i], *(params.relative_error) * std::abs(static_cast<float>(cur_expected[i])))
-                << "i:" << i;
-          }
-        }
+        float tolerance = get_tolerance<float>(tolerance_params, f_expected[i]);
+        EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
   }
@@ -346,7 +386,7 @@ struct TensorCheck<BFloat16> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto* cur_expected = expected.Data<BFloat16>();
     auto* cur_actual = actual.Data<BFloat16>();
     auto size = actual.Shape().Size();
@@ -362,32 +402,16 @@ struct TensorCheck<BFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    /// XXX: May need to adjust threshold as BFloat is coarse
-    float abs_threshold = 0.0001f;
-    float threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) || defined(USE_DNNL)
-    threshold = 0.05f;  // expect at least 95% close
-#endif
+    auto tolerance_params = get_tolerance_params<BFloat16>(params, provider_type);
 
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
         EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i;
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        // the default for existing tests
-        const float max_value = fmax(fabs(f_expected[i]), fabs(f_actual[i]));
-        if (max_value != 0) {  // max_value = 0 means output and expected are 0s.
-          const float abs_error = fabs(f_expected[i] - f_actual[i]);
-          if (abs_error <= abs_threshold) {
-            // if the absolute error is small enough, then no need to calculate realative error
-            EXPECT_NEAR(0, abs_error, abs_threshold);
-          } else {
-            // default for existing tests.
-            const float rel_error = abs_error / max_value;
-            EXPECT_NEAR(0, rel_error, threshold);
-          }
-        }
+        float tolerance = get_tolerance<float>(tolerance_params, f_expected[i]);
+        EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
   }
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 7b6f1b9244be..0f068ba48d3d 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -192,5 +192,29 @@ TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) {
 #endif
 }
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+// Names in CoreML cannot start with [0-9] or contain anything but "[a-z][A-Z][0-9]_"
+// Test that we fix invalid names in model inputs, initializers and outputs.
+// This is only enforced for ML Program, so we only do name sanitization when creating an ML Program format model.
+TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
+  OpTester test("Clip", 11);
+
+  std::vector<int64_t> dims{3, 3};
+  test.AddInput<float>("0", dims,
+                       {-1.0f, 0.0f, 1.0f,
+                        -6.0f, 0.0f, 6.0f,
+                        -5.4f, 2.0f, 6.0f});
+  test.AddInput<float>("1.min", {}, {-5}, true);  // add as initializers
+  test.AddInput<float>("2/max", {}, {5}, true);
+  test.AddOutput<float>("3", dims,
+                        {-1.0f, 0.0f, 1.0f,
+                         -5.0f, 0.0f, 5.0f,
+                         -5.0f, 2.0f, 5.0f});
+
+  // TensorRT does not support Clip opset 11 yet.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index 7ec9e0f34518..d2e883331acd 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -116,13 +116,13 @@ TEST_F(ActivationOpTest, Relu) {
       "Relu",
       input_values_double,
       [](double x) { return std::max(x, 0.0); },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false);
   TestActivationOp<int8_t>(
       "Relu",
       input_values_int8,
       [](int8_t x) { return std::max(x, static_cast<int8_t>(0)); },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false,
       /*opset_version= */ 14);
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -133,7 +133,7 @@ TEST_F(ActivationOpTest, Relu) {
         if (x.ToFloat() > 0.0f) return x;
         return MLFloat16();
       },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false,
       /*opset_version= */ 11);
 #endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -402,7 +402,7 @@ TEST_F(ActivationOpTest, Celu) {
       // TODO: Investigate why gcc 4 fails to compile without the explicit cast
       [alpha](float x) { return std::max(0.0f, x) + std::min(0.0f, alpha * (static_cast<float>(exp(x / alpha)) - 1)); },
       // Disable on TensorRT as it seems like it doesn't yet support Celu
-      {{"alpha", alpha}}, false, 12);
+      {{"alpha", alpha}}, {}, false, 12);
 }
 
 TEST_F(ActivationOpTest, LeakyRelu) {
@@ -410,7 +410,7 @@ TEST_F(ActivationOpTest, LeakyRelu) {
   TestActivationOp<float>("LeakyRelu",
                           input_values,
                           [alpha](float x) { return (x >= 0) ? x : alpha * x; },
-                          {{"alpha", alpha}});
+                          {{"alpha", alpha}}, {});
 }
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -442,7 +442,7 @@ TEST_F(ActivationOpTest, ThresholdedRelu) {
       "ThresholdedRelu",
       input_values,
       [alpha](float x) { return (x >= alpha) ? x : 0; },
-      {{"alpha", alpha}}, true, 10);
+      {{"alpha", alpha}}, {}, true, 10);
 }
 
 TEST_F(ActivationOpTest, Selu) {
@@ -452,7 +452,7 @@ TEST_F(ActivationOpTest, Selu) {
   TestActivationOp<float>("Selu",
                           input_values,
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, Selu_Attributes) {
@@ -462,7 +462,7 @@ TEST_F(ActivationOpTest, Selu_Attributes) {
   TestActivationOp<float>("Selu",
                           input_values,
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, Selu_GH10726) {
@@ -472,7 +472,7 @@ TEST_F(ActivationOpTest, Selu_GH10726) {
   TestActivationOp<float>("Selu",
                           {{1.f, -1.f}},
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, PRelu) {
@@ -588,6 +588,9 @@ TEST_F(ActivationOpTest, Softplus) {
 }
 
 TEST_F(ActivationOpNoInfTest, Softsign) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
     GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1, which exceeds threshold";
@@ -622,7 +625,7 @@ TEST_F(ActivationOpNoInfTest, Softsign) {
 
         return result;
       },
-      {}, false);  // Disable TensorRT because result mismatches
+      {}, {}, false);  // Disable TensorRT because result mismatches
 }
 
 #if defined(ENABLE_TRAINING_OPS)
@@ -692,5 +695,35 @@ TEST(LeakyReluGradInferenceTest, Basic) {
 }
 #endif
 
+// Remove DNNL from running this test because DNNL Gelu op seems not check domain for kernel implementation.
+// It will run the DNNL Gelu op which only be part of standard of Gelu-20 op.
+// [TODO] Temporarily ignore this test for OpenVINO to avoid an exception due to mishandling of the
+// approximate parameter. Re-enable it later when the issue is fixed
+#if !defined(USE_DNNL) && !defined(USE_QNN) && !defined(USE_OPENVINO)
+TEST_F(ActivationOpTest, ONNX_Gelu) {
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) { return 0.5 * x * (1 + erf(x * M_SQRT1_2)); }, {},
+      {{"approximate", "none"}}, true, 20);
+
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) { return 0.5 * x * (1 + erf(x * M_SQRT1_2)); },
+      {},
+      {/*default value of approximate attribute is none */}, true, 20);
+
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) {
+        return 0.5 * x * (1 + tanh(sqrt(2 / M_PI) * (x + 0.044715 * x * x * x)));
+      },
+      {},
+      {{"approximate", "tanh"}}, true, 20);
+}
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index b5ec1402584f..9a74d763a13e 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -17,13 +17,16 @@ namespace test {
 template <typename T>
 inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>& input_vals_vec,
                              std::function<T(T)> expected_func,
-                             const std::unordered_map<std::string, float> attribs = {},
+                             const std::unordered_map<std::string, float> float_attribs = {},
+                             const std::unordered_map<std::string, std::string> string_attribs = {},
                              bool is_tensorrt_supported = true, int opset_version = 7,
                              const char* domain = kOnnxDomain) {
   for (const std::vector<T>& input_vals : input_vals_vec) {
     OpTester test(szOp, opset_version, domain);
 
-    for (auto attr : attribs) test.AddAttribute<float>(attr.first, attr.second);
+    for (auto attr : float_attribs) test.AddAttribute<float>(attr.first, attr.second);
+    for (auto attr : string_attribs) test.AddAttribute(attr.first, attr.second);
+
     std::vector<int64_t> dims{(int64_t)input_vals.size()};
 
     std::vector<T> expected_vals;
@@ -66,6 +69,11 @@ inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>
       test.SetOutputRelErr("Y", .000001f);
     }
 #endif
+
+    if (strcmp(szOp, "QuickGelu") == 0) {
+      test.SetOutputTolerance(0.0001f);
+    }
+
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
   }
 }
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index 3d46893cdb82..e5f3956438b7 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -248,10 +248,9 @@ static common::Status CreateSubgraph(Graph& graph, RunOptions& options, const st
   auto status = graph.Resolve();
 
   if (failure_message.empty()) {
-    EXPECT_EQ(status, Status::OK());
+    EXPECT_STATUS_OK(status);
   } else {
-    EXPECT_TRUE(!status.IsOK());
-    EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr(failure_message));
+    EXPECT_STATUS_NOT_OK_AND_HAS_SUBSTR(status, failure_message);
   }
 
   return status;
diff --git a/onnxruntime/test/providers/cpu/generator/constant_of_shape_test.cc b/onnxruntime/test/providers/cpu/generator/constant_of_shape_test.cc
index de4ce5d85ab0..d6efa01c9ae8 100644
--- a/onnxruntime/test/providers/cpu/generator/constant_of_shape_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/constant_of_shape_test.cc
@@ -110,8 +110,8 @@ inline void SetValue(TensorProto& t_proto, T value,
 }
 
 template <class T>
-void RunTypedTest(TensorProto::DataType dt, T value) {
-  OpTester test("ConstantOfShape", 9);
+static void RunTypedTest(TensorProto::DataType dt, T value, int opset) {
+  OpTester test("ConstantOfShape", opset);
 
   TensorProto t_proto;
   t_proto.set_data_type(dt);
@@ -140,17 +140,20 @@ TEST(ConstantOfShape, TypeTests) {
   // and does not have a continuous buffer implementation
   // RunTypedTest(TensorProto::BOOL, true);
 
-  RunTypedTest(TensorProto::INT8, int8_t(8));
-  RunTypedTest(TensorProto::INT16, int16_t(16));
-  RunTypedTest(TensorProto::FLOAT, 1.f);
-  RunTypedTest(TensorProto::FLOAT16, MLFloat16::FromBits(static_cast<uint16_t>(5)));
-  RunTypedTest(TensorProto::DOUBLE, 1.0);
-  RunTypedTest(TensorProto::INT32, int32_t(32));
-  RunTypedTest(TensorProto::INT64, int64_t(64));
-  RunTypedTest(TensorProto::UINT8, uint8_t(8U));
-  RunTypedTest(TensorProto::UINT16, uint16_t(6U));
-  RunTypedTest(TensorProto::UINT32, uint32_t(32U));
-  RunTypedTest(TensorProto::UINT64, uint64_t(64U));
+  constexpr std::array<int, 3> opsets = {9, 20, 21};
+  for (auto opset : opsets) {
+    RunTypedTest(TensorProto::INT8, int8_t(8), opset);
+    RunTypedTest(TensorProto::INT16, int16_t(16), opset);
+    RunTypedTest(TensorProto::FLOAT, 1.f, opset);
+    RunTypedTest(TensorProto::FLOAT16, MLFloat16::FromBits(static_cast<uint16_t>(5)), opset);
+    RunTypedTest(TensorProto::DOUBLE, 1.0, opset);
+    RunTypedTest(TensorProto::INT32, int32_t(32), opset);
+    RunTypedTest(TensorProto::INT64, int64_t(64), opset);
+    RunTypedTest(TensorProto::UINT8, uint8_t(8U), opset);
+    RunTypedTest(TensorProto::UINT16, uint16_t(6U), opset);
+    RunTypedTest(TensorProto::UINT32, uint32_t(32U), opset);
+    RunTypedTest(TensorProto::UINT64, uint64_t(64U), opset);
+  }
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index 16582696a81d..be049d1cf0ce 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -36,7 +36,8 @@ TEST(Random, RandomNormal2DDouble) {
 
   // The expected_output is generated using std lib, which is used by CPU kernel only.
   // So we need to exclude other EPs here. Ditto for other places.
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
 }
 
 void RunRandomNormalLike3DFloat(bool infer_dtype = false) {
@@ -72,7 +73,8 @@ void RunRandomNormalLike3DFloat(bool infer_dtype = false) {
   test.AddOutput<float>("Y", dims, expected_output);
 
   // TensorRT does not support manual seed overrides and there will be result mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(Random, RandomNormalLike3DDouble) {
@@ -109,7 +111,8 @@ TEST(Random, RandomUniform1DFloat) {
   test.AddOutput<float>("Y", dims, expected_output);
 
   // TensorRT does not support manual seed overrides and there will be result mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 void RunRandomUniformLikeTest(bool infer_dtype = false) {
@@ -142,7 +145,8 @@ void RunRandomUniformLikeTest(bool infer_dtype = false) {
   test.AddOutput<double>("Y", dims, expected_output);
 
   // TensorRT does not support seed parameter and there will be result mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(Random, RandomUniformLike2DDouble) {
@@ -380,7 +384,7 @@ void RunRandomNormalGpuTest(const std::vector<int64_t> dims, const float mean, c
     test.AddOutput("Y", dims, fp16_data);
   }
 
-  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
+  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& /*provider_type*/) {
     // Only one output, and mean of output values are near attribute mean.
     ASSERT_EQ(fetches.size(), 1u);
     const auto& output_tensor = fetches[0].Get<Tensor>();
@@ -472,7 +476,7 @@ void RunRandomUniformGpuTest(const std::vector<int64_t> dims, const float low, c
     test.AddOutput("Y", dims, fp16_data);
   }
 
-  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
+  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& /*provider_type*/) {
     // Only one output. Each value in output tensoer is between low and high.
     // Mean of output values are near attribute mean of low and high.
     ASSERT_EQ(fetches.size(), 1u);
diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc
index efb46e86d04e..b5d5f84df950 100644
--- a/onnxruntime/test/providers/cpu/math/clip_test.cc
+++ b/onnxruntime/test/providers/cpu/math/clip_test.cc
@@ -182,7 +182,7 @@ TEST(MathOpTest, Clip) {
   run_test(true);
 }
 
-// Use clip between [0, 6] as Relu6 (for some EPs, such as NNAPI)
+// Use clip between [0, 6] as Relu6 to test optimized path in some  EPs, such as NNAPI and CoreML
 TEST(MathOpTest, Clip_Relu6) {
   // To test NNAPI EP, we need the min/max to be in initializers
   auto run_test = [](bool min_max_are_initializer) {
@@ -208,6 +208,31 @@ TEST(MathOpTest, Clip_Relu6) {
   run_test(true);
 }
 
+// Use clip between [0, inf] as Relu to test optimized path in some EPs, such as CoreML
+TEST(MathOpTest, Clip_Relu) {
+  // To test NNAPI EP, we need the min/max to be in initializers
+  auto run_test = [](bool min_max_are_initializer) {
+    OpTester test("Clip", 11);
+
+    std::vector<int64_t> dims{3, 3};
+    test.AddInput<float>("X", dims,
+                         {-1.0f, 0.0f, 1.0f,
+                          -6.0f, 3.5f, 6.0f,
+                          -5.4f, 2.0f, 8.0f});
+    test.AddInput<float>("min", {}, {0.0f}, min_max_are_initializer);
+    test.AddOutput<float>("Y", dims,
+                          {0.0f, 0.0f, 1.0f,
+                           0.0f, 3.5f, 6.0f,
+                           0.0f, 2.0f, 8.0f});
+
+    // TensorRT does not support Clip opset 11 yet.
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  };
+
+  run_test(false);
+  run_test(true);
+}
+
 // Use clip between [-1, 1] as Relu1 (for some EPs, such as NNAPI)
 TEST(MathOpTest, Clip_Relu1) {
   // To test NNAPI EP, we need the min/max to be in initializers
diff --git a/onnxruntime/test/providers/cpu/math/einsum_test.cc b/onnxruntime/test/providers/cpu/math/einsum_test.cc
index 05b936a41e3c..423ea3f682f4 100644
--- a/onnxruntime/test/providers/cpu/math/einsum_test.cc
+++ b/onnxruntime/test/providers/cpu/math/einsum_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 #include "core/framework/data_types.h"
 #include "core/util/math.h"
 
@@ -50,7 +51,7 @@ TEST(Einsum, ExplicitEinsumAsTransposeOp_2D_input_With_Broadcasting) {
   test.AddAttribute<std::string>("equation", "...i->i...");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2}, {1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) {
@@ -58,7 +59,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) {
   test.AddAttribute<std::string>("equation", "...ji->...ij");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -75,7 +76,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedTransposeOp_3D_input) {
   test.AddAttribute<std::string>("equation", "...ji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Axis/Axes reduction
@@ -102,7 +103,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_0) {
   test.AddAttribute<std::string>("equation", "...ji->...j");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2}, {3.f, 7.f, 3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) {
@@ -110,7 +111,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) {
   test.AddAttribute<std::string>("equation", "...ji->...");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2}, {10.f, 10.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -144,7 +145,7 @@ TEST(Einsum, ExplicitEinsumAsOuterProductWithTransposeOp_Multi_Input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 30.f, 36.f, 20.f, 24.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -155,7 +156,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_2D_input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) {
@@ -165,7 +166,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 // Theme: MatMul
 
@@ -233,7 +234,7 @@ TEST(Einsum, ExplicitEinsumAsMatmul_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {37.f, 81.f, 54.f, 118.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedMatmul) {
@@ -251,7 +252,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_0) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) {
@@ -260,7 +261,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {14.f, 20.f, 30.f, 44.f, 14.f, 20.f, 30.f, 44.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsMatmul_OutputTransposed) {
@@ -303,7 +304,7 @@ TEST(Einsum, ImplicitEinsumAsMatmul_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {37.f, 54.f, 81.f, 118.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ImplicitEinsumAsBatchedMatmul) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -320,7 +321,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedMatmulWithBroadcasting_0) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsMatmul_2) {
@@ -343,7 +344,7 @@ TEST(Einsum, DiagonalWithMatmul) {
   test.AddInput<float>("x", {2, 2, 3}, {1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f});
   test.AddInput<float>("y", {3, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
   test.AddOutput<float>("o", {3}, {60.f, 72.f, 84.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Diagonal parsing
@@ -354,7 +355,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp) {
   test.AddAttribute<std::string>("equation", "ii->i");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) {
@@ -362,7 +363,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "iii->i");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) {
@@ -370,7 +371,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) {
   test.AddAttribute<std::string>("equation", "iji->j");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) {
@@ -378,7 +379,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) {
   test.AddAttribute<std::string>("equation", "iji->ij");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) {
@@ -386,7 +387,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {1.f, 2.f, 3.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // ROCm doesn't support double
@@ -396,7 +397,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_double) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<double>("x", {2, 2, 2}, {1., 2., 3., 4., 1., 2., 3., 4.});
   test.AddOutput<double>("o", {2, 2}, {1., 2., 3., 4.});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 #endif
 
@@ -405,7 +406,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int32) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<int32_t>("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4});
   test.AddOutput<int32_t>("o", {2, 2}, {1, 2, 3, 4});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) {
@@ -413,14 +414,14 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<int64_t>("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4});
   test.AddOutput<int64_t>("o", {2, 2}, {1, 2, 3, 4});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
   test.AddAttribute<std::string>("equation", "...ii->...i");
   test.AddInput<float>("x", {3, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {3, 2}, {1.f, 4.f, 1.f, 4.f, 1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) {
@@ -428,7 +429,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "...iij->...j");
   test.AddInput<float>("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {4.f, 6.f, 4.f, 6.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit (Implicit diagonal ops will sum up diagonal values)
@@ -442,7 +443,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp) {
   test.AddAttribute<std::string>("equation", "ii");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {}, {5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) {
@@ -455,7 +456,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "iii");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {}, {5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) {
@@ -463,7 +464,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) {
   test.AddAttribute<std::string>("equation", "iji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) {
@@ -471,7 +472,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) {
   test.AddAttribute<std::string>("equation", "...ii");
   test.AddInput<float>("x", {2, 1, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 1}, {5.f, 5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) {
@@ -479,7 +480,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "...iij");
   test.AddInput<float>("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {4.f, 6.f, 4.f, 6.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Scalar inputs and outputs
@@ -491,7 +492,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar) {
   test.AddInput<float>("x", {}, {10.f});
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {10.f, 20.f, 30.f, 40.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) {
@@ -501,7 +502,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {}, {10.f});
   test.AddOutput<float>("o", {2, 2}, {100.f, 200.f, 300.f, 400.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithAllScalars) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -527,7 +528,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithOneScalar) {
   test.AddInput<float>("x", {}, {10.f});
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {10.f, 20.f, 30.f, 40.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) {
@@ -538,7 +539,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) {
   test.AddInput<float>("c", {}, {10.f});
   test.AddInput<float>("d", {}, {10.f});
   test.AddOutput<float>("o", {2, 2}, {1000.f, 2000.f, 3000.f, 4000.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithAllScalars) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -568,7 +569,7 @@ TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeFinal) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, -6.f, 2.f});
   test.AddInput<float>("z", {2, 2}, {3.f, 4.f, 5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {63.f, -132.f, 63.f, -132.f, 63.f, -132.f, 63.f, -132.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeLeft) {
@@ -720,7 +721,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_Half) {
   ConvertFloatToMLFloat16(output_f.data(), output.data(), 2);
   test.AddInput<MLFloat16>("x", {2, 2}, input_x);
   test.AddOutput<MLFloat16>("o", {2}, output);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) {
@@ -741,7 +742,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) {
   test.AddInput<MLFloat16>("x", {}, input_x);
   test.AddInput<MLFloat16>("y", {2, 2}, input_y);
   test.AddOutput<MLFloat16>("o", {2, 2}, output);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsTensorContraction_Half) {
@@ -769,374 +770,1334 @@ TEST(Einsum, ExplicitEinsumAsTensorContraction_Half) {
 // for two and three inputs (most common use-case of Einsum operator)
 
 struct EinsumTestCase {
-  std::string equation;
-  std::vector<int64_t> shape;
-  std::vector<float> expected;
-  EinsumTestCase(const std::string& eq, const std::vector<int64_t>& sh, const std::vector<float>& exp) : equation(eq), shape(sh), expected(exp) {}
+  std::string_view equation;
+  gsl::span<const int64_t> shape;
+  gsl::span<const float> expected;
 };
+static constexpr std::string_view equation0 = "abc,cd->abc";
+static constexpr std::array<int64_t, 3> shape0{2, 2, 2};
+static constexpr std::array<float, 8> expected0{0.f, 5.f, 2.f, 15.f, 4.f, 25.f, 6.f, 35.f};
+static constexpr std::string_view equation1 = "abc,cd->abd";
+static constexpr std::array<int64_t, 3> shape1{2, 2, 2};
+static constexpr std::array<float, 8> expected1{2.f, 3.f, 6.f, 11.f, 10.f, 19.f, 14.f, 27.f};
+static constexpr std::string_view equation2 = "abc,cd->acd";
+static constexpr std::array<int64_t, 3> shape2{2, 2, 2};
+static constexpr std::array<float, 8> expected2{0.f, 2.f, 8.f, 12.f, 0.f, 10.f, 24.f, 36.f};
+static constexpr std::string_view equation3 = "abc,dc->abd";
+static constexpr std::array<int64_t, 3> shape3{2, 2, 2};
+static constexpr std::array<float, 8> expected3{1.f, 3.f, 3.f, 13.f, 5.f, 23.f, 7.f, 33.f};
+static constexpr std::string_view equation4 = "abc,dc->abc";
+static constexpr std::array<int64_t, 3> shape4{2, 2, 2};
+static constexpr std::array<float, 8> expected4{0.f, 4.f, 4.f, 12.f, 8.f, 20.f, 12.f, 28.f};
+static constexpr std::string_view equation5 = "abc,dc->acd";
+static constexpr std::array<int64_t, 3> shape5{2, 2, 2};
+static constexpr std::array<float, 8> expected5{0.f, 4.f, 4.f, 12.f, 0.f, 20.f, 12.f, 36.f};
+static constexpr std::string_view equation6 = "acb,cd->acd";
+static constexpr std::array<int64_t, 3> shape6{2, 2, 2};
+static constexpr std::array<float, 8> expected6{0.f, 1.f, 10.f, 15.f, 0.f, 9.f, 26.f, 39.f};
+static constexpr std::string_view equation7 = "acb,cd->abc";
+static constexpr std::array<int64_t, 3> shape7{2, 2, 2};
+static constexpr std::array<float, 8> expected7{0.f, 10.f, 1.f, 15.f, 4.f, 30.f, 5.f, 35.f};
+static constexpr std::string_view equation8 = "acb,cd->abd";
+static constexpr std::array<int64_t, 3> shape8{2, 2, 2};
+static constexpr std::array<float, 8> expected8{4.f, 6.f, 6.f, 10.f, 12.f, 22.f, 14.f, 26.f};
+static constexpr std::string_view equation9 = "acb,dc->acd";
+static constexpr std::array<int64_t, 3> shape9{2, 2, 2};
+static constexpr std::array<float, 8> expected9{0.f, 2.f, 5.f, 15.f, 0.f, 18.f, 13.f, 39.f};
+static constexpr std::string_view equation10 = "acb,dc->abd";
+static constexpr std::array<int64_t, 3> shape10{2, 2, 2};
+static constexpr std::array<float, 8> expected10{2.f, 6.f, 3.f, 11.f, 6.f, 26.f, 7.f, 31.f};
+static constexpr std::string_view equation11 = "acb,dc->abc";
+static constexpr std::array<int64_t, 3> shape11{2, 2, 2};
+static constexpr std::array<float, 8> expected11{0.f, 8.f, 2.f, 12.f, 8.f, 24.f, 10.f, 28.f};
+static constexpr std::string_view equation12 = "bac,cd->bac";
+static constexpr std::array<int64_t, 3> shape12{2, 2, 2};
+static constexpr std::array<float, 8> expected12{0.f, 5.f, 2.f, 15.f, 4.f, 25.f, 6.f, 35.f};
+static constexpr std::string_view equation13 = "bac,cd->bad";
+static constexpr std::array<int64_t, 3> shape13{2, 2, 2};
+static constexpr std::array<float, 8> expected13{2.f, 3.f, 6.f, 11.f, 10.f, 19.f, 14.f, 27.f};
+static constexpr std::string_view equation14 = "bac,cd->bcd";
+static constexpr std::array<int64_t, 3> shape14{2, 2, 2};
+static constexpr std::array<float, 8> expected14{0.f, 2.f, 8.f, 12.f, 0.f, 10.f, 24.f, 36.f};
+static constexpr std::string_view equation15 = "bac,dc->bad";
+static constexpr std::array<int64_t, 3> shape15{2, 2, 2};
+static constexpr std::array<float, 8> expected15{1.f, 3.f, 3.f, 13.f, 5.f, 23.f, 7.f, 33.f};
+static constexpr std::string_view equation16 = "bac,dc->bac";
+static constexpr std::array<int64_t, 3> shape16{2, 2, 2};
+static constexpr std::array<float, 8> expected16{0.f, 4.f, 4.f, 12.f, 8.f, 20.f, 12.f, 28.f};
+static constexpr std::string_view equation17 = "bac,dc->bcd";
+static constexpr std::array<int64_t, 3> shape17{2, 2, 2};
+static constexpr std::array<float, 8> expected17{0.f, 4.f, 4.f, 12.f, 0.f, 20.f, 12.f, 36.f};
+static constexpr std::string_view equation18 = "bca,cd->bcd";
+static constexpr std::array<int64_t, 3> shape18{2, 2, 2};
+static constexpr std::array<float, 8> expected18{0.f, 1.f, 10.f, 15.f, 0.f, 9.f, 26.f, 39.f};
+static constexpr std::string_view equation19 = "bca,cd->bac";
+static constexpr std::array<int64_t, 3> shape19{2, 2, 2};
+static constexpr std::array<float, 8> expected19{0.f, 10.f, 1.f, 15.f, 4.f, 30.f, 5.f, 35.f};
+static constexpr std::string_view equation20 = "bca,cd->bad";
+static constexpr std::array<int64_t, 3> shape20{2, 2, 2};
+static constexpr std::array<float, 8> expected20{4.f, 6.f, 6.f, 10.f, 12.f, 22.f, 14.f, 26.f};
+static constexpr std::string_view equation21 = "bca,dc->bcd";
+static constexpr std::array<int64_t, 3> shape21{2, 2, 2};
+static constexpr std::array<float, 8> expected21{0.f, 2.f, 5.f, 15.f, 0.f, 18.f, 13.f, 39.f};
+static constexpr std::string_view equation22 = "bca,dc->bad";
+static constexpr std::array<int64_t, 3> shape22{2, 2, 2};
+static constexpr std::array<float, 8> expected22{2.f, 6.f, 3.f, 11.f, 6.f, 26.f, 7.f, 31.f};
+static constexpr std::string_view equation23 = "bca,dc->bac";
+static constexpr std::array<int64_t, 3> shape23{2, 2, 2};
+static constexpr std::array<float, 8> expected23{0.f, 8.f, 2.f, 12.f, 8.f, 24.f, 10.f, 28.f};
+static constexpr std::string_view equation24 = "cab,cd->cad";
+static constexpr std::array<int64_t, 3> shape24{2, 2, 2};
+static constexpr std::array<float, 8> expected24{0.f, 1.f, 0.f, 5.f, 18.f, 27.f, 26.f, 39.f};
+static constexpr std::string_view equation25 = "cab,cd->cbd";
+static constexpr std::array<int64_t, 3> shape25{2, 2, 2};
+static constexpr std::array<float, 8> expected25{0.f, 2.f, 0.f, 4.f, 20.f, 30.f, 24.f, 36.f};
+static constexpr std::string_view equation26 = "cab,dc->cad";
+static constexpr std::array<int64_t, 3> shape26{2, 2, 2};
+static constexpr std::array<float, 8> expected26{0.f, 2.f, 0.f, 10.f, 9.f, 27.f, 13.f, 39.f};
+static constexpr std::string_view equation27 = "cab,dc->cbd";
+static constexpr std::array<int64_t, 3> shape27{2, 2, 2};
+static constexpr std::array<float, 8> expected27{0.f, 4.f, 0.f, 8.f, 10.f, 30.f, 12.f, 36.f};
+static constexpr std::string_view equation28 = "cba,cd->cbd";
+static constexpr std::array<int64_t, 3> shape28{2, 2, 2};
+static constexpr std::array<float, 8> expected28{0.f, 1.f, 0.f, 5.f, 18.f, 27.f, 26.f, 39.f};
+static constexpr std::string_view equation29 = "cba,cd->cad";
+static constexpr std::array<int64_t, 3> shape29{2, 2, 2};
+static constexpr std::array<float, 8> expected29{0.f, 2.f, 0.f, 4.f, 20.f, 30.f, 24.f, 36.f};
+static constexpr std::string_view equation30 = "cba,dc->cbd";
+static constexpr std::array<int64_t, 3> shape30{2, 2, 2};
+static constexpr std::array<float, 8> expected30{0.f, 2.f, 0.f, 10.f, 9.f, 27.f, 13.f, 39.f};
+static constexpr std::string_view equation31 = "cba,dc->cad";
+static constexpr std::array<int64_t, 3> shape31{2, 2, 2};
+static constexpr std::array<float, 8> expected31{0.f, 4.f, 0.f, 8.f, 10.f, 30.f, 12.f, 36.f};
+static constexpr std::array<EinsumTestCase, 32> case0 = {{
+    {equation0, shape0, expected0},
+    {equation1, shape1, expected1},
+    {equation2, shape2, expected2},
+    {equation3, shape3, expected3},
+    {equation4, shape4, expected4},
+    {equation5, shape5, expected5},
+    {equation6, shape6, expected6},
+    {equation7, shape7, expected7},
+    {equation8, shape8, expected8},
+    {equation9, shape9, expected9},
+    {equation10, shape10, expected10},
+    {equation11, shape11, expected11},
+    {equation12, shape12, expected12},
+    {equation13, shape13, expected13},
+    {equation14, shape14, expected14},
+    {equation15, shape15, expected15},
+    {equation16, shape16, expected16},
+    {equation17, shape17, expected17},
+    {equation18, shape18, expected18},
+    {equation19, shape19, expected19},
+    {equation20, shape20, expected20},
+    {equation21, shape21, expected21},
+    {equation22, shape22, expected22},
+    {equation23, shape23, expected23},
+    {equation24, shape24, expected24},
+    {equation25, shape25, expected25},
+    {equation26, shape26, expected26},
+    {equation27, shape27, expected27},
+    {equation28, shape28, expected28},
+    {equation29, shape29, expected29},
+    {equation30, shape30, expected30},
+    {equation31, shape31, expected31},
+}};
+
+static constexpr std::string_view equation32 = "abc,cd,def->abd";
+static constexpr std::array<int64_t, 3> shape32{2, 2, 2};
+static constexpr std::array<float, 8> expected32{12.f, 66.f, 36.f, 242.f, 60.f, 418.f, 84.f, 594.f};
+static constexpr std::string_view equation33 = "abc,cd,def->abe";
+static constexpr std::array<int64_t, 3> shape33{2, 2, 2};
+static constexpr std::array<float, 8> expected33{29.f, 49.f, 105.f, 173.f, 181.f, 297.f, 257.f, 421.f};
+static constexpr std::string_view equation34 = "abc,cd,def->acd";
+static constexpr std::array<int64_t, 3> shape34{2, 2, 2};
+static constexpr std::array<float, 8> expected34{0.f, 44.f, 48.f, 264.f, 0.f, 220.f, 144.f, 792.f};
+static constexpr std::string_view equation35 = "abc,cd,def->ace";
+static constexpr std::array<int64_t, 3> shape35{2, 2, 2};
+static constexpr std::array<float, 8> expected35{18.f, 26.f, 116.f, 196.f, 90.f, 130.f, 348.f, 588.f};
+static constexpr std::string_view equation36 = "abc,cd,dfe->abd";
+static constexpr std::array<int64_t, 3> shape36{2, 2, 2};
+static constexpr std::array<float, 8> expected36{12.f, 66.f, 36.f, 242.f, 60.f, 418.f, 84.f, 594.f};
+static constexpr std::string_view equation37 = "abc,cd,dfe->abf";
+static constexpr std::array<int64_t, 3> shape37{2, 2, 2};
+static constexpr std::array<float, 8> expected37{29.f, 49.f, 105.f, 173.f, 181.f, 297.f, 257.f, 421.f};
+static constexpr std::string_view equation38 = "abc,cd,dfe->acd";
+static constexpr std::array<int64_t, 3> shape38{2, 2, 2};
+static constexpr std::array<float, 8> expected38{0.f, 44.f, 48.f, 264.f, 0.f, 220.f, 144.f, 792.f};
+static constexpr std::string_view equation39 = "abc,cd,dfe->acf";
+static constexpr std::array<int64_t, 3> shape39{2, 2, 2};
+static constexpr std::array<float, 8> expected39{18.f, 26.f, 116.f, 196.f, 90.f, 130.f, 348.f, 588.f};
+static constexpr std::string_view equation40 = "abc,cd,edf->abe";
+static constexpr std::array<int64_t, 3> shape40{2, 2, 2};
+static constexpr std::array<float, 8> expected40{17.f, 57.f, 61.f, 197.f, 105.f, 337.f, 149.f, 477.f};
+static constexpr std::string_view equation41 = "abc,cd,edf->abd";
+static constexpr std::array<int64_t, 3> shape41{2, 2, 2};
+static constexpr std::array<float, 8> expected41{20.f, 54.f, 60.f, 198.f, 100.f, 342.f, 140.f, 486.f};
+static constexpr std::string_view equation42 = "abc,cd,edf->ace";
+static constexpr std::array<int64_t, 3> shape42{2, 2, 2};
+static constexpr std::array<float, 8> expected42{10.f, 26.f, 68.f, 228.f, 50.f, 130.f, 204.f, 684.f};
+static constexpr std::string_view equation43 = "abc,cd,edf->acd";
+static constexpr std::array<int64_t, 3> shape43{2, 2, 2};
+static constexpr std::array<float, 8> expected43{0.f, 36.f, 80.f, 216.f, 0.f, 180.f, 240.f, 648.f};
+static constexpr std::string_view equation44 = "abc,cd,efd->abe";
+static constexpr std::array<int64_t, 3> shape44{2, 2, 2};
+static constexpr std::array<float, 8> expected44{16.f, 56.f, 56.f, 192.f, 96.f, 328.f, 136.f, 464.f};
+static constexpr std::string_view equation45 = "abc,cd,efd->abf";
+static constexpr std::array<int64_t, 3> shape45{2, 2, 2};
+static constexpr std::array<float, 8> expected45{26.f, 46.f, 90.f, 158.f, 154.f, 270.f, 218.f, 382.f};
+static constexpr std::string_view equation46 = "abc,cd,efd->ace";
+static constexpr std::array<int64_t, 3> shape46{2, 2, 2};
+static constexpr std::array<float, 8> expected46{8.f, 24.f, 64.f, 224.f, 40.f, 120.f, 192.f, 672.f};
+static constexpr std::string_view equation47 = "abc,cd,efd->acf";
+static constexpr std::array<int64_t, 3> shape47{2, 2, 2};
+static constexpr std::array<float, 8> expected47{12.f, 20.f, 104.f, 184.f, 60.f, 100.f, 312.f, 552.f};
+static constexpr std::string_view equation48 = "abc,cd,fde->abf";
+static constexpr std::array<int64_t, 3> shape48{2, 2, 2};
+static constexpr std::array<float, 8> expected48{17.f, 57.f, 61.f, 197.f, 105.f, 337.f, 149.f, 477.f};
+static constexpr std::string_view equation49 = "abc,cd,fde->abd";
+static constexpr std::array<int64_t, 3> shape49{2, 2, 2};
+static constexpr std::array<float, 8> expected49{20.f, 54.f, 60.f, 198.f, 100.f, 342.f, 140.f, 486.f};
+static constexpr std::string_view equation50 = "abc,cd,fde->acf";
+static constexpr std::array<int64_t, 3> shape50{2, 2, 2};
+static constexpr std::array<float, 8> expected50{10.f, 26.f, 68.f, 228.f, 50.f, 130.f, 204.f, 684.f};
+static constexpr std::string_view equation51 = "abc,cd,fde->acd";
+static constexpr std::array<int64_t, 3> shape51{2, 2, 2};
+static constexpr std::array<float, 8> expected51{0.f, 36.f, 80.f, 216.f, 0.f, 180.f, 240.f, 648.f};
+static constexpr std::string_view equation52 = "abc,cd,fed->abf";
+static constexpr std::array<int64_t, 3> shape52{2, 2, 2};
+static constexpr std::array<float, 8> expected52{16.f, 56.f, 56.f, 192.f, 96.f, 328.f, 136.f, 464.f};
+static constexpr std::string_view equation53 = "abc,cd,fed->abe";
+static constexpr std::array<int64_t, 3> shape53{2, 2, 2};
+static constexpr std::array<float, 8> expected53{26.f, 46.f, 90.f, 158.f, 154.f, 270.f, 218.f, 382.f};
+static constexpr std::string_view equation54 = "abc,cd,fed->acf";
+static constexpr std::array<int64_t, 3> shape54{2, 2, 2};
+static constexpr std::array<float, 8> expected54{8.f, 24.f, 64.f, 224.f, 40.f, 120.f, 192.f, 672.f};
+static constexpr std::string_view equation55 = "abc,cd,fed->ace";
+static constexpr std::array<int64_t, 3> shape55{2, 2, 2};
+static constexpr std::array<float, 8> expected55{12.f, 20.f, 104.f, 184.f, 60.f, 100.f, 312.f, 552.f};
+static constexpr std::string_view equation56 = "abc,dc,def->abd";
+static constexpr std::array<int64_t, 3> shape56{2, 2, 2};
+static constexpr std::array<float, 8> expected56{6.f, 66.f, 18.f, 286.f, 30.f, 506.f, 42.f, 726.f};
+static constexpr std::string_view equation57 = "abc,dc,def->abe";
+static constexpr std::array<int64_t, 3> shape57{2, 2, 2};
+static constexpr std::array<float, 8> expected57{28.f, 44.f, 120.f, 184.f, 212.f, 324.f, 304.f, 464.f};
+static constexpr std::string_view equation58 = "abc,dc,def->acd";
+static constexpr std::array<int64_t, 3> shape58{2, 2, 2};
+static constexpr std::array<float, 8> expected58{0.f, 88.f, 24.f, 264.f, 0.f, 440.f, 72.f, 792.f};
+static constexpr std::string_view equation59 = "abc,dc,def->ace";
+static constexpr std::array<int64_t, 3> shape59{2, 2, 2};
+static constexpr std::array<float, 8> expected59{36.f, 52.f, 112.f, 176.f, 180.f, 260.f, 336.f, 528.f};
+static constexpr std::string_view equation60 = "abc,dc,dfe->abd";
+static constexpr std::array<int64_t, 3> shape60{2, 2, 2};
+static constexpr std::array<float, 8> expected60{6.f, 66.f, 18.f, 286.f, 30.f, 506.f, 42.f, 726.f};
+static constexpr std::string_view equation61 = "abc,dc,dfe->abf";
+static constexpr std::array<int64_t, 3> shape61{2, 2, 2};
+static constexpr std::array<float, 8> expected61{28.f, 44.f, 120.f, 184.f, 212.f, 324.f, 304.f, 464.f};
+static constexpr std::string_view equation62 = "abc,dc,dfe->acd";
+static constexpr std::array<int64_t, 3> shape62{2, 2, 2};
+static constexpr std::array<float, 8> expected62{0.f, 88.f, 24.f, 264.f, 0.f, 440.f, 72.f, 792.f};
+static constexpr std::string_view equation63 = "abc,dc,dfe->acf";
+static constexpr std::array<int64_t, 3> shape63{2, 2, 2};
+static constexpr std::array<float, 8> expected63{36.f, 52.f, 112.f, 176.f, 180.f, 260.f, 336.f, 528.f};
+static constexpr std::string_view equation64 = "abc,dc,edf->abe";
+static constexpr std::array<int64_t, 3> shape64{2, 2, 2};
+static constexpr std::array<float, 8> expected64{16.f, 48.f, 68.f, 196.f, 120.f, 344.f, 172.f, 492.f};
+static constexpr std::string_view equation65 = "abc,dc,edf->abd";
+static constexpr std::array<int64_t, 3> shape65{2, 2, 2};
+static constexpr std::array<float, 8> expected65{10.f, 54.f, 30.f, 234.f, 50.f, 414.f, 70.f, 594.f};
+static constexpr std::string_view equation66 = "abc,dc,edf->ace";
+static constexpr std::array<int64_t, 3> shape66{2, 2, 2};
+static constexpr std::array<float, 8> expected66{20.f, 52.f, 64.f, 192.f, 100.f, 260.f, 192.f, 576.f};
+static constexpr std::string_view equation67 = "abc,dc,edf->acd";
+static constexpr std::array<int64_t, 3> shape67{2, 2, 2};
+static constexpr std::array<float, 8> expected67{0.f, 72.f, 40.f, 216.f, 0.f, 360.f, 120.f, 648.f};
+static constexpr std::string_view equation68 = "abc,dc,efd->abe";
+static constexpr std::array<int64_t, 3> shape68{2, 2, 2};
+static constexpr std::array<float, 8> expected68{14.f, 46.f, 58.f, 186.f, 102.f, 326.f, 146.f, 466.f};
+static constexpr std::string_view equation69 = "abc,dc,efd->abf";
+static constexpr std::array<int64_t, 3> shape69{2, 2, 2};
+static constexpr std::array<float, 8> expected69{22.f, 38.f, 90.f, 154.f, 158.f, 270.f, 226.f, 386.f};
+static constexpr std::string_view equation70 = "abc,dc,efd->ace";
+static constexpr std::array<int64_t, 3> shape70{2, 2, 2};
+static constexpr std::array<float, 8> expected70{16.f, 48.f, 56.f, 184.f, 80.f, 240.f, 168.f, 552.f};
+static constexpr std::string_view equation71 = "abc,dc,efd->acf";
+static constexpr std::array<int64_t, 3> shape71{2, 2, 2};
+static constexpr std::array<float, 8> expected71{24.f, 40.f, 88.f, 152.f, 120.f, 200.f, 264.f, 456.f};
+static constexpr std::string_view equation72 = "abc,dc,fde->abf";
+static constexpr std::array<int64_t, 3> shape72{2, 2, 2};
+static constexpr std::array<float, 8> expected72{16.f, 48.f, 68.f, 196.f, 120.f, 344.f, 172.f, 492.f};
+static constexpr std::string_view equation73 = "abc,dc,fde->abd";
+static constexpr std::array<int64_t, 3> shape73{2, 2, 2};
+static constexpr std::array<float, 8> expected73{10.f, 54.f, 30.f, 234.f, 50.f, 414.f, 70.f, 594.f};
+static constexpr std::string_view equation74 = "abc,dc,fde->acf";
+static constexpr std::array<int64_t, 3> shape74{2, 2, 2};
+static constexpr std::array<float, 8> expected74{20.f, 52.f, 64.f, 192.f, 100.f, 260.f, 192.f, 576.f};
+static constexpr std::string_view equation75 = "abc,dc,fde->acd";
+static constexpr std::array<int64_t, 3> shape75{2, 2, 2};
+static constexpr std::array<float, 8> expected75{0.f, 72.f, 40.f, 216.f, 0.f, 360.f, 120.f, 648.f};
+static constexpr std::string_view equation76 = "abc,dc,fed->abf";
+static constexpr std::array<int64_t, 3> shape76{2, 2, 2};
+static constexpr std::array<float, 8> expected76{14.f, 46.f, 58.f, 186.f, 102.f, 326.f, 146.f, 466.f};
+static constexpr std::string_view equation77 = "abc,dc,fed->abe";
+static constexpr std::array<int64_t, 3> shape77{2, 2, 2};
+static constexpr std::array<float, 8> expected77{22.f, 38.f, 90.f, 154.f, 158.f, 270.f, 226.f, 386.f};
+static constexpr std::string_view equation78 = "abc,dc,fed->acf";
+static constexpr std::array<int64_t, 3> shape78{2, 2, 2};
+static constexpr std::array<float, 8> expected78{16.f, 48.f, 56.f, 184.f, 80.f, 240.f, 168.f, 552.f};
+static constexpr std::string_view equation79 = "abc,dc,fed->ace";
+static constexpr std::array<int64_t, 3> shape79{2, 2, 2};
+static constexpr std::array<float, 8> expected79{24.f, 40.f, 88.f, 152.f, 120.f, 200.f, 264.f, 456.f};
+static constexpr std::string_view equation80 = "acb,cd,def->acd";
+static constexpr std::array<int64_t, 3> shape80{2, 2, 2};
+static constexpr std::array<float, 8> expected80{0.f, 22.f, 60.f, 330.f, 0.f, 198.f, 156.f, 858.f};
+static constexpr std::string_view equation81 = "acb,cd,def->ace";
+static constexpr std::array<int64_t, 3> shape81{2, 2, 2};
+static constexpr std::array<float, 8> expected81{9.f, 13.f, 145.f, 245.f, 81.f, 117.f, 377.f, 637.f};
+static constexpr std::string_view equation82 = "acb,cd,def->abd";
+static constexpr std::array<int64_t, 3> shape82{2, 2, 2};
+static constexpr std::array<float, 8> expected82{24.f, 132.f, 36.f, 220.f, 72.f, 484.f, 84.f, 572.f};
+static constexpr std::string_view equation83 = "acb,cd,def->abe";
+static constexpr std::array<int64_t, 3> shape83{2, 2, 2};
+static constexpr std::array<float, 8> expected83{58.f, 98.f, 96.f, 160.f, 210.f, 346.f, 248.f, 408.f};
+static constexpr std::string_view equation84 = "acb,cd,dfe->acd";
+static constexpr std::array<int64_t, 3> shape84{2, 2, 2};
+static constexpr std::array<float, 8> expected84{0.f, 22.f, 60.f, 330.f, 0.f, 198.f, 156.f, 858.f};
+static constexpr std::string_view equation85 = "acb,cd,dfe->acf";
+static constexpr std::array<int64_t, 3> shape85{2, 2, 2};
+static constexpr std::array<float, 8> expected85{9.f, 13.f, 145.f, 245.f, 81.f, 117.f, 377.f, 637.f};
+static constexpr std::string_view equation86 = "acb,cd,dfe->abd";
+static constexpr std::array<int64_t, 3> shape86{2, 2, 2};
+static constexpr std::array<float, 8> expected86{24.f, 132.f, 36.f, 220.f, 72.f, 484.f, 84.f, 572.f};
+static constexpr std::string_view equation87 = "acb,cd,dfe->abf";
+static constexpr std::array<int64_t, 3> shape87{2, 2, 2};
+static constexpr std::array<float, 8> expected87{58.f, 98.f, 96.f, 160.f, 210.f, 346.f, 248.f, 408.f};
+static constexpr std::string_view equation88 = "acb,cd,edf->ace";
+static constexpr std::array<int64_t, 3> shape88{2, 2, 2};
+static constexpr std::array<float, 8> expected88{5.f, 13.f, 85.f, 285.f, 45.f, 117.f, 221.f, 741.f};
+static constexpr std::string_view equation89 = "acb,cd,edf->acd";
+static constexpr std::array<int64_t, 3> shape89{2, 2, 2};
+static constexpr std::array<float, 8> expected89{0.f, 18.f, 100.f, 270.f, 0.f, 162.f, 260.f, 702.f};
+static constexpr std::string_view equation90 = "acb,cd,edf->abe";
+static constexpr std::array<int64_t, 3> shape90{2, 2, 2};
+static constexpr std::array<float, 8> expected90{34.f, 114.f, 56.f, 184.f, 122.f, 394.f, 144.f, 464.f};
+static constexpr std::string_view equation91 = "acb,cd,edf->abd";
+static constexpr std::array<int64_t, 3> shape91{2, 2, 2};
+static constexpr std::array<float, 8> expected91{40.f, 108.f, 60.f, 180.f, 120.f, 396.f, 140.f, 468.f};
+static constexpr std::string_view equation92 = "acb,cd,efd->ace";
+static constexpr std::array<int64_t, 3> shape92{2, 2, 2};
+static constexpr std::array<float, 8> expected92{4.f, 12.f, 80.f, 280.f, 36.f, 108.f, 208.f, 728.f};
+static constexpr std::string_view equation93 = "acb,cd,efd->acf";
+static constexpr std::array<int64_t, 3> shape93{2, 2, 2};
+static constexpr std::array<float, 8> expected93{6.f, 10.f, 130.f, 230.f, 54.f, 90.f, 338.f, 598.f};
+static constexpr std::string_view equation94 = "acb,cd,efd->abe";
+static constexpr std::array<int64_t, 3> shape94{2, 2, 2};
+static constexpr std::array<float, 8> expected94{32.f, 112.f, 52.f, 180.f, 112.f, 384.f, 132.f, 452.f};
+static constexpr std::string_view equation95 = "acb,cd,efd->abf";
+static constexpr std::array<int64_t, 3> shape95{2, 2, 2};
+static constexpr std::array<float, 8> expected95{52.f, 92.f, 84.f, 148.f, 180.f, 316.f, 212.f, 372.f};
+static constexpr std::string_view equation96 = "acb,cd,fde->acf";
+static constexpr std::array<int64_t, 3> shape96{2, 2, 2};
+static constexpr std::array<float, 8> expected96{5.f, 13.f, 85.f, 285.f, 45.f, 117.f, 221.f, 741.f};
+static constexpr std::string_view equation97 = "acb,cd,fde->acd";
+static constexpr std::array<int64_t, 3> shape97{2, 2, 2};
+static constexpr std::array<float, 8> expected97{0.f, 18.f, 100.f, 270.f, 0.f, 162.f, 260.f, 702.f};
+static constexpr std::string_view equation98 = "acb,cd,fde->abf";
+static constexpr std::array<int64_t, 3> shape98{2, 2, 2};
+static constexpr std::array<float, 8> expected98{34.f, 114.f, 56.f, 184.f, 122.f, 394.f, 144.f, 464.f};
+static constexpr std::string_view equation99 = "acb,cd,fde->abd";
+static constexpr std::array<int64_t, 3> shape99{2, 2, 2};
+static constexpr std::array<float, 8> expected99{40.f, 108.f, 60.f, 180.f, 120.f, 396.f, 140.f, 468.f};
+static constexpr std::string_view equation100 = "acb,cd,fed->acf";
+static constexpr std::array<int64_t, 3> shape100{2, 2, 2};
+static constexpr std::array<float, 8> expected100{4.f, 12.f, 80.f, 280.f, 36.f, 108.f, 208.f, 728.f};
+static constexpr std::string_view equation101 = "acb,cd,fed->ace";
+static constexpr std::array<int64_t, 3> shape101{2, 2, 2};
+static constexpr std::array<float, 8> expected101{6.f, 10.f, 130.f, 230.f, 54.f, 90.f, 338.f, 598.f};
+static constexpr std::string_view equation102 = "acb,cd,fed->abf";
+static constexpr std::array<int64_t, 3> shape102{2, 2, 2};
+static constexpr std::array<float, 8> expected102{32.f, 112.f, 52.f, 180.f, 112.f, 384.f, 132.f, 452.f};
+static constexpr std::string_view equation103 = "acb,cd,fed->abe";
+static constexpr std::array<int64_t, 3> shape103{2, 2, 2};
+static constexpr std::array<float, 8> expected103{52.f, 92.f, 84.f, 148.f, 180.f, 316.f, 212.f, 372.f};
+static constexpr std::string_view equation104 = "acb,dc,def->acd";
+static constexpr std::array<int64_t, 3> shape104{2, 2, 2};
+static constexpr std::array<float, 8> expected104{0.f, 44.f, 30.f, 330.f, 0.f, 396.f, 78.f, 858.f};
+static constexpr std::string_view equation105 = "acb,dc,def->ace";
+static constexpr std::array<int64_t, 3> shape105{2, 2, 2};
+static constexpr std::array<float, 8> expected105{18.f, 26.f, 140.f, 220.f, 162.f, 234.f, 364.f, 572.f};
+
+static constexpr std::string_view equation106 = "acb,dc,def->abd";
+static constexpr std::array<int64_t, 3> shape106{2, 2, 2};
+static constexpr std::array<float, 8> expected106{12.f, 132.f, 18.f, 242.f, 36.f, 572.f, 42.f, 682.f};
+static constexpr std::string_view equation107 = "acb,dc,def->abe";
+static constexpr std::array<int64_t, 3> shape107{2, 2, 2};
+static constexpr std::array<float, 8> expected107{56.f, 88.f, 102.f, 158.f, 240.f, 368.f, 286.f, 438.f};
+static constexpr std::string_view equation108 = "acb,dc,dfe->acd";
+static constexpr std::array<int64_t, 3> shape108{2, 2, 2};
+static constexpr std::array<float, 8> expected108{0.f, 44.f, 30.f, 330.f, 0.f, 396.f, 78.f, 858.f};
+static constexpr std::string_view equation109 = "acb,dc,dfe->acf";
+static constexpr std::array<int64_t, 3> shape109{2, 2, 2};
+static constexpr std::array<float, 8> expected109{18.f, 26.f, 140.f, 220.f, 162.f, 234.f, 364.f, 572.f};
+static constexpr std::string_view equation110 = "acb,dc,dfe->abd";
+static constexpr std::array<int64_t, 3> shape110{2, 2, 2};
+static constexpr std::array<float, 8> expected110{12.f, 132.f, 18.f, 242.f, 36.f, 572.f, 42.f, 682.f};
+static constexpr std::string_view equation111 = "acb,dc,dfe->abf";
+static constexpr std::array<int64_t, 3> shape111{2, 2, 2};
+static constexpr std::array<float, 8> expected111{56.f, 88.f, 102.f, 158.f, 240.f, 368.f, 286.f, 438.f};
+static constexpr std::string_view equation112 = "acb,dc,edf->ace";
+static constexpr std::array<int64_t, 3> shape112{2, 2, 2};
+static constexpr std::array<float, 8> expected112{10.f, 26.f, 80.f, 240.f, 90.f, 234.f, 208.f, 624.f};
+static constexpr std::string_view equation113 = "acb,dc,edf->acd";
+static constexpr std::array<int64_t, 3> shape113{2, 2, 2};
+static constexpr std::array<float, 8> expected113{0.f, 36.f, 50.f, 270.f, 0.f, 324.f, 130.f, 702.f};
+static constexpr std::string_view equation114 = "acb,dc,edf->abe";
+static constexpr std::array<int64_t, 3> shape114{2, 2, 2};
+static constexpr std::array<float, 8> expected114{32.f, 96.f, 58.f, 170.f, 136.f, 392.f, 162.f, 466.f};
+static constexpr std::string_view equation115 = "acb,dc,edf->abd";
+static constexpr std::array<int64_t, 3> shape115{2, 2, 2};
+static constexpr std::array<float, 8> expected115{20.f, 108.f, 30.f, 198.f, 60.f, 468.f, 70.f, 558.f};
+static constexpr std::string_view equation116 = "acb,dc,efd->ace";
+static constexpr std::array<int64_t, 3> shape116{2, 2, 2};
+static constexpr std::array<float, 8> expected116{8.f, 24.f, 70.f, 230.f, 72.f, 216.f, 182.f, 598.f};
+static constexpr std::string_view equation117 = "acb,dc,efd->acf";
+static constexpr std::array<int64_t, 3> shape117{2, 2, 2};
+static constexpr std::array<float, 8> expected117{12.f, 20.f, 110.f, 190.f, 108.f, 180.f, 286.f, 494.f};
+static constexpr std::string_view equation118 = "acb,dc,efd->abe";
+static constexpr std::array<int64_t, 3> shape118{2, 2, 2};
+static constexpr std::array<float, 8> expected118{28.f, 92.f, 50.f, 162.f, 116.f, 372.f, 138.f, 442.f};
+static constexpr std::string_view equation119 = "acb,dc,efd->abf";
+static constexpr std::array<int64_t, 3> shape119{2, 2, 2};
+static constexpr std::array<float, 8> expected119{44.f, 76.f, 78.f, 134.f, 180.f, 308.f, 214.f, 366.f};
+static constexpr std::string_view equation120 = "acb,dc,fde->acf";
+static constexpr std::array<int64_t, 3> shape120{2, 2, 2};
+static constexpr std::array<float, 8> expected120{10.f, 26.f, 80.f, 240.f, 90.f, 234.f, 208.f, 624.f};
+static constexpr std::string_view equation121 = "acb,dc,fde->acd";
+static constexpr std::array<int64_t, 3> shape121{2, 2, 2};
+static constexpr std::array<float, 8> expected121{0.f, 36.f, 50.f, 270.f, 0.f, 324.f, 130.f, 702.f};
+static constexpr std::string_view equation122 = "acb,dc,fde->abf";
+static constexpr std::array<int64_t, 3> shape122{2, 2, 2};
+static constexpr std::array<float, 8> expected122{32.f, 96.f, 58.f, 170.f, 136.f, 392.f, 162.f, 466.f};
+static constexpr std::string_view equation123 = "acb,dc,fde->abd";
+static constexpr std::array<int64_t, 3> shape123{2, 2, 2};
+static constexpr std::array<float, 8> expected123{20.f, 108.f, 30.f, 198.f, 60.f, 468.f, 70.f, 558.f};
+static constexpr std::string_view equation124 = "acb,dc,fed->acf";
+static constexpr std::array<int64_t, 3> shape124{2, 2, 2};
+static constexpr std::array<float, 8> expected124{8.f, 24.f, 70.f, 230.f, 72.f, 216.f, 182.f, 598.f};
+static constexpr std::string_view equation125 = "acb,dc,fed->ace";
+static constexpr std::array<int64_t, 3> shape125{2, 2, 2};
+static constexpr std::array<float, 8> expected125{12.f, 20.f, 110.f, 190.f, 108.f, 180.f, 286.f, 494.f};
+static constexpr std::string_view equation126 = "acb,dc,fed->abf";
+static constexpr std::array<int64_t, 3> shape126{2, 2, 2};
+static constexpr std::array<float, 8> expected126{28.f, 92.f, 50.f, 162.f, 116.f, 372.f, 138.f, 442.f};
+static constexpr std::string_view equation127 = "acb,dc,fed->abe";
+static constexpr std::array<int64_t, 3> shape127{2, 2, 2};
+static constexpr std::array<float, 8> expected127{44.f, 76.f, 78.f, 134.f, 180.f, 308.f, 214.f, 366.f};
+static constexpr std::string_view equation128 = "bac,cd,def->bad";
+static constexpr std::array<int64_t, 3> shape128{2, 2, 2};
+static constexpr std::array<float, 8> expected128{12.f, 66.f, 36.f, 242.f, 60.f, 418.f, 84.f, 594.f};
+static constexpr std::string_view equation129 = "bac,cd,def->bae";
+static constexpr std::array<int64_t, 3> shape129{2, 2, 2};
+static constexpr std::array<float, 8> expected129{29.f, 49.f, 105.f, 173.f, 181.f, 297.f, 257.f, 421.f};
+static constexpr std::string_view equation130 = "bac,cd,def->bcd";
+static constexpr std::array<int64_t, 3> shape130{2, 2, 2};
+static constexpr std::array<float, 8> expected130{0.f, 44.f, 48.f, 264.f, 0.f, 220.f, 144.f, 792.f};
+static constexpr std::string_view equation131 = "bac,cd,def->bce";
+static constexpr std::array<int64_t, 3> shape131{2, 2, 2};
+static constexpr std::array<float, 8> expected131{18.f, 26.f, 116.f, 196.f, 90.f, 130.f, 348.f, 588.f};
+static constexpr std::string_view equation132 = "bac,cd,dfe->bad";
+static constexpr std::array<int64_t, 3> shape132{2, 2, 2};
+static constexpr std::array<float, 8> expected132{12.f, 66.f, 36.f, 242.f, 60.f, 418.f, 84.f, 594.f};
+static constexpr std::string_view equation133 = "bac,cd,dfe->baf";
+static constexpr std::array<int64_t, 3> shape133{2, 2, 2};
+static constexpr std::array<float, 8> expected133{29.f, 49.f, 105.f, 173.f, 181.f, 297.f, 257.f, 421.f};
+static constexpr std::string_view equation134 = "bac,cd,dfe->bcd";
+static constexpr std::array<int64_t, 3> shape134{2, 2, 2};
+static constexpr std::array<float, 8> expected134{0.f, 44.f, 48.f, 264.f, 0.f, 220.f, 144.f, 792.f};
+static constexpr std::string_view equation135 = "bac,cd,dfe->bcf";
+static constexpr std::array<int64_t, 3> shape135{2, 2, 2};
+static constexpr std::array<float, 8> expected135{18.f, 26.f, 116.f, 196.f, 90.f, 130.f, 348.f, 588.f};
+static constexpr std::string_view equation136 = "bac,cd,edf->bae";
+static constexpr std::array<int64_t, 3> shape136{2, 2, 2};
+static constexpr std::array<float, 8> expected136{17.f, 57.f, 61.f, 197.f, 105.f, 337.f, 149.f, 477.f};
+static constexpr std::string_view equation137 = "bac,cd,edf->bad";
+static constexpr std::array<int64_t, 3> shape137{2, 2, 2};
+static constexpr std::array<float, 8> expected137{20.f, 54.f, 60.f, 198.f, 100.f, 342.f, 140.f, 486.f};
+static constexpr std::string_view equation138 = "bac,cd,edf->bce";
+static constexpr std::array<int64_t, 3> shape138{2, 2, 2};
+static constexpr std::array<float, 8> expected138{10.f, 26.f, 68.f, 228.f, 50.f, 130.f, 204.f, 684.f};
+static constexpr std::string_view equation139 = "bac,cd,edf->bcd";
+static constexpr std::array<int64_t, 3> shape139{2, 2, 2};
+static constexpr std::array<float, 8> expected139{0.f, 36.f, 80.f, 216.f, 0.f, 180.f, 240.f, 648.f};
+static constexpr std::string_view equation140 = "bac,cd,efd->bae";
+static constexpr std::array<int64_t, 3> shape140{2, 2, 2};
+static constexpr std::array<float, 8> expected140{16.f, 56.f, 56.f, 192.f, 96.f, 328.f, 136.f, 464.f};
+static constexpr std::string_view equation141 = "bac,cd,efd->baf";
+static constexpr std::array<int64_t, 3> shape141{2, 2, 2};
+static constexpr std::array<float, 8> expected141{26.f, 46.f, 90.f, 158.f, 154.f, 270.f, 218.f, 382.f};
+static constexpr std::string_view equation142 = "bac,cd,efd->bce";
+static constexpr std::array<int64_t, 3> shape142{2, 2, 2};
+static constexpr std::array<float, 8> expected142{8.f, 24.f, 64.f, 224.f, 40.f, 120.f, 192.f, 672.f};
+static constexpr std::string_view equation143 = "bac,cd,efd->bcf";
+static constexpr std::array<int64_t, 3> shape143{2, 2, 2};
+static constexpr std::array<float, 8> expected143{12.f, 20.f, 104.f, 184.f, 60.f, 100.f, 312.f, 552.f};
+static constexpr std::string_view equation144 = "bac,cd,fde->baf";
+static constexpr std::array<int64_t, 3> shape144{2, 2, 2};
+static constexpr std::array<float, 8> expected144{17.f, 57.f, 61.f, 197.f, 105.f, 337.f, 149.f, 477.f};
+static constexpr std::string_view equation145 = "bac,cd,fde->bad";
+static constexpr std::array<int64_t, 3> shape145{2, 2, 2};
+static constexpr std::array<float, 8> expected145{20.f, 54.f, 60.f, 198.f, 100.f, 342.f, 140.f, 486.f};
+static constexpr std::string_view equation146 = "bac,cd,fde->bcf";
+static constexpr std::array<int64_t, 3> shape146{2, 2, 2};
+static constexpr std::array<float, 8> expected146{10.f, 26.f, 68.f, 228.f, 50.f, 130.f, 204.f, 684.f};
+static constexpr std::string_view equation147 = "bac,cd,fde->bcd";
+static constexpr std::array<int64_t, 3> shape147{2, 2, 2};
+static constexpr std::array<float, 8> expected147{0.f, 36.f, 80.f, 216.f, 0.f, 180.f, 240.f, 648.f};
+static constexpr std::string_view equation148 = "bac,cd,fed->baf";
+static constexpr std::array<int64_t, 3> shape148{2, 2, 2};
+static constexpr std::array<float, 8> expected148{16.f, 56.f, 56.f, 192.f, 96.f, 328.f, 136.f, 464.f};
+static constexpr std::string_view equation149 = "bac,cd,fed->bae";
+static constexpr std::array<int64_t, 3> shape149{2, 2, 2};
+static constexpr std::array<float, 8> expected149{26.f, 46.f, 90.f, 158.f, 154.f, 270.f, 218.f, 382.f};
+static constexpr std::string_view equation150 = "bac,cd,fed->bcf";
+static constexpr std::array<int64_t, 3> shape150{2, 2, 2};
+static constexpr std::array<float, 8> expected150{8.f, 24.f, 64.f, 224.f, 40.f, 120.f, 192.f, 672.f};
+static constexpr std::string_view equation151 = "bac,cd,fed->bce";
+static constexpr std::array<int64_t, 3> shape151{2, 2, 2};
+static constexpr std::array<float, 8> expected151{12.f, 20.f, 104.f, 184.f, 60.f, 100.f, 312.f, 552.f};
+static constexpr std::string_view equation152 = "bac,dc,def->bad";
+static constexpr std::array<int64_t, 3> shape152{2, 2, 2};
+static constexpr std::array<float, 8> expected152{6.f, 66.f, 18.f, 286.f, 30.f, 506.f, 42.f, 726.f};
+static constexpr std::string_view equation153 = "bac,dc,def->bae";
+static constexpr std::array<int64_t, 3> shape153{2, 2, 2};
+static constexpr std::array<float, 8> expected153{28.f, 44.f, 120.f, 184.f, 212.f, 324.f, 304.f, 464.f};
+static constexpr std::string_view equation154 = "bac,dc,def->bcd";
+static constexpr std::array<int64_t, 3> shape154{2, 2, 2};
+static constexpr std::array<float, 8> expected154{0.f, 88.f, 24.f, 264.f, 0.f, 440.f, 72.f, 792.f};
+static constexpr std::string_view equation155 = "bac,dc,def->bce";
+static constexpr std::array<int64_t, 3> shape155{2, 2, 2};
+static constexpr std::array<float, 8> expected155{36.f, 52.f, 112.f, 176.f, 180.f, 260.f, 336.f, 528.f};
+static constexpr std::string_view equation156 = "bac,dc,dfe->bad";
+static constexpr std::array<int64_t, 3> shape156{2, 2, 2};
+static constexpr std::array<float, 8> expected156{6.f, 66.f, 18.f, 286.f, 30.f, 506.f, 42.f, 726.f};
+static constexpr std::string_view equation157 = "bac,dc,dfe->baf";
+static constexpr std::array<int64_t, 3> shape157{2, 2, 2};
+static constexpr std::array<float, 8> expected157{28.f, 44.f, 120.f, 184.f, 212.f, 324.f, 304.f, 464.f};
+static constexpr std::string_view equation158 = "bac,dc,dfe->bcd";
+static constexpr std::array<int64_t, 3> shape158{2, 2, 2};
+static constexpr std::array<float, 8> expected158{0.f, 88.f, 24.f, 264.f, 0.f, 440.f, 72.f, 792.f};
+static constexpr std::string_view equation159 = "bac,dc,dfe->bcf";
+static constexpr std::array<int64_t, 3> shape159{2, 2, 2};
+static constexpr std::array<float, 8> expected159{36.f, 52.f, 112.f, 176.f, 180.f, 260.f, 336.f, 528.f};
+static constexpr std::string_view equation160 = "bac,dc,edf->bae";
+static constexpr std::array<int64_t, 3> shape160{2, 2, 2};
+static constexpr std::array<float, 8> expected160{16.f, 48.f, 68.f, 196.f, 120.f, 344.f, 172.f, 492.f};
+static constexpr std::string_view equation161 = "bac,dc,edf->bad";
+static constexpr std::array<int64_t, 3> shape161{2, 2, 2};
+static constexpr std::array<float, 8> expected161{10.f, 54.f, 30.f, 234.f, 50.f, 414.f, 70.f, 594.f};
+static constexpr std::string_view equation162 = "bac,dc,edf->bce";
+static constexpr std::array<int64_t, 3> shape162{2, 2, 2};
+static constexpr std::array<float, 8> expected162{20.f, 52.f, 64.f, 192.f, 100.f, 260.f, 192.f, 576.f};
+static constexpr std::string_view equation163 = "bac,dc,edf->bcd";
+static constexpr std::array<int64_t, 3> shape163{2, 2, 2};
+static constexpr std::array<float, 8> expected163{0.f, 72.f, 40.f, 216.f, 0.f, 360.f, 120.f, 648.f};
+static constexpr std::string_view equation164 = "bac,dc,efd->bae";
+static constexpr std::array<int64_t, 3> shape164{2, 2, 2};
+static constexpr std::array<float, 8> expected164{14.f, 46.f, 58.f, 186.f, 102.f, 326.f, 146.f, 466.f};
+static constexpr std::string_view equation165 = "bac,dc,efd->baf";
+static constexpr std::array<int64_t, 3> shape165{2, 2, 2};
+static constexpr std::array<float, 8> expected165{22.f, 38.f, 90.f, 154.f, 158.f, 270.f, 226.f, 386.f};
+static constexpr std::string_view equation166 = "bac,dc,efd->bce";
+static constexpr std::array<int64_t, 3> shape166{2, 2, 2};
+static constexpr std::array<float, 8> expected166{16.f, 48.f, 56.f, 184.f, 80.f, 240.f, 168.f, 552.f};
+static constexpr std::string_view equation167 = "bac,dc,efd->bcf";
+static constexpr std::array<int64_t, 3> shape167{2, 2, 2};
+static constexpr std::array<float, 8> expected167{24.f, 40.f, 88.f, 152.f, 120.f, 200.f, 264.f, 456.f};
+static constexpr std::string_view equation168 = "bac,dc,fde->baf";
+static constexpr std::array<int64_t, 3> shape168{2, 2, 2};
+static constexpr std::array<float, 8> expected168{16.f, 48.f, 68.f, 196.f, 120.f, 344.f, 172.f, 492.f};
+static constexpr std::string_view equation169 = "bac,dc,fde->bad";
+static constexpr std::array<int64_t, 3> shape169{2, 2, 2};
+static constexpr std::array<float, 8> expected169{10.f, 54.f, 30.f, 234.f, 50.f, 414.f, 70.f, 594.f};
+static constexpr std::string_view equation170 = "bac,dc,fde->bcf";
+static constexpr std::array<int64_t, 3> shape170{2, 2, 2};
+static constexpr std::array<float, 8> expected170{20.f, 52.f, 64.f, 192.f, 100.f, 260.f, 192.f, 576.f};
+static constexpr std::string_view equation171 = "bac,dc,fde->bcd";
+static constexpr std::array<int64_t, 3> shape171{2, 2, 2};
+static constexpr std::array<float, 8> expected171{0.f, 72.f, 40.f, 216.f, 0.f, 360.f, 120.f, 648.f};
+static constexpr std::string_view equation172 = "bac,dc,fed->baf";
+static constexpr std::array<int64_t, 3> shape172{2, 2, 2};
+static constexpr std::array<float, 8> expected172{14.f, 46.f, 58.f, 186.f, 102.f, 326.f, 146.f, 466.f};
+static constexpr std::string_view equation173 = "bac,dc,fed->bae";
+static constexpr std::array<int64_t, 3> shape173{2, 2, 2};
+static constexpr std::array<float, 8> expected173{22.f, 38.f, 90.f, 154.f, 158.f, 270.f, 226.f, 386.f};
+static constexpr std::string_view equation174 = "bac,dc,fed->bcf";
+static constexpr std::array<int64_t, 3> shape174{2, 2, 2};
+static constexpr std::array<float, 8> expected174{16.f, 48.f, 56.f, 184.f, 80.f, 240.f, 168.f, 552.f};
+static constexpr std::string_view equation175 = "bac,dc,fed->bce";
+static constexpr std::array<int64_t, 3> shape175{2, 2, 2};
+static constexpr std::array<float, 8> expected175{24.f, 40.f, 88.f, 152.f, 120.f, 200.f, 264.f, 456.f};
+static constexpr std::string_view equation176 = "bca,cd,def->bcd";
+static constexpr std::array<int64_t, 3> shape176{2, 2, 2};
+static constexpr std::array<float, 8> expected176{0.f, 22.f, 60.f, 330.f, 0.f, 198.f, 156.f, 858.f};
+static constexpr std::string_view equation177 = "bca,cd,def->bce";
+static constexpr std::array<int64_t, 3> shape177{2, 2, 2};
+static constexpr std::array<float, 8> expected177{9.f, 13.f, 145.f, 245.f, 81.f, 117.f, 377.f, 637.f};
+static constexpr std::string_view equation178 = "bca,cd,def->bad";
+static constexpr std::array<int64_t, 3> shape178{2, 2, 2};
+static constexpr std::array<float, 8> expected178{24.f, 132.f, 36.f, 220.f, 72.f, 484.f, 84.f, 572.f};
+static constexpr std::string_view equation179 = "bca,cd,def->bae";
+static constexpr std::array<int64_t, 3> shape179{2, 2, 2};
+static constexpr std::array<float, 8> expected179{58.f, 98.f, 96.f, 160.f, 210.f, 346.f, 248.f, 408.f};
+static constexpr std::string_view equation180 = "bca,cd,dfe->bcd";
+static constexpr std::array<int64_t, 3> shape180{2, 2, 2};
+static constexpr std::array<float, 8> expected180{0.f, 22.f, 60.f, 330.f, 0.f, 198.f, 156.f, 858.f};
+static constexpr std::string_view equation181 = "bca,cd,dfe->bcf";
+static constexpr std::array<int64_t, 3> shape181{2, 2, 2};
+static constexpr std::array<float, 8> expected181{9.f, 13.f, 145.f, 245.f, 81.f, 117.f, 377.f, 637.f};
+static constexpr std::string_view equation182 = "bca,cd,dfe->bad";
+static constexpr std::array<int64_t, 3> shape182{2, 2, 2};
+static constexpr std::array<float, 8> expected182{24.f, 132.f, 36.f, 220.f, 72.f, 484.f, 84.f, 572.f};
+static constexpr std::string_view equation183 = "bca,cd,dfe->baf";
+static constexpr std::array<int64_t, 3> shape183{2, 2, 2};
+static constexpr std::array<float, 8> expected183{58.f, 98.f, 96.f, 160.f, 210.f, 346.f, 248.f, 408.f};
+static constexpr std::string_view equation184 = "bca,cd,edf->bce";
+static constexpr std::array<int64_t, 3> shape184{2, 2, 2};
+static constexpr std::array<float, 8> expected184{5.f, 13.f, 85.f, 285.f, 45.f, 117.f, 221.f, 741.f};
+static constexpr std::string_view equation185 = "bca,cd,edf->bcd";
+static constexpr std::array<int64_t, 3> shape185{2, 2, 2};
+static constexpr std::array<float, 8> expected185{0.f, 18.f, 100.f, 270.f, 0.f, 162.f, 260.f, 702.f};
+static constexpr std::string_view equation186 = "bca,cd,edf->bae";
+static constexpr std::array<int64_t, 3> shape186{2, 2, 2};
+static constexpr std::array<float, 8> expected186{34.f, 114.f, 56.f, 184.f, 122.f, 394.f, 144.f, 464.f};
+static constexpr std::string_view equation187 = "bca,cd,edf->bad";
+static constexpr std::array<int64_t, 3> shape187{2, 2, 2};
+static constexpr std::array<float, 8> expected187{40.f, 108.f, 60.f, 180.f, 120.f, 396.f, 140.f, 468.f};
+static constexpr std::string_view equation188 = "bca,cd,efd->bce";
+static constexpr std::array<int64_t, 3> shape188{2, 2, 2};
+static constexpr std::array<float, 8> expected188{4.f, 12.f, 80.f, 280.f, 36.f, 108.f, 208.f, 728.f};
+static constexpr std::string_view equation189 = "bca,cd,efd->bcf";
+static constexpr std::array<int64_t, 3> shape189{2, 2, 2};
+static constexpr std::array<float, 8> expected189{6.f, 10.f, 130.f, 230.f, 54.f, 90.f, 338.f, 598.f};
+static constexpr std::string_view equation190 = "bca,cd,efd->bae";
+static constexpr std::array<int64_t, 3> shape190{2, 2, 2};
+static constexpr std::array<float, 8> expected190{32.f, 112.f, 52.f, 180.f, 112.f, 384.f, 132.f, 452.f};
+static constexpr std::string_view equation191 = "bca,cd,efd->baf";
+static constexpr std::array<int64_t, 3> shape191{2, 2, 2};
+static constexpr std::array<float, 8> expected191{52.f, 92.f, 84.f, 148.f, 180.f, 316.f, 212.f, 372.f};
+static constexpr std::string_view equation192 = "bca,cd,fde->bcf";
+static constexpr std::array<int64_t, 3> shape192{2, 2, 2};
+static constexpr std::array<float, 8> expected192{5.f, 13.f, 85.f, 285.f, 45.f, 117.f, 221.f, 741.f};
+static constexpr std::string_view equation193 = "bca,cd,fde->bcd";
+static constexpr std::array<int64_t, 3> shape193{2, 2, 2};
+static constexpr std::array<float, 8> expected193{0.f, 18.f, 100.f, 270.f, 0.f, 162.f, 260.f, 702.f};
+static constexpr std::string_view equation194 = "bca,cd,fde->baf";
+static constexpr std::array<int64_t, 3> shape194{2, 2, 2};
+static constexpr std::array<float, 8> expected194{34.f, 114.f, 56.f, 184.f, 122.f, 394.f, 144.f, 464.f};
+static constexpr std::string_view equation195 = "bca,cd,fde->bad";
+static constexpr std::array<int64_t, 3> shape195{2, 2, 2};
+static constexpr std::array<float, 8> expected195{40.f, 108.f, 60.f, 180.f, 120.f, 396.f, 140.f, 468.f};
+static constexpr std::string_view equation196 = "bca,cd,fed->bcf";
+static constexpr std::array<int64_t, 3> shape196{2, 2, 2};
+static constexpr std::array<float, 8> expected196{4.f, 12.f, 80.f, 280.f, 36.f, 108.f, 208.f, 728.f};
+static constexpr std::string_view equation197 = "bca,cd,fed->bce";
+static constexpr std::array<int64_t, 3> shape197{2, 2, 2};
+static constexpr std::array<float, 8> expected197{6.f, 10.f, 130.f, 230.f, 54.f, 90.f, 338.f, 598.f};
+static constexpr std::string_view equation198 = "bca,cd,fed->baf";
+static constexpr std::array<int64_t, 3> shape198{2, 2, 2};
+static constexpr std::array<float, 8> expected198{32.f, 112.f, 52.f, 180.f, 112.f, 384.f, 132.f, 452.f};
+static constexpr std::string_view equation199 = "bca,cd,fed->bae";
+static constexpr std::array<int64_t, 3> shape199{2, 2, 2};
+static constexpr std::array<float, 8> expected199{52.f, 92.f, 84.f, 148.f, 180.f, 316.f, 212.f, 372.f};
+static constexpr std::string_view equation200 = "bca,dc,def->bcd";
+static constexpr std::array<int64_t, 3> shape200{2, 2, 2};
+static constexpr std::array<float, 8> expected200{0.f, 44.f, 30.f, 330.f, 0.f, 396.f, 78.f, 858.f};
+static constexpr std::string_view equation201 = "bca,dc,def->bce";
+static constexpr std::array<int64_t, 3> shape201{2, 2, 2};
+static constexpr std::array<float, 8> expected201{18.f, 26.f, 140.f, 220.f, 162.f, 234.f, 364.f, 572.f};
+static constexpr std::string_view equation202 = "bca,dc,def->bad";
+static constexpr std::array<int64_t, 3> shape202{2, 2, 2};
+static constexpr std::array<float, 8> expected202{12.f, 132.f, 18.f, 242.f, 36.f, 572.f, 42.f, 682.f};
+static constexpr std::string_view equation203 = "bca,dc,def->bae";
+static constexpr std::array<int64_t, 3> shape203{2, 2, 2};
+static constexpr std::array<float, 8> expected203{56.f, 88.f, 102.f, 158.f, 240.f, 368.f, 286.f, 438.f};
+static constexpr std::string_view equation204 = "bca,dc,dfe->bcd";
+static constexpr std::array<int64_t, 3> shape204{2, 2, 2};
+static constexpr std::array<float, 8> expected204{0.f, 44.f, 30.f, 330.f, 0.f, 396.f, 78.f, 858.f};
+static constexpr std::string_view equation205 = "bca,dc,dfe->bcf";
+static constexpr std::array<int64_t, 3> shape205{2, 2, 2};
+static constexpr std::array<float, 8> expected205{18.f, 26.f, 140.f, 220.f, 162.f, 234.f, 364.f, 572.f};
+static constexpr std::string_view equation206 = "bca,dc,dfe->bad";
+static constexpr std::array<int64_t, 3> shape206{2, 2, 2};
+static constexpr std::array<float, 8> expected206{12.f, 132.f, 18.f, 242.f, 36.f, 572.f, 42.f, 682.f};
+static constexpr std::string_view equation207 = "bca,dc,dfe->baf";
+static constexpr std::array<int64_t, 3> shape207{2, 2, 2};
+static constexpr std::array<float, 8> expected207{56.f, 88.f, 102.f, 158.f, 240.f, 368.f, 286.f, 438.f};
+static constexpr std::string_view equation208 = "bca,dc,edf->bce";
+static constexpr std::array<int64_t, 3> shape208{2, 2, 2};
+static constexpr std::array<float, 8> expected208{10.f, 26.f, 80.f, 240.f, 90.f, 234.f, 208.f, 624.f};
+static constexpr std::string_view equation209 = "bca,dc,edf->bcd";
+static constexpr std::array<int64_t, 3> shape209{2, 2, 2};
+static constexpr std::array<float, 8> expected209{0.f, 36.f, 50.f, 270.f, 0.f, 324.f, 130.f, 702.f};
+static constexpr std::string_view equation210 = "bca,dc,edf->bae";
+static constexpr std::array<int64_t, 3> shape210{2, 2, 2};
+static constexpr std::array<float, 8> expected210{32.f, 96.f, 58.f, 170.f, 136.f, 392.f, 162.f, 466.f};
+static constexpr std::string_view equation211 = "bca,dc,edf->bad";
+static constexpr std::array<int64_t, 3> shape211{2, 2, 2};
+static constexpr std::array<float, 8> expected211{20.f, 108.f, 30.f, 198.f, 60.f, 468.f, 70.f, 558.f};
+static constexpr std::string_view equation212 = "bca,dc,efd->bce";
+static constexpr std::array<int64_t, 3> shape212{2, 2, 2};
+static constexpr std::array<float, 8> expected212{8.f, 24.f, 70.f, 230.f, 72.f, 216.f, 182.f, 598.f};
+static constexpr std::string_view equation213 = "bca,dc,efd->bcf";
+static constexpr std::array<int64_t, 3> shape213{2, 2, 2};
+static constexpr std::array<float, 8> expected213{12.f, 20.f, 110.f, 190.f, 108.f, 180.f, 286.f, 494.f};
+static constexpr std::string_view equation214 = "bca,dc,efd->bae";
+static constexpr std::array<int64_t, 3> shape214{2, 2, 2};
+static constexpr std::array<float, 8> expected214{28.f, 92.f, 50.f, 162.f, 116.f, 372.f, 138.f, 442.f};
+static constexpr std::string_view equation215 = "bca,dc,efd->baf";
+static constexpr std::array<int64_t, 3> shape215{2, 2, 2};
+static constexpr std::array<float, 8> expected215{44.f, 76.f, 78.f, 134.f, 180.f, 308.f, 214.f, 366.f};
+static constexpr std::string_view equation216 = "bca,dc,fde->bcf";
+static constexpr std::array<int64_t, 3> shape216{2, 2, 2};
+static constexpr std::array<float, 8> expected216{10.f, 26.f, 80.f, 240.f, 90.f, 234.f, 208.f, 624.f};
+static constexpr std::string_view equation217 = "bca,dc,fde->bcd";
+static constexpr std::array<int64_t, 3> shape217{2, 2, 2};
+static constexpr std::array<float, 8> expected217{0.f, 36.f, 50.f, 270.f, 0.f, 324.f, 130.f, 702.f};
+static constexpr std::string_view equation218 = "bca,dc,fde->baf";
+static constexpr std::array<int64_t, 3> shape218{2, 2, 2};
+static constexpr std::array<float, 8> expected218{32.f, 96.f, 58.f, 170.f, 136.f, 392.f, 162.f, 466.f};
+static constexpr std::string_view equation219 = "bca,dc,fde->bad";
+static constexpr std::array<int64_t, 3> shape219{2, 2, 2};
+static constexpr std::array<float, 8> expected219{20.f, 108.f, 30.f, 198.f, 60.f, 468.f, 70.f, 558.f};
+static constexpr std::string_view equation220 = "bca,dc,fed->bcf";
+static constexpr std::array<int64_t, 3> shape220{2, 2, 2};
+static constexpr std::array<float, 8> expected220{8.f, 24.f, 70.f, 230.f, 72.f, 216.f, 182.f, 598.f};
+static constexpr std::string_view equation221 = "bca,dc,fed->bce";
+static constexpr std::array<int64_t, 3> shape221{2, 2, 2};
+static constexpr std::array<float, 8> expected221{12.f, 20.f, 110.f, 190.f, 108.f, 180.f, 286.f, 494.f};
+static constexpr std::string_view equation222 = "bca,dc,fed->baf";
+static constexpr std::array<int64_t, 3> shape222{2, 2, 2};
+static constexpr std::array<float, 8> expected222{28.f, 92.f, 50.f, 162.f, 116.f, 372.f, 138.f, 442.f};
+static constexpr std::string_view equation223 = "bca,dc,fed->bae";
+static constexpr std::array<int64_t, 3> shape223{2, 2, 2};
+static constexpr std::array<float, 8> expected223{44.f, 76.f, 78.f, 134.f, 180.f, 308.f, 214.f, 366.f};
+static constexpr std::string_view equation224 = "cab,cd,def->cad";
+static constexpr std::array<int64_t, 3> shape224{2, 2, 2};
+static constexpr std::array<float, 8> expected224{0.f, 22.f, 0.f, 110.f, 108.f, 594.f, 156.f, 858.f};
+static constexpr std::string_view equation225 = "cab,cd,def->cae";
+static constexpr std::array<int64_t, 3> shape225{2, 2, 2};
+static constexpr std::array<float, 8> expected225{9.f, 13.f, 45.f, 65.f, 261.f, 441.f, 377.f, 637.f};
+static constexpr std::string_view equation226 = "cab,cd,def->cbd";
+static constexpr std::array<int64_t, 3> shape226{2, 2, 2};
+static constexpr std::array<float, 8> expected226{0.f, 44.f, 0.f, 88.f, 120.f, 660.f, 144.f, 792.f};
+static constexpr std::string_view equation227 = "cab,cd,def->cbe";
+static constexpr std::array<int64_t, 3> shape227{2, 2, 2};
+static constexpr std::array<float, 8> expected227{18.f, 26.f, 36.f, 52.f, 290.f, 490.f, 348.f, 588.f};
+static constexpr std::string_view equation228 = "cab,cd,dfe->cad";
+static constexpr std::array<int64_t, 3> shape228{2, 2, 2};
+static constexpr std::array<float, 8> expected228{0.f, 22.f, 0.f, 110.f, 108.f, 594.f, 156.f, 858.f};
+static constexpr std::string_view equation229 = "cab,cd,dfe->caf";
+static constexpr std::array<int64_t, 3> shape229{2, 2, 2};
+static constexpr std::array<float, 8> expected229{9.f, 13.f, 45.f, 65.f, 261.f, 441.f, 377.f, 637.f};
+static constexpr std::string_view equation230 = "cab,cd,dfe->cbd";
+static constexpr std::array<int64_t, 3> shape230{2, 2, 2};
+static constexpr std::array<float, 8> expected230{0.f, 44.f, 0.f, 88.f, 120.f, 660.f, 144.f, 792.f};
+static constexpr std::string_view equation231 = "cab,cd,dfe->cbf";
+static constexpr std::array<int64_t, 3> shape231{2, 2, 2};
+static constexpr std::array<float, 8> expected231{18.f, 26.f, 36.f, 52.f, 290.f, 490.f, 348.f, 588.f};
+static constexpr std::string_view equation232 = "cab,cd,edf->cae";
+static constexpr std::array<int64_t, 3> shape232{2, 2, 2};
+static constexpr std::array<float, 8> expected232{5.f, 13.f, 25.f, 65.f, 153.f, 513.f, 221.f, 741.f};
+static constexpr std::string_view equation233 = "cab,cd,edf->cad";
+static constexpr std::array<int64_t, 3> shape233{2, 2, 2};
+static constexpr std::array<float, 8> expected233{0.f, 18.f, 0.f, 90.f, 180.f, 486.f, 260.f, 702.f};
+static constexpr std::string_view equation234 = "cab,cd,edf->cbe";
+static constexpr std::array<int64_t, 3> shape234{2, 2, 2};
+static constexpr std::array<float, 8> expected234{10.f, 26.f, 20.f, 52.f, 170.f, 570.f, 204.f, 684.f};
+static constexpr std::string_view equation235 = "cab,cd,edf->cbd";
+static constexpr std::array<int64_t, 3> shape235{2, 2, 2};
+static constexpr std::array<float, 8> expected235{0.f, 36.f, 0.f, 72.f, 200.f, 540.f, 240.f, 648.f};
+static constexpr std::string_view equation236 = "cab,cd,efd->cae";
+static constexpr std::array<int64_t, 3> shape236{2, 2, 2};
+static constexpr std::array<float, 8> expected236{4.f, 12.f, 20.f, 60.f, 144.f, 504.f, 208.f, 728.f};
+static constexpr std::string_view equation237 = "cab,cd,efd->caf";
+static constexpr std::array<int64_t, 3> shape237{2, 2, 2};
+static constexpr std::array<float, 8> expected237{6.f, 10.f, 30.f, 50.f, 234.f, 414.f, 338.f, 598.f};
+static constexpr std::string_view equation238 = "cab,cd,efd->cbe";
+static constexpr std::array<int64_t, 3> shape238{2, 2, 2};
+static constexpr std::array<float, 8> expected238{8.f, 24.f, 16.f, 48.f, 160.f, 560.f, 192.f, 672.f};
+static constexpr std::string_view equation239 = "cab,cd,efd->cbf";
+static constexpr std::array<int64_t, 3> shape239{2, 2, 2};
+static constexpr std::array<float, 8> expected239{12.f, 20.f, 24.f, 40.f, 260.f, 460.f, 312.f, 552.f};
+static constexpr std::string_view equation240 = "cab,cd,fde->caf";
+static constexpr std::array<int64_t, 3> shape240{2, 2, 2};
+static constexpr std::array<float, 8> expected240{5.f, 13.f, 25.f, 65.f, 153.f, 513.f, 221.f, 741.f};
+static constexpr std::string_view equation241 = "cab,cd,fde->cad";
+static constexpr std::array<int64_t, 3> shape241{2, 2, 2};
+static constexpr std::array<float, 8> expected241{0.f, 18.f, 0.f, 90.f, 180.f, 486.f, 260.f, 702.f};
+static constexpr std::string_view equation242 = "cab,cd,fde->cbf";
+static constexpr std::array<int64_t, 3> shape242{2, 2, 2};
+static constexpr std::array<float, 8> expected242{10.f, 26.f, 20.f, 52.f, 170.f, 570.f, 204.f, 684.f};
+static constexpr std::string_view equation243 = "cab,cd,fde->cbd";
+static constexpr std::array<int64_t, 3> shape243{2, 2, 2};
+static constexpr std::array<float, 8> expected243{0.f, 36.f, 0.f, 72.f, 200.f, 540.f, 240.f, 648.f};
+static constexpr std::string_view equation244 = "cab,cd,fed->caf";
+static constexpr std::array<int64_t, 3> shape244{2, 2, 2};
+static constexpr std::array<float, 8> expected244{4.f, 12.f, 20.f, 60.f, 144.f, 504.f, 208.f, 728.f};
+static constexpr std::string_view equation245 = "cab,cd,fed->cae";
+static constexpr std::array<int64_t, 3> shape245{2, 2, 2};
+static constexpr std::array<float, 8> expected245{6.f, 10.f, 30.f, 50.f, 234.f, 414.f, 338.f, 598.f};
+static constexpr std::string_view equation246 = "cab,cd,fed->cbf";
+static constexpr std::array<int64_t, 3> shape246{2, 2, 2};
+static constexpr std::array<float, 8> expected246{8.f, 24.f, 16.f, 48.f, 160.f, 560.f, 192.f, 672.f};
+static constexpr std::string_view equation247 = "cab,cd,fed->cbe";
+static constexpr std::array<int64_t, 3> shape247{2, 2, 2};
+static constexpr std::array<float, 8> expected247{12.f, 20.f, 24.f, 40.f, 260.f, 460.f, 312.f, 552.f};
+static constexpr std::string_view equation248 = "cab,dc,def->cad";
+static constexpr std::array<int64_t, 3> shape248{2, 2, 2};
+static constexpr std::array<float, 8> expected248{0.f, 44.f, 0.f, 220.f, 54.f, 594.f, 78.f, 858.f};
+static constexpr std::string_view equation249 = "cab,dc,def->cae";
+static constexpr std::array<int64_t, 3> shape249{2, 2, 2};
+static constexpr std::array<float, 8> expected249{18.f, 26.f, 90.f, 130.f, 252.f, 396.f, 364.f, 572.f};
+static constexpr std::string_view equation250 = "cab,dc,def->cbd";
+static constexpr std::array<int64_t, 3> shape250{2, 2, 2};
+static constexpr std::array<float, 8> expected250{0.f, 88.f, 0.f, 176.f, 60.f, 660.f, 72.f, 792.f};
+
+static constexpr std::string_view equation251 = "cab,dc,def->cbe";
+static constexpr std::array<int64_t, 3> shape251{2, 2, 2};
+static constexpr std::array<float, 8> expected251{36.f, 52.f, 72.f, 104.f, 280.f, 440.f, 336.f, 528.f};
+static constexpr std::string_view equation252 = "cab,dc,dfe->cad";
+static constexpr std::array<int64_t, 3> shape252{2, 2, 2};
+static constexpr std::array<float, 8> expected252{0.f, 44.f, 0.f, 220.f, 54.f, 594.f, 78.f, 858.f};
+static constexpr std::string_view equation253 = "cab,dc,dfe->caf";
+static constexpr std::array<int64_t, 3> shape253{2, 2, 2};
+static constexpr std::array<float, 8> expected253{18.f, 26.f, 90.f, 130.f, 252.f, 396.f, 364.f, 572.f};
+static constexpr std::string_view equation254 = "cab,dc,dfe->cbd";
+static constexpr std::array<int64_t, 3> shape254{2, 2, 2};
+static constexpr std::array<float, 8> expected254{0.f, 88.f, 0.f, 176.f, 60.f, 660.f, 72.f, 792.f};
+static constexpr std::string_view equation255 = "cab,dc,dfe->cbf";
+static constexpr std::array<int64_t, 3> shape255{2, 2, 2};
+static constexpr std::array<float, 8> expected255{36.f, 52.f, 72.f, 104.f, 280.f, 440.f, 336.f, 528.f};
+static constexpr std::string_view equation256 = "cab,dc,edf->cae";
+static constexpr std::array<int64_t, 3> shape256{2, 2, 2};
+static constexpr std::array<float, 8> expected256{10.f, 26.f, 50.f, 130.f, 144.f, 432.f, 208.f, 624.f};
+static constexpr std::string_view equation257 = "cab,dc,edf->cad";
+static constexpr std::array<int64_t, 3> shape257{2, 2, 2};
+static constexpr std::array<float, 8> expected257{0.f, 36.f, 0.f, 180.f, 90.f, 486.f, 130.f, 702.f};
+static constexpr std::string_view equation258 = "cab,dc,edf->cbe";
+static constexpr std::array<int64_t, 3> shape258{2, 2, 2};
+static constexpr std::array<float, 8> expected258{20.f, 52.f, 40.f, 104.f, 160.f, 480.f, 192.f, 576.f};
+static constexpr std::string_view equation259 = "cab,dc,edf->cbd";
+static constexpr std::array<int64_t, 3> shape259{2, 2, 2};
+static constexpr std::array<float, 8> expected259{0.f, 72.f, 0.f, 144.f, 100.f, 540.f, 120.f, 648.f};
+static constexpr std::string_view equation260 = "cab,dc,efd->cae";
+static constexpr std::array<int64_t, 3> shape260{2, 2, 2};
+static constexpr std::array<float, 8> expected260{8.f, 24.f, 40.f, 120.f, 126.f, 414.f, 182.f, 598.f};
+static constexpr std::string_view equation261 = "cab,dc,efd->caf";
+static constexpr std::array<int64_t, 3> shape261{2, 2, 2};
+static constexpr std::array<float, 8> expected261{12.f, 20.f, 60.f, 100.f, 198.f, 342.f, 286.f, 494.f};
+static constexpr std::string_view equation262 = "cab,dc,efd->cbe";
+static constexpr std::array<int64_t, 3> shape262{2, 2, 2};
+static constexpr std::array<float, 8> expected262{16.f, 48.f, 32.f, 96.f, 140.f, 460.f, 168.f, 552.f};
+static constexpr std::string_view equation263 = "cab,dc,efd->cbf";
+static constexpr std::array<int64_t, 3> shape263{2, 2, 2};
+static constexpr std::array<float, 8> expected263{24.f, 40.f, 48.f, 80.f, 220.f, 380.f, 264.f, 456.f};
+static constexpr std::string_view equation264 = "cab,dc,fde->caf";
+static constexpr std::array<int64_t, 3> shape264{2, 2, 2};
+static constexpr std::array<float, 8> expected264{10.f, 26.f, 50.f, 130.f, 144.f, 432.f, 208.f, 624.f};
+static constexpr std::string_view equation265 = "cab,dc,fde->cad";
+static constexpr std::array<int64_t, 3> shape265{2, 2, 2};
+static constexpr std::array<float, 8> expected265{0.f, 36.f, 0.f, 180.f, 90.f, 486.f, 130.f, 702.f};
+static constexpr std::string_view equation266 = "cab,dc,fde->cbf";
+static constexpr std::array<int64_t, 3> shape266{2, 2, 2};
+static constexpr std::array<float, 8> expected266{20.f, 52.f, 40.f, 104.f, 160.f, 480.f, 192.f, 576.f};
+static constexpr std::string_view equation267 = "cab,dc,fde->cbd";
+static constexpr std::array<int64_t, 3> shape267{2, 2, 2};
+static constexpr std::array<float, 8> expected267{0.f, 72.f, 0.f, 144.f, 100.f, 540.f, 120.f, 648.f};
+static constexpr std::string_view equation268 = "cab,dc,fed->caf";
+static constexpr std::array<int64_t, 3> shape268{2, 2, 2};
+static constexpr std::array<float, 8> expected268{8.f, 24.f, 40.f, 120.f, 126.f, 414.f, 182.f, 598.f};
+static constexpr std::string_view equation269 = "cab,dc,fed->cae";
+static constexpr std::array<int64_t, 3> shape269{2, 2, 2};
+static constexpr std::array<float, 8> expected269{12.f, 20.f, 60.f, 100.f, 198.f, 342.f, 286.f, 494.f};
+static constexpr std::string_view equation270 = "cab,dc,fed->cbf";
+static constexpr std::array<int64_t, 3> shape270{2, 2, 2};
+static constexpr std::array<float, 8> expected270{16.f, 48.f, 32.f, 96.f, 140.f, 460.f, 168.f, 552.f};
+static constexpr std::string_view equation271 = "cab,dc,fed->cbe";
+static constexpr std::array<int64_t, 3> shape271{2, 2, 2};
+static constexpr std::array<float, 8> expected271{24.f, 40.f, 48.f, 80.f, 220.f, 380.f, 264.f, 456.f};
+static constexpr std::string_view equation272 = "cba,cd,def->cbd";
+static constexpr std::array<int64_t, 3> shape272{2, 2, 2};
+static constexpr std::array<float, 8> expected272{0.f, 22.f, 0.f, 110.f, 108.f, 594.f, 156.f, 858.f};
+static constexpr std::string_view equation273 = "cba,cd,def->cbe";
+static constexpr std::array<int64_t, 3> shape273{2, 2, 2};
+static constexpr std::array<float, 8> expected273{9.f, 13.f, 45.f, 65.f, 261.f, 441.f, 377.f, 637.f};
+static constexpr std::string_view equation274 = "cba,cd,def->cad";
+static constexpr std::array<int64_t, 3> shape274{2, 2, 2};
+static constexpr std::array<float, 8> expected274{0.f, 44.f, 0.f, 88.f, 120.f, 660.f, 144.f, 792.f};
+static constexpr std::string_view equation275 = "cba,cd,def->cae";
+static constexpr std::array<int64_t, 3> shape275{2, 2, 2};
+static constexpr std::array<float, 8> expected275{18.f, 26.f, 36.f, 52.f, 290.f, 490.f, 348.f, 588.f};
+static constexpr std::string_view equation276 = "cba,cd,dfe->cbd";
+static constexpr std::array<int64_t, 3> shape276{2, 2, 2};
+static constexpr std::array<float, 8> expected276{0.f, 22.f, 0.f, 110.f, 108.f, 594.f, 156.f, 858.f};
+static constexpr std::string_view equation277 = "cba,cd,dfe->cbf";
+static constexpr std::array<int64_t, 3> shape277{2, 2, 2};
+static constexpr std::array<float, 8> expected277{9.f, 13.f, 45.f, 65.f, 261.f, 441.f, 377.f, 637.f};
+static constexpr std::string_view equation278 = "cba,cd,dfe->cad";
+static constexpr std::array<int64_t, 3> shape278{2, 2, 2};
+static constexpr std::array<float, 8> expected278{0.f, 44.f, 0.f, 88.f, 120.f, 660.f, 144.f, 792.f};
+static constexpr std::string_view equation279 = "cba,cd,dfe->caf";
+static constexpr std::array<int64_t, 3> shape279{2, 2, 2};
+static constexpr std::array<float, 8> expected279{18.f, 26.f, 36.f, 52.f, 290.f, 490.f, 348.f, 588.f};
+static constexpr std::string_view equation280 = "cba,cd,edf->cbe";
+static constexpr std::array<int64_t, 3> shape280{2, 2, 2};
+static constexpr std::array<float, 8> expected280{5.f, 13.f, 25.f, 65.f, 153.f, 513.f, 221.f, 741.f};
+static constexpr std::string_view equation281 = "cba,cd,edf->cbd";
+static constexpr std::array<int64_t, 3> shape281{2, 2, 2};
+static constexpr std::array<float, 8> expected281{0.f, 18.f, 0.f, 90.f, 180.f, 486.f, 260.f, 702.f};
+static constexpr std::string_view equation282 = "cba,cd,edf->cae";
+static constexpr std::array<int64_t, 3> shape282{2, 2, 2};
+static constexpr std::array<float, 8> expected282{10.f, 26.f, 20.f, 52.f, 170.f, 570.f, 204.f, 684.f};
+static constexpr std::string_view equation283 = "cba,cd,edf->cad";
+static constexpr std::array<int64_t, 3> shape283{2, 2, 2};
+static constexpr std::array<float, 8> expected283{0.f, 36.f, 0.f, 72.f, 200.f, 540.f, 240.f, 648.f};
+static constexpr std::string_view equation284 = "cba,cd,efd->cbe";
+static constexpr std::array<int64_t, 3> shape284{2, 2, 2};
+static constexpr std::array<float, 8> expected284{4.f, 12.f, 20.f, 60.f, 144.f, 504.f, 208.f, 728.f};
+static constexpr std::string_view equation285 = "cba,cd,efd->cbf";
+static constexpr std::array<int64_t, 3> shape285{2, 2, 2};
+static constexpr std::array<float, 8> expected285{6.f, 10.f, 30.f, 50.f, 234.f, 414.f, 338.f, 598.f};
+static constexpr std::string_view equation286 = "cba,cd,efd->cae";
+static constexpr std::array<int64_t, 3> shape286{2, 2, 2};
+static constexpr std::array<float, 8> expected286{8.f, 24.f, 16.f, 48.f, 160.f, 560.f, 192.f, 672.f};
+static constexpr std::string_view equation287 = "cba,cd,efd->caf";
+static constexpr std::array<int64_t, 3> shape287{2, 2, 2};
+static constexpr std::array<float, 8> expected287{12.f, 20.f, 24.f, 40.f, 260.f, 460.f, 312.f, 552.f};
+static constexpr std::string_view equation288 = "cba,cd,fde->cbf";
+static constexpr std::array<int64_t, 3> shape288{2, 2, 2};
+static constexpr std::array<float, 8> expected288{5.f, 13.f, 25.f, 65.f, 153.f, 513.f, 221.f, 741.f};
+static constexpr std::string_view equation289 = "cba,cd,fde->cbd";
+static constexpr std::array<int64_t, 3> shape289{2, 2, 2};
+static constexpr std::array<float, 8> expected289{0.f, 18.f, 0.f, 90.f, 180.f, 486.f, 260.f, 702.f};
+static constexpr std::string_view equation290 = "cba,cd,fde->caf";
+static constexpr std::array<int64_t, 3> shape290{2, 2, 2};
+static constexpr std::array<float, 8> expected290{10.f, 26.f, 20.f, 52.f, 170.f, 570.f, 204.f, 684.f};
+static constexpr std::string_view equation291 = "cba,cd,fde->cad";
+static constexpr std::array<int64_t, 3> shape291{2, 2, 2};
+static constexpr std::array<float, 8> expected291{0.f, 36.f, 0.f, 72.f, 200.f, 540.f, 240.f, 648.f};
+static constexpr std::string_view equation292 = "cba,cd,fed->cbf";
+static constexpr std::array<int64_t, 3> shape292{2, 2, 2};
+static constexpr std::array<float, 8> expected292{4.f, 12.f, 20.f, 60.f, 144.f, 504.f, 208.f, 728.f};
+static constexpr std::string_view equation293 = "cba,cd,fed->cbe";
+static constexpr std::array<int64_t, 3> shape293{2, 2, 2};
+static constexpr std::array<float, 8> expected293{6.f, 10.f, 30.f, 50.f, 234.f, 414.f, 338.f, 598.f};
+static constexpr std::string_view equation294 = "cba,cd,fed->caf";
+static constexpr std::array<int64_t, 3> shape294{2, 2, 2};
+static constexpr std::array<float, 8> expected294{8.f, 24.f, 16.f, 48.f, 160.f, 560.f, 192.f, 672.f};
+static constexpr std::string_view equation295 = "cba,cd,fed->cae";
+static constexpr std::array<int64_t, 3> shape295{2, 2, 2};
+static constexpr std::array<float, 8> expected295{12.f, 20.f, 24.f, 40.f, 260.f, 460.f, 312.f, 552.f};
+static constexpr std::string_view equation296 = "cba,dc,def->cbd";
+static constexpr std::array<int64_t, 3> shape296{2, 2, 2};
+static constexpr std::array<float, 8> expected296{0.f, 44.f, 0.f, 220.f, 54.f, 594.f, 78.f, 858.f};
+static constexpr std::string_view equation297 = "cba,dc,def->cbe";
+static constexpr std::array<int64_t, 3> shape297{2, 2, 2};
+static constexpr std::array<float, 8> expected297{18.f, 26.f, 90.f, 130.f, 252.f, 396.f, 364.f, 572.f};
+static constexpr std::string_view equation298 = "cba,dc,def->cad";
+static constexpr std::array<int64_t, 3> shape298{2, 2, 2};
+static constexpr std::array<float, 8> expected298{0.f, 88.f, 0.f, 176.f, 60.f, 660.f, 72.f, 792.f};
+static constexpr std::string_view equation299 = "cba,dc,def->cae";
+static constexpr std::array<int64_t, 3> shape299{2, 2, 2};
+static constexpr std::array<float, 8> expected299{36.f, 52.f, 72.f, 104.f, 280.f, 440.f, 336.f, 528.f};
+static constexpr std::string_view equation300 = "cba,dc,dfe->cbd";
+static constexpr std::array<int64_t, 3> shape300{2, 2, 2};
+static constexpr std::array<float, 8> expected300{0.f, 44.f, 0.f, 220.f, 54.f, 594.f, 78.f, 858.f};
+static constexpr std::string_view equation301 = "cba,dc,dfe->cbf";
+static constexpr std::array<int64_t, 3> shape301{2, 2, 2};
+static constexpr std::array<float, 8> expected301{18.f, 26.f, 90.f, 130.f, 252.f, 396.f, 364.f, 572.f};
+static constexpr std::string_view equation302 = "cba,dc,dfe->cad";
+static constexpr std::array<int64_t, 3> shape302{2, 2, 2};
+static constexpr std::array<float, 8> expected302{0.f, 88.f, 0.f, 176.f, 60.f, 660.f, 72.f, 792.f};
+static constexpr std::string_view equation303 = "cba,dc,dfe->caf";
+static constexpr std::array<int64_t, 3> shape303{2, 2, 2};
+static constexpr std::array<float, 8> expected303{36.f, 52.f, 72.f, 104.f, 280.f, 440.f, 336.f, 528.f};
+static constexpr std::string_view equation304 = "cba,dc,edf->cbe";
+static constexpr std::array<int64_t, 3> shape304{2, 2, 2};
+static constexpr std::array<float, 8> expected304{10.f, 26.f, 50.f, 130.f, 144.f, 432.f, 208.f, 624.f};
+static constexpr std::string_view equation305 = "cba,dc,edf->cbd";
+static constexpr std::array<int64_t, 3> shape305{2, 2, 2};
+static constexpr std::array<float, 8> expected305{0.f, 36.f, 0.f, 180.f, 90.f, 486.f, 130.f, 702.f};
+static constexpr std::string_view equation306 = "cba,dc,edf->cae";
+static constexpr std::array<int64_t, 3> shape306{2, 2, 2};
+static constexpr std::array<float, 8> expected306{20.f, 52.f, 40.f, 104.f, 160.f, 480.f, 192.f, 576.f};
+static constexpr std::string_view equation307 = "cba,dc,edf->cad";
+static constexpr std::array<int64_t, 3> shape307{2, 2, 2};
+static constexpr std::array<float, 8> expected307{0.f, 72.f, 0.f, 144.f, 100.f, 540.f, 120.f, 648.f};
+static constexpr std::string_view equation308 = "cba,dc,efd->cbe";
+static constexpr std::array<int64_t, 3> shape308{2, 2, 2};
+static constexpr std::array<float, 8> expected308{8.f, 24.f, 40.f, 120.f, 126.f, 414.f, 182.f, 598.f};
+static constexpr std::string_view equation309 = "cba,dc,efd->cbf";
+static constexpr std::array<int64_t, 3> shape309{2, 2, 2};
+static constexpr std::array<float, 8> expected309{12.f, 20.f, 60.f, 100.f, 198.f, 342.f, 286.f, 494.f};
+static constexpr std::string_view equation310 = "cba,dc,efd->cae";
+static constexpr std::array<int64_t, 3> shape310{2, 2, 2};
+static constexpr std::array<float, 8> expected310{16.f, 48.f, 32.f, 96.f, 140.f, 460.f, 168.f, 552.f};
+static constexpr std::string_view equation311 = "cba,dc,efd->caf";
+static constexpr std::array<int64_t, 3> shape311{2, 2, 2};
+static constexpr std::array<float, 8> expected311{24.f, 40.f, 48.f, 80.f, 220.f, 380.f, 264.f, 456.f};
+static constexpr std::string_view equation312 = "cba,dc,fde->cbf";
+static constexpr std::array<int64_t, 3> shape312{2, 2, 2};
+static constexpr std::array<float, 8> expected312{10.f, 26.f, 50.f, 130.f, 144.f, 432.f, 208.f, 624.f};
+static constexpr std::string_view equation313 = "cba,dc,fde->cbd";
+static constexpr std::array<int64_t, 3> shape313{2, 2, 2};
+static constexpr std::array<float, 8> expected313{0.f, 36.f, 0.f, 180.f, 90.f, 486.f, 130.f, 702.f};
+static constexpr std::string_view equation314 = "cba,dc,fde->caf";
+static constexpr std::array<int64_t, 3> shape314{2, 2, 2};
+static constexpr std::array<float, 8> expected314{20.f, 52.f, 40.f, 104.f, 160.f, 480.f, 192.f, 576.f};
+static constexpr std::string_view equation315 = "cba,dc,fde->cad";
+static constexpr std::array<int64_t, 3> shape315{2, 2, 2};
+static constexpr std::array<float, 8> expected315{0.f, 72.f, 0.f, 144.f, 100.f, 540.f, 120.f, 648.f};
+static constexpr std::string_view equation316 = "cba,dc,fed->cbf";
+static constexpr std::array<int64_t, 3> shape316{2, 2, 2};
+static constexpr std::array<float, 8> expected316{8.f, 24.f, 40.f, 120.f, 126.f, 414.f, 182.f, 598.f};
+static constexpr std::string_view equation317 = "cba,dc,fed->cbe";
+static constexpr std::array<int64_t, 3> shape317{2, 2, 2};
+static constexpr std::array<float, 8> expected317{12.f, 20.f, 60.f, 100.f, 198.f, 342.f, 286.f, 494.f};
+static constexpr std::string_view equation318 = "cba,dc,fed->caf";
+static constexpr std::array<int64_t, 3> shape318{2, 2, 2};
+static constexpr std::array<float, 8> expected318{16.f, 48.f, 32.f, 96.f, 140.f, 460.f, 168.f, 552.f};
+static constexpr std::string_view equation319 = "cba,dc,fed->cae";
+static constexpr std::array<int64_t, 3> shape319{2, 2, 2};
+static constexpr std::array<float, 8> expected319{24.f, 40.f, 48.f, 80.f, 220.f, 380.f, 264.f, 456.f};
+static constexpr std::array<EinsumTestCase, 288> case1 = {{{equation32, shape32, expected32},
+                                                           {equation33, shape33, expected33},
+                                                           {equation34, shape34, expected34},
+                                                           {equation35, shape35, expected35},
+                                                           {equation36, shape36, expected36},
+                                                           {equation37, shape37, expected37},
+                                                           {equation38, shape38, expected38},
+                                                           {equation39, shape39, expected39},
+                                                           {equation40, shape40, expected40},
+                                                           {equation41, shape41, expected41},
+                                                           {equation42, shape42, expected42},
+                                                           {equation43, shape43, expected43},
+                                                           {equation44, shape44, expected44},
+                                                           {equation45, shape45, expected45},
+                                                           {equation46, shape46, expected46},
+                                                           {equation47, shape47, expected47},
+                                                           {equation48, shape48, expected48},
+                                                           {equation49, shape49, expected49},
+                                                           {equation50, shape50, expected50},
+                                                           {equation51, shape51, expected51},
+                                                           {equation52, shape52, expected52},
+                                                           {equation53, shape53, expected53},
+                                                           {equation54, shape54, expected54},
+                                                           {equation55, shape55, expected55},
+                                                           {equation56, shape56, expected56},
+                                                           {equation57, shape57, expected57},
+                                                           {equation58, shape58, expected58},
+                                                           {equation59, shape59, expected59},
+                                                           {equation60, shape60, expected60},
+                                                           {equation61, shape61, expected61},
+                                                           {equation62, shape62, expected62},
+                                                           {equation63, shape63, expected63},
+                                                           {equation64, shape64, expected64},
+                                                           {equation65, shape65, expected65},
+                                                           {equation66, shape66, expected66},
+                                                           {equation67, shape67, expected67},
+                                                           {equation68, shape68, expected68},
+                                                           {equation69, shape69, expected69},
+                                                           {equation70, shape70, expected70},
+                                                           {equation71, shape71, expected71},
+                                                           {equation72, shape72, expected72},
+                                                           {equation73, shape73, expected73},
+                                                           {equation74, shape74, expected74},
+                                                           {equation75, shape75, expected75},
+                                                           {equation76, shape76, expected76},
+                                                           {equation77, shape77, expected77},
+                                                           {equation78, shape78, expected78},
+                                                           {equation79, shape79, expected79},
+                                                           {equation80, shape80, expected80},
+                                                           {equation81, shape81, expected81},
+                                                           {equation82, shape82, expected82},
+                                                           {equation83, shape83, expected83},
+                                                           {equation84, shape84, expected84},
+                                                           {equation85, shape85, expected85},
+                                                           {equation86, shape86, expected86},
+                                                           {equation87, shape87, expected87},
+                                                           {equation88, shape88, expected88},
+                                                           {equation89, shape89, expected89},
+                                                           {equation90, shape90, expected90},
+                                                           {equation91, shape91, expected91},
+                                                           {equation92, shape92, expected92},
+                                                           {equation93, shape93, expected93},
+                                                           {equation94, shape94, expected94},
+                                                           {equation95, shape95, expected95},
+                                                           {equation96, shape96, expected96},
+                                                           {equation97, shape97, expected97},
+                                                           {equation98, shape98, expected98},
+                                                           {equation99, shape99, expected99},
+                                                           {equation100, shape100, expected100},
+                                                           {equation101, shape101, expected101},
+                                                           {equation102, shape102, expected102},
+                                                           {equation103, shape103, expected103},
+                                                           {equation104, shape104, expected104},
+                                                           {equation105, shape105, expected105},
+                                                           {equation106, shape106, expected106},
+                                                           {equation107, shape107, expected107},
+                                                           {equation108, shape108, expected108},
+                                                           {equation109, shape109, expected109},
+                                                           {equation110, shape110, expected110},
+                                                           {equation111, shape111, expected111},
+                                                           {equation112, shape112, expected112},
+                                                           {equation113, shape113, expected113},
+                                                           {equation114, shape114, expected114},
+                                                           {equation115, shape115, expected115},
+                                                           {equation116, shape116, expected116},
+                                                           {equation117, shape117, expected117},
+                                                           {equation118, shape118, expected118},
+                                                           {equation119, shape119, expected119},
+                                                           {equation120, shape120, expected120},
+                                                           {equation121, shape121, expected121},
+                                                           {equation122, shape122, expected122},
+                                                           {equation123, shape123, expected123},
+                                                           {equation124, shape124, expected124},
+                                                           {equation125, shape125, expected125},
+                                                           {equation126, shape126, expected126},
+                                                           {equation127, shape127, expected127},
+                                                           {equation128, shape128, expected128},
+                                                           {equation129, shape129, expected129},
+                                                           {equation130, shape130, expected130},
+                                                           {equation131, shape131, expected131},
+                                                           {equation132, shape132, expected132},
+                                                           {equation133, shape133, expected133},
+                                                           {equation134, shape134, expected134},
+                                                           {equation135, shape135, expected135},
+                                                           {equation136, shape136, expected136},
+                                                           {equation137, shape137, expected137},
+                                                           {equation138, shape138, expected138},
+                                                           {equation139, shape139, expected139},
+                                                           {equation140, shape140, expected140},
+                                                           {equation141, shape141, expected141},
+                                                           {equation142, shape142, expected142},
+                                                           {equation143, shape143, expected143},
+                                                           {equation144, shape144, expected144},
+                                                           {equation145, shape145, expected145},
+                                                           {equation146, shape146, expected146},
+                                                           {equation147, shape147, expected147},
+                                                           {equation148, shape148, expected148},
+                                                           {equation149, shape149, expected149},
+                                                           {equation150, shape150, expected150},
+                                                           {equation151, shape151, expected151},
+                                                           {equation152, shape152, expected152},
+                                                           {equation153, shape153, expected153},
+                                                           {equation154, shape154, expected154},
+                                                           {equation155, shape155, expected155},
+                                                           {equation156, shape156, expected156},
+                                                           {equation157, shape157, expected157},
+                                                           {equation158, shape158, expected158},
+                                                           {equation159, shape159, expected159},
+                                                           {equation160, shape160, expected160},
+                                                           {equation161, shape161, expected161},
+                                                           {equation162, shape162, expected162},
+                                                           {equation163, shape163, expected163},
+                                                           {equation164, shape164, expected164},
+                                                           {equation165, shape165, expected165},
+                                                           {equation166, shape166, expected166},
+                                                           {equation167, shape167, expected167},
+                                                           {equation168, shape168, expected168},
+                                                           {equation169, shape169, expected169},
+                                                           {equation170, shape170, expected170},
+                                                           {equation171, shape171, expected171},
+                                                           {equation172, shape172, expected172},
+                                                           {equation173, shape173, expected173},
+                                                           {equation174, shape174, expected174},
+                                                           {equation175, shape175, expected175},
+                                                           {equation176, shape176, expected176},
+                                                           {equation177, shape177, expected177},
+                                                           {equation178, shape178, expected178},
+                                                           {equation179, shape179, expected179},
+                                                           {equation180, shape180, expected180},
+                                                           {equation181, shape181, expected181},
+                                                           {equation182, shape182, expected182},
+                                                           {equation183, shape183, expected183},
+                                                           {equation184, shape184, expected184},
+                                                           {equation185, shape185, expected185},
+                                                           {equation186, shape186, expected186},
+                                                           {equation187, shape187, expected187},
+                                                           {equation188, shape188, expected188},
+                                                           {equation189, shape189, expected189},
+                                                           {equation190, shape190, expected190},
+                                                           {equation191, shape191, expected191},
+                                                           {equation192, shape192, expected192},
+                                                           {equation193, shape193, expected193},
+                                                           {equation194, shape194, expected194},
+                                                           {equation195, shape195, expected195},
+                                                           {equation196, shape196, expected196},
+                                                           {equation197, shape197, expected197},
+                                                           {equation198, shape198, expected198},
+                                                           {equation199, shape199, expected199},
+                                                           {equation200, shape200, expected200},
+                                                           {equation201, shape201, expected201},
+                                                           {equation202, shape202, expected202},
+                                                           {equation203, shape203, expected203},
+                                                           {equation204, shape204, expected204},
+                                                           {equation205, shape205, expected205},
+                                                           {equation206, shape206, expected206},
+                                                           {equation207, shape207, expected207},
+                                                           {equation208, shape208, expected208},
+                                                           {equation209, shape209, expected209},
+                                                           {equation210, shape210, expected210},
+                                                           {equation211, shape211, expected211},
+                                                           {equation212, shape212, expected212},
+                                                           {equation213, shape213, expected213},
+                                                           {equation214, shape214, expected214},
+                                                           {equation215, shape215, expected215},
+                                                           {equation216, shape216, expected216},
+                                                           {equation217, shape217, expected217},
+                                                           {equation218, shape218, expected218},
+                                                           {equation219, shape219, expected219},
+                                                           {equation220, shape220, expected220},
+                                                           {equation221, shape221, expected221},
+                                                           {equation222, shape222, expected222},
+                                                           {equation223, shape223, expected223},
+                                                           {equation224, shape224, expected224},
+                                                           {equation225, shape225, expected225},
+                                                           {equation226, shape226, expected226},
+                                                           {equation227, shape227, expected227},
+                                                           {equation228, shape228, expected228},
+                                                           {equation229, shape229, expected229},
+                                                           {equation230, shape230, expected230},
+                                                           {equation231, shape231, expected231},
+                                                           {equation232, shape232, expected232},
+                                                           {equation233, shape233, expected233},
+                                                           {equation234, shape234, expected234},
+                                                           {equation235, shape235, expected235},
+                                                           {equation236, shape236, expected236},
+                                                           {equation237, shape237, expected237},
+                                                           {equation238, shape238, expected238},
+                                                           {equation239, shape239, expected239},
+                                                           {equation240, shape240, expected240},
+                                                           {equation241, shape241, expected241},
+                                                           {equation242, shape242, expected242},
+                                                           {equation243, shape243, expected243},
+                                                           {equation244, shape244, expected244},
+                                                           {equation245, shape245, expected245},
+                                                           {equation246, shape246, expected246},
+                                                           {equation247, shape247, expected247},
+                                                           {equation248, shape248, expected248},
+                                                           {equation249, shape249, expected249},
+                                                           {equation250, shape250, expected250},
+                                                           {equation251, shape251, expected251},
+                                                           {equation252, shape252, expected252},
+                                                           {equation253, shape253, expected253},
+                                                           {equation254, shape254, expected254},
+                                                           {equation255, shape255, expected255},
+                                                           {equation256, shape256, expected256},
+                                                           {equation257, shape257, expected257},
+                                                           {equation258, shape258, expected258},
+                                                           {equation259, shape259, expected259},
+                                                           {equation260, shape260, expected260},
+                                                           {equation261, shape261, expected261},
+                                                           {equation262, shape262, expected262},
+                                                           {equation263, shape263, expected263},
+                                                           {equation264, shape264, expected264},
+                                                           {equation265, shape265, expected265},
+                                                           {equation266, shape266, expected266},
+                                                           {equation267, shape267, expected267},
+                                                           {equation268, shape268, expected268},
+                                                           {equation269, shape269, expected269},
+                                                           {equation270, shape270, expected270},
+                                                           {equation271, shape271, expected271},
+                                                           {equation272, shape272, expected272},
+                                                           {equation273, shape273, expected273},
+                                                           {equation274, shape274, expected274},
+                                                           {equation275, shape275, expected275},
+                                                           {equation276, shape276, expected276},
+                                                           {equation277, shape277, expected277},
+                                                           {equation278, shape278, expected278},
+                                                           {equation279, shape279, expected279},
+                                                           {equation280, shape280, expected280},
+                                                           {equation281, shape281, expected281},
+                                                           {equation282, shape282, expected282},
+                                                           {equation283, shape283, expected283},
+                                                           {equation284, shape284, expected284},
+                                                           {equation285, shape285, expected285},
+                                                           {equation286, shape286, expected286},
+                                                           {equation287, shape287, expected287},
+                                                           {equation288, shape288, expected288},
+                                                           {equation289, shape289, expected289},
+                                                           {equation290, shape290, expected290},
+                                                           {equation291, shape291, expected291},
+                                                           {equation292, shape292, expected292},
+                                                           {equation293, shape293, expected293},
+                                                           {equation294, shape294, expected294},
+                                                           {equation295, shape295, expected295},
+                                                           {equation296, shape296, expected296},
+                                                           {equation297, shape297, expected297},
+                                                           {equation298, shape298, expected298},
+                                                           {equation299, shape299, expected299},
+                                                           {equation300, shape300, expected300},
+                                                           {equation301, shape301, expected301},
+                                                           {equation302, shape302, expected302},
+                                                           {equation303, shape303, expected303},
+                                                           {equation304, shape304, expected304},
+                                                           {equation305, shape305, expected305},
+                                                           {equation306, shape306, expected306},
+                                                           {equation307, shape307, expected307},
+                                                           {equation308, shape308, expected308},
+                                                           {equation309, shape309, expected309},
+                                                           {equation310, shape310, expected310},
+                                                           {equation311, shape311, expected311},
+                                                           {equation312, shape312, expected312},
+                                                           {equation313, shape313, expected313},
+                                                           {equation314, shape314, expected314},
+                                                           {equation315, shape315, expected315},
+                                                           {equation316, shape316, expected316},
+                                                           {equation317, shape317, expected317},
+                                                           {equation318, shape318, expected318},
+                                                           {equation319, shape319, expected319}}};
 
 TEST(Einsum, EinsumTransposeMatMulTwoInputsTestSuite) {
-  std::vector<EinsumTestCase> test_cases{
-      EinsumTestCase("abc,cd->abc", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 5.f, 2.f, 15.f, 4.f, 25.f, 6.f, 35.f}),
-      EinsumTestCase("abc,cd->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{2.f, 3.f, 6.f, 11.f, 10.f, 19.f, 14.f, 27.f}),
-      EinsumTestCase("abc,cd->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 2.f, 8.f, 12.f, 0.f, 10.f, 24.f, 36.f}),
-      EinsumTestCase("abc,dc->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{1.f, 3.f, 3.f, 13.f, 5.f, 23.f, 7.f, 33.f}),
-      EinsumTestCase("abc,dc->abc", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 4.f, 4.f, 12.f, 8.f, 20.f, 12.f, 28.f}),
-      EinsumTestCase("abc,dc->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 4.f, 4.f, 12.f, 0.f, 20.f, 12.f, 36.f}),
-      EinsumTestCase("acb,cd->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 1.f, 10.f, 15.f, 0.f, 9.f, 26.f, 39.f}),
-      EinsumTestCase("acb,cd->abc", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 10.f, 1.f, 15.f, 4.f, 30.f, 5.f, 35.f}),
-      EinsumTestCase("acb,cd->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 6.f, 6.f, 10.f, 12.f, 22.f, 14.f, 26.f}),
-      EinsumTestCase("acb,dc->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 2.f, 5.f, 15.f, 0.f, 18.f, 13.f, 39.f}),
-      EinsumTestCase("acb,dc->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{2.f, 6.f, 3.f, 11.f, 6.f, 26.f, 7.f, 31.f}),
-      EinsumTestCase("acb,dc->abc", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 8.f, 2.f, 12.f, 8.f, 24.f, 10.f, 28.f}),
-      EinsumTestCase("bac,cd->bac", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 5.f, 2.f, 15.f, 4.f, 25.f, 6.f, 35.f}),
-      EinsumTestCase("bac,cd->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{2.f, 3.f, 6.f, 11.f, 10.f, 19.f, 14.f, 27.f}),
-      EinsumTestCase("bac,cd->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 2.f, 8.f, 12.f, 0.f, 10.f, 24.f, 36.f}),
-      EinsumTestCase("bac,dc->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{1.f, 3.f, 3.f, 13.f, 5.f, 23.f, 7.f, 33.f}),
-      EinsumTestCase("bac,dc->bac", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 4.f, 4.f, 12.f, 8.f, 20.f, 12.f, 28.f}),
-      EinsumTestCase("bac,dc->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 4.f, 4.f, 12.f, 0.f, 20.f, 12.f, 36.f}),
-      EinsumTestCase("bca,cd->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 1.f, 10.f, 15.f, 0.f, 9.f, 26.f, 39.f}),
-      EinsumTestCase("bca,cd->bac", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 10.f, 1.f, 15.f, 4.f, 30.f, 5.f, 35.f}),
-      EinsumTestCase("bca,cd->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 6.f, 6.f, 10.f, 12.f, 22.f, 14.f, 26.f}),
-      EinsumTestCase("bca,dc->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 2.f, 5.f, 15.f, 0.f, 18.f, 13.f, 39.f}),
-      EinsumTestCase("bca,dc->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{2.f, 6.f, 3.f, 11.f, 6.f, 26.f, 7.f, 31.f}),
-      EinsumTestCase("bca,dc->bac", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 8.f, 2.f, 12.f, 8.f, 24.f, 10.f, 28.f}),
-      EinsumTestCase("cab,cd->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 1.f, 0.f, 5.f, 18.f, 27.f, 26.f, 39.f}),
-      EinsumTestCase("cab,cd->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 2.f, 0.f, 4.f, 20.f, 30.f, 24.f, 36.f}),
-      EinsumTestCase("cab,dc->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 2.f, 0.f, 10.f, 9.f, 27.f, 13.f, 39.f}),
-      EinsumTestCase("cab,dc->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 4.f, 0.f, 8.f, 10.f, 30.f, 12.f, 36.f}),
-      EinsumTestCase("cba,cd->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 1.f, 0.f, 5.f, 18.f, 27.f, 26.f, 39.f}),
-      EinsumTestCase("cba,cd->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 2.f, 0.f, 4.f, 20.f, 30.f, 24.f, 36.f}),
-      EinsumTestCase("cba,dc->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 2.f, 0.f, 10.f, 9.f, 27.f, 13.f, 39.f}),
-      EinsumTestCase("cba,dc->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 4.f, 0.f, 8.f, 10.f, 30.f, 12.f, 36.f})};
-
   std::vector<float> m1{0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
   std::vector<float> m2{0.f, 1.f, 2.f, 3.f};
-  for (const auto& tst : test_cases) {
+  for (const auto& tst : case0) {
     OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
-    test.AddAttribute<std::string>("equation", tst.equation);
+    std::string s(tst.equation);
+    test.AddAttribute<std::string>("equation", s);
     test.AddInput<float>("x", {2, 2, 2}, m1);
     test.AddInput<float>("y", {2, 2}, m2);
-    test.AddOutput<float>("o", tst.shape, tst.expected);
+
+    std::vector<int64_t> v1(tst.shape.begin(), tst.shape.end());
+    std::vector<float> v2(tst.expected.begin(), tst.expected.end());
+    test.AddOutput<float>("o", v1, v2);
     test.Run();
   }
 }
 
-TEST(Einsum, EinsumTransposeMatMulThreeInputsTestSuite) {
-  std::vector<EinsumTestCase> test_cases_set_1{
-      EinsumTestCase("abc,cd,def->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 66.f, 36.f, 242.f, 60.f, 418.f, 84.f, 594.f}),
-      EinsumTestCase("abc,cd,def->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{29.f, 49.f, 105.f, 173.f, 181.f, 297.f, 257.f, 421.f}),
-      EinsumTestCase("abc,cd,def->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 48.f, 264.f, 0.f, 220.f, 144.f, 792.f}),
-      EinsumTestCase("abc,cd,def->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 116.f, 196.f, 90.f, 130.f, 348.f, 588.f}),
-      EinsumTestCase("abc,cd,dfe->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 66.f, 36.f, 242.f, 60.f, 418.f, 84.f, 594.f}),
-      EinsumTestCase("abc,cd,dfe->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{29.f, 49.f, 105.f, 173.f, 181.f, 297.f, 257.f, 421.f}),
-      EinsumTestCase("abc,cd,dfe->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 48.f, 264.f, 0.f, 220.f, 144.f, 792.f}),
-      EinsumTestCase("abc,cd,dfe->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 116.f, 196.f, 90.f, 130.f, 348.f, 588.f}),
-      EinsumTestCase("abc,cd,edf->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{17.f, 57.f, 61.f, 197.f, 105.f, 337.f, 149.f, 477.f}),
-      EinsumTestCase("abc,cd,edf->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 54.f, 60.f, 198.f, 100.f, 342.f, 140.f, 486.f}),
-      EinsumTestCase("abc,cd,edf->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 68.f, 228.f, 50.f, 130.f, 204.f, 684.f}),
-      EinsumTestCase("abc,cd,edf->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 80.f, 216.f, 0.f, 180.f, 240.f, 648.f}),
-      EinsumTestCase("abc,cd,efd->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 56.f, 56.f, 192.f, 96.f, 328.f, 136.f, 464.f}),
-      EinsumTestCase("abc,cd,efd->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{26.f, 46.f, 90.f, 158.f, 154.f, 270.f, 218.f, 382.f}),
-      EinsumTestCase("abc,cd,efd->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 64.f, 224.f, 40.f, 120.f, 192.f, 672.f}),
-      EinsumTestCase("abc,cd,efd->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 104.f, 184.f, 60.f, 100.f, 312.f, 552.f}),
-      EinsumTestCase("abc,cd,fde->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{17.f, 57.f, 61.f, 197.f, 105.f, 337.f, 149.f, 477.f}),
-      EinsumTestCase("abc,cd,fde->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 54.f, 60.f, 198.f, 100.f, 342.f, 140.f, 486.f}),
-      EinsumTestCase("abc,cd,fde->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 68.f, 228.f, 50.f, 130.f, 204.f, 684.f}),
-      EinsumTestCase("abc,cd,fde->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 80.f, 216.f, 0.f, 180.f, 240.f, 648.f}),
-      EinsumTestCase("abc,cd,fed->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 56.f, 56.f, 192.f, 96.f, 328.f, 136.f, 464.f}),
-      EinsumTestCase("abc,cd,fed->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{26.f, 46.f, 90.f, 158.f, 154.f, 270.f, 218.f, 382.f}),
-      EinsumTestCase("abc,cd,fed->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 64.f, 224.f, 40.f, 120.f, 192.f, 672.f}),
-      EinsumTestCase("abc,cd,fed->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 104.f, 184.f, 60.f, 100.f, 312.f, 552.f}),
-      EinsumTestCase("abc,dc,def->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 66.f, 18.f, 286.f, 30.f, 506.f, 42.f, 726.f}),
-      EinsumTestCase("abc,dc,def->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{28.f, 44.f, 120.f, 184.f, 212.f, 324.f, 304.f, 464.f}),
-      EinsumTestCase("abc,dc,def->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 88.f, 24.f, 264.f, 0.f, 440.f, 72.f, 792.f}),
-      EinsumTestCase("abc,dc,def->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{36.f, 52.f, 112.f, 176.f, 180.f, 260.f, 336.f, 528.f}),
-      EinsumTestCase("abc,dc,dfe->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 66.f, 18.f, 286.f, 30.f, 506.f, 42.f, 726.f}),
-      EinsumTestCase("abc,dc,dfe->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{28.f, 44.f, 120.f, 184.f, 212.f, 324.f, 304.f, 464.f}),
-      EinsumTestCase("abc,dc,dfe->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 88.f, 24.f, 264.f, 0.f, 440.f, 72.f, 792.f}),
-      EinsumTestCase("abc,dc,dfe->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{36.f, 52.f, 112.f, 176.f, 180.f, 260.f, 336.f, 528.f}),
-      EinsumTestCase("abc,dc,edf->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 68.f, 196.f, 120.f, 344.f, 172.f, 492.f}),
-      EinsumTestCase("abc,dc,edf->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 54.f, 30.f, 234.f, 50.f, 414.f, 70.f, 594.f}),
-      EinsumTestCase("abc,dc,edf->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 52.f, 64.f, 192.f, 100.f, 260.f, 192.f, 576.f}),
-      EinsumTestCase("abc,dc,edf->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 72.f, 40.f, 216.f, 0.f, 360.f, 120.f, 648.f}),
-      EinsumTestCase("abc,dc,efd->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{14.f, 46.f, 58.f, 186.f, 102.f, 326.f, 146.f, 466.f}),
-      EinsumTestCase("abc,dc,efd->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{22.f, 38.f, 90.f, 154.f, 158.f, 270.f, 226.f, 386.f}),
-      EinsumTestCase("abc,dc,efd->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 56.f, 184.f, 80.f, 240.f, 168.f, 552.f}),
-      EinsumTestCase("abc,dc,efd->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 40.f, 88.f, 152.f, 120.f, 200.f, 264.f, 456.f}),
-      EinsumTestCase("abc,dc,fde->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 68.f, 196.f, 120.f, 344.f, 172.f, 492.f}),
-      EinsumTestCase("abc,dc,fde->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 54.f, 30.f, 234.f, 50.f, 414.f, 70.f, 594.f}),
-      EinsumTestCase("abc,dc,fde->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 52.f, 64.f, 192.f, 100.f, 260.f, 192.f, 576.f}),
-      EinsumTestCase("abc,dc,fde->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 72.f, 40.f, 216.f, 0.f, 360.f, 120.f, 648.f}),
-      EinsumTestCase("abc,dc,fed->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{14.f, 46.f, 58.f, 186.f, 102.f, 326.f, 146.f, 466.f}),
-      EinsumTestCase("abc,dc,fed->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{22.f, 38.f, 90.f, 154.f, 158.f, 270.f, 226.f, 386.f}),
-      EinsumTestCase("abc,dc,fed->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 56.f, 184.f, 80.f, 240.f, 168.f, 552.f}),
-      EinsumTestCase("abc,dc,fed->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 40.f, 88.f, 152.f, 120.f, 200.f, 264.f, 456.f}),
-      EinsumTestCase("acb,cd,def->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 22.f, 60.f, 330.f, 0.f, 198.f, 156.f, 858.f}),
-      EinsumTestCase("acb,cd,def->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{9.f, 13.f, 145.f, 245.f, 81.f, 117.f, 377.f, 637.f}),
-      EinsumTestCase("acb,cd,def->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 132.f, 36.f, 220.f, 72.f, 484.f, 84.f, 572.f}),
-      EinsumTestCase("acb,cd,def->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{58.f, 98.f, 96.f, 160.f, 210.f, 346.f, 248.f, 408.f}),
-      EinsumTestCase("acb,cd,dfe->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 22.f, 60.f, 330.f, 0.f, 198.f, 156.f, 858.f}),
-      EinsumTestCase("acb,cd,dfe->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{9.f, 13.f, 145.f, 245.f, 81.f, 117.f, 377.f, 637.f}),
-      EinsumTestCase("acb,cd,dfe->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 132.f, 36.f, 220.f, 72.f, 484.f, 84.f, 572.f}),
-      EinsumTestCase("acb,cd,dfe->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{58.f, 98.f, 96.f, 160.f, 210.f, 346.f, 248.f, 408.f}),
-      EinsumTestCase("acb,cd,edf->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{5.f, 13.f, 85.f, 285.f, 45.f, 117.f, 221.f, 741.f}),
-      EinsumTestCase("acb,cd,edf->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 18.f, 100.f, 270.f, 0.f, 162.f, 260.f, 702.f}),
-      EinsumTestCase("acb,cd,edf->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{34.f, 114.f, 56.f, 184.f, 122.f, 394.f, 144.f, 464.f}),
-      EinsumTestCase("acb,cd,edf->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{40.f, 108.f, 60.f, 180.f, 120.f, 396.f, 140.f, 468.f}),
-      EinsumTestCase("acb,cd,efd->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 12.f, 80.f, 280.f, 36.f, 108.f, 208.f, 728.f}),
-      EinsumTestCase("acb,cd,efd->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 10.f, 130.f, 230.f, 54.f, 90.f, 338.f, 598.f}),
-      EinsumTestCase("acb,cd,efd->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{32.f, 112.f, 52.f, 180.f, 112.f, 384.f, 132.f, 452.f}),
-      EinsumTestCase("acb,cd,efd->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{52.f, 92.f, 84.f, 148.f, 180.f, 316.f, 212.f, 372.f}),
-      EinsumTestCase("acb,cd,fde->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{5.f, 13.f, 85.f, 285.f, 45.f, 117.f, 221.f, 741.f}),
-      EinsumTestCase("acb,cd,fde->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 18.f, 100.f, 270.f, 0.f, 162.f, 260.f, 702.f}),
-      EinsumTestCase("acb,cd,fde->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{34.f, 114.f, 56.f, 184.f, 122.f, 394.f, 144.f, 464.f}),
-      EinsumTestCase("acb,cd,fde->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{40.f, 108.f, 60.f, 180.f, 120.f, 396.f, 140.f, 468.f}),
-      EinsumTestCase("acb,cd,fed->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 12.f, 80.f, 280.f, 36.f, 108.f, 208.f, 728.f}),
-      EinsumTestCase("acb,cd,fed->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 10.f, 130.f, 230.f, 54.f, 90.f, 338.f, 598.f}),
-      EinsumTestCase("acb,cd,fed->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{32.f, 112.f, 52.f, 180.f, 112.f, 384.f, 132.f, 452.f}),
-      EinsumTestCase("acb,cd,fed->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{52.f, 92.f, 84.f, 148.f, 180.f, 316.f, 212.f, 372.f}),
-      EinsumTestCase("acb,dc,def->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 30.f, 330.f, 0.f, 396.f, 78.f, 858.f}),
-      EinsumTestCase("acb,dc,def->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 140.f, 220.f, 162.f, 234.f, 364.f, 572.f})};
-
-  std::vector<EinsumTestCase> test_cases_set_2{
-      EinsumTestCase("acb,dc,def->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 132.f, 18.f, 242.f, 36.f, 572.f, 42.f, 682.f}),
-      EinsumTestCase("acb,dc,def->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{56.f, 88.f, 102.f, 158.f, 240.f, 368.f, 286.f, 438.f}),
-      EinsumTestCase("acb,dc,dfe->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 30.f, 330.f, 0.f, 396.f, 78.f, 858.f}),
-      EinsumTestCase("acb,dc,dfe->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 140.f, 220.f, 162.f, 234.f, 364.f, 572.f}),
-      EinsumTestCase("acb,dc,dfe->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 132.f, 18.f, 242.f, 36.f, 572.f, 42.f, 682.f}),
-      EinsumTestCase("acb,dc,dfe->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{56.f, 88.f, 102.f, 158.f, 240.f, 368.f, 286.f, 438.f}),
-      EinsumTestCase("acb,dc,edf->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 80.f, 240.f, 90.f, 234.f, 208.f, 624.f}),
-      EinsumTestCase("acb,dc,edf->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 50.f, 270.f, 0.f, 324.f, 130.f, 702.f}),
-      EinsumTestCase("acb,dc,edf->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{32.f, 96.f, 58.f, 170.f, 136.f, 392.f, 162.f, 466.f}),
-      EinsumTestCase("acb,dc,edf->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 108.f, 30.f, 198.f, 60.f, 468.f, 70.f, 558.f}),
-      EinsumTestCase("acb,dc,efd->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 70.f, 230.f, 72.f, 216.f, 182.f, 598.f}),
-      EinsumTestCase("acb,dc,efd->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 110.f, 190.f, 108.f, 180.f, 286.f, 494.f}),
-      EinsumTestCase("acb,dc,efd->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{28.f, 92.f, 50.f, 162.f, 116.f, 372.f, 138.f, 442.f}),
-      EinsumTestCase("acb,dc,efd->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{44.f, 76.f, 78.f, 134.f, 180.f, 308.f, 214.f, 366.f}),
-      EinsumTestCase("acb,dc,fde->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 80.f, 240.f, 90.f, 234.f, 208.f, 624.f}),
-      EinsumTestCase("acb,dc,fde->acd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 50.f, 270.f, 0.f, 324.f, 130.f, 702.f}),
-      EinsumTestCase("acb,dc,fde->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{32.f, 96.f, 58.f, 170.f, 136.f, 392.f, 162.f, 466.f}),
-      EinsumTestCase("acb,dc,fde->abd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 108.f, 30.f, 198.f, 60.f, 468.f, 70.f, 558.f}),
-      EinsumTestCase("acb,dc,fed->acf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 70.f, 230.f, 72.f, 216.f, 182.f, 598.f}),
-      EinsumTestCase("acb,dc,fed->ace", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 110.f, 190.f, 108.f, 180.f, 286.f, 494.f}),
-      EinsumTestCase("acb,dc,fed->abf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{28.f, 92.f, 50.f, 162.f, 116.f, 372.f, 138.f, 442.f}),
-      EinsumTestCase("acb,dc,fed->abe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{44.f, 76.f, 78.f, 134.f, 180.f, 308.f, 214.f, 366.f}),
-      EinsumTestCase("bac,cd,def->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 66.f, 36.f, 242.f, 60.f, 418.f, 84.f, 594.f}),
-      EinsumTestCase("bac,cd,def->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{29.f, 49.f, 105.f, 173.f, 181.f, 297.f, 257.f, 421.f}),
-      EinsumTestCase("bac,cd,def->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 48.f, 264.f, 0.f, 220.f, 144.f, 792.f}),
-      EinsumTestCase("bac,cd,def->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 116.f, 196.f, 90.f, 130.f, 348.f, 588.f}),
-      EinsumTestCase("bac,cd,dfe->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 66.f, 36.f, 242.f, 60.f, 418.f, 84.f, 594.f}),
-      EinsumTestCase("bac,cd,dfe->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{29.f, 49.f, 105.f, 173.f, 181.f, 297.f, 257.f, 421.f}),
-      EinsumTestCase("bac,cd,dfe->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 48.f, 264.f, 0.f, 220.f, 144.f, 792.f}),
-      EinsumTestCase("bac,cd,dfe->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 116.f, 196.f, 90.f, 130.f, 348.f, 588.f}),
-      EinsumTestCase("bac,cd,edf->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{17.f, 57.f, 61.f, 197.f, 105.f, 337.f, 149.f, 477.f}),
-      EinsumTestCase("bac,cd,edf->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 54.f, 60.f, 198.f, 100.f, 342.f, 140.f, 486.f}),
-      EinsumTestCase("bac,cd,edf->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 68.f, 228.f, 50.f, 130.f, 204.f, 684.f}),
-      EinsumTestCase("bac,cd,edf->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 80.f, 216.f, 0.f, 180.f, 240.f, 648.f}),
-      EinsumTestCase("bac,cd,efd->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 56.f, 56.f, 192.f, 96.f, 328.f, 136.f, 464.f}),
-      EinsumTestCase("bac,cd,efd->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{26.f, 46.f, 90.f, 158.f, 154.f, 270.f, 218.f, 382.f}),
-      EinsumTestCase("bac,cd,efd->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 64.f, 224.f, 40.f, 120.f, 192.f, 672.f}),
-      EinsumTestCase("bac,cd,efd->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 104.f, 184.f, 60.f, 100.f, 312.f, 552.f}),
-      EinsumTestCase("bac,cd,fde->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{17.f, 57.f, 61.f, 197.f, 105.f, 337.f, 149.f, 477.f}),
-      EinsumTestCase("bac,cd,fde->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 54.f, 60.f, 198.f, 100.f, 342.f, 140.f, 486.f}),
-      EinsumTestCase("bac,cd,fde->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 68.f, 228.f, 50.f, 130.f, 204.f, 684.f}),
-      EinsumTestCase("bac,cd,fde->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 80.f, 216.f, 0.f, 180.f, 240.f, 648.f}),
-      EinsumTestCase("bac,cd,fed->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 56.f, 56.f, 192.f, 96.f, 328.f, 136.f, 464.f}),
-      EinsumTestCase("bac,cd,fed->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{26.f, 46.f, 90.f, 158.f, 154.f, 270.f, 218.f, 382.f}),
-      EinsumTestCase("bac,cd,fed->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 64.f, 224.f, 40.f, 120.f, 192.f, 672.f}),
-      EinsumTestCase("bac,cd,fed->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 104.f, 184.f, 60.f, 100.f, 312.f, 552.f}),
-      EinsumTestCase("bac,dc,def->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 66.f, 18.f, 286.f, 30.f, 506.f, 42.f, 726.f}),
-      EinsumTestCase("bac,dc,def->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{28.f, 44.f, 120.f, 184.f, 212.f, 324.f, 304.f, 464.f}),
-      EinsumTestCase("bac,dc,def->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 88.f, 24.f, 264.f, 0.f, 440.f, 72.f, 792.f}),
-      EinsumTestCase("bac,dc,def->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{36.f, 52.f, 112.f, 176.f, 180.f, 260.f, 336.f, 528.f}),
-      EinsumTestCase("bac,dc,dfe->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 66.f, 18.f, 286.f, 30.f, 506.f, 42.f, 726.f}),
-      EinsumTestCase("bac,dc,dfe->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{28.f, 44.f, 120.f, 184.f, 212.f, 324.f, 304.f, 464.f}),
-      EinsumTestCase("bac,dc,dfe->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 88.f, 24.f, 264.f, 0.f, 440.f, 72.f, 792.f}),
-      EinsumTestCase("bac,dc,dfe->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{36.f, 52.f, 112.f, 176.f, 180.f, 260.f, 336.f, 528.f}),
-      EinsumTestCase("bac,dc,edf->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 68.f, 196.f, 120.f, 344.f, 172.f, 492.f}),
-      EinsumTestCase("bac,dc,edf->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 54.f, 30.f, 234.f, 50.f, 414.f, 70.f, 594.f}),
-      EinsumTestCase("bac,dc,edf->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 52.f, 64.f, 192.f, 100.f, 260.f, 192.f, 576.f}),
-      EinsumTestCase("bac,dc,edf->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 72.f, 40.f, 216.f, 0.f, 360.f, 120.f, 648.f}),
-      EinsumTestCase("bac,dc,efd->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{14.f, 46.f, 58.f, 186.f, 102.f, 326.f, 146.f, 466.f}),
-      EinsumTestCase("bac,dc,efd->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{22.f, 38.f, 90.f, 154.f, 158.f, 270.f, 226.f, 386.f}),
-      EinsumTestCase("bac,dc,efd->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 56.f, 184.f, 80.f, 240.f, 168.f, 552.f}),
-      EinsumTestCase("bac,dc,efd->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 40.f, 88.f, 152.f, 120.f, 200.f, 264.f, 456.f}),
-      EinsumTestCase("bac,dc,fde->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 68.f, 196.f, 120.f, 344.f, 172.f, 492.f}),
-      EinsumTestCase("bac,dc,fde->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 54.f, 30.f, 234.f, 50.f, 414.f, 70.f, 594.f}),
-      EinsumTestCase("bac,dc,fde->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 52.f, 64.f, 192.f, 100.f, 260.f, 192.f, 576.f}),
-      EinsumTestCase("bac,dc,fde->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 72.f, 40.f, 216.f, 0.f, 360.f, 120.f, 648.f}),
-      EinsumTestCase("bac,dc,fed->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{14.f, 46.f, 58.f, 186.f, 102.f, 326.f, 146.f, 466.f}),
-      EinsumTestCase("bac,dc,fed->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{22.f, 38.f, 90.f, 154.f, 158.f, 270.f, 226.f, 386.f}),
-      EinsumTestCase("bac,dc,fed->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 56.f, 184.f, 80.f, 240.f, 168.f, 552.f}),
-      EinsumTestCase("bac,dc,fed->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 40.f, 88.f, 152.f, 120.f, 200.f, 264.f, 456.f}),
-      EinsumTestCase("bca,cd,def->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 22.f, 60.f, 330.f, 0.f, 198.f, 156.f, 858.f}),
-      EinsumTestCase("bca,cd,def->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{9.f, 13.f, 145.f, 245.f, 81.f, 117.f, 377.f, 637.f}),
-      EinsumTestCase("bca,cd,def->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 132.f, 36.f, 220.f, 72.f, 484.f, 84.f, 572.f}),
-      EinsumTestCase("bca,cd,def->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{58.f, 98.f, 96.f, 160.f, 210.f, 346.f, 248.f, 408.f}),
-      EinsumTestCase("bca,cd,dfe->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 22.f, 60.f, 330.f, 0.f, 198.f, 156.f, 858.f}),
-      EinsumTestCase("bca,cd,dfe->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{9.f, 13.f, 145.f, 245.f, 81.f, 117.f, 377.f, 637.f}),
-      EinsumTestCase("bca,cd,dfe->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 132.f, 36.f, 220.f, 72.f, 484.f, 84.f, 572.f}),
-      EinsumTestCase("bca,cd,dfe->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{58.f, 98.f, 96.f, 160.f, 210.f, 346.f, 248.f, 408.f}),
-      EinsumTestCase("bca,cd,edf->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{5.f, 13.f, 85.f, 285.f, 45.f, 117.f, 221.f, 741.f}),
-      EinsumTestCase("bca,cd,edf->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 18.f, 100.f, 270.f, 0.f, 162.f, 260.f, 702.f}),
-      EinsumTestCase("bca,cd,edf->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{34.f, 114.f, 56.f, 184.f, 122.f, 394.f, 144.f, 464.f}),
-      EinsumTestCase("bca,cd,edf->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{40.f, 108.f, 60.f, 180.f, 120.f, 396.f, 140.f, 468.f}),
-      EinsumTestCase("bca,cd,efd->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 12.f, 80.f, 280.f, 36.f, 108.f, 208.f, 728.f}),
-      EinsumTestCase("bca,cd,efd->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 10.f, 130.f, 230.f, 54.f, 90.f, 338.f, 598.f}),
-      EinsumTestCase("bca,cd,efd->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{32.f, 112.f, 52.f, 180.f, 112.f, 384.f, 132.f, 452.f}),
-      EinsumTestCase("bca,cd,efd->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{52.f, 92.f, 84.f, 148.f, 180.f, 316.f, 212.f, 372.f}),
-      EinsumTestCase("bca,cd,fde->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{5.f, 13.f, 85.f, 285.f, 45.f, 117.f, 221.f, 741.f}),
-      EinsumTestCase("bca,cd,fde->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 18.f, 100.f, 270.f, 0.f, 162.f, 260.f, 702.f}),
-      EinsumTestCase("bca,cd,fde->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{34.f, 114.f, 56.f, 184.f, 122.f, 394.f, 144.f, 464.f}),
-      EinsumTestCase("bca,cd,fde->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{40.f, 108.f, 60.f, 180.f, 120.f, 396.f, 140.f, 468.f}),
-      EinsumTestCase("bca,cd,fed->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 12.f, 80.f, 280.f, 36.f, 108.f, 208.f, 728.f}),
-      EinsumTestCase("bca,cd,fed->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 10.f, 130.f, 230.f, 54.f, 90.f, 338.f, 598.f}),
-      EinsumTestCase("bca,cd,fed->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{32.f, 112.f, 52.f, 180.f, 112.f, 384.f, 132.f, 452.f}),
-      EinsumTestCase("bca,cd,fed->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{52.f, 92.f, 84.f, 148.f, 180.f, 316.f, 212.f, 372.f}),
-      EinsumTestCase("bca,dc,def->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 30.f, 330.f, 0.f, 396.f, 78.f, 858.f}),
-      EinsumTestCase("bca,dc,def->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 140.f, 220.f, 162.f, 234.f, 364.f, 572.f}),
-      EinsumTestCase("bca,dc,def->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 132.f, 18.f, 242.f, 36.f, 572.f, 42.f, 682.f}),
-      EinsumTestCase("bca,dc,def->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{56.f, 88.f, 102.f, 158.f, 240.f, 368.f, 286.f, 438.f}),
-      EinsumTestCase("bca,dc,dfe->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 30.f, 330.f, 0.f, 396.f, 78.f, 858.f}),
-      EinsumTestCase("bca,dc,dfe->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 140.f, 220.f, 162.f, 234.f, 364.f, 572.f}),
-      EinsumTestCase("bca,dc,dfe->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 132.f, 18.f, 242.f, 36.f, 572.f, 42.f, 682.f}),
-      EinsumTestCase("bca,dc,dfe->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{56.f, 88.f, 102.f, 158.f, 240.f, 368.f, 286.f, 438.f}),
-      EinsumTestCase("bca,dc,edf->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 80.f, 240.f, 90.f, 234.f, 208.f, 624.f}),
-      EinsumTestCase("bca,dc,edf->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 50.f, 270.f, 0.f, 324.f, 130.f, 702.f}),
-      EinsumTestCase("bca,dc,edf->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{32.f, 96.f, 58.f, 170.f, 136.f, 392.f, 162.f, 466.f}),
-      EinsumTestCase("bca,dc,edf->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 108.f, 30.f, 198.f, 60.f, 468.f, 70.f, 558.f}),
-      EinsumTestCase("bca,dc,efd->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 70.f, 230.f, 72.f, 216.f, 182.f, 598.f}),
-      EinsumTestCase("bca,dc,efd->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 110.f, 190.f, 108.f, 180.f, 286.f, 494.f}),
-      EinsumTestCase("bca,dc,efd->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{28.f, 92.f, 50.f, 162.f, 116.f, 372.f, 138.f, 442.f}),
-      EinsumTestCase("bca,dc,efd->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{44.f, 76.f, 78.f, 134.f, 180.f, 308.f, 214.f, 366.f}),
-      EinsumTestCase("bca,dc,fde->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 80.f, 240.f, 90.f, 234.f, 208.f, 624.f}),
-      EinsumTestCase("bca,dc,fde->bcd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 50.f, 270.f, 0.f, 324.f, 130.f, 702.f}),
-      EinsumTestCase("bca,dc,fde->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{32.f, 96.f, 58.f, 170.f, 136.f, 392.f, 162.f, 466.f}),
-      EinsumTestCase("bca,dc,fde->bad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 108.f, 30.f, 198.f, 60.f, 468.f, 70.f, 558.f}),
-      EinsumTestCase("bca,dc,fed->bcf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 70.f, 230.f, 72.f, 216.f, 182.f, 598.f}),
-      EinsumTestCase("bca,dc,fed->bce", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 110.f, 190.f, 108.f, 180.f, 286.f, 494.f}),
-      EinsumTestCase("bca,dc,fed->baf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{28.f, 92.f, 50.f, 162.f, 116.f, 372.f, 138.f, 442.f}),
-      EinsumTestCase("bca,dc,fed->bae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{44.f, 76.f, 78.f, 134.f, 180.f, 308.f, 214.f, 366.f}),
-      EinsumTestCase("cab,cd,def->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 22.f, 0.f, 110.f, 108.f, 594.f, 156.f, 858.f}),
-      EinsumTestCase("cab,cd,def->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{9.f, 13.f, 45.f, 65.f, 261.f, 441.f, 377.f, 637.f}),
-      EinsumTestCase("cab,cd,def->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 0.f, 88.f, 120.f, 660.f, 144.f, 792.f}),
-      EinsumTestCase("cab,cd,def->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 36.f, 52.f, 290.f, 490.f, 348.f, 588.f}),
-      EinsumTestCase("cab,cd,dfe->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 22.f, 0.f, 110.f, 108.f, 594.f, 156.f, 858.f}),
-      EinsumTestCase("cab,cd,dfe->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{9.f, 13.f, 45.f, 65.f, 261.f, 441.f, 377.f, 637.f}),
-      EinsumTestCase("cab,cd,dfe->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 0.f, 88.f, 120.f, 660.f, 144.f, 792.f}),
-      EinsumTestCase("cab,cd,dfe->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 36.f, 52.f, 290.f, 490.f, 348.f, 588.f}),
-      EinsumTestCase("cab,cd,edf->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{5.f, 13.f, 25.f, 65.f, 153.f, 513.f, 221.f, 741.f}),
-      EinsumTestCase("cab,cd,edf->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 18.f, 0.f, 90.f, 180.f, 486.f, 260.f, 702.f}),
-      EinsumTestCase("cab,cd,edf->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 20.f, 52.f, 170.f, 570.f, 204.f, 684.f}),
-      EinsumTestCase("cab,cd,edf->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 0.f, 72.f, 200.f, 540.f, 240.f, 648.f}),
-      EinsumTestCase("cab,cd,efd->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 12.f, 20.f, 60.f, 144.f, 504.f, 208.f, 728.f}),
-      EinsumTestCase("cab,cd,efd->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 10.f, 30.f, 50.f, 234.f, 414.f, 338.f, 598.f}),
-      EinsumTestCase("cab,cd,efd->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 16.f, 48.f, 160.f, 560.f, 192.f, 672.f}),
-      EinsumTestCase("cab,cd,efd->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 24.f, 40.f, 260.f, 460.f, 312.f, 552.f}),
-      EinsumTestCase("cab,cd,fde->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{5.f, 13.f, 25.f, 65.f, 153.f, 513.f, 221.f, 741.f}),
-      EinsumTestCase("cab,cd,fde->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 18.f, 0.f, 90.f, 180.f, 486.f, 260.f, 702.f}),
-      EinsumTestCase("cab,cd,fde->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 20.f, 52.f, 170.f, 570.f, 204.f, 684.f}),
-      EinsumTestCase("cab,cd,fde->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 0.f, 72.f, 200.f, 540.f, 240.f, 648.f}),
-      EinsumTestCase("cab,cd,fed->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 12.f, 20.f, 60.f, 144.f, 504.f, 208.f, 728.f}),
-      EinsumTestCase("cab,cd,fed->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 10.f, 30.f, 50.f, 234.f, 414.f, 338.f, 598.f}),
-      EinsumTestCase("cab,cd,fed->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 16.f, 48.f, 160.f, 560.f, 192.f, 672.f}),
-      EinsumTestCase("cab,cd,fed->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 24.f, 40.f, 260.f, 460.f, 312.f, 552.f}),
-      EinsumTestCase("cab,dc,def->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 0.f, 220.f, 54.f, 594.f, 78.f, 858.f}),
-      EinsumTestCase("cab,dc,def->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 90.f, 130.f, 252.f, 396.f, 364.f, 572.f}),
-      EinsumTestCase("cab,dc,def->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 88.f, 0.f, 176.f, 60.f, 660.f, 72.f, 792.f})};
-
-  std::vector<EinsumTestCase> test_cases_set_3{
-      EinsumTestCase("cab,dc,def->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{36.f, 52.f, 72.f, 104.f, 280.f, 440.f, 336.f, 528.f}),
-      EinsumTestCase("cab,dc,dfe->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 0.f, 220.f, 54.f, 594.f, 78.f, 858.f}),
-      EinsumTestCase("cab,dc,dfe->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 90.f, 130.f, 252.f, 396.f, 364.f, 572.f}),
-      EinsumTestCase("cab,dc,dfe->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 88.f, 0.f, 176.f, 60.f, 660.f, 72.f, 792.f}),
-      EinsumTestCase("cab,dc,dfe->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{36.f, 52.f, 72.f, 104.f, 280.f, 440.f, 336.f, 528.f}),
-      EinsumTestCase("cab,dc,edf->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 50.f, 130.f, 144.f, 432.f, 208.f, 624.f}),
-      EinsumTestCase("cab,dc,edf->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 0.f, 180.f, 90.f, 486.f, 130.f, 702.f}),
-      EinsumTestCase("cab,dc,edf->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 52.f, 40.f, 104.f, 160.f, 480.f, 192.f, 576.f}),
-      EinsumTestCase("cab,dc,edf->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 72.f, 0.f, 144.f, 100.f, 540.f, 120.f, 648.f}),
-      EinsumTestCase("cab,dc,efd->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 40.f, 120.f, 126.f, 414.f, 182.f, 598.f}),
-      EinsumTestCase("cab,dc,efd->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 60.f, 100.f, 198.f, 342.f, 286.f, 494.f}),
-      EinsumTestCase("cab,dc,efd->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 32.f, 96.f, 140.f, 460.f, 168.f, 552.f}),
-      EinsumTestCase("cab,dc,efd->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 40.f, 48.f, 80.f, 220.f, 380.f, 264.f, 456.f}),
-      EinsumTestCase("cab,dc,fde->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 50.f, 130.f, 144.f, 432.f, 208.f, 624.f}),
-      EinsumTestCase("cab,dc,fde->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 0.f, 180.f, 90.f, 486.f, 130.f, 702.f}),
-      EinsumTestCase("cab,dc,fde->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 52.f, 40.f, 104.f, 160.f, 480.f, 192.f, 576.f}),
-      EinsumTestCase("cab,dc,fde->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 72.f, 0.f, 144.f, 100.f, 540.f, 120.f, 648.f}),
-      EinsumTestCase("cab,dc,fed->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 40.f, 120.f, 126.f, 414.f, 182.f, 598.f}),
-      EinsumTestCase("cab,dc,fed->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 60.f, 100.f, 198.f, 342.f, 286.f, 494.f}),
-      EinsumTestCase("cab,dc,fed->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 32.f, 96.f, 140.f, 460.f, 168.f, 552.f}),
-      EinsumTestCase("cab,dc,fed->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 40.f, 48.f, 80.f, 220.f, 380.f, 264.f, 456.f}),
-      EinsumTestCase("cba,cd,def->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 22.f, 0.f, 110.f, 108.f, 594.f, 156.f, 858.f}),
-      EinsumTestCase("cba,cd,def->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{9.f, 13.f, 45.f, 65.f, 261.f, 441.f, 377.f, 637.f}),
-      EinsumTestCase("cba,cd,def->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 0.f, 88.f, 120.f, 660.f, 144.f, 792.f}),
-      EinsumTestCase("cba,cd,def->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 36.f, 52.f, 290.f, 490.f, 348.f, 588.f}),
-      EinsumTestCase("cba,cd,dfe->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 22.f, 0.f, 110.f, 108.f, 594.f, 156.f, 858.f}),
-      EinsumTestCase("cba,cd,dfe->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{9.f, 13.f, 45.f, 65.f, 261.f, 441.f, 377.f, 637.f}),
-      EinsumTestCase("cba,cd,dfe->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 0.f, 88.f, 120.f, 660.f, 144.f, 792.f}),
-      EinsumTestCase("cba,cd,dfe->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 36.f, 52.f, 290.f, 490.f, 348.f, 588.f}),
-      EinsumTestCase("cba,cd,edf->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{5.f, 13.f, 25.f, 65.f, 153.f, 513.f, 221.f, 741.f}),
-      EinsumTestCase("cba,cd,edf->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 18.f, 0.f, 90.f, 180.f, 486.f, 260.f, 702.f}),
-      EinsumTestCase("cba,cd,edf->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 20.f, 52.f, 170.f, 570.f, 204.f, 684.f}),
-      EinsumTestCase("cba,cd,edf->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 0.f, 72.f, 200.f, 540.f, 240.f, 648.f}),
-      EinsumTestCase("cba,cd,efd->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 12.f, 20.f, 60.f, 144.f, 504.f, 208.f, 728.f}),
-      EinsumTestCase("cba,cd,efd->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 10.f, 30.f, 50.f, 234.f, 414.f, 338.f, 598.f}),
-      EinsumTestCase("cba,cd,efd->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 16.f, 48.f, 160.f, 560.f, 192.f, 672.f}),
-      EinsumTestCase("cba,cd,efd->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 24.f, 40.f, 260.f, 460.f, 312.f, 552.f}),
-      EinsumTestCase("cba,cd,fde->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{5.f, 13.f, 25.f, 65.f, 153.f, 513.f, 221.f, 741.f}),
-      EinsumTestCase("cba,cd,fde->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 18.f, 0.f, 90.f, 180.f, 486.f, 260.f, 702.f}),
-      EinsumTestCase("cba,cd,fde->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 20.f, 52.f, 170.f, 570.f, 204.f, 684.f}),
-      EinsumTestCase("cba,cd,fde->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 0.f, 72.f, 200.f, 540.f, 240.f, 648.f}),
-      EinsumTestCase("cba,cd,fed->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{4.f, 12.f, 20.f, 60.f, 144.f, 504.f, 208.f, 728.f}),
-      EinsumTestCase("cba,cd,fed->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{6.f, 10.f, 30.f, 50.f, 234.f, 414.f, 338.f, 598.f}),
-      EinsumTestCase("cba,cd,fed->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 16.f, 48.f, 160.f, 560.f, 192.f, 672.f}),
-      EinsumTestCase("cba,cd,fed->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 24.f, 40.f, 260.f, 460.f, 312.f, 552.f}),
-      EinsumTestCase("cba,dc,def->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 0.f, 220.f, 54.f, 594.f, 78.f, 858.f}),
-      EinsumTestCase("cba,dc,def->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 90.f, 130.f, 252.f, 396.f, 364.f, 572.f}),
-      EinsumTestCase("cba,dc,def->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 88.f, 0.f, 176.f, 60.f, 660.f, 72.f, 792.f}),
-      EinsumTestCase("cba,dc,def->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{36.f, 52.f, 72.f, 104.f, 280.f, 440.f, 336.f, 528.f}),
-      EinsumTestCase("cba,dc,dfe->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 44.f, 0.f, 220.f, 54.f, 594.f, 78.f, 858.f}),
-      EinsumTestCase("cba,dc,dfe->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{18.f, 26.f, 90.f, 130.f, 252.f, 396.f, 364.f, 572.f}),
-      EinsumTestCase("cba,dc,dfe->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 88.f, 0.f, 176.f, 60.f, 660.f, 72.f, 792.f}),
-      EinsumTestCase("cba,dc,dfe->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{36.f, 52.f, 72.f, 104.f, 280.f, 440.f, 336.f, 528.f}),
-      EinsumTestCase("cba,dc,edf->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 50.f, 130.f, 144.f, 432.f, 208.f, 624.f}),
-      EinsumTestCase("cba,dc,edf->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 0.f, 180.f, 90.f, 486.f, 130.f, 702.f}),
-      EinsumTestCase("cba,dc,edf->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 52.f, 40.f, 104.f, 160.f, 480.f, 192.f, 576.f}),
-      EinsumTestCase("cba,dc,edf->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 72.f, 0.f, 144.f, 100.f, 540.f, 120.f, 648.f}),
-      EinsumTestCase("cba,dc,efd->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 40.f, 120.f, 126.f, 414.f, 182.f, 598.f}),
-      EinsumTestCase("cba,dc,efd->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 60.f, 100.f, 198.f, 342.f, 286.f, 494.f}),
-      EinsumTestCase("cba,dc,efd->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 32.f, 96.f, 140.f, 460.f, 168.f, 552.f}),
-      EinsumTestCase("cba,dc,efd->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 40.f, 48.f, 80.f, 220.f, 380.f, 264.f, 456.f}),
-      EinsumTestCase("cba,dc,fde->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{10.f, 26.f, 50.f, 130.f, 144.f, 432.f, 208.f, 624.f}),
-      EinsumTestCase("cba,dc,fde->cbd", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 36.f, 0.f, 180.f, 90.f, 486.f, 130.f, 702.f}),
-      EinsumTestCase("cba,dc,fde->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{20.f, 52.f, 40.f, 104.f, 160.f, 480.f, 192.f, 576.f}),
-      EinsumTestCase("cba,dc,fde->cad", std::vector<int64_t>{2, 2, 2}, std::vector<float>{0.f, 72.f, 0.f, 144.f, 100.f, 540.f, 120.f, 648.f}),
-      EinsumTestCase("cba,dc,fed->cbf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{8.f, 24.f, 40.f, 120.f, 126.f, 414.f, 182.f, 598.f}),
-      EinsumTestCase("cba,dc,fed->cbe", std::vector<int64_t>{2, 2, 2}, std::vector<float>{12.f, 20.f, 60.f, 100.f, 198.f, 342.f, 286.f, 494.f}),
-      EinsumTestCase("cba,dc,fed->caf", std::vector<int64_t>{2, 2, 2}, std::vector<float>{16.f, 48.f, 32.f, 96.f, 140.f, 460.f, 168.f, 552.f}),
-      EinsumTestCase("cba,dc,fed->cae", std::vector<int64_t>{2, 2, 2}, std::vector<float>{24.f, 40.f, 48.f, 80.f, 220.f, 380.f, 264.f, 456.f})};
-
-  auto test_lambda = [](const std::vector<EinsumTestCase>& test_cases_set) {
-    std::vector<float> m1{0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
-    std::vector<float> m2{0.f, 1.f, 2.f, 3.f};
-    std::vector<float> m3{0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
-    for (const auto& tst : test_cases_set) {
-      OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
-      test.AddAttribute<std::string>("equation", tst.equation);
-      test.AddInput<float>("x", {2, 2, 2}, m1);
-      test.AddInput<float>("y", {2, 2}, m2);
-      test.AddInput<float>("z", {2, 2, 2}, m3);
-      test.AddOutput<float>("o", tst.shape, tst.expected);
-      test.Run();
-    }
-  };
-
-  test_lambda(test_cases_set_1);
-  test_lambda(test_cases_set_2);
-  test_lambda(test_cases_set_3);
+class EinsumTransposeMatMulThreeInputsTest : public testing::TestWithParam<EinsumTestCase> {
+};
 
-}  // namespace test
+TEST_P(EinsumTransposeMatMulThreeInputsTest, EinsumTransposeMatMulThreeInputsTestSuite) {
+  const auto& tst = GetParam();
+  std::vector<float> m1{0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
+  std::vector<float> m2{0.f, 1.f, 2.f, 3.f};
+  std::vector<float> m3{0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
+  OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
+  std::string s(tst.equation);
+  test.AddAttribute<std::string>("equation", s);
+  test.AddInput<float>("x", {2, 2, 2}, m1);
+  test.AddInput<float>("y", {2, 2}, m2);
+  test.AddInput<float>("z", {2, 2, 2}, m3);
+  std::vector<int64_t> v1(tst.shape.begin(), tst.shape.end());
+  std::vector<float> v2(tst.expected.begin(), tst.expected.end());
+  test.AddOutput<float>("o", v1, v2);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
+}
+
+INSTANTIATE_TEST_SUITE_P(EinsumTransposeMatMulThreeInputsTests, EinsumTransposeMatMulThreeInputsTest, testing::ValuesIn(case1));
 
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index 5e746ed0c62d..c02486a2ec26 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -5,8 +5,11 @@
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
 #include "test/common/dnnl_op_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 #include "core/util/math.h"
 #include <algorithm>
+#include <limits>
 #include <math.h>
 
 namespace onnxruntime {
@@ -786,13 +789,20 @@ TEST(MathOpTest, Sqrt_Float) {
   test.Run();
 }
 
-#if defined(USE_DNNL)
+#if defined(USE_DNNL) || defined(USE_CUDA)
 TEST(MathOpTest, Sqrt_bfloat16) {
 #ifdef USE_DNNL
   if (!DnnlHasBF16Support()) {
     LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
     return;
   }
+#endif
+#ifdef USE_CUDA
+  int min_cuda_architecture = 530;
+  if (!HasCudaEnvironment(min_cuda_architecture)) {
+    LOGS_DEFAULT(WARNING) << "Hardware does NOT support BFP16";
+    return;
+  }
 #endif
   OpTester test_bf16("Sqrt", 13);  // only version 13 support bf16 for sqrt
   test_bf16.AddInput<BFloat16>("X", {2, 3},
@@ -804,6 +814,9 @@ TEST(MathOpTest, Sqrt_bfloat16) {
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 #if defined(USE_DNNL)
   execution_providers.push_back(DefaultDnnlExecutionProvider());
+#endif
+#ifdef USE_CUDA
+  execution_providers.push_back(DefaultCudaExecutionProvider());
 #endif
   test_bf16.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
@@ -1359,7 +1372,8 @@ static void TestSumMultipleInputsNoBroadcasting(size_t num_inputs, const TensorS
 
   test.AddOutput<element_type>("sum", dims, expected_output_data);
 
-  test.Run();
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(MathOpTest, SumMultipleInputsNoBroadcasting) {
@@ -1495,6 +1509,34 @@ TEST(MathOpTest, Min_12_Float_2_Input) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Min_12_Float_Nan) {
+  OpTester test("Min", 12);
+  test.AddInput<float>("data_2", {3, 3},
+                       {std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        -0.5f, 0.0f, -2.0f,
+                        0.5f, 0.0f, 2.0f});
+  test.AddInput<float>("data_1", {3, 1},
+                       {0.0f, -1.0f, 1.0f});
+  test.AddOutput<float>("min", {3, 3},
+                        {std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         -1.0f, -1.0f, -2.0f,
+                         0.5f, 0.0f, 1.0f});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Min_12_Double) {
   OpTester test("Min", 12);
   test.AddInput<double>("data_0", {1, 3},
@@ -1512,6 +1554,34 @@ TEST(MathOpTest, Min_12_Double) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Min_12_Double_Nan) {
+  OpTester test("Min", 12);
+  test.AddInput<double>("data_2", {3, 3},
+                        {std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         -0.5, 0.0, -2.0,
+                         0.5, 0.0, 2.0});
+  test.AddInput<double>("data_1", {3, 1},
+                        {0.0, -1.0, 1.0});
+  test.AddOutput<double>("min", {3, 3},
+                         {std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          -1.0, -1.0, -2.0,
+                          0.5, 0.0, 1.0});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Min_12_Int32) {
   OpTester test("Min", 12);
   test.AddInput<int32_t>("data_0", {1, 3},
@@ -1618,6 +1688,7 @@ TEST(MathOpTest, Min_12_MLFLoat16_Scalar1) {
                             MakeMLFloat16({-10.f, -10.f, -10.f}));
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
+
 TEST(MathOpTest, Max_6) {
   OpTester test("Max", 6);
   std::vector<int64_t> dims{3, 3};
@@ -1706,6 +1777,34 @@ TEST(MathOpTest, Max_12_Float) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Max_12_Float_Nan) {
+  OpTester test("Max", 12);
+  test.AddInput<float>("data_2", {3, 3},
+                       {std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        -0.5f, 0.0f, -2.0f,
+                        0.5f, 0.0f, 2.0f});
+  test.AddInput<float>("data_1", {3, 1},
+                       {0.0f, -1.0f, 1.0f});
+  test.AddOutput<float>("max", {3, 3},
+                        {std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         -0.5f, 0.0f, -1.0f,
+                         1.0f, 1.0f, 2.0f});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Max_12_Double) {
   OpTester test("Max", 12);
   test.AddInput<double>("data_0", {1, 3},
@@ -1723,6 +1822,34 @@ TEST(MathOpTest, Max_12_Double) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Max_12_Double_Nan) {
+  OpTester test("Max", 12);
+  test.AddInput<double>("data_2", {3, 3},
+                        {std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         -0.5, 0.0, -2.0,
+                         0.5, 0.0, 2.0});
+  test.AddInput<double>("data_1", {3, 1},
+                        {0.0, -1.0, 1.0});
+  test.AddOutput<double>("max", {3, 3},
+                         {std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          -0.5, 0.0, -1.0,
+                          1.0, 1.0, 2.0});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Max_12_Int32) {
   OpTester test("Max", 12);
   test.AddInput<int32_t>("data_0", {1, 3},
@@ -2619,7 +2746,7 @@ TEST(MathOpTest, Mean_8) {
 #endif
 
 template <float (&op)(float value) MATH_NO_EXCEPT>
-void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
+void TrigFloatTest(OpTester& test, std::initializer_list<float> input, float abs_error = -1.0f) {
   std::vector<int64_t> dims{static_cast<int64_t>(input.size())};
 
   std::vector<float> output;
@@ -2628,6 +2755,11 @@ void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
 
   test.AddInput<float>("X", dims, input);
   test.AddOutput<float>("Y", dims, output);
+
+  if (abs_error >= 0.0f) {
+    test.SetOutputTolerance(abs_error);
+  }
+
   test.Run();
 }
 
@@ -2697,6 +2829,7 @@ TEST(MathOpTest, CosFloat16) {
     TrigFloat16Test<::cosf>(test, {1.1f, -1.1f, 2.2f, -2.2f});
   }
 }
+
 TEST(MathOpTest, Tan) {
   OpTester test("Tan");
   TrigFloatTest<::tanf>(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f});
@@ -2704,7 +2837,8 @@ TEST(MathOpTest, Tan) {
 
 TEST(MathOpTest, Asin) {
   OpTester test("Asin");
-  TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f});
+  float abs_error = DefaultDmlExecutionProvider().get() != nullptr ? 0.0001f : -1.0f;
+  TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}, abs_error);
 }
 
 TEST(MathOpTest, Acos) {
diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc
index bf089e083d67..1a542fb67418 100644
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@@ -277,28 +277,35 @@ class GemmOpTypedTests : public ::testing::Test {
 // On CPUs without fp16 instructions the tests will output a warning:
 // "registered execution providers CPUExecutionProvider were unable to run the model"
 // , then they will still pass.
-using GemmOpTypedTestsTypes = ::testing::Types<float, double, MLFloat16>;
+using GemmOpTypedTestsTypes = ::testing::Types<float, double>;
 TYPED_TEST_SUITE(GemmOpTypedTests, GemmOpTypedTestsTypes);
 
 TYPED_TEST(GemmOpTypedTests, TestGemmScalarBroadcast) {
-  OpTester test("Gemm");
+  auto run_test = [](bool b_is_initializer, bool c_is_initializer) {
+    OpTester test("Gemm");
 
-  test.AddAttribute("transA", (int64_t)0);
-  test.AddAttribute("transB", (int64_t)0);
-  test.AddAttribute("alpha", 1.0f);
-  test.AddAttribute("beta", 1.0f);
+    test.AddAttribute("transA", (int64_t)0);
+    test.AddAttribute("transB", (int64_t)0);
+    test.AddAttribute("alpha", 1.0f);
+    test.AddAttribute("beta", 1.0f);
 
-  test.AddInput<TypeParam>("A", {2, 4},
-                           {static_cast<TypeParam>(1.0f), static_cast<TypeParam>(2.0f), static_cast<TypeParam>(3.0f), static_cast<TypeParam>(4.0f),
-                            static_cast<TypeParam>(-1.0f), static_cast<TypeParam>(-2.0f), static_cast<TypeParam>(-3.0f), static_cast<TypeParam>(-4.0f)});
-  test.AddInput<TypeParam>("B", {4, 3}, std::vector<TypeParam>(12, static_cast<TypeParam>(1.0f)));
-  test.AddInput<TypeParam>("C", {1}, std::vector<TypeParam>{static_cast<TypeParam>(1.0f)});
-  test.AddOutput<TypeParam>("Y", {2, 3},
-                            {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f),
-                             static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f)});
-  test.Config(run_with_tunable_op)
-      .RunWithConfig();
+    test.AddInput<TypeParam>("A", {2, 4},
+                             {static_cast<TypeParam>(1.0f), static_cast<TypeParam>(2.0f), static_cast<TypeParam>(3.0f), static_cast<TypeParam>(4.0f),
+                              static_cast<TypeParam>(-1.0f), static_cast<TypeParam>(-2.0f), static_cast<TypeParam>(-3.0f), static_cast<TypeParam>(-4.0f)});
+    test.AddInput<TypeParam>("B", {4, 3}, std::vector<TypeParam>(12, static_cast<TypeParam>(1.0f)), b_is_initializer);
+    test.AddInput<TypeParam>("C", {1}, std::vector<TypeParam>{static_cast<TypeParam>(1.0f)}, c_is_initializer);
+    test.AddOutput<TypeParam>("Y", {2, 3},
+                              {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f),
+                               static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f)});
+    test.Config(run_with_tunable_op)
+        .RunWithConfig();
+  };
+
+  run_test(false, false);
+  // CoreML EP requires weight and bias to be initializers
+  run_test(true, true);
 }
+
 TYPED_TEST(GemmOpTypedTests, TestGemm2DBroadcast_2) {
   OpTester test("Gemm");
 
diff --git a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
index 273503e7bf6a..f057e4a071bd 100644
--- a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
@@ -15,7 +15,8 @@ static void RunTest(const std::vector<float>& x_vals,
                     int64_t axis = 1,
                     bool is_tensorrt_supported = true,
                     OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
-                    const std::string& error_msg = "") {
+                    const std::string& error_msg = "",
+                    float tolerance = 0.0f) {
   OpTester tester("LogSoftmax", opset);
 
   if (opset < 13) {
@@ -31,6 +32,10 @@ static void RunTest(const std::vector<float>& x_vals,
   tester.AddInput("X", dimensions, x_vals);
   tester.AddOutput("Y", dimensions, expected_vals);
 
+  if (tolerance != 0.0f) {
+    tester.SetOutputAbsErr("Y", tolerance);
+  }
+
   std::unordered_set<std::string> excluded_providers;
   if (!is_tensorrt_supported) {
     excluded_providers.insert(kTensorrtExecutionProvider);
@@ -62,7 +67,7 @@ TEST(LogSoftmaxOperator, LargeNumber) {
                                       -3.4401896f, -2.4401896f, -1.44018972f, -0.44018969f};
   std::vector<int64_t> dimensions = {2, 4};
 
-  RunTest(x_vals, expected_vals, dimensions);
+  RunTest(x_vals, expected_vals, dimensions, 7, 1, true, OpTester::ExpectResult::kExpectSuccess, "", 0.0005f);
 }
 
 // np.random.seed(123)   # Use a seed so we can replicate the input and expected values here and in python
diff --git a/onnxruntime/test/providers/cpu/math/matmul_fastmath_test.cc b/onnxruntime/test/providers/cpu/math/matmul_fastmath_test.cc
new file mode 100644
index 000000000000..75e0c06b04f0
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/math/matmul_fastmath_test.cc
@@ -0,0 +1,305 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// Licensed under the MIT License.
+
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/providers/run_options_config_keys.h"
+#include "test/common/dnnl_op_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "default_providers.h"
+
+#if defined(__aarch64__) && defined(__linux__)
+
+namespace onnxruntime {
+namespace test {
+
+namespace {
+
+const onnxruntime::RunOptions run_options = []() {
+  onnxruntime::RunOptions options{};
+  ORT_THROW_IF_ERROR(options.config_options.AddConfigEntry(kOpTesterRunOptionsConfigTestTunableOp, "true"));
+  return options;
+}();
+
+const constexpr auto run_with_tunable_op = &run_options;
+
+}  // namespace
+
+template <typename T>
+struct MatMulTestData {
+  std::string name;
+  std::vector<int64_t> input0_dims;
+  std::vector<int64_t> input1_dims;
+  std::vector<int64_t> expected_dims;
+  std::vector<T> expected_vals;
+};
+
+template <typename T>
+std::vector<MatMulTestData<T>> GenerateTestCases() {
+  std::vector<MatMulTestData<T>> test_cases;
+  test_cases.push_back(
+      {"test padding and broadcast A > B",
+       {3, 1, 1, 6},
+       {2, 6, 7},
+       {3, 2, 1, 7},
+       {385, 400, 415, 430, 445, 460, 475, 1015, 1030, 1045, 1060, 1075, 1090, 1105, 1015, 1066, 1117, 1168, 1219, 1270, 1321, 3157, 3208, 3259, 3310, 3361, 3412, 3463, 1645, 1732, 1819, 1906, 1993, 2080, 2167, 5299, 5386, 5473, 5560, 5647, 5734, 5821}});
+
+  test_cases.push_back(
+      {"test padding and broadcast B > A",
+       {2, 3, 12},
+       {3, 2, 12, 3},
+       {3, 2, 3, 3},
+       {1518, 1584, 1650, 3894, 4104, 4314, 6270, 6624, 6978, 26574, 27072, 27570, 34134, 34776, 35418, 41694, 42480, 43266, 6270, 6336, 6402, 19014, 19224, 19434, 31758, 32112, 32466, 62430, 62928, 63426, 80358, 81000, 81642, 98286, 99072, 99858, 11022, 11088, 11154, 34134, 34344, 34554, 57246, 57600, 57954, 98286, 98784, 99282, 126582, 127224, 127866, 154878, 155664, 156450}});
+
+  test_cases.push_back(
+      {"test 2D",
+       {8, 6},
+       {6, 6},
+       {8, 6},
+       {330, 345, 360, 375, 390, 405, 870, 921, 972, 1023, 1074, 1125, 1410, 1497, 1584, 1671, 1758, 1845, 1950, 2073, 2196, 2319, 2442, 2565, 2490, 2649, 2808, 2967, 3126, 3285, 3030, 3225, 3420, 3615, 3810, 4005, 3570, 3801, 4032, 4263, 4494, 4725, 4110, 4377, 4644, 4911, 5178, 5445}});
+
+  test_cases.push_back(
+      {"test 2D special",
+       {2, 2, 16},
+       {16, 4},
+       {2, 2, 4},
+       {4960, 5080, 5200, 5320, 12640, 13016, 13392, 13768, 20320, 20952, 21584, 22216, 28000, 28888, 29776, 30664}});
+
+  test_cases.push_back(
+      {"test 2D special 2",
+       {2, 2, 9},
+       {1, 9, 4},
+       {2, 2, 4},
+       {816, 852, 888, 924, 2112, 2229, 2346, 2463, 3408, 3606, 3804, 4002, 4704, 4983, 5262, 5541}});
+
+  test_cases.push_back(
+      {"test 2D special 3",
+       {2, 12},
+       {1, 1, 12, 3},
+       {1, 1, 2, 3},
+       {1518, 1584, 1650, 3894, 4104, 4314}});
+
+  test_cases.push_back(
+      {"test 3D batch",
+       {3, 1, 18},
+       {3, 18, 2},
+       {3, 1, 2},
+       {
+           // clang-format off
+            3570,  3723,
+           26250, 26727,
+           72258, 73059,
+           // clang-format on
+       }});
+
+  test_cases.push_back(
+      {"test 4D batch",
+       {2, 2, 1, 20},
+       {2, 2, 20, 2},
+       {2, 2, 1, 2},
+       {
+           // clang-format off
+            4940,  5130,
+           36140, 36730,
+           99340, 100330,
+           194540, 195930,
+           // clang-format on
+       }});
+
+  return test_cases;
+}
+
+template <typename T>
+void RunMatMulTest(int32_t opset_version, bool is_a_constant, bool is_b_constant, bool disable_fastmath) {
+  for (auto t : GenerateTestCases<T>()) {
+    SCOPED_TRACE("test case: " + t.name);
+
+    OpTester test("MatMul", opset_version);
+
+    int64_t size0 = TensorShape::FromExistingBuffer(t.input0_dims).SizeHelper(0, t.input0_dims.size());
+    std::vector<T> input0_vals = ValueRange<T>(size0);
+
+    test.AddInput<T>("A", t.input0_dims, input0_vals, is_a_constant);
+
+    int64_t size1 = TensorShape::FromExistingBuffer(t.input1_dims).SizeHelper(0, t.input1_dims.size());
+    std::vector<T> input1_vals = ValueRange<T>(size1);
+    test.AddInput<T>("B", t.input1_dims, input1_vals, is_b_constant);
+
+    test.AddOutput<T>("Y", t.expected_dims, t.expected_vals);
+
+    // OpenVINO EP: Disabled temporarily matmul broadcasting not fully supported
+    // Disable TensorRT because of unsupported data type
+    std::unordered_set<std::string> excluded_providers{kTensorrtExecutionProvider, kOpenVINOExecutionProvider};
+    if (t.name == "test 2D empty input") {
+      // NNAPI: currently fails for the "test 2D empty input" case
+      excluded_providers.insert(kNnapiExecutionProvider);
+    }
+
+    if ("test padding and broadcast A > B" == t.name || "test 2D empty input" == t.name) {
+      // QNN can't handle 0 shap
+      excluded_providers.insert(kQnnExecutionProvider);
+    }
+
+    SessionOptions so;
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+        kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+
+    test.ConfigExcludeEps(excluded_providers)
+        .Config(run_with_tunable_op)
+        .Config(so)
+        .RunWithConfig();
+
+    if (disable_fastmath) {
+      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+          kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "0"));
+
+      test.ConfigExcludeEps(excluded_providers)
+          .Config(run_with_tunable_op)
+          .Config(so)
+          .RunWithConfig();
+    }
+  }
+}
+
+template <typename T>
+void RunMatMulTest(int32_t opset_version) {
+  RunMatMulTest<T>(opset_version, false, false, false);
+}
+
+TEST(MathOpTest, MatMulFloatType_FastMath) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Assertion failed: m_bufferTensorDesc.TotalTensorSizeInBytes >= ComputeByteSizeFromDimensions(nonBroadcastDimensions, dataType)";
+  }
+  RunMatMulTest<float>(7, false, false, false);
+}
+
+TEST(MathOpTest, MatMulFloatTypeInitializer_FastMath) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Assertion failed: m_bufferTensorDesc.TotalTensorSizeInBytes >= ComputeByteSizeFromDimensions(nonBroadcastDimensions, dataType)";
+  }
+  RunMatMulTest<float>(7, false, true, false);
+}
+
+TEST(MathOpTest, MatMulInt32Type_FastMath) {
+  RunMatMulTest<int32_t>(9);
+}
+
+TEST(MathOpTest, MatMulUint32Type_FastMath) {
+  RunMatMulTest<uint32_t>(9);
+}
+
+TEST(MathOpTest, MatMulInt64Type_FastMath) {
+  RunMatMulTest<int64_t>(9);
+}
+
+TEST(MathOpTest, MatMulUint64Type_FastMath) {
+  RunMatMulTest<uint64_t>(9);
+}
+
+#ifndef ENABLE_TRAINING
+// Prepacking is disabled in full training build so no need to test the feature in a training build.
+TEST(MathOpTest, MatMulSharedPrepackedWeights_FastMath) {
+  OpTester test("MatMul");
+
+  std::vector<float> b_init_values(32, 1.0f);
+  test.AddInput<float>("A", {8, 4},
+                       {1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f,
+                        1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f,
+                        1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f,
+                        1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f});
+  // B is to be an initializer for triggering pre-packing
+  test.AddInput<float>("B", {4, 8}, b_init_values, true);
+
+  test.AddOutput<float>("Y", {8, 8},
+                        {10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f,
+                         -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f,
+                         10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f,
+                         -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f,
+                         10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f,
+                         -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f,
+                         10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f,
+                         -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f, -10.0f});
+
+  OrtValue b;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape({4, 8}),
+                       b_init_values.data(), OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator), b);
+
+  SessionOptions so;
+  // Set up B as a shared initializer to be shared between sessions
+  ASSERT_EQ(so.AddInitializer("B", &b), Status::OK());
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+      kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+
+  // We want all sessions running using this OpTester to be able to share pre-packed weights if applicable
+  test.EnableSharingOfPrePackedWeightsAcrossSessions();
+
+  // Pre-packing is limited just to the CPU EP for now and we will only test the CPU EP
+  // and we want to ensure that it is available in this build
+  auto cpu_ep = []() -> std::vector<std::unique_ptr<IExecutionProvider>> {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    return execution_providers;
+  };
+
+  size_t number_of_pre_packed_weights_counter_session_1 = 0;
+  size_t number_of_shared_pre_packed_weights_counter = 0;
+
+  // Session 1
+  {
+    test.Config(so)
+        .Config(run_with_tunable_op)
+        .ConfigEps(cpu_ep())
+        .RunWithConfig(&number_of_pre_packed_weights_counter_session_1, &number_of_shared_pre_packed_weights_counter);
+    // Assert that no pre-packed weights have been shared thus far
+    ASSERT_EQ(number_of_shared_pre_packed_weights_counter, static_cast<size_t>(0));
+  }
+
+  auto number_of_elements_in_shared_prepacked_buffers_container =
+      test.GetNumPrePackedWeightsShared();
+  // Assert that the number of elements in the shared container
+  // is the same as the number of weights that have been pre-packed
+  ASSERT_EQ(number_of_pre_packed_weights_counter_session_1, number_of_elements_in_shared_prepacked_buffers_container);
+
+  // On some platforms/architectures MLAS may choose to not do any pre-packing and the number of elements
+  // that have been pre-packed will be zero in which case we do not continue with the testing
+  // of "sharing" of pre-packed weights as there are no pre-packed weights to be shared at all.
+  if (number_of_pre_packed_weights_counter_session_1 == 0)
+    return;
+
+  // Session 2
+  {
+    size_t number_of_pre_packed_weights_counter_session_2 = 0;
+    test.Config(so)
+        .Config(run_with_tunable_op)
+        .ConfigEps(cpu_ep())
+        .RunWithConfig(&number_of_pre_packed_weights_counter_session_2, &number_of_shared_pre_packed_weights_counter);
+
+    // Assert that the same number of weights were pre-packed in both sessions
+    ASSERT_EQ(number_of_pre_packed_weights_counter_session_1, number_of_pre_packed_weights_counter_session_2);
+
+    // Assert that the number of pre-packed weights that were shared equals
+    // the number of pre-packed weights in the second session
+    ASSERT_EQ(number_of_pre_packed_weights_counter_session_2,
+              static_cast<size_t>(number_of_shared_pre_packed_weights_counter));
+  }
+}
+
+#endif
+
+// Dummy run to disable the FastMath mode for the current session
+TEST(MathOpTest, MatMulUint64Type_DisableFastMath) {
+  RunMatMulTest<uint64_t>(9, false, false, true);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/test/providers/cpu/ml/label_encoder_test.cc b/onnxruntime/test/providers/cpu/ml/label_encoder_test.cc
index 2ce652e83371..63001dd1063c 100644
--- a/onnxruntime/test/providers/cpu/ml/label_encoder_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/label_encoder_test.cc
@@ -8,7 +8,8 @@ namespace onnxruntime {
 namespace test {
 
 template <typename TInput, typename TOutput>
-static void RunTest(const std::vector<int64_t>& dims, const std::vector<TInput>& input, const std::vector<TOutput>& output) {
+static void RunTest(const std::vector<int64_t>& dims, const std::vector<TInput>& input,
+                    const std::vector<TOutput>& output) {
   OpTester test("LabelEncoder", 1, onnxruntime::kMLDomain);
 
   static const std::vector<std::string> labels = {"Beer", "Wine", "Tequila"};
@@ -231,5 +232,284 @@ TEST(LabelEncoder, FloatToFloatOpset2) {
   test.Run();
 }
 
+TEST(LabelEncoder, Int64toInt64Opset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<int64_t> input{1, 2, 3, 4, 5};
+  std::vector<int64_t> output{12, 13, 14, 15, 42};
+  std::vector<int64_t> key_data{1, 2, 3, 4};
+  std::vector<int64_t> value_data{12, 13, 14, 15};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  test.AddAttribute("keys_int64s", key_data);
+  test.AddAttribute("values_int64s", value_data);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  default_proto.add_dims(1);
+  default_proto.add_int64_data(42);
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<int64_t>("X", dims, input);
+  test.AddOutput<int64_t>("Y", dims, output);
+  test.Run();
+}
+
+TEST(LabelEncoder, StringtoInt16Opset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  const std::vector<std::string> input{"a", "b", "d", "c", "g"};
+  const std::vector<int16_t> output{0, 1, 42, 2, 42};
+  const std::vector<std::string> key_data{"a", "b", "c"};
+  const std::vector<int16_t> value_data{0, 1, 2};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  test.AddAttribute("keys_strings", key_data);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT16);
+  values_proto.add_dims(value_data.size());
+  for (const auto value : value_data) {
+    values_proto.add_int32_data(value);
+  }
+
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT16);
+  default_proto.add_dims(1);
+  default_proto.add_int32_data(42);
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<std::string>("X", dims, input);
+  test.AddOutput<int16_t>("Y", dims, output);
+  test.Run();
+}
+
+TEST(LabelEncoder, Int64toStringOpset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<int64_t> input{1, 2, 3, 4, 5};
+  std::vector<std::string> output{"Hello", "world", "_Unused", "onnxruntime", "!"};
+  std::vector<int64_t> key_data{1, 2, 4, 5};
+  std::vector<std::string> value_data{"Hello", "world", "onnxruntime", "!"};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  keys_proto.add_dims(key_data.size());
+  for (const auto key : key_data) {
+    keys_proto.add_int64_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  values_proto.add_dims(value_data.size());
+  for (const auto& value : value_data) {
+    values_proto.add_string_data(value);
+  }
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  default_proto.add_dims(1);
+  default_proto.add_string_data("_Unused");
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<int64_t>("X", dims, input);
+  test.AddOutput<std::string>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, StringToFloatOpset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<std::string> input{"Hello", "world", "Random", "onnxruntime", "!"};
+  std::vector<float> output{3.14f, 2.0f, -0.0f, 2.718f, 5.0f};
+  std::vector<std::string> key_data{"Hello", "world", "onnxruntime", "!"};
+  std::vector<float> value_data{3.14f, 2.0f, 2.718f, 5.0f};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  keys_proto.add_dims(key_data.size());
+  for (const auto& key : key_data) {
+    keys_proto.add_string_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  values_proto.add_dims(value_data.size());
+  for (const auto& value : value_data) {
+    values_proto.add_float_data(value);
+  }
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  default_proto.add_dims(1);
+  default_proto.add_float_data(-0.0f);
+  test.AddAttribute("default_tensor", default_proto);
+  test.AddInput<std::string>("X", dims, input);
+  test.AddOutput<float>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, StringToDoubleOpset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<std::string> input{"Hello", "world", "Random", "onnxruntime", "!"};
+  std::vector<double> output{0.1, 1.1231e30, -0.0, 2.718, 5.0};
+  std::vector<std::string> key_data{"Hello", "world", "onnxruntime", "!"};
+  std::vector<double> value_data{0.1, 1.1231e30, 2.718, 5.0};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  keys_proto.add_dims(key_data.size());
+  for (const auto& key : key_data) {
+    keys_proto.add_string_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
+  values_proto.add_dims(value_data.size());
+  for (const auto& value : value_data) {
+    values_proto.add_double_data(value);
+  }
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
+  default_proto.add_dims(1);
+  default_proto.add_double_data(-0.0);
+  test.AddAttribute("default_tensor", default_proto);
+  test.AddInput<std::string>("X", dims, input);
+  test.AddOutput<double>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, TensorBasedAttributesOpset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<int64_t> input{1, 2, 3, 4, 5};
+  std::vector<int64_t> output{12, 13, 14, 15, 42};
+  std::vector<int64_t> key_data{1, 2, 3, 4};
+  std::vector<int64_t> value_data{12, 13, 14, 15};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  keys_proto.add_dims(key_data.size());
+  for (const auto key : key_data) {
+    keys_proto.add_int64_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  values_proto.add_dims(value_data.size());
+  for (const auto value : value_data) {
+    values_proto.add_int64_data(value);
+  }
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  default_proto.add_dims(1);
+  default_proto.add_int64_data(42);
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<int64_t>("X", dims, input);
+  test.AddOutput<int64_t>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, NaNsMappedTogetherOpset4) {
+  std::vector<std::int64_t> dims{1, 6};
+  std::vector<float> input{3.14f, std::nanf("1"), 2.718f, std::nanf("2"), 5.f, -1.f};
+  std::vector<std::string> output{"a", "ONNX", "b", "ONNX", "c", "onnxruntime"};
+  std::vector<float> key_data{3.14f, 2.718f, 5.0f, std::nanf("3")};
+  std::vector<std::string> value_data{"a", "b", "c", "ONNX"};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  test.AddAttribute("keys_floats", key_data);
+  test.AddAttribute("values_strings", value_data);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  default_proto.add_dims(1);
+  default_proto.add_string_data("onnxruntime");
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<float>("X", dims, input);
+  test.AddOutput<std::string>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, DoubleNaNsMappedTogetherOpset4) {
+  std::vector<std::int64_t> dims{1, 6};
+  std::vector<double> input{3.14, std::nan("1"), 2.718, std::nan("2"), 5.0, -1};
+  std::vector<std::string> output{"a", "ONNX", "b", "ONNX", "c", "onnxruntime"};
+  std::vector<double> key_data{3.14, 2.718, 5.0, std::nan("3")};
+  std::vector<std::string> value_data{"a", "b", "c", "ONNX"};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
+  keys_proto.add_dims(key_data.size());
+  for (const auto key : key_data) {
+    keys_proto.add_double_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  test.AddAttribute("values_strings", value_data);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  default_proto.add_dims(1);
+  default_proto.add_string_data("onnxruntime");
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<double>("X", dims, input);
+  test.AddOutput<std::string>("Y", dims, output);
+
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index 859e08271676..aa752ed7308c 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -3,6 +3,12 @@
 
 #include <iostream>
 #include <iterator>
+#include <string>
+#include <codecvt>
+#include <locale>
+#include <filesystem>
+#include <utility>
+#include <unordered_map>
 #include <gtest/gtest.h>
 
 #include "core/session/onnxruntime_c_api.h"
@@ -15,9 +21,6 @@
 #include <core/platform/path_lib.h>
 #include "default_providers.h"
 #include "test/onnx/TestCase.h"
-#include <string>
-#include <codecvt>
-#include <locale>
 
 #ifdef USE_DNNL
 #include "core/providers/dnnl/dnnl_provider_factory.h"
@@ -39,13 +42,14 @@
 #include "core/providers/armnn/armnn_provider_factory.h"
 #endif
 
+#include "test/common/cuda_op_test_utils.h"
+
 // test infrastructure
 #include "test/onnx/testenv.h"
 #include "test/onnx/TestCase.h"
 #include "test/compare_ortvalue.h"
 #include "test/onnx/heap_buffer.h"
 #include "test/onnx/onnx_model_info.h"
-#include "test/onnx/callback.h"
 #include "test/onnx/testcase_request.h"
 
 extern std::unique_ptr<Ort::Env> ort_env;
@@ -88,6 +92,7 @@ TEST_P(ModelTest, Run) {
   // when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure
   if (model_path.find(ORT_TSTR("_MNIST")) > 0) {
     if (provider_name == "cuda" || provider_name == "openvino") {
+      per_sample_tolerance = 2.5e-2;
       relative_per_sample_tolerance = 1e-2;
     }
   }
@@ -173,12 +178,14 @@ TEST_P(ModelTest, Run) {
         ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options));
         std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(&OrtApis::ReleaseCUDAProviderOptions)> rel_cuda_options(
             cuda_options, &OrtApis::ReleaseCUDAProviderOptions);
-        std::vector<const char*> keys{"device_id"};
 
+        std::vector<const char*> keys{"device_id", "use_tf32"};
         std::vector<const char*> values;
         std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
         values.push_back(device_id.empty() ? "0" : device_id.c_str());
-        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 1));
+        values.push_back("0");
+        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 2));
+
         ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
       } else if (provider_name == "rocm") {
         OrtROCMProviderOptions ep_options;
@@ -210,6 +217,14 @@ TEST_P(ModelTest, Run) {
         ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options));
         std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(&OrtApis::ReleaseCUDAProviderOptions)> rel_cuda_options(
             cuda_options, &OrtApis::ReleaseCUDAProviderOptions);
+
+        std::vector<const char*> keys{"device_id", "use_tf32"};
+        std::vector<const char*> values;
+        std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
+        values.push_back(device_id.empty() ? "0" : device_id.c_str());
+        values.push_back("0");
+        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 2));
+
         ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
       } else if (provider_name == "migraphx") {
         OrtMIGraphXProviderOptions ep_options;
@@ -361,46 +376,46 @@ TEST_P(ModelTest, Run) {
 }
 
 using ORT_STRING_VIEW = std::basic_string_view<ORTCHAR_T>;
-static ORT_STRING_VIEW opset7 = ORT_TSTR("opset7");
-static ORT_STRING_VIEW opset8 = ORT_TSTR("opset8");
-static ORT_STRING_VIEW opset9 = ORT_TSTR("opset9");
-static ORT_STRING_VIEW opset10 = ORT_TSTR("opset10");
-static ORT_STRING_VIEW opset11 = ORT_TSTR("opset11");
-static ORT_STRING_VIEW opset12 = ORT_TSTR("opset12");
-static ORT_STRING_VIEW opset13 = ORT_TSTR("opset13");
-static ORT_STRING_VIEW opset14 = ORT_TSTR("opset14");
-static ORT_STRING_VIEW opset15 = ORT_TSTR("opset15");
-static ORT_STRING_VIEW opset16 = ORT_TSTR("opset16");
-static ORT_STRING_VIEW opset17 = ORT_TSTR("opset17");
-static ORT_STRING_VIEW opset18 = ORT_TSTR("opset18");
+static constexpr ORT_STRING_VIEW opset7 = ORT_TSTR("opset7");
+static constexpr ORT_STRING_VIEW opset8 = ORT_TSTR("opset8");
+static constexpr ORT_STRING_VIEW opset9 = ORT_TSTR("opset9");
+static constexpr ORT_STRING_VIEW opset10 = ORT_TSTR("opset10");
+static constexpr ORT_STRING_VIEW opset11 = ORT_TSTR("opset11");
+static constexpr ORT_STRING_VIEW opset12 = ORT_TSTR("opset12");
+static constexpr ORT_STRING_VIEW opset13 = ORT_TSTR("opset13");
+static constexpr ORT_STRING_VIEW opset14 = ORT_TSTR("opset14");
+static constexpr ORT_STRING_VIEW opset15 = ORT_TSTR("opset15");
+static constexpr ORT_STRING_VIEW opset16 = ORT_TSTR("opset16");
+static constexpr ORT_STRING_VIEW opset17 = ORT_TSTR("opset17");
+static constexpr ORT_STRING_VIEW opset18 = ORT_TSTR("opset18");
 // TODO: enable opset19 tests
-// static ORT_STRING_VIEW opset19 = ORT_TSTR("opset19");
+// static constexpr ORT_STRING_VIEW opset19 = ORT_TSTR("opset19");
 
-static ORT_STRING_VIEW provider_name_cpu = ORT_TSTR("cpu");
-static ORT_STRING_VIEW provider_name_tensorrt = ORT_TSTR("tensorrt");
+static constexpr ORT_STRING_VIEW provider_name_cpu = ORT_TSTR("cpu");
+static constexpr ORT_STRING_VIEW provider_name_tensorrt = ORT_TSTR("tensorrt");
 #ifdef USE_MIGRAPHX
-static ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx");
+static constexpr ORT_STRING_VIEW provider_name_migraphx = ORT_TSTR("migraphx");
 #endif
-static ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino");
-static ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda");
+static constexpr ORT_STRING_VIEW provider_name_openvino = ORT_TSTR("openvino");
+static constexpr ORT_STRING_VIEW provider_name_cuda = ORT_TSTR("cuda");
 #ifdef USE_ROCM
-static ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm");
+static constexpr ORT_STRING_VIEW provider_name_rocm = ORT_TSTR("rocm");
 #endif
-static ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl");
+static constexpr ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl");
 // For any non-Android system, NNAPI will only be used for ort model converter
 #if defined(USE_NNAPI) && defined(__ANDROID__)
-static ORT_STRING_VIEW provider_name_nnapi = ORT_TSTR("nnapi");
+static constexpr ORT_STRING_VIEW provider_name_nnapi = ORT_TSTR("nnapi");
 #endif
 #ifdef USE_RKNPU
-static ORT_STRING_VIEW provider_name_rknpu = ORT_TSTR("rknpu");
+static constexpr ORT_STRING_VIEW provider_name_rknpu = ORT_TSTR("rknpu");
 #endif
 #ifdef USE_ACL
-static ORT_STRING_VIEW provider_name_acl = ORT_TSTR("acl");
+static constexpr ORT_STRING_VIEW provider_name_acl = ORT_TSTR("acl");
 #endif
 #ifdef USE_ARMNN
-static ORT_STRING_VIEW provider_name_armnn = ORT_TSTR("armnn");
+static constexpr ORT_STRING_VIEW provider_name_armnn = ORT_TSTR("armnn");
 #endif
-static ORT_STRING_VIEW provider_name_dml = ORT_TSTR("dml");
+static constexpr ORT_STRING_VIEW provider_name_dml = ORT_TSTR("dml");
 
 ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   // Map key is provider name(CPU, CUDA, etc). Value is the ONNX node tests' opsets to run.
@@ -598,9 +613,10 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
       ORT_TSTR("SSD"),                 // needs to run symbolic shape inference shape first
       ORT_TSTR("size")                 // INVALID_ARGUMENT: Cannot find binding of given name: x
   };
-  std::vector<std::basic_string<ORTCHAR_T>> paths;
+  std::vector<std::filesystem::path> paths;
 
   for (std::pair<ORT_STRING_VIEW, std::vector<ORT_STRING_VIEW>> kvp : provider_names) {
+    const ORT_STRING_VIEW provider_name = kvp.first;
     // Setup ONNX node tests. The test data is preloaded on our CI build machines.
 #if !defined(_WIN32)
     ORT_STRING_VIEW node_test_root_path = ORT_TSTR("/data/onnx");
@@ -608,7 +624,10 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
     ORT_STRING_VIEW node_test_root_path = ORT_TSTR("c:\\local\\data\\onnx");
 #endif
     for (auto p : kvp.second) {
-      paths.push_back(ConcatPathComponent(node_test_root_path, p));
+      // tensorrt ep isn't expected to pass all onnx node tests. exclude and run model tests only.
+      if (provider_name != provider_name_tensorrt) {
+        paths.push_back(ConcatPathComponent(node_test_root_path, p));
+      }
     }
 
     // Same as the above, except this one is for large models
@@ -627,7 +646,6 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
     }
 #endif
 
-    ORT_STRING_VIEW provider_name = kvp.first;
     std::unordered_set<std::basic_string<ORTCHAR_T>> all_disabled_tests(std::begin(immutable_broken_tests),
                                                                         std::end(immutable_broken_tests));
     if (provider_name == provider_name_cuda) {
@@ -682,45 +700,45 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
     all_disabled_tests.insert(ORT_TSTR("fp16_tiny_yolov2"));
 
     while (!paths.empty()) {
-      std::basic_string<ORTCHAR_T> node_data_root_path = paths.back();
+      std::filesystem::path node_data_root_path = paths.back();
       paths.pop_back();
-      std::basic_string<ORTCHAR_T> my_dir_name = GetLastComponent(node_data_root_path);
-      ORT_TRY {
-        LoopDir(node_data_root_path, [&](const ORTCHAR_T* filename, OrtFileType f_type) -> bool {
-          if (filename[0] == ORT_TSTR('.'))
-            return true;
-          if (f_type == OrtFileType::TYPE_DIR) {
-            std::basic_string<PATH_CHAR_TYPE> p = ConcatPathComponent(node_data_root_path, filename);
-            paths.push_back(p);
-            return true;
-          }
-          std::basic_string<PATH_CHAR_TYPE> filename_str = filename;
-          if (!HasExtensionOf(filename_str, ORT_TSTR("onnx")))
-            return true;
-          std::basic_string<PATH_CHAR_TYPE> test_case_name = my_dir_name;
-          if (test_case_name.compare(0, 5, ORT_TSTR("test_")) == 0)
-            test_case_name = test_case_name.substr(5);
-          if (all_disabled_tests.find(test_case_name) != all_disabled_tests.end())
-            return true;
+      if (!std::filesystem::exists(node_data_root_path) || !std::filesystem::is_directory(node_data_root_path)) {
+        continue;
+      }
+      for (auto const& dir_entry : std::filesystem::directory_iterator(node_data_root_path)) {
+        if (dir_entry.is_directory()) {
+          paths.push_back(dir_entry.path());
+          continue;
+        }
+        const std::filesystem::path& path = dir_entry.path();
+        if (!path.has_filename() || path.filename().native().compare(0, 1, ORT_TSTR(".")) == 0) {
+          // Ignore hidden files.
+          continue;
+        }
+        if (path.filename().extension().compare(ORT_TSTR(".onnx")) != 0) {
+          // Ignore the files that are not ONNX models
+          continue;
+        }
+        std::basic_string<PATH_CHAR_TYPE> test_case_name = path.parent_path().filename().native();
+        if (test_case_name.compare(0, 5, ORT_TSTR("test_")) == 0)
+          test_case_name = test_case_name.substr(5);
+        if (all_disabled_tests.find(test_case_name) != all_disabled_tests.end())
+          continue;
 
 #ifdef DISABLE_ML_OPS
-          auto starts_with = [](const std::basic_string<PATH_CHAR_TYPE>& find_in,
-                                const std::basic_string<PATH_CHAR_TYPE>& find_what) {
-            return find_in.compare(0, find_what.size(), find_what) == 0;
-          };
-          if (starts_with(test_case_name, ORT_TSTR("XGBoost_")) || starts_with(test_case_name, ORT_TSTR("coreml_")) ||
-              starts_with(test_case_name, ORT_TSTR("scikit_")) || starts_with(test_case_name, ORT_TSTR("libsvm_"))) {
-            return true;
-          }
+        auto starts_with = [](const std::basic_string<PATH_CHAR_TYPE>& find_in,
+                              const std::basic_string<PATH_CHAR_TYPE>& find_what) {
+          return find_in.compare(0, find_what.size(), find_what) == 0;
+        };
+        if (starts_with(test_case_name, ORT_TSTR("XGBoost_")) || starts_with(test_case_name, ORT_TSTR("coreml_")) ||
+            starts_with(test_case_name, ORT_TSTR("scikit_")) || starts_with(test_case_name, ORT_TSTR("libsvm_"))) {
+          continue;
+        }
 #endif
-          std::basic_ostringstream<PATH_CHAR_TYPE> oss;
-          oss << provider_name << ORT_TSTR("_") << ConcatPathComponent(node_data_root_path, filename_str);
-          v.emplace_back(oss.str());
-          return true;
-        });
+        std::basic_ostringstream<PATH_CHAR_TYPE> oss;
+        oss << provider_name << ORT_TSTR("_") << path.native();
+        v.emplace_back(oss.str());
       }
-      ORT_CATCH(const std::exception&) {
-      }  // ignore non-exist dir
     }
   }
   return v;
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index ee18cf2cea6c..d91a1de3faa6 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -75,6 +75,43 @@ TEST(BatchNormTest, PositiveTestCase) {
   input_data_map.insert({"mean", mean});
   input_data_map.insert({"var", var});
 
+  InputShapesMap input_shapes_map;
+  vector<int64_t> input_shape{1, 1, 7, 7};
+  input_shapes_map.insert({"X", input_shape});
+  input_shapes_map.insert({"scale", {1}});
+  input_shapes_map.insert({"B", {1}});
+  input_shapes_map.insert({"mean", {1}});
+  input_shapes_map.insert({"var", {1}});
+
+  auto expected_output = {1.01359f, 0.703983f, 0.641631f, 1.08571f, 0.939167f, 0.762469f, 0.682729f, 0.762401f, 0.787021f,
+                          1.06744f, 0.604378f, 0.957476f, 0.667302f, 0.901764f, 1.07566f, 1.01117f, 0.928324f, 0.897667f,
+                          0.705842f, 0.660885f, 0.977291f, 0.878918f, 0.818345f, 1.06608f, 0.839057f, 1.04796f, 0.621471f,
+                          0.781831f, 0.760527f, 0.835665f, 1.05825f, 0.611442f, 0.781873f, 1.08437f, 0.907454f, 0.926173f,
+                          1.03375f, 0.707961f, 0.968646f, 0.621757f, 0.973095f, 0.700301f, 0.916723f, 0.807602f, 0.692598f,
+                          0.621972f, 0.707334f, 0.63723f, 0.63062f};
+  float epsilon = 1e-05f;
+  TestBatchNorm(input_data_map, input_shapes_map, epsilon, expected_output, input_shape);
+}
+
+TEST(BatchNormTest, PositiveTestCase_5D) {
+  // This input was taken from the SpatialBN_1.pb, SpatialBN_1_input.pb and SpatialBN_1_output.pb files.
+  vector<float> X{0.329876f, -0.287158f, -0.411425f, 0.473621f, 0.18156f, -0.170596f, -0.329516f, -0.170733f, -0.121664f, 0.4372f,
+                  -0.485668f, 0.218049f, -0.360263f, 0.107016f, 0.45358f, 0.325056f, 0.15995f, 0.098852f, -0.283453f, -0.373051f,
+                  0.257542f, 0.0614853f, -0.0592363f, 0.434488f, -0.0179583f, 0.398374f, -0.451602f, -0.132009f, -0.174468f,
+                  -0.0247169f, 0.418897f, -0.47159f, -0.131925f, 0.470943f, 0.118357f, 0.155664f, 0.370062f, -0.279229f, 0.240311f,
+                  -0.451034f, 0.249178f, -0.294496f, 0.13683f, -0.0806475f, -0.309849f, -0.450604f, -0.28048f, -0.420197f, -0.433369f};
+  vector<float> scale{0.589433f};
+  vector<float> B{-0.384622f};
+  vector<float> mean{-2.45673f};
+  vector<float> var{1.37998f};
+
+  InputDataMap input_data_map;
+  input_data_map.insert({"X", X});
+  input_data_map.insert({"scale", scale});
+  input_data_map.insert({"B", B});
+  input_data_map.insert({"mean", mean});
+  input_data_map.insert({"var", var});
+
   InputShapesMap input_shapes_map;
   vector<int64_t> input_shape{1, 1, 7, 7, 1};
   input_shapes_map.insert({"X", input_shape});
@@ -868,18 +905,21 @@ TEST(BatchNormTest, ForwardTrainingTestWithSavedOutputsOpset9) {
   test.AddInput<float>("var", channel_dims, {1.0f, 2.0f});
 
   test.AddOutput<float>("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f});
-
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
+
   // mean and variance of X across channel dimension
   // With Opset9 we output saved_inv_std instead of saved_var to match CUDA EP
   test.AddOutput<float>("saved_mean", channel_dims, {-0.306f, 0.114562f});
   test.AddOutput<float>("saved_inv_std", channel_dims, {1.2288f, 0.861317f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider,
+           // TODO(mtavenrath) flakiness of running_mean for CUDA has been fixed, the delta of running_var is still ~0.1
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
             kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
 }
 
@@ -900,14 +940,15 @@ TEST(BatchNormTest, ForwardTrainingTestOpset14) {
   test.AddInput<float>("var", channel_dims, {1.0f, 2.0f});
 
   test.AddOutput<float>("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f});
-
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider,
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
             kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
 }
 
@@ -932,9 +973,11 @@ TEST(BatchNormTest, ForwardTrainingTestOpset15) {
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // Same exclusions as the opset 14 test
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider,
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
             kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
 }
 #endif  // BATCHNORM_INCLUDE_TRAINING_SUPPORT
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index dede278b7274..0efa78af2795 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -59,6 +59,8 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes,
   std::unordered_set<std::string> excluded_providers(attributes.excluded_providers);
   // Disable TensorRT because weight as input is not supported
   excluded_providers.insert(kTensorrtExecutionProvider);
+  // Disable CUDA NHWC execution provider as it is currently flaky
+  excluded_providers.insert(kCudaNHWCExecutionProvider);
 
   // QNN SDK 2.10.0 has a bug that breaks support for dynamic bias inputs.
   excluded_providers.insert(kQnnExecutionProvider);
diff --git a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
index 472f841aa856..ec93dc249eeb 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
@@ -75,7 +75,8 @@ void TestConvTransposeOp(const ConvTransposeOpAttributes& attributes,
                          const vector<int64_t>& expected_output_shape,
                          OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
                          const std::string& err_str = "",
-                         const std::unordered_set<std::string>& excluded_provider_types = {kTensorrtExecutionProvider, kQnnExecutionProvider}) {
+                         const std::unordered_set<std::string>& excluded_provider_types =
+                             {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kQnnExecutionProvider}) {
   std::unordered_set<std::string> extra_exclude_openvino_for_initializer_filter = excluded_provider_types;
   extra_exclude_openvino_for_initializer_filter.insert(kOpenVINOExecutionProvider);
   TestConvTransposeOpInitializer(attributes, inputs, input_shapes, expected_output, expected_output_shape,
@@ -409,7 +410,8 @@ TEST(ConvTransposeTest, ConvTranspose_2D_OutputShape_2) {
   vector<int64_t> Y_shape = {1, 1, 1, 14};
   auto expected_vals = {1.0f, 2.0f, 5.0f, 11.0f, 19.0f, 28.0f, 37.0f, 46.0f, 55.0f, 64.0f, 63.0f, 51.0f, 27.0f, 10.0f};
   TestConvTransposeOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape,
-                      OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kQnnExecutionProvider});
+                      OpTester::ExpectResult::kExpectSuccess, "",
+                      {kOpenVINOExecutionProvider, kCudaNHWCExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(ConvTransposeTest, ConvTranspose_2D_OutputShapeWithBatchSize) {
@@ -434,7 +436,8 @@ TEST(ConvTransposeTest, ConvTranspose_2D_OutputShapeWithBatchSize) {
   auto expected_vals = {1.0f, 2.0f, 5.0f, 11.0f, 19.0f, 28.0f, 37.0f, 46.0f, 55.0f, 64.0f, 63.0f, 51.0f, 27.0f, 10.0f,
                         11.0f, 32.0f, 65.0f, 91.0f, 109.0f, 118.0f, 127.0f, 136.0f, 145.0f, 154.0f, 143.0f, 111.0f, 57.0f, 20.0f};
   TestConvTransposeOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape,
-                      OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kQnnExecutionProvider});
+                      OpTester::ExpectResult::kExpectSuccess, "",
+                      {kOpenVINOExecutionProvider, kCudaNHWCExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(ConvTransposeTest, ConvTranspose_InvalidKernelShape) {
@@ -871,7 +874,8 @@ TEST(ConvTransposeTest, DimWithZero) {
 
   TestConvTransposeOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape,
                       OpTester::ExpectResult::kExpectSuccess, "",
-                      {kTensorrtExecutionProvider, kAclExecutionProvider, kQnnExecutionProvider});
+                      {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider,
+                       kAclExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(ConvTransposeTest, ConvTranspose_3D) {
@@ -1005,7 +1009,8 @@ TEST(ConvTransposeTest, ConvTranspose_3D) {
 
   TestConvTransposeOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape,
                       OpTester::ExpectResult::kExpectSuccess, "",
-                      {kTensorrtExecutionProvider, kCudaExecutionProvider, kQnnExecutionProvider});
+                      {kTensorrtExecutionProvider, kCudaExecutionProvider,
+                       kCudaNHWCExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(ConvTransposeTest, ConvTranspose_1D_AsymmetricPads) {
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index 10476ada2fa6..c8cf18329151 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -57,7 +57,8 @@ TEST(PoolTest, MaxPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: result differs
+  // TensorRT: result differs
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Only CUDA kernel has float 16 support
@@ -115,7 +116,8 @@ TEST(PoolTest, MaxPool_F16) {
 
   test.AddInput<MLFloat16>("X", x_dims, f_X);
   test.AddOutput<MLFloat16>("Y", expected_dims, f_Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Assertion `!attrs.count("pads")' failed
+  // TensorRT: Assertion `!attrs.count("pads")' failed
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 #endif
 
@@ -167,7 +169,9 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) {
     storage_order == 0 ? test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_row)
                        : test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_col);
   }
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider, kArmNNExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kDnnlExecutionProvider, kTensorrtExecutionProvider,
+            kAclExecutionProvider, kArmNNExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_8_With_Index) {
@@ -181,7 +185,7 @@ TEST(PoolTest, MaxPool_8_With_Index) {
   MaxPool_8_WithIndexTest(true, 1 /*storage_order*/);  // col major
 }
 
-TEST(PoolTest, MaxPool1D) {
+TEST(PoolTest, MaxPool1D_case1) {
   OpTester test("MaxPool");
 
   test.AddAttribute("auto_pad", "");
@@ -199,6 +203,44 @@ TEST(PoolTest, MaxPool1D) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(PoolTest, MaxPool1D_case2) {
+  OpTester test("MaxPool");
+  // no padding
+  test.AddAttribute("auto_pad", "VALID");
+  test.AddAttribute("strides", std::vector<int64_t>{1});
+  test.AddAttribute("pads", vector<int64_t>{0, 0});
+  test.AddAttribute("kernel_shape", vector<int64_t>{2});
+
+  std::vector<float> x_vals = {1, 2, 3, 4, 5};
+  std::vector<int64_t> x_dims = {1, 1, 5};
+  // The last dim is (5-2+1)/1 = 4
+  std::vector<int64_t> expected_dims = {1, 1, 4};
+  std::vector<float> expected_vals = {2, 3, 4, 5};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(PoolTest, MaxPool1D_case3) {
+  OpTester test("MaxPool");
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{1});
+  // Pad one element
+  test.AddAttribute("pads", vector<int64_t>{0, 1});
+  test.AddAttribute("kernel_shape", vector<int64_t>{2});
+
+  std::vector<float> x_vals = {1, 2, 3, 4, 5};
+  std::vector<int64_t> x_dims = {1, 1, 5};
+  // Since we padded it, the last dim is larger compared to the case above
+  std::vector<int64_t> expected_dims = {1, 1, 5};
+  std::vector<float> expected_vals = {2, 3, 4, 5, 5};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
   OpTester test("MaxPool", 8);
 
@@ -217,7 +259,8 @@ static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_8_With_Index) {
@@ -243,7 +286,8 @@ static void MaxPool1D_12_WithIndexTest_int8(int64_t storage_order) {
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) {
@@ -264,7 +308,8 @@ static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) {
   test.AddInput<uint8_t>("X", x_dims, x_vals);
   test.AddOutput<uint8_t>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_12_With_Index_8bits) {
@@ -304,7 +349,7 @@ TEST(PoolTest, MaxPool2D_uint8) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
 #else
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 #endif
 }
 
@@ -416,7 +461,7 @@ TEST(PoolTest, MaxPool_10_DilationPadding_1d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_2d) {
@@ -500,7 +545,7 @@ TEST(PoolTest, MaxPool_10_DilationPadding_2d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) {
@@ -528,7 +573,8 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) {
@@ -556,7 +602,8 @@ TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) {
 
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
@@ -585,7 +632,8 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_3d) {
@@ -621,7 +669,7 @@ TEST(PoolTest, MaxPool_10_DilationPadding_3d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(PoolTest, GlobalMaxPool) {
@@ -697,7 +745,7 @@ TEST(PoolTest, GlobalMaxPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalMaxPool3D) {
@@ -777,11 +825,6 @@ TEST(PoolTest, GlobalMaxPool3D) {
 }
 
 TEST(PoolTest, AveragePool) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
   OpTester test("AveragePool");
 
   test.AddAttribute("auto_pad", "");
@@ -863,11 +906,6 @@ TEST(PoolTest, AveragePool) {
 }
 
 TEST(PoolTest, AveragePool_IncludePadPixel) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
   OpTester test("AveragePool");
 
   test.AddAttribute("auto_pad", "");
@@ -888,6 +926,7 @@ TEST(PoolTest, AveragePool_IncludePadPixel) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
@@ -911,11 +950,6 @@ TEST(PoolTest, AveragePool_DefaultStrides) {
 }
 
 TEST(PoolTest, AveragePool_10_ceil1_2d) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
   OpTester test("AveragePool", 10);
 
   test.AddAttribute("auto_pad", "");
@@ -935,15 +969,11 @@ TEST(PoolTest, AveragePool_10_ceil1_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_19_dilation_2d) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
   OpTester test("AveragePool", 19);
 
   test.AddAttribute("auto_pad", "");
@@ -964,7 +994,9 @@ TEST(PoolTest, AveragePool_19_dilation_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider,
+            kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(PoolTest, GlobalAveragePool) {
@@ -1040,7 +1072,7 @@ TEST(PoolTest, GlobalAveragePool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalAveragePool_Large_128) {
@@ -1053,7 +1085,7 @@ TEST(PoolTest, GlobalAveragePool_Large_128) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals,
                         /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalAveragePool_Large_256) {
@@ -1066,15 +1098,10 @@ TEST(PoolTest, GlobalAveragePool_Large_256) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals,
                         /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, LpPool) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
   OpTester test("LpPool");
 
   test.AddAttribute("auto_pad", "");
@@ -1378,7 +1405,7 @@ TEST(PoolTest, LpPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
 }
 
 // test data generated with lp_pool_test_generator.py
@@ -1410,7 +1437,8 @@ TEST(PoolTest, LpPool1d) {
 
       // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
       // TensorRT does not support 1d pooling
-      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+               {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
       y_count++;
     }
 }
@@ -1442,7 +1470,7 @@ TEST(PoolTest, LpPool2d) {
       test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]);
 
       test.AddOutput<float>("Y", y_sizes[y_count], ys[y_count]);
-      test.Run();
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
       y_count++;
     }
 }
@@ -1460,7 +1488,8 @@ TEST(PoolTest, LpPoolCeilMode) {
 
   // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
   // TensorRT does not support 1d pooling
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, GlobalLpPool) {
@@ -1715,7 +1744,7 @@ TEST(PoolTest, GlobalLpPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
 }
 
 TEST(PoolTest, MaxPoolDimWithZeroForN) {
@@ -1732,7 +1761,8 @@ TEST(PoolTest, MaxPoolDimWithZeroForN) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kQnnExecutionProvider});
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/nn/string_normalizer_test.cc b/onnxruntime/test/providers/cpu/nn/string_normalizer_test.cc
deleted file mode 100644
index 5279a0ef2baa..000000000000
--- a/onnxruntime/test/providers/cpu/nn/string_normalizer_test.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-#if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
-// TODO: handle the u8string.
-#else
-#include "gtest/gtest.h"
-#include "test/providers/provider_test_utils.h"
-
-namespace onnxruntime {
-namespace test {
-
-namespace str_normalizer_test {
-constexpr const char* domain = kOnnxDomain;
-const int opset_ver = 10;
-
-#ifdef _MSC_VER
-const std::string test_locale("en-US");
-#else
-const std::string test_locale("en_US.UTF-8");
-#endif
-
-void InitTestAttr(OpTester& test, const std::string& case_change_action,
-                  bool is_case_sensitive,
-                  const std::vector<std::string>& stopwords,
-                  const std::string& locale) {
-  if (!case_change_action.empty()) {
-    test.AddAttribute("case_change_action", case_change_action);
-  }
-  test.AddAttribute("is_case_sensitive", int64_t{is_case_sensitive});
-  if (!stopwords.empty()) {
-    test.AddAttribute("stopwords", stopwords);
-  }
-  if (!locale.empty()) {
-    test.AddAttribute("locale", locale);
-  }
-}
-}  // namespace str_normalizer_test
-
-using namespace str_normalizer_test;
-
-TEST(ContribOpTest, StringNormalizerTest) {
-  // - casesensitive approach
-  // - no stopwords.
-  // - No change case action, expecting default to take over
-  {
-    OpTester test("StringNormalizer", opset_ver, domain);
-    InitTestAttr(test, "", true, {}, test_locale);
-    std::vector<int64_t> dims{4};
-    std::vector<std::string> input = {std::string("monday"), std::string("tuesday"),
-                                      std::string("wednesday"), std::string("thursday")};
-    test.AddInput<std::string>("T", dims, input);
-    std::vector<std::string> output(input);  // do the same for now
-    test.AddOutput<std::string>("Y", dims, output);
-    test.Run(OpTester::ExpectResult::kExpectSuccess);
-  }
-  // - casesensitive approach
-  // - filter out monday
-  // - No change case action
-  {
-    OpTester test("StringNormalizer", opset_ver, domain);
-    InitTestAttr(test, "NONE", true, {"monday"}, test_locale);
-    std::vector<int64_t> dims{4};
-    std::vector<std::string> input = {std::string("monday"), std::string("tuesday"),
-                                      std::string("wednesday"), std::string("thursday")};
-    test.AddInput<std::string>("T", dims, input);
-
-    std::vector<std::string> output = {std::string("tuesday"),
-                                       std::string("wednesday"), std::string("thursday")};
-    test.AddOutput<std::string>("Y", {3}, output);
-    test.Run(OpTester::ExpectResult::kExpectSuccess);
-  }
-  // - casesensitive approach
-  // - filter out monday
-  // - LOWER should produce the same output as they are all lower.
-  {
-    OpTester test("StringNormalizer", opset_ver, domain);
-    InitTestAttr(test, "LOWER", true, {"monday"}, test_locale);
-    std::vector<int64_t> dims{4};
-    std::vector<std::string> input = {std::string("monday"), std::string("tuesday"),
-                                      std::string("wednesday"), std::string("thursday")};
-    test.AddInput<std::string>("T", dims, input);
-
-    std::vector<std::string> output = {std::string("tuesday"),
-                                       std::string("wednesday"), std::string("thursday")};
-    test.AddOutput<std::string>("Y", {3}, output);
-    test.Run(OpTester::ExpectResult::kExpectSuccess);
-  }
-  // - casesensitive approach
-  // - filter out monday
-  // - UPPER should produce the same output as they are all lower.
-  {
-    OpTester test("StringNormalizer", opset_ver, domain);
-    InitTestAttr(test, "UPPER", true, {"monday"}, test_locale);
-    std::vector<int64_t> dims{4};
-    std::vector<std::string> input = {std::string("monday"), std::string("tuesday"),
-                                      std::string("wednesday"), std::string("thursday")};
-    test.AddInput<std::string>("T", dims, input);
-
-    std::vector<std::string> output = {std::string("TUESDAY"),
-                                       std::string("WEDNESDAY"), std::string("THURSDAY")};
-    test.AddOutput<std::string>("Y", {3}, output);
-    test.Run(OpTester::ExpectResult::kExpectSuccess);
-  }
-  // - case-SENSETIVE approach en_US locale
-  // - we test the behavior of a mix of english, french, german, russian and chinese
-  //   with en_US locale
-  // - filter out monday
-  // - UPPER should produce the same output as they are all lower.
-  {
-    OpTester test("StringNormalizer", opset_ver, domain);
-    InitTestAttr(test, "UPPER", true, {u8"monday"}, test_locale);
-    std::vector<int64_t> dims{7};
-    std::vector<std::string> input = {std::string(u8"monday"),
-                                      std::string(u8"tuesday"),
-                                      std::string(u8"Besançon"),
-                                      std::string(u8"École élémentaire"),
-                                      std::string(u8"Понедельник"),
-                                      std::string(u8"mit freundlichen grüßen"),
-                                      std::string(u8"中文")};
-    test.AddInput<std::string>("T", dims, input);
-
-    // en_US results (default)
-    std::vector<std::string> output = {std::string(u8"TUESDAY"),
-                                       // It does upper case cecedille, accented E
-                                       // and german umlaut but fails
-                                       // with german eszett
-                                       std::string(u8"BESANÇON"),
-                                       std::string(u8"ÉCOLE ÉLÉMENTAIRE"),
-                                       // No issues with Cyrllic
-                                       std::string(u8"ПОНЕДЕЛЬНИК"),
-                                       std::string(u8"MIT FREUNDLICHEN GRÜßEN"),
-                                       // Chinese do not have cases
-                                       std::string(u8"中文")};
-    test.AddOutput<std::string>("Y", {6}, output);
-    test.Run(OpTester::ExpectResult::kExpectSuccess);
-  }
-  // - case-INSENSETIVE approach en_US locale
-  // - we test the behavior of a mix of english, french, german, russian and chinese
-  //   with en_US locale
-  // - filter out monday
-  // - UPPER should produce the same output as they are all lower.
-  {
-    OpTester test("StringNormalizer", opset_ver, domain);
-    InitTestAttr(test, "UPPER", false, {u8"monday"}, test_locale);
-    std::vector<int64_t> dims{7};
-    std::vector<std::string> input = {std::string(u8"monday"),
-                                      std::string(u8"tuesday"),
-                                      std::string(u8"Besançon"),
-                                      std::string(u8"École élémentaire"),
-                                      std::string(u8"Понедельник"),
-                                      std::string(u8"mit freundlichen grüßen"),
-                                      std::string(u8"中文")};
-    test.AddInput<std::string>("T", dims, input);
-
-    // en_US results (default)
-    std::vector<std::string> output = {std::string(u8"TUESDAY"),
-                                       // It does upper case cecedille, accented E
-                                       // and german umlaut but fails
-                                       // with german eszett
-                                       std::string(u8"BESANÇON"),
-                                       std::string(u8"ÉCOLE ÉLÉMENTAIRE"),
-                                       // No issues with Cyrllic
-                                       std::string(u8"ПОНЕДЕЛЬНИК"),
-                                       std::string(u8"MIT FREUNDLICHEN GRÜßEN"),
-                                       // Chinese do not have cases
-                                       std::string(u8"中文")};
-    test.AddOutput<std::string>("Y", {6}, output);
-    test.Run(OpTester::ExpectResult::kExpectSuccess);
-  }
-
-  // Empty output case
-  // - casesensitive approach
-  // - filter out monday
-  // - UPPER should produce the same output as they are all lower.
-  {
-    OpTester test("StringNormalizer", opset_ver, domain);
-    InitTestAttr(test, "UPPER", true, {"monday"}, test_locale);
-    std::vector<int64_t> dims{2};
-    std::vector<std::string> input = {std::string("monday"),
-                                      std::string("monday")};
-    test.AddInput<std::string>("T", dims, input);
-
-    std::vector<std::string> output{""};  // One empty string
-    test.AddOutput<std::string>("Y", {1}, output);
-    test.Run(OpTester::ExpectResult::kExpectSuccess);
-  }
-  // Empty output case
-  // - casesensitive approach
-  // - filter out monday
-  // - UPPER should produce the same output as they are all lower.
-  {
-    OpTester test("StringNormalizer", opset_ver, domain);
-    InitTestAttr(test, "UPPER", true, {"monday"}, "");
-    std::vector<int64_t> dims{1, 2};
-    std::vector<std::string> input = {std::string("monday"),
-                                      std::string("monday")};
-    test.AddInput<std::string>("T", dims, input);
-
-    std::vector<std::string> output{""};  // One empty string
-    test.AddOutput<std::string>("Y", {1, 1}, output);
-    test.Run(OpTester::ExpectResult::kExpectSuccess);
-  }
-}
-
-}  // namespace test
-}  // namespace onnxruntime
-#endif
\ No newline at end of file
diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
index 2f97f6e71e92..58a616717316 100644
--- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
+++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -463,6 +464,7 @@ static void BasicTest() {
                                            0.3661f,
                                            0.2349f,
                                        });
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -689,6 +691,7 @@ TEST(RoiAlignTest, MaxModePositive) {
                                           });*/
   test.Run();
 }
+
 TEST(RoiAlignTest, AvgModeNegativeInvalidMode) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
@@ -713,7 +716,8 @@ TEST(RoiAlignTest, AvgModeNegativeInvalidMode) {
   test.AddInput<int64_t>("batch_indices", {5}, {0, 0, 0, 0, 0});
   test.AddOutput<float>("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode");
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode", ExcludeTrtOnA100());
 }
 
 TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) {
@@ -738,7 +742,8 @@ TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) {
   test.AddInput<int64_t>("batch_indices", {5}, {0, 0, 0, 0, 0});
   test.AddOutput<float>("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0");
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0", ExcludeTrtOnA100());
 }
 
 TEST(RoiAlignTest, AvgModeNegativeInvalidNumRoiDims) {
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 79da8004a9ed..2902995df1e7 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -924,7 +924,280 @@ TEST(ReductionOpTest, ReduceMax_default_axes_do_not_keep_dims) {
                         55.0f, 1.0f,
                         60.0f, 2.0f});
   test.AddOutput<float>("reduced", {}, {60.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: full reduce without keepDimensions is not supported with explicit batch                         //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: full reduce without keepDimensions is not supported with explicit batch //TensorRT: axis must be 0
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_0) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {2}, {-1, 1});
+  test.AddOutput<bool>("reduced", {2}, {true, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_1) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {2}, {-1, 1});
+  test.AddOutput<bool>("reduced", {2}, {false, false});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_2) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {2}, {-1, 1});
+  test.AddOutput<bool>("reduced", {2, 1, 1}, {true, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      }
+
+  );
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_3) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {2}, {-1, 1});
+  test.AddOutput<bool>("reduced", {2, 1, 1}, {false, false});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_4) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {2}, {2, 1});
+  test.AddOutput<bool>("reduced", {2}, {true, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_5) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {2}, {2, 1});
+  test.AddOutput<bool>("reduced", {2}, {false, false});
+  test.Run();
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_6) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {2}, {2, 1});
+  test.AddOutput<bool>("reduced", {2, 1, 1}, {true, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_7) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {2}, {2, 1});
+  test.AddOutput<bool>("reduced", {2, 1, 1}, {false, false});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_8) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {1}, {0});
+  test.AddOutput<bool>("reduced", {3, 2}, {false, true, true, true, false, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_9) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {1}, {0});
+  test.AddOutput<bool>("reduced", {3, 2}, {false, false, false, true, false, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_10) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {1}, {0});
+  test.AddOutput<bool>("reduced", {1, 3, 2}, {false, true, true, true, false, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_11) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {1}, {0});
+  test.AddOutput<bool>("reduced", {1, 3, 2}, {false, false, false, true, false, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_12) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {1}, {2});
+  test.AddOutput<bool>("reduced", {2, 3}, {false, true, true, true, true, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_13) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {1}, {2});
+  test.AddOutput<bool>("reduced", {2, 3}, {false, true, false, false, false, false});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_14) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {1}, {2});
+  test.AddOutput<bool>("reduced", {2, 3, 1}, {false, true, true, true, true, true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_15) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddInput<int64_t>("axes", {1}, {2});
+  test.AddOutput<bool>("reduced", {2, 3, 1}, {false, true, false, false, false, false});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_16) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddOutput<bool>("reduced", {}, {true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_17) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddOutput<bool>("reduced", {}, {false});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMax_18) {
+  OpTester test("ReduceMax", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddOutput<bool>("reduced", {1, 1, 1}, {true});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, test_bool_ReduceMin_19) {
+  OpTester test("ReduceMin", 20);
+  test.AddAttribute("keepdims", static_cast<int64_t>(1));
+  test.AddInput<bool>("data", {2, 3, 2}, {false, false, true, true, false, true, false, true, false, true, false, true});
+  test.AddOutput<bool>("reduced", {1, 1, 1}, {false});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kOpenVINOExecutionProvider,
+      });
 }
 
 TEST(ReductionOpTest, ReduceMax_do_not_keepdims) {
@@ -3254,7 +3527,7 @@ TEST(ReductionOpTest, OptimizeShapeForFastReduce_ReduceDimWithZero1b) {
 // test that PrepareForReduce handles this case. Called by all reduction ops so any op can be used in the test
 TEST(ReductionOpTest, ReduceDimWithZero1) {
   // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
+  if (DefaultDmlExecutionProvider().get() != nullptr || DefaultRocmExecutionProvider().get() != nullptr) {
     GTEST_SKIP() << "Skipping because of the following error: Expected output shape [{1,0,1}] did not match run output shape [{1,1,1}] for reduced";
   }
 
@@ -3264,8 +3537,13 @@ TEST(ReductionOpTest, ReduceDimWithZero1) {
 
     tester.Run(expect, error_msg,
                // exclude EPs that don't handle this
+               // TODO: fix reduce kernel for zero set cases. see: https://github.com/microsoft/onnxruntime/issues/18588
                {
                    kCoreMLExecutionProvider,
+                   kCudaExecutionProvider,
+                   kCudaNHWCExecutionProvider,
+                   kDnnlExecutionProvider,
+                   kMIGraphXExecutionProvider,
                    kOpenVINOExecutionProvider,
                    kQnnExecutionProvider,
                    kTensorrtExecutionProvider,
@@ -3275,9 +3553,8 @@ TEST(ReductionOpTest, ReduceDimWithZero1) {
   // reduce on all axes keeping dims. should allow the 0 to be the reduced value
   OpTester test("ReduceSum", 10);
   test.AddAttribute("keepdims", int64_t(1));
-  test.AddShapeToTensorData(true, 1);  // make second dim symbolic so that we don't break during shape inferencing
   test.AddInput<float>("data", {3, 0, 2}, {});
-  test.AddOutput<float>("reduced", {1, 0, 1}, {});
+  test.AddOutput<float>("reduced", {1, 1, 1}, {0.0f});
   run(test);
 }
 
@@ -3301,8 +3578,8 @@ TEST(ReductionOpTest, OptimizeShapeForFastReduce_ReduceDimWithZero2) {
 
 TEST(ReductionOpTest, ReduceDimWithZero2) {
   // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: Can't reduce on dim with value of 0 if 'keepdims' is false. Invalid output shape would be produced. input_shape:{3,0,2}";
+  if (DefaultDmlExecutionProvider().get() != nullptr || DefaultRocmExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: Can't reduce on dim with value of 0 if 'keepdims' is false. Invalid output shape would be produced. input_shape:{?,0,?}";
   }
 
   auto run = [](OpTester& tester, const std::string& error_msg = "") {
@@ -3311,23 +3588,26 @@ TEST(ReductionOpTest, ReduceDimWithZero2) {
 
     tester.Run(expect, error_msg,
                // exclude EPs that don't handle this
+               // TODO: fix reduce kernel for zero set cases. see: https://github.com/microsoft/onnxruntime/issues/18588
                {
+                   kCoreMLExecutionProvider,
+                   kCudaExecutionProvider,
+                   kCudaNHWCExecutionProvider,
+                   kDnnlExecutionProvider,
+                   kMIGraphXExecutionProvider,
                    kOpenVINOExecutionProvider,
                    kQnnExecutionProvider,
                    kTensorrtExecutionProvider,
-                   kCoreMLExecutionProvider,
                });
   };
 
-  // reduction without keeping dims on all axes. can't reduce on an axis with value of 0
+  // reducing on all axes including one or more with 0 dimension, with keepdims=0, results a scalar of 0.
   OpTester test2("ReduceSum", 10);
   test2.AddAttribute("keepdims", int64_t(0));
   test2.AddShapeToTensorData(true, 1);
   test2.AddInput<float>("data", {3, 0, 2}, {});
-  test2.AddOutput<float>("reduced", {}, {0.f});
-  run(test2,
-      "Can't reduce on dim with value of 0 if 'keepdims' is false. "
-      "Invalid output shape would be produced. input_shape:{3,0,2}");
+  test2.AddOutput<float>("reduced", {}, {0.0f});
+  run(test2);
 }
 
 TEST(ReductionOpTest, OptimizeShapeForFastReduce_ReduceDimWithZero3) {
@@ -5478,5 +5758,102 @@ TEST(ReductionOpTest, ReduceSum_RKRK_keepdims) {
   test.Run();
 }
 
+void test_empty_set(const std::string& op, int opset, bool axes_as_input, float empty_value) {
+  OpTester test(op, opset);
+  std::vector<int64_t> input_shape = {2, 0, 4};
+  int64_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), static_cast<int64_t>(1), std::multiplies<int64_t>());
+  std::vector<float> data(input_size);
+  test.AddInput("data", input_shape, data);
+  std::vector<int64_t> axes = {1};
+  if (axes_as_input) {
+    test.AddInput("axes", {(int64_t)(axes.size())}, axes);
+  } else {
+    test.AddAttribute("axes", axes);
+  }
+
+  std::vector<int64_t> output_shape = {2, 1, 4};
+  int64_t output_size = std::accumulate(output_shape.begin(), output_shape.end(), static_cast<int64_t>(1), std::multiplies<int64_t>());
+  std::vector<float> reduced(output_size, empty_value);
+  test.AddOutput<float>("reduced", output_shape, reduced);
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess,
+      "",
+      {
+          kCoreMLExecutionProvider,
+          kCudaExecutionProvider,
+          kCudaNHWCExecutionProvider,
+          kDmlExecutionProvider,
+          kDnnlExecutionProvider,
+          kMIGraphXExecutionProvider,
+          kOpenVINOExecutionProvider,
+          kQnnExecutionProvider,
+          kRocmExecutionProvider,
+          kTensorrtExecutionProvider,
+      });
+}
+
+TEST(ReductionOpTest, empty_set_ReduceL1) {
+  test_empty_set("ReduceL1", 20, true, 0);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceL1_13) {
+  test_empty_set("ReduceL1", 13, false, 0);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceL2) {
+  test_empty_set("ReduceL2", 20, true, 0);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceL2_13) {
+  test_empty_set("ReduceL2", 13, false, 0);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceLogSum) {
+  test_empty_set("ReduceLogSum", 20, true, -std::numeric_limits<float>::infinity());
+}
+
+TEST(ReductionOpTest, empty_set_ReduceLogSum_13) {
+  test_empty_set("ReduceLogSum", 13, false, -std::numeric_limits<float>::infinity());
+}
+
+TEST(ReductionOpTest, empty_set_ReduceLogSumExp) {
+  test_empty_set("ReduceLogSumExp", 20, true, -std::numeric_limits<float>::infinity());
+}
+
+TEST(ReductionOpTest, empty_set_ReduceLogSumExp_13) {
+  test_empty_set("ReduceLogSumExp", 13, false, -std::numeric_limits<float>::infinity());
+}
+
+TEST(ReductionOpTest, empty_set_ReduceMin) {
+  test_empty_set("ReduceMin", 20, true, std::numeric_limits<float>::infinity());
+}
+
+TEST(ReductionOpTest, empty_set_ReduceMin_13) {
+  test_empty_set("ReduceMin", 13, false, std::numeric_limits<float>::infinity());
+}
+
+TEST(ReductionOpTest, empty_set_ReduceProd) {
+  test_empty_set("ReduceProd", 20, true, 1.0f);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceProd_13) {
+  test_empty_set("ReduceProd", 13, false, 1.0f);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceSum) {
+  test_empty_set("ReduceSum", 20, true, 0.0f);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceSum_13) {
+  test_empty_set("ReduceSum", 11, false, 0.0f);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceSumSquare) {
+  test_empty_set("ReduceSumSquare", 20, true, 0.0f);
+}
+
+TEST(ReductionOpTest, empty_set_ReduceSumSquare_13) {
+  test_empty_set("ReduceSumSquare", 13, false, 0.0f);
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
index 727351cae84a..568a4649f397 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
@@ -59,7 +59,7 @@ def PrintResult(op, axes, keepdims, res):  # noqa: N802
 
     print(" // expected values")
     print("{", end="")
-    for i in range(0, res.size):
+    for i in range(res.size):
         print("%5.6ff," % res.item(i))
 
     print("})},")
@@ -128,7 +128,7 @@ def PrintReenableOptimizations():  # noqa: N802
     print("ReductionTestCases testcases = {")
     print("// input_data")
     print("{")
-    for i in range(0, input_data.size):
+    for i in range(input_data.size):
         print(
             "%5.6ff," % input_data.item(i),
         )
diff --git a/onnxruntime/test/providers/cpu/rnn/GRU.py b/onnxruntime/test/providers/cpu/rnn/GRU.py
index 846fc3d06b9a..f141710cf31e 100644
--- a/onnxruntime/test/providers/cpu/rnn/GRU.py
+++ b/onnxruntime/test/providers/cpu/rnn/GRU.py
@@ -47,8 +47,8 @@ def __init__(self, **params):
             if "initial_h" in params
             else np.zeros((num_directions, batch_size, hidden_size)).reshape(num_directions, batch_size, hidden_size)
         )
-        LBR = params["linear_before_reset"] if "linear_before_reset" in params else 0  # noqa: N806
-        self.direction = params["direction"] if "direction" in params else "forward"
+        LBR = params.get("linear_before_reset", 0)  # noqa: N806
+        self.direction = params.get("direction", "forward")
 
         if num_directions == 1:
             if self.direction == "forward":
@@ -84,7 +84,7 @@ def run(self):
             hidden_size = f_output.shape[3]
 
             output = np.empty((0, 2, batch_size, hidden_size), np.float32)
-            for x in range(0, seq_length):
+            for x in range(seq_length):
                 output = np.append(output, f_output[x])
                 output = np.append(output, r_output_orig_input_order[x])
 
diff --git a/onnxruntime/test/providers/cpu/rnn/LSTM.py b/onnxruntime/test/providers/cpu/rnn/LSTM.py
index 74299ea2c75a..49e28a93385a 100644
--- a/onnxruntime/test/providers/cpu/rnn/LSTM.py
+++ b/onnxruntime/test/providers/cpu/rnn/LSTM.py
@@ -65,13 +65,13 @@ def __init__(self, **params):  # type: (*Any) -> None
             else np.zeros((num_directions, batch_size, hidden_size)).reshape(num_directions, batch_size, hidden_size)
         )
 
-        f = params["f"] if "f" in params else ActivationFuncs.sigmoid
-        g = params["g"] if "g" in params else ActivationFuncs.tanh
-        h = params["h"] if "h" in params else ActivationFuncs.tanh
-        input_forget = params["input_forget"] if "input_forget" in params else False
-        clip = params["clip"] if "clip" in params else 9999.0
+        f = params.get("f", ActivationFuncs.sigmoid)
+        g = params.get("g", ActivationFuncs.tanh)
+        h = params.get("h", ActivationFuncs.tanh)
+        input_forget = params.get("input_forget", False)
+        clip = params.get("clip", 9999.0)
 
-        self.direction = params["direction"] if "direction" in params else "forward"
+        self.direction = params.get("direction", "forward")
 
         if num_directions == 1:
             if self.direction == "forward":
@@ -124,7 +124,7 @@ def run(self):
             output = np.empty((0, 2, batch_size, hidden_size), np.float32)
             # Y_h = np.empty((0, 2, batch_size, hidden_size), np.float32)
             # Y_c = np.empty((0, 2, hidden_size, hidden_size), np.float32)
-            for x in range(0, seq_length):
+            for x in range(seq_length):
                 output = np.append(output, f_output[x])
                 output = np.append(output, r_output_orig_input_order[x])
 
@@ -266,8 +266,8 @@ def SimpleWeightsNoBiasTwoRows(direction):  # type: () -> None  # noqa: N802
         R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         if direction == "bidirectional":
-            W = W = np.tile(W, (2, 1)).reshape(2, number_of_gates * hidden_size, input_size)  # noqa: N806
-            R = R = np.tile(R, (2, 1)).reshape(2, number_of_gates * hidden_size, hidden_size)  # noqa: N806
+            W = np.tile(W, (2, 1)).reshape(2, number_of_gates * hidden_size, input_size)  # noqa: N806
+            R = np.tile(R, (2, 1)).reshape(2, number_of_gates * hidden_size, hidden_size)  # noqa: N806
 
         lstm = LSTM_Helper(X=input, W=W, R=R, direction=direction)
 
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
index 7e81fc80ddf8..e73a1b492cc0 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@@ -143,6 +143,8 @@ static void RunLstmTest(const std::vector<float>& X_data,
     test.AddOptionalOutputEdge<float>();
   }
 
+  test.SetOutputTolerance(0.0001f);
+
   // TensorRT failed on LSTM tests
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
diff --git a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
index b9875b9553a5..38734ab9f668 100644
--- a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
@@ -120,15 +120,11 @@ TEST(RNNTest, RNN_bidirectional_bias_initial_zigged_batch) {
   test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
   // TensorRT failed on RNN tests
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_zigged_batch) {
-#else
 TEST(RNNTest, RNN_bidirectional_zigged_batch) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 3, seq_length = 5;
 
@@ -275,15 +271,11 @@ TEST(RNNTest, RNN_reverse_direction_zigged_batch) {
   std::vector<float> Y_h_data({0.87014002F, 0.09402763F, -0.54269236F, 0.64809889F, -0.19472955F, -0.24271242F});
   test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_forward_direction_zigged_batch) {
-#else
 TEST(RNNTest, RNN_forward_direction_zigged_batch) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 1, input_size = 2, hidden_size = 3, seq_length = 5;
 
@@ -357,12 +349,7 @@ TEST(RNNTest, RNN_forward_direction_zigged_batch) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-// Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_0) {
-#else
 TEST(RNNTest, RNN_bidirectional_0) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 3, batch_size = 1, seq_length = 5;
 
@@ -424,12 +411,7 @@ TEST(RNNTest, RNN_bidirectional_0) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-// Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_1) {
-#else
 TEST(RNNTest, RNN_bidirectional_1) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 2, batch_size = 1, seq_length = 1;
 
@@ -597,7 +579,7 @@ TEST(RNNTest, DISABLED_RNN_default_attributes_and_forward_direction) {
   }
 }
 
-TEST(RNNTest, DISABLED_RNN_reverse_direction) {
+TEST(RNNTest, RNN_reverse_direction) {
   int64_t num_directions = 1, input_size = 2, hidden_size = 3, batch_size = 1, seq_length = 5;
 
   // In case of useDefault, attributes, inputs or outputs are not set.
@@ -762,7 +744,9 @@ TEST(RNNTest, RNN_invalid_sequence_lens) {
     test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
     // the CUDA RNN version allows the invalid sequence lengths, so disable testing on CUDA and TensorRT
-    test.Run(OpTester::ExpectResult::kExpectFailure, error_msg, {kCudaExecutionProvider, kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectFailure, error_msg,
+             {kCudaExecutionProvider, kCudaNHWCExecutionProvider,
+              kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
   };
 
   // should batch batch_size to be valid
@@ -860,7 +844,8 @@ TEST(RNNTest, RNN_bidirectional_with_sequence_lens) {
 
   test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(RNNTest, RNN_with_invalid_activation_load_failure) {
diff --git a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
index 60e75811e433..c2d64b8e5ee4 100644
--- a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
@@ -442,6 +442,19 @@ TEST(SequenceOpsTest, SplitToSequence_PositiveAxisScalarSplit) {
   test.Run();
 }
 
+TEST(SequenceOpsTest, SplitToSequence_StringSplit) {
+  OpTester test("SplitToSequence", 11);
+  test.AddInput<std::string>("input", {3}, std::vector<std::string>({"Test string", "Another string", "A third and much longer string"}));
+  int64_t axis = 0;
+  test.AddAttribute("axis", axis);
+  SeqTensors<std::string> output;
+  output.AddTensor({1}, {"Test string"});
+  output.AddTensor({1}, {"Another string"});
+  output.AddTensor({1}, {"A third and much longer string"});
+  test.AddSeqOutput("S2", output);
+  test.Run();
+}
+
 TEST(SequenceOpsTest, SplitToSequence_DefaultAxis0UnevenSplitFloat) {
   OpTester test("SplitToSequence", 11);
   test.AddInput<float>("input", {5, 2}, GetConsecutiveVector<float>(1.f, 10));
diff --git a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
index e37e784f2893..1ffe6c73d4fa 100644
--- a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
@@ -13,6 +13,7 @@ TEST(AffineGridTest, 2d) {
   test.AddInput<int64_t>("size", {4}, {1, 1, 2, 3});
   test.AddOutput<float>("grid", {1, 2, 3, 2},
                         {-0.6667f, -0.5000f, 0.0000f, -0.5000f, 0.6667f, -0.5000f, -0.6667f, 0.5000f, 0.0000f, 0.5000f, 0.6667f, 0.5000f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -24,6 +25,7 @@ TEST(AffineGridTest, test_2d_0) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.3228f, -0.9151f, 1.1544f, -0.7414f, -0.4386f, -0.5868f, 1.0386f, -0.4132f, -0.5544f, -0.2586f, 0.9228f, -0.0849f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -33,6 +35,7 @@ TEST(AffineGridTest, test_2d_1) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f, -0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -42,6 +45,7 @@ TEST(AffineGridTest, test_2d_2) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.6726f, -2.7663f, 0.8274f, -1.9003f, -1.2500f, -0.9330f, 0.2500f, -0.0670f, -1.8274f, 0.9003f, -0.3274f, 1.7663f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -51,6 +55,7 @@ TEST(AffineGridTest, test_2d_3) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f, -1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -60,6 +65,7 @@ TEST(AffineGridTest, test_2d_4) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.0036f, -1.1661f, 1.9509f, -0.8188f, -1.1772f, -0.6736f, 1.7772f, -0.3264f, -1.3509f, -0.1812f, 1.6036f, 0.1661f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -69,6 +75,7 @@ TEST(AffineGridTest, test_2d_5) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f, -1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -78,6 +85,7 @@ TEST(AffineGridTest, test_2d_6) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.1340f, -4.1160f, 1.8660f, -2.3840f, -2.0000f, -1.3660f, 1.0000f, 0.3660f, -2.8660f, 1.3840f, 0.1340f, 3.1160f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -87,6 +95,7 @@ TEST(AffineGridTest, test_2d_7) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f, -1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -96,6 +105,7 @@ TEST(AffineGridTest, test_3d_0) {
   test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.7468f, -1.3266f, 1.5323f, 0.6627f, -1.2078f, 1.3639f, -0.7468f, 0.6430f, 1.6191f, 0.6627f, 0.7618f, 1.4507f, -0.4048f, -1.5442f, 1.8408f, 1.0048f, -1.4254f, 1.6724f, -0.4048f, 0.4254f, 1.9276f, 1.0048f, 0.5442f, 1.7592f, -0.0627f, -1.7618f, 2.1493f, 1.3468f, -1.6430f, 1.9809f, -0.0627f, 0.2078f, 2.2361f, 1.3468f, 0.3266f, 2.0677f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -105,6 +115,7 @@ TEST(AffineGridTest, test_3d_1) {
   test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f, -0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -114,6 +125,7 @@ TEST(AffineGridTest, test_3d_2) {
   test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.5299f, 0.8995f, -4.3568f, -0.2701f, -0.3995f, -2.9818f, -0.5299f, 2.3995f, 0.4064f, -0.2701f, 1.1005f, 1.7814f, -0.6299f, -0.6005f, -2.7691f, -0.3701f, -1.8995f, -1.3941f, -0.6299f, 0.8995f, 1.9941f, -0.3701f, -0.3995f, 3.3691f, -0.7299f, -2.1005f, -1.1814f, -0.4701f, -3.3995f, 0.1936f, -0.7299f, -0.6005f, 3.5818f, -0.4701f, -1.8995f, 4.9568f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -123,6 +135,7 @@ TEST(AffineGridTest, test_3d_3) {
   test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f, -0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -132,6 +145,7 @@ TEST(AffineGridTest, test_3d_4) {
   test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-1.6226f, -2.2620f, 1.4189f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, 1.1965f, 1.9147f, 1.2557f, -1.1095f, -2.5884f, 1.8816f, 1.7095f, -2.3508f, 1.5448f, -1.1095f, 1.3508f, 2.0552f, 1.7095f, 1.5884f, 1.7184f, -0.5965f, -2.9147f, 2.3443f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 2.2226f, 1.2620f, 2.1811f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -141,6 +155,7 @@ TEST(AffineGridTest, test_3d_5) {
   test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f, -1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -150,6 +165,7 @@ TEST(AffineGridTest, test_3d_6) {
   test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.0902f, 1.9510f, 4.0566f, -0.7598f, -0.7010f, -5.8381f, -0.2402f, -3.2990f, -3.0881f, -0.7598f, 2.2990f, 3.6881f, -0.2402f, -0.2990f, 6.4381f, -0.9098f, -2.9510f, -3.4566f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.3902f, -2.5490f, 8.8197f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -159,6 +175,7 @@ TEST(AffineGridTest, test_3d_7) {
   test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f, -0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
index 8a8bc5560c08..b4bd3fca7b71 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
@@ -383,7 +383,7 @@ TEST(GatherElementsOpTest, IndicesOutOfBounds) {
   // skip openvino which will not throw error message but will ensure no out-of-bound access
   // skip TensorRT because it doesn't support out of bounds indices
   test.Run(OpTester::ExpectResult::kExpectFailure, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider, kOpenVINOExecutionProvider,
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kOpenVINOExecutionProvider,
             kTensorrtExecutionProvider, kDmlExecutionProvider});
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index 0f097622abff..5c89d6ea7bd7 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -6,6 +6,33 @@
 
 namespace onnxruntime {
 namespace test {
+
+std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset_version) {
+  ORT_UNUSED_PARAMETER(opset_version);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+
+  execution_providers.emplace_back(DefaultCpuExecutionProvider());
+#ifdef USE_CUDA
+  if (opset_version < 20) {
+    execution_providers.emplace_back(DefaultCudaExecutionProvider());
+#ifdef ENABLE_CUDA_NHWC_OPS
+    execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
+#endif
+  }
+
+#endif
+  return execution_providers;
+}
+
+template <typename T>
+void RunTests(T& test, std::vector<std::unique_ptr<IExecutionProvider>>&& execution_providers) {
+  for (size_t idx = 0; idx < execution_providers.size(); ++idx) {
+    test.ConfigEp(std::move(execution_providers[idx])).RunWithConfig();
+  }
+  execution_providers.clear();
+}
+
 // DO NOT edit following tests. They are generated by:
 // onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
@@ -25,8 +52,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
@@ -46,8 +72,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
@@ -67,8 +92,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
@@ -88,8 +112,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
@@ -109,8 +132,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) {
@@ -130,8 +152,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
@@ -151,8 +172,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
@@ -172,8 +192,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
@@ -193,8 +212,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
@@ -214,8 +232,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
@@ -235,8 +252,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners) {
@@ -256,8 +272,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
@@ -277,8 +292,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
@@ -298,8 +312,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
@@ -319,8 +332,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
@@ -340,8 +352,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
@@ -361,8 +372,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) {
@@ -382,8 +392,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
@@ -403,8 +412,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
@@ -424,8 +432,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
@@ -445,8 +452,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
@@ -466,8 +472,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
@@ -487,8 +492,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
@@ -508,8 +512,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
@@ -529,8 +532,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
@@ -550,8 +552,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
@@ -571,8 +572,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
@@ -592,8 +592,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) {
@@ -613,8 +612,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) {
@@ -634,8 +632,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
@@ -655,8 +652,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
@@ -676,8 +672,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
@@ -697,8 +692,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
@@ -718,8 +712,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
@@ -739,8 +732,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
@@ -760,8 +752,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
@@ -781,8 +772,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
@@ -802,8 +792,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
@@ -823,8 +812,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
@@ -844,8 +832,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners) {
@@ -865,8 +852,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners) {
@@ -886,8 +872,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
@@ -907,8 +892,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
@@ -928,8 +912,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
@@ -949,8 +932,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
@@ -970,8 +952,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
@@ -991,8 +972,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) {
@@ -1012,8 +992,8 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
index e4d58e79243e..c60e55617774 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
@@ -76,6 +76,6 @@
                     print('test.AddAttribute("padding_mode", padding_mode);')
                     print('test.AddAttribute("align_corners", align_corners);')
                     print('test.AddOutput<float>("Y", Y_shape, Y_data);')
-                    print("test.Run();")
+                    print(f"RunTests(test, GetExecutionProviders({opset_version}));")
                     print("}")
                     print("\n")
diff --git a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
index 2e583c5d2547..bd97306142f1 100644
--- a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
@@ -99,6 +99,48 @@ TEST(IsInfTest, test_isinf_negative_double20) {
   run_is_inf_test(20, 0, 1, input, output);
 }
 
+TEST(IsInfTest, test_isinf_mlfloat16) {
+  std::initializer_list<MLFloat16> input = {MLFloat16{-1.7f}, MLFloat16::NaN, MLFloat16::Infinity, 3.6_fp16,
+                                            MLFloat16::NegativeInfinity, MLFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, true, false, true, true};
+  run_is_inf_test(20, 1, 1, input, output);
+}
+
+TEST(IsInfTest, test_isinf_positive_mlfloat16) {
+  std::initializer_list<MLFloat16> input = {MLFloat16{-1.7f}, MLFloat16::NaN, MLFloat16::Infinity, 3.6_fp16,
+                                            MLFloat16::NegativeInfinity, MLFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, true, false, false, true};
+  run_is_inf_test(20, 1, 0, input, output);
+}
+
+TEST(IsInfTest, test_isinf_negative_mlfloat16) {
+  std::initializer_list<MLFloat16> input = {MLFloat16{-1.7f}, MLFloat16::NaN, MLFloat16::Infinity, 3.6_fp16,
+                                            MLFloat16::NegativeInfinity, MLFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, false, false, true, false};
+  run_is_inf_test(20, 0, 1, input, output);
+}
+
+TEST(IsInfTest, test_isinf_bfloat16) {
+  std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
+                                           BFloat16::NegativeInfinity, BFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, true, false, true, true};
+  run_is_inf_test(20, 1, 1, input, output);
+}
+
+TEST(IsInfTest, test_isinf_positive_bfloat16) {
+  std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
+                                           BFloat16::NegativeInfinity, BFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, true, false, false, true};
+  run_is_inf_test(20, 1, 0, input, output);
+}
+
+TEST(IsInfTest, test_isinf_negative_bfloat16) {
+  std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
+                                           BFloat16::NegativeInfinity, BFloat16::Infinity};
+  std::initializer_list<bool> output = {false, false, false, false, true, false};
+  run_is_inf_test(20, 0, 1, input, output);
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(IsInfTest, test_Float8E4M3FN) {
   std::initializer_list<Float8E4M3FN> input = {
diff --git a/onnxruntime/test/providers/cpu/tensor/isnan_test.cc b/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
index 0f1e5c07cdd9..3cf99fde2cce 100644
--- a/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
@@ -38,9 +38,23 @@ TEST(IsNaNOpTest, IsNaNFloat16_9) {
   run_is_nan_test(9, dims, input, output);
 }
 
+TEST(IsNaNOpTest, IsNaNFloat16_13) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<MLFloat16> input = {MLFloat16::One, MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(13, dims, input, output);
+}
+
 TEST(IsNaNOpTest, IsNaNFloat16_20) {
   std::vector<int64_t> dims{2, 2};
-  std::initializer_list<MLFloat16> input = {MLFloat16(1.0f), MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN};
+  std::initializer_list<MLFloat16> input = {MLFloat16::One, MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaNBFloat16_20) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<BFloat16> input = {BFloat16::One, BFloat16::NaN, BFloat16(2.0f), BFloat16::NaN};
   std::initializer_list<bool> output = {false, true, false, true};
   run_is_nan_test(20, dims, input, output);
 }
diff --git a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
index b6720ae2a9a7..8dcb15cbc692 100644
--- a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
@@ -5,6 +5,7 @@
 
 #include "test/common/tensor_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/util/include/default_providers.h"
 
 namespace onnxruntime::test {
 
@@ -155,6 +156,10 @@ TEST(MeanVarianceNormalizationTest, AxesSubsets5D) {
     test.AddInput<float>("input", shape, X.data(), X.size());
     test.AddOutput<float>("output", shape, Y.data(), Y.size());
 
+    if (DefaultDmlExecutionProvider().get() != nullptr) {
+      test.SetOutputTolerance(0.001f);
+    }
+
     test.Run();
   };
 
diff --git a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
index a2ffbdcc0bdf..55c247e4c2fe 100644
--- a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
@@ -3,6 +3,7 @@
 
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 
 using namespace std;
 
@@ -36,7 +37,8 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float /*indices, output, depth*/) {
                          0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
                          0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
                          0., 0., 0., 0., 0., 0., 1., 0., 0., 0.});
-  test.Run();
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) {
@@ -51,7 +53,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) {
                            0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 1, 0, 0, 0});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_float_int64 /*indices, output, depth*/) {
@@ -81,7 +83,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float /*indices, output, depth*/) {
                          0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int32_float_int32 /*indices, output, depth*/) {
@@ -231,7 +233,7 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float_NonZeroOffValue /*indices, outp
                          2., 2., 3., 2., 2., 2., 2., 2., 2., 2.,
                          2., 2., 2., 2., 3., 2., 2., 2., 2., 2.,
                          2., 2., 2., 2., 2., 2., 3., 2., 2., 2.});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, output, depth*/) {
@@ -246,7 +248,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, outp
                            2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
                            2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
                            2, 2, 2, 2, 2, 2, 3, 2, 2, 2});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_float_int64_NonZeroOffValue /*indices, output, depth*/) {
@@ -276,7 +278,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float_NonZeroOffValue /*indices, outp
                          2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
                          2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
                          2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int32_float_int32_NonZeroOffValue /*indices, output, depth*/) {
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index f4b21823a487..34f6455f3385 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -32,13 +32,32 @@ TEST(DequantizeLinearOpTest, Int8) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+// Test int16 DequantizeLinear (per tensor)
+TEST(DequantizeLinearOpTest, Int16) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> dims{4};
+  test.AddInput<int16_t>("x", dims, {-300, -30, -1025, 1270});
+  test.AddInput<float>("scale", {}, {2.0f}, true);
+  test.AddInput<int16_t>("zero_point", {}, {-1024}, true);
+  test.AddOutput<float>("y", dims, {1448.0f, 1988.0f, -2.0f, 4588.0f});
+  // Disable Tensorrt EP due to error: unsupported data type
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Test uint16 DequantizeLinear (per tensor)
+TEST(DequantizeLinearOpTest, Uint16) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> dims{4};
+  test.AddInput<uint16_t>("x", dims, {30000, 31000, 32768, 33000});
+  test.AddInput<float>("scale", {}, {2.0f}, true);
+  test.AddInput<uint16_t>("zero_point", {}, {32767}, true);
+  test.AddOutput<float>("y", dims, {-5534.0f, -3534.0f, 2.0f, 466.0f});
+  // Disable Tensorrt EP due to error: unsupported data type
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 // scalar zero & scale with int8
 TEST(DequantizeLinearOpTest, Int32) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: AbiCustomRegistry.cpp(507): The parameter is incorrect";
-  }
-
   OpTester test("DequantizeLinear", 10);
   std::vector<int64_t> dims{4};
   test.AddInput<int32_t>("x", dims, {-30, -3, 100, 127});
@@ -47,6 +66,16 @@ TEST(DequantizeLinearOpTest, Int32) {
   test.Run();
 }
 
+TEST(DequantizeLinearOpTest_BroadcastTensor, Int32) {
+  OpTester test("DequantizeLinear", 13);
+  test.AddInput<int32_t>("x", {4}, {-30, -3, 100, 127});
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddInput<float>("x_scale", {1}, {2.0f});
+  test.AddInput<int32_t>("x_zero_point", {1}, {0});
+  test.AddOutput<float>("y", {4}, {-60.f, -6.f, 200.f, 254.f});
+  test.Run();
+}
+
 // 2d inputs
 TEST(DequantizeLinearOpTest, 2D) {
   OpTester test("DequantizeLinear", 10);
@@ -88,11 +117,6 @@ TEST(DequantizeLinearOpMLFloat16Test, Scalar) {
 
 // dequantize without zero point
 TEST(DequantizeLinearOpTest, Without_Zero_Point) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: AbiCustomRegistry.cpp(507): The parameter is incorrect";
-  }
-
   OpTester test("DequantizeLinear", 10);
   test.AddInput<int8_t>("x", {}, {100});
   test.AddInput<float>("x_scale", {}, {2.0f});
@@ -271,6 +295,60 @@ TEST(QuantizeLinearOpTest, Int8) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+// Test uint16 QuantizeLinear (per tensor)
+TEST(QuantizeLinearOpTest, Uint16) {
+  OpTester test("QuantizeLinear", 21);
+  std::vector<int64_t> dims{12};
+  test.AddInput<float>("x", dims, {
+                                      0.f, -128.f, 3.f, -3.f,  // rounding half to even
+                                      2.9f, -2.9f,             // round < .5
+                                      3.1f, -3.1f,             // round > .5
+                                      65536.f, -65534.f,       // critical point
+                                      70000.f, -70000.f        // saturate case
+                                  });
+  test.AddInput<float>("scale", {}, {2.0f}, true);
+  test.AddInput<uint16_t>("zero_point", {}, {32767}, true);
+  test.AddOutput<uint16_t>("y", dims,
+                           {32767, 32703,
+                            32769, 32765,
+                            32768, 32766,
+                            32769, 32765,
+                            65535, 0,
+                            65535, 0});
+
+  // Disable Tensorrt EP due to error: unsupported data type
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Test int16 QuantizeLinear (per tensor)
+TEST(QuantizeLinearOpTest, Int16) {
+  OpTester test("QuantizeLinear", 21);
+  std::vector<int64_t> dims{16};
+  test.AddInput<float>("x", dims, {
+                                      0.f, -514.f, 3.f, -3.f,  // rounding half to even
+                                      2.9f, -2.9f,             // round < .5
+                                      3.1f, -3.1f,             // round > .5
+                                      65022.f, -66046.f,       // critical point
+                                      65023.f, -66047.f,       // critical point
+                                      65024.f, -66048.f,       // critical point
+                                      70000.f, -70000.f        // saturate case
+                                  });
+  test.AddInput<float>("scale", {}, {2.0f}, true);
+  test.AddInput<int16_t>("zero_point", {}, {256}, true);
+  test.AddOutput<int16_t>("y", dims,
+                          {256, -1,
+                           258, 254,
+                           257, 255,
+                           258, 254,
+                           32767, -32767,
+                           32767, -32768,
+                           32767, -32768,
+                           32767, -32768});
+
+  // Disable Tensorrt EP due to error: unsupported data type
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 // quantize with scalar zero point and scale
 TEST(QuantizeLinearOpTest, Int8_NegativeZeroPoint) {
   // TODO: Unskip when fixed #41968513
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 3ea7295aef5a..496f2213e9d3 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -5,13 +5,16 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
+
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.20000028610229492, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] "
+                 << "is 0.20000028610229492, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -32,7 +35,8 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
 
   test.AddInput<float>("X", {H, W}, X);
   test.AddInput<float>("roi", {4}, roi);
-  test.AddInput<float>("", {0}, scales);  // opset13 requires either 'sizes' or 'scales' must be provided, but not both of them
+  // opset13 requires either 'sizes' or 'scales' must be provided, but not both of them
+  test.AddInput<float>("", {0}, scales);
   test.AddInput<int64_t>("sizes", {2}, sizes);
 
   std::vector<float> Y = {7.600004f, 7.9f, 8.2f,
@@ -100,7 +104,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr
   // TensorRT: results mismatch
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapolation_uint8) {
@@ -130,7 +134,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapolation_int8) {
@@ -187,7 +192,10 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+  // DML: results mismatch
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess, "",
+      {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_extrapolation_int8) {
@@ -214,7 +222,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
                            0, 0, 0};
 
   test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
-  test.Run();
+  // DML: results mismatch
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
@@ -236,7 +245,10 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
   std::vector<float> Y = {2.66666651f, 4.3333331f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: Segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
@@ -260,8 +272,9 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
   test.AddOutput<float>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider});
+  // TRT: Segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
@@ -285,7 +298,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
@@ -307,7 +321,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
   std::vector<int8_t> Y = {0, 0};
 
   test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Since NNAPI(TFLite) only using the scale calculate using the input/output size
@@ -315,7 +329,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
 // The output size is [1,1,2,4].*[1,1,0.6,0.6]=[1,1,1,2]
 // NNAPI will recaluclate the scales as the output size divided by input size
 // scales = [1,1,1,2]./[1,1,2,4] = [1,1,0.5,0.5]
-// See, https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/reference_ops.h
+// See:https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/reference_ops.h
 // So the result of the above example will be different than CPU EP
 // Add the following 2 tests to test with scales valid to NNAPI
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
@@ -339,7 +353,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
     std::vector<float> Y = {3.5f, 5.5f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -397,7 +411,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
     std::vector<float> Y = {1.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -433,7 +447,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_uin
     test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
     // CUDA: result mismatch due to not implementing NHWC support
     // ROCm: results mismatch
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+             {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
   };
 
   run_test(false);
@@ -473,7 +488,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_int
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_pytorch_half_pixel) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << " The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -530,7 +546,9 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   test.AddOutput<uint8_t>("Y", {N, sizes[1], sizes[2], C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+  // DML: results mismatch
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixel_int8) {
@@ -558,11 +576,13 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   std::vector<int8_t> Y = {0, 2, -9};
 
   test.AddOutput<int8_t>("Y", {N, sizes[1], sizes[2], C}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
+  // TensorRT: results mismatch
+  // DML: results mismatch
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kDmlExecutionProvider});
 }
 
-TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) {
+  // To test CoreML/NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -594,7 +614,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) {
         7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -639,7 +659,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_uint8) {
                             Y, false, .0f, 1.0f);
     // CUDA: result mismatch due to not implementing NHWC support
     // ROCm: results mismatch
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+             {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
   };
 
   run_test(false);
@@ -710,13 +731,14 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) {
       4.0f, 4.5714290f, 5.142857f, 5.714286f, 6.285714f, 6.8571430f, 7.428571f, 8.0f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -803,7 +825,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) {
                             7.0f, 11.0f};
 
     test.AddOutput<float>("Y", {N, C, H, W}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -829,7 +851,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) {
@@ -851,7 +873,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_WithSizes) {
@@ -904,7 +926,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_half_pixel) {
                           14.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, sizes[2], sizes[3]}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_crop_and_resize_with_extrapolation) {
@@ -984,7 +1006,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest) {
                           3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_WithSizes_CeilMode) {
@@ -1077,13 +1099,14 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
                           13.0f, 13.0f, 13.0f, 14.0f, 14.0f, 15.0f, 15.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 3, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 3, which exceeds threshold";
   }
 
   OpTester test("Resize", 12);  // tf_half_pixel_for_nn is deprecated since opset 13
@@ -1180,7 +1203,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Nearest2xOptimization_Scales) {
                             3.0f, 3.0f, 4.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -1245,7 +1268,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest) {
                           11.9165f, 13.2266f, 14.5278f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) {
@@ -1275,7 +1298,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) {
                           11.949f, 13.2503f, 14.5942f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) {
@@ -1302,7 +1325,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) {
                           11.8701f, 13.168f, 14.4912f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_with_roi) {
@@ -1356,7 +1379,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_asymmetric) {
                           11.375f, 12.6719f, 13.9688f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) {
@@ -1388,7 +1411,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) {
                           13.375f, 13.7813f, 14.375f, 14.875f, 15.375f, 15.9688f, 16.375f, 16.4688f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_MultiChannel) {
@@ -1469,13 +1492,14 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) {
                           13.332f, 13.8086f, 14.4375f, 14.8438f, 15.4727f, 15.9492f, 16.2461f, 16.1758f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1494,13 +1518,17 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
   std::vector<float> Y = {1.0f, 2.66666651f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold ";
   }
 
   OpTester test("Resize", 10);
@@ -1519,13 +1547,14 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
   std::vector<float> Y = {1.0f, 2.66666651f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1554,13 +1583,17 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
       7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1581,7 +1614,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
       4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
@@ -1606,7 +1639,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
                           7.0f, 11.0f};
 
   test.AddOutput<float>("Y", {N, C, H, W}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) {
@@ -1626,7 +1659,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) {
@@ -1647,10 +1680,10 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) {
                           3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
-TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
+TEST(ResizeOpTest, ResizeOpNearestNoScaleTest_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{1.0f, 1.0f, 1.0f, 1.0f};
 
@@ -1665,13 +1698,14 @@ TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
   std::vector<float> Y = {1.0f, 2.0f, 3.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, H, W}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOp_MissingRoiAndMissingScalesOptionalInputs) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(1876): The parameter is incorrect.";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "MLOperatorAuthorImpl.cpp(1876): The parameter is incorrect.";
   }
 
   OpTester test("Resize", 13);
@@ -1715,7 +1749,7 @@ void ResizeOpTypeCheck_Ver_10() {
                       3, 3, 3, 4, 4, 4};
 
   test.AddOutput<T>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpTypeCheck_Ver_10) {
@@ -1746,7 +1780,7 @@ void ResizeOpTypeCheck_Ver_11_13_18(int opset_version) {
                       3, 3, 3, 4, 4, 4};
 
   test.AddOutput<T>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpTypeCheck_Ver11) {
@@ -1822,7 +1856,8 @@ template <typename T, typename T1 = int64_t>
 void TestAntialiasing(std::map<std::string, std::string> attributes,
                       std::vector<int64_t> input_shape,
                       std::vector<T> input_data,
-                      std::vector<T1> output_shape_or_scale, std::vector<T> output_data) {
+                      std::vector<T1> output_shape_or_scale, std::vector<T> output_data,
+                      gsl::span<std::string_view> excluded_ep = {}) {
   auto parse_attr = [](const std::string& str, auto typed_v) {
     using Tdata = decltype(typed_v);
     std::vector<Tdata> vect;
@@ -1865,6 +1900,8 @@ void TestAntialiasing(std::map<std::string, std::string> attributes,
       test.AddAttribute<float>("extrapolation_value", std::stof(v));
     } else if (k == "roi") {
       roi = parse_attr(v, 0.0f);
+    } else if (k == "antialias") {
+      test.AddAttribute<int64_t>("antialias", std::stoll(v));
     } else {
       throw std::invalid_argument("Unknown attribute");
     }
@@ -1884,11 +1921,25 @@ void TestAntialiasing(std::map<std::string, std::string> attributes,
   }
 
   test.AddOutput<T>("Y", output_shape, output_data);
-  // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accurarcy issue.
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+
+  std::unordered_set<std::string> excluded_eps;
+  std::transform(excluded_ep.begin(), excluded_ep.end(),
+                 std::inserter(excluded_eps, excluded_eps.end()), [](std::string_view ep) {
+                   return std::string(ep);
+                 });
+  // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accuracy issue.
+  excluded_eps.insert(kTensorrtExecutionProvider);
+  // Test is flaky on kCudaNHWCExecutionProvider
+  excluded_eps.insert(kCudaNHWCExecutionProvider);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_Bilinear_No_ExcludeOutside) {
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because dml implementation of antialias "
+                 << "is slightly different and doesn't match in all cases.";
+  }
   std::vector<float> X(16);
   std::iota(X.begin(), X.end(), 1.f);
 
@@ -1907,7 +1958,6 @@ TEST(ResizeOpTest, Antialias_Bilinear_ExcludeOutside) {
                           12.1f, 13.3f, 14.5f};
   TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 1, 4, 4}, X, {1, 1, 3, 3}, Y);
 }
-
 TEST(ResizeOpTest, Antialias_Bilinear_Scale_Is_All_1) {
   std::vector<float> X(3 * 4 * 5 * 6);
   std::iota(X.begin(), X.end(), 1.f);
@@ -1930,7 +1980,8 @@ TEST(ResizeOpTest, Antialias_Bilinear_dtype) {
     std::vector<int8_t> Y = {1, 3, 4,
                              6, 8, 9,
                              11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 1, 4, 4}, X, {1, 1, 3, 3}, Y);
+    InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 1, 4, 4}, X, {1, 1, 3, 3}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(16);
@@ -1973,17 +2024,21 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear) {
                           33.5f, 73.5f, 113.5f,
                           35.074074f, 75.07407f, 115.07407f,
                           36.590908f, 76.59091f, 116.59091f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 5, 8, 3}, X, {1, 4, 5, 3}, Y);
+
+  // Nchw is not supported by CUDA Resize implementation
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
+  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 5, 8, 3}, X, {1, 4, 5, 3}, Y, excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
   {
     std::vector<uint8_t> X(16);
     std::iota(X.begin(), X.end(), uint8_t(0));
     std::vector<uint8_t> Y = {1, 3, 4,
                               6, 8, 9,
                               11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
   {
     std::vector<int8_t> X(16);
@@ -1991,7 +2046,7 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
     std::vector<int8_t> Y = {1, 3, 4,
                              6, 8, 9,
                              11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(16);
@@ -1999,11 +2054,15 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
     std::vector<int32_t> Y = {1, 3, 4,
                               6, 8, 9,
                               11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
 }
 
 TEST(ResizeOpTest, Antialias_Trilinear_No_ExcludeOutside) {
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because dml implementation of "
+                 << "antialias is slightly different and doesn't match in all cases.";
+  }
   std::vector<float> X(16 * 4);
   std::iota(X.begin(), X.end(), 0.f);
   std::vector<float> Y = {5.7272725f, 6.9545455f, 8.181818f, 10.636364f, 11.863636f,
@@ -2025,11 +2084,18 @@ TEST(ResizeOpTest, Antialias_Trilinear_ExcludeOutside) {
 }
 
 TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because dml implementation of antialias"
+                 << " is slightly different and doesn't match in all cases.";
+  }
+
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
   std::vector<float> X(16 * 4 * 4);
   std::iota(X.begin(), X.end(), 0.f);
   {
     std::vector<float> Y = X;
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 4}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 4}, Y,
+                     excluded_eps);
   }
   {
     std::vector<float> Y = {0.625f, 2.375f, 4.625f, 6.375f, 8.625f, 10.375f, 12.625f,
@@ -2051,7 +2117,8 @@ TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
                             224.625f, 226.375f, 228.625f, 230.375f, 232.625f, 234.375f, 236.625f,
                             238.375f, 240.625f, 242.375f, 244.625f, 246.375f, 248.625f, 250.375f,
                             252.625f, 254.375f};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 2}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 2}, Y,
+                     excluded_eps);
   }
   {
     std::vector<float> Y = {2.5f, 3.5f, 4.5f, 5.5f, 9.5f, 10.5f, 11.5f, 12.5f, 18.5f,
@@ -2069,7 +2136,8 @@ TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
                             217.5f, 218.5f, 219.5f, 220.5f, 226.5f, 227.5f, 228.5f, 229.5f, 233.5f,
                             234.5f, 235.5f, 236.5f, 242.5f, 243.5f, 244.5f, 245.5f, 249.5f, 250.5f,
                             251.5f, 252.5f};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 2, 4}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 2, 4}, Y,
+                     excluded_eps);
   }
 }
 
@@ -2109,10 +2177,16 @@ TEST(ResizeOpTest, Antialias_NHWCBicubic_ExcludeOutside) {
       19.576872f, 43.57687f, 21.126253f, 45.126255f, 22.606192f,
       46.606194f, 19.878183f, 43.87818f, 21.358122f, 45.35812f,
       22.907503f, 46.907505f, 24.387442f, 48.387444f};
-  TestAntialiasing({{"mode", "cubic"}, {"exclude_outside", "0"}}, {1, 4, 6, 2}, X, {1, 8, 4, 2}, Y);
+
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
+  TestAntialiasing({{"mode", "cubic"}, {"exclude_outside", "0"}}, {1, 4, 6, 2}, X, {1, 8, 4, 2}, Y, excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_Linear_AlignCorners) {
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because dml implementation of antialias"
+                 << "is slightly different and doesn't match in all cases.";
+  }
   std::vector<float> X(256);
   std::iota(X.begin(), X.end(), 0.0f);
 
@@ -2127,9 +2201,40 @@ TEST(ResizeOpTest, Antialias_Linear_AlignCorners) {
       187.08333f, 195.91667f, 198.41667f, 205.91667f, 208.41667f,
       217.25f, 219.75f, 227.25f, 229.75f, 238.58333f,
       241.08333f, 248.58333f, 251.08333f};
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "0"}, {"coordinate_transformation_mode", "align_corners"}},
+      {4, 1, 4, 4, 4}, X, {4, 1, 3, 2, 2}, Y, excluded_eps);
+}
+
+TEST(ResizeOpTest, Antialias_Linear_AlignCorners_3D) {
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly "
+                 << "different and doesn't match in all cases.";
+  }
+  std::vector<float> X(256);
+  std::iota(X.begin(), X.end(), 0.0f);
+  std::vector<float> Y{
+      1.25f, 3.75f, 11.25f, 13.75f,
+      17.25f, 19.75f, 27.25f, 29.75f,
+      33.25f, 35.75f, 43.25f, 45.75f,
+      49.25f, 51.75f, 59.25f, 61.75f,
+      65.25f, 67.75f, 75.25f, 77.75f,
+      81.25f, 83.75f, 91.25f, 93.75f,
+      97.25f, 99.75f, 107.25f, 109.75f,
+      113.25f, 115.75f, 123.25f, 125.75f,
+      129.25f, 131.75f, 139.25f, 141.75f,
+      145.25f, 147.75f, 155.25f, 157.75f,
+      161.25f, 163.75f, 171.25f, 173.75f,
+      177.25f, 179.75f, 187.25f, 189.75f,
+      193.25f, 195.75f, 203.25f, 205.75f,
+      209.25f, 211.75f, 219.25f, 221.75f,
+      225.25f, 227.75f, 235.25f, 237.75f,
+      241.25f, 243.75f, 251.25f, 253.75f};
+
   TestAntialiasing(
       {{"mode", "linear"}, {"exclude_outside", "0"}, {"coordinate_transformation_mode", "align_corners"}},
-      {4, 1, 4, 4, 4}, X, {4, 1, 3, 2, 2}, Y);
+      {16, 4, 4}, X, {16, 2, 2}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Bicubic_ExcludeOutside) {
@@ -2148,19 +2253,23 @@ TEST(ResizeOpTest, Antialias_Bicubic_Dtype) {
     std::vector<uint8_t> X(36);
     std::iota(X.begin(), X.end(), uint8_t(0));
     std::vector<uint8_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y);
   }
   {
     std::vector<int8_t> X(36);
     std::iota(X.begin(), X.end(), int8_t(0));
     std::vector<int8_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(36);
     std::iota(X.begin(), X.end(), 0);
     std::vector<int32_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y);
   }
 }
 
@@ -2171,8 +2280,10 @@ TEST(ResizeOpTest, Antialias_Axes_and_Scale) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}}, {1, 1, 4, 4, 4}, X,
-                   std::vector<float>{3 / 4.0f, 3 / 4.0f, 3 / 4.0f}, Y);
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}},
+      {1, 1, 4, 4, 4}, X,
+      std::vector<float>{3 / 4.0f, 3 / 4.0f, 3 / 4.0f}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_Size) {
@@ -2181,8 +2292,10 @@ TEST(ResizeOpTest, Antialias_Axes_and_Size) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}}, {1, 1, 4, 4, 4}, X,
-                   {3, 3, 3}, Y);
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}},
+      {1, 1, 4, 4, 4}, X,
+      {3, 3, 3}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoLarger) {
@@ -2191,9 +2304,13 @@ TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoLarger) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}, {"policy", "not_larger"}},
-                   {1, 1, 4, 4, 4}, X,
-                   {3, 4, 5}, Y);
+  // clang-format off
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"},
+       {"policy", "not_larger"}},
+      {1, 1, 4, 4, 4}, X,
+      {3, 4, 5}, Y);
+  // clang-format on
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoSmaller) {
@@ -2202,9 +2319,13 @@ TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoSmaller) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}, {"policy", "not_smaller"}},
-                   {1, 1, 4, 4, 4}, X,
-                   {1, 2, 3}, Y);
+  // clang-format off
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"},
+       {"policy", "not_smaller"}},
+      {1, 1, 4, 4, 4}, X,
+      {1, 2, 3}, Y);
+  // clang-format on
 }
 
 TEST(ResizeOpTest, Antialias_Use_Extrapolation) {
@@ -2226,5 +2347,87 @@ TEST(ResizeOpTest, Antialias_Use_Extrapolation) {
       },
       {4, 4, 4}, X, {3, 3, 3}, Y);
 }
+
+TEST(ResizeOpTest, Antialias_Large_half_pixel) {
+  std::vector<float> X{0.f, 1.f, 2.f, 3.f, 4.f, 5.f};
+  std::vector<float> Y = {1.f, 4.f};
+  std::vector<float> roi{};
+  std::vector<float> scales{};
+  std::vector<int64_t> output_shape{1, 1, 2, 1};
+
+  OpTester test("Resize", 18);
+
+  test.AddAttribute<int64_t>("exclude_outside", 0LL);
+  test.AddAttribute<int64_t>("antialias", 1LL);
+  test.AddAttribute("mode", "linear");
+
+  test.AddInput<float>("X", {1, 1, 6, 1}, X);
+  test.AddInput<float>("roi", {int64_t(roi.size())}, roi);
+  test.AddInput<float>("", {0}, scales);
+  test.AddInput<int64_t>("sizes", {4}, output_shape);
+
+  // Have absolute tolerance because ort is slightly different results.
+  // DML implementation is equivalent to resize with variable input window size while ORT using a convolution approach.
+  // Absolute error is for ORT CPU.
+  test.AddOutput<float>("Y", output_shape, Y, false, /*rel_error*/ 0.0f, /*abs_error*/ 0.12f);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});
+}
+
+// Test without anti-aliasing for better comparison with DirectML
+TEST(ResizeOpTest, Axes_and_Scale_18) {
+  std::vector<float> X(16 * 4);
+  std::iota(X.begin(), X.end(), 0.f);
+  std::vector<float> Y = {3.5f, 4.8333335f, 6.1666665f, 8.833333f, 10.166667f, 11.5f, 14.166667f,
+                          15.5f, 16.833334f, 24.833334f, 26.166666f, 27.5f, 30.166666f, 31.5f,
+                          32.833332f, 35.5f, 36.833332f, 38.166668f, 46.166668f, 47.5f, 48.833332f,
+                          51.5f, 52.833332f, 54.166668f, 56.833332f, 58.166668f, 59.5};
+  std::vector<float> roi{};
+  std::vector<float> scales{3 / 4.0f, 3 / 4.0f, 3 / 4.0f};
+  std::vector<int64_t> output_shape{1, 1, 3, 3, 3};
+  std::vector<int64_t> axes{2, 3, 4};
+
+  OpTester test("Resize", 18);
+
+  test.AddAttribute<int64_t>("exclude_outside", 0LL);
+  test.AddAttribute<std::vector<int64_t>>("axes", axes);
+  test.AddAttribute<int64_t>("antialias", 0LL);
+  test.AddAttribute("mode", "linear");
+
+  test.AddInput<float>("X", {1, 1, 4, 4, 4}, X);
+  test.AddInput<float>("roi", {int64_t(roi.size())}, roi);
+  test.AddInput<float>("scales", {int64_t(scales.size())}, scales, true);
+
+  test.AddOutput<float>("Y", output_shape, Y);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});
+}
+
+TEST(ResizeOpTest, Axes_and_Size_18) {
+  std::vector<float> X(16 * 4);
+  std::iota(X.begin(), X.end(), 0.f);
+  std::vector<float> Y = {3.5f, 4.8333335f, 6.1666665f, 8.833333f, 10.166667f, 11.5f, 14.166667f,
+                          15.5f, 16.833334f, 24.833334f, 26.166666f, 27.5f, 30.166666f, 31.5f,
+                          32.833332f, 35.5f, 36.833332f, 38.166668f, 46.166668f, 47.5f, 48.833332f,
+                          51.5f, 52.833332f, 54.166668f, 56.833332f, 58.166668f, 59.5};
+  std::vector<float> roi{};
+  std::vector<float> scales{};
+  std::vector<int64_t> output_shape{1, 1, 3, 3, 3};
+  std::vector<int64_t> axes{2, 3, 4};
+
+  OpTester test("Resize", 18);
+
+  test.AddAttribute<int64_t>("exclude_outside", 0LL);
+  test.AddAttribute<std::vector<int64_t>>("axes", axes);
+  test.AddAttribute<int64_t>("antialias", 0LL);
+  test.AddAttribute("mode", "linear");
+
+  test.AddInput<float>("X", {1, 1, 4, 4, 4}, X);
+  test.AddInput<float>("roi", {int64_t(roi.size())}, roi);
+  test.AddInput<float>("", {0}, scales);
+  test.AddInput<int64_t>("sizes", {3}, {3, 3, 3});
+
+  test.AddOutput<float>("Y", output_shape, Y);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc b/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
index 9b44bf400c05..b1dfec795133 100644
--- a/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/scatter_op_test.cc
@@ -268,7 +268,7 @@ static void scatter_invalid_index(const char* op_name, int op_version) {
   test.AddOutput<float>("y", {4, 2, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 5.0f, 0.0f});
   test.Run(OpTester::ExpectResult::kExpectFailure,
            "indices element out of data bounds, idx=4 must be within the inclusive range [-4,3]",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(Scatter, InvalidIndex) {
@@ -291,9 +291,10 @@ static void scatter_bool_with_axis_tests(const char* op_name, int op_version) {
   test.AddOutput<bool>("y", {1, 5}, {false, true, false, false, false});
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kOpenVINOExecutionProvider});  // OpenVINO: Disabled due to failure for GPU
+           {kCudaNHWCExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled due to failure for GPU
 #else
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaNHWCExecutionProvider});  // OpenVINO: Disabled due to failure for GPU
 #endif
 }
 
@@ -302,5 +303,137 @@ TEST(Scatter, BoolInputWithAxis) {
   scatter_bool_with_axis_tests("ScatterElements", 11);
 }
 
+TEST(ScatterElements, AddReduction) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddAttribute<std::string>("reduction", "add");
+
+  test.AddInput<float>("data", {2, 3}, {-9.f, -4.f, -1.f, -7.f, -3.f, -6.f});
+  test.AddInput<int64_t>("indices", {4, 3}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  test.AddInput<float>("updates", {4, 3}, {1.f, 1.f, 1.f, 2.f, 2.f, 2.f, 3.f, 3.f, 3.f, 4.f, 4.f, 4.f});
+  test.AddOutput<float>("y", {2, 3}, {-9.f, -4.f, -1.f, -7.f + (1.f + 2.f + 3.f + 4.f), -3.f + (1.f + 2.f + 3.f + 4.f), -6.f + (1.f + 2.f + 3.f + 4.f)});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, AddReductionAxis1) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 1);
+  test.AddAttribute<std::string>("reduction", "add");
+
+  // update's slice shape is {2, 1}
+  test.AddInput<float>("data", {2, 3}, {9.f, 4.f, 1.f, 7.f, 3.f, 6.f});
+  test.AddInput<int64_t>("indices", {2, 4}, {1, 1, 1, 1, 1, 1, 1, 1});
+  test.AddInput<float>("updates", {2, 4}, {2.f, 5.f, 3.f, 6.f, 7.f, 9.f, 8.f, 10.f});
+  test.AddOutput<float>("y", {2, 3}, {9.f, 4.f + (2.f + 5.f + 3.f + 6.f), 1.f, 7.f, 3.f + (7.f + 9.f + 8.f + 10.f), 6.f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, MulReduction) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddAttribute<std::string>("reduction", "mul");
+
+  test.AddInput<float>("data", {2, 3}, {-9.f, -4.f, -1.f, -7.f, -3.f, -6.f});
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 1, 1, 1, 1, 1});
+  test.AddInput<float>("updates", {2, 3}, {7.f, 3.f, 6.f, 7.f, 3.f, 6.f});
+  test.AddOutput<float>("y", {2, 3}, {-9.f, -4.f, -1.f, -7.f * 7.f * 7.f, -3.f * 3.f * 3.f, -6.f * 6.f * 6.f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, MulReductionAxis1) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 1);
+  test.AddAttribute<std::string>("reduction", "mul");
+
+  // update's slice shape is {2, 1}
+  test.AddInput<float>("data", {2, 3}, {9.f, 4.f, 1.f, 7.f, 3.f, 6.f});
+  test.AddInput<int64_t>("indices", {2, 4}, {1, 1, 1, 1, 1, 1, 1, 1});
+  test.AddInput<float>("updates", {2, 4}, {2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+  test.AddOutput<float>("y", {2, 3}, {9.f, 4.f * (2.f * 3.f * 4.f * 5.f), 1.f, 7.f, 3.f * (6.f * 7.f * 8.f * 9.f), 6.f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, MaxReduction_MLFloat16) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddAttribute<std::string>("reduction", "max");
+
+  test.AddInput<MLFloat16>("data", {2, 3}, ToFloat16({-9.f, -4.f, -1.f, -7.f, -3.f, -6.f}));
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 1, 1, 1, 1, 1});
+  test.AddInput<MLFloat16>("updates", {2, 3}, ToFloat16({1.f, 5.f, 3.f, 7.f, 3.f, 6.f}));
+  test.AddOutput<MLFloat16>("y", {2, 3}, ToFloat16({-9.f, -4.f, -1.f, 7.f, 5.f, 6.f}));
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, MaxReduction_Float) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddAttribute<std::string>("reduction", "max");
+
+  test.AddInput<float>("data", {2, 3}, {-9.f, -4.f, -1.f, -7.f, -3.f, -6.f});
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 1, 1, 1, 1, 1});
+  test.AddInput<float>("updates", {2, 3}, {1.f, 5.f, 3.f, 7.f, 3.f, 6.f});
+  test.AddOutput<float>("y", {2, 3}, {-9.f, -4.f, -1.f, 7.f, 5.f, 6.f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, MaxReduction_Double) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddAttribute<std::string>("reduction", "max");
+
+  test.AddInput<double>("data", {2, 3}, {-9.f, -4.f, -1.f, -7.f, -3.f, -6.f});
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 1, 1, 1, 1, 1});
+  test.AddInput<double>("updates", {2, 3}, {1.f, 5.f, 3.f, 7.f, 3.f, 6.f});
+  test.AddOutput<double>("y", {2, 3}, {-9.f, -4.f, -1.f, 7.f, 5.f, 6.f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, MinReduction_MLFloat16) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddAttribute<std::string>("reduction", "min");
+
+  test.AddInput<MLFloat16>("data", {2, 3}, ToFloat16({-9.f, -4.f, -1.f, 8.f, -3.f, 5.f}));
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 1, 1, 1, 1, 1});
+  test.AddInput<MLFloat16>("updates", {2, 3}, ToFloat16({1.f, 5.f, 3.f, 7.f, 3.f, 6.f}));
+  test.AddOutput<MLFloat16>("y", {2, 3}, ToFloat16({-9.f, -4.f, -1.f, 1.f, -3.f, 3.f}));
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, MinReduction_Float) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddAttribute<std::string>("reduction", "min");
+
+  test.AddInput<float>("data", {2, 3}, {-9.f, -4.f, -1.f, 8.f, -3.f, 5.f});
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 1, 1, 1, 1, 1});
+  test.AddInput<float>("updates", {2, 3}, {1.f, 5.f, 3.f, 7.f, 3.f, 6.f});
+  test.AddOutput<float>("y", {2, 3}, {-9.f, -4.f, -1.f, 1.f, -3.f, 3.f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
+TEST(ScatterElements, MinReduction_Double) {
+  OpTester test("ScatterElements", 18);
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddAttribute<std::string>("reduction", "min");
+
+  test.AddInput<double>("data", {2, 3}, {-9.f, -4.f, -1.f, 8.f, -3.f, 5.f});
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 1, 1, 1, 1, 1});
+  test.AddInput<double>("updates", {2, 3}, {1.f, 5.f, 3.f, 7.f, 3.f, 6.f});
+  test.AddOutput<double>("y", {2, 3}, {-9.f, -4.f, -1.f, 1.f, -3.f, 3.f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
index 63b92cfc187b..5222380d9ca5 100644
--- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
@@ -108,6 +108,53 @@ TEST(TensorOpTest, SpaceToDepthTest_2) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});
 }
 
+TEST(TensorOpTest, SpaceToDepthTest_3) {
+  // Test swizzling with H_output > 1
+  OpTester test("SpaceToDepth");
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  constexpr int64_t N = 1, C = 2, H = 4, W = 8;
+
+  const std::vector<float> X = {
+      0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
+      1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f,
+
+      2.0f, 2.1f, 2.2f, 2.3f, 2.4f, 2.5f, 2.6f, 2.7f,
+      3.0f, 3.1f, 3.2f, 3.3f, 3.4f, 3.5f, 3.6f, 3.7f,
+
+      4.0f, 4.1f, 4.2f, 4.3f, 4.4f, 4.5f, 4.6f, 4.7f,
+      5.0f, 5.1f, 5.2f, 5.3f, 5.4f, 5.5f, 5.6f, 5.7f,
+      6.0f, 6.1f, 6.2f, 6.3f, 6.4f, 6.5f, 6.6f, 6.7f,
+      7.0f, 7.1f, 7.2f, 7.3f, 7.4f, 7.5f, 7.6f, 7.7f};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  const std::vector<float> result = {
+      0.0f, 0.2f, 0.4f, 0.6f,
+      2.0f, 2.2f, 2.4f, 2.6f,
+      4.0f, 4.2f, 4.4f, 4.6f,
+      6.0f, 6.2f, 6.4f, 6.6f,
+
+      0.1f, 0.3f, 0.5f, 0.7f,
+      2.1f, 2.3f, 2.5f, 2.7f,
+      4.1f, 4.3f, 4.5f, 4.7f,
+      6.1f, 6.3f, 6.5f, 6.7f,
+
+      1.0f, 1.2f, 1.4f, 1.6f,
+      3.0f, 3.2f, 3.4f, 3.6f,
+      5.0f, 5.2f, 5.4f, 5.6f,
+      7.0f, 7.2f, 7.4f, 7.6f,
+
+      1.1f, 1.3f, 1.5f, 1.7f,
+      3.1f, 3.3f, 3.5f, 3.7f,
+      5.1f, 5.3f, 5.5f, 5.7f,
+      7.1f, 7.3f, 7.5f, 7.7f};
+
+  test.AddOutput<float>("output", {N, C * blocksize * blocksize, H / blocksize, W / blocksize}, result);
+
+  test.Run();
+}
+
 TEST(TensorOpTest, DepthToSpaceTest_1) {
   OpTester test("DepthToSpace", 7);  // create an opset 7 model
   constexpr int64_t blocksize = 2;
diff --git a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
index 72cb84d50f07..3ac8053aef95 100644
--- a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -692,7 +693,7 @@ TEST(UpsampleOpTest, NhwcUpsampleOp4D1CBilinearTest) {
   // TensorRT: results mismatch
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(UpsampleOpTest, NhwcUpsampleOp4DBilinearTest) {
@@ -766,7 +767,7 @@ TEST(UpsampleOpTest, NhwcUpsampleOp4DBilinearTest) {
   // TensorRT: results mismatch
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(UpsampleOpTest, UpsampleOp2DBilinearTest) {
@@ -886,7 +887,7 @@ TEST(UpsampleOpTest, NhwcUpsampleOp4DBilinearTest_int32) {
   // TensorRT: results mismatch
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(UpsampleOpTest, UpsampleOpNearestTest_1D) {
@@ -939,7 +940,9 @@ TEST(UpsampleOpTest, UpsampleOpNearest2XTest_opset9) {
       7, 7, 9, 9};
 
   test.AddOutput<int32_t>("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y);
-  test.Run();
+
+  // TRT: segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(UpsampleOpTest, NhwcUpsampleOpNearest2XTest_opset9) {
diff --git a/onnxruntime/test/providers/cpu/text/regex_full_match_test.cc b/onnxruntime/test/providers/cpu/text/regex_full_match_test.cc
new file mode 100644
index 000000000000..4aa5a0d44b67
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/text/regex_full_match_test.cc
@@ -0,0 +1,119 @@
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+namespace onnxruntime {
+namespace test {
+
+static void RunTest(const std::initializer_list<int64_t>& dims, const std::initializer_list<std::string>& input, const std::string& pattern, const std::initializer_list<bool>& output) {
+  OpTester test("RegexFullMatch", 20, kOnnxDomain);
+  test.AddAttribute("pattern", pattern);
+  test.AddInput<std::string>("Input", dims, input);
+  test.AddOutput<bool>("Output", dims, output);
+  test.Run();
+}
+
+TEST(RegexFullMatch, WebsiteMatch) {
+  RunTest({3, 1}, {"www.google.com", "www.facebook.com", "www.bbc.co.uk"}, R"(www\.[\w.-]+\.\bcom\b)", {true, true, false});
+}
+
+TEST(RegexFullMatch, EmailMatch) {
+  RunTest({2, 2}, {"account@gmail.com", "account@hotmail.com", "not email", "account@yahoo.com"}, R"((\W|^)[\w.\-]{0,25}@(yahoo|gmail)\.com(\W|$))", {true, false, false, true});
+}
+
+TEST(RegexFullMatch, MultibyteMatch) {
+  RunTest({1, 2}, {"ä", "a"}, "ä", {true, false});
+  RunTest({
+              1,
+          },
+          {"une cédille like in Besançon"}, R"(.*Besançon.*)", {
+                                                                   true,
+                                                               });
+  RunTest({
+              1,
+          },
+          {"une cédille like in Besançon"}, R"(.*Besancon.*)", {
+                                                                   false,
+                                                               });
+  RunTest({
+              1,
+          },
+          {"Mit freundlichen Grüßen"}, R"(.*Grüßen$)", {
+                                                           true,
+                                                       });
+  RunTest({
+              1,
+          },
+          {"Mit freundlichen Grüßen"}, R"(.*Grußen$)", {
+                                                           false,
+                                                       });
+  RunTest({
+              3,
+          },
+          {"HПонедельник", "Понедельник", "недельник"}, R"(^Понед.*)", {
+                                                                           false,
+                                                                           true,
+                                                                           false,
+                                                                       });
+  RunTest({
+              3,
+          },
+          {"thank you", "どうもありがとうございます", "こんにちは世界"}, R"(^こんにちは世界.*)", {
+                                                                                                     false,
+                                                                                                     false,
+                                                                                                     true,
+                                                                                                 });
+  RunTest({
+              3,
+          },
+          {"नमस्ते, आपसे मिलकर अच्छा लगा", "नमस्ते", "स्वागत एवं नमस्ते"}, R"(.+नमस्ते$)", {
+                                                                                   false,
+                                                                                   false,
+                                                                                   true,
+                                                                               });
+  RunTest({
+              3,
+          },
+          {"你好，你好吗?", "你好呀", "你好呀!"}, R"(^你好.*\?$)", {
+                                                                       true,
+                                                                       false,
+                                                                       false,
+                                                                   });
+}
+
+TEST(RegexFullMatch, InvalidPattern) {
+  OpTester test("RegexFullMatch", 20, kOnnxDomain);
+  test.AddAttribute("pattern", R"([a-z)");
+  test.AddInput<std::string>("Input", {
+                                          1,
+                                      },
+                             {
+                                 "abcdef",
+                             });
+  test.AddOutput<bool>("Output", {
+                                     1,
+                                 },
+                       {
+                           false,
+                       });
+  test.Run(BaseTester::ExpectResult::kExpectFailure, "Invalid regex pattern: [a-z");
+}
+
+TEST(RegexFullMatch, NonUtf8Pattern) {
+  uint8_t invalid_bytes[] = {0xC0, 0xC1, 0x41, 0x42, 0xC3, 0x80, 0xC2, 0x80, 0xC2, 0xC3, 0xC4, 0x00};
+  OpTester test("RegexFullMatch", 20, kOnnxDomain);
+  test.AddAttribute("pattern", std::string((char*)invalid_bytes, sizeof(invalid_bytes)));
+  test.AddInput<std::string>("Input", {
+                                          1,
+                                      },
+                             {
+                                 "abcd",
+                             });
+  test.AddOutput<bool>("Output", {
+                                     1,
+                                 },
+                       {
+                           false,
+                       });
+  test.Run(BaseTester::ExpectResult::kExpectFailure, "Invalid regex pattern");
+}
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/text/string_concat_test.cc b/onnxruntime/test/providers/cpu/text/string_concat_test.cc
new file mode 100644
index 000000000000..2bfa3dc5615e
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/text/string_concat_test.cc
@@ -0,0 +1,76 @@
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+static void RunTest(const std::vector<int64_t>& dims, const std::vector<std::string>& input1,
+                    const std::vector<std::string>& input2, const std::vector<std::string>& output) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", dims, input1);
+  test.AddInput<std::string>("Y", dims, input2);
+  test.AddOutput<std::string>("Z", dims, output);
+  test.Run();
+}
+
+TEST(StringConcat, BasicConcatenation) {
+  RunTest({1, 2}, {"Hello", "World"}, {"Hello", "World"}, {"HelloHello", "WorldWorld"});
+}
+
+TEST(StringConcat, TwoDimensionalConcatenation) {
+  RunTest({2, 2}, {"Hello", "World", "ONNX", "onnxruntime"}, {"Hello", "World", "ONNX", "onnxruntime"},
+          {"HelloHello", "WorldWorld", "ONNXONNX", "onnxruntimeonnxruntime"});
+}
+
+TEST(StringConcat, LeftToRightBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {2, 2}, {"Hello", "World", "ONNX", "onnxruntime"});
+  test.AddInput<std::string>("Y", {1}, {"!"});
+  test.AddOutput<std::string>("Z", {2, 2}, {"Hello!", "World!", "ONNX!", "onnxruntime!"});
+  test.Run();
+}
+
+TEST(StringConcat, RightToLeftBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {1}, {"!"});
+  test.AddInput<std::string>("Y", {2, 2}, {"Hello", "World", "ONNX", "onnxruntime"});
+  test.AddOutput<std::string>("Z", {2, 2}, {"!Hello", "!World", "!ONNX", "!onnxruntime"});
+  test.Run();
+}
+
+TEST(StringConcat, BidirectionalBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {2, 1, 3}, {"a", "b", "c", "d", "e", "f"});
+  test.AddInput<std::string>("Y", {1, 4, 3}, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "k", "l", "m"});
+  test.AddOutput<std::string>("Z", {2, 4, 3},
+                              {
+                                  "aa",
+                                  "bb",
+                                  "cc",
+                                  "ad",
+                                  "be",
+                                  "cf",
+                                  "ag",
+                                  "bh",
+                                  "ci",
+                                  "ak",
+                                  "bl",
+                                  "cm",
+                                  "da",
+                                  "eb",
+                                  "fc",
+                                  "dd",
+                                  "ee",
+                                  "ff",
+                                  "dg",
+                                  "eh",
+                                  "fi",
+                                  "dk",
+                                  "el",
+                                  "fm",
+                              });
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/text/string_normalizer_test.cc b/onnxruntime/test/providers/cpu/text/string_normalizer_test.cc
new file mode 100644
index 000000000000..724fdb078e2f
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/text/string_normalizer_test.cc
@@ -0,0 +1,236 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#if TARGET_OS_IOS
+#define ORT_IOS
+#endif
+#endif
+
+namespace onnxruntime {
+namespace test {
+
+namespace str_normalizer_test {
+constexpr const char* domain = kOnnxDomain;
+const int opset_ver = 10;
+
+#ifdef _MSC_VER
+const std::string test_locale("en-US");
+#else
+const std::string test_locale("en_US.UTF-8");
+#endif
+
+static void InitTestAttr(OpTester& test, const std::string& case_change_action,
+                         bool is_case_sensitive,
+                         const std::vector<std::string>& stopwords,
+                         const std::string& locale) {
+  if (!case_change_action.empty()) {
+    test.AddAttribute("case_change_action", case_change_action);
+  }
+  test.AddAttribute("is_case_sensitive", int64_t{is_case_sensitive});
+  if (!stopwords.empty()) {
+    test.AddAttribute("stopwords", stopwords);
+  }
+  if (!locale.empty()) {
+    test.AddAttribute("locale", locale);
+  }
+}
+}  // namespace str_normalizer_test
+
+using namespace str_normalizer_test;
+
+TEST(ContribOpTest, StringNormalizerSensitiveNoCase) {
+  // - casesensitive approach
+  // - no stopwords.
+  // - No change case action, expecting default to take over
+  OpTester test("StringNormalizer", opset_ver, domain);
+  InitTestAttr(test, "", true, {}, test_locale);
+  std::vector<int64_t> dims{4};
+  std::vector<std::string> input = {std::string("monday"), std::string("tuesday"),
+                                    std::string("wednesday"), std::string("thursday")};
+  test.AddInput<std::string>("T", dims, input);
+  std::vector<std::string> output(input);  // do the same for now
+  test.AddOutput<std::string>("Y", dims, output);
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+TEST(ContribOpTest, StringNormalizerSensitiveFilterOutNoCase) {
+  // - casesensitive approach
+  // - filter out monday
+  // - No change case action
+
+  OpTester test("StringNormalizer", opset_ver, domain);
+  InitTestAttr(test, "NONE", true, {"monday"}, test_locale);
+  std::vector<int64_t> dims{4};
+  std::vector<std::string> input = {std::string("monday"), std::string("tuesday"),
+                                    std::string("wednesday"), std::string("thursday")};
+  test.AddInput<std::string>("T", dims, input);
+
+  std::vector<std::string> output = {std::string("tuesday"),
+                                     std::string("wednesday"), std::string("thursday")};
+  test.AddOutput<std::string>("Y", {3}, output);
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+TEST(ContribOpTest, StringNormalizerSensitiveFilterOutLower) {
+  // - casesensitive approach
+  // - filter out monday
+  // - LOWER should produce the same output as they are all lower.
+  OpTester test("StringNormalizer", opset_ver, domain);
+  InitTestAttr(test, "LOWER", true, {"monday"}, test_locale);
+  std::vector<int64_t> dims{4};
+  std::vector<std::string> input = {std::string("monday"), std::string("tuesday"),
+                                    std::string("wednesday"), std::string("thursday")};
+  test.AddInput<std::string>("T", dims, input);
+
+  std::vector<std::string> output = {std::string("tuesday"),
+                                     std::string("wednesday"), std::string("thursday")};
+  test.AddOutput<std::string>("Y", {3}, output);
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+TEST(ContribOpTest, StringNormalizerSensitiveFilterOutUpper) {
+  // - casesensitive approach
+  // - filter out monday
+  // - UPPER should produce the same output as they are all lower.
+
+  OpTester test("StringNormalizer", opset_ver, domain);
+  InitTestAttr(test, "UPPER", true, {"monday"}, test_locale);
+  std::vector<int64_t> dims{4};
+  std::vector<std::string> input = {std::string("monday"), std::string("tuesday"),
+                                    std::string("wednesday"), std::string("thursday")};
+  test.AddInput<std::string>("T", dims, input);
+
+  std::vector<std::string> output = {std::string("TUESDAY"),
+                                     std::string("WEDNESDAY"), std::string("THURSDAY")};
+  test.AddOutput<std::string>("Y", {3}, output);
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+TEST(ContribOpTest, StringNormalizerSensitiveFilterOutUpperWithLocale) {
+  // - case-SENSITIVE approach en_US locale
+  // - we test the behavior of a mix of english, french, german, russian and chinese
+  //   with en_US locale
+  // - filter out monday
+  // - UPPER should produce the same output as they are all lower.
+
+  OpTester test("StringNormalizer", opset_ver, domain);
+  InitTestAttr(test, "UPPER", true, {"monday"}, test_locale);
+  std::vector<int64_t> dims{7};
+  std::vector<std::string> input = {"monday",
+                                    "tuesday",
+                                    "Besançon",
+                                    "École élémentaire",
+                                    "Понедельник",
+                                    "mit freundlichen grüßen",
+                                    "中文"};
+  test.AddInput<std::string>("T", dims, input);
+
+  // en_US results (default)
+  std::vector<std::string> output = {"TUESDAY",
+                                     // It does upper case cecedille, accented E
+                                     "BESANÇON",
+                                     "ÉCOLE ÉLÉMENTAIRE",
+                                     // No issues with Cyrllic
+                                     "ПОНЕДЕЛЬНИК",
+  // Works with german umlaut but fails
+  // with german Eszett. Reason being, capital case for Eszett
+  // was introduced only recently into encodings
+  // and some platforms produce it, but others do not
+#ifdef __wasm__
+                                     "MIT FREUNDLICHEN GRÜẞEN",
+#else
+                                     "MIT FREUNDLICHEN GRÜßEN",
+#endif
+                                     // Chinese do not have cases
+                                     "中文"};
+  test.AddOutput<std::string>("Y", {6}, output);
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+TEST(ContribOpTest, StringNormalizerInsensitiveFilterOutUpperWithLocale) {
+  // - case-INSENSITIVE approach en_US locale
+  // - we test the behavior of a mix of english, french, german, russian and chinese
+  //   with en_US locale
+  // - filter out monday
+  // - UPPER should produce the same output as they are all lower.
+
+  OpTester test("StringNormalizer", opset_ver, domain);
+  InitTestAttr(test, "UPPER", false, {"monday"}, test_locale);
+  std::vector<int64_t> dims{7};
+  std::vector<std::string> input = {"monday",
+                                    "tuesday",
+                                    "Besançon",
+                                    "École élémentaire",
+                                    "Понедельник",
+                                    "mit freundlichen grüßen",
+                                    "中文"};
+  test.AddInput<std::string>("T", dims, input);
+
+  // en_US results (default)
+  std::vector<std::string> output = {"TUESDAY",
+                                     // It does upper case cecedille, and accented E
+                                     "BESANÇON",
+                                     "ÉCOLE ÉLÉMENTAIRE",
+                                     // No issues with Cyrllic
+                                     "ПОНЕДЕЛЬНИК",
+  // Works with german umlaut but fails
+  // with german Eszett (ß). Reason being, capital case for Eszett
+  // was introduced only recently into encodings
+  // and some platforms produce it, but others do not
+#ifdef __wasm__
+                                     "MIT FREUNDLICHEN GRÜẞEN",
+#else
+                                     "MIT FREUNDLICHEN GRÜßEN",
+#endif
+
+                                     // Chinese do not have cases
+                                     "中文"};
+  test.AddOutput<std::string>("Y", {6}, output);
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+TEST(ContribOpTest, StringNormalizerSensitiveFilterOutUpperEmptyCase) {
+  // Empty output case
+  // - casesensitive approach
+  // - filter out monday
+  // - UPPER should produce the same output as they are all lower.
+
+  OpTester test("StringNormalizer", opset_ver, domain);
+  InitTestAttr(test, "UPPER", true, {"monday"}, test_locale);
+  std::vector<int64_t> dims{2};
+  std::vector<std::string> input = {"monday", "monday"};
+  test.AddInput<std::string>("T", dims, input);
+
+  std::vector<std::string> output{""};  // One empty string
+  test.AddOutput<std::string>("Y", {1}, output);
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+// Fails on iOS because necessary locales are not installed
+// MacOS runs fine.
+#ifndef ORT_IOS
+TEST(ContribOpTest, StringNormalizerSensitiveFilterOutUpperSameOutput) {
+  // Empty output case
+  // - casesensitive approach
+  // - filter out monday
+  // - UPPER should produce the same output as they are all lower.
+  OpTester test("StringNormalizer", opset_ver, domain);
+  InitTestAttr(test, "UPPER", true, {"monday"}, "");
+  std::vector<int64_t> dims{1, 2};
+  std::vector<std::string> input = {std::string("monday"),
+                                    std::string("monday")};
+  test.AddInput<std::string>("T", dims, input);
+
+  std::vector<std::string> output{""};  // One empty string
+  test.AddOutput<std::string>("Y", {1, 1}, output);
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+#endif
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/text/string_split_test.cc b/onnxruntime/test/providers/cpu/text/string_split_test.cc
new file mode 100644
index 000000000000..d5e1c296d0b2
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/text/string_split_test.cc
@@ -0,0 +1,126 @@
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+TEST(StringSplit, BasicSplitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {3}, {"hello world", "hello", "world"});
+  test.AddAttribute<std::string>("delimiter", " ");
+  test.AddOutput<std::string>("Y", {3, 2}, {"hello", "world", "hello", "", "world", ""});
+  test.AddOutput<int64_t>("Z", {3}, {2, 1, 1});
+  test.Run();
+}
+
+TEST(StringSplit, MaxSplitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {2, 2}, {"eggs;milk;chesse", "pepper;salt", "chicken;fish;pork", "spinach"});
+  test.AddAttribute<std::string>("delimiter", ";");
+  test.AddAttribute<int64_t>("maxsplit", 1);
+  test.AddOutput<std::string>("Y", {2, 2, 2},
+                              {"eggs", "milk;chesse", "pepper", "salt", "chicken", "fish;pork", "spinach", ""});
+  test.AddOutput<int64_t>("Z", {2, 2}, {2, 2, 2, 1});
+  test.Run();
+}
+
+TEST(StringSplit, EmptyStringDelimiterTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4}, {"hello world", "hello  world", " hello world", "hello world  "});
+  test.AddAttribute<std::string>("delimiter", "");
+  test.AddOutput<std::string>("Y", {1, 4, 2}, {"hello", "world", "hello", "world", "hello", "world", "hello", "world"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 2, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SubsequentWhitespaceDefaultTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4}, {"hello world", "hello  world", "   hello world", "hello world  "});
+  test.AddOutput<std::string>("Y", {1, 4, 2}, {"hello", "world", "hello", "world", "hello", "world", "hello", "world"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 2, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SubsequentWhitespaceWithLimitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4},
+                             {"lorem  ipsum doler", " Open Neural Network Exchange (ONNX)", "onnx", "ONNX runtime "});
+  test.AddAttribute<int64_t>("maxsplit", 1);
+  test.AddOutput<std::string>(
+      "Y", {1, 4, 2},
+      {"lorem", "ipsum doler", "Open", "Neural Network Exchange (ONNX)", "onnx", "", "ONNX", "runtime"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 1, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SingleTokenTest) {
+  OpTester test("StringSplit", 20);
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddInput<std::string>("X", {1, 1, 1}, {"lorem"});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, SingleTokenWhitespaceTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 1, 1}, {"lorem"});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, EdgeWhitespaceTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 1, 1}, {"         lorem "});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, EmptyInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 3, 1}, {"", "+", "*"});
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddOutput<std::string>("Y", {1, 3, 1, 2}, {"", "", "+", "", "", ""});
+  test.AddOutput<int64_t>("Z", {1, 3, 1}, {0, 1, 2});
+  test.Run();
+}
+
+TEST(StringSplit, OnlyEmptyInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddInput<std::string>("X", {1, 2, 1}, {"", ""});
+  test.AddOutput<std::string>("Y", {1, 2, 1, 0}, {});
+  test.AddOutput<int64_t>("Z", {1, 2, 1}, {0, 0});
+  test.Run();
+}
+
+TEST(StringSplit, OnlyEmptyNoDelimiterInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 2, 1}, {"", ""});
+  test.AddOutput<std::string>("Y", {1, 2, 1, 0}, {});
+  test.AddOutput<int64_t>("Z", {1, 2, 1}, {0, 0});
+  test.Run();
+}
+
+TEST(StringSplit, NoInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {
+                                      0,
+                                  },
+                             {});
+  test.AddOutput<std::string>("Y", {
+                                       0,
+                                       0,
+                                   },
+                              {});
+  test.AddOutput<int64_t>("Z", {
+                                   0,
+                               },
+                          {});
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
index 13d4546d669e..b6a760f7041a 100644
--- a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
@@ -9,8 +9,8 @@ namespace test {
 
 template <typename T>
 struct ConvOp {
-  const std::vector<int64_t> input_dims;
-  const std::vector<int64_t> kernel_shape;
+  std::vector<int64_t> input_dims;
+  std::vector<int64_t> kernel_shape;
   int64_t channels;
   int64_t group = 1;
   bool bias = false;
@@ -52,20 +52,31 @@ struct ConvOp {
 };
 
 TYPED_TEST(CudaNhwcTypedTest, ConvNhwcBias) {
-  auto op = ConvOp<TypeParam>{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .bias = true};
+  auto op = ConvOp<TypeParam>{};
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+  op.bias = true;
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvNhwcGroupNoBias) {
-  auto op = ConvOp<TypeParam>{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .group = 4};
+  auto op = ConvOp<TypeParam>{};
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+  op.group = 4;
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvNhwcPadding) {
-  auto op =
-      ConvOp<TypeParam>{.input_dims = {2, 4, 64, 64}, .kernel_shape = {3, 3}, .channels = 4, .padding = {4, 4, 4, 4}};
+  auto op = ConvOp<TypeParam>{};
+  op.input_dims = {2, 4, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 4;
+  op.padding = {4, 4, 4, 4};
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc
index 06da2a530471..786b2cb4cedc 100644
--- a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc
@@ -9,8 +9,8 @@ namespace test {
 
 template <typename T>
 struct ConvTransposeOp {
-  const std::vector<int64_t> input_dims;
-  const std::vector<int64_t> kernel_shape;
+  std::vector<int64_t> input_dims;
+  std::vector<int64_t> kernel_shape;
   int64_t channels;
   int64_t group = 1;
   bool bias = false;
@@ -60,35 +60,47 @@ struct ConvTransposeOp {
 };
 
 TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcGroupNoBias) {
-  auto op =
-      ConvTransposeOp<TypeParam>{.input_dims = {8, 8, 32, 32}, .kernel_shape = {3, 3}, .channels = 16, .group = 4};
+  auto op = ConvTransposeOp<TypeParam>{};
+  op.input_dims = {8, 8, 32, 32};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+  op.group = 4;
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcBias) {
-  auto op =
-      ConvTransposeOp<TypeParam>{.input_dims = {1, 8, 80, 80}, .kernel_shape = {5, 5}, .channels = 16, .bias = true};
-
-  MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+  auto op = ConvTransposeOp<TypeParam>{};
+  op.input_dims = {1, 8, 80, 80};
+  op.kernel_shape = {5, 5};
+  op.channels = 16;
+  op.bias = true;
+
+  if (HasCudaEnvironment(800)) {
+    MAKE_PROVIDERS_EPS(1e-2)
+  } else {
+    MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+  }
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcPad) {
-  auto op = ConvTransposeOp<TypeParam>{.input_dims = {1, 16, 8, 8},
-                                       .kernel_shape = {3, 3},
-                                       .channels = 32,
-                                       .padding = {2, 2, 2, 2},
-                                       .output_padding = {}};
+  auto op = ConvTransposeOp<TypeParam>{};
+  op.input_dims = {1, 16, 8, 8};
+  op.kernel_shape = {3, 3};
+  op.channels = 32;
+  op.padding = {2, 2, 2, 2};
+  op.output_padding = {};
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
 
 TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcOutPad) {
-  auto op = ConvTransposeOp<TypeParam>{.input_dims = {1, 32, 8, 8},
-                                       .kernel_shape = {3, 3},
-                                       .channels = 32,
-                                       .strides = {2, 2},
-                                       .output_padding = {1, 1, 1, 1}};
+  auto op = ConvTransposeOp<TypeParam>{};
+  op.input_dims = {1, 32, 8, 8};
+  op.kernel_shape = {3, 3};
+  op.channels = 32;
+  op.strides = {2, 2};
+  op.output_padding = {1, 1, 1, 1};
 
   MAKE_PROVIDERS_EPS_TYPE(TypeParam)
 }
diff --git a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
index 2c942bb79009..82b6a286409c 100644
--- a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
+++ b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
@@ -16,11 +16,13 @@
 
 #define MAKE_PROVIDERS_EPS(eps)                                           \
   std::vector<std::shared_ptr<IExecutionProvider>> execution_providers;   \
-  OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true};                  \
+  OrtCUDAProviderOptionsV2 nhwc{};                                        \
+  nhwc.prefer_nhwc = true;                                                \
   execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); \
                                                                           \
   double error_tolerance = eps;                                           \
-  OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false};                 \
+  OrtCUDAProviderOptionsV2 nchw{};                                        \
+  nchw.prefer_nhwc = false;                                               \
   auto source_ep = CudaExecutionProviderWithOptions(&nchw);               \
   auto test = op.get_test();                                              \
   test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance);
diff --git a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc
index 52da8ba557c2..40f69e3bd5b4 100644
--- a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc
@@ -9,7 +9,7 @@ namespace test {
 
 template <typename T>
 struct BatchNormOp {
-  const std::vector<int64_t> input_dims;
+  std::vector<int64_t> input_dims;
 
   std::unique_ptr<CompareOpTester> get_test() {
     // create rand inputs
@@ -40,9 +40,8 @@ struct BatchNormOp {
 };
 
 TYPED_TEST(CudaNhwcTypedTest, BatchNormNhwc) {
-  auto op = BatchNormOp<TypeParam>{
-      .input_dims = {4, 16, 64, 64},
-  };
+  auto op = BatchNormOp<TypeParam>{};
+  op.input_dims = {4, 16, 64, 64};
 
   MAKE_PROVIDERS()
 }
diff --git a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc
index e0d59901da80..426170b9588f 100644
--- a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc
@@ -9,9 +9,9 @@ namespace test {
 
 template <typename T>
 struct PoolOp {
-  const std::string pooling_type;
-  const std::vector<int64_t> input_dims;
-  const std::vector<int64_t> kernel_shape;
+  std::string pooling_type;
+  std::vector<int64_t> input_dims;
+  std::vector<int64_t> kernel_shape;
   int64_t channels;
   int64_t group = 1;
   std::vector<int64_t> strides = {1, 1};
@@ -41,22 +41,21 @@ struct PoolOp {
 };
 
 TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwc) {
-  auto op = PoolOp<TypeParam>{
-      .pooling_type = "AveragePool",
-      .input_dims = {1, 16, 64, 64},
-      .kernel_shape = {3, 3},
-      .channels = 16,
-  };
+  auto op = PoolOp<TypeParam>{};
+  op.pooling_type = "AveragePool";
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+
   MAKE_PROVIDERS()
 }
 
 TYPED_TEST(CudaNhwcTypedTest, MaxPoolNhwc) {
-  auto op = PoolOp<TypeParam>{
-      .pooling_type = "MaxPool",
-      .input_dims = {1, 16, 64, 64},
-      .kernel_shape = {3, 3},
-      .channels = 16,
-  };
+  auto op = PoolOp<TypeParam>{};
+  op.pooling_type = "MaxPool";
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
   MAKE_PROVIDERS()
 }
 
@@ -72,21 +71,24 @@ TYPED_TEST(CudaNhwcTypedTest, GlobalMaxPoolNhwc) {
   test->AddOutput<TypeParam>("Y", output_dims, output_data);
 
   std::vector<std::shared_ptr<IExecutionProvider>> execution_providers;
-  OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true};
+  OrtCUDAProviderOptionsV2 nhwc{};
+  nhwc.prefer_nhwc = true;
   execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc));
 
   double error_tolerance = 1e-3;
-  OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false};
+  OrtCUDAProviderOptionsV2 nchw{};
+  nchw.prefer_nhwc = false;
   auto source_ep = CudaExecutionProviderWithOptions(&nchw);
   test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance);
 }
 
 TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwcPad) {
-  auto op = PoolOp<TypeParam>{.pooling_type = "AveragePool",
-                              .input_dims = {1, 16, 64, 64},
-                              .kernel_shape = {3, 3},
-                              .channels = 16,
-                              .padding = {2, 2, 2, 2}};
+  auto op = PoolOp<TypeParam>{};
+  op.pooling_type = "AveragePool";
+  op.input_dims = {1, 16, 64, 64};
+  op.kernel_shape = {3, 3};
+  op.channels = 16;
+  op.padding = {2, 2, 2, 2};
 
   MAKE_PROVIDERS()
 }
diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
index 9fecec9f7e8b..a0d115c41c14 100644
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -80,7 +80,8 @@ TEST(TestBeamSearch, TopK) {
   std::vector<float> top_k_values_ref(batch_size * k);
   std::vector<int32_t> top_k_tokens_ref(batch_size * k);
   std::vector<int32_t> top_k_indices_ref(batch_size * k);
-  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k);
+  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size,
+                       beam_size, vocab_size, k);
 
   const int32_t max_vocab_parts = 128;
   size_t buffer_size = batch_x_beam_x_vocab * 4                                      // input
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
new file mode 100644
index 000000000000..f0dfaf1a5861
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@@ -0,0 +1,190 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *    blkq4_fp16_gemm_sm80.h
+ *
+ * Abstract:
+ *   Bridge between gtest code and gemm kernel implementation.
+ *   Gemm kernel requires CUTLASS header files, which causes strange
+ *   compilation errors with RE2 header files, which are required
+ *   by gtest.
+ */
+
+#pragma once
+
+#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+
+#include <random>
+#include <thrust/host_vector.h>
+
+#include "core/common/common.h"
+#include "core/mickey/blk_q4/f16_prepack_sm80.h"
+#include "core/util/matrix_layout.h"
+
+namespace onnxruntime {
+namespace cuda {
+namespace test {
+
+Status sm80_supported();
+
+/**
+ * @brief Generate a set of quantized weights, scales and offsets
+ *        and dequantized weights for testing quantization and
+ *        dequantization. All outputs are column major layout.
+ *
+ * @tparam ElementT The type of the dequantized weights.
+ * @tparam block_size The block size of the quantization.
+ * @tparam col_blocking Whether to use column blocking (all elements of
+ *                      a block comes from a single column) or row blocking
+ * @tparam has_offsets Whether to generate offsets.
+ *
+ * @param[in]  rows The number of rows of the weight matrix.
+ * @param[in]  columns The number of columns of the weight matrix.
+ * @param[out] dequants The dequantized weights, column major layout.
+ * @param[out] q_weights The quantized weights, column major layout.
+ * @param[out] q_scales The scales, column major layout.
+ * @param[out] q_zp The zero points, column major layout.
+ */
+template <typename ElementT, int block_size, bool col_blocking, bool has_offsets>
+inline void blkq4_weights_gen(
+    int rows, int columns,
+    thrust::host_vector<ElementT>& dequants,
+    thrust::host_vector<uint8_t>& q_weights,
+    thrust::host_vector<ElementT>& q_scales,
+    thrust::host_vector<uint8_t>& q_zp) {
+  using Base = onnxruntime::cuda::BlockwiseQuantization<
+      ElementT,
+      block_size,
+      4,
+      col_blocking>;
+
+  using QuantBlocking = typename Base::QuantBlocking;
+  using ElementW = typename Base::ElementW;
+  using LayoutWPack = typename Base::LayoutWPack;
+  using ElementQOffset = typename Base::ElementQOffset;
+
+  static_assert(std::is_same<ElementW, uint8_t>::value);
+  static_assert(std::is_same<ElementQOffset, uint8_t>::value);
+  static_assert(std::is_same<LayoutWPack, ColumnMajorLayout>::value);
+
+  unsigned int seed = 28571;  // Replace with desired seed value
+  std::seed_seq seq{seed};
+  std::mt19937 gen(seq);
+  std::uniform_int_distribution<uint32_t> dis(0, 8192);
+
+  const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
+  const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
+  [[maybe_unused]] const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+
+  //
+  // For testing quantization and dequantization, it is not straight
+  // forward to avoid flaky tests due to rounding errors. The way we
+  // try to achieve this is to:
+  // 1. Generate a set of quantized weights, scales and offsets
+  // 2. Dequantize the weights
+  // 3. Quantize the dequantized weights
+  // 4. Compare the dequantied-and-then-quantized weights with
+  //    the original quantized weights
+  //
+  // Random filling of the initial values are key to get this right.
+  // For weights, we must ensure each block gets a full range of
+  // values, i.e. must contain 0 and 15. And for scales, they must
+  // all be positive.
+  //
+
+  q_weights.resize(q_weight_shape.product());
+  MatrixRef<ElementW, ColumnMajorLayout, true> tensor_q_weight(
+      q_weights, make_Position(rows / 2, columns));
+  int v = 7;
+  for (int c = 0; c < tensor_q_weight.shape()[1]; c++) {
+    for (int r = 0; r < tensor_q_weight.shape()[0]; ++r) {
+      uint8_t v0 = static_cast<uint8_t>(v);
+      v = (v + 5) % 16;
+      if (v == 11 || v == 7 || v == 3) {
+        // making the cycle 13 instead of 16, avoiding same values in a row
+        v = (v + 5) % 16;
+      }
+      uint8_t v1 = 0;
+      if (r + 1 < rows) {
+        v1 = static_cast<uint8_t>(v);
+        v = (v + 5) % 16;
+        if (v == 11 || v == 7 || v == 3) {
+          // making the cycle 13 instead of 16, avoiding same values in a row
+          v = (v + 5) % 16;
+        }
+      }
+
+      tensor_q_weight.at(r, c) = ElementW((v1 << 4) | v0);
+    }
+  }
+
+  q_scales.resize(meta_shape.product());
+  for (size_t i = 0; i < q_scales.size(); i++) {
+    uint32_t vl = dis(gen);
+    uint32_t m = (vl % 63) + 1;
+    uint32_t e = (vl >> 6) % 4;
+    q_scales[i] = ElementT(m / static_cast<float>(1 << (2 + e)));
+  }
+  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
+      q_scales, meta_shape);
+
+  MatrixRef<ElementQOffset, ColumnMajorLayout, true> tensor_offset;
+  if constexpr (has_offsets) {
+    q_zp.resize(zp_shape.product());
+    tensor_offset = MatrixRef<ElementQOffset, ColumnMajorLayout, true>(
+        q_zp, zp_shape);
+    for (int c = 0; c < zp_shape[1]; c++) {
+      for (int r = 0; r < zp_shape[0]; ++r) {
+        uint8_t v0 = dis(gen) % 16;
+        uint8_t v1 = 8;
+        if (r * 2 + 1 < meta_shape[0]) {
+          v1 = dis(gen) % 16;
+        }
+        tensor_offset.at(r, c) = static_cast<uint8_t>(v0 | (v1 << 4));
+      }
+    }
+  }
+
+  dequants.resize(rows * columns);
+  MatrixRef<ElementT, ColumnMajorLayout> tensor_dequant(dequants, make_Position(rows, columns));
+
+  // Dequantize weights and save into matrix B
+  for (int col = 0; col < tensor_dequant.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_dequant.shape()[0]; ++row) {
+      auto weight_cord = make_Position(row / 2, col);
+      auto scale_cord = make_Position(row / QuantBlocking::kRow, col / QuantBlocking::kColumn);
+      uint8_t offset = 8;
+      if constexpr (has_offsets) {
+        if (scale_cord[0] % 2 == 0) {
+          offset = tensor_offset.at(scale_cord[0] / 2, scale_cord[1]) & 0x0f;
+        } else {
+          offset = tensor_offset.at(scale_cord[0] / 2, scale_cord[1]) >> 4;
+        }
+      }
+      int w = 0;
+      if (row % 2 == 0) {
+        w = int(tensor_q_weight.at(weight_cord) & 0x0f);
+      } else {
+        w = int(tensor_q_weight.at(weight_cord) >> 4);
+      }
+      float scale = float(tensor_scale.at(scale_cord));
+      float dequant = scale * float(w - offset);
+      tensor_dequant.at(row, col) = ElementT(dequant);
+      // Prints for help debugging in case of test failure
+      // fprintf(stderr, "(%2d,%2d)= %2d, %2d, %f, %f\n", row, col, w, offset, scale, dequant);
+    }
+  }
+}
+
+template <
+    int block_size,
+    bool column_wise_blocking,
+    bool small_m,
+    bool has_offsets>
+void run_blkq4_gemm(int m, int n, int k);
+
+}  // namespace test
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
new file mode 100644
index 000000000000..e7fa0dae02fd
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -0,0 +1,330 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *    blkq4_fp16_gemm_sm80_test.cc
+ *
+ * Abstract:
+ *   Test code for block-wise quantized 4b GEMM kernels.
+ *   This part requires gtest header files, which do not play
+ *   well with CUTLASS headers.
+ */
+
+#include "blkq4_fp16_gemm_sm80.h"
+
+#include "gtest/gtest.h"
+#include <thrust/host_vector.h>
+#include <random>
+
+#include "core/framework/float16.h"
+#include "core/mlas/inc/mlas_q4.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <bool col_blocking, bool has_offset = true>
+void testPrepack(int rows, int columns) {
+  using ElementT = MLFloat16;
+  constexpr int block_size = 32;
+  using Base = onnxruntime::cuda::BlockwiseQuantization<
+      ElementT,
+      block_size,
+      4,
+      col_blocking>;
+
+  using QuantBlocking = typename Base::QuantBlocking;
+  using ElementW = typename Base::ElementW;
+  using LayoutWPack = typename Base::LayoutWPack;
+  using ElementQOffset = typename Base::ElementQOffset;
+  using LayoutQmeta = typename Base::LayoutQmeta;
+
+  const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
+  const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
+  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementT> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementT> dequants;
+  onnxruntime::cuda::test::blkq4_weights_gen<ElementT, block_size, col_blocking, has_offset>(
+      rows, columns, dequants, q_weights, q_scales, q_zp);
+
+  // for quantization tool, the input is row major, all outputs are column major
+  MatrixRef<ElementW, ColumnMajorLayout, true> tensor_q_weight(
+      q_weights, make_Position(rows / 2, columns));
+  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
+      q_scales, meta_shape);
+  MatrixRef<ElementQOffset, ColumnMajorLayout, true> tensor_offset;
+  if constexpr (has_offset) {
+    tensor_offset = MatrixRef<ElementQOffset, ColumnMajorLayout, true>(q_zp, zp_shape);
+  }
+
+  // for quantization tool, the input is row major, test weight gen output is column major
+  std::vector<ElementT> dequants_transposed(dequants.size());
+  MatrixRef<ElementT, ColumnMajorLayout> tensor_dequant(dequants, make_Position(rows, columns));
+  MatrixRef<ElementT, RowMajorLayout> tensor_dequant_transposed(dequants_transposed, make_Position(rows, columns));
+  for (int col = 0; col < tensor_dequant.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_dequant.shape()[0]; ++row) {
+      tensor_dequant_transposed.at(row, col) = tensor_dequant.at(row, col);
+    }
+  }
+
+  int q_rows, q_cols;
+  MlasBlockwiseQuantizedShape<ElementT, 4>(
+      block_size, col_blocking, rows, columns, q_rows, q_cols);
+  // to be exact, q_rows are padded to multiple of block_size, deal with it when we care about strange shapes
+  EXPECT_EQ(q_rows, q_weight_shape[0]);
+  EXPECT_EQ(q_cols, q_weight_shape[1]);
+
+  //
+  // Quantization tool outputs:
+  //
+  std::vector<ElementW> o_elements(q_rows * q_cols);
+  MatrixRef<ElementW, ColumnMajorLayout, true> tensor_o_elements(o_elements, q_weight_shape);
+
+  std::vector<ElementT> o_scales(meta_shape.product());
+  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_o_scales(o_scales, meta_shape);
+
+  std::vector<uint8_t> o_zp(zp_shape.product());
+  MatrixRef<uint8_t, ColumnMajorLayout, true> tensor_o_zp(o_zp, zp_shape);
+
+  MlasQuantizeBlockwise<MLFloat16, 4>(o_elements.data(), o_scales.data(), has_offset ? o_zp.data() : nullptr,
+                                      dequants_transposed.data(), block_size,
+                                      col_blocking, rows, columns, columns, nullptr);
+  for (int col = 0; col < tensor_q_weight.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_q_weight.shape()[0]; ++row) {
+      EXPECT_EQ(tensor_o_elements.at(row, col), tensor_q_weight.at(row, col))
+          << "quantized value mismatch at [" << row << "," << col << "]"
+          << " shape[" << rows << "," << columns << "]"
+          << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+          << std::endl;
+    }
+  }
+
+  for (int col = 0; col < meta_shape[1]; ++col) {
+    for (int row = 0; row < meta_shape[0]; row += 2) {
+      if (has_offset) {
+        uint8_t pair01 = tensor_o_zp.at(row / 2, col);
+        uint8_t expected_pair01 = tensor_offset.at(row / 2, col);
+        EXPECT_EQ(expected_pair01 & 0xf, pair01 & 0xf)
+            << "quantized offset mismatch at [" << row << "," << col << "]"
+            << " shape[" << rows << "," << columns << "]"
+            << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+            << std::endl;
+        if (row + 1 < meta_shape[0]) {
+          EXPECT_EQ(expected_pair01 >> 4, pair01 >> 4)
+              << "quantized offset mismatch at [" << row + 1 << "," << col << "]"
+              << " shape[" << rows << "," << columns << "]"
+              << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+              << std::endl;
+        }
+      }
+
+      EXPECT_EQ(tensor_scale.at(row + 0, col), tensor_o_scales.at(row + 0, col))
+          << "quantized scale mismatch at [" << row << "," << col << "]"
+          << " shape[" << rows << "," << columns << "]"
+          << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+          << std::endl;
+      if (row + 1 < meta_shape[0]) {
+        EXPECT_EQ(tensor_scale.at(row + 1, col), tensor_o_scales.at(row + 1, col))
+            << "quantized scale mismatch at [" << row + 1 << "," << col << "]"
+            << " shape[" << rows << "," << columns << "]"
+            << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+            << std::endl;
+      }
+    }
+  }
+
+  //
+  // Now we just setup quantized weights tensor_q_weight, quantization scale tensor_scale
+  // and quantization offset tensor_offset. The above tests just make sure our setup is
+  // consistent with quantization tool output.
+  //
+  // Next we test the prepack code
+  //
+
+  std::vector<ElementW> packed_w_ref(q_weight_shape.product());
+  MatrixRef<ElementW, LayoutWPack, true> tensor_packed_w_ref(
+      packed_w_ref, make_Position(rows, columns / 2));
+  onnxruntime::test::sm80_prepack_weights_ref(rows, columns, tensor_q_weight, tensor_packed_w_ref);
+
+  std::vector<ElementW> packed_w(q_weight_shape.product());
+  MatrixRef<ElementW, LayoutWPack, true> tensor_packed_w(
+      packed_w, make_Position(rows, columns / 2));
+  Base::prepack_weights(rows, columns, o_elements, packed_w);
+
+  for (int col = 0; col < tensor_packed_w.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_packed_w.shape()[0]; ++row) {
+      EXPECT_EQ(tensor_packed_w_ref.at(row, col), tensor_packed_w.at(row, col))
+          << "prepacked weights mismatch at [" << row << "," << col << "]"
+          << " shape[" << rows << "," << columns << "]"
+          << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+          << std::endl;
+    }
+  }
+
+  std::vector<ElementT> packed_scales_ref(meta_shape.product());
+  MatrixRef<ElementT, LayoutQmeta, true> tensor_packed_s_ref =
+      make_MatrixRef<ElementT, LayoutQmeta, true>(packed_scales_ref, meta_shape);
+  if constexpr (Base::ShouldRearrangeMeta) {
+    onnxruntime::test::sm80_prepack_quant_scales_ref<ElementT, LayoutQmeta, QuantBlocking>(
+        rows, columns, tensor_scale.const_ref(), tensor_packed_s_ref);
+  } else {
+    for (int col = 0; col < tensor_packed_s_ref.shape()[1]; ++col) {
+      for (int row = 0; row < tensor_packed_s_ref.shape()[0]; ++row) {
+        tensor_packed_s_ref.at(row, col) = tensor_scale.at(row, col);
+      }
+    }
+  }
+
+  std::vector<ElementT> packed_scales(meta_shape.product());
+  MatrixRef<ElementT, LayoutQmeta, true> tensor_packed_s(
+      packed_scales, meta_shape);
+  Base::prepack_quant_scales(rows, columns, o_scales, packed_scales);
+
+  for (int col = 0; col < tensor_packed_s.shape()[1]; ++col) {
+    for (int row = 0; row < tensor_packed_s.shape()[0]; ++row) {
+      EXPECT_EQ(tensor_packed_s_ref.at(row, col), tensor_packed_s.at(row, col))
+          << "prepacked scales mismatch at [" << row << "," << col << "]"
+          << " shape[" << rows << "," << columns << "]"
+          << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+          << std::endl;
+    }
+  }
+
+  if (has_offset) {
+    std::vector<ElementQOffset> packed_zp_ref(meta_shape.product());
+    MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_packed_zp_ref =
+        make_MatrixRef<ElementQOffset, LayoutQmeta, true>(packed_zp_ref, meta_shape);
+    if constexpr (Base::ShouldRearrangeMeta) {
+      onnxruntime::test::sm80_prepack_quant_offsets_ref<LayoutQmeta, QuantBlocking>(
+          rows, columns, tensor_offset.const_ref(), tensor_packed_zp_ref);
+    } else {
+      for (int col = 0; col < meta_shape[1]; ++col) {
+        for (int row = 0; row < meta_shape[0]; row += 2) {
+          uint8_t pair01 = tensor_offset.at(row / 2, col);
+          tensor_packed_zp_ref.at(row, col) = pair01 & 0xf;
+          if (row + 1 < meta_shape[0]) {
+            tensor_packed_zp_ref.at(row + 1, col) = pair01 >> 4;
+          }
+        }
+      }
+    }
+
+    std::vector<ElementQOffset> packed_zp(meta_shape.product());
+    MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_packed_zp(
+        packed_zp, meta_shape);
+    Base::prepack_quant_offsets(rows, columns, o_zp, packed_zp);
+
+    for (int col = 0; col < tensor_packed_zp.shape()[1]; ++col) {
+      for (int row = 0; row < tensor_packed_zp.shape()[0]; ++row) {
+        EXPECT_EQ(tensor_packed_zp_ref.at(row, col), tensor_packed_zp.at(row, col))
+            << "prepacked offsets mismatch at [" << row << "," << col << "]"
+            << " shape[" << rows << "," << columns << "]"
+            << (col_blocking ? "Column-wise-block" : "Row-wise-block")
+            << std::endl;
+      }
+    }
+  }
+}
+
+// TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
+TEST(BlkQ4_GEMM, PrepackSm80Test) {
+  Status status = onnxruntime::cuda::test::sm80_supported();
+  if (!status.IsOK()) {
+    // skip the test if sm80 is not supported
+    return;
+  }
+
+  testPrepack<false>(32, 32);
+  testPrepack<false, false>(32, 32);
+  testPrepack<true>(32, 32);
+  testPrepack<true, false>(32, 32);
+  testPrepack<false>(32, 64);
+  testPrepack<false>(32, 128);
+  testPrepack<false>(32, 256);
+  testPrepack<false>(64, 32);
+  testPrepack<false>(128, 32);
+  testPrepack<false>(256, 32);
+  testPrepack<false>(256, 256);
+  testPrepack<false, false>(32, 128);
+  testPrepack<false, false>(128, 32);
+  testPrepack<false, false>(256, 256);
+  testPrepack<true>(32, 64);
+  testPrepack<true>(32, 128);
+  testPrepack<true>(32, 256);
+  testPrepack<true>(64, 32);
+  testPrepack<true>(128, 32);
+  testPrepack<true>(256, 32);
+  testPrepack<true>(256, 256);
+  testPrepack<true, false>(32, 128);
+  testPrepack<true, false>(128, 32);
+  testPrepack<true, false>(256, 256);
+}
+
+TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
+  Status status = onnxruntime::cuda::test::sm80_supported();
+  if (!status.IsOK()) {
+    // skip the test if sm80 is not supported
+    return;
+  }
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(32, 32, 64);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, true>(32, 32, 64);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(32, 96, 64);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, true>(32, 96, 64);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(32, 96, 192);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, true>(32, 96, 192);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(256, 672, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, true>(256, 672, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(512, 2048 + 32, 960);
+  onnxruntime::cuda::test::run_blkq4_gemm<32, false, false, false>(512, 2048 + 32, 960);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<16, false, false, false>(256, 672, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<16, false, false, true>(256, 672, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, false>(256, 1024, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
+}
+
+TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
+  Status status = onnxruntime::cuda::test::sm80_supported();
+  if (!status.IsOK()) {
+    // skip the test if sm80 is not supported
+    return;
+  }
+  onnxruntime::cuda::test::run_blkq4_gemm<16, true, false, false>(64, 672, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<16, true, false, true>(64, 672, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, false>(256, 1024, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
+}
+
+TEST(BlkQ4_GEMM, Sm80SmallMTest) {
+  Status status = onnxruntime::cuda::test::sm80_supported();
+  if (!status.IsOK()) {
+    // skip the test if sm80 is not supported
+    return;
+  }
+
+  // // small m
+  onnxruntime::cuda::test::run_blkq4_gemm<16, false, true, false>(16, 704, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<16, false, true, true>(16, 704, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<64, false, true, false>(16, 1024, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<64, false, true, true>(16, 1024, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<16, true, true, false>(16, 672, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<16, true, true, true>(16, 672, 576);
+
+  onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, false>(16, 1024, 576);
+  onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
new file mode 100644
index 000000000000..210c33933d90
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@@ -0,0 +1,343 @@
+/**
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ *
+ * Module Name:
+ *    blkq4_fp16_gemm_sm80_testcu.cu
+ *
+ * Abstract:
+ *   Test code for invoking block-wise quantized 4b GEMM kernels.
+ *   This part requires CUTLASS header files, which do not play
+ *   well with gtest headers.
+ */
+
+#include "blkq4_fp16_gemm_sm80.h"
+
+#include <random>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "core/mickey/blk_q4/f16_gemm_sm80.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace cuda {
+namespace test {
+
+Status sm80_supported() {
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::ostringstream ss;
+    ss << "Unable to obtain GPU device properties: " << cudaGetErrorString(error);
+    return Status(common::ONNXRUNTIME, common::ENGINE_ERROR, ss.str());
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::ostringstream ss;
+    ss << "Device compute capability mismatch, desired 8.0, actual " << props.major << "." << props.minor;
+    return Status(common::ONNXRUNTIME, common::ENGINE_ERROR, ss.str());
+  }
+  return Status::OK();
+}
+
+/**
+ * @brief Reference implementation of GEMM
+ *        Copied directly from cutlass util/reference/device/gemm.h
+ *        for the strange reason that compiler insists on asking
+ *        for explicit stream argument in kernel launch.
+ */
+template <
+    typename ElementA,
+    typename LayoutA,
+    typename ElementB,
+    typename LayoutB,
+    typename ElementC,
+    typename LayoutC,
+    typename ScalarType,
+    typename AccumulatorType>
+void compute_gemm_ref(
+    cutlass::gemm::GemmCoord problem_size,
+    ScalarType alpha,
+    cutlass::TensorRef<ElementA, LayoutA> tensor_a,
+    cutlass::TensorRef<ElementB, LayoutB> tensor_b,
+    ScalarType beta,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_c,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_d,
+    AccumulatorType initial_accum = AccumulatorType(0)) {
+  // Blocking structure potentially improves performance of reference implementation
+  // with a minor increase in complexity.
+  //
+  // Note, this reference implementation is NOT expected to approach peak performance.
+  using OutputTile = cutlass::MatrixShape<4, 4>;
+
+  dim3 block(16, 8);
+
+  dim3 grid(
+      (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+      (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn));
+
+  // Launch a GEMM kernel
+  cutlass::reference::device::kernel::Gemm<
+      cutlass::TensorRef<ElementA, LayoutA>,
+      cutlass::TensorRef<ElementB, LayoutB>,
+      cutlass::TensorRef<ElementC, LayoutC>,
+      ScalarType,
+      AccumulatorType,
+      OutputTile,
+      cutlass::multiply_add<AccumulatorType>,
+      cutlass::NumericConverter<ElementC, ScalarType>><<<grid, block, 0, 0>>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      tensor_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Converting cutlass tensor to MatrixRef
+//
+
+template <
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass,
+                                                      cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
+__forceinline__
+    MatrixRef<Element, Layout, true>
+    make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+  auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
+  auto* ptr = const_cast<typename std::remove_const<Element>::type*>(tensor.host_data());
+  return MatrixRef<Element, Layout, true>(ptr, tensor.capacity(), shape);
+}
+
+template <
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
+__forceinline__
+    MatrixRef<Element const, Layout, true>
+    make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+  auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
+  return MatrixRef<Element const, Layout, true>(tensor.host_data(), tensor.capacity(), shape);
+}
+
+//
+// Invoking the kernel
+//
+
+template <
+    int block_size,
+    bool column_wise_blocking,
+    bool small_m,
+    bool has_offsets>
+void run_blkq4_gemm(int m, int n, int k) {
+  unsigned int seed = 28571;  // Replace with desired seed value
+  std::seed_seq seq{seed};
+  std::mt19937 gen(seq);
+  std::uniform_int_distribution<> dis(0, 8192);
+
+  using ElementDequant = cutlass::half_t;
+  using QuantBlocking =
+      typename std::conditional<column_wise_blocking,
+                                cutlass::MatrixShape<block_size, 1>,
+                                cutlass::MatrixShape<1, block_size>>::type;
+
+  using GemmRunner = BlkQ4F16GemmImpl<ElementDequant, QuantBlocking, small_m, has_offsets>;
+
+  using ElementAccumulator = typename GemmRunner::ElementAccumulator;
+  using ElementComputeEpilogue = typename GemmRunner::ElementComputeEpilogue;
+  using ElementInputA = typename GemmRunner::ElementInputA;
+  using ElementOutput = typename GemmRunner::ElementOutput;
+  using ElementW = typename GemmRunner::ElementW;
+  using ElementWPack = typename GemmRunner::ElementWPack;
+  using ElementQScale = typename GemmRunner::ElementQScale;
+  using ElementQOffset = typename GemmRunner::ElementQOffset;
+
+  using LayoutInputA = typename GemmRunner::LayoutInputA;
+  using LayoutOutput = typename GemmRunner::LayoutOutput;
+  using LayoutInputWPack = typename GemmRunner::LayoutInputWPack;
+  using LayoutInputQScale = typename GemmRunner::LayoutInputQScale;
+
+  const cutlass::gemm::GemmCoord problem_size = {m, n, k};
+  const auto q_weight_shape = cutlass::make_Coord(problem_size.k() / 2, problem_size.n());
+  const auto meta_shape = cutlass::make_Coord(problem_size.k() / QuantBlocking::kRow, problem_size.n() /
+                                                                                          QuantBlocking::kColumn);
+
+  //
+  // Generate quantized and dequantizeed input matrix B [K, N]
+  //
+  static_assert(std::is_same<LayoutInputWPack, cutlass::layout::ColumnMajor>::value);
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementQScale> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementDequant> dequants;
+  onnxruntime::cuda::test::blkq4_weights_gen<ElementDequant, block_size, column_wise_blocking, has_offsets>(
+      problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp);
+
+  using PrepackT = onnxruntime::cuda::BlockwiseQuantization<
+      ElementDequant,
+      block_size,
+      4,
+      column_wise_blocking>;
+
+  thrust::host_vector<ElementW> packed_w(q_weight_shape.product());
+  PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w);
+  thrust::host_vector<ElementQScale> packed_scales(meta_shape.product());
+  PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales);
+  thrust::host_vector<ElementQOffset> packed_zp;
+  if constexpr (has_offsets) {
+    packed_zp.resize(meta_shape.product());
+    PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp);
+  }
+
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      problem_size.mk());  // <- Create matrix A with dimensions M x K
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+      problem_size.mn());  // <- Create matrix C with dimensions M x N
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // CUTLASS kernel
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(4),
+      ElementInputA(-4),
+      2);  // <- Fill matrix A on host with uniform-distribution random data
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_c.host_view(),
+      1,
+      ElementOutput(4),
+      ElementOutput(-4),
+      0);  // <- Fill matrix C on host with uniform-distribution random data
+  cutlass::reference::host::TensorFill(
+      tensor_d.host_view());  // <- fill matrix D on host with zeros
+
+  //
+  // Copy data from host to GPU...
+  //
+  thrust::device_vector<ElementW> d_packed_w(packed_w);
+  cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_W(
+      reinterpret_cast<ElementWPack const*>(d_packed_w.data().get()),
+      LayoutInputWPack::packed({problem_size.k() / 2, problem_size.n() / 2}));
+
+  thrust::device_vector<ElementQScale> d_packed_scales(packed_scales);
+  cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_scales(
+      d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
+
+  thrust::device_vector<ElementQOffset> d_packed_zp(packed_zp);
+  cutlass::TensorRef<ElementQOffset const, LayoutInputQScale> ref_zp(
+      d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
+
+  tensor_a.sync_device();
+  tensor_c.sync_device();
+  tensor_d.sync_device();
+
+  // run GEMM
+  cutlass::Status status;
+  if constexpr (has_offsets) {
+    status = GemmRunner::run(
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales, ref_zp,
+        tensor_c.device_ref(), tensor_d.device_ref());
+  } else {
+    status = GemmRunner::run(
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales,
+        tensor_c.device_ref(), tensor_d.device_ref());
+  }
+  ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status));
+
+  // Running reference kernel
+  using ElementInputB = ElementInputA;
+  using LayoutInputB = cutlass::layout::ColumnMajor;
+  thrust::device_vector<ElementInputB> d_dequants(dequants);
+  cutlass::TensorRef<ElementInputB, LayoutInputB> ref_B(
+      d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // reference kernel
+
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());  // <- fill matrix D for reference on host with zeros
+  tensor_ref_d.sync_device();
+
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(0);
+
+  compute_gemm_ref<ElementInputA, LayoutInputA,
+                   ElementInputB, LayoutInputB,
+                   ElementOutput, LayoutOutput,
+                   ElementComputeEpilogue, ElementAccumulator>(
+      problem_size,
+      alpha,
+      tensor_a.device_ref(),
+      ref_B,
+      beta,
+      tensor_c.device_ref(),
+      tensor_ref_d.device_ref());
+
+  //// Wait for kernels to finish
+  cudaDeviceSynchronize();
+
+  //// Copy output data from CUTLASS and reference kernel to host for comparison
+  tensor_d.sync_host();
+  tensor_ref_d.sync_host();
+
+  //// Check if output from CUTLASS kernel and reference kernel are equal or not
+  bool passed = cutlass::reference::host::TensorEquals(
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
+  ORT_ENFORCE(passed, "Gemm kernel result wrong!");
+}
+
+template void run_blkq4_gemm<16, true, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<16, true, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<32, true, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<32, true, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<64, true, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<64, true, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<16, false, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<16, false, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<32, false, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<32, false, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<64, false, false, true>(int m, int n, int k);
+template void run_blkq4_gemm<64, false, false, false>(int m, int n, int k);
+template void run_blkq4_gemm<16, true, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<16, true, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<32, true, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<32, true, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<64, true, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<64, true, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<16, false, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<16, false, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<32, false, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<32, false, true, false>(int m, int n, int k);
+template void run_blkq4_gemm<64, false, true, true>(int m, int n, int k);
+template void run_blkq4_gemm<64, false, true, false>(int m, int n, int k);
+
+}  // namespace test
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_sm80_prepack_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_sm80_prepack_test.cc
deleted file mode 100644
index aba2b0b2cb4a..000000000000
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_sm80_prepack_test.cc
+++ /dev/null
@@ -1,507 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <random>
-
-#include "core/framework/float16.h"
-#include "core/mickey/blk_q4/prepack_sm80.h"
-#include "core/mlas/inc/mlas_q4.h"
-
-#include "gtest/gtest.h"
-
-namespace onnxruntime {
-namespace test {
-
-void prepack_weights_ref(
-    int rows,
-    int columns,
-    const MatrixRef<uint8_t const, ColumnMajorLayout, true>& tensor_weight,
-    const MatrixRef<uint8_t, ColumnMajorLayout, true>& tensor_weight_prepacked) {
-  EXPECT_TRUE(tensor_weight.shape()[0] == rows / 2 && tensor_weight.shape()[1] == columns);
-  EXPECT_TRUE(tensor_weight_prepacked.shape()[0] == rows && tensor_weight_prepacked.shape()[1] == columns / 2);
-
-  auto t0_base = make_Position(0, 0);
-  auto t1_base = make_Position(4, 0);
-  auto t2_base = make_Position(0, 8);
-  auto t3_base = make_Position(4, 8);
-  for (int col_dtile = 0; col_dtile < columns / 16; ++col_dtile) {
-    for (int row_dtile = 0; row_dtile < rows / 16; ++row_dtile) {
-      // Packing from a 8x16 tile to a 16x8 tile
-      auto dtile_base = make_Position(row_dtile * 8, col_dtile * 16);
-      auto packed_tile_base = make_Position(row_dtile * 16, col_dtile * 8);
-      for (int col = 0; col < 8; ++col) {
-        for (int row = 0; row < 4; ++row) {
-          auto cord = make_Position(row, col);
-          auto packed_cord = packed_tile_base + make_Position(row * 4, col);  // packed tile is 16x8
-          uint8_t buf[4];
-          buf[0] = tensor_weight.at(dtile_base + t0_base + cord);
-          buf[1] = tensor_weight.at(dtile_base + t1_base + cord);
-          buf[2] = tensor_weight.at(dtile_base + t2_base + cord);
-          buf[3] = tensor_weight.at(dtile_base + t3_base + cord);
-
-          // [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7] so that each pair of adjacent weights
-          // are in different b16 register at the same positions. This makes it easier to convert to
-          // fp16x2 format in a b32 register
-
-          tensor_weight_prepacked.at(packed_cord) = (buf[0] & 0x0f) | ((buf[1] & 0x0f) << 4);
-          tensor_weight_prepacked.at(packed_cord + make_Position(1, 0)) = (buf[2] & 0x0f) | ((buf[3] & 0x0f) << 4);
-          tensor_weight_prepacked.at(packed_cord + make_Position(2, 0)) = ((buf[0] & 0xf0) >> 4) | (buf[1] & 0xf0);
-          tensor_weight_prepacked.at(packed_cord + make_Position(3, 0)) = ((buf[2] & 0xf0) >> 4) | (buf[3] & 0xf0);
-        }
-      }
-    }
-  }
-}
-
-template <
-    typename ScaleElementT,
-    typename Layout,
-    typename QuantBlocking>
-void prepack_quant_scales_ref(
-    int rows,
-    int columns,
-    const MatrixRef<ScaleElementT const, Layout, true>& tensor_scale,
-    const MatrixRef<ScaleElementT, Layout, true>& tensor_scale_prepacked) {
-  EXPECT_TRUE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn));
-  EXPECT_TRUE(tensor_scale_prepacked.shape() == tensor_scale.shape());
-
-  // Only prepacking scale and offset tensors for a often used special case:
-  //    16b gemm (2 elements per 32b register, operand tile shape 8x8)
-  //    2 B operand tiles per mma instruction stacked on k dimension
-  //    (1,n) quantization blocking
-  if constexpr (sizeof(ScaleElementT) == 2 && QuantBlocking::kRow == 1) {
-    // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
-    // holds a fragment of the tile containing 2 elements in the k dimension. Most often we use
-    // mma instruction shape of 16x8x16, which means 2 B tiles are stacked in the k dimension,
-    // as shown below (T stands for thread):
-    // T0, T4, T8, T12
-    // T1, T5, T9, T13
-    // T2, T6, T10, T14
-    // T3, T7, T11, T15
-    // T0, T4, T8, T12
-    // T1, T5, T9, T13
-    // T2, T6, T10, T14
-    // T3, T7, T11, T15
-    //
-    // We need to deliver quantization scale and offset elements to the corresponding threads,
-    // so we can perform dequantization efficiently. With a column major layout, each thread
-    // needs two separate loads for a mma instruction, due to the tile fragment layout shown
-    // above. To reduce the number of loads, we rearrange each column as below, so we can use
-    // a single load to load fragments for two tiles:
-    // T0        T0
-    // T1        T0
-    // T2        T1
-    // T3   =>   T1
-    // T0        T2
-    // T1        T2
-    // T2        T3
-    // T3        T3
-
-    for (int col = 0; col < tensor_scale.shape()[1]; ++col) {
-      for (int row_blk = 0; row_blk < tensor_scale.shape()[0]; row_blk += 16) {
-        for (int thread_id = 0; thread_id < 4; thread_id++) {
-          const int dst_idx = row_blk + thread_id * 4;
-          const int src_idx = row_blk + thread_id * 2;
-          tensor_scale_prepacked.at(dst_idx + 0, col) = tensor_scale.at(src_idx + 0, col);
-          tensor_scale_prepacked.at(dst_idx + 1, col) = tensor_scale.at(src_idx + 1, col);
-          tensor_scale_prepacked.at(dst_idx + 2, col) = tensor_scale.at(src_idx + 8, col);
-          tensor_scale_prepacked.at(dst_idx + 3, col) = tensor_scale.at(src_idx + 9, col);
-        }
-      }
-    }
-  } else {
-    // In all other cases, we don't prepack scale or offset
-    FAIL() << "Scale prepack only supported for 16b gemm with (1,n) quantization blocking";
-  }
-}
-
-template <typename Layout, typename QuantBlocking>
-void prepack_quant_offsets_ref(
-    size_t rows,
-    size_t columns,
-    MatrixRef<uint8_t const, Layout, true> tensor_offset,
-    MatrixRef<uint8_t, Layout, true> tensor_offset_prepacked) {
-  // EXPECT_TRUE(tensor_offset.shape()[0] == (rows / QuantBlocking::kRow) && tensor_offset.shape()[1] == (columns / QuantBlocking::kColumn));
-  EXPECT_TRUE(tensor_offset_prepacked.shape() == tensor_offset.shape());
-
-  // Only prepacking scale and offset tensors for a often used special case:
-  //    16b gemm (2 elements per 32b register, operand tile shape 8x8)
-  //    2 B operand tiles per mma instruction stacked on k dimension
-  //    (1,n) quantization blocking
-  if constexpr (QuantBlocking::kRow != 1) {
-    FAIL() << "Offsets prepack only supported for 16b gemm with (1,n) quantization blocking";
-  }
-  // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
-  // holds a fragment of the tile containing 2 elements in the k dimension. Most often we use
-  // mma instruction shape of 16x8x16, which means 2 B tiles are stacked in the k dimension,
-  // as shown below (T stands for thread):
-  // T0, T4, T8, T12
-  // T1, T5, T9, T13
-  // T2, T6, T10, T14
-  // T3, T7, T11, T15
-  // T0, T4, T8, T12
-  // T1, T5, T9, T13
-  // T2, T6, T10, T14
-  // T3, T7, T11, T15
-  //
-  // We need to deliver quantization scale and offset elements to the corresponding threads,
-  // so we can perform dequantization efficiently. With a column major layout, each thread
-  // needs two separate loads for a mma instruction, due to the tile fragment layout shown
-  // above. To reduce the number of loads, we rearrange each column as below, so we can use
-  // a single load to load fragments for two tiles:
-  // T0        T0
-  // T1        T0
-  // T2        T1
-  // T3   =>   T1
-  // T0        T2
-  // T1        T2
-  // T2        T3
-  // T3        T3
-  if (tensor_offset_prepacked.good()) {
-    for (int col = 0; col < tensor_offset.shape()[1]; ++col) {
-      for (int row_blk = 0; row_blk < tensor_offset.shape()[0]; row_blk += 16) {
-        for (int thread_id = 0; thread_id < 4; thread_id++) {
-          const int dst_idx = row_blk + thread_id * 4;
-          const int src_idx = row_blk + thread_id * 2;
-          // [a, b, c, d] => [a, c, b, d] so that adjacent weights are in their own
-          // 16b element: [a, x, b, x] and [x, c, x, d], which makes it easier to
-          // convert to fp16x2 format in a b32 register
-          tensor_offset_prepacked.at(dst_idx + 0, col) = tensor_offset.at(src_idx + 0, col);
-          tensor_offset_prepacked.at(dst_idx + 1, col) = tensor_offset.at(src_idx + 8, col);
-          tensor_offset_prepacked.at(dst_idx + 2, col) = tensor_offset.at(src_idx + 1, col);
-          tensor_offset_prepacked.at(dst_idx + 3, col) = tensor_offset.at(src_idx + 9, col);
-        }
-      }
-    }
-  }
-}
-
-template <bool ColumnMajorQuantBlocking>
-void testPrepack(int rows, int columns, bool has_offset = true) {
-  using ElementT = MLFloat16;
-  constexpr int block_size = 32;
-  using Base = onnxruntime::cuda::BlockwiseQuantization<
-      ElementT,
-      block_size,
-      4,
-      ColumnMajorQuantBlocking>;
-
-  using QuantBlocking = typename Base::QuantBlocking;
-  using ElementW = typename Base::ElementW;
-  using LayoutWPack = typename Base::LayoutWPack;
-  using ElementQOffset = typename Base::ElementQOffset;
-  using LayoutQmeta = typename Base::LayoutQmeta;
-
-  unsigned int seed = 28571;  // Replace with desired seed value
-  std::seed_seq seq{seed};
-  std::mt19937 gen(seq);
-  std::uniform_int_distribution<> dis(0, 8192);
-
-  const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
-  const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
-
-  //
-  // For testing quantization and dequantization, it is not straight
-  // forward to avoid flaky tests due to rounding errors. The way we
-  // try to achieve this is to:
-  // 1. Generate a set of quantized weights, scales and offsets
-  // 2. Dequantize the weights
-  // 3. Quantize the dequantized weights
-  // 4. Compare the dequantied-and-then-quantized weights with
-  //    the original quantized weights
-  //
-  // Random filling of the initial values are key to get this right.
-  // For weights, we must ensure each block gets a full range of
-  // values, i.e. must contain 0 and 15. And for scales, they must
-  // all be positive.
-  //
-
-  std::vector<ElementW> q_weights(q_weight_shape.product());
-  MatrixRef<ElementW, LayoutWPack, true> tensor_q_weight(
-      q_weights, make_Position(rows / 2, columns));
-  int v = 7;
-  for (int c = 0; c < tensor_q_weight.shape()[1]; c++) {
-    for (int r = 0; r < tensor_q_weight.shape()[0]; ++r) {
-      uint8_t v0 = static_cast<uint8_t>(v);
-      v = (v + 5) % 16;
-      if (v == 11 || v == 7 || v == 3) {
-        // making the cycle 13 instead of 16, avoiding same values in a row
-        v = (v + 5) % 16;
-      }
-      uint8_t v1 = 0;
-      if (r + 1 < rows) {
-        v1 = static_cast<uint8_t>(v);
-        v = (v + 5) % 16;
-        if (v == 11 || v == 7 || v == 3) {
-          // making the cycle 13 instead of 16, avoiding same values in a row
-          v = (v + 5) % 16;
-        }
-      }
-
-      tensor_q_weight.at(r, c) = ElementW((v1 << 4) | v0);
-    }
-  }
-
-  std::vector<ElementT> q_scales(meta_shape.product());
-  for (size_t i = 0; i < q_scales.size(); i++) {
-    q_scales[i] = ElementT(((dis(gen) % 127) + 1) / 32.0f);
-  }
-  MatrixRef<ElementT, LayoutQmeta, true> tensor_scale(
-      q_scales, meta_shape);
-
-  std::vector<ElementQOffset> q_zp(meta_shape.product());
-  for (size_t i = 0; i < q_zp.size(); i++) {
-    q_zp[i] = dis(gen) % 16;
-  }
-  MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_offset(
-      q_zp, meta_shape);
-
-#if 0  // debug
-  // Fill tensor_q_weight with the patterned data, easier to debug with print
-  int loop_val = 0;
-  int offset = 3;
-  for (int col_tile = 0; col_tile < tensor_q_weight.extent().column()/8; ++col_tile) {
-    for (int row_tile = 0; row_tile < tensor_q_weight.extent().row()/4; ++row_tile) {
-      for (int col = 0; col < 8; ++col) {
-        for (int row = 0; row < 4; ++row) {
-          auto weight_cord = cutlass::make_Coord(row_tile * 4 + row, col_tile * 8 + col);
-          auto val = (loop_val + offset) % 256;
-          tensor_q_weight.at(weight_cord) = ElementW(val);
-          loop_val++;
-          if (loop_val == 256) {
-            loop_val = 0;
-            offset += 11;
-          }
-        }
-      }
-    }
-  }
-  for (int col = 0; col < tensor_scale.extent().column(); ++col){
-    int c =  col * QuantBlocking::kColumn;
-    for (int row = 0; row < tensor_scale.extent().row(); ++row){
-      int r = row * QuantBlocking::kRow;
-      auto weight_cord = cutlass::make_Coord(r/2, c);
-      int w = 0;
-      if (r % 2 == 0) {
-        w = int(tensor_q_weight.at(weight_cord) & 0x0f);
-      } else {
-        w = int(tensor_q_weight.at(weight_cord) >> 4);
-      }
-      tensor_scale.at({row, col}) = w;
-      tensor_offset.at({row, col}) = ElementQOffset(w);
-    }
-  }
-
-  int fill_val = -512;
-  int factor = 1;
-  for (int col = 0; col < tensor_scale.extent().column(); ++col){
-    for (int row = 0; row < tensor_scale.extent().row(); ++row){
-      tensor_scale.at({row, col}) = ElementQScale((float)fill_val * float(factor));
-      fill_val++;
-      if (fill_val == 512) {
-        fill_val = -512;
-        factor += 1;
-      }
-    }
-  }
-
-#endif  // debug
-
-  std::vector<ElementT> dequants(rows * columns);
-  MatrixRef<ElementT, RowMajorLayout> tensor_dequant(dequants, make_Position(rows, columns));
-
-  // Dequantize weights and save into matrix B for reference
-  for (int col = 0; col < tensor_dequant.shape()[1]; ++col) {
-    for (int row = 0; row < tensor_dequant.shape()[0]; ++row) {
-      auto weight_cord = make_Position(row / 2, col);
-      auto scale_cord = make_Position(row / QuantBlocking::kRow, col / QuantBlocking::kColumn);
-      const uint8_t offset = has_offset ? tensor_offset.at(scale_cord) : 8;
-      int w = 0;
-      if (row % 2 == 0) {
-        w = int(tensor_q_weight.at(weight_cord) & 0x0f);
-      } else {
-        w = int(tensor_q_weight.at(weight_cord) >> 4);
-      }
-      float scale = float(tensor_scale.at(scale_cord));
-      float dequant = scale * float(w - offset);
-      tensor_dequant.at(row, col) = ElementT(dequant);
-      // Prints for help debugging in case of test failure
-      // fprintf(stderr, "(%2d,%2d)= %2d, %2d, %f, %f\n", row, col, w, offset, scale, dequant);
-    }
-  }
-
-  int q_rows, q_cols;
-  MlasBlockwiseQuantizedShape<ElementT, 4>(
-      block_size, ColumnMajorQuantBlocking, rows, columns, q_rows, q_cols);
-  // to be exact, q_rows are padded to multiple of block_size, deal with it when we care about strange shapes
-  EXPECT_EQ(q_rows, q_weight_shape[0]);
-  EXPECT_EQ(q_cols, q_weight_shape[1]);
-
-  //
-  // Quantization tool outputs:
-  //
-  std::vector<ElementW> o_elements(q_rows * q_cols);
-  MatrixRef<ElementW, ColumnMajorLayout, true> tensor_o_elements(o_elements, q_weight_shape);
-
-  std::vector<ElementT> o_scales(meta_shape.product());
-  MatrixRef<ElementT, ColumnMajorLayout, true> tensor_o_scales(o_scales, meta_shape);
-
-  std::vector<uint8_t> o_zp(((meta_shape[0] + 1) / 2) * meta_shape[1], true);
-  MatrixRef<uint8_t, ColumnMajorLayout, true> tensor_o_zp(
-      o_zp, make_Position((meta_shape[0] + 1) / 2, meta_shape[1]));
-
-  MlasQuantizeBlockwise<MLFloat16, 4>(o_elements.data(), o_scales.data(), has_offset ? o_zp.data() : nullptr,
-                                      tensor_dequant.data().data(), block_size,
-                                      ColumnMajorQuantBlocking, rows, columns, columns, nullptr);
-  for (int col = 0; col < tensor_q_weight.shape()[1]; ++col) {
-    for (int row = 0; row < tensor_q_weight.shape()[0]; ++row) {
-      EXPECT_EQ(tensor_o_elements.at(row, col), tensor_q_weight.at(row, col))
-          << "quantized value mismatch at [" << row << "," << col << "]"
-          << " shape[" << rows << "," << columns << "]"
-          << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-          << std::endl;
-    }
-  }
-
-  for (int col = 0; col < meta_shape[1]; ++col) {
-    for (int row = 0; row < meta_shape[0]; row += 2) {
-      if (has_offset) {
-        uint8_t pair01 = tensor_o_zp.at(row / 2, col);
-        EXPECT_EQ(tensor_offset.at(row + 0, col), pair01 & 0xf)
-            << "quantized offset mismatch at [" << row << "," << col << "]"
-            << " shape[" << rows << "," << columns << "]"
-            << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-            << std::endl;
-        if (row + 1 < meta_shape[0]) {
-          EXPECT_EQ(tensor_offset.at(row + 1, col), pair01 >> 4)
-              << "quantized offset mismatch at [" << row + 1 << "," << col << "]"
-              << " shape[" << rows << "," << columns << "]"
-              << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-              << std::endl;
-        }
-      }
-
-      EXPECT_EQ(tensor_scale.at(row + 0, col), tensor_o_scales.at(row + 0, col))
-          << "quantized scale mismatch at [" << row << "," << col << "]"
-          << " shape[" << rows << "," << columns << "]"
-          << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-          << std::endl;
-      if (row + 1 < meta_shape[0]) {
-        EXPECT_EQ(tensor_scale.at(row + 1, col), tensor_o_scales.at(row + 1, col))
-            << "quantized scale mismatch at [" << row + 1 << "," << col << "]"
-            << " shape[" << rows << "," << columns << "]"
-            << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-            << std::endl;
-      }
-    }
-  }
-
-  //
-  // Now we just setup fp16 weights tensor_dequant, quantized weights tensor_q_weight,
-  // quantization scale tensor_scale and quantization offset tensor_offset. The above
-  // testing just make sure our test setup is consistent with quantization tool output.
-  //
-  // Next we test the prepack code
-  //
-
-  std::vector<ElementW> packed_w_ref(q_weight_shape.product());
-  MatrixRef<ElementW, LayoutWPack, true> tensor_packed_w_ref(
-      packed_w_ref, make_Position(rows, columns / 2));
-  prepack_weights_ref(rows, columns, tensor_q_weight, tensor_packed_w_ref);
-
-  std::vector<ElementW> packed_w(q_weight_shape.product());
-  MatrixRef<ElementW, LayoutWPack, true> tensor_packed_w(
-      packed_w, make_Position(rows, columns / 2));
-  Base::prepack_weights(rows, columns, o_elements, packed_w);
-
-  for (int col = 0; col < tensor_packed_w.shape()[1]; ++col) {
-    for (int row = 0; row < tensor_packed_w.shape()[0]; ++row) {
-      EXPECT_EQ(tensor_packed_w_ref.at(row, col), tensor_packed_w.at(row, col))
-          << "prepacked weights mismatch at [" << row << "," << col << "]"
-          << " shape[" << rows << "," << columns << "]"
-          << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-          << std::endl;
-    }
-  }
-
-  std::vector<ElementT> packed_scales_ref(meta_shape.product());
-  MatrixRef<ElementT, LayoutQmeta, true> tensor_packed_s_ref =
-      Base::ShouldRearrangeMeta ? make_MatrixRef<ElementT, LayoutQmeta, true>(packed_scales_ref, meta_shape)
-                                : tensor_scale;
-  if (Base::ShouldRearrangeMeta) {
-    prepack_quant_scales_ref<ElementT, LayoutQmeta, QuantBlocking>(
-        rows, columns, tensor_scale.const_ref(), tensor_packed_s_ref);
-  }
-
-  std::vector<ElementT> packed_scales(meta_shape.product());
-  MatrixRef<ElementT, LayoutQmeta, true> tensor_packed_s(
-      packed_scales, meta_shape);
-  Base::prepack_quant_scales(rows, columns, o_scales, packed_scales);
-
-  for (int col = 0; col < tensor_packed_s.shape()[1]; ++col) {
-    for (int row = 0; row < tensor_packed_s.shape()[0]; ++row) {
-      EXPECT_EQ(tensor_packed_s_ref.at(row, col), tensor_packed_s.at(row, col))
-          << "prepacked scales mismatch at [" << row << "," << col << "]"
-          << " shape[" << rows << "," << columns << "]"
-          << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-          << std::endl;
-    }
-  }
-
-  if (has_offset) {
-    std::vector<ElementQOffset> packed_zp_ref(meta_shape.product());
-    MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_packed_zp_ref =
-        Base::ShouldRearrangeMeta ? make_MatrixRef<ElementQOffset, LayoutQmeta, true>(packed_zp_ref, meta_shape)
-                                  : tensor_offset;
-    if (Base::ShouldRearrangeMeta) {
-      prepack_quant_offsets_ref<LayoutQmeta, QuantBlocking>(
-          rows, columns, tensor_offset.const_ref(), tensor_packed_zp_ref);
-    }
-
-    std::vector<ElementQOffset> packed_zp(meta_shape.product());
-    MatrixRef<ElementQOffset, LayoutQmeta, true> tensor_packed_zp(
-        packed_zp, meta_shape);
-    Base::prepack_quant_offsets(rows, columns, o_zp, packed_zp);
-
-    for (int col = 0; col < tensor_packed_zp.shape()[1]; ++col) {
-      for (int row = 0; row < tensor_packed_zp.shape()[0]; ++row) {
-        EXPECT_EQ(tensor_packed_zp_ref.at(row, col), tensor_packed_zp.at(row, col))
-            << "prepacked offsets mismatch at [" << row << "," << col << "]"
-            << " shape[" << rows << "," << columns << "]"
-            << (ColumnMajorQuantBlocking ? "Column-wise-block" : "Row-wise-block")
-            << std::endl;
-      }
-    }
-  }
-}
-
-// TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
-TEST(BlkQ4_GEMM, PrepackSm80Test) {
-  testPrepack<false>(32, 32);
-  testPrepack<false>(32, 32, false);
-  testPrepack<true>(32, 32);
-  testPrepack<true>(32, 32, false);
-  testPrepack<false>(32, 64);
-  testPrepack<false>(32, 128);
-  testPrepack<false>(32, 256);
-  testPrepack<false>(64, 32);
-  testPrepack<false>(128, 32);
-  testPrepack<false>(256, 32);
-  testPrepack<false>(256, 256);
-  testPrepack<false>(32, 128, false);
-  testPrepack<false>(128, 32, false);
-  testPrepack<false>(256, 256, false);
-  testPrepack<true>(32, 64);
-  testPrepack<true>(32, 128);
-  testPrepack<true>(32, 256);
-  testPrepack<true>(64, 32);
-  testPrepack<true>(128, 32);
-  testPrepack<true>(256, 32);
-  testPrepack<true>(256, 256);
-  testPrepack<true>(32, 128, false);
-  testPrepack<true>(128, 32, false);
-  testPrepack<true>(256, 256, false);
-}
-
-}  // namespace test
-}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index a70e439cdf75..72357ec7e02d 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -5,11 +5,14 @@
 // extra code in the core of CUDA EP and that code may
 //  1. slow down performance critical applications and
 //  2. increase binary size of ORT.
+
+#include "gtest/gtest.h"
 #include <iostream>
-#include "core/providers/cuda/cuda_execution_provider.h"
+
+#include "core/framework/run_options.h"
 #include "core/providers/cuda/cuda_allocator.h"
+#include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_stream_handle.h"
-#include "gtest/gtest.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -22,16 +25,18 @@ TEST(TestDeferredRelease, WithArena) {
   CUDAExecutionProvider ep(info);
   AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0];
 
+  onnxruntime::RunOptions run_opts;
+  run_opts.run_tag = "log1";
   // Allocator for call cudaMallocHost and cudaFreeHost
   // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
   AllocatorPtr cpu_pinned_alloc = ep.CreatePreferredAllocators()[1];
   // let the CudaStream instance "own" the default stream, so we can avoid the
   // work to initialize cublas/cudnn/... It is ok since it is just a customized unit test.
-  CudaStream stream(nullptr, gpu_alloctor->Info().device, cpu_pinned_alloc, false, true, nullptr, nullptr);
+  CudaStream stream(nullptr, gpu_alloctor->Info().device, cpu_pinned_alloc, false, true, nullptr, nullptr, info);
   // 10 MB
   const size_t n_bytes = 10 * 1000000;
   const int64_t n_allocs = 64;
-  ORT_THROW_IF_ERROR(ep.OnRunStart());
+  ORT_THROW_IF_ERROR(ep.OnRunStart(run_opts));
   for (size_t i = 0; i < n_allocs; ++i) {
     // Allocate 10MB CUDA pinned memory.
     auto pinned_buffer = IAllocator::MakeUniquePtr<void>(cpu_pinned_alloc, n_bytes);
@@ -44,7 +49,7 @@ TEST(TestDeferredRelease, WithArena) {
   cpu_pinned_alloc->GetStats(&stats);
   ASSERT_EQ(stats.num_allocs, n_allocs);
   ORT_THROW_IF_ERROR(stream.CleanUpOnRunEnd());
-  ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
+  ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
 TEST(TestDeferredRelease, WithoutArena) {
@@ -52,6 +57,9 @@ TEST(TestDeferredRelease, WithoutArena) {
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
 
+  onnxruntime::RunOptions run_opts;
+  run_opts.run_tag = "log1";
+
   OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
   // Create allocator without BFCArena
   AllocatorCreationInfo pinned_memory_info(
@@ -66,11 +74,11 @@ TEST(TestDeferredRelease, WithoutArena) {
   // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
   // let the CudaStream instance "own" the default stream, so we can avoid the
   // work to initialize cublas/cudnn/... It is ok since it is just a customized unit test.
-  CudaStream stream(nullptr, gpu_alloctor->Info().device, cuda_pinned_alloc, false, true, nullptr, nullptr);
+  CudaStream stream(nullptr, gpu_alloctor->Info().device, cuda_pinned_alloc, false, true, nullptr, nullptr, info);
   // 10 MB
   const size_t n_bytes = 10 * 1000000;
   const int64_t n_allocs = 64;
-  ORT_THROW_IF_ERROR(ep.OnRunStart());
+  ORT_THROW_IF_ERROR(ep.OnRunStart(run_opts));
   for (size_t i = 0; i < n_allocs; ++i) {
     // Allocate 10MB CUDA pinned memory.
     auto pinned_buffer = IAllocator::MakeUniquePtr<void>(cuda_pinned_alloc, n_bytes);
@@ -79,7 +87,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   }
 
   ORT_THROW_IF_ERROR(stream.CleanUpOnRunEnd());
-  ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
+  ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
index 96c1e173316d..d8384b432786 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
@@ -6,12 +6,11 @@
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cuda/cuda_provider_options.h"
 
+#include "gtest/gtest.h"
 #include <memory>
 #include <chrono>
 
 #include "core/common/gsl.h"
-#include "gtest/gtest.h"
-
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_execution_provider_info.h"
 #include "core/providers/cuda/cuda_allocator.h"
@@ -64,8 +63,15 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
 
   void cuda__Impl_Cast(void*, const float*, double*, size_t) override {}
 
-  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
-  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
+  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode,
+                        const char* msg, const char* file, const int line) override {
+    return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName,
+                                      cudaError(successCode), msg, file, line);
+  }
+  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode,
+                     const char* msg, const char* file, const int line) override {
+    CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line);
+  }
 
   void CopyGpuToCpu(void*, const void*, const size_t, const OrtMemoryInfo&, const OrtMemoryInfo&) override {}
 
@@ -93,19 +99,27 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
     return nullptr;
   }
 
-  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, const OrtArenaCfg*) override {
+  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy,
+                                                  onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&,
+                                                  const OrtArenaCfg*) override {
     return nullptr;
   }
 
   void TestAll() override {
-    // TestAll is the entry point of CUDA EP's insternal tests.
+    // TestAll is the entry point of CUDA EP's internal tests.
     // Those internal tests are not directly callable from onnxruntime_test_all
     // because CUDA EP is a shared library now.
     // Instead, this is a test provider that implements all the test cases.
     // onnxruntime_test_all is calling this function through TryGetProviderInfo_CUDA_Test.
-    int argc = 1;
-    std::string mock_exe_name = "onnxruntime_providers_cuda_ut";
-    char* argv[] = {const_cast<char*>(mock_exe_name.data())};
+    char mock_exe_name[] = "onnxruntime_providers_cuda_ut";
+
+    // InitGoogleTest decrements argc and removes args from argv if
+    // recognized. By doing so it decrements argc and shifts argv,
+    // to do so, from the code comments it expects argc + 1 with the last one always being nullptr
+    // otherwise, windows diagnostics reports stack corruption. when
+    int argc = 1;  // Change argc to 2 and edit the filter below if necessary
+    char* argv[] = {mock_exe_name, nullptr};
+    // char* argv[] = {mock_exe_name, "--gtest_filter=ReductionFunctionsTest.*", nullptr};
     ::testing::InitGoogleTest(&argc, argv);
     ORT_ENFORCE(RUN_ALL_TESTS() == 0);
   }
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
index 9d20bc545df5..7468a5718425 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gtest/gtest.h"
+
 #include <memory>
 #include <vector>
 
-#include "gtest/gtest.h"
-
 #include "core/common/common.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
@@ -32,7 +32,8 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
   Fill<TElement>(nullptr, buffer.get(), value, num_elements);
 
   auto cpu_buffer = std::make_unique<TElement[]>(num_elements);
-  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost));
+  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement),
+                             cudaMemcpyKind::cudaMemcpyDeviceToHost));
 
   std::vector<TElement> expected_data(num_elements, value);
   EXPECT_EQ(std::memcmp(cpu_buffer.get(), expected_data.data(), num_elements * sizeof(TElement)), 0);
diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
index 6cac23f14459..6636e1504039 100644
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gtest/gtest.h"
+
 #include "core/common/common.h"
 #include "core/providers/cuda/cuda_common.h"
 
-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace cuda {
 namespace test {
@@ -17,7 +17,7 @@ TEST(CudaGemmOptions, TestDefaultOptions) {
   EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_32F);
 #else
-  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH);
+  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_32F);
 #endif
 }
@@ -30,7 +30,7 @@ TEST(CudaGemmOptions, TestCompute16F) {
   EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_16F);
 #else
-  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH);
+  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_16F);
 #endif
 }
@@ -43,7 +43,7 @@ TEST(CudaGemmOptions, NoReducedPrecision) {
   EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
   EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_32F);
 #else
-  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH);
+  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_32F);
 #endif
 }
@@ -56,7 +56,7 @@ TEST(CudaGemmOptions, Pedantic) {
   EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_PEDANTIC_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_32F_PEDANTIC);
 #else
-  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH);
+  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_32F);
 #endif
 }
@@ -69,7 +69,7 @@ TEST(CudaGemmOptions, Compute16F_Pedantic) {
   EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_PEDANTIC_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_16F_PEDANTIC);
 #else
-  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH);
+  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_16F);
 #endif
 }
@@ -82,7 +82,7 @@ TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
   EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_16F);
 #else
-  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH);
+  EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH);
   EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_16F);
 #endif
 }
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index c460e806c1a8..ec7e98528504 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <memory>
-
 #include "gtest/gtest.h"
 
+#include <memory>
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/common/optional.h"
 #include "core/providers/cuda/reduction/reduction_functions.h"
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
index 957443c23e7c..2e073def5d64 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
@@ -85,7 +85,7 @@ constexpr const char* INTERNAL_TESTING_EP = "InternalTestingEP";
 InternalTestingExecutionProvider::InternalTestingExecutionProvider(const std::unordered_set<std::string>& ops,
                                                                    const std::unordered_set<std::string>& stop_ops,
                                                                    DataLayout preferred_layout)
-    : IExecutionProvider{utils::kInternalTestingExecutionProvider, true},
+    : IExecutionProvider{utils::kInternalTestingExecutionProvider},
       ep_name_{INTERNAL_TESTING_EP},
       ops_{ops},
       stop_ops_{stop_ops},
@@ -212,7 +212,7 @@ InternalTestingExecutionProvider::GetCapability(const onnxruntime::GraphViewer&
   // create functor to generate a guaranteed unique metadef id
   auto generate_metadef_name = [this, &graph_viewer]() {
     HashValue model_hash;
-    int metadef_id = GenerateMetaDefId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
     auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>();
     return ep_name_ + "_" + std::to_string(model_hash) + "_" + std::to_string(metadef_id);
   };
@@ -220,6 +220,7 @@ InternalTestingExecutionProvider::GetCapability(const onnxruntime::GraphViewer&
   auto compile_capabilities = utils::CreateSupportedPartitions(graph_viewer, supported_compiled_nodes, stop_ops_,
                                                                generate_metadef_name, ep_name_,
                                                                onnxruntime::utils::kInternalTestingExecutionProvider,
+                                                               /*QDQ NodeUnit map*/ nullptr,
                                                                debug_output_);
 
   if (!static_capabilities.empty()) {
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
index 610335262766..6615eb82f2b0 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
@@ -4,6 +4,7 @@
 #pragma once
 #include <set>
 #include "core/framework/execution_provider.h"
+#include "core/framework/model_metadef_id_generator.h"
 
 namespace onnxruntime {
 namespace internal_testing_ep {
@@ -82,6 +83,7 @@ class InternalTestingExecutionProvider : public IExecutionProvider {
   // per-instance kernel registry so tests using static kernels don't clash.
   // shared_ptr as required by IExecutionProvider::GetKernelRegistry
   std::shared_ptr<KernelRegistry> kernel_registry_;
+  ModelMetadefIdGenerator metadef_id_generator_;
 };
 
 }  // namespace internal_testing_ep
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
index 8955a83e66c0..aba74484a644 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
@@ -153,9 +153,8 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
       std::make_unique<InternalTestingExecutionProvider>(supported_ops)));
 
   ASSERT_STATUS_OK(session->Load(ort_model_path));
-  auto status = session->Initialize();
-  ASSERT_FALSE(status.IsOK()) << "Initialize should have failed when trying to save model with compiled kernels";
-  ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Unable to serialize model as it contains compiled nodes"));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session->Initialize(),
+                                      "Unable to serialize model as it contains compiled nodes");
 }
 
 // the internal NHWC operators are only included as part of contrib ops currently. as the EP requests the NHWC
@@ -195,11 +194,10 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
   output_names.push_back("Z");
   std::vector<OrtValue> fetches;
 
-  auto status = session.Run(feeds, output_names, &fetches);
   // Error message should come from the Conv implementation with the statically registered kernel
-  ASSERT_THAT(status.ErrorMessage(),
-              ::testing::HasSubstr("Non-zero status code returned while running Conv node. Name:'Conv' "
-                                   "Status Message: TODO: add NHWC implementation here."));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session.Run(feeds, output_names, &fetches),
+                                      "Non-zero status code returned while running Conv node. Name:'Conv' "
+                                      "Status Message: TODO: add NHWC implementation here.");
 }
 
 TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
@@ -243,10 +241,9 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
   output_names.push_back("softmaxout_1");
   std::vector<OrtValue> fetches;
 
-  auto status = session.Run(feeds, output_names, &fetches);
-  ASSERT_THAT(status.ErrorMessage(),
-              ::testing::HasSubstr("Non-zero status code returned while running Conv node. Name:'Conv' "
-                                   "Status Message: TODO: add NHWC implementation here."));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session.Run(feeds, output_names, &fetches),
+                                      "Non-zero status code returned while running Conv node. Name:'Conv' "
+                                      "Status Message: TODO: add NHWC implementation here.");
 }
 
 // This test can be deprecated now as the code logic has been changed so the model is not applicable
diff --git a/onnxruntime/test/providers/kernel_compute_test_utils.cc b/onnxruntime/test/providers/kernel_compute_test_utils.cc
index 977a5bd9ea7b..23ec48fa649d 100644
--- a/onnxruntime/test/providers/kernel_compute_test_utils.cc
+++ b/onnxruntime/test/providers/kernel_compute_test_utils.cc
@@ -124,7 +124,8 @@ void KernelComputeTester::Run(std::unordered_set<int> strided_outputs) {
     outputs.emplace_back(output);
   }
 
-  auto kernel = info.CreateKernel(&node);
+  static const ConfigOptions empty_config_options;
+  auto kernel = info.CreateKernel(&node, empty_config_options);
   ASSERT_TRUE(kernel);
 
   std::vector<int> fetch_mlvalue_idxs;
diff --git a/onnxruntime/test/providers/partitioning_utils_test.cc b/onnxruntime/test/providers/partitioning_utils_test.cc
new file mode 100644
index 000000000000..5db69489afae
--- /dev/null
+++ b/onnxruntime/test/providers/partitioning_utils_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "core/common/common.h"
+#include "core/graph/graph_viewer.h"
+#include "core/graph/model.h"
+#include "core/framework/node_unit.h"
+#include "core/framework/compute_capability.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/partitioning_utils.h"
+
+#include "test/optimizer/graph_transform_test_builder.h"
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/util/include/asserts.h"
+#include "test/util/include/test_utils.h"
+#include "test/util/include/test/test_environment.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Test handling of a DQ node that is connected to an initializer at the start of the graph, but not used
+// in a QDQ node group until after an unsupported node in the graph. If we do not process QDQ node units
+// correctly this DQ will incorrectly be in the first partition, with the rest of the QDQ node group in
+// the second partition.
+TEST(PartitioningUtilsTest, TestQDQHandling) {
+  constexpr const ORTCHAR_T* model_uri = ORT_TSTR("testdata/ort_github_issue_19590.onnx");
+  auto& logger = DefaultLoggingManager().DefaultLogger();
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, logger));
+  Graph& graph = p_model->MainGraph();
+  GraphViewer graph_viewer = GraphViewer(graph);
+
+  // we want everything but the Cast in the test model to be supported
+  const auto is_node_supported = [&](const Node& node) -> bool {
+    return node.OpType() != "Cast";
+  };
+
+  const auto on_group_closed = [&](const std::vector<const Node*>& /*group*/) -> bool {
+    return true;
+  };
+
+  const auto gen_metadef_name = [&]() {
+    static int metadef_id = 0;
+    return "TestMetaDef_" + std::to_string(metadef_id++);
+  };
+
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+
+  auto result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
+                                                 gen_metadef_name, "TEST", kCpuExecutionProvider, &node_unit_map,
+                                                 true);
+
+  // we should have 2 supported partitions, split by the Cast node.
+  // the first should have the Mul and NOT the DQ for the initializer if everything worked correctly.
+  ASSERT_EQ(result.size(), size_t(2)) << "Expected 2 partitions";
+  ASSERT_EQ(result[0]->sub_graph->nodes.size(), size_t(1)) << "First partition should only have the Mul and not a DQ";
+  ASSERT_EQ(result[1]->sub_graph->nodes.size(), size_t(5));  // everything else except the unsupported Cast
+}
+
+/// Check that CreateSupportedPartitions processes all nodes without error.
+static void CheckAllNodesProcessed(const std::function<void(ModelTestBuilder&)>& build_model) {
+  auto& logger = DefaultLoggingManager().DefaultLogger();
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 15}};
+
+  Model model("PartitioningUtils_TestModel", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, {}, logger);
+
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  build_model(helper);
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+
+  GraphViewer graph_viewer = GraphViewer(graph);
+
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+
+  const auto is_node_supported = [&](const Node& /*node*/) -> bool {
+    return true;
+  };
+
+  const auto on_group_closed = [&](const std::vector<const Node*>& /*group*/) -> bool {
+    return true;
+  };
+
+  const auto gen_metadef_name = [&]() {
+    static int metadef_id = 0;
+    return "TestMetaDef_" + std::to_string(metadef_id++);
+  };
+
+  auto result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
+                                                 gen_metadef_name, "TEST", kCpuExecutionProvider, &node_unit_map,
+                                                 true);
+
+  // the 'real' test is that CreateSupportedPartitions doesn't throw due to a mismatch with expected vs processed nodes
+  // as all ops are supported there should only ever be 1 partition
+  ASSERT_EQ(result.size(), size_t(1)) << "Expected 1 partition";
+}
+
+TEST(PartitioningUtilsTest, TestHandlingQDQNodeUnitWithNoQNodes) {
+  // build graph with QDQ node unit for logical operator (Equal) that has no Q node and a downstream node (Cast).
+  auto build_model = [](ModelTestBuilder& builder) {
+    constexpr uint8_t zero_point = 0;
+    constexpr float qdq_scale = 0.0038f;
+    const std::vector<int64_t> input_shape = {1, 3, 8, 8};
+
+    auto* input0 = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
+    auto* input1 = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
+    auto* output = builder.MakeOutput();
+
+    // input -> Q -> DQ -> Op
+    auto* qdq0_output = AddQDQNodePair<uint8_t>(builder, input0, qdq_scale, zero_point);
+    auto* qdq1_output = AddQDQNodePair<uint8_t>(builder, input1, qdq_scale, zero_point);
+
+    // Equal ->
+    auto* equal_output = builder.MakeIntermediate();
+    builder.AddNode("Equal", {qdq0_output, qdq1_output}, {equal_output});
+
+    // -> Cast -> output
+    Node& cast_node = builder.AddNode("Cast", {equal_output}, {output});
+    cast_node.AddAttribute("to",
+                           static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT));
+  };
+
+  CheckAllNodesProcessed(build_model);
+}
+
+// TopK produces 2 outputs, one of which is used in a QDQ node group (Q of values output)
+// and the other (indices output) is not. A downstream node consuming the indices output has an edge from the target
+// node and not a Q node.
+// To process this correctly, the QDQ NodeUnit must return output edges for both the Q node/s of the values output,
+// and the downstream node (Cast in this case) of the indices output.
+TEST(PartitioningUtilsTest, TestQDQNodeGroupWithOutputFromTargetNode) {
+  const auto build_model = [](ModelTestBuilder& builder) {
+    constexpr uint8_t zero_point = 0;
+    constexpr float qdq_scale = 0.0038f;
+    const std::vector<int64_t> input_shape = {1, 3, 8, 8};
+
+    auto* input0 = builder.MakeInput<float>(input_shape, -1.0f, 1.0f);
+
+    // input -> Q -> DQ ->
+    auto* qdq0_output = AddQDQNodePair<uint8_t>(builder, input0, qdq_scale, zero_point);
+
+    // K input
+    NodeArg* k_input = builder.MakeInput<int64_t>({1}, {10});
+
+    // TopK op
+    NodeArg* values_output = builder.MakeIntermediate();
+    NodeArg* indices_output = builder.MakeIntermediate();
+    builder.AddNode("TopK", {qdq0_output, k_input}, {values_output, indices_output});
+
+    // values -> Q -> DQ -> graph output
+    AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, values_output, qdq_scale, zero_point);
+
+    // indices -> Cast -> graph output
+    auto* i_output = builder.MakeOutput();
+    Node& cast_node = builder.AddNode("Cast", {indices_output}, {i_output});
+    const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
+    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
+  };
+
+  CheckAllNodesProcessed(build_model);
+}
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index 15ba3b5de2fa..e899f870f9e7 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -182,6 +182,44 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
                   ExpectedEPNodeAssignment::All);
 }
 
+// Test FP16 Clip with min (FP16)
+TEST_F(QnnHTPBackendTests, Clip_FP16) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto f32_input = TestInputDef<float>({1, 3, 2, 2}, false,
+                                       {-10.0f, -8.0f, -3.5f, 2.2f,
+                                        1.3f, 1.5f, 3.2f, 5.8f,
+                                        5.8f, 9.7f, 8.5f, 8.9f});
+  std::vector<MLFloat16> f16_data;
+  std::for_each(f32_input.GetRawData().begin(), f32_input.GetRawData().end(),
+                [&f16_data](const float data) {
+                  f16_data.push_back(static_cast<MLFloat16>(data));
+                });
+  auto f16_input = TestInputDef<MLFloat16>({1, 3, 2, 2}, false, f16_data);
+
+  const float min_f32 = 1.2f;
+  const MLFloat16 min_f16 = static_cast<MLFloat16>(min_f32);
+  auto f32_min_input = TestInputDef<float>({}, true, {min_f32});
+  auto f16_min_input = TestInputDef<MLFloat16>({}, true, {min_f16});
+
+  auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {f32_input}, {f32_min_input}, {});
+  auto f16_model_builder = BuildOpTestCase<MLFloat16, MLFloat16>("Clip", {f16_input}, {f16_min_input}, {});
+  int opset = 13;
+  ExpectedEPNodeAssignment expected_ep_assignment = ExpectedEPNodeAssignment::All;
+
+  TestFp16ModelAccuracy(f32_model_builder,
+                        f16_model_builder,
+                        provider_options,
+                        opset,
+                        expected_ep_assignment);
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index 1cd8498ea1d3..0eeeccf4fe7d 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -3,8 +3,10 @@
 
 #if !defined(ORT_MINIMAL_BUILD)
 
+#include <optional>
 #include <string>
 #include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -20,9 +22,10 @@ static GetTestModelFn BuildF32ConvTestCase(const std::string& conv_op_type, cons
                                            const std::vector<int64_t>& strides,
                                            const std::vector<int64_t>& pads,
                                            const std::vector<int64_t>& dilations,
+                                           std::optional<int64_t> group,
                                            const std::string& auto_pad = "NOTSET") {
   return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
-          dilations, auto_pad](ModelTestBuilder& builder) {
+          dilations, group, auto_pad](ModelTestBuilder& builder) {
     std::vector<NodeArg*> conv_inputs = {
         MakeTestInput(builder, input_def),
         MakeTestInput(builder, weights_def)};
@@ -33,19 +36,23 @@ static GetTestModelFn BuildF32ConvTestCase(const std::string& conv_op_type, cons
 
     auto* output = builder.MakeOutput();
 
-    Node& convNode = builder.AddNode(conv_op_type, conv_inputs, {output});
-    convNode.AddAttribute("auto_pad", auto_pad);
+    Node& conv_node = builder.AddNode(conv_op_type, conv_inputs, {output});
+    conv_node.AddAttribute("auto_pad", auto_pad);
+
+    if (group.has_value()) {
+      conv_node.AddAttribute("group", group.value());
+    }
 
     if (!pads.empty() && auto_pad == "NOTSET") {
-      convNode.AddAttribute("pads", pads);
+      conv_node.AddAttribute("pads", pads);
     }
 
     if (!strides.empty()) {
-      convNode.AddAttribute("strides", strides);
+      conv_node.AddAttribute("strides", strides);
     }
 
     if (!dilations.empty()) {
-      convNode.AddAttribute("dilations", dilations);
+      conv_node.AddAttribute("dilations", dilations);
     }
   };
 }
@@ -58,6 +65,7 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef
                              const std::vector<int64_t>& strides,
                              const std::vector<int64_t>& pads,
                              const std::vector<int64_t>& dilations,
+                             std::optional<int64_t> group,
                              const std::string& auto_pad,
                              ExpectedEPNodeAssignment expected_ep_assignment,
                              int opset = 13,
@@ -69,8 +77,9 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef
 #else
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
-
-  RunQnnModelTest(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, auto_pad),
+  auto build_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads,
+                                       dilations, group, auto_pad);
+  RunQnnModelTest(build_fn,
                   provider_options,
                   opset,
                   expected_ep_assignment,
@@ -86,11 +95,12 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQConvTestCase(const std::string
                                                                const std::vector<int64_t>& strides,
                                                                const std::vector<int64_t>& pads,
                                                                const std::vector<int64_t>& dilations,
+                                                               std::optional<int64_t> group,
                                                                const std::string& auto_pad = "NOTSET",
                                                                bool use_contrib_qdq = false) {
   return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
-          dilations, auto_pad, use_contrib_qdq](ModelTestBuilder& builder,
-                                                std::vector<QuantParams<ActivationQType>>& output_qparams) {
+          dilations, group, auto_pad, use_contrib_qdq](ModelTestBuilder& builder,
+                                                       std::vector<QuantParams<ActivationQType>>& output_qparams) {
     std::vector<NodeArg*> conv_inputs;
 
     // input -> Q/DQ ->
@@ -120,6 +130,104 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQConvTestCase(const std::string
 
     conv_node.AddAttribute("auto_pad", auto_pad);
 
+    if (group.has_value()) {
+      conv_node.AddAttribute("group", group.value());
+    }
+
+    if (!pads.empty() && auto_pad == "NOTSET") {
+      conv_node.AddAttribute("pads", pads);
+    }
+    if (!strides.empty()) {
+      conv_node.AddAttribute("strides", strides);
+    }
+    if (!dilations.empty()) {
+      conv_node.AddAttribute("dilations", dilations);
+    }
+
+    AddQDQNodePairWithOutputAsGraphOutput<ActivationQType>(builder, conv_output, output_qparams[0].scale,
+                                                           output_qparams[0].zero_point, use_contrib_qdq);
+  };
+}
+
+template <typename ActivationQType, typename WeightQType>
+static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const std::string& conv_op_type,
+                                                                         const TestInputDef<float>& input_def,
+                                                                         const TestInputDef<float>& weights_def,
+                                                                         const TestInputDef<float>& bias_def,
+                                                                         const std::vector<int64_t>& strides,
+                                                                         const std::vector<int64_t>& pads,
+                                                                         const std::vector<int64_t>& dilations,
+                                                                         std::optional<int64_t> group,
+                                                                         const std::string& auto_pad = "NOTSET",
+                                                                         bool use_contrib_qdq = false) {
+  return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
+          dilations, group, auto_pad, use_contrib_qdq](ModelTestBuilder& builder,
+                                                       std::vector<QuantParams<ActivationQType>>& output_qparams) {
+    std::vector<NodeArg*> conv_inputs;
+
+    // input -> Q/DQ ->
+    auto* input = MakeTestInput(builder, input_def);
+    QuantParams<ActivationQType> input_qparams = GetTestInputQuantParams<ActivationQType>(input_def);
+    auto* input_qdq = AddQDQNodePair<ActivationQType>(builder, input, input_qparams.scale, input_qparams.zero_point,
+                                                      use_contrib_qdq);
+    conv_inputs.push_back(input_qdq);
+
+    // Quantized(weights) -> DQ ->
+    ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData());
+    int64_t weight_quant_axis = conv_op_type == "Conv" ? 0 : 1;  // 0 for Conv, 1 for ConvTranspose
+    std::vector<float> weight_scales;
+    std::vector<WeightQType> weight_zero_points;
+    GetTestInputQuantParamsPerChannel<WeightQType>(weights_def, weight_scales, weight_zero_points,
+                                                   static_cast<size_t>(weight_quant_axis), true);
+
+    TensorShape weights_shape = weights_def.GetTensorShape();
+    std::vector<WeightQType> quantized_weights(weights_shape.Size());
+    QuantizeValues<float, WeightQType>(weights_def.GetRawData(), quantized_weights, weights_shape,
+                                       weight_scales, weight_zero_points, weight_quant_axis);
+
+    NodeArg* weights_initializer = builder.MakeInitializer<WeightQType>(weights_def.GetShape(), quantized_weights);
+    NodeArg* weights_dq = builder.MakeIntermediate();
+    Node& weights_dq_node = builder.AddDequantizeLinearNode<WeightQType>(weights_initializer, weight_scales,
+                                                                         weight_zero_points, weights_dq,
+                                                                         nullptr, use_contrib_qdq);
+    weights_dq_node.AddAttribute("axis", weight_quant_axis);
+    conv_inputs.push_back(weights_dq);
+
+    // Quantized(bias) -> DQ ->
+    if (!bias_def.GetShape().empty()) {
+      // Bias requirement taken from python quantization tool: onnx_quantizer.py::quantize_bias_static()
+      // bias_scale = input_scale * weight_scale
+      // bias_zero_point = 0
+      ORT_ENFORCE(bias_def.IsInitializer() && bias_def.IsRawData());
+      std::vector<float> bias_scales = weight_scales;
+      std::vector<int32_t> bias_zero_points(weight_scales.size(), 0);
+      for (size_t i = 0; i < bias_scales.size(); i++) {
+        bias_scales[i] *= input_qparams.scale;
+      }
+
+      TensorShape bias_shape = bias_def.GetTensorShape();
+      std::vector<int32_t> quantized_biases(bias_shape.Size());
+      QuantizeValues<float, int32_t>(bias_def.GetRawData(), quantized_biases, bias_shape, bias_scales,
+                                     bias_zero_points, 0);
+
+      NodeArg* bias_initializer = builder.MakeInitializer<int32_t>(bias_def.GetShape(), quantized_biases);
+      NodeArg* bias_dq = builder.MakeIntermediate();
+      Node& bias_dq_node = builder.AddDequantizeLinearNode<int32_t>(bias_initializer, bias_scales, bias_zero_points,
+                                                                    bias_dq, nullptr, use_contrib_qdq);
+
+      bias_dq_node.AddAttribute("axis", static_cast<int64_t>(0));
+      conv_inputs.push_back(bias_dq);
+    }
+
+    auto* conv_output = builder.MakeIntermediate();
+    Node& conv_node = builder.AddNode(conv_op_type, conv_inputs, {conv_output});
+
+    conv_node.AddAttribute("auto_pad", auto_pad);
+
+    if (group.has_value()) {
+      conv_node.AddAttribute("group", group.value());
+    }
+
     if (!pads.empty() && auto_pad == "NOTSET") {
       conv_node.AddAttribute("pads", pads);
     }
@@ -144,6 +252,7 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
                              const std::vector<int64_t>& strides,
                              const std::vector<int64_t>& pads,
                              const std::vector<int64_t>& dilations,
+                             std::optional<int64_t> group,
                              const std::string& auto_pad,
                              ExpectedEPNodeAssignment expected_ep_assignment,
                              bool use_contrib_qdq = false,
@@ -158,16 +267,47 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
 #endif
 
   TestQDQModelAccuracy(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
-                                            auto_pad),
+                                            group, auto_pad),
                        BuildQDQConvTestCase<ActivationQType, WeightQType>(conv_op_type, input_def, weights_def,
                                                                           bias_def, strides, pads, dilations,
-                                                                          auto_pad, use_contrib_qdq),
+                                                                          group, auto_pad, use_contrib_qdq),
                        provider_options,
                        opset,
                        expected_ep_assignment,
                        tolerance);
 }
 
+// Runs a QDQ Conv model (per-axis quantization on weight/bias) on the QNN HTP backend.
+// Checks the graph node assignment, and that inference outputs for QNN EP and CPU EP match.
+template <typename ActivationQType, typename WeightQType>
+static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const TestInputDef<float>& input_def,
+                                       const TestInputDef<float>& weights_def,
+                                       const TestInputDef<float>& bias_def,
+                                       const std::vector<int64_t>& strides,
+                                       const std::vector<int64_t>& pads,
+                                       const std::vector<int64_t>& dilations,
+                                       std::optional<int64_t> group,
+                                       const std::string& auto_pad,
+                                       ExpectedEPNodeAssignment expected_ep_assignment,
+                                       bool use_contrib_qdq = false,
+                                       int opset = 13,
+                                       QDQTolerance tolerance = QDQTolerance()) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
+                                     group, auto_pad);
+  auto qdq_fn = BuildQDQPerChannelConvTestCase<ActivationQType, WeightQType>(conv_op_type, input_def, weights_def,
+                                                                             bias_def, strides, pads, dilations,
+                                                                             group, auto_pad, use_contrib_qdq);
+  TestQDQModelAccuracy(f32_fn, qdq_fn, provider_options, opset, expected_ep_assignment, tolerance);
+}
+
 // Check that QNN compiles DQ -> Conv -> Q as a single unit.
 // Tests bias as a dynamic input.
 // TODO: Segfaults when calling graphFinalize(). v2.13
@@ -179,6 +319,7 @@ TEST_F(QnnCPUBackendTests, DISABLED_Convf32_dynamic_bias) {
                    {1, 1},                                                 // default strides
                    {0, 0, 0, 0},                                           // default pads
                    {1, 1},                                                 // default dilations
+                   1,                                                      // default group
                    "NOTSET",                                               // No auto-padding
                    ExpectedEPNodeAssignment::All);
 }
@@ -193,6 +334,7 @@ TEST_F(QnnCPUBackendTests, Convf32_bias_initializer) {
                    {1, 1},                                                 // default strides
                    {0, 0, 0, 0},                                           // default pads
                    {1, 1},                                                 // default dilations
+                   1,                                                      // default group
                    "NOTSET",                                               // No auto-padding
                    ExpectedEPNodeAssignment::All);
 }
@@ -206,6 +348,7 @@ TEST_F(QnnCPUBackendTests, Convf32_AutoPadUpper) {
                    {1, 1},                                                 // strides
                    {},                                                     // pads
                    {1, 1},                                                 // dilations
+                   1,                                                      // default group
                    "SAME_UPPER",                                           // auto_pad
                    ExpectedEPNodeAssignment::All);
 }
@@ -219,6 +362,7 @@ TEST_F(QnnCPUBackendTests, ConvTransposef32_AutoPadUpper) {
                    {1, 1},                                                 // strides
                    {},                                                     // pads
                    {1, 1},                                                 // dilations
+                   1,                                                      // default group
                    "SAME_UPPER",                                           // auto_pad
                    ExpectedEPNodeAssignment::All);
 }
@@ -232,6 +376,7 @@ TEST_F(QnnCPUBackendTests, Convf32_AutoPadLower) {
                    {1, 1},                                                 // strides
                    {},                                                     // pads
                    {1, 1},                                                 // dilations
+                   1,                                                      // default group
                    "SAME_LOWER",                                           // auto_pad
                    ExpectedEPNodeAssignment::All);
 }
@@ -245,6 +390,7 @@ TEST_F(QnnCPUBackendTests, ConvTransposef32_AutoPadLower) {
                    {1, 1},                                                 // strides
                    {},                                                     // pads
                    {1, 1},                                                 // dilations
+                   1,                                                      // default group
                    "SAME_LOWER",                                           // auto_pad
                    ExpectedEPNodeAssignment::All);
 }
@@ -258,6 +404,7 @@ TEST_F(QnnCPUBackendTests, Convf32_large_input1_pad_bias_initializer) {
                    {1, 1},
                    {1, 1, 1, 1},
                    {1, 1},
+                   1,  // default group
                    "NOTSET",
                    ExpectedEPNodeAssignment::All,
                    13,
@@ -280,6 +427,7 @@ TEST_F(QnnCPUBackendTests, Convf32_large_input2_nopad_bias_initializer) {
                    {1, 1},
                    {0, 0, 0, 0},
                    {1, 1},
+                   1,  // default group
                    "NOTSET",
                    ExpectedEPNodeAssignment::All,
                    13,  // opset
@@ -296,6 +444,7 @@ TEST_F(QnnCPUBackendTests, Conv1Df32_StaticWeights_DefaultBias) {
                    {1},                                                             // Strides
                    {0, 0},                                                          // Pads
                    {1},                                                             // Dilations
+                   1,                                                               // default group
                    "NOTSET",
                    ExpectedEPNodeAssignment::All);
 }
@@ -310,6 +459,7 @@ TEST_F(QnnCPUBackendTests, Conv1Df32_DynamicWeights_DefaultBias) {
                    {1},                                                              // Strides
                    {0, 0},                                                           // Pads
                    {1},                                                              // Dilations
+                   1,                                                                // default group
                    "NOTSET",
                    ExpectedEPNodeAssignment::All);
 }
@@ -324,6 +474,7 @@ TEST_F(QnnCPUBackendTests, ConvTranspose1Df32_StaticWeights_DefaultBias) {
                    {1},                                                             // Strides
                    {0, 0},                                                          // Pads
                    {1},                                                             // Dilations
+                   1,                                                               // default group
                    "NOTSET",
                    ExpectedEPNodeAssignment::All);
 }
@@ -338,6 +489,7 @@ TEST_F(QnnCPUBackendTests, ConvTranspose1Df32_DynamicWeights_DefaultBias) {
                    {1},                                                              // Strides
                    {0, 0},                                                           // Pads
                    {1},                                                              // Dilations
+                   1,                                                                // default group
                    "NOTSET",
                    ExpectedEPNodeAssignment::All);
 }
@@ -363,7 +515,8 @@ TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
   auto BuildConvMulGraph = [](ModelTestBuilder& builder) {
     // DQ node for Conv input
     auto* dq_i_output = builder.MakeIntermediate();
-    auto* conv_dq_input = builder.MakeInitializer<uint8_t>({1, 32, 16, 113}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
+    auto* conv_dq_input = builder.MakeInitializer<uint8_t>({1, 32, 16, 113}, static_cast<uint8_t>(0),
+                                                           static_cast<uint8_t>(127));
 
     // DQ node for Conv bias
     auto* dq_bias_output = builder.MakeIntermediate();
@@ -375,7 +528,8 @@ TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
     auto* mul_input1 = builder.MakeInput<uint8_t>({16, 32, 1, 1}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
 
     auto* mul_dq2_output = builder.MakeIntermediate();
-    auto* mul_input2 = builder.MakeInitializer<uint8_t>({16, 1, 1, 1}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
+    auto* mul_input2 = builder.MakeInitializer<uint8_t>({16, 1, 1, 1}, static_cast<uint8_t>(0),
+                                                        static_cast<uint8_t>(127));
     builder.AddDequantizeLinearNode<uint8_t>(mul_input1, .03f, 0, mul_dq1_output);
     builder.AddDequantizeLinearNode<uint8_t>(mul_input2, .03f, 0, mul_dq2_output);
 
@@ -420,6 +574,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_dynamic_input) {
                                      {1, 1},                                                  // Strides
                                      {0, 0, 0, 0},                                            // Pads
                                      {1, 1},                                                  // Dilations
+                                     1,                                                       // default group
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_qdq_contrib_ops
@@ -428,6 +583,170 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_dynamic_input) {
                                      QDQTolerance(0.00413f));
 }
 
+// Test per-channel QDQ Conv. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8
+TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {3, 2, 2, 2};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint8_t, int8_t>("Conv",
+                                              input_def,
+                                              weight_def,
+                                              bias_def,
+                                              {1, 1},        // Strides
+                                              {0, 0, 0, 0},  // Pads
+                                              {1, 1},        // Dilations
+                                              1,             // default group
+                                              "NOTSET",
+                                              ExpectedEPNodeAssignment::All,
+                                              false,  // use_qdq_contrib_ops
+                                              13);    // opset
+}
+
+// Test per-channel QDQ Conv that maps to QNN's DepthwiseConv2d (input_chans == output_chans == group).
+// in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8
+TEST_F(QnnHTPBackendTests, ConvDepthwiseU8S8S32_PerChannel) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};   // (N, C, H, W)
+  std::vector<int64_t> weight_shape = {2, 1, 2, 2};  // (C, M/group, kH, kW)
+  std::vector<int64_t> bias_shape = {2};             // (M)
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint8_t, int8_t>("Conv",
+                                              input_def,
+                                              weight_def,
+                                              bias_def,
+                                              {1, 1},        // Strides
+                                              {0, 0, 0, 0},  // Pads
+                                              {1, 1},        // Dilations
+                                              2,             // group
+                                              "NOTSET",
+                                              ExpectedEPNodeAssignment::All,
+                                              false,  // use_qdq_contrib_ops
+                                              13);    // opset
+}
+
+// Test per-channel QDQ ConvTranspose. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8
+TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {2, 3, 2, 2};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint8_t, int8_t>("ConvTranspose",
+                                              input_def,
+                                              weight_def,
+                                              bias_def,
+                                              {1, 1},        // Strides
+                                              {0, 0, 0, 0},  // Pads
+                                              {1, 1},        // Dilations
+                                              1,             // default group
+                                              "NOTSET",
+                                              ExpectedEPNodeAssignment::All,
+                                              false,  // use_qdq_contrib_ops
+                                              13);    // opset
+}
+
+// Test per-channel QDQ Conv. in0: u16, in1 (weight): s8, in2 (bias): s32, out: u16
+TEST_F(QnnHTPBackendTests, ConvU16S8S32_PerChannel) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {3, 2, 2, 2};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint16_t, int8_t>("Conv",
+                                               input_def,
+                                               weight_def,
+                                               bias_def,
+                                               {1, 1},        // Strides
+                                               {0, 0, 0, 0},  // Pads
+                                               {1, 1},        // Dilations
+                                               1,             // default group
+                                               "NOTSET",
+                                               ExpectedEPNodeAssignment::All,
+                                               true,  // use_qdq_contrib_ops
+                                               13);   // opset
+}
+
+// Test per-channel QDQ ConvTranspose. in0: u16, in1 (weight): s8, in2 (bias): s32, out: u16
+TEST_F(QnnHTPBackendTests, ConvTransposeU16S8S32_PerChannel) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {2, 3, 2, 2};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint16_t, int8_t>("ConvTranspose",
+                                               input_def,
+                                               weight_def,
+                                               bias_def,
+                                               {1, 1},        // Strides
+                                               {0, 0, 0, 0},  // Pads
+                                               {1, 1},        // Dilations
+                                               1,             // default group
+                                               "NOTSET",
+                                               ExpectedEPNodeAssignment::All,
+                                               true,  // use_qdq_contrib_ops
+                                               13);   // opset
+}
+
+// Test per-channel QDQ Conv that maps to QNN's DepthwiseConv2d (input_chans == output_chans == group).
+// in0: u16, in1 (weight): s8, in2 (bias): s32, out: u16
+TEST_F(QnnHTPBackendTests, ConvDepthwiseU16S8S32_PerChannel) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};   // (N, C, H, W)
+  std::vector<int64_t> weight_shape = {2, 1, 2, 2};  // (C, M/group, kH, kW)
+  std::vector<int64_t> bias_shape = {2};             // (M)
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint16_t, int8_t>("Conv",
+                                               input_def,
+                                               weight_def,
+                                               bias_def,
+                                               {1, 1},        // Strides
+                                               {0, 0, 0, 0},  // Pads
+                                               {1, 1},        // Dilations
+                                               2,             // group
+                                               "NOTSET",
+                                               ExpectedEPNodeAssignment::All,
+                                               true,  // use_qdq_contrib_ops
+                                               13);   // opset
+}
+
 // Tests 16-bit QDQ Conv with dynamic weights and bias (uses QNN's Conv2d)
 // TODO: Inaccuracy detected for output 'output', element 0.
 // Output quant params: scale=0.0040235077030956745, zero_point=0.
@@ -444,6 +763,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvU16S16S32_DynamicBias) {
                                       {1, 1},                                      // Strides
                                       {0, 0, 0, 0},                                // Pads
                                       {1, 1},                                      // Dilations
+                                      1,                                           // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true);  // Use com.microsoft QDQ ops for 16-bit
@@ -461,6 +781,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_DepthwiseConvU16S16S32_DynamicBias) {
                                       {1, 1},                                      // Strides
                                       {0, 0, 0, 0},                                // Pads
                                       {1, 1},                                      // Dilations
+                                      1,                                           // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true);  // Use com.microsoft QDQ ops for 16-bit
@@ -482,6 +803,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvU16S16S32_NoBias) {
                                       {1, 1},                                      // Strides
                                       {0, 0, 0, 0},                                // Pads
                                       {1, 1},                                      // Dilations
+                                      1,                                           // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true);  // Use com.microsoft QDQ ops for 16-bit
@@ -499,6 +821,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_DepthwiseConvU16S16S32_NoBias) {
                                       {1, 1},                                                 // Strides
                                       {0, 0, 0, 0},                                           // Pads
                                       {1, 1},                                                 // Dilations
+                                      1,                                                      // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true);  // Use com.microsoft QDQ ops for 16-bit
@@ -521,6 +844,7 @@ TEST_F(QnnHTPBackendTests, DepthwiseConvU16U8S32_StaticBias) {
                                       {1, 1},                                                // Strides
                                       {0, 0, 0, 0},                                          // Pads
                                       {1, 1},                                                // Dilations
+                                      1,                                                     // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true,  // Use com.microsoft QDQ ops for 16-bit
@@ -543,6 +867,7 @@ TEST_F(QnnHTPBackendTests, ConvU16U8S32_StaticBias) {
                                       {1, 1},                                                // Strides
                                       {0, 0, 0, 0},                                          // Pads
                                       {1, 1},                                                // Dilations
+                                      1,                                                     // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true,  // Use com.microsoft QDQ ops for 16-bit
@@ -566,6 +891,7 @@ TEST_F(QnnHTPBackendTests, DepthwiseConvU16U8S32_DynamicBias) {
                                       {1, 1},                                                // Strides
                                       {0, 0, 0, 0},                                          // Pads
                                       {1, 1},                                                // Dilations
+                                      1,                                                     // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true,  // Use com.microsoft QDQ ops for 16-bit
@@ -588,6 +914,7 @@ TEST_F(QnnHTPBackendTests, ConvU16U8S32_DynamicBias) {
                                       {1, 1},                                                // Strides
                                       {0, 0, 0, 0},                                          // Pads
                                       {1, 1},                                                // Dilations
+                                      1,                                                     // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true,  // Use com.microsoft QDQ ops for 16-bit
@@ -610,6 +937,7 @@ TEST_F(QnnHTPBackendTests, ConvU16U8S32_NoBias) {
                                       {1, 1},                                                // Strides
                                       {0, 0, 0, 0},                                          // Pads
                                       {1, 1},                                                // Dilations
+                                      1,                                                     // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true,  // Use com.microsoft QDQ ops for 16-bit
@@ -633,6 +961,7 @@ TEST_F(QnnHTPBackendTests, DepthwiseConvU16U8S32_NoBias) {
                                       {1, 1},                                                // Strides
                                       {0, 0, 0, 0},                                          // Pads
                                       {1, 1},                                                // Dilations
+                                      1,                                                     // default group
                                       "NOTSET",
                                       ExpectedEPNodeAssignment::All,
                                       true,  // Use com.microsoft QDQ ops for 16-bit
@@ -649,6 +978,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_DynamicWeight_NoBias) {
                                      {1, 1},                                                     // Strides
                                      {0, 0, 0, 0},                                               // Pads
                                      {1, 1},                                                     // Dilations
+                                     1,                                                          // default group
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All);
 }
@@ -663,6 +993,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8U8S32_DynamicWeight_NoBias) {
                                      {1, 1},                                                     // Strides
                                      {0, 0, 0, 0},                                               // Pads
                                      {1, 1},                                                     // Dilations
+                                     1,                                                          // default group
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All);
 }
@@ -677,6 +1008,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_initializer) {
                                      {1, 1},                                                  // Strides
                                      {0, 0, 0, 0},                                            // Pads
                                      {1, 1},                                                  // Dilations
+                                     1,                                                       // default group
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_qdq_contrib_ops
@@ -695,6 +1027,7 @@ TEST_F(QnnHTPBackendTests, Conv1DU8U8S32_bias_initializer) {
                                      {1},                                                         // strides
                                      {0, 0},                                                      // pads
                                      {1},                                                         // dilations
+                                     1,                                                           // default group
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All);
 }
@@ -709,6 +1042,7 @@ TEST_F(QnnHTPBackendTests, ConvTranspose1DU8U8S32_bias_initializer) {
                                      {1},                                                         // strides
                                      {0, 0},                                                      // pads
                                      {1},                                                         // dilations
+                                     1,                                                           // default group
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All);
 }
@@ -722,6 +1056,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_AutoPadUpper) {
                                      {1, 1},                                               // strides
                                      {},                                                   // pads
                                      {1, 1},                                               // dilations
+                                     1,                                                    // default group
                                      "SAME_UPPER",                                         // auto_pad
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_contrib_qdq
@@ -738,6 +1073,7 @@ TEST_F(QnnHTPBackendTests, Conv1DU8U8S32_AutoPadUpper) {
                                      {1},                                                         // strides
                                      {0},                                                         // pads
                                      {1},                                                         // dilations
+                                     1,                                                           // default group
                                      "SAME_UPPER",                                                // auto_pad
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_contrib_qdq
@@ -754,6 +1090,7 @@ TEST_F(QnnHTPBackendTests, ConvTranspose1DU8U8S32_AutoPadUpper) {
                                      {1},                                                         // strides
                                      {0},                                                         // pads
                                      {1},                                                         // dilations
+                                     1,                                                           // default group
                                      "SAME_UPPER",                                                // auto_pad
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_contrib_qdq
@@ -769,6 +1106,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_AutoPadLower) {
                                      {1, 1},                                               // strides
                                      {},                                                   // pads
                                      {1, 1},                                               // dilations
+                                     1,                                                    // default group
                                      "SAME_LOWER",                                         // auto_pad
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_contrib_qdq
@@ -784,6 +1122,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8U8S32_AutoPadLower) {
                                      {1, 1},                                               // strides
                                      {},                                                   // pads
                                      {1, 1},                                               // dilations
+                                     1,                                                    // default group
                                      "SAME_LOWER",                                         // auto_pad
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_contrib_qdq
@@ -800,6 +1139,7 @@ TEST_F(QnnHTPBackendTests, Conv1DU8U8S32_AutoPadLower) {
                                      {1},                                                         // strides
                                      {0},                                                         // pads
                                      {1},                                                         // dilations
+                                     1,                                                           // default group
                                      "SAME_LOWER",                                                // auto_pad
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_contrib_qdq
@@ -816,6 +1156,7 @@ TEST_F(QnnHTPBackendTests, ConvTranspose1DU8U8S32_AutoPadLower) {
                                      {1},                                                         // strides
                                      {0},                                                         // pads
                                      {1},                                                         // dilations
+                                     1,                                                           // default group
                                      "SAME_LOWER",                                                // auto_pad
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_contrib_qdq
@@ -830,6 +1171,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input1_padding_bias_initializer) {
                                      {1, 1},
                                      {1, 1, 1, 1},
                                      {1, 1},
+                                     1,  // default group
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_qdq_contrib_ops
@@ -852,6 +1194,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input2_bias_initializer) {
                                      {1, 1},
                                      {0, 0, 0, 0},
                                      {1, 1},
+                                     1,  // default group
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,
@@ -867,6 +1210,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_LargeInput_Dilations_Pads) {
                                      {2, 2},                                                    // strides
                                      {3, 3, 3, 3},                                              // pads
                                      {1, 1},                                                    // dilations
+                                     1,                                                         // default group
                                      "NOTSET",                                                  // auto_pad
                                      ExpectedEPNodeAssignment::All);
 }
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 391d7bebc958..c443d9e92cec 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -1,11 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <string>
 #include <filesystem>
+#include <string>
+#include <thread>
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #include "core/providers/cpu/cpu_provider_factory.h"  // For OrtSessionOptionsAppendExecutionProvider_CPU
 #include "core/session/inference_session.h"
 
@@ -166,6 +168,26 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) {
   }
 }
 
+// Conv node `Conv` is not supported: GetFileLength for conv_qdq_external_ini.bin failed:open file conv_qdq_external_ini.bin fail,
+// errcode = 2 - The system cannot find the file specified.
+TEST_F(QnnHTPBackendTests, TestConvWithExternalData) {
+  Ort::SessionOptions so;
+  onnxruntime::ProviderOptions options;
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  so.AppendExecutionProvider("QNN", options);
+
+  Ort::Status status(OrtSessionOptionsAppendExecutionProvider_CPU(so, 1));
+
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "conv_qdq_external_ini.onnx";
+
+  Ort::Session session(*ort_env, ort_model_path, so);
+}
+
 // Helper function that runs an ONNX model with a NHWC Resize operator to test that
 // type/shape inference succeeds during layout transformation.
 // Refer to onnxruntime/core/graph/contrib_ops/nhwc_inference_context.h.
@@ -175,7 +197,10 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) {
 // types and shapes.
 static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bool enable_qnn_saver = false,
                                std::string htp_graph_finalization_opt_mode = "",
-                               std::string qnn_context_priority = "") {
+                               std::string qnn_context_priority = "",
+                               std::string soc_model = "",
+                               std::string htp_arch = "",
+                               std::string device_id = "") {
   Ort::SessionOptions so;
 
   // Ensure all type/shape inference warnings result in errors!
@@ -204,6 +229,18 @@ static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bo
     options["qnn_context_priority"] = std::move(qnn_context_priority);
   }
 
+  if (!soc_model.empty()) {
+    options["soc_model"] = std::move(soc_model);
+  }
+
+  if (!htp_arch.empty()) {
+    options["htp_arch"] = std::move(htp_arch);
+  }
+
+  if (!device_id.empty()) {
+    options["device_id"] = std::move(device_id);
+  }
+
   so.AppendExecutionProvider("QNN", options);
 
   Ort::Session session(*ort_env, ort_model_path, so);
@@ -287,10 +324,386 @@ TEST_F(QnnCPUBackendTests, QnnSaver_OutputFiles) {
   EXPECT_TRUE(std::filesystem::exists(qnn_saver_output_dir / "params.bin"));
 }
 
+struct ModelAndBuilder {
+  ModelAndBuilder(Graph& graph) : builder(graph) {}
+  std::string model_data;
+  ModelTestBuilder builder;
+};
+
+// Creates a model in memory. Input feeds and output names can be accessed from result.builder.
+static void CreateModelInMemory(std::unique_ptr<ModelAndBuilder>& result,
+                                const GetTestModelFn& model_build_fn,
+                                const std::string& model_name,
+                                int opset_version = 18) {
+  const std::unordered_map<std::string, int> domain_to_version = {{"", opset_version}, {kMSDomain, 1}};
+  auto& logging_manager = DefaultLoggingManager();
+
+  // Create float model and serialize it to a string.
+  onnxruntime::Model model(model_name, false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  result = std::make_unique<ModelAndBuilder>(model.MainGraph());
+  model_build_fn(result->builder);
+  result->builder.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+  model.ToProto().SerializeToString(&result->model_data);
+}
+
+// Runs a session and verifies the outputs. Can be run by individual threads.
+static void RunSessionAndVerify(InferenceSession& session, const RunOptions& run_options, const NameMLValMap& feeds,
+                                const std::vector<std::string>& output_names,
+                                const std::vector<std::vector<int64_t>>& output_shapes,
+                                const std::vector<std::vector<float>>& expected_values,
+                                int loop_count = 10) {
+  // Let it run for a while
+  for (int it = 0; it < loop_count; ++it) {
+    std::vector<OrtValue> fetches;
+    auto status = session.Run(run_options, feeds, output_names, &fetches);
+    ASSERT_TRUE(status.IsOK());
+
+    for (size_t i = 0; i < fetches.size(); i++) {
+      auto& tensor = fetches[i].Get<Tensor>();
+      TensorShape expected_shape(output_shapes[i]);
+      ASSERT_EQ(expected_shape, tensor.Shape());
+
+      gsl::span<const float> actual = tensor.DataAsSpan<float>();
+      gsl::span<const float> expected(expected_values[i].data(), expected_values[i].size());
+      ASSERT_EQ(expected, actual);
+    }
+  }
+}
+
+// Returns a function that builds a float32 model that adds 3 tensors.
+static GetTestModelFn F32BuildAdd3Tensors(const TestInputDef<float>& input0_def,
+                                          const TestInputDef<float>& input1_def,
+                                          const TestInputDef<float>& input2_def) {
+  return [input0_def, input1_def, input2_def](ModelTestBuilder& builder) {
+    NodeArg* input0 = MakeTestInput<float>(builder, input0_def);
+    NodeArg* input1 = MakeTestInput<float>(builder, input1_def);
+    NodeArg* input2 = MakeTestInput<float>(builder, input1_def);
+
+    auto* add0_out = builder.MakeIntermediate();
+    builder.AddNode("Add", {input0, input1}, {add0_out});
+
+    auto* output = builder.MakeOutput();
+    builder.AddNode("Add", {add0_out, input2}, {output});
+  };
+}
+
+// Tests running a single session in multiple threads on the CPU backend.
+TEST_F(QnnCPUBackendTests, MultithreadSessionRun) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      F32BuildAdd3Tensors(TestInputDef<float>(shape, false, input_data),
+                                          TestInputDef<float>(shape, false, input_data),
+                                          TestInputDef<float>(shape, false, input_data)),
+                      "add3.f32");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  RunOptions run_opts;
+  run_opts.run_tag = session_opts.session_logid;
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnCpu.dll";
+#else
+  options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+  for (int i = 0; i < num_threads; i++) {
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
+// Returns a function that builds a QDQ model that adds 3 tensors. Forces all scales and zero-points to be (1.0f, 0),
+// so it is only accurate when using non-fractional positive inputs.
+template <typename QuantType>
+static GetTestModelFn QDQBuildAdd3Tensors(const TestInputDef<float>& input0_def,
+                                          const TestInputDef<float>& input1_def,
+                                          const TestInputDef<float>& input2_def) {
+  return [input0_def, input1_def, input2_def](ModelTestBuilder& builder) {
+    NodeArg* input0 = MakeTestInput<float>(builder, input0_def);
+    NodeArg* input0_after_qdq = AddQDQNodePair<QuantType>(builder, input0, 1.0f, 0);
+    NodeArg* input1 = MakeTestInput<float>(builder, input1_def);
+    NodeArg* input1_after_qdq = AddQDQNodePair<QuantType>(builder, input1, 1.0f, 0);
+    NodeArg* input2 = MakeTestInput<float>(builder, input1_def);
+    NodeArg* input2_after_qdq = AddQDQNodePair<QuantType>(builder, input2, 1.0f, 0);
+
+    auto* add0_out = builder.MakeIntermediate();
+    builder.AddNode("Add", {input0_after_qdq, input1_after_qdq}, {add0_out});
+
+    auto* add0_out_dq = AddQDQNodePair<QuantType>(builder, add0_out, 1.0f, 0);
+
+    auto* add1_out = builder.MakeIntermediate();
+    builder.AddNode("Add", {add0_out_dq, input2_after_qdq}, {add1_out});
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, add1_out, 1.0f, 0);
+  };
+}
+
+// Tests running a single session in multiple threads on the HTP backend.
+TEST_F(QnnHTPBackendTests, MultithreadSessionRun) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  RunOptions run_opts;
+  run_opts.run_tag = session_opts.session_logid;
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  for (int i = 0; i < num_threads; i++) {
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with run option to set power config
+TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgSessionRunOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  std::vector<std::string> perf_modes{
+      "burst", "balanced", "default", "high_performance", "high_power_saver",
+      "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver"};
+
+  size_t post_i = perf_modes.size() - 1;
+  ASSERT_TRUE(post_i > num_threads);
+  for (int i = 0; i < num_threads; ++i, --post_i) {
+    RunOptions run_opts;
+    run_opts.run_tag = session_opts.session_logid;
+    auto rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfMode, perf_modes[i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+    rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with EP option to set default power config
+TEST_F(QnnHTPBackendTests, MultithreadDefaultHtpPowerCfgFromEpOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  RunOptions run_opts;
+  run_opts.run_tag = session_opts.session_logid;
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+  options["htp_performance_mode"] = "burst";
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  for (int i = 0; i < num_threads; i++) {
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with
+// EP option to set default power config + run option to set power config for each run
+TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgDefaultAndRunOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+  options["htp_performance_mode"] = "burst";
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  std::vector<std::string> perf_modes{
+      "burst", "balanced", "default", "high_performance", "high_power_saver",
+      "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver"};
+
+  size_t post_i = perf_modes.size() - 1;
+  ASSERT_TRUE(post_i > num_threads);
+  for (int i = 0; i < num_threads; ++i, --post_i) {
+    RunOptions run_opts;
+    run_opts.run_tag = session_opts.session_logid;
+    auto rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfMode, perf_modes[i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+    rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
 // Test shape inference of QDQ NHWC Resize operator (opset 18) that uses
 // the sizes input. Use the QNN HTP backend.
+// Maps to QNN's ResizeBilinear operator.
 TEST_F(QnnHTPBackendTests, TestNHWCResizeShapeInference_qdq_sizes_opset18) {
   RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx", true);
 }
@@ -327,6 +740,45 @@ TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) {
   }
 }
 
+// Test that models run with various SoC model values
+TEST_F(QnnHTPBackendTests, HTPSocModels) {
+  constexpr std::array<const char*, 3> soc_models = { "",   // No explicit SoC model specified
+                                                      "0",  // "Unknown"
+#if defined(_M_ARM64)
+                                                      "37" };  // SC8280X
+#elif defined(__linux__)
+                                                      "30" };  // SM8350
+#else
+                                                      "" };
+#endif
+
+  for (auto soc_model : soc_models) {
+    RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx",
+                       true,   // use_htp
+                       false,  // enable_qnn_saver
+                       "",     // htp_graph_finalization_opt_mode
+                       "",     // qnn_context_priority
+                       soc_model);
+  }
+}
+
+// Test that models run with various HTP architecture values (and set device_id)
+TEST_F(QnnHTPBackendTests, HTPArchValues) {
+  constexpr std::array<const char*, 3> htp_archs = {"",     // No explicit arch specified
+                                                    "0",    // "None"
+                                                    "68"};  // v68
+  for (auto htp_arch : htp_archs) {
+    RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx",
+                       true,      // use_htp
+                       false,     // enable_qnn_saver
+                       "",        // htp_graph_finalization_opt_mode
+                       "",        // qnn_context_priority
+                       "",        // soc_model
+                       htp_arch,  // htp_arch
+                       "0");      // device_id
+  }
+}
+
 // Test that models run with high QNN context priority.
 TEST_F(QnnHTPBackendTests, QnnContextPriorityHigh) {
   RunNHWCResizeModel(ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx",
@@ -367,8 +819,10 @@ static GetTestModelFn BuildCastAddTestCase() {
   };
 }
 
-// Test that models with 2 inputs which has different data type can still generate the context binary
-TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
+// A repro of QC case 06838696, accuracy issue for Cast + Op (quantized)
+// the value pair(1, 0.00392156886) at index #1 don't match,
+// which is -0.996078 from 1
+TEST_F(QnnHTPBackendTests, DISABLED_CastAddHTPAccuracyTest) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -376,55 +830,29 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  // Add kMSDomain to cover contrib op like Gelu
-  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
-
-  auto& logging_manager = DefaultLoggingManager();
-  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
-
-  onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
-                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
-                           logging_manager.DefaultLogger());
-  Graph& graph = model.MainGraph();
-  ModelTestBuilder helper(graph);
-  BuildCastAddTestCase()(helper);
-  helper.SetGraphOutputs();
-  ASSERT_STATUS_OK(model.MainGraph().Resolve());
-
-  // Serialize the model to a string.
-  std::string model_data;
-  model.ToProto().SerializeToString(&model_data);
-
-  const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
-
-  const std::string context_binary_file = "./qnn_context_binary_int32_fp32_inputs_test.onnx";
-  Ort::SessionOptions so;
-  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
-  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
-
-  so.AppendExecutionProvider("QNN", provider_options);
-
-  Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), so);
-
-  // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  RunQnnModelTest(BuildCastAddTestCase(),
+                  provider_options,
+                  13,  // opset
+                  ExpectedEPNodeAssignment::All);
 }
 
-// A repro of QC case 06838696, accuracy issue for Cast + Op (quantized)
-// the value pair(1, 0.00392156886) at index #1 don't match,
-// which is -0.996078 from 1
-TEST_F(QnnHTPBackendTests, DISABLED_CastAddHTPAccuracyTest) {
+// Test float32 model with FP16 precision
+TEST_F(QnnHTPBackendTests, Float32ModelWithFP16PrecisionTest) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["enable_htp_fp16_precision"] = "1";
 
-  RunQnnModelTest(BuildCastAddTestCase(),
+  auto input_defs = {TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                     TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f)};
+  RunQnnModelTest(BuildOpTestCase<float>("Add", input_defs, {}, {}, kOnnxDomain),
                   provider_options,
-                  13,  // opset
-                  ExpectedEPNodeAssignment::All);
+                  13,
+                  ExpectedEPNodeAssignment::All,
+                  0.008f);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
new file mode 100644
index 000000000000..9eb75d297ef7
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -0,0 +1,738 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <filesystem>
+#include <string>
+
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/inference_session.h"
+
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::logging;
+
+// in test_main.cc
+extern std::unique_ptr<Ort::Env> ort_env;
+
+namespace onnxruntime {
+namespace test {
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+// Create a model with Case + Add (quantized)
+// input1 -> Add -> Q -> DQ \
+//                           FusedMatMul -> Q -> DQ -> output
+//        input2 -> Q -> DQ /
+static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
+  return [single_ep_node](ModelTestBuilder& builder) {
+    // Creat non-quantized Add node1
+    NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({2, 2}, false, {0, 1, 0, 1}));
+    NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
+
+    auto* add1_output = builder.MakeIntermediate();
+    builder.AddNode("FusedMatMul", {input1, add1_ini_input2}, {add1_output}, kMSDomain);
+
+    // Create quantized Add node2
+    std::vector<float> data = {0.0f, 0.0f, 1.0f, 0.0f};
+    gsl::span<float> data_range = gsl::make_span(data);
+    QuantParams<uint8_t> q_parameter = GetDataQuantParams<uint8_t>(data_range);
+    auto* add2_input1_qdq = AddQDQNodePair<uint8_t>(builder, add1_output, q_parameter.scale, q_parameter.zero_point);
+
+    NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
+    auto* add2_input2_qdq = AddQDQNodePair<uint8_t>(builder, add2_input2, q_parameter.scale, q_parameter.zero_point);
+
+    auto* add2_output = builder.MakeIntermediate();
+
+    builder.AddNode("Add", {add2_input1_qdq, add2_input2_qdq}, {add2_output});
+
+    if (single_ep_node) {
+      // add_output -> Q -> DQ -> output
+      AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
+    } else {
+      auto* add3_input1_qdq = AddQDQNodePair<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
+      NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
+
+      auto* add3_output = builder.MakeIntermediate();
+      builder.AddNode("FusedMatMul", {add3_input1_qdq, add3_ini_input2}, {add3_output}, kMSDomain);
+
+      // Create quantized Add node4
+      auto* add4_input1_qdq = AddQDQNodePair<uint8_t>(builder, add3_output, q_parameter.scale, q_parameter.zero_point);
+
+      NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
+      auto* add4_input2_qdq = AddQDQNodePair<uint8_t>(builder, add4_input2, q_parameter.scale, q_parameter.zero_point);
+
+      auto* add4_output = builder.MakeIntermediate();
+
+      builder.AddNode("Add", {add4_input1_qdq, add4_input2_qdq}, {add4_output});
+      // add_output -> Q -> DQ -> output
+      AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, add4_output, q_parameter.scale, q_parameter.zero_point);
+    }
+  };
+}
+
+void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  BuildGraphWithQAndNonQ(single_ep_node)(helper);
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+
+  // Serialize the model to a string.
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
+
+  const std::string context_binary_file = "./qnn_context_binary_multi_partition_test.onnx";
+  std::remove(context_binary_file.c_str());
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), so);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  int ep_context_node_count = 0;
+  int non_ep_context_node_count = 0;
+  std::shared_ptr<Model> ctx_model;
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), ctx_model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  auto& ctx_graph = ctx_model->MainGraph();
+  for (auto& node : ctx_graph.Nodes()) {
+    if (node.OpType() == "EPContext") {
+      ++ep_context_node_count;
+      // validate the fix for the partition issue relate to QDQ model
+      ASSERT_EQ(node.InputDefs().size(), 1);
+    } else {
+      ++non_ep_context_node_count;
+    }
+  }
+
+  int expected_node_count = single_ep_node ? 1 : 2;
+  ASSERT_EQ(ep_context_node_count, expected_node_count);
+  ASSERT_EQ(non_ep_context_node_count, expected_node_count);
+
+  Ort::SessionOptions so2;
+  // context file path is required if it's non-embed mode and the model is loaded from memory
+  so2.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so2.AppendExecutionProvider("QNN", provider_options);
+
+  std::string ctx_model_data;
+  ctx_model->ToProto().SerializeToString(&ctx_model_data);
+  Ort::Session session2(*ort_env, ctx_model_data.data(), ctx_model_data.size(), so2);
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+// Test that models with 1 non-quantized Add node and 1 quantized Add node can still generate the context binary
+// The generated Onnx model has 1 Add node and 1 EPContext node
+TEST_F(QnnHTPBackendTests, QnnContextBinaryMultiPartitionSupport1) {
+  bool single_ep_node = true;
+  QnnContextBinaryMultiPartitionTestBody(single_ep_node);
+}
+
+// Test that models with 2 non-quantized Add nodes and 2 quantized Add nodes can still generate the context binary
+// The generated Onnx model has 2 Add nodes and 1 EPContext nodes
+TEST_F(QnnHTPBackendTests, QnnContextBinaryMultiPartitionSupport2) {
+  bool single_ep_node = false;
+  QnnContextBinaryMultiPartitionTestBody(single_ep_node);
+}
+
+// Create a model with Case + Add (quantized)
+// cast_input -> Cast -> Q -> DQ \
+//                                Add -> Q -> DQ -> output
+//             input2 -> Q -> DQ /
+static GetTestModelFn BuildCastAddTestCase() {
+  return [](ModelTestBuilder& builder) {
+    // Creat Cast node int32 -> float32
+    NodeArg* cast_input = MakeTestInput(builder, TestInputDef<int32_t>({2, 3}, false, {0, 1, 0, 1, 0, 1}));
+
+    auto* cast_output = builder.MakeIntermediate();
+    Node& cast_node = builder.AddNode("Cast", {cast_input}, {cast_output});
+    cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT));
+
+    // Create Add node
+    std::vector<float> data = {0.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f};
+    gsl::span<float> data_range = gsl::make_span(data);
+    QuantParams<uint8_t> q_parameter = GetDataQuantParams<uint8_t>(data_range);
+    auto* add_input1_qdq = AddQDQNodePair<uint8_t>(builder, cast_output, q_parameter.scale, q_parameter.zero_point);
+
+    NodeArg* add_input2 = MakeTestInput(builder, TestInputDef<float>({2, 3}, false, data));
+    auto* add_input2_qdq = AddQDQNodePair<uint8_t>(builder, add_input2, q_parameter.scale, q_parameter.zero_point);
+
+    auto* add_output = builder.MakeIntermediate();
+
+    builder.AddNode("Add", {add_input1_qdq, add_input2_qdq}, {add_output});
+
+    // add_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, add_output, q_parameter.scale, q_parameter.zero_point);
+  };
+}
+
+// Test that models with 2 inputs which has different data type can still generate the context binary
+TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  BuildCastAddTestCase()(helper);
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+
+  // Serialize the model to a string.
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
+
+  const std::string context_binary_file = "./qnn_context_binary_int32_fp32_inputs_test.onnx";
+  std::remove(context_binary_file.c_str());
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), so);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+// Generate context cache model from the ONNX models with 2 inputs.
+// The generated model should have same input order.
+// The input ONNX model is created in the way that the model inputs order
+// is different with the order in the graph (topological order).
+// It cause issue if the generated model doesn't set the inputs/outputs explicitly.
+TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  // Add kMSDomain to cover contrib op like Gelu
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  const std::string context_binary_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  Ort::Session session(*ort_env, ORT_TSTR("testdata/qnn_ctx_2_inputs_order_test.onnx"), so);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  auto inputs = model->MainGraph().GetInputs();
+  EXPECT_TRUE(inputs.size() == 2);
+  EXPECT_TRUE(inputs[0]->Name() == "attention_mask");
+  EXPECT_TRUE(inputs[1]->Name() == "Add_input_0");
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+// Run QDQ model on HTP 3 times
+// 1st run will generate the Qnn context cache onnx file
+// 2nd run directly loads and run from Qnn context cache model
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheEmbedModeTest) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  const std::string context_binary_file = "./qnn_context_binary_test.onnx";
+  std::remove(context_binary_file.c_str());
+
+  std::unordered_map<std::string, std::string> session_option_pairs;
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+
+  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
+  const std::string op_type = "Atan";
+
+  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
+  // 1st run will generate the Qnn context cache binary file
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       "",  // context model file path, not required for this inference
+                       session_option_pairs);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  // 2nd run directly loads and run from Qnn context cache model
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       context_binary_file);
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+// Run QDQ model on HTP 3 times
+// 1st run will generate the Onnx skeleton file + Qnn context cache binary file
+// 2nd run directly loads and run from Onnx skeleton file + Qnn context cache binary file
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheNonEmbedModeTest) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  const std::string context_binary_file = "./testdata/qnn_context_cache_non_embed.onnx";
+  std::string qnn_ctx_bin = "./testdata/qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+
+  std::unordered_map<std::string, std::string> session_option_pairs;
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0");
+
+  std::remove(context_binary_file.c_str());
+  std::remove(qnn_ctx_bin.c_str());
+
+  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
+  const std::string op_type = "Atan";
+
+  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
+  // 1st run will generate the Onnx skeleton file + Qnn context cache binary file
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       "",  // context model file path, not required for this inference
+                       session_option_pairs);
+
+  // Check the Onnx skeleton file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  // Check the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(qnn_ctx_bin));
+
+  std::unordered_map<std::string, std::string> session_option_pairs2;
+  // Need to set the context file path since TestQDQModelAccuracy load the model from memory
+  session_option_pairs2.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  // 2nd run directly loads and run from Onnx skeleton file + Qnn context cache binary file
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       context_binary_file,
+                       session_option_pairs2);
+
+  // load the model from file
+  std::vector<char> buffer;
+  {
+    std::ifstream file(context_binary_file, std::ios::binary | std::ios::ate);
+    if (!file)
+      ORT_THROW("Error reading model");
+    buffer.resize(narrow<size_t>(file.tellg()));
+    file.seekg(0, std::ios::beg);
+    if (!file.read(buffer.data(), buffer.size()))
+      ORT_THROW("Error reading model");
+  }
+
+  Ort::SessionOptions so;  // No need to set the context file path in so since it's load from file
+  so.AppendExecutionProvider("QNN", provider_options);
+#ifdef _WIN32
+  std::wstring ctx_model_file(context_binary_file.begin(), context_binary_file.end());
+#else
+  std::string ctx_model_file(context_binary_file.begin(), context_binary_file.end());
+#endif
+  Ort::Session session(*ort_env.get(), ctx_model_file.c_str(), so);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  ASSERT_EQ(std::remove(qnn_ctx_bin.c_str()), 0);
+}
+
+// Run QDQ model on HTP 2 times
+// 1st run will generate the Onnx skeleton file + Qnn context cache binary file
+// Then delete the context bin file to make the 2nd sesssion.Initialize() return the status with code INVALID_GRAPH
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_InvalidGraph) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
+  std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  std::remove(context_binary_file.c_str());
+  std::remove(context_bin.string().c_str());
+
+  std::unordered_map<std::string, std::string> session_option_pairs;
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0");
+
+  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
+  const std::string op_type = "Atan";
+
+  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
+  // 1st run will generate the Onnx skeleton file + Qnn context cache binary file
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       "",  // context model file path, not required for this inference
+                       session_option_pairs);
+
+  // Check the Onnx skeleton file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  // Check the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_bin));
+  // Delete the Qnn context cache binary file
+  EXPECT_TRUE(std::filesystem::remove(context_bin));
+
+  // loads and run from Onnx skeleton file + Qnn context cache binary file
+  onnx::ModelProto model_proto;
+  onnxruntime::Model qnn_ctx_model;
+  // Load the QNN context cache model from path specified
+  ASSERT_STATUS_OK(qnn_ctx_model.Load(ToPathString(context_binary_file), model_proto));
+  std::string qnn_ctx_model_data;
+  model_proto.SerializeToString(&qnn_ctx_model_data);
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(qnn_ctx_model_data.data(), static_cast<int>(qnn_ctx_model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+std::string CreateQnnCtxModelWithNonEmbedMode(std::string external_bin_path) {
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 11}, {kMSDomain, 1}};
+  auto& logging_manager = DefaultLoggingManager();
+  onnxruntime::Model model("QNN_ctx_model", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  std::vector<int64_t> shape = {2, 3};
+  NodeArg* graph_input = MakeTestInput(helper, TestInputDef<float>(shape, true, {0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f}));
+  auto* graph_output = helper.MakeOutput<float>(shape);
+  Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
+  ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
+  ep_context_node.AddAttribute("ep_cache_context", external_bin_path);
+  ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
+  ep_context_node.AddAttribute("source", "QNN");
+  helper.SetGraphOutputs();
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  return model_data;
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context has ".."
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryRelativePathTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("../qnn_context.bin");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context has absolute path
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryAbsolutePathTest) {
+#if defined(_WIN32)
+  std::string external_ctx_bin_path = "D:/qnn_context.bin";
+#else
+  std::string external_ctx_bin_path = "/data/qnn_context.bin";
+#endif
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode(external_ctx_bin_path);
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context to a file not exist
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryFileNotExistTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("qnn_context_not_exist.bin");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context to empty string
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryFileEmptyStringTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Run QDQ model on HTP with 2 inputs
+// 1st run will generate the Qnn context cache onnx file
+// 2nd run directly loads and run from Qnn context cache model
+TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  const std::string context_binary_file = "./qnn_context_binary_2inputs_test.onnx";
+  std::remove(context_binary_file.c_str());
+
+  std::unordered_map<std::string, std::string> session_option_pairs;
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+
+  const TestInputDef<float> input_def1({1, 2, 3}, false, -10.0f, 10.0f);
+  const TestInputDef<float> input_def2({1, 2, 3}, false, -10.0f, 10.0f);
+  const std::string op_type = "Add";
+
+  // Runs model with DQ-> Add-> Q and compares the outputs of the CPU and QNN EPs.
+  // 1st run will generate the Qnn context cache binary file
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def1, input_def2}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def1, input_def2}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       "",  // context model file path, not required for this inference
+                       session_option_pairs);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  // 2nd run directly loads and run from Qnn context cache model
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def1, input_def2}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def1, input_def2}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       context_binary_file);
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+// Context binary only contains a single QNN graph, generated context cache model (detached mode) only has 1 EPContext node
+// Create another Onnx model which also reference to the bin file,
+// but the node name is not same with the QNN graph name inside the bin file.
+// This is to support backward compitable for the models generated before the PR that
+// make context generation support multi-partition
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphNameInCtx) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
+  std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  std::remove(context_binary_file.c_str());
+  std::remove(context_bin.string().c_str());
+
+  std::unordered_map<std::string, std::string> session_option_pairs;
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0");
+
+  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
+  const std::string op_type = "Atan";
+
+  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
+  // 1st run will generate the Onnx skeleton file + Qnn context cache binary file
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       "",  // context model file path, not required for this inference
+                       session_option_pairs);
+
+  // Check the Onnx skeleton file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  // Check the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_bin));
+
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 11}, {kMSDomain, 1}};
+  auto& logging_manager = DefaultLoggingManager();
+  onnxruntime::Model model("QNN_ctx_model", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  std::vector<int64_t> shape = {1, 2, 3};
+  NodeArg* graph_input = MakeTestInput(helper, TestInputDef<float>(shape, false, {0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f}));
+  auto* graph_output = helper.MakeOutput<float>(shape);
+  Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
+  ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
+  ep_context_node.AddAttribute("ep_cache_context", context_bin.string());
+  ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
+  ep_context_node.AddAttribute("source", "QNNExecutionProvider");
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(graph.Resolve());
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  // loads and run from Onnx skeleton file + Qnn context cache binary file
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::OK);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  ASSERT_EQ(std::remove(context_bin.string().c_str()), 0);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h
index bfe5bab31831..c474e989243a 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.h
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h
@@ -4,10 +4,12 @@
 #pragma once
 
 #if !defined(ORT_MINIMAL_BUILD)
-#include <string>
 #include <cmath>
+#include <string>
+#include <type_traits>
 #include <unordered_map>
 #include "core/framework/provider_options.h"
+#include "core/framework/tensor_shape.h"
 #include "core/util/qmath.h"
 
 #include "test/optimizer/qdq_test_utils.h"
@@ -30,7 +32,7 @@ struct QuantParams {
   float scale;
   QType zero_point;
 
-  static QuantParams<QType> Compute(float rmin, float rmax) {
+  static QuantParams<QType> Compute(float rmin, float rmax, bool symmetric = false) {
     // Ensure a minimum range of 0.0001 (required by QNN)
     rmax = std::max(rmax, rmin + 0.0001f);
 
@@ -41,8 +43,23 @@ struct QuantParams {
     constexpr float qmin = static_cast<float>(std::numeric_limits<QType>::min());
     constexpr float qmax = static_cast<float>(std::numeric_limits<QType>::max());
 
-    const float scale = rmax == rmin ? 1.0f : (rmax - rmin) / (qmax - qmin);
-    const float initial_zero_point = qmin - (rmin / scale);
+    if (symmetric) {
+      const float abs_max = std::max(std::abs(rmin), std::abs(rmax));
+      rmax = abs_max;
+      rmin = -abs_max;
+    }
+
+    const float scale = (rmax - rmin) / (qmax - qmin);
+    float initial_zero_point = 0.0f;
+
+    if (symmetric) {
+      // Symmetric uses same formula for zero-point as asymmetric, but we can cancel out terms for
+      // increased numerical accuracy.
+      initial_zero_point = (qmin + qmax) / 2.0f;
+    } else {
+      initial_zero_point = qmin - (rmin / scale);
+    }
+
     const QType zero_point = static_cast<QType>(RoundHalfToEven(std::max(qmin, std::min(qmax, initial_zero_point))));
 
     return QuantParams<QType>{scale, zero_point};
@@ -55,11 +72,12 @@ struct QuantParams {
 // range of output values. Note that the function is able to overwrite the output_qparams parameter if necessary
 // (Example: MaxPool must have identical input and output quantization params).
 template <typename QuantType>
-using GetTestQDQModelFn = std::function<void(ModelTestBuilder& builder, std::vector<QuantParams<QuantType>>& output_qparams)>;
+using GetTestQDQModelFn = std::function<void(ModelTestBuilder& builder,
+                                             std::vector<QuantParams<QuantType>>& output_qparams)>;
 
 // Computes quantization parameters for an array of floating-point values.
 template <typename QType = uint8_t>
-inline QuantParams<QType> GetDataQuantParams(gsl::span<const float> data) {
+inline QuantParams<QType> GetDataQuantParams(gsl::span<const float> data, bool symmetric = false) {
   // Get min/max of raw data.
   float min_val = std::numeric_limits<float>::max();
   float max_val = std::numeric_limits<float>::min();
@@ -69,7 +87,7 @@ inline QuantParams<QType> GetDataQuantParams(gsl::span<const float> data) {
     max_val = std::max(max_val, val);
   }
 
-  return QuantParams<QType>::Compute(min_val, max_val);
+  return QuantParams<QType>::Compute(min_val, max_val, symmetric);
 }
 
 /**
@@ -150,6 +168,10 @@ struct TestInputDef {
     return shape_;
   }
 
+  const TensorShape GetTensorShape() const {
+    return TensorShape(shape_);
+  }
+
   bool IsInitializer() const {
     return is_initializer_;
   }
@@ -201,6 +223,42 @@ struct TestInputDef {
     return range;
   }
 
+  std::vector<std::pair<T, T>> GetRangePerChannel(size_t axis) const {
+    auto which_type = data_info_.index();
+    const size_t num_ranges = static_cast<size_t>(shape_.at(axis));
+
+    // Random. All axis dims get the same ranges (rand_min -> rand_max)
+    if (which_type == 1) {
+      RandomData rand_info = std::get<RandomData>(data_info_);
+      return std::vector<std::pair<T, T>>(num_ranges, std::pair<T, T>(rand_info.min, rand_info.max));
+    }
+
+    // Raw data. Get min/max per axis dim val
+    assert(which_type == 0);
+
+    const std::vector<T>& raw_data = std::get<RawData>(data_info_).data;
+    std::pair<T, T> init_range(std::numeric_limits<T>::max(), std::numeric_limits<T>::min());
+    std::vector<std::pair<T, T>> per_axis_ranges(num_ranges, init_range);
+    TensorShape shape(shape_);
+    size_t num_blocks = shape.SizeToDimension(axis);
+    size_t block_size = shape.SizeFromDimension(axis + 1);
+
+    size_t i = 0;
+    for (size_t n = 0; n < num_blocks; n++) {
+      for (size_t r = 0; r < num_ranges; r++) {
+        for (size_t j = 0; j < block_size; j++) {
+          std::pair<T, T>& range = per_axis_ranges[r];
+          range.first = std::min(range.first, raw_data[i]);
+          range.second = std::max(range.second, raw_data[i]);
+          i++;
+        }
+      }
+    }
+    assert(i == raw_data.size());
+
+    return per_axis_ranges;
+  }
+
  private:
   std::vector<int64_t> shape_;
   std::variant<RawData, RandomData> data_info_;
@@ -210,9 +268,64 @@ struct TestInputDef {
 };
 
 template <typename QType>
-inline QuantParams<QType> GetTestInputQuantParams(const TestInputDef<float>& input_def) {
+inline QuantParams<QType> GetTestInputQuantParams(const TestInputDef<float>& input_def, bool symmetric = false) {
   const std::pair<float, float> frange = input_def.GetRange();
-  return QuantParams<QType>::Compute(frange.first, frange.second);
+  return QuantParams<QType>::Compute(frange.first, frange.second, symmetric);
+}
+
+template <typename QType>
+static void GetTestInputQuantParamsPerChannel(const TestInputDef<float>& input_def, std::vector<float>& scales,
+                                              std::vector<QType>& zero_points, size_t axis, bool symmetric = false) {
+  const auto f32_ranges = input_def.GetRangePerChannel(axis);
+
+  scales.reserve(f32_ranges.size());
+  zero_points.reserve(f32_ranges.size());
+
+  for (const auto& range : f32_ranges) {
+    QuantParams<QType> params = QuantParams<QType>::Compute(range.first, range.second, symmetric);
+    scales.push_back(params.scale);
+    zero_points.push_back(params.zero_point);
+  }
+}
+
+template <typename FloatType, typename QuantType>
+static void QuantizeValues(gsl::span<const FloatType> input, gsl::span<QuantType> output, const TensorShape& shape,
+                           gsl::span<const FloatType> scales, gsl::span<const QuantType> zero_points,
+                           std::optional<int64_t> axis) {
+  const size_t input_rank = shape.NumDimensions();
+  const size_t num_elems = static_cast<size_t>(shape.Size());
+  ORT_ENFORCE(input.size() == num_elems);
+  ORT_ENFORCE(output.size() == num_elems);
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + input_rank : static_cast<size_t>(*axis);
+    block_count = shape.SizeToDimension(axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = shape.SizeFromDimension(axis_no_neg + 1);
+  }
+
+  ORT_ENFORCE(scales.size() == broadcast_dim);
+  ORT_ENFORCE(zero_points.empty() || zero_points.size() == broadcast_dim);
+
+  size_t i = 0;
+
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      QuantType zp = zero_points.empty() ? static_cast<QuantType>(0) : zero_points[bd];
+      if constexpr (std::is_same_v<QuantType, int32_t>) {
+        for (size_t e = 0; e < block_size; e++) {
+          output[i + e] = static_cast<QuantType>(input[i + e] / scales[bd]) + zp;
+        }
+      } else {
+        ParQuantizeLinearStd(&input[i], &output[i], block_size, scales[bd], zp, nullptr);
+      }
+      i += block_size;
+    }
+  }
 }
 
 /**
@@ -281,8 +394,8 @@ struct QDQTolerance {
  * \param qnn_options QNN EP provider options.
  * \param opset_version The opset version.
  * \param expected_ep_assignment Describes "which nodes" should be assigned to the EP.
- * \param tolerance The percent tolerance (as fraction) QNN EP results are allowed to differ from the QDQ model on CPU EP.
- *                  This tolerance is a percentage of the output range.
+ * \param tolerance The percent tolerance (as fraction) QNN EP results are allowed to differ from the QDQ model
+ *                  on CPU EP. This tolerance is a percentage of the output range.
  * \param log_severity The logger's severity setting.
  */
 template <typename QuantType>
@@ -361,7 +474,7 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
     model_proto.SerializeToString(&qnn_ctx_model_data);
     // Run QNN context cache model on QNN EP and collect outputs.
     InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", qnn_options,
-                   expected_ep_assignment, qdq_helper.feeds_, qnn_qdq_outputs, is_qnn_ep);
+                   expected_ep_assignment, qdq_helper.feeds_, qnn_qdq_outputs, is_qnn_ep, session_option_pairs);
   } else {
     // Run QDQ model on QNN EP and collect outputs.
     // Only need to apply the extra session options to this QDQ model inference on QNN EP
@@ -467,6 +580,187 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
   }
 }
 
+/**
+ * Tests the accuracy of a FP16 model on QNN EP by runnning 3 inferences:
+ *
+ * 1. float32 model on CPU EP (baseline)
+ * 2. FP16 model on CPU EP
+ * 3. FP16 model on QNN EP
+ *
+ * This function checks that running the FP16 model on QNN EP (#3) is at least as accurate (+- small tolerance)
+ * as running the FP16 model on CPU EP (#2). We primarily measure accuracy by comparing to the baseline (#1).
+ *
+ * \param f32_model_fn Function that builds the float model (baseline for comparison).
+ * \param f16_model_fn Function that builds the FP16 model (run by CPU EP and QNN EP).
+ * \param qnn_options QNN EP provider options.
+ * \param opset_version The opset version.
+ * \param expected_ep_assignment Describes "which nodes" should be assigned to the EP.
+ * \param tolerance The percent tolerance (as fraction) QNN EP results are allowed to differ from the FP16 model
+ *                  on CPU EP. This tolerance is a percentage of the output range.
+ * \param log_severity The logger's severity setting.
+ */
+inline void TestFp16ModelAccuracy(const GetTestModelFn& f32_model_fn,
+                                  const GetTestModelFn& f16_model_fn,
+                                  ProviderOptions qnn_options,
+                                  int opset_version,
+                                  ExpectedEPNodeAssignment expected_ep_assignment,
+                                  float tolerance = 0.004,
+                                  logging::Severity log_severity = logging::Severity::kERROR,
+                                  const std::string& qnn_ctx_model_path = "",
+                                  const std::unordered_map<std::string, std::string>& session_option_pairs = {}) {
+  // Add kMSDomain to cover contrib op like Gelu
+  const std::unordered_map<std::string, int> domain_to_version = {{"", opset_version}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(log_severity);
+
+  // Create float model and serialize it to a string.
+  onnxruntime::Model f32_model("f32_model", false, ModelMetaData(), PathString(),
+                               IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                               logging_manager.DefaultLogger());
+  ModelTestBuilder f32_helper(f32_model.MainGraph());
+  std::string f32_model_data;
+  f32_model_fn(f32_helper);
+  f32_helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(f32_model.MainGraph().Resolve());
+  f32_model.ToProto().SerializeToString(&f32_model_data);
+
+  // Run f32 model on CPU EP and collect outputs.
+  std::vector<OrtValue> cpu_f32_outputs;
+  InferenceModel(f32_model_data, "f32_model_logger", {}, ExpectedEPNodeAssignment::All,
+                 f32_helper.feeds_, cpu_f32_outputs);
+  ASSERT_FALSE(cpu_f32_outputs.empty());
+
+  const size_t num_outputs = cpu_f32_outputs.size();
+
+  // Compute output range(s) and quantization params.
+  std::vector<gsl::span<const float>> output_vals;
+  std::vector<int32_t> output_types;
+  output_vals.resize(num_outputs);
+  output_types.resize(num_outputs);
+
+  for (size_t i = 0; i < num_outputs; i++) {
+    auto& tensor = cpu_f32_outputs[i].Get<Tensor>();
+    int32_t elem_type = tensor.GetElementType();
+
+    if (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+      output_vals[i] = tensor.DataAsSpan<float>();
+    }
+
+    output_types[i] = elem_type;
+  }
+
+  // Create FP16 model and serialize it to a string.
+  onnxruntime::Model f16_model("fp16_model", false, ModelMetaData(), PathString(),
+                               IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                               logging_manager.DefaultLogger());
+  ModelTestBuilder f16_helper(f16_model.MainGraph());
+  std::string f16_model_data;
+  f16_model_fn(f16_helper);
+  f16_helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(f16_model.MainGraph().Resolve());
+  f16_model.ToProto().SerializeToString(&f16_model_data);
+
+  bool is_qnn_ep = true;
+  TryEnableQNNSaver(qnn_options);
+  std::vector<OrtValue> qnn_f16_outputs;
+  if (!qnn_ctx_model_path.empty()) {
+    onnx::ModelProto model_proto;
+    onnxruntime::Model qnn_ctx_model;
+    // Load the QNN context cache model from path specified
+    ASSERT_STATUS_OK(qnn_ctx_model.Load(ToPathString(qnn_ctx_model_path), model_proto));
+    std::string qnn_ctx_model_data;
+    model_proto.SerializeToString(&qnn_ctx_model_data);
+    // Run QNN context cache model on QNN EP and collect outputs.
+    InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", qnn_options,
+                   expected_ep_assignment, f16_helper.feeds_, qnn_f16_outputs, is_qnn_ep, session_option_pairs);
+  } else {
+    // Run QDQ model on QNN EP and collect outputs.
+    // Only need to apply the extra session options to this QDQ model inference on QNN EP
+    InferenceModel(f16_model_data, "fp16_model_logger", qnn_options, expected_ep_assignment,
+                   f16_helper.feeds_, qnn_f16_outputs, is_qnn_ep, session_option_pairs);
+  }
+
+  if (expected_ep_assignment != ExpectedEPNodeAssignment::None) {
+    // Run QDQ model on CPU EP and collect outputs.
+    std::vector<OrtValue> cpu_f16_outputs;
+    InferenceModel(f16_model_data, "fp16_model_logger", {}, ExpectedEPNodeAssignment::All,
+                   f16_helper.feeds_, cpu_f16_outputs);
+    ASSERT_EQ(cpu_f16_outputs.size(), num_outputs);
+    ASSERT_EQ(qnn_f16_outputs.size(), num_outputs);
+
+    // limit the error message count in case test with large data failed
+    size_t max_error_count = 10;
+    size_t error_count = 0;
+
+    // Compare accuracy of QDQ results with float model.
+    // QNN EP must be at least as accurate as CPU EP when running the QDQ model.
+    const std::string base_output_name = "output_";
+    for (size_t i = 0; i < num_outputs; i++) {
+      std::string debug_output_name = base_output_name + std::to_string(i);
+      auto& cpu_f16_tensor = cpu_f16_outputs[i].Get<Tensor>();
+      auto& qnn_f16_tensor = qnn_f16_outputs[i].Get<Tensor>();
+
+      ASSERT_EQ(cpu_f16_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+      ASSERT_EQ(qnn_f16_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+      ASSERT_EQ(output_types[i], ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+      const size_t num_vals = output_vals[i].size();
+      gsl::span<const float> cpu_f32_vals = output_vals[i];
+      gsl::span<const MLFloat16> cpu_f16_vals = cpu_f16_tensor.DataAsSpan<MLFloat16>();
+      gsl::span<const MLFloat16> qnn_f16_vals = qnn_f16_tensor.DataAsSpan<MLFloat16>();
+
+      ASSERT_EQ(num_vals, cpu_f16_vals.size());
+      ASSERT_EQ(num_vals, qnn_f16_vals.size());
+
+      float max_f16_cpu_err = 0.0f;
+      float max_f16_qnn_err = 0.0f;
+
+      for (size_t j = 0; j < num_vals && error_count < max_error_count; j++) {
+        const float expected_val = cpu_f32_vals[j];           // f32@CPU_EP val ("ground-truth")
+        const float qnn_f16_val = qnn_f16_vals[j].ToFloat();  // f16@QNN_EP val
+        const float cpu_f16_val = cpu_f16_vals[j].ToFloat();  // f16@CPU_EP val
+
+        // Get errors of f16@CPU_EP and f16@QNN_EP against f32@CPU_EP.
+        const float cpu_relative_err = std::fabs(expected_val - cpu_f16_val) / expected_val;
+        const float qnn_relative_err = std::fabs(expected_val - qnn_f16_val) / expected_val;
+
+        // Also compare the FP16 values against each other.
+        // This is equivalent to abs(f16@QNN_EP - f16@CPU_EP) / output_range
+        const float f16_vals_err = std::fabs(qnn_relative_err - cpu_relative_err);
+
+        // True if f16@QNN_EP is at least as accurate as f16@CPU_EP when compared to expected f32@CPU_EP value.
+        const bool is_as_accurate_as_cpu_ep = qnn_relative_err <= qnn_relative_err;
+
+        // True if the normalized difference between f16@QNN_EP and f16@CPU_EP is within tolerance.
+        const bool f16_vals_diff_within_tolerance = f16_vals_err <= tolerance;
+
+        const bool passed_test = is_as_accurate_as_cpu_ep || f16_vals_diff_within_tolerance;
+        if (!passed_test) {
+          ++error_count;
+        }
+        EXPECT_TRUE(passed_test)
+            << "Inaccuracy detected for output '" << debug_output_name
+            << "', element " << j << ", tolerance=" << (tolerance * 100) << "%"
+            << ".\nExpected val (f32@CPU_EP): " << expected_val << "\n"
+            << "f16@QNN_EP val: " << qnn_f16_val << " (err: " << qnn_relative_err << ")\n"
+            << "f16@CPU_EP val: " << cpu_f16_val << " (err: " << cpu_relative_err << ")\n";
+
+        max_f16_cpu_err = std::max(max_f16_cpu_err, cpu_relative_err);
+        max_f16_qnn_err = std::max(max_f16_qnn_err, qnn_relative_err);
+      }
+
+      if (error_count > 0) {
+        std::cerr << std::endl
+                  << "[WARNING]: Output " << i
+                  << " required larger tolerance to pass accuracy checks" << std::endl
+                  << "Max relative error against f32@CPU_EP = " << max_f16_cpu_err << std::endl
+                  << "Max relative error against f16@CPU_EP = " << max_f16_qnn_err << std::endl;
+      }
+    }
+  }
+}
+
 /**
  * Creates and returns an input in a test model graph. The input's characteristics are defined
  * by the provided input definition.
@@ -527,9 +821,10 @@ inline NodeArg* MakeTestInput(ModelTestBuilder& builder, const TestInputDef<bool
   return input;
 }
 
-// ONNX spec does not allow quantizing float to int32. However, this function will create an int32 input (divide by scale)
-// and then return the output of DequantizeLinear. Note that bias_scale should be generally be equal
-// to input_scale * weights_scale. See quantization tool: onnx_quantizer.py::quantize_bias_static()
+// ONNX spec does not allow quantizing float to int32. However, this function will create an int32
+// input (divide by scale) and then return the output of DequantizeLinear. Note that bias_scale should
+// be generally be equal to input_scale * weights_scale.
+// See quantization tool: onnx_quantizer.py::quantize_bias_static()
 //
 // i.e., initial bias => manual quantization (int32) => DQ => final float bias
 NodeArg* MakeTestQDQBiasInput(ModelTestBuilder& builder, const TestInputDef<float>& bias_def, float bias_scale,
@@ -586,12 +881,13 @@ inline GetTestModelFn BuildOpTestCase(const std::string& op_type,
  * \returns A model building function.
  */
 template <typename QuantType, typename OtherInputType = int64_t>
-inline GetTestQDQModelFn<QuantType> BuildQDQOpTestCase(const std::string& op_type,
-                                                       const std::vector<TestInputDef<float>>& quant_input_defs,
-                                                       const std::vector<TestInputDef<OtherInputType>>& non_quant_input_defs,
-                                                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
-                                                       const std::string& op_domain = kOnnxDomain,
-                                                       bool use_contrib_qdq = false) {
+inline GetTestQDQModelFn<QuantType> BuildQDQOpTestCase(
+    const std::string& op_type,
+    const std::vector<TestInputDef<float>>& quant_input_defs,
+    const std::vector<TestInputDef<OtherInputType>>& non_quant_input_defs,
+    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+    const std::string& op_domain = kOnnxDomain,
+    bool use_contrib_qdq = false) {
   return [op_type, quant_input_defs, non_quant_input_defs, attrs, op_domain,
           use_contrib_qdq](ModelTestBuilder& builder, std::vector<QuantParams<QuantType>>& output_qparams) {
     std::vector<NodeArg*> op_inputs;
diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc
index 14df171140fa..1171f2922c7a 100644
--- a/onnxruntime/test/providers/qnn/resize_test.cc
+++ b/onnxruntime/test/providers/qnn/resize_test.cc
@@ -177,7 +177,7 @@ static void RunQDQResizeOpTest(const TestInputDef<float>& input_def,
 }
 
 //
-// CPU tests:
+// CPU tests (all map to QNN's Resize on CPU):
 //
 
 // Upsample that uses "round_prefer_floor" as the "nearest_mode".
@@ -324,6 +324,7 @@ TEST_F(QnnCPUBackendTests, DISABLED_Resize_DownSample_Linear_HalfPixel_scales) {
 //
 
 // Test QDQ Resize downsample with mode: "linear", coordinate_transformation_mode: "align_corners"
+// Maps to QNN's ResizeBilinear operator.
 TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_AlignCorners) {
   std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
@@ -332,6 +333,7 @@ TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_AlignCorners) {
 }
 
 // Test QDQ Resize downsample with mode: "linear", coordinate_transformation_mode: "half_pixel"
+// Maps to QNN's ResizeBilinear operator.
 TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_HalfPixel) {
   std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
@@ -343,7 +345,7 @@ TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_HalfPixel) {
 }
 
 // Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "pytorch_half_pixel"
-// QNN EP uses QNN's Resize op.
+// Maps to QNN's Resize operator.
 TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearPytorchHalfPixel) {
   std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
@@ -355,7 +357,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearPytorchHalfPixel) {
 }
 
 // Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "half_pixel"
-// QNN EP uses QNN's Resize op.
+// Maps to QNN's ResizeBilinear operator.
 TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearHalfPixel) {
   std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
@@ -367,7 +369,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearHalfPixel) {
 }
 
 // Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "align_corners"
-// QNN EP uses QNN's Resize op.
+// Maps to QNN's ResizeBilinear operator.
 TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAlignCorners) {
   std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
@@ -379,7 +381,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAlignCorners) {
 }
 
 // Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "asymmetric"
-// QNN EP uses QNN's Resize op.
+// Maps to QNN's ResizeBilinear operator.
 TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAsymmetric) {
   std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
@@ -391,7 +393,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAsymmetric) {
 }
 
 // Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "half_pixel", nearest_mode: "round_prefer_floor"
-// QNN EP uses QNN's Resize op.
+// Maps to QNN's Resize operator.
 TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestHalfPixelRoundPreferFloor) {
   std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
@@ -408,7 +410,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_NearestModeCeil_Unsupported) {
 }
 
 // Test 3x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "floor".
-// QNN EP uses QNN's ResizeNearestNeighbor op.
+// Maps to QNN's ResizeNearestNeighbor operator.
 TEST_F(QnnHTPBackendTests, ResizeU8_3xNearestAsymmetricFloor) {
   std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
@@ -417,7 +419,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_3xNearestAsymmetricFloor) {
 }
 
 // Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "round_prefer_floor"
-// QNN EP uses QNN's Resize op.
+// Maps to QNN's Resize operator.
 TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestAsymmetricRoundPreferFloor) {
   std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2}, false, input_data),
@@ -447,7 +449,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ResizeU8_3xNearestAsymmetricRoundPreferFloor
 }
 
 // Test 0.5x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "floor"
-// QNN EP uses QNN's ResizeNearestNeighbor op.
+// Maps to QNN's ResizeNearestNeighbor operator.
 TEST_F(QnnHTPBackendTests, ResizeU8_HalfNearestAsymmetricFloor) {
   std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 8ff65c08e863..a6422407d79f 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -723,240 +723,6 @@ TEST_F(QnnHTPBackendTests, SpaceToDepthOp_U16) {
                          true);        // Use com.microsoft domain for Q/DQ ops
 }
 
-// Run QDQ model on HTP 3 times
-// 1st run will generate the Qnn context cache onnx file
-// 2nd run will load and run from QDQ model + Qnn context cache model
-// 3rd run directly loads and run from Qnn context cache model
-TEST_F(QnnHTPBackendTests, ContextBinaryCacheEmbedModeTest) {
-  ProviderOptions provider_options;
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-  const std::string context_binary_file = "./qnn_context_binary_test.onnx";
-
-  std::unordered_map<std::string, std::string> session_option_pairs;
-  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
-  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
-
-  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
-  const std::string op_type = "Atan";
-
-  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
-  // 1st run will generate the Qnn context cache binary file
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       "",  // context model file path, not required for this inference
-                       session_option_pairs);
-
-  // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
-
-  // 2nd run loads and run from QDQ model + Qnn context cache model
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       "",  // context model file path, not required for this inference
-                       session_option_pairs);
-
-  // 3rd run directly loads and run from Qnn context cache model
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       context_binary_file);
-}
-
-// Run QDQ model on HTP 3 times
-// 1st run will generate the Onnx skeleton file + Qnn context cache binary file
-// 2nd run will loads and run from QDQ model + Onnx skeleton file + Qnn context cache binary file
-// 3rd run directly loads and run from Onnx skeleton file + Qnn context cache binary file
-TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) {
-  ProviderOptions provider_options;
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-  const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
-  std::unordered_map<std::string, std::string> session_option_pairs;
-  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
-  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
-  session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0");
-
-  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
-  const std::string op_type = "Atan";
-
-  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
-  // 1st run will generate the Onnx skeleton file + Qnn context cache binary file
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       "",  // context model file path, not required for this inference
-                       session_option_pairs);
-
-  // Check the Onnx skeleton file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
-  // Check the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists("qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin"));
-
-  // 2nd run loads and run from QDQ model + Onnx skeleton file + Qnn context cache binary file
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       "",  // context model file path, not required for this inference
-                       session_option_pairs);
-
-  // 3rd run directly loads and run from Onnx skeleton file + Qnn context cache binary file
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       context_binary_file);
-}
-
-// Run QDQ model on HTP 2 times
-// 1st run will generate the Onnx skeleton file + Qnn context cache binary file
-// Then delete the context bin file to make the 2nd sesssion.Initialize() return the status with code INVALID_GRAPH
-TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) {
-  ProviderOptions provider_options;
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-  const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
-  std::unordered_map<std::string, std::string> session_option_pairs;
-  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
-  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
-  session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0");
-
-  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
-  const std::string op_type = "Atan";
-
-  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
-  // 1st run will generate the Onnx skeleton file + Qnn context cache binary file
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       "",  // context model file path, not required for this inference
-                       session_option_pairs);
-
-  // Check the Onnx skeleton file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
-  // Check the Qnn context cache binary file is generated
-  std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
-  EXPECT_TRUE(std::filesystem::exists(context_bin));
-  // Delete the Qnn context cache binary file
-  EXPECT_TRUE(std::filesystem::remove(context_bin));
-
-  // loads and run from Onnx skeleton file + Qnn context cache binary file
-  onnx::ModelProto model_proto;
-  onnxruntime::Model qnn_ctx_model;
-  // Load the QNN context cache model from path specified
-  ASSERT_STATUS_OK(qnn_ctx_model.Load(ToPathString(context_binary_file), model_proto));
-  std::string qnn_ctx_model_data;
-  model_proto.SerializeToString(&qnn_ctx_model_data);
-
-  SessionOptions so;
-  so.session_logid = "qnn_ctx_model_logger";
-  RunOptions run_options;
-  run_options.run_tag = so.session_logid;
-
-  InferenceSessionWrapper session_object{so, GetEnvironment()};
-
-  std::string provider_type = kCpuExecutionProvider;
-  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
-  ASSERT_STATUS_OK(session_object.Load(qnn_ctx_model_data.data(), static_cast<int>(qnn_ctx_model_data.size())));
-  // Verify the return status with code INVALID_GRAPH
-  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
-}
-
-// Run QDQ model on HTP with 2 inputs
-// 1st run will generate the Qnn context cache onnx file
-// 2nd run will load and run from QDQ model + Qnn context cache model
-// 3rd run directly loads and run from Qnn context cache model
-TEST_F(QnnHTPBackendTests, ContextBinary2InputsTest) {
-  ProviderOptions provider_options;
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-  const std::string context_binary_file = "./qnn_context_binary_2inputs_test.onnx";
-  std::unordered_map<std::string, std::string> session_option_pairs;
-  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
-  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
-
-  const TestInputDef<float> input_def1({1, 2, 3}, false, -10.0f, 10.0f);
-  const TestInputDef<float> input_def2({1, 2, 3}, false, -10.0f, 10.0f);
-  const std::string op_type = "Add";
-
-  // Runs model with DQ-> Add-> Q and compares the outputs of the CPU and QNN EPs.
-  // 1st run will generate the Qnn context cache binary file
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def1, input_def2}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def1, input_def2}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       "",  // context model file path, not required for this inference
-                       session_option_pairs);
-
-  // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
-
-  // 2nd run loads and run from QDQ model + Qnn context cache model
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def1, input_def2}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def1, input_def2}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       "",  // context model file path, not required for this inference
-                       session_option_pairs);
-
-  // 3rd run directly loads and run from Qnn context cache model
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def1, input_def2}, {}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def1, input_def2}, {}, {}),
-                       provider_options,
-                       14,
-                       ExpectedEPNodeAssignment::All,
-                       QDQTolerance(),
-                       logging::Severity::kERROR,
-                       context_binary_file);
-}
-
 TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {
   ProviderOptions provider_options;
 
@@ -1344,6 +1110,61 @@ TEST_F(QnnHTPBackendTests, LpNormalization_u16_rank4) {
                          kOnnxDomain,
                          true);
 }
+
+static GetTestQDQModelFn<uint16_t> BuildQDQConvertAddTestCase(const TestInputDef<float>& input0_def,
+                                                              const TestInputDef<float>& input1_def) {
+  return [input0_def, input1_def](ModelTestBuilder& builder, std::vector<QuantParams<uint16_t>>& output_qparams) {
+    constexpr bool use_contrib_qdq = true;
+
+    // Input0 -> Quantize(u8) -> Dequantize(u8 to float) -> input0_after_qdq
+    NodeArg* input0 = MakeTestInput<float>(builder, input0_def);
+    QuantParams<uint8_t> input0_u8_qparams = GetTestInputQuantParams<uint8_t>(input0_def);
+    NodeArg* input0_after_qdq = AddQDQNodePair<uint8_t>(builder, input0, input0_u8_qparams.scale,
+                                                        input0_u8_qparams.zero_point, use_contrib_qdq);
+
+    // input0_after_qdq -> Quantize(u16) -> Dequantize(u16 to float)
+    QuantParams<uint16_t> input0_u16_qparams = GetTestInputQuantParams<uint16_t>(input0_def);
+    NodeArg* input0_after_convert = AddQDQNodePair<uint16_t>(builder, input0_after_qdq, input0_u16_qparams.scale,
+                                                             input0_u16_qparams.zero_point, use_contrib_qdq);
+
+    // Input1 -> Quantize(u16) -> Dequantize(u16 to float) -> input1_after_qdq
+    NodeArg* input1 = MakeTestInput<float>(builder, input1_def);
+    QuantParams<uint16_t> input1_qparams = GetTestInputQuantParams<uint16_t>(input1_def);
+    NodeArg* input1_after_qdq = AddQDQNodePair<uint16_t>(builder, input1, input1_qparams.scale,
+                                                         input1_qparams.zero_point, use_contrib_qdq);
+
+    // Add op -> op_output
+    auto* op_output = builder.MakeIntermediate();
+    builder.AddNode("Add", {input0_after_convert, input1_after_qdq}, {op_output});
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<uint16_t>(builder, op_output, output_qparams[0].scale,
+                                                    output_qparams[0].zero_point, use_contrib_qdq);
+  };
+}
+
+// Test quantization type conversion (mixed precision) with Add.
+// First input is converted from uint8_t to uint16_t.
+TEST_F(QnnHTPBackendTests, Add_U8_U16_Convert) {
+  std::vector<float> input0_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  std::vector<float> input1_data = GetFloatDataInRange(-20.0f, 20.0f, 8);
+  TestInputDef<float> input0_def({1, 2, 2, 2}, false, input0_data);
+  TestInputDef<float> input1_def({1, 2, 2, 2}, false, input1_data);
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildOpTestCase<float>("Add", {input0_def, input1_def}, {}, {}, kOnnxDomain),
+                       BuildQDQConvertAddTestCase(input0_def, input1_def),
+                       provider_options,
+                       18,
+                       ExpectedEPNodeAssignment::All);
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc
index 57e4b211777b..6dc721edb421 100644
--- a/onnxruntime/test/providers/qnn/split_op_test.cc
+++ b/onnxruntime/test/providers/qnn/split_op_test.cc
@@ -302,19 +302,46 @@ TEST_F(QnnHTPBackendTests, Split_Int32_Opset13) {
 // Test 8-bit QDQ Split opset 18 on HTP backend: equal split of axis 0 via 'num_outputs' attribute
 // and 'split' input.
 TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset18) {
+  // Split 6 into 3 outputs of lengths [2, 2, 2]
+  TestInputDef<float> input_def({6, 2}, false,
+                                {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f, 9.0f, 10.0f, 11.0f});
+
   // Use 'split' input (initializer).
-  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
-                                  {2, 2},  // split
-                                  0,       // axis
-                                  -1,      // num_outputs
-                                  18,      // opset
+  RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
+                                  {2, 2, 2},  // split
+                                  0,          // axis
+                                  -1,         // num_outputs
+                                  18,         // opset
                                   ExpectedEPNodeAssignment::All);
 
   // Use 'num_outputs' attribute.
-  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+  RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
+                                  {},  // split (use num_outputs instead)
+                                  0,   // axis
+                                  3,   // num_outputs
+                                  18,  // opset
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test 8-bit QDQ Split opset 18 on HTP backend. Use an uneven split (last chunk should be smaller).
+TEST_F(QnnHTPBackendTests, Split_NonEqual_Axis0_Opset18) {
+  // Split 7 into 3 outputs of lengths [3, 3, 1]
+  TestInputDef<float> input_def({7, 2}, false,
+                                {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f});
+
+  // Use a `split` input with uneven split lengths.
+  RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
+                                  {3, 3, 1},  // split
+                                  0,          // axis
+                                  -1,         // num_outputs
+                                  18,         // opset
+                                  ExpectedEPNodeAssignment::All);
+
+  // Use a `num_outputs` attribute that does not evenly divide into shape[axis].
+  RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
                                   {},  // split (use num_outputs instead)
                                   0,   // axis
-                                  2,   // num_outputs
+                                  3,   // num_outputs
                                   18,  // opset
                                   ExpectedEPNodeAssignment::All);
 }
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index d9f917f6d187..4d2538c947dc 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -122,6 +122,25 @@ void CreateBaseModel(std::string model_name,
   status = onnxruntime::Model::Save(model, model_name);
 }
 
+bool HasCacheFileWithPrefix(const std::string& prefix, std::string file_dir = "") {
+  std::filesystem::path target_dir;
+  if (file_dir.empty()) {
+    target_dir = std::filesystem::current_path();
+  } else {
+    target_dir = std::filesystem::path(file_dir);
+  }
+
+  for (const auto& entry : std::filesystem::directory_iterator(target_dir)) {
+    if (entry.is_regular_file()) {
+      std::string filename = entry.path().filename().string();
+      if (filename.rfind(prefix, 0) == 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void RunSession(InferenceSession& session_object,
                 RunOptions& run_options,
                 NameMLValMap& feeds,
@@ -177,6 +196,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
 
   OrtTensorRTProviderOptionsV2 params;
   params.trt_engine_cache_enable = 1;
+  params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
+  params.trt_dump_ep_context_model = 1;
+  params.trt_ep_context_file_path = "EP_Context_model.onnx";
   std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
   EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
   auto status = session_object.Load(model_name);
@@ -192,6 +214,12 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
   // Y: 1, 3, 3, 2, 2, 2
   // Z: 1, 3, 3, 2, 2, 2
   RunSession(session_object, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  // Verify on cache with customized prefix
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
+
+  // Verify EP context model with user provided name
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
 }
 
 void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string sess_log_id, bool has_non_zero_node = false) {
@@ -227,6 +255,7 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string
 
   OrtTensorRTProviderOptionsV2 params;
   params.trt_engine_cache_enable = 1;
+  params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
   std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
   EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
   auto status = session_object.Load(model_name);
@@ -253,6 +282,9 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string
 
   for (auto& th : threads)
     th.join();
+
+  // Verify on cache with customized prefix
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
 }
 
 TEST(TensorrtExecutionProviderTest, SessionCreationWithMultiThreadsAndInferenceWithMultiThreads) {
@@ -327,6 +359,192 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   ASSERT_EQ(model_hash, model_hash3) << "model 1&3 are same models and they have same hash, no matter where they are loaded";
 }
 
+TEST(TensorrtExecutionProviderTest, EPContextNode) {
+  std::string model_name = "EPContextNode_test.onnx";
+  std::string graph_name = "EPContextNode_test";
+  std::string sess_log_id = "EPContextNode_test";
+  std::vector<int> dims = {1, 3, 2};
+  CreateBaseModel(model_name, graph_name, dims);
+
+  SessionOptions so;
+  so.session_logid = sess_log_id;
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+  InferenceSession session_object{so, GetEnvironment()};
+  auto cuda_provider = DefaultCudaExecutionProvider();
+  auto cpu_allocator = cuda_provider->CreatePreferredAllocators()[1];
+  std::vector<int64_t> dims_mul_x = {1, 3, 2};
+  std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x);
+  OrtValue ml_value_y;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y);
+  OrtValue ml_value_z;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+  feeds.insert(std::make_pair("Y", ml_value_y));
+  feeds.insert(std::make_pair("Z", ml_value_z));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("M");
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_mul_m = {1, 3, 2};
+  std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f};
+
+  /*
+   * Test case 1: Dump context model
+   *
+   * provider options=>
+   *   trt_ep_context_file_path = "EP_Context_model.onnx"
+   *
+   * expected result =>
+   *   context model "EP_Context_model.onnx" should be created in current directory
+   *
+   */
+  OrtTensorRTProviderOptionsV2 params;
+  params.trt_engine_cache_enable = 1;
+  params.trt_dump_ep_context_model = 1;
+  params.trt_ep_context_file_path = "EP_Context_model.onnx";
+  std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  auto status = session_object.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
+
+  /*
+   * Test case 2: Dump context model
+   *
+   * provider options=>
+   *   trt_engine_cache_prefix = "TRT_engine_cache"
+   *   trt_ep_context_file_path = "context_model_folder"
+   *   trt_engine_cache_path = "engine_cache_folder"
+   *
+   * expected result =>
+   *   engine cache "./context_model_folder/engine_cache_folder/TRT_engine_cache...engine" should be created
+   *   context model "./context_model_folder/EPContextNode_test_ctx.onnx" should be created
+   */
+  InferenceSession session_object2{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params2;
+  params2.trt_engine_cache_enable = 1;
+  params2.trt_dump_ep_context_model = 1;
+  params2.trt_engine_cache_prefix = "TRT_engine_cache";
+  params2.trt_engine_cache_path = "engine_cache_folder";  // due to dump_ep_context_model = 1, the new cache path is ./context_model_folder/engine_cache_folder
+  params2.trt_ep_context_file_path = "context_model_folder";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params2);
+  EXPECT_TRUE(session_object2.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object2.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object2.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  auto new_engine_cache_path = std::filesystem::path(params2.trt_ep_context_file_path).append(params2.trt_engine_cache_path).string();
+  // Test engine cache path:
+  // "./context_model_folder/engine_cache_folder/TRT_engine_cache...engine" should be created
+  ASSERT_TRUE(HasCacheFileWithPrefix(params2.trt_engine_cache_prefix, new_engine_cache_path));
+  // Test context model path:
+  // "./context_model_folder/EPContextNode_test_ctx.onnx" should be created
+  ASSERT_TRUE(HasCacheFileWithPrefix("EPContextNode_test_ctx.onnx", params2.trt_ep_context_file_path));
+
+  /*
+   * Test case 3: Run the dumped context model
+   *
+   * context model path = "./EP_Context_model.onnx" (created from case 1)
+   *
+   * expected result=>
+   *   engine cache is also in the same current dirctory as "./xxxxx.engine"
+   *   and the "ep_cache_context" attribute node of the context model should point to that.
+   *
+   */
+  InferenceSession session_object3{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params3;
+  model_name = params.trt_ep_context_file_path;
+  params3.trt_engine_cache_enable = 1;
+  execution_provider = TensorrtExecutionProviderWithOptions(&params3);
+  EXPECT_TRUE(session_object3.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object3.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object3.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object3, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 4: Run the dumped context model
+   *
+   * context model path = "./context_model_folder/EPContextNode_test_ctx.onnx" (created from case 2)
+   *
+   * expected result=>
+   *   engine cache path is "./context_model_folder/engine_cache_folder/xxxxx.engine"
+   *   and the "ep_cache_context" attribute node of the context model should point to "engine_cache_folder/xxxxx.engine".
+   *
+   */
+  InferenceSession session_object4{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params4;
+  model_name = "./context_model_folder/EPContextNode_test_ctx.onnx";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params4);
+  EXPECT_TRUE(session_object4.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object4.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object4.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object4, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 5: Dump context model with embed_model = 1
+   */
+  InferenceSession session_object5{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params5;
+  params5.trt_dump_ep_context_model = 1;
+  params5.trt_ep_context_embed_mode = 1;
+  params5.trt_ep_context_file_path = "EP_Context_model_2.onnx";
+  model_name = "EPContextNode_test.onnx";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params5);
+  EXPECT_TRUE(session_object5.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object5.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object5.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  /*
+   * Test case 6: Run context model with embed_model = 1 (created from case 5)
+   */
+  InferenceSession session_object6{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params6;
+  params6.trt_ep_context_embed_mode = 1;
+  model_name = params5.trt_ep_context_file_path;
+  execution_provider = TensorrtExecutionProviderWithOptions(&params6);
+  EXPECT_TRUE(session_object6.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object6.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object6.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object6, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+}
+
 TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {
   std::string model_name = "testdata/trt_plugin_custom_op_test.onnx";
   SessionOptions so;
@@ -426,6 +644,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
      */
 
     params.trt_engine_cache_enable = 1;
+    params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
+    params.trt_dump_ep_context_model = 1;
+    params.trt_ep_context_file_path = "EP_Context_model.onnx";
     std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
     EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
     auto status = session_object.Load(model_name);
@@ -551,6 +772,12 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
 
     status = session_object2.Run(run_options, feeds, output_names, &fetches);
 
+    // Verify on cache with customized prefix
+    ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
+
+    // Verify EP context model with user provided name
+    ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
+
     if (input_type.compare("static") == 0) {
       // Can't run inference since input shape changes but the engine is built with static input
       ASSERT_FALSE(status.IsOK());
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index c48b07422d45..b8897c98c2a0 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -37,8 +37,11 @@ def __init__(
         super().__init__(backend, parent_module=__name__)
 
     @classmethod
-    def assert_similar_outputs(cls, ref_outputs, outputs, rtol, atol):
-        """Asserts ref_outputs and outputs match to within the given tolerances."""
+    def assert_similar_outputs(cls, ref_outputs, outputs, rtol, atol, model_dir=None):
+        """
+        Asserts ref_outputs and outputs match to within the given tolerances.
+        The `model_dir` parameter is currently unused (added to base Runner class in onnx 1.16.0).
+        """
 
         def assert_similar_array(ref_output, output):
             np.testing.assert_equal(ref_output.dtype, output.dtype)
@@ -140,8 +143,8 @@ def create_backend_test(test_name=None):
         if backend.supports_device("OPENVINO_CPU_FP16"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16")
 
-        if backend.supports_device("OPENVINO_NPU_FP16"):
-            current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU_FP16")
+        if backend.supports_device("OPENVINO_NPU"):
+            current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU")
 
         if backend.supports_device("OPENVINO"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18")
diff --git a/onnxruntime/test/python/onnxruntime_test_engine_wrapper.py b/onnxruntime/test/python/onnxruntime_test_engine_wrapper.py
new file mode 100644
index 000000000000..4123318b9f0a
--- /dev/null
+++ b/onnxruntime/test/python/onnxruntime_test_engine_wrapper.py
@@ -0,0 +1,100 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import unittest
+
+import numpy as np
+import onnx
+from helper import get_name
+from onnx import TensorProto, helper
+
+import onnxruntime as onnxrt
+
+
+class TestInferenceSessionWithCtxNode(unittest.TestCase):
+    trt_engine_cache_path_ = "./trt_engine_cache"
+    ctx_node_model_name_ = "ctx_node.onnx"
+
+    # This test is only for TRT EP to test EPContext node with TRT engine
+    @unittest.skipIf(
+        "TensorrtExecutionProvider" not in onnxrt.get_available_providers(),
+        reason="Test TRT EP only",
+    )
+    def create_ctx_node(self, ctx_embed_mode=0, cache_path=""):
+        if ctx_embed_mode:
+            # Get engine buffer from engine cache
+            with open(cache_path, "rb") as file:
+                engine_buffer = file.read()
+            ep_cache_context_content = engine_buffer
+        else:
+            ep_cache_context_content = cache_path
+
+        nodes = [
+            helper.make_node(
+                "EPContext",
+                ["X"],
+                ["Y"],
+                "EPContext",
+                domain="com.microsoft",
+                embed_mode=ctx_embed_mode,
+                ep_cache_context=ep_cache_context_content,
+            ),
+        ]
+
+        graph = helper.make_graph(
+            nodes,
+            "trt_engine_wrapper",
+            [  # input
+                helper.make_tensor_value_info("X", TensorProto.FLOAT, ["N", 2]),
+            ],
+            [  # output
+                helper.make_tensor_value_info("Y", TensorProto.FLOAT, ["N", 1]),
+            ],
+        )
+        model = helper.make_model(graph)
+        onnx.save(model, self.ctx_node_model_name_)
+
+    def test_ctx_node(self):
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+
+        # First session and run to create engine cache
+        providers = [
+            (
+                "TensorrtExecutionProvider",
+                {"trt_engine_cache_enable": True, "trt_engine_cache_path": self.trt_engine_cache_path_},
+            )
+        ]
+        session = onnxrt.InferenceSession(get_name("matmul_2.onnx"), providers=providers)
+        session.run(
+            ["Y"],
+            {"X": x},
+        )
+
+        # Get engine cache name
+        cache_name = ""
+        for f in os.listdir(self.trt_engine_cache_path_):
+            if f.endswith(".engine"):
+                cache_name = f
+        print(cache_name)
+
+        # Second session and run to test ctx node with engine cache path
+        self.create_ctx_node(cache_path=os.path.join(self.trt_engine_cache_path_, cache_name))
+        providers = [("TensorrtExecutionProvider", {})]
+        session = onnxrt.InferenceSession(get_name(self.ctx_node_model_name_), providers=providers)
+        session.run(
+            ["Y"],
+            {"X": x},
+        )
+
+        # Third session and run to test ctx node with engine binary content
+        self.create_ctx_node(ctx_embed_mode=1, cache_path=os.path.join(self.trt_engine_cache_path_, cache_name))
+        session = onnxrt.InferenceSession(get_name(self.ctx_node_model_name_), providers=providers)
+        session.run(
+            ["Y"],
+            {"X": x},
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 8c23286e4544..e4814aa7fc03 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -312,6 +312,7 @@ def test_set_providers_with_options(self):
             option["trt_engine_cache_path"] = engine_cache_path
             force_sequential_engine_build = "true"
             option["trt_force_sequential_engine_build"] = force_sequential_engine_build
+            option["user_compute_stream"] = "1"
             sess.set_providers(["TensorrtExecutionProvider"], [option])
 
             options = sess.get_provider_options()
@@ -326,6 +327,8 @@ def test_set_providers_with_options(self):
             self.assertEqual(option["trt_engine_cache_enable"], "1")
             self.assertEqual(option["trt_engine_cache_path"], str(engine_cache_path))
             self.assertEqual(option["trt_force_sequential_engine_build"], "1")
+            self.assertEqual(option["user_compute_stream"], "1")
+            self.assertEqual(option["has_user_compute_stream"], "1")
 
             from onnxruntime.capi import _pybind_state as C
 
@@ -354,6 +357,19 @@ def test_set_providers_with_options(self):
                 sess.set_providers(['TensorrtExecutionProvider'], [option])
             """
 
+            try:
+                import torch
+
+                if torch.cuda.is_available():
+                    s = torch.cuda.Stream()
+                    option["user_compute_stream"] = str(s.cuda_stream)
+                    sess.set_providers(["TensorrtExecutionProvider"], [option])
+                    options = sess.get_provider_options()
+                    self.assertEqual(options["TensorrtExecutionProvider"]["user_compute_stream"], str(s.cuda_stream))
+                    self.assertEqual(options["TensorrtExecutionProvider"]["has_user_compute_stream"], "1")
+            except ImportError:
+                print("torch is not installed, skip testing setting user_compute_stream from torch cuda stream")
+
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             cuda_success = 0
 
@@ -414,6 +430,8 @@ def test_get_and_set_option_with_values(option_name, option_values):
                             str(option_value),
                         )
 
+                test_get_and_set_option_with_values("enable_cuda_graph", ["1", "0"])
+
                 test_get_and_set_option_with_values("arena_extend_strategy", ["kNextPowerOfTwo", "kSameAsRequested"])
 
                 test_get_and_set_option_with_values("cudnn_conv_algo_search", ["DEFAULT", "EXHAUSTIVE", "HEURISTIC"])
@@ -426,6 +444,8 @@ def test_get_and_set_option_with_values(option_name, option_values):
 
                 test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"])
 
+                test_get_and_set_option_with_values("use_tf32", ["1", "0"])
+
                 option["gpu_external_alloc"] = "0"
                 option["gpu_external_free"] = "0"
                 option["gpu_external_empty_cache"] = "0"
@@ -434,6 +454,25 @@ def test_get_and_set_option_with_values(option_name, option_values):
                 self.assertEqual(options["CUDAExecutionProvider"]["gpu_external_alloc"], "0")
                 self.assertEqual(options["CUDAExecutionProvider"]["gpu_external_free"], "0")
                 self.assertEqual(options["CUDAExecutionProvider"]["gpu_external_empty_cache"], "0")
+
+                option["user_compute_stream"] = "0"
+                sess.set_providers(["CUDAExecutionProvider"], [option])
+                options = sess.get_provider_options()
+                self.assertEqual(options["CUDAExecutionProvider"]["user_compute_stream"], "0")
+
+                try:
+                    import torch
+
+                    if torch.cuda.is_available():
+                        s = torch.cuda.Stream()
+                        option["user_compute_stream"] = str(s.cuda_stream)
+                        sess.set_providers(["CUDAExecutionProvider"], [option])
+                        options = sess.get_provider_options()
+                        self.assertEqual(options["CUDAExecutionProvider"]["user_compute_stream"], str(s.cuda_stream))
+                        self.assertEqual(options["CUDAExecutionProvider"]["has_user_compute_stream"], "1")
+                except ImportError:
+                    print("torch is not installed, skip testing setting user_compute_stream from torch cuda stream")
+
                 #
                 # Note: Tests that throw an exception leave an empty session due to how set_providers currently works,
                 #       so run them last. Each set_providers call will attempt to re-create a session, so it's
@@ -534,6 +573,18 @@ def test_get_and_set_option_with_values(option_name, option_values):
 
                 test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"])
 
+                test_get_and_set_option_with_values("enable_hip_graph", ["1", "0"])
+
+                # test for user_compute_stream
+                option = options["ROCMExecutionProvider"]
+                option["user_compute_stream"] = "1"
+                sess.set_providers(["ROCMExecutionProvider"], [option])
+                new_options = sess.get_provider_options()
+                new_option = new_options["ROCMExecutionProvider"]
+                self.assertEqual(new_option["user_compute_stream"], "1")
+                # set user_compute_stream will set has_user_compute_stream to 1 too
+                self.assertEqual(new_option["has_user_compute_stream"], "1")
+
             run_rocm_options_test()
 
     def test_invalid_set_providers(self):
@@ -631,6 +682,14 @@ def do_test_get_and_set_tuning_results(ep):
         if "ROCMExecutionProvider" in onnxrt.get_available_providers():
             do_test_get_and_set_tuning_results("ROCMExecutionProvider")
 
+    def test_run_model_with_optional_sequence_input(self):
+        sess = onnxrt.InferenceSession(get_name("identity_opt.onnx"))
+        x = [np.array([1, 2, 3, 4, 5]).astype(np.float32)]
+        input_name = sess.get_inputs()[0].name
+        output_name = sess.get_outputs()[0].name
+        res = sess.run([output_name], {input_name: x})
+        np.testing.assert_allclose(res[0], x)
+
     def test_run_model(self):
         sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=available_providers)
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index c4e13e773535..ce04dff2aecb 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -84,6 +84,7 @@ def test_select_ep_to_run_cuda_graph(self):
         elif "CUDAExecutionProvider" in onnxrt.get_available_providers():
             providers = [("CUDAExecutionProvider", {"enable_cuda_graph": True})]
             self.run_model_with_cuda_graph(providers)
+            self.run_model_with_cuda_graph_annotation(providers)
 
     def run_model_with_cuda_graph(self, providers):
         INPUT_SIZE = 1280  # noqa: N806
@@ -100,13 +101,15 @@ def run_model_with_cuda_graph(self, providers):
         io_binding.bind_ortvalue_input("X", x_ortvalue)
         io_binding.bind_ortvalue_output("Y", y_ortvalue)
 
+        ro = onnxrt.RunOptions()
+
         # One regular run for the necessary memory allocation and cuda graph capturing
-        session.run_with_iobinding(io_binding)
+        session.run_with_iobinding(io_binding, ro)
         expected_y = np.array([[5.0], [11.0], [17.0]] * INPUT_SIZE, dtype=np.float32)
         np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
 
         # After capturing, CUDA graph replay happens from this Run onwards
-        session.run_with_iobinding(io_binding)
+        session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05)
 
         # Update input and then replay CUDA graph
@@ -116,7 +119,7 @@ def run_model_with_cuda_graph(self, providers):
                 dtype=np.float32,
             )
         )
-        session.run_with_iobinding(io_binding)
+        session.run_with_iobinding(io_binding, ro)
         np.testing.assert_allclose(
             np.array([[50.0], [110.0], [170.0]] * INPUT_SIZE, dtype=np.float32),
             y_ortvalue.numpy(),
@@ -124,6 +127,58 @@ def run_model_with_cuda_graph(self, providers):
             atol=1e-05,
         )
 
+    def run_model_with_cuda_graph_annotation(self, providers):
+        INPUT_SIZE = 1280  # noqa: N806
+
+        x_base = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]
+        y_base = [[0.0], [0.0], [0.0], [0.0]]
+        expected_y_base = [[5.0], [11.0], [17.0], [23.0]]
+
+        x_base_mul_10 = [[10.0, 20.0], [30.0, 40.0], [50.0, 60.0], [70.0, 80.0]]
+        expected_y_base_mul_10 = [[50.0], [110.0], [170.0], [230.0]]
+
+        test_num = 4
+
+        x_ortvalues = []
+        y_ortvalues = []
+        for i in range(test_num):
+            x = np.array(x_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
+            y = np.array(y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
+            x_ortvalues.append(onnxrt.OrtValue.ortvalue_from_numpy(x, "cuda", 0))
+            y_ortvalues.append(onnxrt.OrtValue.ortvalue_from_numpy(y, "cuda", 0))
+
+        onnxrt.set_default_logger_severity(0)
+        session = onnxrt.InferenceSession(get_name("matmul_2.onnx"), providers=providers)
+        io_bindings = [session.io_binding()] * test_num
+        ro = onnxrt.RunOptions()
+
+        # Regular run to capture CUDA graph
+        for i in range(test_num):
+            io_bindings[i].bind_ortvalue_input("X", x_ortvalues[i])
+            io_bindings[i].bind_ortvalue_output("Y", y_ortvalues[i])
+            # TODO: Temporarily remove the default cuda graph capture test for the first regular run
+            # because it fails on a training CI. Need to investigate the root cause.
+            ro.add_run_config_entry("gpu_graph_id", str(i + 1))
+            io_bindings[i].synchronize_inputs()
+            session.run_with_iobinding(io_bindings[i], ro)
+            io_bindings[i].synchronize_outputs()
+            expected_y = np.array(expected_y_base[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
+            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+
+        del ro
+        ro = onnxrt.RunOptions()
+
+        # After capturing, CUDA graph replay happens from this Run onwards
+        for i in range(test_num):
+            # Update input and then replay CUDA graph
+            x_ortvalues[i].update_inplace(np.array(x_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32))
+            ro.add_run_config_entry("gpu_graph_id", str(i + 1))
+            io_bindings[i].synchronize_inputs()
+            session.run_with_iobinding(io_bindings[i], ro)
+            io_bindings[i].synchronize_outputs()
+            expected_y = np.array(expected_y_base_mul_10[: i + 1][:] * INPUT_SIZE, dtype=np.float32)
+            np.testing.assert_allclose(expected_y, y_ortvalues[i].numpy(), rtol=1e-05, atol=1e-05)
+
     def test_arena_with_cuda_graph(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             # To test cuda graph catpure, we set Arena extend strategy to be SameAsRequested so as to detect any
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index 67db411ddc24..eca1430448e8 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -392,6 +392,208 @@ def test_div_precision(self):
         self.assertEqual(len(output_dims), 1)
         self.assertEqual(output_dims[0].dim_value, 512)
 
+    def test_quantize_linear(self):
+        """
+        Test ONNX QuantizeLinear op.
+        Check that the output shape is propagated from the first input and that the output data
+        type comes from the zero-point input.
+        """
+        initializers = [
+            helper.make_tensor(
+                "scale",
+                TensorProto.FLOAT,
+                [],
+                [1.0],
+            ),
+            helper.make_tensor(
+                "zero_point",
+                TensorProto.INT8,
+                [],
+                [16],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "QuantizeLinear",
+                inputs=[
+                    "input_f32",
+                    "scale",
+                    "zero_point",
+                ],
+                outputs=["output_s8"],
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("input_f32", TensorProto.FLOAT, ["b", 2, 3, 4]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("output_s8", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "QuantizeLinear_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        expected_shapes = [
+            helper.make_tensor_value_info("output_s8", TensorProto.INT8, ["b", 2, 3, 4]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
+    def test_quantize_linear_ms_domain(self):
+        """
+        Test QuantizeLinear op ('com.microsoft' domain).
+        Check that the output shape is propagated from the first input and that the output data
+        type comes from the zero-point input.
+        """
+        initializers = [
+            helper.make_tensor(
+                "scale",
+                TensorProto.FLOAT,
+                [],
+                [1.0],
+            ),
+            helper.make_tensor(
+                "zero_point",
+                TensorProto.UINT16,
+                [],
+                [16],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "QuantizeLinear",
+                inputs=[
+                    "input_f32",
+                    "scale",
+                    "zero_point",
+                ],
+                outputs=["output_u16"],
+                domain="com.microsoft",
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("input_f32", TensorProto.FLOAT, ["b", 2, 3, 4]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("output_u16", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "QuantizeLinear_MSDomain_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        expected_shapes = [
+            helper.make_tensor_value_info("output_u16", TensorProto.UINT16, ["b", 2, 3, 4]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
+    def test_quantize_linear_no_zp_input(self):
+        """
+        Test QuantizeLinear op ('com.microsoft' domain).
+        Check that the output shape is propagated from the first input.
+        The zero-point input is missing, so the output data type should default to uint8.
+        """
+        initializers = [
+            helper.make_tensor(
+                "scale",
+                TensorProto.FLOAT,
+                [],
+                [1.0],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "QuantizeLinear",
+                inputs=[
+                    "input_f32",
+                    "scale",
+                ],
+                outputs=["output_u8"],
+                domain="com.microsoft",
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("input_f32", TensorProto.FLOAT, ["b", 2, 3, 4]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("output_u8", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "QuantizeLinear_NoZP_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        # Check that the output shape is propagated from the first input and that the
+        # output data type comes from the zero-point input.
+        expected_shapes = [
+            helper.make_tensor_value_info("output_u8", TensorProto.UINT8, ["b", 2, 3, 4]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
+    def test_dequantize_linear_ms_domain(self):
+        """
+        Test DequantizeLinear operator ('com.microsoft' domain).
+        Check that the output shape is propagated from the first input and that the output data
+        type comes from the scale input.
+        """
+        initializers = [
+            helper.make_tensor(
+                "scale",
+                TensorProto.FLOAT,
+                [],
+                [1.0],
+            ),
+            helper.make_tensor(
+                "zero_point",
+                TensorProto.UINT16,
+                [],
+                [16],
+            ),
+        ]
+
+        nodes = [
+            helper.make_node(
+                "DequantizeLinear",
+                inputs=[
+                    "input_u16",
+                    "scale",
+                    "zero_point",
+                ],
+                outputs=["output_f32"],
+                domain="com.microsoft",
+            ),
+        ]
+
+        inputs = [
+            helper.make_tensor_value_info("input_u16", TensorProto.UINT16, ["b", 2, 3, 4]),
+        ]
+
+        outputs = [
+            helper.make_tensor_value_info("output_f32", TensorProto.UNDEFINED, None),
+        ]
+
+        graph = helper.make_graph(nodes, "DequantizeLinear_MSDomain_Test", inputs, outputs, initializers)
+        model = helper.make_model(graph)
+
+        inferred = SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
+        expected_shapes = [
+            helper.make_tensor_value_info("output_f32", TensorProto.FLOAT, ["b", 2, 3, 4]),
+        ]
+        self._check_shapes(graph, inferred.graph, expected_shapes)
+
 
 class TestSymbolicShapeInferenceForSlice(unittest.TestCase):
     def check_slice_of_concat(self, input_dims, start, end, step, expected_output_dim):
diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py
index eede1be05f85..b30282f2ab41 100644
--- a/onnxruntime/test/python/quantization/op_test_utils.py
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@@ -12,6 +12,7 @@
 from onnx.reference.op_run import OpRun
 
 import onnxruntime
+import onnxruntime.capi._pybind_state as C
 from onnxruntime.quantization import CalibrationDataReader
 
 onnx_recent_enough = hasattr(OpRun, "infer_name")
@@ -27,7 +28,7 @@
         onnx_recent_enough = False
 
 
-class QGemm(OpRun):
+class QOpRun(OpRun):
     op_domain = "com.microsoft"
 
     f8_types = {
@@ -48,6 +49,8 @@ def get_tensor_type(self, tensor: np.ndarray) -> int:
             return TensorProto.FLOAT8E5M2FNUZ
         return np_dtype_to_tensor_dtype(tensor.dtype)
 
+
+class QGemm(QOpRun):
     def _run(
         self,
         A,
@@ -131,6 +134,74 @@ def _run(
             return (y.astype(dtype),)
 
 
+class QLinearMatMul(QOpRun):
+    def _run(
+        self,
+        A,
+        a_scale,
+        a_zero_point,
+        B,
+        b_scale,
+        b_zero_point,
+        y_scale=None,
+        y_zero_point=None,
+    ):
+        a_type = self.get_tensor_type(a_zero_point)
+        b_type = self.get_tensor_type(b_zero_point)
+        y_type = self.get_tensor_type(y_zero_point)
+        if a_type == TensorProto.FLOAT8E4M3FN and b_type == TensorProto.FLOAT8E4M3FN:
+            a_scaled = (float8e4m3_to_float32(A).astype(float) - float8e4m3_to_float32(a_zero_point)) * np.float32(
+                a_scale
+            )
+            b_scaled = (float8e4m3_to_float32(B).astype(float) - float8e4m3_to_float32(b_zero_point)) * np.float32(
+                b_scale
+            )
+            y = a_scaled @ b_scaled
+            if y_scale is not None:
+                y /= y_scale
+            if y_zero_point is not None:
+                y += float8e4m3_to_float32(y_zero_point)
+                ry = y.ravel()
+
+                fy = np.empty(ry.shape, dtype=float8e4m3fn)
+                for i in range(fy.shape[0]):
+                    el = float32_to_float8e4m3(ry[i])  # type: ignore[assignment]
+                    fy[i] = el
+                y = fy.reshape(y.shape)
+            else:
+                raise NotImplementedError("y_zero_point is not empty. QLinearMatMul is not implemented in that case.")
+            return (y,)
+        elif a_type in self.f8_types or b_type in self.f8_types or y_type in self.f8_types:
+            raise NotImplementedError(f"QLinearMatMul not implemented for zero_types {a_type}, {b_type}, {y_type}.")
+        else:
+            if TensorProto.FLOAT8E4M3FN in {a_type, b_type, y_type}:
+                raise TypeError(f"Unexpected type for A: {a_type}, B:{b_type} or Y:{y_type}.")
+            a_scaled = (A.astype(float) - a_zero_point) * np.float32(a_scale)
+            b_scaled = (B.astype(float) - b_zero_point) * np.float32(b_scale)
+            y = a_scaled @ b_scaled
+            if y_scale is not None:
+                y /= np.float32(y_scale)
+            if y_zero_point is not None:
+                y += y_zero_point
+
+            if y_zero_point is not None:
+                dtype = y_zero_point.dtype
+            elif C is not None:
+                dtype = C.dtype
+            else:
+                dtype = A.dtype
+
+            y = np.rint(y)
+            if dtype == np.uint8:
+                y = np.clip(y, 0, 255)
+            elif dtype == np.int8:
+                y = np.clip(y, -128, 127)
+            else:
+                raise ValueError(f"Unexpected dtype={dtype}, it should be uint8 or int8.")
+
+            return (y.astype(dtype),)
+
+
 class TestDataFeeds(CalibrationDataReader):
     def __init__(self, data_feeds):
         """
@@ -183,12 +254,20 @@ def check_op_type_count(testcase, model_path, **kwargs):
     for node in model.graph.node:
         if node.op_type in optype2count:
             optype2count[node.op_type] += 1
+
     for op_type in kwargs:
-        testcase.assertEqual(
-            kwargs[op_type],
-            optype2count[op_type],
-            f"op_type {op_type} count not same",
-        )
+        try:
+            testcase.assertEqual(
+                kwargs[op_type],
+                optype2count[op_type],
+                f"op_type {op_type} count not same",
+            )
+        except AssertionError as e:
+            from onnx_array_api.plotting.text_plot import onnx_simple_text_plot
+
+            raise AssertionError(
+                f"Assert failed:\noptype={optype2count}\nkwargs={kwargs}\n{onnx_simple_text_plot(model)}"
+            ) from e
 
 
 def check_sign_f8_quantization(model_path_origin, model_path_to_check):
@@ -265,6 +344,7 @@ def check_model_correctness(
     providers=None,
     dynamic=False,
     is_gemm=False,
+    op_matmul=False,
 ):
     if providers is None:
         providers = ["CPUExecutionProvider"]
@@ -278,6 +358,7 @@ def check_model_correctness(
         model_onnx = onnx.load(f)
     ops_set = set(node.op_type for node in model_onnx.graph.node)
     check_reference_evaluator = not (ops_set & {"EmbedLayerNormalization", "Conv", "Attention", "Transpose"})
+    check_target_evaluator = False
 
     with open(model_path_to_check, "rb") as f:
         model_check = onnx.load(f)
@@ -333,8 +414,11 @@ def check_model_correctness(
             check_sign_f8_quantization(model_path_origin, model_path_to_check)
 
     # Verifies the expected outputs.
-    if check_reference_evaluator and onnx_recent_enough:
-        reference_new_ops = [QGemm]
+    if check_target_evaluator and onnx_recent_enough:
+        if op_matmul:
+            reference_new_ops = [QLinearMatMul]
+        else:
+            reference_new_ops = [QGemm]
         has_missing_reference_ops = any(
             node.domain not in ["", "ai.onnx"]
             and not any(
@@ -350,25 +434,47 @@ def check_model_correctness(
             )
         # Needs pv.Version(onnx.__version__) >= pv.Version("1.16.0")
         ref = ReferenceEvaluator(model_check, new_ops=reference_new_ops)
-        target_results = ref.run(None, inputs)
-        testcase.assertEqual(len(origin_results), len(target_results), "result count are different")
-        for idx, ref_output in enumerate(origin_results):
-            output = target_results[idx]
-            np.testing.assert_allclose(
-                ref_output,
-                output,
-                rtol=rtol,
-                atol=atol,
-                err_msg=f"Model {model_path_to_check!r} failed for providers={providers!r}.",
-            )
+        try:
+            target_results = ref.run(None, inputs)
+        except Exception as e:
+            if "axis is out of boundary" not in str(e) and "list assignment index out of range" not in str(e):
+                # Run through the same failure with more logs
+                ref = ReferenceEvaluator(model_check, new_ops=reference_new_ops, verbose=10)
+                target_results = ref.run(None, inputs)
+            else:
+                target_results = []
+        if target_results:
+            testcase.assertEqual(len(origin_results), len(target_results), "result count are different")
+            for idx, ref_output in enumerate(origin_results):
+                output = target_results[idx]
+                np.testing.assert_allclose(
+                    ref_output,
+                    output,
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=f"Model {model_path_to_check!r} failed for providers={providers!r}.",
+                )
 
     # enable QDQ transformers
     # sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
-    target_sess = onnxruntime.InferenceSession(
-        model_path_to_check,
-        sess_options=sess_options,
-        providers=providers,
-    )
+    try:
+        target_sess = onnxruntime.InferenceSession(
+            model_path_to_check,
+            sess_options=sess_options,
+            providers=providers,
+        )
+    except (C.Fail, C.InvalidGraph) as e:
+        # This should disabled when QDQ optimizers is implemented.
+        se = str(e)
+        if (
+            "com.microsoft:QLinearMatMul(-1) is not a registered function/op" not in se
+            and "Type 'tensor(float16)' of input parameter (input) of operator (QuantizeLinear)" not in se
+            and "Type 'tensor(float16)' of input parameter (input) of operator (DynamicQuantizeLinear)" not in se
+        ):
+            # com.microsoft:QLinearMatMul is not yet implemented.
+            # QuantizeLinear supports float16 in opset 19
+            raise e
+        return
     target_results = target_sess.run([], inputs)
     testcase.assertEqual(len(origin_results), len(target_results), "result count are different")
     for idx, ref_output in enumerate(origin_results):
@@ -407,7 +513,10 @@ def check_qtype_by_node_type(testcase, model_to_check, check_list):
             input_output_check_list = check_list[node.op_type]
             for check_item in input_output_check_list:
                 tensor_name = node.input[check_item[1]] if check_item[0] == "i" else node.output[check_item[1]]
-                testcase.assertTrue((tensor_name in value_infos) or (tensor_name in initializers))
+                if tensor_name not in value_infos and tensor_name not in initializers:
+                    raise AssertionError(
+                        f"Unable to find tensor_name={tensor_name!r} in {list(sorted(value_infos))}\n{model}"
+                    )
                 if tensor_name in value_infos:
                     vi = value_infos[tensor_name]
                     testcase.assertTrue(vi.type.HasField("tensor_type"))
diff --git a/onnxruntime/test/python/quantization/test_conv_dynamic.py b/onnxruntime/test/python/quantization/test_conv_dynamic.py
index 18467bcbc108..f6ee3fe97a74 100644
--- a/onnxruntime/test/python/quantization/test_conv_dynamic.py
+++ b/onnxruntime/test/python/quantization/test_conv_dynamic.py
@@ -27,7 +27,7 @@ def generate_input_initializer(tensor_shape, tensor_dtype, input_name):
 
 
 class TestONNXModel(unittest.TestCase):
-    def construct_model(self, model_path):
+    def construct_model(self, model_path, onnx_type=TensorProto.FLOAT, opset=13, ir_version=7):
         #       input
         #      /    |
         #     /     |
@@ -40,12 +40,13 @@ def construct_model(self, model_path):
         #        |
         #       (output)
         initializers = []
-        input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [4, 2, 8, 8])
-        output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [4, 2, 8, 8])
+        input = helper.make_tensor_value_info("input", onnx_type, [4, 2, 8, 8])
+        output = helper.make_tensor_value_info("output", onnx_type, [4, 2, 8, 8])
 
-        initializers.append(generate_input_initializer([2, 2, 1, 1], np.float32, "W1"))
-        initializers.append(generate_input_initializer([2, 2, 1, 1], np.float32, "W2"))
-        initializers.append(generate_input_initializer([2], np.float32, "B"))
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
+        initializers.append(generate_input_initializer([2, 2, 1, 1], dtype, "W1"))
+        initializers.append(generate_input_initializer([2, 2, 1, 1], dtype, "W2"))
+        initializers.append(generate_input_initializer([2], dtype, "B"))
         conv_node_1 = onnx.helper.make_node("Conv", ["input", "W1", "B"], ["Conv1_O"], name="Conv1")
         conv_node_2 = onnx.helper.make_node("Conv", ["input", "W2", "B"], ["Conv2_O"], name="Conv2")
         relu_node = onnx.helper.make_node("Relu", ["Conv1_O"], ["Relu_O"], name="Relu")
@@ -57,13 +58,17 @@ def construct_model(self, model_path):
             [output],
             initializer=initializers,
         )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)], ir_version=ir_version)
         onnx.save(model, model_path)
 
-    def dynamic_quant_conv_test(self, weight_type, extra_options={}, use_quant_config=False):  # noqa: B006
+    def dynamic_quant_conv_test(
+        self, onnx_type, opset, ir_version, weight_type, extra_options=None, use_quant_config=False
+    ):
+        if extra_options is None:
+            extra_options = {}
         np.random.seed(1)
         model_fp32_path = "conv_bias.fp32.onnx"
-        self.construct_model(model_fp32_path)
+        self.construct_model(model_fp32_path, onnx_type, opset, ir_version)
 
         activation_proto_qtype = TensorProto.UINT8
         activation_type_str = "u8"
@@ -84,16 +89,26 @@ def dynamic_quant_conv_test(self, weight_type, extra_options={}, use_quant_confi
         check_op_type_count(self, model_int8_path, **quant_nodes)
         qnode_io_qtypes = {"ConvInteger": [["i", 2, activation_proto_qtype]]}
         check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
         check_model_correctness(
             self,
             model_fp32_path,
             model_int8_path,
-            {"input": np.random.rand(4, 2, 8, 8).astype(np.float32)},
+            {"input": np.random.rand(4, 2, 8, 8).astype(dtype)},
         )
 
     def test_quant_conv(self):
         for use_quant_config in [True, False]:
-            self.dynamic_quant_conv_test(QuantType.QUInt8, extra_options={}, use_quant_config=use_quant_config)
+            self.dynamic_quant_conv_test(
+                TensorProto.FLOAT, 13, 7, QuantType.QUInt8, extra_options={}, use_quant_config=use_quant_config
+            )
+
+    @unittest.skipIf(onnx.defs.onnx_opset_version() < 20, reason="Shape inference bug, see onnx PR #5709")
+    def test_quant_conv_fp16(self):
+        for use_quant_config in [True, False]:
+            self.dynamic_quant_conv_test(
+                TensorProto.FLOAT16, 19, 9, QuantType.QUInt8, extra_options={}, use_quant_config=use_quant_config
+            )
 
     # TODO: uncomment following after ConvInteger s8 supported
     # def test_quant_conv_s8s8(self):
diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py
new file mode 100644
index 000000000000..bea110e566fb
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_fusions.py
@@ -0,0 +1,401 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import math
+import unittest
+
+import numpy as np
+import onnx
+
+import onnxruntime
+from onnxruntime.quantization.execution_providers.qnn.fusion_lpnorm import FusionLpNormalization
+from onnxruntime.quantization.fusions import FusionGelu, FusionLayerNormalization
+from onnxruntime.quantization.onnx_model import ONNXModel
+
+
+class TestFusions(unittest.TestCase):
+    def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e-7, atol=0):
+        """
+        Checks that the output of the fused model matches the output of the original model.
+        """
+        orig_session = onnxruntime.InferenceSession(orig_model.SerializeToString(), providers=["CPUExecutionProvider"])
+        orig_results = orig_session.run(None, inputs)
+
+        fused_session = onnxruntime.InferenceSession(
+            fused_model.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        fused_results = fused_session.run([], inputs)
+
+        self.assertEqual(len(orig_results), len(fused_results), "Number of outputs for fused model differs")
+        for idx, expected_output in enumerate(orig_results):
+            actual_output = fused_results[idx]
+            np.testing.assert_allclose(
+                expected_output,
+                actual_output,
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"Fused model output {idx} differs",
+            )
+
+    def build_erf_sequence_1_model(self, shape):
+        """
+        Erf sequence that fuses into Gelu:
+           +-------Mul(0.5)---------------------+
+           |                                    |
+           |                                    v
+        [root] --> Div -----> Erf  --> Add --> Mul -->
+                  (B=1.4142...)       (1)
+
+        This method builds 2 of these Erf sequences:
+
+        [root] -> ERF_SEQUENCE1 -> ERF_SEQUENCE2 -> output
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const")
+
+        # First Erf sequence
+        mul0_node = onnx.helper.make_node("Mul", ["root", "half_const"], ["mul0_out"])
+        div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"])
+        erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"])
+        add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"])
+        mul1_node = onnx.helper.make_node("Mul", ["add_out", "mul0_out"], ["seq1_output"])
+
+        # Second Erf sequence
+        mul0_node_dup = onnx.helper.make_node("Mul", ["seq1_output", "half_const"], ["mul0_out_dup"])
+        div_node_dup = onnx.helper.make_node("Div", ["seq1_output", "root2_const"], ["div_out_dup"])
+        erf_node_dup = onnx.helper.make_node("Erf", ["div_out_dup"], ["erf_out_dup"])
+        add_node_dup = onnx.helper.make_node("Add", ["erf_out_dup", "one_const"], ["add_out_dup"])
+        mul1_node_dup = onnx.helper.make_node("Mul", ["add_out_dup", "mul0_out_dup"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [
+                mul0_node,
+                div_node,
+                erf_node,
+                add_node,
+                mul1_node,
+                mul0_node_dup,
+                div_node_dup,
+                erf_node_dup,
+                add_node_dup,
+                mul1_node_dup,
+            ],
+            "two_erf_sequences",
+            [root_inp],
+            [output],
+            initializer=[one_const, half_const, root2_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+            onnx.helper.make_opsetid("com.microsoft", 1),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_erf_sequence_2_model(self, shape):
+        """
+           +------------------------------------+
+           |                                    |
+           |                                    v
+        [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
+                  (B=1.4142...)       (1)            (0.5)
+
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const")
+
+        div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"])
+        erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"])
+        add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"])
+        mul0_node = onnx.helper.make_node("Mul", ["add_out", "root"], ["mul0_out"])
+        mul1_node = onnx.helper.make_node("Mul", ["mul0_out", "half_const"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [div_node, erf_node, add_node, mul0_node, mul1_node],
+            "erf_sequence_2",
+            [root_inp],
+            [output],
+            initializer=[one_const, half_const, root2_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+            onnx.helper.make_opsetid("com.microsoft", 1),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_erf_sequence_3_model(self, shape):
+        """
+           +------------------------------------------+
+           |                                          |
+           |                                          v
+        [root] --> Div -----> Erf  --> Add --> Mul -->Mul
+                  (B=1.4142...)       (A=1)   (A=0.5)
+
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const")
+
+        div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"])
+        erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"])
+        add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"])
+        mul0_node = onnx.helper.make_node("Mul", ["add_out", "half_const"], ["mul0_out"])
+        mul1_node = onnx.helper.make_node("Mul", ["mul0_out", "root"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [div_node, erf_node, add_node, mul0_node, mul1_node],
+            "erf_sequence_3",
+            [root_inp],
+            [output],
+            initializer=[one_const, half_const, root2_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+            onnx.helper.make_opsetid("com.microsoft", 1),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_erf_sequence_4_model(self, shape):
+        """
+           +----------------------------------------------+
+           |                                              |
+           |                                              v
+        [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
+                   (A=0.7071067690849304)  (B=1)  (B=0.5)
+
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        frac_const = onnx.numpy_helper.from_array(np.array(0.7071067690849304, dtype=np.float32), "frac_const")
+
+        mul0_node = onnx.helper.make_node("Mul", ["root", "frac_const"], ["mul0_out"])
+        erf_node = onnx.helper.make_node("Erf", ["mul0_out"], ["erf_out"])
+        add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"])
+        mul1_node = onnx.helper.make_node("Mul", ["add_out", "half_const"], ["mul1_out"])
+        mul2_node = onnx.helper.make_node("Mul", ["mul1_out", "root"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [mul0_node, erf_node, add_node, mul1_node, mul2_node],
+            "erf_sequence_4",
+            [root_inp],
+            [output],
+            initializer=[one_const, half_const, frac_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+            onnx.helper.make_opsetid("com.microsoft", 1),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_reduce_mean_sequence_model(self, shape, scale_val, bias_val, axis=-1):
+        """
+            +----------------------+
+            |                      |
+            |                      v
+        [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                   (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0) ^       ^       ^
+                                   |                                                 |       |       |
+                                   +-------------------------------------------------+    [Scale]  [Bias]
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        scale_const = onnx.numpy_helper.from_array(np.array(scale_val, dtype=np.float32), "scale_const")
+        bias_const = onnx.numpy_helper.from_array(np.array(bias_val, dtype=np.float32), "bias_const")
+        axes_const = onnx.numpy_helper.from_array(np.array([axis], dtype=np.int64), "axes_const")
+        two_const = onnx.numpy_helper.from_array(np.array(2.0, dtype=np.float32), "two_const")
+        eps_const = onnx.numpy_helper.from_array(np.array(1.0e-8, dtype=np.float32), "eps_const")
+
+        rm0_node = onnx.helper.make_node("ReduceMean", ["root", "axes_const"], ["rm0_out"])
+        sub_node = onnx.helper.make_node("Sub", ["root", "rm0_out"], ["sub_out"])
+        pow_node = onnx.helper.make_node("Pow", ["sub_out", "two_const"], ["pow_out"])
+        rm1_node = onnx.helper.make_node("ReduceMean", ["pow_out", "axes_const"], ["rm1_out"])
+        add0_node = onnx.helper.make_node("Add", ["rm1_out", "eps_const"], ["add0_out"])
+        sqrt_node = onnx.helper.make_node("Sqrt", ["add0_out"], ["sqrt_out"])
+        div_node = onnx.helper.make_node("Div", ["sub_out", "sqrt_out"], ["div_out"])
+        mul_node = onnx.helper.make_node("Mul", ["div_out", "scale_const"], ["mul_out"])
+        add1_node = onnx.helper.make_node("Add", ["mul_out", "bias_const"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [rm0_node, sub_node, pow_node, rm1_node, add0_node, sqrt_node, div_node, mul_node, add1_node],
+            "reduce_mean_sequence",
+            [root_inp],
+            [output],
+            initializer=[scale_const, bias_const, axes_const, two_const, eps_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def build_reduce_l2_sequence_model(self, shape, epsilon_val, axis=-1):
+        """
+        [root] --> ReduceL2 -----> Clip  --> Expand ----> Div -->
+           |      (axis=-1)    (min=epsilon) (shape=root)  ^
+           |   (keepdims=True)                             |
+           |                                               |
+           +-----------------------------------------------+
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+        axes_const = onnx.numpy_helper.from_array(np.array([axis], dtype=np.int64), "axes_const")
+        eps_const = onnx.numpy_helper.from_array(np.array(epsilon_val, dtype=np.float32), "eps_const")
+        shape_const = onnx.numpy_helper.from_array(np.array(list(shape), dtype=np.int64), "shape_const")
+
+        rl2_node = onnx.helper.make_node("ReduceL2", ["root", "axes_const"], ["rl2_out"], keepdims=1)
+        clip_node = onnx.helper.make_node("Clip", ["rl2_out", "eps_const"], ["clip_out"])
+        expand_node = onnx.helper.make_node("Expand", ["clip_out", "shape_const"], ["expand_out"])
+        div_node = onnx.helper.make_node("Div", ["root", "expand_out"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [rl2_node, clip_node, expand_node, div_node],
+            "reducel2_sequence",
+            [root_inp],
+            [output],
+            initializer=[axes_const, eps_const, shape_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return ONNXModel(model)
+
+    def test_fuse_erf_to_gelu_1(self):
+        shape = (1, 2, 3)
+        model = self.build_erf_sequence_1_model(shape)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 2 Gelu nodes.
+        modified = FusionGelu(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 2)
+
+        gelu_node_0 = model.model.graph.node[0]
+        gelu_node_1 = model.model.graph.node[1]
+        self.assertEqual(gelu_node_0.op_type, "Gelu")
+        self.assertEqual(gelu_node_1.op_type, "Gelu")
+
+        self.assertTrue(gelu_node_0.name)
+        self.assertTrue(gelu_node_1.name)
+        self.assertNotEqual(gelu_node_0.name, gelu_node_1.name)  # Generated names should not be equal
+
+        # Check that fusion is equivalent to original Erf model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+    def test_fuse_erf_to_gelu_2(self):
+        shape = (1, 2, 3)
+        model = self.build_erf_sequence_2_model(shape)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 Gelu node.
+        modified = FusionGelu(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        gelu_node = model.model.graph.node[0]
+        self.assertEqual(gelu_node.op_type, "Gelu")
+        self.assertTrue(gelu_node.name)
+
+        # Check that fusion is equivalent to original Erf model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+    def test_fuse_erf_to_gelu_3(self):
+        shape = (1, 2, 3)
+        model = self.build_erf_sequence_3_model(shape)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 Gelu node.
+        modified = FusionGelu(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        gelu_node = model.model.graph.node[0]
+        self.assertEqual(gelu_node.op_type, "Gelu")
+        self.assertTrue(gelu_node.name)
+
+        # Check that fusion is equivalent to original Erf model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+    def test_fuse_erf_to_gelu_4(self):
+        shape = (1, 2, 3)
+        model = self.build_erf_sequence_4_model(shape)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 Gelu node.
+        modified = FusionGelu(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        gelu_node = model.model.graph.node[0]
+        self.assertEqual(gelu_node.op_type, "Gelu")
+        self.assertTrue(gelu_node.name)
+
+        # Check that fusion is equivalent to original Erf model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+    def test_fuse_reduce_l2_to_lpnorm(self):
+        shape = (1, 2, 3)
+        model = self.build_reduce_l2_sequence_model(shape, 1e-12, axis=-1)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 LpNormalization node.
+        modified = FusionLpNormalization(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        lpnorm_node = model.model.graph.node[0]
+        self.assertEqual(lpnorm_node.op_type, "LpNormalization")
+        self.assertTrue(lpnorm_node.name)
+
+        # LpNorm's p attribute should be set to 2
+        p_attr = next(attr for attr in lpnorm_node.attribute if attr.name == "p")
+        self.assertEqual(p_attr.i, 2)
+
+    def test_fuse_reduce_mean_to_layer_norm(self):
+        shape = (1, 2, 3)
+        model = self.build_reduce_mean_sequence_model(shape, [2.0, 2.0, 2.0], [1.0, 1.0, 1.0], axis=-1)
+        orig_model = onnx.ModelProto()
+        orig_model.CopyFrom(model.model)
+
+        # Check that fusion simplified model to 1 LayerNormalization node.
+        modified = FusionLayerNormalization(model).apply()
+        self.assertTrue(modified)
+        self.assertEqual(len(model.model.graph.node), 1)
+
+        layer_norm_node = model.model.graph.node[0]
+        self.assertEqual(layer_norm_node.op_type, "LayerNormalization")
+        self.assertTrue(layer_norm_node.name)
+
+        # Check that fused model is equivalent to original model.
+        inputs = {"root": np.ones(shape, dtype=np.float32)}
+        self.check_fused_model_correctness(orig_model, model.model, inputs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_mixed_prec_quant_overrides_fixer.py b/onnxruntime/test/python/quantization/test_mixed_prec_quant_overrides_fixer.py
new file mode 100644
index 000000000000..96277056adee
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_mixed_prec_quant_overrides_fixer.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import unittest
+
+import onnx
+
+from onnxruntime.quantization import QuantType
+from onnxruntime.quantization.execution_providers.qnn.mixed_precision_overrides_utils import (
+    MixedPrecisionTensorQuantOverridesFixer,
+)
+from onnxruntime.quantization.tensor_quant_overrides import TensorQuantOverridesHelper
+
+
+class TestMixedPrecisionQuantOverridesFixer(unittest.TestCase):
+    def build_test_model_1(self, shape):
+        input_0 = onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, shape)
+        input_1 = onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, shape)
+        output_0 = onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, shape)
+        output_1 = onnx.helper.make_tensor_value_info("output_1", onnx.TensorProto.FLOAT, shape)
+        output_2 = onnx.helper.make_tensor_value_info("output_2", onnx.TensorProto.FLOAT, shape)
+
+        op1_node = onnx.helper.make_node("Sigmoid", ["input_0"], ["op1_out"], name="op1")
+        op2_node = onnx.helper.make_node("Cos", ["input_1"], ["op2_out"], name="op2")
+        op3_node = onnx.helper.make_node("Sin", ["op1_out"], ["op3_out"], name="op3")
+        op4_node = onnx.helper.make_node("Tanh", ["op2_out"], ["op4_out"], name="op4")
+        op5_node = onnx.helper.make_node("Mul", ["op3_out", "op4_out"], ["op5_out"], name="op5")
+        op6_node = onnx.helper.make_node("Relu", ["op5_out"], ["output_0"], name="op6")
+        op7_node = onnx.helper.make_node("Cos", ["op2_out"], ["output_1"], name="op7")
+        op8_node = onnx.helper.make_node("Sigmoid", ["op2_out"], ["output_2"], name="op8")
+
+        graph = onnx.helper.make_graph(
+            [
+                op1_node,
+                op2_node,
+                op3_node,
+                op4_node,
+                op5_node,
+                op6_node,
+                op7_node,
+                op8_node,
+            ],
+            "mixed_prec_test",
+            [input_0, input_1],
+            [output_0, output_1, output_2],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_fixer_1(self):
+        shape = (1, 2, 3)
+        model = self.build_test_model_1(shape)
+        onnx.save_model(model, "model.onnx")
+
+        default_act_qtype = QuantType.QUInt8
+        raw_overrides = {"op4_out": [{"quant_type": QuantType.QUInt16}]}
+        overrides = TensorQuantOverridesHelper(raw_overrides)
+        fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, default_act_qtype)
+        fixer.apply(default_act_qtype, default_activation_symmetric=False)
+
+        expected = {
+            "op2_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op4"}}}
+            ],
+            "op3_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op5"}}}
+            ],
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "op5_out": [
+                {"quant_type": QuantType.QUInt16, "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"op6"}}}
+            ],
+        }
+        self.assertDictEqual(overrides.get_dict(), expected)
+
+    def test_fixer_with_symmetric(self):
+        shape = (1, 2, 3)
+        model = self.build_test_model_1(shape)
+        onnx.save_model(model, "model.onnx")
+
+        default_act_qtype = QuantType.QInt8
+        raw_overrides = {"op4_out": [{"quant_type": QuantType.QInt16, "symmetric": True}]}
+        overrides = TensorQuantOverridesHelper(raw_overrides)
+        fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, default_act_qtype)
+        fixer.apply(default_act_qtype, default_activation_symmetric=False)
+
+        expected = {
+            "op2_out": [
+                {
+                    "quant_type": QuantType.QInt8,
+                    "convert": {"quant_type": QuantType.QInt16, "symmetric": True, "recv_nodes": {"op4"}},
+                }
+            ],
+            "op3_out": [
+                {
+                    "quant_type": QuantType.QInt8,
+                    "convert": {"quant_type": QuantType.QInt16, "symmetric": True, "recv_nodes": {"op5"}},
+                }
+            ],
+            "op4_out": [{"quant_type": QuantType.QInt16, "symmetric": True}],
+            "op5_out": [
+                {
+                    "quant_type": QuantType.QInt16,
+                    "symmetric": True,
+                    "convert": {"quant_type": QuantType.QInt8, "recv_nodes": {"op6"}},
+                }
+            ],
+        }
+        self.assertDictEqual(overrides.get_dict(), expected)
+
+    def test_fixer_upgrade_output(self):
+        shape = (1, 2, 3)
+        model = self.build_test_model_1(shape)
+        onnx.save_model(model, "model.onnx")
+
+        default_act_qtype = QuantType.QUInt8
+        raw_overrides = {
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "output_0": [{"quant_type": QuantType.QUInt16}],
+        }
+        overrides = TensorQuantOverridesHelper(raw_overrides)
+        fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, default_act_qtype)
+        fixer.apply(default_act_qtype, default_activation_symmetric=False)
+
+        expected = {
+            "op2_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op4"}}}
+            ],
+            "op3_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op5"}}}
+            ],
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "op5_out": [{"quant_type": QuantType.QUInt16}],
+            "output_0": [{"quant_type": QuantType.QUInt16}],
+        }
+        self.assertDictEqual(overrides.get_dict(), expected)
+
+    def test_fixer_upgrade_input(self):
+        shape = (1, 2, 3)
+        model = self.build_test_model_1(shape)
+        onnx.save_model(model, "model.onnx")
+
+        default_act_qtype = QuantType.QUInt8
+        raw_overrides = {"op4_out": [{"quant_type": QuantType.QUInt16}], "input_0": [{"quant_type": QuantType.QUInt16}]}
+        overrides = TensorQuantOverridesHelper(raw_overrides)
+        fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, default_act_qtype)
+        fixer.apply(default_act_qtype, default_activation_symmetric=False)
+
+        expected = {
+            "input_0": [{"quant_type": QuantType.QUInt16}],
+            "op1_out": [
+                {"quant_type": QuantType.QUInt16, "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"op3"}}}
+            ],
+            "op2_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op4"}}}
+            ],
+            "op3_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op5"}}}
+            ],
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "op5_out": [
+                {"quant_type": QuantType.QUInt16, "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"op6"}}}
+            ],
+        }
+        self.assertDictEqual(overrides.get_dict(), expected)
diff --git a/onnxruntime/test/python/quantization/test_op_conv_transpose.py b/onnxruntime/test/python/quantization/test_op_conv_transpose.py
index e7746f21300e..6f8d5f7b4dfd 100644
--- a/onnxruntime/test/python/quantization/test_op_conv_transpose.py
+++ b/onnxruntime/test/python/quantization/test_op_conv_transpose.py
@@ -24,7 +24,7 @@ class TestOpConvTranspose(unittest.TestCase):
     Class with test_* methods that test quantization of the ConvTranspose operator.
     """
 
-    def input_feeds(self, num_test_inputs, name2shape):
+    def input_feeds(self, num_test_inputs, name2shape, dtype):
         """
         Returns a data reader of input test data.
 
@@ -37,12 +37,12 @@ def input_feeds(self, num_test_inputs, name2shape):
         for _ in range(num_test_inputs):
             inputs = {}
             for name, shape in name2shape.items():
-                inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
+                inputs.update({name: np.random.randint(-1, 2, shape).astype(dtype)})
             input_data_list.extend([inputs])
         data_reader = TestDataFeeds(input_data_list)
         return data_reader
 
-    def construct_model(self, output_model_path):
+    def construct_model(self, output_model_path, onnx_type=TensorProto.FLOAT, opset=13, ir_version=7):
         """
         Constructs an ONNX model containing a single ConvTranspose node, and saves
         the model to the specified output path.
@@ -50,10 +50,10 @@ def construct_model(self, output_model_path):
         :param output_model_path: The output filepath in which to save the model.
         """
 
-        input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 1, 7, 7])
-        output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, [1, 1, 8, 8])
-        ini_w = helper.make_tensor("weight", TensorProto.FLOAT, [1, 1, 2, 2], [1.0, 1.0, 1.0, 1.0])
-        ini_b = helper.make_tensor("bias", TensorProto.FLOAT, [1], [0.17])
+        input_tensor = helper.make_tensor_value_info("input", onnx_type, [1, 1, 7, 7])
+        output_tensor = helper.make_tensor_value_info("output", onnx_type, [1, 1, 8, 8])
+        ini_w = helper.make_tensor("weight", onnx_type, [1, 1, 2, 2], [1.0, 1.0, 1.0, 1.0])
+        ini_b = helper.make_tensor("bias", onnx_type, [1], [0.17])
         conv_tranpose_node = onnx.helper.make_node(
             "ConvTranspose",
             ["input", "weight", "bias"],
@@ -72,8 +72,8 @@ def construct_model(self, output_model_path):
             [output_tensor],
             initializer=[ini_w, ini_b],
         )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7  # use stable onnx ir version
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)])
+        model.ir_version = ir_version  # use stable onnx ir version
 
         onnx.save(model, output_model_path)
 
@@ -128,15 +128,16 @@ def static_quant_test_qdq(
         data_reader.rewind()
         check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
 
-    def test_quantize_conv_transpose_u8u8(self):
+    def quantize_conv_transpose_u8u8(self, onnx_type, opset, ir_version):
         """
         Unit test that quantizes (uint8) an ONNX model containing an ConvTranspose operator.
         """
 
         np.random.seed(1)
         model_fp32_path = "conv_transpose_fp32.onnx"
-        self.construct_model(model_fp32_path)
-        data_reader = self.input_feeds(1, {"input": [1, 1, 7, 7]})
+        self.construct_model(model_fp32_path, onnx_type, opset, ir_version)
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
+        data_reader = self.input_feeds(1, {"input": [1, 1, 7, 7]}, dtype)
 
         self.static_quant_test_qdq(
             model_fp32_path,
@@ -145,15 +146,22 @@ def test_quantize_conv_transpose_u8u8(self):
             weight_type=QuantType.QUInt8,
         )
 
-    def test_quantize_conv_transpose_s8s8(self):
+    def test_quantize_conv_transpose_u8u8(self):
+        self.quantize_conv_transpose_u8u8(TensorProto.FLOAT, 13, 7)
+
+    def test_quantize_conv_transpose_u8u8_fp16(self):
+        self.quantize_conv_transpose_u8u8(TensorProto.FLOAT16, 19, 9)
+
+    def quantize_conv_transpose_s8s8(self, onnx_type, opset, ir_version):
         """
         Unit test that quantizes (int8) an ONNX model containing an ConvTranspose operator.
         """
 
         np.random.seed(1)
         model_fp32_path = "conv_transpose_fp32.onnx"
-        self.construct_model(model_fp32_path)
-        data_reader = self.input_feeds(1, {"input": [1, 1, 7, 7]})
+        self.construct_model(model_fp32_path, onnx_type, opset, ir_version)
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
+        data_reader = self.input_feeds(1, {"input": [1, 1, 7, 7]}, dtype)
 
         self.static_quant_test_qdq(
             model_fp32_path,
@@ -163,6 +171,12 @@ def test_quantize_conv_transpose_s8s8(self):
             extra_options={"ActivationSymmetric": True},
         )
 
+    def test_quantize_conv_transpose_s8s8(self):
+        self.quantize_conv_transpose_s8s8(TensorProto.FLOAT, 13, 7)
+
+    def test_quantize_conv_transpose_s8s8_fp16(self):
+        self.quantize_conv_transpose_s8s8(TensorProto.FLOAT16, 19, 9)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_op_gemm.py b/onnxruntime/test/python/quantization/test_op_gemm.py
index bac0f6d48e9f..843b34a6398b 100644
--- a/onnxruntime/test/python/quantization/test_op_gemm.py
+++ b/onnxruntime/test/python/quantization/test_op_gemm.py
@@ -318,7 +318,8 @@ def test_quantize_gemm(self):
             weight_type=QuantType.QUInt8,
         )
 
-    def test_quantize_gemm_s8s8(self):
+    @unittest.skip(reason="Shape inference bug, see onnx PR #6080")
+    def test_quantize_qop_gemm_s8s8(self):
         np.random.seed(1)
         model_fp32_path = "gemm_fp32.onnx"
         self.construct_model_gemm(model_fp32_path)
@@ -331,6 +332,17 @@ def test_quantize_gemm_s8s8(self):
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
         )
+
+        # dynamic quantization doesn't support activation:int8
+        # self.dynamic_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8,
+        #                        extra_options={'ActivationSymmetric': True})
+
+    def test_quantize_qdq_gemm_s8s8(self):
+        np.random.seed(1)
+        model_fp32_path = "gemm_fp32.onnx"
+        self.construct_model_gemm(model_fp32_path)
+        data_reader = self.input_feeds(1, {"input": [5, 10]})
+
         self.static_quant_test_qdq(
             model_fp32_path,
             data_reader,
@@ -339,11 +351,7 @@ def test_quantize_gemm_s8s8(self):
             extra_options={"ActivationSymmetric": True},
         )
 
-        # dynamic quantization doesn't support activation:int8
-        # self.dynamic_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8,
-        #                        extra_options={'ActivationSymmetric': True})
-
-    def test_quantize_gemm_e4m3fn_same(self):
+    def test_quantize_qdq_gemm_e4m3fn_same(self):
         np.random.seed(1)
         model_fp32_path = "gemm_fp32.onnx"
         self.construct_model_gemm(model_fp32_path, add_clip=False)
@@ -357,6 +365,14 @@ def test_quantize_gemm_e4m3fn_same(self):
             extra_options={"scenario": "same"},
             calibrate_method=CalibrationMethod.Distribution,
         )
+
+    @unittest.skip(reason="Shape inference bug, see onnx PR #6080")
+    def test_quantize_qop_gemm_e4m3fn_same(self):
+        np.random.seed(1)
+        model_fp32_path = "gemm_fp32.onnx"
+        self.construct_model_gemm(model_fp32_path, add_clip=False)
+        data_reader = self.input_feeds(1, {"input": [5, 10]})
+
         self.static_quant_test(
             model_fp32_path,
             data_reader,
@@ -366,7 +382,7 @@ def test_quantize_gemm_e4m3fn_same(self):
             calibrate_method=CalibrationMethod.Distribution,
         )
 
-    def test_quantize_gemm_e4m3fn_p3(self):
+    def test_quantize_qdq_gemm_e4m3fn_p3(self):
         np.random.seed(1)
         model_fp32_path = "gemm_fp32.onnx"
         self.construct_model_gemm(model_fp32_path, add_clip=False)
@@ -380,6 +396,14 @@ def test_quantize_gemm_e4m3fn_p3(self):
             extra_options={"scenario": "p3"},
             calibrate_method=CalibrationMethod.Distribution,
         )
+
+    @unittest.skip(reason="Shape inference bug, see onnx PR #6080")
+    def test_quantize_qop_gemm_e4m3fn_p3(self):
+        np.random.seed(1)
+        model_fp32_path = "gemm_fp32.onnx"
+        self.construct_model_gemm(model_fp32_path, add_clip=False)
+        data_reader = self.input_feeds(1, {"input": [5, 10]})
+
         self.static_quant_test(
             model_fp32_path,
             data_reader,
@@ -762,4 +786,5 @@ def test_qgemm_ref_uint8_specific_example(self):
 
 
 if __name__ == "__main__":
+    TestOpGemm().test_quantize_gemm_e4m3fn_p3()
     unittest.main(verbosity=2)
diff --git a/onnxruntime/test/python/quantization/test_op_matmul.py b/onnxruntime/test/python/quantization/test_op_matmul.py
new file mode 100644
index 000000000000..1b9d15f1cc24
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_op_matmul.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import unittest
+
+import numpy as np
+import onnx
+import packaging.version as pv
+from numpy.testing import assert_almost_equal
+from onnx import TensorProto, helper
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
+
+from onnxruntime.capi.onnxruntime_pybind11_state import Fail
+from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, quantize_dynamic, quantize_static
+from onnxruntime.quantization.calibrate import entropy
+
+
+def skip_if_new_opset_exception_raised(func):
+    def wrapper(*args, **kwargs):
+        try:
+            func(*args, **kwargs)
+        except Fail as e:
+            if "is under development and support for this is limited" in str(e):
+                raise unittest.SkipTest(f"Skipped {func} due to opset under development.")  # noqa: B904
+            raise
+
+    return wrapper
+
+
+class TestOpMatMul(unittest.TestCase):
+    def test_entropy(self):
+        try:
+            from scipy.stats import entropy as scipy_entropy
+        except ImportError:
+            raise unittest.SkipTest("scipy not installed.")  # noqa: B904
+        pk = (np.arange(10) - 5).astype(np.float32) / 10
+        qk = -(np.arange(10) - 5).astype(np.float32) / 10
+        ent = scipy_entropy(pk, qk)
+        get = entropy(pk, qk)
+        assert_almost_equal(ent, get)
+
+    def input_feeds(self, n, name2shape, dtype):
+        input_data_list = []
+        for _i in range(n):
+            inputs = {}
+            for name, shape in name2shape.items():
+                inputs.update({name: np.random.randint(-1, 2, shape).astype(dtype)})
+            input_data_list.extend([inputs])
+        dr = TestDataFeeds(input_data_list)
+        return dr
+
+    def construct_model_matmul(
+        self, output_model_path, add_clip=True, tensor_type=onnx.TensorProto.FLOAT, opset=18, ir_version=8
+    ):
+        #      (input)
+        #         |
+        #        MatMul
+        #         |
+        #        Clip
+        #         |
+        #        MatMul
+        #         |
+        #      (output)
+        dtype = np.float32 if tensor_type == onnx.TensorProto.FLOAT else np.float16
+        input_name = "input"
+        output_name = "output"
+        initializers = []
+
+        def make_matmul(input_name, weight_shape, weight_name, output_name):
+            weight_data = np.random.normal(0, 0.1, weight_shape).astype(dtype)
+            initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
+            return onnx.helper.make_node("MatMul", [input_name, weight_name], [output_name])
+
+        # make mm1 node
+        mm1_output_name = "mm1_output"
+        mm1_node = make_matmul(
+            input_name,
+            [10, 100],
+            "linear1.weight",
+            mm1_output_name,
+        )
+
+        if add_clip:
+            # make Clip
+            clip_min_name = "clip_min"
+            clip_max_name = "clip_max"
+            clip_output_name = "clip_output"
+            clip_inputs = [mm1_output_name, clip_min_name, clip_max_name]
+            clip_outputs = [clip_output_name]
+            initializers.append(onnx.numpy_helper.from_array(np.array(-1.0, dtype=dtype), name=clip_min_name))
+            initializers.append(onnx.numpy_helper.from_array(np.array(1.0, dtype=dtype), name=clip_max_name))
+            clip_node = onnx.helper.make_node("Clip", clip_inputs, clip_outputs)
+
+        else:
+            clip_output_name = "clip_output"
+            clip_node = onnx.helper.make_node("Identity", [mm1_output_name], [clip_output_name])
+
+        # make mm2 node
+        mm2_node = make_matmul(
+            clip_output_name,
+            [100, 10],
+            "linear2.weight",
+            output_name,
+        )
+
+        # make graph
+        input_tensor = helper.make_tensor_value_info(input_name, tensor_type, [-1, 10])
+        output_tensor = helper.make_tensor_value_info(output_name, tensor_type, [-1, 10])
+        graph_name = "matmul_test"
+        graph = helper.make_graph(
+            [mm1_node, clip_node, mm2_node],
+            graph_name,
+            [input_tensor],
+            [output_tensor],
+            initializer=initializers,
+        )
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)])
+        model.ir_version = ir_version
+
+        onnx.save(model, output_model_path)
+
+    @staticmethod
+    def str_type(qtype):
+        if qtype == QuantType.QUInt8:
+            return "u8"
+        if qtype == QuantType.QInt8:
+            return "s8"
+        if qtype == QuantType.QFLOAT8E4M3FN:
+            return "f8e4m3fn"
+        raise ValueError(f"Unexpected value for qtype={qtype}")
+
+    def static_quant_test(
+        self,
+        model_fp_path,
+        data_reader,
+        activation_type,
+        weight_type,
+        extra_options={},  # noqa: B006
+        calibrate_method=CalibrationMethod.MinMax,
+    ):
+        activation_proto_qtype = activation_type.tensor_type
+        activation_type_str = self.str_type(activation_type)
+        weight_type_str = self.str_type(weight_type)
+        model_qtype_path = f"matmul_fp.quant_{activation_type_str}{weight_type_str}.onnx"
+
+        data_reader.rewind()
+        quantize_static(
+            model_fp_path,
+            model_qtype_path,
+            data_reader,
+            quant_format=QuantFormat.QOperator,
+            activation_type=activation_type,
+            weight_type=weight_type,
+            extra_options=extra_options,
+            calibrate_method=calibrate_method,
+        )
+
+        if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
+            quant_nodes = {"QLinearMatMul": 2, "QuantizeLinear": 2, "DequantizeLinear": 2, "Identity": 1}
+            qnode_io_qtypes = {
+                "QuantizeLinear": [
+                    ["i", 2, activation_proto_qtype],
+                    ["o", 0, activation_proto_qtype],
+                ]
+            }
+        else:
+            qdq_count = 1 if activation_type != QuantType.QInt8 else 2
+            clip_count = 0 if activation_type != QuantType.QInt8 else 1
+            quant_nodes = {
+                "QLinearMatMul": 2,
+                "QuantizeLinear": qdq_count,
+                "DequantizeLinear": qdq_count,
+                "Clip": clip_count,
+            }
+            qnode_io_qtypes = {
+                "QuantizeLinear": [
+                    ["i", 2, activation_proto_qtype],
+                    ["o", 0, activation_proto_qtype],
+                ]
+            }
+
+        if activation_type_str == "f8e4m3fn" and weight_type_str == "f8e4m3fn":
+            with open(model_qtype_path, "rb") as f:
+                onx = onnx.load(f)
+
+            nf8 = 0
+            for init in onx.graph.initializer:
+                if init.data_type not in (TensorProto.FLOAT, TensorProto.FLOAT16, TensorProto.FLOAT8E4M3FN):
+                    raise AssertionError(f"Unexpected data_type={init.data_type} for initializer {init.name!r}.")
+                if init.data_type == TensorProto.FLOAT8E4M3FN:
+                    nf8 += 1
+            if nf8 < 4:
+                raise AssertionError(f"Unexpected low number of float 8 initializer ({nf8}).")
+
+        check_op_type_count(self, model_qtype_path, **quant_nodes)
+        qnode_io_qtypes.update({"DequantizeLinear": [["i", 2, activation_proto_qtype]]})
+        if activation_type_str != "f8e4m3fn":
+            # QLinearMatMul belongs to domain com.microsoft for this type and shape inference does not work
+            check_qtype_by_node_type(self, model_qtype_path, qnode_io_qtypes)
+        data_reader.rewind()
+        if activation_type_str == "f8e4m3fn" and weight_type_str == "f8e4m3fn":
+            check_model_correctness(
+                self,
+                model_fp_path,
+                model_qtype_path,
+                data_reader.get_next(),
+                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+                is_gemm=True,
+                op_matmul=True,
+            )
+        else:
+            check_model_correctness(
+                self, model_fp_path, model_qtype_path, data_reader.get_next(), is_gemm=True, op_matmul=True
+            )
+
+    def static_quant_test_qdq(
+        self,
+        model_fp_path,
+        data_reader,
+        activation_type,
+        weight_type,
+        extra_options={},  # noqa: B006
+        calibrate_method=CalibrationMethod.MinMax,
+    ):
+        activation_proto_qtype = activation_type.tensor_type
+        activation_type_str = self.str_type(activation_type)
+        weight_type_str = self.str_type(weight_type)
+        model_qtype_path = f"matmul_fp.quant_dqd_{activation_type_str}{weight_type_str}.onnx"
+
+        data_reader.rewind()
+        quantize_static(
+            model_fp_path,
+            model_qtype_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=activation_type,
+            weight_type=weight_type,
+            extra_options=extra_options,
+            calibrate_method=calibrate_method,
+        )
+
+        if activation_type == QuantType.QUInt8:
+            clip_count = 0
+            q_count = 3
+            dq_count = 5
+            cast_count = 0
+        elif activation_type == QuantType.QInt8:
+            clip_count = 1
+            q_count = 4
+            dq_count = 6
+            cast_count = 0
+        elif activation_type == QuantType.QFLOAT8E4M3FN:
+            clip_count = 0
+            q_count = 4
+            dq_count = 6
+            cast_count = 0
+        else:
+            raise AssertionError(f"Test not implemented for activation_type={activation_type}.")
+
+        quant_nodes = {
+            "MatMul": 2,
+            "QuantizeLinear": q_count,
+            "DequantizeLinear": dq_count,
+            "Clip": clip_count,
+            "Cast": cast_count,
+        }
+        check_op_type_count(self, model_qtype_path, **quant_nodes)
+        qnode_io_qtypes = {
+            "QuantizeLinear": [
+                ["i", 2, activation_proto_qtype],
+                ["o", 0, activation_proto_qtype],
+            ]
+        }
+        check_qtype_by_node_type(self, model_qtype_path, qnode_io_qtypes)
+        data_reader.rewind()
+        check_model_correctness(
+            self, model_fp_path, model_qtype_path, data_reader.get_next(), is_gemm=True, op_matmul=True
+        )
+
+    def dynamic_quant_test(
+        self,
+        model_fp_path,
+        data_reader,
+        activation_type,
+        weight_type,
+        extra_options={},  # noqa: B006
+    ):
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
+        weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
+        model_qtype_path = f"matmul_fp.quant_dynamic_{activation_type_str}{weight_type_str}.onnx"
+
+        quantize_dynamic(
+            model_fp_path,
+            model_qtype_path,
+            weight_type=weight_type,
+            extra_options=extra_options,
+        )
+        quant_nodes = {"MatMulInteger": 2}
+        check_op_type_count(self, model_qtype_path, **quant_nodes)
+        qnode_io_qtypes = {"MatMulInteger": [["i", 2, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_qtype_path, qnode_io_qtypes)
+        data_reader.rewind()
+        onx = onnx.load(model_fp_path)
+        tt = onx.graph.input[0].type.tensor_type.elem_type
+        check_model_correctness(
+            self,
+            model_fp_path,
+            model_qtype_path,
+            {"input": np.random.rand(5, 10).astype(np.float32 if tt == onnx.TensorProto.FLOAT else np.float16)},
+            dynamic=True,
+            is_gemm=True,
+            op_matmul=True,
+        )
+
+    def quantize_matmul_u8u8(self, tt, opset, ir_version):
+        np.random.seed(1)
+        model_fp_path = "matmul_fp.onnx"
+        self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
+        data_reader = self.input_feeds(
+            1, {"input": [5, 10]}, np.float32 if tt == onnx.TensorProto.FLOAT else np.float16
+        )
+
+        self.static_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+        )
+        self.static_quant_test_qdq(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+        )
+        self.dynamic_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+        )
+
+    def test_quantize_matmul_u8u8(self):
+        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT, 18, 8)
+
+    @unittest.skip(reason="QLinearMatMul(21), which supports float16, is not implemented in ORT.")
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_u8u8_f16(self):
+        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 21, 9)
+
+    def quantize_matmul_s8s8(self, tt, opset, ir_version, calibrate_method=CalibrationMethod.MinMax):
+        np.random.seed(1)
+        model_fp_path = "matmul_fp.onnx"
+        self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
+        data_reader = self.input_feeds(
+            1, {"input": [5, 10]}, np.float32 if tt == onnx.TensorProto.FLOAT else np.float16
+        )
+
+        self.static_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QInt8,
+            weight_type=QuantType.QInt8,
+            extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
+        )
+        self.static_quant_test_qdq(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QInt8,
+            weight_type=QuantType.QInt8,
+            extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
+        )
+
+        # dynamic quantization doesn't support activation:int8
+        # self.dynamic_quant_test(model_fp_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8,
+        #                        extra_options={'ActivationSymmetric': True})
+
+    def test_quantize_matmul_s8s8(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8)
+
+    def test_quantize_matmul_s8s8_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Entropy)
+
+    def test_quantize_matmul_s8s8_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Percentile)
+
+    def test_quantize_matmul_s8s8_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Distribution)
+
+    @unittest.skip(reason="QLinearMatMul(21), which supports float16, is not implemented in ORT.")
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9)
+
+    @unittest.skip(reason="QLinearMatMul(21), which supports float16, is not implemented in ORT.")
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Entropy)
+
+    @unittest.skip(reason="QLinearMatMul(21), which supports float16, is not implemented in ORT.")
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Percentile)
+
+    @unittest.skip(reason="QLinearMatMul(21), which supports float16, is not implemented in ORT.")
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Distribution)
+
+    def quantize_matmul_e4m3fn_same(self, tt, opset, ir_version):
+        np.random.seed(1)
+        model_fp_path = "matmul_fp.onnx"
+        self.construct_model_matmul(model_fp_path, add_clip=False, tensor_type=tt, opset=opset, ir_version=ir_version)
+        data_reader = self.input_feeds(
+            1, {"input": [5, 10]}, np.float32 if tt == onnx.TensorProto.FLOAT else np.float16
+        )
+
+        self.static_quant_test_qdq(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QFLOAT8E4M3FN,
+            weight_type=QuantType.QFLOAT8E4M3FN,
+            extra_options={"scenario": "same"},
+            calibrate_method=CalibrationMethod.Distribution,
+        )
+        self.static_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QFLOAT8E4M3FN,
+            weight_type=QuantType.QFLOAT8E4M3FN,
+            extra_options={"scenario": "same"},
+            calibrate_method=CalibrationMethod.Distribution,
+        )
+
+    def test_quantize_matmul_e4m3fn_same(self):
+        self.quantize_matmul_e4m3fn_same(onnx.TensorProto.FLOAT, 18, 8)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    def test_quantize_matmul_e4m3fn_same_f16(self):
+        self.quantize_matmul_e4m3fn_same(onnx.TensorProto.FLOAT16, 19, 9)
+
+    def quantize_matmul_e4m3fn_p3(self, tt, opset, ir_version):
+        np.random.seed(1)
+        model_fp_path = "matmul_fp.onnx"
+        self.construct_model_matmul(model_fp_path, add_clip=False, tensor_type=tt, opset=opset, ir_version=ir_version)
+        data_reader = self.input_feeds(
+            1, {"input": [5, 10]}, np.float32 if tt == onnx.TensorProto.FLOAT else np.float16
+        )
+
+        self.static_quant_test_qdq(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QFLOAT8E4M3FN,
+            weight_type=QuantType.QFLOAT8E4M3FN,
+            extra_options={"scenario": "p3"},
+            calibrate_method=CalibrationMethod.Distribution,
+        )
+        self.static_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QFLOAT8E4M3FN,
+            weight_type=QuantType.QFLOAT8E4M3FN,
+            extra_options={"scenario": "p3"},
+            calibrate_method=CalibrationMethod.Distribution,
+        )
+
+    def test_quantize_matmul_e4m3fn_p3(self):
+        self.quantize_matmul_e4m3fn_p3(onnx.TensorProto.FLOAT, 18, 8)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    def test_quantize_matmul_e4m3fn_p3_f16(self):
+        self.quantize_matmul_e4m3fn_p3(onnx.TensorProto.FLOAT16, 19, 9)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 02f51cc4fa80..88e5052db4e2 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -71,13 +71,16 @@ def construct_model_matmul(self, output_model_path: str, symmetric: bool) -> Non
         output_name = "output"
         initializers = []
 
-        def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str):
+        def make_matmul(
+            input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str, node_name: str
+        ):
             weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32)
             initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
             return onnx.helper.make_node(
                 "MatMul",
                 [input_name, weight_name],
                 [output_name],
+                node_name,
             )
 
         in_features = 52
@@ -88,6 +91,7 @@ def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_na
             [in_features, out_features],
             "linear1.weight",
             output_name,
+            "MatMul_0",
         )
 
         # make graph
@@ -121,7 +125,55 @@ def quant_test(
         from onnxruntime.quantization import matmul_4bits_quantizer
 
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
-        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric)
+        quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig(
+            block_size=block_size, is_symmetric=is_symmetric
+        )
+        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, algo_config=quant_config)
+        quant.process()
+        quant.model.save_model_to_file(model_int4_path, False)
+
+        quant_nodes = {"MatMulNBits": 1}
+        check_op_type_count(self, model_int4_path, **quant_nodes)
+
+        data_reader.rewind()
+
+        try:
+            check_model_correctness(self, model_fp32_path, model_int4_path, data_reader.get_next())
+        except Exception as exception:
+            if "4b quantization not yet supported on this hardware platform!" in exception.args[0]:
+                # Currently we don't have int4 quantization support on all platforms, has to tolerate this exception
+                pass
+            else:
+                raise exception
+
+    def quant_test_with_algo(
+        self,
+        algorithm: str,
+        model_fp32_path: str,
+        data_reader: TestDataFeeds,
+        block_size: int,
+        is_symmetric: bool,
+    ):
+        model_int4_path = str(
+            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+        )
+
+        # Quantize fp32 model to int4 model
+        from onnxruntime.quantization import matmul_4bits_quantizer
+
+        algo_config = None
+        if algorithm == "RTN":
+            # test RTN algorithm
+            algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig()
+        elif algorithm == "GPTQ":
+            # test GPTQ algorithm
+            algo_config = matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader)
+        elif algorithm == "HQQ":
+            # test HQQ algorithm
+            algo_config = matmul_4bits_quantizer.HQQWeightOnlyQuantConfig(block_size=block_size)
+
+        model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
+        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config)
         quant.process()
         quant.model.save_model_to_file(model_int4_path, False)
 
@@ -159,6 +211,39 @@ def test_quantize_matmul_int4_offsets(self):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test(model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_rtn_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("RTN", model_fp32_path, data_reader, 32, False)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_gptq_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("GPTQ", model_fp32_path, data_reader, 32, False)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_hqq_algo(self):
+        if not find_spec("torch"):
+            self.skipTest("skip test_hqq_quant since torch is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("HQQ", model_fp32_path, data_reader, 32, False)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_op_matmulfpq4.py b/onnxruntime/test/python/quantization/test_op_matmulfpq4.py
deleted file mode 100644
index 170bb09a0fde..000000000000
--- a/onnxruntime/test/python/quantization/test_op_matmulfpq4.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env python
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-
-import tempfile
-import unittest
-from pathlib import Path
-from typing import Dict, Tuple, Union
-
-import numpy as np
-import onnx
-from onnx import TensorProto, helper
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
-
-from onnxruntime.quantization import MatMulWeight4Quantizer, quant_utils
-
-
-class TestOpMatMulFpQ4(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="test_matmulfpq4.")
-
-    @classmethod
-    def tearDownClass(cls):
-        cls._tmp_model_dir.cleanup()
-
-    def fill_int4_data(self, shape: Union[int, Tuple[int, ...]], symmetric: bool) -> np.ndarray:
-        line = np.zeros(shape)
-        line = line.reshape(-1)
-
-        if symmetric:
-            v = -2.0
-            for i in range(line.shape[0]):
-                if v == 0 or v == -3 or v == 3:
-                    v += 1
-                line[i] = v
-                v += 1
-                if v >= 8:
-                    v = -8
-        else:
-            v = 0.0
-            for i in range(line.shape[0]):
-                line[i] = v
-                v += 1
-                if v >= 16:
-                    v = 0
-
-        return line.reshape(shape)
-
-    def input_feeds(self, n: int, name2shape: Dict[str, Union[int, Tuple[int, ...]]]) -> TestDataFeeds:
-        input_data_list = []
-        for _i in range(n):
-            inputs = {}
-            for name, shape in name2shape.items():
-                inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
-            input_data_list.extend([inputs])
-        dr = TestDataFeeds(input_data_list)
-        return dr
-
-    def construct_model_matmul(self, output_model_path: str, symmetric: bool) -> None:
-        #      (input)
-        #         |
-        #       MatMul
-        #         |
-        #      (output)
-        input_name = "input"
-        output_name = "output"
-        initializers = []
-
-        def make_gemm(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str):
-            weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32)
-            initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
-            return onnx.helper.make_node(
-                "MatMul",
-                [input_name, weight_name],
-                [output_name],
-            )
-
-        in_features = 52
-        out_features = 288
-        # make MatMulFpQ4 node
-        matmul_node = make_gemm(
-            input_name,
-            [in_features, out_features],
-            "linear1.weight",
-            output_name,
-        )
-
-        # make graph
-        input_tensor = helper.make_tensor_value_info(input_name, TensorProto.FLOAT, [-1, in_features])
-        output_tensor = helper.make_tensor_value_info(output_name, TensorProto.FLOAT, [-1, out_features])
-        graph_name = "matmul_test"
-        graph = helper.make_graph(
-            [matmul_node],
-            graph_name,
-            [input_tensor],
-            [output_tensor],
-            initializer=initializers,
-        )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7  # use stable onnx ir version
-
-        onnx.save(model, output_model_path)
-
-    def quant_test(
-        self,
-        model_fp32_path: str,
-        data_reader: TestDataFeeds,
-        quantization_type: int,  # 0: BlkQ4Sym, 1: BlkQ4Zp8
-    ):
-        qtype_str = "BlkQ4Sym" if (quantization_type == 0) else "BlkQ4Zp8"
-        model_int4_path = str(Path(self._tmp_model_dir.name).joinpath(f"matmulfpq4_{qtype_str}.onnx").absolute())
-
-        # Quantize fp32 model to int4 model
-        model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
-        quant = MatMulWeight4Quantizer(model, quantization_type)
-        quant.process()
-        quant.model.save_model_to_file(model_int4_path, False)
-
-        quant_nodes = {"MatMulFpQ4": 1}
-        check_op_type_count(self, model_int4_path, **quant_nodes)
-
-        data_reader.rewind()
-
-        try:
-            check_model_correctness(self, model_fp32_path, model_int4_path, data_reader.get_next())
-        except Exception as exception:
-            if "4b quantization not yet supported on this hardware platform!" in exception.args[0]:
-                # Currently we don't have int4 quantization support on all platforms, has to tolerate this exception
-                pass
-            else:
-                raise exception
-
-    def test_quantize_matmul_int4_symmetric(self):
-        np.random.seed(13)
-
-        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_symmetric.onnx").absolute())
-        self.construct_model_matmul(model_fp32_path, symmetric=True)
-        data_reader = self.input_feeds(1, {"input": [100, 52]})
-        self.quant_test(model_fp32_path, data_reader, quantization_type=MatMulWeight4Quantizer.BlkQ4Sym)
-
-    def test_quantize_matmul_int4_offsets(self):
-        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
-        self.construct_model_matmul(model_fp32_path, symmetric=False)
-        data_reader = self.input_feeds(1, {"input": [100, 52]})
-        self.quant_test(model_fp32_path, data_reader, quantization_type=MatMulWeight4Quantizer.BlkQ4Zp8)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py
index 005f4752c16c..291bf42405d5 100644
--- a/onnxruntime/test/python/quantization/test_op_pad.py
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@@ -222,12 +222,8 @@ def verify_quantize_with_pad_mode(
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_i8_path = "qop_pad_{}_i8_{}{}_{}{}.onnx".format(
-            quantize_mode,
-            tag_pad_mode,
-            tag_constant_value,
-            activation_type_str,
-            weight_type_str,
+        model_i8_path = (
+            f"qop_pad_{quantize_mode}_i8_{tag_pad_mode}{tag_constant_value}_{activation_type_str}{weight_type_str}.onnx"
         )
         data_reader.rewind()
         self.quantize_model(
@@ -496,6 +492,8 @@ def test_pad_with_empty_string_input_name(self):
         )
 
         model_fp32 = TestOpQuatizerPad.construct_model_add_pad_add(name=name, shape=shape, final_name="output")
+        op_types = [n.op_type for n in model_fp32.graph.node]
+        self.assertEqual(["Add", "Pad", "Add"], op_types)
 
         onnx.save(model_fp32, model_fp32_path)
 
@@ -506,13 +504,11 @@ def test_pad_with_empty_string_input_name(self):
         )
 
         model_i8 = onnx.load(model_i8_path)
+        print(model_i8)
 
         # Assert quantization really happens.
-        self.assertEqual(model_i8.graph.node[0].op_type, "QuantizeLinear")
-        self.assertEqual(model_i8.graph.node[1].op_type, "QLinearAdd")
-        self.assertEqual(model_i8.graph.node[2].op_type, "Pad")
-        self.assertEqual(model_i8.graph.node[3].op_type, "QLinearAdd")
-        self.assertEqual(model_i8.graph.node[4].op_type, "DequantizeLinear")
+        op_types = [n.op_type for n in model_i8.graph.node]
+        self.assertEqual(["QuantizeLinear", "QLinearAdd", "Pad", "QLinearAdd", "DequantizeLinear"], op_types)
 
         for node in model_i8.graph.node:
             # Examine no empty string flows to quantization process.
diff --git a/onnxruntime/test/python/quantization/test_op_relu.py b/onnxruntime/test/python/quantization/test_op_relu.py
index 36a6a3dc7bc5..5cc4271b03e8 100644
--- a/onnxruntime/test/python/quantization/test_op_relu.py
+++ b/onnxruntime/test/python/quantization/test_op_relu.py
@@ -194,7 +194,8 @@ def test_quantize_gemm(self):
             weight_type=QuantType.QUInt8,
         )
 
-    def test_quantize_relu_s8s8(self):
+    @unittest.skip(reason="Shape inference bug, see onnx PR #6080")
+    def test_quantize_qop_relu_s8s8(self):
         np.random.seed(1)
         model_fp32_path = "relu_fp32.onnx"
         self.construct_model_gemm(model_fp32_path)
@@ -207,6 +208,13 @@ def test_quantize_relu_s8s8(self):
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
         )
+
+    def test_quantize_qdq_relu_s8s8(self):
+        np.random.seed(1)
+        model_fp32_path = "relu_fp32.onnx"
+        self.construct_model_gemm(model_fp32_path)
+        data_reader = self.input_feeds(1, {"input": [5, 10]})
+
         self.static_quant_test_qdq(
             model_fp32_path,
             data_reader,
diff --git a/onnxruntime/test/python/quantization/test_op_where.py b/onnxruntime/test/python/quantization/test_op_where.py
index 4f96283c7d03..0c57a7452902 100644
--- a/onnxruntime/test/python/quantization/test_op_where.py
+++ b/onnxruntime/test/python/quantization/test_op_where.py
@@ -145,11 +145,9 @@ def quantize_where_test(self, activation_type, weight_type, extra_options={}):
 
     def test_quantize_where_u8u8(self):
         self.quantize_where_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={"ForceQuantizeNoInputCheck": True})
-        print(__name__)
 
     def test_quantize_where_u8u8_no_force_quantize_no_input_check(self):
         self.quantize_where_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={"ForceQuantizeNoInputCheck": False})
-        print(__name__)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py
index 5c2db435d7fb..db4ab7e8a412 100644
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@@ -4,7 +4,9 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
+from __future__ import annotations
 
+import os
 import tempfile
 import unittest
 from pathlib import Path
@@ -20,17 +22,17 @@
     create_clip_node,
 )
 
-from onnxruntime.quantization import QDQQuantizer, QuantFormat, QuantizationMode, QuantType, quantize_static
+from onnxruntime.quantization import QDQQuantizer, QuantFormat, QuantType, quantize_static
 from onnxruntime.quantization.calibrate import TensorData
 
 
 class TestQDQFormat(unittest.TestCase):
-    def input_feeds(self, n, name2shape):
+    def input_feeds(self, n, name2shape, np_float_type=np.float32):
         input_data_list = []
         for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
-                inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
+                inputs.update({name: np.random.randint(-1, 2, shape).astype(np_float_type)})
             input_data_list.extend([inputs])
         dr = TestDataFeeds(input_data_list)
         return dr
@@ -74,7 +76,7 @@ def test_qdq_extra_options(self):
         onnx.save(model, test_model_path)
 
         def td(vals):
-            return TensorData(lowest=vals[0], highest=vals[1])
+            return TensorData(lowest=np.array(vals[0], dtype=np.float32), highest=np.array(vals[1], dtype=np.float32))
 
         compute_data = {
             "P": td([0.1, 0.1]),
@@ -87,14 +89,11 @@ def td(vals):
 
         op_types_to_quantize = ["Add"]
 
-        mode = QuantizationMode.QLinearOps
         model = onnx.load_model(test_model_path)
         quantizer = QDQQuantizer(
             model,
             True,  # per_channel
             False,  # reduce_range
-            mode,
-            True,  # static
             QuantType.QInt8,  # weight_type
             QuantType.QInt8,  # activation_type
             compute_data,
@@ -175,7 +174,7 @@ def test_qdq_extra_options_2(self):
         onnx.save(model, test_model_path)
 
         def td(vals):
-            return TensorData(lowest=vals[0], highest=vals[1])
+            return TensorData(lowest=np.array(vals[0], dtype=np.float32), highest=np.array(vals[1], dtype=np.float32))
 
         compute_data = {
             "L": td([0.1, 0.1]),
@@ -191,14 +190,11 @@ def td(vals):
 
         op_types_to_quantize = ["Add", "MatMul"]
 
-        mode = QuantizationMode.QLinearOps
         model = onnx.load_model(test_model_path)
         quantizer = QDQQuantizer(
             model,
             True,  # per_channel
             False,  # reduce_range
-            mode,
-            True,  # static
             QuantType.QInt8,  # weight_type
             QuantType.QInt8,  # activation_type
             compute_data,
@@ -601,6 +597,13 @@ def verify_qdq(self, per_channel, activation_type, weight_type, extra_options=No
         )
         check_model_correctness(self, model_fp32_path, model_qdq_path, data_reader.get_next())
 
+        # If the model uses Q/DQ ops with "com.microsoft" domain (e.g., for int16 support),
+        # then ensure the model has the appropriate opset import.
+        if extra_options and extra_options.get("UseQDQContribOps", False):
+            qdq_model = onnx.load_model(model_qdq_path)
+            ms_opset = next((opset for opset in qdq_model.opset_import if opset.domain == "com.microsoft"), None)
+            self.assertIsNot(ms_opset, None)
+
     def verify_qop(self, per_channel, is_quant_type_int8):
         np.random.seed(1)
         model_fp32_path = str(Path(self._tmp_model_dir.name) / f"conv_relu_fp32.{per_channel}.onnx")
@@ -719,5 +722,593 @@ def test_activation_only(self):
         check_op_type_count(self, qdq_model_path, **qop_nodes)
 
 
+class TestQDQMixedPrecision(TestQDQFormat):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.mixed_prec_")
+
+        # Note: swap with the commented line if you want to see the models in local test dir.
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+        # cls._tmp_dir_path = "."
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
+    def build_test_model_for_add_qdq_ops(
+        self,
+        num_consumers: int,
+        is_graph_output: bool,
+        float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT,
+        op0_transpose: bool = False,
+    ):
+        """
+        Builds a float32 model with a single producer node and a configurable number of consumer nodes.
+        The tensor between the producer and consumers can be optionally made a graph output.
+        op_0 can optionally be made a Transpose node to test sharing qparams across the input and output.
+
+                           +-> op_0_out (optional graph output)
+                           |
+        input_0 --> op_0 --+-> op_1 --> output_0
+                           |
+                           +-> op_2 --> output_1
+                           |
+                           ...
+                           |
+                           +-> op_{n} --> output_{n-1}
+        """
+        shape = (1, 2, 3)
+        shape_t = (1, 3, 2)
+        input_0 = onnx.helper.make_tensor_value_info("input_0", float_type, shape)
+        output_shape = shape if not op0_transpose else shape_t
+
+        outputs = []
+        for i in range(num_consumers):
+            outputs.append(onnx.helper.make_tensor_value_info(f"output_{i}", float_type, output_shape))
+
+        if is_graph_output:
+            outputs.append(onnx.helper.make_tensor_value_info("op_0_out", float_type, output_shape))
+
+        nodes = []
+        if op0_transpose:
+            nodes.append(onnx.helper.make_node("Transpose", ["input_0"], ["op_0_out"], perm=[0, 2, 1], name="op_0"))
+        else:
+            nodes.append(onnx.helper.make_node("Sigmoid", ["input_0"], ["op_0_out"], name="op_0"))
+
+        for i in range(num_consumers):
+            op_index = i + 1
+            nodes.append(onnx.helper.make_node("Cos", ["op_0_out"], [f"output_{i}"], name=f"op_{op_index}"))
+
+        graph = onnx.helper.make_graph(
+            nodes,
+            "test_add_qdq_ops_for_converted_activation",
+            [input_0],
+            outputs,
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_add_tensor_qdq_ops_case_1(self):
+        """
+        Tensor T is not a graph output; all consumers use the converted type
+        <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 ---> <Consumers>
+        """
+        # Test configurations (qparam_sharing, float_type)
+        subtest_configs = [
+            (False, onnx.TensorProto.FLOAT, np.float32),
+            (False, onnx.TensorProto.FLOAT16, np.float16),
+            (True, onnx.TensorProto.FLOAT, np.float32),
+            (True, onnx.TensorProto.FLOAT16, np.float16),
+        ]
+        for test_qparam_sharing, float_type, np_float_type in subtest_configs:
+            with self.subTest(test_qparam_sharing=test_qparam_sharing, float_type=float_type):
+                label = f"_share{test_qparam_sharing}_f{float_type}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"case_1{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"case_1{label}.qdq.onnx")
+                float_model = self.build_test_model_for_add_qdq_ops(
+                    2, False, float_type=float_type, op0_transpose=test_qparam_sharing
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                data_reader = self.input_feeds(3, {"input_0": (1, 2, 3)}, np_float_type)
+
+                mixed_prec_overrides = {
+                    "op_0_out": [
+                        {
+                            "quant_type": QuantType.QUInt8,
+                            "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op_1", "op_2"}},
+                        }
+                    ],
+                    "output_0": [{"quant_type": QuantType.QUInt16}],
+                    "output_1": [{"quant_type": QuantType.QUInt16}],
+                }
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    op_types_to_quantize=[node.op_type for node in float_model.graph.node],
+                    extra_options={
+                        "TensorQuantOverrides": mixed_prec_overrides,
+                        "ForceQuantizeNoInputCheck": test_qparam_sharing,  # To ensure Transpose is wrapped in DQ/Q
+                    },
+                )
+
+                # Expect the following QDQ model:
+                # input_0 --> Q --> DQ --> op_0 --> Q_8 --> DQ_8 --> Q_16 --> DQ_16 -+-> op_1 --> Q --> DQ --> output_0
+                #                                                                    |
+                #                                                                    +-> op_2 --> Q --> DQ --> output_1
+                qdq_node_counts = {"QuantizeLinear": 5, "DequantizeLinear": 5}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                onnx.checker.check_model(qdq_model, True)
+
+                initializers = {init.name: init for init in qdq_model.graph.initializer}
+
+                # Check zero-point data types
+                orig_zp_init = None
+                if test_qparam_sharing:
+                    # op_0_out_zero_point should not be in the model because the Transpose output is sharing
+                    # qparams from the Transpose input.
+                    self.assertNotIn("op_0_out_zero_point", initializers)
+                    orig_zp_init = initializers["input_0_zero_point"]
+                else:
+                    orig_zp_init = initializers["op_0_out_zero_point"]
+
+                self.assertEqual(orig_zp_init.data_type, onnx.TensorProto.UINT8)
+                convert_zp_init = initializers["op_0_out_zero_point_convert"]
+                self.assertEqual(convert_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_0_zp_init = initializers["output_0_zero_point"]
+                self.assertEqual(output_0_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_1_zp_init = initializers["output_1_zero_point"]
+                self.assertEqual(output_1_zp_init.data_type, onnx.TensorProto.UINT16)
+
+                # Check scale data types
+                orig_scale_init = None
+                if test_qparam_sharing:
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_scale_init = initializers["input_0_scale"]
+                else:
+                    orig_scale_init = initializers["op_0_out_scale"]
+
+                self.assertEqual(orig_scale_init.data_type, float_type)
+                convert_scale_init = initializers["op_0_out_scale_convert"]
+                self.assertEqual(convert_scale_init.data_type, float_type)
+                output_0_scale_init = initializers["output_0_scale"]
+                self.assertEqual(output_0_scale_init.data_type, float_type)
+                output_1_scale_init = initializers["output_1_scale"]
+                self.assertEqual(output_1_scale_init.data_type, float_type)
+
+    def test_add_tensor_qdq_ops_case_2(self):
+        """
+        Tensor T is not a graph output; some consumers use the original type, others use the converted type
+        <Producer> ---> Q1 -+-> DQ1 ---> <Consumers of original type>
+                            |
+                            +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
+        """
+        # Test configurations (qparam_sharing, float_type)
+        subtest_configs = [
+            (False, onnx.TensorProto.FLOAT, np.float32),
+            (False, onnx.TensorProto.FLOAT16, np.float16),
+            (True, onnx.TensorProto.FLOAT, np.float32),
+            (True, onnx.TensorProto.FLOAT16, np.float16),
+        ]
+        for test_qparam_sharing, float_type, np_float_type in subtest_configs:
+            with self.subTest(test_qparam_sharing=test_qparam_sharing, float_type=float_type):
+                label = f"_share{test_qparam_sharing}_f{float_type}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"case_2{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"case_2{label}.qdq.onnx")
+                float_model = self.build_test_model_for_add_qdq_ops(
+                    4, False, float_type=float_type, op0_transpose=test_qparam_sharing
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                data_reader = self.input_feeds(3, {"input_0": (1, 2, 3)}, np_float_type)
+
+                mixed_prec_overrides = {
+                    "op_0_out": [
+                        {
+                            "quant_type": QuantType.QUInt8,
+                            "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op_3", "op_4"}},
+                        }
+                    ],
+                    "output_2": [{"quant_type": QuantType.QUInt16}],
+                    "output_3": [{"quant_type": QuantType.QUInt16}],
+                }
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    op_types_to_quantize=[node.op_type for node in float_model.graph.node],
+                    extra_options={
+                        "TensorQuantOverrides": mixed_prec_overrides,
+                        "ForceQuantizeNoInputCheck": test_qparam_sharing,  # To ensure Transpose is wrapped in DQ/Q
+                    },
+                )
+
+                # Expect the following QDQ model:
+                # input_0 --> Q --> DQ --> op_0 --> Q_8 -+-> DQ_8 -+-> op_1 --> Q --> DQ --> output_0
+                #                                        |         |
+                #                                        |         +-> op_2 --> Q --> DQ --> output_1
+                #                                        |
+                #                                        +-> DQ_8' --> Q_16 --> DQ_16 -+-> op_3 --> Q --> DQ --> output_2
+                #                                                                      |
+                #                                                                      +-> op_4 --> Q --> DQ --> output_3
+                qdq_node_counts = {"QuantizeLinear": 7, "DequantizeLinear": 8}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                onnx.checker.check_model(qdq_model, True)
+
+                initializers = {init.name: init for init in qdq_model.graph.initializer}
+
+                # Check zero-point data types
+                orig_zp_init = None
+                if test_qparam_sharing:
+                    # op_0_out_zero_point should not be in the model because the Transpose output is sharing
+                    # qparams from the Transpose input.
+                    self.assertNotIn("op_0_out_zero_point", initializers)
+                    orig_zp_init = initializers["input_0_zero_point"]
+                else:
+                    orig_zp_init = initializers["op_0_out_zero_point"]
+
+                self.assertEqual(orig_zp_init.data_type, onnx.TensorProto.UINT8)
+                convert_zp_init = initializers["op_0_out_zero_point_convert"]
+                self.assertEqual(convert_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_0_zp_init = initializers["output_0_zero_point"]
+                self.assertEqual(output_0_zp_init.data_type, onnx.TensorProto.UINT8)
+                output_1_zp_init = initializers["output_1_zero_point"]
+                self.assertEqual(output_1_zp_init.data_type, onnx.TensorProto.UINT8)
+                output_2_zp_init = initializers["output_2_zero_point"]
+                self.assertEqual(output_2_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_3_zp_init = initializers["output_3_zero_point"]
+                self.assertEqual(output_3_zp_init.data_type, onnx.TensorProto.UINT16)
+
+                # Check scale data types
+                orig_scale_init = None
+                if test_qparam_sharing:
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_scale_init = initializers["input_0_scale"]
+                else:
+                    orig_scale_init = initializers["op_0_out_scale"]
+
+                self.assertEqual(orig_scale_init.data_type, float_type)
+                convert_scale_init = initializers["op_0_out_scale_convert"]
+                self.assertEqual(convert_scale_init.data_type, float_type)
+                output_0_scale_init = initializers["output_0_scale"]
+                self.assertEqual(output_0_scale_init.data_type, float_type)
+                output_1_scale_init = initializers["output_1_scale"]
+                self.assertEqual(output_1_scale_init.data_type, float_type)
+                output_2_scale_init = initializers["output_2_scale"]
+                self.assertEqual(output_2_scale_init.data_type, float_type)
+                output_3_scale_init = initializers["output_3_scale"]
+                self.assertEqual(output_3_scale_init.data_type, float_type)
+
+    def test_add_tensor_qdq_ops_case_3(self):
+        """
+        Tensor T is a graph output; all consumers use the converted type
+        <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 -+-> <Consumers>
+                                                      |
+                                                      +-> <Graph output>
+        """
+        # Test configurations (qparam_sharing, float_type)
+        subtest_configs = [
+            (False, onnx.TensorProto.FLOAT, np.float32),
+            (False, onnx.TensorProto.FLOAT16, np.float16),
+            (True, onnx.TensorProto.FLOAT, np.float32),
+            (True, onnx.TensorProto.FLOAT16, np.float16),
+        ]
+        for test_qparam_sharing, float_type, np_float_type in subtest_configs:
+            with self.subTest(test_qparam_sharing=test_qparam_sharing, float_type=float_type):
+                label = f"_share{test_qparam_sharing}_f{float_type}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"case_3{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"case_3{label}.qdq.onnx")
+                float_model = self.build_test_model_for_add_qdq_ops(
+                    2, True, float_type=float_type, op0_transpose=test_qparam_sharing
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                data_reader = self.input_feeds(3, {"input_0": (1, 2, 3)}, np_float_type)
+
+                mixed_prec_overrides = {
+                    "op_0_out": [
+                        {
+                            "quant_type": QuantType.QUInt8,
+                            "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op_1", "op_2"}},
+                        }
+                    ],
+                    "output_0": [{"quant_type": QuantType.QUInt16}],
+                    "output_1": [{"quant_type": QuantType.QUInt16}],
+                }
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    op_types_to_quantize=[node.op_type for node in float_model.graph.node],
+                    extra_options={
+                        "TensorQuantOverrides": mixed_prec_overrides,
+                        "ForceQuantizeNoInputCheck": test_qparam_sharing,  # To ensure Transpose is wrapped in DQ/Q
+                    },
+                )
+
+                # Expect the following QDQ model:
+                # input_0 --> Q --> DQ --> op_0 --> Q_8 --> DQ_8 --> Q_16 --> DQ_16 -+-> op_1 --> Q --> DQ --> output_0
+                #                                                                    |
+                #                                                                    +-> op_2 --> Q --> DQ --> output_1
+                #                                                                    |
+                #                                                                    +--> op_0_out (is graph output)
+                qdq_node_counts = {"QuantizeLinear": 5, "DequantizeLinear": 5}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                onnx.checker.check_model(qdq_model, True)
+
+                initializers = {init.name: init for init in qdq_model.graph.initializer}
+                graph_outputs = {g_output.name: g_output for g_output in qdq_model.graph.output}
+
+                # Check zero-point data types
+                orig_zp_init = None
+                if test_qparam_sharing:
+                    # op_0_out_zero_point should not be in the model because the Transpose output is sharing
+                    # qparams from the Transpose input.
+                    self.assertNotIn("op_0_out_zero_point", initializers)
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_zp_init = initializers["input_0_zero_point"]
+                else:
+                    orig_zp_init = initializers["op_0_out_zero_point"]
+
+                self.assertEqual(orig_zp_init.data_type, onnx.TensorProto.UINT8)
+                convert_zp_init = initializers["op_0_out_zero_point_convert"]
+                self.assertEqual(convert_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_0_zp_init = initializers["output_0_zero_point"]
+                self.assertEqual(output_0_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_1_zp_init = initializers["output_1_zero_point"]
+                self.assertEqual(output_1_zp_init.data_type, onnx.TensorProto.UINT16)
+
+                # Check scale data types
+                orig_scale_init = None
+                if test_qparam_sharing:
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_scale_init = initializers["input_0_scale"]
+                else:
+                    orig_scale_init = initializers["op_0_out_scale"]
+
+                self.assertEqual(orig_scale_init.data_type, float_type)
+                convert_scale_init = initializers["op_0_out_scale_convert"]
+                self.assertEqual(convert_scale_init.data_type, float_type)
+                output_0_scale_init = initializers["output_0_scale"]
+                self.assertEqual(output_0_scale_init.data_type, float_type)
+                output_1_scale_init = initializers["output_1_scale"]
+                self.assertEqual(output_1_scale_init.data_type, float_type)
+
+                self.assertIn("op_0_out", graph_outputs)
+
+    def test_add_tensor_qdq_ops_case_4(self):
+        """
+        Tensor T is a graph output; some consumers use the original type, others use the converted type
+        <Producer> ---> Q1 -+-> DQ1 -+-> <Consumers of original type>
+                            |        |
+                            |        +-> <Graph output>
+                            |
+                            +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
+        """
+        # Test configurations (qparam_sharing, float_type)
+        subtest_configs = [
+            (False, onnx.TensorProto.FLOAT, np.float32),
+            (False, onnx.TensorProto.FLOAT16, np.float16),
+            (True, onnx.TensorProto.FLOAT, np.float32),
+            (True, onnx.TensorProto.FLOAT16, np.float16),
+        ]
+        for test_qparam_sharing, float_type, np_float_type in subtest_configs:
+            with self.subTest(test_qparam_sharing=test_qparam_sharing, float_type=float_type):
+                label = f"_share{test_qparam_sharing}_f{float_type}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"case_4{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"case_4{label}.qdq.onnx")
+                float_model = self.build_test_model_for_add_qdq_ops(
+                    4, True, float_type=float_type, op0_transpose=test_qparam_sharing
+                )
+                onnx.save_model(float_model, float_model_path)
+
+                data_reader = self.input_feeds(3, {"input_0": (1, 2, 3)}, np_float_type)
+
+                mixed_prec_overrides = {
+                    "op_0_out": [
+                        {
+                            "quant_type": QuantType.QUInt8,
+                            "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op_3", "op_4"}},
+                        }
+                    ],
+                    "output_2": [{"quant_type": QuantType.QUInt16}],
+                    "output_3": [{"quant_type": QuantType.QUInt16}],
+                }
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    op_types_to_quantize=[node.op_type for node in float_model.graph.node],
+                    extra_options={
+                        "TensorQuantOverrides": mixed_prec_overrides,
+                        "ForceQuantizeNoInputCheck": test_qparam_sharing,  # To ensure Transpose is wrapped in DQ/Q
+                    },
+                )
+
+                # Expect the following QDQ model:
+                # input_0 --> Q --> DQ --> op_0 --> Q_8 -+-> DQ_8 -+-> op_1 --> Q --> DQ --> output_0
+                #                                        |         |
+                #                                        |         +-> op_2 --> Q --> DQ --> output_1
+                #                                        |         |
+                #                                        |         +-> op_0_out (is graph output)
+                #                                        |
+                #                                        +-> DQ_8' --> Q_16 --> DQ_16 -+-> op_3 --> Q --> DQ --> output_2
+                #                                                                      |
+                #                                                                      +-> op_4 --> Q --> DQ --> output_3
+                qdq_node_counts = {"QuantizeLinear": 7, "DequantizeLinear": 8}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                qdq_model = onnx.load_model(qdq_model_path)
+                onnx.checker.check_model(qdq_model, True)
+
+                initializers = {init.name: init for init in qdq_model.graph.initializer}
+                graph_outputs = {g_output.name: g_output for g_output in qdq_model.graph.output}
+
+                # Check zero-point data types
+                orig_zp_init = None
+                if test_qparam_sharing:
+                    # op_0_out_zero_point should not be in the model because the Transpose output is sharing
+                    # qparams from the Transpose input.
+                    self.assertNotIn("op_0_out_zero_point", initializers)
+                    orig_zp_init = initializers["input_0_zero_point"]
+                else:
+                    orig_zp_init = initializers["op_0_out_zero_point"]
+
+                self.assertEqual(orig_zp_init.data_type, onnx.TensorProto.UINT8)
+                convert_zp_init = initializers["op_0_out_zero_point_convert"]
+                self.assertEqual(convert_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_0_zp_init = initializers["output_0_zero_point"]
+                self.assertEqual(output_0_zp_init.data_type, onnx.TensorProto.UINT8)
+                output_1_zp_init = initializers["output_1_zero_point"]
+                self.assertEqual(output_1_zp_init.data_type, onnx.TensorProto.UINT8)
+                output_2_zp_init = initializers["output_2_zero_point"]
+                self.assertEqual(output_2_zp_init.data_type, onnx.TensorProto.UINT16)
+                output_3_zp_init = initializers["output_3_zero_point"]
+                self.assertEqual(output_3_zp_init.data_type, onnx.TensorProto.UINT16)
+
+                # Check scale data types
+                orig_scale_init = None
+                if test_qparam_sharing:
+                    self.assertNotIn("op_0_out_scale", initializers)
+                    orig_scale_init = initializers["input_0_scale"]
+                else:
+                    orig_scale_init = initializers["op_0_out_scale"]
+
+                self.assertEqual(orig_scale_init.data_type, float_type)
+                convert_scale_init = initializers["op_0_out_scale_convert"]
+                self.assertEqual(convert_scale_init.data_type, float_type)
+                output_0_scale_init = initializers["output_0_scale"]
+                self.assertEqual(output_0_scale_init.data_type, float_type)
+                output_1_scale_init = initializers["output_1_scale"]
+                self.assertEqual(output_1_scale_init.data_type, float_type)
+                output_2_scale_init = initializers["output_2_scale"]
+                self.assertEqual(output_2_scale_init.data_type, float_type)
+                output_3_scale_init = initializers["output_3_scale"]
+                self.assertEqual(output_3_scale_init.data_type, float_type)
+
+                self.assertIn("op_0_out", graph_outputs)
+
+    def build_test_model_1(self, shape):
+        """
+        Returns the following float32 model.
+
+        input_0 --> op1 --> op3 --> op5 --> op6 --> output_0
+                                     ^
+                                     |
+        input_1 --> op2 -+-> op4 ----+
+                         |
+                         +-> op7 --> output_1
+                         |
+                         +-> op8 --> output_2
+        """
+        input_0 = onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, shape)
+        input_1 = onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, shape)
+        output_0 = onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, shape)
+        output_1 = onnx.helper.make_tensor_value_info("output_1", onnx.TensorProto.FLOAT, shape)
+        output_2 = onnx.helper.make_tensor_value_info("output_2", onnx.TensorProto.FLOAT, shape)
+
+        op1_node = onnx.helper.make_node("Sigmoid", ["input_0"], ["op1_out"], name="op1")
+        op2_node = onnx.helper.make_node("Cos", ["input_1"], ["op2_out"], name="op2")
+        op3_node = onnx.helper.make_node("Sin", ["op1_out"], ["op3_out"], name="op3")
+        op4_node = onnx.helper.make_node("Tanh", ["op2_out"], ["op4_out"], name="op4")
+        op5_node = onnx.helper.make_node("Mul", ["op3_out", "op4_out"], ["op5_out"], name="op5")
+        op6_node = onnx.helper.make_node("Relu", ["op5_out"], ["output_0"], name="op6")
+        op7_node = onnx.helper.make_node("Cos", ["op2_out"], ["output_1"], name="op7")
+        op8_node = onnx.helper.make_node("Sigmoid", ["op2_out"], ["output_2"], name="op8")
+
+        graph = onnx.helper.make_graph(
+            [
+                op1_node,
+                op2_node,
+                op3_node,
+                op4_node,
+                op5_node,
+                op6_node,
+                op7_node,
+                op8_node,
+            ],
+            "mixed_prec_test",
+            [input_0, input_1],
+            [output_0, output_1, output_2],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_16bit_subgraph(self):
+        """
+        Test correctness of a qdq model that uses a default 8-bit quantization type and contains
+        a subgraph that uses 16-bit activations.
+        """
+        shape = (1, 2, 3)
+        f32_model_path = os.path.join(self._tmp_dir_path, "model.onnx")
+        qdq_model_path = os.path.join(self._tmp_dir_path, "model.qdq.onnx")
+        qdq_mixed_model_path = os.path.join(self._tmp_dir_path, "model.mixed.qdq.onnx")
+        f32_model = self.build_test_model_1(shape)
+        onnx.save_model(f32_model, f32_model_path)
+
+        data_reader = self.input_feeds(3, {"input_0": shape, "input_1": shape})
+
+        # Create pure 8-bit qdq model
+        quantize_static(
+            f32_model_path,
+            qdq_model_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
+        )
+
+        # Create mixed precision 8-bit/16-bit qdq model
+        mixed_prec_overrides = {
+            "op2_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op4"}}}
+            ],
+            "op3_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"op5"}}}
+            ],
+            "op4_out": [{"quant_type": QuantType.QUInt16}],
+            "op5_out": [{"quant_type": QuantType.QUInt16}],
+            "output_0": [{"quant_type": QuantType.QUInt16}],
+        }
+        data_reader.rewind()
+        quantize_static(
+            f32_model_path,
+            qdq_mixed_model_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
+            extra_options={"TensorQuantOverrides": mixed_prec_overrides},
+        )
+
+        qop_nodes = {"Relu": 0, "QuantizeLinear": 11, "DequantizeLinear": 12}
+        check_op_type_count(self, qdq_mixed_model_path, **qop_nodes)
+        data_reader.rewind()
+        check_model_correctness(self, f32_model_path, qdq_mixed_model_path, data_reader.get_next())
+        data_reader.rewind()
+        check_model_correctness(self, f32_model_path, qdq_model_path, data_reader.get_next())
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
new file mode 100644
index 000000000000..6503b3223b82
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import math
+import unittest
+from pathlib import Path
+
+import numpy as np
+import onnx
+
+import onnxruntime
+from onnxruntime.quantization.execution_providers.qnn import qnn_preprocess_model
+from onnxruntime.quantization.quant_utils import model_has_external_data, ms_domain
+
+
+class TestQnnPreprocessModel(unittest.TestCase):
+    def build_model(self, shape, scale_val, bias_val):
+        """
+        Build a model that supports 3 kinds of fusions:
+        - Erf sequence to Gelu
+        - ReduceL2 sequence to LpNormalization
+        - ReduceMean sequence to LayerNormalization
+        """
+        root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape)
+
+        # Erf sequence
+        one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const")
+        half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const")
+        root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const")
+
+        e_mul0_node = onnx.helper.make_node("Mul", ["root", "half_const"], ["e_mul0_out"])
+        e_div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["e_div_out"])
+        e_erf_node = onnx.helper.make_node("Erf", ["e_div_out"], ["e_erf_out"])
+        e_add_node = onnx.helper.make_node("Add", ["e_erf_out", "one_const"], ["e_add_out"])
+        e_mul1_node = onnx.helper.make_node("Mul", ["e_add_out", "e_mul0_out"], ["erf_seq_output"])
+
+        # ReduceL2 sequence
+        axes_const = onnx.numpy_helper.from_array(np.array([-1], dtype=np.int64), "axes_const")
+        eps_const = onnx.numpy_helper.from_array(np.array(1e-12, dtype=np.float32), "eps_const")
+        shape_const = onnx.numpy_helper.from_array(np.array(list(shape), dtype=np.int64), "shape_const")
+
+        l2_rl2_node = onnx.helper.make_node("ReduceL2", ["erf_seq_output", "axes_const"], ["l2_rl2_out"], keepdims=1)
+        l2_clip_node = onnx.helper.make_node("Clip", ["l2_rl2_out", "eps_const"], ["l2_clip_out"])
+        l2_expand_node = onnx.helper.make_node("Expand", ["l2_clip_out", "shape_const"], ["l2_expand_out"])
+        l2_div_node = onnx.helper.make_node("Div", ["erf_seq_output", "l2_expand_out"], ["l2_seq_output"])
+
+        # ReduceMean sequence
+        scale_const = onnx.numpy_helper.from_array(np.array(scale_val, dtype=np.float32), "scale_const")
+        bias_const = onnx.numpy_helper.from_array(np.array(bias_val, dtype=np.float32), "bias_const")
+        two_const = onnx.numpy_helper.from_array(np.array(2.0, dtype=np.float32), "two_const")
+
+        m_rm0_node = onnx.helper.make_node("ReduceMean", ["l2_seq_output", "axes_const"], ["m_rm0_out"])
+        m_sub_node = onnx.helper.make_node("Sub", ["l2_seq_output", "m_rm0_out"], ["m_sub_out"])
+        m_pow_node = onnx.helper.make_node("Pow", ["m_sub_out", "two_const"], ["m_pow_out"])
+        m_rm1_node = onnx.helper.make_node("ReduceMean", ["m_pow_out", "axes_const"], ["m_rm1_out"])
+        m_add0_node = onnx.helper.make_node("Add", ["m_rm1_out", "eps_const"], ["m_add0_out"])
+        m_sqrt_node = onnx.helper.make_node("Sqrt", ["m_add0_out"], ["m_sqrt_out"])
+        m_div_node = onnx.helper.make_node("Div", ["m_sub_out", "m_sqrt_out"], ["m_div_out"])
+        m_mul_node = onnx.helper.make_node("Mul", ["m_div_out", "scale_const"], ["m_mul_out"])
+        m_add1_node = onnx.helper.make_node("Add", ["m_mul_out", "bias_const"], ["output"])
+
+        graph = onnx.helper.make_graph(
+            [
+                e_mul0_node,
+                e_div_node,
+                e_erf_node,
+                e_add_node,
+                e_mul1_node,
+                l2_rl2_node,
+                l2_clip_node,
+                l2_expand_node,
+                l2_div_node,
+                m_rm0_node,
+                m_sub_node,
+                m_pow_node,
+                m_rm1_node,
+                m_add0_node,
+                m_sqrt_node,
+                m_div_node,
+                m_mul_node,
+                m_add1_node,
+            ],
+            "qnn_f32_model",
+            [root_inp],
+            [output],
+            initializer=[
+                one_const,
+                half_const,
+                root2_const,
+                axes_const,
+                eps_const,
+                shape_const,
+                scale_const,
+                bias_const,
+                two_const,
+            ],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_all_fusions(self):
+        """
+        Test calling qnn_preprocess_model() with a model that supports all 3 fusions.
+        """
+        model = self.build_model((1, 2, 3), [2.0, 2.0, 2.0], [1.0, 1.0, 1.0])
+        onnx.save_model(model, "model.onnx")
+        modified = qnn_preprocess_model("model.onnx", "model.qnn_pp.onnx", fuse_layernorm=True)
+
+        self.assertTrue(modified)
+
+        fused_model = onnx.load_model("model.qnn_pp.onnx")
+
+        # 3 fused Ops: Gelu, LpNorm, LayerNorm
+        self.assertEqual(len(fused_model.graph.node), 3)
+        expected_op_types = {"Gelu", "LpNormalization", "LayerNormalization"}
+        for node in fused_model.graph.node:
+            self.assertIn(node.op_type, expected_op_types)
+
+        # Should have added "com.microsoft" opset import because we added a Gelu.
+        ms_domain_opset = next((opset for opset in fused_model.opset_import if opset.domain == ms_domain), None)
+        self.assertIsNotNone(ms_domain_opset)
+        self.assertEqual(ms_domain_opset.version, 1)
+
+    def test_external_data(self):
+        """
+        Test calling qnn_preprocess_model() with a model that uses external data.
+        The new preprocessed model should also have external data.
+        """
+        model = self.build_model((1, 2, 3), [2.0, 2.0, 2.0], [1.0, 1.0, 1.0])
+        onnx.save_model(
+            model,
+            "model.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location="weights.bin",
+            size_threshold=0,
+        )
+        modified = qnn_preprocess_model(
+            "model.onnx",
+            "model.qnn_pp.onnx",
+            fuse_layernorm=True,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            external_data_location="weights2.bin",
+            external_data_size_threshold=0,
+        )
+
+        self.assertTrue(modified)
+
+        # Model should still have external data.
+        self.assertTrue(model_has_external_data(Path("model.qnn_pp.onnx")))
+
+        fused_model = onnx.load_model("model.qnn_pp.onnx", load_external_data=False)
+
+        # 3 fused Ops: Gelu, LpNorm, LayerNorm
+        self.assertEqual(len(fused_model.graph.node), 3)
+        expected_op_types = {"Gelu", "LpNormalization", "LayerNormalization"}
+        for node in fused_model.graph.node:
+            self.assertIn(node.op_type, expected_op_types)
+
+    def build_multi_input_output_model(self, shape):
+        """
+        Returns the following model.
+                               +----------> [X]
+                               |
+        [A] ---> Add ---> Abs -+-> Mul ---> [Y]
+                  ^                 ^
+                  |                 |
+        [B] ------+-----------------+
+        """
+        input_a = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, shape)
+        input_b = onnx.helper.make_tensor_value_info("B", onnx.TensorProto.FLOAT, shape)
+        output_x = onnx.helper.make_tensor_value_info("X", onnx.TensorProto.FLOAT, shape)
+        output_y = onnx.helper.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, shape)
+
+        add_node = onnx.helper.make_node("Add", ["A", "B"], ["add_out"], name="add_node")
+        abs_node = onnx.helper.make_node("Abs", ["add_out"], ["X"], name="abs_node")
+        mul_node = onnx.helper.make_node("Mul", ["X", "B"], ["Y"], name="mul_node")
+
+        graph = onnx.helper.make_graph(
+            [add_node, abs_node, mul_node],
+            "multi_io_graph",
+            [input_a, input_b],
+            [output_x, output_y],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_make_io_channel_last(self):
+        """
+        Test making a model's inputs and outputs channel-last.
+        """
+        model = self.build_multi_input_output_model((1, 2, 3, 4))
+        onnx.save_model(model, "model.onnx")
+        modified = qnn_preprocess_model(
+            "model.onnx",
+            "model.qnn_pp.onnx",
+            inputs_to_make_channel_last=["A", "B"],
+            outputs_to_make_channel_last=["X", "Y"],
+        )
+
+        self.assertTrue(modified)
+
+        preproc_model = onnx.load_model("model.qnn_pp.onnx")
+        self.assertEqual(len(preproc_model.graph.node), 7)
+
+        num_transposes = sum(1 for node in preproc_model.graph.node if node.op_type == "Transpose")
+        self.assertEqual(num_transposes, 4)
+
+        # Check that the outputs of the new model are the same, but transposed.
+        input_a = np.arange(0.0, 24.0, 1.0, dtype=np.float32).reshape((1, 2, 3, 4))
+        input_a_t = input_a.transpose(0, 2, 3, 1)
+        input_b = np.arange(1.0, 25.0, 1.0, dtype=np.float32).reshape((1, 2, 3, 4))
+        input_b_t = input_b.transpose(0, 2, 3, 1)
+
+        orig_session = onnxruntime.InferenceSession(model.SerializeToString(), providers=["CPUExecutionProvider"])
+        orig_results = orig_session.run(None, {"A": input_a, "B": input_b})
+
+        new_session = onnxruntime.InferenceSession(
+            preproc_model.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        new_results = new_session.run(None, {"A": input_a_t, "B": input_b_t})
+
+        self.assertEqual(len(orig_results), len(new_results))
+        for idx, orig_output in enumerate(orig_results):
+            transposed_output = new_results[idx]
+            np.testing.assert_allclose(
+                orig_output,
+                transposed_output.transpose(0, 3, 1, 2),
+                err_msg=f"Channel-last model output {idx} differs",
+            )
+
+    def test_make_io_channel_last_rank_error(self):
+        """
+        Test making a model's inputs and outputs channel-last with a rank < 3 (error).
+        """
+        model = self.build_multi_input_output_model((1, 2))
+        onnx.save_model(model, "model.onnx")
+
+        with self.assertRaises(ValueError) as context:
+            qnn_preprocess_model(
+                "model.onnx",
+                "model.qnn_pp.onnx",
+                inputs_to_make_channel_last=["A", "B"],
+                outputs_to_make_channel_last=["X", "Y"],
+            )
+
+        self.assertIn("to be of rank >= 3", str(context.exception))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py
index 65cdff025bbe..848857ceb279 100644
--- a/onnxruntime/test/python/quantization/test_quant_util.py
+++ b/onnxruntime/test/python/quantization/test_quant_util.py
@@ -18,31 +18,56 @@
 
 class TestQuantUtil(unittest.TestCase):
     def test_compute_scale_zp(self):
-        self.assertEqual(compute_scale_zp(0.0, 0.0, -127, 127, symmetric=True), [0, 1.0])
-        self.assertEqual(compute_scale_zp(1.0, -1.0, -127, 127, symmetric=True), [0, 1.0])
-        self.assertEqual(compute_scale_zp(0.0, 0.0, 0, 255, symmetric=True), [0, 1.0])
-        self.assertEqual(compute_scale_zp(1.0, -1.0, 0, 255, symmetric=True), [0, 1.0])
+        def _compute_scale_zp(rmin, rmax, qmin, qmax, qtype, symmetric=False, min_real_range=None):
+            zp, scale = compute_scale_zp(
+                numpy.array(rmin, dtype=numpy.float32),
+                numpy.array(rmax, dtype=numpy.float32),
+                numpy.array(qmin, dtype=qtype),
+                numpy.array(qmax, dtype=qtype),
+                symmetric=symmetric,
+                min_real_range=min_real_range,
+            )
+            assert isinstance(zp, numpy.ndarray)
+            assert isinstance(scale, numpy.ndarray)
+            return [float(zp), float(scale)]
 
-        self.assertEqual(compute_scale_zp(-1.0, 2.0, -127, 127, symmetric=True), [0, 2.0 / 127])
-        self.assertEqual(compute_scale_zp(-1.0, 2.0, -127, 127, symmetric=False), [-42, 3.0 / 254])
+        self.assertEqual(_compute_scale_zp(0.0, 0.0, -127, 127, numpy.int8, symmetric=True), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(1.0, -1.0, -127, 127, numpy.int8, symmetric=True), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(0.0, 0.0, 0, 255, numpy.uint8, symmetric=True), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(1.0, -1.0, 0, 255, numpy.uint8, symmetric=True), [0, 1.0])
 
-        self.assertEqual(compute_scale_zp(-1.0, 2.0, 0, 255, symmetric=True), [128, 4.0 / 255])
-        self.assertEqual(compute_scale_zp(-1.0, 2.0, 0, 255, symmetric=False), [85, 3.0 / 255])
+        self.assertEqual(
+            _compute_scale_zp(-1.0, 2.0, -127, 127, numpy.int8, symmetric=True), [0, numpy.float32(2.0 / 127)]
+        )
+        self.assertEqual(
+            _compute_scale_zp(-1.0, 2.0, -127, 127, numpy.int8, symmetric=False), [-42, numpy.float32(3.0 / 254)]
+        )
+
+        self.assertEqual(
+            _compute_scale_zp(-1.0, 2.0, 0, 255, numpy.uint8, symmetric=True), [128, numpy.float32(4.0 / 255)]
+        )
+        self.assertEqual(
+            _compute_scale_zp(-1.0, 2.0, 0, 255, numpy.uint8, symmetric=False), [85, numpy.float32(3.0 / 255)]
+        )
 
         tiny_float = numpy.float32(numpy.finfo(numpy.float32).tiny * 0.1)
-        self.assertEqual(compute_scale_zp(-tiny_float, tiny_float, 0, 255, symmetric=True), [0, 1.0])
-        self.assertEqual(compute_scale_zp(-tiny_float, 0.0, 0, 255, symmetric=False), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(-tiny_float, tiny_float, 0, 255, numpy.uint8, symmetric=True), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(-tiny_float, 0.0, 0, 255, numpy.uint8, symmetric=False), [0, 1.0])
 
         # Test enforcing a minimum floatint-point range.
-        self.assertEqual(compute_scale_zp(0.0, 0.0, 0, 255, symmetric=False, min_real_range=0.0001), [0, 0.0001 / 255])
         self.assertEqual(
-            compute_scale_zp(0.0, 0.0, -128, 127, symmetric=True, min_real_range=0.0001), [0, 0.0002 / 255]
+            _compute_scale_zp(0.0, 0.0, 0, 255, numpy.uint8, symmetric=False, min_real_range=0.0001), [0, 0.0001 / 255]
+        )
+        self.assertEqual(
+            _compute_scale_zp(0.0, 0.0, -128, 127, numpy.int8, symmetric=True, min_real_range=0.0001), [0, 0.0002 / 255]
         )
         self.assertEqual(
-            compute_scale_zp(0.0, 0.0, 0, 65535, symmetric=False, min_real_range=0.0001), [0, 0.0001 / 65535]
+            _compute_scale_zp(0.0, 0.0, 0, 65535, numpy.uint16, symmetric=False, min_real_range=0.0001),
+            [0, 0.0001 / 65535],
         )
         self.assertEqual(
-            compute_scale_zp(0.0, 0.0, -32768, 32767, symmetric=True, min_real_range=0.0001), [0, 0.0002 / 65535]
+            _compute_scale_zp(0.0, 0.0, -32768, 32767, numpy.int16, symmetric=True, min_real_range=0.0001),
+            [0, 0.0002 / 65535],
         )
 
     def test_load_external_model(self):
diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
index 765825d4b86e..97931acf03f4 100644
--- a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
+++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
@@ -122,9 +122,11 @@ def test_quantize_blockwise_4bits(self):
                                     dequantize_blockwise_4bits(
                                         quant_value_ref[c, k],
                                         scales_ref[c, k],
-                                        (zero_point_ref[c, k // 2] >> 4)
-                                        if (k & 1)
-                                        else (zero_point_ref[c, k // 2] & 0x0F),
+                                        (
+                                            (zero_point_ref[c, k // 2] >> 4)
+                                            if (k & 1)
+                                            else (zero_point_ref[c, k // 2] & 0x0F)
+                                        ),
                                         min(block_size, rows - k * block_size),
                                     ),
                                     dequantize_blockwise_4bits(
diff --git a/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py b/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py
new file mode 100644
index 000000000000..2b5d1f36070e
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+
+import numpy as np
+import onnx
+import onnx.helper as oh
+import onnx.numpy_helper as onh
+
+from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
+from onnxruntime.quantization.quant_utils import QuantizationMode, QuantType
+
+
+class TestQuantizerShapeInference(unittest.TestCase):
+    def test_com_microsoft(self):
+        model = oh.make_model(
+            oh.make_graph(
+                [
+                    oh.make_node("MatMul", ["X", "W1"], ["T1"]),
+                    oh.make_node("FusedMatMul", ["T1", "W2"], ["T2"], domain="com.microsoft"),
+                    oh.make_node("MatMul", ["T2", "W3"], ["T3"]),
+                    oh.make_node("MatMul", ["T3", "W4"], ["Y"]),
+                ],
+                "name",
+                [oh.make_tensor_value_info("X", onnx.TensorProto.FLOAT, [1, 4])],
+                [oh.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [1, 4])],
+                [
+                    onh.from_array(np.random.randn(4, 4).astype(np.float32), "W1"),
+                    onh.from_array(np.random.randn(4, 4).astype(np.float32), "W2"),
+                    onh.from_array(np.random.randn(4, 4).astype(np.float32), "W3"),
+                    onh.from_array(np.random.randn(4, 4).astype(np.float32), "W4"),
+                ],
+            ),
+            opset_imports=[oh.make_opsetid("", 18), oh.make_opsetid("com.microsoft", 1)],
+        )
+        model_shaped = onnx.shape_inference.infer_shapes(model)
+        shaped_results = set(t.name for t in model_shaped.graph.value_info)
+        # every result after T1 depends on T2 coming from a node com.microsoft,
+        # shape_inference cannot go beyond this point
+        self.assertEqual(shaped_results, {"T1"})
+
+        # first try: checks it raises an exception
+        quantizer = ONNXQuantizer(
+            model,
+            False,  # per_channel
+            False,  # reduce_range
+            QuantizationMode.IntegerOps,  # mode
+            False,  # static
+            QuantType.QInt8,  #  weight_type,
+            QuantType.QUInt8,  # dynamic activation only supports uint8
+            None,
+            [],  # nodes_to_quantize,
+            [],  # nodes_to_exclude
+            ["MatMul"],  # op_types_to_quantize,
+            {"MatMulConstBOnly": True},  # extra_options,
+            # {'DefaultTensorType': 1, }
+        )
+
+        with self.assertRaises(RuntimeError) as e:
+            quantizer.quantize_model()
+            self.assertIn("Unable to find data type for weight_name=", str(e))
+
+        # second try: checks it works
+        quantizer = ONNXQuantizer(
+            model,
+            False,  # per_channel
+            False,  # reduce_range
+            QuantizationMode.IntegerOps,  # mode
+            False,  # static
+            QuantType.QInt8,  #  weight_type,
+            QuantType.QUInt8,  # dynamic activation only supports uint8
+            None,
+            [],  # nodes_to_quantize,
+            [],  # nodes_to_exclude
+            ["MatMul"],  # op_types_to_quantize,
+            {
+                "MatMulConstBOnly": True,
+                "DefaultTensorType": 1,
+            },
+        )
+
+        model = quantizer.quantize_model()
+        ops = {n.op_type for n in model.graph.node}
+        self.assertEqual(ops, {"Cast", "FusedMatMul", "MatMulInteger", "DynamicQuantizeLinear", "Mul"})
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/onnxruntime/test/python/quantization/test_subgraph.py b/onnxruntime/test/python/quantization/test_subgraph.py
new file mode 100644
index 000000000000..c425bf956f97
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_subgraph.py
@@ -0,0 +1,64 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import os
+import tempfile
+import unittest
+import urllib.request
+
+import onnx
+
+from onnxruntime.quantization import quantize_dynamic
+
+
+class TestDynamicQuantizationSubgraph(unittest.TestCase):
+    def test_dynamic_quantization_subgraph(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            onnx_path = os.path.join(tmpdir, "decoder_model_merged.onnx")
+            quantized_onnx_path = os.path.join(tmpdir, "decoder_model_merged_quantized.onnx")
+            urllib.request.urlretrieve(
+                "https://huggingface.co/fxmarty/t5-tiny-onnx-testing/resolve/main/decoder_model_merged.onnx", onnx_path
+            )
+
+            quantize_dynamic(
+                model_input=onnx_path,
+                model_output=quantized_onnx_path,
+                per_channel=True,
+                op_types_to_quantize=[
+                    "Conv",
+                    "MatMul",
+                    "Attention",
+                    "LSTM",
+                    "Gather",
+                    "Transpose",
+                    "EmbedLayerNormalization",
+                ],
+                extra_options={"EnableSubgraph": True},
+            )
+            model = onnx.load(quantized_onnx_path)
+
+            # The initializer `shared.weight_merged_0` is attached to the top-level graph, and used in a Gather node in each subgraphs.
+            # We expect the quantized Gather (after which a DequantizeLinear is attached) initializer to also be attached to the top-level graph.
+            found_gather_quantized = False
+            for initializer in model.graph.initializer:
+                if initializer.name == "shared.weight_merged_0_quantized":
+                    found_gather_quantized = True
+                    break
+            self.assertTrue(found_gather_quantized)
+
+            found_gather_scale = False
+            for initializer in model.graph.initializer:
+                if initializer.name == "shared.weight_merged_0_scale":
+                    found_gather_scale = True
+                    break
+            self.assertTrue(found_gather_scale)
+
+            # No initializers related to the Gather node should be attached to the subgraphs.
+            for node in model.graph.node:
+                for attr in node.attribute:
+                    if attr.type == onnx.AttributeProto.GRAPH:
+                        for initializer in attr.g.initializer:
+                            self.assertTrue("shared.weight" not in initializer.name)
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index 770f29228698..ff97e04fb7fd 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -11,8 +11,17 @@
 import numpy as np
 import onnx
 
-from onnxruntime import quantization
-from onnxruntime.quantization.quant_utils import compute_scale_zp, get_qmin_qmax_for_qType
+from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantType, quantize_static
+from onnxruntime.quantization.execution_providers.qnn import get_qnn_qdq_config
+from onnxruntime.quantization.quant_utils import compute_scale_zp, get_qmin_qmax_for_qType, ms_domain
+
+
+class DummyDataReader(CalibrationDataReader):
+    def __init__(self, activations):
+        self.iterator = ({"INP": act} for act in activations)
+
+    def get_next(self):
+        return next(self.iterator, None)
 
 
 class TestTensorQuantOverridesOption(unittest.TestCase):
@@ -43,7 +52,7 @@ def setUp(self):
             "OUT": (0, np.float32(0.005075461231172085)),
         }
 
-    def perform_qdq_quantization(self, output_model_name, tensor_quant_overrides=None, per_channel=False):
+    def build_float32_model(self):
         #    (input)
         #       |
         #    Sigmoid
@@ -66,24 +75,18 @@ def perform_qdq_quantization(self, output_model_name, tensor_quant_overrides=Non
         model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
         onnx.save(model, "model.onnx")
 
-        # Quantize model
-        class DummyDataReader(quantization.CalibrationDataReader):
-            def __init__(self, activations):
-                self.iterator = ({"INP": act} for act in activations)
+    def perform_qdq_quantization(self, output_model_name, extra_options=None, per_channel=False, activation_type=None):
+        self.build_float32_model()
 
-            def get_next(self):
-                return next(self.iterator, None)
+        if activation_type is None:
+            activation_type = self.default_act_qtype
 
-        extra_options = {}
-        if tensor_quant_overrides is not None:
-            extra_options["TensorQuantOverrides"] = tensor_quant_overrides
-
-        quantization.quantize_static(
+        quantize_static(
             model_input="model.onnx",
             model_output=output_model_name,
             calibration_data_reader=DummyDataReader(self.activations),
-            quant_format=quantization.QuantFormat.QDQ,
-            activation_type=self.default_act_qtype,
+            quant_format=QuantFormat.QDQ,
+            activation_type=activation_type,
             weight_type=self.default_wgt_qtype,
             per_channel=per_channel,
             op_types_to_quantize=["Conv", "Sigmoid"],
@@ -129,7 +132,7 @@ def test_qdq_default(self):
             out_sc,
         ) = self.perform_qdq_quantization(
             "model_default_quant_overrides.onnx",
-            tensor_quant_overrides=None,  # default behavior
+            extra_options=None,  # default behavior
         )
 
         # No overrides set. Expect default values
@@ -147,7 +150,8 @@ def test_qdq_default(self):
 
         self.assertEqual(bias_zp.int32_data[0], self.default_zp_scales["BIAS"][0])
         self.assertEqual(bias_zp.data_type, self.default_bias_qtype)
-        self.assertEqual(bias_sc.float_data[0], self.default_zp_scales["BIAS"][1])
+        np_array = onnx.numpy_helper.to_array(bias_sc)
+        self.assertEqual(np_array[0], self.default_zp_scales["BIAS"][1])
 
         self.assertEqual(out_zp.int32_data[0], self.default_zp_scales["OUT"][0])
         self.assertEqual(out_zp.data_type, self.default_act_qtype)
@@ -170,7 +174,7 @@ def test_qdq_default_per_channel(self):
             out_sc,
         ) = self.perform_qdq_quantization(
             "model_default_per_channel_quant_overrides.onnx",
-            tensor_quant_overrides=None,  # default behavior
+            extra_options=None,  # default behavior
             per_channel=True,
         )
 
@@ -214,10 +218,14 @@ def test_qdq_overrides1(self):
         """
         inp_zp, inp_sc, sig_out_zp, sig_out_sc, wgt_zp, wgt_sc, bias_zp, bias_sc, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides1.onnx",
-            tensor_quant_overrides={
-                "SIG_OUT": [{"scale": 1.0, "zero_point": 127}],
-                "WGT": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
-                "BIAS": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "SIG_OUT": [
+                        {"scale": np.array(1.0, dtype=np.float32), "zero_point": np.array(127, dtype=np.uint8)}
+                    ],
+                    "WGT": [{"quant_type": QuantType.QInt8, "symmetric": True, "reduce_range": True}],
+                    "BIAS": [{"quant_type": QuantType.QInt8, "symmetric": True, "reduce_range": True}],
+                }
             },
         )
 
@@ -232,7 +240,7 @@ def test_qdq_overrides1(self):
         self.assertEqual(sig_out_sc.float_data[0], np.float32(1.0))
 
         # Weight should have different type, zero_point, and scale
-        self.assertEqual(wgt_zp.data_type, quantization.QuantType.QInt8.tensor_type)
+        self.assertEqual(wgt_zp.data_type, QuantType.QInt8.tensor_type)
 
         wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType(wgt_zp.data_type, reduce_range=True, symmetric=True)
         wgt_rmin, wgt_rmax = np.min(self.weight), np.max(self.weight)
@@ -241,7 +249,7 @@ def test_qdq_overrides1(self):
         self.assertEqual(wgt_sc.float_data[0], np.float32(new_wgt_sc))
 
         # Bias should now be treated as a weight and should have different type, zero_point, and scale
-        self.assertEqual(bias_zp.data_type, quantization.QuantType.QInt8.tensor_type)
+        self.assertEqual(bias_zp.data_type, QuantType.QInt8.tensor_type)
 
         bias_qmin, bias_qmax = get_qmin_qmax_for_qType(bias_zp.data_type, reduce_range=True, symmetric=True)
         bias_rmin, bias_rmax = np.min(self.bias), np.max(self.bias)
@@ -253,10 +261,10 @@ def test_qdq_overrides2(self):
         """
         Test overriding rmin/rmax for Sigmoid output.
         """
-        sigmoid_rmin, sigmoid_rmax = 0.0, 0.5
+        sigmoid_rmin, sigmoid_rmax = np.array(0.0, dtype=np.float32), np.array(0.5, dtype=np.float32)
         inp_zp, inp_sc, sig_out_zp, sig_out_sc, _, _, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides2.onnx",
-            tensor_quant_overrides={"SIG_OUT": [{"rmin": sigmoid_rmin, "rmax": sigmoid_rmax}]},
+            extra_options={"TensorQuantOverrides": {"SIG_OUT": [{"rmin": sigmoid_rmin, "rmax": sigmoid_rmax}]}},
         )
 
         # Input should have same quant params
@@ -276,11 +284,13 @@ def test_qdq_overrides3(self):
         """
         Test overriding rmin and rmax for Conv weight
         """
-        wgt_rmin, wgt_rmax = 0.0, 1.0
+        wgt_rmin, wgt_rmax = np.array(0.0, dtype=np.float32), np.array(1.0, dtype=np.float32)
         _, _, _, _, wgt_zp, wgt_sc, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides3.onnx",
-            tensor_quant_overrides={
-                "WGT": [{"rmin": wgt_rmin, "rmax": wgt_rmax}],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "WGT": [{"rmin": wgt_rmin, "rmax": wgt_rmax}],
+                }
             },
         )
 
@@ -298,11 +308,13 @@ def test_qdq_overrides4(self):
         """
         Test overriding scale and zero_point for Conv weight
         """
-        wgt_zp_val, wgt_scale_val = 4, 0.5
+        wgt_zp_val, wgt_scale_val = np.array(4, dtype=np.float32), np.array(0.5, dtype=np.float32)
         _, _, _, _, wgt_zp, wgt_sc, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides4.onnx",
-            tensor_quant_overrides={
-                "WGT": [{"zero_point": wgt_zp_val, "scale": wgt_scale_val}],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "WGT": [{"zero_point": wgt_zp_val, "scale": wgt_scale_val}],
+                }
             },
         )
 
@@ -315,7 +327,7 @@ def test_qdq_overrides_per_channel1(self):
         """
         Test per-channel overriding of scale/zero_point for Conv weight and bias.
         """
-        zp_vals, scale_vals = [2, 4], [0.5, 0.2]
+        zp_vals, scale_vals = np.array([2, 4], dtype=np.float32), np.array([0.5, 0.2], dtype=np.float32)
         (
             _,
             _,
@@ -329,15 +341,17 @@ def test_qdq_overrides_per_channel1(self):
             _,
         ) = self.perform_qdq_quantization(
             "model_per_channel_quant_overrides1.onnx",
-            tensor_quant_overrides={
-                "WGT": [
-                    {"zero_point": zp_vals[0], "scale": scale_vals[0]},
-                    {"zero_point": zp_vals[1], "scale": scale_vals[1]},
-                ],
-                "BIAS": [
-                    {"zero_point": zp_vals[0], "scale": scale_vals[0]},
-                    {"zero_point": zp_vals[1], "scale": scale_vals[1]},
-                ],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "WGT": [
+                        {"axis": 0, "zero_point": zp_vals[0], "scale": scale_vals[0]},
+                        {"zero_point": zp_vals[1], "scale": scale_vals[1]},
+                    ],
+                    "BIAS": [
+                        {"axis": 0, "zero_point": zp_vals[0], "scale": scale_vals[0]},
+                        {"zero_point": zp_vals[1], "scale": scale_vals[1]},
+                    ],
+                }
             },
             per_channel=True,
         )
@@ -359,48 +373,88 @@ def test_qdq_overrides_per_channel2(self):
         """
         Test per-channel overriding of rmin, rmax, reduce_range, and quant_type for Conv weight.
         """
-        rmin_vals = [0.0, 0.2]
-        rmax_vals = [1.0, 0.8]
-        quant_type = quantization.QuantType.QUInt8
-        reduce_ranges = [True, False]
-        (
-            _,
-            _,
-            _,
-            _,
-            wgt_zp,
-            wgt_sc,
-            bias_zp,
-            bias_sc,
-            _,
-            _,
-        ) = self.perform_qdq_quantization(
-            "model_per_channel_quant_overrides2.onnx",
-            tensor_quant_overrides={
-                "WGT": [
-                    {
-                        "quant_type": quant_type,
-                        "rmin": rmin_vals[0],
-                        "rmax": rmax_vals[0],
-                        "reduce_range": reduce_ranges[0],
+        for reduce_range in (False, True):
+            with self.subTest(reduce_range=reduce_range):
+                qdq_model_name = f"model_per_chan_overrides_2_reduce_range_{reduce_range}.onnx"
+                rmin_vals = [0.0, 0.2]
+                rmax_vals = [1.0, 0.8]
+                quant_type = QuantType.QUInt8
+                (
+                    _,
+                    _,
+                    _,
+                    _,
+                    wgt_zp,
+                    wgt_sc,
+                    bias_zp,
+                    bias_sc,
+                    _,
+                    _,
+                ) = self.perform_qdq_quantization(
+                    qdq_model_name,
+                    extra_options={
+                        "TensorQuantOverrides": {
+                            "WGT": [
+                                {
+                                    "axis": 0,
+                                    "quant_type": quant_type,
+                                    "rmin": np.array(rmin_vals[0], dtype=np.float32),
+                                    "rmax": np.array(rmax_vals[0], dtype=np.float32),
+                                    "reduce_range": reduce_range,
+                                },
+                                {
+                                    "quant_type": quant_type,
+                                    "rmin": np.array(rmin_vals[1], dtype=np.float32),
+                                    "rmax": np.array(rmax_vals[1], dtype=np.float32),
+                                    "reduce_range": reduce_range,
+                                },
+                            ],
+                        }
                     },
-                    {
-                        "quant_type": quant_type,
-                        "rmin": rmin_vals[1],
-                        "rmax": rmax_vals[1],
-                        "reduce_range": reduce_ranges[1],
-                    },
-                ],
+                    per_channel=True,
+                )
+
+                self.assertEqual(wgt_zp.data_type, quant_type.tensor_type)
+                for index, (zp, scale) in enumerate(zip(wgt_zp.int32_data, wgt_sc.float_data)):
+                    wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType(wgt_zp.data_type, reduce_range=reduce_range)
+                    expected_zp, expected_scale = compute_scale_zp(
+                        np.array(rmin_vals[index], dtype=np.float32),
+                        np.array(rmax_vals[index], dtype=np.float32),
+                        wgt_qmin,
+                        wgt_qmax,
+                    )
+                    self.assertEqual(zp, expected_zp)
+                    self.assertEqual(scale, np.float32(expected_scale))
+
+    def test_16bit_overrides_set_ms_domain(self):
+        """
+        Test that overriding a tensor to 16bit (when default is 8bit) automatically sets the 'com.microsoft'
+        domain on DQ and Q ops.
+        """
+        qdq_model_name = "model_quant_overrides_to_16bit.onnx"
+        inp_zp, _, sig_out_zp, _, _, _, _, _, out_zp, _ = self.perform_qdq_quantization(
+            qdq_model_name,
+            activation_type=onnx.TensorProto.UINT8,  # Default to 8bit activations
+            extra_options={
+                "TensorQuantOverrides": {
+                    "INP": [{"quant_type": QuantType.QUInt16}],
+                    "SIG_OUT": [{"quant_type": QuantType.QUInt16}],
+                }
             },
-            per_channel=True,
         )
 
-        self.assertEqual(wgt_zp.data_type, quant_type.tensor_type)
-        for index, (zp, scale) in enumerate(zip(wgt_zp.int32_data, wgt_sc.float_data)):
-            wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType(wgt_zp.data_type, reduce_range=reduce_ranges[index])
-            expected_zp, expected_scale = compute_scale_zp(rmin_vals[index], rmax_vals[index], wgt_qmin, wgt_qmax)
-            self.assertEqual(zp, expected_zp)
-            self.assertEqual(scale, np.float32(expected_scale))
+        # Input and Sigmoid's output should be overridden to 16bit
+        self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT16)
+        self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16)
+
+        # Output should the default uint8 type
+        self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8)
+
+        # Q/DQ ops should all have the 'com.microsoft' domain
+        qdq_model = onnx.load_model(qdq_model_name)
+        for node in qdq_model.graph.node:
+            if node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
+                self.assertEqual(node.domain, ms_domain)
 
     def test_override_validation_nonexisting_tensor(self):
         """
@@ -409,7 +463,13 @@ def test_override_validation_nonexisting_tensor(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"NON_EXISTING": [{"rmin": 0.0, "rmax": 0.5}]},
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "NON_EXISTING": [
+                            {"rmin": np.array(0.0, dtype=np.float32), "rmax": np.array(0.5, dtype=np.float32)}
+                        ]
+                    }
+                },
             )
 
         self.assertIn("is not present in the model", str(context.exception))
@@ -421,7 +481,7 @@ def test_override_validation_scale_missing_zp(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0}]},
+                extra_options={"TensorQuantOverrides": {"SIG_OUT": [{"scale": np.array(0.0, dtype=np.float32)}]}},
             )
 
         self.assertIn("Must provide both 'scale' and 'zero_point'", str(context.exception))
@@ -433,35 +493,630 @@ def test_override_validation_bad_combination(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0, "zero_point": 0, "rmax": 10.0}]},
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "SIG_OUT": [
+                            {
+                                "scale": np.array(0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int8),
+                                "rmax": np.array(10.0, dtype=np.float32),
+                            }
+                        ]
+                    }
+                },
             )
 
-        self.assertIn("option 'rmax' is invalid with 'scale' and 'zero_point'", str(context.exception))
+        self.assertIn("option(s) [rmax] are invalid with 'scale' and 'zero_point'", str(context.exception))
 
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0, "zero_point": 0, "rmin": 10.0}]},
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "SIG_OUT": [
+                            {
+                                "scale": np.array(0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int8),
+                                "rmax": np.array(10.0, dtype=np.float32),
+                            }
+                        ]
+                    }
+                },
             )
 
-        self.assertIn("option 'rmin' is invalid with 'scale' and 'zero_point'", str(context.exception))
+        self.assertIn("option(s) [rmax] are invalid with 'scale' and 'zero_point'", str(context.exception))
 
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0, "zero_point": 0, "symmetric": True}]},
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "SIG_OUT": [
+                            {
+                                "scale": np.array(0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int8),
+                                "symmetric": True,
+                            }
+                        ]
+                    }
+                },
             )
 
-        self.assertIn("option 'symmetric' is invalid with 'scale' and 'zero_point'", str(context.exception))
+        self.assertIn("option(s) [symmetric] are invalid with 'scale' and 'zero_point'", str(context.exception))
 
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0, "zero_point": 0, "reduce_range": True}]},
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "SIG_OUT": [
+                            {
+                                "scale": np.array(0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int8),
+                                "reduce_range": True,
+                            }
+                        ]
+                    }
+                },
             )
 
-        self.assertIn("option 'reduce_range' is invalid with 'scale' and 'zero_point'", str(context.exception))
+        self.assertIn("option(s) [reduce_range] are invalid with 'scale' and 'zero_point'", str(context.exception))
+
+    def test_get_qnn_qdq_config_sigmoid(self):
+        """
+        Test that the QNN-specific configs override the scale and zero-point of 16-bit Sigmoid.
+        """
+        # Create float model with a Abs --> Sigmoid
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_out"], name="Abs_0"),
+                onnx.helper.make_node("Sigmoid", ["abs_out"], ["output_0"], name="Sigmoid_0"),
+            ],
+            "sigmoid_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (1, 2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (1, 2, 3))],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        other_override_0 = {"abs_out": [{"symmetric": True}]}
+        other_override_1 = {
+            "abs_out": [
+                {
+                    "quant_type": QuantType.QUInt8,
+                    "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"Sigmoid_0"}},
+                }
+            ]
+        }
+        other_override_2 = {
+            "abs_out": [
+                {
+                    "quant_type": QuantType.QInt8,
+                    "convert": {"quant_type": QuantType.QInt16, "recv_nodes": {"Sigmoid_0"}},
+                }
+            ]
+        }
+
+        # Enumerate subtests (default_act_qtype, sigmoid_out_qtype, other_override)
+        subtest_configs = [
+            (QuantType.QUInt16, None, {}),  # Sigmoid gets new scale/zp
+            (QuantType.QUInt16, None, other_override_0),  # Sigmoid gets new scale/zp
+            (QuantType.QInt16, None, {}),  # Sigmoid gets new scale/zp
+            (QuantType.QInt16, None, other_override_0),  # Sigmoid gets new scale/zp
+            (QuantType.QUInt8, QuantType.QUInt16, other_override_1),  # Sigmoid gets new scale/zp
+            (QuantType.QInt8, QuantType.QInt16, other_override_2),  # Sigmoid gets new scale/zp
+            (QuantType.QUInt8, None, other_override_0),  # Sigmoid DOES NOT gets new scale/zp
+            (QuantType.QInt8, None, {}),  # Sigmoid DOES NOT gets new scale/zp
+            (QuantType.QInt8, QuantType.QInt8, {}),  # Sigmoid DOES NOT gets new scale/zp
+        ]
+
+        # Test that Sigmoid's output scale and zp should be overridden for 16-bit Sigmoid.
+        for default_act_qtype, sigmoid_out_qtype, abs_override in subtest_configs:
+            with self.subTest(
+                default_act_qtype=default_act_qtype, sigmoid_out_qtype=sigmoid_out_qtype, abs_override=abs_override
+            ):
+                init_overrides = {}
+                init_overrides.update(abs_override)
+
+                if sigmoid_out_qtype is not None:
+                    init_overrides["output_0"] = [{"quant_type": sigmoid_out_qtype}]
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    activation_type=default_act_qtype,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    add_qtype_converts=False,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "Sigmoid"})
+
+                if default_act_qtype == QuantType.QUInt16 or sigmoid_out_qtype == QuantType.QUInt16:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("output_0", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["output_0"],
+                        [
+                            {
+                                "quant_type": QuantType.QUInt16,
+                                "scale": np.array(1.0 / 65536.0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.uint16),
+                            }
+                        ],
+                    )
+                elif default_act_qtype == QuantType.QInt16 or sigmoid_out_qtype == QuantType.QInt16:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("output_0", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["output_0"],
+                        [
+                            {
+                                "quant_type": QuantType.QInt16,
+                                "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int16),
+                            }
+                        ],
+                    )
+
+    def test_get_qnn_qdq_config_tanh(self):
+        """
+        Test that the QNN-specific configs override the scale and zero-point of 16-bit Tanh.
+        """
+
+        # Create float model with a Abs --> Tanh
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_out"], name="Abs_0"),
+                onnx.helper.make_node("Tanh", ["abs_out"], ["output_0"], name="Tanh_0"),
+            ],
+            "tanh_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (1, 2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (1, 2, 3))],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        other_override_0 = {"abs_out": [{"symmetric": True}]}
+        other_override_1 = {
+            "abs_out": [
+                {"quant_type": QuantType.QUInt8, "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"Tanh_0"}}}
+            ]
+        }
+        other_override_2 = {
+            "abs_out": [
+                {"quant_type": QuantType.QInt8, "convert": {"quant_type": QuantType.QInt16, "recv_nodes": {"Tanh_0"}}}
+            ]
+        }
+
+        # Enumerate subtests (default_act_qtype, tanh_out_qtype, other_override)
+        subtest_configs = [
+            (QuantType.QUInt16, None, {}),  # Tanh gets new scale/zp
+            (QuantType.QUInt16, None, other_override_0),  # Tanh gets new scale/zp
+            (QuantType.QInt16, None, {}),  # Tanh gets new scale/zp
+            (QuantType.QInt16, None, other_override_0),  # Tanh gets new scale/zp
+            (QuantType.QUInt8, QuantType.QUInt16, other_override_1),  # Tanh gets new scale/zp
+            (QuantType.QInt8, QuantType.QInt16, other_override_2),  # Tanh gets new scale/zp
+            (QuantType.QUInt8, None, other_override_0),  # Tanh DOES NOT gets new scale/zp
+            (QuantType.QInt8, None, {}),  # Tanh DOES NOT gets new scale/zp
+            (QuantType.QInt8, QuantType.QInt8, {}),  # Tanh DOES NOT gets new scale/zp
+        ]
+
+        # Test that Tanh's output scale and zp should be overridden for 16-bit Tanh.
+        for default_act_qtype, tanh_out_qtype, abs_override in subtest_configs:
+            with self.subTest(
+                default_act_qtype=default_act_qtype, tanh_out_qtype=tanh_out_qtype, abs_override=abs_override
+            ):
+                init_overrides = {}
+                init_overrides.update(abs_override)
+
+                if tanh_out_qtype is not None:
+                    init_overrides["output_0"] = [{"quant_type": tanh_out_qtype}]
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    activation_type=default_act_qtype,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    add_qtype_converts=False,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "Tanh"})
+
+                if default_act_qtype == QuantType.QUInt16 or tanh_out_qtype == QuantType.QUInt16:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("output_0", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["output_0"],
+                        [
+                            {
+                                "quant_type": QuantType.QUInt16,
+                                "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                                "zero_point": np.array(32768, dtype=np.uint16),
+                            }
+                        ],
+                    )
+                elif default_act_qtype == QuantType.QInt16 or tanh_out_qtype == QuantType.QInt16:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("output_0", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["output_0"],
+                        [
+                            {
+                                "quant_type": QuantType.QInt16,
+                                "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int16),
+                            }
+                        ],
+                    )
+
+    def test_get_qnn_qdq_config_matmul(self):
+        """
+        Test that the QNN-specific configs override MatMul's initializer input type to 8-bit if
+        the other input is 16-bit and the default weight type is 8-bit.
+        """
+        # Create float model with a Abs --> MatMul
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_0_out"], name="Abs_0"),
+                onnx.helper.make_node("MatMul", ["abs_0_out", "weight"], ["matmul_0_out"], name="MatMul_0"),
+                onnx.helper.make_node("Abs", ["matmul_0_out"], ["output_0"], name="Abs_1"),
+            ],
+            "matmul_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (2, 2))],
+            initializer=[onnx.numpy_helper.from_array(np.random.random((3, 2)).astype(np.float32), "weight")],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        q16_qtypes = {QuantType.QUInt16, QuantType.QInt16}
+        q8_qtypes = {QuantType.QUInt8, QuantType.QInt8}
+        symmetric_wgt_qtypes = {QuantType.QInt8, QuantType.QInt16}
+
+        other_override_0 = {"output_0": [{"symmetric": True}]}
+        other_override_1 = {
+            "matmul_0_out": [
+                {
+                    "quant_type": QuantType.QUInt16,
+                    "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"Abs_1"}},
+                }
+            ]
+        }
+        other_override_2 = {
+            "matmul_0_out": [
+                {
+                    "quant_type": QuantType.QInt16,
+                    "convert": {"quant_type": QuantType.QInt8, "recv_nodes": {"Abs_1"}},
+                }
+            ]
+        }
+        convert_matmul_input = {
+            "abs_0_out": [
+                {
+                    "quant_type": QuantType.QUInt8,
+                    "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"MatMul_0"}},
+                }
+            ]
+        }
+
+        # Enumerate subtests (default_act_qtype, default_wgt_qtype, matmul_in_qtype, other_override)
+        subtest_configs = [
+            (QuantType.QUInt8, QuantType.QUInt8, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, other_override_0),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, other_override_1),
+            (QuantType.QInt8, QuantType.QInt8, QuantType.QInt16, other_override_2),
+            (QuantType.QUInt16, QuantType.QUInt8, None, other_override_0),
+            (QuantType.QInt16, QuantType.QInt8, None, {}),
+            (QuantType.QUInt16, QuantType.QUInt16, None, other_override_0),
+            (QuantType.QInt16, QuantType.QInt16, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, None, convert_matmul_input),
+        ]
+
+        # Test if MatMul's weight input is overridden.
+        for default_act_qtype, default_wgt_qtype, matmul_input_qtype, other_override in subtest_configs:
+            with self.subTest(
+                default_act_qtype=default_act_qtype,
+                default_wgt_qtype=default_wgt_qtype,
+                matmul_input_qtype=matmul_input_qtype,
+                other_override=other_override,
+            ):
+                init_overrides = {}
+                init_overrides.update(other_override)
+
+                if matmul_input_qtype is not None:
+                    init_overrides["abs_0_out"] = [{"quant_type": matmul_input_qtype}]
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    activation_type=default_act_qtype,
+                    weight_type=default_wgt_qtype,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    add_qtype_converts=False,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "MatMul"})
+                input_is_16bit = (
+                    (default_act_qtype in q16_qtypes)
+                    or (matmul_input_qtype in q16_qtypes)
+                    or (other_override == convert_matmul_input)
+                )
+                weight_is_symmetric = default_wgt_qtype in symmetric_wgt_qtypes
+
+                if input_is_16bit and default_wgt_qtype in q8_qtypes:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["weight"],
+                        [
+                            {
+                                "quant_type": default_wgt_qtype,
+                                "symmetric": weight_is_symmetric,
+                            }
+                        ],
+                    )
+                elif init_overrides:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertNotIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+
+                self.assertEqual(weight_is_symmetric, qnn_config.extra_options["WeightSymmetric"])
+
+    def test_get_qnn_qdq_config_matmul_per_channel(self):
+        """
+        When per_channel is enabled, test that the QNN-specific configs explicitly override MatMul's
+        initializer inputs to use per-tensor quantization (QNN does not support per-channel MatMul).
+        """
+        # Create float model with a Abs --> MatMul
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_0_out"], name="Abs_0"),
+                onnx.helper.make_node("MatMul", ["abs_0_out", "weight"], ["matmul_0_out"], name="MatMul_0"),
+                onnx.helper.make_node("Abs", ["matmul_0_out"], ["output_0"], name="Abs_1"),
+            ],
+            "matmul_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (2, 2))],
+            initializer=[onnx.numpy_helper.from_array(np.random.random((3, 2)).astype(np.float32), "weight")],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        symmetric_wgt_qtypes = {QuantType.QInt8, QuantType.QInt16}
+        weight_override_16bit = {"weight": [{"quant_type": QuantType.QInt16, "symmetric": True}]}
+
+        # Enumerate subtests (default_wgt_qtype, default_wgt_symmetric, other_override)
+        subtest_configs = [
+            (QuantType.QUInt8, False, {}),
+            (QuantType.QInt8, True, {}),
+            (QuantType.QUInt8, None, {}),
+            (QuantType.QInt8, None, {}),
+            (QuantType.QInt8, None, weight_override_16bit),
+        ]
+
+        # Test if MatMul's weight input is overridden to per-tensor correctly.
+        for default_wgt_qtype, default_wgt_symmetric, other_override in subtest_configs:
+            with self.subTest(
+                default_wgt_qtype=default_wgt_qtype,
+                default_wgt_symmetric=default_wgt_symmetric,
+                other_override=other_override,
+            ):
+                init_overrides = {}
+                init_overrides.update(other_override)
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    weight_type=default_wgt_qtype,
+                    weight_symmetric=default_wgt_symmetric,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    per_channel=True,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "MatMul"})
+                weight_is_symmetric = default_wgt_symmetric or default_wgt_qtype in symmetric_wgt_qtypes
+
+                # User did not provide overrides for weight, so get_qnn_qdq_config() should set per-tensor overrides.
+                if not init_overrides:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["weight"],
+                        [
+                            {
+                                "quant_type": default_wgt_qtype,
+                                "symmetric": weight_is_symmetric,
+                            }
+                        ],
+                    )
+                else:
+                    # Should retain user's overrides.
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["weight"], weight_override_16bit["weight"]
+                    )
+
+    def test_get_qnn_qdq_config_layernorm(self):
+        """
+        Test that the QNN-specific configs override LayerNorm's initializer input type to 8-bit if
+        the other input is 16-bit and the default weight type is 8-bit.
+        """
+        # Create float model with a Abs --> LayerNormalization
+        graph = onnx.helper.make_graph(
+            [
+                onnx.helper.make_node("Abs", ["input_0"], ["abs_0_out"], name="Abs_0"),
+                onnx.helper.make_node(
+                    "LayerNormalization", ["abs_0_out", "weight", "bias"], ["layernorm_0_out"], name="LayerNorm_0"
+                ),
+                onnx.helper.make_node("Abs", ["layernorm_0_out"], ["output_0"], name="Abs_1"),
+            ],
+            "layernorm_graph",
+            [onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, (2, 3))],
+            [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, (2, 3))],
+            initializer=[
+                onnx.numpy_helper.from_array(np.random.random((2, 3)).astype(np.float32), "weight"),
+                onnx.numpy_helper.from_array(np.random.random((2, 3)).astype(np.float32), "bias"),
+            ],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        float_model_path = "model.onnx"
+        onnx.save_model(model, float_model_path)
+
+        q16_qtypes = {QuantType.QUInt16, QuantType.QInt16}
+        q8_qtypes = {QuantType.QUInt8, QuantType.QInt8}
+        symmetric_wgt_qtypes = {QuantType.QInt8, QuantType.QInt16}
+
+        other_override_0 = {"output_0": [{"symmetric": True}]}
+        other_override_1 = {
+            "layernorm_0_out": [
+                {
+                    "quant_type": QuantType.QUInt16,
+                    "convert": {"quant_type": QuantType.QUInt8, "recv_nodes": {"Abs_1"}},
+                }
+            ]
+        }
+        other_override_2 = {
+            "layernorm_0_out": [
+                {
+                    "quant_type": QuantType.QInt16,
+                    "convert": {"quant_type": QuantType.QInt8, "recv_nodes": {"Abs_1"}},
+                }
+            ]
+        }
+        convert_layernorm_input = {
+            "abs_0_out": [
+                {
+                    "quant_type": QuantType.QUInt8,
+                    "convert": {"quant_type": QuantType.QUInt16, "recv_nodes": {"LayerNorm_0"}},
+                }
+            ]
+        }
+
+        # Enumerate subtests (default_act_qtype, default_wgt_qtype, layernorm_in_qtype, other_override)
+        subtest_configs = [
+            (QuantType.QUInt8, QuantType.QUInt8, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, other_override_0),
+            (QuantType.QUInt8, QuantType.QUInt8, QuantType.QUInt16, other_override_1),
+            (QuantType.QInt8, QuantType.QInt8, QuantType.QInt16, other_override_2),
+            (QuantType.QUInt16, QuantType.QUInt8, None, other_override_0),
+            (QuantType.QInt16, QuantType.QInt8, None, {}),
+            (QuantType.QUInt16, QuantType.QUInt16, None, other_override_0),
+            (QuantType.QInt16, QuantType.QInt16, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, None, {}),
+            (QuantType.QUInt8, QuantType.QUInt8, None, convert_layernorm_input),
+        ]
+
+        # Test if LayerNorm's weight input is overridden.
+        for default_act_qtype, default_wgt_qtype, layernorm_input_qtype, other_override in subtest_configs:
+            with self.subTest(
+                default_act_qtype=default_act_qtype,
+                default_wgt_qtype=default_wgt_qtype,
+                layernorm_input_qtype=layernorm_input_qtype,
+                other_override=other_override,
+            ):
+                init_overrides = {}
+                init_overrides.update(other_override)
+
+                if layernorm_input_qtype is not None:
+                    init_overrides["abs_0_out"] = [{"quant_type": layernorm_input_qtype}]
+
+                qnn_config = get_qnn_qdq_config(
+                    float_model_path,
+                    DummyDataReader([]),
+                    activation_type=default_act_qtype,
+                    weight_type=default_wgt_qtype,
+                    init_overrides=(init_overrides if init_overrides else None),
+                    add_qtype_converts=False,
+                )
+
+                self.assertEqual(set(qnn_config.op_types_to_quantize), {"Abs", "LayerNormalization"})
+                input_is_16bit = (
+                    (default_act_qtype in q16_qtypes)
+                    or (layernorm_input_qtype in q16_qtypes)
+                    or (other_override == convert_layernorm_input)
+                )
+                weight_is_symmetric = default_wgt_qtype in symmetric_wgt_qtypes
+
+                if input_is_16bit and default_wgt_qtype in q8_qtypes:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+                    self.assertEqual(
+                        qnn_config.extra_options["TensorQuantOverrides"]["weight"],
+                        [
+                            {
+                                "quant_type": default_wgt_qtype,
+                                "symmetric": weight_is_symmetric,
+                            }
+                        ],
+                    )
+                elif init_overrides:
+                    self.assertIn("TensorQuantOverrides", qnn_config.extra_options)
+                    self.assertNotIn("weight", qnn_config.extra_options["TensorQuantOverrides"])
+
+                self.assertEqual(weight_is_symmetric, qnn_config.extra_options["WeightSymmetric"])
+                self.assertNotIn("bias", qnn_config.extra_options["TensorQuantOverrides"])
+
+    def test_get_qnn_qdq_config_ext_data(self):
+        """
+        Test that get_qnn_qdq_config() returns a config that enables external data
+        if the input model has external data.
+        """
+
+        # Create model with a weight large enough (> 1024 bytes) to be stored externally.
+        large_weight = onnx.numpy_helper.from_array(np.random.random((1, 32, 32)).astype(np.float32), "weight")
+        graph = onnx.helper.make_graph(
+            [onnx.helper.make_node("Add", ["input", "weight"], ["output"])],
+            "add_ext_data",
+            [onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, (1, 32, 32))],
+            [onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, (1, 32, 32))],
+            initializer=[large_weight],
+        )
+        model = onnx.helper.make_model(
+            graph,
+            opset_imports=[onnx.helper.make_opsetid("", 18)],
+        )
+        onnx.save_model(
+            model,
+            "add_ext_data.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location="add_ext_data.bin",
+        )
+
+        qnn_config = get_qnn_qdq_config("add_ext_data.onnx", DummyDataReader(self.activations))
+        self.assertEqual(set(qnn_config.op_types_to_quantize), {"Add"})
+        self.assertTrue(qnn_config.use_external_data_format)
 
 
 if __name__ == "__main__":
+    t = TestTensorQuantOverridesOption()
+    t.setUp()
+    t.test_qdq_default_per_channel()
     unittest.main()
diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index e33fe0e4dade..5d8e356d0fc0 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,2 +1,2 @@
-onnx
-pytest
\ No newline at end of file
+onnx==1.16.0
+pytest
diff --git a/onnxruntime/test/python/transformers/bert_model_generator.py b/onnxruntime/test/python/transformers/bert_model_generator.py
index 9b9409545615..a84137f092e6 100644
--- a/onnxruntime/test/python/transformers/bert_model_generator.py
+++ b/onnxruntime/test/python/transformers/bert_model_generator.py
@@ -94,12 +94,16 @@ def create_bert_attention(
             perm=[0, 2, 3, 1],
         ),
         # mask nodes
-        helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1]),
-        helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2]),
+        (
+            helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1])
+        ),
+        (
+            helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2])
+        ),
         # when attention_mask is float type, no need to cast
         helper.make_node("Cast", ["unsqueeze1_out"], ["cast_out"], "cast", to=1) if not use_float_mask else None,
         helper.make_node(
@@ -291,9 +295,11 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4,
         helper.make_node("Add", ["einsum_k_out", "add_k_weight"], ["add_k_out"], "add_k"),
         helper.make_node("Mul", ["add_k_out", "mul_weight_1"], ["mul_k_out"], "mul_k"),
         # mask nodes
-        helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1, 2]),
+        (
+            helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1, 2])
+        ),
         helper.make_node(
             "Slice",
             ["unsqueeze0_out", "slice_start", "slice_end", "slice_axes", "slice_steps"],
diff --git a/onnxruntime/test/python/transformers/conformer_model_generator.py b/onnxruntime/test/python/transformers/conformer_model_generator.py
index 71e4f2b63cf4..5b27a46ea0fd 100644
--- a/onnxruntime/test/python/transformers/conformer_model_generator.py
+++ b/onnxruntime/test/python/transformers/conformer_model_generator.py
@@ -22,9 +22,7 @@ def get_tensor_and_weight(name: str, shape: List[int], random=False, zeros=False
     weights = (
         [np.random.uniform(low, high) for _ in range(total_elements)]
         if random
-        else [0.0] * total_elements
-        if zeros
-        else [1.0] * total_elements
+        else [0.0] * total_elements if zeros else [1.0] * total_elements
     )
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights), weights
 
diff --git a/onnxruntime/test/python/transformers/gpt2_model_generator.py b/onnxruntime/test/python/transformers/gpt2_model_generator.py
index 4a1b48d4d1b4..0865c87b70da 100644
--- a/onnxruntime/test/python/transformers/gpt2_model_generator.py
+++ b/onnxruntime/test/python/transformers/gpt2_model_generator.py
@@ -41,15 +41,17 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["fc_out"],
             "add_fc",
         ),
-        helper.make_node("Split", ["fc_out", "split_q_k_v"], ["q", "k", "v"], "split_qkv", axis=2)
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Split",
-            ["fc_out"],
-            ["q", "k", "v"],
-            "split_qkv",
-            axis=2,
-            split=[hidden_size, hidden_size, hidden_size],
+        (
+            helper.make_node("Split", ["fc_out", "split_q_k_v"], ["q", "k", "v"], "split_qkv", axis=2)
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Split",
+                ["fc_out"],
+                ["q", "k", "v"],
+                "split_qkv",
+                axis=2,
+                split=[hidden_size, hidden_size, hidden_size],
+            )
         ),
         # q nodes
         helper.make_node("Reshape", ["q", "reshape_x_shape"], ["reshape_q_out"], "reshape_q"),
@@ -79,19 +81,23 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             perm=[0, 2, 1, 3],
         ),
         # past
-        helper.make_node("Split", ["past", "split_1_1"], ["split_k", "split_v"], "split_past", axis=0)
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Split",
-            ["past"],
-            ["split_k", "split_v"],
-            "split_past",
-            axis=0,
-            split=[1, 1],
+        (
+            helper.make_node("Split", ["past", "split_1_1"], ["split_k", "split_v"], "split_past", axis=0)
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Split",
+                ["past"],
+                ["split_k", "split_v"],
+                "split_past",
+                axis=0,
+                split=[1, 1],
+            )
+        ),
+        (
+            helper.make_node("Squeeze", ["split_k", "axes_0"], ["past_k"], "squeeze_past_k")
+            if is_opset_13_or_newer
+            else helper.make_node("Squeeze", ["split_k"], ["past_k"], "squeeze_past_k", axes=[0])
         ),
-        helper.make_node("Squeeze", ["split_k", "axes_0"], ["past_k"], "squeeze_past_k")
-        if is_opset_13_or_newer
-        else helper.make_node("Squeeze", ["split_k"], ["past_k"], "squeeze_past_k", axes=[0]),
         helper.make_node(
             "Concat",
             ["past_k", "transpose_k_out"],
@@ -106,9 +112,11 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             "transpose_concat_k",
             perm=[0, 1, 3, 2],
         ),
-        helper.make_node("Squeeze", ["split_v", "axes_0"], ["past_v"], "squeeze_past_v")
-        if is_opset_13_or_newer
-        else helper.make_node("Squeeze", ["split_v"], ["past_v"], "squeeze_past_v", axes=[0]),
+        (
+            helper.make_node("Squeeze", ["split_v", "axes_0"], ["past_v"], "squeeze_past_v")
+            if is_opset_13_or_newer
+            else helper.make_node("Squeeze", ["split_v"], ["past_v"], "squeeze_past_v", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["past_v", "transpose_v_out"],
@@ -117,33 +125,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             axis=-2,
         ),
         # present
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_k_out", "axes_0"],
-            ["concat_k_unsqueeze_out"],
-            "concat_k_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_k_out"],
-            ["concat_k_unsqueeze_out"],
-            "concat_k_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_k_out", "axes_0"],
+                ["concat_k_unsqueeze_out"],
+                "concat_k_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_k_out"],
+                ["concat_k_unsqueeze_out"],
+                "concat_k_unsqueeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_v_out", "axes_0"],
-            ["concat_v_unsqueeze_out"],
-            "concat_v_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_v_out"],
-            ["concat_v_unsqueeze_out"],
-            "concat_v_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_v_out", "axes_0"],
+                ["concat_v_unsqueeze_out"],
+                "concat_v_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_v_out"],
+                ["concat_v_unsqueeze_out"],
+                "concat_v_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -159,19 +171,21 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["transpose_q_shape_slice_out"],
             "transpose_q_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["transpose_q_shape_slice_out", "axes_0"],
-            ["transpose_q_shape_slice_squeeze_out"],
-            "transpose_q_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["transpose_q_shape_slice_out"],
-            ["transpose_q_shape_slice_squeeze_out"],
-            "transpose_q_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["transpose_q_shape_slice_out", "axes_0"],
+                ["transpose_q_shape_slice_squeeze_out"],
+                "transpose_q_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["transpose_q_shape_slice_out"],
+                ["transpose_q_shape_slice_squeeze_out"],
+                "transpose_q_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
         helper.make_node("Shape", ["concat_k_out"], ["concat_k_shape_out"], "concat_k_shape"),
         helper.make_node(
@@ -180,19 +194,21 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["concat_k_shape_slice_out"],
             "concat_k_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["concat_k_shape_slice_out", "axes_0"],
-            ["concat_k_shape_slice_squeeze_out"],
-            "concat_k_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["concat_k_shape_slice_out"],
-            ["concat_k_shape_slice_squeeze_out"],
-            "concat_k_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["concat_k_shape_slice_out", "axes_0"],
+                ["concat_k_shape_slice_squeeze_out"],
+                "concat_k_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["concat_k_shape_slice_out"],
+                ["concat_k_shape_slice_squeeze_out"],
+                "concat_k_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Sub",
@@ -200,22 +216,26 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["sub_out"],
             "sub",
         ),
-        helper.make_node("Unsqueeze", ["sub_out", "axes_0"], ["sub_unsqueeze_out"], "sub_unsqueeze")
-        if is_opset_13_or_newer
-        else helper.make_node("Unsqueeze", ["sub_out"], ["sub_unsqueeze_out"], "sub_unsqueeze", axes=[0]),
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_k_shape_slice_squeeze_out", "axes_0"],
-            ["concat_k_shape_slice_squeeze_unsqueeze_out"],
-            "concat_k_shape_slice_squeeze_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_k_shape_slice_squeeze_out"],
-            ["concat_k_shape_slice_squeeze_unsqueeze_out"],
-            "concat_k_shape_slice_squeeze_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node("Unsqueeze", ["sub_out", "axes_0"], ["sub_unsqueeze_out"], "sub_unsqueeze")
+            if is_opset_13_or_newer
+            else helper.make_node("Unsqueeze", ["sub_out"], ["sub_unsqueeze_out"], "sub_unsqueeze", axes=[0])
+        ),
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_k_shape_slice_squeeze_out", "axes_0"],
+                ["concat_k_shape_slice_squeeze_unsqueeze_out"],
+                "concat_k_shape_slice_squeeze_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_k_shape_slice_squeeze_out"],
+                ["concat_k_shape_slice_squeeze_unsqueeze_out"],
+                "concat_k_shape_slice_squeeze_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Slice",
@@ -255,23 +275,27 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["input_mask_reshape_out"],
             "input_mask_reshape",
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["input_mask_reshape_out", "axes_1"],
-            ["unsqueeze0_out"],
-            "unsqueeze0",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["input_mask_reshape_out"],
-            ["unsqueeze0_out"],
-            "unsqueeze0",
-            axes=[1],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["input_mask_reshape_out", "axes_1"],
+                ["unsqueeze0_out"],
+                "unsqueeze0",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["input_mask_reshape_out"],
+                ["unsqueeze0_out"],
+                "unsqueeze0",
+                axes=[1],
+            )
+        ),
+        (
+            helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
+            if is_opset_13_or_newer
+            else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2])
         ),
-        helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
-        if is_opset_13_or_newer
-        else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2]),
         helper.make_node("Sub", ["sub_weight", "unsqueeze1_out"], ["mask_sub_out"], "sub_mask"),
         helper.make_node("Mul", ["mask_sub_out", "mul_weight"], ["mul_mask_out"], "mul_mask"),
         # qk nodes
@@ -322,33 +346,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["qkv_shape_slice_out"],
             "qkv_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["qkv_shape_slice_out", "axes_0"],
-            ["qkv_shape_slice_squeeze_out"],
-            "qkv_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["qkv_shape_slice_out"],
-            ["qkv_shape_slice_squeeze_out"],
-            "qkv_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["qkv_shape_slice_out", "axes_0"],
+                ["qkv_shape_slice_squeeze_out"],
+                "qkv_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["qkv_shape_slice_out"],
+                ["qkv_shape_slice_squeeze_out"],
+                "qkv_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_slice_squeeze_out", "axes_0"],
-            ["qkv_shape_slice_squeeze_unsqueeze_out"],
-            "qkv_shape_slice_squeeze_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_slice_squeeze_out"],
-            ["qkv_shape_slice_squeeze_unsqueeze_out"],
-            "qkv_shape_slice_squeeze_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_slice_squeeze_out", "axes_0"],
+                ["qkv_shape_slice_squeeze_unsqueeze_out"],
+                "qkv_shape_slice_squeeze_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_slice_squeeze_out"],
+                ["qkv_shape_slice_squeeze_unsqueeze_out"],
+                "qkv_shape_slice_squeeze_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -387,33 +415,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             "shape_qkv_gather_0",
             axis=0,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_1", "axes_0"],
-            ["qkv_shape_1_unsqueeze_out"],
-            "qkv_shape_1_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_1"],
-            ["qkv_shape_1_unsqueeze_out"],
-            "qkv_shape_1_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_1", "axes_0"],
+                ["qkv_shape_1_unsqueeze_out"],
+                "qkv_shape_1_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_1"],
+                ["qkv_shape_1_unsqueeze_out"],
+                "qkv_shape_1_unsqueeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_0", "axes_0"],
-            ["qkv_shape_0_unsqueeze_out"],
-            "qkv_shape_0_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_0"],
-            ["qkv_shape_0_unsqueeze_out"],
-            "qkv_shape_0_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_0", "axes_0"],
+                ["qkv_shape_0_unsqueeze_out"],
+                "qkv_shape_0_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_0"],
+                ["qkv_shape_0_unsqueeze_out"],
+                "qkv_shape_0_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -767,9 +799,11 @@ def create_gpt2_fused_embedlayer(
                 "",
                 "ids",
             ],
-            ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index", "embedding_sum"]
-            if output_embedding_sum
-            else ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index"],
+            (
+                ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index", "embedding_sum"]
+                if output_embedding_sum
+                else ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index"]
+            ),
             "EmbedLayerNormalization_0",
             domain="com.microsoft",
             epsilon=epsilon,
diff --git a/onnxruntime/test/python/transformers/rotary_flash.py b/onnxruntime/test/python/transformers/rotary_flash.py
new file mode 100644
index 000000000000..42bff9c92b41
--- /dev/null
+++ b/onnxruntime/test/python/transformers/rotary_flash.py
@@ -0,0 +1,693 @@
+# Copyright (c) 2023, Tri Dao.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+import triton.language as tl
+from einops import rearrange, repeat
+
+##### TRITON KERNEL FOR ROTARY #####
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"block_m": 2}),
+#         triton.Config({"block_m": 4}),
+#         triton.Config({"block_m": 8}),
+#         triton.Config({"block_m": 16}),
+#     ],
+#     key=["CACHE_KEY_SEQLEN", "BLOCK_K", "INTERLEAVED"],
+# )
+@triton.jit
+def rotary_kernel(
+    out_,  # Pointers to matrices
+    x_,
+    cos_,
+    sin_,
+    CU_SEQLENS,
+    SEQLEN_OFFSETS,  # this could be int or a pointer
+    # Matrix dimensions
+    seqlen,
+    nheads,
+    rotary_dim,
+    seqlen_ro,
+    CACHE_KEY_SEQLEN,
+    # strides
+    stride_out_batch,
+    stride_out_seqlen,
+    stride_out_nheads,
+    stride_out_headdim,
+    stride_x_batch,
+    stride_x_seqlen,
+    stride_x_nheads,
+    stride_x_headdim,
+    # Meta-parameters
+    block_k: tl.constexpr,
+    IS_SEQLEN_OFFSETS_TENSOR: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    INTERLEAVED: tl.constexpr,
+    CONJUGATE: tl.constexpr,
+    block_m: tl.constexpr,
+):
+    pid_m = tl.program_id(axis=0)
+    pid_batch = tl.program_id(axis=1)
+    pid_head = tl.program_id(axis=2)
+    rotary_dim_half = rotary_dim // 2
+
+    if not IS_VARLEN:
+        x_ = x_ + pid_batch * stride_x_batch + pid_head * stride_x_nheads
+        out_ = out_ + pid_batch * stride_out_batch + pid_head * stride_out_nheads
+    else:
+        start_idx = tl.load(CU_SEQLENS + pid_batch)
+        seqlen = tl.load(CU_SEQLENS + pid_batch + 1) - start_idx
+        x_ = x_ + start_idx * stride_x_seqlen + pid_head * stride_x_nheads
+        out_ = out_ + start_idx * stride_out_seqlen + pid_head * stride_out_nheads
+
+    if pid_m * block_m >= seqlen:
+        return
+    rm = pid_m * block_m + tl.arange(0, block_m)
+    if not IS_SEQLEN_OFFSETS_TENSOR:
+        rm_cs = rm + SEQLEN_OFFSETS
+    else:
+        rm_cs = rm + tl.load(SEQLEN_OFFSETS + pid_batch)
+    rk = tl.arange(0, block_k)
+    rk_half = tl.arange(0, block_k // 2)
+
+    if not INTERLEAVED:
+        # Load the 1st and 2nd halves of x_, do calculation, then store to 1st and 2nd halves of out_
+        x_ = x_ + (rm[:, None] * stride_x_seqlen + rk_half[None, :] * stride_x_headdim)
+        cos_ = cos_ + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
+        sin_ = sin_ + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
+        cos = tl.load(cos_, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=1.0).to(
+            tl.float32
+        )
+        sin = tl.load(sin_, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=0.0).to(
+            tl.float32
+        )
+        x0 = tl.load(x_, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), other=0.0).to(tl.float32)
+        x1 = tl.load(
+            x_ + rotary_dim_half * stride_x_headdim,
+            mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
+            other=0.0,
+        ).to(tl.float32)
+        if CONJUGATE:
+            sin = -sin
+        o0 = x0 * cos - x1 * sin
+        o1 = x0 * sin + x1 * cos
+        # write back result
+        out_ = out_ + (rm[:, None] * stride_out_seqlen + rk_half[None, :] * stride_out_headdim)
+        tl.store(out_, o0, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half))
+        tl.store(
+            out_ + rotary_dim_half * stride_out_headdim,
+            o1,
+            mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
+        )
+    else:
+        # We don't want to load x_[0, 2, 4, ...] and x_[1, 3, 5, ...] separately since both are slow.
+        # Instead, we load x0 = x_[0, 1, 2, 3, ...] and x1 = x_[1, 0, 3, 2, ...].
+        # Loading x0 will be fast but x1 will be slow.
+        # Then we load cos = cos_[0, 0, 1, 1, ...] and sin = sin_[0, 0, 1, 1, ...].
+        # Then we do the calculation and use tl.where to pick put the right outputs for the even
+        # and for the odd indices.
+        rk_swap = rk + ((rk + 1) % 2) * 2 - 1  # 1, 0, 3, 2, 5, 4, ...
+        rk_repeat = tl.arange(0, block_k) // 2
+        x0_ = x_ + (rm[:, None] * stride_x_seqlen + rk[None, :] * stride_x_headdim)
+        x1_ = x_ + (rm[:, None] * stride_x_seqlen + rk_swap[None, :] * stride_x_headdim)
+        cos_ = cos_ + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
+        sin_ = sin_ + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
+        cos = tl.load(
+            cos_,
+            mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
+            other=1.0,
+        ).to(tl.float32)
+        sin = tl.load(
+            sin_,
+            mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
+            other=0.0,
+        ).to(tl.float32)
+        x0 = tl.load(x0_, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim), other=0.0).to(tl.float32)
+        x1 = tl.load(x1_, mask=(rm[:, None] < seqlen) & (rk_swap[None, :] < rotary_dim), other=0.0).to(tl.float32)
+        if CONJUGATE:
+            sin = -sin
+        x0_cos = x0 * cos
+        x1_sin = x1 * sin
+        out = tl.where(rk[None, :] % 2 == 0, x0_cos - x1_sin, x0_cos + x1_sin)
+        out_ = out_ + (rm[:, None] * stride_out_seqlen + rk[None, :] * stride_out_headdim)
+        tl.store(out_, out, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim))
+
+
+def apply_rotary(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[int] = None,
+    interleaved=False,
+    inplace=False,
+    conjugate=False,
+) -> torch.Tensor:
+    """
+    Arguments:
+        x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim).
+        cos: (seqlen_ro, rotary_dim / 2)
+        sin: (seqlen_ro, rotary_dim / 2)
+        seqlen_offsets: integer or integer tensor of size (batch,)
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Returns:
+        y: (batch, seqlen, nheads, headdim)
+    """
+    is_varlen = cu_seqlens is not None
+    if not is_varlen:
+        batch, seqlen, nheads, headdim = x.shape
+    else:
+        assert max_seqlen is not None, "If cu_seqlens is passed in, then max_seqlen must be passed"
+        total_seqlen, nheads, headdim = x.shape
+        batch_p_1 = cu_seqlens.shape[0]
+        batch = batch_p_1 - 1
+        seqlen = max_seqlen
+    seqlen_ro, rotary_dim = cos.shape
+    assert sin.shape == cos.shape
+    rotary_dim *= 2
+    assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
+    assert headdim <= 256, "Only support headdim <= 256"
+    assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
+
+    assert cos.dtype == sin.dtype, f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
+    assert x.dtype == cos.dtype, f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
+
+    cos, sin = cos.contiguous(), sin.contiguous()
+    if isinstance(seqlen_offsets, torch.Tensor):
+        assert seqlen_offsets.shape == (batch,)
+        assert seqlen_offsets.dtype in [torch.int32, torch.int64]
+        seqlen_offsets = seqlen_offsets.contiguous()
+    else:
+        assert seqlen_offsets + seqlen <= seqlen_ro
+
+    output = torch.empty_like(x) if not inplace else x
+    if rotary_dim < headdim and not inplace:
+        output[..., rotary_dim:].copy_(x[..., rotary_dim:])
+
+    block_k = 32 if rotary_dim <= 32 else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256))
+    grid = lambda META: (triton.cdiv(seqlen, META["block_m"]), batch, nheads)  # noqa
+    block_m = 4 if interleaved else (8 if rotary_dim <= 64 else 4)
+
+    # Need this, otherwise Triton tries to launch from cuda:0 and we get
+    # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+    with torch.cuda.device(x.device.index):
+        rotary_kernel[grid](
+            output,  # data ptrs
+            x,
+            cos,
+            sin,
+            cu_seqlens,
+            seqlen_offsets,
+            seqlen,  # shapes
+            nheads,
+            rotary_dim,
+            seqlen_ro,
+            seqlen // 128,  # key for triton cache (limit number of compilations)
+            output.stride(0) if not is_varlen else 0,  # batch_strides if not varlen else 0
+            output.stride(-3),  # seqlen_stride or total_seqlen_stride
+            output.stride(-2),  # nheads_stride
+            output.stride(-1),  # headdim_stride
+            x.stride(0) if not is_varlen else 0,  # batch_strides if not varlen else 0
+            x.stride(-3),  # seqlen stride or total_seqlen_stride
+            x.stride(-2),  # nheads stride
+            x.stride(-1),  # headdim stride
+            block_k,
+            isinstance(seqlen_offsets, torch.Tensor),
+            is_varlen,
+            interleaved,
+            conjugate,
+            block_m,
+        )
+    return output
+
+
+##### ROTARY API #####
+
+
+def rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
+
+
+def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
+        dim=-1,
+    )
+
+
+class ApplyRotaryEmb(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        cos,
+        sin,
+        interleaved=False,
+        inplace=False,
+        seqlen_offsets: Union[int, torch.Tensor] = 0,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+    ):
+        out = apply_rotary(
+            x,
+            cos,
+            sin,
+            seqlen_offsets=seqlen_offsets,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            interleaved=interleaved,
+            inplace=inplace,
+        )
+        if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
+            ctx.seqlen_offsets = seqlen_offsets
+        else:
+            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
+            ctx.seqlen_offsets = None
+        ctx.interleaved = interleaved
+        ctx.inplace = inplace
+        ctx.max_seqlen = max_seqlen
+        return out if not inplace else x
+
+    @staticmethod
+    def backward(ctx, do):
+        seqlen_offsets = ctx.seqlen_offsets
+        if seqlen_offsets is None:
+            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
+        else:
+            cos, sin, cu_seqlens = ctx.saved_tensors
+        # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
+        # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
+        if not ctx.interleaved and not ctx.inplace:
+            do = do.clone()
+        dx = apply_rotary(
+            do,
+            cos,
+            sin,
+            seqlen_offsets=seqlen_offsets,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=ctx.max_seqlen,
+            interleaved=ctx.interleaved,
+            inplace=ctx.inplace,
+            conjugate=True,
+        )
+        return dx, None, None, None, None, None, None, None
+
+
+def apply_rotary_emb(
+    x,
+    cos,
+    sin,
+    interleaved=False,
+    inplace=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[int] = None,
+):
+    """
+    Arguments:
+        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+        cos, sin: (seqlen_rotary, rotary_dim / 2)
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        inplace: if True, apply rotary embedding in-place.
+        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Return:
+        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding to the first rotary_dim of x.
+    """
+    return ApplyRotaryEmb.apply(x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen)
+
+
+# For backward compatibility
+apply_rotary_emb_func = apply_rotary_emb
+
+
+class ApplyRotaryEmbQKV(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        qkv,
+        cos,
+        sin,
+        cos_k=None,
+        sin_k=None,
+        interleaved=False,
+        seqlen_offsets: Union[int, torch.Tensor] = 0,
+    ):
+        batch, seqlen, three, nheads, headdim = qkv.shape
+        assert three == 3
+        if cos_k is None and sin_k is None and qkv.is_contiguous():
+            # Call 1 kernel instead of 2 kernels
+            # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
+            # dimensions, we get the same tensor
+            # qk = rearrange(qkv[:, :, :2], "b s t h d -> b s (t h) d")
+            qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
+            apply_rotary(qk, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True)
+        else:
+            cos_k = cos if cos_k is None else cos_k
+            sin_k = sin if sin_k is None else sin_k
+            q, k = qkv[:, :, 0], qkv[:, :, 1]
+            apply_rotary(q, cos, sin, seqlen_offsets, interleaved=interleaved, inplace=True)
+            apply_rotary(k, cos_k, sin_k, seqlen_offsets, interleaved=interleaved, inplace=True)
+            ctx.save_for_backward(cos, sin, cos_k, sin_k)
+        if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(cos, sin, cos_k, sin_k)
+            ctx.seqlen_offsets = seqlen_offsets
+        else:
+            ctx.save_for_backward(cos, sin, cos_k, sin_k, seqlen_offsets)
+            ctx.seqlen_offsets = None
+        ctx.interleaved = interleaved
+        return qkv
+
+    @staticmethod
+    def backward(ctx, dqkv):
+        seqlen_offsets = ctx.seqlen_offsets
+        if seqlen_offsets is None:
+            cos, sin, cos_k, sin_k, seqlen_offsets = ctx.saved_tensors
+        else:
+            cos, sin, cos_k, sin_k = ctx.saved_tensors
+        if cos_k is None and sin_k is None and dqkv.is_contiguous():
+            # Call 1 kernel instead of 2 kernels
+            # We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
+            # dimensions, we get the same tensor
+            dqk = rearrange(dqkv[:, :, :2], "b s t h d -> b s (t h) d")
+            apply_rotary(
+                dqk,
+                cos,
+                sin,
+                seqlen_offsets=seqlen_offsets,
+                interleaved=ctx.interleaved,
+                inplace=True,
+                conjugate=True,
+            )
+        else:
+            cos_k = cos if cos_k is None else cos_k
+            sin_k = sin if sin_k is None else sin_k
+            dq, dk = dqkv[:, :, 0], dqkv[:, :, 1]
+            apply_rotary(dq, cos, sin, seqlen_offsets, interleaved=ctx.interleaved, inplace=True, conjugate=True)
+            apply_rotary(
+                dk,
+                cos_k,
+                sin_k,
+                seqlen_offsets,
+                interleaved=ctx.interleaved,
+                inplace=True,
+                conjugate=True,
+            )
+        return dqkv, None, None, None, None, None, None
+
+
+def apply_rotary_emb_qkv_(
+    qkv,
+    cos,
+    sin,
+    cos_k=None,
+    sin_k=None,
+    interleaved=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, headdim)
+        cos, sin: (seqlen, rotary_dim / 2)
+        cos_k, sin_k: (seqlen, rotary_dim / 2), optional
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
+            1st half and 2nd half (GPT-NeoX style).
+        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+    Return:
+        qkv: (batch_size, seqlen, 3, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
+    """
+    return ApplyRotaryEmbQKV.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
+
+
+class ApplyRotaryEmbKV(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kv, cos, sin, interleaved=False, seqlen_offsets: Union[int, torch.Tensor] = 0):
+        batch, seqlen, two, nheads, headdim = kv.shape
+        assert two == 2
+        k = kv[:, :, 0]
+        apply_rotary(k, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True)
+        if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(cos, sin)  # Can't save int with save_for_backward
+            ctx.seqlen_offsets = seqlen_offsets
+        else:
+            ctx.save_for_backward(cos, sin, seqlen_offsets)
+            ctx.seqlen_offsets = None
+        ctx.interleaved = interleaved
+        return kv
+
+    @staticmethod
+    def backward(ctx, dkv):
+        seqlen_offsets = ctx.seqlen_offsets
+        if seqlen_offsets is None:
+            cos, sin, seqlen_offsets = ctx.saved_tensors
+        else:
+            cos, sin = ctx.saved_tensors
+        apply_rotary(
+            dkv[:, :, 0],
+            cos,
+            sin,
+            seqlen_offsets=seqlen_offsets,
+            interleaved=ctx.interleaved,
+            inplace=True,
+            conjugate=True,
+        )
+        return dkv, None, None, None, None
+
+
+apply_rotary_emb_kv_ = ApplyRotaryEmbKV.apply
+
+
+def apply_rotary_emb_kv_(
+    kv,
+    cos,
+    sin,
+    interleaved=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+):
+    """
+    Arguments:
+        kv: (batch_size, seqlen, 2, nheads, headdim)
+        cos, sin: (seqlen, rotary_dim / 2)
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
+            1st half and 2nd half (GPT-NeoX style).
+        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+    Return:
+        kv: (batch_size, seqlen, 2, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding *inplace* to the first rotary_dim of K.
+    """
+    return ApplyRotaryEmbKV.apply(kv, cos, sin, interleaved, seqlen_offsets)
+
+
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+
+    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
+    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
+    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        base=10000.0,
+        interleaved=False,
+        scale_base=None,
+        pos_idx_in_fp32=True,
+        device=None,
+    ):
+        """
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
+            otherwise they might be in lower precision.
+            This option was added because previously (before 2023-07-02), when we construct
+            the position indices, we use the dtype of self.inv_freq. In most cases this would
+            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
+            self.inv_freq would be bf16, and the position indices are also in bf16.
+            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
+            embeddings for some positions will coincide.
+            To maintain compatibility with models previously trained in pure bf16,
+            we add this option.
+        """
+        super().__init__()
+        self.dim = dim
+        self.base = float(base)
+        self.pos_idx_in_fp32 = pos_idx_in_fp32
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = self._compute_inv_freq(device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.interleaved = interleaved
+        self.scale_base = scale_base
+        scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base is not None
+            else None
+        )
+        self.register_buffer("scale", scale, persistent=False)
+
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+
+    def _compute_inv_freq(self, device=None):
+        return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
+
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self._compute_inv_freq(device=device)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                inv_freq = self.inv_freq
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
+                ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+
+    def forward(
+        self,
+        qkv: torch.Tensor,
+        kv: Optional[torch.Tensor] = None,
+        seqlen_offset: Union[int, torch.Tensor] = 0,
+        max_seqlen: Optional[int] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
+             else it's just q of shape (batch, seqlen, nheads, headdim)
+        kv: (batch, seqlen, 2, nheads, headdim)
+        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
+            should pass in max_seqlen, which will update the cos / sin cache up to that length.
+        Apply rotary embedding *inplace* to qkv and / or kv.
+        """
+        seqlen = qkv.shape[1]
+        if max_seqlen is not None:
+            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
+        elif isinstance(seqlen_offset, int):
+            self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
+        if kv is None:
+            if self.scale is None:
+                return apply_rotary_emb_qkv_(
+                    qkv,
+                    self._cos_cached,
+                    self._sin_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=seqlen_offset,
+                )
+            else:
+                return apply_rotary_emb_qkv_(
+                    qkv,
+                    self._cos_cached,
+                    self._sin_cached,
+                    self._cos_k_cached,
+                    self._sin_k_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=seqlen_offset,
+                )
+        else:
+            q = qkv
+            q = apply_rotary_emb_func(
+                q,
+                self._cos_cached,
+                self._sin_cached,
+                interleaved=self.interleaved,
+                inplace=True,
+                seqlen_offsets=seqlen_offset,
+            )
+            if self.scale is None:
+                kv = apply_rotary_emb_kv_(
+                    kv,
+                    self._cos_cached,
+                    self._sin_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=seqlen_offset,
+                )
+            else:
+                kv = apply_rotary_emb_kv_(
+                    kv,
+                    self._cos_k_cached,
+                    self._sin_k_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=seqlen_offset,
+                )
+            return q, kv
diff --git a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
index af835d2906e8..42682d67e94e 100644
--- a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
+++ b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
@@ -24,25 +24,17 @@ def get_size():
     return comm.Get_size()
 
 
-def barrier():
-    comm.Barrier()
-
-
 def print_out(*args):
     if get_rank() == 0:
         print(*args)
 
 
-def broadcast(data):
-    comm = MPI.COMM_WORLD
-    comm.broadcast(data, root=0)
-
-
 local_rank = get_rank()
 
 ORT_DTYPE = TensorProto.FLOAT16
 NP_TYPE = np.float16 if ORT_DTYPE == TensorProto.FLOAT16 else np.float32
-THRESHOLD = 1e-3
+THRESHOLD_TP = 3e-2
+THRESHOLD_EP = 1e-6
 
 
 def create_moe_onnx_graph(
@@ -52,51 +44,64 @@ def create_moe_onnx_graph(
     hidden_size,
     inter_size,
     fc1_experts_weights,
-    fc2_experts_weights,
     fc1_experts_bias,
+    fc2_experts_weights,
     fc2_experts_bias,
-    local_experts_start_index=-1,
+    fc3_experts_weights,
+    local_experts_start_index=0,
+    topk=2,
+    normalize_routing_weights=1,
+    activation_type="gelu",
+    tensor_shards=1,
 ):
-    use_sharded_moe = local_experts_start_index >= 0
+    use_sharded_moe = num_experts > local_num_experts or tensor_shards > 1
     nodes = [
-        helper.make_node(
-            "MoE",
-            [
-                "input",
-                "router_probs",
-                "fc1_experts_weights",
-                "fc2_experts_weights",
-                "fc1_experts_bias",
-                "fc2_experts_bias",
-            ],
-            ["output"],
-            "MoE_0",
-            k=1,
-            activation_type="gelu",
-            domain="com.microsoft",
-        )
-        if not use_sharded_moe
-        else helper.make_node(
-            "ShardedMoE",
-            [
-                "input",
-                "router_probs",
-                "fc1_experts_weights",
-                "fc2_experts_weights",
-                "fc1_experts_bias",
-                "fc2_experts_bias",
-            ],
-            ["output"],
-            "MoE_0",
-            k=1,
-            activation_type="gelu",
-            local_experts_start_index=local_experts_start_index,
-            domain="com.microsoft",
+        (
+            helper.make_node(
+                "MoE",
+                [
+                    "input",
+                    "router_probs",
+                    "fc1_experts_weights",
+                    "fc1_experts_bias",
+                    "fc2_experts_weights",
+                    "fc2_experts_bias",
+                    "fc3_experts_weights",
+                ],
+                ["output"],
+                "MoE_0",
+                k=topk,
+                normalize_routing_weights=normalize_routing_weights,
+                activation_type=activation_type,
+                domain="com.microsoft",
+            )
+            if not use_sharded_moe
+            else helper.make_node(
+                "ShardedMoE",
+                [
+                    "input",
+                    "router_probs",
+                    "fc1_experts_weights",
+                    "fc1_experts_bias",
+                    "fc2_experts_weights",
+                    "fc2_experts_bias",
+                    "fc3_experts_weights",
+                ],
+                ["output"],
+                "MoE_0",
+                k=topk,
+                normalize_routing_weights=normalize_routing_weights,
+                activation_type=activation_type,
+                local_experts_start_index=local_experts_start_index,
+                tensor_shards=tensor_shards,
+                domain="com.microsoft",
+            )
         ),
     ]
 
     fc1_shape = [local_num_experts, hidden_size, inter_size]
     fc2_shape = [local_num_experts, inter_size, hidden_size]
+    fc3_shape = fc1_shape
 
     initializers = [
         helper.make_tensor(
@@ -113,6 +118,13 @@ def create_moe_onnx_graph(
             fc2_experts_weights.flatten(),
             raw=False,
         ),
+        helper.make_tensor(
+            "fc3_experts_weights",
+            ORT_DTYPE,
+            fc3_shape,
+            fc3_experts_weights.flatten(),
+            raw=False,
+        ),
     ]
 
     fc1_bias_shape = [local_num_experts, inter_size]
@@ -164,18 +176,18 @@ def create_moe_onnx_graph(
     return model.SerializeToString()
 
 
-def test_moe_with_expert_slicing(
+def generate_weights_and_initial_model(
+    num_rows,
+    num_experts,
     hidden_size,
     inter_size,
-    num_experts,
-    num_rows,
 ):
-    local_experts_start_index = local_rank * num_experts // get_size()
-
-    fc1_experts_weights_all = np.random.rand(num_experts, hidden_size, inter_size).astype(NP_TYPE)
-    fc2_experts_weights_all = np.random.rand(num_experts, inter_size, hidden_size).astype(NP_TYPE)
-    fc1_experts_bias_all = np.random.rand(num_experts, inter_size).astype(NP_TYPE)
-    fc2_experts_bias_all = np.random.rand(num_experts, hidden_size).astype(NP_TYPE)
+    s = 0.1
+    fc1_experts_weights_all = np.random.normal(scale=s, size=(num_experts, hidden_size, inter_size)).astype(NP_TYPE)
+    fc2_experts_weights_all = np.random.normal(scale=s, size=(num_experts, inter_size, hidden_size)).astype(NP_TYPE)
+    fc3_experts_weights_all = np.random.normal(scale=s, size=(num_experts, hidden_size, inter_size)).astype(NP_TYPE)
+    fc1_experts_bias_all = np.random.normal(scale=s, size=(num_experts, inter_size)).astype(NP_TYPE)
+    fc2_experts_bias_all = np.random.normal(scale=s, size=(num_experts, hidden_size)).astype(NP_TYPE)
 
     onnx_model_full = create_moe_onnx_graph(
         num_rows,
@@ -184,34 +196,31 @@ def test_moe_with_expert_slicing(
         hidden_size,
         inter_size,
         fc1_experts_weights_all,
-        fc2_experts_weights_all,
         fc1_experts_bias_all,
+        fc2_experts_weights_all,
         fc2_experts_bias_all,
+        fc3_experts_weights_all,
     )
 
-    fc1_experts_weights = fc1_experts_weights_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
-    ]
-    fc2_experts_weights = fc2_experts_weights_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
-    ]
-    fc1_experts_bias = fc1_experts_bias_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :
-    ]
-
-    onnx_model_local = create_moe_onnx_graph(
-        num_rows,
-        num_experts,
-        num_experts // get_size(),
-        hidden_size,
-        inter_size,
-        fc1_experts_weights,
-        fc2_experts_weights,
-        fc1_experts_bias,
+    return (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
         fc2_experts_bias_all,
-        local_experts_start_index,
+        fc3_experts_weights_all,
     )
 
+
+def run_ort_with_parity_check(
+    onnx_model_full,
+    onnx_model_local,
+    num_rows,
+    hidden_size,
+    num_experts,
+    inter_size,
+    threshold,
+):
     sess_options = onnxruntime.SessionOptions()
     cuda_provider_options = {"device_id": local_rank}
     execution_providers = [("CUDAExecutionProvider", cuda_provider_options)]
@@ -227,30 +236,173 @@ def test_moe_with_expert_slicing(
     output = ort_session.run(None, ort_inputs)
     sharded_output = ort_session_local.run(None, ort_inputs)
 
-    assert np.allclose(output[0], sharded_output[0], atol=THRESHOLD, rtol=THRESHOLD)
+    print_out("max diff:", np.max(np.abs(output[0] - sharded_output[0])))
+    assert np.allclose(output[0], sharded_output[0], atol=threshold, rtol=threshold)
 
     print_out(
-        "hidden_size: ",
+        "hidden_size:",
         hidden_size,
-        " inter_size: ",
+        " inter_size:",
         inter_size,
-        " num_experts: ",
+        " num_experts:",
         num_experts,
-        " num_rows: ",
+        " num_rows:",
         num_rows,
-        " world_size: ",
+        " world_size:",
         get_size(),
         " Parity: OK",
     )
 
 
+def test_moe_with_tensor_parallelism(
+    hidden_size,
+    inter_size,
+    num_experts,
+    num_rows,
+    threshold=THRESHOLD_TP,
+):
+    assert inter_size % get_size() == 0
+
+    (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
+        fc2_experts_bias_all,
+        fc3_experts_weights_all,
+    ) = generate_weights_and_initial_model(
+        num_rows,
+        num_experts,
+        hidden_size,
+        inter_size,
+    )
+
+    def get_fc1_tensor_shards(expert_weights):
+        return (
+            expert_weights.reshape(-1, inter_size, hidden_size)
+            .transpose(0, 2, 1)[
+                :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+            ]
+            .transpose(0, 2, 1)
+        )
+
+    def get_fc2_tensor_shards(expert_weights):
+        return (
+            expert_weights.reshape(-1, hidden_size, inter_size)
+            .transpose(0, 2, 1)[
+                :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size(), :
+            ]
+            .transpose(0, 2, 1)
+        )
+
+    fc1_experts_weights = get_fc1_tensor_shards(fc1_experts_weights_all)
+    fc2_experts_weights = get_fc2_tensor_shards(fc2_experts_weights_all)
+    fc3_experts_weights = get_fc1_tensor_shards(fc3_experts_weights_all)
+    fc1_experts_bias = fc1_experts_bias_all[
+        :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+    ]
+
+    onnx_model_local = create_moe_onnx_graph(
+        num_rows,
+        num_experts,
+        num_experts,
+        hidden_size,
+        inter_size // get_size(),
+        fc1_experts_weights,
+        fc1_experts_bias,
+        fc2_experts_weights,
+        fc2_experts_bias_all,
+        fc3_experts_weights,
+        tensor_shards=get_size(),
+    )
+
+    run_ort_with_parity_check(
+        onnx_model_full,
+        onnx_model_local,
+        num_rows,
+        hidden_size,
+        num_experts,
+        inter_size,
+        threshold,
+    )
+
+
+def test_moe_with_expert_parallelism(
+    hidden_size,
+    inter_size,
+    num_experts,
+    num_rows,
+    threshold=THRESHOLD_EP,
+):
+    local_experts_start_index = local_rank * num_experts // get_size()
+
+    (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
+        fc2_experts_bias_all,
+        fc3_experts_weights_all,
+    ) = generate_weights_and_initial_model(
+        num_rows,
+        num_experts,
+        hidden_size,
+        inter_size,
+    )
+
+    fc1_experts_weights = fc1_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc2_experts_weights = fc2_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc3_experts_weights = fc3_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc1_experts_bias = fc1_experts_bias_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :
+    ]
+
+    onnx_model_local = create_moe_onnx_graph(
+        num_rows,
+        num_experts,
+        num_experts // get_size(),
+        hidden_size,
+        inter_size,
+        fc1_experts_weights,
+        fc1_experts_bias,
+        fc2_experts_weights,
+        fc2_experts_bias_all,
+        fc3_experts_weights,
+        local_experts_start_index,
+    )
+
+    run_ort_with_parity_check(
+        onnx_model_full,
+        onnx_model_local,
+        num_rows,
+        hidden_size,
+        num_experts,
+        inter_size,
+        threshold,
+    )
+
+
 class TestMoE(unittest.TestCase):
-    def test_moe_expert_slicing(self):
-        for hidden_size in [16, 128]:
-            for inter_size in [512, 1024]:
-                for num_experts in [8, 16, 32]:
-                    for num_rows in [16, 128, 512]:
-                        test_moe_with_expert_slicing(
+    def test_moe_parallelism(self):
+        for hidden_size in [128, 1024]:
+            for inter_size in [512, 2048]:
+                for num_experts in [64]:
+                    for num_rows in [1024]:
+                        print_out("EP")
+                        test_moe_with_expert_parallelism(
+                            hidden_size,
+                            inter_size,
+                            num_experts,
+                            num_rows,
+                        )
+                        print_out("TP")
+                        test_moe_with_tensor_parallelism(
                             hidden_size,
                             inter_size,
                             num_experts,
diff --git a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
index c42c42c3ca17..0086ce0d289c 100644
--- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@@ -403,9 +403,7 @@ def generate_test_data(
         evalTime = timeit.default_timer() - start_time  # noqa: N806
         if outputs[0].tolist() != result[0].tolist():
             print(
-                "Error: not same result after optimization. use_cpu={}, no_opt_output={}, opt_output={}".format(
-                    use_cpu, result[0].tolist(), outputs[1].tolist()
-                )
+                f"Error: not same result after optimization. use_cpu={use_cpu}, no_opt_output={result[0].tolist()}, opt_output={outputs[1].tolist()}"
             )
         print(f"** Evaluation done in total {evalTime} secs")
 
diff --git a/onnxruntime/test/python/transformers/test_flash_attn.py b/onnxruntime/test/python/transformers/test_flash_attn.py
index 8a839875de2a..183d6218567a 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn.py
@@ -20,6 +20,7 @@
 from bert_padding import pad_input, unpad_input
 from einops import rearrange, repeat
 from onnx import TensorProto, helper
+from rotary_flash import apply_rotary_emb
 
 from onnxruntime import InferenceSession, OrtValue, SessionOptions
 
@@ -184,7 +185,13 @@ def create_multihead_attention_graph(config):
 
 
 def create_group_query_attention_graph_prompt(
-    config, past_kv_format=Formats.BSNH, share_buffer=True, local_window_size=-1
+    config,
+    past_kv_format=Formats.BSNH,
+    share_buffer=True,
+    local_window_size=-1,
+    rotary=False,
+    rotary_interleaved=False,
+    packed=False,
 ):
     past_kv_seqlen = config.buffer_sequence_length if share_buffer else 0
     present_kv_seqlen = config.buffer_sequence_length if share_buffer else config.kv_sequence_length
@@ -193,18 +200,22 @@ def create_group_query_attention_graph_prompt(
             "GroupQueryAttention",
             [
                 "query",
-                "key",
-                "value",
+                "key" if not packed else "",
+                "value" if not packed else "",
                 "past_key" if share_buffer else "",
                 "past_value" if share_buffer else "",
                 "seqlens_k",
                 "total_sequence_length",
+                "cos_cache" if rotary else "",
+                "sin_cache" if rotary else "",
             ],
             ["output", "present_key", "present_value"],
             "GroupQueryAttention_0",
             num_heads=config.num_heads,
             kv_num_heads=config.kv_num_heads,
             local_window_size=local_window_size,
+            do_rotary=rotary,
+            rotary_interleaved=rotary_interleaved,
             # is_past_bsnh=1 if past_kv_format == Formats.BSNH else 0,
             # kv_share_buffer=1 if share_buffer else 0,
             domain="com.microsoft",
@@ -218,25 +229,11 @@ def create_group_query_attention_graph_prompt(
             [
                 config.batch_size,
                 config.q_sequence_length,
-                config.num_heads * config.head_size,
-            ],
-        ),
-        helper.make_tensor_value_info(
-            "key",
-            TensorProto.FLOAT16,
-            [
-                config.batch_size,
-                config.kv_sequence_length,
-                config.kv_num_heads * config.head_size,
-            ],
-        ),
-        helper.make_tensor_value_info(
-            "value",
-            TensorProto.FLOAT16,
-            [
-                config.batch_size,
-                config.kv_sequence_length,
-                config.kv_num_heads * config.head_size,
+                (
+                    (config.num_heads * config.head_size)
+                    if not packed
+                    else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size)
+                ),
             ],
         ),
         helper.make_tensor_value_info(
@@ -250,6 +247,27 @@ def create_group_query_attention_graph_prompt(
             [1],
         ),
     ]
+    if not packed:
+        graph_input += [
+            helper.make_tensor_value_info(
+                "key",
+                TensorProto.FLOAT16,
+                [
+                    config.batch_size,
+                    config.kv_sequence_length,
+                    config.kv_num_heads * config.head_size,
+                ],
+            ),
+            helper.make_tensor_value_info(
+                "value",
+                TensorProto.FLOAT16,
+                [
+                    config.batch_size,
+                    config.kv_sequence_length,
+                    config.kv_num_heads * config.head_size,
+                ],
+            ),
+        ]
     if share_buffer:
         graph_input += [
             helper.make_tensor_value_info(
@@ -273,6 +291,25 @@ def create_group_query_attention_graph_prompt(
                 ],
             ),
         ]
+    if rotary:
+        graph_input += [
+            helper.make_tensor_value_info(
+                "cos_cache",
+                TensorProto.FLOAT16,
+                [
+                    config.buffer_sequence_length if share_buffer else config.kv_sequence_length,
+                    (math.floor(config.head_size / 16) * 16) // 2,
+                ],
+            ),
+            helper.make_tensor_value_info(
+                "sin_cache",
+                TensorProto.FLOAT16,
+                [
+                    config.buffer_sequence_length if share_buffer else config.kv_sequence_length,
+                    (math.floor(config.head_size / 16) * 16) // 2,
+                ],
+            ),
+        ]
 
     graph_output = [
         helper.make_tensor_value_info(
@@ -334,7 +371,13 @@ def create_group_query_attention_graph_prompt(
 
 
 def create_group_query_attention_graph_past(
-    config, past_kv_format=Formats.BSNH, share_buffer=True, local_window_size=-1
+    config,
+    past_kv_format=Formats.BSNH,
+    share_buffer=True,
+    local_window_size=-1,
+    rotary=False,
+    rotary_interleaved=False,
+    packed=False,
 ):
     past_kv_seqlen = config.kv_sequence_length
     present_kv_seqlen = (
@@ -345,18 +388,22 @@ def create_group_query_attention_graph_past(
             "GroupQueryAttention",
             [
                 "query",
-                "key",
-                "value",
+                "key" if not packed else "",
+                "value" if not packed else "",
                 "past_key",
                 "past_value",
                 "seqlens_k",
                 "total_sequence_length",
+                "cos_cache" if rotary else "",
+                "sin_cache" if rotary else "",
             ],
             ["output", "present_key", "present_value"],
             "GroupQueryAttention_0",
             num_heads=config.num_heads,
             kv_num_heads=config.kv_num_heads,
             local_window_size=local_window_size,
+            do_rotary=rotary,
+            rotary_interleaved=rotary_interleaved,
             # is_past_bsnh=1 if past_kv_format == Formats.BSNH else 0,
             # kv_share_buffer=1 if share_buffer else 0,
             domain="com.microsoft",
@@ -370,25 +417,11 @@ def create_group_query_attention_graph_past(
             [
                 config.batch_size,
                 config.sequence_length,
-                config.num_heads * config.head_size,
-            ],
-        ),
-        helper.make_tensor_value_info(
-            "key",
-            TensorProto.FLOAT16,
-            [
-                config.batch_size,
-                config.sequence_length,
-                config.kv_num_heads * config.head_size,
-            ],
-        ),
-        helper.make_tensor_value_info(
-            "value",
-            TensorProto.FLOAT16,
-            [
-                config.batch_size,
-                config.sequence_length,
-                config.kv_num_heads * config.head_size,
+                (
+                    (config.num_heads * config.head_size)
+                    if not packed
+                    else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size)
+                ),
             ],
         ),
         helper.make_tensor_value_info(
@@ -411,8 +444,6 @@ def create_group_query_attention_graph_past(
                 config.head_size,
             ],
         ),
-    ]
-    graph_input += [
         helper.make_tensor_value_info(
             "seqlens_k",
             TensorProto.INT32,
@@ -424,6 +455,46 @@ def create_group_query_attention_graph_past(
             [1],
         ),
     ]
+    if not packed:
+        graph_input += [
+            helper.make_tensor_value_info(
+                "key",
+                TensorProto.FLOAT16,
+                [
+                    config.batch_size,
+                    config.sequence_length,
+                    config.kv_num_heads * config.head_size,
+                ],
+            ),
+            helper.make_tensor_value_info(
+                "value",
+                TensorProto.FLOAT16,
+                [
+                    config.batch_size,
+                    config.sequence_length,
+                    config.kv_num_heads * config.head_size,
+                ],
+            ),
+        ]
+    if rotary:
+        graph_input += [
+            helper.make_tensor_value_info(
+                "cos_cache",
+                TensorProto.FLOAT16,
+                [
+                    config.kv_sequence_length + (0 if share_buffer else config.sequence_length),
+                    (math.floor(config.head_size / 16) * 16) // 2,
+                ],
+            ),
+            helper.make_tensor_value_info(
+                "sin_cache",
+                TensorProto.FLOAT16,
+                [
+                    config.kv_sequence_length + (0 if share_buffer else config.sequence_length),
+                    (math.floor(config.head_size / 16) * 16) // 2,
+                ],
+            ),
+        ]
 
     graph_output = [
         helper.make_tensor_value_info(
@@ -663,21 +734,38 @@ def mha_func(q, k, v, config):
 
 
 def gqa_prompt_func(
-    q, k, v, config, new_k, new_v, seqlens_k=None, window_size=-1, past_kv_format=Formats.BSNH, share_buffer=True
+    q,
+    k,
+    v,
+    config,
+    new_k,
+    new_v,
+    cos=None,
+    sin=None,
+    seqlens_k=None,
+    window_size=-1,
+    past_kv_format=Formats.BSNH,
+    share_buffer=True,
+    rotary_interleaved=False,
 ):
     onnx_model_str = create_group_query_attention_graph_prompt(
-        config, past_kv_format, share_buffer, local_window_size=window_size
+        config,
+        past_kv_format,
+        share_buffer,
+        local_window_size=window_size,
+        rotary=cos is not None,
+        rotary_interleaved=rotary_interleaved,
+        packed=new_k is None,
     )
     q = torch.reshape(q, (config.batch_size, config.q_sequence_length, -1))
     past_k = k.clone() if share_buffer else None
     past_v = v.clone() if share_buffer else None
-    new_k = torch.reshape(new_k, (config.batch_size, config.kv_sequence_length, -1))
-    new_v = torch.reshape(new_v, (config.batch_size, config.kv_sequence_length, -1))
+    if new_k is not None:
+        new_k = torch.reshape(new_k, (config.batch_size, config.kv_sequence_length, -1))
+        new_v = torch.reshape(new_v, (config.batch_size, config.kv_sequence_length, -1))
     if share_buffer:
         ort_inputs = {
             "query": q.detach().cpu().numpy(),
-            "key": new_k.detach().cpu().numpy(),
-            "value": new_v.detach().cpu().numpy(),
             "past_key": OrtValue.ortvalue_from_numpy(past_k.detach().cpu().numpy(), "cuda", 0),
             "past_value": OrtValue.ortvalue_from_numpy(past_v.detach().cpu().numpy(), "cuda", 0),
             "seqlens_k": seqlens_k.detach().cpu().numpy().astype(numpy.int32),
@@ -686,9 +774,17 @@ def gqa_prompt_func(
         sess_options = SessionOptions()
         ort_session = InferenceSession(onnx_model_str, sess_options, providers=["CUDAExecutionProvider"])
         io_binding = ort_session.io_binding()
+        if new_k is not None:
+            ort_inputs["key"] = new_k.detach().cpu().numpy()
+            ort_inputs["value"] = new_v.detach().cpu().numpy()
+            io_binding.bind_cpu_input("key", ort_inputs["key"])
+            io_binding.bind_cpu_input("value", ort_inputs["value"])
+        if cos is not None:
+            ort_inputs["cos_cache"] = cos.detach().cpu().numpy()
+            ort_inputs["sin_cache"] = sin.detach().cpu().numpy()
+            io_binding.bind_cpu_input("cos_cache", ort_inputs["cos_cache"])
+            io_binding.bind_cpu_input("sin_cache", ort_inputs["sin_cache"])
         io_binding.bind_cpu_input("query", ort_inputs["query"])
-        io_binding.bind_cpu_input("key", ort_inputs["key"])
-        io_binding.bind_cpu_input("value", ort_inputs["value"])
         io_binding.bind_input(
             "past_key", "cuda", 0, numpy.float16, ort_inputs["past_key"].shape(), ort_inputs["past_key"].data_ptr()
         )
@@ -713,17 +809,23 @@ def gqa_prompt_func(
     else:
         ort_inputs = {
             "query": q.detach().cpu().numpy(),
-            "key": new_k.detach().cpu().numpy(),
-            "value": new_v.detach().cpu().numpy(),
             "seqlens_k": seqlens_k.detach().cpu().numpy().astype(numpy.int32),
             "total_sequence_length": torch.tensor([config.q_sequence_length], dtype=torch.int32).detach().cpu().numpy(),
         }
         sess_options = SessionOptions()
         ort_session = InferenceSession(onnx_model_str, sess_options, providers=["CUDAExecutionProvider"])
         io_binding = ort_session.io_binding()
+        if new_k is not None:
+            ort_inputs["key"] = new_k.detach().cpu().numpy()
+            ort_inputs["value"] = new_v.detach().cpu().numpy()
+            io_binding.bind_cpu_input("key", ort_inputs["key"])
+            io_binding.bind_cpu_input("value", ort_inputs["value"])
+        if cos is not None:
+            ort_inputs["cos_cache"] = cos.detach().cpu().numpy()
+            ort_inputs["sin_cache"] = sin.detach().cpu().numpy()
+            io_binding.bind_cpu_input("cos_cache", ort_inputs["cos_cache"])
+            io_binding.bind_cpu_input("sin_cache", ort_inputs["sin_cache"])
         io_binding.bind_cpu_input("query", ort_inputs["query"])
-        io_binding.bind_cpu_input("key", ort_inputs["key"])
-        io_binding.bind_cpu_input("value", ort_inputs["value"])
         io_binding.bind_cpu_input("seqlens_k", ort_inputs["seqlens_k"])
         io_binding.bind_cpu_input("total_sequence_length", ort_inputs["total_sequence_length"])
         io_binding.bind_output("output")
@@ -737,21 +839,38 @@ def gqa_prompt_func(
 
 
 def gqa_past_func(
-    q, k, v, config, new_k, new_v, seqlens_k=None, past_kv_format=Formats.BSNH, share_buffer=True, window_size=-1
+    q,
+    k,
+    v,
+    config,
+    new_k,
+    new_v,
+    cos=None,
+    sin=None,
+    seqlens_k=None,
+    past_kv_format=Formats.BSNH,
+    share_buffer=True,
+    window_size=-1,
+    rotary_interleaved=False,
 ):
     onnx_model_str = create_group_query_attention_graph_past(
-        config, past_kv_format, share_buffer, local_window_size=window_size
+        config,
+        past_kv_format,
+        share_buffer,
+        local_window_size=window_size,
+        rotary=cos is not None,
+        rotary_interleaved=rotary_interleaved,
+        packed=new_k is None,
     )
     q = torch.reshape(q, (config.batch_size, config.sequence_length, -1))
     past_k = k.clone()
     past_v = v.clone()
-    new_k = torch.reshape(new_k, (config.batch_size, config.sequence_length, -1))
-    new_v = torch.reshape(new_v, (config.batch_size, config.sequence_length, -1))
+    if new_k is not None:
+        new_k = torch.reshape(new_k, (config.batch_size, config.sequence_length, -1))
+        new_v = torch.reshape(new_v, (config.batch_size, config.sequence_length, -1))
     if share_buffer:
         ort_inputs = {
             "query": q.detach().cpu().numpy(),
-            "key": new_k.detach().cpu().numpy(),
-            "value": new_v.detach().cpu().numpy(),
             "past_key": OrtValue.ortvalue_from_numpy(past_k.detach().cpu().numpy(), "cuda", 0),
             "past_value": OrtValue.ortvalue_from_numpy(past_v.detach().cpu().numpy(), "cuda", 0),
             "seqlens_k": seqlens_k.detach().cpu().numpy().astype(numpy.int32),
@@ -763,9 +882,17 @@ def gqa_past_func(
         sess_options = SessionOptions()
         ort_session = InferenceSession(onnx_model_str, sess_options, providers=["CUDAExecutionProvider"])
         io_binding = ort_session.io_binding()
+        if new_k is not None:
+            ort_inputs["key"] = new_k.detach().cpu().numpy()
+            ort_inputs["value"] = new_v.detach().cpu().numpy()
+            io_binding.bind_cpu_input("key", ort_inputs["key"])
+            io_binding.bind_cpu_input("value", ort_inputs["value"])
+        if cos is not None:
+            ort_inputs["cos_cache"] = cos.detach().cpu().numpy()
+            ort_inputs["sin_cache"] = sin.detach().cpu().numpy()
+            io_binding.bind_cpu_input("cos_cache", ort_inputs["cos_cache"])
+            io_binding.bind_cpu_input("sin_cache", ort_inputs["sin_cache"])
         io_binding.bind_cpu_input("query", ort_inputs["query"])
-        io_binding.bind_cpu_input("key", ort_inputs["key"])
-        io_binding.bind_cpu_input("value", ort_inputs["value"])
         io_binding.bind_input(
             "past_key", "cuda", 0, numpy.float16, ort_inputs["past_key"].shape(), ort_inputs["past_key"].data_ptr()
         )
@@ -790,8 +917,6 @@ def gqa_past_func(
     else:
         ort_inputs = {
             "query": q.detach().cpu().numpy(),
-            "key": new_k.detach().cpu().numpy(),
-            "value": new_v.detach().cpu().numpy(),
             "past_key": past_k.detach().cpu().numpy(),
             "past_value": past_v.detach().cpu().numpy(),
             "seqlens_k": seqlens_k.detach().cpu().numpy().astype(numpy.int32),
@@ -805,9 +930,17 @@ def gqa_past_func(
         sess_options = SessionOptions()
         ort_session = InferenceSession(onnx_model_str, sess_options, providers=["CUDAExecutionProvider"])
         io_binding = ort_session.io_binding()
+        if new_k is not None:
+            ort_inputs["key"] = new_k.detach().cpu().numpy()
+            ort_inputs["value"] = new_v.detach().cpu().numpy()
+            io_binding.bind_cpu_input("key", ort_inputs["key"])
+            io_binding.bind_cpu_input("value", ort_inputs["value"])
+        if cos is not None:
+            ort_inputs["cos_cache"] = cos.detach().cpu().numpy()
+            ort_inputs["sin_cache"] = sin.detach().cpu().numpy()
+            io_binding.bind_cpu_input("cos_cache", ort_inputs["cos_cache"])
+            io_binding.bind_cpu_input("sin_cache", ort_inputs["sin_cache"])
         io_binding.bind_cpu_input("query", ort_inputs["query"])
-        io_binding.bind_cpu_input("key", ort_inputs["key"])
-        io_binding.bind_cpu_input("value", ort_inputs["value"])
         io_binding.bind_cpu_input("past_key", ort_inputs["past_key"])
         io_binding.bind_cpu_input("past_value", ort_inputs["past_value"])
         io_binding.bind_cpu_input("seqlens_k", ort_inputs["seqlens_k"])
@@ -1029,9 +1162,12 @@ def parity_check_mha(
 
 def parity_check_gqa_prompt(
     config,
-    causal=False,
+    causal=True,
     local=False,
     past_format=Formats.BSNH,
+    rotary=False,
+    rotary_interleaved=False,
+    packed=False,
     rtol=1e-3,
     atol=1e-3,
 ):
@@ -1105,19 +1241,47 @@ def parity_check_gqa_prompt(
     #     device="cuda",
     # )
     # cache_seqlens[random.randint(0, cache_seqlens.size(dim=0) - 1)] = config.kv_sequence_length
+    rotary_seqlens = torch.tensor([0], device="cuda").repeat(config.batch_size)
+
+    if rotary:
+        rotary_fraction = 1.0
+        rotary_dim = math.floor(int(rotary_fraction * config.head_size) / 16) * 16
+        angle = torch.rand(config.buffer_sequence_length, rotary_dim // 2, device="cuda") * 2 * math.pi
+        cos = torch.cos(angle).to(dtype=torch.float16)
+        sin = torch.sin(angle).to(dtype=torch.float16)
+        if causal or local:
+            q_ro = apply_rotary_emb(q, cos, sin, seqlen_offsets=rotary_seqlens, interleaved=rotary_interleaved)
+        else:
+            q_ro = rearrange(
+                apply_rotary_emb(
+                    rearrange(q, "b s h d -> b 1 (s h) d"),
+                    cos,
+                    sin,
+                    seqlen_offsets=rotary_seqlens,
+                    interleaved=rotary_interleaved,
+                ),
+                "b 1 (s h) d -> b s h d",
+                s=config.q_sequence_length,
+            )
+        # q_ro = q
+        k_ro = apply_rotary_emb(new_k, cos, sin, seqlen_offsets=rotary_seqlens, interleaved=rotary_interleaved)
+    else:
+        cos, sin = None, None
+        q_ro, k_ro = q, new_k
+
     rearrange(torch.arange(config.kv_sequence_length, device="cuda"), "s -> 1 s")
     arange = rearrange(torch.arange(config.buffer_sequence_length, device="cuda"), "s -> 1 s")
     cache_seqlens_expanded = rearrange(cache_seqlens, "b -> b 1")
     kv_seqlens = torch.tensor([config.kv_sequence_length], device="cuda").repeat(config.batch_size)
     kv_seqlens_expanded = rearrange(kv_seqlens, "b -> b 1")
     update_mask = arange < kv_seqlens_expanded
-    k_cache_ref[update_mask] = rearrange(new_k, "b s ... -> (b s) ...")
+    k_cache_ref[update_mask] = rearrange(k_ro, "b s ... -> (b s) ...")
     v_cache_ref[update_mask] = rearrange(new_v, "b s ... -> (b s) ...")
     k_cache_rep = repeat(k_cache_ref, "b s h d -> b s (h g) d", g=config.num_heads // config.kv_num_heads)
     v_cache_rep = repeat(v_cache_ref, "b s h d -> b s (h g) d", g=config.num_heads // config.kv_num_heads)
     key_padding_mask = arange < cache_seqlens_expanded
     out_ref, _ = attention_ref(
-        q, k_cache_rep, v_cache_rep, None, key_padding_mask, 0.0, None, causal=True, window_size=window_size
+        q_ro, k_cache_rep, v_cache_rep, None, key_padding_mask, 0.0, None, causal=True, window_size=window_size
     )
     out_ref = out_ref.detach().cpu().numpy()
     if past_format == Formats.BNSH:
@@ -1125,9 +1289,39 @@ def parity_check_gqa_prompt(
         v_cache_ref = v_cache_ref.transpose(1, 2)
 
     # Flash function
-    out, present_k, present_v = gqa_prompt_func(
-        q, k, v, config, new_k, new_v, cache_seqlens, left_window_size, past_format, True
-    )
+    if packed:
+        packed_qkv = torch.concatenate([q, new_k, new_v], dim=2)
+        out, present_k, present_v = gqa_prompt_func(
+            packed_qkv,
+            k,
+            v,
+            config,
+            None,
+            None,
+            cos,
+            sin,
+            cache_seqlens,
+            left_window_size,
+            past_format,
+            True,
+            rotary_interleaved,
+        )
+    else:
+        out, present_k, present_v = gqa_prompt_func(
+            q,
+            k,
+            v,
+            config,
+            new_k,
+            new_v,
+            cos,
+            sin,
+            cache_seqlens,
+            left_window_size,
+            past_format,
+            True,
+            rotary_interleaved,
+        )
     out = torch.squeeze(out, 0)
     out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.num_heads, config.head_size))
     out = out.detach().cpu().numpy()
@@ -1139,10 +1333,16 @@ def parity_check_gqa_prompt(
     # Compare results
     print(
         "KV-buffer",
+        " packed:",
+        packed,
         " causal:",
         causal,
         " local:",
         local,
+        " rotary:",
+        rotary,
+        " rotary_interleaved:",
+        rotary_interleaved,
         "past kv format:",
         "BSNH" if past_format == Formats.BSNH else "BNSH",
         " B:",
@@ -1171,9 +1371,12 @@ def parity_check_gqa_prompt(
 
 def parity_check_gqa_prompt_no_buff(
     config,
-    causal=False,
+    causal=True,
     local=False,
     past_format=Formats.BSNH,
+    rotary=False,
+    rotary_interleaved=False,
+    packed=False,
     rtol=1e-3,
     atol=1e-3,
 ):
@@ -1229,13 +1432,42 @@ def parity_check_gqa_prompt_no_buff(
     #     device="cuda",
     # )
     # cache_seqlens[random.randint(0, cache_seqlens.size(dim=0) - 1)] = config.kv_sequence_length
+    rotary_seqlens = torch.tensor([0], device="cuda").repeat(config.batch_size)
+
+    if rotary:
+        rotary_fraction = 1.0
+        rotary_dim = math.floor(int(rotary_fraction * config.head_size) / 16) * 16
+        angle = torch.rand(config.kv_sequence_length, rotary_dim // 2, device="cuda") * 2 * math.pi
+        cos = torch.cos(angle).to(dtype=torch.float16)
+        sin = torch.sin(angle).to(dtype=torch.float16)
+        if causal or local:
+            q_ro = apply_rotary_emb(q, cos, sin, seqlen_offsets=rotary_seqlens, interleaved=rotary_interleaved)
+        else:
+            q_ro = rearrange(
+                apply_rotary_emb(
+                    rearrange(q, "b s h d -> b 1 (s h) d"),
+                    cos,
+                    sin,
+                    seqlen_offsets=rotary_seqlens,
+                    interleaved=rotary_interleaved,
+                ),
+                "b 1 (s h) d -> b s h d",
+                s=config.q_sequence_length,
+            )
+        # q_ro = q
+        k_ro = apply_rotary_emb(k_cache_ref, cos, sin, seqlen_offsets=rotary_seqlens, interleaved=rotary_interleaved)
+    else:
+        cos, sin = None, None
+        q_ro, k_ro = q, k_cache_ref
+    k_cache_ref = k_ro
+
     brange = rearrange(torch.arange(config.kv_sequence_length, device="cuda"), "s -> 1 s")
     cache_seqlens_expanded = rearrange(cache_seqlens, "b -> b 1")
     new_mask = brange < cache_seqlens_expanded
     k_cache_rep = repeat(k_cache_ref, "b s h d -> b s (h g) d", g=config.num_heads // config.kv_num_heads)
     v_cache_rep = repeat(v_cache_ref, "b s h d -> b s (h g) d", g=config.num_heads // config.kv_num_heads)
     out_ref, _ = attention_ref(
-        q, k_cache_rep, v_cache_rep, None, new_mask, 0.0, None, causal=True, window_size=window_size
+        q_ro, k_cache_rep, v_cache_rep, None, new_mask, 0.0, None, causal=True, window_size=window_size
     )
     out_ref = out_ref.detach().cpu().numpy()
     if past_format == Formats.BNSH:
@@ -1243,9 +1475,39 @@ def parity_check_gqa_prompt_no_buff(
         v_cache_ref = v_cache_ref.transpose(1, 2)
 
     # Flash function
-    out, present_k, present_v = gqa_prompt_func(
-        q, None, None, config, new_k, new_v, cache_seqlens, left_window_size, past_format, False
-    )
+    if packed:
+        packed_qkv = torch.concatenate([q, new_k, new_v], dim=2)
+        out, present_k, present_v = gqa_prompt_func(
+            packed_qkv,
+            None,
+            None,
+            config,
+            None,
+            None,
+            cos,
+            sin,
+            cache_seqlens,
+            left_window_size,
+            past_format,
+            False,
+            rotary_interleaved,
+        )
+    else:
+        out, present_k, present_v = gqa_prompt_func(
+            q,
+            None,
+            None,
+            config,
+            new_k,
+            new_v,
+            cos,
+            sin,
+            cache_seqlens,
+            left_window_size,
+            past_format,
+            False,
+            rotary_interleaved,
+        )
     out = torch.squeeze(out, 0)
     out = torch.reshape(out, (config.batch_size, config.q_sequence_length, config.num_heads, config.head_size))
     out = out.detach().cpu().numpy()
@@ -1256,7 +1518,17 @@ def parity_check_gqa_prompt_no_buff(
 
     # Compare results
     print(
-        "KV-buffer",
+        "No buff",
+        " packed:",
+        packed,
+        " causal:",
+        causal,
+        " local:",
+        local,
+        " rotary:",
+        rotary,
+        " rotary_interleaved:",
+        rotary_interleaved,
         "past kv format:",
         "BSNH" if past_format == Formats.BSNH else "BNSH",
         " B:",
@@ -1285,9 +1557,12 @@ def parity_check_gqa_prompt_no_buff(
 
 def parity_check_gqa_past(
     config,
-    causal=False,
+    causal=True,
     local=False,
     past_format=Formats.BSNH,
+    rotary=False,
+    rotary_interleaved=False,
+    packed=False,
     rtol=1e-3,
     atol=1e-3,
 ):
@@ -1336,6 +1611,7 @@ def parity_check_gqa_past(
         dtype=torch.float16,
         requires_grad=False,
     )
+
     window_size = (-1, -1)
     left_window_size = -1
     if local:
@@ -1359,18 +1635,45 @@ def parity_check_gqa_past(
         dtype=torch.int32,
         device="cuda",
     )
+
+    if rotary:
+        rotary_fraction = 1.0
+        rotary_dim = math.floor(int(rotary_fraction * config.head_size) / 16) * 16
+        angle = torch.rand(config.kv_sequence_length, rotary_dim // 2, device="cuda") * 2 * math.pi
+        cos = torch.cos(angle).to(dtype=torch.float16)
+        sin = torch.sin(angle).to(dtype=torch.float16)
+        if causal or local:
+            q_ro = apply_rotary_emb(q, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved)
+        else:
+            q_ro = rearrange(
+                apply_rotary_emb(
+                    rearrange(q, "b s h d -> b 1 (s h) d"),
+                    cos,
+                    sin,
+                    seqlen_offsets=cache_seqlens,
+                    interleaved=rotary_interleaved,
+                ),
+                "b 1 (s h) d -> b s h d",
+                s=config.sequence_length,
+            )
+        # q_ro = q
+        k_ro = apply_rotary_emb(new_k, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved)
+    else:
+        cos, sin = None, None
+        q_ro, k_ro = q, new_k
+
     arange = rearrange(torch.arange(config.kv_sequence_length, device="cuda"), "s -> 1 s")
     cache_seqlens_expanded = rearrange(cache_seqlens, "b -> b 1")
     update_mask = torch.logical_and(
         cache_seqlens_expanded <= arange, arange < cache_seqlens_expanded + config.sequence_length
     )
-    k_cache_ref[update_mask] = rearrange(new_k, "b s ... -> (b s) ...")
+    k_cache_ref[update_mask] = rearrange(k_ro, "b s ... -> (b s) ...")
     v_cache_ref[update_mask] = rearrange(new_v, "b s ... -> (b s) ...")
     k_cache_rep = repeat(k_cache_ref, "b s h d -> b s (h g) d", g=config.num_heads // config.kv_num_heads)
     v_cache_rep = repeat(v_cache_ref, "b s h d -> b s (h g) d", g=config.num_heads // config.kv_num_heads)
     key_padding_mask = arange < cache_seqlens_expanded + config.sequence_length
     out_ref, _ = attention_ref(
-        q, k_cache_rep, v_cache_rep, None, key_padding_mask, 0.0, None, causal=True, window_size=window_size
+        q_ro, k_cache_rep, v_cache_rep, None, key_padding_mask, 0.0, None, causal=True, window_size=window_size
     )
     out_ref = out_ref.detach().cpu().numpy()
     if past_format == Formats.BNSH:
@@ -1378,9 +1681,39 @@ def parity_check_gqa_past(
         v_cache_ref = v_cache_ref.transpose(1, 2)
 
     # Flash function
-    out, present_k, present_v = gqa_past_func(
-        q, k, v, config, new_k, new_v, cache_seqlens, past_format, True, left_window_size
-    )
+    if packed:
+        packed_qkv = torch.concatenate([q, new_k, new_v], dim=2)
+        out, present_k, present_v = gqa_past_func(
+            packed_qkv,
+            k,
+            v,
+            config,
+            None,
+            None,
+            cos,
+            sin,
+            cache_seqlens,
+            past_format,
+            True,
+            left_window_size,
+            rotary_interleaved,
+        )
+    else:
+        out, present_k, present_v = gqa_past_func(
+            q,
+            k,
+            v,
+            config,
+            new_k,
+            new_v,
+            cos,
+            sin,
+            cache_seqlens,
+            past_format,
+            True,
+            left_window_size,
+            rotary_interleaved,
+        )
     out = torch.squeeze(out, 0)
     out = torch.reshape(out, (config.batch_size, config.sequence_length, config.num_heads, config.head_size))
     out = out.detach().cpu().numpy()
@@ -1394,10 +1727,16 @@ def parity_check_gqa_past(
         "KV-buffer",
         "past kv format:",
         "BSNH" if past_format == Formats.BSNH else "BNSH",
+        " packed:",
+        packed,
         " causal:",
         causal,
         " local:",
         local,
+        " rotary:",
+        rotary,
+        " rotary_interleaved:",
+        rotary_interleaved,
         " B:",
         config.batch_size,
         " S:",
@@ -1427,6 +1766,9 @@ def parity_check_gqa_past_no_buff(
     causal=False,
     local=False,
     past_format=Formats.BSNH,
+    rotary=False,
+    rotary_interleaved=False,
+    packed=False,
     rtol=1e-3,
     atol=1e-3,
 ):
@@ -1503,18 +1845,47 @@ def parity_check_gqa_past_no_buff(
         device="cuda",
     )
     cache_seqlens[random.randint(0, config.batch_size - 1)] = config.kv_sequence_length
+
+    if rotary:
+        rotary_fraction = 1.0
+        rotary_dim = math.floor(int(rotary_fraction * config.head_size) / 16) * 16
+        angle = (
+            torch.rand(config.kv_sequence_length + config.sequence_length, rotary_dim // 2, device="cuda") * 2 * math.pi
+        )
+        cos = torch.cos(angle).to(dtype=torch.float16)
+        sin = torch.sin(angle).to(dtype=torch.float16)
+        if causal or local:
+            q_ro = apply_rotary_emb(q, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved)
+        else:
+            q_ro = rearrange(
+                apply_rotary_emb(
+                    rearrange(q, "b s h d -> b 1 (s h) d"),
+                    cos,
+                    sin,
+                    seqlen_offsets=cache_seqlens,
+                    interleaved=rotary_interleaved,
+                ),
+                "b 1 (s h) d -> b s h d",
+                s=config.sequence_length,
+            )
+        # q_ro = q
+        k_ro = apply_rotary_emb(new_k, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved)
+    else:
+        cos, sin = None, None
+        q_ro, k_ro = q, new_k
+
     arange = rearrange(torch.arange(config.kv_sequence_length + config.sequence_length, device="cuda"), "s -> 1 s")
     cache_seqlens_expanded = rearrange(cache_seqlens, "b -> b 1")
     update_mask = torch.logical_and(
         cache_seqlens_expanded <= arange, arange < cache_seqlens_expanded + config.sequence_length
     )
-    k_cache_ref[update_mask] = rearrange(new_k, "b s ... -> (b s) ...")
+    k_cache_ref[update_mask] = rearrange(k_ro, "b s ... -> (b s) ...")
     v_cache_ref[update_mask] = rearrange(new_v, "b s ... -> (b s) ...")
     k_cache_rep = repeat(k_cache_ref, "b s h d -> b s (h g) d", g=config.num_heads // config.kv_num_heads)
     v_cache_rep = repeat(v_cache_ref, "b s h d -> b s (h g) d", g=config.num_heads // config.kv_num_heads)
     key_padding_mask = arange < cache_seqlens_expanded + config.sequence_length
     out_ref, _ = attention_ref(
-        q, k_cache_rep, v_cache_rep, None, key_padding_mask, 0.0, None, causal=True, window_size=window_size
+        q_ro, k_cache_rep, v_cache_rep, None, key_padding_mask, 0.0, None, causal=True, window_size=window_size
     )
     out_ref = out_ref.detach().cpu().numpy()
     if past_format == Formats.BNSH:
@@ -1522,28 +1893,56 @@ def parity_check_gqa_past_no_buff(
         v_cache_ref = v_cache_ref.transpose(1, 2)
 
     # Flash function
-    out, present_k, present_v = gqa_past_func(
-        q, k, v, config, new_k, new_v, cache_seqlens, past_format, False, window_size=left_window_size
-    )
+    if packed:
+        packed_qkv = torch.concatenate([q, new_k, new_v], dim=2)
+        out, present_k, present_v = gqa_past_func(
+            packed_qkv,
+            k,
+            v,
+            config,
+            None,
+            None,
+            cos,
+            sin,
+            cache_seqlens,
+            past_format,
+            False,
+            window_size=left_window_size,
+            rotary_interleaved=rotary_interleaved,
+        )
+    else:
+        out, present_k, present_v = gqa_past_func(
+            q,
+            k,
+            v,
+            config,
+            new_k,
+            new_v,
+            cos,
+            sin,
+            cache_seqlens,
+            past_format,
+            False,
+            window_size=left_window_size,
+            rotary_interleaved=rotary_interleaved,
+        )
     out = torch.squeeze(out, 0)
     out = torch.reshape(out, (config.batch_size, config.sequence_length, config.num_heads, config.head_size))
     out = out.detach().cpu().numpy()
 
-    # Make sure past-present buffer updating correctly
-    # assert numpy.allclose(
-    #     present_k[:, :, :-1, :], k_cache_ref.detach().cpu().numpy()[:, :, :-1, :], rtol=rtol, atol=atol, equal_nan=True
-    # )
-    # assert numpy.allclose(
-    #     present_v[:, :, :-1, :], v_cache_ref.detach().cpu().numpy()[:, :, :-1, :], rtol=rtol, atol=atol, equal_nan=True
-    # )
-
     # Compare results
     print(
         "NO buff",
+        " packed:",
+        packed,
         " causal:",
         causal,
         " local:",
         local,
+        " rotary:",
+        rotary,
+        " rotary_interleaved:",
+        rotary_interleaved,
         "past kv format:",
         "BSNH" if past_format == Formats.BSNH else "BNSH",
         " B:",
@@ -1658,10 +2057,27 @@ def test_gqa_no_past(self):
             for sq, skv in seqs:
                 for n, n2 in num_h:
                     for h in h_sizes:
-                        for past_kv_format in [Formats.BNSH]:
-                            config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
-                            parity_check_gqa_prompt(config, past_format=past_kv_format)
-                            parity_check_gqa_prompt_no_buff(config, past_format=past_kv_format)
+                        for rotary, rotary_interleaved in [(True, False), (True, True), (False, False)]:
+                            for packed in [False, True]:
+                                config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
+                                parity_check_gqa_prompt(
+                                    config,
+                                    rtol=2e-3,
+                                    atol=2e-3,
+                                    past_format=Formats.BNSH,
+                                    rotary=rotary,
+                                    rotary_interleaved=rotary_interleaved,
+                                    packed=packed,
+                                )
+                                parity_check_gqa_prompt_no_buff(
+                                    config,
+                                    rtol=2e-3,
+                                    atol=2e-3,
+                                    past_format=Formats.BNSH,
+                                    rotary=rotary,
+                                    rotary_interleaved=rotary_interleaved,
+                                    packed=packed,
+                                )
         if major < 8 or platform.system() != "Linux":
             return
         print("------- FLASH ATTENTION (PROMPT CASE) --------")
@@ -1671,10 +2087,25 @@ def test_gqa_no_past(self):
                 for n, n2 in num_h:
                     for h in h_sizes:
                         for local in [False, True]:
-                            for past_kv_format in [Formats.BNSH]:
-                                config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
-                                parity_check_gqa_prompt(config, local=local, past_format=past_kv_format)
-                                parity_check_gqa_prompt_no_buff(config, local=local, past_format=past_kv_format)
+                            for rotary, rotary_interleaved in [(True, False), (True, True), (False, False)]:
+                                for packed in [False, True]:
+                                    config = PromptConfig(b, sq, skv, sq + skv + 8, n, n2, h)
+                                    parity_check_gqa_prompt(
+                                        config,
+                                        local=local,
+                                        past_format=Formats.BNSH,
+                                        rotary=rotary,
+                                        rotary_interleaved=rotary_interleaved,
+                                        packed=packed,
+                                    )
+                                    parity_check_gqa_prompt_no_buff(
+                                        config,
+                                        local=local,
+                                        past_format=Formats.BNSH,
+                                        rotary=rotary,
+                                        rotary_interleaved=rotary_interleaved,
+                                        packed=packed,
+                                    )
 
     def test_gqa_past(self):
         if not torch.cuda.is_available():
@@ -1684,7 +2115,6 @@ def test_gqa_past(self):
             return
         os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
         print("-------- TEST GQA PAST (TOKEN GEN) ---------")
-        print("-------- MEMORY EFFICIENT (TOKEN GEN) --------")
         batches = [5] if pipeline_mode else [1, 3, 5]
         seqs = (
             [(1, 128), (1, 1024), (1, 2048)]
@@ -1706,51 +2136,66 @@ def test_gqa_past(self):
         num_h = [(32, 32), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
         h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
         random.seed(69)
+        print("-------- MEMORY EFFICIENT (TOKEN GEN) --------")
         for b in batches:
             for s, s2 in seqs:
                 for n, n2 in num_h:
                     for h in h_sizes:
-                        for past_kv_format in [Formats.BNSH]:
-                            sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
-                            config = Config(b, s, s2, sp, n, n2, h)
-                            parity_check_gqa_past(
-                                config,
-                                past_format=past_kv_format,
-                                rtol=1e-3,
-                                atol=1e-3,
-                            )
-                            parity_check_gqa_past_no_buff(
-                                config,
-                                past_format=past_kv_format,
-                                rtol=1e-3,
-                                atol=1e-3,
-                            )
-        if major < 8 or platform.system() != "Linux":
-            return
-        print("------- FLASH ATTENTION (TOKEN GEN) -------")
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
-        for b in batches:
-            for s, s2 in seqs:
-                for n, n2 in num_h:
-                    for h in h_sizes:
-                        for local in [False, True]:
-                            for past_kv_format in [Formats.BNSH]:
+                        for rotary, rotary_interleaved in [(True, False), (True, True), (False, False)]:
+                            for packed in [False, True]:
                                 sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
                                 config = Config(b, s, s2, sp, n, n2, h)
                                 parity_check_gqa_past(
                                     config,
-                                    local=local,
-                                    past_format=past_kv_format,
+                                    past_format=Formats.BNSH,
                                     rtol=1e-3,
                                     atol=1e-3,
+                                    rotary=rotary,
+                                    rotary_interleaved=rotary_interleaved,
+                                    packed=packed,
                                 )
                                 parity_check_gqa_past_no_buff(
                                     config,
-                                    local=local,
-                                    past_format=past_kv_format,
+                                    past_format=Formats.BNSH,
                                     rtol=1e-3,
                                     atol=1e-3,
+                                    rotary=rotary,
+                                    rotary_interleaved=rotary_interleaved,
+                                    packed=packed,
                                 )
+        if major < 8 or platform.system() != "Linux":
+            return
+        print("------- FLASH ATTENTION (TOKEN GEN) -------")
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
+        for b in batches:
+            for s, s2 in seqs:
+                for n, n2 in num_h:
+                    for h in h_sizes:
+                        for local in [False, True]:
+                            for rotary, rotary_interleaved in [(True, False), (True, True), (False, False)]:
+                                for packed in [False, True]:
+                                    sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
+                                    config = Config(b, s, s2, sp, n, n2, h)
+                                    parity_check_gqa_past(
+                                        config,
+                                        local=local,
+                                        past_format=Formats.BNSH,
+                                        rtol=1e-3,
+                                        atol=1e-3,
+                                        rotary=rotary,
+                                        rotary_interleaved=rotary_interleaved,
+                                        packed=packed,
+                                    )
+                                    parity_check_gqa_past_no_buff(
+                                        config,
+                                        local=local,
+                                        past_format=Formats.BNSH,
+                                        rtol=1e-3,
+                                        atol=1e-3,
+                                        rotary=rotary,
+                                        rotary_interleaved=rotary_interleaved,
+                                        packed=packed,
+                                    )
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
index c9db1fbc0293..33ec1bd7728f 100644
--- a/onnxruntime/test/python/transformers/test_generation.py
+++ b/onnxruntime/test/python/transformers/test_generation.py
@@ -361,7 +361,8 @@ def run_configs(self, optional_arguments):
 
         # INT8 CPU
         arguments = self.base_arguments + self.int8_cpu_arguments + optional_arguments
-        self.run_export(arguments)
+        if "--model_impl" not in arguments:
+            self.run_export(arguments)
 
     @pytest.mark.slow
     def test_required_args(self):
@@ -380,18 +381,24 @@ def test_logits_processor(self):
 
     @pytest.mark.slow
     def test_cross_qk_overall(self):
-        decoder_input_ids = [
-            "--chain_model",
-            "--collect_cross_qk",
-            "--output_cross_qk",
-            "--use_forced_decoder_ids",
-            "--extra_decoding_ids",
-            "--output_no_speech_probs",
+        cross_qk_input_args = [
             "--use_vocab_mask",
             "--use_prefix_vocab_mask",
+            "--use_forced_decoder_ids",
             "--use_logits_processor",
+            "--collect_cross_qk",
+            "--extra_decoding_ids",
         ]
-        self.run_configs(decoder_input_ids)
+        cross_qk_output_args = [
+            "--output_cross_qk",
+            "--output_no_speech_probs",
+        ]
+        self.run_configs(cross_qk_input_args + cross_qk_output_args)
+
+    @pytest.mark.slow
+    def test_openai_impl_whisper(self):
+        optional_args = ["--model_impl", "openai"]
+        self.run_configs(optional_args)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/transformers/test_onnx_utils.py b/onnxruntime/test/python/transformers/test_onnx_utils.py
new file mode 100644
index 000000000000..974991359795
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_onnx_utils.py
@@ -0,0 +1,38 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import unittest
+
+import numpy
+from onnx import ModelProto, TensorProto, helper
+from onnx.external_data_helper import set_external_data
+
+from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
+
+
+class TestOnnxUtils(unittest.TestCase):
+    def test_extract_raw_data_from_model(self):
+        model = self._get_model_proto_with_raw_data(False)
+        external_names, external_values = extract_raw_data_from_model(model)
+        self.assertEqual(list(external_names), ["inputs"])
+        self.assertEqual(len(external_values), 1)
+        self.assertEqual(external_values[0].numpy(), [0.0])
+
+    def test_has_external_data(self):
+        model = self._get_model_proto_with_raw_data()
+        self.assertTrue(has_external_data(model))
+
+    def test_has_external_data_with_no_external_data(self):
+        model = self._get_model_proto_with_raw_data(False)
+        self.assertFalse(has_external_data(model))
+
+    def _get_model_proto_with_raw_data(self, has_external_data: bool = True) -> ModelProto:
+        input = helper.make_tensor_value_info("inputs", TensorProto.FLOAT, [None])
+        output = helper.make_tensor_value_info("outputs", TensorProto.FLOAT, [None])
+        raw_data = numpy.array([0.0], dtype=numpy.float32).tobytes()
+        tensor = helper.make_tensor("inputs", TensorProto.FLOAT, [1], raw_data, True)
+        if has_external_data:
+            set_external_data(tensor, location="foo.bin")
+        node = helper.make_node("Identity", inputs=["inputs"], outputs=["outputs"])
+        return helper.make_model(helper.make_graph([node], "graph", [input], [output], initializer=[tensor]))
diff --git a/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
new file mode 100644
index 000000000000..00704626028a
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
@@ -0,0 +1,361 @@
+# --------------------------------------------------------------------------
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+from collections import OrderedDict
+
+import numpy
+import torch
+import torch.nn.functional as F
+from onnx import TensorProto, helper
+from torch import nn
+
+import onnxruntime
+
+torch.manual_seed(42)
+numpy.random.seed(42)
+
+ORT_DTYPE = TensorProto.FLOAT
+NP_TYPE = numpy.float16 if ORT_DTYPE == TensorProto.FLOAT16 else numpy.float32
+THRESHOLD = 3e-2
+
+
+def value_string_of(numpy_array):
+    arr = numpy_array.flatten()
+    lines = ["f, ".join([str(v) for v in arr[i : min(i + 8, arr.size)]]) for i in range(0, arr.size, 8)]
+    return "{\n    " + "f,\n    ".join(lines) + "f}"
+
+
+def print_tensor(name, numpy_array):
+    print(f"const std::vector<float> {name} = {value_string_of(numpy_array)};")
+
+
+def create_moe_onnx_graph(
+    num_rows,
+    num_experts,
+    hidden_size,
+    inter_size,
+    fc1_experts_weights,
+    fc2_experts_weights,
+    fc3_experts_weights,
+    topk,
+):
+    nodes = [
+        helper.make_node(
+            "MoE",
+            [
+                "input",
+                "router_probs",
+                "fc1_experts_weights",
+                "",
+                "fc2_experts_weights",
+                "",
+                "fc3_experts_weights",
+            ],
+            ["output"],
+            "MoE_0",
+            k=topk,
+            normalize_routing_weights=1,
+            activation_type="silu",
+            domain="com.microsoft",
+        ),
+    ]
+
+    fc1_shape = [num_experts, hidden_size, inter_size]
+    fc2_shape = [num_experts, inter_size, hidden_size]
+    fc3_shape = [num_experts, hidden_size, inter_size]
+
+    torch_type = torch.float16 if ORT_DTYPE == TensorProto.FLOAT16 else torch.float32
+
+    initializers = [
+        helper.make_tensor(
+            "fc1_experts_weights",
+            ORT_DTYPE,
+            fc1_shape,
+            fc1_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+        helper.make_tensor(
+            "fc2_experts_weights",
+            ORT_DTYPE,
+            fc2_shape,
+            fc2_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+        helper.make_tensor(
+            "fc3_experts_weights",
+            ORT_DTYPE,
+            fc3_shape,
+            fc3_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+    ]
+
+    graph_inputs = [
+        helper.make_tensor_value_info("input", ORT_DTYPE, [num_rows, hidden_size]),
+    ]
+
+    graph_inputs.append(
+        helper.make_tensor_value_info(
+            "router_probs",
+            ORT_DTYPE,
+            [num_rows, num_experts],
+        )
+    )
+
+    graph_outputs = [
+        helper.make_tensor_value_info("output", ORT_DTYPE, [num_rows, hidden_size]),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "MoE_Graph",
+        graph_inputs,
+        graph_outputs,
+        initializers,
+    )
+
+    model = helper.make_model(graph)
+    return model.SerializeToString()
+
+
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+
+
+ACT2CLS = {
+    "silu": nn.SiLU,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+
+
+class MixtralConfig:
+    def __init__(
+        self,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        rope_theta=1e6,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+
+class MixtralBlockSparseTop2MLP(nn.Module):
+    def __init__(self, config: MixtralConfig):
+        super().__init__()
+        self.ffn_dim = config.intermediate_size
+        self.hidden_dim = config.hidden_size
+
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        current_hidden_states_1 = self.act_fn(self.w1(hidden_states))
+        current_hidden_states_3 = self.w3(hidden_states)
+        current_hidden_states = current_hidden_states_1 * current_hidden_states_3
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+class MixtralSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accommodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+
+    def __init__(self, config, batch_size, sequence_length):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+
+        self.experts = nn.ModuleList([MixtralBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+
+        w1_list = []
+        w2_list = []
+        w3_list = []
+        for i in range(self.num_experts):
+            w1_list.append(self.experts[i].w1.weight)
+            w2_list.append(self.experts[i].w2.weight)
+            w3_list.append(self.experts[i].w3.weight)
+
+        self.moe_experts_weight1 = torch.stack(w1_list, dim=0)
+        self.moe_experts_weight2 = torch.stack(w2_list, dim=0)
+        self.moe_experts_weight3 = torch.stack(w3_list, dim=0)
+
+        self.batch_size = batch_size
+        self.sequence_length = sequence_length
+        self.moe_onnx_graph = create_moe_onnx_graph(
+            self.batch_size * self.sequence_length,
+            self.num_experts,
+            self.hidden_dim,
+            self.ffn_dim,
+            self.moe_experts_weight1,
+            self.moe_experts_weight2,
+            self.moe_experts_weight3,
+            self.top_k,
+        )
+
+        self.ort_sess = self.create_ort_session()
+
+    def create_ort_session(self):
+        from onnxruntime import InferenceSession, SessionOptions
+
+        sess_options = SessionOptions()
+
+        cuda_providers = ["CUDAExecutionProvider"]
+        if cuda_providers[0] not in onnxruntime.get_available_providers():
+            return None
+
+        sess_options.log_severity_level = 2
+        ort_session = InferenceSession(self.moe_onnx_graph, sess_options, providers=["CUDAExecutionProvider"])
+
+        return ort_session
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states  # , router_logits
+
+    def ort_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        ort_inputs = {
+            "input": numpy.ascontiguousarray(hidden_states.detach().numpy().astype(NP_TYPE)),
+            "router_probs": numpy.ascontiguousarray(router_logits.detach().numpy().astype(NP_TYPE)),
+        }
+
+        ort_output = None
+        if self.ort_sess is not None:
+            ort_output = self.ort_sess.run(None, ort_inputs)
+            return torch.tensor(ort_output).reshape(batch_size, sequence_length, -1)  # , router_logits
+
+        # print_tensor("input", ort_inputs["input"])
+        # print_tensor("router_probs", ort_inputs["router_probs"])
+        # print_tensor("fc1_experts_weights", self.moe_experts_weight1.detach().numpy())
+        # print_tensor("fc2_experts_weights", self.moe_experts_weight2.detach().numpy())
+        # print_tensor("fc3_experts_weights", self.moe_experts_weight3.detach().numpy())
+        # print_tensor("output", ort_output[0])
+
+        return None
+
+    def parity_check(self):
+        hidden_state = torch.randn(self.batch_size, self.sequence_length, self.hidden_dim)
+        torch_output = self.forward(hidden_state)
+        ort_output = self.ort_forward(hidden_state)
+        if ort_output is not None:
+            assert torch.allclose(torch_output, ort_output, rtol=1e-04, atol=1e-04)
+            print(
+                "batch_size:",
+                self.batch_size,
+                " sequence_length:",
+                self.sequence_length,
+                " max_diff:",
+                (torch_output - ort_output).abs().max(),
+                " parity: OK",
+            )
+
+
+class TestMixtralMoE(unittest.TestCase):
+    def test_mixtral_moe_parity(self):
+        for batch_size in [1, 16]:
+            for sequence_length in [128, 1024]:
+                # use a small sizes to speed up the test
+                config = MixtralConfig(hidden_size=256, intermediate_size=1024)
+                mixtral_moe = MixtralSparseMoeBlock(config, batch_size, sequence_length)
+                mixtral_moe.parity_check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_parity_moe.py b/onnxruntime/test/python/transformers/test_parity_moe.py
index 72ca5d9975c0..aa480a1af458 100644
--- a/onnxruntime/test/python/transformers/test_parity_moe.py
+++ b/onnxruntime/test/python/transformers/test_parity_moe.py
@@ -47,8 +47,8 @@ def create_moe_onnx_graph(
     hidden_size,
     inter_size,
     fc1_experts_weights,
-    fc2_experts_weights,
     fc1_experts_bias,
+    fc2_experts_weights,
     fc2_experts_bias,
 ):
     nodes = [
@@ -58,8 +58,8 @@ def create_moe_onnx_graph(
                 "input",
                 "router_probs",
                 "fc1_experts_weights",
-                "fc2_experts_weights",
                 "fc1_experts_bias",
+                "fc2_experts_weights",
                 "fc2_experts_bias",
             ],
             ["output"],
@@ -249,9 +249,9 @@ def __init__(
             num_experts,
             in_features,
             hidden_features,
-            self.moe_experts.weight1,
-            self.moe_experts.weight2,
+            self.moe_experts.weight1.transpose(1, 2),
             self.moe_experts.bias1,
+            self.moe_experts.weight2.transpose(1, 2),
             self.moe_experts.bias2,
         )
 
@@ -296,8 +296,6 @@ def ort_run_with_iobinding(self, ort_inputs, repeat=1000):
             ).data_ptr(),
         )
 
-        iobinding.synchronize_inputs()
-
         iobinding.bind_output(
             name="output",
             device_type="cuda",
@@ -308,11 +306,12 @@ def ort_run_with_iobinding(self, ort_inputs, repeat=1000):
                 numpy.zeros(ort_inputs["input"].shape), "cuda", device_id
             ).data_ptr(),
         )
-        iobinding.synchronize_outputs()
 
         s = time.time()
         for _ in range(repeat):
+            iobinding.synchronize_inputs()
             self.ort_sess.run_with_iobinding(iobinding)
+            iobinding.synchronize_outputs()
         e = time.time()
         print(f"MoE cuda kernel time: {(e - s) / repeat * 1000} ms")
 
@@ -356,8 +355,8 @@ def onnx_forward(self, iobinding=False):
         # print_tensor("input", ort_inputs["input"])
         # print_tensor("router_probs", ort_inputs["router_probs"])
         # print_tensor("fc1_experts_weights", self.moe_experts.weight1.detach().numpy())
-        # print_tensor("fc2_experts_weights", self.moe_experts.weight2.detach().numpy())
         # print_tensor("fc1_experts_bias", self.moe_experts.bias1.detach().numpy())
+        # print_tensor("fc2_experts_weights", self.moe_experts.weight2.detach().numpy())
         # print_tensor("fc2_experts_bias", self.moe_experts.bias2.detach().numpy())
         # print_tensor("output", ort_output[0])
 
diff --git a/onnxruntime/test/python/transformers/test_parity_neox_attention.py b/onnxruntime/test/python/transformers/test_parity_neox_attention.py
index 8c8e871a854b..a98bb623beae 100644
--- a/onnxruntime/test/python/transformers/test_parity_neox_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_neox_attention.py
@@ -29,6 +29,7 @@ def create_neox_attention_graph(
     qkv_weight,
     qkv_bias,
     num_heads,
+    rotary_embedding,
 ):
     nodes = [
         helper.make_node(
@@ -43,6 +44,7 @@ def create_neox_attention_graph(
             num_heads=num_heads,
             unidirectional=1,
             do_rotary=1,
+            rotary_embedding=rotary_embedding,
             domain="com.microsoft",
         ),
     ]
@@ -174,13 +176,13 @@ def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
 
 
 class GPTNeoXAttention(nn.Module):
-    def __init__(self, batch_size, seq_len, num_head, hidden_size, past_seq_len=0):
+    def __init__(self, batch_size, seq_len, num_head, hidden_size, past_seq_len=0, rotary_ndims=64):
         super().__init__()
         self.do_rotary = True
         self.num_attention_heads = num_head
         self.hidden_size = hidden_size
         self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_ndims = int(self.head_size)
+        self.rotary_ndims = rotary_ndims
         max_positions = 2048
         self.register_buffer(
             "bias",
@@ -197,6 +199,7 @@ def __init__(self, batch_size, seq_len, num_head, hidden_size, past_seq_len=0):
         # self.query_key_value.bias.data.copy_(torch.tensor(np.zeros((3 * hidden_size))))
 
         if past_seq_len > 0:
+            assert self.rotary_ndims == self.head_size
             self.onnx_graph = create_neox_decoder_masked_self_attention_graph(
                 batch_size,
                 seq_len,
@@ -220,6 +223,7 @@ def __init__(self, batch_size, seq_len, num_head, hidden_size, past_seq_len=0):
                 .transpose(0, 1),
                 self.query_key_value.bias.reshape(self.num_attention_heads, 3, -1).transpose(0, 1).reshape(-1),
                 self.num_attention_heads,
+                self.rotary_ndims,
             )
 
     @classmethod
@@ -422,17 +426,21 @@ def test_gpt_neox_attention(self):
         for batch_size in [1, 2, 4, 8]:
             for seq_len in [32, 128, 512, 1024, 2048]:
                 for num_head in [12]:
-                    for hidden_size in [768]:
-                        attn = GPTNeoXAttention(batch_size, seq_len, num_head, hidden_size)
-
-                        hidden_states = torch.normal(mean=0.5, std=0.1, size=(batch_size, seq_len, hidden_size)).to(
-                            torch.float32
-                        )
-
-                        torch_output = attn.torch_forward(hidden_states)
-                        ort_output = attn.onnx_forward(hidden_states)
-                        if ort_output is not None:
-                            assert torch.allclose(torch_output, ort_output, atol=1e-4)
+                    for rotary_ndims in [32, 64]:
+                        for hidden_size in [768, 960]:
+                            attn = GPTNeoXAttention(batch_size, seq_len, num_head, hidden_size, 0, rotary_ndims)
+
+                            hidden_states = torch.normal(mean=0.5, std=0.1, size=(batch_size, seq_len, hidden_size)).to(
+                                torch.float32
+                            )
+
+                            torch_output = attn.torch_forward(hidden_states)
+                            ort_output = attn.onnx_forward(hidden_states)
+                            if ort_output is not None:
+                                assert torch.allclose(torch_output, ort_output, atol=1e-3)
+                                print(
+                                    f"Passed: test_gpt_neox_attention: {batch_size}, {seq_len}, {num_head}, {hidden_size}, {rotary_ndims}"
+                                )
 
     def test_gpt_neox_decoder_masked_self_attention(self):
         for batch_size in [1, 2, 4, 8]:
@@ -466,7 +474,7 @@ def test_gpt_neox_decoder_masked_self_attention(self):
                             hidden_states, attention_mask=attention_mask, layer_past=layer_past
                         )
                         if ort_output is not None:
-                            assert torch.allclose(torch_output, ort_output, atol=1e-4)
+                            assert torch.allclose(torch_output, ort_output, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/transformers/test_whisper_timestamp_processor.py b/onnxruntime/test/python/transformers/test_whisper_timestamp_processor.py
index 77ce09d7e793..7892000ae45a 100644
--- a/onnxruntime/test/python/transformers/test_whisper_timestamp_processor.py
+++ b/onnxruntime/test/python/transformers/test_whisper_timestamp_processor.py
@@ -50,7 +50,7 @@ def run_timestamp(self, provider: str):
         ort_out = sess.run(None, ort_inputs)
         ort_out_tensor = torch.from_numpy(ort_out[0])
         ort_transcription = processor.batch_decode(
-            ort_out_tensor[0][0].view(1, -1), skip_special_tokens=True, output_offsets=True
+            ort_out_tensor[0][0].view(1, -1), skip_special_tokens=True, output_offsets=True, decode_with_timestamps=True
         )
         print(ort_transcription)
         expected_transcription = [
@@ -58,7 +58,7 @@ def run_timestamp(self, provider: str):
                 "text": "<|0.00|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|5.44|>",
                 "offsets": [
                     {
-                        "text": "<|0.00|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|5.44|>",
+                        "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
                         "timestamp": (0.0, 5.44),
                     }
                 ],
diff --git a/onnxruntime/test/python/transformers/whisper_model_generator.py b/onnxruntime/test/python/transformers/whisper_model_generator.py
index 71d1a4cbdcee..a57b45cbc5ea 100644
--- a/onnxruntime/test/python/transformers/whisper_model_generator.py
+++ b/onnxruntime/test/python/transformers/whisper_model_generator.py
@@ -22,9 +22,7 @@ def get_tensor_and_weight(name: str, shape: List[int], random=False, zeros=False
     weights = (
         [np.random.uniform(low, high) for _ in range(total_elements)]
         if random
-        else [0.0] * total_elements
-        if zeros
-        else [1.0] * total_elements
+        else [0.0] * total_elements if zeros else [1.0] * total_elements
     )
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights), weights
 
diff --git a/onnxruntime/test/quantization/quantization_test.cc b/onnxruntime/test/quantization/quantization_test.cc
index bdfac77b336d..773f56de5361 100644
--- a/onnxruntime/test/quantization/quantization_test.cc
+++ b/onnxruntime/test/quantization/quantization_test.cc
@@ -99,24 +99,22 @@ void EnsureQuantizedTensorParam(const float scale, const T zero_point) {
 
   // First, create the scale tensor:
   auto alloc = TestCPUExecutionProvider()->CreatePreferredAllocators()[0];
-  auto num_bytes = shape.Size() * sizeof(float);
-  void* data = alloc->Alloc(num_bytes);
-  float* float_data = static_cast<float*>(data);
+  IAllocatorUniquePtr<float> buffer = IAllocator::MakeUniquePtr<float>(alloc, shape.Size());
+  float* float_data = buffer.get();
   float_data[0] = scale;
   Tensor scale_tensor(DataTypeImpl::GetType<float>(),
                       shape,
-                      data,
+                      float_data,
                       alloc->Info(),
                       /*offset=*/0);
 
   // Next, create the zero_point tensor:
-  auto T_num_bytes = shape.Size() * sizeof(T);
-  void* T_data = alloc->Alloc(T_num_bytes);
-  T* typed_data = static_cast<T*>(T_data);
+  IAllocatorUniquePtr<T> buffer2 = IAllocator::MakeUniquePtr<T>(alloc, shape.Size());
+  T* typed_data = buffer2.get();
   typed_data[0] = zero_point;
   Tensor zero_point_tensor(DataTypeImpl::GetType<T>(),
                            shape,
-                           T_data,
+                           typed_data,
                            alloc->Info(),
                            /*offset=*/0);
 
diff --git a/onnxruntime/test/run_benchmark.py b/onnxruntime/test/run_benchmark.py
new file mode 100755
index 000000000000..0c00bb7ac0a0
--- /dev/null
+++ b/onnxruntime/test/run_benchmark.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from __future__ import annotations
+
+import argparse
+import dataclasses
+import json
+import pathlib
+import subprocess
+import sys
+import tempfile
+
+
+def warn(message: str):
+    print(f"WARNING: {message}", file=sys.stderr)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Benchmark (https://github.com/google/benchmark) program runner. "
+        "Runs a benchmark program until the benchmark measurements are within the desired coefficient of variation "
+        "(CV) (stddev / mean) tolerance and outputs those measurements."
+    )
+
+    parser.add_argument(
+        "--program",
+        required=True,
+        type=pathlib.Path,
+        help="Path to the benchmark program to run.",
+    )
+
+    parser.add_argument(
+        "--pattern",
+        required=True,
+        dest="patterns",
+        action="extend",
+        nargs="+",
+        help="Benchmark test name pattern to specify which benchmark tests to run. "
+        "Each pattern value will have its own invocation of the benchmark program (passed to the benchmark program "
+        "with the --benchmark_filter option). "
+        "To list the benchmark test names, run the benchmark program with the --benchmark_list_tests option.",
+    )
+    parser.add_argument(
+        "--repetitions",
+        type=int,
+        default=10,
+        help="Number of benchmark run repetitions (passed to the benchmark program with the "
+        "--benchmark_repetitions option).",
+    )
+
+    parser.add_argument(
+        "--max-cv",
+        type=float,
+        default=0.05,
+        help="Maximum allowed CV (stddev / mean) value. "
+        "The CV value is a number, not a percentage. E.g., a value of 0.05 corresponds to 5%%.",
+    )
+    parser.add_argument(
+        "--max-attempts",
+        type=int,
+        default=3,
+        help="Maximum number of times to attempt running the benchmark program.",
+    )
+
+    parser.add_argument(
+        "--show-program-output",
+        action="store_true",
+        help="Display the output from the benchmark program.",
+    )
+
+    return parser.parse_args()
+
+
+@dataclasses.dataclass
+class BenchmarkResult:
+    name: str
+    median_real_time: float
+    median_cpu_time: float
+    time_unit: str
+
+
+def run_benchmark(
+    program: pathlib.Path,
+    output_file: pathlib.Path,
+    show_output: bool,
+    pattern: str,
+    repetitions: int,
+    max_cv: float,
+    max_attempts: int,
+) -> list[BenchmarkResult]:
+    benchmark_cmd = [
+        f"{program}",
+        f"--benchmark_filter={pattern}",
+        f"--benchmark_repetitions={repetitions}",
+        "--benchmark_report_aggregates_only",
+        f"--benchmark_out={output_file}",
+        "--benchmark_out_format=json",
+    ]
+
+    def check_cv(entries):
+        valid = True
+
+        for entry in entries:
+            if entry.get("aggregate_name") != "cv":
+                continue
+
+            run_name = entry["run_name"]
+
+            real_time_cv = float(entry["real_time"])
+            if real_time_cv > max_cv:
+                warn(f"real_time CV exceeds limit for run '{run_name}': {real_time_cv} > {max_cv}")
+                valid = False
+
+            cpu_time_cv = float(entry["cpu_time"])
+            if cpu_time_cv > max_cv:
+                warn(f"cpu_time CV exceeds limit for run '{run_name}': {cpu_time_cv} > {max_cv}")
+                valid = False
+
+        return valid
+
+    def process_entries(entries) -> list[BenchmarkResult]:
+        results = []
+
+        for entry in entries:
+            if entry.get("aggregate_name") != "median":
+                continue
+
+            result = BenchmarkResult(
+                name=entry["run_name"],
+                median_real_time=float(entry["real_time"]),
+                median_cpu_time=float(entry["cpu_time"]),
+                time_unit=entry["time_unit"],
+            )
+
+            results.append(result)
+
+        return results
+
+    attempts = 0
+    while attempts < max_attempts:
+        attempts += 1
+
+        output_handle = None if show_output else subprocess.DEVNULL
+        subprocess.run(
+            benchmark_cmd,
+            check=True,
+            stdout=output_handle,
+            stderr=output_handle,
+            creationflags=subprocess.HIGH_PRIORITY_CLASS,
+        )
+
+        with open(output_file) as output:
+            output_json = json.load(output)
+            entries = output_json["benchmarks"]
+
+        if not check_cv(entries):
+            warn("Discarding benchmark run.")
+            continue
+
+        return process_entries(entries)
+
+    raise RuntimeError("Failed to get measurements within the CV limit.")
+
+
+def main():
+    args = parse_args()
+
+    program = args.program.resolve(strict=True)
+
+    benchmark_results: list[BenchmarkResult] = []
+
+    with tempfile.TemporaryDirectory() as temp_dir_name:
+        temp_dir = pathlib.Path(temp_dir_name)
+        output_file = temp_dir / "benchmark.out.json"
+
+        for pattern in args.patterns:
+            benchmark_results += run_benchmark(
+                program=program,
+                output_file=output_file,
+                show_output=args.show_program_output,
+                pattern=pattern,
+                repetitions=args.repetitions,
+                max_cv=args.max_cv,
+                max_attempts=args.max_attempts,
+            )
+
+    print("name|median_real_time|median_cpu_time")
+    print("-|-|-")
+    for result in benchmark_results:
+        print(
+            f"{result.name}|"
+            f"{round(result.median_real_time)} {result.time_unit}|"
+            f"{round(result.median_cpu_time)} {result.time_unit}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/test/run_benchmark.readme.md b/onnxruntime/test/run_benchmark.readme.md
new file mode 100644
index 000000000000..06a9cbb85df1
--- /dev/null
+++ b/onnxruntime/test/run_benchmark.readme.md
@@ -0,0 +1,21 @@
+# run_benchmark.py
+
+`run_benchmark.py` is a helper script that runs a [Google Benchmark](https://github.com/google/benchmark) program
+repeatedly until the measurements are within the desired
+[coefficient of variation](https://en.wikipedia.org/wiki/Coefficient_of_variation) and then outputs the measurements.
+
+It can be useful for obtaining measurements that are stable enough when repeated invocations of a benchmark program
+show some measurement variance across runs.
+
+Note that the script runs the benchmark program with specific options and parses specifically formatted output, so it
+is only expected to work with Google Benchmark programs.
+
+## Example usage
+
+To run a benchmark program and get measurements for benchmark test(s) with a particular name:
+
+```
+python run_benchmark.py --program <path to benchmark program, e.g., onnxruntime_mlas_benchmark> --pattern <benchmark test name pattern>
+```
+
+For more detailed usage information, run it with the `--help` option.
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 7dee0bc41a6f..a7ce8127a7f5 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <thread>
 
+#include <absl/base/config.h>
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 
@@ -42,6 +43,10 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef USE_ROCM
+#include <hip/hip_runtime.h>
+#endif
+
 // Once we use C++17 this could be replaced with std::size
 template <typename T, size_t N>
 constexpr size_t countof(T (&)[N]) { return N; }
@@ -175,6 +180,9 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
 }
 
 static constexpr PATH_TYPE MODEL_URI = TSTR("testdata/mul_1.onnx");
+#if defined(USE_CUDA)
+static constexpr PATH_TYPE CUDA_GRAPH_ANNOTATION_MODEL_URI = TSTR("testdata/mul_1_dynamic.onnx");
+#endif
 static constexpr PATH_TYPE MATMUL_MODEL_URI = TSTR("testdata/matmul_1.onnx");
 #ifndef ORT_NO_RTTI
 static constexpr PATH_TYPE SEQUENCE_MODEL_URI = TSTR("testdata/sequence_length.onnx");
@@ -200,7 +208,7 @@ static constexpr PATH_TYPE MODEL_WITH_CUSTOM_MODEL_METADATA = TSTR("testdata/mod
 static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/VariedInputCustomOp.onnx");
 static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_3.onnx");
 static constexpr PATH_TYPE OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_bar_1.onnx");
-static constexpr PATH_TYPE OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_bar_2.onnx");
+static constexpr PATH_TYPE OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_bar_2.onnx");
 static constexpr PATH_TYPE VARIADIC_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/custom_op_variadic_io.onnx");
 static constexpr PATH_TYPE VARIADIC_UNDEF_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR(
     "testdata/custom_op_variadic_undef_io.onnx");
@@ -402,6 +410,8 @@ TEST(CApiTest, SparseInputModel) {
 #endif  // DISABLE_CONTRIB_OPS
 #endif  // !defined(DISABLE_SPARSE_TENSORS)
 
+// Memory leak
+#ifndef ABSL_HAVE_ADDRESS_SANITIZER
 TEST(CApiTest, custom_op_handler) {
   std::cout << "Running custom op inference" << std::endl;
 
@@ -435,6 +445,7 @@ TEST(CApiTest, custom_op_handler) {
                        custom_op_domain, nullptr);
 #endif
 }
+#endif
 
 #ifdef USE_CUDA
 TEST(CApiTest, custom_op_set_input_memory_type) {
@@ -1071,7 +1082,7 @@ TEST(CApiTest, invalid_variadic_input_homogeneity_custom_op) {
   }
 }
 
-TEST(CApiTest, optional_input_output_custom_op_handler) {
+TEST(CApiTest, optional_input_custom_op_handler) {
   MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider};
 
   // `MyCustomOpFooBar` defines a custom op with atmost 3 inputs and the second input is optional.
@@ -1136,7 +1147,7 @@ TEST(CApiTest, optional_input_output_custom_op_handler) {
   {
     std::vector<const char*> input_names = {"X1", "X2"};
     ort_inputs.erase(ort_inputs.begin() + 2);  // remove the last input in the container
-    Ort::Session session(*ort_env, OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI_2, session_options);
+    Ort::Session session(*ort_env, OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2, session_options);
     auto ort_outputs = session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
                                    &output_name, 1);
     ASSERT_EQ(ort_outputs.size(), 1u);
@@ -1155,6 +1166,7 @@ TEST(CApiTest, optional_input_output_custom_op_handler) {
     }
   }
 }
+
 TEST(CApiTest, custom_op_with_attributes_handler) {
   MyCustomOpWithAttributes custom_op{onnxruntime::kCpuExecutionProvider};
 
@@ -1264,6 +1276,49 @@ TEST(CApiTest, test_custom_op_get_const_input) {
 }
 #endif
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#if defined(__ANDROID__)
+// Disable on android because custom op libraries are not copied to the emulator.
+TEST(CApiTest, DISABLED_test_custom_op_local_function) {
+#else
+TEST(CApiTest, test_custom_op_local_function) {
+#endif  // defined(__ANDROID__)
+  const auto* model_path = TSTR("testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx");
+
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  std::vector<Ort::Value> ort_inputs;
+  std::vector<const char*> input_names;
+
+  // input 0 (float type)
+  input_names.emplace_back("X");
+  std::vector<float> input_0_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<int64_t> input_0_dims = {2, 2};
+  ort_inputs.emplace_back(
+      Ort::Value::CreateTensor<float>(info, const_cast<float*>(input_0_data.data()),
+                                      input_0_data.size(), input_0_dims.data(), input_0_dims.size()));
+  const char* output_name = "Y";
+
+  const ORTCHAR_T* lib_name;
+#if defined(_WIN32)
+  lib_name = ORT_TSTR("custom_op_local_function.dll");
+#elif defined(__APPLE__)
+  lib_name = ORT_TSTR("libcustom_op_local_function.dylib");
+#else
+lib_name = ORT_TSTR("./libcustom_op_local_function.so");
+#endif
+
+  Ort::SessionOptions session_opts;
+
+  session_opts.RegisterCustomOpsLibrary(lib_name);
+
+  Ort::Session session(*ort_env, model_path, session_opts);
+  auto default_allocator = std::make_unique<MockedOrtAllocator>();
+
+  session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
+              &output_name, 1);
+}
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+
 #if defined(USE_OPENVINO) && (!defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS))
 TEST(CApiTest, test_custom_op_openvino_wrapper_library) {
   // Tests a custom operator that wraps an OpenVINO MNIST model (.xml and .bin files serialized into node attributes).
@@ -1409,7 +1464,8 @@ TEST(CApiTest, test_custom_op_library) {
 #endif
 }
 
-#if defined(__ANDROID__)
+// Has memory leak
+#if defined(__ANDROID__) || defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(CApiTest, DISABLED_test_custom_op_shape_infer_attr) {
 // To accomodate a reduced op build pipeline
 #elif defined(REDUCED_OPS_BUILD) && defined(USE_CUDA)
@@ -1714,6 +1770,27 @@ TEST(CApiTest, get_allocator_cuda) {
 }
 #endif
 
+#ifdef USE_ROCM
+TEST(CApiTest, get_allocator_rocm) {
+  Ort::SessionOptions session_options;
+  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
+  Ort::Session session(*ort_env, NAMED_AND_ANON_DIM_PARAM_URI, session_options);
+
+  Ort::MemoryInfo info_rocm("Hip", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
+  Ort::Allocator rocm_allocator(session, info_rocm);
+
+  auto allocator_info = rocm_allocator.GetInfo();
+  ASSERT_TRUE(info_rocm == allocator_info);
+  void* p = rocm_allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+  rocm_allocator.Free(p);
+
+  auto mem_allocation = rocm_allocator.GetAllocation(1024);
+  ASSERT_NE(nullptr, mem_allocation.get());
+  ASSERT_EQ(1024U, mem_allocation.size());
+}
+#endif
+
 TEST(CApiTest, io_binding) {
   Ort::SessionOptions session_options;
   Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CPU(session_options, 1));
@@ -1889,7 +1966,7 @@ TEST(CApiTest, io_binding_cuda) {
 }
 #endif
 
-#if defined(USE_CUDA) || defined(USE_TENSORRT)
+#if defined(USE_CUDA) || defined(USE_TENSORRT) || defined(USE_ROCM)
 TEST(CApiTest, basic_cuda_graph) {
   const auto& api = Ort::GetApi();
   Ort::SessionOptions session_options;
@@ -1907,7 +1984,7 @@ TEST(CApiTest, basic_cuda_graph) {
   ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(
                   static_cast<OrtSessionOptions*>(session_options),
                   rel_trt_options.get()) == nullptr);
-#else
+#elif defined(USE_CUDA)
   // Enable cuda graph in cuda provider option.
   OrtCUDAProviderOptionsV2* cuda_options = nullptr;
   ASSERT_TRUE(api.CreateCUDAProviderOptions(&cuda_options) == nullptr);
@@ -1920,34 +1997,55 @@ TEST(CApiTest, basic_cuda_graph) {
   ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
                   static_cast<OrtSessionOptions*>(session_options),
                   rel_cuda_options.get()) == nullptr);
+#elif defined(USE_ROCM)
+  // Enable hip graph in rocm provider option.
+  OrtROCMProviderOptions* rocm_options = nullptr;
+  ASSERT_TRUE(api.CreateROCMProviderOptions(&rocm_options) == nullptr);
+  std::unique_ptr<OrtROCMProviderOptions, decltype(api.ReleaseROCMProviderOptions)>
+      rel_rocm_options(rocm_options, api.ReleaseROCMProviderOptions);
+  std::vector<const char*> keys{"enable_hip_graph"};
+  std::vector<const char*> values{"1"};
+  ASSERT_TRUE(api.UpdateROCMProviderOptions(rel_rocm_options.get(), keys.data(), values.data(), 1) == nullptr);
+
+  ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_ROCM(
+                  static_cast<OrtSessionOptions*>(session_options),
+                  rel_rocm_options.get()) == nullptr);
 #endif
 
   Ort::Session session(*ort_env, MODEL_URI, session_options);
-  Ort::MemoryInfo info_cuda("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
+#if defined(USE_ROCM)
+// local hipify
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+  Ort::MemoryInfo info_mem("Hip", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
+#else
+  Ort::MemoryInfo info_mem("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
+#endif
 
-  Ort::Allocator cuda_allocator(session, info_cuda);
-  auto allocator_info = cuda_allocator.GetInfo();
-  ASSERT_TRUE(info_cuda == allocator_info);
+  Ort::Allocator allocator(session, info_mem);
+  auto allocator_info = allocator.GetInfo();
+  ASSERT_TRUE(info_mem == allocator_info);
 
   const std::array<int64_t, 2> x_shape = {3, 2};
   std::array<float, 3 * 2> x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  auto input_data = cuda_allocator.GetAllocation(x_values.size() * sizeof(float));
+  auto input_data = allocator.GetAllocation(x_values.size() * sizeof(float));
 
   ASSERT_NE(input_data.get(), nullptr);
-  cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
+  (void)cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
 
   // Create an OrtValue tensor backed by data on CUDA memory
-  Ort::Value bound_x = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(input_data.get()), x_values.size(),
+  Ort::Value bound_x = Ort::Value::CreateTensor(info_mem, reinterpret_cast<float*>(input_data.get()), x_values.size(),
                                                 x_shape.data(), x_shape.size());
 
   const std::array<int64_t, 2> expected_y_shape = {3, 2};
   std::array<float, 3 * 2> expected_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
-  auto output_data = cuda_allocator.GetAllocation(expected_y.size() * sizeof(float));
+  auto output_data = allocator.GetAllocation(expected_y.size() * sizeof(float));
 
   ASSERT_NE(output_data.get(), nullptr);
 
   // Create an OrtValue tensor backed by data on CUDA memory
-  Ort::Value bound_y = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(output_data.get()),
+  Ort::Value bound_y = Ort::Value::CreateTensor(info_mem, reinterpret_cast<float*>(output_data.get()),
                                                 expected_y.size(), expected_y_shape.data(), expected_y_shape.size());
 
   // Create IoBinding for inputs and outputs.
@@ -1960,31 +2058,183 @@ TEST(CApiTest, basic_cuda_graph) {
 
   // Check the values against the bound raw memory (needs copying from device to host first)
   std::array<float, 3 * 2> y_values;
-  cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
+  (void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
   ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));
 
   // Replay the captured CUDA graph
   session.Run(Ort::RunOptions(), binding);
-  cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
+  (void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
   ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));
 
   // Change the input and replay the CUDA graph again.
   x_values = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f};
-  cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
+  (void)cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
   binding.SynchronizeInputs();
 
   session.Run(Ort::RunOptions(), binding);
-  cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
+  (void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
   expected_y = {10.0f, 40.0f, 90.0f, 160.0f, 250.0f, 360.0f};
   ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));
 
   // Clean up
   binding.ClearBoundInputs();
   binding.ClearBoundOutputs();
+#if defined(USE_ROCM)
+#undef cudaMemcpy
+#undef cudaMemcpyHostToDevice
+#undef cudaMemcpyDeviceToHost
+#endif
 }
 
-#ifndef REDUCED_OPS_BUILD
+#if defined(USE_CUDA)
+struct CudaGraphInputOutputData_0 {
+  const std::array<int64_t, 2> x_shape = {3, 2};
+  std::array<float, 3 * 2> x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  const std::array<int64_t, 2> expected_y_shape = {3, 2};
+  std::array<float, 3 * 2> expected_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  std::array<float, 3 * 2> y_values;
+  std::array<float, 3 * 2> new_x_values = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f};
+  std::array<float, 3 * 2> new_expected_y = {10.0f, 40.0f, 90.0f, 160.0f, 250.0f, 360.0f};
+} cg_data_0;
+
+struct CudaGraphInputOutputData_1 {
+  const std::array<int64_t, 2> x_shape = {3, 1};
+  std::array<float, 3> x_values = {1.0f, 3.0f, 5.0f};
+  const std::array<int64_t, 2> expected_y_shape = {3, 2};
+  std::array<float, 3 * 2> expected_y = {1.0f, 2.0f, 9.0f, 12.0f, 25.0f, 30.0f};
+
+  std::array<float, 3 * 2> y_values;
+  std::array<float, 3> new_x_values = {10.0f, 30.0f, 50.0f};
+  std::array<float, 3 * 2> new_expected_y = {10.0f, 20.0f, 90.0f, 120.0f, 250.0f, 300.0f};
+} cg_data_1;
+
+struct CudaGraphInputOutputData_2 {
+  const std::array<int64_t, 2> x_shape = {1, 2};
+  std::array<float, 3 * 2> x_values = {1.0f, 2.0f};
+  const std::array<int64_t, 2> expected_y_shape = {3, 2};
+  std::array<float, 3 * 2> expected_y = {1.0f, 4.0f, 3.0f, 8.0f, 5.0f, 12.0f};
+
+  std::array<float, 3 * 2> y_values;
+  std::array<float, 3 * 2> new_x_values = {10.0f, 20.0f};
+  std::array<float, 3 * 2> new_expected_y = {10.0f, 40.0f, 30.0f, 80.0f, 50.0f, 120.0f};
+} cg_data_2;
+
+template <typename T>
+static void RunWithCudaGraphAnnotation(T& cg_data,
+                                       Ort::Session& session,
+                                       Ort::MemoryInfo& info_mem,
+                                       Ort::MemoryAllocation& input_data,
+                                       Ort::MemoryAllocation& output_data,
+                                       const char* cuda_graph_annotation) {
+  (void)cudaMemcpy(input_data.get(),
+                   cg_data.x_values.data(),
+                   sizeof(float) * cg_data.x_values.size(),
+                   cudaMemcpyHostToDevice);
+
+  // Create an OrtValue tensor backed by data on CUDA memory
+  Ort::Value bound_x = Ort::Value::CreateTensor(info_mem,
+                                                reinterpret_cast<float*>(input_data.get()),
+                                                cg_data.x_values.size(),
+                                                cg_data.x_shape.data(),
+                                                cg_data.x_shape.size());
+
+  // Create an OrtValue tensor backed by data on CUDA memory
+  Ort::Value bound_y = Ort::Value::CreateTensor(info_mem,
+                                                reinterpret_cast<float*>(output_data.get()),
+                                                cg_data.expected_y.size(),
+                                                cg_data.expected_y_shape.data(),
+                                                cg_data.expected_y_shape.size());
+
+  // Create IoBinding for inputs and outputs.
+  Ort::IoBinding binding(session);
+  binding.BindInput("X", bound_x);
+  binding.BindOutput("Y", bound_y);
+
+  Ort::RunOptions run_option;
+  if (cuda_graph_annotation != nullptr) {
+    run_option.AddConfigEntry(kOrtRunOptionsConfigCudaGraphAnnotation, cuda_graph_annotation);
+  }
+
+  // One regular run for necessary memory allocation and graph capturing
+  session.Run(run_option, binding);
+
+  // Check the values against the bound raw memory (needs copying from device to host first)
+  (void)cudaMemcpy(cg_data.y_values.data(),
+                   output_data.get(),
+                   sizeof(float) * cg_data.y_values.size(),
+                   cudaMemcpyDeviceToHost);
+  ASSERT_THAT(cg_data.y_values, ::testing::ContainerEq(cg_data.expected_y));
+
+  // Replay the captured CUDA graph
+  session.Run(run_option, binding);
+  (void)cudaMemcpy(cg_data.y_values.data(),
+                   output_data.get(),
+                   sizeof(float) * cg_data.y_values.size(),
+                   cudaMemcpyDeviceToHost);
+  ASSERT_THAT(cg_data.y_values, ::testing::ContainerEq(cg_data.expected_y));
+
+  // Change the input and replay the CUDA graph again.
+  (void)cudaMemcpy(input_data.get(),
+                   cg_data.new_x_values.data(),
+                   sizeof(float) * cg_data.new_x_values.size(),
+                   cudaMemcpyHostToDevice);
+  binding.SynchronizeInputs();
+
+  session.Run(run_option, binding);
+  (void)cudaMemcpy(cg_data.y_values.data(),
+                   output_data.get(),
+                   sizeof(float) * cg_data.y_values.size(),
+                   cudaMemcpyDeviceToHost);
+  ASSERT_THAT(cg_data.y_values, ::testing::ContainerEq(cg_data.new_expected_y));
+
+  // Clean up
+  binding.ClearBoundInputs();
+  binding.ClearBoundOutputs();
+}
+
+TEST(CApiTest, basic_cuda_graph_with_annotation) {
+  const auto& api = Ort::GetApi();
+  Ort::SessionOptions session_options;
+
+  // Enable cuda graph in cuda provider option.
+  OrtCUDAProviderOptionsV2* cuda_options = nullptr;
+  ASSERT_TRUE(api.CreateCUDAProviderOptions(&cuda_options) == nullptr);
+  std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(api.ReleaseCUDAProviderOptions)>
+      rel_cuda_options(cuda_options, api.ReleaseCUDAProviderOptions);
+  std::vector<const char*> keys{"enable_cuda_graph"};
+  std::vector<const char*> values{"1"};
+  ASSERT_TRUE(api.UpdateCUDAProviderOptions(rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr);
+
+  ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
+                  static_cast<OrtSessionOptions*>(session_options),
+                  rel_cuda_options.get()) == nullptr);
+
+  Ort::Session session(*ort_env, CUDA_GRAPH_ANNOTATION_MODEL_URI, session_options);
+  Ort::MemoryInfo info_mem("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
+
+  Ort::Allocator allocator(session, info_mem);
+  auto allocator_info = allocator.GetInfo();
+  ASSERT_TRUE(info_mem == allocator_info);
+
+  size_t max_input_size = 6;
+  size_t max_output_size = 6;
+
+  auto input_data = allocator.GetAllocation(max_input_size * sizeof(float));
+  auto output_data = allocator.GetAllocation(max_output_size * sizeof(float));
+
+  ASSERT_NE(input_data.get(), nullptr);
+  ASSERT_NE(output_data.get(), nullptr);
+
+  RunWithCudaGraphAnnotation(cg_data_0, session, info_mem, input_data, output_data, nullptr);
+  RunWithCudaGraphAnnotation(cg_data_1, session, info_mem, input_data, output_data, "1");
+  RunWithCudaGraphAnnotation(cg_data_2, session, info_mem, input_data, output_data, "2");
+}
+#endif
+
 // The following test uses some ops not supported in the reduced ops build
+#ifndef REDUCED_OPS_BUILD
+#if defined(USE_CUDA) || defined(USE_TENSORRT)
 TEST(CApiTest, cuda_graph_with_shape_nodes) {
   const auto& api = Ort::GetApi();
 
@@ -2005,10 +2255,34 @@ TEST(CApiTest, cuda_graph_with_shape_nodes) {
   // Successful loading of the ONNX model with shape nodes with cuda graph feature enabled
   Ort::Session session(*ort_env, TSTR("testdata/cuda_graph_with_shape_nodes.onnx"), session_options);
 }
+#endif  // defined(USE_CUDA) || defined(USE_TENSORRT)
 
-#endif
+#if defined(USE_ROCM)
+TEST(CApiTest, hip_graph_with_shape_nodes) {
+  const auto& api = Ort::GetApi();
 
-#endif
+  // Enable hip graph in rocm provider option.
+  OrtROCMProviderOptions* rocm_options = nullptr;
+  ASSERT_TRUE(api.CreateROCMProviderOptions(&rocm_options) == nullptr);
+  std::unique_ptr<OrtROCMProviderOptions, decltype(api.ReleaseROCMProviderOptions)>
+      rel_rocm_options(rocm_options, api.ReleaseROCMProviderOptions);
+  std::vector<const char*> keys{"enable_hip_graph"};
+  std::vector<const char*> values{"1"};
+  ASSERT_TRUE(api.UpdateROCMProviderOptions(rel_rocm_options.get(), keys.data(), values.data(), 1) == nullptr);
+
+  Ort::SessionOptions session_options;
+  ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_ROCM(
+                  static_cast<OrtSessionOptions*>(session_options),
+                  rel_rocm_options.get()) == nullptr);
+
+  // Successful loading of the ONNX model with shape nodes with hip graph feature enabled
+  Ort::Session session(*ort_env, TSTR("testdata/cuda_graph_with_shape_nodes.onnx"), session_options);
+}
+#endif  // defined(USE_ROCM)
+
+#endif  // REDUCED_OPS_BUILD
+
+#endif  // defined(USE_CUDA) || defined(USE_TENSORRT) || defined(USE_ROCM)
 
 TEST(CApiTest, create_tensor) {
   const char* s[] = {"abc", "kmp"};
@@ -2588,6 +2862,17 @@ TEST(CApiTest, TestSharedAllocators) {
                         expected_dims_y,
                         expected_values_y,
                         nullptr);
+
+      // create session 3 to test separate allocation for initializers
+      session_options.AddConfigEntry("session.use_device_allocator_for_initializers", "1");
+      Ort::Session session3(*ort_env, MODEL_URI, session_options);
+      RunSession<float>(allocator_for_input_memory_allocation.get(),
+                        session3,
+                        inputs,
+                        "Y",
+                        expected_dims_y,
+                        expected_values_y,
+                        nullptr);
     }
 
     // Remove the registered shared allocator from the global environment
@@ -2598,7 +2883,10 @@ TEST(CApiTest, TestSharedAllocators) {
     // We should have seen 2 allocations per session (one for the sole initializer
     // and one for the output). So, for two sessions, we should have seen 4 allocations.
     size_t num_allocations = custom_allocator.NumAllocations();
-    ASSERT_TRUE(num_allocations == 4);
+    ASSERT_TRUE(num_allocations == 6);
+
+    size_t num_reserve_allocations = custom_allocator.NumReserveAllocations();
+    ASSERT_TRUE(num_reserve_allocations == 1);
 
     // Ensure that there was no leak
     custom_allocator.LeakCheck();
@@ -3734,3 +4022,62 @@ TEST(CApiTest, RunAsyncFail) {
   Ort::RunOptions run_options;
   EXPECT_THROW(session.RunAsync(run_options, input_names, input_tensors, 1, output_names, output_values, 1, CallbackFail, nullptr), std::exception);
 }
+
+struct MockGQA : public OrtCustomOp {
+  MockGQA() {
+    OrtCustomOp::GetMayInplace = [](int** input_index, int** output_index) {
+      size_t ret = 2;
+      *input_index = static_cast<int*>(malloc(ret * sizeof(int)));
+      (*input_index)[0] = 3;
+      (*input_index)[1] = 4;
+      *output_index = static_cast<int*>(malloc(ret * sizeof(int)));
+      (*output_index)[0] = 1;
+      (*output_index)[1] = 2;
+      return ret;
+    };
+    OrtCustomOp::ReleaseMayInplace = [](int* input_index, int* output_index) {
+      free(input_index);
+      free(output_index);
+    };
+    OrtCustomOp::GetAliasMap = [](int** input_index, int** output_index) {
+      size_t ret = 2;
+      *input_index = static_cast<int*>(malloc(ret * sizeof(int)));
+      (*input_index)[0] = 5;
+      (*input_index)[1] = 6;
+      *output_index = static_cast<int*>(malloc(ret * sizeof(int)));
+      (*output_index)[0] = 7;
+      (*output_index)[1] = 8;
+      return ret;
+    };
+    OrtCustomOp::ReleaseAliasMap = [](int* input_index, int* output_index) {
+      free(input_index);
+      free(output_index);
+    };
+  }
+};
+
+TEST(CApiTest, OrtCustomOp_GetInPlace) {
+  MockGQA mock_gqa;
+  int* input_index = nullptr;
+  int* output_index = nullptr;
+  size_t len = mock_gqa.GetMayInplace(&input_index, &output_index);
+  ASSERT_NE(input_index, nullptr);
+  ASSERT_NE(output_index, nullptr);
+  ASSERT_EQ(input_index[0], 3);
+  ASSERT_EQ(input_index[1], 4);
+  ASSERT_EQ(output_index[0], 1);
+  ASSERT_EQ(output_index[1], 2);
+  ASSERT_EQ(len, static_cast<size_t>(2));
+  mock_gqa.ReleaseMayInplace(input_index, output_index);
+
+  input_index = output_index = nullptr;
+  len = mock_gqa.GetAliasMap(&input_index, &output_index);
+  ASSERT_NE(input_index, nullptr);
+  ASSERT_NE(output_index, nullptr);
+  ASSERT_EQ(input_index[0], 5);
+  ASSERT_EQ(input_index[1], 6);
+  ASSERT_EQ(output_index[0], 7);
+  ASSERT_EQ(output_index[1], 8);
+  ASSERT_EQ(len, static_cast<size_t>(2));
+  mock_gqa.ReleaseAliasMap(input_index, output_index);
+}
diff --git a/onnxruntime/test/shared_lib/test_ort_format_models.cc b/onnxruntime/test/shared_lib/test_ort_format_models.cc
index d67c5a304809..99a9ebc3362a 100644
--- a/onnxruntime/test/shared_lib/test_ort_format_models.cc
+++ b/onnxruntime/test/shared_lib/test_ort_format_models.cc
@@ -3,7 +3,7 @@
 
 // custom ops are only supported in a minimal build if explicitly enabled
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
-
+#include <absl/base/config.h>
 #include "core/common/common.h"
 #include "core/graph/constants.h"
 #include "core/session/onnxruntime_cxx_api.h"
@@ -16,10 +16,10 @@
 
 extern std::unique_ptr<Ort::Env> ort_env;
 
-static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& model_uri,
-                          const std::vector<Input>& inputs, const char* output_name,
-                          const std::vector<int64_t>& expected_dims_y, const std::vector<float>& expected_values_y,
-                          Ort::CustomOpDomain& custom_op_domain, void* cuda_compute_stream = nullptr) {
+[[maybe_unused]] static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& model_uri,
+                                           const std::vector<Input>& inputs, const char* output_name,
+                                           const std::vector<int64_t>& expected_dims_y, const std::vector<float>& expected_values_y,
+                                           Ort::CustomOpDomain& custom_op_domain, void* cuda_compute_stream = nullptr) {
   Ort::SessionOptions session_options;
   session_options.Add(custom_op_domain);
 
@@ -27,6 +27,7 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
   auto cuda_options = CreateDefaultOrtCudaProviderOptionsWithCustomStream(cuda_compute_stream);
   session_options.AppendExecutionProvider_CUDA(cuda_options);
 #else
+  session_options.DisableCpuMemArena();
   ORT_UNUSED_PARAMETER(cuda_compute_stream);
 #endif
   Ort::Session session(env, model_uri.c_str(), session_options);
@@ -65,7 +66,7 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
   }
 }
 
-#if !defined(ORT_MINIMAL_BUILD)
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(OrtFormatCustomOpTests, ConvertOnnxModelToOrt) {
   const std::basic_string<ORTCHAR_T> onnx_file = ORT_TSTR("testdata/foo_1.onnx");
   const std::basic_string<ORTCHAR_T> ort_file = ORT_TSTR("testdata/foo_1.onnx.test_output.ort");
@@ -120,7 +121,7 @@ TEST(OrtFormatCustomOpTests, ConvertOnnxModelToOrt) {
 
 // the saved ORT format model has the CPU EP assigned to the custom op node, so we only test if we're not using the
 // CUDA EP for the test.
-#ifndef USE_CUDA
+#if !defined(USE_CUDA) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(OrtFormatCustomOpTests, LoadOrtModel) {
   const std::basic_string<ORTCHAR_T> ort_file = ORT_TSTR("testdata/foo_1.onnx.ort");
 
diff --git a/onnxruntime/test/shared_lib/test_session_options.cc b/onnxruntime/test/shared_lib/test_session_options.cc
index 64d9803f8bf8..8e10ce7ffacc 100644
--- a/onnxruntime/test/shared_lib/test_session_options.cc
+++ b/onnxruntime/test/shared_lib/test_session_options.cc
@@ -4,7 +4,6 @@
 #include "core/common/common.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
-#include "core/optimizer/graph_transformer_level.h"
 #include "gmock/gmock.h"
 
 using namespace onnxruntime;
@@ -15,6 +14,12 @@ TEST(CApiTest, session_options_graph_optimization_level) {
   options.SetGraphOptimizationLevel(ORT_ENABLE_EXTENDED);
 }
 
+TEST(CApiTest, session_options_deterministic_compute) {
+  // Manual validation currently. Check that SetDeterministicCompute in abi_session_options.cc is hit.
+  Ort::SessionOptions options;
+  options.SetDeterministicCompute(true);
+}
+
 #if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) && !defined(ORT_NO_EXCEPTIONS)
 
 TEST(CApiTest, session_options_oversized_affinity_string) {
diff --git a/onnxruntime/test/testdata/conv_qdq_external_ini.bin b/onnxruntime/test/testdata/conv_qdq_external_ini.bin
new file mode 100644
index 000000000000..e749ab5af29c
Binary files /dev/null and b/onnxruntime/test/testdata/conv_qdq_external_ini.bin differ
diff --git a/onnxruntime/test/testdata/conv_qdq_external_ini.onnx b/onnxruntime/test/testdata/conv_qdq_external_ini.onnx
new file mode 100644
index 000000000000..fad6074aea13
Binary files /dev/null and b/onnxruntime/test/testdata/conv_qdq_external_ini.onnx differ
diff --git a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
index 85edfa0e59f1..ebef441350d4 100644
--- a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
@@ -49,16 +49,45 @@ struct KernelOne {
   }
 };
 
+struct DataI {
+  const float* from = {};
+  float* to = {};
+};
+
+struct DataII {
+  const float* from = {};
+  int32_t* to = {};
+};
+
+// floats to floats
+void CopyI(void* raw_data, size_t ith) {
+  auto data = reinterpret_cast<DataI*>(raw_data);
+  data->to[ith] = data->from[ith];
+}
+
+// floats to int32_t
+void CopyII(void* raw_data, size_t ith) {
+  auto data = reinterpret_cast<DataII*>(raw_data);
+  data->to[ith] = static_cast<int32_t>(round(data->from[ith]));
+}
+
 // lite custom op as a function
-void KernelTwo(const Ort::Custom::Tensor<float>& X,
+void KernelTwo(OrtKernelContext* context,
+               const Ort::Custom::Tensor<float>& X,
                Ort::Custom::Tensor<int32_t>& Y) {
   const auto& shape = X.Shape();
   auto X_raw = X.Data();
   auto Y_raw = Y.Allocate(shape);
+  std::vector<float> floats(static_cast<size_t>(X.NumberOfElement()), 0.f);
+
+  DataI data_i = {X_raw, floats.data()};
   auto total = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>());
-  for (int64_t i = 0; i < total; i++) {
-    Y_raw[i] = static_cast<int32_t>(round(X_raw[i]));
-  }
+
+  Ort::KernelContext ctx(context);
+  ctx.ParallelFor(CopyI, static_cast<size_t>(total), 0, &data_i);  // test simple parallel for
+
+  DataII data_ii = {floats.data(), Y_raw};
+  ctx.ParallelFor(CopyII, static_cast<size_t>(total), 2, &data_ii);  // test batch parallel for
 }
 
 template <typename T>
diff --git a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
index 3d561d378cb8..43795921f17d 100644
--- a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
@@ -28,14 +28,14 @@ void KernelOne(const Ort::Custom::CudaContext& cuda_ctx,
                const Ort::Custom::Tensor<float>& X,
                const Ort::Custom::Tensor<float>& Y,
                Ort::Custom::Tensor<float>& Z) {
-  auto input_shape = X.Shape();
   CUSTOM_ENFORCE(cuda_ctx.cuda_stream, "failed to fetch cuda stream");
   CUSTOM_ENFORCE(cuda_ctx.cudnn_handle, "failed to fetch cudnn handle");
   CUSTOM_ENFORCE(cuda_ctx.cublas_handle, "failed to fetch cublas handle");
+  CUSTOM_ENFORCE(cuda_ctx.arena_extend_strategy == 0, "arena_extend_strategy mismatch");
   void* deferred_cpu_mem = cuda_ctx.AllocDeferredCpuMem(sizeof(int32_t));
   CUSTOM_ENFORCE(deferred_cpu_mem, "failed to allocate deferred cpu allocator");
   cuda_ctx.FreeDeferredCpuMem(deferred_cpu_mem);
-  auto z_raw = Z.Allocate(input_shape);
+  auto z_raw = Z.Allocate(X.Shape());
   cuda_add(Z.NumberOfElement(), z_raw, X.Data(), Y.Data(), cuda_ctx.cuda_stream);
 }
 
diff --git a/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py b/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
index 84cf71455f84..6db8e8fe660f 100644
--- a/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
+++ b/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
@@ -1,6 +1,7 @@
 """
 This file was used to generate model `custom_op_test_float8.py`.
 """
+
 from onnx import TensorProto
 from onnx.checker import check_model
 from onnx.helper import make_graph, make_model, make_node, make_opsetid, make_tensor_value_info
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.cc b/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.cc
new file mode 100644
index 000000000000..38eb5d3ca907
--- /dev/null
+++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.cc
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "custom_op_local_function.h"
+
+#include <cmath>
+#include <mutex>
+#include <utility>
+#include <vector>
+
+#include "core/common/common.h"
+#include "core/framework/ortdevice.h"
+#include "core/framework/ortmemoryinfo.h"
+#include "dummy_gemm.h"
+
+static const char* c_OpDomain = "onnx_extented.ortops.tutorial.cpu";
+
+static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) {
+  static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
+  static std::mutex ort_custom_op_domain_mutex;
+  std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
+  ort_custom_op_domain_container.push_back(std::move(domain));
+}
+
+OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options,
+                                          const OrtApiBase* api_base) {
+  Ort::InitApi(api_base->GetApi(ORT_API_VERSION));
+  Ort::UnownedSessionOptions session_options(options);
+
+  // An instance remaining available until onnxruntime unload the library.
+  static Cpu::CustomGemmOp c_CustomGemmFloat(
+      "CustomGemmFloat", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+      ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+      false);
+  static Cpu::CustomGemmOp c_CustomGemmFloat8E4M3FN(
+      "CustomGemmFloat8E4M3FN", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN,
+      ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+      false);
+  OrtStatus* result = nullptr;
+
+  ORT_TRY {
+    Ort::CustomOpDomain domain{c_OpDomain};
+
+    domain.Add(&c_CustomGemmFloat);
+    domain.Add(&c_CustomGemmFloat8E4M3FN);
+
+    session_options.Add(domain);
+    AddOrtCustomOpDomainToContainer(std::move(domain));
+  }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      Ort::Status status{e};
+      result = status.release();
+    });
+  }
+
+  return result;
+}
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.def b/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.def
new file mode 100644
index 000000000000..2bbbe3fe3ccb
--- /dev/null
+++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.def
@@ -0,0 +1,3 @@
+LIBRARY "custom_op_local_function.dll"
+EXPORTS
+ RegisterCustomOps @1
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.h b/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.h
new file mode 100644
index 000000000000..900e47908b58
--- /dev/null
+++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.h
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.lds b/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.lds
new file mode 100644
index 000000000000..bb5d118c7ca2
--- /dev/null
+++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_local_function.lds
@@ -0,0 +1,6 @@
+VERS_1.0.0 {
+ global:
+  RegisterCustomOps;
+ local:
+    *;
+};
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
new file mode 100644
index 000000000000..3e353d414255
--- /dev/null
+++ b/onnxruntime/test/testdata/custom_op_local_function/custom_op_test_local_function.py
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+import os
+import sys
+import unittest
+
+import numpy as np
+import onnx
+
+from onnxruntime import InferenceSession, SessionOptions
+
+
+class TestOnnxToolsGraph(unittest.TestCase):
+    def test_basic_all(self):
+        if sys.platform.startswith("win"):
+            shared_library = "custom_op_local_function.dll"
+        elif sys.platform.startswith("darwin"):
+            shared_library = "libcustom_op_local_function.dylib"
+        else:
+            shared_library = "./libcustom_op_local_function.so"
+        if not os.path.exists(shared_library):
+            raise FileNotFoundError(f"Unable to find '{shared_library}'")
+
+        filename = "custom_ops_type_inference_fails_0.onnx"
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "rb") as f:
+            onxo = onnx.load(f)
+        d = onxo.opset_import.add()
+        d.domain = "ai.onnx.ml"
+        d.version = 2
+
+        sess_opts = SessionOptions()
+        sess_opts.register_custom_ops_library(shared_library)
+
+        sess = InferenceSession(
+            onxo.SerializeToString(),
+            sess_opts,
+            providers=["CPUExecutionProvider"],
+        )
+        x = np.arange(2**2).reshape((2,) * 2).astype(np.float32)
+        t = np.arange(8).reshape((2, 4)).astype(np.float32)
+        got = sess.run(None, dict(X=x))[0]
+        np.testing.assert_allclose(t, got, atol=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx b/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx
new file mode 100644
index 000000000000..3a43a7378a91
Binary files /dev/null and b/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx differ
diff --git a/onnxruntime/test/testdata/custom_op_local_function/dummy_gemm.cc b/onnxruntime/test/testdata/custom_op_local_function/dummy_gemm.cc
new file mode 100644
index 000000000000..4591dd89d2e3
--- /dev/null
+++ b/onnxruntime/test/testdata/custom_op_local_function/dummy_gemm.cc
@@ -0,0 +1,119 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "dummy_gemm.h"
+
+#ifndef ORT_ENFORCE
+#define ORT_ENFORCE(cond, ...) \
+  if (!(cond)) ORT_CXX_API_THROW("Initialization failed.", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+#endif
+
+namespace Cpu {
+
+void* CustomGemmOp::CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+  return std::make_unique<CustomGemmKernel>(api, info).release();
+}
+
+const char* CustomGemmOp::GetName() const { return op_name_; }
+
+const char* CustomGemmOp::GetExecutionProviderType() const {
+  return "CPUExecutionProvider";
+}
+
+size_t CustomGemmOp::GetInputTypeCount() const { return 6; }
+
+ONNXTensorElementDataType CustomGemmOp::GetInputType(size_t index) const {
+  switch (index) {
+    case 0:  // A
+    case 1:  // B
+      return ab_type_;
+    case 2:  // C
+      return c_type_;
+    case 3:  // scale A
+    case 4:  // scale B
+    case 5:  // scale Y
+      return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+    default:
+      ORT_CXX_API_THROW("Input index is out of boundary.", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  }
+}
+
+OrtCustomOpInputOutputCharacteristic CustomGemmOp::GetInputCharacteristic(size_t index) const {
+  switch (index) {
+    case 0:
+    case 1:
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+    case 2:
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+    case 3:
+    case 4:
+    case 5:
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+    default:
+      ORT_CXX_API_THROW("Input index is out of boundary.", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  }
+}
+
+size_t CustomGemmOp::GetOutputTypeCount() const { return 1; }
+
+ONNXTensorElementDataType CustomGemmOp::GetOutputType(size_t index) const {
+  // D, scale D
+  switch (index) {
+    case 0:
+      return d_type_;
+    default:
+      ORT_CXX_API_THROW("Output index is out of boundary.", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  }
+}
+
+OrtCustomOpInputOutputCharacteristic CustomGemmOp::GetOutputCharacteristic(size_t index) const {
+  switch (index) {
+    case 0:
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+    default:
+      ORT_CXX_API_THROW("Output index is out of boundary.", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  }
+}
+
+CustomGemmKernel::CustomGemmKernel(const OrtApi&, const OrtKernelInfo*) {}
+
+template <typename TValue>
+ONNXTensorElementDataType GetTypeAndShape(const TValue& input, std::vector<int64_t>& shape, bool swap = false) {
+  auto t = input.GetTensorTypeAndShapeInfo();
+  shape = t.GetShape();
+  ORT_ENFORCE(shape.size() == 2);
+  if (swap) {
+    std::swap(shape[0], shape[1]);
+  }
+  return t.GetElementType();
+}
+
+void CustomGemmKernel::Compute(OrtKernelContext* context) {
+  // The function does nothing related to Gemm operator. It creates an output with the same dimensions as
+  // the model used in the unit tests and fills it with the first integer.
+  Ort::KernelContext ctx(context);
+
+  auto n_inputs = ctx.GetInputCount();
+  ORT_ENFORCE(n_inputs >= 2);
+  Ort::ConstValue input_A = ctx.GetInput(0);
+  Ort::ConstValue input_B = ctx.GetInput(1);
+
+  std::vector<int64_t> shape_A, shape_B;
+  GetTypeAndShape(input_A, shape_A);
+  GetTypeAndShape(input_B, shape_B);
+  ORT_ENFORCE(shape_A.size() == 2);
+  ORT_ENFORCE(shape_B.size() == 2);
+  std::vector<int64_t> dimensions{shape_A[0], shape_B[1]};
+  Ort::UnownedValue Y = ctx.GetOutput(0, dimensions);
+  float* out = Y.GetTensorMutableData<float>();
+  size_t end = static_cast<size_t>(dimensions[0] * dimensions[1]);
+  for (size_t i = static_cast<size_t>(0); i < end; ++i) {
+    out[i] = static_cast<float>(i);
+  }
+}
+
+}  // namespace Cpu
diff --git a/onnxruntime/test/testdata/custom_op_local_function/dummy_gemm.h b/onnxruntime/test/testdata/custom_op_local_function/dummy_gemm.h
new file mode 100644
index 000000000000..97a8e78cae6a
--- /dev/null
+++ b/onnxruntime/test/testdata/custom_op_local_function/dummy_gemm.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#define ORT_API_MANUAL_INIT
+#include <onnxruntime_c_api.h>
+#include <onnxruntime_cxx_api.h>
+#undef ORT_API_MANUAL_INIT
+
+#include <vector>
+
+namespace Cpu {
+
+struct CustomGemmKernel {
+  CustomGemmKernel(const OrtApi& api, const OrtKernelInfo* info);
+  void Compute(OrtKernelContext* context);
+};
+
+struct CustomGemmOp : Ort::CustomOpBase<CustomGemmOp, CustomGemmKernel> {
+  typedef Ort::CustomOpBase<CustomGemmOp, CustomGemmKernel> parent_type;
+  CustomGemmOp(const char* op_name, ONNXTensorElementDataType ab_type,
+               ONNXTensorElementDataType c_type,
+               ONNXTensorElementDataType d_type, bool compute_time_as_output)
+      : parent_type() {
+    op_name_ = op_name;
+    ab_type_ = ab_type;
+    c_type_ = c_type;
+    d_type_ = d_type;
+    compute_time_as_output_ = compute_time_as_output;
+  }
+  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const;
+  const char* GetName() const;
+  const char* GetExecutionProviderType() const;
+
+  size_t GetInputTypeCount() const;
+  ONNXTensorElementDataType GetInputType(size_t index) const;
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const;
+
+  size_t GetOutputTypeCount() const;
+  ONNXTensorElementDataType GetOutputType(size_t index) const;
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t index) const;
+
+ private:
+  const char* op_name_;
+  ONNXTensorElementDataType ab_type_;
+  ONNXTensorElementDataType c_type_;
+  ONNXTensorElementDataType d_type_;
+  bool compute_time_as_output_;
+};
+
+}  // namespace Cpu
diff --git a/onnxruntime/test/testdata/identity_opt.onnx b/onnxruntime/test/testdata/identity_opt.onnx
new file mode 100644
index 000000000000..24c05f7b7227
Binary files /dev/null and b/onnxruntime/test/testdata/identity_opt.onnx differ
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py
index b898390044cf..e6c51009018f 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/matmul_integer_to_float.py
@@ -4,7 +4,7 @@
 from onnx import TensorProto, helper
 
 
-def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa: N802
+def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bias=False):  # noqa: N802
     nodes = [  # subgraph
         helper.make_node(
             "MatMulInteger",
@@ -13,7 +13,13 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa:
             "MatMulInteger",
         ),
         helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"),
-        helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1),
+        helper.make_node(
+            "Cast",
+            ["matmul_output_int32"],
+            ["matmul_output_float"],
+            "cast",
+            to=TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT,
+        ),
         helper.make_node(
             "Mul",
             ["matmul_output_float", "multiplier"],
@@ -25,8 +31,8 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa:
     inputs = [  # inputs
         helper.make_tensor_value_info("A", TensorProto.INT8 if sign_i else TensorProto.UINT8, ["M", "K"]),
         helper.make_tensor_value_info("B", TensorProto.INT8 if sign_w else TensorProto.UINT8, ["K", "N"]),
-        helper.make_tensor_value_info("a_scale", TensorProto.FLOAT, [1]),
-        helper.make_tensor_value_info("b_scale", TensorProto.FLOAT, ["C"]),
+        helper.make_tensor_value_info("a_scale", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]),
+        helper.make_tensor_value_info("b_scale", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["C"]),
     ]
 
     if has_zp:
@@ -48,14 +54,22 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa:
     if bias:
         nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")])
 
-        inputs.extend([helper.make_tensor_value_info("bias", TensorProto.FLOAT, ["N"])])
+        inputs.extend(
+            [
+                helper.make_tensor_value_info(
+                    "bias", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["N"]
+                )
+            ]
+        )
 
     graph = helper.make_graph(
         nodes,
         "DynamicQuantizeMatMul_fusion",  # name
         inputs,
         [  # outputs
-            helper.make_tensor_value_info("Y", TensorProto.FLOAT, ["M", "N"]),
+            helper.make_tensor_value_info(
+                "Y", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["M", "N"]
+            ),
         ],
     )
 
@@ -64,10 +78,32 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa:
 
 
 if __name__ == "__main__":
-    GenerateModel("matmul_integer_to_float_int8.onnx", False, True)
-    GenerateModel("matmul_integer_to_float_uint8.onnx", False, False)
-    GenerateModel("matmul_integer_to_float_int8_bias.onnx", False, True, False, True)
-    GenerateModel("matmul_integer_to_float_uint8_bias.onnx", False, False, False, True)
+    GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True)
+    GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False)
+    GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False)
+    GenerateModel(
+        "matmul_integer_to_float_int8_bias.onnx",
+        sign_i=False,
+        sign_w=True,
+        output_type_fp16=False,
+        has_zp=False,
+        bias=True,
+    )
+    GenerateModel(
+        "matmul_integer_to_float_uint8_bias.onnx",
+        sign_i=False,
+        sign_w=False,
+        output_type_fp16=False,
+        has_zp=False,
+        bias=True,
+    )
 
-    GenerateModel("matmul_integer_to_float_int8_int8.onnx", True, True)
-    GenerateModel("matmul_integer_to_float_int8_int8_bias.onnx", True, True, False, True)
+    GenerateModel("matmul_integer_to_float_int8_int8.onnx", sign_i=True, sign_w=True, output_type_fp16=False)
+    GenerateModel(
+        "matmul_integer_to_float_int8_int8_bias.onnx",
+        sign_i=True,
+        sign_w=True,
+        output_type_fp16=False,
+        has_zp=False,
+        bias=True,
+    )
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx
index 9f4465a91496..906dec542a4f 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 U
 A
 B
@@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx
index 01b7e15aa4a1..16cdf03c7ae5 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 9
 A
 Bmatmul_output_int32MatMulInteger"MatMulInteger
@@ -41,4 +41,4 @@ mul_bottom"Mul
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx
index 9d38828e25d6..55102757a0b5 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 U
 A
 B
@@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx
index 4d9a55af50a8..d9d7222a1aca 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 9
 A
 Bmatmul_output_int32MatMulInteger"MatMulInteger
@@ -41,4 +41,4 @@ mul_bottom"Mul
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx
index a4c6d20d59be..5373ce145688 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 U
 A
 B
@@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx
index a5be0c63f4dc..e407414b23b2 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx
+++ b/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx
@@ -1,4 +1,4 @@
-:�
+	:�
 9
 A
 Bmatmul_output_int32MatMulInteger"MatMulInteger
@@ -41,4 +41,4 @@ mul_bottom"Mul
 
 
 M
-NB
\ No newline at end of file
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/mul_1_dynamic.onnx b/onnxruntime/test/testdata/mul_1_dynamic.onnx
new file mode 100644
index 000000000000..fb7822498b00
Binary files /dev/null and b/onnxruntime/test/testdata/mul_1_dynamic.onnx differ
diff --git a/onnxruntime/test/testdata/multi_stream_models/cpu_if.onnx b/onnxruntime/test/testdata/multi_stream_models/cpu_if.onnx
index b9374feff46f..e97a8bffa786 100644
Binary files a/onnxruntime/test/testdata/multi_stream_models/cpu_if.onnx and b/onnxruntime/test/testdata/multi_stream_models/cpu_if.onnx differ
diff --git a/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx b/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx
new file mode 100644
index 000000000000..dc7d39206dd4
Binary files /dev/null and b/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx differ
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 49d8d7150a11..0d141d634e05 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -235,10 +235,6 @@
         "^test_resize_upsample_sizes_nearest_not_larger_cuda",
         "^test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric_cuda",
         // onnx 1.15 (opset 20) new and updated op tests
-        "^test_ai_onnx_ml_label_encoder_string_int",
-        "^test_ai_onnx_ml_label_encoder_string_int_no_default",
-        "^test_ai_onnx_ml_label_encoder_tensor_mapping",
-        "^test_ai_onnx_ml_label_encoder_tensor_value_only_mapping",
         "^test_image_decoder_decode_bmp_rgb",
         "^test_image_decoder_decode_jpeg2k_rgb",
         "^test_image_decoder_decode_jpeg_bgr",
@@ -248,36 +244,75 @@
         "^test_image_decoder_decode_pnm_rgb",
         "^test_image_decoder_decode_tiff_rgb",
         "^test_image_decoder_decode_webp_rgb",
-        "^test_regex_full_match_basic",
-        "^test_regex_full_match_email_domain",
-        "^test_regex_full_match_empty",
-        "^test_string_concat_broadcasting",
-        "^test_string_concat",
-        "^test_string_concat_empty_string",
-        "^test_string_concat_utf8",
-        "^test_string_concat_zero_dimensional",
-        "^test_string_split_basic",
-        "^test_string_split_consecutive_delimiters",
-        "^test_string_split_empty_string_delimiter",
-        "^test_string_split_empty_tensor",
-        "^test_string_split_maxsplit",
-        "^test_string_split_no_delimiter",
-        "^test_reduce_max_bool_inputs",
-        "^test_reduce_min_bool_inputs",
-        "^test_reduce_min_empty_set",
-        "^test_reduce_l1_empty_set",
-        "^test_reduce_l1_empty_set_expanded",
-        "^test_reduce_l2_empty_set",
-        "^test_reduce_l2_empty_set_expanded",
-        "^test_reduce_log_sum_empty_set",
-        "^test_reduce_log_sum_empty_set_expanded",
-        "^test_reduce_log_sum_exp_empty_set",
-        "^test_reduce_log_sum_exp_empty_set_expanded",
-        "^test_reduce_prod_empty_set",
-        "^test_reduce_sum_empty_set",
-        "^test_reduce_sum_empty_set_non_reduced_axis_zero",
-        "^test_reduce_sum_square_empty_set",
-        "^test_reduce_sum_square_empty_set_expanded"
+        "^test_reduce_l1_empty_set_cuda",
+        "^test_reduce_l1_empty_set_expanded_cuda",
+        "^test_reduce_l2_empty_set_cuda",
+        "^test_reduce_l2_empty_set_expanded_cuda",
+        "^test_reduce_log_sum_empty_set_cuda",
+        "^test_reduce_log_sum_empty_set_expanded_cuda",
+        "^test_reduce_log_sum_exp_empty_set_cuda",
+        "^test_reduce_log_sum_exp_empty_set_expanded_cuda",
+        "^test_reduce_prod_empty_set_cuda",
+        "^test_reduce_sum_empty_set_cuda",
+        "^test_reduce_sum_square_empty_set_cuda",
+        "^test_reduce_sum_square_empty_set_expanded_cuda",
+        // ONNX 1.16.0 fixed a maxpool output size bug and added this test.
+        // Enable this test when ORT PR is merged: https://github.com/microsoft/onnxruntime/pull/18377
+        // See original ONNX 1.16.0 fix: https://github.com/onnx/onnx/pull/5741
+        "^test_maxpool_2d_ceil_output_size_reduce_by_one",
+        // ai.onnx.ml.TreeEnsemble from ONNX 1.16.0 is not implemented in ORT yet.
+        "^test_ai_onnx_ml_tree_ensemble_set_membership_cpu",
+        "^test_ai_onnx_ml_tree_ensemble_set_membership_cpu",
+        "^test_ai_onnx_ml_tree_ensemble_single_tree_cpu",
+        "^test_ai_onnx_ml_tree_ensemble_set_membership_cuda",
+        "^test_ai_onnx_ml_tree_ensemble_single_tree_cuda",
+        // ORT Cast(21) implementation doesn't support int4 yet
+        "^test_cast_INT4_to_FLOAT_cpu",
+        "^test_cast_INT4_to_INT8_cpu",
+        "^test_cast_UINT4_to_FLOAT_cpu",
+        "^test_cast_UINT4_to_UINT8_cpu",
+        "^test_cast_INT4_to_FLOAT_cuda",
+        "^test_cast_INT4_to_INT8_cuda",
+        "^test_cast_UINT4_to_FLOAT_cuda",
+        "^test_cast_UINT4_to_UINT8_cuda",
+        // ConstantOfShape(21) from ONNX 1.16.0 is not implemented for cuda.
+        "^test_constantofshape_float_ones_cuda",
+        "^test_constantofshape_int_shape_zero_cuda",
+        "^test_constantofshape_int_zeros_cuda",
+        // Flatten(21) from ONNX 1.16.0 is not implemented for cuda.
+        "^test_flatten_axis0_cuda",
+        "^test_flatten_axis1_cuda",
+        "^test_flatten_axis2_cuda",
+        "^test_flatten_axis3_cuda",
+        "^test_flatten_default_axis_cuda",
+        "^test_flatten_negative_axis1_cuda",
+        "^test_flatten_negative_axis2_cuda",
+        "^test_flatten_negative_axis3_cuda",
+        "^test_flatten_negative_axis4_cuda",
+        // QLinearMatMul(21) from ONNX 1.16.0 is not implemented in ORT yet.
+        "^test_qlinearmatmul_2D_int8_float16_cpu",
+        "^test_qlinearmatmul_2D_int8_float32_cpu",
+        "^test_qlinearmatmul_2D_uint8_float16_cpu",
+        "^test_qlinearmatmul_2D_uint8_float32_cpu",
+        "^test_qlinearmatmul_3D_int8_float16_cpu",
+        "^test_qlinearmatmul_3D_int8_float32_cpu",
+        "^test_qlinearmatmul_3D_uint8_float16_cpu",
+        "^test_qlinearmatmul_3D_uint8_float32_cpu",
+        "^test_qlinearmatmul_2D_int8_float16_cuda",
+        "^test_qlinearmatmul_2D_int8_float32_cuda",
+        "^test_qlinearmatmul_2D_uint8_float16_cuda",
+        "^test_qlinearmatmul_2D_uint8_float32_cuda",
+        "^test_qlinearmatmul_3D_int8_float16_cuda",
+        "^test_qlinearmatmul_3D_int8_float32_cuda",
+        "^test_qlinearmatmul_3D_uint8_float16_cuda",
+        "^test_qlinearmatmul_3D_uint8_float32_cuda",
+        // Size(21) from ONNX 1.16.0 is not implemented in cuda.
+        "^test_size_cuda",
+        "^test_size_example_cuda",
+        // DequantizeLinear(21) blocked quantization from ONNX 1.16.0 is not implemented in ORT yet.
+        "^test_dequantizelinear_blocked",
+        "^test_quantizelinear_blocked_asymmetric",
+        "^test_quantizelinear_blocked_symmetric"
     ],
     "current_failing_tests_x86": [
         "^test_vgg19",
@@ -377,7 +412,23 @@
         "^test_constantofshape_int_zeros",
         // https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1141563&view=logs&j=a018b46d-e41a-509d-6581-c95fdaa42fcd&t=d61c1d37-f101-5d28-982f-e5931b720302
         "^test_gelu_tanh_2_cpu",
-        "^test_gelu_tanh_2_expanded_cpu"
+        "^test_gelu_tanh_2_expanded_cpu",
+        "^test_reduce_max_bool_inputs",
+        "^test_reduce_min_bool_inputs",
+        "^test_reduce_min_empty_set",
+        "^test_reduce_l1_empty_set",
+        "^test_reduce_l1_empty_set_expanded",
+        "^test_reduce_l2_empty_set",
+        "^test_reduce_l2_empty_set_expanded",
+        "^test_reduce_log_sum_empty_set",
+        "^test_reduce_log_sum_empty_set_expanded",
+        "^test_reduce_log_sum_exp_empty_set",
+        "^test_reduce_log_sum_exp_empty_set_expanded",
+        "^test_reduce_prod_empty_set",
+        "^test_reduce_sum_empty_set",
+        "^test_reduce_sum_empty_set_non_reduced_axis_zero",
+        "^test_reduce_sum_square_empty_set",
+        "^test_reduce_sum_square_empty_set_expanded"
     ],
     "current_failing_tests_NNAPI": [
         "^test_maxpool_2d_uint8",
@@ -498,9 +549,13 @@
         "test_range_int32_type_negative_delta_expanded_cpu", // Error but not a failure.
         "test_range_float_type_positive_delta_expanded_cpu", // Error but not a failure.
         "test_scan_sum_cpu", // Disabled due to output mismatch with tolerance.
-        "test_scan9_sum_cpu" // Disabled due to output mismatch with tolerance.
+        "test_scan9_sum_cpu", // Disabled due to output mismatch with tolerance.
+        "test_reduce_max_bool_inputs_cpu",
+        "test_gelu_default_1_cpu", // Disabled due to accuracy mismatch
+        "test_gelu_default_2_cpu"
+        
     ],
-    "current_failing_tests_OPENVINO_NPU_FP16": [
+    "current_failing_tests_OPENVINO_NPU": [
         "^test_prelu_broadcast",
         "test_loop11_cpu"
     ],
@@ -656,8 +711,10 @@
         "^test_affine_grid_3d_expanded",
         "^test_constantofshape_float_ones",
         "^test_constantofshape_int_shape_zero",
-        "^test_constantofshape_int_zeros"
-
+        "^test_constantofshape_int_zeros",
+        "^test_reduce_log_sum_empty_set_cpu",
+        "^test_reduce_log_sum_exp_empty_set_cpu",
+        "^test_reduce_prod_empty_set_cpu"
     ],
     // ORT first supported opset 7, so models with nodes that require versions prior to opset 7 are not supported
     "tests_with_pre_opset7_dependencies": [
diff --git a/onnxruntime/test/testdata/ort_github_issue_19590.onnx b/onnxruntime/test/testdata/ort_github_issue_19590.onnx
new file mode 100644
index 000000000000..fa07b624780b
Binary files /dev/null and b/onnxruntime/test/testdata/ort_github_issue_19590.onnx differ
diff --git a/onnxruntime/test/testdata/ort_github_issue_19590.py b/onnxruntime/test/testdata/ort_github_issue_19590.py
new file mode 100644
index 000000000000..9be07134fd8a
--- /dev/null
+++ b/onnxruntime/test/testdata/ort_github_issue_19590.py
@@ -0,0 +1,77 @@
+import onnx
+from onnx import TensorProto, helper
+
+# graph with a QDQ MatMul node unit where one input is and initializer -> DQ and the other is on a path that
+# contains a supported node followed by an unsupported node followed by the DQ -> MatMul.
+# The DQ of the initializer is prior to the unsupported node. If the partitioning utils do not process the QDQ node
+# unit together, the DQ for the initializer and the first supported node will be in the first partition, which
+# incorrectly breaks up the QDQ node unit.
+graph_proto = helper.make_graph(
+    [
+        # DQ of initializer for MatMul B input
+        helper.make_node(
+            "DequantizeLinear",
+            inputs=["matmul_b_uint8", "scale0"],
+            outputs=["dq_matmul_b"],
+            name="dq_matmul_b",
+        ),
+        # Treat as supported
+        helper.make_node(
+            "Mul",
+            inputs=["input:0", "scale_input"],
+            outputs=["mul:0"],
+            name="mul0",
+        ),
+        # Treat as unsupported
+        helper.make_node("Cast", inputs=["mul:0"], outputs=["mul_uint8"], name="cast0", to=2),
+        # DQ of MatMul A input
+        helper.make_node(
+            "DequantizeLinear",
+            inputs=["mul_uint8", "scale1"],
+            outputs=["dq_matmul_a"],
+            name="dq_matmul_a",
+        ),
+        # MatMul
+        helper.make_node(
+            "MatMul",
+            inputs=[
+                "dq_matmul_a",
+                "dq_matmul_b",
+            ],
+            outputs=["matmul_ab"],
+            name="matmul_ab",
+        ),
+        # Q
+        helper.make_node(
+            "QuantizeLinear",
+            inputs=["matmul_ab", "scale2"],
+            outputs=["q_matmul_ab"],
+            name="q_matmul_ab",
+        ),
+        # DQ for model output
+        helper.make_node(
+            "DequantizeLinear",
+            inputs=["q_matmul_ab", "scale2"],
+            outputs=["out:0"],
+            name="dq_graph_output",
+        ),
+    ],
+    "Main_graph",
+    [
+        helper.make_tensor_value_info("input:0", TensorProto.FLOAT, [3, 2]),
+    ],
+    [
+        helper.make_tensor_value_info("out:0", TensorProto.FLOAT, [3, 2]),
+    ],
+    [
+        helper.make_tensor("scale0", TensorProto.FLOAT, [1], [20.0]),
+        helper.make_tensor("scale1", TensorProto.FLOAT, [1], [30.0]),
+        helper.make_tensor("scale2", TensorProto.FLOAT, [1], [40.0]),
+        helper.make_tensor("matmul_b_uint8", TensorProto.UINT8, [2, 2], [1, 2, 3, 4]),
+        helper.make_tensor("scale_input", TensorProto.FLOAT, [2], [3.0, 4.0]),
+    ],
+)
+
+model = helper.make_model(graph_proto)
+onnx.checker.check_model(model, True)
+onnx.save(model, "ort_github_issue_19590.onnx")
diff --git a/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx b/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx
new file mode 100644
index 000000000000..4f575ebb2841
Binary files /dev/null and b/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx differ
diff --git a/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx b/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx
new file mode 100644
index 000000000000..46b212dc1fc0
Binary files /dev/null and b/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx differ
diff --git a/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py b/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
index 79d41e41d696..443444044bb8 100644
--- a/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
+++ b/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
@@ -58,10 +58,8 @@ def _torch_tensor_to_str(torch_tensor):
 
     def _build_param_index_to_name_mapping(model, map_result):
         """Build index to name mapping, which is used to retrieve data from optimizer group."""
-        index = 0
-        for param in model.named_parameters():
+        for index, param in enumerate(model.named_parameters()):
             map_result[index] = param[0]
-            index += 1
 
     torch.manual_seed(seed)
 
@@ -119,8 +117,7 @@ def _build_param_index_to_name_mapping(model, map_result):
         _sync_stream()
 
         for group in adamw_optimizer.param_groups:
-            p_index = 0
-            for param in group["params"]:
+            for p_index, param in enumerate(group["params"]):
                 state = adamw_optimizer.state[param]
                 name = param_index_to_name_mapping[p_index]
                 # Collect flattened optimizer state data.
@@ -130,7 +127,6 @@ def _build_param_index_to_name_mapping(model, map_result):
                 else:
                     m1_dict[name].append(_torch_tensor_to_str(state["exp_avg"].view(-1)))
                     m2_dict[name].append(_torch_tensor_to_str(state["exp_avg_sq"].view(-1)))
-                p_index += 1
 
         adamw_optimizer.step()
         adamw_optimizer.zero_grad()
@@ -194,7 +190,7 @@ def main():
     device_candidates = ["cuda", "cpu"]
     test_data_step_count = 11
     for device in device_candidates:
-        for adam_mode in range(0, 2):
+        for adam_mode in range(2):
             generate_adamw_single_weight_tests(adam_mode, test_data_step_count, device)
             generate_adamw_multiple_weights_tests(adam_mode, test_data_step_count, device)
 
diff --git a/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py b/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
index a3d7946d6321..173225a21a52 100644
--- a/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
+++ b/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
@@ -58,10 +58,8 @@ def _torch_tensor_to_str(torch_tensor):
 
     def _build_param_index_to_name_mapping(model, map_result):
         """Build index to name mapping, which is used to retrieve data from optimizer group."""
-        index = 0
-        for param in model.named_parameters():
+        for index, param in enumerate(model.named_parameters()):
             map_result[index] = param[0]
-            index += 1
 
     torch.manual_seed(seed)
 
diff --git a/onnxruntime/test/testdata/training_api/checkpoint.ckpt b/onnxruntime/test/testdata/training_api/checkpoint.ckpt
index d0b7d0deb654..d1bc1f121c8e 100644
Binary files a/onnxruntime/test/testdata/training_api/checkpoint.ckpt and b/onnxruntime/test/testdata/training_api/checkpoint.ckpt differ
diff --git a/onnxruntime/test/testdata/training_api/custom_ops/checkpoint b/onnxruntime/test/testdata/training_api/custom_ops/checkpoint
index 753b24af63ba..ce23d149e949 100644
Binary files a/onnxruntime/test/testdata/training_api/custom_ops/checkpoint and b/onnxruntime/test/testdata/training_api/custom_ops/checkpoint differ
diff --git a/onnxruntime/test/testdata/training_api/nominal_checkpoint b/onnxruntime/test/testdata/training_api/nominal_checkpoint
new file mode 100644
index 000000000000..2eadfeece2ed
Binary files /dev/null and b/onnxruntime/test/testdata/training_api/nominal_checkpoint differ
diff --git a/onnxruntime/test/testdata/training_api/ort_format/checkpoint b/onnxruntime/test/testdata/training_api/ort_format/checkpoint
index ab35c9ad5acd..83ef6aa4c30d 100644
Binary files a/onnxruntime/test/testdata/training_api/ort_format/checkpoint and b/onnxruntime/test/testdata/training_api/ort_format/checkpoint differ
diff --git a/onnxruntime/test/testdata/transform/convert_qdq_ops_to_ms_domain.py b/onnxruntime/test/testdata/transform/convert_qdq_ops_to_ms_domain.py
index f74342403f4c..e7fd4ac70f06 100644
--- a/onnxruntime/test/testdata/transform/convert_qdq_ops_to_ms_domain.py
+++ b/onnxruntime/test/testdata/transform/convert_qdq_ops_to_ms_domain.py
@@ -6,7 +6,8 @@
 Loads a model and updates the domain of QuantizeLinear and DequantizeLinear nodes to 'com.microsoft'.
 Optionally updates zero-points to 16bit data types.
 
-This is used to create models for testing QDQ transformations with the contrib QDQ ops.
+This is used to create models for testing QDQ transformations with the contrib QDQ ops and/or
+16-bit ONNX Q/DQ ops.
 
 Usage:
 python3 convert_qdq_ops_to_ms_domain.py --input_model <input onnx model> --output_model <output model> --use_16bit_qdq
@@ -14,8 +15,10 @@
 Models created with this script:
 - qdq_with_multi_consumer_dq_nodes.fixed.qdq_contrib.onnx
 - qdq_with_multi_consumer_dq_nodes.fixed.qdq16_contrib.onnx
+- qdq_with_multi_consumer_dq_nodes.fixed.qdq16.onnx
 - fusion/constant_folding_dequantizelinear.qdq_contrib.onnx
 - fusion/constant_folding_dequantizelinear.qdq16_contrib.onnx
+- fusion/constant_folding_dequantizelinear.qdq16.onnx
 - fusion/constant_folding_qdq_node_unit.qdq_contrib.onnx
 - fusion/constant_folding_qdq_node_unit.qdq16_contrib.onnx
 - fusion/constant_folding_qdq_node_unit.graph_output.qdq_contrib.onnx
@@ -117,7 +120,7 @@ def convert_qdq_op_to_16bit(
         raise Exception("Only support Q/DQ ops with explicit zero-point inputs")
 
 
-def update_qdq_node_domains(graph: onnx.GraphProto, use_16bit_qdq: bool):
+def update_qdq_nodes(graph: onnx.GraphProto, use_16bit_qdq: bool, use_onnx_domain: bool = False):
     name_to_initializer = {initializer.name: initializer for initializer in graph.initializer}
     name_to_values = {value.name: value for value in graph.value_info}
     name_to_inputs = {g_input.name: g_input for g_input in graph.input}
@@ -127,14 +130,14 @@ def update_qdq_node_domains(graph: onnx.GraphProto, use_16bit_qdq: bool):
         # Handle subgraphs:
         for attr in node.attribute:
             if attr.type == onnx.AttributeProto.GRAPH:
-                update_qdq_node_domains(attr.g, use_16bit_qdq)
+                update_qdq_nodes(attr.g, use_16bit_qdq, use_onnx_domain)
             elif attr.type == onnx.AttributeProto.GRAPHS:
                 for subgraph in attr.graphs:
-                    update_qdq_node_domains(subgraph, use_16bit_qdq)
+                    update_qdq_nodes(subgraph, use_16bit_qdq, use_onnx_domain)
 
         # Update Q/DQ domains
         if node.op_type in QDQ_OPS:
-            node.domain = "com.microsoft"
+            node.domain = "com.microsoft" if not use_onnx_domain else ""
 
             if use_16bit_qdq:
                 convert_qdq_op_to_16bit(name_to_initializer, name_to_values, name_to_inputs, name_to_outputs, node)
@@ -144,6 +147,7 @@ def main():
     parser = argparse.ArgumentParser(description="Convert Q/DQ ops to com.microsoft domain (or 16-bit)")
     parser.add_argument("--input_model", type=str, required=True, help="Input onnx model path")
     parser.add_argument("--output_model", type=str, required=False, help="Output onnx model path")
+    parser.add_argument("--use_onnx_domain", required=False, action="store_true", help="Use ONNX domain for Q/DQ ops")
     parser.add_argument("--use_16bit_qdq", required=False, action="store_true", help="Convert to 16-bit QDQ")
 
     args = parser.parse_args()
@@ -151,22 +155,28 @@ def main():
     model = onnx.load(args.input_model)
 
     has_ms_domain = False
+    onnx_opset_version = 1
     for opset in model.opset_import:
         if opset.domain == "com.microsoft":
             has_ms_domain = True
-            break
+        elif opset.domain == "" or opset.domain == "ai.onnx":
+            onnx_opset_version = opset.version
 
     if not has_ms_domain:
         model.opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
 
-    update_qdq_node_domains(model.graph, args.use_16bit_qdq)
+    if args.use_onnx_domain and args.use_16bit_qdq and onnx_opset_version < 21:
+        model = onnx.version_converter.convert_version(model, 21)  # Opset 21 supports 16-bit Q/DQ
+
+    update_qdq_nodes(model.graph, args.use_16bit_qdq, args.use_onnx_domain)
     model = shape_inference.infer_shapes(model)
     onnx.checker.check_model(model, True)
 
     output_model_path = args.output_model
     if not output_model_path:
         base_model_name = os.path.splitext(args.input_model)[0]
-        suffix = ".qdq16_contrib" if args.use_16bit_qdq else ".qdq_contrib"
+        suffix0 = "" if args.use_onnx_domain else "_contrib"
+        suffix = f".qdq16{suffix0}" if args.use_16bit_qdq else f".qdq{suffix0}"
         output_model_path = base_model_name + suffix + ".onnx"
 
     onnx.save_model(model, output_model_path)
diff --git a/onnxruntime/test/testdata/transform/fusion/constant_folding_dequantizelinear.qdq16.onnx b/onnxruntime/test/testdata/transform/fusion/constant_folding_dequantizelinear.qdq16.onnx
new file mode 100644
index 000000000000..810b89bedff3
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/constant_folding_dequantizelinear.qdq16.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
index ed06495b42be..54fe7b808bf1 100644
--- a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
@@ -21,19 +21,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["gather0_out" + suffix],
             "gather0" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["gather0_out" + suffix, "axes_0"],
-            ["unsqueeze0_out" + suffix],
-            "unsqueeze0" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["gather0_out" + suffix],
-            ["unsqueeze0_out" + suffix],
-            "unsqueeze0" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["gather0_out" + suffix, "axes_0"],
+                ["unsqueeze0_out" + suffix],
+                "unsqueeze0" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["gather0_out" + suffix],
+                ["unsqueeze0_out" + suffix],
+                "unsqueeze0" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node("Shape", ["input_ids" + suffix], ["shape2_out" + suffix], "shape2" + suffix),
         helper.make_node(
@@ -42,19 +44,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["gather1_out" + suffix],
             "gather1" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["gather1_out" + suffix, "axes_0"],
-            ["unsqueeze1_out" + suffix],
-            "unsqueeze1" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["gather1_out" + suffix],
-            ["unsqueeze1_out" + suffix],
-            "unsqueeze1" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["gather1_out" + suffix, "axes_0"],
+                ["unsqueeze1_out" + suffix],
+                "unsqueeze1" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["gather1_out" + suffix],
+                ["unsqueeze1_out" + suffix],
+                "unsqueeze1" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -80,19 +84,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["range_out" + suffix],
             "range" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["range_out" + suffix, "axes_0"],
-            ["unsqueeze2_out" + suffix],
-            "unsqueeze2" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["range_out" + suffix],
-            ["unsqueeze2_out" + suffix],
-            "unsqueeze2" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["range_out" + suffix, "axes_0"],
+                ["unsqueeze2_out" + suffix],
+                "unsqueeze2" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["range_out" + suffix],
+                ["unsqueeze2_out" + suffix],
+                "unsqueeze2" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Expand",
@@ -145,21 +151,23 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             "mask_cast" + suffix,
             to=6,
         ),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out" + suffix, "axes_1"],
-            ["mask_index_out" + suffix],
-            "mask_index" + suffix,
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out" + suffix],
-            ["mask_index_out" + suffix],
-            "mask_index" + suffix,
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out" + suffix, "axes_1"],
+                ["mask_index_out" + suffix],
+                "mask_index" + suffix,
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out" + suffix],
+                ["mask_index_out" + suffix],
+                "mask_index" + suffix,
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -372,21 +380,23 @@ def GenerateModel5(model_name):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -514,14 +524,18 @@ def GenerateModel6(model_name):  # noqa: N802
     nodes = [  # LayerNorm subgraph
         helper.make_node("Shape", ["input_ids"], ["shape1_out"], "shape1"),
         helper.make_node("Gather", ["shape1_out", "indices_0"], ["gather0_out"], "gather0"),
-        helper.make_node("Unsqueeze", ["gather0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0])
+        ),
         helper.make_node("Shape", ["input_ids"], ["shape2_out"], "shape2"),
         helper.make_node("Gather", ["shape2_out", "indices_1"], ["gather1_out"], "gather1"),
-        helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["unsqueeze0_out", "unsqueeze1_out"],
@@ -533,9 +547,11 @@ def GenerateModel6(model_name):  # noqa: N802
         helper.make_node("Equal", ["reshape_out", "equal_init"], ["equal_out"], "equal"),
         helper.make_node("Where", ["equal_out", "where_init", "reshape_out"], ["where_out"], "where"),
         helper.make_node("Range", ["start_0", "gather1_out", "delta_1"], ["range_out"], "range"),
-        helper.make_node("Unsqueeze", ["range_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["range_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["range_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["range_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0])
+        ),
         helper.make_node("Expand", ["unsqueeze2_out", "where_out"], ["expand_out"], "expand"),
         helper.make_node("Gather", ["pos_embed", "expand_out"], ["pos_gather_out"], "pos_gather"),
         helper.make_node("Gather", ["word_embed", "input_ids"], ["word_gather_out"], "word_gather"),
@@ -556,21 +572,23 @@ def GenerateModel6(model_name):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -756,9 +774,11 @@ def GenerateNodes2(attention_heads):  # noqa: N802
         helper.make_node("Shape", ["input_ids"], ["shape0_out"], "shape0"),
         helper.make_node("Gather", ["shape0_out", "indices_1"], ["gather0_out"], "gather0"),
         helper.make_node("Range", ["start", "gather0_out", "delta"], ["range0_out"], "range0"),
-        helper.make_node("Unsqueeze", ["range0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["range0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["range0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["range0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0])
+        ),
         helper.make_node("Shape", ["input_ids"], ["shape1_out"], "shape1"),
         helper.make_node("Expand", ["unsqueeze0_out", "shape1_out"], ["expand_out"], "expand"),
         helper.make_node(
@@ -778,21 +798,23 @@ def GenerateNodes2(attention_heads):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -898,12 +920,16 @@ def GenerateModel9(model_name):  # noqa: N802
         helper.make_node("Expand", ["unsqueeze0_out", "shape_out"], ["expand_out"], "expand"),
         helper.make_node("Gather", ["shape_out", "indices_0"], ["gather1_out"], "gather1"),
         helper.make_node("Gather", ["shape_out", "indices_1"], ["gather2_out"], "gather2"),
-        helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0]),
-        helper.make_node("Unsqueeze", ["gather2_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather2_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0])
+        ),
+        (
+            helper.make_node("Unsqueeze", ["gather2_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather2_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["unsqueeze1_out", "unsqueeze2_out"],
diff --git a/onnxruntime/test/testdata/transform/fusion/label_encoder.onnx b/onnxruntime/test/testdata/transform/fusion/label_encoder.onnx
new file mode 100644
index 000000000000..ffc28ed2564e
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/label_encoder.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/label_encoder.py b/onnxruntime/test/testdata/transform/fusion/label_encoder.py
new file mode 100644
index 000000000000..96c203547a10
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fusion/label_encoder.py
@@ -0,0 +1,163 @@
+from onnx import OperatorSetIdProto, TensorProto, helper, save
+
+opsets = []
+onnxdomain = OperatorSetIdProto()
+onnxdomain.version = 19
+onnxdomain.domain = ""  # The empty string ("") or absence of this field implies the operator set that is defined as part of the ONNX specification.
+opsets.append(onnxdomain)
+
+msdomain = OperatorSetIdProto()
+msdomain.version = 1
+msdomain.domain = "com.microsoft"
+opsets.append(msdomain)
+
+ai_ml_domain = OperatorSetIdProto()
+ai_ml_domain.version = 4
+ai_ml_domain.domain = "ai.onnx.ml"
+opsets.append(ai_ml_domain)
+
+kwargs = {}
+kwargs["opset_imports"] = opsets
+
+
+def generate_model(model_name):
+    # Create models with consecutive label encoders
+    nodes = [  # subgraph
+        # string -> int -> string
+        helper.make_node(
+            "LabelEncoder",
+            ["A"],
+            ["le_1_int_1"],
+            "le_1_int_1",
+            domain="ai.onnx.ml",
+            keys_strings=["a", "b", "c"],
+            values_int64s=[0, 1, 2],
+        ),
+        helper.make_node(
+            "LabelEncoder",
+            ["le_1_int_1"],
+            ["le_1_string_2"],
+            "le_1_string_2",
+            domain="ai.onnx.ml",
+            keys_int64s=[2, 1, 0],
+            values_strings=["a", "b", "c"],
+            default_string="default",
+        ),
+        # string -> string -> string
+        helper.make_node(
+            "LabelEncoder",
+            ["A"],
+            ["le_2_string_1"],
+            "le_2_string_1",
+            domain="ai.onnx.ml",
+            keys_strings=["a", "b", "c"],
+            values_strings=["C", "B", "A"],
+            default_string="D",
+        ),
+        helper.make_node(
+            "LabelEncoder",
+            ["le_2_string_1"],
+            ["le_2_string_2"],
+            "le_2_string_2",
+            domain="ai.onnx.ml",
+            keys_strings=["A", "B", "C", "D"],
+            values_strings=["a", "b", "c", "d"],
+            default_string="default",
+        ),
+        # string -> string -> int -> string
+        helper.make_node(
+            "LabelEncoder",
+            ["A"],
+            ["le_3_string_1"],
+            "le_3_string_1",
+            domain="ai.onnx.ml",
+            keys_strings=["a", "b", "c"],
+            values_strings=["C", "B", "A"],
+        ),
+        helper.make_node(
+            "LabelEncoder",
+            ["le_3_string_1"],
+            ["le_3_int_2"],
+            "le_3_int_2",
+            domain="ai.onnx.ml",
+            keys_strings=["A", "B", "C"],
+            values_int64s=[1, 2, 3],
+            default_int64=-1,
+        ),
+        helper.make_node(
+            "LabelEncoder",
+            ["le_3_int_2"],
+            ["le_3_string_3"],
+            "le_3_string_3",
+            domain="ai.onnx.ml",
+            keys_int64s=[1, 2, 3],
+            values_strings=["a", "b", "c"],
+            default_string="d",
+        ),
+        # middle encoder is graph output
+        helper.make_node(
+            "LabelEncoder",
+            ["A"],
+            ["le_4_int_1"],
+            "le_4_int_1",
+            domain="ai.onnx.ml",
+            keys_strings=["a", "b", "c"],
+            values_int64s=[0, 1, 2],
+        ),
+        helper.make_node(
+            "LabelEncoder",
+            ["le_4_int_1"],
+            ["le_4_string_2"],
+            "le_4_string_2",
+            domain="ai.onnx.ml",
+            keys_int64s=[0, 1, 2],
+            values_strings=["a", "b", "c"],
+        ),
+        helper.make_node("Identity", ["le_4_int_1"], ["Y"], "output"),
+        # middle encoder is consumed twice
+        helper.make_node(
+            "LabelEncoder",
+            ["A"],
+            ["le_5_int_1"],
+            "le_5_int_1",
+            domain="ai.onnx.ml",
+            keys_strings=["a", "b", "c"],
+            values_int64s=[0, 1, 2],
+        ),
+        helper.make_node(
+            "LabelEncoder",
+            ["le_5_int_1"],
+            ["le_5_string_2"],
+            "le_5_string_2",
+            domain="ai.onnx.ml",
+            keys_int64s=[0, 1, 2],
+            values_strings=["a", "b", "c"],
+        ),
+        helper.make_node("Mul", ["le_5_int_1", "le_5_int_1"], ["mul_5"], "mul_5"),
+    ]
+
+    inputs = [  # inputs
+        helper.make_tensor_value_info("A", TensorProto.STRING, ["N"]),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "LabelEncoder",  # name
+        inputs,
+        [  # outputs
+            helper.make_tensor_value_info("le_1_string_2", TensorProto.STRING, ["N"]),
+            helper.make_tensor_value_info("le_2_string_2", TensorProto.STRING, ["N"]),
+            helper.make_tensor_value_info("le_3_string_3", TensorProto.STRING, ["N"]),
+            helper.make_tensor_value_info("le_4_string_2", TensorProto.STRING, ["N"]),
+            helper.make_tensor_value_info("Y", TensorProto.INT64, ["N"]),
+            helper.make_tensor_value_info("mul_5", TensorProto.INT64, ["N"]),
+        ],
+        [],
+    )
+
+    model = helper.make_model(graph, **kwargs)
+    save(model, model_name)
+
+
+if __name__ == "__main__":
+    generate_model("label_encoder.onnx")
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx
index 7ea69c580ee4..aa8e67bcbc59 100644
Binary files a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx and b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
new file mode 100644
index 000000000000..22293b0d1075
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
@@ -0,0 +1,51 @@
+	:�
+U
+A
+B
+a_zero_point
+b_zero_pointmatmul_output_int32MatMulInteger"MatMulInteger
+.
+a_scale
+b_scale
+multiplier	mul_right"Mul
+A
+matmul_output_int32matmul_output_floatcast"Cast*	
+to
+�
+5
+matmul_output_float
+
+multiplierY
+mul_bottom"MulDynamicQuantizeMatMul_fusionZ
+A
+
+
+M
+KZ
+B
+
+
+K
+NZ
+a_scale
+
+
+
+Z
+b_scale
+	
+
+CZ
+a_zero_point
+
+
+Z
+b_zero_point
+	
+Cb
+Y
+
+
+
+M
+NB
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_bfloat16.onnx b/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_bfloat16.onnx
new file mode 100644
index 000000000000..a04b7781fca4
Binary files /dev/null and b/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_bfloat16.onnx differ
diff --git a/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_float16.onnx b/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_float16.onnx
new file mode 100644
index 000000000000..691da77969b1
Binary files /dev/null and b/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_float16.onnx differ
diff --git a/onnxruntime/test/testdata/transform/qdq_with_multi_consumer_dq_nodes.fixed.qdq16.onnx b/onnxruntime/test/testdata/transform/qdq_with_multi_consumer_dq_nodes.fixed.qdq16.onnx
new file mode 100644
index 000000000000..222da479c834
Binary files /dev/null and b/onnxruntime/test/testdata/transform/qdq_with_multi_consumer_dq_nodes.fixed.qdq16.onnx differ
diff --git a/onnxruntime/test/unittest_main/test_main.cc b/onnxruntime/test/unittest_main/test_main.cc
index 97169df36fdd..d7e8bf906364 100644
--- a/onnxruntime/test/unittest_main/test_main.cc
+++ b/onnxruntime/test/unittest_main/test_main.cc
@@ -32,17 +32,30 @@ void ortenv_setup() {
 }
 
 #ifdef USE_TENSORRT
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)  // Ignore warning C4100: unreferenced format parameter.
+#endif
+
 // TensorRT will load/unload libraries as builder objects are created and torn down. This will happen for
 // every single unit test, which leads to excessive test execution time due to that overhead.
 // Nvidia suggests to keep a placeholder builder object around to avoid this.
 #include "NvInfer.h"
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 class DummyLogger : public nvinfer1::ILogger {
  public:
-  DummyLogger(Severity verbosity) {}
-  void log(Severity severity, const char* msg) noexcept override {}
+  DummyLogger(Severity /*verbosity*/) {}
+  void log(Severity /*severity*/, const char* /*msg*/) noexcept override {}
 };
 DummyLogger trt_logger(nvinfer1::ILogger::Severity::kWARNING);
+
 auto const placeholder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(trt_logger));
+
 #endif
 
 #define TEST_MAIN main
@@ -59,8 +72,8 @@ int TEST_MAIN(int argc, char** argv) {
   int status = 0;
 
   ORT_TRY {
-    ::testing::InitGoogleTest(&argc, argv);
     ortenv_setup();
+    ::testing::InitGoogleTest(&argc, argv);
 
     // allow verbose logging to be enabled by setting this environment variable to a numeric log level
     constexpr auto kLogLevelEnvironmentVariableName = "ORT_UNIT_TEST_MAIN_LOG_LEVEL";
diff --git a/onnxruntime/test/util/compare_ortvalue.cc b/onnxruntime/test/util/compare_ortvalue.cc
index 3d53d4a3a019..64ebe2418876 100644
--- a/onnxruntime/test/util/compare_ortvalue.cc
+++ b/onnxruntime/test/util/compare_ortvalue.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // Licensed under the MIT License.
 
 #include "test/compare_ortvalue.h"
@@ -65,6 +66,54 @@ const char* ElementTypeToString(MLDataType type) {
   return DataTypeImpl::ToString(type);
 }
 
+#if defined(__aarch64__) && defined(__linux__)
+template <typename T>
+std::pair<COMPARE_RESULT, std::string> CheckCosineSimilarity(const Tensor& outvalue, const Tensor& expected_value) {
+  const size_t tensor_size = static_cast<size_t>(expected_value.Shape().Size());
+  const T* expected_output = expected_value.Data<T>();
+  const T* real_output = outvalue.Data<T>();
+  std::pair<COMPARE_RESULT, std::string> res = std::make_pair(COMPARE_RESULT::SUCCESS, "");
+  const T cosine_similarity_threshold = 0.99f;
+
+  T dot = 0.0f, denom_a = 0.0f, denom_b = 0.0f;
+  for (size_t i = 0u; i < tensor_size; ++i) {
+    if (isnan(expected_output[i]) && isnan(real_output[i]))
+      continue;
+    if (isinf(expected_output[i]) && isinf(real_output[i]))
+      continue;
+    dot += expected_output[i] * real_output[i];
+    denom_a += expected_output[i] * expected_output[i];
+    denom_b += real_output[i] * real_output[i];
+  }
+
+  T cos_factor = abs(dot / (sqrt(denom_a) * sqrt(denom_b)));
+  if (cos_factor < cosine_similarity_threshold) {
+    res.first = COMPARE_RESULT::RESULT_DIFFERS;
+    std::ostringstream oss;
+    oss << std::hex << "results differed, cosine similarity factor is " << cos_factor << ".";
+    res.second = oss.str();
+  }
+  return res;
+}
+
+template <typename T>
+std::pair<COMPARE_RESULT, std::string> CheckCloseMatch(const Tensor& outvalue, const Tensor& expected_value) {
+  const size_t size1 = static_cast<size_t>(expected_value.Shape().Size());
+  const T* expected_output = expected_value.Data<T>();
+  const T* real_output = outvalue.Data<T>();
+  const T close_match_threshold = 1.0;
+
+  for (size_t di = 0; di != size1; ++di) {
+    const T diff = expected_output[di] - real_output[di];
+    if (std::fabs(diff) > close_match_threshold) {
+      std::ostringstream oss;
+      oss << "expected " << expected_output[di] << ", got " << real_output[di];
+      return std::make_pair(COMPARE_RESULT::RESULT_DIFFERS, oss.str());
+    }
+  }
+  return std::make_pair(COMPARE_RESULT::SUCCESS, "");
+}
+#endif
 /**
  * @brief Check if two values are closely matched with given tolerance.
 
@@ -207,6 +256,37 @@ std::pair<COMPARE_RESULT, std::string> CompareTwoTensors(const Tensor& outvalue,
     oss << "shape mismatch, expect " << expected_tensor.Shape().ToString() << " got " << outvalue.Shape().ToString();
     return std::make_pair(COMPARE_RESULT::SHAPE_MISMATCH, oss.str());
   }
+
+#if defined(__aarch64__) && defined(__linux__)
+  if (isnan(per_sample_tolerance) || isnan(per_sample_tolerance)) {
+    if (outvalue.IsDataType<float>()) {
+      return CheckCosineSimilarity<float>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<double>()) {
+      return CheckCosineSimilarity<double>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<uint8_t>()) {
+      return CheckCloseMatch<uint8_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<int8_t>()) {
+      return CheckCloseMatch<int8_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<uint16_t>()) {
+      return CheckCloseMatch<uint16_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<int16_t>()) {
+      return CheckCloseMatch<int16_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<uint32_t>()) {
+      return CheckCloseMatch<uint32_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<int32_t>()) {
+      return CheckCloseMatch<int32_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<uint64_t>()) {
+      return CheckCloseMatch<uint64_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<int64_t>()) {
+      return CheckCloseMatch<int64_t>(outvalue, expected_tensor);
+    } else if (outvalue.IsDataType<bool>()) {
+      return CheckCloseMatch<bool>(outvalue, expected_tensor);
+    } else {
+      return std::make_pair(COMPARE_RESULT::NOT_SUPPORT, "");
+    }
+  }
+#endif
+
   if (outvalue.IsDataType<float>()) {
     return CompareFloatResult<float>(outvalue, expected_tensor, per_sample_tolerance, relative_per_sample_tolerance,
                                      post_processing);
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 4468a64d1825..6ad2d41edb56 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -8,6 +8,9 @@
 #ifdef USE_COREML
 #include "core/providers/coreml/coreml_provider_factory.h"
 #endif
+#ifdef USE_CUDA
+#include <core/providers/cuda/cuda_provider_options.h>
+#endif
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/session_options.h"
 
@@ -110,13 +113,28 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
-  OrtCUDAProviderOptions provider_options{};
+  OrtCUDAProviderOptionsV2 provider_options{};
+  provider_options.do_copy_in_default_stream = true;
+  provider_options.use_tf32 = false;
+  if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
+    return factory->CreateProvider();
+#endif
+  return nullptr;
+}
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
+#if defined(USE_CUDA)
+  OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
+  provider_options.use_tf32 = false;
+  provider_options.prefer_nhwc = true;
   if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
     return factory->CreateProvider();
 #endif
   return nullptr;
 }
+#endif
 
 std::unique_ptr<IExecutionProvider> CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options) {
 #ifdef USE_CUDA
@@ -207,15 +225,21 @@ std::unique_ptr<IExecutionProvider> DefaultRocmExecutionProvider(bool test_tunab
   return nullptr;
 }
 
-std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider() {
-// For any non - macOS system, CoreML will only be used for ort model converter
-// Make it unavailable here, you can still manually append CoreML EP to session for model conversion
+std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider(bool use_mlprogram) {
+  // To manually test CoreML model generation on a non-macOS platform, comment out the `&& defined(__APPLE__)` below.
+  // The test will create a model but execution of it will obviously fail.
 #if defined(USE_COREML) && defined(__APPLE__)
   // We want to run UT on CPU only to get output value without losing precision
   uint32_t coreml_flags = 0;
   coreml_flags |= COREML_FLAG_USE_CPU_ONLY;
+
+  if (use_mlprogram) {
+    coreml_flags |= COREML_FLAG_CREATE_MLPROGRAM;
+  }
+
   return CoreMLProviderFactoryCreator::Create(coreml_flags)->CreateProvider();
 #else
+  ORT_UNUSED_PARAMETER(use_mlprogram);
   return nullptr;
 #endif
 }
@@ -274,8 +298,9 @@ std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
 #ifdef USE_DML
-  if (auto factory = DMLProviderFactoryCreator::Create(0, false, false, false))
+  if (auto factory = DMLProviderFactoryCreator::CreateFromOptions(nullptr, false, false)) {
     return factory->CreateProvider();
+  }
 #endif
   return nullptr;
 }
diff --git a/onnxruntime/test/util/include/asserts.h b/onnxruntime/test/util/include/asserts.h
index f6edb062f070..02494951a06b 100644
--- a/onnxruntime/test/util/include/asserts.h
+++ b/onnxruntime/test/util/include/asserts.h
@@ -6,6 +6,7 @@
 #include "core/common/status.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "gtest/gtest.h"
+#include "gmock/gmock.h"
 
 // helpers to run a function and check the status, outputting any error if it fails.
 // note: wrapped in do{} while(false) so the _tmp_status variable has limited scope
@@ -33,6 +34,20 @@
     EXPECT_FALSE(_tmp_status.IsOK());  \
   } while (false)
 
+#define ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(function, msg)              \
+  do {                                                                  \
+    Status _tmp_status = (function);                                    \
+    ASSERT_FALSE(_tmp_status.IsOK());                                   \
+    ASSERT_THAT(_tmp_status.ErrorMessage(), ::testing::HasSubstr(msg)); \
+  } while (false)
+
+#define EXPECT_STATUS_NOT_OK_AND_HAS_SUBSTR(function, msg)              \
+  do {                                                                  \
+    Status _tmp_status = (function);                                    \
+    EXPECT_FALSE(_tmp_status.IsOK());                                   \
+    EXPECT_THAT(_tmp_status.ErrorMessage(), ::testing::HasSubstr(msg)); \
+  } while (false)
+
 // Same helpers for public API OrtStatus. Get the 'api' instance using:
 //   const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
 #define ASSERT_ORTSTATUS_OK(api, function)                                \
diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h
index 9f78e0a0d4eb..ae8e89c38699 100644
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@@ -35,6 +35,9 @@ namespace test {
 // unique_ptr providers with default values for session registration
 std::unique_ptr<IExecutionProvider> DefaultCpuExecutionProvider(bool enable_arena = true);
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider();
+#ifdef ENABLE_CUDA_NHWC_OPS
+std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider();
+#endif
 std::unique_ptr<IExecutionProvider> CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options);
 std::unique_ptr<IExecutionProvider> DefaultDnnlExecutionProvider();
 std::unique_ptr<IExecutionProvider> DnnlExecutionProviderWithOptions(const OrtDnnlProviderOptions* provider_options);
@@ -51,7 +54,7 @@ std::unique_ptr<IExecutionProvider> DefaultRknpuExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultAclExecutionProvider(bool enable_arena = true);
 std::unique_ptr<IExecutionProvider> DefaultArmNNExecutionProvider(bool enable_arena = true);
 std::unique_ptr<IExecutionProvider> DefaultRocmExecutionProvider(bool test_tunable_op = false);
-std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider();
+std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider(bool use_mlprogram = false);
 std::unique_ptr<IExecutionProvider> DefaultSnpeExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultQnnExecutionProvider();
 std::unique_ptr<IExecutionProvider> QnnExecutionProviderWithOptions(const ProviderOptions& options,
diff --git a/onnxruntime/test/util/include/test/capturing_sink.h b/onnxruntime/test/util/include/test/capturing_sink.h
index 39788947602d..7d978d1bd1e5 100644
--- a/onnxruntime/test/util/include/test/capturing_sink.h
+++ b/onnxruntime/test/util/include/test/capturing_sink.h
@@ -6,8 +6,6 @@
 #include "core/common/logging/logging.h"
 #include "core/common/logging/isink.h"
 
-#include "date/date.h"
-
 namespace onnxruntime {
 namespace test {
 
@@ -17,7 +15,7 @@ class CapturingSink : public logging::ISink {
  public:
   void SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) override {
     // operator for formatting of timestamp in ISO8601 format including microseconds
-    using date::operator<<;
+    using timestamp_ns::operator<<;
     std::ostringstream msg;
 
     msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
diff --git a/onnxruntime/test/util/include/test_allocator.h b/onnxruntime/test/util/include/test_allocator.h
index a6dea91f58e7..c700098c87f3 100644
--- a/onnxruntime/test/util/include/test_allocator.h
+++ b/onnxruntime/test/util/include/test_allocator.h
@@ -14,7 +14,9 @@ struct MockedOrtAllocator : OrtAllocator {
   void* Alloc(size_t size);
   void Free(void* p);
   const OrtMemoryInfo* Info() const;
+  void* Reserve(size_t size);
   size_t NumAllocations() const;
+  size_t NumReserveAllocations() const;
 
   void LeakCheck();
 
@@ -24,5 +26,6 @@ struct MockedOrtAllocator : OrtAllocator {
 
   std::atomic<size_t> memory_inuse{0};
   std::atomic<size_t> num_allocations{0};
+  std::atomic<size_t> num_reserve_allocations{0};
   OrtMemoryInfo* cpu_memory_info;
 };
diff --git a/onnxruntime/test/util/test_allocator.cc b/onnxruntime/test/util/test_allocator.cc
index 002e759bd062..05dd454e875d 100644
--- a/onnxruntime/test/util/test_allocator.cc
+++ b/onnxruntime/test/util/test_allocator.cc
@@ -9,6 +9,7 @@ MockedOrtAllocator::MockedOrtAllocator() {
   OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
   OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
   OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
+  OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
   Ort::ThrowOnError(Ort::GetApi().CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info));
 }
 
@@ -30,6 +31,18 @@ void* MockedOrtAllocator::Alloc(size_t size) {
   return (char*)p + extra_len;
 }
 
+void* MockedOrtAllocator::Reserve(size_t size) {
+  constexpr size_t extra_len = sizeof(size_t);
+  memory_inuse.fetch_add(size += extra_len);
+  void* p = new (std::nothrow) uint8_t[size];
+  if (p == nullptr)
+    return p;
+  num_allocations.fetch_add(1);
+  num_reserve_allocations.fetch_add(1);
+  *(size_t*)p = size;
+  return (char*)p + extra_len;
+}
+
 void MockedOrtAllocator::Free(void* p) {
   constexpr size_t extra_len = sizeof(size_t);
   if (!p) return;
@@ -47,6 +60,10 @@ size_t MockedOrtAllocator::NumAllocations() const {
   return num_allocations.load();
 }
 
+size_t MockedOrtAllocator::NumReserveAllocations() const {
+  return num_reserve_allocations.load();
+}
+
 void MockedOrtAllocator::LeakCheck() {
   if (memory_inuse.load())
     ORT_THROW("memory leak!!!");
diff --git a/onnxruntime/test/wasm/package-lock.json b/onnxruntime/test/wasm/package-lock.json
index 3f6db4097356..1beaf3b83ca2 100644
--- a/onnxruntime/test/wasm/package-lock.json
+++ b/onnxruntime/test/wasm/package-lock.json
@@ -520,9 +520,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -1972,9 +1972,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "fs-extra": {
diff --git a/onnxruntime/wasm/js_internal_api.js b/onnxruntime/wasm/js_internal_api.js
index 427ad6f6d14f..90d8b737252e 100644
--- a/onnxruntime/wasm/js_internal_api.js
+++ b/onnxruntime/wasm/js_internal_api.js
@@ -3,17 +3,28 @@
 
 'use strict';
 
-// init JSEP
-Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, releaseKernel, runKernel) => {
-  Module.jsepBackend = backend;
-  Module.jsepAlloc = alloc;
-  Module.jsepFree = free;
-  Module.jsepCopy = copy;
-  Module.jsepCopyAsync = copyAsync;
-  Module.jsepCreateKernel = createKernel;
-  Module.jsepReleaseKernel = releaseKernel;
-  Module.jsepRunKernel = runKernel;
+/**
+ * Mount external data files of a model to an internal map, which will be used during session initialization.
+ *
+ * @param {string} externalDataFilesPath
+ * @param {Uint8Array} externalDataFilesData
+ */
+Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => {
+  const files = Module.MountedFiles || (Module.MountedFiles = new Map());
+  files.set(externalDataFilePath, externalDataFileData);
+};
+
+/**
+ * Unmount external data files of a model.
+ */
+Module['unmountExternalData'] = () => {
+  delete Module.MountedFiles;
+};
 
+/**
+ * initialize JSEP for asyncify support.
+ */
+let jsepInitAsync = () => {
   // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1)
   // It removes some overhead in cwarp() and ccall() that we don't need.
   //
@@ -120,7 +131,7 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea
         }
 
         // Flush the backend. This will submit all pending commands to the GPU.
-        backend['flush']();
+        Module.jsepBackend?.['flush']();
 
         // Await all pending promises. This includes GPU validation promises for diagnostic purposes.
         const errorPromises = state.errors;
@@ -140,6 +151,10 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea
   };
 
   // replace the original functions with asyncified versions
+  Module['_OrtCreateSession'] = jsepWrapAsync(
+      Module['_OrtCreateSession'],
+      () => Module['_OrtCreateSession'],
+      v => Module['_OrtCreateSession'] = v);
   Module['_OrtRun'] = runAsync(jsepWrapAsync(
       Module['_OrtRun'],
       () => Module['_OrtRun'],
@@ -153,17 +168,46 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea
       () => Module['_OrtBindInput'],
       v => Module['_OrtBindInput'] = v);
 
-  // expose webgpu backend functions
-  Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => {
-    return backend['registerBuffer'](sessionId, index, buffer, size);
-  };
-  Module['jsepUnregisterBuffers'] = sessionId => {
-    backend['unregisterBuffers'](sessionId);
-  };
-  Module['jsepGetBuffer'] = (dataId) => {
-    return backend['getBuffer'](dataId);
-  };
-  Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => {
-    return backend['createDownloader'](gpuBuffer, size, type);
-  };
+  // remove this function to make sure it is called only once.
+  jsepInitAsync = undefined;
+};
+
+
+/**
+ * initialize JSEP for WebGPU.
+ */
+Module['jsepInit'] = (name, params) => {
+  jsepInitAsync?.();
+
+  if (name === 'webgpu') {
+    [Module.jsepBackend,
+     Module.jsepAlloc,
+     Module.jsepFree,
+     Module.jsepCopy,
+     Module.jsepCopyAsync,
+     Module.jsepCreateKernel,
+     Module.jsepReleaseKernel,
+     Module.jsepRunKernel,
+     Module.jsepCaptureBegin,
+     Module.jsepCaptureEnd,
+     Module.jsepReplay] = params;
+
+    // expose webgpu backend functions
+    const backend = Module.jsepBackend;
+    Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => {
+      return backend['registerBuffer'](sessionId, index, buffer, size);
+    };
+    Module['jsepGetBuffer'] = (dataId) => {
+      return backend['getBuffer'](dataId);
+    };
+    Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => {
+      return backend['createDownloader'](gpuBuffer, size, type);
+    };
+    Module['jsepOnReleaseSession'] = sessionId => {
+      backend['onReleaseSession'](sessionId);
+    };
+    Module['jsepOnRunStart'] = sessionId => {
+      return backend['onRunStart'](sessionId);
+    };
+  }
 };
diff --git a/ort.wprp b/ort.wprp
index 8738efeb599a..b82ec5882c60 100644
--- a/ort.wprp
+++ b/ort.wprp
@@ -8,12 +8,12 @@
   <Profiles>
     <EventCollector Id="EventCollector_OrtTraceLoggingProvider"
       Name="OrtTraceLoggingProviderCollector">
-      <BufferSize Value="65536" />
+      <BufferSize Value="1024" />
       <Buffers Value="10" PercentageOfTotalMemory="true"/>
     </EventCollector>
 
     <EventProvider Id="EventProvider_OrtTraceLoggingProvider"
-      Name="3a26b1ff-7484-7484-7484-15261f42614d" />
+      Name="3a26b1ff-7484-7484-7484-15261f42614d" Level="5" />
     <Profile Id="OrtTraceLoggingProvider.Verbose.File"
       Name="OrtTraceLoggingProvider" Description="OrtTraceLoggingProvider"
       LoggingMode="File" DetailLevel="Verbose">
diff --git a/orttraining/orttraining/core/framework/gradient_graph_builder.cc b/orttraining/orttraining/core/framework/gradient_graph_builder.cc
index d66591318d5c..2ee4b5e1a173 100644
--- a/orttraining/orttraining/core/framework/gradient_graph_builder.cc
+++ b/orttraining/orttraining/core/framework/gradient_graph_builder.cc
@@ -210,6 +210,11 @@ NodeSet GradientGraphBuilder::ReverseBFSWithStopGradient(const NodeSet& nodes) c
         continue;
       }
       const NodeArg* node_arg = n->InputDefs()[edge_it->GetDstArgIndex()];
+      if (!node_arg) {
+        LOGS(logger_, VERBOSE) << "Skip building gradient for input_" << edge_it->GetDstArgIndex()
+                               << " of node: " << n->Name() << " because it is not found in the graph.";
+        continue;
+      }
       const auto [is_tensor_type, is_allowed_type_for_grad, type] = IsAllowedForGradient(graph_, node_arg);
       if (is_tensor_type) {
         if (!is_allowed_type_for_grad) {
diff --git a/orttraining/orttraining/core/framework/triton/triton_op_executor.cc b/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
index 092ab89d5d76..f30d6ddee253 100644
--- a/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
+++ b/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
@@ -106,6 +106,8 @@ void TritonOpExecutor::ExecuteByFuncName(const std::string& func_name, const Inl
       PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyLong_FromLongLong(std::stoll(kv.second.first)));
     } else if (kv.second.second == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
       PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyFloat_FromDouble(std::stod(kv.second.first)));
+    } else if (kv.second.second == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
+      PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyUnicode_FromString(kv.second.first.c_str()));
     } else {
       ORT_THROW("Unsupported kwargs data type: ", kv.second.second);
     }
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index e675b55c8af8..22dcf4eb9241 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -1112,6 +1112,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
 
   ArgDef grad = GO(0);
   if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
     if (attributes.find("axes") != attributes.end()) {
       std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
       grad = IA("Unqueezed_Grad");
@@ -1122,6 +1123,9 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
         result.push_back(axes_values_node);
         result.push_back(NodeDef(OpDef{"Unsqueeze", kOnnxDomain, 13}, {GO(0), axes_values_node.output_args[0]}, {grad}));
       }
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      grad = IA("Unqueezed_Grad");
+      result.push_back(NodeDef("Unsqueeze", {GO(0), I(1)}, {grad}));
     }
   }
 
@@ -1152,12 +1156,21 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceLogSumExpGradient) {
   }
 
   ArgDef grad = GO(0);
-  if (!keepdims && attributes.find("axes") != attributes.end()) {
-    std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
-    grad = IA("Unsqueezed_Grad");
-    result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)}));
+  if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
+    if (attributes.find("axes") != attributes.end()) {
+      std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+      grad = IA("Unsqueezed_Grad");
 
-    result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)}));
+      result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)}));
+
+      result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)}));
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      grad = IA("Unsqueezed_Grad");
+      result.push_back(NodeDef("Unsqueeze", {GO(0), I(1)}, {grad}));
+
+      result.push_back(NodeDef("Unsqueeze", {O(0), I(1)}, {IA("Unsqueezed_Output")}));
+    }
     result.push_back(NodeDef("Sub", {I(0), IA("Unsqueezed_Output")}, {IA("Self_Sub_Result")}));
   } else {
     result.push_back(NodeDef("Sub", {I(0), O(0)}, {IA("Self_Sub_Result")}));
@@ -1188,11 +1201,17 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceL2Gradient) {
   ArgDef scaled_dy_arg_def = IA("Masked_Scaled_dY");
   result.emplace_back(NodeDef("Where", {IA("Masked_Y"), ZERO, IA("Scaled_dY")}, {scaled_dy_arg_def}));
 
-  if (!keepdims && attributes.find("axes") != attributes.end()) {
-    std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+  if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
     scaled_dy_arg_def = IA("Unsqueezed_Masked_Scaled_dY");
-    result.emplace_back(
-        NodeDef("Unsqueeze", {IA("Masked_Scaled_dY")}, {scaled_dy_arg_def}, {MakeAttribute("axes", axes_values)}));
+    if (attributes.find("axes") != attributes.end()) {
+      std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+      result.emplace_back(
+          NodeDef("Unsqueeze", {IA("Masked_Scaled_dY")}, {scaled_dy_arg_def}, {MakeAttribute("axes", axes_values)}));
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      result.emplace_back(
+          NodeDef("Unsqueeze", {IA("Masked_Scaled_dY"), I(1)}, {scaled_dy_arg_def}));
+    }
   }
 
   result.emplace_back(NodeDef("Mul", {I(0), scaled_dy_arg_def}, {GI(0)}));
diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc
index a62ca611b8e7..21207c8e3ce4 100644
--- a/orttraining/orttraining/core/graph/training_op_defs.cc
+++ b/orttraining/orttraining/core/graph/training_op_defs.cc
@@ -20,7 +20,7 @@ using namespace ONNX_NAMESPACE;
 namespace {
 
 // TODO(pengwa): remove this once customized PythonOp shape inference is supported.
-constexpr const char* kInspectActivationFuncName = "onnxruntime.training.utils.hooks._subscriber_manager._InspectActivation";
+constexpr const char* kInspectActivationFuncName = "onnxruntime.training.utils.hooks._statistics_subscriber._InspectActivation";
 constexpr const char* kIncrementStepFuncName = "onnxruntime.training.utils.hooks._subscriber_manager._IncrementStep";
 
 std::array<TensorShapeProto::Dimension, 6> GetRNNDimensions(InferenceContext& ctx) {
diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
index d42af92c7c66..07ec7e17b2f7 100644
--- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
+++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
@@ -17,8 +17,12 @@ namespace onnxruntime {
 namespace {
 
 // TODO(pengwa): remove this once customized PythonOp shape inference is supported.
-constexpr const char* kInspectActivationFuncName = "onnxruntime.training.utils.hooks._subscriber_manager._InspectActivation";
-constexpr const char* kIncrementStepFuncName = "onnxruntime.training.utils.hooks._subscriber_manager._IncrementStep";
+constexpr const char* kInspectActivationFuncName =
+    "onnxruntime.training.utils.hooks._statistics_subscriber._InspectActivation";
+constexpr const char* kIncrementStepFuncName =
+    "onnxruntime.training.utils.hooks._subscriber_manager._IncrementStep";
+constexpr const char* kFlagPaddingEliminationFuncName =
+    "onnxruntime.training.ortmodule._runtime_inspector.FlagPaddingElimination";
 
 void PushAllOutputNode(Graph& graph, std::queue<Node*>& q, Node* node, std::unordered_set<Node*>& visited) {
   for (auto iter = node->OutputNodesBegin(); iter != node->OutputNodesEnd(); ++iter) {
@@ -224,8 +228,10 @@ void IterateSubgraphFromNode(Graph& graph,
     visited.insert(cur);
     if (graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Add", {7, 13, 14}) ||
         graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "BiasGelu", {1}, kMSDomain) ||
-        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sub", {7, 13, 14}) ||
-        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Mul", {7, 13, 14})) {
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Div", {7, 13, 14}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Mul", {7, 13, 14}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Pow", {7, 12, 13, 15}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sub", {7, 13, 14})) {
       ORT_ENFORCE(subgraph.find(cur->MutableInputDefs()[0]) != subgraph.end() ||
                   subgraph.find(cur->MutableInputDefs()[1]) != subgraph.end());
       if (cur->InputDefs()[0]->Shape() && cur->InputDefs()[1]->Shape()) {
@@ -278,7 +284,10 @@ void IterateSubgraphFromNode(Graph& graph,
       subgraph.insert(cur->MutableOutputDefs()[1]);
       PushAllOutputNode(graph, to_visit, cur, visited);
     } else if (graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Cast", {9, 13}) ||
-               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Gelu", {1}, kMSDomain)) {
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "FastGelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Gelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "QuickGelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sqrt", {6, 13})) {
       ORT_ENFORCE(subgraph.find(cur->MutableInputDefs()[0]) != subgraph.end());
       subgraph.insert(cur->MutableOutputDefs()[0]);
       PushAllOutputNode(graph, to_visit, cur, visited);
@@ -311,7 +320,7 @@ void IterateSubgraphFromNode(Graph& graph,
         candidate_outputs.insert(cur);
         continue;
       }
-      auto func_name = static_cast<std::string>(cur->GetAttributes().at("name").s());
+      auto func_name = static_cast<std::string>(cur->GetAttributes().at("func_name").s());
       if (func_name == kInspectActivationFuncName || func_name == kIncrementStepFuncName) {
         subgraph.insert(cur->MutableOutputDefs()[1]);
         PushAllOutputNode(graph, to_visit, cur, visited);
@@ -353,11 +362,6 @@ void IterateSubgraphFromNode(Graph& graph,
 Status PaddingElimination::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
   LOG_DEBUG_INFO(logger, "Enter PaddingElimination");
 
-  if (sparse_embedding_input_names_.size() == 0) {
-    LOG_DEBUG_INFO(logger, "Exit PaddingElimination, no sparse embedding input names.");
-    return Status::OK();
-  }
-
   GraphViewer graph_viewer(graph);
   const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
   Node* embedding_node = nullptr;
@@ -386,13 +390,31 @@ Status PaddingElimination::ApplyImpl(Graph& graph, bool& modified, int graph_lev
         node.InputDefs()[2]->Exists() &&
         graph_utils::IsConstantInitializer(graph, node.InputDefs()[2]->Name()) &&
         node.InputDefs()[1]->Exists() &&
-        graph_utils::IsGraphInput(graph, node.InputDefs()[1]) &&
         node.InputDefs()[1]->Shape() &&
         node.InputDefs()[1]->Shape()->dim_size() >= 2) {
-      if (std::find(sparse_embedding_input_names_.begin(), sparse_embedding_input_names_.end(),
-                    node.InputDefs()[1]->Name()) == sparse_embedding_input_names_.end()) {
-        LOG_DEBUG_INFO(logger, "Skip node " + node.Name() + "(" + node.OpType() +
-                                   ") due to embedding input is not in the sparse embedding input list.");
+      const auto outputNodeCount = std::distance(node.OutputEdgesBegin(), node.OutputEdgesEnd());
+      if (outputNodeCount != 1) {
+        continue;
+      }
+      auto embedding_output_node = graph.GetNode(node.OutputNodesBegin()->Index());
+      if (embedding_output_node == nullptr ||
+          !graph_utils::IsSupportedOptypeVersionAndDomain(*embedding_output_node, "PythonOp", {1}, kMSDomain) ||
+          static_cast<std::string>(embedding_output_node->GetAttributes().at("func_name").s()) !=
+              kFlagPaddingEliminationFuncName) {
+        LOG_DEBUG_INFO(logger, "not find PythonOp of flagPaddingElimination after embedding node");
+        continue;
+      }
+      if (graph_utils::CanRemoveNode(graph, *embedding_output_node, logger)) {
+        if (graph_utils::RemoveNode(graph, *embedding_output_node)) {
+          modified = true;
+        } else {
+          LOG_DEBUG_INFO(logger, "Failed to remove node " + embedding_output_node->Name() +
+                                     "(" + embedding_output_node->OpType() + ")");
+          continue;
+        }
+      } else {
+        LOG_DEBUG_INFO(logger, "Can not remove node " + embedding_output_node->Name() +
+                                   "(" + embedding_output_node->OpType() + ")");
         continue;
       }
       const ONNX_NAMESPACE::TensorProto* padding_initializer =
@@ -479,7 +501,6 @@ Status PaddingElimination::ApplyImpl(Graph& graph, bool& modified, int graph_lev
   // to flattern the shape of [batch_size, seqlen, ...] to [valid_token_count, ...]
   InsertFlattenPatternForInput(graph, *embedding_node, 1, squeeze_out_arg, logger);
   handled_input_count++;
-  modified = true;
   for (auto& node : candidate_inputs) {
     for (uint32_t i = 0; i < node->InputDefs().size(); ++i) {
       if (subgraph.find(node->MutableInputDefs()[i]) == subgraph.end()) {
diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.h b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.h
index c4f283c30fdd..0dd62be142c7 100644
--- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.h
+++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.h
@@ -127,15 +127,10 @@ namespace onnxruntime {
  */
 class PaddingElimination : public GraphTransformer {
  public:
-  PaddingElimination(const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
-                     const std::vector<std::string>& sparse_embedding_input_names = {}) noexcept
-      : GraphTransformer("PaddingElimination", compatible_execution_providers),
-        sparse_embedding_input_names_{sparse_embedding_input_names} {}
+  explicit PaddingElimination(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("PaddingElimination", compatible_execution_providers) {}
 
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
-
- private:
-  std::vector<std::string> sparse_embedding_input_names_;
 };
 
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/core/optimizer/concat_replacement.cc b/orttraining/orttraining/core/optimizer/concat_replacement.cc
index 37d302765cda..2c919591ec08 100644
--- a/orttraining/orttraining/core/optimizer/concat_replacement.cc
+++ b/orttraining/orttraining/core/optimizer/concat_replacement.cc
@@ -23,7 +23,7 @@ Status ConcatReplacement::Apply(Graph& graph, Node& concat_node, RewriteRuleEffe
 
   concat_outputs.push_back(&ip_shape_op);
 
-  Node& concat_training_node = graph.AddNode(graph.GenerateNodeName("ConcatTraining"),
+  Node& concat_training_node = graph.AddNode(graph.GenerateNodeName(concat_node.Name() + "/ConcatReplacement/"),
                                              "ConcatTraining",
                                              "Concat with extra output",
                                              concat_inputs,
diff --git a/orttraining/orttraining/core/optimizer/conv1d_replacement.cc b/orttraining/orttraining/core/optimizer/conv1d_replacement.cc
index 0412000e04e1..ff220fcb067b 100644
--- a/orttraining/orttraining/core/optimizer/conv1d_replacement.cc
+++ b/orttraining/orttraining/core/optimizer/conv1d_replacement.cc
@@ -42,30 +42,45 @@
 */
 namespace onnxruntime {
 bool NodeCanBeReplacedByMatmul(const Node& node) {
-  // If node type is Conv, and attr "dilations" is 1, "kernel_shape" is 1, "stride" is 1, group is 1 or 2,
-  // then it can be replaced by MatMul
-  // Kernel_shape is 1 means it is conv1d
+  /*
+  If node type is Conv, and satisfy the following conditions then it can be replaced by MatMul:
+  - not bias as input which means only has 2 inputs: input and weight
+  - "dilations" should be [1]
+    size 1 means conv1d
+  - "strides" should be [1]
+  - "pads" should be [0,0]
+  - "autopad" should be "NOTSET"
+  - "kernel_shape" should be [1]
+  */
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Conv", {1, 11})) {
     return false;
   }
-  const auto* dilations = graph_utils::GetNodeAttribute(node, "dilations");
-  const auto* kernel_shape = graph_utils::GetNodeAttribute(node, "kernel_shape");
-  const auto* stride = graph_utils::GetNodeAttribute(node, "strides");
-  const auto* group = graph_utils::GetNodeAttribute(node, "group");
-  if (dilations == nullptr || kernel_shape == nullptr || stride == nullptr || group == nullptr) {
+
+  // TODO: bias input can also be supported if needed
+  if (node.InputDefs().size() != 2) {
     return false;
   }
-  if ((dilations->ints_size() && dilations->ints(0) != 1) ||
-      (kernel_shape->ints_size() && kernel_shape->ints(0) != 1) ||
-      (stride->ints_size() && stride->ints(0) != 1) ||
-      group->i() >= 3) {
+
+  const auto* dilations = graph_utils::GetNodeAttribute(node, "dilations");
+  const auto* strides = graph_utils::GetNodeAttribute(node, "strides");
+  const auto* pads = graph_utils::GetNodeAttribute(node, "pads");
+  const auto* autopad = graph_utils::GetNodeAttribute(node, "auto_pad");
+  const auto* kernel_shape = graph_utils::GetNodeAttribute(node, "kernel_shape");
+  if (dilations == nullptr || strides == nullptr || pads == nullptr || autopad == nullptr || kernel_shape == nullptr) {
     return false;
   }
 
-  return true;
+  if ((dilations->ints_size() == 1 && dilations->ints(0) == 1) &&
+      (strides->ints_size() == 1 && strides->ints(0) == 1) &&
+      (autopad->s() == "NOTSET") &&
+      (pads->ints_size() == 2 && pads->ints(0) == 0 && pads->ints(1) == 0) &&
+      (kernel_shape->ints_size() == 1 && kernel_shape->ints(0) == 1)) {
+    return true;
+  }
+  return false;
 }
 
-void Conv1dToMatmul(Graph& graph, Node& conv) {
+void Conv1dToMatmul(Graph& graph, Node& conv, const std::string transformer_name) {
   // Shape of conv1d input: [batch_size, in_channels, in_length]
   // Shape of conv1d weight:[output_channels, input_channels/group, kernel_shape], kernel_shape is 1
   // We need to split the input into "group", and squeeze&split the weight, and then do MatMul
@@ -83,7 +98,7 @@ void Conv1dToMatmul(Graph& graph, Node& conv) {
     conv1d_input_splitted_outputs.push_back(&graph.GetOrCreateNodeArg(
         graph.GenerateNodeArgName("input_split_output"), nullptr));
   }
-  auto& input_split = graph.AddNode(graph.GenerateNodeName("Split"), "Split", node_description, {conv1d_input},
+  auto& input_split = graph.AddNode(graph.GenerateNodeName(transformer_name + "Split"), "Split", node_description, {conv1d_input},
                                     {conv1d_input_splitted_outputs});
   input_split.SetExecutionProviderType(execution_provider_type);
   input_split.AddAttribute("axis", int64_t(1));
@@ -93,23 +108,25 @@ void Conv1dToMatmul(Graph& graph, Node& conv) {
   }
   // 2. Squeeze conv weight
   auto conv1d_weight = conv.MutableInputDefs()[1];
+  // auto con1d_bias = xx;
   auto weight_squeeze_output = &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("weight_squeeze_output"), nullptr);
-  auto& weight_squeeze = graph.AddNode(graph.GenerateNodeName("WeightSqueeze"), "Squeeze",
+  auto& weight_squeeze = graph.AddNode(graph.GenerateNodeName(transformer_name + "WeightSqueeze"), "Squeeze",
                                        node_description, {conv1d_weight}, {weight_squeeze_output});
+  int64_t weight_squeeze_axis = 2;
   if (onnx_opset_version > 12) {
     // After onnx version 12, squeeze node has axes as input instead of attribute
     ONNX_NAMESPACE::TensorProto initializer_proto;
-    initializer_proto.set_name(graph.GenerateNodeName("ConstAsInitializer"));
+    initializer_proto.set_name(graph.GenerateNodeName(transformer_name + "ConstAsInitializer"));
     initializer_proto.add_dims(static_cast<int64_t>(1));
     initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-    InlinedVector<int64_t> initializer_proto_value{2};
+    InlinedVector<int64_t> initializer_proto_value{weight_squeeze_axis};
     initializer_proto.set_raw_data(initializer_proto_value.data(), initializer_proto_value.size() * sizeof(int64_t));
     auto& axes_input = graph_utils::AddInitializer(graph, initializer_proto);
     // Squeeze node doesn't have opschema here, so we need to set input args count manually
     weight_squeeze.MutableInputArgsCount().resize(2);
     graph_utils::AddNodeInput(weight_squeeze, 1, axes_input);
   } else {
-    weight_squeeze.AddAttribute("axes", std::vector<int64_t>{2});
+    weight_squeeze.AddAttribute("axes", std::vector<int64_t>{weight_squeeze_axis});
   }
   weight_squeeze.SetExecutionProviderType(execution_provider_type);
   // 3. Split conv weight
@@ -118,7 +135,7 @@ void Conv1dToMatmul(Graph& graph, Node& conv) {
     conv1d_weight_splitted_outputs.push_back(&graph.GetOrCreateNodeArg(
         graph.GenerateNodeArgName("weight_split_output"), nullptr));
   }
-  auto& weight_split = graph.AddNode(graph.GenerateNodeName("Split"), "Split", node_description,
+  auto& weight_split = graph.AddNode(graph.GenerateNodeName(transformer_name + "Split"), "Split", node_description,
                                      {weight_squeeze_output}, {conv1d_weight_splitted_outputs});
   weight_split.AddAttribute("axis", int64_t(0));
   weight_split.SetExecutionProviderType(execution_provider_type);
@@ -130,13 +147,13 @@ void Conv1dToMatmul(Graph& graph, Node& conv) {
   for (int i = 0; i < group_num; i++) {
     auto matmul_output = &graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("matmul_output"), nullptr);
     matmul_outputs.push_back(matmul_output);
-    auto& matmul = graph.AddNode(graph.GenerateNodeName("Matmul"), "MatMul", node_description,
+    auto& matmul = graph.AddNode(graph.GenerateNodeName(transformer_name + "Matmul"), "MatMul", node_description,
                                  {conv1d_weight_splitted_outputs[i], conv1d_input_splitted_outputs[i]},
                                  {matmul_output});
     matmul.SetExecutionProviderType(execution_provider_type);
   }
   // 5. Concat matmul outputs
-  auto& concat_node = graph.AddNode(graph.GenerateNodeName("Concat"), "Concat", node_description,
+  auto& concat_node = graph.AddNode(graph.GenerateNodeName(transformer_name + "Concat"), "Concat", node_description,
                                     matmul_outputs, {});
   concat_node.SetExecutionProviderType(execution_provider_type);
   concat_node.AddAttribute("axis", int64_t(1));
@@ -155,7 +172,7 @@ Status Conv1dReplacement::ApplyImpl(Graph& graph, bool& modified, int graph_leve
     ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));
     if (NodeCanBeReplacedByMatmul(node)) {
       LOGS(logger, VERBOSE) << "lora conv1d replacement, node name: " + node.Name();
-      Conv1dToMatmul(graph, node);
+      Conv1dToMatmul(graph, node, Name());
       modified = true;
     }
   }
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_config.h b/orttraining/orttraining/core/optimizer/graph_transformer_config.h
index cc3edfb016a1..f6c14503978e 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_config.h
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_config.h
@@ -25,9 +25,6 @@ struct TrainingGraphTransformerConfiguration : public GraphTransformerConfigurat
   // Enable compute optimizer.
   bool enable_compute_optimizer{false};
 
-  // Enable embedding sparsity compute optimization for the input names in the below list.
-  std::vector<std::string> sparse_embedding_input_names;
-
   // Enable label sparsity compute optimization for the input names in the below list.
   std::vector<std::string> sparse_label_input_names;
 
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
index 6193a1d10c09..9e9261fef6ca 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
@@ -44,6 +44,7 @@
 #include "core/optimizer/relu_clip_fusion.h"
 #include "core/optimizer/reshape_fusion.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
+#include "core/optimizer/shape_input_merge.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
 #include "core/optimizer/slice_elimination.h"
 #include "core/optimizer/unsqueeze_elimination.h"
@@ -115,10 +116,11 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
       ORT_THROW_IF_ERROR(rule_transformer->Register(std::make_unique<PythonOpRewriter>()));
 #endif
 
-      // Put ConstantSharing before CommonSubexpressionElimination by intention as it can create more opportunities for
-      // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by
-      // default, CSE will not merge them, because the different initializers are represented by different NodeArg.
+      // Put ConstantSharing and ShapeInputMerge before CommonSubexpressionElimination by intention as it can create
+      // more opportunities for CSE. For example, if A and B nodes consume same different args but produce same output
+      // or consume different initializers with same value, by default, CSE will not merge them.
       transformers.emplace_back(std::make_unique<ConstantSharing>(compatible_eps));
+      transformers.emplace_back(std::make_unique<ShapeInputMerge>(compatible_eps));
       // LayerNormFusion must be applied before CommonSubexpressionElimination as the latter will break the pattern when 2 LayerNormFusion share the same input.
       transformers.emplace_back(std::make_unique<LayerNormFusion>(compatible_eps));
       // Remove duplicate nodes. Must be applied before any recompute transformations.
@@ -138,7 +140,7 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
       transformers.emplace_back(std::make_unique<FastGeluFusion>(compatible_eps));
       transformers.emplace_back(std::make_unique<QuickGeluFusion>(compatible_eps));
       transformers.emplace_back(std::make_unique<SoftmaxCrossEntropyLossInternalFusion>(compatible_eps));
-      transformers.emplace_back(std::make_unique<GatherToSplitFusion>(compatible_eps));
+      transformers.emplace_back(std::make_unique<GatherSliceToSplitFusion>(compatible_eps));
       transformers.emplace_back(std::make_unique<GatherToSliceFusion>(compatible_eps));
       // If a model with Q, DQ nodes is being used for the purpose of training, it must be for
       // Quantization Aware Training. So, replace QDQ nodes with FakeQuant.
@@ -157,8 +159,10 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
         transformers.emplace_back(std::make_unique<GeluApproximation>(compatible_eps));
       }
       InlinedHashSet<std::string> excluded_initializers(weights_to_train.begin(), weights_to_train.end());
+      static const ConfigOptions empty_config_options;
       transformers.emplace_back(std::make_unique<ConstantFolding>(
-          execution_provider, false /*skip_dequantize_linear*/, compatible_eps, excluded_initializers));
+          execution_provider, false /*skip_dequantize_linear*/, empty_config_options, compatible_eps,
+          excluded_initializers));
       transformers.emplace_back(std::make_unique<ReshapeFusion>(compatible_eps));
       // Put fine-grained optimizer (e.g. ShapeOptimizer) after ReshapeFusion to avoid it breaks the strong patterns
       // it defines. ReshapeFusion depends on subgraph pattern matching and do replacement accordingly, ShapeOptimizer
@@ -193,8 +197,7 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
 #if defined(USE_CUDA) || defined(USE_ROCM)
         // Put this under CUDA/ROCM guard as it depends on PadAndUnflatten CUDA/ROCM kernel.
         // Once we have a CPU kernel for PadAndUnflatten, we can remove the guard.
-        transformers.emplace_back(std::make_unique<PaddingElimination>(compatible_eps,
-                                                                       config.sparse_embedding_input_names));
+        transformers.emplace_back(std::make_unique<PaddingElimination>(compatible_eps));
         transformers.emplace_back(std::make_unique<Conv1dReplacement>(compatible_eps));
 #endif
       }
diff --git a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
index 2aade8c9bc1f..61fc8d5492c2 100644
--- a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
+++ b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
@@ -44,7 +44,7 @@ Status InsertSoftmaxCrossEntropyLossOutput::Apply(Graph& graph, Node& node, Rewr
     t.mutable_tensor_type()->mutable_shape()->CopyFrom(*X->Shape());  // log probability should have the same shape as logits.
   }
 
-  NodeArg& node_arg = graph.GetOrCreateNodeArg(X->Name() + "_log_prob", &t);
+  NodeArg& node_arg = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(X->Name() + "_log_prob"), &t);
 
   outputs.push_back(&node_arg);
 
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
index 9b77832abb6f..3d0fa942fd2d 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
@@ -9,6 +9,8 @@
 #include <utility>
 #include <vector>
 
+#include "core/common/string_utils.h"
+#include "core/framework/random_seed.h"
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "orttraining/core/optimizer/memory_optimizer/common.h"
@@ -255,8 +257,14 @@ Status FindORTModuleMemoryOpportunity(const GraphViewer& graph_viewer,
                                                      is_forward_nodes,
                                                      logger));
 
-  InlinedHashSet<const Node*> layer_boundary_ln_nodes;
-  FindLayerBoundaryLayerNormNodes(graph_viewer, logger, layer_boundary_ln_nodes);
+  InlinedVector<const Node*> layer_boundary_ln_nodes;
+  FindLayerBoundaryLayerNormNodes(graph_viewer, logger, node_index_to_its_order_in_topological_sort_map,
+                                  yield_op_order_in_topological_sort, layer_boundary_ln_nodes);
+
+  if (probe_config.enable_transformer_layer_as_boundary && layer_boundary_ln_nodes.size() == 0) {
+    LOGS(logger, WARNING) << "No transformer layer boundary nodes found, this might cause memory optimization "
+                             "not working as expected. Please check the model and the configuration.";
+  }
 
   // The first pass - find the candidate subgraphs.
   for (int i = static_cast<int>(node_ids.size()) - 1; i >= 0; --i) {
@@ -284,7 +292,9 @@ Status FindORTModuleMemoryOpportunity(const GraphViewer& graph_viewer,
       memory_opt_planner.AddNodeOptimizationPlan(p_node, std::move(recompute_plan));
     }
 
-    if (can_compromise_stashed_activation) {
+    // Only detect compromise recompute when recompute is not found, in case there are multiple recompute plans
+    // for the same named activations, then user might enable those conflicting recompute plans by mistakes.
+    if (recompute_plan == nullptr && can_compromise_stashed_activation) {
       MO_LOG_DEBUG_INFO(logger, "Searching Node " + p_node->Name() + "(" + p_node->OpType() +
                                     ") for compromised recompute");
       // If the subgraph recompute can save memory by comprising the assumption - recompute graphs' input must exist
@@ -485,12 +495,15 @@ void ListAllCombinations(const InlinedVector<InlinedVector<InlinedVector<std::sh
     return;
   }
 
-  for (const auto& plans : all_possible_node_optimization_plans[index]) {
-    for (const auto& plan : plans) {
-      InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>> new_combination = current_combination;
-      new_combination.push_back(plan);
-      ListAllCombinations(all_possible_node_optimization_plans, index + 1, new_combination, logger, all_combinations);
-    }
+  const InlinedVector<InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>>>&
+      plan_combination_list_at_cur_index = all_possible_node_optimization_plans[index];
+  // For the index-th reused buffer, iterate all possible complete plans.
+  for (size_t i = 0; i < plan_combination_list_at_cur_index.size(); ++i) {
+    const auto& plan_combination = plan_combination_list_at_cur_index[i];
+    InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>> new_combination = current_combination;
+    // Append the chosen complete plan and continue exploring the next reused buffer by index + 1.
+    new_combination.insert(new_combination.end(), plan_combination.begin(), plan_combination.end());
+    ListAllCombinations(all_possible_node_optimization_plans, index + 1, new_combination, logger, all_combinations);
   }
 
   MO_LOG_DEBUG_INFO(logger, "Exit ListAllCombinations");
@@ -520,17 +533,28 @@ void IterateNodeOptimizationPlan(const std::shared_ptr<NodeOptimizationPlanBase>
   }
 
   InlinedVector<InlinedVector<InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>>>>
-      all_possible_node_optimization_plans;
-  all_possible_node_optimization_plans.resize(plan->reuse_buffers.size());
+      all_possible_node_optimization_plans(plan->reuse_buffers.size());
 
   size_t i = 0;
   for (const auto& p : plan->reuse_buffers) {
     MO_LOG_DEBUG_INFO(logger, ">>>reuse buffer: " + std::to_string(p.first));
-    IterateNode(p.second.first, node_to_optimization_plans_map, {}, logger, all_possible_node_optimization_plans[i]);
+    // If the resued node is part of current node optimization plan, then we just add current combination to the result.
+    if (plan->GetOptimizationType() == OptimizationType::RecomputeWithCompromise || plan->GetOptimizationType() == OptimizationType::Recompute) {
+      const auto& recompute_subgraph =
+          dynamic_cast<NodeRecomputePlan*>(plan.get())->GetNodesInTopoOrder();
+      if (std::find(recompute_subgraph.begin(), recompute_subgraph.end(), p.second.first) != recompute_subgraph.end()) {
+        all_possible_node_optimization_plans[i].push_back(current_combination);
+      }
+    }
+
+    if (all_possible_node_optimization_plans[i].size() == 0) {
+      IterateNode(p.second.first, node_to_optimization_plans_map, current_combination, logger, all_possible_node_optimization_plans[i]);
+    }
+
     ++i;
   }
 
-  ListAllCombinations(all_possible_node_optimization_plans, 0, current_combination, logger, all_combinations);
+  ListAllCombinations(all_possible_node_optimization_plans, 0, {}, logger, all_combinations);
 
   MO_LOG_DEBUG_INFO(logger, "Exit IterateNodeOptimizationPlan: " + plan->GetClusterId());
 }
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
index 49e026ca86bd..ac619bdc390d 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.cc
@@ -28,6 +28,28 @@ constexpr bool IsForwardPassOperator(ptrdiff_t op_order_in_topological_sort,
   return op_order_in_topological_sort <= boundary_op_order_in_topological_sort;
 }
 
+// Reset seed attribute for the dropout node if the seed is not set.
+bool SetSeedForDropoutNode(Node& node) {
+  // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support.
+  if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Dropout", {12, 13}, kOnnxDomain) ||
+      graph_utils::IsSupportedOptypeVersionAndDomain(node, "BitmaskDropout", {1}, kMSDomain) ||
+      graph_utils::IsSupportedOptypeVersionAndDomain(node, "BiasDropout", {1}, kMSDomain) ||
+      graph_utils::IsSupportedOptypeVersionAndDomain(node, "BitmaskBiasDropout", {1}, kMSDomain) ||
+      graph_utils::IsSupportedOptypeVersionAndDomain(node, "BiasSoftmaxDropout", {1}, kMSDomain)) {
+    auto& attrs = node.GetAttributes();
+    if (attrs.count("seed")) {
+      return false;
+    }
+
+    int64_t seed = static_cast<int64_t>(utils::GetHashFromString(node.OutputDefs()[0]->Name())) +
+                   utils::GetRandomSeed();
+    node.AddAttribute("seed", seed);
+    return true;
+  }
+
+  return false;
+}
+
 }  // namespace
 
 Status MemoryOptimizer::ParseOptimizationConfigFromString(const std::string& memory_optimizer_config,
@@ -74,7 +96,7 @@ bool MemoryOptimizer::ModifyGraph(Graph& graph,
       optimizer::memory_optimizer::NodeRecomputePlan* recompute_plan =
           dynamic_cast<optimizer::memory_optimizer::NodeRecomputePlan*>(node_plan.get());
       ORT_ENFORCE(recompute_plan != nullptr);
-      ORT_ENFORCE(CreateRecomputeGraph(graph, recompute_plan->GetNodesInTopoOrder(), replacement_node_ptr).IsOK());
+      ORT_ENFORCE(CreateRecomputeGraph(graph, recompute_plan->GetNodesInTopoOrder(), logger, replacement_node_ptr).IsOK());
     } else {
       ORT_THROW("unsupported optimization type found.");
     }
@@ -93,7 +115,7 @@ bool MemoryOptimizer::ModifyGraph(Graph& graph,
 
         auto tid = node_index_to_its_order_in_topological_sort_map.find(it->GetNode().Index());
         // It is possible the consumer node is newly added as the recompute node, so we need a check here.
-        // For those kind of ops, we can treat them as backward ops.
+        // For those kinds of ops, we can treat them as backward ops.
         if (tid == node_index_to_its_order_in_topological_sort_map.end() ||
             !IsForwardPassOperator(node_index_to_its_order_in_topological_sort_map.at(tid->first),
                                    boundary_op_order_in_topological_sort)) {
@@ -167,11 +189,44 @@ Status MemoryOptimizer::ApplyImpl(Graph& graph, bool& modified, int /*graph_leve
                   .IsOK());
 
   // The second pass - apply the transformation.
-  // Iterate through the nodes in reversed topological order and find the subgraph that can be alleviated.
+  // Note 1: Iterate through the nodes in reversed topological order and find the subgraph that can be alleviated.
   // The reason we do reversed topological order is that we want the later layers' recompute nodes can be appended
   // earlier than the earlier layers, in this way, the execution order of later layers will be in front of the earlier
   // layers.
-  const auto& node_ids = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::PRIORITY_BASED);
+  //
+  // Note 2: Here we use default typo order (which tries to BFS from the outputs,
+  // so the nearest node to graph output will be visited last). So in reversed default typo order,
+  // the neareast node to graph output will be visited first.
+  // Imagine there is a such subgraph
+  //         input1 input2 input3
+  //             \    |     /
+  //         multiple layers
+  //             |
+  //            node M
+  // labels-------|-----
+  //    \         |     |
+  //    node1     |     |
+  //      \       |     |
+  //      node2  /      |
+  //        \   /       |
+  //      node loss     /
+  //          |        /
+  //       YieldOp  node1_recompute
+  //         |      /
+  //         \   node2 recompute
+  //          \ /
+  //     node loss_grad
+  //           |
+  //     critical grad path
+  //
+  // In PriorityBased order, node1 will be visited first, so it's recompute node node1_recompute will be added
+  // at last because we do this following reversed topological order. Then node1_recompute node will have lowest
+  // priority to execute, as a result, if at this time, the queue to visit contains only recompute nodes, then
+  // node1_recompute will be run at last, affecting the backward critical path, which is not what we want.
+  // Current workaround is to use default order, which will execute node1_recompute earlier than other recompute nodes
+  // in this case.
+
+  const auto& node_ids = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::DEFAULT);
   for (int i = static_cast<int>(node_ids.size()) - 1; i >= 0; --i) {
     Node* p_node = graph.GetNode(node_ids[i]);
     if (p_node == nullptr) {
@@ -223,6 +278,7 @@ void MemoryOptimizer::PrintSummary(const optimizer::memory_optimizer::MemoryOpti
 
 Status MemoryOptimizer::CreateRecomputeGraph(Graph& graph,
                                              const InlinedVector<const Node*>& nodes_in_topological_order,
+                                             const logging::Logger& logger,
                                              Node*& new_output_node_ptr) const {
   InlinedHashMap<NodeArg*, NodeArg*> self_contained_outputs_map;
   for (size_t i = 0; i < nodes_in_topological_order.size(); ++i) {
@@ -236,6 +292,12 @@ Status MemoryOptimizer::CreateRecomputeGraph(Graph& graph,
       continue;
     }
 
+    bool seed_reset = SetSeedForDropoutNode(*node_to_duplicate);
+    if (seed_reset) {
+      LOGS(logger, VERBOSE) << "Set seed for Node " << node_to_duplicate->Name() << "(" << node_to_duplicate->OpType()
+                            << ").";
+    }
+
     InlinedVector<NodeArg*> new_input_args;
     new_input_args.reserve(node_to_duplicate->MutableInputDefs().size());
     for (NodeArg* input_arg : node_to_duplicate->MutableInputDefs()) {
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.h b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.h
index b3e05fd334e4..1d837038e76c 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_optimizer.h
@@ -94,6 +94,7 @@ class MemoryOptimizer : public GraphTransformer {
    */
   Status CreateRecomputeGraph(Graph& graph,
                               const InlinedVector<const Node*>& nodes_in_topological_order,
+                              const logging::Logger& logger,
                               Node*& recompute_subgraph_output_node) const;
 
   /**************************************************
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
index 64e99a4a0bca..4ce896c5350b 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
@@ -15,35 +15,6 @@
 
 namespace onnxruntime::optimizer::memory_optimizer {
 
-std::string NodeOptimizationPlanBase::GetMemorySavingSymbolicString() const {
-  std::string saving_str;
-  for (auto output_index : activation_output_indices_) {
-    // If the output is reusing other node's buffer, then no memory saving.
-    if (reuse_buffers.find(output_index) != reuse_buffers.end()) {
-      continue;
-    }
-
-    const auto& output_def = node->OutputDefs()[output_index];
-    MLDataType ml_data_type = DataTypeImpl::TypeFromProto(*output_def->TypeAsProto());
-    ORT_ENFORCE(ml_data_type->IsTensorType(), "ml_type must be a tensor type, but it is ",
-                DataTypeImpl::ToString(ml_data_type));
-    const TensorTypeBase* tensor_type_base = ml_data_type->AsTensorType();
-    ORT_ENFORCE(nullptr != tensor_type_base);
-    MLDataType elt_type = tensor_type_base->GetElementType();
-    const auto byte_count_per_element = elt_type->Size();
-    if (!saving_str.empty()) {
-      saving_str += " + ";
-    }
-    saving_str = "(" + GetActivationOutputDimParamString(output_index) + " * " +
-                 std::to_string(byte_count_per_element) + " * " +
-                 std::to_string(GetSaveRatio()) + ")";
-  }
-  if (saving_str.empty()) {
-    return saving_str;
-  }
-  return "(" + saving_str + ")";
-}
-
 Status MemoryOptimizationPlanner::UpdateNodePlansFromExecutionPlan(const GraphViewer& graph_viewer,
                                                                    const OrtValueNameIdxMap& ortvalue_name_to_idx_map,
                                                                    const SequentialExecutionPlan& p_seq_exec_plan) {
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
index c585b2810b39..789f530b29f1 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
@@ -83,7 +83,7 @@ class NodeOptimizationPlanBase {
   /**
    * Get a symbolic string to represent the memory saving for this optimization plan.
    */
-  std::string GetMemorySavingSymbolicString() const;
+  virtual std::string GetMemorySavingSymbolicString() const = 0;
 
   std::string GetActivationOutputDimParamString(size_t index) const {
     ORT_ENFORCE(activation_output_dim_params_.find(index) != activation_output_dim_params_.end(),
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index 52dea571a1ea..be2f1387fb66 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -19,7 +19,7 @@ namespace onnxruntime::optimizer::memory_optimizer {
 
 namespace {
 
-constexpr int32_t MAXIMUM_RECOMPUTE_NODE_COUNT = 15;
+constexpr int32_t MAXIMUM_RECOMPUTE_NODE_COUNT = 50;
 
 static size_t GetElementSize(const ONNX_NAMESPACE::DataType& tensor_type) {
   const ONNX_NAMESPACE::TypeProto& type_proto = ONNX_NAMESPACE::Utils::DataTypeUtils::ToTypeProto(tensor_type);
@@ -48,72 +48,410 @@ float InputOutputSizeRatio(const Node* node) {
   return 1.0f;
 }
 
+using IgnorableInputIndices = InlinedVector<int>;
+using OpsetToIgnorableIndicesMap = InlinedHashMap<int, IgnorableInputIndices>;
+
 /**
- * @brief Used to define per-op recompute config.
+ * @brief Get the Allowed Recompute Ops object
+ *
+ * The supported op types are predefined.
+ * Most recent revisited for ONNX v1.15.0 release - https://github.com/onnx/onnx/blob/b86cc54efce19530fb953e4b21f57e6b3888534c/docs/Operators.md
  *
+ * We defined supported list explicitly instead of using a excluding list for the following reasons:
+ * 1. Some ops generate indeterministic results (for example using random number generator). We need evaluate whether
+ *   this is a problem for recompute before adding the support, instead of fixing this after we find and try to
+ *   fix convergence issues (which will be very hard if we have multiple indeterministic operators by default supported.)
+ * 2. Some ops schema will be changed in new opsets, we need also check manually whether it is applicable to recompute
+ *   or not.
+ * 3. Some ops are not supported in older opsets, we need to check whether it is applicable to recompute or not.
  */
-struct AllowedRecomputeNodeConfig {
-  InlinedVector<int> input_arg_indices;  // input index to iterate further (bottom up)
-};
-
-// The supported op types are predefined.
-
-const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecomputeOps(int probe_op_level) {
-  static InlinedHashMap<int, InlinedHashMap<std::string, AllowedRecomputeNodeConfig>> recomputable_op_table_map;
+const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& GetAllowedRecomputeOps(int probe_op_level) {
+  static InlinedHashMap<int, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>> recomputable_op_table_map;
   if (recomputable_op_table_map.find(probe_op_level) != recomputable_op_table_map.end()) {
     return recomputable_op_table_map.at(probe_op_level);
   }
 
-  recomputable_op_table_map.insert({probe_op_level, InlinedHashMap<std::string, AllowedRecomputeNodeConfig>()});
+  recomputable_op_table_map.insert({probe_op_level, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>()});
   auto& recomputable_op_table = recomputable_op_table_map.at(probe_op_level);
   if (probe_op_level >= static_cast<int>(ProbeLevel::Basic)) {
     recomputable_op_table.insert({
-        // Binary elementwise
-        {"Add", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"BiasGelu", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Div", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Mul", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Sub", AllowedRecomputeNodeConfig{{0, 1}}},
-
-        // Data layout
-        /// The shape input is trivial whether it exists or not in backward.
-        {"Reshape", AllowedRecomputeNodeConfig{{0}}},
-        {"Squeeze", AllowedRecomputeNodeConfig{{0}}},
-        {"Transpose", AllowedRecomputeNodeConfig{{0}}},
-        {"Unsqueeze", AllowedRecomputeNodeConfig{{0}}},
-
-        // Unary elementwise
-        {"Dropout", AllowedRecomputeNodeConfig{{0}}},
-        {"BiasGelu", AllowedRecomputeNodeConfig{{0, 1}}},
-        /// The ratio and mode input are trivial whether they exist or not in backward
-        {"BitmaskDropout", AllowedRecomputeNodeConfig{{0}}},
-        /// The axis input is trivial whether it exists or not in backward
-        {"CumSum", AllowedRecomputeNodeConfig{{0}}},
-        {"Expand", AllowedRecomputeNodeConfig{{0}}},
-        {"FastGelu", AllowedRecomputeNodeConfig{{0}}},
-        {"Gelu", AllowedRecomputeNodeConfig{{0}}},
-
-        // Ternary elementwise
-        {"Where", AllowedRecomputeNodeConfig{{0, 1, 2}}},
-
-        // Data copy
-        {"Tile", AllowedRecomputeNodeConfig{{0}}},
-        {"Cast", AllowedRecomputeNodeConfig{{0}}},
-        {"ConcatTraining", AllowedRecomputeNodeConfig{{0, 1}}},  // Input could be more than 2. But mostly 2.
-        {"Slice", AllowedRecomputeNodeConfig{{0}}},
-        {"Split", AllowedRecomputeNodeConfig{{0}}},
-        {"Gather", AllowedRecomputeNodeConfig{{0}}},
+        {
+            utils::GetFullQualifiedOpName("Add", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {13, {}},
+                {14, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BatchNormalization", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {9, {}},
+                {14, {}},
+                {15, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BiasDropout", kMSDomain),
+            {
+                {1, {3, 4}},  // ignore ratio (optional) and training mode (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BitmaskBiasDropout", kMSDomain),
+            {
+                {1, {3, 4}},  // ignore ratio (optional) and training mode (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BitmaskDropout", kMSDomain),
+            {
+                {1, {1, 2}},  // ignore ratio (optional) and training mode (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {9, {}},
+                {13, {}},
+                {19, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("ConcatTraining", kMSDomain),
+            {
+                {1, {}},
+
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("ConstantOfShape", kOnnxDomain),
+            {
+                {9, {0}},  // ignore the `input`, e.g. the shape of the expected output tensor
+                {20, {0}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Cos", kOnnxDomain),
+            {
+                {7, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("CumSum", kOnnxDomain),
+            {
+                // The axis input is trivial
+                {11, {1}},
+                {14, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
+            {
+                // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support.
+                {12, {1, 2}},  // ignore ratio and training_mode
+                {13, {1, 2}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Div", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {13, {}},
+                {14, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Einsum", kOnnxDomain),
+            {
+                {12, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Equal", kOnnxDomain),
+            {
+                {1, {}},
+                {7, {}},
+                {11, {}},
+                {13, {}},
+                {19, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Expand", kOnnxDomain),
+            {
+                {8, {1}},  // Ignore the shape.
+                {13, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("FastGelu", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("FlattenAndUnpad", kMSDomain),
+            {
+                {1, {1}},  // ignore the indices
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Gather", kOnnxDomain),
+            {
+                {1, {1}},  // ignore the indices
+                {11, {1}},
+                {13, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Gelu", kOnnxDomain),
+            {
+                {20, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Gelu", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Gemm", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {9, {}},
+                {11, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Less", kOnnxDomain),
+            {
+                {1, {}},
+                {7, {}},
+                {9, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Mul", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {13, {}},
+                {14, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Neg", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("NonZero", kOnnxDomain),
+            {
+                {9, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("PadAndUnflatten", kMSDomain),
+            {
+                {1, {1, 2}},  // ignore the indices and unflatten_dims
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Range", kOnnxDomain),
+            {
+                {11, {0, 1, 2}},  // ignore start, end, delta, because they are scalars.
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Reshape", kOnnxDomain),
+            {
+                {1, {}},
+                {5, {}},  // ignore the shape.
+                {13, {}},
+                {14, {}},
+                {19, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Sin", kOnnxDomain),
+            {
+                {7, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Slice", kOnnxDomain),
+            {
+                {1, {}},
+                {10, {1, 2, 3, 4}},  // ignore starts, ends, axes (optional) and steps (optional)
+                {11, {1, 2, 3, 4}},
+                {13, {1, 2, 3, 4}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Split", kOnnxDomain),
+            {
+                {1, {1}},  // ignore split (optional)
+                {2, {}},
+                {11, {}},
+                {13, {1}},  // ignore the split (optional)
+                {18, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Squeeze", kOnnxDomain),
+            {
+                {1, {}},
+                {11, {}},
+                {13, {1}},  // ignore the axes (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Sub", kOnnxDomain),
+            {
+                {1, {}},
+                {6, {}},
+                {7, {}},
+                {13, {}},
+                {14, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Tile", kOnnxDomain),
+            {
+                {1, {1, 2}},
+                {6, {1}},
+                {13, {1}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Transpose", kOnnxDomain),
+            {
+                {1, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Trilu", kOnnxDomain),
+            {
+                {14, {1}},  // ignore k (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("QuickGelu", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Unsqueeze", kOnnxDomain),
+            {
+                {1, {}},
+                {11, {}},
+                {13, {1}},  // ignore the axes (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Where", kOnnxDomain),
+            {
+                {9, {}},
+                {16, {}},
+            },
+        },
+
     });
   }
 
   if (probe_op_level >= static_cast<int>(ProbeLevel::Advanced)) {
     recomputable_op_table.insert({
-        {"LayerNormalization", AllowedRecomputeNodeConfig{{0, 1, 2}}},
-        {"MatMul", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"FusedMatMul", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"Softmax", AllowedRecomputeNodeConfig{{0}}},
-        {"BiasSoftmax", AllowedRecomputeNodeConfig{{0, 1}}},
-        {"BiasSoftmaxDropout", AllowedRecomputeNodeConfig{{0, 1}}},
+        {
+            utils::GetFullQualifiedOpName("BiasSoftmax", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("BiasSoftmaxDropout", kMSDomain),
+            {
+                {1, {2}},  // ignore ratio (optional)
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
+            {
+                // Opset 1 in ONNX official does not have LayerNormalization,
+                // while our contrib op defined LayerNormalization in opset 1 in ONNX domain.
+                {1, {}},
+                {17, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
+            {
+                {1, {}},
+                {9, {}},
+                {13, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("FusedMatMul", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("SimplifiedLayerNormalization", kOnnxDomain),
+            {
+                // Opset 1 in ONNX official does not have SimplifiedLayerNormalization,
+                // while our contrib op defined SimplifiedLayerNormalization in opset 1 in ONNX domain.
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("SkipLayerNormalization", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("SkipSimplifiedLayerNormalization", kMSDomain),
+            {
+                {1, {}},
+            },
+        },
+        {
+            utils::GetFullQualifiedOpName("Softmax", kOnnxDomain),
+            {
+                {1, {}},
+                {11, {}},
+                {13, {}},
+            },
+        },
     });
   }
 
@@ -124,8 +462,20 @@ const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecompu
  * @brief Check whether a node is a recomputable node at given probe level.
  */
 bool IsRecomputable(const Node& node, ProbeLevel probe_level) {
-  const auto& op_table = GetAllowedRecomputeOps(static_cast<int>(probe_level));
-  return op_table.find(node.OpType()) != op_table.end();
+  const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& op_table = GetAllowedRecomputeOps(static_cast<int>(probe_level));
+  auto it = op_table.find(utils::GetFullQualifiedOpName(node.OpType(), node.Domain()));
+  if (it == op_table.end()) {
+    return false;
+  }
+  return it->second.count(node.SinceVersion());
+}
+
+const InlinedVector<int>& GetIgnorableInputIndices(const Node& node, ProbeLevel probe_level) {
+  const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& op_table = GetAllowedRecomputeOps(static_cast<int>(probe_level));
+  auto it = op_table.find(utils::GetFullQualifiedOpName(node.OpType(), node.Domain()));
+  ORT_ENFORCE(it != op_table.end(), "Cannot get ignorable indices since the node type is supported in the list.");
+  ORT_ENFORCE(it->second.count(node.SinceVersion()) > 0, "Cannot get ignorable indices since the opset is supported");
+  return it->second.at(node.SinceVersion());
 }
 
 /**
@@ -160,7 +510,6 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
                                bool& can_compromise_stashed_activation,
                                float& save_ratio) {
   const ProbeLevel probe_level = probe_config.probe_level;
-  const auto& recomputable_op_table = GetAllowedRecomputeOps(static_cast<int>(probe_level));
 
   can_compromise_stashed_activation = false;
 
@@ -210,7 +559,7 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
       //  If current op is NOT in allowed list:
       //    1). the output does not exist in backward, we cannot find a good solution for so, the search terminates.
       //    2). the output is used in backward, we don't need to trace back further, so continue searching.
-      auto op_recompute_config_it = recomputable_op_table.find(curr_node->OpType());
+      bool is_recomputable = IsRecomputable(*curr_node, probe_level);
       auto cur_output_arg_name = curr_node->OutputDefs()[p.second]->Name();
       if (is_first_queue_scan) {
         // We handle the entry node outputs differently because, we don't want this case falls into and succeed one of
@@ -218,14 +567,14 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
         // 1. "op is not in recompute op list, but its output is used in backward"
         // 2. "op is in recompute op list, but its output is used in backward"
         // (either of the above checks is true for entry node outputs)
-        if (op_recompute_config_it == recomputable_op_table.end()) {
+        if (!is_recomputable) {
           early_stop = true;
           MO_LOG_DEBUG_INFO(logger, "Entry Node " + curr_node->Name() + "(" + curr_node->OpType() +
                                         ") is **NOT** in recompute op list, search terminates.");
           break;
         }
       } else {
-        if (op_recompute_config_it == recomputable_op_table.end()) {
+        if (!is_recomputable) {
           if (fw_op_output_arg_used_map.at(cur_output_arg_name).second) {
             MO_LOG_DEBUG_INFO(logger, "Node " + curr_node->Name() + "(" + curr_node->OpType() +
                                           ") is **NOT** in recompute op list, but its output [" +
@@ -280,14 +629,30 @@ Status SelectRecomputeSubgraph(const Node& entry_node,
       }
 
       // Iterate all input nodes according to allowed input arg index of the entry node.
-      const auto& input_arg_indices = op_recompute_config_it->second.input_arg_indices;
+      const auto& igorable_input_arg_indices = GetIgnorableInputIndices(*curr_node, probe_level);
       for (auto it = curr_node->InputEdgesBegin(), end = curr_node->InputEdgesEnd(); it != end; ++it) {
         const Node::EdgeEnd& input_edge = *it;
         const auto& parent_node = input_edge.GetNode();
         const auto parent_node_output_index = input_edge.GetSrcArgIndex();
         const auto current_node_input_index = input_edge.GetDstArgIndex();
-        if (std::find(input_arg_indices.begin(), input_arg_indices.end(), current_node_input_index) !=
-            input_arg_indices.end()) {
+        if (std::find(igorable_input_arg_indices.begin(), igorable_input_arg_indices.end(), current_node_input_index) ==
+            igorable_input_arg_indices.end()) {
+          // If the tensor size is constant and very small (Now < 1M), we stop adding the input edge into queue.
+          auto output_shape = parent_node.OutputDefs()[parent_node_output_index]->Shape();
+          if (output_shape) {
+            bool all_constant_dim = true;
+            int64_t num_elem = 1;
+            for (int k = 0, dim_size = output_shape->dim_size(); k < dim_size; ++k) {
+              if (!output_shape->dim(k).has_dim_value()) {
+                all_constant_dim = false;
+                num_elem *= output_shape->dim(k).dim_value();
+              }
+            }
+            if (all_constant_dim && num_elem < 1 * 1024 * 1024) {
+              // Skip this input index.
+              continue;
+            }
+          }
           NodeOutputPort next_p = std::make_pair(&parent_node, parent_node_output_index);
 
           MO_LOG_DEBUG_INFO(logger, "Node " + parent_node.Name() + "(" + parent_node.OpType() + ")'s " +
@@ -384,7 +749,7 @@ std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& grap
                                                              node_index_to_its_order_in_topological_sort_map,
                                                          const InlinedHashMap<const Node*, InlinedVector<size_t>>&
                                                              candidate_output_args_map,
-                                                         const InlinedHashSet<const Node*>& layer_boundary_ln_nodes,
+                                                         const InlinedVector<const Node*>& layer_boundary_ln_nodes,
                                                          const logging::Logger& logger,
                                                          bool compromise_stashed_activation,
                                                          bool& can_compromise_stashed_activation) {
@@ -402,13 +767,14 @@ std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& grap
       auto output_name = node.OutputDefs()[output_index]->Name();
       auto consumers = graph_viewer.GetConsumerNodes(output_name);
       for (auto& consumer : consumers) {
-        if (layer_boundary_ln_nodes.find(consumer) != layer_boundary_ln_nodes.end()) {
+        if (std::find(layer_boundary_ln_nodes.begin(), layer_boundary_ln_nodes.end(), consumer) !=
+            layer_boundary_ln_nodes.end()) {
           int dest_in_index = optimizer_utils::IndexOfNodeInput(*consumer, *node.OutputDefs()[output_index]);
           if (dest_in_index == 0) {
-            LOGS(logger, INFO) << "Node " << node.Name() << "(" << node.OpType()
-                               << ") is a Attention+MLP layer boundary node, "
-                               << "its stashed activation outputs are used by LayerNormalization's inputs, "
-                               << "we don't need to recompute it.";
+            MO_LOG_DEBUG_INFO(logger, "Node " + node.Name() + "(" + node.OpType() +
+                                          ") is a Attention+MLP layer boundary node, " +
+                                          "its stashed activation outputs are used by LayerNormalization's inputs, " +
+                                          "we don't need to recompute it.");
             return nullptr;
           }
         }
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
index d9693835313b..ac1021f5eb83 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
@@ -86,6 +86,51 @@ class NodeRecomputePlan : public NodeOptimizationPlanBase {
 
   std::string GetNodesInTopoOrderStr() const;
 
+  std::string GetMemorySavingSymbolicString() const override {
+    std::string saving_str;
+    for (auto output_index : GetActivationOutputIndices()) {
+      // If the output is reusing other node's buffer, then no memory saving.
+      std::string cur_output_saving_str;
+
+      bool is_reused = reuse_buffers.find(output_index) != reuse_buffers.end();
+      bool is_src_node_in_cur_node_subgraph = false;
+      if (is_reused) {
+        // Here we assume the src_node is the real owner of the buffer, so we don't need trace further.
+        const auto* src_node = reuse_buffers.at(output_index).first;
+        is_src_node_in_cur_node_subgraph = std::find(nodes_in_topological_order_.begin(),
+                                                     nodes_in_topological_order_.end(),
+                                                     src_node) != nodes_in_topological_order_.end();
+      }
+
+      if (!is_reused || is_src_node_in_cur_node_subgraph) {
+        // For is_src_node_in_cur_node_subgraph is True, still use the output to calculate the saving, because
+        // reusing buffer is the same size.
+        const auto& output_def = node->OutputDefs()[output_index];
+        MLDataType ml_data_type = DataTypeImpl::TypeFromProto(*output_def->TypeAsProto());
+        ORT_ENFORCE(ml_data_type->IsTensorType(), "ml_type must be a tensor type, but it is ",
+                    DataTypeImpl::ToString(ml_data_type));
+        const TensorTypeBase* tensor_type_base = ml_data_type->AsTensorType();
+        ORT_ENFORCE(nullptr != tensor_type_base);
+        MLDataType elt_type = tensor_type_base->GetElementType();
+        const auto byte_count_per_element = elt_type->Size();
+        cur_output_saving_str = GetActivationOutputDimParamString(output_index) + " * " +
+                                std::to_string(byte_count_per_element) + " * " +
+                                std::to_string(GetSaveRatio());
+      } else {
+        cur_output_saving_str = "0";
+      }
+
+      if (!saving_str.empty()) {
+        saving_str += " + ";
+      }
+
+      saving_str = "(" + cur_output_saving_str + ")";
+    }
+
+    ORT_ENFORCE(!saving_str.empty(), "saving_str should not be empty for node: ", node->OpType(), " ", node->Name());
+    return "(" + saving_str + ")";
+  }
+
  private:
   bool compromise_recompute_;
   InlinedVector<const Node*> nodes_in_topological_order_;
@@ -119,7 +164,7 @@ std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& grap
                                                              node_index_to_its_order_in_topological_sort_map,
                                                          const InlinedHashMap<const Node*, InlinedVector<size_t>>&
                                                              candidate_output_args_map,
-                                                         const InlinedHashSet<const Node*>& layer_boundary_ln_nodes,
+                                                         const InlinedVector<const Node*>& layer_boundary_ln_nodes,
                                                          const logging::Logger& logger,
                                                          bool compromise_stashed_activation,
                                                          bool& can_compromise_stashed_activation);
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
index 04f2679ac774..3bcfbd324ba3 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <charconv>
+#include <tuple>
 #include <vector>
 #include <utility>
 
@@ -16,31 +17,137 @@
 
 namespace onnxruntime::optimizer::memory_optimizer {
 
+namespace {
+
+bool IsLayerNormNode(const Node& node) {
+  static const std::set<std::string> layer_norm_ops = {
+      "LayerNormalization",
+      "SkipLayerNormalization",
+      "SimplifiedLayerNormalization",
+      "SkipSimplifiedLayerNormalization",
+  };
+  return layer_norm_ops.find(node.OpType()) != layer_norm_ops.end();
+}
+
+bool IsSoftmaxNode(const Node& node) {
+  static const std::set<std::string> softmax_ops = {
+      "Softmax",
+      "BiasSoftmax",
+  };
+  return softmax_ops.find(node.OpType()) != softmax_ops.end();
+}
+
+std::tuple<bool, const Node*, const Node*> IsResidualNodeArg(const GraphViewer& graph_viewer, const NodeArg* node_arg) {
+  auto consumers = graph_viewer.GetConsumerNodes(node_arg->Name());
+  if (2 > consumers.size()) {
+    return std::make_tuple(false, nullptr, nullptr);
+  }
+
+  // Find the Add node from the consumer list.
+  const Node* add_node = nullptr;
+  const Node* other_node = nullptr;
+  for (const auto* consumer : consumers) {
+    if (consumer->OpType() == "Add") {
+      add_node = consumer;
+    } else if (IsLayerNormNode(*consumer)) {
+      other_node = consumer;
+    }
+  }
+
+  return std::make_tuple(add_node != nullptr && other_node != nullptr, add_node, other_node);
+}
+}  // namespace
+
+/*
+    One classical layer includes 1). input layer norm, 2). attention, 3). residual add
+    (input layer norm input + attention output), 4). post attention layer norm feedforward, and 5). residual add
+    (post attention layer norm input + feedforward out).
+
+    The pattern graph looks like below for each transformer layer (taking the example of MistralDecoderLayer):
+                            |
+                        Embedding
+                            |
+      ----------------------|
+      |                     |
+      |                     |
+      |        SimplifiedLayerNormalization (layer boudary node)
+      |                     |
+      |                     |
+      |               MistralAttention
+      |                     |
+      |                     |
+      |____________________Add
+                            |
+      ----------------------|
+      |                     |
+      |                     |
+      |         SimplifiedLayerNormalization
+      |                     |
+      |                     |
+      |            MultipleLayerPerception
+      |                     |
+      |                     |
+      |____________________Add
+                            |
+                        (new layer)
+      ----------------------|
+      |                     |
+      |         SimplifiedLayerNormalization
+                           ....
+*/
 void FindLayerBoundaryLayerNormNodes(
     const GraphViewer& graph_viewer,
-    const logging::Logger&,
-    InlinedHashSet<const Node*>& layer_boundary_ln_nodes) {
+    const logging::Logger& logger,
+    const InlinedHashMap<NodeIndex, ptrdiff_t>&
+        node_index_to_its_order_in_topological_sort_map,
+    const ptrdiff_t& yield_op_order_in_topological_sort,
+    InlinedVector<const Node*>& layer_boundary_ln_nodes) {
   // Loop all nodes to find LayerNormalization nodes.
   // For each LayerNormalization node, keep checking its output nodes,
   // until find a node that is Softmax or BiasSoftmax or another LayerNormalization.
   // If the found node is Softmax or BiasSoftmax, the LayerNormalization node as ATTENTION.
   // If the found node is another LayerNormalization, the LayerNormalization node as MLP.
-  const InlinedHashSet<std::string_view> softmax_ops{"Softmax", "BiasSoftmax"};
-  const InlinedHashSet<std::string_view> layernorm_ops{"LayerNormalization", "SkipLayerNormalization"};
 
   layer_boundary_ln_nodes.clear();
+
   const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::PRIORITY_BASED);
   for (auto node_index : node_topology_list) {
     auto& node = *graph_viewer.GetNode(node_index);
 
-    if (layernorm_ops.find(node.OpType()) == layernorm_ops.end()) {
+    if (!IsLayerNormNode(node)) {
+      continue;
+    }
+    const NodeArg* input_arg = node.InputDefs()[0];
+
+    // IsResidualNodeArg checks input_arg
+    auto [is_residual_node_arg, add_node, other_node] = IsResidualNodeArg(graph_viewer, input_arg);
+    if (!is_residual_node_arg) {
+      MO_LOG_DEBUG_INFO(logger, "Not a residual node arg " + input_arg->Name());
+      continue;
+    }
+
+    // At this point, there should not be any recompute node, so we don't need check the node existence in
+    //  node_index_to_its_order_in_topological_sort_map.
+    ptrdiff_t attention_residual_add_node_order =
+        node_index_to_its_order_in_topological_sort_map.at(add_node->Index());
+    ptrdiff_t attention_residual_ln_node_order =
+        node_index_to_its_order_in_topological_sort_map.at(other_node->Index());
+    if (attention_residual_add_node_order >= yield_op_order_in_topological_sort ||
+        attention_residual_ln_node_order >= yield_op_order_in_topological_sort) {
+      MO_LOG_DEBUG_INFO(logger, "Not a valid residual node arg " + input_arg->Name());
       continue;
     }
 
+    // Search all forward nodes that is before `add_node` in topo order, unless we find a softmax or
+    // nodes_to_check is empty.
     std::deque<const Node*> nodes_to_check;
     std::set<const Node*> visited_nodes;
     for (auto node_it = node.OutputNodesBegin(); node_it != node.OutputNodesEnd(); ++node_it) {
-      nodes_to_check.push_back(&(*node_it));
+      // Ignore those nodes after YieldOp.
+      auto order = node_index_to_its_order_in_topological_sort_map.at(node_it->Index());
+      if (order < yield_op_order_in_topological_sort && order < attention_residual_add_node_order) {
+        nodes_to_check.push_back(&(*node_it));
+      }
     }
 
     while (!nodes_to_check.empty()) {
@@ -52,13 +159,17 @@ void FindLayerBoundaryLayerNormNodes(
       }
 
       visited_nodes.insert(next_node);
-      if (softmax_ops.find(next_node->OpType()) != softmax_ops.end()) {
-        layer_boundary_ln_nodes.insert(&node);
+      if (IsSoftmaxNode(*next_node)) {
+        MO_LOG_DEBUG_INFO(logger, "Found layer boundary node " + node.Name() + " with its input arg: " +
+                                      input_arg->Name());
+        layer_boundary_ln_nodes.push_back(&node);
         break;
-      } else if (layernorm_ops.find(next_node->OpType()) != layernorm_ops.end()) {
-        break;
-      } else {
-        for (auto node_it = next_node->OutputNodesBegin(); node_it != next_node->OutputNodesEnd(); ++node_it) {
+      }
+
+      for (auto node_it = next_node->OutputNodesBegin(); node_it != next_node->OutputNodesEnd(); ++node_it) {
+        // Stop if the node is after next Layernorm node in execution order.
+        auto order = node_index_to_its_order_in_topological_sort_map.at(node_it->Index());
+        if (order < yield_op_order_in_topological_sort && order < attention_residual_add_node_order) {
           nodes_to_check.push_back(&(*node_it));
         }
       }
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
index f2cfd640b084..a72e5a0af92d 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.h
@@ -20,6 +20,9 @@ namespace onnxruntime::optimizer::memory_optimizer {
 
 void FindLayerBoundaryLayerNormNodes(const GraphViewer& graph_viewer,
                                      const logging::Logger& logger,
-                                     InlinedHashSet<const Node*>& layer_boundary_ln_nodes);
+                                     const InlinedHashMap<NodeIndex, ptrdiff_t>&
+                                         node_index_to_its_order_in_topological_sort_map,
+                                     const ptrdiff_t& yield_op_order_in_topological_sort,
+                                     InlinedVector<const Node*>& layer_boundary_ln_nodes);
 
 }  // namespace onnxruntime::optimizer::memory_optimizer
diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc
index 0c2bfa19e167..2d2a3db1be2f 100644
--- a/orttraining/orttraining/python/orttraining_pybind_state.cc
+++ b/orttraining/orttraining/python/orttraining_pybind_state.cc
@@ -488,7 +488,6 @@ void addObjectMethodsForTraining(py::module& m) {
       .def_readwrite("transformer_layer_recompute", &TrainingGraphTransformerConfiguration::transformer_layer_recompute)
       .def_readwrite("number_recompute_layers", &TrainingGraphTransformerConfiguration::number_recompute_layers)
       .def_readwrite("enable_compute_optimizer", &TrainingGraphTransformerConfiguration::enable_compute_optimizer)
-      .def_readwrite("sparse_embedding_input_names", &TrainingGraphTransformerConfiguration::sparse_embedding_input_names)
       .def_readwrite("sparse_label_input_names", &TrainingGraphTransformerConfiguration::sparse_label_input_names)
       .def_readwrite("optimized_pre_grad_filepath", &TrainingGraphTransformerConfiguration::optimized_pre_grad_filepath)
       .def_readwrite("propagate_cast_ops_config", &TrainingGraphTransformerConfiguration::GraphTransformerConfiguration::propagate_cast_ops_config);
@@ -802,6 +801,9 @@ void addObjectMethodsForTraining(py::module& m) {
       .def("copy_parameter_from",
            [](onnxruntime::training::api::CheckpointState* state,
               const std::string& parameter_name, OrtValue& value) -> void {
+             if (state->module_checkpoint_state.is_nominal_state) {
+               ORT_THROW("Cannot copy parameter to a nominal state. Please load all the parameter states first");
+             }
              auto it = state->module_checkpoint_state.named_parameters.find(parameter_name);
              if (it == state->module_checkpoint_state.named_parameters.end()) {
                ORT_THROW("Parameter with name ", parameter_name, " does not exist.");
@@ -811,6 +813,9 @@ void addObjectMethodsForTraining(py::module& m) {
            })
       .def("get_parameter",
            [](onnxruntime::training::api::CheckpointState* state, const std::string& parameter_name) {
+             if (state->module_checkpoint_state.is_nominal_state) {
+               ORT_THROW("Cannot get parameter from a nominal state. Please load the parameter states first");
+             }
              auto it = state->module_checkpoint_state.named_parameters.find(parameter_name);
              if (it == state->module_checkpoint_state.named_parameters.end()) {
                ORT_THROW("Parameter with name ", parameter_name, " does not exist.");
@@ -851,6 +856,9 @@ void addObjectMethodsForTraining(py::module& m) {
         return std::make_unique<PyOptimizer>(optimizer_model_uri, state, providers, session_options);
       }))
       .def("optimizer_step", [](PyOptimizer* optimizer) -> void {
+        // In case the optimizer was constructed using a nominal checkpoint,
+        // the optimizer state construction is delayed until the first call to Optimizer::Step().
+        // It is expected that the model parameter state is available at this point.
         ORT_THROW_IF_ERROR(optimizer->optimizer_->Step());
       })
       .def("set_learning_rate", [](PyOptimizer* optimizer, float lr) -> void {
@@ -893,7 +901,7 @@ void addObjectMethodsForTraining(py::module& m) {
       "save_checkpoint",
       [](const std::vector<py::bytes>& trainable_tensor_protos_pybytes,
          const std::vector<py::bytes>& non_trainable_tensor_protos_pybytes,
-         const std::string& checkpoint_path) {
+         const std::string& checkpoint_path, const bool nominal_checkpoint) {
         std::vector<TensorProto> trainable_tensor_protos(trainable_tensor_protos_pybytes.size());
         std::vector<TensorProto> non_trainable_tensor_protos(non_trainable_tensor_protos_pybytes.size());
 
@@ -914,7 +922,8 @@ void addObjectMethodsForTraining(py::module& m) {
 
         ORT_THROW_IF_ERROR(onnxruntime::training::api::SaveCheckpoint(trainable_tensor_protos,
                                                                       non_trainable_tensor_protos,
-                                                                      ToPathString(checkpoint_path)));
+                                                                      ToPathString(checkpoint_path),
+                                                                      nominal_checkpoint));
       });
 
   m.def("save_checkpoint",
diff --git a/orttraining/orttraining/python/orttraining_python_module.cc b/orttraining/orttraining/python/orttraining_python_module.cc
index 55cd2af2d021..b0d1ed50af12 100644
--- a/orttraining/orttraining/python/orttraining_python_module.cc
+++ b/orttraining/orttraining/python/orttraining_python_module.cc
@@ -47,7 +47,7 @@ void addObjectMethodsForLazyTensor(py::module& m);
 #endif
 bool InitArray();
 
-bool GetDyanmicExecutionProviderHash(
+bool GetDynamicExecutionProviderHash(
     const std::string& ep_shared_lib_path,
     const ProviderOptions& provider_options,
     size_t& hash,
@@ -87,13 +87,7 @@ bool GetProviderInstanceHash(const std::string& type,
     if (auto* cuda_provider_info = TryGetProviderInfo_CUDA()) {
       const CUDAExecutionProviderInfo info = GetCudaExecutionProviderInfo(cuda_provider_info,
                                                                           provider_options_map);
-      hash = static_cast<size_t>(info.device_id) ^
-             info.gpu_mem_limit ^
-             (static_cast<size_t>(info.arena_extend_strategy) << 16) ^
-             (static_cast<size_t>(info.cudnn_conv_algo_search) << 18) ^
-             (static_cast<size_t>(info.do_copy_in_default_stream) << 20) ^
-             (static_cast<size_t>(info.has_user_compute_stream) << 22) ^
-             std::hash<cuda::TunableOpInfo>{}(info.tunable_op);
+      hash = std::hash<CUDAExecutionProviderInfo>{}(info);
       return true;
     }
 #endif
@@ -102,13 +96,7 @@ bool GetProviderInstanceHash(const std::string& type,
     if (auto* rocm_provider_info = TryGetProviderInfo_ROCM()) {
       const ROCMExecutionProviderInfo info = GetRocmExecutionProviderInfo(rocm_provider_info,
                                                                           provider_options_map);
-      hash = static_cast<size_t>(info.device_id) ^
-             info.gpu_mem_limit ^
-             (static_cast<size_t>(info.arena_extend_strategy) << 16) ^
-             (static_cast<size_t>(info.miopen_conv_exhaustive_search) << 18) ^
-             (static_cast<size_t>(info.do_copy_in_default_stream) << 20) ^
-             (static_cast<size_t>(info.has_user_compute_stream) << 22) ^
-             std::hash<rocm::TunableOpInfo>{}(info.tunable_op);
+      hash = std::hash<ROCMExecutionProviderInfo>{}(info);
       return true;
     }
 #endif
@@ -128,7 +116,7 @@ bool GetProviderInstanceHash(const std::string& type,
             provider_options.insert(option);
           }
         }
-        return GetDyanmicExecutionProviderHash(shared_lib_path_it->second, provider_options, hash);
+        return GetDynamicExecutionProviderHash(shared_lib_path_it->second, provider_options, hash);
       }
     }
   }
diff --git a/orttraining/orttraining/python/training/__init__.py b/orttraining/orttraining/python/training/__init__.py
index a3c22686a103..1da95dff94f9 100644
--- a/orttraining/orttraining/python/training/__init__.py
+++ b/orttraining/orttraining/python/training/__init__.py
@@ -23,9 +23,9 @@
 
 try:
     if is_ortmodule_available():
-        from .ortmodule import ORTModule  # noqa: F401
+        from .ortmodule import ORTModule
 
-        __all__.append("ORTModule")
+        __all__ += ["ORTModule"]
 except ImportError:
     # That is OK iff this is not a ORTModule training package
     pass
diff --git a/orttraining/orttraining/python/training/api/checkpoint_state.py b/orttraining/orttraining/python/training/api/checkpoint_state.py
index ba95cd04fce7..cc4e84111c47 100644
--- a/orttraining/orttraining/python/training/api/checkpoint_state.py
+++ b/orttraining/orttraining/python/training/api/checkpoint_state.py
@@ -222,6 +222,8 @@ def __init__(self, state: C.CheckpointState):
     def load_checkpoint(cls, checkpoint_uri: str | os.PathLike) -> CheckpointState:
         """Loads the checkpoint state from the checkpoint file
 
+        The checkpoint file can either be the complete checkpoint or the nominal checkpoint.
+
         Args:
             checkpoint_uri: The path to the checkpoint file.
 
diff --git a/orttraining/orttraining/python/training/api/module.py b/orttraining/orttraining/python/training/api/module.py
index f8f6b4322ce7..a87cd6fdd93c 100644
--- a/orttraining/orttraining/python/training/api/module.py
+++ b/orttraining/orttraining/python/training/api/module.py
@@ -178,6 +178,9 @@ def get_parameters_size(self, trainable_only: bool = True) -> int:
     def copy_buffer_to_parameters(self, buffer: OrtValue, trainable_only: bool = True) -> None:
         """Copies the OrtValue buffer to the training session parameters.
 
+        In case the module was loaded from a nominal checkpoint, invoking this function is required
+        to load the updated parameters onto the checkpoint to complete it.
+
         Args:
             buffer: The OrtValue buffer to copy to the training session parameters.
         """
diff --git a/orttraining/orttraining/python/training/artifacts.py b/orttraining/orttraining/python/training/artifacts.py
index a57105545e11..624b30ffdab3 100644
--- a/orttraining/orttraining/python/training/artifacts.py
+++ b/orttraining/orttraining/python/training/artifacts.py
@@ -41,9 +41,14 @@ def generate_artifacts(
     requires_grad: Optional[List[str]] = None,
     frozen_params: Optional[List[str]] = None,
     loss: Optional[Union[LossType, onnxblock.Block]] = None,
-    optimizer: Optional[OptimType] = None,
+    optimizer: Optional[Union[OptimType, onnxblock.Block]] = None,
     artifact_directory: Optional[Union[str, bytes, os.PathLike]] = None,
-    **extra_options,
+    prefix: str = "",
+    ort_format: bool = False,
+    custom_op_library: Optional[Union[str, bytes, os.PathLike]] = None,
+    additional_output_names: Optional[List[str]] = None,
+    nominal_checkpoint: bool = False,
+    loss_input_names: Optional[List[str]] = None,
 ) -> None:
     """Generates artifacts required for training with ORT training api.
 
@@ -59,16 +64,23 @@ def generate_artifacts(
         model: The base model to be used for gradient graph generation.
         requires_grad: List of names of model parameters that require gradient computation
         frozen_params: List of names of model parameters that should be frozen.
-        loss: The loss function enum to be used for training. If None, no loss node is added to the graph.
-        optimizer: The optimizer enum to be used for training. If None, no optimizer model is generated.
+        loss: The loss function enum or onnxblock to be used for training. If None, no loss node is added to the graph.
+        optimizer: The optimizer enum or onnxblock to be used for training. If None, no optimizer model is generated.
         artifact_directory: The directory to save the generated artifacts.
             If None, the current working directory is used.
-        prefix (str): The prefix to be used for the generated artifacts. If not specified, no prefix is used.
-        ort_format (bool): Whether to save the generated artifacts in ORT format or not. Default is False.
-        custom_op_library (str | os.PathLike): The path to the custom op library.
-                                               If not specified, no custom op library is used.
-        additional_output_names (List[str]): List of additional output names to be added to the training/eval model.
-
+        prefix: The prefix to be used for the generated artifacts. If not specified, no prefix is used.
+        ort_format: Whether to save the generated artifacts in ORT format or not. Default is False.
+        custom_op_library: The path to the custom op library.
+            If not specified, no custom op library is used.
+        additional_output_names: List of additional output names to be added to the training/eval model in addition
+            to the loss output. Default is None.
+        nominal_checkpoint: Whether to generate the nominal checkpoint in addition to the complete checkpoint.
+            Default is False. Nominal checkpoint is a checkpoint that contains nominal information about the model
+            parameters. It can be used on the device to reduce overhead while constructing the training model
+            as well as to reduce the size of the checkpoint packaged with the on-device application.
+        loss_input_names: Specifies a list of input names to be used specifically for the loss computation. When provided,
+            only these inputs will be passed to the loss function. If `None`, all graph outputs are passed to
+            the loss function.
     Raises:
         RuntimeError: If the loss provided is neither one of the supported losses nor an instance of `onnxblock.Block`
         RuntimeError: If the optimizer provided is not one of the supported optimizers.
@@ -102,28 +114,33 @@ def generate_artifacts(
         logging.info("Custom loss block provided: %s", loss.__class__.__name__)
 
     class _TrainingBlock(onnxblock.TrainingBlock):
-        def __init__(self, _loss):
+        def __init__(self, _loss, _loss_input_names=None):
             super().__init__()
             self._loss = _loss
+            self._loss_input_names = _loss_input_names
 
         def build(self, *inputs_to_loss):
-            if "additional_output_names" in extra_options:
+            # If loss_input_names is passed, only pass the specified input names to the loss function.
+            if self._loss_input_names:
+                inputs_to_loss = self._loss_input_names
+
+            if additional_output_names:
                 # If additional output names is not a list, raise an error
-                if not isinstance(extra_options["additional_output_names"], list):
+                if not isinstance(additional_output_names, list):
                     raise RuntimeError(
-                        f"Unknown type provided for additional output names {type(extra_options['additional_output_names'])}. "
+                        f"Unknown type provided for additional output names {type(additional_output_names)}. "
                         "Expected additional output names to be a list of strings."
                     )
 
                 loss_output = self._loss(*inputs_to_loss)
                 if isinstance(loss_output, tuple):
-                    return (*loss_output, *tuple(extra_options["additional_output_names"]))
+                    return (*loss_output, *tuple(additional_output_names))
                 else:
-                    return (loss_output, *tuple(extra_options["additional_output_names"]))
+                    return (loss_output, *tuple(additional_output_names))
 
             return self._loss(*inputs_to_loss)
 
-    training_block = _TrainingBlock(loss_block)
+    training_block = _TrainingBlock(loss_block, loss_input_names)
 
     if requires_grad is not None and frozen_params is not None and set(requires_grad).intersection(set(frozen_params)):
         raise RuntimeError(
@@ -143,72 +160,65 @@ def build(self, *inputs_to_loss):
     eval_model = None
     model_params = None
 
-    custom_op_library = extra_options.get("custom_op_library", None)
+    custom_op_library_path = None
     if custom_op_library is not None:
         logging.info("Custom op library provided: %s", custom_op_library)
-        custom_op_library = pathlib.Path(custom_op_library)
+        custom_op_library_path = pathlib.Path(custom_op_library)
 
-    with onnxblock.base(model), onnxblock.custom_op_library(
-        custom_op_library
-    ) if custom_op_library is not None else contextlib.nullcontext():
+    with onnxblock.base(model), (
+        onnxblock.custom_op_library(custom_op_library_path)
+        if custom_op_library is not None
+        else contextlib.nullcontext()
+    ):
         _ = training_block(*[output.name for output in model.graph.output])
         training_model, eval_model = training_block.to_model_proto()
         model_params = training_block.parameters()
 
-    def _export_to_ort_format(model_path, output_dir, extra_options):
-        if extra_options.get("ort_format", False):
-            custom_op_library = extra_options.get("custom_op_library", None)
-            if custom_op_library is not None:
-                custom_op_library = pathlib.Path(custom_op_library)
+    def _export_to_ort_format(model_path, output_dir, ort_format, custom_op_library_path):
+        if ort_format:
             convert_onnx_models_to_ort(
                 model_path,
                 output_dir=output_dir,
-                custom_op_library_path=custom_op_library,
+                custom_op_library_path=custom_op_library_path,
                 optimization_styles=[OptimizationStyle.Fixed],
             )
 
     if artifact_directory is None:
         artifact_directory = pathlib.Path.cwd()
-    prefix = ""
-    if "prefix" in extra_options:
-        prefix = extra_options["prefix"]
-        logging.info("Using prefix %s for generated artifacts.", prefix)
-
     artifact_directory = pathlib.Path(artifact_directory)
 
+    if prefix:
+        logging.info("Using prefix %s for generated artifacts.", prefix)
+
     training_model_path = artifact_directory / f"{prefix}training_model.onnx"
     if os.path.exists(training_model_path):
         logging.info("Training model path %s already exists. Overwriting.", training_model_path)
     onnx.save(training_model, training_model_path)
-    _export_to_ort_format(training_model_path, artifact_directory, extra_options)
+    _export_to_ort_format(training_model_path, artifact_directory, ort_format, custom_op_library_path)
     logging.info("Saved training model to %s", training_model_path)
 
     eval_model_path = artifact_directory / f"{prefix}eval_model.onnx"
     if os.path.exists(eval_model_path):
         logging.info("Eval model path %s already exists. Overwriting.", eval_model_path)
     onnx.save(eval_model, eval_model_path)
-    _export_to_ort_format(eval_model_path, artifact_directory, extra_options)
+    _export_to_ort_format(eval_model_path, artifact_directory, ort_format, custom_op_library_path)
     logging.info("Saved eval model to %s", eval_model_path)
 
     checkpoint_path = artifact_directory / f"{prefix}checkpoint"
     if os.path.exists(checkpoint_path):
         logging.info("Checkpoint path %s already exists. Overwriting.", checkpoint_path)
-    onnxblock.save_checkpoint(training_block.parameters(), checkpoint_path)
+    onnxblock.save_checkpoint(training_block.parameters(), checkpoint_path, nominal_checkpoint=False)
     logging.info("Saved checkpoint to %s", checkpoint_path)
+    if nominal_checkpoint:
+        nominal_checkpoint_path = artifact_directory / f"{prefix}nominal_checkpoint"
+        onnxblock.save_checkpoint(training_block.parameters(), nominal_checkpoint_path, nominal_checkpoint=True)
+        logging.info("Saved nominal checkpoint to %s", nominal_checkpoint_path)
 
     # If optimizer is not specified, skip creating the optimizer model
     if optimizer is None:
         logging.info("No optimizer enum provided. Skipping optimizer model generation.")
         return
 
-    if not isinstance(optimizer, OptimType):
-        raise RuntimeError(
-            f"Unknown optimizer provided {type(optimizer)}. Expected optimizer to be of type "
-            "onnxruntime.training.artifacts.OptimType."
-        )
-
-    logging.info("Optimizer enum provided: %s", optimizer.name)
-
     opset_version = None
     for domain in model.opset_import:
         if domain.domain == "" or domain.domain == "ai.onnx":
@@ -217,13 +227,24 @@ def _export_to_ort_format(model_path, output_dir, extra_options):
 
     optim_model = None
     optim_blocks = {OptimType.AdamW: onnxblock.optim.AdamW, OptimType.SGD: onnxblock.optim.SGD}
+    optim_block = None
+    if isinstance(optimizer, OptimType):
+        logging.info("Optimizer enum provided: %s", optimizer.name)
+        optim_block = optim_blocks[optimizer]()
+    elif isinstance(optimizer, onnxblock.Block):
+        logging.info("Optimizer block provided: %s", optimizer.__class__.__name__)
+        optim_block = optimizer
+    else:
+        raise TypeError(
+            f"Unknown optimizer provided {type(optimizer)}. Expected optimizer to be either one of"
+            "onnxruntime.training.artifacts.OptimType or onnxruntime.training.onnxblock.Block."
+        )
 
-    optim_block = optim_blocks[optimizer]()
     with onnxblock.empty_base(opset_version=opset_version):
         _ = optim_block(model_params)
         optim_model = optim_block.to_model_proto()
 
     optimizer_model_path = artifact_directory / f"{prefix}optimizer_model.onnx"
     onnx.save(optim_model, optimizer_model_path)
-    _export_to_ort_format(optimizer_model_path, artifact_directory, extra_options)
+    _export_to_ort_format(optimizer_model_path, artifact_directory, ort_format, custom_op_library_path)
     logging.info("Saved optimizer model to %s", optimizer_model_path)
diff --git a/orttraining/orttraining/python/training/onnxblock/blocks.py b/orttraining/orttraining/python/training/onnxblock/blocks.py
index d6146b8509d7..149d0a360f7d 100644
--- a/orttraining/orttraining/python/training/onnxblock/blocks.py
+++ b/orttraining/orttraining/python/training/onnxblock/blocks.py
@@ -144,9 +144,10 @@ def build(self, pow_input_name):
 class _UnaryOp(Block):
     """Base class for all nodes that take in a single argument."""
 
-    def __init__(self, op_name):
+    def __init__(self, op_name, **attributes):
         super().__init__()
         self._op_name = op_name
+        self._attributes = attributes
 
     def build(self, input_name):
         # get the model to manipulate
@@ -165,6 +166,7 @@ def build(self, input_name):
             node_input_names,
             node_output_names,
             _graph_utils.generate_graph_name(self._op_name),
+            **self._attributes,
         )
         onnx_model.graph.node.append(node)
 
@@ -174,15 +176,15 @@ def build(self, input_name):
 class ReduceMean(_UnaryOp):
     """Adds ReduceMean node to the onnx model."""
 
-    def __init__(self):
-        super().__init__("ReduceMean")
+    def __init__(self, keepdims=True):
+        super().__init__("ReduceMean", keepdims=keepdims)
 
 
 class ReduceSum(_UnaryOp):
     """Adds ReduceSum node to the onnx model."""
 
-    def __init__(self):
-        super().__init__("ReduceSum")
+    def __init__(self, keepdims=True):
+        super().__init__("ReduceSum", keepdims=keepdims)
 
 
 class Sigmoid(_UnaryOp):
diff --git a/orttraining/orttraining/python/training/onnxblock/checkpoint_utils.py b/orttraining/orttraining/python/training/onnxblock/checkpoint_utils.py
index bc50d4afa2fe..de3453c630f9 100644
--- a/orttraining/orttraining/python/training/onnxblock/checkpoint_utils.py
+++ b/orttraining/orttraining/python/training/onnxblock/checkpoint_utils.py
@@ -6,18 +6,21 @@
 
 import onnx
 
-from onnxruntime.capi._pybind_state import get_model_after_loading_checkpoint as _internal_load_checkpoint_to_model
-from onnxruntime.capi._pybind_state import save_checkpoint as _internal_save_checkpoint
+from onnxruntime.capi._pybind_state import get_model_after_loading_checkpoint as _load_checkpoint_to_model
+from onnxruntime.capi._pybind_state import save_checkpoint as _save_checkpoint
 
 
 def save_checkpoint(
-    parameters: Tuple[List[onnx.TensorProto], List[onnx.TensorProto]], path_to_checkpoint: Union[str, os.PathLike]
+    parameters: Tuple[List[onnx.TensorProto], List[onnx.TensorProto]],
+    path_to_checkpoint: Union[str, os.PathLike],
+    nominal_checkpoint: bool = False,
 ) -> None:
     """Saves the parameters to the checkpoint directory path_to_checkpoint.
 
     Args:
         parameters tuple(trainable_params, non_trainable_params): The parameters to save to the checkpoint file.
-        path_to_checkpoint (str): The path to the checkpoint directory.
+        path_to_checkpoint: The path to the checkpoint directory.
+        nominal_checkpoint: If True, the checkpoint is saved as a nominal checkpoint. Default is False.
     """
 
     if parameters is None:
@@ -26,7 +29,7 @@ def save_checkpoint(
     trainable_params, non_trainable_params = parameters
     trainable_params = [param.SerializeToString() for param in trainable_params]
     non_trainable_params = [param.SerializeToString() for param in non_trainable_params]
-    _internal_save_checkpoint(trainable_params, non_trainable_params, os.fspath(path_to_checkpoint))
+    _save_checkpoint(trainable_params, non_trainable_params, os.fspath(path_to_checkpoint), nominal_checkpoint)
 
 
 def load_checkpoint_to_model(path_to_checkpoint: Union[str, os.PathLike], model: onnx.ModelProto) -> None:
@@ -37,4 +40,4 @@ def load_checkpoint_to_model(path_to_checkpoint: Union[str, os.PathLike], model:
         model (onnx.ModelProto): The model to load the checkpoint to.
     """
 
-    model.ParseFromString(_internal_load_checkpoint_to_model(os.fspath(path_to_checkpoint), model.SerializeToString()))
+    model.ParseFromString(_load_checkpoint_to_model(os.fspath(path_to_checkpoint), model.SerializeToString()))
diff --git a/orttraining/orttraining/python/training/onnxblock/loss/loss.py b/orttraining/orttraining/python/training/onnxblock/loss/loss.py
index 2ca848fa3ff7..e719301e13f4 100644
--- a/orttraining/orttraining/python/training/onnxblock/loss/loss.py
+++ b/orttraining/orttraining/python/training/onnxblock/loss/loss.py
@@ -29,7 +29,7 @@ def __init__(self, reduction: str = "mean"):
 
         reduction_blocks = {"mean": blocks.ReduceMean, "sum": blocks.ReduceSum, "none": blocks.PassThrough}
 
-        self._reduce = reduction_blocks[reduction]()
+        self._reduce = reduction_blocks[reduction](keepdims=False)
         self._sub = blocks.Sub()
         self._square = blocks.Pow(2.0)
 
@@ -139,7 +139,7 @@ def __init__(self, weight=None, reduction: str = "mean", pos_weight=None):
         reduction_blocks = {"mean": blocks.ReduceMean, "sum": blocks.ReduceSum, "none": blocks.PassThrough}
 
         self._weight = weight
-        self._reduce = reduction_blocks[reduction]()
+        self._reduce = reduction_blocks[reduction](keepdims=False)
         self._pos_weight = pos_weight
 
         self._sigmoid = blocks.Sigmoid()
@@ -225,7 +225,7 @@ def __init__(self, reduction: str = "mean"):
             raise RuntimeError(f"Reduction {reduction} not supported.")
 
         reduction_blocks = {"mean": blocks.ReduceMean, "sum": blocks.ReduceSum, "none": blocks.PassThrough}
-        self._reduce = reduction_blocks[reduction]()
+        self._reduce = reduction_blocks[reduction](keepdims=False)
         self._abs = blocks.Abs()
         self._sub = blocks.Sub()
 
diff --git a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
index d7bbd249a000..ff128c4da425 100644
--- a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
@@ -15,7 +15,6 @@
 class ApexAMPModifier(FP16OptimizerModifier):
     def __init__(self, optimizer, **kwargs) -> None:
         super().__init__(optimizer)
-        pass
 
     def can_be_modified(self):
         return self.check_requirements(
diff --git a/orttraining/orttraining/python/training/ort_triton/__init__.py b/orttraining/orttraining/python/training/ort_triton/__init__.py
index fbb59d1354ae..5f2d0c62ffa5 100644
--- a/orttraining/orttraining/python/training/ort_triton/__init__.py
+++ b/orttraining/orttraining/python/training/ort_triton/__init__.py
@@ -9,6 +9,7 @@
 from onnxruntime.capi import _pybind_state as _C
 
 from .kernel import *  # noqa: F403
+from .triton_op_executor import register_triton_kernel  # noqa: F401
 from .triton_op_executor import call_triton_by_name, call_triton_by_onnx, get_config
 
 
diff --git a/orttraining/orttraining/python/training/ort_triton/_cache.py b/orttraining/orttraining/python/training/ort_triton/_cache.py
index ede9cd86a9da..b70064377abf 100644
--- a/orttraining/orttraining/python/training/ort_triton/_cache.py
+++ b/orttraining/orttraining/python/training/ort_triton/_cache.py
@@ -9,6 +9,7 @@
 import getpass
 import hashlib
 import os
+import sys
 import tempfile
 from types import ModuleType
 from typing import Tuple
@@ -61,6 +62,7 @@ def load(cls, source_code) -> ModuleType:
                 mod.__file__ = path
                 mod.key = key
                 exec(code, mod.__dict__, mod.__dict__)
+                sys.modules[mod.__name__] = mod
                 # another thread might set this first
                 cls.cache.setdefault(key, mod)
         return cls.cache[key]
diff --git a/orttraining/orttraining/python/training/ort_triton/_codegen.py b/orttraining/orttraining/python/training/ort_triton/_codegen.py
index e0f65ed272d3..9c7214f467af 100644
--- a/orttraining/orttraining/python/training/ort_triton/_codegen.py
+++ b/orttraining/orttraining/python/training/ort_triton/_codegen.py
@@ -37,7 +37,7 @@
 from ._lowering import lower
 from ._sorted_graph import SortedGraph
 from ._sympy_utils import parse_shape, sympy_dot
-from ._utils import may_add_brackets
+from ._utils import is_number, may_add_brackets
 
 
 class TritonCodegen(NodeVisitor):
@@ -318,7 +318,7 @@ def ComputeNode(  # noqa: N802
         if op_type == "Cast":
             from_dtype = node.inputs[0].dtype.type
             to_dtype = node.outputs[0].dtype.type
-            if from_dtype == to_dtype:
+            if from_dtype == to_dtype or is_number(kwargs["i0"]):
                 op_type = "Identity"
             elif to_dtype == np.bool_:
                 op_type = "CastBool"
diff --git a/orttraining/orttraining/python/training/ort_triton/_common.py b/orttraining/orttraining/python/training/ort_triton/_common.py
index b7e55bc733ed..a1c3d7d7e1d4 100644
--- a/orttraining/orttraining/python/training/ort_triton/_common.py
+++ b/orttraining/orttraining/python/training/ort_triton/_common.py
@@ -30,7 +30,7 @@ def get_variable_name(self, name: str) -> str:
     # For some operators such as data load/store, we need an internal variable name inside the kernel function.
     def get_internal_variable_name(self, name: str) -> str:
         var_name = self._var_map[name]
-        var_name = self._var_map[var_name] if var_name in self._var_map else var_name
+        var_name = self._var_map.get(var_name, var_name)
         return f'float("{var_name}")' if var_name in _SPECIAL_FLOATS else var_name
 
 
diff --git a/orttraining/orttraining/python/training/ort_triton/_ir.py b/orttraining/orttraining/python/training/ort_triton/_ir.py
index a2b8407645c4..a963d30a9e6e 100644
--- a/orttraining/orttraining/python/training/ort_triton/_ir.py
+++ b/orttraining/orttraining/python/training/ort_triton/_ir.py
@@ -392,5 +392,8 @@ def __init__(
             for ir_node in kernel.sub_nodes:
                 if isinstance(ir_node, DropoutNode):
                     ir_node.global_offset = running_offset
+                    kernel.offset_calc.symbolic_shape_variables.update(
+                        [symbol.name for symbol in running_offset.free_symbols]
+                    )
                     running_offset = running_offset + sympy.prod(ir_node.outputs[0].shape)
                     self.has_dropout = True
diff --git a/orttraining/orttraining/python/training/ort_triton/_lowering.py b/orttraining/orttraining/python/training/ort_triton/_lowering.py
index 5c848d2cecc5..4b580a0cc86d 100644
--- a/orttraining/orttraining/python/training/ort_triton/_lowering.py
+++ b/orttraining/orttraining/python/training/ort_triton/_lowering.py
@@ -312,7 +312,7 @@ def _group_nodes(self):
             for j in range(i + 1, len(groups)):
                 if any(output in group_inputs for output in groups[j].nodes_groups[0].output):
                     group_dependencies[i].add(j)
-                    for k in range(0, i):
+                    for k in range(i):
                         if i in group_dependencies[k]:
                             group_dependencies[k].add(j)
 
diff --git a/orttraining/orttraining/python/training/ort_triton/_utils.py b/orttraining/orttraining/python/training/ort_triton/_utils.py
index c80e28f6f73d..877eacc0b775 100644
--- a/orttraining/orttraining/python/training/ort_triton/_utils.py
+++ b/orttraining/orttraining/python/training/ort_triton/_utils.py
@@ -141,12 +141,21 @@ def get_reduce_info(node: NodeProto, graph: GraphProto, input_rank: int) -> Tupl
 
 
 def next_power_of_2(n: int) -> int:
-    assert n <= 2**32, "32-bit only"
+    """Return the smallest power of 2 greater than or equal to n"""
     n -= 1
     n |= n >> 1
     n |= n >> 2
     n |= n >> 4
     n |= n >> 8
     n |= n >> 16
+    n |= n >> 32
     n += 1
     return n
+
+
+def is_number(name: str) -> bool:
+    try:
+        float(name)
+        return True
+    except ValueError:
+        return name.startswith("float(") and name.endswith(")")
diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py b/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
index 03bb0f4373d8..f7b7c1ff0830 100644
--- a/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
+++ b/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
@@ -694,7 +694,7 @@ def _bwd_kernel(
     LSE += off_hb * seqlen_q_rounded
     if not SEQUENCE_PARALLEL:
         num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
-        for start_n in range(0, num_block_n):
+        for start_n in range(num_block_n):
             _bwd_kernel_one_col_block(
                 start_n,
                 Q,
diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py b/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py
index ed92923589d4..a3681a13699a 100644
--- a/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py
+++ b/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py
@@ -11,7 +11,7 @@
 import torch
 
 from .._cache import ModuleCache, PyCodeCache
-from .._utils import next_power_of_2
+from .._utils import gen_unique_name, next_power_of_2
 
 _DEBUG_MODE = "ORTMODULE_TRITON_DEBUG" in os.environ and int(os.getenv("ORTMODULE_TRITON_DEBUG")) == 1
 
@@ -305,18 +305,18 @@ def _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name):
 
 
 def _gen_mm_key(dtype: torch.dtype, m: int, n: int, k: int, trans_a: bool, trans_b: bool, alpha: float) -> int:
-    return hash(f"mm|{dtype}|{m}|{n}|{k}|{trans_a}|{trans_b}|{alpha}") % (10**8)
+    return hash(f"mm|{dtype}|{m}|{n}|{k}|{trans_a}|{trans_b}|{alpha}")
 
 
 def _gen_mm_module(
     dtype: torch.dtype, m: int, n: int, k: int, trans_a: bool, trans_b: bool, alpha: float
 ) -> Tuple[str, ModuleType]:
-    func_name = f"mm_{_gen_mm_key(dtype, m, n, k, trans_a, trans_b, alpha)}"
+    func_name = gen_unique_name("mm")
     kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name)
     src_code = _MM_TEMPLATE.format(**kwargs)
     if _DEBUG_MODE:
         os.makedirs(os.path.dirname("triton_debug/"), exist_ok=True)
-        with open(f"triton_debug/{func_name}.py", "w") as f:
+        with open(f"triton_debug/{func_name}.py", "w", encoding="utf-8") as f:
             f.write(src_code)
     return func_name, PyCodeCache().load(src_code)
 
@@ -333,7 +333,7 @@ def _gen_gemm_key(
     alpha: float,
     beta: float,
 ) -> int:
-    return hash(f"gemm|{dtype}|{m}|{n}|{k}|{stride_cm}|{stride_cn}|{trans_a}|{trans_b}|{alpha}|{beta}") % (10**8)
+    return hash(f"gemm|{dtype}|{m}|{n}|{k}|{stride_cm}|{stride_cn}|{trans_a}|{trans_b}|{alpha}|{beta}")
 
 
 def _gen_gemm_module(
@@ -348,7 +348,7 @@ def _gen_gemm_module(
     alpha: float,
     beta: float,
 ) -> Tuple[str, ModuleType]:
-    func_name = f"gemm_{_gen_gemm_key(dtype, m, n, k, stride_cm, stride_cn, trans_a, trans_b, alpha, beta)}"
+    func_name = gen_unique_name("gemm")
     kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name)
     kwargs["stride_cm"] = stride_cm
     kwargs["stride_cn"] = stride_cn
@@ -356,7 +356,7 @@ def _gen_gemm_module(
     src_code = _GEMM_TEMPLATE.format(**kwargs)
     if _DEBUG_MODE:
         os.makedirs(os.path.dirname("triton_debug/"), exist_ok=True)
-        with open(f"triton_debug/{func_name}.py", "w") as f:
+        with open(f"triton_debug/{func_name}.py", "w", encoding="utf-8") as f:
             f.write(src_code)
     return func_name, PyCodeCache().load(src_code)
 
@@ -364,13 +364,13 @@ def _gen_gemm_module(
 def _gen_bmm_key(
     dtype: torch.dtype, m: int, n: int, k: int, batch_a: int, batch_b: int, trans_a: bool, trans_b: bool, alpha: float
 ) -> int:
-    return hash(f"bmm|{dtype}|{m}|{n}|{k}|{batch_a}|{batch_b}|{trans_a}|{trans_b}|{alpha}") % (10**8)
+    return hash(f"bmm|{dtype}|{m}|{n}|{k}|{batch_a}|{batch_b}|{trans_a}|{trans_b}|{alpha}")
 
 
 def _gen_bmm_module(
     dtype: torch.dtype, m: int, n: int, k: int, batch_a: int, batch_b: int, trans_a: bool, trans_b: bool, alpha: float
 ) -> Tuple[str, ModuleType]:
-    func_name = f"bmm_{_gen_bmm_key(dtype, m, n, k, batch_a, batch_b, trans_a, trans_b, alpha)}"
+    func_name = gen_unique_name("bmm")
     kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name)
     batch = batch_a if batch_a >= batch_b else batch_b
     kwargs["stride_aq"] = m * k if batch_a == batch else 0
@@ -379,7 +379,7 @@ def _gen_bmm_module(
     src_code = _BMM_TEMPLATE.format(**kwargs)
     if _DEBUG_MODE:
         os.makedirs(os.path.dirname("triton_debug/"), exist_ok=True)
-        with open(f"triton_debug/{func_name}.py", "w") as f:
+        with open(f"triton_debug/{func_name}.py", "w", encoding="utf-8") as f:
             f.write(src_code)
     return func_name, PyCodeCache().load(src_code)
 
diff --git a/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py b/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
index 1fe61750e651..14bc2779aa05 100644
--- a/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
+++ b/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py
@@ -6,11 +6,13 @@
 import functools
 import json
 import os
+import re
 import sys
 from types import ModuleType
 from typing import List, Tuple, Union
 
 import onnx
+from onnx import ModelProto
 from torch._C import _from_dlpack
 from torch.utils.dlpack import to_dlpack
 
@@ -23,6 +25,8 @@
 
 _DEBUG_MODE = "ORTMODULE_TRITON_DEBUG" in os.environ and int(os.getenv("ORTMODULE_TRITON_DEBUG")) == 1
 
+_CUSTOM_KERNELS = dict()
+
 
 @functools.lru_cache(None)
 def _gen_module_internal(sorted_graph: SortedGraph) -> Tuple[str, str, ModuleType]:
@@ -39,18 +43,39 @@ class _ShapeCache:
     """
 
     cache = dict()  # noqa: RUF012
+    symbolic_shape_hint = None
+    min_symbolic_shape = 0
     clear = staticmethod(cache.clear)
 
     @classmethod
-    def get_shape(cls, onnx_key: int, shapes: List[List[int]]) -> List[List[Union[int, str]]]:
+    def set_symbolic_shape_hint(cls, symbolic_shape_hint_config):
+        for k, v in symbolic_shape_hint_config.items():
+            if k == "*":
+                cls.min_symbolic_shape = v
+            else:
+                if cls.symbolic_shape_hint is None:
+                    cls.symbolic_shape_hint = dict()
+                cls.symbolic_shape_hint[k] = v
+
+    @classmethod
+    def get_shape(cls, onnx_key: int, model: ModelProto, shapes: List[List[int]]) -> List[List[Union[int, str]]]:
         if onnx_key not in cls.cache:
+            if cls.symbolic_shape_hint is not None:
+                for i, input in enumerate(model.graph.input):
+                    if input.type.tensor_type.HasField("shape"):
+                        for j, dim in enumerate(input.type.tensor_type.shape.dim):
+                            if dim.dim_param:
+                                for k, v in cls.symbolic_shape_hint.items():
+                                    if re.fullmatch(k, dim.dim_param):
+                                        shapes[i][j] = f"i{i}_dim{j}_{v}"
+                                        break
             cls.cache[onnx_key] = shapes
         else:
             changed = False
             for i, shape in enumerate(shapes):
                 for j, dim in enumerate(shape):
-                    if dim != cls.cache[onnx_key][i][j] and isinstance(cls.cache[onnx_key][i][j], int):
-                        max_dim = max(dim, cls.cache[onnx_key][i][j])
+                    if isinstance(cls.cache[onnx_key][i][j], int) and dim != cls.cache[onnx_key][i][j]:
+                        max_dim = max(dim, cls.cache[onnx_key][i][j], cls.min_symbolic_shape)
                         shape[j] = f"i{i}_dim{j}_{next_power_of_2(max_dim)}"
                         changed = True
                     elif isinstance(cls.cache[onnx_key][i][j], str):
@@ -65,13 +90,12 @@ def get_shape(cls, onnx_key: int, shapes: List[List[int]]) -> List[List[Union[in
         return cls.cache[onnx_key]
 
 
-def _gen_key(onnx_key: int, onnx_str: bytes, shapes: List[List[Union[int, str]]]) -> int:
+def _gen_key(onnx_key: int, model: ModelProto, shapes: List[List[Union[int, str]]]) -> int:
     # pylint: disable=unused-argument
-    return hash(f"{onnx_key}|{str(shapes).replace(' ', '')}") % (10**8)
+    return hash(f"{onnx_key}|{str(shapes).replace(' ', '')}")
 
 
-def _gen_module(onnx_key: int, onnx_str: bytes, shapes: List[List[Union[int, str]]]) -> Tuple[str, ModuleType]:
-    model = onnx.load_model_from_string(onnx_str)
+def _gen_module(onnx_key: int, model: ModelProto, shapes: List[List[Union[int, str]]]) -> Tuple[str, ModuleType]:
     sorted_graph = SortedGraph(model, [parse_shape(shape) for shape in shapes])
     if _DEBUG_MODE:
         os.makedirs(os.path.dirname("triton_debug/"), exist_ok=True)
@@ -94,14 +118,28 @@ def get_config() -> str:
         "scalar": only related scalar initializers will be added to subgraphs.
         "all": all related initializers will be added to subgraphs.
     The min_nodes is used to control the minimum number of non-no-op nodes in a subgraph.
+    User can also specify symbolic_shape_hint in the config, which is a dict to control the symbolic shape hint.
+    Each entry is a regex pattern to match the dim_param in ONNX model and the value is the power of 2 for the symbolic
+    shape. Each dim_param will be replaced by i{input_index}_dim{dim_index}_{power_of_2} in the symbolic shape.
     """
 
+    config = dict()
     config_file = os.getenv("ORTMODULE_TRITON_CONFIG_FILE", "")
     if config_file and os.path.exists(config_file):
         with open(config_file, encoding="UTF-8") as f:
-            return f.read()
+            config = json.load(f)
+
+    if "ops" not in config:
+        config["ops"] = get_supported_ops()
+    if "initializer" not in config:
+        config["initializer"] = "scalar"
+    if "min_nodes" not in config:
+        config["min_nodes"] = 2
+
+    if "symbolic_shape_hint" in config and len(config["symbolic_shape_hint"]) > 0:
+        _ShapeCache.set_symbolic_shape_hint(config["symbolic_shape_hint"])
+        del config["symbolic_shape_hint"]
 
-    config = {"ops": get_supported_ops(), "initializer": "scalar", "min_nodes": 2}
     return json.dumps(config)
 
 
@@ -113,7 +151,10 @@ def call_triton_by_name(func_name: str, *tensors, **kwargs):
     """
 
     torch_tensors = [_from_dlpack(tensor) if tensor is not None else None for tensor in tensors]
-    func = getattr(sys.modules[".".join(__name__.split(".")[:-1])], func_name)
+    func = getattr(sys.modules[".".join(__name__.split(".")[:-1])], func_name, None)
+    if func is None:
+        func = _CUSTOM_KERNELS.get(func_name)
+    assert func is not None, f"Function {func_name} is not found in the registered kernels."
     output = func(*torch_tensors, **kwargs)
     if output is not None:
         if isinstance(output, tuple):
@@ -131,10 +172,16 @@ def call_triton_by_onnx(onnx_key: int, onnx_str: bytes, *tensors):
     assert all(tensor is not None for tensor in tensors)
     torch_tensors = [_from_dlpack(tensor) for tensor in tensors]
     concrete_shapes = [list(tensor.size()) for tensor in torch_tensors]
-    shapes = _ShapeCache.get_shape(onnx_key, concrete_shapes)
-    func_name, mod = ModuleCache.load(_gen_key, _gen_module, onnx_key, onnx_str, shapes)
+    model = onnx.load_model_from_string(onnx_str)
+    shapes = _ShapeCache.get_shape(onnx_key, model, concrete_shapes)
+    func_name, mod = ModuleCache.load(_gen_key, _gen_module, onnx_key, model, shapes)
     func = getattr(mod, func_name)
     output = func(*torch_tensors)
     if isinstance(output, tuple):
         return tuple([to_dlpack(tensor) for tensor in output])
     return to_dlpack(output)
+
+
+def register_triton_kernel(fn):
+    _CUSTOM_KERNELS[fn.__name__] = fn
+    return fn
diff --git a/orttraining/orttraining/python/training/ortmodule/__init__.py b/orttraining/orttraining/python/training/ortmodule/__init__.py
index fbf1b7c2bac4..20e3493395b3 100644
--- a/orttraining/orttraining/python/training/ortmodule/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/__init__.py
@@ -3,6 +3,8 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+import contextlib
+import inspect
 import os
 import sys
 import warnings
@@ -21,7 +23,10 @@
     raise ImportError("ORTModule is not supported on this platform.")
 
 
-def _defined_from_envvar(name, default_value, warn=True):
+def _defined_from_envvar(name: str, default_value: any, warn: bool = True):
+    """Check given name exists in the environment variable and return the value using the default_value's
+    type if it exists.
+    """
     new_value = os.getenv(name, None)
     if new_value is None:
         return default_value
@@ -34,12 +39,92 @@ def _defined_from_envvar(name, default_value, warn=True):
     return new_value
 
 
+def _override_gradient_checkpoint(original_checkpoint):
+    """
+    Best effort to override `torch.utils.checkpoint` and `deepspeed.checkpointing.checkpoint` during ONNX export.
+
+    Despite importing `torch.utils.checkpoint` or `deepspeed.checkpointing.checkpoint` in `__init__.py`,
+    users might import it first, causing our override to not take effect. We still attempt to override
+    it to work in most cases.
+
+    We replace the checkpoint function with our implementation, without condition checks.
+    The actual check is in the overridden function, verifying if:
+    1) `checkpoint` is called during ORTModule model export,
+    2) Gradient checkpoint autograd function is disallowed (ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT),
+    3) Memory optimization level is not specified by the user (ORTMODULE_MEMORY_OPT_LEVEL).
+    If true, we reset memory optimization to layer-wise recompute.
+
+    """
+
+    # Note: The `torch.utils.checkpoint` checkpoint function signature looks like below:
+    #   `checkpoint(function, *args,
+    #               use_reentrant = None,
+    #               context_fn = noop_context_fn,
+    #               determinism_check = _DEFAULT_DETERMINISM_MODE,
+    #               debug = False,
+    #               **kwargs).`
+    # The few keyword arguments are not used in the recompute module forward function, but by the
+    # checkpoint function itself, so we need to filter them out otherwise module forward function
+    # would complain about unexpected keyword arguments.
+    all_input_parameters = inspect.signature(original_checkpoint).parameters.values()
+    outside_kwarg_params = []
+    for input_parameter in all_input_parameters:
+        if (
+            input_parameter.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+            or input_parameter.kind == inspect.Parameter.KEYWORD_ONLY
+            or input_parameter.kind == inspect.Parameter.VAR_KEYWORD
+        ):
+            outside_kwarg_params.append(input_parameter.name)
+
+    def _checkpoint(
+        function,
+        *args,
+        **kwargs,
+    ):
+        # Conditions to activate layer-wise memory optimization automatically:
+        # 1. Checkpoint is called during ORTModule model export context.
+        # 2. Gradient checkpoint autograd function export is disallowed.
+        # 3. Memory optimization level is layer-wise recompute.
+        if (
+            ORTMODULE_ONNX_EXPORT_CONTEXT[0] is True
+            and _defined_from_envvar("ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT", 0) != 1
+            and _defined_from_envvar("ORTMODULE_MEMORY_OPT_LEVEL", 0) == 1
+        ):
+            for name in outside_kwarg_params:
+                if name in kwargs:
+                    # Pop out the keyword argument to avoid passing it to the module run function
+                    kwargs.pop(name)
+            print(
+                "Layer-wise memory optimization is enabled upon detecting "
+                "gradient checkpointing autograd function usage during model execution."
+            )
+            return function(*args, **kwargs)
+        return original_checkpoint(
+            function,
+            *args,
+            **kwargs,
+        )
+
+    return _checkpoint
+
+
+with contextlib.suppress(Exception):
+    from torch.utils.checkpoint import checkpoint as original_torch_checkpoint
+
+    torch.utils.checkpoint.checkpoint = _override_gradient_checkpoint(original_torch_checkpoint)
+
+    import deepspeed
+
+    original_deepspeed_checkpoint = deepspeed.checkpointing.checkpoint
+    deepspeed.checkpointing.checkpoint = _override_gradient_checkpoint(original_deepspeed_checkpoint)
+
+
 ################################################################################
 # All global constant goes here, before ORTModule is imported ##################
 # NOTE: To *change* values in runtime, import onnxruntime.training.ortmodule and
 # assign them new values. Importing them directly do not propagate changes.
 ################################################################################
-ONNX_OPSET_VERSION = 15
+ONNX_OPSET_VERSION = 17
 MINIMUM_RUNTIME_PYTORCH_VERSION_STR = "1.8.1"
 ORTMODULE_TORCH_CPP_DIR = os.path.join(os.path.dirname(__file__), "torch_cpp_extensions")
 _FALLBACK_INIT_EXCEPTION = None
@@ -55,7 +140,24 @@ def _defined_from_envvar(name, default_value, warn=True):
 ONNXRUNTIME_CUDA_VERSION = ort_info.cuda_version if hasattr(ort_info, "cuda_version") else None
 ONNXRUNTIME_ROCM_VERSION = ort_info.rocm_version if hasattr(ort_info, "rocm_version") else None
 
-# Verify minimum PyTorch version is installed before proceding to ONNX Runtime initialization
+# The first value indicates whether the code is in ONNX export context.
+# The export context here include the full export process, including prepare export input/output information,
+# and export model.
+ORTMODULE_ONNX_EXPORT_CONTEXT = [False]
+
+
+@contextlib.contextmanager
+def export_context():
+    """Context manager for model export."""
+    try:
+        ORTMODULE_ONNX_EXPORT_CONTEXT[0] = True
+
+        yield
+    finally:
+        ORTMODULE_ONNX_EXPORT_CONTEXT[0] = False
+
+
+# Verify minimum PyTorch version is installed before proceeding to ONNX Runtime initialization
 try:
     import torch
 
@@ -70,6 +172,8 @@ def _defined_from_envvar(name, default_value, warn=True):
                 f" but version {torch.__version__} was found instead."
             ),
         )
+
+
 except ORTModuleFallbackException as e:
     # Initialization fallback is handled at ORTModule.__init__
     _FALLBACK_INIT_EXCEPTION = e
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
index f10416a9bb0f..8a890625003a 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
@@ -25,6 +25,7 @@
 
 from ._custom_op_symbolic_registry import wrap_custom_export_function
 from ._fallback import ORTModuleONNXModelException, wrap_exception
+from ._logger import LogColor
 from ._utils import get_fully_qualified_class_name, get_runtime_pytorch_version
 
 
@@ -112,35 +113,6 @@ def alias_input(node_proto_str: str) -> Tuple[List[int], List[int]]:
         register_input_alias_function(kclass_name, kclass.alias_input)
 
 
-"""
-Defines a list of names of torch.autograd.Function, for checkpoint activation purposes.
-
-Note:
-    If CheckpointFunction is exported as PythonOp, the checkpoint-ed computation
-    (applied on every N transformer layer) may be computed by PyTorch, not ORT.
-    This situation should be especially noted for large language models such as GPT-2.
-
-As alternatives to using checkpoint activation:
-1. Users could leverage HierarchalORTModule to wrap the model, which only wrap exportable
-sub-nn.Module's as ORTModule. In this way, ideally, ORT could cover most of the model computation,
-other than dispatching them to PyTorch.
-2. Users could disable the check by setting the environment variable ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT=1.
-This may imply that the exported model is not fully running on ORT, users should be aware of the potential
-performance impact.
-3. Users could also leverage ORT's memory optimization feature to achieve a similar effect as checkpointing
-activations. Turn off PyTorch's checkpointing activation, then refer to env var ORTMODULE_MEMORY_OPT_CONFIG
-to enable ORT's recomputation feature.
-
-"""
-_UNSUPPORTED_CKPT_FUNC_NAMES = frozenset(
-    [
-        # Full qualified name.
-        "torch.utils.checkpoint.CheckpointFunction",
-        "deepspeed.checkpointing.CheckpointFunction",
-    ]
-)
-
-
 def _get_training_mode() -> bool:
     # TODO move to public API once the exporter team exposes that
     training_mode = None
@@ -192,15 +164,6 @@ def _export_pt_1_10(g, n, *args, **kwargs):
 
         # Fall back to common exporter if not handled by high priority exporter.
 
-        # Check if the checkpointing activation is allowed.
-        is_ckpt_activation_allowed = ortmodule._defined_from_envvar("ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT", 0) == 1
-        if is_ckpt_activation_allowed is False and func_full_qual_name in _UNSUPPORTED_CKPT_FUNC_NAMES:
-            raise Exception(
-                f"The torch.autograd.Function {func_full_qual_name} should not be exported to ONNX. "
-                "Please replace ORTModule with HierarchalORTModule to only"
-                "wrap exportable sub-nn.Module's as ORTModule."
-            )
-
         cconv = n.cconv()
 
         input_tensor_types = []
@@ -376,8 +339,7 @@ def _export_pt_1_10(g, n, *args, **kwargs):
 
 def post_process_enabling_autograd_function(exported_model: ModelProto) -> ModelProto:
     # Loop all PythonOp, append "_ctx" as the first output.
-    index = 0
-    for node in exported_model.graph.node:
+    for index, node in enumerate(exported_model.graph.node):
         op_name_prefix = node.op_type
         if node.domain == "com.microsoft" and node.op_type == "PythonOp":
             output_names = list(node.output)
@@ -391,11 +353,74 @@ def post_process_enabling_autograd_function(exported_model: ModelProto) -> Model
                     break
 
             node.name = f"{op_name_prefix}_id_{index}"
-        index += 1
 
     return exported_model
 
 
+@register_high_priority_handler("torch.utils.checkpoint.CheckpointFunction")
+@register_high_priority_handler("deepspeed.checkpointing.CheckpointFunction")
+def _gradient_checkpointing_export(g, n, *args, **kwargs):
+    """
+    Register specialized exporter for torch.autograd.Function(s) used for checkpoint activation purposes.
+
+    Note:
+        If CheckpointFunction is exported as PythonOp, the checkpoint-ed computation
+        (applied on every N transformer layer) may be computed by PyTorch, not ORT.
+        This situation should be especially noted for large language models such as GPT-2.
+
+    As alternatives to using checkpoint activation:
+    1. Users could leverage HierarchalORTModule to wrap the model, which only wrap exportable
+    sub-nn.Module's as ORTModule. In this way, ideally, ORT could cover most of the model computation,
+    other than dispatching them to PyTorch.
+    2. Users could disable the check by setting the environment variable ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT=1.
+    This may imply that the exported model is not fully running on ORT, users should be aware of the potential
+    performance impact.
+    3. Users could also leverage ORT's memory optimization feature to achieve a similar effect as checkpointing
+    activations. Turn off PyTorch's checkpointing activation, then refer to env var ORTMODULE_MEMORY_OPT_LEVEL
+    to enable ORT's recomputation feature.
+
+    """
+    # Check if the checkpointing activation is allowed.
+    is_ckpt_activation_allowed = ortmodule._defined_from_envvar("ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT", 0) == 1
+    if is_ckpt_activation_allowed is False:
+        is_layerwise_recompute_enabled = ortmodule._defined_from_envvar("ORTMODULE_MEMORY_OPT_LEVEL", 0) == 1
+        if not is_layerwise_recompute_enabled:
+            raise Exception(
+                f"{LogColor.RED}"
+                "Model uses gradient checkpointing (via {func_full_qual_name}), "
+                "which is not supported for export. \n"
+                "Consider these alternatives:\n"
+                "1) Enable ORTModule's gradient checkpointing for similar or better "
+                "memory efficiency with `export ORTMODULE_MEMORY_OPT_LEVEL=1`.\n"
+                "2) Allow gradient checkpointing export by setting the environment "
+                "variable `ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT=1`, though subsequent "
+                "execution may fail."
+                "3) Replace ORTModule with HierarchalORTModule to wrap exportable "
+                "sub-nn.Module's as ORTModule.\n"
+                f"{LogColor.ENDC}"
+            )
+
+        # Hitting this branch means the user has enabled layerwise recompute, but _override_gradient_checkpoint didn't
+        # catch the checkpointing function. This is usually because model code is importing torch.utils.checkpoint
+        # earlier than ORTModule. We should tolerantly allow this case to happen.
+        raise Exception(
+            f"{LogColor.RED}"
+            "Model uses gradient checkpointing (via {func_full_qual_name}), which is not "
+            "supported for export. \n"
+            "Consider these alternatives:\n"
+            "1) `ORTMODULE_MEMORY_OPT_LEVEL=1` is set but checkpoint functions in the model "
+            "are not overridden during onnxruntime.training.ortmodule import, consider importing "
+            "onnxruntime.training.ortmodule earlier before any model code loaded.\n"
+            "2) To allow gradient checkpointing export, set `ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT=1`. "
+            "Subsequent execution may fail.\n"
+            "3) Replace ORTModule with HierarchalORTModule to wrap exportable sub-nn.Module's as "
+            "ORTModule.\n"
+            f"{LogColor.ENDC}"
+        )
+    else:
+        return None  # Let the common exporter handle the checkpointing function
+
+
 @register_high_priority_handler("bitsandbytes.autograd._functions.MatMul4Bit")
 def _matmul4bit_export(g, n, *args, **kwargs):
     cconv = n.cconv()
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
index 77317242727b..75512cb8e8c8 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
@@ -48,7 +48,7 @@ def _to_gradient_definition(gradient):
                 attr_def.name = key
                 attr_def.value_json = json.dumps(value["value"])
                 attr_def.dtype = value["dtype"]
-                attr_def.is_tensor = value["is_tensor"] if "is_tensor" in value else False
+                attr_def.is_tensor = value.get("is_tensor", False)
                 attributes.append(attr_def)
         node_def.attributes = attributes
         node_defs.append(node_def)
@@ -241,7 +241,7 @@ def native_group_norm_gradient():
 # are available for all versions, though they are not that convienent to use.
 def _upsample_gradient(backward_fn, dims):
     scales = ["" for _ in range(dims)]
-    if "bilinear" in backward_fn:
+    if "bicubic" in backward_fn:
         scales = ["I(2)", *scales]
     return [
         ("Shape", ["I(0)"], ["Shape_X"]),
@@ -271,3 +271,8 @@ def upsample_nearest2d_gradient():
 @register_gradient("org.pytorch.aten", "ATen", "upsample_nearest3d", "vec")
 def upsample_nearest3d_gradient():
     return _upsample_gradient("upsample_nearest3d_backward", 3)
+
+
+@register_gradient("org.pytorch.aten", "ATen", "upsample_bicubic2d", "vec")
+def upsample_bicubic2d_gradient():
+    return _upsample_gradient("upsample_bicubic2d_backward", 2)
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index 99e8851b6a69..dd7fea3ceda1 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -10,7 +10,7 @@
 from packaging import version
 from packaging.version import Version
 from torch.onnx import register_custom_op_symbolic
-from torch.onnx.symbolic_helper import _get_tensor_dim_size, _get_tensor_sizes, parse_args
+from torch.onnx.symbolic_helper import parse_args
 
 from onnxruntime.training.utils import pytorch_type_to_onnx_dtype
 
@@ -176,9 +176,9 @@ def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
     try:
         # Tolerant to the case when sizes of indices are not available or not usable (for example
         # when DeepSpeed stage3 enabled, all weights size is (0), this will fail.)
-        indices_shape = _get_tensor_sizes(indices)
+        indices_shape = sym_help._get_tensor_sizes(indices)
         if indices_shape is not None and hasattr(weight.type(), "with_sizes"):
-            output_type = weight.type().with_sizes([*indices_shape, _get_tensor_dim_size(weight, 1)])
+            output_type = weight.type().with_sizes([*indices_shape, sym_help._get_tensor_dim_size(weight, 1)])
             output.setType(output_type)
     except IndexError:
         output.setType(weight.type())
@@ -808,3 +808,67 @@ def upsample_nearest2d(g, input, output_size, scale_factors):
 @register_symbolic("upsample_nearest3d")
 def upsample_nearest3d(g, input, output_size, scale_factors):
     return _upsample_nearest(g, input, output_size, scale_factors, "upsample_nearest3d")
+
+
+@register_symbolic("upsample_bicubic2d")
+def upsample_bicubic2d(g, input, output_size, align_corners, scale_factors):
+    return g.op(
+        "org.pytorch.aten::ATen",
+        input,
+        output_size,
+        align_corners,
+        scale_factors,
+        operator_s="upsample_bicubic2d",
+        overload_name_s="vec",
+    )
+
+
+@register_symbolic("layer_norm")
+@parse_args("v", "is", "v", "v", "f", "none")
+def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):
+    # normalized_shape: input shape from an expected input of size
+    # axis: The first normalization dimension.
+    # layer_norm normalizes on the last D dimensions,
+    # where D is the size of normalized_shape
+    axis = -len(normalized_shape)
+
+    res, new_running_mean, new_running_var = g.op(
+        "LayerNormalization",
+        input,
+        weight,
+        bias,
+        epsilon_f=eps,
+        axis_i=axis,
+        outputs=3,  # force all 3 outputs to be exported in training mode
+        operator_s="layer_norm",
+        overload_name_s="vec",
+    )
+
+    return res
+
+
+# Adapted from torch.onnx.symbolic_opset13.softmax -
+# https://github.com/pytorch/pytorch/blob/cf06189a2d2785ac493bcd0d55e520af5a0e3b97/torch/onnx/symbolic_opset13.py#L27
+# We don't need overloads symbolic_opset9 because training support opsets >= 13.
+#
+# Why we need to define softmax export logic here?
+# For the usage `nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32)` in the model,
+# https://github.com/huggingface/transformers/blob/76a33a10923ccc1074917f6b6a1e719e626b7dc9/src/transformers/models/mistral/modeling_mistral.py#L302
+# If dtype is specified, the input tensor is casted to dtype before the operation is performed.
+# This is useful for preventing data type overflows. While existing ONNX exporter do the cast after the operation.
+# This override can be a workaround before PyTorch fix the issues in coming releases.
+# (TODO: pengwa - add PyTorch versions when the issue is fixed).
+@register_symbolic("softmax")
+@parse_args("v", "i", "none")
+def softmax(g, input, dim, dtype=None):
+    from torch.onnx import _type_utils
+
+    casted_input = input
+    need_cast_for_compute = dtype and dtype.node().kind() != "prim::Constant"
+    if need_cast_for_compute:
+        parsed_dtype = sym_help._get_const(dtype, "i", "dtype")
+        casted_input = g.op("Cast", input, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type())
+
+    softmax = g.op("Softmax", casted_input, axis_i=dim)
+
+    return softmax
diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py b/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
index 12780016a9ab..871d3fff8ce3 100644
--- a/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
+++ b/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
@@ -10,8 +10,6 @@ class ORTModuleFallbackException(Exception):  # noqa: N818
     it can also be used for generic exception that require fallback
     """
 
-    pass
-
 
 class ORTModuleInitException(ORTModuleFallbackException):
     """Trigger fallback for ORTModule initialization related exceptions
@@ -20,8 +18,6 @@ class ORTModuleInitException(ORTModuleFallbackException):
     including PyTorch version, missing ORTModule's PyTorch C++ extension binaries, etc.
     """
 
-    pass
-
 
 class ORTModuleDeviceException(ORTModuleFallbackException):
     """Trigger fallback for device related exceptions
@@ -31,8 +27,6 @@ class ORTModuleDeviceException(ORTModuleFallbackException):
     This exception does not capture these scenarios.
     """
 
-    pass
-
 
 class ORTModuleIOError(ORTModuleFallbackException):
     """Trigger fallback for I/O related exceptions
@@ -42,8 +36,6 @@ class ORTModuleIOError(ORTModuleFallbackException):
     This exception does not capture these scenarios.
     """
 
-    pass
-
 
 class ORTModuleTorchModelException(ORTModuleFallbackException):
     """Trigger fallback for PyTorch modules related exceptions
@@ -52,8 +44,6 @@ class ORTModuleTorchModelException(ORTModuleFallbackException):
     checking type(model) over a hardcoded list of incompatible models.
     """
 
-    pass
-
 
 class ORTModuleONNXModelException(ORTModuleFallbackException):
     """Trigger fallback for ONNX model related exceptions
@@ -61,8 +51,6 @@ class ORTModuleONNXModelException(ORTModuleFallbackException):
     This exception is raised during model conversion to ONNX and post-processing validation within ORTModule frontend.
     """
 
-    pass
-
 
 def wrap_exception(
     new_exception: ORTModuleFallbackException, raised_exception: Exception
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 76943b954837..b4f2be415025 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -20,9 +20,8 @@
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
 from onnxruntime.training.utils import ORTModelInputOutputSchemaType, PTable, onnx_dtype_to_pytorch_dtype
-from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3
 
-from . import _are_deterministic_algorithms_enabled, _io, _logger, _onnx_models, _utils
+from . import _are_deterministic_algorithms_enabled, _io, _logger, _onnx_models, _utils, export_context
 from ._fallback import (
     ORTModuleDeviceException,
     ORTModuleONNXModelException,
@@ -34,9 +33,9 @@
 from ._gradient_accumulation_manager import GradientAccumulationManager
 from ._graph_execution_interface import GraphExecutionInterface
 from ._io import _FlattenedModule, _InputInfo
-from ._runtime_inspector import RuntimeInspector
+from ._logger import LogColor
+from ._runtime_inspector import FlagPaddingElimination, RuntimeInspector
 from ._utils import check_function_has_param, get_rank
-from ._zero_stage3_compatibility import stage3_export_context
 from .options import DebugOptions, LogLevel, _MemoryOptimizationLevel, _RuntimeOptions
 from .torch_cpp_extensions.cpu.aten_op_executor import load_aten_op_executor_cpp_extension
 
@@ -56,10 +55,20 @@ def __init__(
         self,
         module: _FlattenedModule,
         debug_options: DebugOptions,
+        export_mode: int,
         fallback_manager: _FallbackManager,
         logger: logging.Logger,
     ):
-        """Manages construction and execution of ONNX graphs"""
+        """Manages construction and execution of ONNX graphs.
+
+        Args:
+            module: The flatten PyTorch module to be executed.
+            debug_options: Debug options for ORTModule.
+            export_mode: export mode, should be torch.onnx.TrainingMode.TRAINING or torch.onnx.TrainingMode.EVAL.
+            fallback_manager: Fallback manager to handle exceptions.
+            logger: Logger for ORTModule.
+
+        """
 
         super().__init__(module._original_module)
 
@@ -90,16 +99,12 @@ def __init__(
 
         self._first_skip_check_warning = True
 
-        # Inspector for runtime information, for example input data, memory usage, etc.
-        self._runtime_inspector = RuntimeInspector(self._logger, self._original_module)
-        self._runtime_inspector.memory_ob.enable_memory_stats_by_step(self._runtime_options.print_memory_stat_by_step)
-
         # Tracker for ORTModule model export, session creation overhead.
         self.time_tracker = _logger.TimeTracker()
 
         # Value can be either torch.onnx.TrainingMode.TRAINING or torch.onnx.TrainingMode.EVAL
         # To be instantiated in the concrete implementation of GraphExecutionManager
-        self._export_mode = None
+        self._export_mode = export_mode
 
         # Exporter can take extra arguments for ORTModule extensions
         # It cannot overlap with required/immutable arguments (validated in runtime)
@@ -131,6 +136,12 @@ def __init__(
         # Re-export will be avoided if _skip_check is enabled.
         self._original_model_has_changed = False
 
+        # Inspector for runtime information, for example input data, memory usage, etc.
+        self._runtime_inspector = RuntimeInspector(
+            self._logger, self._original_module, self._export_mode == torch.onnx.TrainingMode.TRAINING
+        )
+        self._runtime_inspector.memory_ob.enable_memory_stats_by_step(self._runtime_options.print_memory_stat_by_step)
+
         # Load ATen operator executor extension.
         load_aten_op_executor_cpp_extension()
 
@@ -144,10 +155,17 @@ def __init__(
 
         self._zero_stage3_param_map = {}
         if self._runtime_options.enable_zero_stage3_support:
+            # Move import to here to avoid circular dependency error
+            from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3  # type: ignore[import]
+
             # Cannot toggle feature enabling/disabling after the first time enabled.
 
             configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="ort_output", stats_overwrite=True)
 
+        # Will be reset everytime we re-initialize the graph builder.
+        # Be noted, we will never enable this feature for inference mode.
+        self._mem_efficient_grad_management_is_enabled = False
+
     def _get_torch_gpu_allocator_function_addresses(self):
         if self._runtime_options.use_external_gpu_allocator and torch.cuda.is_available():
             # CPP extension to get torch GPU allocator's alloc and free function addresses
@@ -183,7 +201,6 @@ def forward(self):
         This is an abstract method and must be overridden by a concrete implementation.
         This is the only method that the user should call on a concrete instance of the ExecutionManager
         All other methods are internal"""
-        pass
 
     def _build_graph(self, config):
         if self._runtime_options.use_static_shape:
@@ -243,7 +260,7 @@ def _get_session_config(self):
         # requires PRIORITY_BASED order to work properly. So we use PRIORITY_BASED order when recompute is enabled.
         session_options.execution_order = (
             onnxruntime.ExecutionOrder.PRIORITY_BASED
-            if self._runtime_options.memory_optimizer_config != ""
+            if self._runtime_options.memory_optimizer_is_enabled()
             else onnxruntime.ExecutionOrder.DEFAULT
         )
         # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
@@ -289,11 +306,16 @@ def _export_model(self, *inputs, **kwargs) -> bool:
             # All required models have already been exported previously
             return False
         self._set_device_from_module(inputs, kwargs)
+        # TODO: move it into runtime_inspector
+        embedding_hook_handles = self._add_check_embedding_sparsity_hook()
 
         from onnxruntime.training.utils.hooks._subscriber_manager import no_increase_global_step
 
-        with no_increase_global_step():
+        with export_context(), no_increase_global_step():
             self._onnx_models.exported_model = self._get_exported_model(schema, *inputs, **kwargs)
+
+        for hook in embedding_hook_handles:
+            hook.remove()
         if self._debug_options.save_onnx_models.save:
             self._onnx_models.save_exported_model(
                 self._debug_options.save_onnx_models.path,
@@ -388,6 +410,8 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
         assert self._export_mode is not None, "Please use a concrete instance of ExecutionManager"
 
         try:
+            from ._zero_stage3_compatibility import stage3_export_context
+
             with torch.no_grad(), stage3_export_context(self._runtime_options.enable_zero_stage3_support, self):
                 required_export_kwargs = {
                     "input_names": self._input_info.names,
@@ -405,9 +429,9 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
                     # From some PyTorch version, autograd_inlining is a valid argument.
                     # We allow it to be True if custom autograd function is disabled (where autograd.Function
                     # anyway is not supported in ONNX until it can be inlined).
-                    required_export_kwargs[
-                        "autograd_inlining"
-                    ] = not self._runtime_options.enable_custom_autograd_function
+                    required_export_kwargs["autograd_inlining"] = (
+                        not self._runtime_options.enable_custom_autograd_function
+                    )
 
                 invalid_args = self._export_extra_kwargs.keys() & required_export_kwargs.keys()
 
@@ -423,12 +447,49 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
                     **self._export_extra_kwargs,
                 )
         except Exception as e:
+            message = _utils.get_exception_as_string(e)
+
+            # Special handling when Huggingface transformers gradient checkpoint usage pattern found.
+            # For new versions of PyTorch 2, tracing torch.utils.checkpoint.checkpoint will be failed like this:
+            #   File "microsoft/phi-2/b10c3eba545ad279e7208ee3a5d644566f001670/modeling_phi.py", line 919, in forward
+            #     layer_outputs = self._gradient_checkpointing_func(
+            #   File "/site-packages/torch/_compile.py", line 24, in inner
+            #     return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
+            #   File "/site-packages/torch/_dynamo/eval_frame.py", line 470, in _fn
+            #     raise RuntimeError(
+            #   RuntimeError: Detected that you are using FX to torch.jit.trace a dynamo-optimized function. This is not supported at the moment.
+            if (
+                "_gradient_checkpointing_func" in message
+                and "Detected that you are using FX to torch.jit.trace a dynamo-optimized function" in message
+            ):
+                is_ckpt_activation_allowed = int(os.getenv("ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT", "0")) == 1
+                notes = (
+                    " Your model is running with gradient checkpointing, yet the PyTorch exporter\n"
+                    " failed during tracing the graph. Try to enable ORTModule's\n"
+                    " gradient checkpointing (a.k.a. Transformer layerwise subgraph recompute)\n"
+                    " using `export ORTMODULE_MEMORY_OPT_LEVEL=1` for similar or even better memory efficiency.\n"
+                )
+                if is_ckpt_activation_allowed:
+                    # If the user allows the gradient checkpointing export, we should inform the user to disable it,
+                    # to make layerwise recompute work.
+                    notes += (
+                        " We also notice your setting `export ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT=1`,\n"
+                        " which enables gradient checkpointing torch.autograd.Functions(s) to export.\n"
+                        " To enable ORTModule's layerwise recompute, it needs to be turned OFF by\n"
+                        " `export ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT=0`.\n"
+                    )
+
+                self._logger.error(
+                    f"{LogColor.RED}\n"
+                    "******************************** IMPORTANT NOTE *******************************\n"
+                    f"{notes}"
+                    "*******************************************************************************\n"
+                    f"{LogColor.ENDC}\n"
+                )
+
             raise wrap_exception(  # noqa: B904
                 ORTModuleONNXModelException,
-                RuntimeError(
-                    f"There was an error while exporting the PyTorch model to ONNX: "
-                    f"\n\n{_utils.get_exception_as_string(e)}"
-                ),
+                RuntimeError(f"There was an error while exporting the PyTorch model to ONNX: \n\n{message}"),
             )
         exported_model = onnx.load_model_from_string(f.getvalue())
 
@@ -496,9 +557,35 @@ def _get_graph_transformer_config(self) -> C.TrainingGraphTransformerConfigurati
     def _initialize_graph_builder(self):
         """Creates a new OrtModuleGraphBuilder, initializes it and saves it to self._graph_builder"""
 
+        self._mem_efficient_grad_management_is_enabled = (
+            self._export_mode != torch.onnx.TrainingMode.EVAL
+            and self._runtime_options.enable_mem_efficient_grad_management
+        )
+
+        # We post process the exported model because the trainable parame might be changed, so this path is
+        # re-triggered by reinitialize_graph_builder.
+        exported_model = copy.deepcopy(self._onnx_models.exported_model)
+        self._onnx_models.processed_exported_model = exported_model
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import post_processing_enable_mem_efficient_training
+
+            # Override the options if model is not modified.
+            (
+                self._mem_efficient_grad_management_is_enabled,
+                exported_model,
+            ) = post_processing_enable_mem_efficient_training(exported_model, self._flattened_module.named_parameters())
+
+            if self._runtime_options.run_symbolic_shape_infer:
+                exported_model = SymbolicShapeInference.infer_shapes(
+                    exported_model, auto_merge=True, guess_output_rank=True
+                )
+
         # All initializer names along with user inputs are a part of the onnx graph inputs
         # since the onnx model was exported with the flag keep_initializers_as_inputs=True
-        onnx_initializer_names = {p.name for p in self._onnx_models.exported_model.graph.input}
+        # We need to use the raw exported model here since the graph inputs include both user inputrs and
+        # parameters.
+        onnx_initializer_names = {p.name for p in exported_model.graph.input}
 
         # TODO: PyTorch exporter bug: changes the initializer order in ONNX model
         initializer_names = [
@@ -521,6 +608,13 @@ def _initialize_graph_builder(self):
 
             # Add stage3 pull weight trigger name to require_grad_names, so that it will be included in the gradient graph.
             input_names_require_grad.append(STAGE3_PULL_WEIGHT_TRIGGER_NAME)
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+
+            # Add mem efficient grad trigger name to require_grad_names, so that it will be included in the gradient graph.
+            input_names_require_grad.append(MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME)
+
         grad_builder_config.input_names_require_grad = input_names_require_grad
         grad_builder_config.build_gradient_graph = self._export_mode == torch.onnx.TrainingMode.TRAINING
         grad_builder_config.enable_caching = self._runtime_options.enable_grad_acc_optimization
@@ -532,12 +626,23 @@ def _initialize_graph_builder(self):
 
         # It is assumed here that the order and names of the inputs and outputs are not modified by the backend in any way
         # and are kept as they appear in the exported onnx model.
-        self._graph_builder.initialize(self._onnx_models.exported_model.SerializeToString(), grad_builder_config)
+        self._graph_builder.initialize(exported_model.SerializeToString(), grad_builder_config)
+
+        raw_onnx_initializer_names = {p.name for p in self._onnx_models.exported_model.graph.input}
+
+        raw_initializer_names = [
+            name for name, _ in self._flattened_module.named_parameters() if name in raw_onnx_initializer_names
+        ]
+        raw_initializer_names_to_train = [
+            name
+            for name, param in self._flattened_module.named_parameters()
+            if param.requires_grad and name in raw_onnx_initializer_names
+        ]
 
         # TODO: Explore ways to make self._graph_info.initializer_names and self._graph_info.initializer_names_to_train
         #       a set (unordered_set in the backend) that does not require a copy on each reference.
-        self._graph_initializer_names = set(initializer_names)
-        self._graph_initializer_names_to_train = set(initializer_names_to_train)
+        self._graph_initializer_names = set(raw_initializer_names)
+        self._graph_initializer_names_to_train = set(raw_initializer_names_to_train)
 
         # Initializers can be cached and used since they are expected not to be re-instantiated
         # between forward calls.
@@ -571,6 +676,58 @@ def __setstate__(self, state):
 
         _utils.reinitialize_graph_execution_manager(self)
 
+    def _add_check_embedding_sparsity_hook(self):
+        """
+        Add hook to check embedding sparsity and enable padding elimination if applicable.
+        1. Iterate through all modules to find Embedding modules with padding_idx >= 0.
+        2. Register forward hook to the Embedding module and the hook will check sparsity of the embedding input.
+        3. If the sparsity is below a threshold, enable padding elimination by adding FlagPaddingElimination after the
+           output. GraphTransformer of PaddingElimination will check the FlagPaddingElimination and do the actual
+           padding elimination graph modification.
+        4. Return the hook handles for later removal.
+
+        """
+        if (
+            not self._runtime_options.enable_sparse_optimizer
+            or not self._runtime_options.enable_embedding_sparse_optimizer
+            or self._device.type != "cuda"
+        ):
+            return []
+
+        def _embedding_hook(module, args, output):
+            ebd_input = args[0]
+            if ebd_input is None or not isinstance(ebd_input, torch.Tensor):
+                self._logger.warning("Embedding input is not a tensor.")
+                return None
+
+            valid_token = torch.count_nonzero(ebd_input - module.padding_idx)
+            total_token = ebd_input.numel()
+            embed_density = float(valid_token) / float(total_token) * 100
+            if embed_density < 90:
+                self._logger.info("Embedding sparsity-based optimization is ON for density: %.0f%%", embed_density)
+                if module not in self._runtime_inspector._embedding_module_to_padding_density_map:
+                    self._logger.warning("Found Embedding module not in the map. %s", module)
+                    return None
+                if self._runtime_inspector._embedding_module_to_padding_density_map[module][1] != -1:
+                    self._logger.warning(
+                        "Found duplicate Embedding module. %s",
+                        self._runtime_inspector._embedding_module_to_padding_density_map[module][0],
+                    )
+                self._runtime_inspector._embedding_module_to_padding_density_map[module][1] = embed_density
+                return FlagPaddingElimination.apply(output)
+            else:
+                self._logger.info("Embedding sparsity-based optimization is OFF for density: %.0f%%", embed_density)
+                return None
+
+        embedding_hook_handles = []
+        for name, sub_module in self._flattened_module.named_modules():
+            if isinstance(sub_module, torch.nn.modules.sparse.Embedding):
+                if sub_module.padding_idx is not None and sub_module.padding_idx >= 0:
+                    self._runtime_inspector._embedding_module_to_padding_density_map[sub_module] = [name, -1]
+                    embedding_hook_handles.append(sub_module.register_forward_hook(_embedding_hook))
+
+        return embedding_hook_handles
+
     @_logger.TrackTime(_logger.ORTModuleInitPhase.DETECTION)
     def _enable_conditional_optimizations(
         self, graph_transformer_config: C.TrainingGraphTransformerConfiguration, inputs: Tuple, kwargs: Dict
@@ -588,7 +745,7 @@ def _enable_conditional_optimizations(
         # Enable data sparsity inspection if sparse optimizer is ON or user wants to print input density.
         if self._runtime_options.enable_sparse_optimizer or self._runtime_options.print_input_density:
             self._runtime_inspector.enable_input_inspector(
-                self._onnx_models.exported_model, self._graph_builder.get_graph_info().user_input_names
+                self._onnx_models.processed_exported_model, self._graph_builder.get_graph_info().user_input_names
             )
 
             if self._runtime_options.enable_sparse_optimizer:
@@ -596,11 +753,21 @@ def _enable_conditional_optimizations(
                     inputs, kwargs
                 )
 
-                if self._runtime_options.enable_zero_stage3_support:
+                if self._runtime_options.enable_zero_stage3_support or self._mem_efficient_grad_management_is_enabled:
                     self._append_pull_weight_trigger_as_input(kwargs, detected_device)
 
-                _, embed_sparsity_results, label_sparsity_results = _io._combine_input_buffers_initializers(
-                    self._graph_initializers,
+                param_to_append_as_onnx_graph_inputs = []
+                if self._mem_efficient_grad_management_is_enabled:
+                    from ._mem_efficient_grad_mgmt import get_params_not_connected_to_pull_param_trigger
+
+                    param_to_append_as_onnx_graph_inputs = get_params_not_connected_to_pull_param_trigger(
+                        self._flattened_module.named_parameters(), self._onnx_models.exported_model
+                    )
+                else:
+                    param_to_append_as_onnx_graph_inputs = self._graph_initializers
+
+                _, _, label_sparsity_results = _io._combine_input_buffers_initializers(
+                    param_to_append_as_onnx_graph_inputs,
                     self._graph_builder.get_graph_info().user_input_names,
                     self._input_info,
                     self._flattened_module.named_buffers(),
@@ -619,11 +786,12 @@ def _enable_conditional_optimizations(
                         [f"{k}:{v:.0f}%" for k, v in label_sparsity_results.items()]
                     )
 
-                if self._runtime_options.enable_embedding_sparse_optimizer and len(embed_sparsity_results) > 0:
-                    graph_transformer_config.sparse_embedding_input_names = list(embed_sparsity_results.keys())
-                    self._logger.info("Embedding sparsity-based optimization is ON for %s", embed_sparsity_results)
+                if self._runtime_inspector._embedding_module_to_padding_density_map:
                     self._runtime_options.embed_sparsity_ratio = ",".join(
-                        [f"{k}:{v:.0f}%" for k, v in embed_sparsity_results.items()]
+                        [
+                            f"{v[0]}:{v[1]:.0f}%"
+                            for v in self._runtime_inspector._embedding_module_to_padding_density_map.values()
+                        ]
                     )
 
             # If users don't want to print input density, disable the input density observer to avoid overhead
@@ -632,19 +800,31 @@ def _enable_conditional_optimizations(
                 self._runtime_inspector.disable_input_inspector()
 
     def _append_pull_weight_trigger_as_input(self, kwargs: Dict, device: torch.device):
-        from ._zero_stage3_compatibility import (
-            STAGE3_PULL_WEIGHT_TRIGGER_NAME,
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
-        )
+        if self._runtime_options.enable_zero_stage3_support:
+            from ._zero_stage3_compatibility import (
+                STAGE3_PULL_WEIGHT_TRIGGER_NAME,
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+            )
 
-        kwargs[STAGE3_PULL_WEIGHT_TRIGGER_NAME] = torch.zeros(
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
-            dtype=onnx_dtype_to_pytorch_dtype(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE),
-            device=device,
-        ).requires_grad_()
+            kwargs[STAGE3_PULL_WEIGHT_TRIGGER_NAME] = torch.zeros(
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+                dtype=onnx_dtype_to_pytorch_dtype(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE),
+                device=device,
+            ).requires_grad_()
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import (
+                MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+            )
 
-        return kwargs
+            kwargs[MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME] = torch.zeros(
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+                dtype=onnx_dtype_to_pytorch_dtype(MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE),
+                device=device,
+            ).requires_grad_()
 
     def _log_feature_stats(self):
         if get_rank() != 0:
@@ -677,20 +857,29 @@ def _add_record(tbl, columns):
 
         if self._runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
             opt_config_to_display = "ALL_RECOMPUTE_FOR_EACH_LAYER"
+        elif (
+            self._runtime_options.memory_optimization_level
+            == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE
+        ):
+            opt_config_to_display = "ALL_RECOMPUTE_FOR_EACH_LAYER_WITH_COMPROMISE"
         else:
             opt_config_to_display = self._runtime_options.memory_optimizer_config
 
+        mem_infos = ""
+        if self._runtime_options.memory_optimizer_is_enabled():
+            mem_infos += (
+                f"Memory Optimization Level: [{_MemoryOptimizationLevel.to_string(self._runtime_options.memory_optimization_level)}], "
+                f"Optimization Config: [{opt_config_to_display}]"
+            )
+        else:
+            mem_infos = "Enable with env ORTMODULE_MEMORY_OPT_LEVEL=1/2 or ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,..."
+
         mem_row = _add_record(
             tbl,
             [
                 "Memory Optimizer",
-                len(self._runtime_options.memory_optimizer_config) > 0,
-                (
-                    f"Memory Optimization Level: [{_MemoryOptimizationLevel.to_string(self._runtime_options.memory_optimization_level)}], "
-                    f"Optimization Config: [{opt_config_to_display}]"
-                    if len(self._runtime_options.memory_optimizer_config) > 0
-                    else "Enable with env ORTMODULE_MEMORY_OPT_LEVEL=1 or ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,..."
-                ),
+                self._runtime_options.memory_optimizer_is_enabled(),
+                mem_infos,
             ],
         )
 
@@ -701,7 +890,7 @@ def _add_record(tbl, columns):
             )
             if mem_tbl is not None:
                 mem_row.append_annotation_table(mem_tbl)
-                notes.extend(mem_notes)
+                notes.extend([f"[{mem_row._columns[0]}] {n}" for n in mem_notes])
 
         compute_opt_row = _add_record(
             tbl,
@@ -726,13 +915,21 @@ def _add_record(tbl, columns):
             if len(self._runtime_options.label_sparsity_ratio) > 0:
                 _add_record(
                     compute_opt_annotation_tbl,
-                    [" - Label Sparsity Opt", True, f"Input density: {self._runtime_options.label_sparsity_ratio}"],
+                    [
+                        " - Label Sparsity",
+                        True,
+                        f"[AUTO ENABLED] Input density: {self._runtime_options.label_sparsity_ratio}",
+                    ],
                 )
 
             if len(self._runtime_options.embed_sparsity_ratio) > 0:
                 _add_record(
                     compute_opt_annotation_tbl,
-                    [" - Embed Sparsity Opt", True, f"Input density: {self._runtime_options.embed_sparsity_ratio}"],
+                    [
+                        " - Embed Sparsity",
+                        True,
+                        f"[AUTO ENABLED] Input density: {self._runtime_options.embed_sparsity_ratio}",
+                    ],
                 )
 
         compute_opt_row.append_annotation_table(compute_opt_annotation_tbl)
diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
index 6690af9b71bf..13145c7c7909 100644
--- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
@@ -28,8 +28,7 @@ class InferenceManager(GraphExecutionManager):
     """
 
     def __init__(self, model, debug_options: DebugOptions, fallback_manager: _FallbackManager, logger: Logger):
-        super().__init__(model, debug_options, fallback_manager, logger)
-        self._export_mode = torch.onnx.TrainingMode.EVAL
+        super().__init__(model, debug_options, torch.onnx.TrainingMode.EVAL, fallback_manager, logger)
 
     @staticmethod
     def execution_session_run_forward(
diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py
index a01db28374b8..91b99d4323d6 100644
--- a/orttraining/orttraining/python/training/ortmodule/_logger.py
+++ b/orttraining/orttraining/python/training/ortmodule/_logger.py
@@ -267,9 +267,11 @@ def wrapper(graph_execution_manager, *args, **kwargs):
                 on_exit=partial(
                     _log_with_filter,
                     graph_execution_manager._logger,
-                    graph_execution_manager._debug_options.onnxruntime_log_filter
-                    if self.is_ort_filter
-                    else graph_execution_manager._debug_options.torch_exporter_filter,
+                    (
+                        graph_execution_manager._debug_options.onnxruntime_log_filter
+                        if self.is_ort_filter
+                        else graph_execution_manager._debug_options.torch_exporter_filter
+                    ),
                     self.phase.to_string(),
                 ),
             ):
diff --git a/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py b/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
new file mode 100644
index 000000000000..4663afdaa94a
--- /dev/null
+++ b/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
@@ -0,0 +1,246 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import ctypes
+
+import torch
+from onnx import ModelProto, NodeProto, TensorProto, helper
+
+from onnxruntime.training.utils import pytorch_type_to_onnx_dtype
+
+from ._pythonop_helper import make_pythonop_node
+
+MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME = "mem_efficient_pull_weight_trigger"
+MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE = TensorProto.FLOAT
+MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE = [1]
+
+
+def get_params_connected_to_pull_param_trigger(
+    named_params: dict[str, torch.nn.parameter.Parameter], exported_model: ModelProto
+):
+    # Be noted, some parameters might not in graph input because they are not used in forward, so we filtered them also.
+    onnx_initializer_names = {p.name for p in exported_model.graph.input}
+    return {k: v for k, v in named_params if v.requires_grad and k in onnx_initializer_names}
+
+
+def get_params_not_connected_to_pull_param_trigger(
+    named_params: dict[str, torch.nn.parameter.Parameter], exported_model: ModelProto
+):
+    # Be noted, some parameters might not in graph input because they are not used in forward, so we filtered them also.
+    onnx_initializer_names = {p.name for p in exported_model.graph.input}
+    return [v for k, v in named_params if not v.requires_grad and k in onnx_initializer_names]
+
+
+def post_processing_enable_mem_efficient_training(
+    exported_model: ModelProto,
+    named_params: dict[str, torch.nn.parameter.Parameter],
+) -> tuple[bool, ModelProto]:
+    """This function is used to enable zero stage3 compatibility.
+
+    Args:
+        exported_model (ModelProto): The exported model.
+        named_params (Optional[Dict[str, torch.nn.parameter.Parameter]]): The full parameter map.
+
+    Returns:
+        tuple[bool, ModelProto]: A tuple of bool and ModelProto. The bool indicates whether the model is modified.
+
+    """
+    trainable_named_params = get_params_connected_to_pull_param_trigger(named_params, exported_model)
+    if len(trainable_named_params) == 0:
+        return False, exported_model
+
+    # Create weight retrieving function using trainable_named_params.
+    param_pull_trigger_func_class = _create_param_trigger_function(trainable_named_params)
+    param_retrieve_func_class = _create_param_retrieval_function(trainable_named_params)
+
+    def _get_param_pull_trigger_name(param_name: str) -> str:
+        return f"pull_{param_name}"
+
+    # Create weight retrieving PythonOp.
+    inputs = [
+        helper.make_tensor_value_info(
+            MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME,
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,  # Use the same data type with output for the input
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+        )
+    ]
+
+    outputs = [
+        helper.make_tensor_value_info(
+            _get_param_pull_trigger_name(pname),
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+        )
+        for pname in trainable_named_params
+    ]
+
+    weight_pull_node = make_pythonop_node(
+        "weight_pull_trigger",
+        inputs,
+        outputs,
+        param_pull_trigger_func_class,
+        training_mode=1,
+        safe_run_mode=0,
+    )
+
+    graph_inputs_to_remove = []
+    input_offset = 0
+    for graph_input in exported_model.graph.input:
+        if graph_input.name not in trainable_named_params:
+            continue
+
+        graph_inputs_to_remove.append(graph_input)
+
+        # Create the param retrieval function for this parameter.
+        node_inputs = [
+            helper.make_tensor_value_info(
+                _get_param_pull_trigger_name(graph_input.name),
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+            ),
+            graph_input.name,  # Second param is a string, which represents the param_name
+        ]
+
+        node_outputs = [
+            helper.make_tensor_value_info(
+                graph_input.name,  # output use the same name as weight
+                int(pytorch_type_to_onnx_dtype(trainable_named_params[graph_input.name].dtype)),
+                list(trainable_named_params[graph_input.name].shape),
+            ),
+        ]
+
+        new_node = make_pythonop_node(
+            f"weight_retrieval_{graph_input.name}",
+            node_inputs,
+            node_outputs,
+            param_retrieve_func_class,
+            training_mode=1,
+            safe_run_mode=0,
+        )
+        exported_model.graph.node.insert(input_offset, new_node)
+        input_offset += 1
+
+    # Delete exported_model.graph.input
+    names_to_remove = [input.name for input in graph_inputs_to_remove]
+    value_infos_to_remove = [
+        value_info for value_info in exported_model.graph.value_info if value_info.name in names_to_remove
+    ]
+    for value_info in value_infos_to_remove:
+        exported_model.graph.value_info.remove(value_info)
+
+    for input_to_remove in graph_inputs_to_remove:
+        exported_model.graph.input.remove(input_to_remove)
+
+    # Re-order graph input to make sure the weight pull trigger is the first user input.
+    offset = 0  # Find the first trainable param, and insert the new input before it, as part of user inputs.
+    for input in exported_model.graph.input:
+        if input.name in named_params:
+            break
+        offset += 1
+    exported_model.graph.input.insert(offset, inputs[0])
+    exported_model.graph.node.insert(0, weight_pull_node)
+
+    return True, exported_model
+
+
+_PARAM_FUNCTION_INDEX = [0]
+
+
+def _create_param_trigger_function(trainable_named_params: dict[str, torch.nn.parameter.Parameter]):
+    """This function is used to create a weight retrieving function using trainable_named_params."""
+
+    @staticmethod
+    def forward(ctx, weight_in_trigger):
+        params = list(trainable_named_params.values())
+        ctx.params = params
+        ctx.dtype = weight_in_trigger.dtype
+        ctx.device = weight_in_trigger.device
+        ctx.shape = weight_in_trigger.shape
+        return (torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype),) * len(params)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype)
+
+    @staticmethod
+    def infer_shape(
+        node: NodeProto,
+        tensor_input_shapes: list[list[int | str] | None],
+        tensor_input_dtypes: list[torch.onnx.TensorProtoDataType],
+    ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]:
+        param_count = len(trainable_named_params.values())
+        tensor_output_shapes = [
+            tensor_input_shapes[0],
+        ] * param_count
+        tensor_output_dtypes = [
+            tensor_input_dtypes[0],
+        ] * param_count
+
+        return tensor_output_shapes, tensor_output_dtypes
+
+    _PARAM_FUNCTION_INDEX[0] += 1
+
+    return type(
+        f"ParamTriggerFunction_{_PARAM_FUNCTION_INDEX[0]}",
+        (torch.autograd.Function,),
+        {
+            "forward": forward,
+            "backward": backward,
+            "infer_shape": infer_shape,
+        },
+    )
+
+
+def _create_param_retrieval_function(trainable_named_params: dict[str, torch.nn.parameter.Parameter]):
+    """This function is used to create a weight retrieving function using trainable_named_params."""
+
+    @staticmethod
+    def forward(ctx, param_trigger, param_name):
+        ctx.param_name = param_name
+        ctx.dtype = param_trigger.dtype
+        ctx.device = param_trigger.device
+        ctx.shape = param_trigger.shape
+        return trainable_named_params[param_name]
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        trainable_named_params[ctx.param_name].backward(grad_outputs[0])
+        return torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype), None
+
+    @staticmethod
+    def infer_shape(
+        node: NodeProto,
+        tensor_input_shapes: list[list[int | str] | None],
+        tensor_input_dtypes: list[torch.onnx.TensorProtoDataType],
+    ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]:
+        input_pointer_scalars_attr_name = "input_pointer_scalars"
+        found = [attr for attr in node.attribute if attr.name == input_pointer_scalars_attr_name]
+
+        assert len(found) == 1
+        input_pointer_scalars = found[0].ints
+
+        # Restore the nn.Module from the pointer.
+        param_name = ctypes.cast(input_pointer_scalars[0], ctypes.py_object).value
+
+        tensor_output_shapes = [
+            list(trainable_named_params[param_name].shape),
+        ]
+        tensor_output_dtypes = [
+            int(pytorch_type_to_onnx_dtype(trainable_named_params[param_name].dtype)),
+        ]
+
+        return tensor_output_shapes, tensor_output_dtypes
+
+    return type(
+        f"ParamRetrievalFunction_{_PARAM_FUNCTION_INDEX[0]}",
+        (torch.autograd.Function,),
+        {
+            "forward": forward,
+            "backward": backward,
+            "infer_shape": infer_shape,
+        },
+    )
diff --git a/orttraining/orttraining/python/training/ortmodule/_onnx_models.py b/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
index d687bc24384e..a0001a2f201f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
+++ b/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
@@ -33,6 +33,7 @@ class ONNXModels:
     """
 
     exported_model: Optional[onnx.ModelProto] = None
+    processed_exported_model: Optional[onnx.ModelProto] = None
     optimized_model: Optional[onnx.ModelProto] = None
 
     def save_exported_model(self, path, name_prefix, export_mode):
diff --git a/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py b/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py
new file mode 100644
index 000000000000..32a564b27acd
--- /dev/null
+++ b/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py
@@ -0,0 +1,240 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import inspect
+
+import onnx
+import torch
+
+from onnxruntime.capi._pybind_state import register_miscellaneous_const_input, register_torch_autograd_function
+
+from ._custom_autograd_function_exporter import register_custom_function_schema_supplementary
+from ._utils import get_fully_qualified_class_name
+
+PYTHON_OP_DOMAIN = "com.microsoft"
+PYTHON_OP_TYPE = "PythonOp"
+
+PYTHON_OP_ATTRIBUTE_FUNC_NAME = "func_name"
+PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE = "safe_run_mode"
+PYTHON_OP_ATTRIBUTE_TRAINING_MODE = "training_mode"
+
+
+def set_safe_run_mode(model: onnx.ModelProto, allowed_unsafe_run_python_op_names: list[str]) -> onnx.ModelProto:
+    # Update safe_run_mode attribute for PythonOp.
+    for node in model.graph.node:
+        if node.domain == PYTHON_OP_DOMAIN and node.op_type == PYTHON_OP_TYPE:
+            func_name = None
+            safe_run_mode_attr = None
+            for attr in node.attribute:
+                if attr.name == PYTHON_OP_ATTRIBUTE_FUNC_NAME:
+                    func_name = attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s
+                if attr.name == PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE:
+                    safe_run_mode_attr = attr
+
+            if func_name in allowed_unsafe_run_python_op_names:
+                if safe_run_mode_attr:
+                    node.attribute.remove(safe_run_mode_attr)
+                node.attribute.append(onnx.helper.make_attribute(PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE, 0))
+
+    return model
+
+
+_PYTHON_OP_INCRE_INDEX = [0]
+
+
+def make_pythonop_node(
+    name_prefix: str,
+    inputs: list[
+        onnx.ValueInfoProto | int | bool | float | tuple[int, ...] | tuple[bool, ...] | tuple[float, ...] | object
+    ],
+    outputs: list[onnx.ValueInfoProto],
+    func_class: torch.autograd.Function,
+    training_mode: int,
+    safe_run_mode: int,
+) -> onnx.NodeProto:
+    assert issubclass(func_class, torch.autograd.Function), "func_class must be a subclass of torch.autograd.Function."
+
+    assert len(inputs) > 0, f"inputs must not be empty for function {func_class}."
+    assert len(outputs) > 0, f"outputs must not be empty for function {func_class}."
+
+    all_input_parameters: list[inspect.Parameter] = list(inspect.signature(func_class.forward).parameters.values())
+
+    # Remove the first parameter (ctx) from inspected parameter list.
+    assert len(inputs) == len(all_input_parameters) - 1, (
+        f"The number of inputs ({len(inputs)}) must match the number of parameters "
+        f"({len(all_input_parameters) - 1}) of the forward function."
+    )
+
+    func_full_qual_name = get_fully_qualified_class_name(func_class)
+
+    input_tensor_types = []
+    input_tensor_ranks = []
+
+    input_bool_scalars = []
+    input_bool_scalar_positions = []
+
+    input_int_scalars = []
+    input_int_scalar_positions = []
+
+    input_float_scalars = []
+    input_float_scalar_positions = []
+
+    input_bool_tuples = []
+    input_bool_tuple_positions = []
+    input_bool_tuple_begins = []
+
+    input_int_tuples = []
+    input_int_tuple_positions = []
+    input_int_tuple_begins = []
+
+    input_float_tuples = []
+    input_float_tuple_positions = []
+    input_float_tuple_begins = []
+
+    input_pointer_scalars = []
+    input_pointer_scalar_positions = []
+
+    tensor_args = []
+    debug_comment = ""
+    cconv = ""
+    # Encode inputs to torch.autograd.Function.
+    for i, arg in enumerate(inputs):
+        if isinstance(arg, onnx.ValueInfoProto):
+            # Got a tensor variable.
+            tensor_args.append(arg.name)
+            input_tensor_types.append(arg.type.tensor_type.elem_type)
+            input_tensor_ranks.append(len(arg.type.tensor_type.shape.dim))
+            cconv += "d"
+            continue
+
+        cconv += "c"
+
+        # Got a non-tensor variable.
+        if isinstance(arg, float):
+            # A float.
+            input_float_scalar_positions.append(i)
+            input_float_scalars.append(arg)
+            continue
+        # bool check MUST be before int check since bool is a subclass of int
+        elif isinstance(arg, bool):
+            # A bool.
+            input_bool_scalar_positions.append(i)
+            input_bool_scalars.append(int(arg))
+            continue
+        elif isinstance(arg, int):
+            # A int.
+            input_int_scalar_positions.append(i)
+            input_int_scalars.append(arg)
+            continue
+
+        is_bool_tuple = False
+        is_int_tuple = False
+        is_float_tuple = False
+        if isinstance(arg, tuple) and len(arg) > 0:
+            # bool check MUST be before int check since bool is a subclass of int.
+            is_bool_tuple = all(isinstance(ele, bool) for ele in arg)
+            is_int_tuple = not is_bool_tuple and all(isinstance(ele, int) for ele in arg)
+            is_float_tuple = not is_bool_tuple and not is_int_tuple and all(isinstance(ele, float) for ele in arg)
+
+        # Only support tuple of bool, int or float, for other types, handle it as a pointer.
+        if is_bool_tuple:
+            # A tuple of bool.
+            input_bool_tuple_positions.append(i)
+            input_bool_tuple_begins.append(len(input_bool_tuples))
+            input_bool_tuples.extend([int(ele) for ele in arg])
+            continue
+        elif is_int_tuple:
+            # A tuple of ints.
+            input_int_tuple_positions.append(i)
+            input_int_tuple_begins.append(len(input_int_tuples))
+            input_int_tuples.extend(list(arg))
+            continue
+        elif is_float_tuple:
+            # A tuple of floats.
+            input_float_tuple_positions.append(i)
+            input_float_tuple_begins.append(len(input_float_tuples))
+            input_float_tuples.extend(list(arg))
+            continue
+
+        from onnxruntime.training.utils.hooks._statistics_subscriber import _InspectActivation
+
+        is_inspect_activation = func_full_qual_name == get_fully_qualified_class_name(_InspectActivation)
+        if is_inspect_activation and isinstance(arg, str):
+            # _InspectActivation is a special case where the first argument is a string
+            # that is used to determine the activation name to be inspected.
+            debug_comment += arg
+
+        # All other inputs are accessed via "pointers".
+        input_pointer_scalar_positions.append(i)
+        input_pointer_scalars.append(id(arg))
+
+        # For pointer (for example, ProcessGroup passed to PythonOp) needed for PythonOp execution,
+        # we append it into a global store to hold a reference (in case it is released after module exported).
+        register_miscellaneous_const_input(arg)
+
+    output_tensor_types = []
+    output_tensor_ranks = []
+    for arg in outputs:
+        output_tensor_types.append(arg.type.tensor_type.elem_type)
+        output_tensor_ranks.append(len(arg.type.tensor_type.shape.dim))
+
+    attrs = {
+        "func_name": func_full_qual_name,
+        "input_convention": cconv,
+        "input_tensor_types": input_tensor_types,
+        "input_tensor_ranks": input_tensor_ranks,
+        "output_tensor_types": output_tensor_types,
+        "output_tensor_ranks": output_tensor_ranks,
+        "training_mode": training_mode,
+        "safe_run_mode": safe_run_mode,
+        "comment": debug_comment,
+    }
+
+    if len(input_bool_scalars) > 0:
+        attrs["input_bool_scalars"] = input_bool_scalars
+        attrs["input_bool_scalar_positions"] = input_bool_scalar_positions
+    if len(input_int_scalars) > 0:
+        attrs["input_int_scalars"] = input_int_scalars
+        attrs["input_int_scalar_positions"] = input_int_scalar_positions
+    if len(input_float_scalars) > 0:
+        attrs["input_float_scalars"] = input_float_scalars
+        attrs["input_float_scalar_positions"] = input_float_scalar_positions
+    if len(input_bool_tuples) > 0:
+        attrs["input_bool_tuples"] = input_bool_tuples
+        attrs["input_bool_tuple_positions"] = input_bool_tuple_positions
+        attrs["input_bool_tuple_begins"] = input_bool_tuple_begins
+    if len(input_int_tuples) > 0:
+        attrs["input_int_tuples"] = input_int_tuples
+        attrs["input_int_tuple_positions"] = input_int_tuple_positions
+        attrs["input_int_tuple_begins"] = input_int_tuple_begins
+    if len(input_float_tuples) > 0:
+        attrs["input_float_tuples"] = input_float_tuples
+        attrs["input_float_tuple_positions"] = input_float_tuple_positions
+        attrs["input_float_tuple_begins"] = input_float_tuple_begins
+    if len(input_pointer_scalars) > 0:
+        attrs["input_pointer_scalars"] = input_pointer_scalars
+        attrs["input_pointer_scalar_positions"] = input_pointer_scalar_positions
+
+    # Register function with class names.
+    register_torch_autograd_function(func_full_qual_name, func_class)
+
+    register_custom_function_schema_supplementary(func_class)
+
+    _PYTHON_OP_INCRE_INDEX[0] += 1
+    node_name = f"{name_prefix}_{_PYTHON_OP_INCRE_INDEX[0]}"
+
+    node = onnx.helper.make_node(
+        PYTHON_OP_TYPE,
+        tensor_args,
+        [f"{node_name}_ctx", *[output.name for output in outputs]],
+        node_name,  # node name
+        "",
+        PYTHON_OP_DOMAIN,
+        **attrs,
+    )
+
+    return node
diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index 078ce4d27cd6..74f27d6fe4c5 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -14,7 +14,7 @@
 from sympy import Symbol, simplify
 from sympy.parsing.sympy_parser import parse_expr
 
-from onnxruntime.training.utils import PTable
+from onnxruntime.training.utils import PTable, log_memory_usage
 
 from ._execution_agent import TrainingAgent
 from .options import _MemoryOptimizationLevel, _RuntimeOptions
@@ -46,11 +46,19 @@ class RuntimeInspector:
     Runtime inspector for ORTModule.
     """
 
-    def __init__(self, logger: Logger, module: torch.nn.Module):
+    def __init__(self, logger: Logger, module: torch.nn.Module, training: bool):
+        """Initialize runtime inspector.
+
+        Args:
+            logger: Logger.
+            module: Torch module.
+            training: a boolean indicating whether the module is in training mode.
+        """
         self._logger = logger
 
         self.input_density_ob: Union[InputDensityObserver, None] = None
-        self.memory_ob = MemoryObserver(module, self._logger)
+        self.memory_ob = MemoryObserver(module, self._logger, training)
+        self._embedding_module_to_padding_density_map = {}
 
     def enable_input_inspector(self, model: ModelProto, user_input_names: List[str]) -> None:
         """Initialize input inspector from the given ONNX model and user input names.
@@ -433,9 +441,7 @@ def _print_embed_label_stats(self):
                 total_token,
                 valid_token_per_batch,
             ) in self._stats:
-                stat += "\t| {:<10} | {:<10} | {:<15} | {:<10} | {:<9.2f}% | {:<15} | {:<15} | {:<15} |\n".format(
-                    step, input_type, input_name, padding_idx, density, valid_token, total_token, valid_token_per_batch
-                )
+                stat += f"\t| {step:<10} | {input_type:<10} | {input_name:<15} | {padding_idx:<10} | {density:<9.2f}% | {valid_token:<15} | {total_token:<15} | {valid_token_per_batch:<15} |\n"
             stat += "<<<\n"
             self._logger.info(stat)
             self._stats.clear()
@@ -481,7 +487,14 @@ class MemoryObserver:
     NORMALIZER_FACTOR = float(1024 * 1024)
     NORMALIZER_UNIT = "MiB"
 
-    def __init__(self, m: torch.nn.Module, logger: Logger):
+    def __init__(self, m: torch.nn.Module, logger: Logger, training: bool):
+        """Initialize memory observer.
+
+        Args:
+            m: Torch module.
+            logger: Logger.
+            training: a boolean indicating whether the module is in training mode.
+        """
         self._logger = logger
         self._is_enabled = True
 
@@ -505,10 +518,15 @@ def __init__(self, m: torch.nn.Module, logger: Logger):
 
         self._rank_info = f"[{self._rank}/{self._world_size}]"
         self._pre_phase = Phase.INVALID
-        self._last_phase = Phase.POST_BACKWARD if m.training else Phase.POST_FORWARD
+
+        # Cannot infer it is for training or inferencing purpose from module.training,
+        # because it probabbly is not set correctly when this happens.
+        self._last_phase = Phase.POST_BACKWARD if training else Phase.POST_FORWARD
 
         self._is_first_inspect = True
 
+        self._m = m
+
     def is_enabled(self) -> bool:
         """Check if memory inspector is enabled."""
         return self._is_enabled
@@ -543,7 +561,10 @@ def find_memory_optimization_opportunity(self, execution_agent: TrainingAgent, r
 
         # If the memory optimization level is aggressive, we will first collect all
         # recompute subgraph by passing empty memory_optimizer_config to get_serialized_ortmodule_memory_stat.
-        if runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+        if runtime_options.memory_optimization_level in [
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE,
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE,
+        ]:
             memory_optimizer_config = ""
 
         (
@@ -579,16 +600,27 @@ def find_memory_optimization_opportunity(self, execution_agent: TrainingAgent, r
             self.cluster_id_combination_to_saving_symbolics_map[cluster_id] = values
 
         # For aggressive memory optimization, we update the memory_optimizer_config using all.
-        if runtime_options.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+        if runtime_options.memory_optimization_level > 0:
             recompute_configs = []
             for cluster_id in self.cluster_id_combination_to_saving_symbolics_map:
                 config_values = cluster_id.split(":")
                 opt_type = int(config_values[1])
-                # TODO(pengwa): use enum instead of 1 here.
-                if opt_type != 1:
-                    continue
-
-                recompute_configs.append(cluster_id)
+                if (
+                    runtime_options.memory_optimization_level
+                    == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE
+                    and opt_type == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE
+                ):
+                    recompute_configs.append(cluster_id)
+                elif (
+                    runtime_options.memory_optimization_level
+                    == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE
+                    and opt_type
+                    in [
+                        _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE,
+                        _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE,
+                    ]
+                ):
+                    recompute_configs.append(cluster_id)
 
             runtime_options.memory_optimizer_config = ",".join(recompute_configs)
 
@@ -621,29 +653,13 @@ def inspect_memory(self, cur_phase: Phase):
         need_print = self._current_step < 10 or (self._current_step & (self._current_step - 1) == 0)
 
         if need_print:
-            cur_mem_allocated = self._normalize(torch.cuda.memory_allocated())
-            max_mem_allocated = self._normalize(torch.cuda.max_memory_allocated())
-            cur_mem_cached = self._normalize(torch.cuda.memory_reserved())
-            max_mem_cached = self._normalize(torch.cuda.max_memory_reserved())
-            torch_mem_stat = torch.cuda.memory_stats()
-            cur_mem_inactive = self._normalize(torch_mem_stat.get("inactive_split_bytes.all.current", 0))
-            max_mem_inactive = self._normalize(torch_mem_stat.get("inactive_split_bytes.all.peak", 0))
-
-            mem_stats = [
-                ["phase", _convert_phase_to_string(cur_phase)],
-                ["allocated", cur_mem_allocated],  # current memory allocated for tensors
-                ["max allocated", max_mem_allocated],  # peak memory allocated for tensors
-                ["cached", cur_mem_cached],  # current memory cached for the caching allocator
-                ["max cached", max_mem_cached],  # peak memory cached for caching allocator.
-                ["inactive", cur_mem_inactive],  # amount of inactive, non-releasable memory
-                ["max inactive", max_mem_inactive],  # peak of inactive, non-releasable memory
-            ]
-
-            summ = f"{self._rank_info} step {self._current_step} memory ({MemoryObserver.NORMALIZER_UNIT})"
-            for stat in mem_stats:
-                summ += f" | {stat[0]}: {stat[1]}"
-
-            self._logger.info(summ)
+            log_memory_usage(
+                _convert_phase_to_string(cur_phase),
+                rank_0_only=True,
+                step_info=f"step {self._current_step}",
+                logger=self._logger,
+                module=self._m,
+            )
 
         if cur_phase == self._last_phase:
             self._increase_step()
@@ -655,9 +671,6 @@ def inspect_memory(self, cur_phase: Phase):
     def _increase_step(self):
         self._current_step += 1
 
-    def _normalize(self, mem_size_in_bytes: Union[float, int]) -> str:
-        return f"{float(mem_size_in_bytes) / MemoryObserver.NORMALIZER_FACTOR:.0f}"
-
     def display_memory_optimization_plans(self, memory_optimizer_config, details=False) -> Tuple[List[str], PTable]:
         mem_plan_count = len(self.cluster_id_combination_to_saving_symbolics_map)
 
@@ -700,9 +713,11 @@ def _get_user_config_without_freq(configs: str):
                     [
                         f" - Plan {index}",
                         ":",
-                        "ON"
-                        if all(cluster_id in user_configs_with_out_freq for cluster_id in cluster_ids_without_freq)
-                        else "OFF",
+                        (
+                            "ON"
+                            if all(cluster_id in user_configs_with_out_freq for cluster_id in cluster_ids_without_freq)
+                            else "OFF"
+                        ),
                         ":",
                         cluster_id,
                         saving_symbolic.freq if details else "",
@@ -716,14 +731,16 @@ def _get_user_config_without_freq(configs: str):
             notes = []
             if details:
                 notes.append(
-                    "[Memory Optimizer] Use ORTMODULE_MEMORY_OPT_LEVEL=1 to enable all recomputable subgraphs per transformer layer."
+                    "Use ORTMODULE_MEMORY_OPT_LEVEL=1 or 2 to enable all recomputable subgraphs per transformer layer."
+                )
+                saving_recommendation = (
+                    "Or use comma as a delimiter to selectively enable multiple memory optimization plans:\n"
                 )
-                saving_recommendation = "[Memory Optimizer] Or use comma as a delimiter to selectively enable multiple memory optimization plans:\n"
                 saving_recommendation += "  export ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,..."
 
                 notes.append(saving_recommendation)
 
-                saving_recommendation = "memory saving is calculated based on the 1st batch symbolic dim values:\n"
+                saving_recommendation = "Memory saving is calculated based on the 1st batch symbolic dim values:\n"
                 for dim_param, dim_value in self.symbolic_dim_name_to_value_map.items():
                     saving_recommendation += f"  {dim_param}={dim_value},"
                 notes.append(saving_recommendation)
@@ -731,3 +748,33 @@ def _get_user_config_without_freq(configs: str):
             return notes, mem_tbl
 
         return [], None
+
+
+class FlagPaddingElimination(torch.autograd.Function):
+    """
+    FlagPaddingElimination is a PyTorch autograd function that does nothing in forward pass and backward pass.
+    It is used as a flag to tell the GraphTransformer of PaddingElimination to modify the graph to eliminate
+    the embedding padding.
+    """
+
+    @staticmethod
+    def forward(ctx, input):
+        return input
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        return grad_output
+
+    @staticmethod
+    def infer_shape(
+        node: onnx.NodeProto,
+        tensor_input_shapes: List[Optional[List[Union[int, str]]]],
+        tensor_input_dtypes: List[torch.onnx.TensorProtoDataType],
+    ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]:
+        return tensor_input_shapes, tensor_input_dtypes
+
+    @staticmethod
+    def alias_input(node_proto_str: str):
+        fw_alias_map = [0]
+        bw_alias_map = [0]
+        return fw_alias_map, bw_alias_map
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 5b2c673ce94c..a7426bce38a4 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -38,9 +38,7 @@ def __init__(
         fallback_manager: _FallbackManager,
         logger: Logger,
     ):
-        super().__init__(model, debug_options, fallback_manager, logger)
-
-        self._export_mode = torch.onnx.TrainingMode.TRAINING
+        super().__init__(model, debug_options, torch.onnx.TrainingMode.TRAINING, fallback_manager, logger)
         self._forward_class = self._create_autofunction_class()
 
     @staticmethod
@@ -171,10 +169,10 @@ def backward(ctx, *grad_outputs):
                 for idx, grad_output in enumerate(grad_outputs):
                     if idx in self._graph_info.output_grad_indices_non_differentiable:
                         assert grad_output is None, (
-                            "ORT found the {}-th module output '{}' is "
+                            f"ORT found the {idx}-th module output '{self._graph_info.user_output_names[idx]}' is "
                             "non-differentiable according to the onnx graph. "
                             "However, the gradient value is still provided by "
-                            "PyTorch's autograd engine.".format(idx, self._graph_info.user_output_names[idx])
+                            "PyTorch's autograd engine."
                         )
                         continue
 
@@ -196,18 +194,20 @@ def backward(ctx, *grad_outputs):
 
                 # Run and get results
                 backward_outputs = C.OrtValueVector()
-                self._execution_agent.run_backward(backward_inputs, backward_outputs, ctx.run_info.state)
-                # Destroy the state immediately (as opposed to be at the mercy of garbage collector) so it does not
-                # affect peak memory usage in a subsequent graph run.
-                del ctx.run_info.state
-
-                # Fast version: all backward_outputs are converted first.
-                # This version only works if backward_outputs is an OrtValueVector.
-                transferred_backward_outputs = _utils._ortvalues_to_torch_tensor(backward_outputs, self._device)
+                try:
+                    self._execution_agent.run_backward(backward_inputs, backward_outputs, ctx.run_info.state)
+                    # Destroy the state immediately (as opposed to be at the mercy of garbage collector) so it does not
+                    # affect peak memory usage in a subsequent graph run.
 
-                self._runtime_inspector.memory_ob.inspect_memory(Phase.POST_BACKWARD)
+                    # Fast version: all backward_outputs are converted first.
+                    # This version only works if backward_outputs is an OrtValueVector.
+                    transferred_backward_outputs = _utils._ortvalues_to_torch_tensor(backward_outputs, self._device)
 
-                return tuple(transferred_backward_outputs[idx] if idx != -1 else None for idx in self._gradient_map)
+                    self._runtime_inspector.memory_ob.inspect_memory(Phase.POST_BACKWARD)
+                    res = tuple(transferred_backward_outputs[idx] if idx != -1 else None for idx in self._gradient_map)
+                    return res
+                finally:
+                    del ctx.run_info.state
 
         return _ORTModuleFunction
 
@@ -310,11 +310,22 @@ def forward(self, *inputs, **kwargs):
 
             self._gradient_accumulation_manager.maybe_update_cache_before_run()
 
-            if self._runtime_options.enable_zero_stage3_support:
+            if self._runtime_options.enable_zero_stage3_support or self._mem_efficient_grad_management_is_enabled:
                 self._append_pull_weight_trigger_as_input(kwargs, self._device)
 
+            param_to_append_as_onnx_graph_inputs = []
+            if self._mem_efficient_grad_management_is_enabled:
+                from ._mem_efficient_grad_mgmt import get_params_not_connected_to_pull_param_trigger
+
+                param_to_append_as_onnx_graph_inputs = get_params_not_connected_to_pull_param_trigger(
+                    self._flattened_module.named_parameters(), self._onnx_models.exported_model
+                )
+
+            else:
+                param_to_append_as_onnx_graph_inputs = self._graph_initializers
+
             prepared_input_list, _, _ = _io._combine_input_buffers_initializers(
-                self._graph_initializers,
+                param_to_append_as_onnx_graph_inputs,
                 self._graph_info.user_input_names,
                 self._input_info,
                 self._flattened_module.named_buffers(),
@@ -492,10 +503,20 @@ def _reinitialize_graph_builder(self, input_info: _InputInfo):
             if param.requires_grad and name in self._graph_initializer_names
         }
 
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+
+            # Remove the inputs we added during model post-processing.
+            existing_require_grad_names = [
+                n for n in self._input_info.require_grad_names if n != MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+            ]
+        else:
+            existing_require_grad_names = self._input_info.require_grad_names
+
         # If inputs requiring gradient change from forward to the next, the module_gradient_graph_builder
         # needs to be reinitialized so it can compute the backward output for the new inputs that require_grad
         if (
-            input_info.require_grad_names != self._input_info.require_grad_names
+            input_info.require_grad_names != existing_require_grad_names
             or initializer_names_to_train_set_user_model != self._graph_initializer_names_to_train
         ):
             self._input_info = input_info
diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py
index 91825fc49220..5faa1c62bae4 100644
--- a/orttraining/orttraining/python/training/ortmodule/_utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/_utils.py
@@ -91,7 +91,7 @@ def _ortvalues_to_torch_tensor(
         # Second option makes it impossible to directly use `_from_dlpack` or
         # or `from_dlpack` from torch.
         # The best option would be to add boolean type in DLDataTypeCode.
-        for i in range(0, len(bool_indices)):
+        for i in range(len(bool_indices)):
             j = bool_indices[i]
             res[j] = res[j].to(torch.bool)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
index dcaa202d46fd..905eb62768a9 100644
--- a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
@@ -214,8 +214,7 @@ def recursive_wrap(module, save_onnx=False, onnx_prefix=""):
                 if isinstance(sub_module, torch.nn.ModuleList):
                     # We encounter a list of sub-modules.
                     # Let's wrap them one-by-one.
-                    idx = 0
-                    for item_name, sub_module_item in sub_module._modules.items():
+                    for idx, (item_name, sub_module_item) in enumerate(sub_module._modules.items()):
                         # Avoid saving too many graphs.
                         new_save_onnx = save_onnx and idx == 0
                         sub_new_prefix = new_prefix + "_" + item_name
@@ -237,7 +236,6 @@ def recursive_wrap(module, save_onnx=False, onnx_prefix=""):
                                 )
                         else:
                             recursive_wrap(sub_module_item, new_save_onnx, sub_new_prefix)
-                        idx += 1
                 else:
                     if is_supported(sub_module):
                         # Just wrap it as ORTModule when possible.
diff --git a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/__init__.py b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/__init__.py
index 3d3538a62da6..368d1b238fd9 100644
--- a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/__init__.py
@@ -13,7 +13,7 @@
 if (
     "ORTMODULE_USE_EFFICIENT_ATTENTION" in os.environ
     and int(os.getenv("ORTMODULE_USE_EFFICIENT_ATTENTION")) == 1
-    and Version(torch.__version__) >= Version("2.1.1")
+    and Version(torch.__version__) >= Version("2.3.0")
 ):
     from ._aten_attn import optimize_graph_for_aten_efficient_attention  # noqa: F401
 
diff --git a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py
index b1e8809f03fc..c1fb6e68568f 100644
--- a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py
+++ b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py
@@ -5,9 +5,12 @@
 
 """
 PyTorch's _efficient_attention_forward/_efficient_attention_backward APIs is keep changing. Current implementation
-is tested well on version 2.2.0.dev20231010+cu121, and should be run well since official version 2.2.0. If may fail to
+is tested well on version 2.3.0.dev20240221+cu118, and should be run well since official version 2.3.0. If may fail to
 run is you are using PyTorch with older versions.
 
+This file is more like an example of how to add a new graph optimizer. Ideally user can add graph optimizer according
+to the specific model they are using on their own instead of putting every possible graph optimizer here.
+
 PyTorch also has API for flash attention (currently doesn't support random attention mask or Dropout), we can add
 support if we want to try in the future.
 """
@@ -40,13 +43,14 @@ def _make_efficient_attention_nodes(
     scale_node = make_constant_node("scale_" + str(idx), TensorProto.FLOAT, [], [scale])
     dropout_ratio_node = make_constant_node("dropout_ratio_" + str(idx), TensorProto.FLOAT, [], [dropout_ratio])
     causal_node = make_constant_node("causal_" + str(idx), TensorProto.INT64, [], [1 if causal else 0])
-    int_zero_node = make_constant_node("int_zero_" + str(idx), TensorProto.INT64, [], [0])
-    true_node = make_constant_node("true_" + str(idx), TensorProto.BOOL, [], [True])
-    false_node = make_constant_node("false_" + str(idx), TensorProto.BOOL, [], [False])
+    one_node = make_constant_node("one_" + str(idx), TensorProto.INT64, [], [1])
+    zero_node = make_constant_node("zero_" + str(idx), TensorProto.INT64, [], [0])
     logsumexp = helper.make_tensor_value_info("logsumexp" + str(idx), TensorProto.FLOAT, [])
     seed = helper.make_tensor_value_info("seed" + str(idx), TensorProto.INT64, [])
     offset = helper.make_tensor_value_info("offset" + str(idx), TensorProto.INT64, [])
-    new_value_infos = [logsumexp, seed, offset]
+    msb_q = helper.make_tensor_value_info("msb_q_" + str(idx), TensorProto.INT64, [])
+    msb_k = helper.make_tensor_value_info("msb_k_" + str(idx), TensorProto.INT64, [])
+    new_value_infos = [logsumexp, seed, offset, msb_q, msb_k]
     if expand_bias:
         shape_0 = helper.make_node("Shape", [q], ["shape_0_" + str(idx)], start=0, end=1)
         shape_1 = helper.make_node("Shape", [q], ["shape_1_" + str(idx)], start=2, end=3)
@@ -54,13 +58,13 @@ def _make_efficient_attention_nodes(
         shape_3 = helper.make_node("Shape", [k], ["shape_3_" + str(idx)], start=1, end=2)
         concat = helper.make_node(
             "Concat",
-            ["shape_0_" + str(idx), "shape_1_" + str(idx), "shape_2_" + str(idx), "shape_3_" + str(idx)],
+            [shape_0.output[0], shape_1.output[0], shape_2.output[0], shape_3.output[0]],
             ["concated_shape_" + str(idx)],
             axis=0,
         )
-        expand = helper.make_node("Expand", [bias, "concated_shape_" + str(idx)], ["expanded_bias_" + str(idx)])
+        expand = helper.make_node("Expand", [bias, concat.output[0]], ["expanded_bias_" + str(idx)])
         nodes_to_add.extend([shape_0, shape_1, shape_2, shape_3, concat, expand])
-        bias = "expanded_bias_" + str(idx)
+        bias = expand.output[0]
     fwd_node = helper.make_node(
         "ATen",
         [
@@ -71,18 +75,21 @@ def _make_efficient_attention_nodes(
             "",
             "",
             "",
+            "",
             dropout_ratio_node.output[0],
             causal_node.output[0],
-            true_node.output[0],
+            one_node.output[0],
             scale_node.output[0],
             "",
             "",
         ],
-        [y, logsumexp.name, seed.name, offset.name],
+        [y, logsumexp.name, seed.name, offset.name, msb_q.name, msb_k.name],
         "efficient_attention_forward_" + str(idx),
         None,
         "org.pytorch.aten",
         operator="_efficient_attention_forward",
+        cpu_input_args=[4, 5, 12, 13],
+        cpu_output_args=[2, 3, 4, 5],
     )
     bwd_node = helper.make_node(
         "ATen",
@@ -95,14 +102,14 @@ def _make_efficient_attention_nodes(
             y,
             "",
             "",
-            int_zero_node.output[0],
-            int_zero_node.output[0],
+            msb_q.name,
+            msb_k.name,
             logsumexp.name,
             dropout_ratio_node.output[0],
             seed.name,
             offset.name,
             causal_node.output[0],
-            false_node.output[0],
+            zero_node.output[0],
             scale_node.output[0],
             "",
         ],
@@ -111,10 +118,9 @@ def _make_efficient_attention_nodes(
         None,
         "org.pytorch.aten",
         operator="_efficient_attention_backward",
+        cpu_input_args=[6, 7, 12, 13],
     )
-    nodes_to_add.extend(
-        [scale_node, dropout_ratio_node, causal_node, int_zero_node, true_node, false_node, fwd_node, bwd_node]
-    )
+    nodes_to_add.extend([scale_node, dropout_ratio_node, causal_node, one_node, zero_node, fwd_node, bwd_node])
     return nodes_to_add, new_value_infos
 
 
@@ -240,140 +246,9 @@ def _optimize_for_pattern_1(matcher: GraphMatcher, idx: int, nodes: List[NodePro
     return nodes, nodes_to_add, new_value_infos
 
 
-# No causal mask, no attention mask, without Dropout.
-_PATTERN_2: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [
-    ("MatMul", False, []),  # 0
-    ("Mul", True, [(0, 0, 0)]),  # 1
-    ("Mul", True, [(0, 0, 1)]),  # 2
-    ("Transpose", True, [(1, 0, 0)]),  # 3
-    ("Transpose", True, [(2, 0, 0)]),  # 4
-    ("Softmax", False, [(0, 0, 0)]),  # 5
-    ("MatMul", False, [(5, 0, 0)]),  # 6
-    ("Transpose", True, [(6, 0, 1)]),  # 7
-    ("Transpose", False, [(6, 0, 0)]),  # 8
-    ("FusedMatMul", False, [(7, 0, 1)]),  # 9
-    ("SoftmaxGrad_13", False, [(9, 0, 0), (5, 0, 1)]),  # 10
-    ("FusedMatMul", False, [(2, 0, 1), (10, 0, 0)]),  # 11
-    ("FusedMatMul", False, [(1, 0, 0), (10, 0, 1)]),  # 12
-    ("Mul", False, [(11, 0, 0)]),  # 13
-    ("Mul", False, [(12, 0, 0)]),  # 14
-    ("Identity", False, [(13, 0, 0)]),  # 15
-    ("Identity", False, [(14, 0, 0)]),  # 16
-    ("Transpose", False, [(15, 0, 0)]),  # 17
-    ("Transpose", False, [(16, 0, 0)]),  # 18
-    ("FusedMatMul", False, [(5, 0, 0)]),  # 19
-    ("Transpose", True, [(19, 0, 1)]),  # 20
-    ("Transpose", False, [(19, 0, 0)]),  # 21
-]
-
-
-def _optimize_for_pattern_2(matcher: GraphMatcher, idx: int, nodes: List[NodeProto]):
-    # Check forward only as the backward is expected to be consistent if it's built correctly.
-    scale_value_1 = matcher.get_constant_value(nodes[1].input[1])
-    scale_value_1 = scale_value_1[0] if isinstance(scale_value_1, list) else scale_value_1
-    scale_value_2 = matcher.get_constant_value(nodes[2].input[1])
-    scale_value_2 = scale_value_2[0] if isinstance(scale_value_2, list) else scale_value_2
-    if not (
-        check_attribute_value(nodes[3], "perm", [0, 2, 1, 3])
-        and check_attribute_value(nodes[4], "perm", [0, 2, 3, 1])
-        and check_attribute_value(nodes[7], "perm", [0, 2, 1, 3])
-        and check_attribute_value(nodes[8], "perm", [0, 2, 1, 3])
-        and scale_value_1 == scale_value_2
-    ):
-        return [], [], []
-
-    nodes_to_add, new_value_infos = _make_efficient_attention_nodes(
-        idx,
-        nodes[3].input[0],
-        nodes[4].input[0],
-        nodes[7].input[0],
-        nodes[8].output[0],
-        nodes[20].input[0],
-        nodes[17].output[0],
-        nodes[18].output[0],
-        nodes[21].output[0],
-        "",
-        False,
-        scale_value_1,
-        0.0,
-        False,
-    )
-    return nodes, nodes_to_add, new_value_infos
-
-
-# Has causal mask, no attention mask, without Dropout.
-_PATTERN_3: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [
-    ("MatMul", False, []),  # 0
-    ("Mul", True, [(0, 0, 0)]),  # 1
-    ("Mul", True, [(0, 0, 1)]),  # 2
-    ("Transpose", True, [(1, 0, 0)]),  # 3
-    ("Transpose", True, [(2, 0, 0)]),  # 4
-    ("Add", False, [(0, 0, 0)]),  # 5
-    ("Slice", True, [(5, 0, 1)]),  # 6
-    ("Slice", True, [(6, 0, 0)]),  # 7
-    ("Unsqueeze", True, [(6, 0, 2)]),  # 8
-    ("Gather", True, [(8, 0, 0)]),  # 9
-    ("Shape", True, [(9, 0, 0)]),  # 10
-    ("Softmax", False, [(5, 0, 0)]),  # 11
-    ("MatMul", False, [(11, 0, 0)]),  # 12
-    ("Transpose", True, [(12, 0, 1)]),  # 13
-    ("Transpose", False, [(12, 0, 0)]),  # 14
-    ("FusedMatMul", False, [(13, 0, 1)]),  # 15
-    ("SoftmaxGrad_13", False, [(15, 0, 0), (11, 0, 1)]),  # 16
-    ("Identity", False, [(16, 0, 0)]),  # 17
-    ("FusedMatMul", False, [(2, 0, 1), (17, 0, 0)]),  # 18
-    ("FusedMatMul", False, [(1, 0, 0), (17, 0, 1)]),  # 19
-    ("Mul", False, [(18, 0, 0)]),  # 20
-    ("Mul", False, [(19, 0, 0)]),  # 21
-    ("Identity", False, [(20, 0, 0)]),  # 22
-    ("Identity", False, [(21, 0, 0)]),  # 23
-    ("Transpose", False, [(22, 0, 0)]),  # 24
-    ("Transpose", False, [(23, 0, 0)]),  # 25
-    ("FusedMatMul", False, [(11, 0, 0)]),  # 26
-    ("Transpose", True, [(26, 0, 1)]),  # 27
-    ("Transpose", False, [(26, 0, 0)]),  # 28
-]
-
-
-def _optimize_for_pattern_3(matcher: GraphMatcher, idx: int, nodes: List[NodeProto]):
-    # Check forward only as the backward is expected to be consistent if it's built correctly.
-    scale_value_1 = matcher.get_constant_value(nodes[1].input[1])
-    scale_value_1 = scale_value_1[0] if isinstance(scale_value_1, list) else scale_value_1
-    scale_value_2 = matcher.get_constant_value(nodes[2].input[1])
-    scale_value_2 = scale_value_2[0] if isinstance(scale_value_2, list) else scale_value_2
-    if not (
-        check_attribute_value(nodes[3], "perm", [0, 2, 1, 3])
-        and check_attribute_value(nodes[4], "perm", [0, 2, 3, 1])
-        and check_attribute_value(nodes[13], "perm", [0, 2, 1, 3])
-        and check_attribute_value(nodes[14], "perm", [0, 2, 1, 3])
-        and scale_value_1 == scale_value_2
-    ):
-        return [], [], []
-
-    nodes_to_add, new_value_infos = _make_efficient_attention_nodes(
-        idx,
-        nodes[3].input[0],
-        nodes[4].input[0],
-        nodes[13].input[0],
-        nodes[14].output[0],
-        nodes[27].input[0],
-        nodes[24].output[0],
-        nodes[25].output[0],
-        nodes[28].output[0],
-        "",
-        False,
-        scale_value_1,
-        0.0,
-        True,
-    )
-    return nodes, nodes_to_add, new_value_infos
-
-
 _PATTERNS = [
     (_PATTERN_0, _optimize_for_pattern_0),
     (_PATTERN_1, _optimize_for_pattern_1),
-    (_PATTERN_2, _optimize_for_pattern_2),
-    (_PATTERN_3, _optimize_for_pattern_3),
 ]
 
 
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index a93f6413b7ab..1bde07dc29ba 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -196,7 +196,10 @@ class _MemoryOptimizationLevel(IntFlag):
     """Enumeration to specify memory optimization level"""
 
     USER_SPECIFIED = 0  # Fully respect user-specified config
-    TRANSFORMER_LAYERWISE_RECOMPUTE = 1  # Enable all recomputable subgraphs per layer
+    TRANSFORMER_LAYERWISE_RECOMPUTE = (
+        1  # Enable all recomputable subgraphs (excluding compromised recomptable graphs) per layer
+    )
+    TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE = 2  # Enable all recomputable subgraphs per layer
 
     @staticmethod
     def to_string(memory_optimization_level):
@@ -206,6 +209,9 @@ def to_string(memory_optimization_level):
         if memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
             return "TRANSFORMER_LAYERWISE_RECOMPUTE"
 
+        if memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE:
+            return "TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE"
+
         return ""
 
 
@@ -271,7 +277,7 @@ def __init__(self, logger: Logger):
         self.enable_sparse_optimizer = True
         self.label_sparsity_ratio = ""
         self.embed_sparsity_ratio = ""
-        self.enable_embedding_sparse_optimizer = False  # TODO(pengwa): remove once validation on more models are done.
+        self.enable_embedding_sparse_optimizer = True
 
         # Configuration for memory optimization.
         self.memory_optimization_level = (
@@ -308,6 +314,9 @@ def __init__(self, logger: Logger):
         # Experimental features.
         self.enable_zero_stage3_support = False  # Once enabled, cannot be disabled.
 
+        # We disable memory efficient grad management by default, will enable once it's fully validated.
+        self.enable_mem_efficient_grad_management = False
+
         self.deepcopy_before_model_export = True
 
         # Override the feature config if it exists in os env.
@@ -341,7 +350,10 @@ def _override_from_env_vars(self):
         self.memory_optimization_level = int(os.getenv("ORTMODULE_MEMORY_OPT_LEVEL", self.memory_optimization_level))
         user_given_memory_optimizer_config = os.getenv("ORTMODULE_MEMORY_OPT_CONFIG", self.memory_optimizer_config)
         self.memory_optimizer_config = ",".join([c for c in user_given_memory_optimizer_config.split(",") if c])
-        if self.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+        if self.memory_optimization_level in [
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE,
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE,
+        ]:
             # For transformer layer-wise recompute, we enable layer boundary when detecting subgraphs.
             # Then all detected subgraphs will not cross different layers.
             self.recompute_probe_config = "1:1"
@@ -375,7 +387,9 @@ def _override_from_env_vars(self):
             try:
                 import triton  # noqa: F401
             except ImportError:
-                pass
+                self._logger.warning(
+                    "triton library missing. Please install triton with `pip install triton`. Triton feature will be off."
+                )
             else:
                 self.enable_triton = True
 
@@ -397,5 +411,26 @@ def _override_from_env_vars(self):
         if "ORTMODULE_ENABLE_ZERO_STAGE3" in os.environ and int(os.getenv("ORTMODULE_ENABLE_ZERO_STAGE3")) == 1:
             self.enable_zero_stage3_support = True
 
+        if "ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT" in os.environ:
+            enable_grad_mgmt = int(os.getenv("ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT"))
+            self.enable_mem_efficient_grad_management = enable_grad_mgmt == 1 and self.enable_custom_autograd_function
+            if not self.enable_custom_autograd_function and enable_grad_mgmt == 1:
+                self._logger.warning(
+                    "ORTModule optimization for memory efficient gradient management cannot be enabled "
+                    "because PyTorch custom autograd function support is disabled."
+                )
+
         if "ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT" in os.environ:
             self.deepcopy_before_model_export = int(os.getenv("ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT")) == 1
+
+    def memory_optimizer_is_enabled(self) -> bool:
+        """Check whether memory optimizer is enabled."""
+        if self.memory_optimization_level == _MemoryOptimizationLevel.USER_SPECIFIED:
+            return len(self.memory_optimizer_config) > 0
+        elif self.memory_optimization_level in [
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE,
+            _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE,
+        ]:
+            return True
+
+        return False
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc
index 88e93b26e0e2..d511743c4b69 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_bw.cc
@@ -60,9 +60,10 @@ std::vector<PyObject*> custom_function_backward_runner(const char* func_name_cha
         tensor = torch::utils::tensor_fromDLPack(args[arg_index]);
       } else {
         TORCH_CHECK(args[arg_index] == Py_None, "Only None is supported for non-tensor input.");
-        PyObject* fw_kernel_invoke_id = PyObject_GetAttrString(ctx.ptr(), "fw_kernel_invoke_id");
+        py::object fw_kernel_invoke_id = PyObject_FastGetAttrString(ctx.ptr(), "fw_kernel_invoke_id");
+        TORCH_CHECK(fw_kernel_invoke_id.ptr() != nullptr, "fw_kernel_invoke_id is not found in the context.");
         std::string fw_kernel_invoke_id_str =
-            py::cast<std::string>(py::reinterpret_borrow<py::object>(fw_kernel_invoke_id));
+            py::cast<std::string>(fw_kernel_invoke_id);
         CustomFuncOpKernelInfo& fw_kernel_info =
             KernelInfoStore::GetInstance().GetKernelInfoMap().at(fw_kernel_invoke_id_str);
         if (fw_kernel_info.materialize_grads) {
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc
index 599bdf813907..3bb5151265ef 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/custom_function_fw.cc
@@ -255,7 +255,7 @@ static py::object get_mockup_context_class() {
       throw std::runtime_error("Fails to import the module.");
     }
 
-    auto python_class = py::reinterpret_steal<py::object>(PyObject_GetAttrString(module.ptr(), "FakeContext"));
+    auto python_class = PyObject_FastGetAttrString(module.ptr(), "FakeContext");
     if (!PyCallable_Check(python_class.ptr())) {
       throw std::runtime_error("Cannot instantiate the Python class");
     }
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
index fa72f3b13491..898c242bb3c3 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
@@ -23,7 +23,7 @@
     cur_file_dir,
 ]
 
-extra_compile_args = {"cxx": ["-O3"]}
+extra_compile_args = {"cxx": ["-O3", "-std=c++17"]}
 setup(
     name="torch_interop_utils",
     ext_modules=[
diff --git a/orttraining/orttraining/python/training/torchdynamo/__init__.py b/orttraining/orttraining/python/training/torchdynamo/__init__.py
deleted file mode 100644
index 862c45ce31b2..000000000000
--- a/orttraining/orttraining/python/training/torchdynamo/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
deleted file mode 100644
index 9bafe39a5c21..000000000000
--- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
+++ /dev/null
@@ -1,729 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-import dataclasses
-import logging
-from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, Union
-
-import numpy as np
-import onnx
-import torch
-import torch._C
-import torch._ops
-import torch._prims.executor
-import torch.fx
-import torch.onnx
-
-# TODO(wschin,justinchuby): Since the internal APIs are not stable, please
-# contact us if you hit errors.
-import torch.onnx._internal
-import torch.onnx._internal.diagnostics
-import torch.onnx._internal.exporter
-import torch.onnx._internal.fx.decomposition_table
-import torch.onnx._internal.fx.passes
-from torch._subclasses.fake_tensor import FakeTensor
-from torch.fx.passes.fake_tensor_prop import FakeTensorProp
-from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
-from torch.fx.passes.operator_support import OperatorSupport
-from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
-from torch.utils import _pytree
-
-import onnxruntime  # type: ignore
-from onnxruntime.capi import _pybind_state as ORTC
-
-_NP_DTYPE = {
-    torch.float16: np.float16,
-    torch.float32: np.float32,
-    torch.float64: np.float64,
-    torch.uint8: np.uint8,
-    torch.int8: np.int8,
-    torch.int16: np.int16,
-    torch.int32: np.int32,
-    torch.int64: np.longlong,
-    torch.bool: np.bool_,
-}
-
-_ONNX_ELEMENT_TYPE_TO_TORCH_DTYPE = {
-    1: torch.float32,
-    2: torch.uint8,
-    3: torch.int8,
-    5: torch.int16,
-    6: torch.int32,
-    7: torch.int64,
-    9: torch.bool,
-    10: torch.float16,
-}
-
-_TORCH_DTYPE_TO_ONNX_ELEMENT_TYPE = {value: key for key, value in _ONNX_ELEMENT_TYPE_TO_TORCH_DTYPE.items()}
-
-
-def _nvtx_range_push(name: str):
-    """If PyTorch is installed with CUDA support, this starts NVTX range.
-
-    Check torch.cuda.nvtx.range_push's document for more details.
-    """
-    if torch.cuda.is_available():
-        torch.cuda.nvtx.range_push(name)
-
-
-def _nvtx_range_pop():
-    """If PyTorch is installed with CUDA support, this terminates NVTX range.
-
-    Check torch.cuda.nvtx.range_pop's document for more details.
-    """
-    if torch.cuda.is_available():
-        torch.cuda.nvtx.range_pop()
-
-
-def _get_ort_device_type(device_type: str):
-    if device_type == "cuda":
-        return ORTC.OrtDevice.cuda()  # type: ignore
-    if device_type == "cpu":
-        return ORTC.OrtDevice.cpu()  # type: ignore
-    # ort pytorch device is mapped to NPU OrtDevice type
-    if device_type == "ort":
-        return ORTC.OrtDevice.npu()  # type: ignore
-    raise ValueError("Unsupported device type: " + device_type)
-
-
-logger = logging.getLogger(__name__)
-# Uncomment the following lines to print out development info.
-# logging.basicConfig(level=logging.INFO)
-# logger.setLevel(logging.INFO)
-
-
-class OrtOperatorSupport(OperatorSupport):
-    """
-    Operator support for ONNXRuntime backend. It has two-level of support decision.
-    One is via support_dict and the other one is via extra_support_dict. The logic
-    of using support_dict is implemented in OrtOperatorSupport and extra_support_dict
-    is used by OperatorSupport.is_node_supported.
-    """
-
-    def __init__(self, support_dict: Set[Any], extra_support_dict: Dict[str, Any]):
-        # Use extra_support_dict[op_name] = None to indicate
-        # we support op_name with all input types. Otherwise,
-        # see support_dict (type: SupportDict) in operator_support.py
-        # for specifying supported types.
-        super().__init__(extra_support_dict)
-        self._support_dict = support_dict
-
-    def is_node_supported(self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node) -> bool:
-        # OperatorSupport.is_node_supported returns True for non-callable nodes.
-        # Since ORT can't execute them, we return False here to override the base
-        # behavior.
-        if node.op not in CALLABLE_NODE_OPS:
-            return False
-        # This is the and the only place to decide if aten op is supported.
-        if node.op == "call_function" and node.target in self._support_dict:
-            logger.info("support_dict supports node.target: %s (type: %s)", node.target, type(node.target))
-            return True
-        logger.info("support_dict doesn't support node.target: %s (type: %s)", node.target, type(node.target))
-        # If node.target is not in support_dict, we still want to check if torch.jit.script
-        # can convert it to ONNX equivalence. Let's use base mechanism to do this.
-        # See extra_support_dict  for supported ops.
-        if super().is_node_supported(submodules, node):
-            logger.info("extra_support_dict supports node.target: %s (type: %s)", node.target, type(node.target))
-            return True
-        logger.info("extra_support_dict doesn't supports node.target: %s (type: %s)", node.target, type(node.target))
-        return False
-
-
-def _move_placeholder_to_front(graph_module: torch.fx.GraphModule) -> None:
-    """
-    In torch.fx.Graph, placehoder is a special assignment node. If it's not
-    executed in the beginning, it could overwrite values computed by upstream
-    nodes.
-    """
-
-    graph = graph_module.graph
-    placeholders = []
-    first_not_placeholder = None
-    for node in graph.nodes:
-        if node.op == "placeholder":
-            placeholders.append(node)
-        if first_not_placeholder is None and node.op != "placeholder":
-            first_not_placeholder = node
-    if first_not_placeholder is None:
-        return
-    for placeholder in placeholders:
-        first_not_placeholder.prepend(placeholder)
-
-
-def _replace_to_copy_with_to(fx_module: torch.fx.GraphModule) -> None:
-    # aten._to_copy doesn't have exporter so we replace it with aten.to.
-    for node in fx_module.graph.nodes:
-        if (
-            isinstance(node.target, torch._ops.OpOverload)
-            and node.target.overloadpacket == torch.ops.aten._to_copy  # type: ignore
-        ):
-            is_default_layout = True
-            is_on_same_device = True
-            is_cast = True
-            are_kwargs_supported = True
-            if "layout" in node.kwargs and node.kwargs["layout"] != torch.strided:
-                is_default_layout = False
-            if "device" in node.kwargs and node.kwargs["device"] != node.args[0].meta["val"].device:
-                is_on_same_device = False
-            if "dtype" not in node.kwargs:
-                is_cast = False
-            for kwarg in node.kwargs:
-                if kwarg not in ["layout", "device", "dtype"]:
-                    are_kwargs_supported = False
-
-            if len(node.args) == 1 and is_default_layout and is_on_same_device and is_cast and are_kwargs_supported:
-                # This aten::_to_copy looks like ONNX Cast, so other kwargs are ignored.
-                # This change could lead to invalid FX graph but it doesn't matter, as long as the downstream backend,
-                # ONNXRuntime, can execute the exported ONNX graph.
-                node.kwargs = {"dtype": node.kwargs["dtype"]}
-
-                node.target = torch.ops.aten.to.dtype  # type: ignore
-            else:
-                raise RuntimeError(
-                    f"aten._to_copy must be replaced with other ONNX-supported aten ops. \
-                         args={[arg.meta for arg in node.args]}, kwargs={node.kwargs}"
-                )
-    fx_module.recompile()
-
-
-def _create_onnx_model(onnx_proto):
-    return onnx.ModelProto.FromString(onnx_proto)
-
-
-def _create_onnx_session(onnx_proto, eps: Tuple[str, ...], session_options):
-    # TODO(wechi): Add more EPs per PyTorch device types.
-    # TODO(wechi): enable external allocators.
-    return onnxruntime.InferenceSession(onnx_proto, providers=eps, sess_options=session_options)
-
-
-def _infer_ep_from_device(*args) -> Tuple[str, ...]:
-    """Return the first valid device (i.e., GPU or CPU) in argument list."""
-    eps = []
-    for arg in args:
-        if hasattr(arg, "device"):
-            device = arg.device
-            if device.type == "cuda":
-                eps.append("CUDAExecutionProvider")
-            elif device.type == "cpu":
-                eps.append("CPUExecutionProvider")
-    return tuple(eps)
-
-
-def _extract_graph_module_inputs(graph_module: torch.fx.GraphModule) -> Tuple[Any, ...]:
-    placeholders = []
-    for node in graph_module.graph.nodes:
-        if node.op == "placeholder":
-            if hasattr(node, "meta") and "val" in node.meta:
-                assert isinstance(node.meta["val"], torch.Tensor)
-            placeholders.append(node)
-
-
-def _extract_graph_module_outputs(graph_module: torch.fx.GraphModule) -> Any:
-    """Collect "val" fields from outputs metadata in this torch.fx.GraphModule."""
-    for node in graph_module.graph.nodes:
-        if node.op == "output":
-            # Output node is unique. Let's retrieve output values from
-            # this node's input list. And then just return.
-            return node.args[0]
-    raise ValueError("No output node found in this torch.fx.GraphModule.")
-
-
-def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> Tuple[str, ...]:
-    """Return the all valid devices (i.e., GPU or CPU) among outputs of this torch.fx.GraphModule."""
-    flattened_output_args, _ = _pytree.tree_flatten(_extract_graph_module_outputs(graph_module))
-    # Output arguments with example value (type: torch.Tensor) in the `graph_module`.
-    selected_output_args = [
-        output_arg.meta["val"]
-        for output_arg in flattened_output_args
-        # output_arg must have tensor for its device information.
-        # Otherwise, skip it.
-        if (hasattr(output_arg, "meta") and "val" in output_arg.meta)
-    ]
-    return _infer_ep_from_device(*selected_output_args)
-
-
-def _sort_eps(eps: Tuple[str, ...]) -> Tuple[str, ...]:
-    """Sort execution providers in eps based on pre-set priority."""
-
-    def get_execution_provider_priority(ep: str) -> int:
-        if ep == "CPUExecutionProvider":
-            # Lowest priority.
-            return 2
-        if ep == "CUDAExecutionProvider":
-            # Higher priority than CPU but lower than
-            # other specialized EPs.
-            return 1
-        # Highest priority.
-        return 0
-
-    unique_eps = set(eps)
-    return tuple(sorted(unique_eps, key=get_execution_provider_priority, reverse=True))
-
-
-def _get_onnx_devices(values: Tuple[torch.Tensor, ...]) -> Tuple[ORTC.OrtDevice, ...]:  # type: ignore
-    assert all(value.device == values[0].device for value in values), "All values must be on the same device."
-
-    def _device_id_or_zero(device_id: int) -> int:
-        return device_id or 0
-
-    devices: Tuple[ORTC.OrtDevice, ...] = tuple(  # type: ignore
-        ORTC.OrtDevice(  # type: ignore
-            _get_ort_device_type(value.device.type),
-            ORTC.OrtDevice.default_memory(),  # type: ignore
-            _device_id_or_zero(value.device.index),
-        )
-        for value in values
-    )
-    return devices
-
-
-def _get_ortvalues_from_torch_tensors(
-    tensors: Tuple[torch.Tensor, ...], devices: Tuple[ORTC.OrtDevice, ...]
-) -> Tuple[torch.Tensor, ...]:
-    ortvalues = ORTC.OrtValueVector()  # type: ignore
-    ortvalues.reserve(len(tensors))
-    dtypes = []
-    shapes = []
-    data_ptrs = []
-
-    for tensor in tensors:
-        dtypes.append(_NP_DTYPE[tensor.dtype])
-        shapes.append(tensor.size())
-        data_ptrs.append(tensor.data_ptr())
-    ortvalues.push_back_batch(tensors, data_ptrs, dtypes, shapes, devices)
-    return ortvalues
-
-
-def _to_real_tensor(tensor: FakeTensor) -> torch.Tensor:
-    if tensor.is_sparse:
-        raise ValueError("sparse tensor is not yet supported.")
-    out = torch.empty(tensor.size(), dtype=tensor.dtype, device=tensor.device)
-    return out
-
-
-def _run_onnx_session_with_ortvaluevector(
-    sess: onnxruntime.InferenceSession,
-    input_names: Tuple[str, ...],
-    inputs: Tuple[torch.Tensor, ...],
-    input_devices: Tuple[ORTC.OrtDevice, ...],  # type: ignore
-    output_names: Tuple[str, ...],
-    outputs: Tuple[torch.Tensor, ...],
-    output_devices: Tuple[ORTC.OrtDevice, ...],  # type: ignore
-    preallocate_output: bool,
-) -> Tuple[torch.Tensor, ...]:
-    _nvtx_range_push("contiguous")
-    inputs = tuple(a.contiguous() for a in inputs)
-    _nvtx_range_pop()
-
-    _nvtx_range_push("push_back_batch")
-
-    ort_inputs = _get_ortvalues_from_torch_tensors(inputs, input_devices)
-
-    # preallocate output pytorch Tensors and use the buffers affined to the torch device for the output ortvalue.
-    # Because the output ortvalue is not allocated and owned by ort, it does not need to convert the output ortvalue
-    # to torch Tensor transferring the ownership.
-    if preallocate_output:
-        pth_outputs = tuple(map(lambda t: _to_real_tensor(t) if isinstance(t, FakeTensor) else t, outputs))
-        ort_outputs = _get_ortvalues_from_torch_tensors(pth_outputs, output_devices)
-    else:
-        ort_outputs = ORTC.OrtValueVector()  # type: ignore
-    _nvtx_range_pop()
-
-    _nvtx_range_push("run_with_ortvaluevector")
-    run_options = onnxruntime.RunOptions()
-    run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
-    sess.run_with_ortvaluevector(run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices)
-    _nvtx_range_pop()
-
-    if preallocate_output:
-        return pth_outputs
-    else:
-        _nvtx_range_push("after run_with_ortvaluevector")
-        pth_outputs = onnxruntime.training.ortmodule._utils._ortvalues_to_torch_tensor(ort_outputs)  # type: ignore
-        _nvtx_range_pop()
-        return pth_outputs
-
-
-def _assert_allclose_with_detailed_error_message(
-    actual: torch.Tensor, expected: torch.Tensor, rtol: float = 1e-03, atol: float = 1e-04
-):
-    diff = actual - expected
-    real_atol = torch.max(torch.abs(diff))
-    max_value = torch.max(torch.abs(actual), torch.abs(expected))
-    max_value[max_value == 0.0] = 1.0
-    real_rtol = torch.max(diff / max_value)
-    allclose = bool(real_atol <= atol or real_rtol <= rtol)
-    if not allclose:
-        raise RuntimeError(
-            "ONNX output doesn't match baseline output with "
-            f"actual rtol={real_rtol} and actual atol={real_atol} "
-            f"but expected rtol={rtol} and expected atol={atol}."
-        )
-
-
-class OrtExecutionInfoPerSession:
-    """Information required to execute torch.fx.GraphModule using onnxruntime.InferenceSession"""
-
-    def __init__(
-        self,
-        session: onnxruntime.InferenceSession,
-        input_names: Tuple[str, ...],
-        input_value_infos: Tuple[onnx.ValueInfoProto, ...],
-        output_names: Tuple[str, ...],
-        output_value_infos: Tuple[onnx.ValueInfoProto, ...],
-        input_devices: Tuple[ORTC.OrtDevice, ...],  # type: ignore
-        output_devices: Tuple[ORTC.OrtDevice, ...],  # type: ignore
-        example_outputs: Union[Tuple[torch.Tensor, ...], torch.Tensor],
-    ):
-        # Carrier of ONNX model and its executor.
-        self.session: onnxruntime.InferenceSession = session
-        # For the ONNX model stored in self.session, self.input_names[i] is the
-        # name of the i-th positional input.
-        self.input_names: Tuple[str, ...] = input_names
-        # self.input_name[i]'s type information is stored in self.input_value_infos[i].
-        self.input_value_infos: Tuple[onnx.ValueInfoProto, ...] = input_value_infos
-        # Similar to self.input_names, but for outputs.
-        self.output_names: Tuple[str, ...] = output_names
-        # Similar to self.input_value_infos but for outputs.
-        self.output_value_infos: Tuple[onnx.ValueInfoProto, ...] = output_value_infos
-        # For the ONNX model stored in self.session, self.input_devices[i] is the
-        # i-th positional input's device.
-        self.input_devices: Tuple[ORTC.OrtDevice, ...] = input_devices  # type: ignore
-        # Similar to self.input_devices, but for outputs.
-        self.output_devices: Tuple[ORTC.OrtDevice, ...] = output_devices  # type: ignore
-        # This is the outputs of executing the original torch.fx.GraphModule with example inputs
-        # (i.e., args passed into OrtBackend._ort_acclerated_call).
-        self.example_outputs: Union[Tuple[torch.Tensor, ...], torch.Tensor] = example_outputs
-
-    def is_supported(self, *args):
-        # Compare the args and the input schema in ONNX model and
-        # return the first match.
-        if len(args) != len(self.input_value_infos):
-            return False
-        for arg, value_info in zip(args, self.input_value_infos):
-            if not isinstance(arg, torch.Tensor):
-                return False
-            onnx_dtype = _TORCH_DTYPE_TO_ONNX_ELEMENT_TYPE[arg.dtype]
-            if onnx_dtype != value_info.type.tensor_type.elem_type:
-                return False
-            for dim, onnx_dim in zip(arg.shape, value_info.type.tensor_type.shape.dim):
-                if isinstance(dim, int) and (onnx_dim.dim_value == dim or onnx_dim.dim_param):
-                    continue
-                elif isinstance(dim, torch.SymInt) and onnx_dim.dim_param:
-                    continue
-                else:
-                    return False
-        return True
-
-
-@dataclasses.dataclass
-class OrtExecutionInfoForAllGraphModules:
-    def __init__(self):
-        # All sessions (and their related information) created by exporting the same GraphModule
-        # with different inputs.
-        self.execution_info_per_graph_module: Dict[torch.fx.GraphModule, List[OrtExecutionInfoPerSession]] = {}
-
-    def search_reusable_session_execution_info(self, graph_module: torch.fx.GraphModule, *args):
-        if graph_module not in self.execution_info_per_graph_module:
-            return None
-        # All execution information for ONNX models exported from the same `graph_module`
-        # with different inputs.
-        candidates = self.execution_info_per_graph_module[graph_module]
-
-        for candidate in candidates:
-            if candidate.is_supported(*args):
-                # Returns the first session that accepts this input schema.
-                return candidate
-        # No reusable session found.
-        return None
-
-    def cache_session_execution_info(self, graph_module: torch.fx.GraphModule, info: OrtExecutionInfoPerSession):
-        if graph_module not in self.execution_info_per_graph_module:
-            self.execution_info_per_graph_module[graph_module] = [info]
-        else:
-            self.execution_info_per_graph_module[graph_module].append(info)
-
-
-class OrtBackend:
-    """A backend compiles (sub-)graphs in torch.fx.GraphModule to onnxruntime.InferenceSession calls.
-
-    The compiler entry point is OrtBackend.compile, which
-        1. partitions the original graph into supported sub-graphs (type: torch.fx.GrpahModule) and unsupported
-           sub-graphs.
-        2. For each supported sub-graph, it replaces its _wrapped_call function with _ort_accelerated_call.
-        3. Inside _ort_accelerated_call, it creates onnxruntime.InferenceSession and calls it to execute the sub-graph.
-    """
-
-    def __init__(
-        self,
-        ep: str = "CPUExecutionProvider",
-        preallocate_output: bool = False,
-        session_options=None,
-        onnx_exporter_options: Optional["torch.onnx.ExportOptions"] = None,
-    ):
-        # onnx_exporter_options contains information shared between exporter and DORT.
-        # For example, they should use the same decomposition table when
-        #  1. capturing FX graph in torch.compile (see how we create aot_ort in register_backend.py)
-        #  2. call exporter's API to convert `torch.fx.GraphModule` to ONNX model
-        #     (see onnxfunction_dispatcher passed to FxOnnxInterpreter.run below).
-        if onnx_exporter_options is None:
-            onnx_exporter_options = torch.onnx.ExportOptions()
-        # Convert user-facing option to internal option used by ONNX exporter
-        # to access required information.
-        # Some useful fields:
-        # - Decomposition table for decomposing FX operators in exporter is
-        #   self.resolved_onnx_exporter_options.decomposition_table.
-        # - self.resolved_onnx_exporter_options.onnx_registry records what
-        #   aten/prim ops are supported by exporter and their exporters (type: callable).
-        self.resolved_onnx_exporter_options = torch.onnx._internal.exporter.ResolvedExportOptions(onnx_exporter_options)
-
-        # TODO(wechi): This line must generate result identical to the call of
-        # _create_onnx_supports_op_overload_table(...) inside
-        # create_onnx_friendly_decomposition_table(...) in
-        # torch/onnx/_internal/fx/decomposition_table.py.
-        support_dict = torch.onnx._internal.fx.decomposition_table._create_onnx_supports_op_overload_table(
-            # This is identical to self.resolved_onnx_exporter_options.onnxfunction_dispatcher.onnx_registry.
-            self.resolved_onnx_exporter_options.onnx_registry
-        )  # type: ignore
-
-        extra_support_dict: Dict[str, Any] = {
-            "getattr": None,
-            "_operator.getitem": None,
-        }
-
-        self._supported_ops = OrtOperatorSupport(support_dict, extra_support_dict)
-        # TODO: this is a naive implementation of cache without proper guard
-        self._partitioner_cache: Dict[torch.fx.GraphModule, torch.fx.GraphModule] = {}
-        # Conceptually, this filed is a 2-layer dictionary
-        #   GraphModule 0
-        #     ONNX Model 0 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
-        #     ONNX Model 1
-        #     ...
-        #   GraphModule 1
-        #     ONNX Model 2 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
-        #     ONNX Model 3
-        #     ...
-        #   ...
-        # , which caches all previous compilation result so that we can reuse them.
-        # ONNX Model 0 and 1 are exported from the same GraphModule 0 but with different inputs
-        # (e.g., tensors with different ranks). GraphModule 0 and GraphModule 1 are different
-        # graphs captured by Dynamo and sent to OrtBackend.compile.
-        self._all_ort_execution_info = OrtExecutionInfoForAllGraphModules()
-
-        self._assert_allclose_to_baseline = False
-
-        self.ep = ep
-        self.session_options = session_options
-
-        # preallocate_output allows for allocating output torch Tensor buffers and feeding them to InferenceSession
-        # in order to avoid internal allocation of output buffers in InferenceSession.
-        # If output ortvalue returned from InferenceSession is allocated internally,
-        # it needs to be converted to torch Tensor for return, and the torch Tensor should hold the ownership.
-        # When a custom torch device is used with a custom aten allocator, the conversion from ortvalue to torch Tensor
-        # should be supported, which is currently done through dlpack. Note that dlpack might not support a custom torch device.
-        # It can be avoided by allowing for preallocation for output buffers allocated by a custom aten allocator,
-        # and use the preallocated output buffers for InferenceSession not holding any ownership for them.
-        self.preallocate_output = preallocate_output
-
-    def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwargs):
-        cached_execution_info_per_session = self._all_ort_execution_info.search_reusable_session_execution_info(
-            graph_module, *args
-        )
-        if cached_execution_info_per_session:
-            onnx_session = cached_execution_info_per_session.session
-            input_names = cached_execution_info_per_session.input_names
-            output_names = cached_execution_info_per_session.output_names
-            input_devices = cached_execution_info_per_session.input_devices
-            output_devices = cached_execution_info_per_session.output_devices
-            prim_outputs = cached_execution_info_per_session.example_outputs
-        else:
-            # It's first time seeing such as graph. Let's make a new session
-            # (type: onnxruntime.InferenceSession) for it.
-
-            # TODO(wechi): this is a workaround for pytorch/pytorch#84311.
-            _move_placeholder_to_front(graph_module)
-            # Generate reference outputs. They are used to indicate output
-            # tensors' types and devices when calling ORT.
-            #
-            # WARNING: The downstream code should not change prim_outputs and
-            # this backend should always produces output with schema identical to prim_outputs'.
-
-            if self.resolved_onnx_exporter_options.dynamic_shapes:
-                # No pre-allocation when dynamic shape is enabled.
-                self.preallocate_output = False
-                extracted_outputs = _extract_graph_module_outputs(graph_module)
-
-                def maybe_map_to_meta_val(value):
-                    if hasattr(value, "meta") and "val" in value.meta:
-                        # Select outputs with "val" information. Without "val",
-                        # it's not possible access output_arg.meta["val"].device.
-                        return value.meta["val"]
-                    else:
-                        return value
-
-                prim_outputs = _pytree.tree_map(maybe_map_to_meta_val, extracted_outputs)
-            else:
-                try:
-                    prim_outputs = FakeTensorProp(graph_module).propagate(*args, **kwargs)
-                except Exception:
-                    logger.info(f"FakeTensorProb failed for {graph_module}")
-                    # When FakeTensorProp fails, it is not possible to preallocate output buffers
-                    # because the output shapes are not inferred.
-                    self.preallocate_output = False
-
-                    # rethrow FakeTensorProb failure because it is not yet currently handled.
-                    raise
-
-            graph_module = torch.onnx._internal.fx.passes.InsertTypePromotion(
-                self.resolved_onnx_exporter_options.diagnostic_context, graph_module
-            ).run()
-
-            from torch.onnx._internal.fx import fx_onnx_interpreter
-
-            # Create the object to iterate through the nodes in graph one-by-one
-            # and calls the corresponding ONNX exporter for each node.
-            fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter(
-                diagnostic_context=self.resolved_onnx_exporter_options.diagnostic_context
-            )
-            # Start the per-node exporting process. It's conceptually a for loop
-            # scanning through the nodes in the graph.
-            exported = fx_interpreter.run(
-                fx_graph_module=graph_module,
-                onnxfunction_dispatcher=self.resolved_onnx_exporter_options.onnxfunction_dispatcher,
-                op_level_debug=self.resolved_onnx_exporter_options.op_level_debug,
-            )
-            # Convert the exported result to ONNX ModelProto.
-            onnx_proto = exported.to_model_proto(
-                opset_version=self.resolved_onnx_exporter_options.onnx_registry.opset_version
-            ).SerializeToString()
-
-            # Initialize a ORT session to execute this ONNX model.
-            # Note that TorchDynamo assumes all inputs/outputs are on the
-            # same device, but it's subject to change (very likely with
-            # dynamic shape support), so we add execution providers
-            # based on the all inputs/outputs plus a default OrtBackend.ep.
-            eps_from_args = _infer_ep_from_device(args)
-            eps_from_graph_module = _infer_ep_from_graph_module(graph_module)
-            if eps_from_args:
-                # If user feeds CUDA tensor as input argument,
-                # we want to use CUDA EP.
-                # Thus, `eps_from_args` (deduced from input arguments)
-                # has highest priority.
-                selected_eps = _sort_eps((*eps_from_args, self.ep))
-            elif eps_from_graph_module:
-                # If there is no EP in input arguments, we deduce EP from
-                # graph_module's outputs. Those outputs may come from
-                # FakeTensorProp or Dynamo's built-in symbolic shape inference.
-                selected_eps = _sort_eps((*eps_from_graph_module, self.ep))
-            else:
-                # No EP found in inputs and outputs, let's use default.
-                selected_eps = (self.ep,)
-
-            onnx_session = _create_onnx_session(onnx_proto, selected_eps, self.session_options)
-            # Cache ORT session. It's reused for the same "graph_module".
-            # Generate ONNX model and extract its input and output names.
-            onnx_model = _create_onnx_model(onnx_proto)
-            # TODO(wechi): ORT session should provide a API to extract
-            # input and output names from the underlying model.
-            input_names = tuple(input.name for input in onnx_model.graph.input)
-            output_names = tuple(output.name for output in onnx_model.graph.output)
-            input_devices = _get_onnx_devices(args)
-            # Cache devices for inputs and outputs. They are used to invoke
-            # ORT session. Output devices indicate where (e.g., GPU or CPU)
-            # to store outputs
-            if isinstance(prim_outputs, tuple):
-                output_devices = _get_onnx_devices(prim_outputs)
-            else:
-                output_devices = _get_onnx_devices((prim_outputs,))
-
-            execution_info_per_session = OrtExecutionInfoPerSession(
-                session=onnx_session,
-                input_names=input_names,
-                input_value_infos=tuple(input for input in onnx_model.graph.input),
-                output_names=output_names,
-                output_value_infos=tuple(output for output in onnx_model.graph.output),
-                input_devices=input_devices,
-                output_devices=output_devices,
-                example_outputs=prim_outputs,
-            )
-
-            self._all_ort_execution_info.cache_session_execution_info(graph_module, execution_info_per_session)
-
-        if isinstance(prim_outputs, tuple):
-            assert all(isinstance(elem, torch.Tensor) for elem in prim_outputs)
-            # ORT always returns a tuple of outputs. If the original is a tuple, just returning
-            # ORT output is ok.
-            _nvtx_range_push("run_onnx_session_with_ortvaluevector")
-            onnx_outputs = _run_onnx_session_with_ortvaluevector(
-                onnx_session,
-                input_names,
-                args,
-                input_devices,
-                output_names,
-                prim_outputs,
-                output_devices,
-                self.preallocate_output,
-            )
-            _nvtx_range_pop()
-            if self._assert_allclose_to_baseline:
-                # Compute baseline.
-                baseline_outputs = torch._prims.executor.execute(graph_module, *args, executor="aten")
-                # Ensure every output tensor is close to the corresponding baseline.
-                for onnx_output, baseline_output in zip(onnx_outputs, baseline_outputs):
-                    _assert_allclose_with_detailed_error_message(onnx_output, baseline_output)
-            return onnx_outputs
-        else:
-            assert isinstance(prim_outputs, torch.Tensor)
-            # ORT always returns a tuple of outputs. If the original output is a tensor,
-            # ORT output's first element must be extracted and returned. Otherwise, type
-            # mismatch may happen in downstream computation.
-            onnx_outputs = _run_onnx_session_with_ortvaluevector(
-                onnx_session,
-                input_names,
-                args,
-                input_devices,
-                output_names,
-                (prim_outputs,),
-                output_devices,
-                self.preallocate_output,
-            )
-            assert len(onnx_outputs) == 1
-            if self._assert_allclose_to_baseline:
-                # Compute baseline.
-                baseline_outputs = torch._prims.executor.execute(graph_module, *args, executor="aten")
-                # Ensure output tensor is close to the corresponding baseline.
-                _assert_allclose_with_detailed_error_message(onnx_outputs[0], baseline_outputs)
-            return onnx_outputs[0]
-
-    def compile(self, graph_module: torch.fx.GraphModule, args) -> torch.fx.GraphModule:
-        # FX graph based partitioning based on ONNX supported ops.
-        if graph_module in self._partitioner_cache:
-            partitioned_prim_graph_module = self._partitioner_cache[graph_module]
-        else:
-            prim_graph_module = graph_module
-            # TODO(wechi): this is required for removing aten::_to_copy in _replace_to_copy_with_to.
-            _replace_to_copy_with_to(prim_graph_module)
-            partitioner = CapabilityBasedPartitioner(
-                prim_graph_module, self._supported_ops, allows_single_node_partition=True
-            )
-            partitioned_prim_graph_module = partitioner.partition_and_fuse()
-            self._partitioner_cache[graph_module] = partitioned_prim_graph_module
-
-            # Overriding fused_module's __call__() function with ort_acclerated_call()
-            # This loop goes through all graph partitions (each of them is an ONNX-representable graph)
-            # and override their _wrappped_call function with _ort_accelerated_call.
-            # Inside _ort_accelerated_call, the partition's graph is exported into ONNX and executed by ORT.
-            for node in partitioned_prim_graph_module.graph.nodes:
-                # TODO: use a better way to identify fused submodule
-                if node.op == "call_module" and "fused_" in node.name:
-                    fused_module = getattr(partitioned_prim_graph_module, node.name)
-                    # self.ort_acclerated_call is responsible for exporting graph to ONNX,
-                    # creating ORT session, and running ORT session.
-                    fused_module._wrapped_call = self._ort_acclerated_call
-
-        return partitioned_prim_graph_module
-
-    def __call__(self, graph_module: torch.fx.GraphModule, args) -> torch.fx.GraphModule:
-        return self.compile(graph_module, args)
diff --git a/orttraining/orttraining/python/training/torchdynamo/register_backend.py b/orttraining/orttraining/python/training/torchdynamo/register_backend.py
deleted file mode 100644
index 3a49e85ab836..000000000000
--- a/orttraining/orttraining/python/training/torchdynamo/register_backend.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-from functorch.compile import min_cut_rematerialization_partition
-from torch._dynamo.backends.common import aot_autograd
-from torch.onnx._internal.exporter import ExportOptions
-
-from .ort_backend import OrtBackend
-
-
-def make_aot_ort(dynamic: bool = True):
-    """Wrap OrtBackend as PyTorch's AOT compiler.
-
-    Example usages:
-        import torch
-        from onnxruntime.training.torchdynamo.register_backend import make_aot_ort
-        use_dynamic = True
-        local_aot_ort, _ = make_aot_ort(dynamic = use_dynamic)
-
-        @torch._dynamo.optimize(local_aot_ort, dynamic=use_dynamic)
-        def foo(x: torch.Tensor):
-            return torch.sigmoid(x)
-
-        x = torch.rand(2, 2, dtype=torch.float)
-        torch.testing.assert_close(torch.sigmoid(x), foo(x))
-    """
-    ort_backend = OrtBackend(onnx_exporter_options=ExportOptions(dynamic_shapes=dynamic))
-    return (
-        aot_autograd(
-            fw_compiler=ort_backend,
-            partition_fn=min_cut_rematerialization_partition,
-            decompositions=ort_backend.resolved_onnx_exporter_options.decomposition_table,
-        ),
-        ort_backend,
-    )
-
-
-# Wrap ORT as a compiler in Dynamo for training (i.e., when .backward is called).
-#
-# Under the hood, OrtBackend.compile is called inside functorch. See aot_function
-# and aot_module in aot_autograd.py in PyTorch repo for more details. Basically,
-# OrtBackend.compile is mapped to forward graph compiler, fw_compile, and backward
-# graph compiler, bw_compile, in aot_autograd.py.
-#
-# Example usage:
-#  import torch
-#  from onnxruntime.training.torchdynamo.register_backend import aot_ort
-#  model = torch.nn.Linear(2, 2)
-#  compiled_model = torch._dynamo.optimize(aot_ort)(model)
-#  result = compiled_model(torch.rand(2, 2, dtype=torch.float)
-#  result.sum().backward()
-#
-# DEFAULT_BACKEND should be the underlying compiler for ALL graphs if
-# the user uses ORT to accelerate PyTorch via Dynamo.
-# By using a global compiler for all graphs, cached compilation
-# results can be reused when encountering the identical graphs.
-aot_ort, DEFAULT_BACKEND = make_aot_ort(dynamic=False)
-
-# Similar to aot_ort but should be used with
-#    torch._dynamo.optimize(dynamic_aot_ort, dynamic=True)
-# to enable dynamic shapes in ONNX graph.
-#
-# Similar to DEFAULT_BACKEND but DEFAULT_DYNAMIC_BACKEND enables dynamic shapes
-# when exporting FX graph to ONNX.
-# Note that this backend must be used with
-#    torch._dynamo.optimize(DEFAULT_DYNAMIC_BACKEND, dynamic=True)
-# Without `dynamic=True`, the FX graph only contains static shapes, and results ONNX graph
-# with static shapes.
-dynamic_aot_ort, DEFAULT_DYNAMIC_BACKEND = make_aot_ort(dynamic=True)
-
-# Declare ORT as a compiler in Dynamo for inference (i.e., when .backward is NOT called).
-#
-# ort is usually faster than aot_ort for inference because the graphs generated by aot_autograd
-# mechanism are very different than the original graphs. Therefore, some ORT's graph transformers
-# are not applicable.
-#
-# Example usage:
-#  import torch
-#  from onnxruntime.training.torchdynamo.register_backend import ort
-#  model = torch.nn.Linear(2, 2)
-#  compiled_model = torch._dynamo.optimize(ort)(model)
-ort = DEFAULT_BACKEND
-
-# Similar to ort but should be used with
-#    torch._dynamo.optimize(dynamic_ort, dynamic=True)
-# to enable dynamic shapes in ONNX graph.
-dynamic_ort = DEFAULT_DYNAMIC_BACKEND
diff --git a/orttraining/orttraining/python/training/utils/__init__.py b/orttraining/orttraining/python/training/utils/__init__.py
index b4a518d57399..ecfb7d7907f3 100644
--- a/orttraining/orttraining/python/training/utils/__init__.py
+++ b/orttraining/orttraining/python/training/utils/__init__.py
@@ -12,6 +12,7 @@
     unflatten_data_using_schema,
 )
 from onnxruntime.training.utils.torch_profile_utils import (
+    log_memory_usage,
     nvtx_function_decorator,
     torch_nvtx_range_pop,
     torch_nvtx_range_push,
@@ -31,6 +32,7 @@
     "torch_nvtx_range_push",
     "torch_nvtx_range_pop",
     "nvtx_function_decorator",
+    "log_memory_usage",
     "pytorch_type_to_onnx_dtype",
     "onnx_dtype_to_pytorch_dtype",
     "pytorch_scalar_type_to_pytorch_dtype",
diff --git a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
index 68b78f8df70f..a8e730488d76 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
@@ -14,6 +14,7 @@
 import torch
 
 from ._subscriber_base import RuntimeStates, SubscriberBase
+from ._subscriber_manager import ORT_NO_INCREASE_GLOBAL_STEP
 
 
 class _InspectActivation(torch.autograd.Function):
@@ -176,21 +177,23 @@ def _summarize_activations(self, tensor: torch.Tensor, depth: int, name: str, st
         display_name = name + " forward run" if is_forward is True else name + " backward run"
         output_file_name = name + "_forward" if is_forward is True else name + "_backward"
 
-        if tensor is None or not isinstance(tensor, torch.Tensor):
-            print(f"{display_name} not a torch tensor, value: {tensor}")
-            return
+        # Skip dump during model pre-export output schema preparison run and export run.
+        if ORT_NO_INCREASE_GLOBAL_STEP[0] is False:
+            if tensor is None or not isinstance(tensor, torch.Tensor):
+                print(f"{display_name} not a torch tensor, value: {tensor}")
+                return
 
-        step_path = Path(step_folder)
-        if not step_path.exists():
-            step_path.mkdir(parents=True, exist_ok=False)
-        order_file_path = step_path / "order.txt"
-        tensor_file_path = step_path / output_file_name
+            step_path = Path(step_folder)
+            if not step_path.exists():
+                step_path.mkdir(parents=True, exist_ok=False)
+            order_file_path = step_path / "order.txt"
+            tensor_file_path = step_path / output_file_name
 
-        with order_file_path.open(mode="a", encoding="utf-8") as f:
-            f.write(f"{output_file_name}\n")
+            with order_file_path.open(mode="a", encoding="utf-8") as f:
+                f.write(f"{output_file_name}\n")
 
-        with tensor_file_path.open(mode="w", encoding="utf-8") as f:
-            _summarize_tensor(display_name, tensor, f, depth, self._run_on_cpu, self._bucket_size)
+            with tensor_file_path.open(mode="w", encoding="utf-8") as f:
+                _summarize_tensor(display_name, tensor, f, depth, self._run_on_cpu, self._bucket_size)
 
 
 def _summarize_tensor(
diff --git a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
index e6004319ef5e..d4b9768116e9 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
@@ -289,7 +289,7 @@ def backward(ctx, *grads):
                     raise RuntimeError(f"param {p} has no grad, this should not happen.")
                 # Param gradient accumulation is triggered here, along with the attached hooks, done by PyTorch.
                 assert p.shape == g.shape, f"param_index: {param_index} - param shape {p.shape} != grad shape {g.shape}"
-                # p.backward(g)
+                p.backward(g)
 
         # At this point, the **real** param grads are already updated, the following grads are only used for
         # completing the full backward propagation, will not affect parameter updates.
diff --git a/orttraining/orttraining/python/training/utils/torch_profile_utils.py b/orttraining/orttraining/python/training/utils/torch_profile_utils.py
index 382d7dac142f..9e8a41e0dc7c 100644
--- a/orttraining/orttraining/python/training/utils/torch_profile_utils.py
+++ b/orttraining/orttraining/python/training/utils/torch_profile_utils.py
@@ -3,6 +3,8 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+from __future__ import annotations
+
 import torch
 
 
@@ -26,3 +28,77 @@ def wrapped_fn(*args, **kwargs):
         return ret_val
 
     return wrapped_fn
+
+
+def log_memory_usage(cur_phase: str, rank_0_only=True, step_info="", logger=None, module=None):
+    """Log memory usage for the current phase.
+    Args:
+        cur_phase (str): The current phase.
+        rank_0_only (bool, optional): Only log the memory usage for rank 0. Defaults to True.
+        step_info (str, optional): The step information. Defaults to "".
+        logger (logging.Logger, optional): The logger to log the memory usage. Defaults to None, which means print to stdout.
+        module (torch.nn.Module, optional): The module to get parameter, buffer and grad sizes. Defaults to None.
+    """
+    rank = 0
+    if rank_0_only is True:
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+        if rank != 0:
+            return
+
+    _normalizer_factor = float(1024 * 1024)
+    _normalizer_unit = "MiB"
+
+    def _normalize(mem_size_in_bytes: float | int) -> str:
+        return f"{float(mem_size_in_bytes) / _normalizer_factor:.0f}"
+
+    cur_mem_allocated = _normalize(torch.cuda.memory_allocated())
+    max_mem_allocated = _normalize(torch.cuda.max_memory_allocated())
+    cur_mem_cached = _normalize(torch.cuda.memory_reserved())
+    max_mem_cached = _normalize(torch.cuda.max_memory_reserved())
+    torch_mem_stat = torch.cuda.memory_stats()
+    cur_mem_inactive = _normalize(torch_mem_stat.get("inactive_split_bytes.all.current", 0))
+    max_mem_inactive = _normalize(torch_mem_stat.get("inactive_split_bytes.all.peak", 0))
+
+    mem_stats = [
+        ["phase", cur_phase],
+        ["allocated", cur_mem_allocated],  # current memory allocated for tensors
+        ["max allocated", max_mem_allocated],  # peak memory allocated for tensors
+        ["cached", cur_mem_cached],  # current memory cached for the caching allocator
+        ["max cached", max_mem_cached],  # peak memory cached for caching allocator.
+        ["inactive", cur_mem_inactive],  # amount of inactive, non-releasable memory
+        ["max inactive", max_mem_inactive],  # peak of inactive, non-releasable memory
+    ]
+
+    # Calculate the total size of parameters and gradients in the model
+    if module:
+        param_total_size = 0
+        grad_total_size = 0
+        for p in module.parameters():
+            if p.is_cuda:
+                param_total_size += p.numel() * p.element_size()
+            if p.grad is not None and p.grad.is_cuda:
+                grad_total_size += p.grad.numel() * p.grad.element_size()
+
+        # Calculate the total size of buffers in the model
+        buffer_total_size = 0
+        for b in module.buffers():
+            if b.is_cuda:
+                buffer_total_size += b.numel() * b.element_size()
+
+        mem_stats.extend(
+            [
+                ["param", _normalize(param_total_size)],
+                ["grad", _normalize(grad_total_size)],
+                ["buffer", _normalize(buffer_total_size)],
+            ]
+        )
+
+    summ = f"rank-{rank} {step_info} memory ({_normalizer_unit})"
+    for stat in mem_stats:
+        summ += f" | {stat[0]}: {stat[1]}"
+
+    if logger is None:
+        print(summ)
+    else:
+        logger.info(summ)
diff --git a/orttraining/orttraining/test/external_custom_ops/setup.py b/orttraining/orttraining/test/external_custom_ops/setup.py
index 435b83b81838..29383e361834 100644
--- a/orttraining/orttraining/test/external_custom_ops/setup.py
+++ b/orttraining/orttraining/test/external_custom_ops/setup.py
@@ -28,9 +28,7 @@ def build_extension(self, ext):
         subprocess.check_call(
             [
                 "cmake",
-                "-DPYBIND11_PYTHON_VERSION={}.{}.{}".format(
-                    sys.version_info.major, sys.version_info.minor, sys.version_info.micro
-                ),
+                f"-DPYBIND11_PYTHON_VERSION={sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
                 f"-Dpybind11_DIR={pybind11.get_cmake_dir()}",
                 f"-DONNX_INCLUDE={os.path.dirname(os.path.dirname(onnx.__file__))}",
                 "-DONNXRUNTIME_EXTERNAL_INCLUDE={}".format(
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 6fb42dd59b6a..94ca96c68f2c 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -607,6 +607,10 @@ TEST(GradientCheckerTest, ReduceMeanGrad) {
 
   OpDef op_def_opset13{"ReduceMean", kOnnxDomain, 13};
   RunReductionTests(op_def_opset13);
+
+  // axes is input from opset 18.
+  OpDef op_def_opset18{"ReduceMean", kOnnxDomain, 18};
+  RunReductionTests(op_def_opset18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceSumGrad) {
@@ -619,6 +623,10 @@ TEST(GradientCheckerTest, ReduceSumGrad) {
   OpDef op_def_13{"ReduceSum", kOnnxDomain, 13};
 
   RunReductionTests(op_def_13, true, true);
+
+  OpDef op_def_18{"ReduceSum", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceL2Grad) {
@@ -641,6 +649,11 @@ TEST(GradientCheckerTest, ReduceL2Grad) {
                                                            {MakeAttribute("axes", axes)}));
     EXPECT_IS_TINY(max_error);
   }
+
+  // axes is input from opset 18
+  OpDef op_def_18{"ReduceL2", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
@@ -648,6 +661,10 @@ TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
   OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11};
 
   RunReductionTests(op_def);
+
+  OpDef op_def_opset18{"ReduceLogSumExp", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_opset18, true, true);
 }
 
 TEST(GradientCheckerTest, ReluGrad) {
@@ -698,6 +715,13 @@ TEST(GradientCheckerTest, SplitGrad) {
   ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_13, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error,
                                                          {MakeAttribute("axis", int64_t(0))}));
   EXPECT_IS_TINY(max_error);
+
+  // opset18 test
+  OpDef op_def_18{"Split", kOnnxDomain, 18};
+  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_18, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error,
+                                                         {MakeAttribute("axis", int64_t(0)),
+                                                          MakeAttribute("num_outputs", int64_t(3))}));
+  EXPECT_IS_TINY(max_error);
 }
 
 template <typename T>
@@ -2218,12 +2242,6 @@ TEST(GradientUtilsTest, InPlaceAccumulatorV2_GPU) {
       {3072, 768},
       {4096, 768},
       {8192, 768},
-      {16384, 768},
-      {32768, 768},
-      {65536, 768},
-      {131072, 768},
-      {250002, 768},
-      {500004, 768},
   };
 
   for (const auto& test_dim : test_dims) {
@@ -2739,7 +2757,7 @@ TEST(GradientCheckerTest, TileGrad) {
 TEST(GradientCheckerTest, PadGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"Pad", kOnnxDomain, 11};
+  OpDef op_def{"Pad", kOnnxDomain, 18};
 
   {
     TensorInfo x_info({2, 4}, true);
@@ -2809,7 +2827,7 @@ TEST(GradientCheckerTest, PadGrad) {
 TEST(GradientCheckerTest, ScatterNDGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"ScatterND", kOnnxDomain, 11};
+  OpDef op_def{"ScatterND", kOnnxDomain, 18};
 
   {
     TensorInfo data_info({8}, true);
@@ -2893,7 +2911,7 @@ TEST(GradientCheckerTest, ScatterNDGrad) {
 TEST(GradientCheckerTest, ScatterElementsGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"ScatterElements", kOnnxDomain, 13};
+  OpDef op_def{"ScatterElements", kOnnxDomain, 18};
 
   {  // without axis
     TensorInfo data_info({3, 3}, true);
diff --git a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
index bfb59f1525e4..18c1364f5d1f 100644
--- a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
@@ -144,6 +144,8 @@ TEST(OptimizerTest, AdamBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-1.4634f, -0.6416f, -1.2121f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
 
@@ -167,6 +169,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0NoBiasCorrection) {
   test.AddOutput<float>("W_Out", {3}, {-3.6210f, -2.8075f, -3.3723f});
   test.AddOutput<float>("G_Out", {3}, {-3.1576f, -3.1658f, -3.1601f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
@@ -191,6 +195,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0WithBiasCorrection) {
   test.AddOutput<float>("W_Out", {3}, {-1.4587f, -0.6452f, -1.2099f});
   test.AddOutput<float>("G_Out", {3}, {-0.9954f, -1.0036f, -0.9979f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
@@ -214,6 +220,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1NoBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-3.5894f, -2.7758f, -3.3406f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(1));
@@ -237,6 +245,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1WithBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-1.4488f, -0.6352f, -1.1999f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(1));
@@ -368,6 +378,11 @@ TEST(OptimizerTest, AdamOptimizerMixPrecision_FP16Weight_ClipNorm_Test) {
   test.AddOptionalOutputEdge<MLFloat16>();
   test.AddOutput<MLFloat16>("FP16_W_Out", {3}, w_new_half);
 
+  test.SetOutputAbsErr("Moment_1_Out", 0.005f);
+  test.SetOutputAbsErr("Moment_2_Out", 0.005f);
+  test.SetOutputAbsErr("W_Out", 0.001f);
+  test.SetOutputAbsErr("FP16_W_Out", 0.005f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
   test.AddAttribute("max_norm_clip", 0.001f);
@@ -617,6 +632,8 @@ void run_lamb_test_with_baseline(
     test.AddOptionalOutputEdge<MLFloat16>();
   }
 
+  test.SetOutputTolerance(0.005f);
+
   test.Run();
 }
 
@@ -737,6 +754,8 @@ void run_multi_tensor_lamb_test_with_baseline(
   test.AddAttribute("ratio_min", ratio_min);
   test.AddAttribute("ratio_max", ratio_max);
 
+  test.SetOutputTolerance(0.005f);
+
   test.Run();
 }
 
diff --git a/orttraining/orttraining/test/optimizer/compute_optimizer_test.cc b/orttraining/orttraining/test/optimizer/compute_optimizer_test.cc
index cf510ea43c89..509937bdd0c3 100644
--- a/orttraining/orttraining/test/optimizer/compute_optimizer_test.cc
+++ b/orttraining/orttraining/test/optimizer/compute_optimizer_test.cc
@@ -135,7 +135,7 @@ TEST(ComputeOptimizerTests, InsertGatherBeforeSceLoss_Allowed) {
       }
     };
 
-    std::vector<int> opsets{12, 13, 14, 15};
+    std::vector<int> opsets{12, 13, 14, 15, 17};
     for (auto opset : opsets) {
       std::unique_ptr<GraphTransformer> transformer =
           std::make_unique<InsertGatherBeforeSceLoss>(compatible_eps, std::vector<std::string>{"label"});
@@ -206,7 +206,7 @@ TEST(ComputeOptimizerTests, InsertGatherBeforeSceLoss_NotAllowed_LabelNameNotMat
       }
     };
 
-    std::vector<int> opsets{12, 13, 14, 15};
+    std::vector<int> opsets{12, 13, 14, 15, 17};
     for (auto opset : opsets) {
       std::unique_ptr<GraphTransformer> transformer =
           std::make_unique<InsertGatherBeforeSceLoss>(compatible_eps, std::vector<std::string>{"label"});
@@ -277,7 +277,7 @@ TEST(ComputeOptimizerTests, InsertGatherBeforeSceLoss_NotAllowed_ReduceNone) {
       }
     };
 
-    std::vector<int> opsets{12, 13, 14, 15};
+    std::vector<int> opsets{12, 13, 14, 15, 17};
     for (auto opset : opsets) {
       std::unique_ptr<GraphTransformer> transformer =
           std::make_unique<InsertGatherBeforeSceLoss>(compatible_eps, std::vector<std::string>{"label"});
@@ -344,7 +344,7 @@ TEST(ComputeOptimizerTests, InsertGatherBeforeSceLoss_NotAllowed_NoIgnoreIndex)
       }
     };
 
-    std::vector<int> opsets{12, 13, 14, 15};
+    std::vector<int> opsets{12, 13, 14, 15, 17};
     for (auto opset : opsets) {
       std::unique_ptr<GraphTransformer> transformer =
           std::make_unique<InsertGatherBeforeSceLoss>(compatible_eps, std::vector<std::string>{"label"});
diff --git a/orttraining/orttraining/test/optimizer/graph_transform_test.cc b/orttraining/orttraining/test/optimizer/graph_transform_test.cc
index b774fec11cc8..109937ff96d1 100644
--- a/orttraining/orttraining/test/optimizer/graph_transform_test.cc
+++ b/orttraining/orttraining/test/optimizer/graph_transform_test.cc
@@ -1200,7 +1200,7 @@ TEST_P(QDQFusionTestsParameterized, CheckModelComposition) {
   ASSERT_EQ(op_to_count_post_fusion["com.microsoft.FakeQuant"], 1);
 }
 
-TEST_F(GraphTransformationTests, Conv1dReplacement) {
+TEST_F(GraphTransformationTests, Conv1dReplacement_TakeEffect) {
   auto pre_graph_checker = [&](Graph& graph) {
     auto op_count_map = CountOpsInGraph(graph);
     TEST_RETURN_IF_NOT(op_count_map["Conv"] == 1);
@@ -1208,7 +1208,7 @@ TEST_F(GraphTransformationTests, Conv1dReplacement) {
   };
 
   for (auto opset : {11, 12, 13, 14, 15, 16, 17, 18}) {
-    for (auto group : {1, 2}) {
+    for (auto group : {1, 2, 4}) {
       auto build_test_case = [&](ModelTestBuilder& builder) {
         auto [batch_size, in_channel, in_length] = std::make_tuple(8, 16, 128);
         auto out_channel = 64;
@@ -1222,6 +1222,8 @@ TEST_F(GraphTransformationTests, Conv1dReplacement) {
         conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{1});
         conv_node.AddAttribute("strides", std::vector<int64_t>{1});
         conv_node.AddAttribute("group", static_cast<int64_t>(group));
+        conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0});
+        conv_node.AddAttribute("auto_pad", "NOTSET");
       };
 
       auto post_graph_checker = [&](Graph& graph) {
@@ -1243,28 +1245,64 @@ TEST_F(GraphTransformationTests, Conv1dReplacement) {
   }
 }
 
-TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect) {
+// node has bias input so conv not replaced
+TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect1) {
   auto pre_graph_checker = [&](Graph& graph) {
     auto op_count_map = CountOpsInGraph(graph);
     TEST_RETURN_IF_NOT(op_count_map["Conv"] == 1);
     return Status::OK();
   };
 
-  // "group" is 3 so conv not replaced
   for (auto opset : {11, 12, 13, 14, 15, 16, 17, 18}) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto [batch_size, in_channel, in_length] = std::make_tuple(8, 16, 128);
       auto out_channel = 64;
       auto* data_arg = builder.MakeInput<float>({{batch_size, in_channel, in_length}});
 
-      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel / 3, 1}, {-1.0f, 1.0f});
+      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, {-1.0f, 1.0f});
+      auto* bias_arg = builder.MakeInitializer<float>({out_channel}, {-1.0f, 1.0f});
+      auto* conv_output = builder.MakeOutput();
+
+      auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg, bias_arg}, {conv_output});
+      conv_node.AddAttribute("dilations", std::vector<int64_t>{1});
+      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{1});
+      conv_node.AddAttribute("strides", std::vector<int64_t>{1});
+      conv_node.AddAttribute("group", static_cast<int64_t>(1));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0});
+      conv_node.AddAttribute("auto_pad", "NOTSET");
+    };
+
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<Conv1dReplacement>();
+    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset, *logger_, std::move(transformer),
+                                          TransformerLevel::Level1, 1,
+                                          pre_graph_checker, pre_graph_checker));
+  }
+}
+
+// "auto_pad " is not NOTSET so conv not replaced
+TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect2) {
+  auto pre_graph_checker = [&](Graph& graph) {
+    auto op_count_map = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_map["Conv"] == 1);
+    return Status::OK();
+  };
+
+  for (auto opset : {11, 12, 13, 14, 15, 16, 17, 18}) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto [batch_size, in_channel, in_length] = std::make_tuple(8, 16, 128);
+      auto out_channel = 64;
+      auto* data_arg = builder.MakeInput<float>({{batch_size, in_channel, in_length}});
+
+      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, {-1.0f, 1.0f});
       auto* conv_output = builder.MakeOutput();
 
       auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg}, {conv_output});
       conv_node.AddAttribute("dilations", std::vector<int64_t>{1});
       conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{1});
       conv_node.AddAttribute("strides", std::vector<int64_t>{1});
-      conv_node.AddAttribute("group", static_cast<int64_t>(3));
+      conv_node.AddAttribute("group", static_cast<int64_t>(1));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0});
+      conv_node.AddAttribute("auto_pad", "VALID");
     };
 
     std::unique_ptr<GraphTransformer> transformer = std::make_unique<Conv1dReplacement>();
@@ -1272,8 +1310,16 @@ TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect) {
                                           TransformerLevel::Level1, 1,
                                           pre_graph_checker, pre_graph_checker));
   }
+}
+
+// pads is not all zero, so conv not replaced
+TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect3) {
+  auto pre_graph_checker = [&](Graph& graph) {
+    auto op_count_map = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_map["Conv"] == 1);
+    return Status::OK();
+  };
 
-  // "kernel_shape" is not 1 so conv not replaced
   for (auto opset : {11, 12, 13, 14, 15, 16, 17, 18}) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       auto [batch_size, in_channel, in_length] = std::make_tuple(8, 16, 128);
@@ -1285,9 +1331,11 @@ TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect) {
 
       auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg}, {conv_output});
       conv_node.AddAttribute("dilations", std::vector<int64_t>{1});
-      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{2});
+      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{1});
       conv_node.AddAttribute("strides", std::vector<int64_t>{1});
       conv_node.AddAttribute("group", static_cast<int64_t>(1));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{1, 0});
+      conv_node.AddAttribute("auto_pad", "NOTSET");
     };
 
     std::unique_ptr<GraphTransformer> transformer = std::make_unique<Conv1dReplacement>();
@@ -1523,7 +1571,7 @@ TEST_F(GraphTransformationTests, ScaledSumFusionThreeInputs) {
       builder.AddNode("Identity", {add2_out}, {graph_out});
     };
 
-    const std::vector<int> opsets{12, 13, 14, 15};
+    const std::vector<int> opsets{12, 13, 14, 15, 17};
     for (auto& opset_version : opsets) {
       std::unique_ptr<GraphTransformer> transformer = std::make_unique<ScaledSumFusion>();
       ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer),
@@ -1616,7 +1664,7 @@ TEST_F(GraphTransformationTests, ScaledSumFusionThreeInputs_LastAddNotHaveScaleI
       builder.AddNode("Identity", {add2_out}, {graph_out});
     };
 
-    const std::vector<int> opsets{12, 13, 14, 15};
+    const std::vector<int> opsets{12, 13, 14, 15, 17};
     for (auto& opset_version : opsets) {
       std::unique_ptr<GraphTransformer> transformer = std::make_unique<ScaledSumFusion>();
       ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer),
@@ -1710,7 +1758,7 @@ TEST_F(GraphTransformationTests, ScaledSumFusionTwoInputs) {
       builder.AddNode("Identity", {add1_out}, {graph_output2});
     };
 
-    const std::vector<int> opsets{12, 13, 14, 15};
+    const std::vector<int> opsets{12, 13, 14, 15, 17};
     for (auto& opset_version : opsets) {
       std::unique_ptr<GraphTransformer> transformer = std::make_unique<ScaledSumFusion>();
       ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer),
diff --git a/orttraining/orttraining/test/optimizer/memory_optimizer_test.cc b/orttraining/orttraining/test/optimizer/memory_optimizer_test.cc
index 22f1da132754..360095dea669 100644
--- a/orttraining/orttraining/test/optimizer/memory_optimizer_test.cc
+++ b/orttraining/orttraining/test/optimizer/memory_optimizer_test.cc
@@ -29,6 +29,7 @@
 #include "orttraining/core/optimizer/memory_optimizer/common.h"
 #include "orttraining/core/optimizer/memory_optimizer/memory_optimizer.h"
 #include "orttraining/core/optimizer/memory_optimizer/memory_insight.h"
+#include "orttraining/core/optimizer/memory_optimizer/transformer_specific.h"
 
 using namespace std;
 using namespace ONNX_NAMESPACE;
@@ -312,5 +313,45 @@ TEST(MemoryOptimizerTests, TransformerPerLayerRecompute) {
   }
 }
 
+TEST(MemoryOptimizerTests, TransformerLayerDetectionTest) {
+  const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
+  auto model_uri = MODEL_FOLDER "3layer_bloom_optimized_training.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger));
+  Graph& graph = model->MainGraph();
+  GraphViewer graph_viewer(graph);
+
+  InlinedHashMap<NodeIndex, ptrdiff_t> node_index_to_its_order_in_topological_sort_map;
+  const auto& node_ids = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::PRIORITY_BASED);
+
+  // Find boundary ops between forward and backward pass, currently, it's limited to YieldOp.
+  ptrdiff_t yield_op_order_in_topological_sort = -1;
+  for (size_t i = 0; i < node_ids.size(); ++i) {
+    const Node* p_node = graph_viewer.GetNode(node_ids[i]);
+    if (p_node == nullptr) { /* skip removed nodes*/
+      continue;
+    }
+
+    if (p_node->OpType() == "YieldOp") {
+      // There are multiple YieldOps in the graph。
+      ASSERT_EQ(yield_op_order_in_topological_sort, -1);
+      yield_op_order_in_topological_sort = static_cast<ptrdiff_t>(i);
+    }
+
+    node_index_to_its_order_in_topological_sort_map[p_node->Index()] = static_cast<ptrdiff_t>(i);
+  }
+
+  InlinedVector<const Node*> layer_boundary_ln_node;
+  optimizer::memory_optimizer::FindLayerBoundaryLayerNormNodes(graph_viewer, *logger,
+                                                               node_index_to_its_order_in_topological_sort_map,
+                                                               yield_op_order_in_topological_sort,
+                                                               layer_boundary_ln_node);
+
+  ASSERT_EQ(layer_boundary_ln_node.size(), 3);
+  ASSERT_EQ(layer_boundary_ln_node[0]->Name(), "LayerNormalization_token_0");
+  ASSERT_EQ(layer_boundary_ln_node[1]->Name(), "LayerNormalization_token_6");
+  ASSERT_EQ(layer_boundary_ln_node[2]->Name(), "LayerNormalization_token_12");
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/test/optimizer/shape_optimizer_test.cc b/orttraining/orttraining/test/optimizer/shape_optimizer_test.cc
index ea05b29c8668..a1629eb73eeb 100644
--- a/orttraining/orttraining/test/optimizer/shape_optimizer_test.cc
+++ b/orttraining/orttraining/test/optimizer/shape_optimizer_test.cc
@@ -67,7 +67,7 @@ TEST(ShapeOptimizerTests, Shape15CannotFold) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{15};
+  std::vector<int> opset_candidates{15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> identity_input_shape;
@@ -145,7 +145,7 @@ TEST(ShapeOptimizerTests, Shape15) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{15};
+  std::vector<int> opset_candidates{15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> identity_input_shape;
@@ -218,7 +218,7 @@ TEST(ShapeOptimizerTests, Shape15TakesGraphInput) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{15};
+  std::vector<int> opset_candidates{15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> shape_input_shape;
@@ -289,7 +289,7 @@ TEST(ShapeOptimizerTests, Shape15GeneratesGraphOutput) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{15};
+  std::vector<int> opset_candidates{15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> identity_input_shape;
@@ -366,7 +366,7 @@ TEST(ShapeOptimizerTests, Slice) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15};
+  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> shape_input_shape;
@@ -446,7 +446,7 @@ TEST(ShapeOptimizerTests, SliceGeneratesGraphOutput) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15};
+  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> shape_input_shape;
@@ -530,7 +530,7 @@ TEST(ShapeOptimizerTests, Gather) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15};
+  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> shape_input_shape;
@@ -639,7 +639,7 @@ TEST(ShapeOptimizerTests, ConcreteDimUsedBySlice) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15};
+  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> dropout_input_shape;
@@ -810,7 +810,7 @@ TEST(ShapeOptimizerTests, ConcreteDimUsedByGatherSlice) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15};
+  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> reshape_input_shape;
@@ -976,7 +976,7 @@ TEST(ShapeOptimizerTests, SymbolicDimUsedByGather_ConcreteDimUsedByGather) {
     return Status::OK();
   };
 
-  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15};
+  std::vector<int> opset_candidates{10, 11, 12, 13, 14, 15, 17};
   for (auto opset : opset_candidates) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
       std::vector<std::variant<int64_t, std::string>> reshape_input_shape;
diff --git a/orttraining/orttraining/test/python/_test_commons.py b/orttraining/orttraining/test/python/_test_commons.py
index fb7e62551de6..762c4c4d55f9 100644
--- a/orttraining/orttraining/test/python/_test_commons.py
+++ b/orttraining/orttraining/test/python/_test_commons.py
@@ -25,5 +25,5 @@ def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False, en
     completed_process = subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell)
 
     if log:
-        log.debug("Subprocess completed. Return code=" + str(completed_process.returncode))
+        log.debug("Subprocess completed. Return code=%s", completed_process.returncode)
     return completed_process
diff --git a/orttraining/orttraining/test/python/_test_helpers.py b/orttraining/orttraining/test/python/_test_helpers.py
index 8f2a18b5ec00..65043c10d8a0 100644
--- a/orttraining/orttraining/test/python/_test_helpers.py
+++ b/orttraining/orttraining/test/python/_test_helpers.py
@@ -288,7 +288,6 @@ def cpu_barrier_func():
 
     def cuda_barrier_func():
         torch.cuda.synchronize()
-        pass
 
     cuda = torch.device("cuda:0")
     run_evaluate_test_on_device_and_compare(
diff --git a/orttraining/orttraining/test/python/orttraining_test_dort.py b/orttraining/orttraining/test/python/orttraining_test_dort.py
index 2a7012787be6..573ec85d7601 100644
--- a/orttraining/orttraining/test/python/orttraining_test_dort.py
+++ b/orttraining/orttraining/test/python/orttraining_test_dort.py
@@ -8,9 +8,22 @@
 import torch.onnx._internal.exporter
 from torch import nn
 from torch.nn import functional as F
+from torch.onnx import ExportOptions
+from torch.onnx import _OrtBackend as OrtBackend
+from torch.onnx import _OrtBackendOptions as OrtBackendOptions
 from torch.utils import _pytree
 
-from onnxruntime.training.torchdynamo.register_backend import aot_ort, dynamic_aot_ort, make_aot_ort, ort
+
+def make_local_backend(dynamic: bool = False, use_aot_autograd: bool = False):
+    ort_backend = OrtBackend(
+        options=OrtBackendOptions(
+            export_options=ExportOptions(
+                dynamic_shapes=dynamic,
+            ),
+            use_aot_autograd=use_aot_autograd,
+        )
+    )
+    return ort_backend
 
 
 class TestTorchDynamoOrt(unittest.TestCase):
@@ -35,9 +48,7 @@ def elementwise_model(tensor_x: torch.Tensor):
                 tensor_q = tensor_p.sigmoid()
                 return tensor_q
 
-            @torch._dynamo.optimize(aot_ort)
-            def optimized_elementwise_model(tensor_x: torch.Tensor):
-                return elementwise_model(tensor_x)
+            optimized_elementwise_model = torch.compile(elementwise_model, backend="onnxrt", dynamic=True)
 
             def run(fun, list_x):
                 tensor_x = torch.tensor(list_x, dtype=torch.float32).requires_grad_()
@@ -77,9 +88,7 @@ def elementwise_model(tensor_x: torch.Tensor):
             # With dynamic_shape=True, Dynamo sends FX graphs with dynamic
             # shapes (e.g., batch size is a symbol "batch" instead of a fixed
             # number) to OrtBackend.compile(...).
-            @torch._dynamo.optimize(dynamic_aot_ort, dynamic=True)
-            def optimized_elementwise_model(tensor_x: torch.Tensor):
-                return elementwise_model(tensor_x)
+            optimized_elementwise_model = torch.compile(elementwise_model, backend="onnxrt", dynamic=True)
 
             def run(fun, seed: torch.Tensor):
                 tensor_x = seed.detach().clone().requires_grad_()
@@ -125,8 +134,8 @@ def elementwise_model(tensor_x: torch.Tensor):
                 tensor_q = tensor_p.sigmoid()
                 return (tensor_q, (tensor_y, tensor_z))
 
-            local_aot_ort, ort_backend = make_aot_ort(dynamic=True)
-            cached = ort_backend._all_ort_execution_info.execution_info_per_graph_module
+            local_backend = make_local_backend(dynamic=True, use_aot_autograd=True)
+            cached = local_backend._all_ort_execution_info.execution_info_per_graph_module
             # Before compilation, no graph is generated.
             assert len(cached) == 0
 
@@ -135,7 +144,7 @@ def elementwise_model(tensor_x: torch.Tensor):
             # With dynamic_shape=True, Dynamo sends FX graphs with dynamic
             # shapes (e.g., batch size is a symbol "batch" instead of a fixed
             # number) to OrtBackend.compile(...).
-            @torch._dynamo.optimize(local_aot_ort, dynamic=True)
+            @torch._dynamo.optimize(local_backend, dynamic=True)
             def optimized_elementwise_model(tensor_x: torch.Tensor):
                 return elementwise_model(tensor_x)
 
@@ -207,9 +216,13 @@ def elementwise_model(tensor_x: torch.Tensor):
             tensor_q = tensor_p.relu()
             return tensor_q
 
-        @torch._dynamo.optimize(ort)
-        def optimized_elementwise_model(tensor_x: torch.Tensor):
-            return elementwise_model(tensor_x)
+        # TODO: Set use_aot_autograd=False. In order to decompose torch
+        # function calls to aten ops, we need to set
+        # user_aot_autograd=True because there is no decomposition in DORT
+        # anymore. A long-term fix will be brining # decomposition pass back
+        # into DORT.
+        local_backend = make_local_backend(dynamic=True, use_aot_autograd=True)
+        optimized_elementwise_model = torch.compile(elementwise_model, backend=local_backend, dynamic=True)
 
         def run(fun, list_x):
             tensor_x = torch.tensor(list_x, dtype=torch.float32).requires_grad_()
@@ -237,9 +250,7 @@ def copy_copy_copy(tensor_x: torch.Tensor):
                 )
                 return tensor_x1, tensor_x2, tensor_x3
 
-            @torch._dynamo.optimize(aot_ort)
-            def optimized_copy_copy_copy(tensor_x: torch.Tensor):
-                return copy_copy_copy(tensor_x)
+            optimized_copy_copy_copy = torch.compile(copy_copy_copy, backend="onnxrt")
 
             def run(fun, list_x):
                 tensor_x = torch.tensor(list_x, dtype=torch.float32)
@@ -265,7 +276,7 @@ def run_no_input_model():
             def no_input_model():
                 return torch.ops.aten.full([2, 3], 1.5)
 
-            @torch._dynamo.optimize(aot_ort)
+            @torch._dynamo.optimize("onnxrt")
             def optimized_no_input_model():
                 return no_input_model()
 
@@ -291,9 +302,7 @@ def run_no_input_model():
             def no_input_model():
                 return torch.ops.aten.full([2, 3], 1.5, device="cpu")
 
-            @torch._dynamo.optimize(aot_ort)
-            def optimized_no_input_model():
-                return no_input_model()
+            optimized_no_input_model = torch.compile(no_input_model, backend="onnxrt")
 
             def run(fun):
                 tensor_x = fun()
@@ -355,7 +364,8 @@ def run(model, tensor_x, tensor_y):
             # Baseline.
             loss, grads = run(model, tensor_x, tensor_y)
             # ORT result.
-            compiled_model = torch._dynamo.optimize(aot_ort)(model)
+            local_backend = make_local_backend(dynamic=False, use_aot_autograd=True)
+            compiled_model = torch.compile(model, backend=local_backend, dynamic=False)
             loss_new, grads_new = run(compiled_model, tensor_x, tensor_y)
 
             print(f"MNIST loss: {loss} (pytorch), {loss_new} (ort).")
diff --git a/orttraining/orttraining/test/python/orttraining_test_dort_custom_ops.py b/orttraining/orttraining/test/python/orttraining_test_dort_custom_ops.py
index c2a6ed504a20..dfc62dba427e 100644
--- a/orttraining/orttraining/test/python/orttraining_test_dort_custom_ops.py
+++ b/orttraining/orttraining/test/python/orttraining_test_dort_custom_ops.py
@@ -11,9 +11,10 @@
 from functorch.compile import min_cut_rematerialization_partition
 from torch._dynamo.backends.common import aot_autograd
 from torch.library import Library
+from torch.onnx import _OrtBackend as OrtBackend
+from torch.onnx import _OrtBackendOptions as OrtBackendOptions
 
 import onnxruntime
-from onnxruntime.training.torchdynamo.ort_backend import OrtBackend
 
 # Dummy operator set to map aten::mul.Tensor to test.customop::CustomOpOne
 # in ONNX model executed by DORT.
@@ -112,16 +113,18 @@ def test_export_aten_mul_as_onnx_custom_op_and_run_ort(self):
 
         # In order to use custom exporting function inside PyTorch-to-ONNX exporter used in DORT, create executor of ONNX model with custom `onnx_registry`.
         ort_backend = OrtBackend(
-            ep="CPUExecutionProvider",
-            session_options=TestTorchDynamoOrtCustomOp.create_onnxruntime_session_options(),
-            onnx_exporter_options=torch.onnx.ExportOptions(dynamic_shapes=True, onnx_registry=onnx_registry),
+            OrtBackendOptions(
+                preferred_execution_providers="CPUExecutionProvider",
+                ort_session_options=TestTorchDynamoOrtCustomOp.create_onnxruntime_session_options(),
+                export_options=torch.onnx.ExportOptions(dynamic_shapes=True, onnx_registry=onnx_registry),
+            )
         )
 
         # Wrap ORT executor as a Dynamo backend.
         aot_ort = aot_autograd(
             fw_compiler=ort_backend,
             partition_fn=min_cut_rematerialization_partition,
-            decompositions=ort_backend.resolved_onnx_exporter_options.decomposition_table,
+            decompositions=ort_backend._resolved_onnx_exporter_options.decomposition_table,
         )
 
         def one_mul(tensor_x: torch.Tensor, tensor_y: torch.Tensor):
@@ -169,19 +172,22 @@ def bar_impl(self: torch.Tensor) -> torch.Tensor:
 
         # Create executor of ONNX model.
         ort_backend = OrtBackend(
-            ep="CPUExecutionProvider",
-            session_options=TestTorchDynamoOrtCustomOp.create_onnxruntime_session_options(),
-            onnx_exporter_options=torch.onnx.ExportOptions(onnx_registry=onnx_registry),
+            OrtBackendOptions(
+                preferred_execution_providers="CPUExecutionProvider",
+                ort_session_options=TestTorchDynamoOrtCustomOp.create_onnxruntime_session_options(),
+                export_options=torch.onnx.ExportOptions(dynamic_shapes=True, onnx_registry=onnx_registry),
+            )
         )
+
         # Allow torch.ops.foo.bar.default to be sent to DORT.
         # _support_dict tells Dynamo which ops to sent to DORT.
-        ort_backend._supported_ops._support_dict.add(torch.ops.foo.bar.default)
+        ort_backend._supported_ops._support_dict[torch.ops.foo.bar.default] = None
 
         # Wrap ORT executor as a Dynamo backend.
         aot_ort = aot_autograd(
             fw_compiler=ort_backend,
             partition_fn=min_cut_rematerialization_partition,
-            decompositions=ort_backend.resolved_onnx_exporter_options.decomposition_table,
+            decompositions=ort_backend._resolved_onnx_exporter_options.decomposition_table,
         )
 
         def one_foo(tensor_x: torch.Tensor):
diff --git a/orttraining/orttraining/test/python/orttraining_test_gru.py b/orttraining/orttraining/test/python/orttraining_test_gru.py
index fcb7e13b1694..c9e22bf7384a 100644
--- a/orttraining/orttraining/test/python/orttraining_test_gru.py
+++ b/orttraining/orttraining/test/python/orttraining_test_gru.py
@@ -355,9 +355,7 @@ def backward_np(
                 prev_h = (
                     all_hidden_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_hidden_state[0, idx, :]
-                    if initial_hidden_state is not None
-                    else 0
+                    else initial_hidden_state[0, idx, :] if initial_hidden_state is not None else 0
                 )
 
                 grad_update_gate = (prev_h - hidden_gate) * grad_h
diff --git a/orttraining/orttraining/test/python/orttraining_test_lstm.py b/orttraining/orttraining/test/python/orttraining_test_lstm.py
index 2b296cf70c2c..4debe73951b2 100644
--- a/orttraining/orttraining/test/python/orttraining_test_lstm.py
+++ b/orttraining/orttraining/test/python/orttraining_test_lstm.py
@@ -480,9 +480,7 @@ def backward_np(
                 grad_forget_gate = grad_c * (
                     all_cell_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_cell_state[0, idx, :]
-                    if initial_cell_state is not None
-                    else 0
+                    else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                 )
                 grad_control_gate = grad_c * input_gate
 
@@ -522,9 +520,7 @@ def backward_np(
                 prev_h = (
                     all_hidden_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_hidden_state[0, idx, :]
-                    if initial_hidden_state is not None
-                    else 0
+                    else initial_hidden_state[0, idx, :] if initial_hidden_state is not None else 0
                 )
                 grad_recurrence_weights[0, : self._hidden_size, :] += np.dot(
                     np.expand_dims(grad_input_activation, axis=0).T, np.expand_dims(prev_h, axis=0)
@@ -553,9 +549,7 @@ def backward_np(
                     grad_peephole_weights[0, : self._hidden_size] += grad_input_activation * (
                         all_cell_states[t - 1, 0, idx, :]
                         if t > 0
-                        else initial_cell_state[0, idx, :]
-                        if initial_cell_state is not None
-                        else 0
+                        else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                     )
                     grad_peephole_weights[0, self._hidden_size : 2 * self._hidden_size] += (
                         grad_output_activation * all_cell_states[t, 0, idx, :]
@@ -565,9 +559,7 @@ def backward_np(
                     ] += grad_forget_activation * (
                         all_cell_states[t - 1, 0, idx, :]
                         if t > 0
-                        else initial_cell_state[0, idx, :]
-                        if initial_cell_state is not None
-                        else 0
+                        else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                     )
 
                 grad_c = grad_prev_c
diff --git a/orttraining/orttraining/test/python/orttraining_test_model_transform.py b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
index 3b07aa1f4daf..095830cd54ab 100644
--- a/orttraining/orttraining/test/python/orttraining_test_model_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
@@ -2,10 +2,8 @@
 
 
 def add_name(model):
-    i = 0
-    for node in model.graph.node:
+    for i, node in enumerate(model.graph.node):
         node.name = "%s_%d" % (node.op_type, i)
-        i += 1
 
 
 def find_single_output_node(model, arg):
diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
index 6e5d54cbb942..ac49c1c2834c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
@@ -190,9 +190,11 @@ def _get_training_ort_inputs(x, target, pt_model, onnx_model, target_type=None):
 
     ort_inputs = {
         onnx_model.graph.input[0].name: _to_numpy(copy.deepcopy(x)),
-        onnx_model.graph.input[1].name: _to_numpy(copy.deepcopy(target))
-        if target_type is None
-        else _to_numpy(copy.deepcopy(target).type(target_type)),
+        onnx_model.graph.input[1].name: (
+            _to_numpy(copy.deepcopy(target))
+            if target_type is None
+            else _to_numpy(copy.deepcopy(target).type(target_type))
+        ),
     }
     if target_type is not None:
         ort_inputs[onnx_model.graph.input[1].name]
@@ -1017,3 +1019,83 @@ def test_save_ort_format():
             raise AssertionError(f"Opsets mismatch {base_opsets['']} != {eval_opsets['']}.")
         if base_opsets[""] != optimizer_opsets[""]:
             raise AssertionError(f"Opsets mismatch {base_opsets['']} != {optimizer_opsets['']}.")
+
+
+def test_custom_loss_function():
+    # This test tries to add a custom loss function to the model.
+    # The custom loss function tries to use two model outputs of two different ranks, computes the
+    # two losses and returns the sum of the two losses.
+    # If the artifacts are generated successfully, without an exception being raised, the test passes.
+    class ModelWithTwoOutputs(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.a = torch.randn(20, 100, 35, 45)
+            self.b = torch.randn(40, 100, 70)
+
+        def forward(self, x, y):
+            return self.a + x, self.b + y
+
+    class CustomLossBlock(onnxblock.Block):
+        def __init__(self):
+            self._loss1 = onnxblock.loss.MSELoss()
+            self._loss2 = onnxblock.loss.BCEWithLogitsLoss()
+            self._add = onnxblock.blocks.Add()
+
+        def build(self, input1, input2):
+            return self._add(self._loss1(input1, target_name="target1"), self._loss2(input2, target_name="target2"))
+
+    model = ModelWithTwoOutputs()
+    onnx_model = _get_onnx_model(model, (torch.randn(20, 100, 35, 45), torch.randn(40, 100, 70)))
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        artifacts.generate_artifacts(onnx_model, loss=CustomLossBlock(), artifact_directory=temp_dir)
+
+
+def test_save_nominal_checkpoint():
+    device = "cpu"
+    batch_size, input_size, hidden_size, output_size = 64, 784, 500, 10
+    _, base_model = _get_models(device, batch_size, input_size, hidden_size, output_size)
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        artifacts.generate_artifacts(
+            base_model,
+            requires_grad=["fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias"],
+            loss=artifacts.LossType.CrossEntropyLoss,
+            optimizer=artifacts.OptimType.AdamW,
+            artifact_directory=temp_dir,
+            nominal_checkpoint=True,
+        )
+
+        assert os.path.exists(os.path.join(temp_dir, "checkpoint"))
+        assert os.path.exists(os.path.join(temp_dir, "nominal_checkpoint"))
+        assert (
+            os.stat(os.path.join(temp_dir, "checkpoint")).st_size
+            > os.stat(os.path.join(temp_dir, "nominal_checkpoint")).st_size
+        )
+
+
+def test_custom_optimizer_block():
+    device = "cpu"
+    batch_size, input_size, hidden_size, output_size = 64, 784, 500, 10
+    _, base_model = _get_models(device, batch_size, input_size, hidden_size, output_size)
+    weight_decay = 123
+    optimizer = onnxblock.optim.AdamW(weight_decay=weight_decay)
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        artifacts.generate_artifacts(
+            base_model,
+            requires_grad=["fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias"],
+            loss=artifacts.LossType.CrossEntropyLoss,
+            optimizer=optimizer,
+            artifact_directory=temp_dir,
+        )
+
+        assert os.path.exists(os.path.join(temp_dir, "checkpoint"))
+        assert os.path.exists(os.path.join(temp_dir, "optimizer_model.onnx"))
+
+        optimizer_model = onnx.load(os.path.join(temp_dir, "optimizer_model.onnx"))
+        for node in optimizer_model.graph.node:
+            if node.op_type == "AdamW":
+                for attr in node.attribute:
+                    if attr.name == "weight_decay":
+                        assert attr.f == weight_decay
diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis_py_bindings.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis_py_bindings.py
index 34d8c24ccfab..68b3fa217694 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ort_apis_py_bindings.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis_py_bindings.py
@@ -6,9 +6,11 @@
 import os
 import pathlib
 import tempfile
+from dataclasses import dataclass
 
 import numpy as np
 import onnx
+import packaging.version as pv
 import pytest
 import torch
 from orttraining_test_ort_apis_onnxblock import _get_models
@@ -28,11 +30,22 @@ def build(self, output_name):
         return self.loss(output_name)
 
 
+@dataclass
+class Artifacts:
+    checkpoint_file_path: str
+    training_model_file_path: str
+    eval_model_file_path: str
+    optimizer_model_file_path: str
+    pt_model: torch.nn.Module
+    nominal_checkpoint_file_path: str | None = None
+
+
 def _create_training_artifacts(
     artifact_directory: str | os.PathLike,
     requires_grad: list[str] | None = None,
     frozen_params: list[str] | None = None,
     optimizer_type=artifacts.OptimType.AdamW,
+    nominal_checkpoint: bool = False,
 ):
     device = "cpu"
     batch_size, input_size, hidden_size, output_size = 64, 784, 500, 10
@@ -51,14 +64,20 @@ def _create_training_artifacts(
         requires_grad=requires_grad,
         frozen_params=frozen_params,
         artifact_directory=artifact_directory,
+        nominal_checkpoint=nominal_checkpoint,
     )
 
     training_model_file = os.path.join(artifact_directory, "training_model.onnx")
     eval_model_file = os.path.join(artifact_directory, "eval_model.onnx")
     optimizer_model_file = os.path.join(artifact_directory, "optimizer_model.onnx")
     checkpoint_file = os.path.join(artifact_directory, "checkpoint")
+    nominal_checkpoint_file = None
+    if nominal_checkpoint:
+        nominal_checkpoint_file = os.path.join(artifact_directory, "nominal_checkpoint")
 
-    return checkpoint_file, training_model_file, eval_model_file, optimizer_model_file, pt_model
+    return Artifacts(
+        checkpoint_file, training_model_file, eval_model_file, optimizer_model_file, pt_model, nominal_checkpoint_file
+    )
 
 
 def test_train_step():
@@ -67,22 +86,16 @@ def test_train_step():
     labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            _,
-            pt_model,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Module.
-        model = Module(training_model_file_path, state)
+        model = Module(artifacts.training_model_file_path, state)
         model.train()
         ort_loss = model(inputs, labels)
 
         # Calculate loss using pytorch model to compare it with Module's output.
-        pt_outputs = pt_model(torch.from_numpy(inputs))
+        pt_outputs = artifacts.pt_model(torch.from_numpy(inputs))
         loss_fn = torch.nn.CrossEntropyLoss()
         pt_loss = loss_fn(pt_outputs, torch.from_numpy(labels).long())
 
@@ -95,17 +108,11 @@ def test_eval_step():
     labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            eval_model_file_path,
-            _,
-            _,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Module.
-        model = Module(training_model_file_path, state, eval_model_file_path)
+        model = Module(artifacts.training_model_file_path, state, artifacts.eval_model_file_path)
         model.train()
         model(inputs, labels)
 
@@ -121,18 +128,12 @@ def test_optimizer_step(optimizer_type):
     labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            optimizer_model_file_path,
-            _,
-        ) = _create_training_artifacts(temp_dir, optimizer_type=optimizer_type)
+        artifacts = _create_training_artifacts(temp_dir, optimizer_type=optimizer_type)
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Module and Optimizer.
-        model = Module(training_model_file_path, state)
-        optimizer = Optimizer(optimizer_model_file_path, model)
+        model = Module(artifacts.training_model_file_path, state)
+        optimizer = Optimizer(artifacts.optimizer_model_file_path, model)
 
         model.train()
         old_flatten_params = model.get_contiguous_parameters()
@@ -147,18 +148,12 @@ def test_optimizer_step(optimizer_type):
 @pytest.mark.parametrize("optimizer_type", [artifacts.OptimType.SGD, artifacts.OptimType.AdamW])
 def test_get_and_set_lr(optimizer_type):
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            optimizer_model_file_path,
-            _,
-        ) = _create_training_artifacts(temp_dir, optimizer_type=optimizer_type)
+        artifacts = _create_training_artifacts(temp_dir, optimizer_type=optimizer_type)
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Module and Optimizer.
-        model = Module(training_model_file_path, state)
-        optimizer = Optimizer(optimizer_model_file_path, model)
+        model = Module(artifacts.training_model_file_path, state)
+        optimizer = Optimizer(artifacts.optimizer_model_file_path, model)
 
         # Test get and set learning rate.
         lr = optimizer.get_learning_rate()
@@ -178,18 +173,11 @@ def test_scheduler_step(optimizer_type):
     labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            optimizer_model_file_path,
-            _,
-        ) = _create_training_artifacts(temp_dir, optimizer_type=optimizer_type)
-        # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        artifacts = _create_training_artifacts(temp_dir, optimizer_type=optimizer_type)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Module and Optimizer.
-        model = Module(training_model_file_path, state)
-        optimizer = Optimizer(optimizer_model_file_path, model)
+        model = Module(artifacts.training_model_file_path, state)
+        optimizer = Optimizer(artifacts.optimizer_model_file_path, model)
         scheduler = LinearLRScheduler(optimizer, 1, 2, 0.2)
 
         # Test get and set learning rate.
@@ -212,17 +200,11 @@ def test_training_module_checkpoint():
     labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            _,
-            _,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Training Module and Training Optimizer.
-        model = Module(training_model_file_path, state)
+        model = Module(artifacts.training_model_file_path, state)
 
         model.train()
         model(inputs, labels)
@@ -237,7 +219,7 @@ def test_training_module_checkpoint():
 
         # Assert the checkpoint parameters remain after saving.
         new_state = CheckpointState.load_checkpoint(checkpoint_save_path)
-        new_model = Module(training_model_file_path, new_state)
+        new_model = Module(artifacts.training_model_file_path, new_state)
 
         new_params = new_model.get_contiguous_parameters()
 
@@ -252,23 +234,17 @@ def test_copy_buffer_to_parameters(trainable_only, optimizer_type):
     labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            optimizer_model_file_path,
-            _,
-        ) = _create_training_artifacts(
+        artifacts = _create_training_artifacts(
             temp_dir,
             requires_grad=["fc2.weight", "fc2.bias"],
             frozen_params=["fc1.weight", "fc1.bias"],
             optimizer_type=optimizer_type,
         )
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
 
         # Create a Module and Optimizer.
-        model = Module(training_model_file_path, state)
-        optimizer = Optimizer(optimizer_model_file_path, model)
+        model = Module(artifacts.training_model_file_path, state)
+        optimizer = Optimizer(artifacts.optimizer_model_file_path, model)
 
         # Keep a copy of the parameters.
         old_output_params = model.get_contiguous_parameters(trainable_only=trainable_only)
@@ -295,19 +271,13 @@ def test_copy_buffer_to_parameters(trainable_only, optimizer_type):
 
 def test_export_model_for_inferencing():
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            eval_model_file_path,
-            _,
-            _,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
 
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
 
         # Create a Module.
-        model = Module(training_model_file_path, state, eval_model_file_path)
+        model = Module(artifacts.training_model_file_path, state, artifacts.eval_model_file_path)
 
         # Export inference model
         inference_model_file_path = os.path.join(temp_dir, "inference_model.onnx")
@@ -317,18 +287,12 @@ def test_export_model_for_inferencing():
 
 def test_cuda_execution_provider():
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            _,
-            _,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
 
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Module.
-        model = Module(training_model_file_path, state, device="cuda")
+        model = Module(artifacts.training_model_file_path, state, device="cuda")
         params = model.get_contiguous_parameters()
 
         # Check if parameters are moved to cuda.
@@ -341,19 +305,13 @@ def test_cuda_execution_provider():
 )
 def test_add_get_property(property_value):
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            _,
-            _,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
 
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
 
         # Create a Module.
-        _ = Module(training_model_file_path, state)
+        _ = Module(artifacts.training_model_file_path, state)
 
         # Float values in python are double precision.
         # Convert to float32 to match the type of the property.
@@ -367,8 +325,8 @@ def test_add_get_property(property_value):
         assert state.properties["property"] == property_value
         assert len(state.properties) == 1
 
-        CheckpointState.save_checkpoint(state, checkpoint_file_path)
-        new_state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        CheckpointState.save_checkpoint(state, artifacts.checkpoint_file_path)
+        new_state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         assert "property" in new_state.properties
         assert new_state.properties["property"] == property_value
         assert len(new_state.properties) == 1
@@ -376,25 +334,25 @@ def test_add_get_property(property_value):
 
 def test_get_input_output_names():
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            eval_model_file_path,
-            _,
-            _,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
 
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
 
         # Create a Module.
-        model = Module(training_model_file_path, state, eval_model_file_path)
+        model = Module(artifacts.training_model_file_path, state, artifacts.eval_model_file_path)
 
-        training_model = onnx.load(training_model_file_path)
+        training_model = onnx.load(artifacts.training_model_file_path)
         assert model.input_names() == [input.name for input in training_model.graph.input][:2]
         assert model.output_names() == [output.name for output in training_model.graph.output][:1]
 
 
+# Fails in ONNX 1.16.0 due to potential shape inference bug for custom ops.
+# Potential ONNX fix: https://github.com/onnx/onnx/pull/6080
+# Error log: LookupError: The provided name onnx::linear.output::171 is not a graph value info or a graph output.
+@pytest.mark.skipif(
+    pv.Version(onnx.__version__) == pv.Version("1.16.0"), reason="Shape inference bug for custom ops in ONNX 1.16.0"
+)
 def test_ort_custom_ops():
     def _create_custom_op_trainable_onnx_model():
         """This function takes in a pre generated custom op model and adds a trainable linear layer to it"""
@@ -518,23 +476,18 @@ def test_train_step_with_ort_values():
     labels = OrtValue.ortvalue_from_numpy(labels_np)
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            _,
-            _,
-            pt_model,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
+
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Module.
-        model = Module(training_model_file_path, state)
+        model = Module(artifacts.training_model_file_path, state)
         model.train()
         ort_loss = model(inputs, labels)
         assert isinstance(ort_loss, OrtValue)
 
         # Calculate loss using pytorch model to compare it with Module's output.
-        pt_outputs = pt_model(torch.from_numpy(inputs_np))
+        pt_outputs = artifacts.pt_model(torch.from_numpy(inputs_np))
         loss_fn = torch.nn.CrossEntropyLoss()
         pt_loss = loss_fn(pt_outputs, torch.from_numpy(labels_np).long())
 
@@ -549,17 +502,11 @@ def test_eval_step_with_ort_values():
     labels = OrtValue.ortvalue_from_numpy(labels_np)
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            eval_model_file_path,
-            _,
-            _,
-        ) = _create_training_artifacts(temp_dir)
+        artifacts = _create_training_artifacts(temp_dir)
         # Create Checkpoint State.
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
         # Create a Module.
-        model = Module(training_model_file_path, state, eval_model_file_path)
+        model = Module(artifacts.training_model_file_path, state, artifacts.eval_model_file_path)
         model.train()
         model(inputs, labels)
 
@@ -572,26 +519,20 @@ def test_eval_step_with_ort_values():
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
 def test_get_and_set_parameter_values(device):
     with tempfile.TemporaryDirectory() as temp_dir:
-        (
-            checkpoint_file_path,
-            training_model_file_path,
-            eval_model_file_path,
-            _,
-            pt_model,
-        ) = _create_training_artifacts(
+        artifacts = _create_training_artifacts(
             temp_dir, requires_grad=["fc2.weight", "fc2.bias"], frozen_params=["fc1.weight", "fc1.bias"]
         )
 
-        state = CheckpointState.load_checkpoint(checkpoint_file_path)
+        state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
 
-        model = Module(training_model_file_path, state, eval_model_file_path, device=device)
+        model = Module(artifacts.training_model_file_path, state, artifacts.eval_model_file_path, device=device)
 
-        state_dict = pt_model.state_dict()
+        state_dict = artifacts.pt_model.state_dict()
         assert len(state_dict) == len(state.parameters)
         for parameter_name, _ in state.parameters:
             assert parameter_name in state_dict
 
-        for name, pt_param in pt_model.named_parameters():
+        for name, pt_param in artifacts.pt_model.named_parameters():
             ort_param = state.parameters[name]
             assert ort_param.name == name
             assert np.allclose(pt_param.detach().cpu().numpy(), ort_param.data)
@@ -612,7 +553,7 @@ def test_get_and_set_parameter_values(device):
         labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
         loss = model(inputs, labels)
         assert loss is not None
-        for name, _ in pt_model.named_parameters():
+        for name, _ in artifacts.pt_model.named_parameters():
             ort_param = state.parameters[name]
             assert ort_param.name == name
             if name in ["fc1.weight", "fc1.bias"]:
@@ -624,3 +565,111 @@ def test_get_and_set_parameter_values(device):
 
         state.parameters["fc1.weight"] = original_param
         assert np.allclose(state.parameters["fc1.weight"].data, original_param)
+
+
+def test_model_construction_with_nominal_checkpoint():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        artifacts = _create_training_artifacts(temp_dir, nominal_checkpoint=True)
+
+        nominal_state = CheckpointState.load_checkpoint(artifacts.nominal_checkpoint_file_path)
+        model_with_nominal_state = Module(
+            artifacts.training_model_file_path, nominal_state, artifacts.eval_model_file_path
+        )
+        optimizer_with_nominal_state = Optimizer(artifacts.optimizer_model_file_path, model_with_nominal_state)
+
+        inputs = torch.randn(64, 784).numpy()
+        labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
+
+        err_msg = "Please load the parameter states first"
+
+        # Accessing the checkpoint parameter raises
+        state_dict = artifacts.pt_model.state_dict()
+        for param_name in state_dict:
+            assert param_name in nominal_state.parameters
+        with pytest.raises(Exception) as exc_info:
+            _ = nominal_state.parameters["fc1.weight"]
+
+        assert err_msg in str(exc_info.value)
+
+        err_msg = "Please load all the parameter states first"
+        with pytest.raises(Exception) as exc_info:
+            nominal_state.parameters["fc1.weight"] = np.ones((10, 10), dtype=np.float32)
+
+        assert err_msg in str(exc_info.value)
+
+        err_msg = "Please load the model parameters first."
+
+        # Getting contiguous parameters raises
+        with pytest.raises(Exception) as exc_info:
+            _ = model_with_nominal_state.get_contiguous_parameters()
+
+        assert err_msg in str(exc_info.value)
+
+        # Train step raises
+        with pytest.raises(Exception) as exc_info:
+            model_with_nominal_state.train()
+            model_with_nominal_state(inputs, labels)
+
+        assert err_msg in str(exc_info.value)
+
+        # Optimizer step raises
+        with pytest.raises(Exception) as exc_info:
+            optimizer_with_nominal_state.step()
+
+        assert err_msg in str(exc_info.value)
+
+        # Eval step raises
+        with pytest.raises(Exception) as exc_info:
+            model_with_nominal_state.eval()
+            model_with_nominal_state(inputs, labels)
+
+        assert err_msg in str(exc_info.value)
+
+        # Get parameters size does not raise
+        params_size = model_with_nominal_state.get_parameters_size()
+        assert params_size > 0
+
+
+def test_train_with_nominal_checkpoint():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        artifacts = _create_training_artifacts(temp_dir, nominal_checkpoint=True)
+
+        # Create Checkpoint State with nominal checkpoint as well as the complete checkpoint.
+        complete_state = CheckpointState.load_checkpoint(artifacts.checkpoint_file_path)
+        nominal_state = CheckpointState.load_checkpoint(artifacts.nominal_checkpoint_file_path)
+
+        # Create a Module with both complete and nominal checkpoint states.
+        model_with_complete_state = Module(artifacts.training_model_file_path, complete_state)
+        model_with_nominal_state = Module(artifacts.training_model_file_path, nominal_state)
+
+        optimizer_with_complete_state = Optimizer(artifacts.optimizer_model_file_path, model_with_complete_state)
+        optimizer_with_nominal_state = Optimizer(artifacts.optimizer_model_file_path, model_with_nominal_state)
+
+        parameter_buffer = model_with_complete_state.get_contiguous_parameters()
+        model_with_nominal_state.copy_buffer_to_parameters(parameter_buffer, trainable_only=False)
+
+        model_with_complete_state.train()
+        model_with_nominal_state.train()
+
+        # Generate random data for testing.
+        inputs = torch.randn(64, 784).numpy()
+        labels = torch.randint(high=10, size=(64,), dtype=torch.int64).numpy()
+
+        ort_loss_1 = model_with_complete_state(inputs, labels)
+        ort_loss_2 = model_with_nominal_state(inputs, labels)
+
+        # Calculate loss using pytorch model to compare it with both the Modules' output.
+        pt_outputs = artifacts.pt_model(torch.from_numpy(inputs))
+        loss_fn = torch.nn.CrossEntropyLoss()
+        pt_loss = loss_fn(pt_outputs, torch.from_numpy(labels).long())
+
+        assert np.allclose(ort_loss_1, ort_loss_2)
+        assert np.allclose(ort_loss_1, pt_loss.detach().numpy())
+
+        optimizer_with_complete_state.step()
+        optimizer_with_nominal_state.step()
+
+        new_params_1 = model_with_complete_state.get_contiguous_parameters()
+        new_params_2 = model_with_nominal_state.get_contiguous_parameters()
+
+        assert np.allclose(new_params_1.numpy(), new_params_2.numpy())
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index f944d8bc5ef4..e231579887a1 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -33,8 +33,9 @@
 from onnxruntime.training.ortmodule import DebugOptions, LogLevel, ORTModule, _fallback, _io, _utils
 from onnxruntime.training.ortmodule._custom_gradient_registry import register_gradient
 from onnxruntime.training.ortmodule.options import _SkipCheck
+from onnxruntime.training.utils import pytorch_type_to_onnx_dtype
 
-DEFAULT_OPSET = 15
+DEFAULT_OPSET = 17
 
 
 # PyTorch model definitions for tests
@@ -417,24 +418,38 @@ def _get_bert_for_sequence_classification_model(
     return model
 
 
-def _get_bert_for_sequence_classification_sample_data(device):
-    """Returns sample data to be used with BertForSequenceClassification model"""
+def _generate_attention_mask_for_encoder_following_hf(batch_size, seq_length, device, past_key_values_length=0):
+    """Generate attention mask for encoder following the implementation in HuggingFace.
 
-    input_ids = torch.randint(0, 100, (32, 64), dtype=torch.long, device=device)
-    input_mask = torch.randint(0, 100, (32, 64), dtype=torch.long, device=device)
-    labels = torch.randint(0, 1, (32,), dtype=torch.long, device=device)
+    Be noted: past_key_values_length is 0 for training.
 
-    return input_ids, input_mask, labels
+    Generate mask using this
+        https://github.com/huggingface/transformers/blame/4f27ee936a861f56f32ea6db138978b274008006/src/transformers/models/bert/modeling_bert.py#L974C81-L974C81
+
+    """
+
+    attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+    return attention_mask
 
 
 def _get_bert_for_sequence_classification_sample_data_with_random_shapes(device):
     """Returns sample data with random shape to be used with BertForSequenceClassification model"""
 
-    x = random.randint(1, 100)
-    y = random.randint(1, 100)
-    input_ids = torch.randint(0, 100, (x, y), dtype=torch.long, device=device)
-    input_mask = torch.randint(0, 100, (x, y), dtype=torch.long, device=device)
-    labels = torch.randint(0, 1, (x,), dtype=torch.long, device=device)
+    bsz = random.randint(1, 100)
+    seq_length = random.randint(1, 100)
+    input_ids = torch.randint(0, 100, (bsz, seq_length), dtype=torch.long, device=device)
+    input_mask = _generate_attention_mask_for_encoder_following_hf(bsz, seq_length, device)
+    labels = torch.randint(0, 1, (bsz,), dtype=torch.long, device=device)
+
+    return input_ids, input_mask, labels
+
+
+def _get_bert_for_sequence_classification_sample_data(device):
+    """Returns sample data to be used with BertForSequenceClassification model"""
+
+    input_ids = torch.randint(0, 100, (32, 64), dtype=torch.long, device=device)
+    input_mask = _generate_attention_mask_for_encoder_following_hf(32, 64, device)
+    labels = torch.randint(0, 1, (32,), dtype=torch.long, device=device)
 
     return input_ids, input_mask, labels
 
@@ -684,7 +699,7 @@ def test_input_requires_grad_saved(device):
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device, requires_grad=True) + 1
     model(x)
-    assert model._torch_module._execution_manager(model._is_training())._input_info.require_grad_names == ["input1"]
+    assert "input1" in model._torch_module._execution_manager(model._is_training())._input_info.require_grad_names
 
 
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
@@ -1805,6 +1820,34 @@ def run_step(model, input):
     _test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad)
 
 
+def test_aten_upsample_bicubic():
+    class _NeuralNetUpsampleBicubic(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, input):
+            return torch.nn.functional.interpolate(input, size=(8, 12), mode="bicubic")
+
+    device = "cuda"
+    pt_model = _NeuralNetUpsampleBicubic().to(device)
+    ort_model = ORTModule(copy.deepcopy(pt_model))
+
+    def run_step(model, input):
+        prediction = model(input)
+        prediction.sum().backward()
+        return prediction
+
+    # reset manual seed to reset the generator
+    torch.manual_seed(2333)
+    pt_input = torch.randn([2, 4, 6, 8], dtype=torch.float, device=device, requires_grad=True)
+    ort_input = copy.deepcopy(pt_input)
+    pt_prediction = run_step(pt_model, pt_input)
+    ort_prediction = run_step(ort_model, ort_input)
+
+    _test_helpers.assert_values_are_close(ort_prediction, pt_prediction)
+    _test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad)
+
+
 def test_gradient_correctness_cast_chain():
     class NeuralNetCast(torch.nn.Module):
         def __init__(self, D):
@@ -2183,32 +2226,27 @@ def run_step(model, x):
         _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
 
 
-# TODO(askhade): This test is failing with smaller tolerance, need to investigate! Disabling it right now to
-# unblock the move to a later version of transformers to resolve security vulnerability.
-# (Moving from transformers v4.4.2 to v4.30.0)
-# def test_bert_inputs_with_dynamic_shape():
-#     # create pytorch model with dropout disabled
-#     pt_model = _get_bert_for_sequence_classification_model(
-#         "cuda", is_training=True, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0
-#     )
-#     ort_model = ORTModule(copy.deepcopy(pt_model))
+def test_bert_inputs_with_dynamic_shape():
+    # create pytorch model with dropout disabled
+    pt_model = _get_bert_for_sequence_classification_model(
+        "cuda", is_training=True, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0
+    )
+    ort_model = ORTModule(copy.deepcopy(pt_model))
 
-#     def run_step(model, x, y, z):
-#         outputs = model(x, y, None, None, None, None, z)
-#         loss = outputs[0]
-#         loss.backward()
-#         return outputs[0]
+    def run_step(model, x, y, z):
+        outputs = model(x, y, None, None, None, None, z)
+        loss = outputs[0]
+        loss.backward()
+        return outputs[0]
 
-#     for _step in range(10):
-#         x, y, z = _get_bert_for_sequence_classification_sample_data_with_random_shapes("cuda")
+    for _step in range(10):
+        x, y, z = _get_bert_for_sequence_classification_sample_data_with_random_shapes("cuda")
 
-#         pt_p = run_step(pt_model, x, y, z)
-#         ort_p = run_step(ort_model, x, y, z)
+        pt_p = run_step(pt_model, x, y, z)
+        ort_p = run_step(ort_model, x, y, z)
 
-#         _test_helpers.assert_values_are_close(
-#             ort_p, pt_p, atol=1e-01
-#         )  # TODO: this assert is failing with smaller tolerance, need to investigate!!
-#         # _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)  #TODO - enable this check after the investigation
+        _test_helpers.assert_values_are_close(ort_p, pt_p, atol=1e-01)
+        _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
 
 
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
@@ -3760,7 +3798,7 @@ def forward(self, input1=None, input2=None):
             model.eval()
 
         # Must work because forward() and dict order match
-        y1, y2 = model(**{"input1": input1, "input2": input2})
+        y1, y2 = model(input1=input1, input2=input2)
         assert y1 is not None
         assert y2 is not None
         if model._is_training():
@@ -3768,7 +3806,7 @@ def forward(self, input1=None, input2=None):
             loss.backward()
 
         # Must work even when forward() and dict order mismatch
-        y1, y2 = model(**{"input2": input2, "input1": input1})
+        y1, y2 = model(input2=input2, input1=input1)
         assert y1 is not None
         assert y2 is not None
         if model._is_training():
@@ -3850,17 +3888,20 @@ def run_step(expected, a, b, c, d, e, f, y, z):
             None,
             None,
         )
-        run_step(
-            a.item() + f.item(), **{"a": a, "b": None, "c": None, "d": None, "e": None, "f": f, "y": None, "z": None}
-        )
+        run_step(a.item() + f.item(), a=a, b=None, c=None, d=None, e=None, f=f, y=None, z=None)
         run_step(a.item() + z.item(), a, None, None, None, None, None, None, z)
-        run_step(
-            a.item() + z.item(), **{"a": a, "b": None, "c": None, "d": None, "e": None, "f": None, "y": None, "z": z}
-        )
+        run_step(a.item() + z.item(), a=a, b=None, c=None, d=None, e=None, f=None, y=None, z=z)
         run_step(a.item() + c.item() + y.item(), a, None, c, None, None, None, y, None)
         run_step(
             a.item() + c.item() + y.item(),
-            **{"a": a, "b": None, "c": c, "d": None, "e": None, "f": None, "y": y, "z": None},
+            a=a,
+            b=None,
+            c=c,
+            d=None,
+            e=None,
+            f=None,
+            y=y,
+            z=None,
         )
         run_step(
             a.item() + b.item() + c.item() + d.item() + e.item() + f.item() + y.item() + z.item(),
@@ -3875,7 +3916,14 @@ def run_step(expected, a, b, c, d, e, f, y, z):
         )
         run_step(
             a.item() + b.item() + c.item() + d.item() + e.item() + f.item() + y.item() + z.item(),
-            **{"a": a, "b": b, "c": c, "d": d, "e": e, "f": f, "y": y, "z": z},
+            a=a,
+            b=b,
+            c=c,
+            d=d,
+            e=e,
+            f=f,
+            y=y,
+            z=z,
         )
 
     del os.environ["ORTMODULE_SKIPCHECK_POLICY"]
@@ -5252,7 +5300,7 @@ def run_step(model, x):
     assert ort_model._torch_module._execution_manager(True)._runtime_options.onnx_opset_version == 13
 
 
-@pytest.mark.parametrize("opset_version", [12, 13, 14, 15])
+@pytest.mark.parametrize("opset_version", [12, 13, 14, 15, 17])
 def test_opset_version_change(opset_version):
     original_env = None
     if "ORTMODULE_ONNX_OPSET_VERSION" in os.environ:
@@ -5677,8 +5725,6 @@ def run_step(model, input, target):
 @pytest.mark.parametrize("label_is_sparse", [False, True])
 @pytest.mark.parametrize("rank", [1, 2])
 def test_runtime_inspector_label_and_embed_sparsity_detection(embed_is_sparse, label_is_sparse, rank, caplog):
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
-
     class NeuralNetCrossEntropyLoss(torch.nn.Module):
         def __init__(self, num_embeddings, embedding_dim):
             super().__init__()
@@ -5749,10 +5795,12 @@ def run_step(model, input, positions):
     "test_cases",
     [
         ("Add", 0),
+        ("Add", 1),
         ("Add", 2),
         ("Add", 3),
         ("Add", 4),
         ("Sub", 0),
+        ("Sub", 1),
         ("Sub", 2),
         ("Sub", 3),
         ("Sub", 4),
@@ -5760,12 +5808,22 @@ def run_step(model, input, positions):
         ("Mul", 2),
         ("Mul", 3),
         ("Mul", 4),
+        ("Div", 0),
+        ("Div", 2),
+        ("Div", 3),
+        ("Div", 4),
+        ("Pow", 0),
+        ("Pow", 1),
+        ("Pow", 2),
+        ("Pow", 3),
+        ("Pow", 4),
         ("MatMul", 0),
         ("MatMul", 1),
         ("Dropout", 0),
         ("LayerNormalization", 0),
         ("LayerNormalization", 1),
         ("Cast", 0),
+        ("Sqrt", 0),
         ("BiasGelu", 0),
         ("Gelu", 0),
         ("ReduceMean", 0),
@@ -5773,7 +5831,6 @@ def run_step(model, input, positions):
     ],
 )
 def test_ops_for_padding_elimination(test_cases):
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
     test_op = test_cases[0]
     case = test_cases[1]
 
@@ -5800,7 +5857,7 @@ def __init__(self, vocab_size, hidden_size, pad_token_id):
         #            pattern should be insert to the arg of [batch_size, 1, hidden_size].
         # in case 3, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [1, hidden_size],
         #            the test_op should be included in padding elimination subgraph and a 'Expand + FlattenAndUnpad'
-        #            pattern should be insert to the arg of [batch_size, 1, hidden_size].
+        #            pattern should be insert to the arg of [1, hidden_size].
         # in case 4, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [batch_size, seqlen, hidden_size],
         #            the test_op should be included in padding elimination subgraph and the PadAndUnflatten should be added to
         #            output of test_op. Besides, the other input of Add should be added 'FlattenAndUnpad' to
@@ -5810,6 +5867,8 @@ def test_elementwise(self, input_ids):
             one_input = None
             if case == 0:
                 one_input = torch.ones(self.hidden_size, dtype=torch.long).to(device)
+            elif case == 1:
+                one_input = 1
             elif case == 2:
                 one_input = torch.ones((input_shape[0], 1, self.hidden_size), dtype=torch.long).to(device)
             elif case == 3:
@@ -5824,6 +5883,10 @@ def test_elementwise(self, input_ids):
                 output = one_input - inputs_embeds
             elif test_op == "Mul":
                 output = one_input * inputs_embeds
+            elif test_op == "Div":
+                output = inputs_embeds / one_input
+            elif test_op == "Pow":
+                output = inputs_embeds ** (one_input * 2)
             else:
                 output = None
             return output
@@ -5863,6 +5926,8 @@ def test_other(self, input_ids):
                 output = torch.nn.functional.gelu(inputs_embeds + bias)
             elif test_op == "Gelu":
                 output = torch.nn.functional.gelu(inputs_embeds)
+            elif test_op == "Sqrt":
+                output = torch.sqrt(inputs_embeds)
             elif test_op == "ReduceMean":
                 # In case 0, the inputs_embeds are reduced at last dimension, the ReduceMean should be included in padding
                 # elimination subgraph and the PadAndUnflatten should be added to output of ReduceMean.
@@ -5876,7 +5941,7 @@ def test_other(self, input_ids):
             return output
 
         def forward(self, input_ids):
-            if test_op in ["Add", "Mul", "Sub"]:
+            if test_op in ["Add", "Mul", "Sub", "Div", "Pow"]:
                 output = self.test_elementwise(input_ids)
             elif test_op == "MatMul":
                 output = self.test_matmul(input_ids)
@@ -5905,7 +5970,7 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     model(x)
 
     training_model = model._torch_module._execution_manager(True)._onnx_models.optimized_model
-    if test_op == "Sub":
+    if test_op == "Sub" or test_op == "Pow":
         assert len([node.op_type for node in training_model.graph.node if node.op_type == "Sub"]) == 2
     else:
         assert len([node.op_type for node in training_model.graph.node if node.op_type == "Sub"]) == 1
@@ -5926,7 +5991,7 @@ def find_input_node_type(model, arg):
         return result[0].op_type if len(result) == 1 else None
 
     recover_pad_input_optypes = [find_input_node_type(training_model, arg) for arg in recover_pad_node.input]
-    if test_op == "Add" or test_op == "Mul" or test_op == "Sub":
+    if test_op == "Add" or test_op == "Mul" or test_op == "Sub" or test_op == "Div" or test_op == "Pow":
         assert test_op in recover_pad_input_optypes
     else:
         if case == 0:
@@ -5934,11 +5999,8 @@ def find_input_node_type(model, arg):
         else:
             assert "ATen" in recover_pad_input_optypes
 
-    del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"]
-
 
 def test_e2e_padding_elimination():
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
     seed = 5033
     random.seed(seed)
     np.random.seed(seed)
@@ -6081,7 +6143,6 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     training_model = ort_model._torch_module._execution_manager(True)._onnx_models.optimized_model
     assert "FlattenAndUnpad" in [node.op_type for node in training_model.graph.node]
     assert "PadAndUnflatten" in [node.op_type for node in training_model.graph.node]
-    del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"]
 
 
 @pytest.mark.skipif(
@@ -6396,11 +6457,8 @@ def run_step(model, x):
         del os.environ["ORTMODULE_CONV_ALGO_SEARCH"]
 
 
-@pytest.mark.skip(
-    reason="This test fail because bert forward loss is nan in updated transformers lib, disable for now."
-)
 def test_bert_result_with_layerwise_recompute():
-    original_val = os.environ["ORTMODULE_MEMORY_OPT_LEVEL"] if "ORTMODULE_MEMORY_OPT_LEVEL" in os.environ else None
+    original_val = os.environ.get("ORTMODULE_MEMORY_OPT_LEVEL", None)
     # Create PyTorch model with dropout disabled.
     pt_model = _get_bert_for_sequence_classification_model(
         "cuda", is_training=True, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0
@@ -6452,3 +6510,189 @@ def run_step(model, x, y, z):
     torch.cuda.synchronize()
     if original_val is not None:
         os.environ["ORTMODULE_MEMORY_OPT_LEVEL"] = original_val
+
+
+def test_bert_memory_inspection(caplog):
+    original_val = os.environ.get("ORTMODULE_PRINT_MEMORY_STATS", None)
+
+    # Create PyTorch model with dropout disabled.
+    pt_model = _get_bert_for_sequence_classification_model(
+        "cuda", is_training=True, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0
+    )
+
+    os.environ["ORTMODULE_PRINT_MEMORY_STATS"] = "1"
+    pt_model.eval()  # Put it in evaluate mode by intention, in case some initialization in ORTModule use the module.is_training for its checks by mistake.
+    ort_model = ORTModule(
+        copy.deepcopy(pt_model), DebugOptions(log_level=LogLevel.INFO)  # The logged memory info is in INFO level.
+    )
+
+    def run_step(model, x, y, z):
+        outputs = model(x, y, None, None, None, None, z)
+        loss = outputs[0]
+        loss.backward()
+
+    ort_model.train()
+    for _ in range(32):
+        x, y, z = _get_bert_for_sequence_classification_sample_data_with_random_shapes("cuda")
+        run_step(ort_model, x, y, z)
+
+    info_records = [
+        record.message for record in caplog.records if record.levelname == "INFO" and "(MiB) | phase:" in record.message
+    ]
+
+    assert len(info_records) == 4 * 11
+
+    # Make sure environment variable is restored to its original value after the run is completed.
+    torch.cuda.synchronize()
+    if original_val is not None:
+        os.environ["ORTMODULE_PRINT_MEMORY_STATS"] = original_val
+    else:
+        if "ORTMODULE_PRINT_MEMORY_STATS" in os.environ:
+            del os.environ["ORTMODULE_PRINT_MEMORY_STATS"]
+
+
+@pytest.mark.parametrize("softmax_compute_type", [torch.float16, torch.float32])
+def test_overridden_softmax_export(softmax_compute_type):
+    class CustomSoftmaxExportTest(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, attn_weight):
+            return torch.nn.functional.softmax(attn_weight, dim=-1, dtype=softmax_compute_type)
+
+    device = "cuda"
+    pt_model = CustomSoftmaxExportTest().to(device)
+    ort_model = ORTModule(
+        copy.deepcopy(pt_model), DebugOptions(save_onnx=True, onnx_prefix="overridden_softmax_export")
+    )
+
+    def run_step(model, attn_weight):
+        prediction = model(attn_weight)
+        prediction.sum().backward()
+        return prediction
+
+    # reset manual seed to reset the generator
+    torch.manual_seed(2333)
+    attn_weight = torch.randn([20, 6, 10, 10], dtype=torch.float, device=device, requires_grad=True)
+    ort_attn_weight = copy.deepcopy(attn_weight)
+    pt_prediction = run_step(pt_model, attn_weight)
+    ort_prediction = run_step(ort_model, ort_attn_weight)
+
+    _test_helpers.assert_values_are_close(ort_prediction, pt_prediction)
+    _test_helpers.assert_values_are_close(attn_weight.grad, ort_attn_weight.grad)
+    _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
+
+    # Check the ONNX Softmax is running in float32.
+    execution_mgr = ort_model._torch_module._execution_manager._training_manager
+    from onnxruntime.training.ortmodule._onnx_models import _get_onnx_file_name
+
+    # Keep the logic aligned with _graph_execution_manager.py
+    path = os.path.join(
+        execution_mgr._debug_options.save_onnx_models.path,
+        _get_onnx_file_name(
+            execution_mgr._debug_options.save_onnx_models.name_prefix, "torch_exported", execution_mgr._export_mode
+        ),
+    )
+
+    onnx_model = onnx.load(path)
+    onnx_nodes = [n for n in onnx_model.graph.node]
+
+    assert onnx_nodes[0].op_type == "Cast"
+    to_attr = onnx_nodes[0].attribute[0]
+    assert to_attr.name == "to"
+    to_value = to_attr.i
+    assert to_value == pytorch_type_to_onnx_dtype(softmax_compute_type), "Cast to attribute is not as expected"
+
+
+@pytest.mark.parametrize("memory_optimization_level", [None, 0, 1, 2])
+@pytest.mark.parametrize("allow_gradient_checkpoint_export", [None, 0, 1])
+@pytest.mark.parametrize("fx", ["torch", "deepspeed"])
+def test_enable_layerwise_recompute(memory_optimization_level, allow_gradient_checkpoint_export, fx, caplog):
+    """Expected behaviors:
+    memory_optimization_level=0|None, allow_gradient_checkpoint_export=0|None => layerwise recompute is disabled
+    memory_optimization_level=1, allow_gradient_checkpoint_export=0|None => layerwise recompute is enabled
+    memory_optimization_level=2, allow_gradient_checkpoint_export=0|None => layerwise recompute is disabled
+    memory_optimization_level=0|None, allow_gradient_checkpoint_export=1 => layerwise recompute is disabled
+    memory_optimization_level=1, allow_gradient_checkpoint_export=1 => layerwise recompute is disabled
+    memory_optimization_level=2, allow_gradient_checkpoint_export=1 => layerwise recompute is disabled
+    """
+    if fx == "deepspeed":
+        try:
+            import deepspeed
+
+            checkpoint = deepspeed.checkpointing.checkpoint
+        except ImportError:
+            # skip if deepspeed is not installed (in amd CI)
+            return
+
+    elif fx == "torch":
+        from torch.utils.checkpoint import checkpoint
+    else:
+        raise ValueError(f"unsupported fx value: {fx}. only torch and deepspeed are supported.")
+
+    original_val = os.environ.get("ORTMODULE_MEMORY_OPT_LEVEL", None)
+    original_val_allow_gradient_checkpoint_export = os.environ.get("ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT", None)
+
+    if memory_optimization_level is not None:
+        os.environ["ORTMODULE_MEMORY_OPT_LEVEL"] = str(memory_optimization_level)
+    else:
+        if original_val is not None:
+            del os.environ["ORTMODULE_MEMORY_OPT_LEVEL"]
+
+    if allow_gradient_checkpoint_export:
+        os.environ["ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT"] = str(allow_gradient_checkpoint_export)
+    else:
+        if original_val_allow_gradient_checkpoint_export is not None:
+            del os.environ["ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT"]
+
+    class SampleModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layer1 = nn.Linear(10, 10)
+            self.layer2 = nn.Linear(10, 10)
+
+        def forward(self, x):
+            # Checkpointing the first layer
+            x = checkpoint(self.layer1, x)
+            x = nn.ReLU()(x)
+            # The second layer is not checkpointed
+            x = self.layer2(x)
+            return x
+
+    model = SampleModel().cuda()
+    input = torch.randn(1, 10).cuda()
+    model = ORTModule(model, DebugOptions(log_level=LogLevel.INFO))
+
+    # Forward pass
+
+    # Tolerant export failure.
+    import contextlib
+
+    with contextlib.suppress(Exception):
+        _ = model(input)
+
+    layerwise_recompute_info_records = [
+        record.message for record in caplog.records if "Layer-wise memory optimization is enabled" in record.message
+    ]
+
+    if memory_optimization_level != 1:
+        assert len(layerwise_recompute_info_records) == 0
+    else:
+        if allow_gradient_checkpoint_export is None or allow_gradient_checkpoint_export == 0:
+            assert len(layerwise_recompute_info_records) > 0
+        else:
+            assert len(layerwise_recompute_info_records) == 0
+
+    # Make sure environment variable is restored to its original value after the run is completed.
+    torch.cuda.synchronize()
+    if original_val is not None:
+        os.environ["ORTMODULE_MEMORY_OPT_LEVEL"] = original_val
+    else:
+        if "ORTMODULE_MEMORY_OPT_LEVEL" in os.environ:
+            del os.environ["ORTMODULE_MEMORY_OPT_LEVEL"]
+
+    if original_val_allow_gradient_checkpoint_export is not None:
+        os.environ["ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT"] = original_val_allow_gradient_checkpoint_export
+    else:
+        if "ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT" in os.environ:
+            del os.environ["ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT"]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
index 50016515a69e..043c70263d31 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
@@ -125,8 +125,6 @@ def run_with_ort_on_gpu(model, args, rank, device):
     try:
         mp.spawn(test_Distributed_ReduceWithMarkDirtyModel, nprocs=size, args=(size,))
     except Exception:
-        import sys  # noqa: F811
-
         sys.stdout.flush()
         sys.stderr.flush()
         raise
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
index 3d92e0b323c1..a1a7d4660f26 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
@@ -441,7 +441,7 @@ def main():
 
     # 4. Train loop (fine-tune)
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch_i in range(0, args.epochs):
+    for epoch_i in range(args.epochs):
         total_training_time += train(model, optimizer, scheduler, train_dataloader, epoch_i, device, args)
         if not args.pytorch_only and epoch_i == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
index 87c8e66231a2..0d5aba1a1a5c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
@@ -446,7 +446,7 @@ def main():
 
     # 4. Train loop (fine-tune)
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch_i in range(0, args.epochs):
+    for epoch_i in range(args.epochs):
         total_training_time += train(model, optimizer, scaler, scheduler, train_dataloader, epoch_i, device, args)
         if not args.pytorch_only and epoch_i == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
index 86e8d9aea1d3..5b28e9c52b48 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
@@ -8,6 +8,7 @@
     --deepspeed_config=orttraining_test_ortmodule_deepspeed_zero_stage_1_config.json
 ```
 """
+
 import argparse
 import time
 
@@ -36,11 +37,7 @@ def forward(self, input1):
 
 
 def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
-    print(
-        "\n======== Epoch {:} / {:} with batch size {:} ========".format(
-            epoch + 1, args.epochs, model.train_batch_size()
-        )
-    )
+    print(f"\n======== Epoch {epoch + 1} / {args.epochs} with batch size {model.train_batch_size()} ========")
     model.train()
     # Measure how long the training epoch takes.
     t0 = time.time()
@@ -77,13 +74,7 @@ def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -115,13 +106,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
@@ -251,7 +236,7 @@ def main():
 
     # Train loop
     total_training_time, total_test_time, epoch_0_training = 0, 0, 0
-    for epoch in range(0, args.epochs):
+    for epoch in range(args.epochs):
         total_training_time += train(args, model, device, optimizer, my_loss, train_loader, epoch)
         if not args.pytorch_only and epoch == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
index 53e1928e2d2f..443761128312 100755
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
@@ -123,13 +123,7 @@ def train_step(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -160,13 +154,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
index 4f0925c5c855..df0b5f195f0b 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
@@ -1,6 +1,7 @@
 """
 @brief      test log(time=3s)
 """
+
 import copy
 import unittest
 
@@ -79,7 +80,7 @@ def run_step(model, x):
         for onnx_model in [onnx_graph_inf, onnx_graph_train]:
             for oimp in onnx_model.opset_import:
                 if oimp.domain == "":
-                    self.assertEqual(oimp.version, 15)
+                    self.assertEqual(oimp.version, 17)  # Needs to match latest default ORTModule opset
         if op_grad_type is not None:
             if isinstance(op_grad_type, tuple):
                 text = str(onnx_graph_train)
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
index 1cb0b3626e54..d6f84d94c283 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
@@ -64,13 +64,7 @@ def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -102,13 +96,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
@@ -221,7 +209,7 @@ def main():
 
     # Train loop
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch in range(0, args.epochs):
+    for epoch in range(args.epochs):
         total_training_time += train(args, model, device, optimizer, my_loss, train_loader, epoch)
         if not args.pytorch_only and epoch == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py
index 0c381d70ca4c..922f5c696500 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py
@@ -12,6 +12,7 @@
 import pytest
 import torch
 from onnx import TensorProto, helper
+from packaging.version import Version
 from torch._C import _from_dlpack
 from torch.utils.dlpack import to_dlpack
 
@@ -842,6 +843,32 @@ def _gen_inputs(dtype):
     _run_module_test(NeuralNetSliceScel, dtype, _gen_inputs, 2)
 
 
+@pytest.mark.skipif(
+    Version(torch.__version__) < Version("2.1"), reason="PyTorch has scaled_dot_product_attention since 2.1."
+)
+def test_scaled_dot_product_attention_module():
+    class NeuralNetScaledDotProductAttention(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear1 = torch.nn.Linear(64, 64, bias=False, dtype=torch.float16)
+            self.linear2 = torch.nn.Linear(64, 64, bias=False, dtype=torch.float16)
+            self.linear3 = torch.nn.Linear(64, 64, bias=False, dtype=torch.float16)
+
+        def forward(self, q, k, v):
+            return torch.nn.functional.scaled_dot_product_attention(
+                self.linear1(q), self.linear2(k), self.linear3(v)
+            ).to(torch.float16)
+
+    def _gen_inputs(dtype):
+        return [
+            (torch.rand(32, 8, 128, 64) * 0.01).to(dtype=torch.float16, device=DEVICE),
+            (torch.rand(32, 8, 128, 64) * 0.01).to(dtype=torch.float16, device=DEVICE),
+            (torch.rand(32, 8, 128, 64) * 0.01).to(dtype=torch.float16, device=DEVICE),
+        ]
+
+    _run_module_test(NeuralNetScaledDotProductAttention, torch.float16, _gen_inputs, 3)
+
+
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
 @pytest.mark.parametrize("input_shapes", [([128, 64], [64, 64]), ([16, 64, 128], [16, 128, 64])])
 def test_matmul_tunable_op(dtype, input_shapes):
diff --git a/orttraining/orttraining/test/python/qat_poc_example/README.md b/orttraining/orttraining/test/python/qat_poc_example/README.md
index 6840e98bd9c8..05072b410b73 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/README.md
+++ b/orttraining/orttraining/test/python/qat_poc_example/README.md
@@ -48,7 +48,7 @@ We use `onnxruntime.training.onnxblock` to perform the above operations to get t
 
 > **_NOTE:_**  As of this writing, ORT does not have its own `"Observers"`. Instead, we rely on the `onnxruntime.quantization` tool to quantize the model and give us an initial estimate of the quantization parameters using its calibration process. Here the calibration process is used as a substitute for the observers to present the POC.
 
-> **_NOTE:_** Typically, the weights in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since weights are quantized. However, QAT requires weights and biases to be non quantized. We ensure that the weights have dedicated QDQ pair by passing in the flag AddQDQPairToWeight=True`
+> **_NOTE:_** Typically, the weights in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since weights are quantized. However, QAT requires weights and biases to be non quantized. We ensure that the weights have dedicated QDQ pair by passing in the flag `AddQDQPairToWeight=True`
 
 > **_NOTE:_**  Typically, the bias term in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since it is quantized as int32 as opposed to int8. So, we disable quantizing the bias term using the flag QuantizeBias=False`
 
diff --git a/orttraining/orttraining/test/python/qat_poc_example/model.py b/orttraining/orttraining/test/python/qat_poc_example/model.py
index 91d7ccd7294f..601362a59e37 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/model.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/model.py
@@ -5,7 +5,7 @@
 import onnx
 import torch
 
-import onnxruntime.training.onnxblock as onnxblock
+from onnxruntime.training import artifacts
 
 
 class MNIST(torch.nn.Module):
@@ -96,42 +96,26 @@ def create_training_artifacts(model_path, artifacts_dir, model_prefix):
     4. The checkpoint file
     """
 
-    class MNISTWithLoss(onnxblock.TrainingModel):
-        def __init__(self):
-            super().__init__()
-            self.loss = onnxblock.loss.CrossEntropyLoss()
-
-        def build(self, output_name):
-            return self.loss(output_name)
-
-    mnist_with_loss = MNISTWithLoss()
-    onnx_model, eval_model, optimizer_model = onnx.load(model_path), None, None
-
-    # Build the training and eval graphs
-    logging.info("Using onnxblock to create the training artifacts.")
-    with onnxblock.onnx_model(onnx_model) as model_accessor:
-        _ = mnist_with_loss(onnx_model.graph.output[0].name)
-        eval_model = model_accessor.eval_model
-
-    # Build the optimizer graph
-    optimizer = onnxblock.optim.AdamW()
-    with onnxblock.onnx_model() as accessor:
-        _ = optimizer(mnist_with_loss.parameters())
-        optimizer_model = accessor.model
+    onnx_model = onnx.load(model_path)
+
+    requires_grad = [
+        param.name
+        for param in onnx_model.graph.initializer
+        if (not param.name.endswith("_scale") and not param.name.endswith("_zero_point"))
+    ]
+    artifacts.generate_artifacts(
+        onnx_model,
+        requires_grad=requires_grad,
+        loss=artifacts.LossType.CrossEntropyLoss,
+        optimizer=artifacts.OptimType.AdamW,
+        artifact_directory=artifacts_dir,
+        prefix=model_prefix,
+    )
 
     # Create the training artifacts
-    train_model_path = os.path.join(artifacts_dir, f"{model_prefix}_train.onnx")
-    logging.info(f"Saving the training model to {train_model_path}.")
-    onnx.save(onnx_model, train_model_path)
-    eval_model_path = os.path.join(artifacts_dir, f"{model_prefix}_eval.onnx")
-    logging.info(f"Saving the eval model to {eval_model_path}.")
-    onnx.save(eval_model, eval_model_path)
-    optimizer_model_path = os.path.join(artifacts_dir, f"{model_prefix}_optimizer.onnx")
-    logging.info(f"Saving the optimizer model to {optimizer_model_path}.")
-    onnx.save(optimizer_model, optimizer_model_path)
-    trainable_params, non_trainable_params = mnist_with_loss.parameters()
-    checkpoint_path = os.path.join(artifacts_dir, f"{model_prefix}_checkpoint.ckpt")
-    logging.info(f"Saving the checkpoint to {checkpoint_path}.")
-    onnxblock.save_checkpoint((trainable_params, non_trainable_params), checkpoint_path)
+    train_model_path = os.path.join(artifacts_dir, f"{model_prefix}training_model.onnx")
+    eval_model_path = os.path.join(artifacts_dir, f"{model_prefix}eval_model.onnx")
+    optimizer_model_path = os.path.join(artifacts_dir, f"{model_prefix}optimizer_model.onnx")
+    checkpoint_path = os.path.join(artifacts_dir, f"{model_prefix}checkpoint")
 
     return train_model_path, eval_model_path, optimizer_model_path, checkpoint_path
diff --git a/orttraining/orttraining/test/python/qat_poc_example/qat.py b/orttraining/orttraining/test/python/qat_poc_example/qat.py
index 51a15475ee91..dcc9e116fda7 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/qat.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/qat.py
@@ -46,7 +46,7 @@
     )
 
     logging.info("Preparing the training artifacts for QAT.")
-    training_model_name = "mnist_qat"
+    training_model_name = "mnist_qat_"
     artifacts_dir = os.path.join(model_dir, "training_artifacts")
     utils.makedir(artifacts_dir)
     training_artifacts = create_training_artifacts(
diff --git a/orttraining/orttraining/test/python/qat_poc_example/quantize.py b/orttraining/orttraining/test/python/qat_poc_example/quantize.py
index 6d9ea284fd3e..225fb2f8e81b 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/quantize.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/quantize.py
@@ -53,7 +53,7 @@ def quantize_static(input_model_dir, output_model_dir):
     logging.info(
         "Invoking onnxruntime.quantization.quantize_static with AddQDQPairToWeight=True and QuantizeBias=False.."
     )
-    logging.info("Quantized model will be saved to %s." % output_model_dir)
+    logging.info("Quantized model will be saved to %s.", output_model_dir)
     quantization.quantize_static(
         input_model_dir,
         output_model_dir,
diff --git a/orttraining/orttraining/test/python/qat_poc_example/train.py b/orttraining/orttraining/test/python/qat_poc_example/train.py
index 9a429d2adc6f..a25c071c58a4 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/train.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/train.py
@@ -26,14 +26,10 @@ def _train_epoch(model, optimizer, train_loader):
     model.train()
     cumulative_loss = 0
     for data, target in train_loader:
-        forward_inputs = [
-            data.reshape(len(data), 784).numpy(),
-            target.numpy().astype(np.int32),
-        ]
-        train_loss = model(forward_inputs)
+        train_loss = model(data.reshape(len(data), 784).numpy(), target.numpy().astype(np.int64))
         optimizer.step()
         model.lazy_reset_grad()
-        cumulative_loss += train_loss[0]
+        cumulative_loss += train_loss
 
     return cumulative_loss / len(train_loader)
 
@@ -43,12 +39,8 @@ def _eval(model, test_loader):
     model.eval()
     cumulative_loss = 0
     for data, target in test_loader:
-        forward_inputs = [
-            data.reshape(len(data), 784).numpy(),
-            target.numpy().astype(np.int32),
-        ]
-        test_loss = model(forward_inputs)
-        cumulative_loss += test_loss[0]
+        test_loss = model(data.reshape(len(data), 784).numpy(), target.numpy().astype(np.int64))
+        cumulative_loss += test_loss
 
     return cumulative_loss / len(test_loader)
 
@@ -65,7 +57,7 @@ def train_model(qat_train_model, qat_eval_model, qat_optimizer_model, qat_checkp
     train_loader, test_loader = _get_dataloaders("data", batch_size)
 
     # Load the checkpoint state.
-    state = orttraining.CheckpointState(qat_checkpoint)
+    state = orttraining.CheckpointState.load_checkpoint(qat_checkpoint)
 
     # Create the training module.
     model = orttraining.Module(qat_train_model, state, qat_eval_model)
diff --git a/orttraining/orttraining/test/training_api/core/checkpoint_test.cc b/orttraining/orttraining/test/training_api/core/checkpoint_test.cc
index 1369c9c69865..5c53addb853e 100644
--- a/orttraining/orttraining/test/training_api/core/checkpoint_test.cc
+++ b/orttraining/orttraining/test/training_api/core/checkpoint_test.cc
@@ -95,7 +95,8 @@ TEST(CheckpointApiTest, SaveOnnxModelAsCheckpoint_ThenLoad_CPU) {
   // Call Save APIs.
   PathString checkpoint_path{
       ConcatPathComponent(tmp_dir.Path(), ORT_TSTR("e2e_ckpt_save_cpu"))};
-  ASSERT_STATUS_OK(SaveCheckpoint(trainable_param_values, non_trainable_param_values, checkpoint_path));
+  ASSERT_STATUS_OK(SaveCheckpoint(trainable_param_values, non_trainable_param_values, checkpoint_path,
+                                  false /* nominal checkpoint */));
 
   /// Phase 3 - Run load checkpoint APIs.
   /// And check the result comparable with initial parameter values.
@@ -193,7 +194,8 @@ TEST(CheckpointApiTest, SaveOnnxModelAsCheckpointThenLoadFromBufferCPU) {
   // Call Save APIs.
   PathString checkpoint_path{
       ConcatPathComponent(tmp_dir.Path(), ORT_TSTR("e2e_ckpt_save_cpu"))};
-  ASSERT_STATUS_OK(SaveCheckpoint(trainable_param_values, non_trainable_param_values, checkpoint_path));
+  ASSERT_STATUS_OK(SaveCheckpoint(trainable_param_values, non_trainable_param_values, checkpoint_path,
+                                  false /* nominal checkpoint */));
 
   /// Phase 3 - Run load checkpoint APIs.
   /// And check the result comparable with initial parameter values.
@@ -435,4 +437,37 @@ TEST(CheckpointApiTest, SaveCustomPropertyAsCheckpoint_ThenLoad_CPU) {
   std::string restored_s_data = restored_property_bag.GetProperty<std::string>(s_property_name);
   ASSERT_EQ(s_data, restored_s_data);
 }
+
+/**
+ * Loads a nominal checkpoint. Checks for nominal flag, and that the state is empty.
+ * Saves the checkpoint, and loads it again. Checks for nominal flag, and that the state is empty.
+ */
+TEST(CheckpointApiTest, LoadAndSaveNominalCheckpoint) {
+  PathString nominal_checkpoint_path{ORT_TSTR("testdata/training_api/nominal_checkpoint")};
+
+  CheckpointState checkpoint_state;
+  ASSERT_STATUS_OK(LoadCheckpoint(nominal_checkpoint_path, checkpoint_state));
+  ASSERT_TRUE(checkpoint_state.module_checkpoint_state.is_nominal_state);
+  for (auto& [name, param] : checkpoint_state.module_checkpoint_state.named_parameters) {
+    ASSERT_TRUE(param->Data().IsTensor());
+    // An empty tensor will have size 1.
+    ASSERT_EQ(param->Data().Get<Tensor>().Shape().Size(), 1);
+  }
+
+  // Remove the temporary directory if it already exists.
+  auto ckpt_test_root_dir = ORT_TSTR("checkpointing_api_test_dir");
+  TemporaryDirectory tmp_dir{ckpt_test_root_dir};
+  PathString checkpoint_path{
+      ConcatPathComponent(tmp_dir.Path(), ORT_TSTR("nominal_checkpoint_2"))};
+  ASSERT_STATUS_OK(SaveCheckpoint(checkpoint_state, checkpoint_path, false));
+
+  CheckpointState checkpoint_state_2;
+  ASSERT_STATUS_OK(LoadCheckpoint(checkpoint_path, checkpoint_state_2));
+  ASSERT_TRUE(checkpoint_state_2.module_checkpoint_state.is_nominal_state);
+  for (auto& [name, param] : checkpoint_state_2.module_checkpoint_state.named_parameters) {
+    ASSERT_TRUE(param->Data().IsTensor());
+    // An empty tensor will have size 1.
+    ASSERT_EQ(param->Data().Get<Tensor>().Shape().Size(), 1);
+  }
+}
 }  // namespace onnxruntime::training::test
diff --git a/orttraining/orttraining/test/training_api/core/training_api_tests.cc b/orttraining/orttraining/test/training_api/core/training_api_tests.cc
index 2170f7957e6a..e2232687d0b0 100644
--- a/orttraining/orttraining/test/training_api/core/training_api_tests.cc
+++ b/orttraining/orttraining/test/training_api/core/training_api_tests.cc
@@ -537,6 +537,167 @@ TEST(TrainingApiTest, OptimStep) {
   }
 }
 
+TEST(TrainingApiTest, ModuleAndOptimizerWithNominalState) {
+  auto model_uri = MODEL_FOLDER "training_model.onnx";
+  auto eval_model_uri = MODEL_FOLDER "eval_model.onnx";
+  auto optim_uri = MODEL_FOLDER "adamw.onnx";
+
+  onnxruntime::training::api::CheckpointState complete_state;
+  onnxruntime::training::api::CheckpointState nominal_state;
+  auto complete_checkpoint_path = MODEL_FOLDER "checkpoint.ckpt";
+  auto nominal_checkpoint_path = MODEL_FOLDER "nominal_checkpoint";
+  ASSERT_STATUS_OK(onnxruntime::training::api::LoadCheckpoint(complete_checkpoint_path, complete_state));
+  ASSERT_STATUS_OK(onnxruntime::training::api::LoadCheckpoint(nominal_checkpoint_path, nominal_state));
+
+  ASSERT_FALSE(complete_state.module_checkpoint_state.is_nominal_state);
+  ASSERT_TRUE(nominal_state.module_checkpoint_state.is_nominal_state);
+
+  onnxruntime::SessionOptions session_option;
+  std::unique_ptr<Environment> env;
+  std::vector<std::shared_ptr<IExecutionProvider>> providers;
+#if defined(USE_CUDA)
+  providers.push_back(onnxruntime::test::DefaultCudaExecutionProvider());
+#endif
+  ASSERT_STATUS_OK(Environment::Create(nullptr, env));
+
+  auto model_identifier = ModelIdentifiers(onnxruntime::ToUTF8String(model_uri),
+                                           std::optional<std::string>(onnxruntime::ToUTF8String(eval_model_uri)),
+                                           std::optional<std::string>(onnxruntime::ToUTF8String(optim_uri)));
+  auto model_with_complete_state = std::make_unique<onnxruntime::training::api::Module>(
+      model_identifier, &complete_state, session_option,
+      *env, providers);
+  auto model_with_nominal_state = std::make_unique<onnxruntime::training::api::Module>(
+      model_identifier, &nominal_state, session_option,
+      *env, providers);
+  auto optim_with_complete_state = std::make_unique<onnxruntime::training::api::Optimizer>(
+      model_identifier, &complete_state, session_option,
+      *env, providers);
+  auto optim_with_nominal_state = std::make_unique<onnxruntime::training::api::Optimizer>(
+      model_identifier, &nominal_state, session_option,
+      *env, providers);
+
+  // Before running the test, copy all the parameters to the nominal module.
+  ASSERT_EQ(model_with_complete_state->GetParametersSize(), model_with_nominal_state->GetParametersSize());
+  int64_t params_size = static_cast<int64_t>(model_with_nominal_state->GetParametersSize());
+  OrtValue params_buffer;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), {params_size},
+                       onnxruntime::test::TestCPUExecutionProvider()->CreatePreferredAllocators()[0],
+                       params_buffer);
+  ASSERT_STATUS_OK(model_with_complete_state->CopyParametersToBuffer(params_buffer, false));
+  ASSERT_STATUS_OK(model_with_nominal_state->CopyBufferToParameters(params_buffer, false));
+
+  ASSERT_STATUS_OK(optim_with_nominal_state->ConstructOptimizerStateAndInputs());
+
+  OrtValue input, target;
+  GenerateRandomInput(std::array<int64_t, 2>{2, 784}, input);
+  target = onnxruntime::test::CreateInputOrtValueOnCPU<int32_t>(
+      std::array<int64_t, 1>{2}, std::vector<int32_t>(2, 1));
+  auto data_loader = std::vector<std::vector<OrtValue>>(4, std::vector<OrtValue>{input, target});
+
+  for (auto it = data_loader.begin(); it != data_loader.end(); ++it) {
+    std::vector<OrtValue>& inputs = *it;
+    std::vector<OrtValue> complete_fetches;
+    std::vector<OrtValue> nominal_fetches;
+    ASSERT_STATUS_OK(model_with_complete_state->TrainStep(inputs, complete_fetches));
+    ASSERT_STATUS_OK(model_with_nominal_state->TrainStep(inputs, nominal_fetches));
+
+    ASSERT_GT(complete_fetches.size(), 0);
+    for (size_t i = 0; i < complete_fetches.size(); ++i) {
+      ASSERT_TRUE(complete_fetches[i].IsTensor());
+      ASSERT_TRUE(nominal_fetches[i].IsTensor());
+      const Tensor& complete_tensor = complete_fetches[i].Get<Tensor>();
+      const Tensor& nominal_tensor = nominal_fetches[i].Get<Tensor>();
+      ASSERT_EQ(complete_tensor.Shape(), nominal_tensor.Shape());
+      ASSERT_EQ(complete_tensor.DataType(), nominal_tensor.DataType());
+
+      std::vector<float> complete_fetches_vec;
+      std::vector<float> nominal_fetches_vec;
+#if defined(USE_CUDA)
+      CudaOrtValueToCpuVec(complete_fetches[i], complete_fetches_vec);
+      CudaOrtValueToCpuVec(nominal_fetches[i], nominal_fetches_vec);
+#else
+      CpuOrtValueToVec(complete_fetches[i], complete_fetches_vec);
+      CpuOrtValueToVec(nominal_fetches[i], nominal_fetches_vec);
+#endif
+
+      for (size_t j = 0; j < complete_fetches_vec.size(); ++j) {
+        ASSERT_EQ(complete_fetches_vec[j], nominal_fetches_vec[j]);
+      }
+    }
+
+    ASSERT_STATUS_OK(optim_with_complete_state->Step());
+    ASSERT_STATUS_OK(optim_with_nominal_state->Step());
+
+    for (auto& [name, param] : model_with_complete_state->NamedParameters()) {
+      ASSERT_TRUE(param->Data().IsTensor());
+      ASSERT_TRUE(param->Gradient().IsTensor());
+      ASSERT_TRUE(model_with_nominal_state->NamedParameters().at(name)->Data().IsTensor());
+      ASSERT_TRUE(model_with_nominal_state->NamedParameters().at(name)->Gradient().IsTensor());
+
+      const Tensor& complete_data = param->Data().Get<Tensor>();
+      const Tensor& complete_grad = param->Gradient().Get<Tensor>();
+      const Tensor& nominal_data = model_with_nominal_state->NamedParameters().at(name)->Data().Get<Tensor>();
+      const Tensor& nominal_grad = model_with_nominal_state->NamedParameters().at(name)->Gradient().Get<Tensor>();
+
+      ASSERT_EQ(complete_data.Shape(), nominal_data.Shape());
+      ASSERT_EQ(complete_data.DataType(), nominal_data.DataType());
+      ASSERT_EQ(complete_grad.Shape(), nominal_grad.Shape());
+      ASSERT_EQ(complete_grad.DataType(), nominal_grad.DataType());
+
+      std::vector<float> complete_data_vec;
+      std::vector<float> complete_grad_vec;
+      std::vector<float> nominal_data_vec;
+      std::vector<float> nominal_grad_vec;
+
+#if defined(USE_CUDA)
+      CudaOrtValueToCpuVec(param->Data(), complete_data_vec);
+      CudaOrtValueToCpuVec(param->Gradient(), complete_grad_vec);
+      CudaOrtValueToCpuVec(model_with_nominal_state->NamedParameters().at(name)->Data(), nominal_data_vec);
+      CudaOrtValueToCpuVec(model_with_nominal_state->NamedParameters().at(name)->Gradient(), nominal_grad_vec);
+#else
+      CpuOrtValueToVec(param->Data(), complete_data_vec);
+      CpuOrtValueToVec(param->Gradient(), complete_grad_vec);
+      CpuOrtValueToVec(model_with_nominal_state->NamedParameters().at(name)->Data(), nominal_data_vec);
+      CpuOrtValueToVec(model_with_nominal_state->NamedParameters().at(name)->Gradient(), nominal_grad_vec);
+#endif
+
+      for (size_t j = 0; j < complete_data_vec.size(); ++j) {
+        ASSERT_EQ(complete_data_vec[j], nominal_data_vec[j]);
+        ASSERT_EQ(complete_grad_vec[j], nominal_grad_vec[j]);
+      }
+    }
+
+    std::vector<OrtValue> complete_eval_fetches;
+    std::vector<OrtValue> nominal_eval_fetches;
+    ASSERT_STATUS_OK(model_with_complete_state->EvalStep(inputs, complete_eval_fetches));
+    ASSERT_STATUS_OK(model_with_nominal_state->EvalStep(inputs, nominal_eval_fetches));
+
+    ASSERT_GT(complete_eval_fetches.size(), 0);
+    for (size_t i = 0; i < complete_eval_fetches.size(); ++i) {
+      ASSERT_TRUE(complete_eval_fetches[i].IsTensor());
+      ASSERT_TRUE(nominal_eval_fetches[i].IsTensor());
+      const Tensor& complete_tensor = complete_eval_fetches[i].Get<Tensor>();
+      const Tensor& nominal_tensor = nominal_eval_fetches[i].Get<Tensor>();
+      ASSERT_EQ(complete_tensor.Shape(), nominal_tensor.Shape());
+      ASSERT_EQ(complete_tensor.DataType(), nominal_tensor.DataType());
+
+      std::vector<float> complete_eval_fetches_vec;
+      std::vector<float> nominal_eval_fetches_vec;
+#if defined(USE_CUDA)
+      CudaOrtValueToCpuVec(complete_eval_fetches[i], complete_eval_fetches_vec);
+      CudaOrtValueToCpuVec(nominal_eval_fetches[i], nominal_eval_fetches_vec);
+#else
+      CpuOrtValueToVec(complete_eval_fetches[i], complete_eval_fetches_vec);
+      CpuOrtValueToVec(nominal_eval_fetches[i], nominal_eval_fetches_vec);
+#endif
+
+      for (size_t j = 0; j < complete_eval_fetches_vec.size(); ++j) {
+        ASSERT_EQ(complete_eval_fetches_vec[j], nominal_eval_fetches_vec[j]);
+      }
+    }
+  }
+}
+
 }  // namespace test
 }  // namespace training
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/test/training_api/core/training_capi_tests.cc b/orttraining/orttraining/test/training_api/core/training_capi_tests.cc
index e46952d87c2b..8f25e1e4c92b 100644
--- a/orttraining/orttraining/test/training_api/core/training_capi_tests.cc
+++ b/orttraining/orttraining/test/training_api/core/training_capi_tests.cc
@@ -420,4 +420,79 @@ TEST(TrainingCApiTest, UpdateParameterDifferentDevices) {
 }
 #endif
 
+TEST(TrainingCApiTest, ModuleAndOptimizerWithNominalState) {
+  auto training_model_uri = MODEL_FOLDER "training_model.onnx";
+  auto eval_model_uri = MODEL_FOLDER "eval_model.onnx";
+  auto optimizer_model_uri = MODEL_FOLDER "adamw.onnx";
+
+  Ort::Env env;
+  Ort::SessionOptions session_options_for_complete_state;
+  Ort::SessionOptions session_options_for_nominal_state;
+  Ort::CheckpointState complete_state = Ort::CheckpointState::LoadCheckpoint(MODEL_FOLDER "checkpoint.ckpt");
+  Ort::CheckpointState nominal_state = Ort::CheckpointState::LoadCheckpoint(MODEL_FOLDER "nominal_checkpoint");
+
+#ifdef USE_CUDA
+  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options_for_complete_state, 0));
+  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options_for_nominal_state, 0));
+#endif
+
+  Ort::TrainingSession complete_training_session = Ort::TrainingSession(env, session_options_for_complete_state, complete_state,
+                                                                        training_model_uri, eval_model_uri, optimizer_model_uri);
+  Ort::TrainingSession nominal_training_session = Ort::TrainingSession(env, session_options_for_nominal_state, nominal_state,
+                                                                       training_model_uri, eval_model_uri,
+                                                                       optimizer_model_uri);
+
+  Ort::Value params_buffer = complete_training_session.ToBuffer(false);
+  nominal_training_session.FromBuffer(params_buffer);
+
+  for (size_t i = 0; i < 4U; ++i) {
+    std::vector<float> x(2 * 784);
+    std::vector<int64_t> x_shape{2, 784};
+    GenerateRandomData(x);
+
+    std::vector<int32_t> labels{0, 8};
+    std::vector<int64_t> labels_shape{2};
+
+    Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+    std::vector<Ort::Value> ort_inputs;
+    ort_inputs.emplace_back(Ort::Value::CreateTensor(memory_info, x.data(),
+                                                     x.size() * sizeof(float),
+                                                     x_shape.data(), x_shape.size(),
+                                                     ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT));
+    ort_inputs.emplace_back(Ort::Value::CreateTensor(memory_info, labels.data(),
+                                                     labels.size() * sizeof(int32_t),
+                                                     labels_shape.data(), labels_shape.size(),
+                                                     ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32));
+
+    std::vector<Ort::Value> complete_fetches = complete_training_session.TrainStep(ort_inputs);
+    std::vector<Ort::Value> nominal_fetches = nominal_training_session.TrainStep(ort_inputs);
+
+    ASSERT_EQ(complete_fetches.size(), nominal_fetches.size());
+    ASSERT_GT(complete_fetches.size(), 0U);
+    for (size_t j = 0; j < complete_fetches.size(); ++j) {
+      ASSERT_TRUE(complete_fetches[j].IsTensor());
+      ASSERT_TRUE(nominal_fetches[j].IsTensor());
+
+      auto complete_tensor_info = complete_fetches[j].GetTensorTypeAndShapeInfo();
+      auto nominal_tensor_info = nominal_fetches[j].GetTensorTypeAndShapeInfo();
+
+      ASSERT_EQ(complete_tensor_info.GetShape(), nominal_tensor_info.GetShape());
+      ASSERT_EQ(complete_tensor_info.GetElementType(), nominal_tensor_info.GetElementType());
+
+      gsl::span complete_data = gsl::span(complete_fetches[j].GetTensorMutableData<float>(),
+                                          complete_tensor_info.GetElementCount());
+      gsl::span nominal_data = gsl::span(nominal_fetches[j].GetTensorMutableData<float>(),
+                                         nominal_tensor_info.GetElementCount());
+
+      ASSERT_EQ(complete_data, nominal_data);
+    }
+
+    complete_training_session.OptimizerStep();
+    nominal_training_session.OptimizerStep();
+
+    complete_training_session.LazyResetGrad();
+    nominal_training_session.LazyResetGrad();
+  }
+}
+
 }  // namespace onnxruntime::training::test
diff --git a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
index e9795a24681c..e89883bfd4d9 100644
--- a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
@@ -37,6 +37,8 @@ TEST(BatchNormInternalTest, ForwardTrainingTest) {
   test.AddOutput<float>("saved_mean", channel_dims, {-0.306f, 0.114562f});
   test.AddOutput<float>("saved_inv_std", channel_dims, {1.2288f, 0.861317f});
 
+  test.SetOutputTolerance(0.0001f);
+
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.emplace_back(DefaultCpuExecutionProvider());
 
diff --git a/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc b/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
index 6335a666e038..d842d4f1ea73 100644
--- a/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
@@ -68,6 +68,7 @@ static void TestBatchNormInternal(bool test_double = false, bool T_is_half = fal
     test.AddOutput<double>("running_var", channel_dims, running_var_double);
     test.AddOutput<double>("saved_mean", channel_dims, saved_mean_double);
     test.AddOutput<double>("saved_inv_std", channel_dims, saved_inv_std_double);
+    test.SetOutputTolerance(0.0001f);
   } else {
     if (T_is_half) {
       std::vector<MLFloat16> X_half(X.size());
diff --git a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
index d9800ce0e0d3..d36f9b307ec7 100644
--- a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
@@ -311,11 +311,9 @@ template <typename T, typename TOut>
 static std::vector<OrtValue> RunSCELossWithEP(const char* op,
                                               int opset_version,
                                               const char* domain,
-                                              std::function<std::unique_ptr<IExecutionProvider>()>
-                                                  ep_creator,
+                                              std::function<std::unique_ptr<IExecutionProvider>()> ep_creator,
                                               const std::string& reduction,
                                               const std::int64_t ignore_index,
-                                              const double error_tolerance,
                                               const std::vector<int64_t>* X_dims,
                                               const std::vector<int64_t>* index_dims,
                                               const std::vector<int64_t>* weight_dims,
@@ -403,7 +401,7 @@ static void TestSCELoss(const char* op, int opset_version,
     cpu_fetches = RunSCELossWithEP<float, float>(
         op, opset_version, domain,
         []() -> std::unique_ptr<IExecutionProvider> { return DefaultCpuExecutionProvider(); },
-        reduction, ignore_index, error_tolerance,
+        reduction, ignore_index,
         X_dims, index_dims, weight_dims,
         Y_dims, log_prob_dims,
         X_data_temp, index_data, weight_data_temp);
@@ -411,7 +409,7 @@ static void TestSCELoss(const char* op, int opset_version,
     cpu_fetches = RunSCELossWithEP<T, float>(
         op, opset_version, domain,
         []() -> std::unique_ptr<IExecutionProvider> { return DefaultCpuExecutionProvider(); },
-        reduction, ignore_index, error_tolerance,
+        reduction, ignore_index,
         X_dims, index_dims, weight_dims,
         Y_dims, log_prob_dims,
         X_data, index_data, weight_data);
@@ -429,7 +427,7 @@ static void TestSCELoss(const char* op, int opset_version,
         return DefaultRocmExecutionProvider();
 #endif
       },
-      reduction, ignore_index, error_tolerance,
+      reduction, ignore_index,
       X_dims, index_dims, weight_dims,
       Y_dims, log_prob_dims,
       X_data, index_data, weight_data);
diff --git a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
index e86aa871b6c5..13ad2f6150ac 100644
--- a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
@@ -49,7 +49,7 @@ static void TestLayerNormGrad(
 
   test.AddAttribute("axis", axis);
 
-  RandomValueGenerator random{};
+  RandomValueGenerator random{optional<RandomValueGenerator::RandomSeedType>{2345}};
   const auto Y_grad_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto X_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto scale_data = random.Uniform<float>(m_dims, k_random_data_min, k_random_data_max);
@@ -152,7 +152,7 @@ static void TestInvertibleLayerNormGrad(
 
   test.AddAttribute("axis", axis);
 
-  RandomValueGenerator random{};
+  RandomValueGenerator random{optional<RandomValueGenerator::RandomSeedType>{2345}};
   const auto Y_grad_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto X_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto scale_data = random.Uniform<float>(m_dims, k_random_data_min, k_random_data_max);
diff --git a/orttraining/orttraining/test/training_ops/cuda/softmax_test.cc b/orttraining/orttraining/test/training_ops/cuda/softmax_test.cc
index 45edac3df280..ad6ee1e0950e 100644
--- a/orttraining/orttraining/test/training_ops/cuda/softmax_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/softmax_test.cc
@@ -70,6 +70,23 @@ TEST(CudaKernelTest, Softmax_LargeTensor_LastAxis_Float16_NoPowerOfTwo) {
   TestSoftmax<MLFloat16>(X_dims, Y_dims, 2, false, 1e-3, 1e-3);
 }
 
+TEST(CudaKernelTest, Softmax_LargeTensor_LastAxis_Float16_NoPowerOfTwo2) {
+  // at fp16 case, when input is all -65504, the output can't be inf
+  std::vector<int64_t> X_dims{8192, 1, 1050};
+  std::vector<int64_t> Y_dims{8192, 1, 1050};
+  TestSoftmax<MLFloat16>(X_dims, Y_dims, 2, false, 1e-3, 1e-3);
+  CompareOpTester test("Softmax");
+  test.AddAttribute<int64_t>("axis", 1);
+
+  std::vector<MLFloat16> X_data(detail::SizeFromDims(X_dims), (MLFloat16)-65504.0f);
+  test.AddInput<MLFloat16>("X", X_dims, X_data);
+
+  std::vector<MLFloat16> Y_data = FillZeros<MLFloat16>(Y_dims);
+  test.AddOutput<MLFloat16>("Y", Y_dims, Y_data);
+
+  test.CompareWithCPU(kGpuExecutionProvider, 1e-4, 1e-4);
+}
+
 TEST(CudaKernelTest, Softmax_LargeTensor_AllAxis) {
   std::vector<int64_t> X_dims{8, 16, 512};
   std::vector<int64_t> Y_dims{8, 16, 512};
diff --git a/orttraining/orttraining/training_api/checkpoint.cc b/orttraining/orttraining/training_api/checkpoint.cc
index dbcef78c3965..720bdd7e68dd 100644
--- a/orttraining/orttraining/training_api/checkpoint.cc
+++ b/orttraining/orttraining/training_api/checkpoint.cc
@@ -174,7 +174,7 @@ Status ToFile(const PathString& checkpoint_path, flatbuffers::FlatBufferBuilder&
 Status FromTensorProtos(
     gsl::span<const ONNX_NAMESPACE::TensorProto> trainable_tensor_protos,
     gsl::span<const ONNX_NAMESPACE::TensorProto> non_trainable_tensor_protos,
-    const PathString& checkpoint_path) {
+    const PathString& checkpoint_path, const bool nominal_checkpoint) {
   const auto check_unique = [](gsl::span<const ONNX_NAMESPACE::TensorProto> tensor_protos,
                                InlinedHashSet<std::string>& unique_names) {
     for (const auto& tensor_proto : tensor_protos) {
@@ -230,6 +230,7 @@ Status FromTensorProtos(
   fbs::ModuleStateBuilder module_state_builder(builder);
   module_state_builder.add_requires_grad_params(fbs_trainable_tensors);
   module_state_builder.add_frozen_params(fbs_non_trainable_tensors);
+  module_state_builder.add_is_nominal_state(nominal_checkpoint);
   flatbuffers::Offset<fbs::ModuleState> fbs_module_state = module_state_builder.Finish();
 
   fbs::CheckpointBuilder checkpoint_builder(builder);
@@ -294,6 +295,7 @@ Status FromModuleState(const ModuleCheckpointState& module_state,
   fbs::ModuleStateBuilder module_state_builder(builder);
   module_state_builder.add_requires_grad_params(fbs_trainable_tensors);
   module_state_builder.add_frozen_params(fbs_non_trainable_tensors);
+  module_state_builder.add_is_nominal_state(module_state.is_nominal_state);
   fbs_module_state = module_state_builder.Finish();
 
   return Status::OK();
@@ -513,6 +515,8 @@ Status ToModuleState(
     module_state.named_parameters.insert({name, param});
   }
 
+  module_state.is_nominal_state = fbs_module_state.is_nominal_state();
+
   return Status::OK();
 }
 
@@ -646,6 +650,10 @@ Status ToModelProto(gsl::span<const uint8_t> checkpoint_bytes,
   ORT_RETURN_IF_NOT(frozen_params,
                     "Checkpoint is invalid. Expected: Valid non-trainable params flatbuffer. Actual: nullptr.");
 
+  ORT_RETURN_IF(module_state->is_nominal_state(),
+                "Cannot load a nominal checkpoint to a model proto. "
+                "Expected: Complete checkpoint. Actual: Nominal checkpoint.");
+
   InlinedHashMap<std::string, ONNX_NAMESPACE::TensorProto> param_tensor_protos;
   param_tensor_protos.reserve(
       static_cast<size_t>(requires_grad_params->size()) + static_cast<size_t>(frozen_params->size()));
@@ -717,14 +725,33 @@ Status ToCheckpointState(gsl::span<const uint8_t> checkpoint_bytes, CheckpointSt
 
 }  // namespace load
 
+#if !defined(ORT_MINIMAL_BUILD)
+InlinedVector<ONNX_NAMESPACE::TensorProto> Nominalize(gsl::span<const ONNX_NAMESPACE::TensorProto> tensor_protos) {
+  InlinedVector<ONNX_NAMESPACE::TensorProto> nominal_tensor_protos;
+  nominal_tensor_protos.reserve(tensor_protos.size());
+  for (const auto& tensor_proto : tensor_protos) {
+    ONNX_NAMESPACE::TensorProto nominal_tensor_proto;
+    nominal_tensor_proto.set_name(tensor_proto.name());
+    nominal_tensor_proto.set_data_type(tensor_proto.data_type());
+    nominal_tensor_protos.push_back(nominal_tensor_proto);
+  }
+
+  return nominal_tensor_protos;
+}
+#endif
+
 }  // namespace
 
 #if !defined(ORT_MINIMAL_BUILD)
 Status SaveCheckpoint(gsl::span<const ONNX_NAMESPACE::TensorProto> trainable_tensor_protos,
                       gsl::span<const ONNX_NAMESPACE::TensorProto> non_trainable_tensor_protos,
-                      const PathString& checkpoint_path) {
+                      const PathString& checkpoint_path, const bool nominal_checkpoint) {
   ORT_RETURN_IF_NOT(FLATBUFFERS_LITTLEENDIAN, "ORT training checkpoint format only supports little-endian machines");
-  return save::FromTensorProtos(trainable_tensor_protos, non_trainable_tensor_protos, checkpoint_path);
+  return nominal_checkpoint
+             ? save::FromTensorProtos(Nominalize(trainable_tensor_protos), Nominalize(non_trainable_tensor_protos),
+                                      checkpoint_path, nominal_checkpoint)
+             : save::FromTensorProtos(trainable_tensor_protos, non_trainable_tensor_protos, checkpoint_path,
+                                      nominal_checkpoint);
 }
 #endif
 
diff --git a/orttraining/orttraining/training_api/checkpoint.h b/orttraining/orttraining/training_api/checkpoint.h
index 5d8554662f48..95d3820a33a7 100644
--- a/orttraining/orttraining/training_api/checkpoint.h
+++ b/orttraining/orttraining/training_api/checkpoint.h
@@ -49,11 +49,12 @@ Status SaveCheckpoint(const CheckpointState& state, const PathString& checkpoint
  * @param trainable_tensor_protos trainable parameters in TensorProto format.
  * @param non_trainable_tensor_protos non-trainable parameters in TensorProto format.
  * @param checkpoint_path file where checkpoint is saved.
+ * @param nominal_checkpoint flag indicating whether to save the complete checkpoint or the nominal checkpoint.
  * @return Status
  */
 Status SaveCheckpoint(gsl::span<const ONNX_NAMESPACE::TensorProto> trainable_tensor_protos,
                       gsl::span<const ONNX_NAMESPACE::TensorProto> non_trainable_tensor_protos,
-                      const PathString& checkpoint_path);
+                      const PathString& checkpoint_path, const bool nominal_checkpoint);
 #endif
 
 /**
diff --git a/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h b/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h
index 0e8544a7639b..ed6d151a595b 100644
--- a/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h
+++ b/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h
@@ -132,6 +132,7 @@ struct OrtTrainingApi {
    * \note Note that the training session created with a checkpoint state uses this state to store the entire
    * training state (including model parameters, its gradients, the optimizer states and the properties).
    * As a result, it is required that the checkpoint state outlive the lifetime of the training session.
+   * \note Note that the checkpoint file can be either the complete checkpoint or the nominal checkpoint.
    *
    * \param[in] checkpoint_path Path to the checkpoint file
    * \param[out] checkpoint_state Checkpoint state that contains the states of the training session.
@@ -463,10 +464,12 @@ struct OrtTrainingApi {
    *
    * The parameters_buffer argument has to be of the size given by OrtTrainingApi::GetParametersSize api call,
    * with matching setting for trainable_only argument. All the target parameters must be of the same
-   * datatype. This is a complementary function to OrtTrainingApi::CopyBufferToParameters
+   * datatype. This is a complementary function to OrtTrainingApi::CopyParametersToBuffer
    * and can be used to load updated buffer values onto the training state.
    * Parameter ordering is preserved.
    * User is responsible for allocating and freeing the resources used by the parameters_buffer.
+   * In case the training session was created with a nominal checkpoint, invoking this function is required
+   * to load the updated parameters onto the checkpoint to complete it.
    *
    * \param[in] sess The `this` pointer to the training session.
    * \param[in] trainable_only Whether to skip non-trainable parameters
diff --git a/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h b/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h
index 218bef524200..e78c16136ab3 100644
--- a/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h
+++ b/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h
@@ -58,6 +58,8 @@ using Property = std::variant<int64_t, float, std::string>;
  * training state (including model parameters, its gradients, the optimizer states and the properties).
  * The Ort::TrainingSession does not hold a copy of the Ort::CheckpointState and as a result, it is required
  * that the checkpoint state outlive the lifetime of the training session.
+ * \note Note that the checkpoint state can be either the complete checkpoint state or the nominal checkpoint
+ * state depending on the version provided while loading the checkpoint.
  *
  */
 class CheckpointState : public detail::Base<OrtCheckpointState> {
@@ -386,6 +388,9 @@ class TrainingSession : public detail::Base<OrtTrainingSession> {
   Value ToBuffer(const bool only_trainable);
 
   /** \brief Loads the training session model parameters from a contiguous buffer
+   *
+   * In case the training session was created with a nominal checkpoint, invoking this function is required
+   * to load the updated parameters onto the checkpoint to complete it.
    *
    * \param[in] buffer Contiguous buffer to load the parameters from.
    */
diff --git a/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h b/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h
index 7d1326a10f8f..397cba0b0f9d 100644
--- a/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h
+++ b/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h
@@ -168,22 +168,23 @@ inline void TrainingSession::FromBuffer(Value& buffer) {
 
   auto buffer_size = buffer_shape.front();
 
+  size_t session_buffer_size = 0U;
+  ThrowOnError(GetTrainingApi().GetParametersSize(p_, &session_buffer_size, false));
+
+  if (buffer_size == static_cast<int64_t>(session_buffer_size)) {
+    ThrowOnError(GetTrainingApi().CopyBufferToParameters(p_, buffer, false));
+    return;
+  }
+
   size_t session_buffer_size_trainable_only = 0U;
   ThrowOnError(GetTrainingApi().GetParametersSize(p_, &session_buffer_size_trainable_only, true));
 
   if (buffer_size == static_cast<int64_t>(session_buffer_size_trainable_only)) {
     ThrowOnError(GetTrainingApi().CopyBufferToParameters(p_, buffer, true));
     return;
-  }
-
-  size_t session_buffer_size = 0U;
-  ThrowOnError(GetTrainingApi().GetParametersSize(p_, &session_buffer_size, false));
-
-  if (buffer_size != static_cast<int64_t>(session_buffer_size)) {
+  } else {
     ThrowStatus(Status("Incorrect buffer size received.", OrtErrorCode::ORT_INVALID_ARGUMENT));
   }
-
-  ThrowOnError(GetTrainingApi().CopyBufferToParameters(p_, buffer, false));
 }
 
 inline CheckpointState CheckpointState::LoadCheckpoint(const std::basic_string<ORTCHAR_T>& path_to_checkpoint) {
diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc
index cf49a01517d6..41ed79d28553 100644
--- a/orttraining/orttraining/training_api/module.cc
+++ b/orttraining/orttraining/training_api/module.cc
@@ -6,6 +6,8 @@
 #include "core/common/safeint.h"
 #include "core/common/string_utils.h"
 #include "core/framework/execution_provider.h"
+#include "core/framework/mldata_type_utils.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/session/inference_session.h"
 #include "core/session/environment.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -117,6 +119,75 @@ Status TransformModelInputsForInference(Graph& inference_graph,
   return Status::OK();
 }
 #endif
+
+InlinedHashMap<std::string, const NodeArg*> BuildParameterToInputNodeArgMap(const ModuleCheckpointState& state,
+                                                                            const InputDefList* model_inputs) {
+  ORT_ENFORCE(model_inputs != nullptr, "Model inputs are not defined.");
+  InlinedHashMap<std::string, const NodeArg*> parameter_to_input_node_arg_map;
+  parameter_to_input_node_arg_map.reserve(state.named_parameters.size());
+  for (const auto& input_def : *model_inputs) {
+    const std::string& input_name = input_def->Name();
+    const auto param_it = state.named_parameters.find(input_name);
+    if (param_it == state.named_parameters.end()) {
+      continue;
+    }
+    parameter_to_input_node_arg_map[input_name] = input_def;
+  }
+  return parameter_to_input_node_arg_map;
+}
+
+InlinedHashMap<std::string, size_t> BuildParameterToGradInputIndexMap(gsl::span<const std::string> grad_names) {
+  InlinedHashMap<std::string, size_t> param_name_to_grad_input_index_map;
+  param_name_to_grad_input_index_map.reserve(grad_names.size());
+  for (size_t i = 0; i < grad_names.size(); ++i) {
+    std::string param_name;
+    utils::GetParamNameFromGradient(grad_names[i], param_name);
+    param_name_to_grad_input_index_map.insert({param_name, i});
+  }
+  return param_name_to_grad_input_index_map;
+}
+
+Status LoadParameter(const std::string& param_name, const Tensor& src_weight_tensor,
+                     const SessionState& session_state, const bool force_load,
+                     const InlinedHashMap<std::string, size_t>& param_to_grad_index,
+                     gsl::span<const std::string> grad_names, Parameter& param) {
+  InlinedVector<SessionState::NodeInfo> node_info_vec;
+  ORT_THROW_IF_ERROR(session_state.GetInputNodeInfo(param_name, node_info_vec));
+  const auto& node_info = node_info_vec.front();
+  const auto target_device = *node_info.device;
+  for (auto it = node_info_vec.begin(); it != node_info_vec.end(); ++it) {
+    ORT_ENFORCE(target_device == *(it->device), "Inconsistent device requirements found for input: ", param_name);
+  }
+
+  if (force_load || src_weight_tensor.Location().device.Type() != target_device.Type()) {
+    auto weight_allocator = session_state.GetAllocator(target_device);
+    ORT_ENFORCE(weight_allocator != nullptr);
+
+    // Create a new tensor on the target_device and switch the source_ortvalue to point to this new tensor
+    auto dst_weight_tensor = std::make_unique<Tensor>(src_weight_tensor.DataType(), src_weight_tensor.Shape(),
+                                                      weight_allocator);
+    ORT_THROW_IF_ERROR(session_state.GetDataTransferMgr().CopyTensor(src_weight_tensor, *dst_weight_tensor.get()));
+    auto ml_tensor_type = DataTypeImpl::GetType<Tensor>();
+    param.Data().Init(dst_weight_tensor.release(), ml_tensor_type, ml_tensor_type->GetDeleteFunc());
+  }
+
+  if (param.RequiresGrad()) {
+    // Create gradient accumulation buffer.
+    auto grad_it = param_to_grad_index.find(param_name);
+    ORT_ENFORCE(grad_it != param_to_grad_index.end(), "Gradient buffer input not provided for param: ",
+                param_name);
+
+    const size_t grad_input_index = grad_it->second;
+    auto& param_grad_name = grad_names[grad_input_index];
+
+    OrtValue param_grad;
+    ORT_THROW_IF_ERROR(utils::CreateZeroValuedOrtValueLike(session_state, param.Data(), param_grad));
+    ORT_THROW_IF_ERROR(param.SetGrad(param_grad_name, param_grad));
+  }
+
+  return Status::OK();
+}
+
 }  // namespace
 
 Status Parameter::CopyTo(const DataTransferManager* data_transfer_manager, OrtValue& data) const {
@@ -251,7 +322,6 @@ Module::Module(const ModelIdentifiers& model_identifiers,
   // user inputs, weights, gradients, reset_grad
   InlinedVector<std::string> user_input_names, param_input_names, grad_input_names, reset_grad_name;
 
-  std::unordered_map<std::string, size_t> param_name_to_grad_input_index_map;
   for (const auto& input_name : train_input_names) {
     auto it = state_->module_checkpoint_state.named_parameters.find(input_name);
     if (it != state_->module_checkpoint_state.named_parameters.end()) {
@@ -259,7 +329,6 @@ Module::Module(const ModelIdentifiers& model_identifiers,
     } else if (input_name == ACCUMULATE_GRAD_CONTROL_INPUT_NAME) {
       reset_grad_name.emplace_back(input_name);
     } else if (std::string param_name; utils::GetParamNameFromGradient(input_name, param_name)) {
-      param_name_to_grad_input_index_map.insert({param_name, grad_input_names.size()});
       grad_input_names.emplace_back(input_name);
     } else {
       user_input_names.emplace_back(input_name);
@@ -268,11 +337,7 @@ Module::Module(const ModelIdentifiers& model_identifiers,
 
   gradients_.resize(grad_input_names.size());
 
-  train_input_names_ = user_input_names;
-  train_user_input_count_ = user_input_names.size();
-  train_input_names_.insert(train_input_names_.end(), param_input_names.begin(), param_input_names.end());
-  train_input_names_.insert(train_input_names_.end(), grad_input_names.begin(), grad_input_names.end());
-  train_input_names_.insert(train_input_names_.end(), reset_grad_name.begin(), reset_grad_name.end());
+  train_input_names_ = TrainInputNames(user_input_names, param_input_names, grad_input_names);
 
   for (const auto& output_name : train_output_names) {
     if (std::string param_name; !utils::GetParamNameFromGradient(output_name, param_name)) {
@@ -280,58 +345,24 @@ Module::Module(const ModelIdentifiers& model_identifiers,
     }
   }
 
-  // Loop each parameter, and allocate its memory based on the user-specified device.
-  auto& train_sess_state = train_sess_->GetSessionState();
-  for (auto& param_name : param_input_names) {
-    auto params_iter = state_->module_checkpoint_state.named_parameters.find(param_name);
-    ORT_ENFORCE(params_iter != state_->module_checkpoint_state.named_parameters.end());
-
-    // Retrieve the target device for "param_name".
-    InlinedVector<SessionState::NodeInfo> node_info_vec;
-    ORT_THROW_IF_ERROR(train_sess_state.GetInputNodeInfo(param_name, node_info_vec));
-    const auto& node_info = node_info_vec.front();
-    const auto target_device = *node_info.device;
-    for (auto it = node_info_vec.begin(); it != node_info_vec.end(); ++it) {
-      ORT_ENFORCE(target_device == *(it->device), "Inconsistent device requirements found for input: ", param_name);
-    }
-
-    // Copy ortvalue buffer from CPU to target_device for this "param_name" (based on graph partitioning)
-    // Only copies data if the target device is not the same as the current device the buffer is placed on
-    OrtValue& param_data = params_iter->second->Data();
-    ORT_ENFORCE(param_data.IsTensor());
-    const Tensor& param_data_tensor = param_data.Get<Tensor>();
-    // If the source device type is already the same as target device skip copy
-    if (param_data_tensor.Location().device.Type() != target_device.Type()) {
-      // TODO: move this outside of the for loop?
-      auto target_allocator = train_sess_state.GetAllocator(target_device);
-      ORT_ENFORCE(target_allocator != nullptr);
-
-      // Create a new tensor on the target_device and switch the source_ortvalue to point to this new tensor
-      auto target_tensor = std::make_unique<Tensor>(param_data_tensor.DataType(), param_data_tensor.Shape(),
-                                                    target_allocator);
-      ORT_THROW_IF_ERROR(train_sess_state.GetDataTransferMgr().CopyTensor(param_data_tensor, *target_tensor.get()));
-      auto ml_tensor_type = DataTypeImpl::GetType<Tensor>();
-      param_data.Init(target_tensor.release(), ml_tensor_type, ml_tensor_type->GetDeleteFunc());
-    }
-
-    weights_.push_back(param_data);
-    weight_names_.push_back(param_name);
-
-    // Create gradient buffer when parameter requires gradient.
-    if (params_iter->second->RequiresGrad()) {
-      // Create gradient accumulation buffer.
-      auto it = param_name_to_grad_input_index_map.find(param_name);
-      ORT_ENFORCE(it != param_name_to_grad_input_index_map.end(), "Gradient buffer input not provided for param: ",
-                  param_name);
-
-      const size_t grad_input_index = it->second;
-      auto& param_grad_name = grad_input_names[grad_input_index];
-      // TODO: don't pre-allocate the gradient buffer.
-      // Gradient usually stays on the same device of its parameter.
-      OrtValue param_grad;
-      ORT_THROW_IF_ERROR(utils::CreateZeroValuedOrtValueLike(train_sess_state, param_data, param_grad));
-      ORT_THROW_IF_ERROR(params_iter->second->SetGrad(param_grad_name, param_grad));
-      gradients_[grad_input_index] = params_iter->second->Gradient();
+  if (!state_->module_checkpoint_state.is_nominal_state) {
+    // ORT_THROW_IF_ERROR(AllocateMemoryForWeights());
+    // Loop each parameter, and allocate its memory based on the user-specified device.
+    const auto param_to_grad_index = BuildParameterToGradInputIndexMap(train_input_names_.GradientInputNames());
+    for (auto& param_name : train_input_names_.WeightsInputNames()) {
+      auto params_iter = state_->module_checkpoint_state.named_parameters.find(param_name);
+      ORT_ENFORCE(params_iter != state_->module_checkpoint_state.named_parameters.end());
+
+      OrtValue& param_data = params_iter->second->Data();
+      ORT_ENFORCE(param_data.IsTensor(), "Expected: Parameter data should be of tensor type. Actual: ",
+                  params_iter->second->Name(), " is not a tensor.");
+      ORT_THROW_IF_ERROR(LoadParameter(param_name, param_data.Get<Tensor>(), train_sess_->GetSessionState(),
+                                       false /* force_load */, param_to_grad_index,
+                                       train_input_names_.GradientInputNames(), *params_iter->second));
+      weights_.push_back(param_data);
+      if (params_iter->second->RequiresGrad()) {
+        gradients_[param_to_grad_index.at(param_name)] = params_iter->second->Gradient();
+      }
     }
   }
 
@@ -414,16 +445,24 @@ std::string Module::GetEvalModelOutputName(size_t index) const {
 
 size_t Module::GetParametersSize(const bool trainable_only) const {
   SafeInt<size_t> parameters_size = 0;
-  for (const auto& it : state_->module_checkpoint_state.named_parameters) {
-    if (trainable_only && !it.second->RequiresGrad()) {
+  const auto model_inputs_with_error = GetTrainingModelInputs();
+  ORT_THROW_IF_ERROR(model_inputs_with_error.first);
+  ORT_ENFORCE(model_inputs_with_error.second, "Training model graph inputs are not defined.");
+  for (const auto& input_def : *model_inputs_with_error.second) {
+    const std::string& input_name = input_def->Name();
+    const auto param_it = state_->module_checkpoint_state.named_parameters.find(input_name);
+    if (param_it == state_->module_checkpoint_state.named_parameters.end() ||
+        (trainable_only && !param_it->second->RequiresGrad())) {
       continue;
     }
-    parameters_size += it.second->Data().Get<Tensor>().Shape().Size();
+    parameters_size += onnxruntime::utils::GetTensorShapeFromTensorShapeProto(*input_def->Shape()).Size();
   }
   return parameters_size;
 }
 
 std::vector<std::shared_ptr<Parameter>> Module::Parameters() const {
+  ORT_ENFORCE(!state_->module_checkpoint_state.is_nominal_state,
+              "Cannot fetch parameters from a nominal checkpoint state. Please load the model parameters first.");
   std::vector<std::shared_ptr<Parameter>> params;
   for (auto& it : state_->module_checkpoint_state.named_parameters) {
     params.push_back(it.second);
@@ -432,23 +471,27 @@ std::vector<std::shared_ptr<Parameter>> Module::Parameters() const {
 }
 
 std::unordered_map<std::string, std::shared_ptr<Parameter>> Module::NamedParameters() const {
+  ORT_ENFORCE(!state_->module_checkpoint_state.is_nominal_state,
+              "Cannot fetch named parameters from a nominal checkpoint state. Please load the model parameters first.");
   return state_->module_checkpoint_state.named_parameters;
 }
 
 Status Module::CopyParametersToBuffer(OrtValue& parameters_buffer, const bool trainable_only) {
-  ORT_ENFORCE(parameters_buffer.IsAllocated(), "Parameters buffer should be pre-allocated.");
-  ORT_ENFORCE(parameters_buffer.IsTensor(), "Parameters buffer should be of tensor type.");
+  ORT_RETURN_IF(state_->module_checkpoint_state.is_nominal_state,
+                "Cannot copy parameters from a nominal checkpoint state. Please load the model parameters first.");
+  ORT_RETURN_IF_NOT(parameters_buffer.IsAllocated(), "Parameters buffer should be pre-allocated.");
+  ORT_RETURN_IF_NOT(parameters_buffer.IsTensor(), "Parameters buffer should be of tensor type.");
   auto* init_tensor = parameters_buffer.GetMutable<Tensor>();
   ORT_ENFORCE(nullptr != init_tensor);
   auto expected_buffer_size = static_cast<int64_t>(GetParametersSize(trainable_only));
-  ORT_ENFORCE(init_tensor->Shape().Size() == expected_buffer_size,
-              "Parameters buffer size incorrect. Expected:", expected_buffer_size,
-              ", Actual:", init_tensor->Shape().Size());
+  ORT_RETURN_IF(init_tensor->Shape().Size() != expected_buffer_size,
+                "Parameters buffer size incorrect. Expected:", expected_buffer_size,
+                ", Actual:", init_tensor->Shape().Size());
 
   const DataTransferManager& sess_data_transfer_manager = train_sess_->GetDataTransferManager();
 
   size_t offset = 0;
-  for (const auto& param_name : weight_names_) {
+  for (const auto& param_name : train_input_names_.WeightsInputNames()) {
     auto& param = state_->module_checkpoint_state.named_parameters.at(param_name);
     if (trainable_only && !param->RequiresGrad()) {
       continue;
@@ -458,7 +501,7 @@ Status Module::CopyParametersToBuffer(OrtValue& parameters_buffer, const bool tr
 
     const TensorShape& shape = weight_tensor->Shape();
     auto element_type = init_tensor->DataType();
-    ORT_ENFORCE(weight_tensor->DataType() == element_type, "Data types must match.");
+    ORT_RETURN_IF(weight_tensor->DataType() != element_type, "Data types must match.");
 
     const OrtMemoryInfo& info = init_tensor->Location();
     std::unique_ptr<Tensor> p_tensor;
@@ -470,54 +513,102 @@ Status Module::CopyParametersToBuffer(OrtValue& parameters_buffer, const bool tr
                                           data_buffer + offset,
                                           info);
     } else {
-      ORT_THROW("Unsupported type: ", element_type);
+      ORT_THROW("Unsupported type: ", element_type, " encountered while copying parameters to buffer. ",
+                "Only float is supported.");
     }
-    ORT_THROW_IF_ERROR(sess_data_transfer_manager.CopyTensor(*weight_tensor, *p_tensor.get()));
+    ORT_RETURN_IF_ERROR(sess_data_transfer_manager.CopyTensor(*weight_tensor, *p_tensor.get()));
     offset += shape.Size();
   }
   return Status::OK();
 }
 
 Status Module::CopyBufferToParameters(OrtValue& parameters_buffer, const bool trainable_only) {
-  ORT_ENFORCE(parameters_buffer.IsAllocated(), "Parameters buffer should be pre-allocated.");
-  ORT_ENFORCE(parameters_buffer.IsTensor(), "Parameters buffer should be of tensor type.");
-  auto* init_tensor = parameters_buffer.GetMutable<Tensor>();
-  ORT_ENFORCE(nullptr != init_tensor);
+  // In case of a nominal checkpoint state, all parameters need to be loaded into the model.
+  // i.e. trainable_only must be false.
+  ORT_RETURN_IF(trainable_only && state_->module_checkpoint_state.is_nominal_state,
+                "For nominal checkpoint state, all parameters need to be loaded into the model "
+                "(trainable_only = false).");
+  ORT_RETURN_IF_NOT(parameters_buffer.IsAllocated(), "Parameters buffer should be pre-allocated.");
+  ORT_RETURN_IF_NOT(parameters_buffer.IsTensor(), "Parameters buffer should be of tensor type.");
+  auto* buffer_tensor = parameters_buffer.GetMutable<Tensor>();
+  ORT_RETURN_IF(nullptr == buffer_tensor, "Expected valid parameter buffer. Actual: nullptr.");
   auto expected_buffer_size = static_cast<int64_t>(GetParametersSize(trainable_only));
-  ORT_ENFORCE(init_tensor->Shape().Size() == expected_buffer_size,
-              "Parameters buffer size incorrect. Expected:", expected_buffer_size,
-              ", Actual:", init_tensor->Shape().Size());
+  ORT_RETURN_IF(buffer_tensor->Shape().Size() != expected_buffer_size,
+                "Parameters buffer size incorrect. Expected:", expected_buffer_size,
+                ", Actual:", buffer_tensor->Shape().Size());
 
+  auto& train_sess_state = train_sess_->GetSessionState();
   const DataTransferManager& sess_data_transfer_manager = train_sess_->GetDataTransferManager();
+  const auto model_inputs_with_error = GetTrainingModelInputs();
+  ORT_RETURN_IF_ERROR(model_inputs_with_error.first);
+  ORT_RETURN_IF_NOT(model_inputs_with_error.second, "Training model graph inputs are not defined.");
+  const auto param_to_node_arg = BuildParameterToInputNodeArgMap(state_->module_checkpoint_state,
+                                                                 model_inputs_with_error.second);
+  const auto param_to_grad_index = BuildParameterToGradInputIndexMap(train_input_names_.GradientInputNames());
+
+  if (state_->module_checkpoint_state.is_nominal_state) {
+    // weights_ vector is not initialized for a nominal state. This function is expected to
+    // initialize the weights_.
+    ORT_ENFORCE(weights_.empty(), "Weights vector should be empty for a nominal state.");
+  }
 
   size_t offset = 0;
-  for (const auto& param_name : weight_names_) {
+  for (const auto& param_name : train_input_names_.WeightsInputNames()) {
     auto& param = state_->module_checkpoint_state.named_parameters.at(param_name);
     if (trainable_only && !param->RequiresGrad()) {
       continue;
     }
     OrtValue& weight = param->Data();
-    auto* weight_tensor = weight.GetMutable<Tensor>();
 
-    const TensorShape& shape = weight_tensor->Shape();
-    auto element_type = init_tensor->DataType();
-    ORT_ENFORCE(weight_tensor->DataType() == element_type, "Data types must match.");
+    auto param_it = param_to_node_arg.find(param_name);
+    const TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorShapeProto(
+        *(param_it->second->Shape()));
+    const auto element_type = static_cast<const TensorTypeBase*>(
+                                  onnxruntime::utils::GetMLDataType(*param_it->second))
+                                  ->GetElementType();
 
-    const OrtMemoryInfo& info = init_tensor->Location();
-    std::unique_ptr<Tensor> p_tensor;
+    const OrtMemoryInfo& info = buffer_tensor->Location();
+    std::unique_ptr<Tensor> src_tensor;
 
     if (onnxruntime::utils::IsPrimitiveDataType<float>(element_type)) {
-      float* data_buffer = init_tensor->MutableData<float>();
-      p_tensor = std::make_unique<Tensor>(element_type,
-                                          shape,
-                                          data_buffer + offset,
-                                          info);
+      float* data_buffer = buffer_tensor->MutableData<float>();
+      src_tensor = std::make_unique<Tensor>(element_type,
+                                            shape,
+                                            data_buffer + offset,
+                                            info);
+    } else {
+      ORT_THROW("Unsupported type: ", element_type, " encountered while copying buffer to parameters. ",
+                "Only float is supported.");
+    }
+
+    if (state_->module_checkpoint_state.is_nominal_state) {
+      // If state is a nominal state, then we first need to allocate the memory for
+      // parameters and their gradients in the checkpoint state before copying the data.
+      ORT_RETURN_IF_ERROR(LoadParameter(param_name, *src_tensor, train_sess_state, true,
+                                        param_to_grad_index, train_input_names_.GradientInputNames(),
+                                        *param));
+      weights_.push_back(param->Data());
+      if (param->RequiresGrad()) {
+        // It is expected that the gradients_ vector is already initialized with the correct size
+        // in the Module constructor (even though the OrtValues contained in the vector are empty).
+        gradients_[param_to_grad_index.at(param_name)] = param->Gradient();
+      }
     } else {
-      ORT_THROW("Unsupported type: ", element_type);
+      // If state is not a nominal state, then we can directly copy the data to the existing
+      // parameters in the checkpoint state.
+      auto* weight_tensor = weight.GetMutable<Tensor>();
+      ORT_ENFORCE(weight_tensor->DataType() == element_type, "Data types must match.");
+      ORT_THROW_IF_ERROR(sess_data_transfer_manager.CopyTensor(*src_tensor.get(), *weight_tensor));
     }
-    ORT_THROW_IF_ERROR(sess_data_transfer_manager.CopyTensor(*p_tensor.get(), *weight_tensor));
+
     offset += shape.Size();
   }
+
+  if (state_->module_checkpoint_state.is_nominal_state) {
+    // Once the parameters are loaded, the state is no longer a nominal state.
+    state_->module_checkpoint_state.is_nominal_state = false;
+  }
+
   return Status::OK();
 }
 
@@ -527,6 +618,9 @@ Status Module::LazyResetGrad() {
 }
 
 Status Module::TrainStep(const std::vector<OrtValue>& inputs, std::vector<OrtValue>& outputs) {
+  ORT_RETURN_IF(state_->module_checkpoint_state.is_nominal_state,
+                "Cannot perform TrainStep with a nominal state. Please load the model parameters first.");
+  std::vector<std::shared_ptr<Parameter>> params;
   std::vector<OrtValue> feeds{inputs};
   feeds.insert(feeds.end(), weights_.begin(), weights_.end());
   feeds.insert(feeds.end(), gradients_.begin(), gradients_.end());
@@ -535,7 +629,7 @@ Status Module::TrainStep(const std::vector<OrtValue>& inputs, std::vector<OrtVal
   utils::WrapInOrtValue<bool>(!accumulate_gradient_, &reset_grad_input);
   feeds.push_back(reset_grad_input);
 
-  ORT_THROW_IF_ERROR(train_sess_->Run(RunOptions(), train_input_names_, feeds, train_output_names_, &outputs));
+  ORT_THROW_IF_ERROR(train_sess_->Run(RunOptions(), train_input_names_.AllInputNames(), feeds, train_output_names_, &outputs));
 
   // Reset the flag after every step. In case the ResetGrad was called before running
   // the current step, it will have done the effective resetting during the
@@ -546,6 +640,8 @@ Status Module::TrainStep(const std::vector<OrtValue>& inputs, std::vector<OrtVal
 }
 
 Status Module::EvalStep(const std::vector<OrtValue>& inputs, std::vector<OrtValue>& outputs) {
+  ORT_RETURN_IF(state_->module_checkpoint_state.is_nominal_state,
+                "Cannot perform EvalStep with a nominal state. Please load the model parameters first.");
   ORT_ENFORCE(nullptr != eval_sess_, "Evaluation session not initialized.");
   std::vector<OrtValue> feeds{inputs};
   feeds.insert(feeds.end(), weights_.begin(), weights_.end());
@@ -560,6 +656,8 @@ Status Module::EvalStep(const std::vector<OrtValue>& inputs, std::vector<OrtValu
 //                      transform it to an inference model and save it in ort_format.
 Status Module::ExportModelForInferencing(const std::string& inference_model_path,
                                          gsl::span<const std::string> graph_output_names) const {
+  ORT_RETURN_IF(state_->module_checkpoint_state.is_nominal_state,
+                "Cannot export the model with a nominal state. Please load the model parameters first.");
   ORT_RETURN_IF(!eval_sess_ || !eval_model_path_.has_value(),
                 "Eval model was not provided. Cannot export a model for inferencing.");
 
@@ -586,7 +684,7 @@ Status Module::ExportModelForInferencing(const std::string& inference_model_path
 #endif
 
 size_t Module::GetTrainingModelInputCount() const noexcept {
-  return train_user_input_count_;
+  return train_input_names_.UserInputNames().size();
 }
 
 size_t Module::GetEvalModelInputCount() const noexcept {
@@ -594,10 +692,10 @@ size_t Module::GetEvalModelInputCount() const noexcept {
 }
 
 std::string Module::GetTrainingModelInputName(size_t index) const {
-  ORT_ENFORCE(index < train_user_input_count_,
-              "Train input name index out of range. Expected in range [0-", train_user_input_count_, "). Actual: ",
+  ORT_ENFORCE(index < train_input_names_.UserInputNames().size(),
+              "Train input name index out of range. Expected in range [0-", train_input_names_.UserInputNames().size(), "). Actual: ",
               index);
-  return train_input_names_.at(index);
+  return train_input_names_.UserInputNames()[index];
 }
 
 std::string Module::GetEvalModelInputName(size_t index) const {
@@ -615,6 +713,43 @@ std::pair<common::Status, const InputDefList*> Module::GetEvalModelInputs() cons
   return eval_sess_->GetModelInputs();
 }
 
+Module::TrainInputNames::TrainInputNames(gsl::span<const std::string> user_input_names,
+                                         gsl::span<const std::string> weights_input_names,
+                                         gsl::span<const std::string> gradient_input_names) {
+  train_input_names_.reserve(user_input_names.size() +
+                             weights_input_names.size() +
+                             gradient_input_names.size() +
+                             1U);  // +1 for the reset gradient flag input
+  train_input_index_offsets_.reserve(3);
+
+  train_input_names_.insert(train_input_names_.end(),
+                            user_input_names.begin(), user_input_names.end());
+  train_input_index_offsets_.push_back(train_input_names_.size());
+  train_input_names_.insert(train_input_names_.end(),
+                            weights_input_names.begin(), weights_input_names.end());
+  train_input_index_offsets_.push_back(train_input_names_.size());
+  train_input_names_.insert(train_input_names_.end(),
+                            gradient_input_names.begin(), gradient_input_names.end());
+  train_input_index_offsets_.push_back(train_input_names_.size());
+  train_input_names_.push_back(ACCUMULATE_GRAD_CONTROL_INPUT_NAME);
+}
+
+gsl::span<const std::string> Module::TrainInputNames::AllInputNames() const { return train_input_names_; }
+
+gsl::span<const std::string> Module::TrainInputNames::UserInputNames() const {
+  return gsl::span<const std::string>{train_input_names_.begin(), train_input_index_offsets_[0]};
+}
+
+gsl::span<const std::string> Module::TrainInputNames::WeightsInputNames() const {
+  return gsl::span<const std::string>{train_input_names_.begin() + train_input_index_offsets_[0],
+                                      train_input_index_offsets_[1] - train_input_index_offsets_[0]};
+}
+
+gsl::span<const std::string> Module::TrainInputNames::GradientInputNames() const {
+  return gsl::span<const std::string>{train_input_names_.begin() + train_input_index_offsets_[1],
+                                      train_input_index_offsets_[2] - train_input_index_offsets_[1]};
+}
+
 }  // namespace api
 }  // namespace training
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_api/module.h b/orttraining/orttraining/training_api/module.h
index f323e6be72d4..917887404217 100644
--- a/orttraining/orttraining/training_api/module.h
+++ b/orttraining/orttraining/training_api/module.h
@@ -53,6 +53,7 @@ struct ModuleCheckpointState {
  public:
   std::unordered_map<std::string, std::shared_ptr<Parameter>> named_parameters;
   const DataTransferManager* train_session_data_transfer_mgr;
+  bool is_nominal_state = false;
 };
 
 struct CheckpointState;
@@ -87,19 +88,28 @@ struct Module {
   ~Module();
 
   // Return the trainable/nontrainable parameters
+  // If the parameter state is not available; i.e. the module was created using the nominal checkpoint,
+  // and the state has not been loaded yet, then this function will raise an exception.
   std::vector<std::shared_ptr<Parameter>> Parameters() const;
 
+  // Return the trainable/nontrainable parameters as a map
+  // If the parameter state is not available; i.e. the module was created using the nominal checkpoint,
+  // and the state has not been loaded yet, then this function will raise an exception.
   std::unordered_map<std::string, std::shared_ptr<Parameter>> NamedParameters() const;
 
   // Reset and release the gradient buffer of all trainable params lazily.
   Status LazyResetGrad();
 
   // Train Step – does forward and backward computation. The outputs will be the forward’s outputs.
-  // Gradients will be accumulated within the Parameter object
+  // Gradients will be accumulated within the Parameter object.
+  // If the parameter state is not available; i.e. the module was created using the nominal checkpoint,
+  // and the state has not been loaded yet, then this function will return an error.
   Status TrainStep(const std::vector<OrtValue>& inputs, std::vector<OrtValue>& outputs);
 
   // Eval Step – does forward computation. This will use a separate inference session
   // and take in a separate inference graph, while sharing the parameters
+  // If the parameter state is not available; i.e. the module was created using the nominal checkpoint,
+  // and the state has not been loaded yet, then this function will return an error.
   Status EvalStep(const std::vector<OrtValue>& inputs, std::vector<OrtValue>& outputs);
 
   // Returns the output count for training graph
@@ -118,14 +128,20 @@ struct Module {
   size_t GetParametersSize(const bool trainable_only = true) const;
 
   // Copy parameters onto contiguous buffer held by parameters_buffer
+  // If the parameter state is not available; i.e. the module was created using the nominal checkpoint,
+  // and the state has not been loaded yet, then this function will return an error.
   Status CopyParametersToBuffer(OrtValue& parameters_buffer, const bool trainable_only = true);
 
   // Copy parameter values from contiguous buffer held by parameters_buffer onto parameters
+  // This function is responsible for completing the nominal checkpoint state. The checkpoint
+  // state will no longer be nominal after the successful completion of this function.
   Status CopyBufferToParameters(OrtValue& parameters_buffer, const bool trainable_only = true);
 
 #if !defined(ORT_MINIMAL_BUILD)
   // Load the eval model from eval_model_path_or_bytes and transform it for the purpose of
-  // inferencing, and serialize to given path
+  // inferencing, and serialize to given path.
+  // If the parameter state is not available; i.e. the module was created using the nominal checkpoint,
+  // and the state has not been loaded yet, then this function will return an error.
   Status ExportModelForInferencing(const std::string& inference_model_path,
                                    gsl::span<const std::string> graph_output_names) const;
 #endif
@@ -152,11 +168,28 @@ struct Module {
   std::unique_ptr<onnxruntime::InferenceSession> train_sess_{nullptr};
   std::unique_ptr<onnxruntime::InferenceSession> eval_sess_{nullptr};
 
-  InlinedVector<std::string> train_input_names_;
+  struct TrainInputNames {
+   private:
+    InlinedVector<std::string> train_input_names_;
+    InlinedVector<size_t> train_input_index_offsets_;  // offset range[[0], [1]) = user input names
+                                                       // offset range[[1], [2]) = weights input names
+                                                       // offset range[[2], [3]) = gradient input names
+   public:
+    TrainInputNames() = default;
+    TrainInputNames(gsl::span<const std::string> user_input_names,
+                    gsl::span<const std::string> weights_input_names,
+                    gsl::span<const std::string> gradient_input_names);
+
+    gsl::span<const std::string> AllInputNames() const;
+    gsl::span<const std::string> UserInputNames() const;
+    gsl::span<const std::string> WeightsInputNames() const;
+    gsl::span<const std::string> GradientInputNames() const;
+  };
+
+  TrainInputNames train_input_names_;
   InlinedVector<std::string> train_output_names_;
   InlinedVector<std::string> eval_input_names_;
   InlinedVector<std::string> eval_output_names_;
-  InlinedVector<std::string> weight_names_;
 
   InlinedVector<OrtValue> weights_;
   InlinedVector<OrtValue> gradients_;
@@ -165,7 +198,6 @@ struct Module {
 
   bool accumulate_gradient_ = false;
   std::optional<std::string> eval_model_path_;
-  size_t train_user_input_count_{0U};
   size_t eval_user_input_count_{0U};
 };
 
diff --git a/orttraining/orttraining/training_api/onnxruntime_training_c_api.cc b/orttraining/orttraining/training_api/onnxruntime_training_c_api.cc
index 38a9aad9640e..0ed41f670f9e 100644
--- a/orttraining/orttraining/training_api/onnxruntime_training_c_api.cc
+++ b/orttraining/orttraining/training_api/onnxruntime_training_c_api.cc
@@ -568,9 +568,16 @@ ORT_API_STATUS_IMPL(OrtTrainingApis::GetParameterTypeAndShape, _In_ const OrtChe
   API_IMPL_BEGIN
 
   auto chkpt_state = reinterpret_cast<const onnxruntime::training::api::CheckpointState*>(checkpoint_state);
+  if (chkpt_state->module_checkpoint_state.is_nominal_state) {
+    const std::string err_msg =
+        "Parameter type and shape cannot be retrieved from nominal checkpoint state. "
+        "Please load the parameter states first.";
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, err_msg.c_str());
+  }
+
   auto it = chkpt_state->module_checkpoint_state.named_parameters.find(parameter_name);
   if (it == chkpt_state->module_checkpoint_state.named_parameters.end()) {
-    std::string err_msg = "Parameter name " + std::string(parameter_name) + " not found in checkpoint state.";
+    const std::string err_msg = "Parameter name " + std::string(parameter_name) + " not found in checkpoint state.";
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, err_msg.c_str());
   }
 
@@ -586,9 +593,15 @@ ORT_API_STATUS_IMPL(OrtTrainingApis::UpdateParameter, _Inout_ OrtCheckpointState
   }
 
   auto chkpt_state = reinterpret_cast<const onnxruntime::training::api::CheckpointState*>(checkpoint_state);
+  if (chkpt_state->module_checkpoint_state.is_nominal_state) {
+    const std::string err_msg =
+        "Parameter cannot be updated for nominal checkpoint state. Please load all the parameter states first.";
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, err_msg.c_str());
+  }
+
   auto it = chkpt_state->module_checkpoint_state.named_parameters.find(parameter_name);
   if (it == chkpt_state->module_checkpoint_state.named_parameters.end()) {
-    std::string err_msg = "Parameter name " + std::string(parameter_name) + " not found in checkpoint state.";
+    const std::string err_msg = "Parameter name " + std::string(parameter_name) + " not found in checkpoint state.";
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, err_msg.c_str());
   }
   ORT_API_RETURN_IF_STATUS_NOT_OK(it->second->CopyFrom(
@@ -608,9 +621,15 @@ ORT_API_STATUS_IMPL(OrtTrainingApis::GetParameter, _In_ const OrtCheckpointState
   }
 
   auto chkpt_state = reinterpret_cast<const onnxruntime::training::api::CheckpointState*>(checkpoint_state);
+  if (chkpt_state->module_checkpoint_state.is_nominal_state) {
+    const std::string err_msg =
+        "Parameter cannot be retrieved from nominal checkpoint state. Please load the parameter states first.";
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, err_msg.c_str());
+  }
+
   auto it = chkpt_state->module_checkpoint_state.named_parameters.find(parameter_name);
   if (it == chkpt_state->module_checkpoint_state.named_parameters.end()) {
-    std::string err_msg = "Parameter name " + std::string(parameter_name) + " not found in checkpoint state.";
+    const std::string err_msg = "Parameter name " + std::string(parameter_name) + " not found in checkpoint state.";
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, err_msg.c_str());
   }
 
diff --git a/orttraining/orttraining/training_api/optimizer.cc b/orttraining/orttraining/training_api/optimizer.cc
index 7f583ce8f6e7..4647f890729f 100644
--- a/orttraining/orttraining/training_api/optimizer.cc
+++ b/orttraining/orttraining/training_api/optimizer.cc
@@ -21,8 +21,8 @@ namespace {
 constexpr char GROUP_ZERO_NAME[] = "group0";
 static constexpr std::array CommonOptimizerInputs{"learning_rate", "step", "params", "gradients"};
 
-Status GraphInputsAreExpected(gsl::span<std::string> actual_graph_inputs,
-                              gsl::span<std::string> expected_graph_inputs) {
+Status GraphInputsAreExpected(gsl::span<const std::string> actual_graph_inputs,
+                              gsl::span<const std::string> expected_graph_inputs) {
   const auto stringify = [](const auto& container) {
     if (container.empty()) {
       return std::string("[]");
@@ -61,32 +61,19 @@ Status GraphInputsAreExpected(gsl::span<std::string> actual_graph_inputs,
 }  // namespace
 
 std::unique_ptr<OptimizerAlgorithmBase> OptimizerAlorithmFactory::CreateInstance(
-    std::shared_ptr<Model> model, int32_t& group_count) {
+    const GraphViewer& graph_viewer, int32_t& group_count) {
   std::map<std::pair<std::string, std::string>, int32_t> opt_type_to_freq_map;
-#if !defined(ORT_MINIMAL_BUILD)
-  if (model != nullptr) {
-    Graph& graph = model->MainGraph();
-    for (auto& node : graph.Nodes()) {
-      if (node.Domain() == kMSDomain && (node.OpType() == "AdamWOptimizer" || node.OpType() == "SGDOptimizerV2")) {
-        auto domain_type_pair = std::make_pair(node.Domain(), node.OpType());
-        if (opt_type_to_freq_map.find(domain_type_pair) == opt_type_to_freq_map.end()) {
-          opt_type_to_freq_map[domain_type_pair] = 0;
-        }
 
-        opt_type_to_freq_map[domain_type_pair] += 1;
+  for (const auto& node : graph_viewer.Nodes()) {
+    if (node.Domain() == kMSDomain && (node.OpType() == "AdamWOptimizer" || node.OpType() == "SGDOptimizerV2")) {
+      auto domain_type_pair = std::make_pair(node.Domain(), node.OpType());
+      if (opt_type_to_freq_map.find(domain_type_pair) == opt_type_to_freq_map.end()) {
+        opt_type_to_freq_map[domain_type_pair] = 0;
       }
+
+      opt_type_to_freq_map[domain_type_pair] += 1;
     }
-  } else {
-#else
-  ORT_UNUSED_PARAMETER(model);
-#endif
-    // TODO(baijumeswani): Figure out the best way to extract the optimizer type
-    // from the model (either onnx model or ort format model) or from the checkpoint.
-    // For now, assume that the optimizer type is AdamWOptimizer when using ort format models.
-    opt_type_to_freq_map[std::make_pair(kMSDomain, "AdamWOptimizer")] = 1;
-#if !defined(ORT_MINIMAL_BUILD)
   }
-#endif
 
   ORT_ENFORCE(opt_type_to_freq_map.size() == 1U, "Only support one type of optimizer algorithm, but got: " +
                                                      std::to_string(opt_type_to_freq_map.size()));
@@ -105,42 +92,6 @@ std::unique_ptr<OptimizerAlgorithmBase> OptimizerAlorithmFactory::CreateInstance
   }
 }
 
-std::unique_ptr<OptimizerAlgorithmBase> OptimizerAlorithmFactory::CreateInstance(
-    const PathString& optim_path, int32_t& group_count) {
-  std::shared_ptr<Model> model = nullptr;
-#if !defined(ORT_MINIMAL_BUILD)
-  if (!fbs::utils::IsOrtFormatModel(optim_path)) {
-    ORT_ENFORCE(Model::Load(optim_path, model, nullptr,
-                            logging::LoggingManager::DefaultLogger())
-                    .IsOK());
-  }
-#else
-  ORT_UNUSED_PARAMETER(optim_path);
-#endif
-  return CreateInstance(model, group_count);
-}
-
-std::unique_ptr<OptimizerAlgorithmBase> OptimizerAlorithmFactory::CreateInstance(
-    const uint8_t* optim_model_data, size_t optim_model_data_len, int32_t& group_count) {
-  std::shared_ptr<Model> model = nullptr;
-#if !defined(ORT_MINIMAL_BUILD)
-  if (!fbs::utils::IsOrtFormatModelBytes(optim_model_data, static_cast<int>(optim_model_data_len))) {
-    ONNX_NAMESPACE::ModelProto model_proto;
-    ORT_ENFORCE(model_proto.ParseFromArray(optim_model_data, static_cast<int>(optim_model_data_len)) == true,
-                "Failed to load model because protobuf parsing failed.");
-
-    ORT_ENFORCE(Model::Load(std::move(model_proto), model, nullptr,
-                            logging::LoggingManager::DefaultLogger(), ModelOptions(true, true))
-                    .IsOK());
-  }
-#else
-  ORT_UNUSED_PARAMETER(optim_model_data);
-  ORT_UNUSED_PARAMETER(optim_model_data_len);
-#endif
-
-  return CreateInstance(model, group_count);
-}
-
 Status Optimizer::GenerateMomentumNamedStates(OptimizerCheckpointState& optimizer_checkpoint_states) {
   auto group_optimizer_state_it =
       optimizer_checkpoint_states.group_named_optimizer_states.find(GROUP_ZERO_NAME);
@@ -245,8 +196,17 @@ Optimizer::Optimizer(const ModelIdentifiers& model_identifiers,
     if (!find_group_zero)
       state_->optimizer_checkpoint_state.group_named_optimizer_states.insert(
           {GROUP_ZERO_NAME, std::make_shared<GroupOptimizerState>()});
-    ORT_THROW_IF_ERROR(GenerateMomentumNamedStates(state_->optimizer_checkpoint_state));
-    ORT_THROW_IF_ERROR(ConstructInputs());
+    if (!state_->module_checkpoint_state.is_nominal_state) {
+      // Construct the optimizer state and inputs only if the complete state
+      // is available.
+      // For a nominal state, delay the construction of the optimizer state
+      // and inputs until the complete state is available. Once the complete
+      // state is available, the optimizer state and inputs can be constructed
+      // by invoking ConstructOptimizerStateAndInputs().
+      ORT_THROW_IF_ERROR(ConstructOptimizerStateAndInputs());
+    } else {
+      delay_optimizer_state_contruction_ = true;
+    }
   } else {
     ORT_THROW_IF_ERROR(LoadStateDict(state_->optimizer_checkpoint_state));
   }
@@ -271,17 +231,15 @@ void Optimizer::Initialize(const ModelIdentifiers& model_identifiers,
     auto optimizer_model = std::get<std::optional<std::string>>(model_identifiers.optim_model);
     // The above call to IsOptimizerModelAvailable() ensures that optimizer_model is not nullopt
     ORT_THROW_IF_ERROR(optim_sess_->Load(optimizer_model.value()));
-    optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(ToWideString(optimizer_model.value()), group_count_);
   } else {
     auto optimizer_model = std::get<gsl::span<const uint8_t>>(model_identifiers.optim_model);
     ORT_THROW_IF_ERROR(optim_sess_->Load(optimizer_model.data(),
                                          static_cast<int>(optimizer_model.size())));
-    optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(optimizer_model.data(),
-                                                                   optimizer_model.size(),
-                                                                   group_count_);
   }
 
   ORT_THROW_IF_ERROR(optim_sess_->Initialize());
+  optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(optim_sess_->GetSessionState().GetGraphViewer(),
+                                                                 group_count_);
 
   // Make sure that the checkpoint state can copy tensors
   state_->optimizer_checkpoint_state.optimizer_session_data_transfer_mgr = &optim_sess_->GetDataTransferManager();
@@ -298,6 +256,10 @@ void Optimizer::Initialize(const ModelIdentifiers& model_identifiers,
 }
 
 Status Optimizer::Step() {
+  if (delay_optimizer_state_contruction_) {
+    ORT_RETURN_IF_ERROR(ConstructOptimizerStateAndInputs());
+  }
+
   OrtValue learning_rate_input, step_input;
   utils::WrapInOrtValue<float>(optimizer_state_->learning_rate, &learning_rate_input);
   // Use step count + 1 before running optimizer step.
@@ -375,6 +337,17 @@ Status Optimizer::LoadStateDict(OptimizerCheckpointState& optimizer_checkpoint_s
   return Status::OK();
 }
 
+Status Optimizer::ConstructOptimizerStateAndInputs() {
+  ORT_RETURN_IF(state_->module_checkpoint_state.is_nominal_state,
+                "The optimizer state cannot be constructed. Please load the model parameters first.");
+  ORT_RETURN_IF_ERROR(GenerateMomentumNamedStates(state_->optimizer_checkpoint_state));
+  ORT_RETURN_IF_ERROR(ConstructInputs());
+
+  delay_optimizer_state_contruction_ = false;
+
+  return Status::OK();
+}
+
 }  // namespace api
 }  // namespace training
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_api/optimizer.h b/orttraining/orttraining/training_api/optimizer.h
index d9bc4870bb7e..5b908acf7c9e 100644
--- a/orttraining/orttraining/training_api/optimizer.h
+++ b/orttraining/orttraining/training_api/optimizer.h
@@ -64,11 +64,8 @@ struct SGDOptimizerV2Algorithm : public OptimizerAlgorithmBase {
 };
 
 struct OptimizerAlorithmFactory {
-  static std::unique_ptr<OptimizerAlgorithmBase> CreateInstance(const PathString& optim_path,
+  static std::unique_ptr<OptimizerAlgorithmBase> CreateInstance(const GraphViewer& graph_viewer,
                                                                 int32_t& group_count);
-  static std::unique_ptr<OptimizerAlgorithmBase> CreateInstance(const uint8_t* optim_model_data,
-                                                                size_t optim_model_data_len, int32_t& group_count);
-  static std::unique_ptr<OptimizerAlgorithmBase> CreateInstance(std::shared_ptr<Model> model, int32_t& group_count);
 };
 
 struct CheckpointState;
@@ -123,6 +120,15 @@ struct Optimizer {
     return Status::OK();
   }
 
+  // Constructs the optimizer state and prepares the model inputs.
+  // This is called once during the construction of the Optimizer if the model state is available.
+  // In case the optimizer was instantiated with a nominal checkpoint, this function must be
+  // called when the model state is available.
+  // The optimizer checks if the optimizer state needs to be constructed in the train step function.
+  // However, this is exposed as a public function in case the user wants to construct the optimizer
+  // state before the train step function is called.
+  Status ConstructOptimizerStateAndInputs();
+
  private:
   void Initialize(const ModelIdentifiers& model_identifiers,
                   const std::vector<std::shared_ptr<IExecutionProvider>>& providers,
@@ -134,8 +140,7 @@ struct Optimizer {
 
   // Generates optimizer momentum states for parameters that require grad.
   Status GenerateMomentumNamedStates(OptimizerCheckpointState& optimizer_checkpoint_states);
-  // Constructs the ortvalue inputs to be fed to the graph
-  // at each step.
+  // Constructs the ortvalue inputs to be fed to the graph at each step.
   Status ConstructInputs();
 
   /**
@@ -160,6 +165,8 @@ struct Optimizer {
   InlinedVector<OrtValue> inputs_;
 
   int32_t group_count_{0};
+
+  bool delay_optimizer_state_contruction_{false};
 };
 
 }  // namespace api
diff --git a/orttraining/orttraining/training_api/training_session.cc b/orttraining/orttraining/training_api/training_session.cc
index 45f0f0ddcf7f..78619947b8b1 100644
--- a/orttraining/orttraining/training_api/training_session.cc
+++ b/orttraining/orttraining/training_api/training_session.cc
@@ -112,7 +112,16 @@ Status TrainingSession::CopyParametersToBuffer(OrtValue& parameters_buffer, cons
 }
 
 Status TrainingSession::CopyBufferToParameters(OrtValue& parameters_buffer, const bool trainable_only) {
-  return module_->CopyBufferToParameters(parameters_buffer, trainable_only);
+  const bool was_nominal_state = state_->module_checkpoint_state.is_nominal_state;
+  ORT_RETURN_IF_ERROR(module_->CopyBufferToParameters(parameters_buffer, trainable_only));
+
+  // If the checkpoint state was nominal before loading the params, then we need to construct the
+  // optimizer state and inputs.
+  if (was_nominal_state) {
+    ORT_RETURN_IF_ERROR(optimizer_->ConstructOptimizerStateAndInputs());
+  }
+
+  return Status::OK();
 }
 
 #if !defined(ORT_MINIMAL_BUILD)
diff --git a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
index 3c5ac56cb139..0a98cd959dd3 100644
--- a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
+++ b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
@@ -385,7 +385,10 @@ void PythonOpGradBase::RunBackward(OpKernelContext* context,
 
 void PythonOpGradBase::SetOutputs(OpKernelContext* context, std::vector<OrtValue>& returned_ortvalues) const {
   auto* ctx_internal = reinterpret_cast<onnxruntime::OpKernelContextInternal*>(context);
-  ORT_ENFORCE(output_convention_.size() == returned_ortvalues.size(), "backward output count mismatch.");
+  ORT_ENFORCE(output_convention_.size() == returned_ortvalues.size(), "backward output count mismatch. Expected ",
+              output_convention_.size(), ", but got ", returned_ortvalues.size(),
+              ". Please check the backward function return same number of outputs as forward function's input for ",
+              name_, ".");
   int tensor_output_index = 0;
   for (size_t i = 0; i < returned_ortvalues.size(); ++i) {
     if (output_convention_[i] == 'd') {
diff --git a/orttraining/orttraining/training_ops/cpu/triton/triton_op.h b/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
index f226db76f7ed..db8e8558ab88 100644
--- a/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
+++ b/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
@@ -25,12 +25,15 @@ class TritonOp final : public OpKernel {
           attr.first == "onnx_string") {
         continue;
       }
-      // Support int64 and float only for now, skip other types.
+      // Support int64, float and string only for now, skip other types.
       if (attr.second.type() == ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_INT) {
         kwargs_.insert({attr.first, {std::to_string(attr.second.i()), ONNX_NAMESPACE::TensorProto_DataType_INT64}});
       } else if (attr.second.type() ==
                  ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_FLOAT) {
         kwargs_.insert({attr.first, {std::to_string(attr.second.f()), ONNX_NAMESPACE::TensorProto_DataType_FLOAT}});
+      } else if (attr.second.type() ==
+                 ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_STRING) {
+        kwargs_.insert({attr.first, {attr.second.s(), ONNX_NAMESPACE::TensorProto_DataType_STRING}});
       }
     }
   }
diff --git a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
index f604e4c4aaf3..c642a87e22de 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
+++ b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
@@ -233,6 +233,7 @@ void NcclService::Initialize() {
   //   CPUs
   //   Other devices
 
+#ifdef USE_MPI
   const int mpi_rank = onnxruntime::training::MPIContext::GetInstance().GetWorldRank();
   const int mpi_local_rank = onnxruntime::training::MPIContext::GetInstance().GetLocalRank();
   const int mpi_size = onnxruntime::training::MPIContext::GetInstance().GetWorldSize();
@@ -248,6 +249,7 @@ void NcclService::Initialize() {
   if (mpi_rank == 0) NCCL_CALL_THROW(ncclGetUniqueId(&id));
   MPI_CHECK(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
   NCCL_CALL_THROW(ncclCommInitRank(&comm_, mpi_size, id, mpi_rank));
+#endif  // USE_MPI
 }
 
 void NcclService::Launch() {
diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
index dcf733153bda..8b2bc7e2ef2b 100644
--- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
@@ -196,6 +196,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MixedPrecisionScale);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, LayerNormalizationGrad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, SimplifiedLayerNormalizationGrad);
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, ReduceAllL2);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2);
@@ -452,6 +453,7 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
 
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MixedPrecisionScale)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, LayerNormalizationGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, SimplifiedLayerNormalizationGrad)>,
 
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, ReduceAllL2)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
index f6c58445c0a5..fc5d9b65d0f8 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
@@ -114,7 +114,8 @@ Status ConvGrad<T>::PrepareArgs(const Tensor& x, const Tensor& dY, const Tensor&
     ORT_RETURN_IF_ERROR(args_.y_tensor.Set(dy_dims, args_.params.data_type));
     ORT_RETURN_IF_ERROR(args_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
                                             gsl::narrow_cast<int>(conv_attrs_.group), CUDNN_CROSS_CORRELATION,
-                                            args_.params.data_type));
+                                            args_.params.data_type,
+                                            UseTF32()));
 
     if (dB) {
       const TensorShape& db_shape = dB->Shape();
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
index 5dc16c68f621..9b30bd128b16 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc
@@ -105,7 +105,8 @@ struct AlgoSearch<T_BwdDataPerf> {
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED};
     static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-    ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward data algorithms.");
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution backward data algorithms.");
     int perf_count;
     std::unique_ptr<T_BwdDataPerf[]> candidates = std::make_unique<T_BwdDataPerf[]>(num_algos);
     if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
@@ -146,7 +147,9 @@ struct AlgoSearch<T_BwdFilterPerf> {
 
     // NOTE: - 1 because ALGO_WINOGRAD is not implemented.
     static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
-    ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward filter algorithms.");
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution backward filter algorithms.");
+
     std::unique_ptr<T_BwdFilterPerf[]> candidates = std::make_unique<T_BwdFilterPerf[]>(num_algos);
     int perf_count;
     if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
@@ -188,7 +191,9 @@ struct AlgoSearch<T_FwdPerf> {
     };
 
     static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
-    ORT_ENFORCE(sizeof(algos) / sizeof(algos[0]) == num_algos, "Missing cuDNN convolution backward filter algorithms.");
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing cuDNN convolution backward filter algorithms.");
+
     std::unique_ptr<T_FwdPerf[]> candidates = std::make_unique<T_FwdPerf[]>(num_algos);
     int perf_count;
     if (args.params.algo_mode == OrtCudnnConvAlgoSearchHeuristic) {
@@ -233,11 +238,13 @@ bool ConvParamsEqual::operator()(const ConvParams& a, const ConvParams& b) const
 }
 
 template <typename T_Perf>
-Status AlgoIterator<T_Perf>::OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results) {
+Status AlgoIterator<T_Perf>::OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results, bool use_tf32) {
   perf_results.resize(1);
   perf_results[0].algo = AlgoSearch<T_Perf>::DEFAULT_ALGO;
   if (args.params.data_type == CUDNN_DATA_HALF) {
     perf_results[0].mathType = CUDNN_TENSOR_OP_MATH;
+  } else if (args.params.data_type == CUDNN_DATA_FLOAT && !use_tf32) {
+    perf_results[0].mathType = CUDNN_FMA_MATH;
   } else {
     perf_results[0].mathType = CUDNN_DEFAULT_MATH;
   }
@@ -256,7 +263,7 @@ Status AlgoIterator<T_Perf>::TryAll(const CUDAExecutionProvider* provider, const
 
   std::vector<T_Perf> perf_results;
   ORT_RETURN_IF_ERROR(args_.params.algo_mode == OrtCudnnConvAlgoSearchDefault
-                          ? OnlyDefaultAlgorithm(args_, perf_results)
+                          ? OnlyDefaultAlgorithm(args_, perf_results, provider->UseTF32())
                           : AlgoSearch<T_Perf>::FindAlgorithms(args_, provider, allocator, perf_results));
   for (auto& algo_perf : perf_results) {
     if (f(algo_perf) == Status::OK()) {
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h
index a2d4bf3bdc00..3fdb4306bfbb 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h
@@ -75,7 +75,7 @@ class AlgoIterator {
   Status TryAll(const CUDAExecutionProvider* provider, const AllocatorPtr& allocator,
                 std::function<Status(const T_Perf& perf)> f);
 
-  static Status OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results);
+  static Status OnlyDefaultAlgorithm(const ConvArgs& args, std::vector<T_Perf>& perf_results, bool use_tf32);
 
  private:
   const ConvArgs& args_;
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
index 5f7206fc121e..5d12e0ac312c 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc
@@ -53,7 +53,6 @@ Status ConvTransposeGrad<T>::ComputeInputGradient(onnxruntime::Stream* stream, c
             algo_perf.algo, workspace.get(), algo_perf.memory, &zero, args.y_tensor, args.y_data));
         return Status::OK();
       });
-  return Status::OK();
 }
 
 template <typename T>
@@ -71,7 +70,6 @@ Status ConvTransposeGrad<T>::ComputeWeightGradient(onnxruntime::Stream* stream,
             algo_perf.algo, workspace.get(), algo_perf.memory, &zero, args.w_desc, args.dw_data));
         return Status::OK();
       });
-  return Status::OK();
 }
 
 template <typename T>
@@ -182,7 +180,8 @@ Status ConvTransposeGrad<T>::PrepareConvForwardArgs(const Tensor& X, const Tenso
     ORT_RETURN_IF_ERROR(args.y_tensor.Set(y_dims, args.params.data_type));
     ORT_RETURN_IF_ERROR(args.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
                                            gsl::narrow_cast<int>(conv_attrs_.group), CUDNN_CROSS_CORRELATION,
-                                           args.params.data_type));
+                                           args.params.data_type,
+                                           UseTF32()));
   }
 
   return Status::OK();
@@ -287,7 +286,8 @@ Status ConvTransposeGrad<T>::PrepareConvBackwardFilterArgs(const Tensor& X, cons
     ORT_RETURN_IF_ERROR(args.y_tensor.Set(y_dims, args.params.data_type));
     ORT_RETURN_IF_ERROR(args.conv_desc.Set(kernel_shape.size(), pads, strides, dilations,
                                            gsl::narrow_cast<int>(conv_attrs_.group), CUDNN_CROSS_CORRELATION,
-                                           args.params.data_type));
+                                           args.params.data_type,
+                                           UseTF32()));
 
     if (dB) {
       const auto& b_shape = dB->Shape();
diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
index 2d89ed05712e..ad577afa06c1 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
@@ -30,8 +30,6 @@
 namespace onnxruntime {
 namespace cuda {
 
-using namespace onnxruntime::cuda;
-
 namespace {
   // This is the un-specialized struct.  Note that we prevent instantiation of this
   // struct by putting an undefined symbol in the function body so it won't compile.
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
index c90809eb2fdc..fd55f7c30ff7 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
@@ -619,7 +619,7 @@ CudaKernel::CudaAsyncBuffer<LambMultiTensorSyncRangeAndLock> compute_tensor_rang
 
 template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename TBuf>
 void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(
-    cudaStream_t stream,
+    cudaStream_t /*stream*/,
     ChunkGroup<4> chunk_group,
     const CudaKernel& kernel,
     void* reduction_buffer,
diff --git a/orttraining/tools/amdgpu/script/rocprof.py b/orttraining/tools/amdgpu/script/rocprof.py
index e5b107ba285b..21dd8501f3f1 100644
--- a/orttraining/tools/amdgpu/script/rocprof.py
+++ b/orttraining/tools/amdgpu/script/rocprof.py
@@ -68,18 +68,10 @@ def gpu_kernel_calls(activities):
 for name in groups:
     activities = groups[name]
     print(
-        "{}: N={}, calls={}, absolute={:.3f}s, percent={:.2f}%".format(
-            name,
-            len(activities),
-            gpu_kernel_calls(activities),
-            gpu_absolute_time(activities),
-            gpu_percent_time(activities),
-        )
+        f"{name}: N={len(activities)}, calls={gpu_kernel_calls(activities)}, absolute={gpu_absolute_time(activities):.3f}s, percent={gpu_percent_time(activities):.2f}%"
     )
 
 total = [item for name in groups for item in groups[name]]
 print(
-    "Total: N={}, calls={}, absolute={:.3f}s, percent={:.2f}%".format(
-        len(total), gpu_kernel_calls(total), gpu_absolute_time(total), gpu_percent_time(total)
-    )
+    f"Total: N={len(total)}, calls={gpu_kernel_calls(total)}, absolute={gpu_absolute_time(total):.3f}s, percent={gpu_percent_time(total):.2f}%"
 )
diff --git a/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.7.json b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.0.json
similarity index 61%
rename from orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.7.json
rename to orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.0.json
index a4ac02b56684..05fcf08cd323 100644
--- a/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.7.json
+++ b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.0.json
@@ -2,56 +2,56 @@
     "steps": [
         {
             "step": 20,
-            "loss": 2.0017
+            "loss": 2.0136
         },
         {
             "step": 40,
-            "loss": 1.8337
+            "loss": 1.8466
         },
         {
             "step": 60,
-            "loss": 1.7538
+            "loss": 1.7525
         },
         {
             "step": 80,
-            "loss": 1.6728
+            "loss": 1.6682
         },
         {
             "step": 100,
-            "loss": 1.6656
+            "loss": 1.658
         },
         {
             "step": 120,
-            "loss": 1.6752
+            "loss": 1.6749
         },
         {
             "step": 140,
-            "loss": 1.6335
+            "loss": 1.6263
         },
         {
             "step": 160,
-            "loss": 1.6815
+            "loss": 1.6828
         },
         {
             "step": 180,
-            "loss": 1.6155
+            "loss": 1.6145
         },
         {
             "step": 200,
-            "loss": 1.6177
+            "loss": 1.6197
         },
         {
             "step": 220,
-            "loss": 1.632
+            "loss": 1.6353
         },
         {
             "step": 240,
-            "loss": 1.5161
+            "loss": 1.5266
         },
         {
             "step": 260,
-            "loss": 1.5433
+            "loss": 1.5441
         }
     ],
-    "samples_per_second": 32.335
+    "samples_per_second": 34.561
 }
diff --git a/orttraining/tools/ci_test/run_bert_perf_test.py b/orttraining/tools/ci_test/run_bert_perf_test.py
index bb15d6f5965b..13d5e9f14095 100644
--- a/orttraining/tools/ci_test/run_bert_perf_test.py
+++ b/orttraining/tools/ci_test/run_bert_perf_test.py
@@ -99,8 +99,8 @@ def main():
 
         subprocess.run(cmds).check_returncode()  # noqa: PLW1510
         if c.expected_perf > 0.0:
-            json_filename = "onnxruntime_perf_metrics_{}.onnx_bert_{}_{}_Lamb.json".format(
-                model, precision_prefix, c.max_seq_length
+            json_filename = (
+                f"onnxruntime_perf_metrics_{model}.onnx_bert_{precision_prefix}_{c.max_seq_length}_Lamb.json"
             )
             with open(os.path.join(SCRIPT_DIR, "results", json_filename)) as json_file:
                 results = json.load(json_file)
diff --git a/orttraining/tools/scripts/gpt2_model_transform.py b/orttraining/tools/scripts/gpt2_model_transform.py
index 06f03e06632b..294af13fe69b 100644
--- a/orttraining/tools/scripts/gpt2_model_transform.py
+++ b/orttraining/tools/scripts/gpt2_model_transform.py
@@ -17,10 +17,8 @@
 
 
 def add_name(model):
-    i = 0
-    for node in model.graph.node:
+    for i, node in enumerate(model.graph.node):
         node.name = "%s_%d" % (node.op_type, i)
-        i += 1
 
 
 def find_input_node(model, arg):
@@ -139,11 +137,9 @@ def process_concat(model):
             delete_nodes.append(get_node_index(model, n))
 
     # insert new shape to reshape
-    index = 0
-    for reshape_node_index in new_nodes:
+    for index, reshape_node_index in enumerate(new_nodes):
         shape_tensor = numpy_helper.from_array(np.asarray(new_nodes[reshape_node_index], dtype=np.int64))
         const_node = add_const(model, "concat_shape_node_%d" % index, "concat_shape_%d" % index, shape_tensor)
-        index += 1
         reshape_node = model.graph.node[reshape_node_index]
         reshape_node.input[1] = const_node.output[0]
     # delete nodes
@@ -154,28 +150,22 @@ def process_concat(model):
 
 def replace_input_arg(model, arg, new_arg):
     for node in model.graph.node:
-        i = 0
-        while i < len(node.input):
-            if node.input[i] == arg:
+        for i, input_name in enumerate(node.input):
+            if input_name == arg:
                 node.input[i] = new_arg
-            i += 1
 
 
 def find_weight_index(model, name):
-    index = 0
-    for w in model.graph.initializer:
+    for index, w in enumerate(model.graph.initializer):
         if w.name == name:
             return index
-        index += 1
     return None
 
 
 def find_input_index(model, name):
-    index = 0
-    for w in model.graph.input:
+    for index, w in enumerate(model.graph.input):
         if w.name == name:
             return index
-        index += 1
     return None
 
 
diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py
index 81e9f7b16be1..f0cf53990eac 100644
--- a/orttraining/tools/scripts/model_transform.py
+++ b/orttraining/tools/scripts/model_transform.py
@@ -1,8 +1,10 @@
+from __future__ import annotations
+
 import sys
 
 import numpy as np
 import onnx
-from onnx import TensorProto, helper, numpy_helper, shape_inference  # noqa: F401
+from onnx import numpy_helper
 
 if len(sys.argv) < 2:
     print("Please give model path...")
@@ -15,10 +17,8 @@
 
 
 def add_name(model):
-    i = 0
-    for node in model.graph.node:
+    for i, node in enumerate(model.graph.node):
         node.name = "%s_%d" % (node.op_type, i)
-        i += 1
 
 
 def find_input_node(model, arg):
@@ -118,11 +118,9 @@ def process_concat(model):
             for n in fuse_nodes:
                 delete_nodes.append(get_node_index(model, n))
     # insert new shape to reshape
-    index = 0
-    for reshape_node_index in new_nodes:
+    for index, reshape_node_index in enumerate(new_nodes):
         shape_tensor = numpy_helper.from_array(np.asarray(new_nodes[reshape_node_index], dtype=np.int64))
         const_node = add_const(model, "concat_shape_node_%d" % index, "concat_shape_%d" % index, shape_tensor)
-        index += 1
         reshape_node = model.graph.node[reshape_node_index]
         reshape_node.input[1] = const_node.output[0]
     # delete nodes
@@ -199,12 +197,10 @@ def replace_input_arg(model, arg, new_arg):
             i += 1
 
 
-def find_weight_index(model, name):
-    index = 0
-    for w in model.graph.initializer:
+def find_weight_index(model, name: str) -> int | None:
+    for index, w in enumerate(model.graph.initializer):
         if w.name == name:
             return index
-        index += 1
     return None
 
 
diff --git a/orttraining/tools/scripts/nv_run_pretraining.py b/orttraining/tools/scripts/nv_run_pretraining.py
index f64460f3ff0b..8c57101f72dd 100644
--- a/orttraining/tools/scripts/nv_run_pretraining.py
+++ b/orttraining/tools/scripts/nv_run_pretraining.py
@@ -81,9 +81,11 @@ def __len__(self):
 
     def __getitem__(self, index):
         [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
-            torch.from_numpy(input[index].astype(np.int64))
-            if indice < 5
-            else torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            (
+                torch.from_numpy(input[index].astype(np.int64))
+                if indice < 5
+                else torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            )
             for indice, input in enumerate(self.inputs)
         ]
 
@@ -231,9 +233,7 @@ def setup_training(args):
         )
     if args.train_batch_size % args.gradient_accumulation_steps != 0:
         raise ValueError(
-            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
-                args.gradient_accumulation_steps, args.train_batch_size
-            )
+            f"Invalid gradient_accumulation_steps parameter: {args.gradient_accumulation_steps}, batch size {args.train_batch_size} should be divisible"
         )
 
     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
diff --git a/orttraining/tools/scripts/watch_experiment.py b/orttraining/tools/scripts/watch_experiment.py
index aefa1f57cfc1..d2255b63c66b 100644
--- a/orttraining/tools/scripts/watch_experiment.py
+++ b/orttraining/tools/scripts/watch_experiment.py
@@ -57,11 +57,7 @@
     remote_root = args.remote_dir
 
     if run.get_status() in ["Completed", "Failed", "Canceled"]:
-        print(
-            "Downloading Experiment files from remote directory: '{}' to local directory: '{}'".format(
-                remote_root, local_root
-            )
-        )
+        print(f"Downloading Experiment files from remote directory: '{remote_root}' to local directory: '{local_root}'")
         files = [f for f in run.get_file_names() if f.startswith(remote_root)]
         for remote_path in files:
             local_path = os.path.join(local_root, os.path.basename(remote_path))
@@ -71,11 +67,7 @@
         event = Event()
         session = Session()
 
-        print(
-            "Streaming Experiment files from remote directory: '{}' to local directory: '{}'".format(
-                remote_root, local_root
-            )
-        )
+        print(f"Streaming Experiment files from remote directory: '{remote_root}' to local directory: '{local_root}'")
         watcher = RunWatcher(
             run, local_root=local_root, remote_root=remote_root, executor=executor, event=event, session=session
         )
diff --git a/packages.config b/packages.config
index da61a10adfa7..e5b134d99dd8 100644
--- a/packages.config
+++ b/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Microsoft.AI.DirectML" version="1.12.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
   <package id="google.protobuf.tools" version="3.21.12" targetFramework="native" />
 </packages>
diff --git a/pyproject.toml b/pyproject.toml
index 97515cb9fa62..8fe114d4692c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,19 +44,26 @@ reportMissingImports = false
 [tool.ruff]
 # NOTE: Do not create an exclude list. Edit .lintrunner.toml instead
 target-version = "py38"
+
+[tool.ruff.lint]
 select = [
     "B", # flake8-bugbear
     "E", # pycodestyle
     "F", # Pyflakes
+    "FURB", # refurb
+    "G", # flake8-logging-format
     "ISC", # flake8-implicit-str-concat
     "N", # pep8-naming
     "NPY", # numpy
     "PERF", # Perflint
+    "PIE", # flake8-pie
     "PLC", # pylint conventions
     "PLE", # pylint errors
     "PLW", # pylint warnings
+    "PYI", # flake8-pyi
     "RUF", # Ruff-specific rules
     "SIM", # flake8-simplify
+    "SLOT", # flake8-slots
     "T10", # flake8-debugger
     "UP", # pyupgrade
     "W", # pycodestyle
@@ -67,12 +74,15 @@ select = [
 ignore = [
     "B028", # FIXME: Add stacklevel to warnings
     "E501", # Line length controlled by black
+    "G004", # FIXME: Enable when the rule can be autofixed
     "N803", # Argument casing
     "N812", # Allow import torch.nn.functional as F
     "N999", # Module names
     "NPY002", # np.random.Generator may not always fit our use cases
     "PERF203", # "try-except-in-loop" only affects Python <3.11, and the improvement is minor; can have false positives
     "PERF401", # List comprehensions are not always readable
+    "PYI041", # May create confusion
+    "PYI024", # May create confusion
     "SIM102", # We don't perfer always combining if branches
     "SIM108", # We don't encourage ternary operators
     "SIM114", # Don't combine if branches for debugability
@@ -84,7 +94,7 @@ unfixable = [
     "SIM112", # Use upper case for env vars
 ]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # NOTE: Refrain from growing the ignore list unless for exceptional cases.
 # Prefer inline ignores with `noqa: xxx`.
 # Eventually this list should become empty.
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 25454ce40c26..d19ebe379b50 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -1,9 +1,9 @@
 # This file is auto updated by dependabot
 lintrunner-adapters>=0.11.0
 # RUFF
-ruff==0.1.4
+ruff==0.3.2
 # BLACK-ISORT
-black==23.10.1
+black==24.2.0
 isort==5.12.0
 # CLANGFORMAT
 clang-format==17.0.4
diff --git a/setup.py b/setup.py
index 0c2eb19e82c8..ffe2958b357b 100644
--- a/setup.py
+++ b/setup.py
@@ -205,18 +205,23 @@ def run(self):
                 rocm_dependencies = [
                     "libamd_comgr.so.2",
                     "libamdhip64.so.5",
+                    "libamdhip64.so.6",
                     "libdrm.so.2",
                     "libdrm_amdgpu.so.1",
                     "libelf.so.1",
                     "libhipfft.so.0",
                     "libhiprtc.so.5",
+                    "libhiprtc.so.6",
                     "libhsa-runtime64.so.1",
                     "libMIOpen.so.1",
                     "libnuma.so.1",
                     "librccl.so.1",
                     "librocblas.so.3",
+                    "librocblas.so.4",
                     "librocfft.so.0",
+                    "libroctx64.so.4",
                     "librocm_smi64.so.5",
+                    "librocm_smi64.so.6",
                     "libroctracer64.so.4",
                     "libtinfo.so.6",
                     "libmigraphx_c.so.3",
@@ -227,6 +232,8 @@ def run(self):
 
                 tensorrt_dependencies = ["libnvinfer.so.8", "libnvinfer_plugin.so.8", "libnvonnxparser.so.8"]
 
+                cann_dependencies = ["libascendcl.so", "libacl_op_compiler.so", "libfmk_onnx_parser.so"]
+
                 dest = "onnxruntime/capi/libonnxruntime_providers_openvino.so"
                 if path.isfile(dest):
                     subprocess.run(
@@ -250,9 +257,9 @@ def run(self):
                 file = glob(path.join(self.dist_dir, "*linux*.whl"))[0]
                 logger.info("repairing %s for manylinux1", file)
                 auditwheel_cmd = ["auditwheel", "-v", "repair", "-w", self.dist_dir, file]
-                for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies:
+                for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies + cann_dependencies:
                     auditwheel_cmd += ["--exclude", i]
-                logger.info("Running {}".format(" ".join([shlex.quote(arg) for arg in auditwheel_cmd])))
+                logger.info("Running %s", " ".join([shlex.quote(arg) for arg in auditwheel_cmd]))
                 try:
                     subprocess.run(auditwheel_cmd, check=True, stdout=subprocess.PIPE)
                 finally:
@@ -298,6 +305,7 @@ def finalize_options(self):
     libs.extend(["libonnxruntime_providers_shared.so"])
     libs.extend(["libonnxruntime_providers_dnnl.so"])
     libs.extend(["libonnxruntime_providers_openvino.so"])
+    libs.extend(["libonnxruntime_providers_vitisai.so"])
     libs.append(providers_cuda_or_rocm)
     libs.append(providers_tensorrt_or_migraphx)
     libs.append(providers_cann)
@@ -310,6 +318,7 @@ def finalize_options(self):
     libs.extend(["libonnxruntime_providers_dnnl.dylib"])
     libs.extend(["libonnxruntime_providers_tensorrt.dylib"])
     libs.extend(["libonnxruntime_providers_cuda.dylib"])
+    libs.extend(["libonnxruntime_providers_vitisai.dylib"])
     if nightly_build:
         libs.extend(["libonnxruntime_pywrapper.dylib"])
 else:
@@ -320,6 +329,7 @@ def finalize_options(self):
     libs.extend(["onnxruntime_providers_tensorrt.dll"])
     libs.extend(["onnxruntime_providers_openvino.dll"])
     libs.extend(["onnxruntime_providers_cuda.dll"])
+    libs.extend(["onnxruntime_providers_vitisai.dll"])
     # DirectML Libs
     libs.extend(["DirectML.dll"])
     if nightly_build:
@@ -416,6 +426,7 @@ def finalize_options(self):
     "onnxruntime.transformers.models.gpt2",
     "onnxruntime.transformers.models.llama",
     "onnxruntime.transformers.models.longformer",
+    "onnxruntime.transformers.models.phi2",
     "onnxruntime.transformers.models.t5",
     "onnxruntime.transformers.models.stable_diffusion",
     "onnxruntime.transformers.models.whisper",
@@ -451,6 +462,7 @@ def finalize_options(self):
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Operating System :: Microsoft :: Windows",
     "Operating System :: MacOS",
 ]
@@ -464,7 +476,6 @@ def finalize_options(self):
                 "onnxruntime.training.experimental",
                 "onnxruntime.training.experimental.gradient_graph",
                 "onnxruntime.training.optim",
-                "onnxruntime.training.torchdynamo",
                 "onnxruntime.training.ortmodule",
                 "onnxruntime.training.ortmodule.experimental",
                 "onnxruntime.training.ortmodule.experimental.json_config",
@@ -605,9 +616,7 @@ def reformat_run_count(count_str):
             # TODO: this is the last time we have to do this!!!
             # We shall bump up release number right after release cut.
             if ort_version.major == 1 and ort_version.minor == 8 and ort_version.micro == 0:
-                version_number = "{major}.{minor}.{macro}".format(
-                    major=ort_version.major, minor=ort_version.minor + 1, macro=ort_version.micro
-                )
+                version_number = f"{ort_version.major}.{ort_version.minor + 1}.{ort_version.micro}"
 
     version_number = version_number + ".dev" + build_suffix
 
@@ -658,9 +667,11 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
                 else:
                     print(
                         "Error getting cudart version. ",
-                        "did not find any cudart library"
-                        if not cudart_versions or len(cudart_versions) == 0
-                        else "found multiple cudart libraries",
+                        (
+                            "did not find any cudart library"
+                            if not cudart_versions or len(cudart_versions) == 0
+                            else "found multiple cudart libraries"
+                        ),
                     )
             elif rocm_version:
                 f.write(f"rocm_version = '{rocm_version}'\n")
diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py
index 8ea0481c9b10..f1d3702e3245 100644
--- a/tools/ci_build/amd_hipify.py
+++ b/tools/ci_build/amd_hipify.py
@@ -117,7 +117,6 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
     s = s.replace("HIPBLAS_R_16F", "rocblas_datatype_f16_r")
     s = s.replace("HIPBLAS_R_32F", "rocblas_datatype_f32_r")
     s = s.replace("ROCBLAS_GEMM_DEFAULT_TENSOR_OP", "rocblas_gemm_algo_standard")
-    s = s.replace("ROCBLAS_TENSOR_OP_MATH", "0 /* CUBLAS_TENSOR_OP_MATH is deprecated */")
 
     # compatible layer
     s = s.replace("rocblas_gemm_strided_batched_ex", "_compat_rocblas_gemm_strided_batched_ex")
@@ -182,6 +181,8 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
     s = s.replace("rocm_device_prop_", "cuda_device_prop_")
     s = s.replace("rocm_device_arch_", "cuda_device_arch_")
 
+    s = s.replace("HipTuningContext", "RocmTuningContext")
+
     # We want hipfft, which needs hipDataType etc, but only do this for files that have "fft" in their names
     # And we do this last, undoing or fixing hipify mistakes.
     if "fft" in src_file_path:
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 5adcfa1000cf..cb385674dc76 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -38,8 +38,6 @@ def version_to_tuple(version: str) -> tuple:
 class BaseError(Exception):
     """Base class for errors originating from build.py."""
 
-    pass
-
 
 class BuildError(BaseError):
     """Error from running build steps."""
@@ -56,7 +54,7 @@ def __init__(self, message):
 
 
 def _check_python_version():
-    required_minor_version = 7
+    required_minor_version = 8
     if (sys.version_info.major, sys.version_info.minor) < (3, required_minor_version):
         raise UsageError(
             f"Invalid Python version. At least Python 3.{required_minor_version} is required. "
@@ -75,13 +73,14 @@ def _str_to_bool(s):
 
 
 def _openvino_verify_device_type(device_read):
-    choices = ["CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"]
+    choices = ["CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16", "NPU"]
 
     choices1 = [
         "CPU_FP32_NO_PARTITION",
         "CPU_FP16_NO_PARTITION",
         "GPU_FP32_NO_PARTITION",
         "GPU_FP16_NO_PARTITION",
+        "NPU_NO_PARTITION",
     ]
     status_hetero = True
     res = False
@@ -89,14 +88,14 @@ def _openvino_verify_device_type(device_read):
         res = True
     elif device_read in choices1:
         res = True
-    elif device_read.startswith("HETERO:") or device_read.startswith("MULTI:") or device_read.startswith("AUTO:"):
+    elif device_read.startswith(("HETERO:", "MULTI:", "AUTO:")):
         res = True
         comma_separated_devices = device_read.split(":")
         comma_separated_devices = comma_separated_devices[1].split(",")
         if len(comma_separated_devices) < 2:
             print("At least two devices required in Hetero/Multi/Auto Mode")
             status_hetero = False
-        dev_options = ["CPU", "GPU"]
+        dev_options = ["CPU", "GPU", "NPU"]
         for dev in comma_separated_devices:
             if dev not in dev_options:
                 status_hetero = False
@@ -107,7 +106,7 @@ def invalid_hetero_build():
         print("specify the keyword HETERO or MULTI or AUTO followed by the devices ")
         print("in the order of priority you want to build\n")
         print("The different hardware devices that can be added in HETERO or MULTI or AUTO")
-        print("are ['CPU','GPU'] \n")
+        print("are ['CPU','GPU','NPU'] \n")
         print("An example of how to specify the hetero build type. Ex: HETERO:GPU,CPU \n")
         print("An example of how to specify the MULTI build type. Ex: MULTI:GPU,CPU \n")
         print("An example of how to specify the AUTO build type. Ex: AUTO:GPU,CPU \n")
@@ -118,7 +117,7 @@ def invalid_hetero_build():
         print("pick the build type for specific Hardware Device from following options: ", choices)
         print("(or) from the following options with graph partitioning disabled: ", choices1)
         print("\n")
-        if not (device_read.startswith("HETERO") or device_read.startswith("MULTI") or device_read.startswith("AUTO")):
+        if not (device_read.startswith(("HETERO", "MULTI", "AUTO"))):
             invalid_hetero_build()
         sys.exit("Wrong Build Type selected")
 
@@ -328,6 +327,12 @@ def convert_arg_line_to_args(self, arg_line):
         help="[cross-compiling] Create Windows x86 makefiles. Requires --update and no existing cache "
         "CMake setup. Delete CMakeCache.txt if needed",
     )
+    parser.add_argument(
+        "--rv64",
+        action="store_true",
+        help="[cross-compiling] Create riscv64 makefiles. Requires --update and no existing cache "
+        "CMake setup. Delete CMakeCache.txt if needed",
+    )
     parser.add_argument(
         "--arm",
         action="store_true",
@@ -351,6 +356,18 @@ def convert_arg_line_to_args(self, arg_line):
         action="store_true",
         help="[cross-compiling] Create ARM64X Binary.",
     )
+    parser.add_argument(
+        "--riscv_toolchain_root",
+        type=str,
+        default="",
+        help="Path to RISC-V toolchain root dir. e.g. --riscv_toolchain_root=$HOME/riscv-tools/",
+    )
+    parser.add_argument(
+        "--riscv_qemu_path",
+        type=str,
+        default="",
+        help="Path to RISC-V qemu. e.g. --riscv_qemu_path=$HOME/qemu-dir/qemu-riscv64",
+    )
     parser.add_argument("--msvc_toolset", help="MSVC toolset to use. e.g. 14.11")
     parser.add_argument("--windows_sdk_version", help="Windows SDK version to use. e.g. 10.0.19041.0")
     parser.add_argument("--android", action="store_true", help="Build for Android")
@@ -384,6 +401,12 @@ def convert_arg_line_to_args(self, arg_line):
 
     parser.add_argument("--ios", action="store_true", help="build for ios")
 
+    parser.add_argument(
+        "--macos",
+        choices=["MacOSX", "Catalyst"],
+        help="Specify the target platform for macOS build. Only specify this argument when --build_apple_framework is present.",
+    )
+
     parser.add_argument(
         "--apple_sysroot", default="", help="Specify the location name of the macOS platform SDK to be used"
     )
@@ -403,7 +426,7 @@ def convert_arg_line_to_args(self, arg_line):
         action="store_const",
         const="Xcode",
         dest="cmake_generator",
-        help="Use Xcode as cmake generator, this is only supported on MacOS. Equivalent to '--cmake_generator Xcode'.",
+        help="Use Xcode as cmake generator, this is only supported on MacOS. (non Catalyst build). Equivalent to '--cmake_generator Xcode'.",
     )
     parser.add_argument(
         "--osx_arch",
@@ -418,14 +441,24 @@ def convert_arg_line_to_args(self, arg_line):
         "(e.g. macOS or iOS)"
         "This is only supported on MacOS",
     )
+    # A 32-bit progress doesn't have enough memory to run all the tests in onnxruntime_test_all.
+    # Mimalloc is incompatible with address sanitizer.
+    # Address sanitizer itself is also a memory leak checker, so when it is enabled we should disable_memleak_checker.
+    parser.add_argument(
+        "--enable_address_sanitizer", action="store_true", help="Enable address sanitizer. Windows/Linux/MacOS only."
+    )
+    # The following flag is mostly designed to be used in ONNX Runtime's Azure DevOps/Github build pipelines. Its main purpose is to make the built binaries pass BinSkim scan.
+    parser.add_argument("--use_binskim_compliant_compile_flags", action="store_true", help="Use preset compile flags.")
     parser.add_argument(
-        "--disable_memleak_checker", action="store_true", help="Disable memory leak checker from Windows build"
+        "--disable_memleak_checker",
+        action="store_true",
+        help="Disable memory leak checker from Windows build. By default it is enabled in Windows Debug build. This option is Windows only.",
     )
 
     # WebAssembly build
     parser.add_argument("--build_wasm", action="store_true", help="Build for WebAssembly")
     parser.add_argument("--build_wasm_static_lib", action="store_true", help="Build for WebAssembly static library")
-    parser.add_argument("--emsdk_version", default="3.1.44", help="Specify version of emsdk")
+    parser.add_argument("--emsdk_version", default="3.1.51", help="Specify version of emsdk")
 
     parser.add_argument("--enable_wasm_simd", action="store_true", help="Enable WebAssembly SIMD")
     parser.add_argument("--enable_wasm_threads", action="store_true", help="Enable WebAssembly multi-threads support")
@@ -600,13 +633,18 @@ def convert_arg_line_to_args(self, arg_line):
         "--use_telemetry", action="store_true", help="Only official builds can set this flag to enable telemetry."
     )
     parser.add_argument("--enable_wcos", action="store_true", help="Build for Windows Core OS.")
+    # Do not enable LTO when the compiler is MSVC and the flag for generating debug symbols is set to /Z7 and training
+    # is also enabled. Because both LTO and /Zi could significantly increase *.obj/*.lib files' size, and on Windows
+    # there is a 4GB per file limit(ERROR LNK1248). We may solve the issue by splitting the big static libs to smaller
+    # ones. Before the refactoring work is done, we should avoid enabling LTO and ccache at the same time because ccache
+    # needs /Z7.
     parser.add_argument("--enable_lto", action="store_true", help="Enable Link Time Optimization")
     parser.add_argument("--enable_transformers_tool_test", action="store_true", help="Enable transformers tool test")
     parser.add_argument(
         "--use_acl",
         nargs="?",
         const="ACL_1905",
-        choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002"],
+        choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002", "ACL_2308"],
         help="Build with ACL for ARM architectures.",
     )
     parser.add_argument("--acl_home", help="Path to ACL home dir")
@@ -771,11 +809,6 @@ def get_linux_distro():
         return "", ""
 
 
-def is_ubuntu_1604():
-    dist, ver = get_linux_distro()
-    return dist == "Ubuntu" and ver.startswith("16.04")
-
-
 def get_config_build_dir(build_dir, config):
     # build directory per configuration
     return os.path.join(build_dir, config)
@@ -829,15 +862,6 @@ def update_submodules(source_dir):
     run_subprocess(["git", "submodule", "update", "--init", "--recursive"], cwd=source_dir)
 
 
-def is_docker():
-    path = "/proc/self/cgroup"
-    return (
-        os.path.exists("/.dockerenv")
-        or os.path.isfile(path)
-        and any("docker" in line for line in open(path))  # noqa: SIM115
-    )
-
-
 def install_python_deps(numpy_version=""):
     dep_packages = ["setuptools", "wheel", "pytest"]
     dep_packages.append(f"numpy=={numpy_version}" if numpy_version else "numpy>=1.16.6")
@@ -1031,6 +1055,7 @@ def generate_build_tree(
         "-Donnxruntime_USE_ACL_1905=" + ("ON" if args.use_acl == "ACL_1905" else "OFF"),
         "-Donnxruntime_USE_ACL_1908=" + ("ON" if args.use_acl == "ACL_1908" else "OFF"),
         "-Donnxruntime_USE_ACL_2002=" + ("ON" if args.use_acl == "ACL_2002" else "OFF"),
+        "-Donnxruntime_USE_ACL_2308=" + ("ON" if args.use_acl == "ACL_2308" else "OFF"),
         "-Donnxruntime_USE_ARMNN=" + ("ON" if args.use_armnn else "OFF"),
         "-Donnxruntime_ARMNN_RELU_USE_CPU=" + ("OFF" if args.armnn_relu else "ON"),
         "-Donnxruntime_ARMNN_BN_USE_CPU=" + ("OFF" if args.armnn_bn else "ON"),
@@ -1075,6 +1100,19 @@ def generate_build_tree(
         "-Donnxruntime_DISABLE_OPTIONAL_TYPE=" + ("ON" if disable_optional_type else "OFF"),
     ]
 
+    if args.rv64:
+        add_default_definition(cmake_extra_defines, "onnxruntime_CROSS_COMPILING", "ON")
+        if not args.riscv_toolchain_root:
+            raise BuildError("The --riscv_toolchain_root option is required to build for riscv64.")
+        if not args.skip_tests and not args.riscv_qemu_path:
+            raise BuildError("The --riscv_qemu_path option is required for testing riscv64.")
+
+        cmake_args += [
+            "-DRISCV_TOOLCHAIN_ROOT:PATH=" + args.riscv_toolchain_root,
+            "-DRISCV_QEMU_PATH:PATH=" + args.riscv_qemu_path,
+            "-DCMAKE_TOOLCHAIN_FILE=" + os.path.join(source_dir, "cmake", "riscv64.toolchain.cmake"),
+        ]
+
     # By default on Windows we currently support only cross compiling for ARM/ARM64
     # (no native compilation supported through this script).
     if args.arm64 or args.arm64ec or args.arm:
@@ -1189,6 +1227,7 @@ def generate_build_tree(
             "-Donnxruntime_USE_OPENVINO_GPU_FP16=" + ("ON" if args.use_openvino == "GPU_FP16" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP32=" + ("ON" if args.use_openvino == "CPU_FP32" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP16=" + ("ON" if args.use_openvino == "CPU_FP16" else "OFF"),
+            "-Donnxruntime_USE_OPENVINO_NPU=" + ("ON" if args.use_openvino == "NPU" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_GPU_FP32_NP="
             + ("ON" if args.use_openvino == "GPU_FP32_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_GPU_FP16_NP="
@@ -1197,15 +1236,22 @@ def generate_build_tree(
             + ("ON" if args.use_openvino == "CPU_FP32_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP16_NP="
             + ("ON" if args.use_openvino == "CPU_FP16_NO_PARTITION" else "OFF"),
+            "-Donnxruntime_USE_OPENVINO_NPU_NP=" + ("ON" if args.use_openvino == "NPU_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_HETERO=" + ("ON" if args.use_openvino.startswith("HETERO") else "OFF"),
             "-Donnxruntime_USE_OPENVINO_DEVICE=" + (args.use_openvino),
             "-Donnxruntime_USE_OPENVINO_MULTI=" + ("ON" if args.use_openvino.startswith("MULTI") else "OFF"),
             "-Donnxruntime_USE_OPENVINO_AUTO=" + ("ON" if args.use_openvino.startswith("AUTO") else "OFF"),
         ]
 
-    # TensorRT and OpenVINO providers currently only support
-    # full_protobuf option.
-    if args.use_full_protobuf or args.use_tensorrt or args.use_openvino or args.use_vitisai or args.gen_doc:
+    # VitisAI and OpenVINO providers currently only support
+    # full_protobuf option. TensorRT provider only requires it if built with oss_parser
+    if (
+        args.use_full_protobuf
+        or (args.use_tensorrt and args.use_tensorrt_oss_parser)
+        or args.use_openvino
+        or args.use_vitisai
+        or args.gen_doc
+    ):
         cmake_args += ["-Donnxruntime_USE_FULL_PROTOBUF=ON", "-DProtobuf_USE_STATIC_LIBS=ON"]
 
     if args.use_tvm and args.llvm_path is not None:
@@ -1281,17 +1327,17 @@ def generate_build_tree(
     if args.use_webnn:
         if not args.build_wasm:
             raise BuildError("WebNN is only available for WebAssembly build.")
-        if args.disable_rtti:
-            # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
-            # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/16911
-            raise BuildError("WebNN is not supported with RTTI disabled.")
         cmake_args += ["-Donnxruntime_USE_WEBNN=ON"]
 
     if args.use_snpe:
         cmake_args += ["-Donnxruntime_USE_SNPE=ON"]
 
-    if args.build_apple_framework or args.ios:
-        if not args.cmake_generator == "Xcode":
+    if args.macos or args.ios:
+        # Note: Xcode CMake generator doesn't have a good support for Mac Catalyst yet.
+        if args.macos == "Catalyst" and args.cmake_generator == "Xcode":
+            raise BuildError("Xcode CMake generator ('--cmake_generator Xcode') doesn't support Mac Catalyst build.")
+
+        if (args.ios or args.macos == "MacOSX") and not args.cmake_generator == "Xcode":
             raise BuildError(
                 "iOS/MacOS framework build requires use of the Xcode CMake generator ('--cmake_generator Xcode')."
             )
@@ -1309,12 +1355,15 @@ def generate_build_tree(
                 "iOS/MacOS framework build on MacOS canceled due to missing arguments: "
                 + ", ".join(val for val, cond in zip(arg_names, needed_args) if not cond)
             )
+        # note: this value is mainly used in framework_info.json file to specify the build osx type
+        platform_name = "macabi" if args.macos == "Catalyst" else args.apple_sysroot
         cmake_args += [
             "-Donnxruntime_BUILD_SHARED_LIB=ON",
             "-DCMAKE_OSX_SYSROOT=" + args.apple_sysroot,
             "-DCMAKE_OSX_DEPLOYMENT_TARGET=" + args.apple_deploy_target,
             # we do not need protoc binary for ios cross build
             "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF",
+            "-DPLATFORM_NAME=" + platform_name,
         ]
         if args.ios:
             cmake_args += [
@@ -1322,6 +1371,21 @@ def generate_build_tree(
                 "-DCMAKE_TOOLCHAIN_FILE="
                 + (args.ios_toolchain_file if args.ios_toolchain_file else "../cmake/onnxruntime_ios.toolchain.cmake"),
             ]
+        # for catalyst build, we need to manually specify cflags for target e.g. x86_64-apple-ios14.0-macabi, etc.
+        # https://forums.developer.apple.com/forums/thread/122571
+        if args.macos == "Catalyst":
+            macabi_target = f"{args.osx_arch}-apple-ios{args.apple_deploy_target}-macabi"
+            cmake_args += [
+                "-DCMAKE_CXX_COMPILER_TARGET=" + macabi_target,
+                "-DCMAKE_C_COMPILER_TARGET=" + macabi_target,
+                "-DCMAKE_CC_COMPILER_TARGET=" + macabi_target,
+                f"-DCMAKE_CXX_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_CXX_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+                f"-DCMAKE_C_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_C_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+                f"-DCMAKE_CC_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_CC_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+            ]
 
     if args.build_wasm:
         emsdk_dir = os.path.join(cmake_dir, "external", "emsdk")
@@ -1405,6 +1469,24 @@ def generate_build_tree(
     if args.use_lock_free_queue:
         add_default_definition(cmake_extra_defines, "onnxruntime_USE_LOCK_FREE_QUEUE", "ON")
 
+    if is_windows():
+        if args.use_cache:
+            add_default_definition(
+                cmake_extra_defines, "CMAKE_MSVC_DEBUG_INFORMATION_FORMAT", "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>"
+            )
+        else:
+            # Always enable debug info even in release build. The debug information is in separated *.pdb files that
+            # can be easily discarded when debug symbols are not needed. We enable it by default because many auditting
+            # tools need to use the symbols.
+            add_default_definition(cmake_extra_defines, "CMAKE_MSVC_DEBUG_INFORMATION_FORMAT", "ProgramDatabase")
+
+        if number_of_parallel_jobs(args) > 0:
+            # https://devblogs.microsoft.com/cppblog/improved-parallelism-in-msbuild/
+            # NOTE: this disables /MP if set (according to comments on blog post).
+            # By default, MultiProcMaxCount and CL_MPCount value are equal to the number of CPU logical processors.
+            # See logic around setting CL_MPCount below
+            cmake_args += ["-DCMAKE_VS_GLOBALS=UseMultiToolTask=true;EnforceProcessCountAcrossBuilds=true"]
+
     cmake_args += [f"-D{define}" for define in cmake_extra_defines]
 
     cmake_args += cmake_extra_args
@@ -1446,6 +1528,114 @@ def generate_build_tree(
                 ]
 
     for config in configs:
+        cflags = []
+        cxxflags = None
+        ldflags = None
+        cudaflags = []
+        if is_windows() and not args.ios and not args.android and not args.build_wasm:
+            njobs = number_of_parallel_jobs(args)
+            if njobs > 1:
+                if args.parallel == 0:
+                    cflags += ["/MP"]
+                else:
+                    cflags += ["/MP%d" % njobs]
+        # Setup default values for cflags/cxxflags/ldflags.
+        # The values set here are purely for security and compliance purposes. ONNX Runtime should work fine without these flags.
+        if (
+            (args.use_binskim_compliant_compile_flags or args.enable_address_sanitizer)
+            and not args.ios
+            and not args.android
+            and not args.build_wasm
+        ):
+            if is_windows():
+                cflags += ["/guard:cf", "/DWIN32", "/D_WINDOWS"]
+                if not args.use_gdk:
+                    # Target Windows 10
+                    cflags += [
+                        "/DWINAPI_FAMILY=100",
+                        "/DWINVER=0x0A00",
+                        "/D_WIN32_WINNT=0x0A00",
+                        "/DNTDDI_VERSION=0x0A000000",
+                    ]
+                # The "/profile" flag implies "/DEBUG:FULL /DEBUGTYPE:cv,fixup /OPT:REF /OPT:NOICF /INCREMENTAL:NO /FIXED:NO". We set it for satisfying a Microsoft internal compliance requirement. External users
+                # do not need to have it.
+                ldflags = ["/profile", "/DYNAMICBASE"]
+                # Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled.
+                if not args.enable_address_sanitizer:
+                    # Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs
+                    cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"]
+                if config == "Release":
+                    cflags += ["/O2", "/Ob2", "/DNDEBUG"]
+                elif config == "RelWithDebInfo":
+                    cflags += ["/O2", "/Ob1", "/DNDEBUG"]
+                elif config == "Debug":
+                    cflags += ["/Ob0", "/Od", "/RTC1"]
+                elif config == "MinSizeRel":
+                    cflags += ["/O1", "/Ob1", "/DNDEBUG"]
+                if args.enable_address_sanitizer:
+                    cflags += ["/fsanitize=address"]
+                cxxflags = cflags.copy()
+                if args.use_cuda:
+                    # On Windows, nvcc passes /EHsc to the host compiler by default.
+                    cuda_compile_flags_str = ""
+                    for compile_flag in cflags:
+                        if compile_flag.startswith("/D"):
+                            cudaflags.append(compile_flag)
+                        else:
+                            cuda_compile_flags_str = cuda_compile_flags_str + " " + compile_flag
+                    if len(cuda_compile_flags_str) != 0:
+                        cudaflags.append('-Xcompiler="%s"' % cuda_compile_flags_str)
+            elif is_linux() or is_macOS():
+                if is_linux():
+                    ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now", "-Wl,-z,noexecstack"]
+                else:
+                    ldflags = []
+                if config == "Release":
+                    cflags = [
+                        "-DNDEBUG",
+                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-Wp,-D_GLIBCXX_ASSERTIONS",
+                        "-fstack-protector-strong",
+                        "-O3",
+                        "-pipe",
+                    ]
+                    if is_linux():
+                        ldflags += ["-Wl,--strip-all"]
+                elif config == "RelWithDebInfo":
+                    cflags = [
+                        "-DNDEBUG",
+                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-Wp,-D_GLIBCXX_ASSERTIONS",
+                        "-fstack-protector-strong",
+                        "-O3",
+                        "-pipe",
+                        "-ggdb3",
+                    ]
+                elif config == "Debug":
+                    cflags = ["-ggdb3", "-O0"]
+                    if args.enable_address_sanitizer:
+                        cflags += ["-fsanitize=address"]
+                        ldflags += ["-fsanitize=address"]
+                elif config == "MinSizeRel":
+                    cflags = [
+                        "-DNDEBUG",
+                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-Wp,-D_GLIBCXX_ASSERTIONS",
+                        "-fstack-protector-strong",
+                        "-Os",
+                        "-pipe",
+                        "-ggdb3",
+                    ]
+                if is_linux() and platform.machine() == "x86_64":
+                    # The following flags needs GCC 8 and newer
+                    cflags += ["-fstack-clash-protection"]
+                    if not args.rv64:
+                        cflags += ["-fcf-protection"]
+                cxxflags = cflags.copy()
+                if args.use_cuda:
+                    cudaflags = cflags.copy()
+        if cxxflags is None and cflags is not None and len(cflags) != 0:
+            cxxflags = cflags.copy()
         config_build_dir = get_config_build_dir(build_dir, config)
         os.makedirs(config_build_dir, exist_ok=True)
         if args.use_tvm:
@@ -1459,20 +1649,23 @@ def generate_build_tree(
                 + os.environ["PATH"]
             )
         preinstalled_dir = Path(build_dir) / config
+        temp_cmake_args = cmake_args.copy()
+        if cflags is not None and cxxflags is not None and len(cflags) != 0 and len(cxxflags) != 0:
+            temp_cmake_args += [
+                "-DCMAKE_C_FLAGS=%s" % (" ".join(cflags)),
+                "-DCMAKE_CXX_FLAGS=%s" % (" ".join(cxxflags)),
+            ]
+        if cudaflags is not None and len(cudaflags) != 0:
+            temp_cmake_args += ["-DCMAKE_CUDA_FLAGS_INIT=%s" % (" ".join(cudaflags))]
+        if ldflags is not None and len(ldflags) != 0:
+            temp_cmake_args += [
+                "-DCMAKE_EXE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
+                "-DCMAKE_MODULE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
+                "-DCMAKE_SHARED_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
+            ]
         run_subprocess(
             [
-                *cmake_args,
-                "-Donnxruntime_ENABLE_MEMLEAK_CHECKER="
-                + (
-                    "ON"
-                    if config.lower() == "debug"
-                    and not args.use_tvm
-                    and not args.use_openvino
-                    and not args.use_gdk
-                    and not args.enable_msvc_static_runtime
-                    and not args.disable_memleak_checker
-                    else "OFF"
-                ),
+                *temp_cmake_args,
                 f"-DCMAKE_BUILD_TYPE={config}",
                 (
                     f"-DCMAKE_PREFIX_PATH={build_dir}/{config}/installed"
@@ -1505,15 +1698,24 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
         build_tool_args = []
         if num_parallel_jobs != 1:
             if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm:
+                # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests
+                # not maxing out CL_MPCount
+                # Start by having one less than num_parallel_jobs (default is num logical cores),
+                # limited to a range of 1..15
+                # that gives maxcpucount projects building using up to 15 cl.exe instances each
                 build_tool_args += [
                     f"/maxcpucount:{num_parallel_jobs}",
+                    # one less than num_parallel_jobs, at least 1, up to 15
+                    f"/p:CL_MPCount={min(max(num_parallel_jobs - 1, 1), 15)}",
                     # if nodeReuse is true, msbuild processes will stay around for a bit after the build completes
                     "/nodeReuse:False",
-                    f"/p:CL_MPCount={num_parallel_jobs}",
                 ]
             elif args.cmake_generator == "Xcode":
-                # CMake will generate correct build tool args for Xcode
-                cmd_args += ["--parallel", str(num_parallel_jobs)]
+                build_tool_args += [
+                    "-parallelizeTargets",
+                    "-jobs",
+                    str(num_parallel_jobs),
+                ]
             else:
                 build_tool_args += [f"-j{num_parallel_jobs}"]
 
@@ -1548,9 +1750,7 @@ def setup_cuda_vars(args):
         if not cuda_home_valid or (not is_windows() and not cudnn_home_valid):
             raise BuildError(
                 "cuda_home and cudnn_home paths must be specified and valid.",
-                "cuda_home='{}' valid={}. cudnn_home='{}' valid={}".format(
-                    cuda_home, cuda_home_valid, cudnn_home, cudnn_home_valid
-                ),
+                f"cuda_home='{cuda_home}' valid={cuda_home_valid}. cudnn_home='{cudnn_home}' valid={cudnn_home_valid}",
             )
 
     return cuda_home, cudnn_home
@@ -1643,6 +1843,9 @@ def setup_dml_build(args, cmake_path, build_dir, configs):
             ]
             run_subprocess(cmd_args)
 
+    if args.minimal_build is not None:
+        raise BuildError("use_dml and minimal_build may not both be set")
+
 
 def setup_rocm_build(args):
     rocm_home = None
@@ -1880,11 +2083,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             # For CUDA or DML enabled builds test IOBinding feature
             if args.use_cuda or args.use_dml:
                 log.info("Testing IOBinding feature")
-                if args.use_dml:
-                    run_subprocess(
-                        [sys.executable, "-m", "pip", "uninstall", "--yes", "onnx"], cwd=cwd, dll_path=dll_path
-                    )
-                    run_subprocess([sys.executable, "-m", "pip", "install", "-q", "onnx"], cwd=cwd, dll_path=dll_path)
                 run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path)
 
             if args.use_cuda:
@@ -1931,7 +2129,8 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                         numpy_init_version = numpy.__version__
                         pb_init_version = google.protobuf.__version__
                         run_subprocess(
-                            [sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], cwd=SCRIPT_DIR
+                            [sys.executable, "-m", "pip", "install", "-r", "requirements-transformers-test.txt"],
+                            cwd=SCRIPT_DIR,
                         )
                         run_subprocess([sys.executable, "-m", "pytest", "transformers"], cwd=cwd)
                         # Restore initial numpy/protobuf version in case other tests use it
@@ -2268,16 +2467,6 @@ def run_csharp_tests(source_dir, build_dir, use_cuda, use_openvino, use_tensorrt
     run_subprocess(cmd_args, cwd=csharp_source_dir)
 
 
-def is_cross_compiling_on_apple(args):
-    if not is_macOS():
-        return False
-    if args.ios:
-        return True
-    if args.osx_arch != platform.machine():
-        return True
-    return False
-
-
 def generate_documentation(source_dir, build_dir, configs, validate):
     # Randomly choose one build config
     config = next(iter(configs))
@@ -2322,11 +2511,11 @@ def diff_file(path, regenerate_qualifiers=""):
                     nonlocal have_diff
                     have_diff = True
                     log.warning(
-                        "The updated document {} is different from the checked in version. "
-                        "Please regenerate the file{}, or copy the updated version from the "
-                        "CI build's published artifacts if applicable.".format(path, regenerate_qualifiers)
+                        f"The updated document {path} is different from the checked in version. "
+                        f"Please regenerate the file{regenerate_qualifiers}, or copy the updated version from the "
+                        "CI build's published artifacts if applicable."
                     )
-                    log.debug("diff:\n" + diff)
+                    log.debug("diff:\n" + diff)  # noqa: G003
 
             diff_file(opkernel_doc_path, " with CPU, CUDA and DML execution providers enabled")
             diff_file(contrib_op_doc_path)
@@ -2341,7 +2530,7 @@ def diff_file(path, regenerate_qualifiers=""):
 
 
 def main():
-    log.debug("Command line arguments:\n  {}".format(" ".join(shlex.quote(arg) for arg in sys.argv[1:])))
+    log.debug("Command line arguments:\n  {}".format(" ".join(shlex.quote(arg) for arg in sys.argv[1:])))  # noqa: G001
 
     args = parse_arguments()
 
@@ -2361,6 +2550,10 @@ def main():
     cmake_extra_defines = normalize_arg_list(args.cmake_extra_defines)
     cross_compiling = args.arm or args.arm64 or args.arm64ec or args.android
 
+    if args.enable_address_sanitizer:
+        # Disable ONNX Runtime's builtin memory checker
+        args.disable_memleak_checker = True
+
     # If there was no explicit argument saying what to do, default
     # to update, build and test (for native builds).
     if not (args.update or args.clean or args.build or args.test or args.gen_doc):
@@ -2390,11 +2583,15 @@ def main():
     if args.build_nuget and cross_compiling:
         raise BuildError("Currently nuget package creation is not supported while cross-compiling")
 
-    if args.enable_pybind and args.disable_rtti:
-        raise BuildError("Python bindings use typeid so you can't disable RTTI")
+    if args.enable_pybind:
+        if args.disable_rtti:
+            raise BuildError("Python bindings use typeid so you can't disable RTTI")
+
+        if args.disable_exceptions:
+            raise BuildError("Python bindings require exceptions to be enabled.")
 
-    if args.enable_pybind and args.disable_exceptions:
-        raise BuildError("Python bindings require exceptions to be enabled.")
+        if args.minimal_build is not None:
+            raise BuildError("Python bindings are not supported in a minimal build.")
 
     if args.nnapi_min_api:
         if not args.use_nnapi:
@@ -2430,7 +2627,7 @@ def main():
         raise BuildError("Using --get-api-doc requires a single build config")
 
     # Disabling unit tests for GPU on nuget creation
-    if args.use_openvino != "CPU_FP32" and args.build_nuget:
+    if args.use_openvino and args.use_openvino != "CPU_FP32" and args.build_nuget:
         args.test = False
 
     # GDK builds don't support testing
@@ -2569,7 +2766,13 @@ def main():
             cmake_extra_args += ["-G", args.cmake_generator]
 
         if is_macOS():
-            if not args.ios and not args.android and args.osx_arch == "arm64" and platform.machine() == "x86_64":
+            if (
+                not args.ios
+                and args.macos != "Catalyst"
+                and not args.android
+                and args.osx_arch == "arm64"
+                and platform.machine() == "x86_64"
+            ):
                 if args.test:
                     log.warning("Cannot test ARM64 build on X86_64. Will skip test running after build.")
                     args.test = False
@@ -2588,12 +2791,6 @@ def main():
             log.info("Activating emsdk...")
             run_subprocess([emsdk_file, "activate", emsdk_version], cwd=emsdk_dir)
 
-        if is_ubuntu_1604():
-            if args.arm or args.arm64:
-                raise BuildError("Only Windows ARM(64) cross-compiled builds supported currently through this script")
-            if not is_docker() and not args.use_acl and not args.use_armnn:
-                install_python_deps()
-
         if args.enable_pybind and is_windows():
             install_python_deps(args.numpy_version)
 
@@ -2663,6 +2860,7 @@ def main():
     # fail unexpectedly. Similar, if your packaging step forgot to copy a file into the package, we don't know it
     # either.
     if args.build:
+        # TODO: find asan DLL and copy it to onnxruntime/capi folder when args.enable_address_sanitizer is True and the target OS is Windows
         if args.build_wheel:
             nightly_build = bool(os.getenv("NIGHTLY_BUILD") == "1")
             default_training_package_device = bool(os.getenv("DEFAULT_TRAINING_PACKAGE_DEVICE") == "1")
diff --git a/tools/ci_build/clean_docker_image_cache.py b/tools/ci_build/clean_docker_image_cache.py
index f9b41ce31f92..8ec2b6b43817 100755
--- a/tools/ci_build/clean_docker_image_cache.py
+++ b/tools/ci_build/clean_docker_image_cache.py
@@ -237,13 +237,13 @@ def main():
     def sorted_image_names(image_infos):
         return sorted([get_image_name(image_info) for image_info in image_infos])
 
-    log.debug("All images:\n{}".format("\n".join(sorted_image_names(all_images))))
-    log.debug("Valid images:\n{}".format("\n".join(sorted_image_names(valid_images))))
+    log.debug("All images:\n{}".format("\n".join(sorted_image_names(all_images))))  # noqa: G001
+    log.debug("Valid images:\n{}".format("\n".join(sorted_image_names(valid_images))))  # noqa: G001
 
     images_to_clean = all_images - valid_images
     image_names_to_clean = sorted_image_names(images_to_clean)
 
-    log.info("Images to clean:\n{}".format("\n".join(image_names_to_clean)))
+    log.info("Images to clean:\n{}".format("\n".join(image_names_to_clean)))  # noqa: G001
 
     if args.dry_run:
         log.info("Dry run, no images will be cleaned.")
diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py
index 2ce1764c9632..99ecaf677f33 100755
--- a/tools/ci_build/get_docker_image.py
+++ b/tools/ci_build/get_docker_image.py
@@ -56,11 +56,7 @@ def parse_args():
 def main():
     args = parse_args()
 
-    log.debug(
-        "Dockerfile: {}, context: {}, docker build args: '{}'".format(
-            args.dockerfile, args.context, args.docker_build_args
-        )
-    )
+    log.debug(f"Dockerfile: {args.dockerfile}, context: {args.context}, docker build args: '{args.docker_build_args}'")
 
     use_container_registry = args.container_registry is not None
 
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index f9688a1453e1..3aaced63dd41 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -149,9 +149,11 @@ def _build_aar(args):
         "-DminSdkVer=" + str(build_settings["android_min_sdk_version"]),
         "-DtargetSdkVer=" + str(build_settings["android_target_sdk_version"]),
         "-DbuildVariant=" + str(build_settings["build_variant"]),
-        "-DENABLE_TRAINING_APIS=1"
-        if "--enable_training_apis" in build_settings["build_params"]
-        else "-DENABLE_TRAINING_APIS=0",
+        (
+            "-DENABLE_TRAINING_APIS=1"
+            if "--enable_training_apis" in build_settings["build_params"]
+            else "-DENABLE_TRAINING_APIS=0"
+        ),
     ]
 
     # clean, build, and publish to a local directory
diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
index 006dc4c33ffc..6188c7d7c067 100755
--- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
+++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
@@ -86,9 +86,7 @@ def run(arg_list, cwd=None):
     import shlex
     import subprocess
 
-    log.info(
-        "Running subprocess in '{}'\n  {}".format(cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in arg_list]))
-    )
+    log.info("Running subprocess in '%s'\n  %s", cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in arg_list]))
 
     return subprocess.run(arg_list, check=True, cwd=cwd)
 
diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py
index 5137a0644b2e..e17bcd65d881 100644
--- a/tools/ci_build/github/apple/build_apple_framework.py
+++ b/tools/ci_build/github/apple/build_apple_framework.py
@@ -50,9 +50,11 @@ def _build_for_apple_sysroot(
     # Build binary for each arch, one by one
     for current_arch in archs:
         build_dir_current_arch = os.path.join(intermediates_dir, sysroot + "_" + current_arch)
+        # Use MacOS SDK for Catalyst builds
+        apple_sysroot = "macosx" if sysroot == "macabi" else sysroot
         build_command = [
             *base_build_command,
-            "--apple_sysroot=" + sysroot,
+            "--apple_sysroot=" + apple_sysroot,
             "--osx_arch=" + current_arch,
             "--build_dir=" + build_dir_current_arch,
         ]
@@ -65,9 +67,11 @@ def _build_for_apple_sysroot(
             build_dir_current_arch,
             build_config,
             build_config + "-" + sysroot,
-            "onnxruntime.framework"
-            if build_dynamic_framework
-            else os.path.join("static_framework", "onnxruntime.framework"),
+            (
+                "onnxruntime.framework"
+                if build_dynamic_framework
+                else os.path.join("static_framework", "onnxruntime.framework")
+            ),
         )
         ort_libs.append(os.path.join(framework_dir, "onnxruntime"))
 
diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
index 86b4efdc6375..04a73ae450e5 100644
--- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
@@ -23,6 +23,7 @@
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],
         "macosx": [
+            "--macos=MacOSX",
             "--apple_deploy_target=11.0"
         ],
         "iphoneos": [
diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
new file mode 100644
index 000000000000..4bc978956d7f
--- /dev/null
+++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
@@ -0,0 +1,40 @@
+{
+    "build_osx_archs": {
+        "iphoneos": [
+            "arm64"
+        ],
+        "iphonesimulator": [
+            "arm64",
+            "x86_64"
+        ],
+        "macabi": [
+            "arm64",
+            "x86_64"
+        ]
+    },
+    "build_params": {
+        "base": [
+            "--parallel",
+            "--build_apple_framework",
+            "--use_coreml",
+            "--skip_tests",
+            "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
+        ],
+        "iphoneos": [
+            "--ios",
+            "--use_xcode",
+            "--use_xnnpack",
+            "--apple_deploy_target=12.0"
+        ],
+        "iphonesimulator": [
+            "--ios",
+            "--use_xcode",
+            "--use_xnnpack",
+            "--apple_deploy_target=12.0"
+        ],
+        "macabi":[
+            "--macos=Catalyst",
+            "--apple_deploy_target=14.0"
+        ]
+    }
+}
diff --git a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
index f88934cd44a6..2066af7843e0 100644
--- a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
@@ -32,6 +32,7 @@
             "--apple_deploy_target=12.0"
         ],
         "macosx": [
+            "--macos=MacOSX",
             "--apple_deploy_target=11.0"
         ]
     }
diff --git a/tools/ci_build/github/apple/framework_info.json.template b/tools/ci_build/github/apple/framework_info.json.template
index b4c4fb8d16eb..1f7eeb594879 100644
--- a/tools/ci_build/github/apple/framework_info.json.template
+++ b/tools/ci_build/github/apple/framework_info.json.template
@@ -1,5 +1,5 @@
 {
-    "@CMAKE_OSX_SYSROOT@": {
+    "@PLATFORM_NAME@": {
         "APPLE_DEPLOYMENT_TARGET": "@CMAKE_OSX_DEPLOYMENT_TARGET@",
         "WEAK_FRAMEWORK": "@APPLE_WEAK_FRAMEWORK@"
     }
diff --git a/tools/ci_build/github/apple/get_simulator_device_info.py b/tools/ci_build/github/apple/get_simulator_device_info.py
index 2a36418bac9c..7de9aa13912e 100755
--- a/tools/ci_build/github/apple/get_simulator_device_info.py
+++ b/tools/ci_build/github/apple/get_simulator_device_info.py
@@ -138,13 +138,11 @@ def runtime_id_and_device_pair_key(runtime_id_and_device_pair):
 
 def main():
     parser = argparse.ArgumentParser(description="Gets simulator info from Xcode and prints it in JSON format.")
-    _ = parser.parse_args()  # no args yet
+    parser.add_argument("--max-runtime-version", help="The maximum runtime version to allow.")
+    args = parser.parse_args()
 
     info = get_simulator_device_info(
-        # The macOS-13 hosted agent image has iOS 17 which is currently in beta. Limit it to 16.4 for now.
-        # See https://github.com/actions/runner-images/issues/8023
-        # TODO Remove max_runtime_version limit.
-        max_runtime_version_str="16.4",
+        max_runtime_version_str=args.max_runtime_version,
     )
 
     print(json.dumps(info, indent=2))
diff --git a/tools/ci_build/github/apple/test_apple_packages.py b/tools/ci_build/github/apple/test_apple_packages.py
index 6dc4868dac8a..3987a37fcc76 100644
--- a/tools/ci_build/github/apple/test_apple_packages.py
+++ b/tools/ci_build/github/apple/test_apple_packages.py
@@ -112,7 +112,10 @@ def _test_apple_packages(args):
         subprocess.run(["pod", "cache", "clean", "--all"], shell=False, check=True, cwd=target_proj_path)
 
         # install pods
-        subprocess.run(["pod", "install"], shell=False, check=True, cwd=target_proj_path)
+        # set env to skip macos test targets accordingly
+        env = os.environ.copy()
+        env["SKIP_MACOS_TEST"] = "true" if args.skip_macos_test else "false"
+        subprocess.run(["pod", "install"], shell=False, check=True, cwd=target_proj_path, env=env)
 
         # run the tests
         if not args.prepare_test_project_only:
@@ -127,24 +130,72 @@ def _test_apple_packages(args):
 
             simulator_device_info = json.loads(simulator_device_info)
 
-            subprocess.run(
-                [
-                    "xcrun",
-                    "xcodebuild",
-                    "test",
-                    "-workspace",
-                    "./apple_package_test.xcworkspace",
-                    "-scheme",
-                    "ios_package_test",
-                    "-destination",
-                    f"platform=iOS Simulator,id={simulator_device_info['device_udid']}",
-                ],
-                shell=False,
-                check=True,
-                cwd=target_proj_path,
-            )
+            # Xcode UI tests seem to be flaky: https://github.com/orgs/community/discussions/68807
+            # Add a couple of retries if we get this error:
+            #   ios_package_testUITests-Runner Failed to initialize for UI testing:
+            #   Error Domain=com.apple.dt.XCTest.XCTFuture Code=1000 "Timed out while loading Accessibility."
+            attempts = 0
+            cmd = [
+                "xcrun",
+                "xcodebuild",
+                "test",
+                "-workspace",
+                "./apple_package_test.xcworkspace",
+                "-scheme",
+                "ios_package_test",
+                "-destination",
+                f"platform=iOS Simulator,id={simulator_device_info['device_udid']}",
+            ]
+
+            while True:
+                attempts += 1
+                completed_process = subprocess.run(
+                    cmd,
+                    shell=False,
+                    capture_output=True,
+                    check=False,
+                    text=True,
+                    cwd=target_proj_path,
+                )
+
+                # print so it's in CI output
+                print(completed_process.stdout)
+
+                if completed_process.returncode != 0:
+                    print(f"Running ios_package_test failed. Return code was {completed_process.returncode}")
+                    print("xcrun xcodebuild test stderr:")
+                    print(completed_process.stderr)
+                    print("---")
+
+                    if "Timed out while loading Accessibility" in completed_process.stderr and attempts < 3:
+                        continue
+
+                    raise subprocess.CalledProcessError(
+                        completed_process.returncode, " ".join(cmd), completed_process.stdout, completed_process.stderr
+                    )
 
-            if PackageVariant[args.variant] != PackageVariant.Mobile:
+                break
+
+            if args.mac_catalyst_enabled:
+                subprocess.run(
+                    [
+                        "xcrun",
+                        "xcodebuild",
+                        "test",
+                        "-workspace",
+                        "./apple_package_test.xcworkspace",
+                        "-scheme",
+                        "ios_package_test",
+                        "-destination",
+                        "platform=macOS,variant=Mac Catalyst",
+                        "CODE_SIGNING_ALLOWED=NO",
+                    ],
+                    shell=False,
+                    check=True,
+                    cwd=target_proj_path,
+                )
+
+            if PackageVariant[args.variant] != PackageVariant.Mobile and not args.skip_macos_test:
                 subprocess.run(
                     [
                         "xcrun",
@@ -206,6 +257,18 @@ def parse_args():
         help="Prepare the test project only, without running the tests",
     )
 
+    parser.add_argument(
+        "--skip_macos_test",
+        action="store_true",
+        help="Skip macos platform tests. Specify this argument when build targets only contain ios archs. ",
+    )
+
+    parser.add_argument(
+        "--mac_catalyst_enabled",
+        action="store_true",
+        help="Run tests for mac catalyst variants. Specify this argument when build targets contains catalyst archs. ",
+    )
+
     return parser.parse_args()
 
 
diff --git a/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
new file mode 100644
index 000000000000..1a89d941e5e5
--- /dev/null
+++ b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
@@ -0,0 +1,22 @@
+{
+  "build_osx_archs": {
+    "iphonesimulator": [
+      "x86_64"
+    ]
+  },
+  "build_params": {
+    "base": [
+      "--parallel",
+      "--use_xcode",
+      "--build_apple_framework",
+      "--minimal_build=extended",
+      "--enable_training_apis",
+      "--skip_tests",
+      "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
+    ],
+    "iphonesimulator": [
+      "--ios",
+      "--apple_deploy_target=12.0"
+    ]
+  }
+}
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index e2ca4f64a0ec..d37266a8e96d 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -31,7 +31,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124
+  default: qnn-v2.19.2.240210
 
 jobs:
 - job: Build_QNN_EP
@@ -88,13 +88,6 @@ jobs:
       cp -r cmake/external/onnx/onnx/backend/test/data/node/test_basic_conv_with_padding build_qnn/Release/testdata/QNN/node_tests
     displayName: Initialize test directories
 
-  - task: JavaToolInstaller@0
-    displayName: Use jdk 11
-    inputs:
-      versionSpec: '11'
-      jdkArchitectureOption: 'x64'
-      jdkSourceOption: 'PreInstalled'
-
   # This is commented out for now. The emulator runs correctly, onnx_test_runner is executable, and the test passes
   # with the CPU EP but returns 139 when attempting to use the QNN EP. Maybe some QNN EP parameters need to be provided?
   #
diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
index 9136b21aec62..d0a22aae0774 100644
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     Codeql.Enabled: false
   jobs:
   - job: Build_CPU_EP
-    pool: onnxruntime-Linux-CPU-For-Android-CI
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     workspace:
       clean: all
     timeoutInMinutes: 30
@@ -140,7 +140,7 @@ stages:
 
   jobs:
   - job: Build_NNAPI_EP
-    pool: onnxruntime-Linux-CPU-For-Android-CI
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     timeoutInMinutes: ${{ variables.JobsTimeout }}
     workspace:
       clean: all
@@ -456,7 +456,7 @@ stages:
     variables:
     - name: skipComponentGovernanceDetection
       value: true
-    pool: 'onnxruntime-Linux-CPU-For-Android-CI'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI'))
     dependsOn:
     - NNAPI_EP_MASTER
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
new file mode 100644
index 000000000000..b9a47f6739fe
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -0,0 +1,394 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+
+# reference: https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
+parameters:
+- name: specificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
+- name: BuildId
+  displayName: Specific Artifact's RunId
+  type: number
+  default: 0
+
+resources:
+  repositories:
+  - repository: manylinux
+    type: Github
+    endpoint: Microsoft
+    name: pypa/manylinux
+    ref: 5eda9aded5462201e6310105728d33016e637ea7
+
+  - repository: LLaMa2Onnx
+    type: Github
+    endpoint: Microsoft
+    name: Microsoft/Llama-2-Onnx
+    ref: main
+
+variables:
+  - template: templates/common-variables.yml
+  - name: docker_base_image
+    value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+  - name: linux_trt_version
+    value: 8.6.1.6-1.cuda11.8
+
+stages:
+- stage: Build_Onnxruntime_Cuda
+  jobs:
+  - job: Linux_Build
+    timeoutInMinutes: 120
+    variables:
+      skipComponentGovernanceDetection: true
+      CCACHE_DIR: $(Pipeline.Workspace)/ccache
+    workspace:
+      clean: all
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
+    steps:
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+        Context: tools/ci_build/github/linux/docker
+        DockerBuildArgs: "
+        --network=host
+        --build-arg BASEIMAGE=$(docker_base_image)
+        --build-arg TRT_VERSION=$(linux_trt_version)
+        --build-arg BUILD_UID=$( id -u )
+        "
+        Repository: onnxruntimecuda11build
+
+    - task: Cache@2
+      inputs:
+        key: '"ccache" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
+        path: $(CCACHE_DIR)
+        restoreKeys: |
+          "ccache" | "$(Build.SourceBranch)"
+          "ccache"
+        cacheHitVar: CACHE_RESTORED
+      displayName: Cach Task
+
+    - script: |
+        sudo mkdir -p $(Pipeline.Workspace)/ccache
+      condition: ne(variables.CACHE_RESTORED, 'true')
+      displayName: Create Cache Dir
+
+    - task: CmdLine@2
+      inputs:
+        script: |
+          mkdir -p $HOME/.onnx
+          docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            --volume $(Pipeline.Workspace)/ccache:/cache \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            -e CCACHE_DIR=/cache \
+            onnxruntimecuda11build \
+            /bin/bash -c "
+              set -ex; \
+              env; \
+              ccache -s; \
+              /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
+                --build_dir /build --cmake_generator Ninja \
+                --config Release --update --build \
+                --skip_submodule_sync \
+                --build_shared_lib \
+                --parallel \
+                --build_wheel \
+                --enable_onnx_tests --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda-${{variables.common_cuda_version}} --cudnn_home=/usr/local/cuda-${{variables.common_cuda_version}} \
+                --enable_cuda_profiling --enable_cuda_nhwc_ops \
+                --enable_pybind --build_java \
+                --use_cache \
+                --cmake_extra_defines  'CMAKE_CUDA_ARCHITECTURES=75;86' ; \
+                ccache -sv; \
+                ccache -z"
+        workingDirectory: $(Build.SourcesDirectory)
+
+    - task: CmdLine@2
+      inputs:
+        script: |
+          rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
+          rm -f $(Build.BinariesDirectory)/Release/models
+          find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete
+          cd $(Build.BinariesDirectory)/Release
+          find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
+
+    - script: |
+        set -ex
+        mkdir -p $(Agent.TempDirectory)/ort
+        cp $(Build.BinariesDirectory)/Release/dist/*.whl $(Agent.TempDirectory)/ort/
+      displayName: 'Copy Wheels'
+
+    - task: PublishPipelineArtifact@0
+      displayName: 'Publish Pipeline Artifact'
+      inputs:
+        artifactName: 'drop-ort-linux-gpu'
+        targetPath: '$(Agent.TempDirectory)/ort'
+
+    - template: templates/explicitly-defined-final-tasks.yml
+
+- stage: Stable_Diffusion
+  dependsOn:
+  - Build_Onnxruntime_Cuda
+  jobs:
+  - job: Stable_Diffusion
+    variables:
+      skipComponentGovernanceDetection: true
+      CLIP_MODEL_CACHE: $(Agent.TempDirectory)/clip_cache
+      STABLE_DIFFUSION_MODEL_CACHE: $(Agent.TempDirectory)/stablediffusion_cache
+      GenerateImage_DIR: $(Agent.TempDirectory)/images
+    workspace:
+      clean: all
+    pool: onnxruntime-Linux-GPU-A10-12G
+    steps:
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Onnxruntime Artifact'
+        ArtifactName: 'drop-ort-linux-gpu'
+        TargetPath: '$(Build.BinariesDirectory)/Release'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - task: Cache@2
+      inputs:
+        key: stable_diffusion | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
+        restoreKeys: |
+          stable_diffusion | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
+          stable_diffusion
+        path: $(STABLE_DIFFUSION_MODEL_CACHE)
+      displayName: Cache stable diffusion model
+
+    - script: |
+        mkdir -p $(GenerateImage_DIR)
+        docker run --rm --gpus all -v $PWD:/workspace \
+          -v $(Build.BinariesDirectory)/Release:/Release \
+          -v $(STABLE_DIFFUSION_MODEL_CACHE):/model_cache:rw \
+          -v $(GenerateImage_DIR):/images:rw \
+          nvcr.io/nvidia/pytorch:22.11-py3 \
+          bash -c ' \
+            set -ex; \
+            python3 --version; \
+            python3 -m pip install --upgrade pip; \
+            python3 -m pip install /Release/*.whl; \
+            pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion; \
+            python3 -m pip install -r requirements-cuda11.txt; \
+            python3 -m pip install --upgrade polygraphy onnx-graphsurgeon ; \
+            echo Generate an image guided by a text prompt; \
+            python3 demo_txt2img.py --framework-model-dir /model_cache --seed 1 --deterministic "astronaut riding a horse on mars" ; \
+            find $(pwd)/ORT_CUDA -name "*.png" -exec cp {} /images/ \; ; \
+            popd ; \
+            '
+      displayName: 'Run stable diffusion demo'
+      workingDirectory: $(Build.SourcesDirectory)
+
+    # For verification we will check the generated image looks .
+    - task: PublishPipelineArtifact@0
+      displayName: 'Publish code coverage report'
+      inputs:
+          artifactName: "Generated-Image"
+          targetPath: '$(GenerateImage_DIR)'
+
+    - task: Cache@2
+      inputs:
+        key: clip_model | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
+        restoreKeys: |
+          clip_model | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
+          clip_model
+        path: $(CLIP_MODEL_CACHE)
+      displayName: Cache clip model
+
+    - script: |
+        docker run --rm --gpus all -v $PWD:/workspace \
+          -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
+          nvcr.io/nvidia/pytorch:22.11-py3 \
+          bash -c '
+            set -ex; \
+            python3 --version; \
+            python3 -m pip install --upgrade pip; \
+            pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion/; \
+            image2=$(find $(pwd) -name "astronaut_riding_a_h*.png") ; \
+            pushd test; \
+            python3 -m pip install -r requirements.txt; \
+            echo check demo_txt2image.py generate image; \
+            python3 -u check_image.py --image1 astronaut_riding_txt2image-DDIM-50.png --image2 $image2 --cache_dir /model_cache ; \
+            popd ; \
+            popd ; \
+            '
+      displayName: 'Check the generated image'
+      workingDirectory: $(Build.SourcesDirectory)
+
+- stage: Llama2_ONNX_FP16
+  dependsOn:
+  - Build_Onnxruntime_Cuda
+  jobs:
+  - job: Llama2_ONNX_FP16
+    variables:
+      skipComponentGovernanceDetection: true
+    workspace:
+      clean: all
+    pool: Onnxruntime-Linux-A10-24G
+    steps:
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Onnxruntime Artifact'
+        ArtifactName: 'drop-ort-linux-gpu'
+        TargetPath: '$(Build.BinariesDirectory)/ort-artifact/'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
+        Context: tools/ci_build/github/linux/docker/
+        ScriptName: tools/ci_build/get_docker_image.py
+        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+        Repository: onnxruntimeubi8packagestest
+        UpdateDepsTxt: false
+
+    - task: DownloadPackage@1
+      displayName: 'Download Meta Llama2 model'
+      inputs:
+        packageType: upack
+        feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
+        version: 1.0.0
+        definition: '6fe0c4ed-9d0e-4d66-94cc-fb6a111d02a5'
+        downloadPath: $(Agent.TempDirectory)/meta_llama2_7b_hf
+
+    - script: |
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
+           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
+           -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
+           onnxruntimeubi8packagestest \
+            bash -c "
+              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
+              python3 -m pip install --upgrade pip ; \
+              pushd models/llama ; \
+              python3 -m pip install -r requirements.txt ; \
+              popd ; \
+              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m pip uninstall -y torch ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\
+              popd ; \
+            "
+      displayName: 'Run Llama2 to Onnx F16 and parity Test'
+      workingDirectory: $(Build.SourcesDirectory)
+
+- stage: Whisper_ONNX
+  dependsOn:
+  - Build_Onnxruntime_Cuda
+  jobs:
+  - job: Whisper_ONNX
+    variables:
+      skipComponentGovernanceDetection: true
+    workspace:
+      clean: all
+    pool: Onnxruntime-Linux-A10-24G
+    steps:
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Onnxruntime Artifact'
+        ArtifactName: 'drop-ort-linux-gpu'
+        TargetPath: '$(Build.BinariesDirectory)/ort-artifact/'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+        Context: tools/ci_build/github/linux/docker/
+        ScriptName: tools/ci_build/get_docker_image.py
+        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+        Repository: onnxruntimepackagestest
+        UpdateDepsTxt: false
+
+    - task: DownloadPackage@1
+      # The model data in artifact is downloaded from openai/whisper-large-v3 in huggingface model hub
+      # In order to save size, removed .git directory and pickled files, and keep the safetensors model files
+      displayName: 'Download Whisper Model'
+      inputs:
+        packageType: upack
+        feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
+        version: 1.0.0
+        definition: 'b583ce7c-1a8f-4099-ae28-5d5f56c478b1'
+        downloadPath: $(Agent.TempDirectory)/whisper_large_v3
+
+    - script: |
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
+           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
+           -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \
+           onnxruntimepackagestest \
+            bash -c '
+              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
+              python3 -m pip install --upgrade pip ; \
+              pushd models/whisper ; \
+              python3 -m pip install -r requirements.txt ; \
+              popd ; \
+              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m pip uninstall -y torch ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              python3 -m models.whisper.convert_to_onnx -m /whisper_large_v3 --output whisperlargev3 --use_external_data_format ; \
+              popd ; \
+            '
+      displayName: 'Convert Whisper Model'
+      workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
index 08330764ff5f..d37e9bdc5da4 100644
--- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
@@ -28,7 +28,7 @@ stages:
         artifactName: 'onnxruntime-android-full-aar'
         job_name_suffix: 'Full'
         publish_executables: '1'
-        pool_name: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 # build Python packages
 # Linux GPU only
@@ -40,5 +40,4 @@ stages:
       enable_windows_cpu: false
       enable_windows_gpu: false
       enable_mac_cpu: false
-      enable_mac_silicon: false
       enable_linux_arm: false
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index badee79fd78b..a124cb3c1ac7 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -83,6 +83,16 @@ resources:
 variables:
 - name: ReleaseVersionSuffix
   value: ''
+- name: docker_base_image
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+- name: linux_trt_version
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: 8.6.1.6-1.cuda11.8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: 8.6.1.6-1.cuda12.0
 
 stages:
 - stage: Setup
@@ -92,6 +102,9 @@ stages:
       vmImage: ubuntu-latest
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - bash: |
         # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
         set +x
@@ -105,6 +118,10 @@ stages:
           echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]"
         fi
       name: Set_Release_Version_Suffix
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 
 - stage: Debug
   dependsOn: Setup
@@ -116,7 +133,14 @@ stages:
       MyVar: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - bash: echo $(MyVar)
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 
 - stage: Download_Java_Tools
   dependsOn: []
@@ -126,6 +150,9 @@ stages:
       vmImage: ubuntu-latest
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - task: CmdLine@2
       displayName: Download Java Tools
       inputs:
@@ -141,6 +168,9 @@ stages:
       inputs:
         targetPath: '$(Agent.TempDirectory)/java-tools'
         artifact: 'onnxruntime-java-tools'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
 
 - template: templates/c-api-cpu.yml
   parameters:
@@ -169,64 +199,13 @@ stages:
     AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
     BuildVariant: 'default'
 
-- stage: Linux_C_API_Packaging_GPU_x64
-  dependsOn: []
-  jobs:
-  - job:
-    workspace:
-      clean: all
-    timeoutInMinutes: 120
-    pool: 'Onnxruntime-Linux-GPU'
-    variables:
-      - name: CUDA_VERSION_MAJOR
-        ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: '11'
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: '12'
-      - name: CUDA_VERSION
-        value: ${{ parameters.CudaVersion }}
-    steps:
-    - template: templates/set-version-number-variables-step.yml
-    - template: templates/get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
-        Repository: onnxruntimecuda$(CUDA_VERSION_MAJOR)build
-
-    - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
-      workingDirectory: $(Build.SourcesDirectory)
-      displayName: 'Build and Test'
-
-    - template: templates/java-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-          arch: 'linux-x64'
-          buildConfig: 'Release'
-          artifactName: 'onnxruntime-java-linux-x64-cuda'
-          version: '$(OnnxRuntimeVersion)'
-          libraryName: 'libonnxruntime.so'
-          nativeLibraryName: 'libonnxruntime4j_jni.so'
-
-    - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-        buildConfig: 'Release'
-        artifactName: 'onnxruntime-linux-x64-cuda-$(OnnxRuntimeVersion)'
-        artifactNameNoVersionString: 'onnxruntime-linux-x64-cuda'
-        libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)'
-
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters:
-        condition: 'succeeded'
-    - template: templates/clean-agent-build-directory-step.yml
-
-- template: templates/linux-gpu-tensorrt-packaging-pipeline.yml
+- template: stages/nuget-linux-cuda-packaging-stage.yml
   parameters:
-      artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
-      artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
-      buildJava: true
-      buildJavaOption: '--build_java'
-      buildNodejs: true
-      buildNodejsOption: '--build_nodejs'
+    CudaVersion: ${{ parameters.CudaVersion }}
+    docker_base_image: ${{ variables.docker_base_image }}
+    linux_trt_version: ${{ variables.linux_trt_version }}
+    buildJava: true
+    buildNodejs: true
 
 #CUDA without tensorrt
 - template: templates/win-ci.yml
@@ -243,6 +222,8 @@ stages:
     buildJava: true
     java_artifact_id: onnxruntime_gpu
     CudaVersion: 11.8
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
 
 # CUDA with Tensorrt
 - template: templates/win-ci.yml
@@ -269,7 +250,7 @@ stages:
     workspace:
       clean: all
     timeoutInMinutes: 120
-    pool: onnxruntime-Ubuntu2004-AMD-CPU
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     variables:
       RocmVersion: '5.6'
     steps:
@@ -331,8 +312,8 @@ stages:
   dependsOn:
   - Linux_C_API_Packaging_GPU_x64
   - Linux_C_API_Packaging_GPU_TensorRT_x64
-  - Windows_Packaging_gpu
-  - Windows_Packaging_tensorrt
+  - Windows_Packaging_gpu_Testing
+  - Windows_Packaging_tensorrt_Testing
   - Download_Java_Tools
   condition: succeeded()
   jobs:
@@ -415,12 +396,11 @@ stages:
     steps:
     - template: templates/set-version-number-variables-step.yml
 
-    - task: BatchScript@1
-      displayName: 'setup env'
-      inputs:
-        filename: '$(Build.SourcesDirectory)\tools\ci_build\github\windows\setup_env_cuda.bat'
-        modifyEnvironment: true
-        workingFolder: '$(Build.BinariesDirectory)'
+    - template: templates/jobs/download_win_gpu_library.yml
+      parameters:
+        CudaVersion: ${{ parameters.CudaVersion }}
+        DownloadCUDA: true
+        DownloadTRT: true
 
     - template: templates\flex-downloadPipelineArtifact.yml
       parameters:
@@ -507,107 +487,11 @@ stages:
       displayName: 'Clean Agent Directories'
       condition: always()
 
-- stage: Linux_Packaging_combined_GPU
-  dependsOn:
-  - Linux_C_API_Packaging_GPU_x64
-  - Linux_C_API_Packaging_GPU_TensorRT_x64
-  condition: succeeded()
-  jobs:
-  - job:
-    workspace:
-      clean: all
-    pool: 'Onnxruntime-Linux-GPU'
-
-    steps:
-    - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
-      submodules: false
-    - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
-      submodules: false
-    - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
-      submodules: false
-
-    - script: |
-        set -e -x
-        cd $(Build.SourcesDirectory)
-        mv manylinux onnxruntime
-        ls
-
-    - template: templates/with-container-registry-steps.yml
-      parameters:
-        Steps:
-        - script: |
-            tools/ci_build/get_docker_image.py \
-              --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda \
-              --context tools/ci_build/github/linux/docker \
-              --docker-build-args "--network=host --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 --build-arg BUILD_UID=$( id -u )" \
-              --container-registry onnxruntimebuildcache \
-              --multiple_repos \
-              --repository onnxruntimecuda118xtrt86build
-          displayName: "Get onnxruntimecuda118xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda"
-          workingDirectory: $(Build.SourcesDirectory)/onnxruntime
-        ContainerRegistry: onnxruntimebuildcache
-
-    - template: templates/set-version-number-variables-step.yml
-      parameters:
-        versionFileDirectory: '$(Build.SourcesDirectory)/onnxruntime'
-        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime'
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Pipeline Artifact - Combined GPU'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64-cuda'
-        targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Pipeline Artifact - Combined GPU'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64-tensorrt'
-        targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: ShellScript@2
-      displayName: 'Shell Script'
-      inputs:
-        scriptPath: 'onnxruntime/tools/ci_build/github/linux/extract_and_bundle_gpu_package.sh'
-        args: '-a $(Build.BinariesDirectory)/tgz-artifacts'
-        workingDirectory: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: ArchiveFiles@2
-      inputs:
-        rootFolderOrFile: '$(Build.BinariesDirectory)/tgz-artifacts/onnxruntime-linux-x64-gpu'
-        includeRootFolder: false
-        archiveType: 'tar' # Options: zip, 7z, tar, wim
-        tarCompression: 'gz'
-        archiveFile: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        replaceExistingArchive: true
-
-    - template: templates/validate-package.yml
-      parameters:
-        PackageType: 'tarball'
-        PackagePath: '$(Build.ArtifactStagingDirectory)'
-        PackageName: 'onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        ScriptPath: '$(Build.SourcesDirectory)/onnxruntime/tools/nuget/validate_package.py'
-        PlatformsSupported: 'linux-x64'
-        VerifyNugetSigning: false
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-
-
-    - task: CmdLine@2
-      displayName: 'Test C API application for GPU package'
-      inputs:
-        script: |
-          docker run --gpus all -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/models:/data/models --volume $(Build.SourcesDirectory):/src_dir \
-          --volume $(Build.ArtifactStagingDirectory):/artifact_src -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
-          /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet/run_capi_application.sh -o /src_dir/onnxruntime -p /artifact_src/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz -w /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-
-    - task: PublishPipelineArtifact@1
-      inputs:
-        targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        artifactName: 'onnxruntime-linux-x64-gpu'
 
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
-  - Windows_Packaging_gpu
-  - Windows_Packaging_tensorrt
+  - Windows_Packaging_gpu_Testing
+  - Windows_Packaging_tensorrt_Testing
   condition: succeeded()
   jobs:
   - job:
@@ -619,13 +503,16 @@ stages:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
     - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
       submodules: false
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
     - script: dir $(Build.SourcesDirectory)
-    - task: BatchScript@1
-      displayName: 'setup env'
-      inputs:
-        filename: '$(Build.SourcesDirectory)\onnxruntime\tools\ci_build\github\windows\setup_env_gpu.bat'
-        modifyEnvironment: true
-        workingFolder: '$(Build.BinariesDirectory)'
+    - template: templates/jobs/download_win_gpu_library.yml
+      parameters:
+        CudaVersion: ${{ parameters.CudaVersion }}
+        DownloadCUDA: true
+        DownloadTRT: true
     - template: templates/set-version-number-variables-step.yml
       parameters:
         versionFileDirectory: '$(Build.SourcesDirectory)\onnxruntime'
@@ -687,14 +574,19 @@ stages:
       displayName: 'Publish Pipeline Combined GPU Package Artifact'
       inputs:
         artifactName: 'onnxruntime-win-x64-gpu'
-        targetPath: '$(Build.ArtifactStagingDirectory)'
+        targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'
 
 
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 - stage: NuGet_Packaging_GPU
   dependsOn:
   - Setup
-  - Windows_Packaging_gpu
-  - Windows_Packaging_tensorrt
+  - Windows_Packaging_gpu_Testing
+  - Windows_Packaging_CPU_x64_default
+  - Windows_Packaging_tensorrt_Testing
   - Linux_C_API_Packaging_GPU_x64
   - Linux_C_API_Packaging_GPU_TensorRT_x64
   condition: succeeded()
@@ -747,6 +639,7 @@ stages:
         SpecificArtifact: ${{ parameters.SpecificArtifact }}
         BuildId: ${{ parameters.BuildId }}
 
+    # The following one is from a CPU job that publishes protoc.exe
     - template: templates/flex-downloadPipelineArtifact.yml
       parameters:
         StepName: 'Download Pipeline Artifact - NuGet'
@@ -857,7 +750,7 @@ stages:
         PackageType: 'nuget'
         PackagePath: '$(Build.ArtifactStagingDirectory)'
         PlatformsSupported: 'win-x64,linux-x64'
-        # 1* stands for version number. we use it to fileter Gpu.Windows and Gpu.Linux packages
+        # 1* stands for version number. we use it to filter Gpu.Windows and Gpu.Linux packages
         PackageName: 'Microsoft.ML.OnnxRuntime.Gpu.1*nupkg'
         VerifyNugetSigning: false
 
@@ -1130,28 +1023,30 @@ stages:
 
 - template: nuget/templates/test_win.yml
   parameters:
-    AgentPool : 'onnxruntime-Win2022-GPU-T4'
+    AgentPool : 'onnxruntime-Win2022-GPU-A10'
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     Skipx86Tests: 'true'
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
 - template: nuget/templates/test_win.yml
   parameters:
-    AgentPool : 'onnxruntime-Win2022-GPU-T4'
+    AgentPool : 'onnxruntime-Win2022-GPU-A10'
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Windows'
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     MoreSuffix: '_Windows'
     Skipx86Tests: 'true'
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
 - template: nuget/templates/test_linux.yml
   parameters:
-    AgentPool : Onnxruntime-Linux-GPU
+    AgentPool : Onnxruntime-Linux-GPU-A10
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
@@ -1160,7 +1055,7 @@ stages:
 
 - template: nuget/templates/test_linux.yml
   parameters:
-    AgentPool : Onnxruntime-Linux-GPU
+    AgentPool : Onnxruntime-Linux-GPU-A10
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     MoreSuffix: '_Linux'
@@ -1246,45 +1141,21 @@ stages:
      mkdir $(Build.ArtifactStagingDirectory)\testdata
      copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
 
-- template: nuget/templates/dml-vs-2022.yml
-  parameters:
-    AgentPool : 'onnxruntime-Win-CPU-2022'
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    ArtifactName: 'drop-win-dml-arm-zip'
-    StageName: 'Windows_CI_GPU_DML_Dev_arm'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
-    BuildArch: 'x64'
-    EnvSetupScript: 'setup_env.bat'
-    sln_platform: 'arm'
-    DoDebugBuild: 'false'
-    DoNugetPack : 'true'
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    RunTests: 'false'
-    BuildNodejs: 'false'
-    NuPackScript: |
-     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-     cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-     ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-arm.zip
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm.zip $(Build.ArtifactStagingDirectory)
-     mkdir $(Build.ArtifactStagingDirectory)\testdata
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
 - stage: NuGet_Packaging_DML
   dependsOn:
   - Windows_CI_GPU_DML_Dev
   - Windows_CI_GPU_DML_Dev_x86
   - Windows_CI_GPU_DML_Dev_arm64
-  - Windows_CI_GPU_DML_Dev_arm
   condition: succeeded()
   jobs:
   - job:
     workspace:
       clean: all
-    pool: 'onnxruntime-Win2022-GPU-T4'
-
+    pool: 'onnxruntime-Win2022-GPU-dml-A10'
     steps:
-
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet DirectML'
       inputs:
@@ -1303,12 +1174,6 @@ stages:
         artifactName: 'drop-win-dml-arm64-zip'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet DirectML arm'
-      inputs:
-        artifactName: 'drop-win-dml-arm-zip'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
-
     - script: |
         pushd $(Build.BinariesDirectory)\nuget-artifact-dml
         dir
@@ -1339,13 +1204,6 @@ stages:
                 move win-arm64\runtimes\win-arm64\native\onnxruntime.lib %%~ni\runtimes\win-arm64\native\onnxruntime.lib
                 move win-arm64\runtimes\win-arm64\native\onnxruntime.pdb %%~ni\runtimes\win-arm64\native\onnxruntime.pdb
 
-                unzip win-dml-arm.zip -d win-arm
-                mkdir %%~ni\runtimes\win-arm
-                mkdir %%~ni\runtimes\win-arm\native
-
-                move win-arm\runtimes\win-arm\native\onnxruntime.dll %%~ni\runtimes\win-arm\native\onnxruntime.dll
-                move win-arm\runtimes\win-arm\native\onnxruntime.lib %%~ni\runtimes\win-arm\native\onnxruntime.lib
-                move win-arm\runtimes\win-arm\native\onnxruntime.pdb %%~ni\runtimes\win-arm\native\onnxruntime.pdb
 
                 pushd %%~ni
                 zip -r ..\%%~ni.zip .
@@ -1368,7 +1226,7 @@ stages:
         PackageType: 'nuget'
         PackagePath: '$(Build.ArtifactStagingDirectory)'
         PackageName: 'Microsoft.ML.OnnxRuntime.DirectML*nupkg'
-        PlatformsSupported: 'win-x64,win-x86,win-arm64,win-arm'
+        PlatformsSupported: 'win-x64,win-x86,win-arm64'
         VerifyNugetSigning: ${{ parameters.DoEsrp }}
 
     - task: PublishPipelineArtifact@0
@@ -1376,3 +1234,6 @@ stages:
       inputs:
         artifactName: 'drop-signed-nuget-dml'
         targetPath: '$(Build.ArtifactStagingDirectory)'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml b/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
index 24086b6166fe..43e668eef8d0 100644
--- a/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
@@ -19,8 +19,7 @@ variables:
 jobs:
 - job: Clean_Build_Docker_Image_Cache
 
-  pool:
-    vmImage: 'ubuntu-20.04'
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
 
   timeoutInMinutes: 30
 
@@ -29,13 +28,6 @@ jobs:
     submodules: false
     fetchDepth: 1
 
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '3.9'
-      addToPath: true
-      architecture: 'x64'
-    displayName: "Use Python 3.9"
-
   - task: AzureCLI@2
     inputs:
       azureSubscription: 'AIInfraBuild'
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index efb936a8ded3..3c9baf97b8e6 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -125,10 +125,8 @@ stages:
       parameters:
         BaseImage: 'registry.access.redhat.com/ubi8/ubi'
         OnnxruntimeArch: 'x64'
-        OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
-        OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
         OnnxruntimeNodejsBindingArch: 'x64'
-        PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
         PackageJava: false
         PackageNodeJS: false
   # Nuget Packaging
@@ -145,32 +143,56 @@ stages:
       CudaVersion: ${{ parameters.CudaVersion }}
       win_trt_home: ${{ variables.win_trt_home }}
       win_cuda_home: ${{ variables.win_cuda_home }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
   - template: stages/nuget-combine-cuda-stage.yml
     parameters:
       DoCompliance: ${{ parameters.DoCompliance }}
       DoEsrp: ${{ parameters.DoEsrp }}
       IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
   # Testing
-  ## Windows GPU Testing
   - template: nuget/templates/test_win.yml
     parameters:
-      AgentPool: 'onnxruntime-Win2022-GPU-T4'
-      NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
+      AgentPool : 'onnxruntime-Win2022-GPU-A10'
+      NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
+      ArtifactSuffix: 'GPU'
+      StageSuffix: 'GPU'
+      Skipx86Tests: 'true'
+      CudaVersion: ${{ parameters.CudaVersion }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
+
+  - template: nuget/templates/test_win.yml
+    parameters:
+      AgentPool : 'onnxruntime-Win2022-GPU-A10'
+      NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Windows'
       ArtifactSuffix: 'GPU'
       StageSuffix: 'GPU'
+      MoreSuffix: '_Windows'
       Skipx86Tests: 'true'
       CudaVersion: ${{ parameters.CudaVersion }}
       SpecificArtifact: ${{ parameters.SpecificArtifact }}
       BuildId: ${{ parameters.BuildId }}
-  ## Linux GPU Testing
+
   - template: nuget/templates/test_linux.yml
     parameters:
-      AgentPool: Onnxruntime-Linux-GPU
+      AgentPool : Onnxruntime-Linux-GPU-A10
       ArtifactSuffix: 'GPU'
       StageSuffix: 'GPU'
-      NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
+      NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
+      CudaVersion: ${{ parameters.CudaVersion }}
       SpecificArtifact: ${{ parameters.specificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
+
+  - template: nuget/templates/test_linux.yml
+    parameters:
+      AgentPool : Onnxruntime-Linux-GPU-A10
+      ArtifactSuffix: 'GPU'
+      StageSuffix: 'GPU'
+      MoreSuffix: '_Linux'
+      NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
       CudaVersion: ${{ parameters.CudaVersion }}
+      SpecificArtifact: ${{ parameters.specificArtifact }}
       BuildId: ${{ parameters.BuildId }}
 
 ## Win/Linux GPU Combined Publishing
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 64b78dca504c..82e571bf6519 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -38,7 +38,7 @@ stages:
 - stage: x64
   dependsOn: []
   jobs:
-    - job: Linux_Build
+    - job: Linux_Debug
       timeoutInMinutes: 180
       workspace:
         clean: all
@@ -46,7 +46,86 @@ stages:
         skipComponentGovernanceDetection: true
         ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
         TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
+      steps:
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
+
+      - checkout: self
+        clean: true
+        submodules: none
+
+      - template: templates/get-docker-image-steps.yml
+        parameters:
+          Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
+          Context: tools/ci_build/github/linux/docker/inference/x64/default/cpu
+          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi"
+          Repository: onnxruntimecpubuildcentos8x64
+
+      - template: templates/linux-build-step-with-cache.yml
+        parameters:
+          WithCache: false
+          Today: $(TODAY)
+          AdditionalKey: onnxruntime_linux_debug_with_address_sanitizer
+          CacheDir: $(ORT_CACHE_DIR)
+          ChangeEveryCommit: true
+          BuildStep:
+            - task: CmdLine@2
+              displayName: 'build'
+              inputs:
+                script: |
+                  mkdir -p $HOME/.onnx
+                  docker run --rm \
+                    --volume /data/onnx:/data/onnx:ro \
+                    --volume /data/models:/data/models:ro \
+                    --volume $(Build.SourcesDirectory):/onnxruntime_src \
+                    --volume $(Build.BinariesDirectory):/build \
+                    --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+                    -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+                    -e NIGHTLY_BUILD \
+                    -e BUILD_BUILDNUMBER \
+                    onnxruntimecpubuildcentos8x64 \
+                    /bin/bash -c '
+                      set -ex; \
+                      python3.9 /onnxruntime_src/tools/ci_build/build.py \
+                        --build_dir /build --cmake_generator 'Ninja' \
+                        --config Debug \
+                        --skip_submodule_sync \
+                        --build_shared_lib \
+                        --parallel --use_binskim_compliant_compile_flags \
+                        --build_csharp \
+                        --enable_onnx_tests --enable_address_sanitizer \
+                        --update --build;
+                      LD_PRELOAD=/usr/lib64/libasan.so.6 python3.9 /onnxruntime_src/tools/ci_build/build.py \
+                        --build_dir /build --cmake_generator 'Ninja' \
+                        --config Debug \
+                        --skip_submodule_sync \
+                        --build_shared_lib \
+                        --parallel --use_binskim_compliant_compile_flags \
+                        --build_csharp \
+                        --enable_onnx_tests --enable_address_sanitizer \
+                        --test;
+                      '
+                workingDirectory: $(Build.SourcesDirectory)
+
+      - task: PublishTestResults@2
+        displayName: 'Publish unit test results'
+        inputs:
+          testResultsFiles: '**/*.results.xml'
+          searchFolder: '$(Build.BinariesDirectory)'
+          testRunTitle: 'Unit Test Run'
+        condition: succeededOrFailed()
+
+    - job: Linux_Release
+      timeoutInMinutes: 180
+      workspace:
+        clean: all
+      variables:
+        skipComponentGovernanceDetection: true
+        ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
+        TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
         displayName: 'Clean Agent Directories'
@@ -148,10 +227,10 @@ stages:
                       ccache -s; \
                       /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                         --build_dir /build --cmake_generator 'Ninja' \
-                        --config Debug Release \
+                        --config Release \
                         --skip_submodule_sync \
                         --build_shared_lib \
-                        --parallel \
+                        --parallel --use_binskim_compliant_compile_flags \
                         --build_wheel \
                         --build_csharp \
                         --enable_onnx_tests \
@@ -182,8 +261,8 @@ stages:
               set -ex; \
               pushd /onnxruntime_src/csharp; \
               dotnet restore /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln; \
-              dotnet build /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln; \
-              dotnet test /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -f net6.0 --no-build -l \"console;verbosity=normal\"; \
+              dotnet build /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -c Release; \
+              dotnet test /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -c Release -f net6.0 --no-build -l \"console;verbosity=normal\"; \
               popd
               "
         displayName: 'Dotnet build C# sln and Test'
@@ -207,20 +286,9 @@ stages:
               "
         displayName: 'Run Release tests and symbolic shape infer test'
 
-      - bash: |
-          mkdir -p $HOME/.onnx
-          docker run --rm \
-            --volume /data/onnx:/data/onnx:ro \
-            --volume $(Build.SourcesDirectory):/onnxruntime_src \
-            --volume $(Build.BinariesDirectory):/build \
-            --volume /data/models:/build/models:ro \
-            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-            -e NIGHTLY_BUILD \
-            -e BUILD_BUILDNUMBER \
-            onnxruntimecpubuild \
-                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Debug
-        displayName: 'Run Debug tests'
+      - template: templates/check_test_result.yml
+        parameters:
+          FileName: '$(Build.BinariesDirectory)/Release/onnxruntime_test_all.Release.results.xml'
 
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
index 146186e9eeaf..31decb0c2ffc 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
@@ -43,7 +43,7 @@ jobs:
   variables:
     CCACHE_DIR: $(Agent.TempDirectory)/ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
@@ -94,7 +94,7 @@ jobs:
                       --config Release \
                       --skip_submodule_sync \
                       --build_shared_lib \
-                      --parallel \
+                      --parallel --use_binskim_compliant_compile_flags \
                       --build_wheel \
                       --skip_tests \
                       --cmake_extra_defines onnxruntime_ENABLE_ATEN=ON \
@@ -126,7 +126,7 @@ jobs:
                 --config Release \
                 --skip_submodule_sync \
                 --build_shared_lib \
-                --parallel \
+                --parallel --use_binskim_compliant_compile_flags \
                 --build_wheel \
                 --test \
                 --cmake_extra_defines onnxruntime_ENABLE_ATEN=ON"
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
index a5c08e95b7ef..b3f5ff963141 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
@@ -51,7 +51,7 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - checkout: self
     clean: true
@@ -80,7 +80,7 @@ jobs:
               --config Release \
               --skip_submodule_sync \
               --build_shared_lib \
-              --parallel \
+              --parallel --use_binskim_compliant_compile_flags \
               --enable_lazy_tensor --enable_training --build_wheel --skip_test \
       workingDirectory: $(Build.SourcesDirectory)
 
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
index 1df36c2f2fb1..bbea7a0d114e 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
@@ -59,7 +59,7 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
-  pool: onnxruntime-Linux-CPU-For-Android-CI
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   variables:
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ort_ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
@@ -141,7 +141,7 @@ jobs:
               --config Debug \
               --skip_submodule_sync \
               --build_shared_lib \
-              --parallel \
+              --parallel --use_binskim_compliant_compile_flags \
               --skip_tests \
               --minimal_build \
               --disable_exceptions \
@@ -222,7 +222,7 @@ jobs:
               --build_dir /build/5 --cmake_generator Ninja \
               --config Debug \
               --skip_submodule_sync \
-              --build_shared_lib \
+              --build_shared_lib --use_binskim_compliant_compile_flags \
               --parallel \
               --minimal_build extended
       workingDirectory: $(Build.SourcesDirectory)
@@ -246,7 +246,7 @@ jobs:
               --skip_submodule_sync \
               --build_shared_lib \
               --build_wheel \
-              --parallel \
+              --parallel --use_binskim_compliant_compile_flags \
               --skip_tests \
               --disable_ml_ops \
               --disable_types sparsetensor float8 optional \
@@ -272,7 +272,7 @@ jobs:
               --config MinSizeRel \
               --skip_submodule_sync \
               --build_shared_lib \
-              --parallel \
+              --parallel --use_binskim_compliant_compile_flags \
               --minimal_build \
               --disable_exceptions \
               --disable_ml_ops \
@@ -300,7 +300,7 @@ jobs:
               --cmake_generator Ninja \
               --config MinSizeRel \
               --skip_submodule_sync \
-              --build_shared_lib \
+              --build_shared_lib --use_binskim_compliant_compile_flags \
               --parallel \
               --minimal_build extended \
               --disable_exceptions \
@@ -330,7 +330,7 @@ jobs:
               --cmake_generator Ninja \
               --config MinSizeRel \
               --skip_submodule_sync \
-              --parallel \
+              --parallel --use_binskim_compliant_compile_flags \
               --android \
               --android_sdk_path /android_home \
               --android_ndk_path /ndk_home \
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 0993a81a0224..0e885b71b486 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -34,6 +34,17 @@ parameters:
     values:
       - 11.8
       - 12.2
+
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Specific Artifact's BuildId
+    type: string
+    default: '0'
+
 resources:
   repositories:
   - repository: manylinux
@@ -43,7 +54,6 @@ resources:
     ref: 5eda9aded5462201e6310105728d33016e637ea7
 
 variables:
-  - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
@@ -56,53 +66,62 @@ variables:
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
       value: 8.6.1.6-1.cuda12.0
 
-jobs:
-- job: Linux_Build
-  timeoutInMinutes: 120
-  variables:
-    skipComponentGovernanceDetection: true
-    CCACHE_DIR: $(Pipeline.Workspace)/ccache
-  workspace:
-    clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
-  steps:
-  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-    displayName: 'Clean Agent Directories'
-    condition: always()
-
-  - checkout: self
-    clean: true
-    submodules: none
-  - template: templates/get-docker-image-steps.yml
-    parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-      Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "
-      --network=host 
-      --build-arg BASEIMAGE=$(docker_base_image)
-      --build-arg TRT_VERSION=$(linux_trt_version) 
-      --build-arg BUILD_UID=$( id -u )
-      "
-      Repository: onnxruntimecuda11build
-
-  - task: Cache@2
-    inputs:
-      key: '"ccache" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
-      path: $(CCACHE_DIR)
-      restoreKeys: |
-        "ccache" | "$(Build.SourceBranch)"
-        "ccache"
-      cacheHitVar: CACHE_RESTORED
-    displayName: Cach Task
-
-  - script: |
-      sudo mkdir -p $(Pipeline.Workspace)/ccache
-    condition: ne(variables.CACHE_RESTORED, 'true')
-    displayName: Create Cache Dir
-
-  - task: CmdLine@2
-    inputs:
-      script: |
+  - name: Repository
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: 'onnxruntimecuda11build'
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: 'onnxruntimecuda12build'
+
+stages:
+- stage: Linux_Build
+  jobs:
+  - job: Linux_Build
+    timeoutInMinutes: 120
+    variables:
+      skipComponentGovernanceDetection: true
+      CCACHE_DIR: $(Pipeline.Workspace)/ccache
+    workspace:
+      clean: all
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
+
+    steps:
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+        Context: tools/ci_build/github/linux/docker
+        DockerBuildArgs: "
+        --network=host
+        --build-arg BASEIMAGE=$(docker_base_image)
+        --build-arg TRT_VERSION=$(linux_trt_version)
+        --build-arg BUILD_UID=$( id -u )
+        "
+        Repository: $(Repository)
+
+    - task: Cache@2
+      inputs:
+        key: '"ccache" | "${{parameters.CudaVersion}}" |"$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
+        path: $(CCACHE_DIR)
+        restoreKeys: |
+          "ccache" | "${{parameters.CudaVersion}}" | "$(Build.SourceBranch)"
+          "ccache"
+        cacheHitVar: CACHE_RESTORED
+      displayName: Cach Task
+
+    - script: |
+        sudo mkdir -p $(Pipeline.Workspace)/ccache
+      condition: ne(variables.CACHE_RESTORED, 'true')
+      displayName: Create Cache Dir
+
+    - script: |
+        set -e -x
         mkdir -p $HOME/.onnx
         docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
           --volume /data/onnx:/data/onnx:ro \
@@ -115,7 +134,7 @@ jobs:
           -e NIGHTLY_BUILD \
           -e BUILD_BUILDNUMBER \
           -e CCACHE_DIR=/cache \
-          onnxruntimecuda11build \
+          $(Repository) \
           /bin/bash -c "
             set -ex; \
             env; \
@@ -125,91 +144,131 @@ jobs:
               --config Release --update --build \
               --skip_submodule_sync \
               --build_shared_lib \
-              --parallel \
+              --parallel --use_binskim_compliant_compile_flags \
               --build_wheel \
-              --enable_onnx_tests --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda-${{variables.common_cuda_version}} --cudnn_home=/usr/local/cuda-${{variables.common_cuda_version}} \
+              --enable_onnx_tests --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda-${{parameters.CudaVersion}} --cudnn_home=/usr/local/cuda-${{parameters.CudaVersion}} \
               --enable_cuda_profiling --enable_cuda_nhwc_ops \
               --enable_pybind --build_java \
               --use_cache \
-              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
+              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75 \
+              --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \
+              --cmake_extra_defines  onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \
                 ccache -sv; \
                 ccache -z"
       workingDirectory: $(Build.SourcesDirectory)
+      displayName: Build Onnxruntime
+
+    - task: CmdLine@2
+      inputs:
+        script: |
+          rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
+          rm -f $(Build.BinariesDirectory)/Release/models
+          find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete
+          cd $(Build.BinariesDirectory)/Release
+          find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
 
-  - task: CmdLine@2
-    inputs:
-      script: |
-        rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
-        rm -f $(Build.BinariesDirectory)/Release/models
-        find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete
-        cd $(Build.BinariesDirectory)/Release
-        find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
-
-  - task: PublishPipelineArtifact@0
-    displayName: 'Publish Pipeline Artifact'
-    inputs:
-      artifactName: 'drop-linux'
-      targetPath: '$(Build.BinariesDirectory)/Release'
-
-  - template: templates/explicitly-defined-final-tasks.yml
-
-- job: Linux_Test
-  timeoutInMinutes: 180
-  variables:
-    skipComponentGovernanceDetection: true
-  workspace:
-    clean: all
-  pool: Onnxruntime-Linux-GPU-T4
+    - task: PublishPipelineArtifact@0
+      displayName: 'Publish Pipeline Artifact'
+      inputs:
+        artifactName: 'drop-linux'
+        targetPath: '$(Build.BinariesDirectory)/Release'
+
+    - template: templates/explicitly-defined-final-tasks.yml
+
+- stage: Linux_Test
   dependsOn:
-  - Linux_Build
-  steps:
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Pipeline Artifact'
-    inputs:
-      buildType: 'current'
-      artifactName: 'drop-linux'
-      targetPath: '$(Build.BinariesDirectory)/Release'
-
-  - checkout: self
-    clean: true
-    submodules: none
-
-  - template: templates/get-docker-image-steps.yml
-    parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-      Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "
-      --network=host 
-      --build-arg BASEIMAGE=$(docker_base_image)
-      --build-arg TRT_VERSION=$(linux_trt_version)
-      --build-arg BUILD_UID=$( id -u )
-      "
-      Repository: onnxruntimecuda11build
-
-  - task: CmdLine@2
-    inputs:
-      script: |
-        set -e -x
-        mkdir -p $HOME/.onnx
-        docker run --gpus all --rm \
-          --volume  $(Build.SourcesDirectory):/onnxruntime_src \
-          --volume $(Build.BinariesDirectory)/Release:/build/Release \
-          --volume /data/models:/build/models:ro \
-          --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-          --volume /data/onnx:/data/onnx \
-          onnxruntimecuda11build \
-          /bin/bash -c "
-            set -ex; \
-            cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \
-            ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
-            /tmp/python3 -m pip install -r /tmp/requirements.txt; \
-            /tmp/python3 -m pip install /build/Release/dist/*.whl; \
-            cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \
-            cd /onnxruntime_src/java && /onnxruntime_src/java/gradlew cmakeCheck -DcmakeBuildDir=/build/Release -DUSE_CUDA=1; \
-            cd /tmp; \
-            /tmp/python3 /onnxruntime_src/tools/ci_build/build.py \
-              --build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_onnx_tests \
-              --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
-              --enable_pybind --build_java --ctest_path '' "
-
-  - template: templates/clean-agent-build-directory-step.yml
+    - Linux_Build
+  jobs:
+  - job: Linux_Test
+    timeoutInMinutes: 180
+    variables:
+      skipComponentGovernanceDetection: true
+    workspace:
+      clean: all
+    pool: onnxruntime-Linux-GPU-T4
+    steps:
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        ArtifactName: 'drop-linux'
+        StepName: 'Download Pipeline Artifact - Linux Build'
+        TargetPath: '$(Build.BinariesDirectory)/Release'
+        SpecificArtifact: ${{ parameters.SpecificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+        Context: tools/ci_build/github/linux/docker
+        DockerBuildArgs: "
+        --network=host
+        --build-arg BASEIMAGE=$(docker_base_image)
+        --build-arg TRT_VERSION=$(linux_trt_version)
+        --build-arg BUILD_UID=$( id -u )
+        "
+        Repository: $(Repository)
+
+    - task: CmdLine@2
+      inputs:
+        script: |
+          set -e -x
+          mkdir -p $HOME/.onnx
+          docker run --gpus all --rm \
+            --volume  $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory)/Release:/build/Release \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            --volume /data/onnx:/data/onnx \
+            -e NVIDIA_TF32_OVERRIDE=0 \
+            $(Repository) \
+            /bin/bash -c '
+              nvidia-smi; \
+              /sbin/ldconfig -N -v $(sed "s/:/ /" <<< $LD_LIBRARY_PATH) 2>/dev/null | grep -E "libcudart.so|libcudnn.so|libnvinfer.so"; \
+              cat /usr/local/cuda/include/cuda.h | grep -m1 CUDA_VERSION; \
+              cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -m1 -A 2; \
+              ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
+              /tmp/python3 -m pip install /build/Release/dist/*.whl; \
+              /tmp/python3 -u -c "from onnxruntime.capi._pybind_state import (OrtDevice as C_OrtDevice) ; \
+                        ort_device = C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0); \
+                        print(ort_device); print(ort_device.device_type(), C_OrtDevice.cuda()); \
+                        assert(ort_device.device_type()==1); assert(C_OrtDevice.cuda()==1);" \
+            '
+      displayName: 'Check GPU'
+
+    - task: CmdLine@2
+      inputs:
+        script: |
+          set -e -x
+          mkdir -p $HOME/.onnx
+          docker run --gpus all --shm-size=1g --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --rm \
+            --volume  $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory)/Release:/build/Release \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            --volume /data/onnx:/data/onnx \
+            -e NVIDIA_TF32_OVERRIDE=0 \
+            $(Repository) \
+            /bin/bash -c '
+              set -ex; \
+              cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \
+              ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
+              /tmp/python3 -m pip install -r /tmp/requirements.txt; \
+              /tmp/python3 -m pip install /build/Release/dist/*.whl; \
+              cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \
+              cd /onnxruntime_src/java && /onnxruntime_src/java/gradlew cmakeCheck -DcmakeBuildDir=/build/Release -DUSE_CUDA=1; \
+              cd /tmp; \
+              /tmp/python3 /onnxruntime_src/tools/ci_build/build.py \
+                --build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_wheel --enable_onnx_tests \
+                --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
+                --enable_pybind --build_java --ctest_path "" ; \
+              '
+      displayName: 'Run Tests'
+
+    - template: templates/check_test_result.yml
+      parameters:
+        FileName: '$(Build.BinariesDirectory)/Release/onnxruntime_test_all.Release.results.xml'
+
+    - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 4ca11a4d1565..75e4ba54006d 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -114,7 +114,7 @@ jobs:
                       --config Release \
                       --skip_submodule_sync \
                       --build_shared_lib \
-                      --parallel \
+                      --parallel --use_binskim_compliant_compile_flags \
                       --build_wheel \
                       --enable_onnx_tests \
                       --use_cuda --cuda_home=/usr/local/cuda-${{ parameters.CudaVersion }} --cudnn_home=/usr/local/cuda-${{ parameters.CudaVersion }} \
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index e75bb68a8bfe..48f809fc0675 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -8,13 +8,19 @@ parameters:
 - name: TrtVersion
   displayName: TensorRT Version
   type: string
-  default: 8.6.1.6
+  default: 8.6.cuda_11_8_cudnn_8
   values:
-  - 8.4.1.5
-  - 8.5.1.1
-  - 8.6.1.6
+  - 8.4.cuda_11_6_cudnn_8
+  - 8.5.cuda_11_8_cudnn_8
+  - 8.6.cuda_11_8_cudnn_8
+  - 8.6.cuda_12_3_cudnn_9
   - BIN
 
+- name: UseTensorrtOssParser
+  displayName: Use TensorRT-OSS Parser (not compatible with BIN)
+  type: boolean
+  default: false
+
 - name: ModelGroups
   type: object
   default: 
@@ -22,10 +28,15 @@ parameters:
     - "partner-models"
 
 - name: MemTest
-  displayName: Run Memory Test
+  displayName: Run Memory Test and Concurrency Test
   type: boolean
   default: true
 
+- name: ConcurrencyTest
+  displayName: Specifies the number of concurrency model test to invoke simultaneously
+  type: string
+  default: 2
+
 - name: TrtEPOptions
   displayName: TensorRT EP options
   type: object
@@ -71,26 +82,38 @@ jobs:
 
     - name: image
       value: ort-image-$(Build.BuildId)
+    
+    - name: parser
+      ${{ if eq(parameters.UseTensorrtOssParser, true) }}:
+        value: --use_tensorrt_oss_parser $(parameters.UseTensorrtOssParser) }}
 
   steps:
-    - ${{ if eq(parameters.TrtVersion, 'BIN') }}:
+    - ${{ if and(eq(parameters.TrtVersion, 'BIN'), eq(parameters.UseTensorrtOssParser, false)) }}:
       - script: 'ls -al $(trtBinsDir)'
         displayName: 'Show available TensorRT .tar.gz packages'
 
-      - script: 'cp $(trtBinsDir)/TensorRT-$(trtVersion).Linux.x86_64-gnu.cuda-$(tarCudaVersion).cudnn$(tarCudnnVersion).tar.gz $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/'
+      - script: 'cp $(trtBinsDir)/TensorRT-$(trtVersion).Linux.x86_64-gnu.cuda-$(tarCudaVersion).tar.gz $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/'
         displayName: 'Copy TensorRT .tar.gz package into Docker build directory'
 
-      - script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75 --install_bin --tar_cuda_version=$(tarCudaVersion) --tar_cudnn_version=$(tarCudnnVersion) --trt_bins_dir=.'
-        displayName: 'Install TensorRT from binaries and build latest ORT Image'
+      - script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75 --install_bin --tar_cuda_version=$(tarCudaVersion) --trt_bins_dir=.'
+        displayName: 'Install TensorRT $(tarTrtVersion) from binaries and build latest ORT Image'
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
-    - ${{ else }}:
+    
+    # Build ORT with TensorRT built-in parser 
+    - ${{ if and(ne(parameters.TrtVersion, 'BIN'), eq(parameters.UseTensorrtOssParser, false)) }}:
       - script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75'
-        displayName: 'Build latest ORT Image'
+        displayName: 'Build latest ORT Image with TensorRT built-in parser'
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
-        
+    
+    # Build ORT with TensorRT OSS parser 
+    - ${{ if and(ne(parameters.TrtVersion, 'BIN'), eq(parameters.UseTensorrtOssParser, true)) }}:
+      - script: 'python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build/build_image.py -r $(Build.SourcesDirectory) -i $(image) -b $(branchName) -t $(trtVersion) -a 75 -o $(onnxTensorrtCommitId) --use_tensorrt_oss_parser'
+        displayName: 'Build latest ORT Image with TensorRT OSS parser'
+        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
+    
     - ${{ if eq(parameters.MemTest, true) }}:
-      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false'
-        displayName: 'Run Memory Test'
+      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false -c ${{ parameters.ConcurrencyTest }}'
+        displayName: 'Run Memory Test and Concurrency Test'
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/'
 
     - ${{ each option in parameters.ModelGroups }}:
@@ -134,7 +157,7 @@ jobs:
         displayName: 'Check and Install Azure CLI'
 
       - task: AzureCLI@2
-        displayName: 'Azure CLI Post to Dashboard'
+        displayName: 'Post EP Perf Results to Dashboard'
         inputs:
           azureSubscription: AIInfraBuildOnnxRuntimeOSS
           scriptLocation: inlineScript
@@ -142,8 +165,8 @@ jobs:
           inlineScript: |
             short_hash=$(git rev-parse --short HEAD) &&
             commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
-            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database)
-    
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+
     - template: templates/component-governance-component-detection-steps.yml
       parameters :
         condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index 5dac8fc9cda6..8b58d958ba89 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -36,7 +36,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 5.7
+    value: 6.0
 
 jobs:
 - job: Linux_Build
@@ -46,7 +46,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
@@ -109,6 +109,7 @@ jobs:
               --rocm_version=$(RocmVersion) \
               --rocm_home /opt/rocm \
               --nccl_home /opt/rocm \
+              --enable_nccl \
               --update \
               --build_dir /build \
               --build \
diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
index c92fc93abba3..03e0274fc198 100644
--- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
@@ -32,5 +32,5 @@ jobs:
   parameters:
     AgentPool : 'Linux-CPU-2019'
     JobName: 'Linux_CI_Dev'
-    RunDockerBuildArgs: '-o ubuntu20.04 -d openvino -v 2023.0.0 -x "--use_openvino CPU_FP32 --build_wheel"'
+    RunDockerBuildArgs: '-o ubuntu20.04 -d openvino -v 2024.0.0 -x "--use_openvino CPU_FP32 --build_wheel"'
     TimeoutInMinutes: 120
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index 07e69ff49672..8fa5bdbf9093 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124
+  default: qnn-v2.19.2.240210
 
 jobs:
   - job: Build_QNN_EP
@@ -63,7 +63,7 @@ jobs:
           python3 tools/ci_build/build.py \
             --build_dir build \
             --config Release \
-            --parallel \
+            --parallel --use_binskim_compliant_compile_flags \
             --use_qnn \
             --qnn_home $(QNN_SDK_ROOT) \
             --cmake_generator=Ninja \
@@ -73,7 +73,7 @@ jobs:
       - script: |
           python3 tools/ci_build/build.py \
             --build_dir build \
-            --config Release \
+            --config Release --use_binskim_compliant_compile_flags \
             --test \
             --qnn_home $(QNN_SDK_ROOT) \
             --cmake_generator=Ninja \
@@ -86,7 +86,7 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnCpu.so" \
+              -v -j 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnCpu.so" \
               cmake/external/onnx/onnx/backend/test/data/node
 
       - task: CmdLine@2
@@ -94,7 +94,7 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnCpu.so" \
+              -v -j 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnCpu.so" \
               /data/float32_models
 
       - task: CmdLine@2
@@ -102,7 +102,7 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so" \
+              -v -j 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so" \
               /data/qdq_models
 
       - task: CmdLine@2
@@ -110,5 +110,5 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -f -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so" \
+              -v -f -j 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so" \
               /data/qdq_models/mobilenetv2-1.0_add_transpose_quant
diff --git a/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
index f5472a49c514..f0a35d809c70 100644
--- a/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
@@ -32,7 +32,7 @@ jobs:
   workspace:
     clean: all
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   variables:
     MACOSX_DEPLOYMENT_TARGET: '11.0'
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
@@ -43,6 +43,8 @@ jobs:
     displayName: Install coreutils and ninja
 
   - template: templates/use-xcode-version.yml
+    parameters:
+      xcodeVersion: 14.2
 
   - template: templates/mac-build-step-with-cache.yml
     parameters:
@@ -57,7 +59,7 @@ jobs:
           --build_dir build \
           --skip_submodule_sync \
           --cmake_generator=Ninja \
-          --parallel \
+          --parallel --use_binskim_compliant_compile_flags \
           --build_shared_lib \
           --config Debug \
           --use_cache \
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
index 18d53654e7c4..255531681b03 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
@@ -30,7 +30,7 @@ pr:
 jobs:
 - job: iOS_CI_on_Mac
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   variables:
     PROTO_CACHE_DIR: $(Pipeline.Workspace)/proto_ccache
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ort_ccache
@@ -38,6 +38,8 @@ jobs:
   timeoutInMinutes: 150
   steps:
     - template: templates/use-xcode-version.yml
+      parameters:
+        xcodeVersion: 14.2
     - template: templates/mac-build-step-with-cache.yml
       parameters:
         WithCache: true
@@ -60,7 +62,7 @@ jobs:
                 --use_xcode \
                 --config RelWithDebInfo \
                 --build_apple_framework \
-                --parallel
+                --parallel --use_binskim_compliant_compile_flags
             displayName: (CPU, CoreML, XNNPACK EPs) Build onnxruntime for iOS x86_64 and run tests using simulator
             env:
               CC: clang
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 5fd15b64e03b..881023e1c118 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     displayName: "Set common variables"
 
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     timeoutInMinutes: 5
 
diff --git a/tools/ci_build/github/azure-pipelines/mac-objc-static-analysis-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-objc-static-analysis-ci-pipeline.yml
index 6893fb95cfec..4927c9684b9d 100644
--- a/tools/ci_build/github/azure-pipelines/mac-objc-static-analysis-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-objc-static-analysis-ci-pipeline.yml
@@ -2,7 +2,7 @@ jobs:
 - job: ObjCStaticAnalysis
 
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
 
   timeoutInMinutes: 30
 
@@ -26,7 +26,7 @@ jobs:
         --enable_training_apis \
         --cmake_extra_defines CMAKE_EXPORT_COMPILE_COMMANDS=ON \
         --update --skip_submodule_sync \
-        --build --parallel --target onnx_proto
+        --build --parallel --use_binskim_compliant_compile_flags --target onnx_proto
     displayName: Generate compile_commands.json and ONNX protobuf files
 
   - script: |
diff --git a/tools/ci_build/github/azure-pipelines/mac-react-native-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-react-native-ci-pipeline.yml
index e8f4931d5ad9..886bacf5aac4 100644
--- a/tools/ci_build/github/azure-pipelines/mac-react-native-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-react-native-ci-pipeline.yml
@@ -61,4 +61,4 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     BuildConfig: 'Release'
-    PoolName: 'onnxruntime-Linux-CPU-For-Android-CI'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_linux.yml
index 864d1002a90f..7b03c0e82f4b 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_linux.yml
@@ -4,7 +4,7 @@ parameters:
 stages:
 - stage: Nodejs_Test_${{ parameters.StageSuffix }}
   dependsOn:
-  - Nodejs_Packaging_CPU
+  - Nodejs_Packaging
   condition: succeeded()
   jobs:
   - job:
@@ -18,4 +18,3 @@ stages:
       value: '$(Build.BinariesDirectory)'
     steps:
     - template: test.yml
-  
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
index 871d7894e531..f66c7d9938ec 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
@@ -3,7 +3,7 @@ parameters:
 stages:
 - stage: Nodejs_Test_MacOS_${{ parameters.StageSuffix }}
   dependsOn:
-  - Nodejs_Packaging_CPU
+  - Nodejs_Packaging
   condition: succeeded()
   jobs:
   - job:
@@ -11,7 +11,7 @@ stages:
       clean: all
     timeoutInMinutes:  120
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-latest'
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_win.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_win.yml
index c823ac788f92..9b3c61b2d3d8 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_win.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_win.yml
@@ -4,7 +4,7 @@ parameters:
 stages:
 - stage: Nodejs_Test_${{ parameters.StageSuffix }}
   dependsOn:
-  - Nodejs_Packaging_CPU
+  - Nodejs_Packaging
   condition: succeeded()
   jobs:
   - job:
diff --git a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
index 7f73da23b5eb..21fc205c72e8 100644
--- a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
@@ -41,7 +41,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     IsReleasePipeline: true
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     PackageName: 'onnxruntime-web'
     ExtraBuildArgs: ''
     UseWebPoolName: true
@@ -54,7 +54,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     BuildConfig: 'Release'
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     PackageName: 'onnxruntime-react-native'
     BuildAndroidAARStageDependsOn: 'Precheck_and_extract_commit'
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index 4e7093f04a59..3a3375a313ca 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -55,6 +55,9 @@ stages:
       - checkout: self
         clean: true
         submodules: recursive
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
 
       - powershell: |
           if($env:TELEMETRYGUID)
@@ -103,7 +106,7 @@ stages:
         displayName: 'Generate cmake config'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '$(BuildCommand) --path_to_protoc_exe $(Build.BinariesDirectory)\installed\bin\protoc.exe --build_csharp --update --config $(BuildConfig) ${{ variables.build_py_lto_flag }}'
+          arguments: '$(BuildCommand) --use_binskim_compliant_compile_flags --parallel --path_to_protoc_exe $(Build.BinariesDirectory)\installed\bin\protoc.exe --build_csharp --update --config $(BuildConfig) ${{ variables.build_py_lto_flag }}'
           workingDirectory: '$(Build.BinariesDirectory)'
 
       - ${{ if notIn(parameters['sln_platform'], 'Win32', 'x64') }}:
@@ -176,7 +179,7 @@ stages:
              python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
              set PATH=%PATH%;$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)
              @echo %PATH%
-             python $(Build.SourcesDirectory)\tools\ci_build\build.py $(BuildCommand) --test --config $(BuildConfig) ${{ variables.build_py_lto_flag }}
+             python $(Build.SourcesDirectory)\tools\ci_build\build.py $(BuildCommand) --parallel --use_binskim_compliant_compile_flags --test --config $(BuildConfig) ${{ variables.build_py_lto_flag }}
             workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
             displayName: 'Run tests'
 
@@ -185,7 +188,7 @@ stages:
           displayName: 'Publish unit test results'
           inputs:
             testResultsFiles: '**\*.results.xml'
-            searchFolder: '$(Build.BinariesDirectory)'
+            searchFolder: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
             testRunTitle: 'Unit Test Run'
           condition: succeededOrFailed()
 
@@ -231,14 +234,7 @@ stages:
               searchPattern: '**/*.pdb'
               symbolServerType: teamServices
 
-      - ${{ if eq(parameters['DoCompliance'], 'true') }}:
-        - template: ../../templates/compliance.yml
-          parameters :
-            msbuildPlatform: ${{ parameters.sln_platform }}
 
-      - template: ../../templates/component-governance-component-detection-steps.yml
-        parameters :
-          condition : 'succeeded'
 
       # Node.js Publish
       - ${{ if eq(parameters['DoNodejsPack'], 'true') }}:
@@ -294,6 +290,12 @@ stages:
             targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
             artifactName: 'drop-onnxruntime-nodejs-win-${{ parameters.sln_platform }}-dml'
 
-      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-        displayName: 'Clean Agent Directories'
-        condition: always()
+
+      - ${{ if eq(parameters['DoCompliance'], 'true') }}:
+        - template: ../../templates/compliance.yml
+          parameters :
+            msbuildPlatform: ${{ parameters.sln_platform }}
+
+      - template: ../../templates/component-governance-component-detection-steps.yml
+        parameters :
+          condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index f735755b04bb..2567bec9fdfc 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -1,8 +1,9 @@
 parameters:
-  AgentPool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  AgentPool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   ArtifactSuffix: ''
   NugetPackageName : ''
   StageSuffix: 'CPU'
+  # More Suffix is used to differentiate testing for GPU and GPU-Windows/GPU-Linux packages
   MoreSuffix: ''
   NativePackagePrefix: 'onnxruntime'
   SpecificArtifact: false
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
index de0520b97504..4dcec0f8cf3e 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-latest'
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
index 018672e0b2de..d8f02054a321 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
@@ -44,7 +44,7 @@ jobs:
     skipComponentGovernanceDetection: true
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-  pool: onnxruntime-Ubuntu-2004-Training-CPU
+  pool: onnxruntime-Ubuntu-2204-Training-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
@@ -102,7 +102,7 @@ jobs:
               --config Release \
               --skip_submodule_sync \
               --build_shared_lib \
-              --parallel \
+              --parallel --use_binskim_compliant_compile_flags \
               --build_wheel \
               --enable_onnx_tests \
               --enable_training \
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
index 7824bf2203ef..e13ef9160bed 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
@@ -24,7 +24,7 @@ jobs:
         --volume $(Build.SourcesDirectory)/orttraining/orttraining/test/python:/onnxruntime_src \
         --volume $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly:/requirements_torch_nightly \
         ptebic.azurecr.io/internal/aifx/acpt/nightly-ubuntu-cuda-torch-dev \
-         bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
+         bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py && ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=1 python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
     displayName: 'Run ORTModule Tests'
     condition: succeededOrFailed()
     timeoutInMinutes: 120
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index 8d02a5e5809a..71b224b65964 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -25,7 +25,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 5.7
+    value: 6.0
   - name: BuildConfig
     value: Release
 
@@ -37,7 +37,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
@@ -132,7 +132,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 9755e1f0771b..4ca122f63955 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -13,10 +13,10 @@ stages:
 
   jobs:
     - job: Linux_Training_CPU_Wheels
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
 
       strategy:
         matrix:
@@ -28,6 +28,8 @@ stages:
             PythonVersion: '3.10'
           Python311:
             PythonVersion: '3.11'
+          Python312:
+            PythonVersion: '3.12'
 
       steps:
       - checkout: self
@@ -44,7 +46,7 @@ stages:
             --build-arg PYTHON_VERSION=$(PythonVersion)
             --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
             --build-arg BUILD_UID=$(id -u)
-          Repository: onnxruntimetrainingcpubuild
+          Repository: onnxruntimetrainingcpubuild_$(PythonVersion)
 
       - task: CmdLine@2
         displayName: 'build onnxruntime'
@@ -61,13 +63,13 @@ stages:
               -e BUILD_BUILDNUMBER \
               -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
               -e DEFAULT_TRAINING_PACKAGE_DEVICE \
-              onnxruntimetrainingcpubuild \
+              onnxruntimetrainingcpubuild_$(PythonVersion) \
                 $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                   --build_dir /build --cmake_generator Ninja \
                   --config Debug Release \
                   --skip_submodule_sync \
                   --build_shared_lib \
-                  --parallel \
+                  --parallel --use_binskim_compliant_compile_flags \
                   --build_wheel \
                   --enable_onnx_tests \
                   --enable_pybind --enable_training
@@ -99,5 +101,4 @@ stages:
     enable_windows_cpu: true
     enable_windows_gpu: false
     enable_mac_cpu: true
-    enable_mac_silicon: true
     enable_linux_arm: false
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
index f244851f8cc3..2c6543247192 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
@@ -8,15 +8,29 @@ resources:
     name: pypa/manylinux
     ref: 5eda9aded5462201e6310105728d33016e637ea7
 
+parameters:
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Specific Artifact's BuildId
+    type: string
+    default: '0'
+
 stages:
 - template: templates/py-packaging-training-cuda-stage.yml
   parameters:
     build_py_parameters: --enable_training --update --build
     torch_version: '2.0.0'
-    opset_version: '15'
+    opset_version: '17'
     cuda_version: '11.8'
-    cmake_cuda_architectures: 60;61;70;75;80;86;90
+    cmake_cuda_architectures: 60;61;70;75;80;86
     docker_file: Dockerfile.manylinux2_28_training_cuda11_8
     agent_pool: Onnxruntime-Linux-GPU
     upload_wheel: 'yes'
     debug_build: false
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
+    build_pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml
index 422fb33eec5d..8628ae3de4d7 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda12.yml
@@ -11,12 +11,14 @@ resources:
 stages:
 - template: templates/py-packaging-training-cuda-stage.yml
   parameters:
-    build_py_parameters: --enable_training --update --build
+    # set the paralle count to reduce memory/build_threads to avoid OOM
+    build_py_parameters: --enable_training --update --build --parallel 8
     torch_version: '2.1.0'
-    opset_version: '15'
+    opset_version: '17'
     cuda_version: '12.2'
     cmake_cuda_architectures: 70;75;80;86;90
     docker_file: Dockerfile.manylinux2_28_training_cuda12_2
     agent_pool: Onnxruntime-Linux-GPU
     upload_wheel: 'yes'
     debug_build: false
+    build_pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml
index f2ba99369c14..bbdbe0fd8e37 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml
@@ -9,51 +9,51 @@ resources:
     ref: 5eda9aded5462201e6310105728d33016e637ea7
 
 stages:
-- stage: "Python_Packaging_ROCm57_Release"
+- stage: "Python_Packaging_ROCm60_Release"
   jobs:
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.8'
-      RocmVersion: '5.7'
+      RocmVersion: '6.0'
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.9'
-      RocmVersion: '5.7'
+      RocmVersion: '6.0'
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.10'
-      RocmVersion: '5.7'
+      RocmVersion: '6.0'
 
-- stage: "Python_Packaging_ROCm57_Debug"
+- stage: "Python_Packaging_ROCm60_Debug"
   jobs:
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.8'
-      RocmVersion: '5.7'
+      RocmVersion: '6.0'
       BuildConfig: 'Debug'
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.9'
-      RocmVersion: '5.7'
+      RocmVersion: '6.0'
       BuildConfig: 'Debug'
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.10'
-      RocmVersion: '5.7'
+      RocmVersion: '6.0'
       BuildConfig: 'Debug'
 
-- stage: "Python_Packaging_ROCm56_Release"
+- stage: "Python_Packaging_ROCm57_Release"
   condition: ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true')
   jobs:
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.8'
-      RocmVersion: '5.6'
+      RocmVersion: '5.7'
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.9'
-      RocmVersion: '5.6'
+      RocmVersion: '5.7'
   - template: templates/rocm.yml
     parameters:
       PythonVersion: '3.10'
-      RocmVersion: '5.6'
+      RocmVersion: '5.7'
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index e26e3e9abd0b..82708bbec34c 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -4,7 +4,7 @@ stages:
     parameters:
       NpmPackagingMode: 'dev'
       IsReleasePipeline: true
-      PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
       BuildStaticLib: true
       ExtraBuildArgs: ''
       UseWebPoolName: true
@@ -17,7 +17,6 @@ stages:
 # Each group has 4 jobs that cover:
 # o Windows ARM64EC
 # o Windows ARM64
-# o Windows ARM
 # o Windows x64
 # o Windows x86
 # Now we don't have coverage for ARM64EC yet. Will add it.
@@ -35,20 +34,6 @@ stages:
     buildNodejs: false
     ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
 
-- template: templates/win-ci.yml
-  parameters:
-    DoCompliance: false
-    DoEsrp: false
-    stage_name_suffix: CPU_arm_default
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm  --enable_onnx_tests --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
-
 - template: templates/win-ci.yml
   parameters:
     DoCompliance: false
@@ -106,21 +91,6 @@ stages:
     buildNodejs: false
     ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
 
-- template: templates/win-ci.yml
-  parameters:
-    DoCompliance: false
-    DoEsrp: false
-    stage_name_suffix: CPU_arm_wcos
-    artifact_name_suffix: '-wcos'
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm  --enable_onnx_tests --enable_wcos --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
-
 - template: templates/win-ci.yml
   parameters:
     DoCompliance: false
@@ -181,21 +151,6 @@ stages:
     buildNodejs: false
     ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
 
-- template: templates/win-ci.yml
-  parameters:
-    DoCompliance: false
-    DoEsrp: false
-    stage_name_suffix: CPU_arm_extension
-    artifact_name_suffix: '-extension'
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm --use_extensions  --enable_onnx_tests --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: 'onnxruntime-Win-CPU-2022'
-
 - template: templates/win-ci.yml
   parameters:
     DoCompliance: false
@@ -367,7 +322,7 @@ stages:
     timeoutInMinutes: 150
     variables:
       skipComponentGovernanceDetection: true
-    pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     steps:
     - template: templates/set-version-number-variables-step.yml
 
@@ -413,7 +368,7 @@ stages:
   - job: AndroidCustomBuildScript
     workspace:
       clean: all
-    pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     variables:
       dockerImageTag: onnxruntime-android-custom-build
     steps:
@@ -450,7 +405,7 @@ stages:
   - job: IosDynamicFramework
     timeoutInMinutes: 120
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     steps:
     - task: UsePythonVersion@0
@@ -460,6 +415,8 @@ stages:
         architecture: "x64"
 
     - template: templates/use-xcode-version.yml
+      parameters:
+        xcodeVersion: 14.2
 
     - script: |
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
@@ -477,4 +434,41 @@ stages:
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
           --variant Mobile
-      displayName: "Test pod with iOS dynamic framework"
+      displayName: "Test pod with iOS framework"
+
+- stage: IosMinimalTrainingBuild
+  dependsOn: []
+  jobs:
+  - job: IosMinimalTrainingBuild
+    timeoutInMinutes: 120
+    pool:
+      vmImage: "macOS-latest"
+
+    steps:
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: "3.9"
+        addToPath: true
+        architecture: "x64"
+
+    - template: templates/use-xcode-version.yml
+      parameters:
+        xcodeVersion: 14.2
+
+    - script: |
+        pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
+      displayName: "Install Python requirements"
+
+    - script: |
+        python tools/ci_build/github/apple/build_apple_framework.py \
+          --build_dir "$(Build.BinariesDirectory)/ios_framework" \
+          tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
+      displayName: "Build iOS framework with minimal build and training enabled"
+
+    - script: |
+        python tools/ci_build/github/apple/test_apple_packages.py \
+          --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
+          --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
+          --variant Training \
+          --skip_macos_test
+      displayName: "Test pod with iOS framework"
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
index aee42d367508..20646d3ba4a2 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
@@ -21,6 +21,15 @@ parameters:
     values:
       - 11.8
       - 12.2
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Specific Artifact's BuildId
+    type: string
+    default: '0'
 
 resources:
   repositories:
@@ -36,4 +45,6 @@ stages:
       enable_linux_gpu: ${{ parameters.enable_linux_gpu }}
       enable_windows_gpu: ${{ parameters.enable_windows_gpu }}
       cmake_build_type: ${{ parameters.cmake_build_type }}
-      cuda_version: ${{ parameters.cuda_version }}
\ No newline at end of file
+      cuda_version: ${{ parameters.cuda_version }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 55d3150f21aa..acec6f501ed2 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
   - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'x86_64'
-      machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
       base_image: 'registry.access.redhat.com/ubi8/ubi'
       devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
       ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
@@ -43,7 +43,7 @@ stages:
       parameters:
         job_name: Test_MAC_Wheels
         machine_pool:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-latest'
         itemPattern: '*/*mac*x86_64.whl'
     - template: templates/py-package-smoking-test.yml
       parameters:
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index 62f84a9bb185..6b0ae085fa4d 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -24,13 +24,18 @@ parameters:
   type: boolean
   default: true
 
-- name: enable_mac_silicon
-  displayName: 'Whether Mac silicon package is built.'
+- name: enable_linux_arm
+  displayName: 'Whether Linux ARM package is built.'
   type: boolean
   default: true
 
-- name: enable_linux_arm
-  displayName: 'Whether Linux ARM package is built.'
+- name: enable_windows_arm64_qnn
+  displayName: 'Whether Windows ARM64 package with QNN EP is built.'
+  type: boolean
+  default: true
+
+- name: enable_windows_x64_qnn
+  displayName: 'Whether Windows x86_64 package with QNN EP is built.'
   type: boolean
   default: true
 
@@ -68,7 +73,8 @@ stages:
     enable_windows_cpu: ${{ parameters.enable_windows_cpu }}
     enable_windows_gpu: ${{ parameters.enable_windows_gpu }}
     enable_mac_cpu: ${{ parameters.enable_mac_cpu }}
-    enable_mac_silicon: ${{ parameters.enable_mac_silicon }}
     enable_linux_arm: ${{ parameters.enable_linux_arm }}
+    enable_windows_arm64_qnn: ${{ parameters.enable_windows_arm64_qnn }}
+    enable_windows_x64_qnn: ${{ parameters.enable_windows_x64_qnn }}
     build_py_parameters: ${{ parameters.build_py_parameters }}
-    cmake_build_type: ${{ parameters.cmake_build_type }}
\ No newline at end of file
+    cmake_build_type: ${{ parameters.cmake_build_type }}
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index d9aff36c4ad3..9a38513d04a7 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -1,19 +1,9 @@
 parameters:
-- name: qnn_sdk_path_win
-  displayName: QNN Windows SDK path
+- name: QnnSdk
+  displayName: QNN SDK Version
   type: string
-  default: C:\data\qnnsdk\qnn-v2.17.0.231124_win
+  default: qnn-v2.19.2.240210_win
 
-- name: qnn_sdk_info
-  displayName: QNN SDK Version Information
-  type: string
-  default: qnn-v2.17.0.231124_win
-
-- name: ort_package_version
-  displayName: OnnxRuntime Nuget package version
-  type: string
-  default: 1.15.0
-  
 - name: build_config
   displayName: Build Configuration
   type: string
@@ -47,7 +37,7 @@ jobs:
       buildArch: x64
       setVcvars: true
       ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
-      commonBuildArgs: '--compile_no_warning_as_error --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
+      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home C:\data\qnnsdk\${{parameters.QnnSdk}} --parallel --use_binskim_compliant_compile_flags '
 
     steps:
       - template: templates/set-version-number-variables-step.yml
@@ -90,7 +80,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package x64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.QnnSdk }}
             cd $(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\x64
@@ -130,7 +120,7 @@ jobs:
         displayName: 'Generate CMake Configuration for arm64'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--update --arm64 --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
+          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home C:\data\qnnsdk\${{parameters.QnnSdk}} --parallel'
 
       - task: VSBuild@1
         displayName: 'Build onnxruntime arm64'
@@ -178,7 +168,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package arm64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.QnnSdk }}
             cd $(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\arm64
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index d009e1555918..2452e2885e74 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -17,8 +17,8 @@ stages:
 - stage: NuGet_Packaging_GPU
   dependsOn:
     - Set_ReleaseVersionSuffix
-    - Windows_Packaging_gpu
-    - Windows_Packaging_tensorrt
+    - Windows_Packaging_gpu_Testing
+    - Windows_Packaging_tensorrt_Testing
     - Linux_C_API_Packaging_CPU_x64
     - Linux_C_API_Packaging_GPU_x64
     - Linux_C_API_Packaging_GPU_TensorRT_x64
@@ -31,6 +31,8 @@ stages:
       variables:
         breakCodesignValidationInjection: ${{ parameters.DoEsrp }}
         ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
+        BuildDate: $[format('{0:yyyyMMdd}', pipeline.startTime)]
+        BuildTime: $[format('{0:HHmm}', pipeline.startTime)]
 
       steps:
         - checkout: self
@@ -149,7 +151,8 @@ stages:
             solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
             configuration: RelWithDebInfo
             platform: 'Any CPU'
-            msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
+            msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
+                              -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) -p:CurrentDate=$(BuildDate) -p:CurrentTime=$(BuildTime)'
             workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
         - task: BatchScript@1
@@ -189,16 +192,26 @@ stages:
           parameters:
             PackageType: 'nuget'
             PackagePath: '$(Build.ArtifactStagingDirectory)'
-            PackageName: 'Microsoft.ML.OnnxRuntime.*nupkg'
             PlatformsSupported: 'win-x64,linux-x64'
+            # 1* stands for version number. we use it to filter Gpu.Windows and Gpu.Linux packages
+            PackageName: 'Microsoft.ML.OnnxRuntime.Gpu.1*nupkg'
             VerifyNugetSigning: false
 
-        - task: PublishPipelineArtifact@0
-          displayName: 'Publish Pipeline NuGet Artifact'
-          inputs:
-            artifactName: 'drop-signed-nuget-GPU'
-            targetPath: '$(Build.ArtifactStagingDirectory)'
+        - template: ../templates/validate-package.yml
+          parameters:
+            PackageType: 'nuget'
+            PackagePath: '$(Build.ArtifactStagingDirectory)'
+            PackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Windows.*nupkg'
+            PlatformsSupported: 'win-x64'
+            VerifyNugetSigning: false
 
+        - template: ../templates/validate-package.yml
+          parameters:
+            PackageType: 'nuget'
+            PackagePath: '$(Build.ArtifactStagingDirectory)'
+            PackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux.*nupkg'
+            PlatformsSupported: 'linux-x64'
+            VerifyNugetSigning: false
 
         - task: MSBuild@1
           displayName: 'Clean C#'
@@ -221,6 +234,12 @@ stages:
           parameters:
             condition: 'succeeded'
 
+        - task: PublishPipelineArtifact@0
+          displayName: 'Publish Pipeline NuGet Artifact'
+          inputs:
+            artifactName: 'drop-signed-nuget-GPU'
+            targetPath: '$(Build.ArtifactStagingDirectory)'
+
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
           displayName: 'Clean Agent Directories'
           condition: always()
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index fbdd67bb5de2..db9bcacbf075 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -6,6 +6,12 @@ parameters:
   type: string
 - name: linux_trt_version
   type: string
+- name: buildJava
+  type: boolean
+  default: false
+- name: buildNodejs
+  type: boolean
+  default: false
 
 stages:
   # Linux CUDA without TensorRT Packaging
@@ -15,7 +21,7 @@ stages:
   - job:
     workspace:
       clean: all
-    timeoutInMinutes: 120
+    timeoutInMinutes: 150
     pool: 'Onnxruntime-Linux-GPU'
     variables:
       - name: CUDA_VERSION_MAJOR
@@ -40,7 +46,16 @@ stages:
     - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
       workingDirectory: $(Build.SourcesDirectory)
       displayName: 'Build and Test'
-
+# We only support Maven package for CUDA 11.8
+    - ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      - template: ../templates/java-api-artifacts-package-and-publish-steps-posix.yml
+        parameters:
+          arch: 'linux-x64'
+          buildConfig: 'Release'
+          artifactName: 'onnxruntime-java-linux-x64-cuda'
+          version: '$(OnnxRuntimeVersion)'
+          libraryName: 'libonnxruntime.so'
+          nativeLibraryName: 'libonnxruntime4j_jni.so'
     - template: ../templates/c-api-artifacts-package-and-publish-steps-posix.yml
       parameters:
         buildConfig: 'Release'
@@ -57,9 +72,9 @@ stages:
   parameters:
     artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
     artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
-    buildJava: false
+    buildJava: ${{ parameters.buildJava }}
     buildJavaOption: '--build_java'
-    buildNodejs: false
+    buildNodejs: ${{ parameters.buildNodejs }}
     buildNodejsOption: '--build_nodejs'
     CudaVersion: ${{ parameters.CudaVersion }}
 # Linux CUDA Combined Testing and Publishing
@@ -82,6 +97,10 @@ stages:
         - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
           submodules: false
 
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
         - script: |
             set -e -x
             cd $(Build.SourcesDirectory)
@@ -159,3 +178,6 @@ stages:
           inputs:
             targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
             artifactName: 'onnxruntime-linux-x64-gpu'
+        - template: ../templates/component-governance-component-detection-steps.yml
+          parameters:
+            condition: 'succeeded'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 3fb653c6b440..f599f45059c0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -23,6 +23,16 @@ parameters:
 - name: win_trt_home
   type: string
 
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
+
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
+
 stages:
 # Windows CUDA without TensorRT Packaging
 - template: ../templates/win-ci.yml
@@ -40,6 +50,8 @@ stages:
     buildJava: false
     java_artifact_id: onnxruntime_gpu
     PublishProtoc: true
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
 # Windows CUDA with TensorRT Packaging
 - template: ../templates/win-ci.yml
   parameters:
@@ -56,12 +68,14 @@ stages:
     buildJava: false
     java_artifact_id: onnxruntime_gpu
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
 
 # Windows CUDA Combined Testing and Publishing
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
-    - Windows_Packaging_gpu
-    - Windows_Packaging_tensorrt
+    - Windows_Packaging_gpu_Testing
+    - Windows_Packaging_tensorrt_Testing
   condition: succeeded()
 
   jobs:
@@ -139,9 +153,24 @@ stages:
             filename: $(Build.SourcesDirectory)\onnxruntime-inference-examples\c_cxx\squeezenet\run_capi_application.bat
             arguments: $(Build.SourcesDirectory)\onnxruntime $(Build.ArtifactStagingDirectory)\onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip $(Build.SourcesDirectory)\onnxruntime-inference-examples\c_cxx\squeezenet
             workingFolder: '$(Build.ArtifactStagingDirectory)'
+        - script: |
+            dir
+          workingDirectory: '$(Build.ArtifactStagingDirectory)'
+          displayName: 'List ArtifactStagingDirectory before delete'
+
+        - task: DeleteFiles@1
+          displayName: 'Clean up none zip files from ArtifactStagingDirectory'
+          inputs:
+            SourceFolder: $(Build.ArtifactStagingDirectory)
+            Contents: '*/'
+
+        - script: |
+            dir
+          workingDirectory: '$(Build.ArtifactStagingDirectory)'
+          displayName: 'List ArtifactStagingDirectory after delete'
 
         - task: PublishPipelineArtifact@0
           displayName: 'Publish Pipeline Combined GPU Package Artifact'
           inputs:
             artifactName: 'onnxruntime-win-x64-gpu'
-            targetPath: '$(Build.ArtifactStagingDirectory)'
\ No newline at end of file
+            targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index f3d68957d649..f78520112fc7 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -34,72 +34,52 @@ parameters:
    - 11.8
    - 12.2
 
-stages:
-- stage: Python_Packaging
-  dependsOn: []
-  variables:
-  - name: docker_base_image
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
-  - name: linux_trt_version
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: 8.6.1.6-1.cuda11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: 8.6.1.6-1.cuda12.0
-  - name: win_trt_home
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0
-  - name: win_cuda_home
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: $(Agent.TempDirectory)\v11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: $(Agent.TempDirectory)\v12.2
-  jobs:
-  - ${{ if eq(parameters.enable_windows_gpu, true) }}:
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.8'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
 
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.9'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
 
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.10'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: PythonVersions
+  type: object
+  displayName: 'Python versions to build'
+  default:
+    - '3.8'
+    - '3.9'
+    - '3.10'
+    - '3.11'
+    - '3.12'
 
+stages:
+  - ${{ if eq(parameters.enable_windows_gpu, true) }}:
+    - ${{ each python_version in parameters.PythonVersions }}:
       - template: ../templates/py-win-gpu.yml
         parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.11'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-
+          SpecificArtifact: ${{ parameters.SpecificArtifact }}
+          BuildId: ${{ parameters.BuildId }}
+          ${{ if eq(parameters.cuda_version, '11.8') }}:
+            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          ${{ if eq(parameters.cuda_version, '12.2') }}:
+            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
 
   - ${{ if eq(parameters.enable_linux_gpu, true) }}:
       - template: ../templates/py-linux-gpu.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          docker_base_image: ${{ variables.docker_base_image }}
-          trt_version: ${{ variables.linux_trt_version }}
           cuda_version: ${{ parameters.cuda_version }}
+          ${{ if eq(parameters.cuda_version, '11.8') }}:
+            docker_base_image: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+            trt_version: 8.6.1.6-1.cuda11.8
+          ${{ if eq(parameters.cuda_version, '12.2') }}:
+            docker_base_image: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+            trt_version: 8.6.1.6-1.cuda12.0
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
index 4f440e0f61b3..2a4debcf9fba 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
@@ -20,7 +20,7 @@ stages:
       dependsOn: []
     jobs:
       - job:
-        pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
         steps:
           - checkout: none
           - task: DownloadPipelineArtifact@2
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml b/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
index 733cafdeeb8c..a96abee85388 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
@@ -31,7 +31,7 @@ stages:
     timeoutInMinutes: 60
     workspace:
       clean: all
-    pool: onnxruntime-Linux-CPU-For-Android-CI
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     steps:
     - checkout: self
       clean: true
@@ -49,6 +49,7 @@ stages:
     - task: PythonScript@0
       displayName: 'Set variables from config file "${{ parameters.BuildConfigFile }}"'
       inputs:
+        pythonInterpreter: /usr/bin/python3
         scriptSource: inline
         script: |
           import json
@@ -111,7 +112,7 @@ stages:
             echo "File not found: ${BINARY_SIZE_DATA_FILE}"
             exit 1
           fi
-          /usr/bin/python3 -m pip install -r $(Build.SourcesDirectory)/tools/ci_build/github/windows/post_to_dashboard/requirements.txt && \
+          /usr/bin/python3 -m pip install --user -r $(Build.SourcesDirectory)/tools/ci_build/github/windows/post_to_dashboard/requirements.txt && \
           /usr/bin/python3 $(Build.SourcesDirectory)/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py \
             --commit_hash=$(Build.SourceVersion) \
             --size_data_file="${BINARY_SIZE_DATA_FILE}" \
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
index 5e61f88b4aa1..509fea45ebe5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
@@ -33,7 +33,7 @@ parameters:
 - name: pool_name
   displayName: Pool name
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: packageName
   # now we can build onnxruntime or onnxruntime-mobile for Android, need specify it here
diff --git a/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml b/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
index e664cf69dec7..e77b1a4008b7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
@@ -24,19 +24,17 @@ parameters:
   type: string
 
 steps:
-  - task: Cache@2
-    inputs:
-      ${{if eq(variables['Build.SourceBranchName'], 'merge')}}:
-        key: ' "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | merge '
-      ${{else}}:
-        key: '"${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | $(Build.SourceVersion) '
-      path: ${{parameters.CacheDir}}
-      restoreKeys: |
-        "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}
-    displayName: Cache Task
-    condition: eq('${{parameters.WithCache}}', true)
-
   - ${{if eq(parameters.WithCache, true)}}:
+    - task: Cache@2
+      inputs:
+        ${{if eq(variables['Build.SourceBranchName'], 'merge')}}:
+          key: ' "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | merge '
+        ${{else}}:
+          key: '"${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | $(Build.SourceVersion) '
+        path: ${{parameters.CacheDir}}
+        restoreKeys: |
+          "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}
+      displayName: Cache Task
     - script: |
         set -e -x
         pushd '$(Build.SourcesDirectory)/cmake/external/emsdk'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 37b4bdc43afc..8b0961308299 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -101,12 +101,14 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-latest'
     timeoutInMinutes: 300
     steps:
     - template: set-version-number-variables-step.yml
 
     - template: use-xcode-version.yml
+      parameters:
+        xcodeVersion: 14.2
 
     - script: |
         /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \
@@ -115,34 +117,45 @@ stages:
           $(Build.BinariesDirectory)/protobuf_install
       displayName: Build Host Protoc
 
+    - template: download-deps.yml
+
+    - task: PythonScript@0
+      displayName: 'Update deps.txt'
+      inputs:
+        scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+        arguments: --new_dir $(Build.BinariesDirectory)/deps
+        workingDirectory: $(Build.BinariesDirectory)
+
     - script: |
         set -e -x
         python3 tools/ci_build/github/apple/build_apple_framework.py \
-          --build_dir "$(Build.BinariesDirectory)/apple_framework" \
+          --build_dir "$(Build.BinariesDirectory)/ios_framework" \
           --path_to_protoc_exe $(Build.BinariesDirectory)/protobuf_install/bin/protoc \
-          tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
+          tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
         mkdir $(Build.BinariesDirectory)/artifacts
-        mkdir -p $(Build.BinariesDirectory)/artifacts_staging/onnxruntime-apple-xcframework-$(OnnxRuntimeVersion)
-        cp -R $(Build.BinariesDirectory)/apple_framework/framework_out/onnxruntime.xcframework \
-          $(Build.BinariesDirectory)/artifacts_staging/onnxruntime-apple-xcframework-$(OnnxRuntimeVersion)
+        mkdir -p $(Build.BinariesDirectory)/artifacts_staging/onnxruntime-ios-xcframework-$(OnnxRuntimeVersion)
+        cp -R $(Build.BinariesDirectory)/ios_framework/framework_out/onnxruntime.xcframework \
+          $(Build.BinariesDirectory)/artifacts_staging/onnxruntime-ios-xcframework-$(OnnxRuntimeVersion)
         pushd $(Build.BinariesDirectory)/artifacts_staging
         zip -vr $(Build.BinariesDirectory)/artifacts/onnxruntime_xcframework.zip \
-          onnxruntime-apple-xcframework-$(OnnxRuntimeVersion)
+          onnxruntime-ios-xcframework-$(OnnxRuntimeVersion)
         popd
       displayName: "Build Apple xcframework"
 
     - script: |
         python3 tools/ci_build/github/apple/test_apple_packages.py \
           --fail_if_cocoapods_missing \
-          --framework_info_file "$(Build.BinariesDirectory)/apple_framework/xcframework_info.json" \
-          --c_framework_dir "$(Build.BinariesDirectory)/apple_framework/framework_out" \
-          --variant Full
+          --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
+          --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
+          --variant Full \
+          --skip_macos_test \
+          --mac_catalyst_enabled
       displayName: "Test Apple framework"
 
     - task: PublishBuildArtifacts@1
       inputs:
         pathtoPublish: '$(Build.BinariesDirectory)/artifacts'
-        artifactName: 'onnxruntime-apple-full-xcframework'
+        artifactName: 'onnxruntime-ios-full-xcframework'
 
     - template: component-governance-component-detection-steps.yml
       parameters:
@@ -161,20 +174,6 @@ stages:
     buildJava: false
     buildNodejs: false
 
-- template: win-ci.yml
-  parameters:
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    stage_name_suffix: CPU_arm_${{ parameters.BuildVariant }}
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm ${{ parameters.AdditionalBuildFlags }}  ${{ parameters.AdditionalWinBuildFlags}} --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: onnxruntime-Win-CPU-2022
-
 - template: win-ci.yml
   parameters:
     DoCompliance: ${{ parameters.DoCompliance }}
@@ -205,10 +204,7 @@ stages:
   dependsOn:
   - Linux_C_API_Packaging_CPU
   - MacOS_C_API_Package_Publish
-  - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   - Download_Java_Tools
   condition: succeeded()
   jobs:
@@ -297,7 +293,6 @@ stages:
   - MacOS_C_API_Package_Publish
   - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   - Android_Java_API_AAR_Packaging_Full
   - iOS_Full_xcframework
@@ -340,14 +335,6 @@ stages:
         SpecificArtifact: ${{ parameters.specificArtifact }}
         BuildId: ${{ parameters.BuildId }}
 
-    - template: flex-downloadPipelineArtifact.yml
-      parameters:
-        StepName: 'Download win-arm Pipeline Artifact'
-        ArtifactName: 'onnxruntime-win-arm'
-        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-        SpecificArtifact: ${{ parameters.specificArtifact }}
-        BuildId: ${{ parameters.BuildId }}
-
     - template: flex-downloadPipelineArtifact.yml
       parameters:
         StepName: 'Download osx-x64 Pipeline Artifact'
@@ -375,7 +362,7 @@ stages:
     - template: flex-downloadPipelineArtifact.yml
       parameters:
         StepName: 'Download iOS Pipeline Artifact'
-        ArtifactName: 'onnxruntime-apple-full-xcframework'
+        ArtifactName: 'onnxruntime-ios-full-xcframework'
         TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
         SpecificArtifact: ${{ parameters.specificArtifact }}
         BuildId: ${{ parameters.BuildId }}
@@ -525,12 +512,13 @@ stages:
       displayName: 'Clean Agent Directories'
       condition: always()
 
-- stage: Nodejs_Packaging_CPU
+- stage: Nodejs_Packaging
   dependsOn:
+  - Windows_CI_GPU_DML_Dev
+  - Windows_CI_GPU_DML_Dev_arm64
   - Linux_C_API_Packaging_CPU
+  - Linux_C_API_Packaging_GPU_TensorRT_x64
   - MacOS_C_API_Package_Publish
-  - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   condition: succeeded()
   jobs:
   - job:
@@ -557,17 +545,49 @@ stages:
       workingDirectory: '$(Build.SourcesDirectory)'
       displayName: 'Testing: force EOL to lf on windows for /js/**'
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet (Win x64)'
-      inputs:
-        artifactName: 'onnxruntime-win-x64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet (Win ARM64)'
-      inputs:
-        artifactName: 'onnxruntime-win-arm64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    ##################################################################
+    # Node.js binding artifacts preparation
+    #
+    # This stage prepares Node.js binding artifacts for publishing. The artifacts support the following platforms:
+    #  - Windows x64 with DML support
+    #  - Windows arm64 with DML support
+    #  - Linux x64 with TensorRT support
+    #  - Linux arm64 (CPU only)
+    #  - macOS x64 (CPU only)
+    #  - macOS arm64 (CPU only)
+    #
+    # ORT Node.js binding artifacts contain 2 parts:
+    #  1. ONNX Runtime native shared libraries and their dependencies
+    #     - Windows (x64, arm64):
+    #       - onnxruntime.dll
+    #       - DirectML.dll
+    #     - Linux (x64, arm64):
+    #       - libonnxruntime.so{.version}
+    #       - libonnxruntime_providers_shared.so
+    #       - libonnxruntime_providers_{provider}.so
+    #     - macOS (x64, arm64):
+    #       - libonnxruntime.dylib
+    #  2. ONNX Runtime Node.js binding
+    #     - onnxruntime_binding.node
+    #
+    # For windows platform, the artifact is named as 'onnxruntime-nodejs-win-x64-dml' for x64, and
+    # 'onnxruntime-nodejs-win-arm64-dml' for arm64. Each artifact contains both (1) and (2).
+    #
+    # For Linux and macOS platforms, (1) and (2) are packed into separate artifacts.
+    # The following artifacts contain (1):
+    #  - onnxruntime-osx
+    #  - onnxruntime-linux-x64-tensorrt
+    #  - onnxruntime-linux-aarch64
+    # The following artifacts contain (2):
+    #  - drop-onnxruntime-nodejs-linux-x64-tensorrt
+    #  - drop-onnxruntime-nodejs-linux-aarch64
+    #  - drop-onnxruntime-nodejs-osx-x86_64
+    #  - drop-onnxruntime-nodejs-osx-arm64
+    #
+    # All binary artifacts will eventually be put into folder before packaging 'onnxruntime-node':
+    #  $(Build.SourcesDirectory)\js\node\bin\napi-v3\{os}\{cpu_arch}\
+    #
+    # {os} is one of 'win32', 'darwin', 'linux' and {cpu_arch} is one of 'x64', 'arm64'.
 
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet (OSX)'
@@ -578,7 +598,7 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet (Linux x64)'
       inputs:
-        artifactName: 'onnxruntime-linux-x64'
+        artifactName: 'onnxruntime-linux-x64-tensorrt'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
 
     - task: DownloadPipelineArtifact@0
@@ -590,13 +610,13 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Win x64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-win-x64'
+        artifactName: 'drop-onnxruntime-nodejs-win-x64-dml'
         targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/x64/'
 
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Win ARM64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-win-arm64'
+        artifactName: 'drop-onnxruntime-nodejs-win-arm64-dml'
         targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/arm64/'
 
     - task: DownloadPipelineArtifact@0
@@ -614,7 +634,7 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Linux x64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-linux-x64'
+        artifactName: 'drop-onnxruntime-nodejs-linux-x64-tensorrt'
         targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/linux/x64/'
 
     - task: DownloadPipelineArtifact@0
@@ -655,38 +675,32 @@ stages:
 
     # Node.js binding win32/x64
     - task: CopyFiles@2
-      displayName: 'Copy nuget binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64\'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-win-x64\lib'
-        Contents: '*.dll'
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64'
-    - task: CopyFiles@2
-      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64\'
+      displayName: 'Copy binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64\'
       inputs:
         SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\win32\x64'
-        Contents: '*.node'
+        Contents: |
+          *.dll
+          *.node
         TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64'
 
     # Node.js binding win32/arm64
     - task: CopyFiles@2
-      displayName: 'Copy nuget binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\arm64\'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-win-arm64\lib'
-        Contents: '*.dll'
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\arm64'
-    - task: CopyFiles@2
-      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\arm64\'
+      displayName: 'Copy binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\arm64\'
       inputs:
         SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\win32\arm64'
-        Contents: '*.node'
+        Contents: |
+          *.dll
+          *.node
         TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\arm64'
 
     # Node.js binding linux/x64
     - task: CopyFiles@2
       displayName: 'Copy nuget binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\linux\x64\'
       inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-linux-x64\lib'
-        Contents: 'libonnxruntime.so.*'
+        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-linux-x64-tensorrt\lib'
+        Contents: |
+          libonnxruntime.so.*
+          libonnxruntime_providers_*.so
         TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\linux\x64'
     - task: CopyFiles@2
       displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\linux\x64\'
@@ -784,7 +798,7 @@ stages:
 
 - template: ../nuget/templates/test_linux.yml
   parameters:
-    AgentPool : onnxruntime-Ubuntu2004-AMD-CPU
+    AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
     NugetPackageName : 'Microsoft.ML.OnnxRuntime'
     ArtifactSuffix: 'CPU'
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
@@ -792,7 +806,7 @@ stages:
 
 - template: ../nuget/templates/test_macos.yml
   parameters:
-    AgentPool : macOS-13
+    AgentPool : macOS-latest
     ArtifactSuffix: 'CPU'
 
 - template: ../nodejs/templates/test_win.yml
@@ -821,11 +835,11 @@ stages:
     OS: Linux
     BuildId: ${{ parameters.BuildId }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - template: final-jar-testing.yml
   parameters:
     OS: MacOS
     BuildId: ${{ parameters.BuildId }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    PoolName: 'macOS-13'
+    PoolName: 'macOS-latest'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
index 15fcec051174..2da3b8a9bc7b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
@@ -11,12 +11,6 @@ parameters:
 - name: OnnxruntimeArch
   type: string
 
-- name: OnnxruntimeCFlags
-  type: string
-
-- name: OnnxruntimeCXXFlags
-  type: string
-
 - name: OnnxruntimeNodejsBindingArch
   type: string
   values:
@@ -25,7 +19,7 @@ parameters:
 
 - name: PoolName
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: ArtifactNamePrefix
   type: string
@@ -67,10 +61,10 @@ jobs:
       inputs:
         script: |
           mkdir -p $HOME/.onnx
-          docker run --rm -e CFLAGS="${{parameters.OnnxruntimeCFlags}}" -e CXXFLAGS="${{parameters.OnnxruntimeCXXFlags}}" --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
+          docker run --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
           --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3.9 \
-          /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \
-          --skip_submodule_sync  --parallel --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/linux-${{parameters.OnnxruntimeArch}}"
+          /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \
+          --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/linux-${{parameters.OnnxruntimeArch}}"
         workingDirectory: $(Build.SourcesDirectory)
       displayName: 'Build'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/check_test_result.yml b/tools/ci_build/github/azure-pipelines/templates/check_test_result.yml
new file mode 100644
index 000000000000..1a68d415c44d
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/check_test_result.yml
@@ -0,0 +1,20 @@
+parameters:
+- name: FileName
+  type: string
+
+steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '3.x'
+      addToPath: true
+      architecture: 'x64'
+
+  - task: PythonScript@0
+    displayName: 'Check test result yml'
+    inputs:
+      scriptSource: 'inline'
+      script: |
+        with open('${{parameters.FileName}}', 'r') as file:
+          content = file.read()
+        assert 'data_onnx_opset' in content, "operator test not found in test result file"
+        assert 'models_zoo_opset' in content, "models_zoo model not found in test reuslt file"
diff --git a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
index c2ef565a6e9e..3d128fdb78ee 100644
--- a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
@@ -5,10 +5,9 @@ parameters:
   default: 'succeeded' # could be 'ci_only', 'always', 'succeeded'
 
 steps:
-- ${{ if eq(variables['System.TeamProject'], 'Lotus') }}: 
-  - task: DeleteFiles@1
-    inputs:
-      contents: $(Build.BinariesDirectory)/*
+- ${{ if eq(variables['System.TeamProject'], 'Lotus') }}:
+  - powershell: |
+      Remove-Item $(Build.BinariesDirectory)/* -Recurse -Force
     displayName: 'Clean up build directory'
 
   - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 537175f6bec7..e00425739b71 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.129
+      version: 1.0.150
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.129
+      version: 1.0.150
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index b7ae9ffa3c21..538cccd3c903 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -20,31 +20,37 @@ steps:
     - powershell: |
         Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}\bin;$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}\extras\CUPTI\lib64"
       displayName: 'Append CUDA SDK Directory to PATH'
+
     - task: CmdLine@2
       inputs:
         script: |
           echo %PATH%
-      displayName: 'Print PATH'
+      displayName: 'Print PATH after download CUDA SDK'
 
   - ${{ if eq(parameters.DownloadTRT, true) }}:
     - ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      - powershell: |
-          azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8'
-      - powershell: |
-          Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8\lib"
-        displayName: 'Append TensorRT Directory to PATH'
-
+        - bash: |
+            echo "##vso[task.setvariable variable=trtCudaVersion]11.8"
+          displayName: Set trtCudaVersion
     - ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      - powershell: |
-          azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0'
-      - powershell: |
-          Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0\lib"
-        displayName: 'Append TensorRT Directory to PATH'
+        - bash: |
+            echo "##vso[task.setvariable variable=trtCudaVersion]12.0"
+          displayName: Set trtCudaVersion
+
+    - bash: |
+        echo $(trtCudaVersion)
+      displayName: Get trtCudaVersion
+
+    - powershell: |
+        azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion) $(Agent.TempDirectory)
+      displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion)'
+
+    - powershell: |
+        Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion)\lib"
+      displayName: 'Append TensorRT Directory to PATH'
 
     - task: CmdLine@2
       inputs:
         script: |
           echo %PATH%
-      displayName: 'Print PATH'
\ No newline at end of file
+      displayName: 'Print PATH after download TensorRT'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
index 09c52f4d5ba0..67a2543bfb50 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
@@ -13,6 +13,10 @@ parameters:
   type: boolean
   default: false
 
+- name: InstallONNX
+  type: boolean
+  default: true
+
 - name: WITHCACHE
   type: boolean
   default: false
@@ -30,6 +34,41 @@ steps:
     addToPath: true
     architecture: ${{parameters.BuildArch}}
 
+- ${{ if eq(parameters.BuildArch, 'x64') }}:
+  - script: |
+      @echo off
+      set vswherepath="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+      for /f "usebackq delims=" %%i in (`%vswherepath% -latest -property installationPath`) do (
+        if exist "%%i\VC\Auxiliary\Build\vcvars64.bat" (
+          set vcvarsall="%%i\VC\Auxiliary\Build\vcvars64.bat"
+        )
+      )
+
+      @echo %vcvarsall% will be used as the VC compiler
+      @echo ##vso[task.setvariable variable=vcvarsall]%vcvarsall%
+    displayName: 'locate vcvarsall via vswhere'
+
+- ${{ if eq(parameters.BuildArch, 'x86') }}:
+  - script: |
+      @echo off
+      set vswherepath="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+      for /f "usebackq delims=" %%i in (`%vswherepath% -latest -property installationPath`) do (
+        if exist "%%i\VC\Auxiliary\Build\vcvars32.bat" (
+          set vcvarsall="%%i\VC\Auxiliary\Build\vcvars32.bat"
+        )
+      )
+
+      @echo %vcvarsall% will be used as the VC compiler
+      @echo ##vso[task.setvariable variable=vcvarsall]%vcvarsall%
+    displayName: 'locate vcvarsall via vswhere'
+
+- task: BatchScript@1
+  displayName: 'Setup VC env'
+  inputs:
+    filename: '$(vcvarsall)'
+    modifyEnvironment: true
+    workingFolder: '$(Build.BinariesDirectory)'
+
 - script: |
     python -m pip install --upgrade "setuptools>=68.2.2" wheel numpy flatbuffers
   workingDirectory: '$(Build.BinariesDirectory)'
@@ -58,10 +97,20 @@ steps:
         $ccache_parent_dir = (Split-Path -parent $ccache_path)
         Copy-Item "C:\ProgramData\chocolatey\lib\ccache\tools\ccache-4.7.4-windows-x86_64\ccache.exe" -Destination "C:\ProgramData\chocolatey\bin\cl.exe"
         Get-ChildItem $ccache_parent_dir
-        ccache --version
       }
+
+      "ccache info:"
+      ccache --version
+      ccache --show-config
+
+      "cl.exe from path: $((Get-Command cl).Path). Version:"
+      (cl.exe -?) -match 'Compiler Version'
+      "C:\ProgramData\chocolatey\bin\cl.exe version:"
+      (C:\ProgramData\chocolatey\bin\cl.exe -?) -match 'Compiler Version'
+
     displayName: Install ccache and update PATH to use linked versions of gcc, cc, etc
 
+- ${{ if eq(parameters.InstallONNX, true) }}:
   - ${{ if eq(parameters.WITHCACHE, true) }}:
     - task: Cache@2
       # machinepool is used to ensure the compiler is same
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index e40c4d0e95dc..30e427a18509 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -138,7 +138,7 @@ jobs:
       Today: $(TODAY)
       CacheDir: $(ORT_CACHE_DIR)
       AdditionalKey: " $(System.StageName) | ${{ parameters.BuildConfig }} "
-      BuildPyArguments: '--config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_csharp --update --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}'
+      BuildPyArguments: '--config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_csharp --update --parallel  --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}'
       MsbuildArguments: '-maxcpucount'
       BuildArch: ${{ parameters.buildArch }}
       Platform: ${{ parameters.msbuildPlatform }}
@@ -148,12 +148,9 @@ jobs:
       Get-Volume $("$(Build.BinariesDirectory)")[0]
     displayName: check disk size
 
-  - task: DeleteFiles@1
-    displayName: 'Delete intermedia files from $(Build.BinariesDirectory)\${{ parameters.BuildConfig }}'
-    inputs:
-      SourceFolder: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}'
-      Contents: |
-        **/*.obj
+  - powershell: |
+      Remove-Item "$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}" -Include "*.obj" -Recurse
+    displayName: 'Delete intermediate files from $(Build.BinariesDirectory)\${{ parameters.BuildConfig }}'
 
   - powershell: |
       Get-Volume $("$(Build.BinariesDirectory)")[0]
@@ -221,14 +218,6 @@ jobs:
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Run tests'
 
-      - task: PublishTestResults@2
-        displayName: 'Publish unit test results'
-        inputs:
-          testResultsFiles: '**/*.results.xml'
-          searchFolder: '$(Build.BinariesDirectory)/${{ parameters.BuildConfig }}'
-          testRunTitle: 'Unit Test Run'
-        condition: succeededOrFailed()
-
   - ${{ if eq(parameters.GenerateDocumentation, true) }}:
     - task: PythonScript@0
       displayName: 'Generate documentation'
@@ -251,4 +240,4 @@ jobs:
     condition: and(failed(), eq(variables['DocUpdateNeeded'], 'true'))
     inputs:
       pathtoPublish: '$(Build.SourcesDirectory)/docs/ContribOperators.md'
-      artifactName: 'ContribOperators.md'
+      artifactName: 'ContribOperators.md'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
index 7b9788d90b17..15165e3cb095 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
@@ -1,5 +1,5 @@
 parameters:
-  AgentPool : 'onnxruntime-Ubuntu2004-AMD-CPU'
+  AgentPool : 'onnxruntime-Ubuntu2204-AMD-CPU'
   StageName : 'Linux_CI_Dev'
   RunDockerBuildArgs: '-o ubuntu20.04 -d cpu -x "--build_wheel"'
   NuPackScript: ''
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
index 1cc5c48c5513..8972d55f6e19 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
@@ -31,10 +31,8 @@ stages:
       AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
       BaseImage: 'registry.access.redhat.com/ubi8/ubi'
       OnnxruntimeArch: 'x64'
-      OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
-      OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
       OnnxruntimeNodejsBindingArch: 'x64'
-      PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
       ArtifactNamePrefix: ${{ parameters.ArtifactNamePrefix }}
       PackageJava: ${{ parameters.PackageJava }}
       PackageNodeJS: ${{ parameters.PackageNodeJS }}
@@ -44,8 +42,6 @@ stages:
       AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
       BaseImage: 'arm64v8/almalinux:8'
       OnnxruntimeArch: 'aarch64'
-      OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -O3 -Wl,--strip-all'
-      OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -O3 -Wl,--strip-all'
       OnnxruntimeNodejsBindingArch: 'arm64'
       PoolName: 'onnxruntime-linux-ARM64-CPU-2019'
       ArtifactNamePrefix: ${{ parameters.ArtifactNamePrefix }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index d67af8d23706..360e3d5ef879 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -13,7 +13,7 @@ parameters:
 
 - name: PoolName
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: SkipPublish
   type: boolean
@@ -93,15 +93,15 @@ jobs:
       - script: |
           set -ex
           cd '$(Build.SourcesDirectory)/cmake/external/emsdk'
-          ./emsdk install 3.1.44 ccache-git-emscripten-64bit
-          ./emsdk activate 3.1.44 ccache-git-emscripten-64bit
+          ./emsdk install 3.1.51 ccache-git-emscripten-64bit
+          ./emsdk activate 3.1.51 ccache-git-emscripten-64bit
         displayName: 'emsdk install and activate ccache for emscripten'
   - ${{if eq(parameters.WithCache, false)}}:
       - script: |
           set -ex
           cd '$(Build.SourcesDirectory)/cmake/external/emsdk'
-          ./emsdk install 3.1.44
-          ./emsdk activate 3.1.44
+          ./emsdk install 3.1.51
+          ./emsdk activate 3.1.51
         displayName: 'emsdk install and activate ccache for emscripten'
 
   - template: build-linux-wasm-step.yml
@@ -174,7 +174,7 @@ jobs:
         ${{ else }}:
           AdditionalKey: wasm_simd_jsep | ${{ parameters.BuildConfig }}
         CacheDir: $(ORT_CACHE_DIR)/wasm_simd_jsep
-        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_jsep --enable_wasm_simd --use_jsep --target onnxruntime_webassembly --skip_tests'
+        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_jsep --enable_wasm_simd --use_jsep --use_webnn --target onnxruntime_webassembly --skip_tests'
         DisplayName: 'Build (simd + JSEP)'
         WithCache: ${{ parameters.WithCache }}
     - template: build-linux-wasm-step.yml
@@ -185,7 +185,7 @@ jobs:
         ${{ else }}:
           AdditionalKey: wasm_simd_threads_jsep | ${{ parameters.BuildConfig }}
         CacheDir: $(ORT_CACHE_DIR)/wasm_simd_threads_jsep
-        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep --target onnxruntime_webassembly --skip_tests'
+        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         DisplayName: 'Build (simd + threads + JSEP)'
         WithCache: ${{ parameters.WithCache }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
index e788e4b3ddda..a4d5a73118ea 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
@@ -31,6 +31,10 @@ steps:
     node -e "a=require('child_process').execSync('git diff --name-only').toString();if(a)throw new Error('Following source files are not formatted: (did you run \"npm run format\"?)\n'+a)"
   workingDirectory: '$(Build.SourcesDirectory)/js'
   displayName: 'Check unformatted files'
+- script: |
+    npx typedoc --emit none --treatWarningsAsErrors
+  workingDirectory: '$(Build.SourcesDirectory)/js/common'
+  displayName: 'TypeDoc Validation'
 - script: |
     npm run build:doc
   workingDirectory: '$(Build.SourcesDirectory)/js/web'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
index 080079388a76..945fbb7c4a09 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
@@ -71,7 +71,7 @@ stages:
         ${{ if eq(parameters.DoESRP, true)}}:
           vmImage: 'macOS-12'
         ${{ else }}:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-latest'
       steps:
       - checkout: none
       - template: flex-downloadPipelineArtifact.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
index 0cb77e222af9..7672b604a526 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
@@ -47,14 +47,14 @@ steps:
     BuildStep:
       - script: |
           rm -rf $(Build.BinariesDirectory)/Release
-          python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --update --build ${{ parameters.AdditionalBuildFlags }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --build_shared_lib --config Release
+          python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --update --build ${{ parameters.AdditionalBuildFlags }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --config Release
         displayName: 'Build ${{ parameters.MacosArch }}'
         env:
           CCACHE_DIR: ${{ parameters.CacheDir }}
 
 - ${{ if eq(parameters.MacosArch, 'x86_64') }}:
   - script: |
-      python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --test  ${{ parameters.AdditionalBuildFlags }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --build_shared_lib --config Release
+      python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --test  ${{ parameters.AdditionalBuildFlags }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --config Release
     displayName: 'Running Tests'
 
 - task: ShellScript@2
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index fd2113502478..5035eb9e5de7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -37,7 +37,7 @@ jobs:
     PROTO_CACHE_DIR: $(Pipeline.Workspace)/ccache_proto
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   timeoutInMinutes: 300
   steps:
   - checkout: self
@@ -52,9 +52,26 @@ jobs:
     inputs:
       versionSpec: '18.x'
 
+  - task: JavaToolInstaller@0
+    inputs:
+      versionSpec: "11"
+      jdkArchitectureOption: "x64"
+      jdkSourceOption: 'PreInstalled'
+
   - template: set-version-number-variables-step.yml
 
   - template: use-xcode-version.yml
+    parameters:
+      xcodeVersion: 14.2
+
+  - template: download-deps.yml
+
+  - task: PythonScript@0
+    displayName: 'Update deps.txt'
+    inputs:
+      scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+      arguments: --new_dir $(Build.BinariesDirectory)/deps
+      workingDirectory: $(Build.BinariesDirectory)
 
   - template: mac-build-step-with-cache.yml
     parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
new file mode 100644
index 000000000000..756a7a48343a
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
@@ -0,0 +1,47 @@
+parameters:
+  - name: msbuildPlatform
+    type: string
+  - name: java_artifact_id
+    type: string
+
+steps:
+    - task: CmdLine@2
+      displayName: 'Add symbols and notices to Java'
+      inputs:
+        script: |
+          @echo on
+          cd $(Build.SourcesDirectory)\java
+          call $(Build.SourcesDirectory)\java\gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
+          if %errorlevel% neq 0 exit /b %errorlevel%
+          cd $(Build.BinariesDirectory)\RelWithDebInfo
+          set NATIVE_FOLDER=$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ai\onnxruntime\native\win-x64
+          mkdir %NATIVE_FOLDER%
+          echo "Directories created"
+          copy .\java\build\libs\*.jar $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
+          pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
+          set artifact_id=${{ parameters.java_artifact_id }}
+          jar xf onnxruntime-$(OnnxRuntimeVersion).jar META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml
+          move META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml onnxruntime-$(OnnxRuntimeVersion).pom
+          rd /s /q META-INF
+          popd
+          copy .\RelWithDebInfo\onnxruntime.pdb %NATIVE_FOLDER%
+          copy .\RelWithDebInfo\onnxruntime4j_jni.pdb %NATIVE_FOLDER%
+          copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\Privacy.md
+          copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ThirdPartyNotices.txt
+          @echo $(OnnxRuntimeGitCommitHash) > $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\GIT_COMMIT_ID
+          pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
+          jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime.pdb
+          jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime4j_jni.pdb
+          jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar Privacy.md ThirdPartyNotices.txt GIT_COMMIT_ID
+          popd
+          pushd $(Build.SourcesDirectory)\java\build\classes\java\test
+          if %errorlevel% neq 0 exit /b %errorlevel%
+          jar cvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
+          if %errorlevel% neq 0 exit /b %errorlevel%
+          popd
+          pushd $(Build.SourcesDirectory)\java\build\resources\test
+          rd /s /q ai\onnxruntime\native
+          jar uvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
+          popd
+          rd /s /q $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
+          dir /s /b $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index 51583a25f63a..bfee58e6e5ef 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -61,21 +61,6 @@ stages:
     buildJava: false
     buildNodejs: false
 
-- template: win-ci.yml
-  parameters:
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    stage_name_suffix: Training_CPU_arm_${{ parameters.BuildVariant }}
-    artifact_name_suffix: -training
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm ${{ parameters.AdditionalBuildFlags }}  ${{ parameters.AdditionalWinBuildFlags}} --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: onnxruntime-Win-CPU-2022
-
 - template: win-ci.yml
   parameters:
     DoCompliance: ${{ parameters.DoCompliance }}
@@ -127,7 +112,6 @@ stages:
   - Linux_C_API_Packaging_Training_CPU
   - Windows_Packaging_Training_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_Training_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_Training_CPU_arm_${{ parameters.BuildVariant }}
   - Windows_Packaging_Training_CPU_arm64_${{ parameters.BuildVariant }}
   - Android_Java_API_AAR_Packaging_Training_Full
   condition: succeeded()
@@ -164,12 +148,6 @@ stages:
         artifactName: 'onnxruntime-training-win-arm64'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download win-arm Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-training-win-arm'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-
     - task: DownloadPipelineArtifact@0
       displayName: 'Download linux-x64 Pipeline Artifact'
       inputs:
@@ -336,7 +314,7 @@ stages:
 
 - template: ../nuget/templates/test_linux.yml
   parameters:
-    AgentPool : onnxruntime-Ubuntu2004-AMD-CPU
+    AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Training'
     ArtifactSuffix: 'Training-CPU'
     StageSuffix: 'Training_CPU'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
index 8cc48aac7a3b..318ffd21febf 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
@@ -35,62 +35,66 @@ parameters:
   values:
    - 11.8
    - 12.2
-jobs:
-- job: Linux_py_GPU_Wheels_${{ parameters.arch }}
-  timeoutInMinutes: 240
-  workspace:
-    clean: all
-  pool: ${{ parameters.machine_pool }}
-  variables:
-    # The build machine pool doesn't have dotnet, so it can't run CG.
-    - name: skipComponentGovernanceDetection
-      value: true
-    - name: extra_build_args
-      ${{ if ne(parameters.extra_build_arg, '') }}:
-        value: -x ${{ parameters.extra_build_arg }}
-      ${{ if eq(parameters.extra_build_arg, '') }}:
-        value: ''
-  steps:
-    - checkout: self
-      clean: true
-      submodules: recursive
 
-    - template: set-nightly-build-option-variable-step.yml
+stages:
+- stage: Linux_py_GPU_Wheels_${{ parameters.arch }}
+  dependsOn: []
+  jobs:
+  - job: Linux_py_GPU_Wheels_${{ parameters.arch }}
+    timeoutInMinutes: 240
+    workspace:
+      clean: all
+    pool: ${{ parameters.machine_pool }}
+    variables:
+      # The build machine pool doesn't have dotnet, so it can't run CG.
+      - name: skipComponentGovernanceDetection
+        value: true
+      - name: extra_build_args
+        ${{ if ne(parameters.extra_build_arg, '') }}:
+          value: -x ${{ parameters.extra_build_arg }}
+        ${{ if eq(parameters.extra_build_arg, '') }}:
+          value: ''
+    steps:
+      - checkout: self
+        clean: true
+        submodules: recursive
 
-    - template: get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-        Context: tools/ci_build/github/linux/docker
-        DockerBuildArgs: "
-        --network=host 
-        --build-arg BASEIMAGE=${{ parameters.docker_base_image }}
-        --build-arg TRT_VERSION=${{ parameters.trt_version }}
-        --build-arg BUILD_UID=$( id -u )
-        --build-arg PLATFORM=${{ parameters.arch }}
-        "
-        Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
+      - template: set-nightly-build-option-variable-step.yml
 
+      - template: get-docker-image-steps.yml
+        parameters:
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+          Context: tools/ci_build/github/linux/docker
+          DockerBuildArgs: "
+          --network=host
+          --build-arg BASEIMAGE=${{ parameters.docker_base_image }}
+          --build-arg TRT_VERSION=${{ parameters.trt_version }}
+          --build-arg BUILD_UID=$( id -u )
+          --build-arg PLATFORM=${{ parameters.arch }}
+          "
+          Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
 
-    - task: Bash@3
-      displayName: 'Build Python Wheel'
-      inputs:
-        targetType: filePath
-        filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
-        arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
 
-    - task: PublishBuildArtifacts@1
-      displayName: 'Publish Artifact: ONNXRuntime python wheel'
-      inputs:
-        PathtoPublish: '$(Build.BinariesDirectory)/dist'
-        ArtifactName: onnxruntime_gpu
+      - task: Bash@3
+        displayName: 'Build Python Wheel'
+        inputs:
+          targetType: filePath
+          filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
+          arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
 
-    - task: PublishPipelineArtifact@0
-      displayName: 'Publish Test Binaries'
-      inputs:
-        artifactName: 'drop-linux-gpu-${{ parameters.arch }}'
-        targetPath: '$(Build.BinariesDirectory)/Release'
+      - task: PublishBuildArtifacts@1
+        displayName: 'Publish Artifact: ONNXRuntime python wheel'
+        inputs:
+          PathtoPublish: '$(Build.BinariesDirectory)/dist'
+          ArtifactName: onnxruntime_gpu
 
+      - task: PublishPipelineArtifact@0
+        displayName: 'Publish Test Binaries'
+        inputs:
+          artifactName: 'drop-linux-gpu-${{ parameters.arch }}'
+          targetPath: '$(Build.BinariesDirectory)/Release'
 
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
+
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
index db3782c69cf6..2adcbb13dbeb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
@@ -106,3 +106,7 @@ jobs:
       inputs:
         artifactName: 'drop-linux-cpu-${{ parameters.arch }}'
         targetPath: '$(Build.BinariesDirectory)/${{ parameters.cmake_build_type }}'
+
+    - template: component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
index 8d5ca19a7353..0cb438c71066 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
@@ -30,6 +30,8 @@ jobs:
         PythonVersion: '3.10'
       Python311:
         PythonVersion: '3.11'
+      Python312:
+        PythonVersion: '3.12'
   steps:
   - checkout: none
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
index 00ba5ea4a475..01cab936aa52 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
@@ -48,7 +48,7 @@ stages:
       timeoutInMinutes: 90
       workspace:
         clean: all
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       strategy:
         matrix:
           ${{ each PythonVersion in parameters.python_version }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index f2b91bbaacb8..4f8d667199a3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -30,13 +30,18 @@ parameters:
   type: boolean
   default: true
 
-- name: enable_mac_silicon
-  displayName: 'Whether Mac silicon package is built.'
+- name: enable_linux_arm
+  displayName: 'Whether Linux ARM package is built.'
   type: boolean
   default: true
 
-- name: enable_linux_arm
-  displayName: 'Whether Linux ARM package is built.'
+- name: enable_windows_arm64_qnn
+  displayName: 'Whether Windows ARM64 package with QNN EP is built.'
+  type: boolean
+  default: true
+
+- name: enable_windows_x64_qnn
+  displayName: 'Whether Windows x86_64 package with QNN EP is built.'
   type: boolean
   default: true
 
@@ -52,11 +57,10 @@ parameters:
    - MinSizeRel
 
 stages:
-- stage: Python_Packaging
-  dependsOn: []
-
-  jobs:
-  - ${{ if eq(parameters.enable_windows_cpu, true) }}:
+- ${{ if eq(parameters.enable_windows_cpu, true) }}:
+  - stage: Python_Packaging_Windows_CPU
+    dependsOn: []
+    jobs:
     - job: Windows_py_Wheels
       pool: 'onnxruntime-Win-CPU-2022'
       strategy:
@@ -77,6 +81,10 @@ stages:
             PythonVersion: '3.11'
             MsbuildPlatform: x64
             buildArch: x64
+          Python312_x64:
+            PythonVersion: '3.12'
+            MsbuildPlatform: x64
+            buildArch: x64
           # Training build cannot support Win32 for now because one or more of its python
           # dependencies does not support Win32. So, don't build a training package for Win32
           ${{ if not(contains(parameters.build_py_parameters, '--enable_training')) }}:
@@ -96,13 +104,17 @@ stages:
               PythonVersion: '3.11'
               MsbuildPlatform: Win32
               buildArch: x86
+            Python312_x86:
+              PythonVersion: '3.12'
+              MsbuildPlatform: Win32
+              buildArch: x86
       variables:
         OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
         EnvSetupScript: setup_env.bat
         setVcvars: true
         BuildConfig: 'RelWithDebInfo'
         ExtraParam: ${{ parameters.build_py_parameters }}
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
 
@@ -174,7 +186,7 @@ stages:
             --enable_pybind
             --enable_onnx_tests
             ${{ parameters.build_py_parameters }}
-            --parallel --update
+            --parallel --use_binskim_compliant_compile_flags --update
             $(TelemetryOption)
           workingDirectory: '$(Build.BinariesDirectory)'
 
@@ -262,78 +274,97 @@ stages:
         displayName: 'Clean Agent Directories'
         condition: always()
 
-  - ${{ if eq(parameters.enable_windows_gpu, true) }}:
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.8'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.9'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.10'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.11'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          ENV_SETUP_SCRIPT: setup_env_gpu.bat
-          EP_NAME: gpu
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.8'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.9'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.10'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-      - template: py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
-          PYTHON_VERSION: '3.11'
-          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
-          ENV_SETUP_SCRIPT: setup_env.bat
-          EP_NAME: directml
-
-  - ${{ if eq(parameters.enable_mac_cpu, true) }}:
+- ${{ if eq(parameters.enable_windows_gpu, true) }}:
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.8'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.9'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.10'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.11'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+        PYTHON_VERSION: '3.12'
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        ENV_SETUP_SCRIPT: setup_env_gpu.bat
+        EP_NAME: gpu
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.8'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.9'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.10'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.11'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+    - template: py-win-gpu.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+        PYTHON_VERSION: '3.12'
+        EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+        ENV_SETUP_SCRIPT: setup_env.bat
+        EP_NAME: directml
+
+- ${{ if eq(parameters.enable_mac_cpu, true) }}:
+  - stage: Python_Packaging_MacOS
+    dependsOn: []
+    jobs:
     - job: MacOS_py_Wheels
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
       pool:
-        vmImage: 'macOS-13'
+        vmImage: 'macOS-latest'
       variables:
         MACOSX_DEPLOYMENT_TARGET: '11.0'
       strategy:
@@ -346,6 +377,8 @@ stages:
             PythonVersion: '3.10'
           Python311:
             PythonVersion: '3.11'
+          Python312:
+            PythonVersion: '3.12'
       steps:
       - checkout: self
         clean: true
@@ -357,95 +390,23 @@ stages:
           versionSpec: $(PythonVersion)
 
       - template: use-xcode-version.yml
-
-      - script: |
-          set -e -x
-          pushd .
-          mkdir -p /tmp/scripts
-          mkdir -p $(Build.BinariesDirectory)/installed
-          cp $(Build.SourcesDirectory)/cmake/deps.txt /tmp/scripts
-          $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/installed
-          popd
-          export PATH=$(Build.BinariesDirectory)/installed/bin:$PATH
-          export ONNX_ML=1
-          export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF"
-          export _PYTHON_HOST_PLATFORM=macosx-${{variables.MACOSX_DEPLOYMENT_TARGET}}-x86_64
-          python3 -m pip install -r '$(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/requirements.txt'
-          python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --build_dir $(Build.BinariesDirectory) --use_coreml --skip_submodule_sync --parallel --config Release --skip_onnx_tests --build_wheel ${{ parameters.build_py_parameters }}
-        displayName: 'Command Line Script'
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)/Release/dist'
-          Contents: '*.whl'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel'
-        inputs:
-          ArtifactName: onnxruntime
-
-      - template: component-governance-component-detection-steps.yml
         parameters:
-          condition: 'succeeded'
+          xcodeVersion: 14.2
 
-  - ${{ if eq(parameters.enable_mac_silicon, true) }}:
-    - job: MacOS_silicon_py_Wheels
-      timeoutInMinutes: 120
-      workspace:
-        clean: all
-      pool:
-        vmImage: 'macOS-13'
-      variables:
-        MACOSX_DEPLOYMENT_TARGET: '11.0'
-      strategy:
-        # As of 3.9.1, Python now fully supports building and running on macOS 11.0 (Big Sur) and on Apple Silicon Macs (based on the ARM64 architecture).
-        # https://docs.python.org/3/whatsnew/3.9.html
-        matrix:
-          Python38:
-            PythonVersion: '3.8'
-          Python39:
-            PythonVersion: '3.9'
-          Python310:
-            PythonVersion: '3.10'
-          Python311:
-            PythonVersion: '3.11'
-      steps:
-      - checkout: self
-        clean: true
-        submodules: recursive
+      - template: download-deps.yml
 
-      - task: UsePythonVersion@0
-        displayName: 'Use Python'
+      - task: PythonScript@0
+        displayName: 'Update deps.txt'
         inputs:
-          versionSpec: $(PythonVersion)
-
-      - script: |
-          set -ex
-          uname -m
-          system_profiler SPSoftwareDataType SPHardwareDataType
-        displayName: 'Mac machine info'
-
-      - template: use-xcode-version.yml
+          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+          arguments: --new_dir $(Build.BinariesDirectory)/deps
+          workingDirectory: $(Build.BinariesDirectory)
 
-      # Don't remove _PYTHON_HOST_PLATFORM, it's used to generate correct package name
-      # Setting _PYTHON_HOST_PLATFORM overwrites the value return by get_platform()
-      # Ref: https://wiki.debian.org/Python/MultiArch
       - script: |
           set -e -x
-          pushd .
-          mkdir -p /tmp/scripts
-          mkdir -p $(Build.BinariesDirectory)/installed
-          cp $(Build.SourcesDirectory)/cmake/deps.txt /tmp/scripts
-          $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/installed
-          popd
-          export PATH=$(Build.BinariesDirectory)/installed/bin:$PATH
-          export ONNX_ML=1
-          export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF"
-          export _PYTHON_HOST_PLATFORM=macosx-${{variables.MACOSX_DEPLOYMENT_TARGET}}-arm64
+          export _PYTHON_HOST_PLATFORM=macosx-${{variables.MACOSX_DEPLOYMENT_TARGET}}-universal2
           python3 -m pip install -r '$(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/requirements.txt'
-          python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --build_dir $(Build.BinariesDirectory) --use_coreml --skip_submodule_sync --parallel --config Release --skip_tests --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64 --build_wheel ${{ parameters.build_py_parameters }}
+          python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --build_dir $(Build.BinariesDirectory) --use_coreml --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --config Release --build_wheel ${{ parameters.build_py_parameters }} --use_coreml --cmake_extra_defines CMAKE_OSX_ARCHITECTURES="arm64;x86_64" --update --build
         displayName: 'Command Line Script'
 
       - script: |
@@ -454,13 +415,9 @@ stages:
           cd '$(Build.BinariesDirectory)/Release/dist'
           ls
           for file in *.whl
-          do
-            [[ "$file" == *arm64* ]] || ( echo "Mac Silicon package name is NOT correct" && exit 1)
-          done
-          for file in *.whl
           do
             delocate-listdeps "$file"
-            delocate-wheel --require-archs=arm64 -w fixed_wheels -v "$file"
+            delocate-wheel --require-archs=x86_64,arm64 -w fixed_wheels -v "$file"
           done
         displayName: 'delocate wheel'
 
@@ -480,23 +437,30 @@ stages:
         parameters:
           condition: 'succeeded'
 
+
   - ${{ if eq(parameters.enable_linux_arm, true) }}:
-      - template: py-linux.yml
-        parameters:
-          arch: 'aarch64'
-          machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
-          base_image: 'arm64v8/almalinux:8'
-          devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
-          ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
-          prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:'
-          extra_build_arg: ${{ parameters.build_py_parameters }}
-          cmake_build_type: ${{ parameters.cmake_build_type }}
+    - stage: Python_Packaging_Linux_ARM
+      dependsOn: []
+      jobs:
+        - template: py-linux.yml
+          parameters:
+            arch: 'aarch64'
+            machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
+            base_image: 'arm64v8/almalinux:8'
+            devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
+            ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
+            prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:'
+            extra_build_arg: ${{ parameters.build_py_parameters }}
+            cmake_build_type: ${{ parameters.cmake_build_type }}
 
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
+    - stage: Python_Packaging_Linux_CPU
+      dependsOn: []
+      jobs:
       - template: py-linux.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           base_image: 'registry.access.redhat.com/ubi8/ubi'
           devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
           ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
@@ -509,6 +473,26 @@ stages:
       - template: py-linux-gpu.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
+
+  - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}:
+    - stage: Python_Packaging_Windows_ARM64_QNN
+      dependsOn: []
+      jobs:
+      - template: py-win-arm64-qnn.yml
+        parameters:
+          MACHINE_POOL: 'onnxruntime-qnn-windows-vs-2022-arm64'
+          QNN_SDK: 'qnn-v2.19.2.240210_win'
+          PYTHON_VERSION: '3.11'
+          NUMPY_VERSION: '1.26.4'
+
+  - ${{ if eq(parameters.enable_windows_x64_qnn, true) }}:
+    - stage: Python_Packaging_Windows_x64_QNN
+      dependsOn: []
+      jobs:
+        - template: py-win-x64-qnn.yml
+          parameters:
+            MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+            QNN_SDK: 'qnn-v2.19.2.240210_win'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
new file mode 100644
index 000000000000..f6b36733ebdd
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
@@ -0,0 +1,233 @@
+parameters:
+  build_py_parameters: ''
+  torch_version: ''
+  opset_version: ''
+  cuda_version: ''
+  cmake_cuda_architectures: ''
+  docker_file: ''
+  upload_wheel: ''
+  debug_build:  ''
+  python_version: ''
+  stage_name: ''
+  SpecificArtifact: false
+  BuildId: '0'
+  build_pool_name: ''
+
+stages:
+  - stage: Build_${{ parameters.stage_name }}
+    variables:
+      - name: isMain
+        value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
+      - name: finalStorage
+        ${{ if eq(variables['isMain'], 'true') }}:
+          value: '--final_storage'
+        ${{ else }}:
+          value: ''
+      - name: buildConfig
+        ${{ if eq(parameters['debug_build'], 'true') }}:
+          value: 'Debug'
+        ${{ else }}:
+          value: 'Release'
+      - name: PythonVersion
+        value: ${{ parameters.python_version }}
+      - name: Repository
+        value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
+    dependsOn: []
+
+    jobs:
+    - job: Build
+      pool: ${{ parameters.build_pool_name }}
+      timeoutInMinutes: 180
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - task: CmdLine@2
+          displayName: 'check variables'
+          inputs:
+            script: |
+              echo "Branch is "${{ variables['Build.SourceBranch'] }} && \
+              echo "isMain is "${{ variables['isMain'] }} && \
+              echo "final_storage is "${{ variables['finalStorage'] }}
+
+        - checkout: self
+          clean: true
+          submodules: recursive
+
+        - template: set-python-manylinux-variables-step.yml
+
+        - template: get-docker-image-steps.yml
+          parameters:
+            Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
+            Context: tools/ci_build/github/linux/docker
+            DockerBuildArgs: >-
+              --build-arg TORCH_VERSION=${{ parameters.torch_version }}
+              --build-arg OPSET_VERSION=${{ parameters.opset_version }}
+              --build-arg PYTHON_VERSION=${{ parameters.python_version }}
+              --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
+              --build-arg BUILD_UID=$(id -u)
+              --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+              --build-arg DEVTOOLSET_ROOTPATH=/usr
+              --build-arg PREPEND_PATH=/usr/local/cuda/bin:
+              --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
+            Repository: $(Repository)
+
+        - task: CmdLine@2
+          displayName: 'build onnxruntime'
+          inputs:
+            script: |
+              set -e -x
+              mkdir -p $HOME/.onnx
+              docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
+                --volume /data/onnx:/data/onnx:ro \
+                --volume $(Build.SourcesDirectory):/onnxruntime_src \
+                --volume $(Build.BinariesDirectory):/build \
+                --volume /data/models:/build/models:ro \
+                --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+                -e NIGHTLY_BUILD \
+                -e DEFAULT_TRAINING_PACKAGE_DEVICE \
+                -e BUILD_BUILDNUMBER \
+                -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
+                $(Repository) \
+                  $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
+                    --build_dir /build \
+                    --config ${{ variables['buildConfig'] }} \
+                    --skip_submodule_sync \
+                    --parallel --use_binskim_compliant_compile_flags \
+                    --build_wheel \
+                    --enable_onnx_tests \
+                    ${{ parameters.build_py_parameters }} \
+                    --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
+                    --use_cuda --cuda_version=${{ parameters.cuda_version }} --cuda_home=/usr/local/cuda-${{ parameters.cuda_version }} --cudnn_home=/usr/local/cuda-${{ parameters.cuda_version }};
+            workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CopyFiles@2
+          displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+          inputs:
+            SourceFolder: '$(Build.BinariesDirectory)'
+            Contents: "${{ variables['buildConfig'] }}/dist/*.whl"
+            TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+        - task: PublishBuildArtifacts@1
+          displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
+          inputs:
+            ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
+
+        - template: component-governance-component-detection-steps.yml
+          parameters:
+            condition: 'succeeded'
+
+        - template: clean-agent-build-directory-step.yml
+
+  - stage: Test_${{ parameters.stage_name }}
+    variables:
+      - name: isMain
+        value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
+      - name: finalStorage
+        ${{ if eq(variables['isMain'], 'true') }}:
+          value: '--final_storage'
+        ${{ else }}:
+          value: ''
+      - name: buildConfig
+        ${{ if eq(parameters['debug_build'], 'true') }}:
+          value: 'Debug'
+        ${{ else }}:
+          value: 'Release'
+      - name: PythonVersion
+        value: ${{ parameters.python_version }}
+      - name: Repository
+        value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
+      - name: UploadWheel
+        value: ${{ parameters.upload_wheel }}
+    dependsOn: Build_${{ parameters.stage_name }}
+    jobs:
+    - job: Test_GPU
+      pool: Onnxruntime-Linux-GPU
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - checkout: self
+          clean: true
+          submodules: none
+
+        - template: set-python-manylinux-variables-step.yml
+
+        - template: flex-downloadPipelineArtifact.yml
+          parameters:
+            ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
+            StepName: 'Download Pipeline Artifact - Linux Training Build'
+            TargetPath: '$(Build.ArtifactStagingDirectory)'
+            SpecificArtifact: ${{ parameters.SpecificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
+
+        - script: |
+            set -e -x
+            whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1)  ; \
+            echo $whlfilename ; du -sh $whlfilename ; \
+            (( $(wc -c < "$whlfilename") -  300*1024*1024 < 0 )) ||  ( echo 'Wheel size bigger than 300M'; exit 1)
+          displayName: 'Check wheel size'
+          continueOnError: true
+
+        - template: get-docker-image-steps.yml
+          parameters:
+            Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
+            Context: tools/ci_build/github/linux/docker
+            UpdateDepsTxt: false
+            DockerBuildArgs: >-
+              --build-arg TORCH_VERSION=${{ parameters.torch_version }}
+              --build-arg OPSET_VERSION=${{ parameters.opset_version }}
+              --build-arg PYTHON_VERSION=${{ parameters.python_version }}
+              --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
+              --build-arg BUILD_UID=$(id -u)
+              --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+              --build-arg DEVTOOLSET_ROOTPATH=/usr
+              --build-arg PREPEND_PATH=/usr/local/cuda/bin:
+              --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
+            Repository: $(Repository)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
+          displayName: 'Mount MNIST'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
+          displayName: 'Mount bert-data'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
+          displayName: 'Mount hf-models-cache'
+          condition: succeededOrFailed()
+          workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CmdLine@2
+          displayName: 'test ortmodule'
+          inputs:
+            script: |
+              set -ex ; \
+              whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
+              echo $whlfilename ; \
+              basefilename=$(basename $whlfilename) ; \
+              docker run --rm \
+                --gpus all \
+                -e NVIDIA_VISIBLE_DEVICES=all \
+                --volume $(Build.ArtifactStagingDirectory):/build \
+                --volume /mnist:/mnist \
+                --volume /bert_data:/bert_data \
+                --volume /hf_models_cache:/hf_models_cache \
+                $(Repository) \
+                  bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ;
+            workingDirectory: $(Build.SourcesDirectory)
+
+        - task: CmdLine@2
+          displayName: 'Upload wheel'
+          condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true')))
+          inputs:
+            script: |
+              set -e -x
+              whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
+              python3 tools/ci_build/upload_python_package_to_azure_storage.py \
+                  --python_wheel_path $whlfilename ${{ variables['finalStorage'] }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
index 7fdd7e54e752..a1f326ebaafa 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -47,176 +47,48 @@ parameters:
   type: boolean
   default: false
 
-stages:
-- stage: "Cuda_Python_Packaging_debug_${{ parameters.debug_build }}"
-
-  variables:
-    - name: isMain
-      value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
-    - name: finalStorage
-      ${{ if eq(variables['isMain'], 'true') }}:
-        value: '--final_storage'
-      ${{ else }}:
-        value: ''
-    - name: buildConfig
-      ${{ if eq(parameters['debug_build'], 'true') }}:
-        value: 'Debug'
-      ${{ else }}:
-        value: 'Release'
-
-  dependsOn: []
-
-  jobs:
-    - job: Linux_py_Training_Cuda_Wheels
-      timeoutInMinutes: 180
-      workspace:
-        clean: all
-      pool: ${{ parameters.agent_pool }}
-      strategy:
-        matrix:
-          Python38:
-            PythonVersion: '3.8'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
-          Python39:
-            PythonVersion: '3.9'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
-          Python310:
-            PythonVersion: '3.10'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
-          Python311:
-            PythonVersion: '3.11'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
-
-      steps:
-      - task: CmdLine@2
-        displayName: 'check variables'
-        inputs:
-          script: |
-            echo "Branch is "${{ variables['Build.SourceBranch'] }} && \
-            echo "isMain is "${{ variables['isMain'] }} && \
-            echo "final_storage is "${{ variables['finalStorage'] }}
-
-      - checkout: self
-        clean: true
-        submodules: recursive
-
-      - template: set-python-manylinux-variables-step.yml
-
-      - template: get-docker-image-steps.yml
-        parameters:
-          Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
-          Context: tools/ci_build/github/linux/docker
-          DockerBuildArgs: >-
-            --build-arg TORCH_VERSION=$(TorchVersion)
-            --build-arg OPSET_VERSION=$(OpsetVersion)
-            --build-arg PYTHON_VERSION=$(PythonVersion)
-            --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
-            --build-arg BUILD_UID=$(id -u)
-            --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
-            --build-arg DEVTOOLSET_ROOTPATH=/usr
-            --build-arg PREPEND_PATH=/usr/local/cuda/bin:
-            --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
-          Repository: onnxruntimetraininggpubuild
-
-      - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
-        displayName: 'Mount MNIST'
-        condition: succeededOrFailed()
-
-      - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
-        displayName: 'Mount bert-data'
-        condition: succeededOrFailed()
-
-      - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
-        displayName: 'Mount hf-models-cache'
-        condition: succeededOrFailed()
-
-      - task: CmdLine@2
-        displayName: 'build onnxruntime'
-        inputs:
-          script: |
-            set -e -x
-            mkdir -p $HOME/.onnx
-            docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
-              --volume /data/onnx:/data/onnx:ro \
-              --volume $(Build.SourcesDirectory):/onnxruntime_src \
-              --volume $(Build.BinariesDirectory):/build \
-              --volume /data/models:/build/models:ro \
-              --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-              -e NVIDIA_VISIBLE_DEVICES=all \
-              -e NIGHTLY_BUILD \
-              -e DEFAULT_TRAINING_PACKAGE_DEVICE \
-              -e BUILD_BUILDNUMBER \
-              -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
-              onnxruntimetraininggpubuild \
-                $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-                  --build_dir /build \
-                  --config ${{ variables['buildConfig'] }} \
-                  --skip_submodule_sync \
-                  --parallel \
-                  --build_wheel \
-                  --enable_onnx_tests \
-                  ${{ parameters.build_py_parameters }} \
-                  --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
-                  --use_cuda --cuda_version=$(CudaVersion) --cuda_home=/usr/local/cuda-$(CudaVersion) --cudnn_home=/usr/local/cuda-$(CudaVersion) ;
-          workingDirectory: $(Build.SourcesDirectory)
-
-      - task: CmdLine@2
-        displayName: 'test ortmodule'
-        inputs:
-          script: |
-            rm -rf $(Build.BinariesDirectory)/${{ variables['buildConfig'] }}/onnxruntime/ && \
-            files=($(Build.BinariesDirectory)/${{ variables['buildConfig'] }}/dist/*.whl) && \
-            echo ${files[0]} && \
-            whlfilename=$(basename ${files[0]}) && \
-            echo $whlfilename && \
-            docker run --rm \
-              --gpus all \
-              -e NVIDIA_VISIBLE_DEVICES=all \
-              --volume $(Build.BinariesDirectory):/build \
-              --volume /mnist:/mnist \
-              --volume /bert_data:/bert_data \
-              --volume /hf_models_cache:/hf_models_cache \
-              onnxruntimetraininggpubuild \
-                bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/${{ variables['buildConfig'] }}/dist/$whlfilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ;
-          workingDirectory: $(Build.SourcesDirectory)
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)'
-          Contents: "${{ variables['buildConfig'] }}/dist/*.whl"
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
 
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
-        inputs:
-          ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}"
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
 
-      - task: CmdLine@2
-        displayName: 'Upload wheel'
-        condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true')))
-        inputs:
-          script: |
-            set -e -x
-            files=($(Build.ArtifactStagingDirectory)/${{ variables['buildConfig'] }}/dist/*.whl) && \
-            echo ${files[0]} && \
-            python3 tools/ci_build/upload_python_package_to_azure_storage.py \
-                --python_wheel_path ${files[0]} ${{ variables['finalStorage'] }}
+- name: build_pool_name
+  displayName: >
+    build_pool_name.
+  type: string
 
-      - template: component-governance-component-detection-steps.yml
-        parameters:
-          condition: 'succeeded'
+- name: PythonVersionList
+  displayName: Python Version List
+  type: object
+  default:
+    - name: '38'
+      version: '3.8'
+    - name: '39'
+      version: '3.9'
+    - name: '310'
+      version: '3.10'
+    - name: '311'
+      version: '3.11'
 
-      - template: clean-agent-build-directory-step.yml
+stages:
+- ${{ each python_version in parameters.PythonVersionList }}:
+  - template: py-packaging-training-cuda-stage-steps.yml
+    parameters:
+      build_py_parameters: ${{ parameters.build_py_parameters }}
+      torch_version: ${{ parameters.torch_version }}
+      opset_version: ${{ parameters.opset_version }}
+      cuda_version: ${{ parameters.cuda_version }}
+      cmake_cuda_architectures: ${{ parameters.cmake_cuda_architectures }}
+      docker_file: ${{ parameters.docker_file }}
+      upload_wheel: ${{ parameters.upload_wheel }}
+      debug_build: ${{ parameters.debug_build }}
+      stage_name: 'Linux_py_Training_Cuda_Wheels_${{ python_version.name }}'
+      python_version: ${{ python_version.version }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
+      build_pool_name: ${{ parameters.build_pool_name }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
new file mode 100644
index 000000000000..8ab82edba51d
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -0,0 +1,165 @@
+parameters:
+
+- name: MACHINE_POOL
+  type: string
+  default: 'onnxruntime-qnn-windows-vs-2022-arm64'
+
+- name: QNN_SDK
+  displayName: QNN Windows SDK path
+  type: string
+  default: qnn-v2.19.2.240210_win
+
+- name: PYTHON_VERSION
+  type: string
+  default: '3.11'
+
+- name: NUMPY_VERSION
+  type: string
+  default: '1.26.4'
+
+- name: ENV_SETUP_SCRIPT
+  type: string
+  default: ''
+
+- name: BUILD_PY_PARAMETERS
+  displayName: >
+    Extra parameters to pass to build.py. Don't put newlines in here.
+  type: string
+  default: ''
+
+jobs:
+- job: Win_py_arm64_qnn_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}
+  timeoutInMinutes: 210
+  workspace:
+    clean: all
+  pool:
+    name: ${{ parameters.MACHINE_POOL }}
+  variables:
+    GRADLE_OPTS: '-Dorg.gradle.daemon=false'
+    VSGenerator: 'Visual Studio 17 2022'
+    QNN_SDK_ROOTDIR: 'C:\data\qnnsdk\${{parameters.QNN_SDK}}'
+  steps:
+      - checkout: self
+        clean: true
+        submodules: recursive
+
+      - template: telemetry-steps.yml
+
+      - script: |
+          DIR C:\data\qnnsdk
+        displayName: Check available QNN SDKs
+
+      - script: |
+          MKDIR $(Agent.ToolsDirectory)\Python\3.11.0\arm64
+          XCOPY /s /y /h /e /c /q "C:\Python\Python311\*.*" $(Agent.ToolsDirectory)\Python\3.11.0\arm64\
+          COPY NUL $(Agent.ToolsDirectory)\Python\3.11.0\arm64.complete
+          DIR $(Agent.ToolsDirectory)\Python
+          DIR $(Agent.ToolsDirectory)\Python\3.11.0
+          DIR $(Agent.ToolsDirectory)\Python\3.11.0\arm64
+        displayName: Copy python 3.11.0 version to agent tools directory
+
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: ${{ parameters.PYTHON_VERSION }}
+          addToPath: true
+          architecture: 'arm64'
+
+      - task: onebranch.pipeline.tsaoptions@1
+        displayName: 'OneBranch TSAOptions'
+        inputs:
+          tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
+          appendSourceBranchName: false
+
+      - task: PythonScript@0
+        inputs:
+          scriptSource: inline
+          script: |
+            import subprocess
+            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', 'numpy==${{parameters.NUMPY_VERSION}}'])
+          workingDirectory: '$(Build.BinariesDirectory)'
+          displayName: 'Install python modules'
+
+      - template: set-nightly-build-option-variable-step.yml
+
+      - task: PythonScript@0
+        displayName: 'Generate cmake config'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+          arguments: >
+            --config RelWithDebInfo
+            --build_dir $(Build.BinariesDirectory)
+            --skip_submodule_sync
+            --cmake_generator "$(VSGenerator)"
+            --use_qnn
+            --qnn_home $(QNN_SDK_ROOTDIR)
+            --enable_pybind
+            --parallel --update
+            --numpy_version ${{ parameters.NUMPY_VERSION }}
+            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
+          workingDirectory: '$(Build.BinariesDirectory)'
+
+      - task: VSBuild@1
+        displayName: 'Build'
+        inputs:
+          solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
+          platform: 'arm64'
+          configuration: RelWithDebInfo
+          msbuildArchitecture: 'arm64'
+          maximumCpuCount: true
+          logProjectEvents: true
+          workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
+          createLogFile: true
+
+      # Esrp signing
+      - template: win-esrp-dll.yml
+        parameters:
+          FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
+          DisplayName: 'ESRP - Sign Native dlls'
+          DoEsrp: true
+          Pattern: '*.pyd,*.dll'
+
+      - task: PythonScript@0
+        displayName: 'Build wheel'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)\setup.py'
+          arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=qnn'
+          workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+      - task: CopyFiles@2
+        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+        inputs:
+          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
+          Contents: '*.whl'
+          TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+      - task: PublishBuildArtifacts@1
+        displayName: 'Publish Artifact: ONNXRuntime python wheel'
+        inputs:
+          ArtifactName: onnxruntime_qnn
+
+      - script: |
+          7z x *.whl
+        workingDirectory: '$(Build.ArtifactStagingDirectory)'
+        displayName: 'unzip the package'
+
+      - task: CredScan@3
+        displayName: 'Run CredScan'
+        inputs:
+          debugMode: false
+        continueOnError: true
+
+      - task: BinSkim@4
+        displayName: 'Run BinSkim'
+        inputs:
+          AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll'
+
+      - task: TSAUpload@2
+        displayName: 'TSA upload'
+        condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+        inputs:
+          GdnPublishTsaOnboard: false
+          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
+
+      - template: component-governance-component-detection-steps.yml
+        parameters:
+          condition: 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index 501251eaff20..71914ab45cfc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -1,7 +1,7 @@
 parameters:
-
 - name: MACHINE_POOL
   type: string
+  default: 'onnxruntime-Win2022-GPU-T4'
 
 - name: EP_NAME
   type: string
@@ -27,169 +27,229 @@ parameters:
   values:
     - 11.8
     - 12.2
-jobs:
-- job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}
-  timeoutInMinutes: 240
-  workspace:
-    clean: all
-  pool:
-    name: ${{ parameters.MACHINE_POOL }}
-#    demands:
-#      - ImageVersionOverride -equals 1.0.367516
-  variables:
-    GRADLE_OPTS: '-Dorg.gradle.daemon=false'
-    VSGenerator: 'Visual Studio 17 2022'
-    CUDA_MODULE_LOADING: 'LAZY'
-  steps:
-      - checkout: self
-        clean: true
-        submodules: recursive
-
-      - template: telemetry-steps.yml
-
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: ${{ parameters.PYTHON_VERSION }}
-          addToPath: true
-          architecture: 'x64'
-
-      - task: onebranch.pipeline.tsaoptions@1
-        displayName: 'OneBranch TSAOptions'
-        inputs:
-          tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
-          appendSourceBranchName: false
-
-      - task: PythonScript@0
-        inputs:
-          scriptSource: inline
-          script: |
-            import sys
-            np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.24.2'
-            import subprocess
-            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version])
-          workingDirectory: '$(Build.BinariesDirectory)'
-          displayName: 'Install python modules'
-
-      - template: download-deps.yml
-
-      - ${{ if ne(parameters.ENV_SETUP_SCRIPT, '') }}:
-        - template: jobs/set-winenv.yml
-          parameters:
-            EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
-            ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
-              DownloadCUDA: true
 
-      - ${{ if eq(parameters.ENV_SETUP_SCRIPT, '') }}:
-        - template: jobs/download_win_gpu_library.yml
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
+
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
+
+stages:
+  - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+    dependsOn: []
+    jobs:
+    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+      timeoutInMinutes: 360
+      workspace:
+        clean: all
+      pool:
+        name: onnxruntime-Win-CPU-2022
+      variables:
+        GRADLE_OPTS: '-Dorg.gradle.daemon=false'
+        VSGenerator: 'Visual Studio 17 2022'
+        CUDA_MODULE_LOADING: 'LAZY'
+      steps:
+          - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+            displayName: 'Clean Agent Directories'
+            condition: always()
+
+          - checkout: self
+            clean: true
+            submodules: recursive
+
+          - template: telemetry-steps.yml
+
+          - task: UsePythonVersion@0
+            inputs:
+              versionSpec: ${{ parameters.PYTHON_VERSION }}
+              addToPath: true
+              architecture: 'x64'
+
+          - task: onebranch.pipeline.tsaoptions@1
+            displayName: 'OneBranch TSAOptions'
+            inputs:
+              tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
+              appendSourceBranchName: false
+
+          - task: PythonScript@0
+            inputs:
+              scriptSource: inline
+              script: |
+                import sys
+                np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.26'
+                import subprocess
+                try:
+                  subprocess.check_call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version])
+                except subprocess.CalledProcessError:
+                  sys.exit(1)
+              workingDirectory: '$(Build.BinariesDirectory)'
+              displayName: 'Install python modules'
+
+          - template: download-deps.yml
+
+          - ${{ if ne(parameters.ENV_SETUP_SCRIPT, '') }}:
+            - template: jobs/set-winenv.yml
+              parameters:
+                EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
+                ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
+                  DownloadCUDA: true
+
+          - ${{ if eq(parameters.ENV_SETUP_SCRIPT, '') }}:
+            - template: jobs/download_win_gpu_library.yml
+              parameters:
+                CudaVersion: ${{ parameters.CudaVersion }}
+                ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
+                  DownloadCUDA: true
+                ${{ if contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt') }}:
+                  DownloadTRT: true
+
+          - task: PythonScript@0
+            displayName: 'Update deps.txt'
+            inputs:
+              scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+              arguments: --new_dir $(Build.BinariesDirectory)/deps
+              workingDirectory: $(Build.BinariesDirectory)
+
+          - task: PowerShell@2
+            displayName: 'Install ONNX'
+            inputs:
+              filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
+              workingDirectory: '$(Build.BinariesDirectory)'
+              arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo
+
+          - template: set-nightly-build-option-variable-step.yml
+
+          - task: PythonScript@0
+            displayName: 'Generate cmake config'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: >
+                --config RelWithDebInfo
+                --build_dir $(Build.BinariesDirectory)
+                --skip_submodule_sync
+                --cmake_generator "$(VSGenerator)"
+                --enable_pybind
+                --enable_onnx_tests
+                --parallel --use_binskim_compliant_compile_flags --update
+                $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
+              workingDirectory: '$(Build.BinariesDirectory)'
+
+          # building with build.py so the parallelization parameters are added to the msbuild command
+          - task: PythonScript@0
+            displayName: 'Build'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: >
+                --config RelWithDebInfo
+                --build_dir $(Build.BinariesDirectory)
+                --parallel --build
+                $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
+              workingDirectory: '$(Build.BinariesDirectory)'
+
+          # Esrp signing
+          - template: win-esrp-dll.yml
+            parameters:
+              FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
+              DisplayName: 'ESRP - Sign Native dlls'
+              DoEsrp: true
+              Pattern: '*.pyd,*.dll'
+
+          - task: PythonScript@0
+            displayName: 'Build wheel'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\setup.py'
+              arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=${{ parameters.EP_NAME }}'
+              workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+          - task: CopyFiles@2
+            displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+            inputs:
+              SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
+              Contents: '*.whl'
+              TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+          - task: PublishBuildArtifacts@1
+            displayName: 'Publish Artifact: ONNXRuntime python wheel'
+            inputs:
+              ArtifactName: onnxruntime_${{ parameters.EP_NAME }}
+
+          - script: |
+              7z x *.whl
+            workingDirectory: '$(Build.ArtifactStagingDirectory)'
+            displayName: 'unzip the package'
+
+          - task: CredScan@3
+            displayName: 'Run CredScan'
+            inputs:
+              debugMode: false
+            continueOnError: true
+
+          - task: BinSkim@4
+            displayName: 'Run BinSkim'
+            inputs:
+              AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
+
+          - task: TSAUpload@2
+            displayName: 'TSA upload'
+            condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+            inputs:
+              GdnPublishTsaOnboard: false
+              GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
+
+          - template: component-governance-component-detection-steps.yml
+            parameters:
+              condition: 'succeeded'
+
+  - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
+    dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+    jobs:
+    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
+      workspace:
+        clean: all
+      pool:
+        name: ${{parameters.MACHINE_POOL}}
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - checkout: self
+          clean: true
+          submodules: none
+
+        - task: UsePythonVersion@0
+          inputs:
+            versionSpec: ${{ parameters.PYTHON_VERSION }}
+            addToPath: true
+            architecture: 'x64'
+
+        - template: flex-downloadPipelineArtifact.yml
           parameters:
-            CudaVersion: ${{ parameters.CudaVersion }}
-            ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
-              DownloadCUDA: true
-            ${{ if contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt') }}:
-              DownloadTRT: true
-
-      - task: PythonScript@0
-        displayName: 'Update deps.txt'
-        inputs:
-          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
-          arguments: --new_dir $(Build.BinariesDirectory)/deps
-          workingDirectory: $(Build.BinariesDirectory)
-
-      - task: PowerShell@2
-        displayName: 'Install ONNX'
-        inputs:
-          filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
-          workingDirectory: '$(Build.BinariesDirectory)'
-          arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo
-
-      - template: set-nightly-build-option-variable-step.yml
-
-
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: >
-            --config RelWithDebInfo
-            --build_dir $(Build.BinariesDirectory)
-            --skip_submodule_sync
-            --cmake_generator "$(VSGenerator)"
-            --enable_pybind
-            --enable_onnx_tests
-            --parallel --update
-            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
-          workingDirectory: '$(Build.BinariesDirectory)'
-
-      - task: VSBuild@1
-        displayName: 'Build'
-        inputs:
-          solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
-          platform: x64
-          configuration: RelWithDebInfo
-          msbuildArchitecture: $(buildArch)
-          maximumCpuCount: true
-          logProjectEvents: true
-          workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-          createLogFile: true
-
-      # Esrp signing
-      - template: win-esrp-dll.yml
-        parameters:
-          FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
-          DisplayName: 'ESRP - Sign Native dlls'
-          DoEsrp: true
-          Pattern: '*.pyd,*.dll'
-
-      - task: PythonScript@0
-        displayName: 'Build wheel'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=${{ parameters.EP_NAME }}'
-          workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
-          Contents: '*.whl'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel'
-        inputs:
-          ArtifactName: onnxruntime_${{ parameters.EP_NAME }}
-
-      - script: |
-          7z x *.whl
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-        displayName: 'unzip the package'
-
-      - task: CredScan@3
-        displayName: 'Run CredScan'
-        inputs:
-          debugMode: false
-        continueOnError: true
-
-      - task: BinSkim@4
-        displayName: 'Run BinSkim'
-        inputs:
-          AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
-
-      - powershell: |
-         python -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
-         Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-         Remove-Item -Recurse -Force onnxruntime
-         python onnx_backend_test_series.py
-        workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-        displayName: 'Run Python Tests'
-
-      - task: TSAUpload@2
-        displayName: 'TSA upload'
-        condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-        inputs:
-          GdnPublishTsaOnboard: false
-          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
-
-      - template: component-governance-component-detection-steps.yml
-        parameters:
-          condition: 'succeeded'
+            ArtifactName: onnxruntime_${{ parameters.EP_NAME }}
+            StepName: 'Download Pipeline Artifact - Windows GPU Build'
+            TargetPath: '$(Build.ArtifactStagingDirectory)'
+            SpecificArtifact: ${{ parameters.SpecificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
+
+        - powershell: |
+            pushd onnxruntime/test/python
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            popd
+          workingDirectory: '$(Build.SourcesDirectory)'
+          displayName: 'Install ONNX'
+
+        - powershell: |
+            python -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+            Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+            mkdir -p $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
+            cd $(Agent.TempDirectory)\ort_test_data
+            python onnx_backend_test_series.py
+          workingDirectory: '$(Build.sourcesDirectory)'
+          displayName: 'Run Python Tests'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
new file mode 100644
index 000000000000..e8a1f2b5e0d0
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -0,0 +1,177 @@
+parameters:
+
+- name: MACHINE_POOL
+  type: string
+  default: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+
+- name: QNN_SDK
+  displayName: QNN Windows SDK path
+  type: string
+  default: qnn-v2.19.2.240210_win
+
+- name: ENV_SETUP_SCRIPT
+  type: string
+  default: ''
+
+- name: BUILD_PY_PARAMETERS
+  displayName: >
+    Extra parameters to pass to build.py. Don't put newlines in here.
+  type: string
+  default: ''
+
+jobs:
+- job: Win_py_x64_qnn_Wheels
+  timeoutInMinutes: 210
+  workspace:
+    clean: all
+  pool:
+    name: ${{ parameters.MACHINE_POOL }}
+  strategy:
+    matrix:
+      Python38_x64:
+        PythonVersion: '3.8'
+      Python39_x64:
+        PythonVersion: '3.9'
+      Python310_x64:
+        PythonVersion: '3.10'
+      Python311_x64:
+        PythonVersion: '3.11'
+      Python312_x64:
+        PythonVersion: '3.12'
+  variables:
+    GRADLE_OPTS: '-Dorg.gradle.daemon=false'
+    VSGenerator: 'Visual Studio 17 2022'
+    QNN_SDK_ROOTDIR: 'C:\data\qnnsdk\${{parameters.QNN_SDK}}'
+  steps:
+      - checkout: self
+        clean: true
+        submodules: recursive
+
+      - template: telemetry-steps.yml
+
+      - script: |
+          DIR C:\data\qnnsdk
+        displayName: Check available QNN SDKs
+
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: $(PythonVersion)
+          addToPath: true
+          architecture: 'x64'
+
+      - task: onebranch.pipeline.tsaoptions@1
+        displayName: 'OneBranch TSAOptions'
+        inputs:
+          tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
+          appendSourceBranchName: false
+
+      - task: PythonScript@0
+        inputs:
+          scriptSource: inline
+          script: |
+            import sys
+            np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.24.2'
+            import subprocess
+            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version])
+          workingDirectory: '$(Build.BinariesDirectory)'
+          displayName: 'Install python modules'
+
+      - template: download-deps.yml
+
+      - task: PythonScript@0
+        displayName: 'Update deps.txt'
+        inputs:
+          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+          arguments: --new_dir $(Build.BinariesDirectory)/deps
+          workingDirectory: $(Build.BinariesDirectory)
+
+      - task: PowerShell@2
+        displayName: 'Install ONNX'
+        inputs:
+          filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
+          workingDirectory: '$(Build.BinariesDirectory)'
+          arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo
+
+      - template: set-nightly-build-option-variable-step.yml
+
+      - task: PythonScript@0
+        displayName: 'Generate cmake config'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+          arguments: >
+            --config RelWithDebInfo
+            --build_dir $(Build.BinariesDirectory)
+            --skip_submodule_sync
+            --cmake_generator "$(VSGenerator)"
+            --use_qnn
+            --qnn_home $(QNN_SDK_ROOTDIR)
+            --enable_pybind
+            --parallel --update
+            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
+          workingDirectory: '$(Build.BinariesDirectory)'
+
+      - task: VSBuild@1
+        displayName: 'Build'
+        inputs:
+          solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
+          platform: 'x64'
+          configuration: RelWithDebInfo
+          msbuildArchitecture: 'x64'
+          maximumCpuCount: true
+          logProjectEvents: true
+          workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
+          createLogFile: true
+
+      # Esrp signing
+      - template: win-esrp-dll.yml
+        parameters:
+          FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
+          DisplayName: 'ESRP - Sign Native dlls'
+          DoEsrp: true
+          Pattern: '*.pyd,*.dll'
+
+      - task: PythonScript@0
+        displayName: 'Build wheel'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)\setup.py'
+          arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=qnn'
+          workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+      - task: CopyFiles@2
+        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+        inputs:
+          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
+          Contents: '*.whl'
+          TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+      - task: PublishBuildArtifacts@1
+        displayName: 'Publish Artifact: ONNXRuntime python wheel'
+        inputs:
+          ArtifactName: onnxruntime_qnn
+
+      - script: |
+          7z x *.whl
+        workingDirectory: '$(Build.ArtifactStagingDirectory)'
+        displayName: 'unzip the package'
+
+      - task: CredScan@3
+        displayName: 'Run CredScan'
+        inputs:
+          debugMode: false
+        continueOnError: true
+
+      - task: BinSkim@4
+        displayName: 'Run BinSkim'
+        inputs:
+          AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll'
+
+      - task: TSAUpload@2
+        displayName: 'TSA upload'
+        condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+        inputs:
+          GdnPublishTsaOnboard: false
+          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
+
+      - template: component-governance-component-detection-steps.yml
+        parameters:
+          condition: 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index 47cd72f412c6..1b7962059e30 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -279,7 +279,7 @@ stages:
 
     - script: |
         JEST_JUNIT_OUTPUT_FILE=$(Build.SourcesDirectory)/js/react_native/e2e/android-test-results.xml \
-        detox test --record-logs all --configuration android.emu.release
+        detox test --record-logs all --configuration android.emu.release --loglevel trace
       workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
       displayName: Run React Native Detox Android e2e Tests
 
@@ -329,7 +329,7 @@ stages:
 
     - script: |
         JEST_JUNIT_OUTPUT_FILE=$(Build.SourcesDirectory)/js/react_native/e2e/ios-test-results.xml \
-        detox test --record-logs all --configuration ios.sim.release
+        detox test --record-logs all --configuration ios.sim.release --loglevel trace
       workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e'
       displayName: Run React Native Detox iOS e2e Tests
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/rocm.yml b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
index 2e9e6c6b35a2..43a80aa4fd4e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
@@ -14,7 +14,7 @@ jobs:
   workspace:
     clean: all
   timeoutInMinutes: 180
-  pool: Ubuntu-2004-rocm-aiinfra
+  pool: Ubuntu-2204-rocm-aiinfra
   variables:
     - name: PythonVersion
       value: ${{ parameters.PythonVersion }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
index 110eaff46f46..68836117db81 100644
--- a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
@@ -5,6 +5,7 @@ steps:
 - task: PythonScript@0
   displayName: 'Set Python manylinux variables'
   inputs:
+    pythonInterpreter: /usr/bin/python3
     scriptSource: inline
     script: |
       version = "$(PythonVersion)"
@@ -30,6 +31,10 @@ steps:
         variables = {
           "PythonManylinuxDir": "/opt/python/cp311-cp311"
         }
+      elif version == "3.12":
+        variables = {
+          "PythonManylinuxDir": "/opt/python/cp312-cp312"
+        }
       else:
         raise ValueError("Unsupported Python version: '{}'".format(version))
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index d1dff0769e25..b1cdb498bb4a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -16,10 +16,10 @@ stages:
     displayName: "Build iOS package for variant: ${{ parameters.packageVariant}}"
 
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     variables:
-      xcodeVersion: "14.3"
+      xcodeVersion: "14.2"
       ortPodVersion: $[stageDependencies.IosPackaging_SetCommonVariables.j.outputs['SetCommonVariables.ORT_POD_VERSION']]
 
       ${{ if eq(parameters.packageVariant, 'Mobile') }}:
@@ -78,10 +78,6 @@ stages:
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
 
-    - script: |
-        $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
-      displayName: "Build Host Protoc"
-
     # create and test mobile pods
     - script: |
         python tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
@@ -91,8 +87,7 @@ stages:
           --test \
           --variant ${{ parameters.packageVariant }} \
           --build-settings-file "${{ variables.buildSettingsFile }}" \
-          ${{ variables.optionalIncludeOpsByConfigOption }} \
-          -b="--path_to_protoc_exe=$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
+          ${{ variables.optionalIncludeOpsByConfigOption }}
       displayName: "Build macOS/iOS framework and assemble pod package files"
 
     - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
index 7d767b4f4fde..ec4398fe31fc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
@@ -3,7 +3,7 @@
 parameters:
 - name: xcodeVersion
   type: string
-  default: "14.3"
+  default: "14.2"
 
 steps:
 - bash: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/web-browserstack-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-browserstack-ci.yml
index 96e6ff89cd4f..9ab2d3401de4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/web-browserstack-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/web-browserstack-ci.yml
@@ -71,7 +71,7 @@ jobs:
     timeoutInMinutes: 20
   - script: |
       export ORT_WEB_TEST_BS_BROWSERS=BS_MAC_11_Safari_14,BS_MAC_11_Chrome_91,BS_ANDROID_11_Pixel_5
-      npm test -- suite0 --env=bs --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -e=bs --wasm.initTimeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)/js/web'
     displayName: 'npm test (Suite0, BS_ANDROID, BS_MAC)'
     env:
@@ -80,7 +80,7 @@ jobs:
     continueOnError: true
   - script: |
       export ORT_WEB_TEST_BS_BROWSERS=BS_IOS_14_iPhoneXS
-      npm test -- suite1 --env=bs --wasm-init-timeout=30000 --file-cache --backend=wasm
+      npm test -- suite1 -e=bs --wasm.initTimeout=30000 --file-cache --backend=wasm
     workingDirectory: '$(Build.SourcesDirectory)/js/web'
     displayName: 'npm test (Suite1, BS_IOS)'
     continueOnError: true
@@ -95,4 +95,3 @@ jobs:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
     condition: always()
-
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 89c481f267e6..f2a29ef8a4c6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -71,6 +71,16 @@ parameters:
       - 11.8
       - 12.2
 
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
+
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
+
 stages:
 - stage: Windows_Packaging_${{ parameters.stage_name_suffix }}
   dependsOn: []
@@ -82,13 +92,20 @@ stages:
   - job:
     workspace:
       clean: all
-    pool: ${{ parameters.ort_build_pool_name }}
+    ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
+      pool: onnxruntime-Win-CPU-2022
+    ${{ else }}:
+      pool: ${{ parameters.ort_build_pool_name }}
     ${{ if eq(parameters['UseIncreasedTimeoutForTests'], 'true') }}:
       timeoutInMinutes: 1200
     ${{ else }}:
-      timeoutInMinutes: 300
+      timeoutInMinutes: 360
 
     steps:
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
+
       - checkout: self
         clean: true
         submodules: none
@@ -150,11 +167,12 @@ stages:
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
           ${{ if eq(parameters['UseIncreasedTimeoutForTests'], 'true') }}:
-            arguments: '--config RelWithDebInfo --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} --test_all_timeout 72000'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} --test_all_timeout 72000'
           ${{ else }}:
-            arguments: '--config RelWithDebInfo --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} '
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} '
           workingDirectory: '$(Build.BinariesDirectory)'
 
+
       - task: VSBuild@1
         displayName: 'Build'
         inputs:
@@ -162,22 +180,63 @@ stages:
           platform: ${{ parameters.msbuildPlatform }}
           configuration: RelWithDebInfo
           msbuildArchitecture: ${{ parameters.buildArch }}
-          maximumCpuCount: true
+          maximumCpuCount: true  # default is num logical cores worth of projects building concurrently
           logProjectEvents: true
           workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
           createLogFile: true
 
-      - task: PythonScript@0
-        displayName: 'test'
-        condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--config RelWithDebInfo --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+      # For CPU job, tests are run in the same machine as building
+      - ${{ if contains(parameters.ort_build_pool_name, 'CPU') }}:
+        - ${{ if eq(parameters.buildJava, 'true') }}:
+          - template: make_java_win_binaries.yml
+            parameters:
+              msbuildPlatform: ${{ parameters.msbuildPlatform }}
+              java_artifact_id: ${{ parameters.java_artifact_id }}
+
+          - task: PublishBuildArtifacts@1
+            condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
+            displayName: 'Publish Java temp binaries'
+            inputs:
+              pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
+              artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
+        - task: PythonScript@0
+          displayName: 'test'
+          condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+            workingDirectory: '$(Build.BinariesDirectory)'
+      - ${{ else }}:
+        - powershell: |
+            New-Item $(Agent.TempDirectory)/RelWithDebInfo -Force -ItemType Directory
+            Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/CTestTestfile.cmake" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Force
+            Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Recurse -Force
+            Get-ChildItem -Path "$(Agent.TempDirectory)/RelWithDebInfo" -Include *.pdb -File -Recurse | ForEach-Object { $_.Delete() }
+            Get-ChildItem -Path "$(Agent.TempDirectory)/RelWithDebInfo" -Include *.lib -File -Recurse | ForEach-Object { $_.Delete() }
+            Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
+            cd $(Agent.TempDirectory)/RelWithDebInfo
+            tree /f
+          displayName: 'Copy native test needs files'
           workingDirectory: '$(Build.BinariesDirectory)'
 
+        - ${{ if eq(parameters['buildJava'], 'true') }}:
+          - powershell: |
+              Copy-Item -Path $(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo/onnxruntime4j_jni.pdb -Destination $(Agent.TempDirectory)/RelWithDebInfo/RelWithDebInfo -Force
+              Copy-Item -Path "$(Build.BinariesDirectory)/RelWithDebInfo/java" -Destination $(Agent.TempDirectory)/RelWithDebInfo/ -Recurse -Force
+              cd $(Agent.TempDirectory)/RelWithDebInfo
+              tree /f
+            displayName: 'Copy java pad and folder for java test'
+            workingDirectory: '$(Build.BinariesDirectory)'
+
+        - task: PublishPipelineArtifact@1
+          inputs:
+            targetPath: '$(Agent.TempDirectory)/RelWithDebInfo'
+            artifactName: 'Windows_Packaging_${{ parameters.stage_name_suffix }}_build_artifacts'
+            publishLocation: 'pipeline'
+
       - script: |
-         dir *.dll
-         mkdir $(Build.ArtifactStagingDirectory)\testdata
+          dir *.dll
+          mkdir $(Build.ArtifactStagingDirectory)\testdata
         workingDirectory: '$(Build.BinariesDirectory)/RelWithDebInfo/RelWithDebInfo'
         displayName: 'List built DLLs'
 
@@ -204,7 +263,6 @@ stages:
           targetPath: '$(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe'
           artifactName: 'drop-extra${{ parameters.artifact_name_suffix }}'
 
-
       - task: CopyFiles@2
         displayName: 'Copy custom_op_library to: $(Build.ArtifactStagingDirectory)'
         condition: and(succeeded(), eq('${{ parameters.packageName}}', 'x64'))
@@ -213,55 +271,6 @@ stages:
           Contents: 'custom_op_library.dll'
           TargetFolder: '$(Build.ArtifactStagingDirectory)/testdata'
 
-      - task: CmdLine@2
-        condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
-        displayName: 'Add symbols and notices to Java'
-        inputs:
-          script: |
-            @echo on
-            cd $(Build.SourcesDirectory)\java
-            call $(Build.SourcesDirectory)\java\gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
-            if %errorlevel% neq 0 exit /b %errorlevel%
-            cd $(Build.BinariesDirectory)\RelWithDebInfo
-            set NATIVE_FOLDER=$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ai\onnxruntime\native\win-x64
-            mkdir %NATIVE_FOLDER%
-            echo "Directories created"
-            copy .\java\build\libs\*.jar $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
-            pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
-            set artifact_id=${{ parameters.java_artifact_id }}
-            jar xf onnxruntime-$(OnnxRuntimeVersion).jar META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml
-            move META-INF\maven\com.microsoft.onnxruntime\%artifact_id%\pom.xml onnxruntime-$(OnnxRuntimeVersion).pom
-            rd /s /q META-INF
-            popd
-            copy .\RelWithDebInfo\onnxruntime.pdb %NATIVE_FOLDER%
-            copy .\RelWithDebInfo\onnxruntime4j_jni.pdb %NATIVE_FOLDER%
-            copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\Privacy.md
-            copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ThirdPartyNotices.txt
-            @echo $(OnnxRuntimeGitCommitHash) > $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\GIT_COMMIT_ID
-            pushd $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
-            jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime.pdb
-            jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar ai\onnxruntime\native\win-x64\onnxruntime4j_jni.pdb
-            jar uf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\onnxruntime-$(OnnxRuntimeVersion).jar Privacy.md ThirdPartyNotices.txt GIT_COMMIT_ID
-            popd
-            pushd $(Build.SourcesDirectory)\java\build\classes\java\test
-            if %errorlevel% neq 0 exit /b %errorlevel%
-            jar cvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
-            if %errorlevel% neq 0 exit /b %errorlevel%
-            popd
-            pushd $(Build.SourcesDirectory)\java\build\resources\test
-            rd /s /q ai\onnxruntime\native
-            jar uvf $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\testing.jar .
-            popd
-            rd /s /q $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage
-            dir /s /b $(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}
-
-      - task: PublishBuildArtifacts@1
-        condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
-        displayName: 'Publish Java temp binaries'
-        inputs:
-          pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
-          artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
-
       - ${{ if eq(parameters['DoCompliance'], 'true') }}:
         - task: CredScan@3
           displayName: 'Run CredScan'
@@ -294,6 +303,97 @@ stages:
         parameters :
           condition : 'succeeded'
 
-      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-        displayName: 'Clean Agent Directories'
-        condition: always()
+- ${{ if contains(parameters.ort_build_pool_name, 'GPU') }}:
+  - stage: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
+    dependsOn: Windows_Packaging_${{ parameters.stage_name_suffix }}
+    variables:
+      CUDA_MODULE_LOADING: 'LAZY'
+    jobs:
+    - job: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
+      workspace:
+        clean: all
+      pool: ${{ parameters.ort_build_pool_name }}
+      timeoutInMinutes: 180
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - checkout: self
+          clean: true
+          submodules: none
+
+        - template: flex-downloadPipelineArtifact.yml
+          parameters:
+            ArtifactName: "Windows_Packaging_${{ parameters.stage_name_suffix }}_build_artifacts"
+            StepName: 'Download Pipeline Artifact - Windows GPU Packages Build'
+            TargetPath: '$(Build.BinariesDirectory)/RelWithDebInfo/'
+            SpecificArtifact: ${{ parameters.SpecificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
+
+        - powershell: |
+            tree /f
+            $drive = (Get-Location).Drive.Name
+            $file = 'CTestTestfile.cmake'
+            (Get-Content $file ) -replace 'C:\\a', -join($drive, ':\\a') | Set-Content $file
+            (Get-Content $file ) -replace 'C:/a', -join($drive, ':/a') | Set-Content $file
+          displayName: 'List built files and update CTestTestfile.cmake drive letter'
+          workingDirectory: '$(Build.BinariesDirectory)/RelWithDebInfo/'
+
+        - template: telemetry-steps.yml
+
+        - template: set-version-number-variables-step.yml
+
+        - ${{ if eq(parameters['buildJava'], 'true') }}:
+          - task: JavaToolInstaller@0
+            inputs:
+              versionSpec: "11"
+              jdkArchitectureOption: ${{ parameters.buildArch }}
+              jdkSourceOption: 'PreInstalled'
+
+        - task: UsePythonVersion@0
+          inputs:
+            versionSpec: '3.8'
+            addToPath: true
+            architecture: ${{ parameters.buildArch }}
+
+        - task: NodeTool@0
+          condition: and(succeeded(), eq('${{ parameters.buildNodejs}}', true))
+          inputs:
+            versionSpec: '18.x'
+
+        - ${{ if ne(parameters.CudaVersion, '') }}:
+          - template: jobs/download_win_gpu_library.yml
+            parameters:
+              CudaVersion: ${{ parameters.CudaVersion }}
+              ${{ if contains(parameters.buildparameter, 'use_cuda') }}:
+                DownloadCUDA: true
+              ${{ if contains(parameters.buildparameter, 'use_tensorrt') }}:
+                DownloadCUDA: true
+                DownloadTRT: true
+
+        - powershell: |
+            Write-Host "##vso[task.prependpath]C:\Program Files (x86)\dotnet"
+          displayName: 'Append dotnet x86  Directory to PATH'
+          condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
+
+        - task: PythonScript@0
+          displayName: 'test'
+          condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+            workingDirectory: '$(Build.BinariesDirectory)'
+
+        - ${{ if eq(parameters.buildJava, 'true') }}:
+          - template: make_java_win_binaries.yml
+            parameters:
+              msbuildPlatform: ${{ parameters.msbuildPlatform }}
+              java_artifact_id: ${{ parameters.java_artifact_id }}
+
+          - task: PublishBuildArtifacts@1
+            condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
+            displayName: 'Publish Java temp binaries'
+            inputs:
+              pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
+              artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
index 79647cc5699c..f2005ec5ada3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
@@ -127,14 +127,14 @@ jobs:
       displayName: 'Build (simd + JSEP)'
       inputs:
         scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_jsep --enable_wasm_simd --use_jsep --target onnxruntime_webassembly --skip_tests'
+        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_jsep --enable_wasm_simd --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         workingDirectory: '$(Build.BinariesDirectory)'
   - ${{ if eq(parameters.BuildJsep, true) }}:
     - task: PythonScript@0
       displayName: 'Build (simd + threads + JSEP)'
       inputs:
         scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep --target onnxruntime_webassembly --skip_tests'
+        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         workingDirectory: '$(Build.BinariesDirectory)'
   - ${{ if eq(parameters.SkipPublish, false) }}:
     - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index b7ec3305003d..fa6103fb8a59 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -31,6 +31,7 @@ jobs:
   variables:
     webgpuCommandlineExtraFlags: '--chromium-flags=--ignore-gpu-blocklist --chromium-flags=--gpu-vendor-id=0x10de'
     runCodesignValidationInjection: false
+    CHROME_BIN: 'C:\Program Files\Google\Chrome\Application\chrome.exe'
   timeoutInMinutes: 60
   workspace:
     clean: all
@@ -95,18 +96,6 @@ jobs:
       targetFolder: $(Build.SourcesDirectory)\js\web\lib\wasm\binding
       flattenFolders: true
     displayName: 'Binplace js files'
-  - script: |
-      npm i -g puppeteer
-    workingDirectory: '$(Build.SourcesDirectory)'
-    displayName: 'Use puppeteer to prepare Chrome for tests'
-  - script: |
-      FOR /F "tokens=* USEBACKQ" %%F IN (`where /r %HOMEDRIVE%%HOMEPATH%\.cache\puppeteer chrome.exe`) DO (
-        SET var=%%F
-        ECHO found chrome.exe: %%F
-      )
-      ECHO ##vso[task.setvariable variable=CHROME_BIN;]%var%
-    workingDirectory: '$(Build.SourcesDirectory)'
-    displayName: 'Set CHROME_BIN'
   - script: |
      npm ci
     workingDirectory: '$(Build.SourcesDirectory)\js'
@@ -155,12 +144,7 @@ jobs:
       path: $(Build.SourcesDirectory)/js/test/
       cacheHitVar: CACHE_RESTORED
     displayName: 'Cache ONNX node test data'
-  - task: Bash@3
-    inputs:
-      targetType: 'inline'
-      script: find "$(Build.SourcesDirectory)/js/test/" -type f
-    condition: and(not(canceled()), eq(variables.CACHE_RESTORED, 'true'))
-    displayName: 'List ONNX node test data'
+
   - task: PowerShell@2
     inputs:
       filePath: '$(Build.SourcesDirectory)\tools\ci_build\github\js\pack-npm-packages.ps1'
@@ -169,31 +153,61 @@ jobs:
       errorActionPreference: stop
     displayName: 'Pack NPM packages'
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm,xnnpack
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
+  - script: |
+     mkdir $(Agent.TempDirectory)\web\test\01
+     npm test -- -e=chrome -b=webgl,wasm --user-data-dir=$(Agent.TempDirectory)\web\test\01
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
-    displayName: 'Run ort-web tests (wasm,webgl,xnnpack backend)'
-    condition: eq('${{ parameters.RunWebGpuTests }}', 'false')
+    displayName: 'Run ort-web tests (wasm,webgl backend)'
+    condition: and(succeeded(), eq('${{ parameters.RunWebGpuTests }}', 'false'))
+  - script: |
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm,xnnpack,webgpu $(webgpuCommandlineExtraFlags)
+     mkdir $(Agent.TempDirectory)\web\test\02
+     npm test -- -e=chrome -b=webgl,wasm,webgpu $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\02
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (ALL backends)'
-    condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
+    condition: and(succeeded(), eq('${{ parameters.RunWebGpuTests }}', 'true'))
   - script: |
-     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags)
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
+  - script: |
+     mkdir $(Agent.TempDirectory)\web\test\03
+     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\03
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-tensor)'
-    condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
+    condition: and(succeeded(), eq('${{ parameters.RunWebGpuTests }}', 'true'))
+  - script: |
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
-     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location $(webgpuCommandlineExtraFlags)
+     mkdir $(Agent.TempDirectory)\web\test\04
+     npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\04
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-location)'
-    condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
+    condition: and(succeeded(), eq('${{ parameters.RunWebGpuTests }}', 'true'))
   - script: |
-     npm test -- --webgl-texture-pack-mode -b=webgl -e=chrome
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
+  - script: |
+     mkdir $(Agent.TempDirectory)\web\test\05
+     npm test -- --webgl.pack -b=webgl -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\05
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebGL: packed mode'
   - script: |
-     npm test -- --wasm-enable-proxy -b=wasm -e=chrome
+     powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
+  - script: |
+     mkdir $(Agent.TempDirectory)\web\test\06
+     npm test -- --wasm.proxy -b=wasm -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\06
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebAssembly: proxy'
     condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
@@ -202,6 +216,11 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'E2E package consuming test'
     condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
+  - script: |
+      npm run test:training:e2e
+    workingDirectory: '$(Build.SourcesDirectory)\js\web'
+    displayName: 'E2E training package test'
+    condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
   - task: CopyFiles@2
     inputs:
       sourceFolder: $(Build.SourcesDirectory)\js\common
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index f7876f15029c..a0af221607dc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -68,15 +68,30 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm ci /js/web/'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --wasm-init-timeout=30000 --file-cache
+      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Chrome processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
+  - script: |
+      mkdir $(Agent.TempDirectory)\web\test_multi_browsers\01
+      npm test -- suite0 -e=chrome -b=wasm,webgl --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\01
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Chrome)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --env=firefox --wasm-init-timeout=30000 --file-cache
+      powershell "Get-WmiObject Win32_Process -Filter \"name = 'firefox.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Firefox processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
+  - script: |
+      mkdir $(Agent.TempDirectory)\web\test_multi_browsers\02
+      npm test -- suite0 -b=wasm,webgl -e=firefox --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\02
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Firefox)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --env=edge --wasm-init-timeout=30000 --file-cache
+      powershell "Get-WmiObject Win32_Process -Filter \"name = 'msedge.exe'\" | Format-List CommandLine"
+    displayName: 'Check active Edge processes (before test)'
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
+  - script: |
+      mkdir $(Agent.TempDirectory)\web\test_multi_browsers\03
+      npm test -- suite0 -b=wasm,webgl -e=edge --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\03
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Edge)'
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
diff --git a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
index 63dabf5eab9d..24809ccfdec1 100644
--- a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
@@ -53,7 +53,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     IsReleasePipeline: false
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     BuildStaticLib: true
     ExtraBuildArgs: $(ExtraBuildArgs)
     WASMTemplate: linux-wasm-ci.yml
@@ -62,4 +62,4 @@ stages:
     RunWebGpuTestsForReleaseBuild: true
     WebGpuPoolName: 'onnxruntime-Win2022-webgpu-A10'
     WebCpuPoolName: 'onnxruntime-Win-CPU-2022-web'
-    WithCache: true
+    WithCache: false
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-fuzz-testing.yml b/tools/ci_build/github/azure-pipelines/win-ci-fuzz-testing.yml
index 98f1bf7ea1a1..db39c2cd2087 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-fuzz-testing.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-fuzz-testing.yml
@@ -20,11 +20,15 @@ jobs:
   workspace:
     clean: all
   steps:
-  - template: win-ci-prebuild-steps.yml
+  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+    displayName: 'Clean Agent Directories'
+    condition: always()
+
+  - template: templates/jobs/win-ci-prebuild-steps.yml
     parameters:
       EnvSetupScript: $(EnvSetupScript)
       DownloadCUDA: false
-      BuildArch: $(buildArch)
+      BuildArch: x64
       BuildConfig: $(BuildConfig)
       MachinePool: 'onnxruntime-Win-CPU-2022'
       WithCache: true
@@ -69,7 +73,3 @@ jobs:
       script: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)\onnxruntime_security_fuzz.exe /t  /f "$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)\testdata\mnist.onnx"  1 m'
       workingDirectory: $(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)
       failOnStderr: false # Optional
-
-  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-    displayName: 'Clean Agent Directories'
-    condition: always()
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index d7ffc1828c94..c333c7ef084d 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -49,9 +49,36 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
+    - job: build_x64_asan
+      pool: 'onnxruntime-Win-CPU-2022'
+      timeoutInMinutes:  300
+      steps:
+      - checkout: self
+        clean: true
+        submodules: none
+
+      - template: templates/jobs/win-ci-prebuild-steps.yml
+        parameters:
+          EnvSetupScript: setup_env.bat
+          DownloadCUDA: false
+          BuildArch: x64
+          BuildConfig: Debug
+          MachinePool: 'onnxruntime-Win-CPU-2022'
+          WithCache: false
+          InstallONNX: false
+          Today: $(TODAY)
+
+      - task: PythonScript@0
+        displayName: 'Build and Test'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+          arguments: --config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --disable_memleak_checker --enable_address_sanitizer
+          workingDirectory: '$(Build.BinariesDirectory)'
+
+
 - stage: x64_release
   dependsOn: []
   jobs:
@@ -69,7 +96,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
 - stage: x64_release_dnnl
@@ -87,7 +114,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: DNNL
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         # Intel EPs require Intel CPUs
         MachinePool: 'onnxruntime-Win2022-Intel-CPU'
 
@@ -107,7 +134,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: XNNPACK
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
 - stage: x64_release_winml
@@ -146,7 +173,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
 - stage: training_x64_debug
@@ -164,7 +191,7 @@ stages:
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win2022-CPU-training-AMD'
 
 - stage: training_x64_release
@@ -182,7 +209,7 @@ stages:
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win2022-CPU-training-AMD'
 
 - stage: ort_training_apis_x64_release
@@ -201,7 +228,7 @@ stages:
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win2022-CPU-training-AMD'
 
 - stage: x64_release_azure
@@ -228,7 +255,5 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
-
-
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index fdb9238071c9..291e2f4e1940 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -42,7 +42,12 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_cuda_profiling
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
@@ -59,15 +64,17 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+        additionalBuildFlags: >-
+          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --skip_onnx_tests
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        # Some unit tests crash on A10 GPUs. So this job still needs to use T4.
-        MachinePool: onnxruntime-Win2022-GPU-T4
+        MachinePool: onnxruntime-Win2022-GPU-A10
         isTraining: true
 
 - stage: dml
@@ -96,7 +103,11 @@ stages:
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
-        additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
+        additionalBuildFlags: >-
+          --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
+          --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
index d0f9772da7ad..9133db79946b 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
@@ -11,7 +11,6 @@ jobs:
   variables:
     MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
     EnvSetupScript: setup_env_cuda.bat
-    buildArch: x64
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   timeoutInMinutes: 120
   workspace:
@@ -21,7 +20,7 @@ jobs:
     parameters:
       EnvSetupScript: $(EnvSetupScript)
       DownloadCUDA: true
-      BuildArch: $(buildArch)
+      BuildArch: 'x64'
       BuildConfig: $(BuildConfig)
       MachinePool: 'onnxruntime-Win2022-GPU-T4'
       WithCache: true
@@ -34,7 +33,7 @@ jobs:
       AdditionalKey: "gpu-reduced-ops | $(BuildConfig)"
       BuildPyArguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --update --skip_submodule_sync --cmake_generator "Visual Studio 17 2022" --build_wheel --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"'
       MsbuildArguments: $(MsbuildArguments)
-      BuildArch: $(buildArch)
+      BuildArch: 'x64'
       Platform: 'x64'
       BuildConfig: $(BuildConfig)
 
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
index 658c358aa452..6cbe20bb9346 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
@@ -33,8 +33,6 @@ jobs:
   variables:
     MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
     EnvSetupScript: setup_env_trt.bat
-    buildArch: x64
-    BuildConfig: 'RelWithDebInfo'
     skipComponentGovernanceDetection: true
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   timeoutInMinutes: 150
@@ -45,8 +43,8 @@ jobs:
     parameters:
       EnvSetupScript: $(EnvSetupScript)
       DownloadCUDA: true
-      BuildArch: $(buildArch)
-      BuildConfig: $(BuildConfig)
+      BuildArch: 'x64'
+      BuildConfig: RelWithDebInfo
       MachinePool: 'onnxruntime-Win2022-GPU-T4'
       WithCache: true
       Today: $(Today)
@@ -55,28 +53,28 @@ jobs:
     parameters:
       WithCache: True
       Today: $(TODAY)
-      AdditionalKey: "gpu-tensorrt | $(BuildConfig)"
-      BuildPyArguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75'
+      AdditionalKey: "gpu-tensorrt | RelWithDebInfo"
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75'
       MsbuildArguments: $(MsbuildArguments)
-      BuildArch: $(buildArch)
+      BuildArch: 'x64'
       Platform: 'x64'
-      BuildConfig: $(BuildConfig)
+      BuildConfig: RelWithDebInfo
 
   - task: PythonScript@0
     displayName: 'Build wheel'
     inputs:
       scriptPath: '$(Build.SourcesDirectory)\setup.py'
       arguments: 'bdist_wheel'
-      workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
+      workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
 
   - script: |
-     mklink  /D /J $(Build.BinariesDirectory)\$(BuildConfig)\models $(Build.BinariesDirectory)\models
+     mklink  /D /J $(Build.BinariesDirectory)\RelWithDebInfo\models $(Build.BinariesDirectory)\models
      DIR dist\ /S /B > wheel_filename_file
      set /p WHEEL_FILENAME=<wheel_filename_file
      del wheel_filename_file
      python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
-     set PATH=$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig);%PATH%
-     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+     set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH%
+     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
 
-    workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
+    workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
     displayName: 'Run tests'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 5e35cbfed669..dc861f7f1ed7 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124_win
+  default: qnn-v2.19.2.240210_win
 
 jobs:
 - job: 'build'
@@ -84,17 +84,17 @@ jobs:
     displayName: 'Run unit tests'
 
   - script: |
-     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
+     .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run ONNX Tests'
 
   - script: |
-     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnCpu.dll" C:\data\float32_models
+     .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnCpu.dll" C:\data\float32_models
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run float32 model tests'
 
   - script: |
-     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnHtp.dll" C:\data\qdq_models
+     .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnHtp.dll" C:\data\qdq_models
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run QDQ model tests'
     enabled: false
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index 65b2924c8be6..534d5c6d6135 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124_win
+  default: qnn-v2.19.2.240210_win
 
 jobs:
 - job: 'build'
@@ -76,7 +76,7 @@ jobs:
       WithCache: True
       Today: $(TODAY)
       AdditionalKey: "win-qnn | $(BuildConfig)"
-      BuildPyArguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --compile_no_warning_as_error --update --cmake_generator "Visual Studio 17 2022" --use_qnn --qnn_home $(QNN_SDK_ROOT) --parallel'
+      BuildPyArguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --compile_no_warning_as_error --update --cmake_generator "Visual Studio 17 2022" --use_qnn --qnn_home $(QNN_SDK_ROOT) --parallel --use_binskim_compliant_compile_flags'
       MsbuildArguments: $(MsbuildArguments)
       BuildArch: $(buildArch)
       Platform: 'x64'
@@ -88,11 +88,11 @@ jobs:
     displayName: 'Run unit tests'
 
   - script: |
-      .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\x86_64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
+      .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\x86_64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run ONNX Tests'
 
   - script: |
-      .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models
+      .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run float32 model tests'
diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
index 2ec8bc82ae04..aec02f76693b 100755
--- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
@@ -1,11 +1,9 @@
 #!/bin/bash
 set -e -x
-export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
-export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
-docker run --gpus all -e CFLAGS -e CXXFLAGS  -e NVIDIA_VISIBLE_DEVICES=all --rm --volume \
+docker run --gpus all -e NVIDIA_VISIBLE_DEVICES=all --rm --volume \
 $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
 --volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \
-/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \
---skip_submodule_sync  --parallel --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \
+/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \
+--skip_submodule_sync  --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \
 --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \
 --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 3c1c65c9a686..bc57cf4120d2 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -7,9 +7,9 @@ mkdir -p /build/dist
 
 EXTRA_ARG=""
 
-# Put 3.8 at the last because Ubuntu 20.04 use python 3.8 and we will upload the intermediate build files of this 
-# config to Azure DevOps Artifacts and download them to a Ubuntu 20.04 machine to run the tests.
-PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp38-cp38/bin/python3.8")
+# Put 3.8 at the last because Ubuntu 22.04 use python 3.10 and we will upload the intermediate build files of this 
+# config to Azure DevOps Artifacts and download them to a Ubuntu 22.04 machine to run the tests.
+PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp310-cp310/bin/python3.10")
 while getopts "d:p:x:c:" parameter_Option
 do case "${parameter_Option}"
 in
@@ -23,32 +23,14 @@ c) BUILD_CONFIG=${OPTARG};;
 esac
 done
 
-BUILD_ARGS=("--build_dir" "/build" "--config" "$BUILD_CONFIG" "--update" "--build" "--skip_submodule_sync" "--parallel" "--build_wheel")
+BUILD_ARGS=("--build_dir" "/build" "--config" "$BUILD_CONFIG" "--update" "--build" "--skip_submodule_sync" "--parallel" "--use_binskim_compliant_compile_flags" "--build_wheel")
 
-if [ "$BUILD_CONFIG" == "Debug" ]; then
-    CFLAGS="-ggdb3"
-    CXXFLAGS="-ggdb3"
-else
-    CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -O3 -pipe -Wl,--strip-all"
-    CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -O3 -pipe -Wl,--strip-all"
+if [ "$BUILD_CONFIG" != "Debug" ]; then
     BUILD_ARGS+=("--enable_lto")
 fi
 
-# Depending on how the compiler has been configured when it was built, sometimes "gcc -dumpversion" shows the full version.
-GCC_VERSION=$(gcc -dumpversion | cut -d . -f 1)
-#-fstack-clash-protection prevents attacks based on an overlapping heap and stack.
-if [ "$GCC_VERSION" -ge 8 ]; then
-    CFLAGS="$CFLAGS -fstack-clash-protection"
-    CXXFLAGS="$CXXFLAGS -fstack-clash-protection"
-fi
-
 ARCH=$(uname -m)
 
-if [ "$ARCH" == "x86_64" ] && [ "$GCC_VERSION" -ge 9 ]; then
-    CFLAGS="$CFLAGS -fcf-protection"
-    CXXFLAGS="$CXXFLAGS -fcf-protection"
-fi
-
 echo "EXTRA_ARG:"
 echo "$EXTRA_ARG"
 
@@ -67,8 +49,6 @@ if [ "$BUILD_DEVICE" == "GPU" ]; then
     BUILD_ARGS+=("--nvcc_threads=1" "--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80")
 fi
 
-export CFLAGS
-export CXXFLAGS
 for PYTHON_EXE in "${PYTHON_EXES[@]}"
 do
   rm -rf /build/"$BUILD_CONFIG"
diff --git a/tools/ci_build/github/linux/build_rocm_c_api_package.sh b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
index 957f1f8a812a..d70442ad2cae 100755
--- a/tools/ci_build/github/linux/build_rocm_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
@@ -24,8 +24,6 @@ docker run --rm \
   --security-opt seccomp=unconfined \
   --shm-size=1024m \
   --user $UID:$(id -g $USER) \
-  -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
-  -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
   -e NIGHTLY_BUILD \
   --volume $SOURCE_DIR:/onnxruntime_src \
   --volume $BINARY_DIR:/build \
diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
index 5bf6a6917007..7d65a6f738a5 100755
--- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 set -e -x
-export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
-export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
 mkdir -p $HOME/.onnx
 docker run --gpus all -e CFLAGS -e CXXFLAGS -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
 --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \
 /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \
---skip_submodule_sync --parallel --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
+--skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh
index 42973a8fcb5b..65d6d97ebf0a 100755
--- a/tools/ci_build/github/linux/copy_strip_binary.sh
+++ b/tools/ci_build/github/linux/copy_strip_binary.sh
@@ -44,17 +44,10 @@ elif [[ $LIB_NAME == *.so.* ]]
 then
     ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.so
 fi
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_c_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_cxx_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_cxx_inline.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_float16.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h  $BINARY_DIR/$ARTIFACT_NAME/include
+cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_*.h $BINARY_DIR/$ARTIFACT_NAME/include
 cp $SOURCE_DIR/include/onnxruntime/core/framework/provider_options.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_cxx_inline.h  $BINARY_DIR/$ARTIFACT_NAME/include
+cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
+cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_*.h  $BINARY_DIR/$ARTIFACT_NAME/include
 
 if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_cuda.so" ]]; then
 # copy headers for context context used in custom ops
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index af87852561e0..546fca69201a 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -116,6 +116,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -127,6 +131,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -140,6 +145,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index 8f265b208cd4..fafc47b6e9de 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -7,7 +7,7 @@ ARG PLATFORM=x86_64
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 ARG DEVTOOLSET_ROOTPATH=/usr
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64
-ARG PREPEND_PATH=/usr/local/cuda/binet
+ARG PREPEND_PATH=/usr/local/cuda/bin
 ARG TRT_VERSION=8.6.1.6-1.cuda11.8
 
 #Build manylinux docker image begin
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -131,6 +135,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 FROM runtime_base
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
index 9e12fe8c7545..e1914d5fe2f0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
@@ -31,7 +31,7 @@ RUN yum install -y hipify-clang
 RUN yum -y install wget
 
 # rocm lib
-RUN yum install -y miopen-hip-devel rocblas-devel rocrand-devel rccl-devel hipsparse-devel hipfft-devel hipcub-devel hipblas-devel rocthrust-devel migraphx-devel
+RUN yum install -y migraphx-devel
 
 ENV AUDITWHEEL_POLICY=${POLICY} AUDITWHEEL_ARCH=${PLATFORM} AUDITWHEEL_PLAT=${POLICY}_${PLATFORM}
 ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 LANGUAGE=en_US.UTF-8
@@ -135,6 +135,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
@@ -147,6 +151,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -160,6 +165,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
@@ -172,7 +178,7 @@ CMD ["/bin/bash"]
 #Build manylinux2014 docker image end
 
 ARG PYTHON_VERSION=3.8
-ARG OPSET_VERSION=15
+ARG OPSET_VERSION=17
 ARG INSTALL_DEPS_EXTRA_ARGS
 
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
index 09ab7951552a..fed29689fbe5 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -130,6 +134,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
@@ -155,7 +161,7 @@ CMD ["/bin/bash"]
 #Build manylinux2014 docker image end
 ARG PYTHON_VERSION=3.9
 ARG TORCH_VERSION=2.0.0
-ARG OPSET_VERSION=15
+ARG OPSET_VERSION=17
 ARG INSTALL_DEPS_EXTRA_ARGS
 
 #Add our own dependencies
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
index a36f60b87768..e1caa141ef31 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -130,6 +134,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
@@ -155,7 +161,7 @@ CMD ["/bin/bash"]
 #Build manylinux2014 docker image end
 ARG PYTHON_VERSION=3.9
 ARG TORCH_VERSION=2.1.0
-ARG OPSET_VERSION=15
+ARG OPSET_VERSION=17
 ARG INSTALL_DEPS_EXTRA_ARGS
 
 #Add our own dependencies
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 9b9dc9ecae82..331eb6472070 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -16,15 +16,14 @@ ENV DEBIAN_FRONTEND=noninteractive
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH}
 
 RUN apt-get update &&\
-    apt-get install -y git bash wget
+    apt-get install -y git bash wget diffutils
 
 # Install python3
 RUN apt-get install -y --no-install-recommends \
     python3 \
     python3-pip \
     python3-dev \
-    python3-wheel 
-   
+    python3-wheel
 
 RUN pip install --upgrade pip
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
index 04a6af962b5e..f1ffba3b3e1c 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
@@ -82,8 +82,9 @@ RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIM
     git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
 
 # Build ORT
-ENV CUDA_MODULE_LOADING "LAZY"
-RUN /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
+ENV CUDA_MODULE_LOADING "LAZY" 
+ARG PARSER_CONFIG=""
+RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
 
 # Switch to root to continue following steps of CI
 USER root
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
new file mode 100644
index 000000000000..9493480784e8
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
@@ -0,0 +1,96 @@
+# --------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------
+# Dockerfile to run ONNXRuntime with TensorRT integration
+
+# Build base image with required system packages
+FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
+
+# The local directory into which to build and install CMAKE
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update &&\
+    apt-get install -y sudo git bash unattended-upgrades wget
+RUN unattended-upgrade
+
+# Install python3
+RUN apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-wheel &&\
+    cd /usr/local/bin &&\
+    ln -s /usr/bin/python3 python &&\
+    ln -s /usr/bin/pip3 pip;
+
+RUN pip install --upgrade pip 
+RUN pip install setuptools>=68.2.2
+
+# Install cuDNN v9
+RUN apt-get -y install cudnn9-cuda-12
+
+# Install TensorRT
+RUN v="8.6.1.6-1+cuda12.0" &&\
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
+    apt-get update &&\
+    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\
+        libnvinfer-headers-dev=${v} libnvinfer-headers-plugin-dev=${v} libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} libnvinfer-lean-dev=${v} libnvinfer-vc-plugin-dev=${v}  libnvinfer-dispatch-dev=${v}\
+        python3-libnvinfer=${v} libnvinfer-samples=${v} tensorrt-dev=${v} tensorrt-libs=${v}
+
+# Compile trtexec
+RUN cd /usr/src/tensorrt/samples/trtexec && make
+
+# Install Valgrind
+RUN apt-get install -y valgrind
+
+# Build final image from base. Builds ORT.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
+USER $BUILD_USER
+
+# ONNX Runtime arguments
+
+# URL to the github repo from which to clone ORT.
+ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
+
+# The local directory into which to clone ORT.
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
+
+# The git branch of ORT to checkout and build.
+ARG ONNXRUNTIME_BRANCH=main
+
+# Optional. The specific commit to pull and build from. If not set, the latest commit is used.
+ARG ONNXRUNTIME_COMMIT_ID
+
+# The supported CUDA architecture
+ARG CMAKE_CUDA_ARCHITECTURES=75
+
+WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}
+
+# Clone ORT repository with branch
+RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
+    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
+
+WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime
+
+# Reset to a specific commit if specified by build args.
+RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\
+    else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\
+    git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
+
+# Build ORT
+ENV CUDA_MODULE_LOADING "LAZY" 
+ARG PARSER_CONFIG=""
+RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
+
+# Switch to root to continue following steps of CI
+USER root
+
+# Intall ORT wheel
+RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index a0ba5ea232ca..45682c797bbb 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=20.04
 FROM ubuntu:${UBUNTU_VERSION}
 
-ARG OPENVINO_VERSION=2023.0.0
-ARG PYTHON_VERSION=3.8
+ARG OPENVINO_VERSION=2024.0.0
+ARG PYTHON_VERSION=3.9
 
 ADD scripts /tmp/scripts
 RUN /tmp/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} -d EdgeDevice && \
@@ -14,15 +14,14 @@ RUN apt update && apt install -y libnuma1 ocl-icd-libopencl1 && \
 
 ENV INTEL_OPENVINO_DIR /opt/intel/openvino_${OPENVINO_VERSION}
 ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
-ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV ngraph_DIR $INTEL_OPENVINO_DIR/runtime/cmake
+ENV OpenVINO_DIR $INTEL_OPENVINO_DIR/runtime/cmake
 ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN cd /opt && mkdir -p intel && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.0/linux/l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && \
-    tar xzf l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && \
-    mv l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64 openvino_2023.0.0 && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && \
+    tar xzf l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && \
+    mv l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64 openvino_2024.0.0 && \
     cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y
 
 WORKDIR /root
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index 21b09b2d8978..a26bf88fbbdf 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -4,29 +4,15 @@
 # --------------------------------------------------------------
 # Dockerfile to run ONNXRuntime with TensorRT installed from provided binaries
 
-FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04
+# Build base image with required system packages
+FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
 
+# The local directory into which to build and install CMAKE
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-# ONNX Runtime Variables
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
-
-# Must provide version numbers used to build the name of the tar file containing TensorRT binaries.
-# See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar
-ARG TAR_TRT_VERSION
-ARG TAR_CUDA_VERSION
-ARG TAR_CUDNN_VERSION
-
-# Directory containing TensorRT tar.gz installation package
-ARG TRT_BINS_DIR=.
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
-
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
-COPY ${TRT_BINS_DIR}/TensorRT-${TAR_TRT_VERSION}.Linux.x86_64-gnu.cuda-${TAR_CUDA_VERSION}.cudnn${TAR_CUDNN_VERSION}.tar.gz /TensorRT-${TAR_TRT_VERSION}.tar.gz
-
 RUN apt-get update &&\
     apt-get install -y sudo git bash unattended-upgrades wget
 RUN unattended-upgrade
@@ -44,22 +30,77 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip 
 RUN pip install setuptools>=68.2.2
 
+# Install cuDNN v9
+RUN apt-get -y install cudnn9-cuda-12
+
+# Install TensorRT
+# Must provide version numbers used to build the name of the tar file containing TensorRT binaries.
+# See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar
+ARG TAR_TRT_VERSION
+ARG TAR_CUDA_VERSION
+
+# Directory containing TensorRT tar.gz installation package
+ARG TRT_BINS_DIR=.
+COPY ${TRT_BINS_DIR}/TensorRT-${TAR_TRT_VERSION}.Linux.x86_64-gnu.cuda-${TAR_CUDA_VERSION}.tar.gz /TensorRT-${TAR_TRT_VERSION}.tar.gz
+
 # Install TensorRT from tar.gz
 RUN tar -xzvf /TensorRT-${TAR_TRT_VERSION}.tar.gz
 
 RUN cd /TensorRT-${TAR_TRT_VERSION}/python &&\
-    python3 -m pip install tensorrt-${TAR_TRT_VERSION}-cp38-none-linux_x86_64.whl
+    python3 -m pip install tensorrt*cp38*.whl
 
 RUN cp -r /TensorRT-${TAR_TRT_VERSION}/lib/* /usr/lib/x86_64-linux-gnu/
 RUN cp /TensorRT-${TAR_TRT_VERSION}/include/* /usr/local/include/
 RUN cp /TensorRT-${TAR_TRT_VERSION}/bin/* /usr/local/bin/
 
-WORKDIR /code
+# Install Valgrind
+RUN apt-get install -y valgrind
+
+# Build final image from base. Builds ORT.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
+USER $BUILD_USER
+
+# ONNX Runtime arguments
+
+# URL to the github repo from which to clone ORT.
+ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
+
+# The local directory into which to clone ORT.
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
+
+# The git branch of ORT to checkout and build.
+ARG ONNXRUNTIME_BRANCH=main
+
+# Optional. The specific commit to pull and build from. If not set, the latest commit is used.
+ARG ONNXRUNTIME_COMMIT_ID
+
+# The supported CUDA architecture
+ARG CMAKE_CUDA_ARCHITECTURES=75
 
 # Prepare onnxruntime repository & build onnxruntime with TensorRT
+WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}
+
+# Clone ORT repository with branch
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
-    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
-    cd onnxruntime &&\
-    /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' &&\
-    pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\
-    cd .. 
+    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
+
+WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime
+
+# Reset to a specific commit if specified by build args.
+RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\
+    else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\
+    git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
+
+# Build ORT
+ENV CUDA_MODULE_LOADING "LAZY" 
+ARG PARSER_CONFIG=""
+RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
+
+# Switch to root to continue following steps of CI
+USER root
+
+# Intall ORT wheel
+RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython b/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython
deleted file mode 100644
index bc0b41277328..000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython
+++ /dev/null
@@ -1,83 +0,0 @@
-FROM quay.io/pypa/manylinux2014_x86_64:latest
-
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
-
-ARG PYTHON_VER_PATH="cp38-cp38"
-ARG PYTHON_VERSION="3.8"
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-ARG OV_DEVICE_PRECISION="CPU_FP32"
-ARG ENABLE_TRAINING=true
-ARG ORT_BRANCH="rel-1.13.1"
-ARG OV_VERSION="2022.2.0"
-RUN adduser --uid $BUILD_UID $BUILD_USER
-WORKDIR /home/$BUILD_USER
-ENV PYTHON_EXE="/opt/python/$PYTHON_VER_PATH/bin/python$PYTHON_VERSION"
-
-RUN yum -y install wget git
-
-# libusb1.0.22
-RUN cd /home/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /home/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /home/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig'
-
-RUN ${PYTHON_EXE} -m pip install onnx numpy wheel
-USER $BUILD_USER
-RUN cd $WORKDIR && git clone https://github.com/openvinotoolkit/openvino.git && \
-    cd openvino && \
-    git checkout $OV_VERSION && \
-    git submodule init && \
-    git submodule update --recursive
-
-RUN cd $WORKDIR && cd openvino && mkdir build && cd build && \
-    cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0 -DENABLE_PYTHON=ON -DPYTHON_EXECUTABLE=$PYTHON_EXE -DCMAKE_INSTALL_PREFIX=/home/onnxruntimedev/openvino_$OV_VERSION && \
-    make -j8 && make install
-
-ENV INTEL_OPENVINO_DIR /home/onnxruntimedev/openvino_$OV_VERSION
-ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
-ENV TBB_LIBS $INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib
-ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV ngraph_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64
-ENV OPENVINO_MANYLINUX 1
-
-RUN cd $WORKDIR && \
-    git clone --recursive -b $ORT_BRANCH https://github.com/intel/onnxruntime.git
-RUN cd onnxruntime/onnxruntime/core/providers/openvino && mkdir scripts
-
-RUN cp ${IE_PLUGINS_PATH}/libopenvino.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_c.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_onnx_frontend.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_cpu_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_gpu_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_myriad_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_hetero_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_auto_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/plugins.xml /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/usb-ma2x8x.mvcmd /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbb.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbb.so.2 /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbbmalloc.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbbmalloc.so.2 /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cd /home/onnxruntimedev/onnxruntime && git pull
-RUN if $ENABLE_TRAINING; then \
-        ${PYTHON_EXE} ./onnxruntime/tools/ci_build/build.py \
-        --build_dir ./onnxruntime/build --use_openvino $(OV_DEVICE_PRECISION) --build_shared_lib \
-        --config Release --build_wheel --skip_tests --enable_training ; \
-    else \
-        ${PYTHON_EXE} ./onnxruntime/tools/ci_build/build.py \
-        --build_dir ./onnxruntime/build --use_openvino $(OV_DEVICE_PRECISION) --build_shared_lib \
-        --config Release --build_wheel --skip_tests ;\
-    fi
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
index b5f8bf1a49a1..dc105805a8a1 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
@@ -1,9 +1,9 @@
-#!/bin/bash
+!/bin/bash
 set -e -x
 
 os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 echo "installing for CentOS version : $os_major_version"
 rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm
-dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
+dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran gcc-toolset-12-libasan-devel libasan.x86_64
 locale
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
index 06e75ee1a39f..66fe0cafd945 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
@@ -114,6 +114,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/finalize-python.sh \
      /build_scripts/
@@ -122,6 +126,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -135,6 +140,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
index 7bf031ee7848..f576b867da73 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e -x
 pushd .
-PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 CURRENT_DIR=$(pwd)
 if ! [ -x "$(command -v protoc)" ]; then
   $CURRENT_DIR/install_protobuf.sh
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
index aa0ad05b42db..8f56ee18ccd2 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
@@ -1,10 +1,11 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.15.0
+onnx==1.16.0
 protobuf==4.21.12
 sympy==1.12
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/manylinux.patch b/tools/ci_build/github/linux/docker/manylinux.patch
index 75923e746f93..b3ea9f0dd17f 100644
--- a/tools/ci_build/github/linux/docker/manylinux.patch
+++ b/tools/ci_build/github/linux/docker/manylinux.patch
@@ -94,7 +94,7 @@ index 9ef1e99..ec52833 100755
 +fi
 \ No newline at end of file
 diff --git a/install-runtime-packages.sh b/install-runtime-packages.sh
-index 137d2e2..203b4bc 100755
+index 137d2e2..a0ed0c8 100755
 --- a/install-runtime-packages.sh
 +++ b/install-runtime-packages.sh
 @@ -33,7 +33,7 @@ source $MY_DIR/build_utils.sh
@@ -152,9 +152,9 @@ index 137d2e2..203b4bc 100755
 +            sed -i 's/enabled\s*=\s*1/enabled = 1\nexclude=dotnet* aspnet* netstandard*/g' /etc/yum.repos.d/ubi.repo
 +	fi
 +        if [[ -d /usr/local/cuda ]]; then
-+	    TOOLCHAIN_DEPS="gcc gcc-c++"
++	    TOOLCHAIN_DEPS="gcc gcc-c++ libasan"
 +	else
-+	    TOOLCHAIN_DEPS="gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran"
++	    TOOLCHAIN_DEPS="gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran gcc-toolset-12-libasan-devel"
  	fi
  elif [ "${AUDITWHEEL_POLICY}" == "musllinux_1_1" ]; then
  	TOOLCHAIN_DEPS="binutils gcc g++ gfortran"
diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
index d02e7d8b91d1..6c7163136882 100644
--- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
@@ -1,7 +1,7 @@
 # Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
 FROM ubuntu:22.04
 
-ARG ROCM_VERSION=5.7
+ARG ROCM_VERSION=6.0
 ARG AMDGPU_VERSION=${ROCM_VERSION}
 ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
 
@@ -65,9 +65,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     conda update --all && \
     rm ~/miniconda.sh && conda clean -ya
 
-# Conda base patch
-RUN pip install cryptography==41.0.4
-
 # Create migraphx-ci environment
 ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci
 ENV CONDA_DEFAULT_ENV migraphx-ci
diff --git a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
index 86585b75d43f..1ac1d226deec 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
@@ -46,6 +46,8 @@ elif [[ "$PYTHON_VER" = "3.10" && -d "/opt/python/cp310-cp310"  ]]; then
    PYTHON_EXE="/opt/python/cp310-cp310/bin/python3.10"
 elif [[ "$PYTHON_VER" = "3.11" && -d "/opt/python/cp311-cp311"  ]]; then
    PYTHON_EXE="/opt/python/cp311-cp311/bin/python3.11"
+elif [[ "$PYTHON_VER" = "3.12" && -d "/opt/python/cp312-cp312"  ]]; then
+   PYTHON_EXE="/opt/python/cp312-cp312/bin/python3.12"
 else
    PYTHON_EXE="/usr/bin/python${PYTHON_VER}"
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
index 8c79918120d8..5b181a484a60 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
@@ -19,7 +19,7 @@ PARENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)"
 source "$PARENT_DIR/install_dotnet.sh"
 
 if [ ! -d "/opt/conda/bin" ]; then
-    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
index ad3366b0bb3b..d8d2fbc06a00 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
@@ -6,7 +6,7 @@ yum -y install \
     graphviz
 
 if [ ! -d "/opt/conda/bin" ]; then
-    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh
index da8a45e00cc9..39c15338aedd 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh
@@ -31,8 +31,7 @@ cd /usr/local/
 echo "Cloning ONNX Script"
 git clone --recursive https://github.com/microsoft/onnxscript.git
 cd onnxscript
-/opt/python/cp39-cp39/bin/python3.9 -m pip install -r requirements-dev.txt
-/opt/python/cp39-cp39/bin/python3.9 setup.py install
+/opt/python/cp39-cp39/bin/python3.9 -m pip install .
 cd ~ && /opt/python/cp39-cp39/bin/python3.9 -c "import onnxscript; print(f'Installed ONNX Script: {onnxscript.__version__}')"
 
 cd /usr/local
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index d6912bfb05ef..80eccb68ebeb 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -1,11 +1,13 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.15.0
+onnx==1.16.0
 protobuf==4.21.12
 sympy==1.12
 flatbuffers
 neural-compressor>=2.2.1
+triton
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index a6452721a2b7..e20e433cd33c 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -1,11 +1,12 @@
 cerberus
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
-setuptools>=68.2.2
-wheel>=0.35.1
-onnx==1.15.0
+setuptools==69.0.3
+wheel==0.42.0
+onnx==1.16.0
 argparse
 sympy==1.12
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/scripts/setup_rocm_yum_repo.sh b/tools/ci_build/github/linux/docker/scripts/setup_rocm_yum_repo.sh
index fcd908606122..269337bbba04 100755
--- a/tools/ci_build/github/linux/docker/scripts/setup_rocm_yum_repo.sh
+++ b/tools/ci_build/github/linux/docker/scripts/setup_rocm_yum_repo.sh
@@ -2,7 +2,7 @@
 set -e -x
 
 # version
-ROCM_VERSION=5.6
+ROCM_VERSION=6.0
 
 while getopts "r:" parameter_Option
 do case "${parameter_Option}"
@@ -14,7 +14,7 @@ done
 tee /etc/yum.repos.d/amdgpu.repo <<EOF
 [amdgpu]
 name=amdgpu
-baseurl=https://repo.radeon.com/amdgpu/$ROCM_VERSION/rhel/8.7/main/x86_64/
+baseurl=https://repo.radeon.com/amdgpu/$ROCM_VERSION/rhel/8.8/main/x86_64/
 enabled=1
 priority=50
 gpgcheck=1
@@ -40,4 +40,4 @@ gpgcheck=1
 gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
 EOF
 
-dnf install -y rocm-dev
+dnf install -y rocm-dev rocm-libs
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
index 9c52aff960d6..57331d6df97d 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
@@ -1,2 +1,3 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
\ No newline at end of file
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
index 0cd5e5c5d5c4..01fa7b0ff956 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
@@ -1,5 +1,5 @@
 scikit-learn
 packaging==21.3
-transformers==v4.30.0
-accelerate==0.20.1
+transformers==v4.36.0
+accelerate==0.25.0
 wget
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
index b4b265f65b69..47f64568f424 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
@@ -1,9 +1,10 @@
 pandas
 scikit-learn
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
-transformers==v4.30.0
-accelerate
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
+transformers==v4.36.0
+accelerate==0.25.0
 rsa==4.9
 tensorboard==2.13.0
 h5py
diff --git a/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh b/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh
index b35bbfbd517d..640028ee7678 100755
--- a/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh
+++ b/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh
@@ -22,7 +22,7 @@ python3 /onnxruntime_src/tools/ci_build/build.py \
     --build_dir ${BUILD_DIR} --cmake_generator Ninja \
     --config Debug \
     --skip_submodule_sync \
-    --parallel \
+    --parallel --use_binskim_compliant_compile_flags \
     --build_wheel \
     --skip_tests \
     --enable_training_ops \
diff --git a/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_and_run_tests.sh b/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_and_run_tests.sh
index 2efcff917417..58d493086ece 100755
--- a/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_and_run_tests.sh
+++ b/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_and_run_tests.sh
@@ -72,7 +72,7 @@ python3 /onnxruntime_src/tools/ci_build/build.py \
     --config Debug \
     --skip_submodule_sync \
     --build_shared_lib \
-    --parallel \
+    --parallel --use_binskim_compliant_compile_flags \
     --minimal_build ${MINIMAL_BUILD_ARGS} \
     --disable_ml_ops \
     --include_ops_by_config ${REDUCED_OPS_CONFIG_FILE} \
diff --git a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
index ea4a3fd32b18..40debff3b2fe 100644
--- a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
+++ b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
@@ -31,9 +31,7 @@ def _check_binary_size(path, readelf, threshold, os_str, arch, build_config):
 
     if threshold is not None and sections_total > threshold:
         raise RuntimeError(
-            "Sections total size for {} of {} exceeds threshold of {} by {}. On-disk size={}".format(
-                path, sections_total, threshold, sections_total - threshold, ondisk_size
-            )
+            f"Sections total size for {path} of {sections_total} exceeds threshold of {threshold} by {sections_total - threshold}. On-disk size={ondisk_size}"
         )
 
 
diff --git a/tools/ci_build/github/linux/ort_minimal/readelf_utils.py b/tools/ci_build/github/linux/ort_minimal/readelf_utils.py
index dec070e3f5c7..2264742079d1 100644
--- a/tools/ci_build/github/linux/ort_minimal/readelf_utils.py
+++ b/tools/ci_build/github/linux/ort_minimal/readelf_utils.py
@@ -66,8 +66,8 @@ def diff_sections_total_size(base_binary_path, binary_path, readelf_path="readel
     results = collections.OrderedDict()
 
     for section in sorted(merged_keys):
-        base_size = base_section_sizes[section] if section in base_section_sizes else 0
-        size = section_sizes[section] if section in section_sizes else 0
+        base_size = base_section_sizes.get(section, 0)
+        size = section_sizes.get(section, 0)
 
         base_total += base_size
         total += size
diff --git a/tools/ci_build/github/linux/run_build.sh b/tools/ci_build/github/linux/run_build.sh
index 43e1543890e3..25b3610872a0 100755
--- a/tools/ci_build/github/linux/run_build.sh
+++ b/tools/ci_build/github/linux/run_build.sh
@@ -37,7 +37,7 @@ if [ $BUILD_OS = "yocto" ]; then
 
     make -j$(nproc)
 else
-    COMMON_BUILD_ARGS="--skip_submodule_sync --enable_onnx_tests --parallel --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest"
+    COMMON_BUILD_ARGS="--skip_submodule_sync --enable_onnx_tests --parallel --use_binskim_compliant_compile_flags --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest"
 
     if [ $BUILD_DEVICE = "gpu" ]; then
         _CUDNN_VERSION=$(echo $CUDNN_VERSION | cut -d. -f1-2)
diff --git a/tools/ci_build/github/linux/run_python_tests.sh b/tools/ci_build/github/linux/run_python_tests.sh
index 3164a10a09df..082c561dd17b 100755
--- a/tools/ci_build/github/linux/run_python_tests.sh
+++ b/tools/ci_build/github/linux/run_python_tests.sh
@@ -15,7 +15,7 @@ c) BUILD_CONFIG=${OPTARG};;
 esac
 done
 
-export PATH=/opt/python/cp38-cp38/bin:$PATH
+export PATH=/opt/python/cp310-cp310/bin:$PATH
 cd /build
 files=(whl/*.whl)
 FILE_NAME="${files[0]}"
diff --git a/tools/ci_build/github/linux/upload_code_coverage_data.sh b/tools/ci_build/github/linux/upload_code_coverage_data.sh
index 2f63e4c2fe08..cba54a421d51 100755
--- a/tools/ci_build/github/linux/upload_code_coverage_data.sh
+++ b/tools/ci_build/github/linux/upload_code_coverage_data.sh
@@ -2,5 +2,5 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 set -x -e
-/usr/bin/env python3 -m pip install -r $BUILD_SOURCESDIRECTORY/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
+/usr/bin/env python3 -m pip install --user -r $BUILD_SOURCESDIRECTORY/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
 $BUILD_SOURCESDIRECTORY/tools/ci_build/github/windows/post_code_coverage_to_dashboard.py --commit_hash=$BUILD_SOURCEVERSION --report_file $1 --report_url $2 --branch $BUILD_SOURCEBRANCHNAME --arch $3 --os $4 --build_config $5
\ No newline at end of file
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index 05eef8a00551..496b57b417fb 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -1,7 +1,7 @@
 # Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
 FROM ubuntu:22.04
 
-ARG ROCM_VERSION=5.7
+ARG ROCM_VERSION=6.0
 ARG AMDGPU_VERSION=${ROCM_VERSION}
 ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
 
@@ -67,9 +67,6 @@ ENV CONDA_DEFAULT_ENV rocm-ci
 RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
 ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}
 
-# Conda base patch
-RUN pip install cryptography==41.0.4
-
 # Enable rocm-ci environment
 SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"]
 
@@ -94,7 +91,7 @@ RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.b
 
 # Install CuPy, No stable version is available
 RUN git clone https://github.com/ROCmSoftwarePlatform/cupy && cd cupy && \
-    git checkout fc251a808037f8a2270860c2a23a683bfc0de43e && \
+    git checkout 432a8683351d681e00903640489cb2f4055d2e09 && \
     export CUPY_INSTALL_USE_HIP=1 && \
     export ROCM_HOME=/opt/rocm && \
     export HCC_AMDGPU_TARGET=gfx906,gfx908,gfx90a && \
@@ -115,7 +112,7 @@ RUN pip install \
     cerberus \
     sympy \
     h5py \
-    datasets==1.9.0 \
+    datasets==2.17.0 \
     requests \
     sacrebleu==1.5.1 \
     sacremoses \
@@ -128,12 +125,13 @@ RUN pip install \
     pytorch_lightning==1.6.0 \
     pytest-xdist \
     pytest-rerunfailures \
-    ml_dtypes==0.3.0
+    ml_dtypes==0.3.0 \
+    pytest==7.4.4
 
 # Install migraphx
 RUN apt update && apt install -y migraphx
 
-ENV ORTMODULE_ONNX_OPSET_VERSION=15
+ENV ORTMODULE_ONNX_OPSET_VERSION=17
 
 ARG BUILD_UID=1001
 ARG BUILD_USER=onnxruntimedev
diff --git a/tools/ci_build/github/windows/helpers.ps1 b/tools/ci_build/github/windows/helpers.ps1
index 20df10b24440..a039a9274b8e 100644
--- a/tools/ci_build/github/windows/helpers.ps1
+++ b/tools/ci_build/github/windows/helpers.ps1
@@ -315,6 +315,7 @@ function Install-Pybind {
     param (
         [Parameter(Mandatory)][string]$cmake_path,
         [Parameter(Mandatory)][string]$msbuild_path,
+        [Parameter(Mandatory)][string]$cpu_arch,
         [Parameter(Mandatory)][string]$src_root,
         [Parameter(Mandatory)][CMakeBuildType]$build_config,
         [Parameter(Mandatory)][string[]]$cmake_extra_args
@@ -349,11 +350,18 @@ function Install-Pybind {
       $msbuild_args += "/p:CLToolExe=cl.exe", "/p:CLToolPath=C:\ProgramData\chocolatey\bin", "/p:TrackFileAccess=false", "/p:UseMultiToolTask=true"
     }
 
+    if($cpu_arch -eq 'x86'){
+      $msbuild_args +=  "/p:platform=Win32"
+    } elseif($cpu_arch -eq 'x64') {
+      $msbuild_args +=  "/p:platform=x64"
+    }
+
+
     $final_args = $msbuild_args + "pybind11.sln"
     &$msbuild_path $final_args
     $final_args = $msbuild_args + "INSTALL.vcxproj"
     &$msbuild_path $final_args
-       
+
     Write-Host "Installing pybind finished."
 
     popd
@@ -377,6 +385,7 @@ function Install-Abseil {
     param (
         [Parameter(Mandatory)][string]$cmake_path,
         [Parameter(Mandatory)][string]$msbuild_path,
+        [Parameter(Mandatory)][string]$cpu_arch,
         [Parameter(Mandatory)][string]$src_root,
         [Parameter(Mandatory)][CMakeBuildType]$build_config,
         [Parameter(Mandatory)][string[]]$cmake_extra_args
@@ -393,7 +402,7 @@ function Install-Abseil {
     }
     cd $absl_src_dir
     cd *
-    
+
     # Search patch.exe
     $patch_path = 'C:\Program Files\Git\usr\bin\patch.exe'
     if(-not (Test-Path $patch_path -PathType Leaf)){
@@ -408,7 +417,7 @@ function Install-Abseil {
     } else {
       Write-Host "Skip patching abseil since we cannot find patch.exe at $patch_path"
     }
-    
+
     # Run cmake to generate Visual Studio sln file
     [string[]]$cmake_args = ".", "-DABSL_PROPAGATE_CXX_STD=ON", "-DCMAKE_BUILD_TYPE=$build_config", "-DBUILD_TESTING=OFF", "-DABSL_USE_EXTERNAL_GOOGLETEST=ON", "-DCMAKE_PREFIX_PATH=$install_prefix",  "-DCMAKE_INSTALL_PREFIX=$install_prefix"
     $cmake_args += $cmake_extra_args
@@ -425,6 +434,11 @@ function Install-Abseil {
       $msbuild_args += "/p:CLToolExe=cl.exe", "/p:CLToolPath=C:\ProgramData\chocolatey\bin", "/p:TrackFileAccess=false", "/p:UseMultiToolTask=true"
     }
 
+    if($cpu_arch -eq 'x86'){
+      $msbuild_args +=  "/p:platform=Win32"
+    } elseif($cpu_arch -eq 'x64') {
+      $msbuild_args +=  "/p:platform=x64"
+    }
 
     $final_args = $msbuild_args + "absl.sln"
     &$msbuild_path $final_args
@@ -459,6 +473,7 @@ function Install-UTF8-Range {
     param (
         [Parameter(Mandatory)][string]$cmake_path,
         [Parameter(Mandatory)][string]$msbuild_path,
+        [Parameter(Mandatory)][string]$cpu_arch,
         [Parameter(Mandatory)][string]$src_root,
         [Parameter(Mandatory)][CMakeBuildType]$build_config,
         [Parameter(Mandatory)][string[]]$cmake_extra_args
@@ -492,6 +507,11 @@ function Install-UTF8-Range {
       $msbuild_args += "/p:CLToolExe=cl.exe", "/p:CLToolPath=C:\ProgramData\chocolatey\bin", "/p:TrackFileAccess=false", "/p:UseMultiToolTask=true"
     }
 
+    if($cpu_arch -eq 'x86'){
+      $msbuild_args +=  "/p:platform=Win32"
+    } elseif($cpu_arch -eq 'x64') {
+      $msbuild_args +=  "/p:platform=x64"
+    }
 
     $final_args = $msbuild_args + "utf8_range.sln"
     &$msbuild_path $final_args
@@ -527,6 +547,7 @@ function Install-Protobuf {
     param (
         [Parameter(Mandatory)][string]$cmake_path,
         [Parameter(Mandatory)][string]$msbuild_path,
+        [Parameter(Mandatory)][string]$cpu_arch,
         [Parameter(Mandatory)][string]$src_root,
         [Parameter(Mandatory)][CMakeBuildType]$build_config,
         [Parameter(Mandatory)][string[]]$cmake_extra_args
@@ -567,8 +588,13 @@ function Install-Protobuf {
       Write-Host -Object "CMake command failed. Exitcode: $exitCode"
       exit $lastExitCode
     }
-    
+
     $msbuild_args = "-nodeReuse:false", "-nologo", "-nr:false", "-maxcpucount", "-p:UseMultiToolTask=true", "-p:configuration=`"$build_config`""
+    if($cpu_arch -eq 'x86'){
+      $msbuild_args +=  "/p:platform=Win32"
+    } elseif($cpu_arch -eq 'x64') {
+      $msbuild_args +=  "/p:platform=x64"
+    }
 
     if ($use_cache) {
       $msbuild_args += "/p:CLToolExe=cl.exe", "/p:CLToolPath=C:\ProgramData\chocolatey\bin", "/p:TrackFileAccess=false", "/p:UseMultiToolTask=true"
@@ -609,7 +635,7 @@ function Install-ONNX {
     if ($lastExitCode -ne 0) {
       exit $lastExitCode
     }
-    
+
     Write-Host "Installing python packages..."
     [string[]]$pip_args = "-m", "pip", "install", "-qq", "--disable-pip-version-check", "setuptools>=68.2.2", "wheel", "numpy", "protobuf==$protobuf_version"
     &"python.exe" $pip_args
@@ -661,8 +687,8 @@ function Install-ONNX {
     $Env:CMAKE_ARGS="-DONNX_USE_PROTOBUF_SHARED_LIBS=OFF -DProtobuf_USE_STATIC_LIBS=ON -DONNX_USE_LITE_PROTO=OFF -DCMAKE_PREFIX_PATH=$install_prefix"
 
     python.exe "setup.py" "bdist_wheel"
-    
-    
+
+
     Write-Host "Installing the newly built ONNX python package"
     Get-ChildItem -Path dist/*.whl | foreach {
         $p = Start-Process -NoNewWindow -Wait -PassThru -FilePath "python.exe" -ArgumentList "-m", "pip", "--disable-pip-version-check", "install", "--upgrade", $_.fullname
diff --git a/tools/ci_build/github/windows/install_third_party_deps.ps1 b/tools/ci_build/github/windows/install_third_party_deps.ps1
index c30b57695311..54507cd40cc4 100644
--- a/tools/ci_build/github/windows/install_third_party_deps.ps1
+++ b/tools/ci_build/github/windows/install_third_party_deps.ps1
@@ -93,11 +93,11 @@ if(-not (Test-Path $vshwere_path -PathType Leaf)){
 
 $msbuild_path = &$vshwere_path -latest -requires Microsoft.Component.MSBuild -find MSBuild\**\Bin\MSBuild.exe | select-object -first 1
 
-Install-Pybind -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config  -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path
+Install-Pybind -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config  -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path -cpu_arch $cpu_arch
 
-Install-Abseil -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path
+Install-Abseil -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path -cpu_arch $cpu_arch
 
-Install-Protobuf -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path
+Install-Protobuf -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path -cpu_arch $cpu_arch
 
 $protobuf_version="4.21.12"
 
diff --git a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
index acca4fb13c45..a9667fe4d065 100644
--- a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
+++ b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
@@ -49,7 +49,7 @@ def get_binary_sizes(size_data_file):
                 break
             linedata = line.strip().split(",")
             tablerow = {}
-            for i in range(0, len(headers)):
+            for i in range(len(headers)):
                 if headers[i] == "size":
                     tablerow[headers[i]] = int(linedata[i])
                 else:
diff --git a/tools/ci_build/op_registration_utils.py b/tools/ci_build/op_registration_utils.py
index 3fd01253a3e3..811ce424eae1 100644
--- a/tools/ci_build/op_registration_utils.py
+++ b/tools/ci_build/op_registration_utils.py
@@ -104,14 +104,12 @@ def process_registration(
         :param end_version: End version or None if unversioned registration
         :param type: Type or types used in registration, if this is a typed registration
         """
-        pass
 
     def process_other_line(self, line):
         """
         Process a line that does not contain a kernel registration
         :param line: Original line
         """
-        pass
 
     def ok(self):
         """
diff --git a/tools/ci_build/op_registration_validator.py b/tools/ci_build/op_registration_validator.py
index 8222437f7b42..d92050a31f96 100644
--- a/tools/ci_build/op_registration_validator.py
+++ b/tools/ci_build/op_registration_validator.py
@@ -45,7 +45,7 @@ def domain_and_op_str(self):
 
 
 def _log_registration_error(r: RegistrationInfo, message: str):
-    log.error("Invalid registration for {}. {}\n{}".format(r.domain_and_op_str(), message, "".join(r.lines)))
+    log.error("Invalid registration for %s. %s\n%s", r.domain_and_op_str(), message, "".join(r.lines))
 
 
 class RegistrationValidator(op_registration_utils.RegistrationProcessor):
@@ -165,7 +165,7 @@ def _validate_last_registration(self, last_r: RegistrationInfo) -> bool:
         # domain that have newer registrations in a non-contrib op file differently. They should only be considered
         # deprecated as contrib ops.
         domain_and_op_str = last_r.domain_and_op_str()
-        deprecation_version = deprecated_ops.get(domain_and_op_str, None)
+        deprecation_version = deprecated_ops.get(domain_and_op_str)
 
         allow_missing_unversioned_registration = (
             deprecation_version is not None and last_r.end_version == deprecation_version - 1
diff --git a/tools/ci_build/requirements.txt b/tools/ci_build/requirements-transformers-test.txt
similarity index 54%
rename from tools/ci_build/requirements.txt
rename to tools/ci_build/requirements-transformers-test.txt
index aaca45b3e17e..0c0c887f0be3 100644
--- a/tools/ci_build/requirements.txt
+++ b/tools/ci_build/requirements-transformers-test.txt
@@ -1,8 +1,10 @@
 # packages used by transformers python unittest (only enabled in Linux CPU CI Pipeline)
 packaging
 protobuf==3.20.2
-numpy==1.24.0
+numpy==1.24.0 ; python_version < '3.12'
+numpy==1.26.0 ; python_version >= '3.12'
+torch
 coloredlogs==15.0
-transformers==4.36.0
+transformers==4.38.0
 psutil
-einops
\ No newline at end of file
+einops
diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py
index cdb75154ecd2..d26fec41033c 100644
--- a/tools/ci_build/set-trigger-rules.py
+++ b/tools/ci_build/set-trigger-rules.py
@@ -14,6 +14,7 @@
 skip_js_changes = [
     "android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml",
     "android-x86_64-crosscompile-ci-pipeline.yml",
+    "bigmodels-ci-pipeline.yml",
     "linux-ci-pipeline.yml",
     "linux-cpu-aten-pipeline.yml",
     "linux-cpu-eager-pipeline.yml",
@@ -31,7 +32,6 @@
     "orttraining-linux-ci-pipeline.yml",
     "orttraining-linux-gpu-ci-pipeline.yml",
     "orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml",
-    "orttraining-linux-gpu-training-apis.yml",
     "orttraining-mac-ci-pipeline.yml",
     "win-ci-pipeline.yml",
     "win-gpu-ci-pipeline.yml",
diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py
index cc64775ae158..90d800f2a449 100644
--- a/tools/doc/rename_folders.py
+++ b/tools/doc/rename_folders.py
@@ -3,6 +3,7 @@
 This extension does not publish any folder starting with `_`.
 These folders need to be renamed.
 """
+
 import os
 import re
 
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 66248565a3e3..20a5b0864e70 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -219,7 +219,7 @@ def add_common_dependencies(xml_text, package_name, version):
 
 
 def generate_dependencies(xml_text, package_name, version):
-    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.12.1"/>'
+    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.13.1"/>'
 
     if package_name == "Microsoft.AI.MachineLearning":
         xml_text.append("<dependencies>")
@@ -324,10 +324,12 @@ def generate_metadata(line_list, args):
     generate_owners(metadata_list, "Microsoft")
     generate_description(metadata_list, args.package_name)
     generate_copyright(metadata_list, "\xc2\xa9 " + "Microsoft Corporation. All rights reserved.")
-    generate_tags(
-        metadata_list, "ONNX ONNX Runtime Machine Learning"
-    ) if "Microsoft.ML.OnnxRuntime.Training." in args.package_name else generate_tags(
-        metadata_list, "native ONNX ONNXRuntime-Training Learning-on-The-Edge On-Device-Training MachineLearning"
+    (
+        generate_tags(metadata_list, "ONNX ONNX Runtime Machine Learning")
+        if "Microsoft.ML.OnnxRuntime.Training." in args.package_name
+        else generate_tags(
+            metadata_list, "native ONNX ONNXRuntime-Training Learning-on-The-Edge On-Device-Training MachineLearning"
+        )
     )
     generate_icon(metadata_list, "ORT_icon_for_light_bg.png")
     generate_license(metadata_list)
@@ -482,7 +484,7 @@ def generate_files(line_list, args):
         files_list.append(
             "<file src="
             + '"'
-            + os.path.join(args.sources_path, "include\\onnxruntime\\core\\providers\\dnnl\\dnnl_provider_factory.h")
+            + os.path.join(args.sources_path, "include\\onnxruntime\\core\\providers\\dnnl\\dnnl_provider_options.h")
             + '" target="build\\native\\include" />'
         )
 
@@ -732,7 +734,7 @@ def generate_files(line_list, args):
         )
 
     if args.execution_provider == "openvino":
-        openvino_path = get_env_var("INTEL_OPENVINO_DIR")
+        get_env_var("INTEL_OPENVINO_DIR")
         files_list.append(
             "<file src="
             + '"'
@@ -750,32 +752,6 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
-        if is_windows():
-            dll_list_path = os.path.join(openvino_path, "runtime\\bin\\intel64\\Release\\")
-            tbb_list_path = os.path.join(openvino_path, "runtime\\3rdparty\\tbb\\bin\\")
-
-            for dll_element in os.listdir(dll_list_path):
-                if dll_element.endswith("dll"):
-                    files_list.append(
-                        "<file src="
-                        + '"'
-                        + os.path.join(dll_list_path, dll_element)
-                        + runtimes_target
-                        + args.target_architecture
-                        + '\\native" />'
-                    )
-
-            for tbb_element in os.listdir(tbb_list_path):
-                if tbb_element.endswith("dll"):
-                    files_list.append(
-                        "<file src="
-                        + '"'
-                        + os.path.join(tbb_list_path, tbb_element)
-                        + runtimes_target
-                        + args.target_architecture
-                        + '\\native" />'
-                    )
-
     if args.execution_provider == "cuda" or is_cuda_gpu_win_sub_package and not is_ado_packaging_build:
         files_list.append(
             "<file src="
diff --git a/tools/python/dump_ort_model.py b/tools/python/dump_ort_model.py
index 2177c42f5bc3..b9e3bfa0d3bc 100644
--- a/tools/python/dump_ort_model.py
+++ b/tools/python/dump_ort_model.py
@@ -29,10 +29,10 @@ def __init__(self, model_path: str):
 
     def _dump_initializers(self, graph: fbs.Graph):
         print("Initializers:")
-        for idx in range(0, graph.InitializersLength()):
+        for idx in range(graph.InitializersLength()):
             tensor = graph.Initializers(idx)
             dims = []
-            for dim in range(0, tensor.DimsLength()):
+            for dim in range(tensor.DimsLength()):
                 dims.append(tensor.Dims(dim))
 
             print(f"{tensor.Name().decode()} data_type={tensor.DataType()} dims={dims}")
@@ -40,7 +40,7 @@ def _dump_initializers(self, graph: fbs.Graph):
 
     def _dump_nodeargs(self, graph: fbs.Graph):
         print("NodeArgs:")
-        for idx in range(0, graph.NodeArgsLength()):
+        for idx in range(graph.NodeArgsLength()):
             node_arg = graph.NodeArgs(idx)
             type = node_arg.Type()
             if not type:
@@ -57,7 +57,7 @@ def _dump_nodeargs(self, graph: fbs.Graph):
                 shape = tensor_type_and_shape.Shape()
                 if shape:
                     dims = []
-                    for dim in range(0, shape.DimLength()):
+                    for dim in range(shape.DimLength()):
                         d = shape.Dim(dim).Value()
                         if d.DimType() == fbs.DimensionValueType.DimensionValueType.VALUE:
                             dims.append(str(d.DimValue()))
@@ -76,8 +76,8 @@ def _dump_node(self, node: fbs.Node):
         domain = node.Domain().decode() or "ai.onnx"  # empty domain defaults to ai.onnx
         since_version = node.SinceVersion()
 
-        inputs = [node.Inputs(i).decode() for i in range(0, node.InputsLength())]
-        outputs = [node.Outputs(i).decode() for i in range(0, node.OutputsLength())]
+        inputs = [node.Inputs(i).decode() for i in range(node.InputsLength())]
+        outputs = [node.Outputs(i).decode() for i in range(node.OutputsLength())]
         print(
             f"{node.Index()}:{node.Name().decode()}({domain}:{optype}:{since_version}) "
             f'inputs=[{",".join(inputs)}] outputs=[{",".join(outputs)}]'
@@ -91,12 +91,12 @@ def _dump_graph(self, graph: fbs.Graph):
         self._dump_initializers(graph)
         self._dump_nodeargs(graph)
         print("Nodes:")
-        for i in range(0, graph.NodesLength()):
+        for i in range(graph.NodesLength()):
             node = graph.Nodes(i)
             self._dump_node(node)
 
             # Read all the attributes
-            for j in range(0, node.AttributesLength()):
+            for j in range(node.AttributesLength()):
                 attr = node.Attributes(j)
                 attr_type = attr.Type()
                 if attr_type == fbs.AttributeType.AttributeType.GRAPH:
@@ -107,7 +107,7 @@ def _dump_graph(self, graph: fbs.Graph):
                     # the ONNX spec doesn't currently define any operators that have multiple graphs in an attribute
                     # so entering this 'elif' isn't currently possible
                     print(f"## Subgraphs for {node.OpType().decode()}.{attr.Name().decode()} ##")
-                    for k in range(0, attr.GraphsLength()):
+                    for k in range(attr.GraphsLength()):
                         print(f"## Subgraph {k} ##")
                         self._dump_graph(attr.Graphs(k))
                         print(f"## End Subgraph {k} ##")
diff --git a/tools/python/find_optimizer_opset_version_updates_required.py b/tools/python/find_optimizer_opset_version_updates_required.py
index 8a5e57b51e38..b46f7e4a54d9 100644
--- a/tools/python/find_optimizer_opset_version_updates_required.py
+++ b/tools/python/find_optimizer_opset_version_updates_required.py
@@ -199,9 +199,7 @@ def find_potential_issues(root_dir, op_to_opset):
                 latest = op_to_opset[op]
                 if int(latest) != int(last_version):
                     log.warning(
-                        "Newer opset found for {}. Latest:{} Optimizer support ends at {}. File:{}".format(
-                            op, latest, last_version, file
-                        )
+                        f"Newer opset found for {op}. Latest:{latest} Optimizer support ends at {last_version}. File:{file}"
                     )
             else:
                 log.error(f"Failed to find version information for {op}. File:{file}")
diff --git a/tools/python/fix_long_lines.py b/tools/python/fix_long_lines.py
index 383fdc962355..8a3c249ef672 100644
--- a/tools/python/fix_long_lines.py
+++ b/tools/python/fix_long_lines.py
@@ -20,9 +20,8 @@ def _process_files(filenames, clang_exe, tmpdir):
         bad_lines = []
 
         with open(path, encoding="UTF8") as f:
-            line_num = 0
-            for line in f:
-                line_num += 1  # clang-format line numbers start at 1
+            for i, line in enumerate(f):
+                line_num = i + 1  # clang-format line numbers start at 1
                 if len(line) > 120:
                     bad_lines.append(line_num)
 
diff --git a/tools/python/gen_contrib_doc.py b/tools/python/gen_contrib_doc.py
index accab96bd359..ab9421b39532 100644
--- a/tools/python/gen_contrib_doc.py
+++ b/tools/python/gen_contrib_doc.py
@@ -359,11 +359,7 @@ def main(output_path: str, domain_filter: [str]):
 
             for _, namemap in supportmap:
                 for n, schema, versions in namemap:  # noqa: B007
-                    s = '  * {}<a href="#{}">{}</a>\n'.format(
-                        support_level_str(schema.support_level),
-                        format_name_with_domain(domain, n),
-                        format_name_with_domain(domain, n),
-                    )
+                    s = f'  * {support_level_str(schema.support_level)}<a href="#{format_name_with_domain(domain, n)}">{format_name_with_domain(domain, n)}</a>\n'
                     fout.write(s)
 
         fout.write("\n")
diff --git a/tools/python/gen_opkernel_doc.py b/tools/python/gen_opkernel_doc.py
index 1075ed8192fd..f6f9f2139685 100644
--- a/tools/python/gen_opkernel_doc.py
+++ b/tools/python/gen_opkernel_doc.py
@@ -22,11 +22,9 @@ def format_version_range(v):
 
 
 def format_type_constraints(tc):
-    counter = 0
     tcstr = ""
     firsttcitem = True
     for tcitem in tc:
-        counter += 1
         if firsttcitem:
             firsttcitem = False
         else:
@@ -98,7 +96,7 @@ def main(output_path: pathlib.Path, provider_filter: [str]):
                     paramstr += f"*out* {outp.name}:**{outp.typeStr}**"
 
             paramstr += ""
-            paramset = paramdict.get(fullname, None)
+            paramset = paramdict.get(fullname)
             if paramset is None:
                 paramdict[fullname] = set()
 
@@ -145,9 +143,8 @@ def main(output_path: pathlib.Path, provider_filter: [str]):
                         else:
                             fout.write("|||")
                         fout.write(format_version_range(version_range) + "|")
-                        tnameindex = 0
-                        for tname, tcset in sorted(typemap.items()):
-                            tnameindex += 1
+                        for i, (tname, tcset) in enumerate(sorted(typemap.items())):
+                            tnameindex = i + 1
                             tclist = []
                             for tc in sorted(tcset):
                                 tclist.append(tc)
diff --git a/tools/python/ort_test_dir_utils.py b/tools/python/ort_test_dir_utils.py
index cd1f5022af52..3af407b2aeee 100644
--- a/tools/python/ort_test_dir_utils.py
+++ b/tools/python/ort_test_dir_utils.py
@@ -115,8 +115,7 @@ def create_test_dir(
     model_outputs = model.graph.output
 
     def save_data(prefix, name_data_map, model_info):
-        idx = 0
-        for name, data in name_data_map.items():
+        for idx, (name, data) in enumerate(name_data_map.items()):
             if isinstance(data, dict):
                 # ignore. map<T1, T2> from traditional ML ops
                 pass
@@ -130,8 +129,6 @@ def save_data(prefix, name_data_map, model_info):
                 with open(filename, "wb") as f:
                     f.write(tensor.SerializeToString())
 
-            idx += 1
-
     if not name_input_map:
         name_input_map = {}
 
diff --git a/tools/python/run_CIs_for_branch.py b/tools/python/run_CIs_for_branch.py
new file mode 100644
index 000000000000..975ea2b988d7
--- /dev/null
+++ b/tools/python/run_CIs_for_branch.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import typing
+
+from run_CIs_for_external_pr import get_pipeline_names
+from util.platform_helpers import is_windows
+
+
+class DefaultArgsRawHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
+    pass
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        os.path.basename(__file__),
+        formatter_class=DefaultArgsRawHelpFormatter,
+        description="""Run the CIs used to validate PRs for the specified branch.
+
+        If not specified, the branch will be inferred (if possible) by running `git branch --show-current`.
+
+        If specified, the `--include` filter is applied first, followed by any `--exclude` filter.
+        `--include` and `--exclude` can be specified multiple times to accumulate values to include/exclude.
+
+        Requires the Azure CLI with DevOps extension to be installed.
+          Azure CLI: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
+          DevOps extension: https://github.com/Azure/azure-devops-cli-extension
+
+        Configuration:
+          Login:`az login`
+          Configure ORT repo as default:
+            `az devops configure --defaults organization=https://dev.azure.com/onnxruntime project=onnxruntime`
+
+        Example usage:
+          List all CIs
+            `python run_CIs_for_branch.py --dry-run my/BranchName`
+          Run all CIs
+            `python run_CIs_for_branch.py my/BranchName`
+          Run only Linux CIs
+            `python run_CIs_for_branch.py --include linux my/BranchName`
+          Exclude training CIs
+            `python run_CIs_for_branch.py --exclude training my/BranchName`
+          Run non-training Linux CIs
+            `python run_CIs_for_branch.py --include linux --exclude training my/BranchName`
+        """,
+    )
+
+    current_branch = None
+    get_branch_result = subprocess.run(["git", "branch", "--show-current"], capture_output=True, text=True, check=False)
+    if get_branch_result.returncode == 0:
+        current_branch = get_branch_result.stdout.strip()
+
+    parser.add_argument(
+        "-i", "--include", action="append", type=str, help="Include CIs that match this string. Case insensitive."
+    )
+    parser.add_argument(
+        "-e", "--exclude", action="append", type=str, help="Exclude CIs that match this string. Case insensitive."
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Print selected CIs but do not run them.")
+    parser.add_argument(
+        "branch",
+        type=str,
+        nargs="?",
+        default=current_branch,
+        help="Specify the branch to run. Default is current branch if available.",
+    )
+
+    args = parser.parse_args()
+    if not args.branch:
+        raise ValueError("Branch was unable to be inferred and must be specified")
+
+    return args
+
+
+def _run_az_pipelines_command(command: typing.List[str]):
+    try:
+        az = "az.cmd" if is_windows() else "az"
+        az_output = subprocess.run([az, "pipelines", *command], capture_output=True, text=True, check=True)
+    except subprocess.CalledProcessError as cpe:
+        print(cpe)
+        print(cpe.stderr)
+        sys.exit(-1)
+
+    return az_output
+
+
+def main():
+    args = _parse_args()
+    branch = args.branch
+
+    # To debug available pipelines:
+    # az_out = az_pipelines = _run_az_pipelines_command(["list"])
+    # pipeline_info = json.loads(az_out.stdout)
+    # print(pipeline_info)
+
+    pipelines = get_pipeline_names()
+    pipelines_to_run = []
+    if args.include:
+        values = [i.lower().strip() for i in args.include]
+        for p in pipelines:
+            include = False
+            for value in values:
+                if value in p.lower():
+                    include = True
+                    break
+
+            if include:
+                print(f"Including {p}")
+                pipelines_to_run.append(p)
+    else:
+        pipelines_to_run = pipelines
+
+    if args.exclude:
+        values = [e.lower().strip() for e in args.exclude]
+        cur_pipelines = pipelines_to_run
+        pipelines_to_run = []
+        for p in cur_pipelines:
+            exclude = False
+            for value in values:
+                if value in p.lower():
+                    exclude = True
+                    break
+
+            if exclude:
+                print(f"Excluding {p}")
+            else:
+                pipelines_to_run.append(p)
+
+    print(f"Pipelines to run for {args.branch}:")
+    for p in pipelines_to_run:
+        print(f"\t{p}")
+
+    if args.dry_run:
+        sys.exit(0)
+
+    for pipeline in pipelines_to_run:
+        az_out = _run_az_pipelines_command(["run", "--branch", branch, "--name", pipeline])
+        run_output = json.loads(az_out.stdout)
+        if "id" in run_output:
+            build_url = f"https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId={run_output['id']}"
+            print(f"{pipeline} build results: {build_url}&view=results")
+        else:
+            raise ValueError("Build id was not found in az output:\n" + run_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py
index 7a77839c4a4e..dcafe898b3bd 100644
--- a/tools/python/run_CIs_for_external_pr.py
+++ b/tools/python/run_CIs_for_external_pr.py
@@ -3,13 +3,54 @@
 # Licensed under the MIT License.
 
 import argparse
+import json
 import os
 import subprocess
 import sys
 import typing
 
 
-def parse_args():
+def get_pipeline_names():
+    # Current pipelines. These change semi-frequently and may need updating.
+    # There is no easy way to get the list of "required" pipelines using `azp` before they are run,
+    # so we need to maintain this list manually.
+    # NOTE: This list is also used by run_CIs_for_branch.py
+    pipelines = [
+        # windows
+        "Windows ARM64 QNN CI Pipeline",
+        "Windows x64 QNN CI Pipeline",
+        "Windows CPU CI Pipeline",
+        "Windows GPU CI Pipeline",
+        "Windows GPU TensorRT CI Pipeline",
+        "ONNX Runtime Web CI Pipeline",
+        # linux
+        "Linux CPU CI Pipeline",
+        "Linux CPU Minimal Build E2E CI Pipeline",
+        "Linux GPU CI Pipeline",
+        "Linux GPU TensorRT CI Pipeline",
+        "Linux OpenVINO CI Pipeline",
+        "Linux QNN CI Pipeline",
+        # mac
+        "MacOS CI Pipeline",
+        # training
+        "orttraining-amd-gpu-ci-pipeline",
+        "orttraining-linux-ci-pipeline",
+        "orttraining-linux-gpu-ci-pipeline",
+        "orttraining-ortmodule-distributed",
+        # checks
+        "onnxruntime-binary-size-checks-ci-pipeline",
+        # big models
+        "Big Models",
+        # not currently required, but running ensures we're hitting all mobile platforms
+        "Android CI Pipeline",
+        "iOS CI Pipeline",
+        "ONNX Runtime React Native CI Pipeline",
+    ]
+
+    return pipelines
+
+
+def _parse_args():
     parser = argparse.ArgumentParser(
         os.path.basename(__file__),
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -25,7 +66,7 @@ def parse_args():
     return args
 
 
-def run_gh_pr_command(command: typing.List[str], check=True):
+def run_gh_pr_command(command: typing.List[str], check: bool = True):
     try:
         return subprocess.run(["gh", "pr", *command], capture_output=True, text=True, check=check)
     except subprocess.CalledProcessError as cpe:
@@ -35,23 +76,25 @@ def run_gh_pr_command(command: typing.List[str], check=True):
 
 
 def main():
-    args = parse_args()
+    args = _parse_args()
     pr_id = args.pr
 
     # validate PR
-    gh_out = run_gh_pr_command(["view", pr_id])
-    info = gh_out.stdout.split("\n")
-    for line in info:
-        pieces = line.split("\t")
-        if len(pieces) != 2:
-            continue
-
-        if pieces[0] == "state:":
-            if pieces[1] != "OPEN":
-                print(f"PR {pr_id} is not OPEN. Currently in state {pieces[1]}.")
-                sys.exit(-1)
-
-    print("Check passed pipelines")
+    print("Checking PR is open")
+    gh_out = run_gh_pr_command(["view", "--json", "state", pr_id])
+    info = json.loads(gh_out.stdout)
+    if "state" not in info:
+        print(f"Could not get current state from `gh pr view` response of\n{gh_out.stdout}")
+        sys.exit(-1)
+
+    if info["state"] != "OPEN":
+        print(f"PR {pr_id} is not OPEN. Currently in state {info['state']}.")
+        sys.exit(0)
+
+    # This will return CIs that have run previously but not passed. We filter the CIs to run based on this, so it's
+    # fine for the initial response to have no info in it.
+    # `gh pr checks` exits with non-zero exit code when failures in pipeline exist, so we set `check` to False.
+    print("Checking for pipelines that have passed.")
     gh_out = run_gh_pr_command(["checks", pr_id, "--required"], check=False)
     # output format is a tab separated list of columns:
     # (pipeline name) "\t" (status) "\t" (ran time) "\t" (url)
@@ -61,52 +104,21 @@ def main():
         if len(columns) == 4 and columns[1] == "pass"
     ]
 
-    print("Adding azp run commands")
-
-    # Current pipelines. These change semi-frequently and may need updating.
-    #
-    # Note: there is no easy way to get the list for azp "required" pipelines before they starts.
-    #       we need to maintain this list manually.
-    #
-    pipelines = [
-        # windows
-        "Windows ARM64 QNN CI Pipeline",
-        "Windows x64 QNN CI Pipeline",
-        "Windows CPU CI Pipeline",
-        "Windows GPU CI Pipeline",
-        "Windows GPU TensorRT CI Pipeline",
-        "ONNX Runtime Web CI Pipeline",
-        # linux
-        "Linux CPU CI Pipeline",
-        "Linux CPU Minimal Build E2E CI Pipeline",
-        "Linux GPU CI Pipeline",
-        "Linux GPU TensorRT CI Pipeline",
-        "Linux OpenVINO CI Pipeline",
-        "Linux QNN CI Pipeline",
-        # mac
-        "MacOS CI Pipeline",
-        # training
-        "orttraining-amd-gpu-ci-pipeline",
-        "orttraining-linux-ci-pipeline",
-        "orttraining-linux-gpu-ci-pipeline",
-        "orttraining-ortmodule-distributed",
-        # checks
-        "onnxruntime-python-checks-ci-pipeline",
-        "onnxruntime-binary-size-checks-ci-pipeline",
-        # not currently required, but running ensures we're hitting all mobile platforms
-        "Android CI Pipeline",
-        "iOS CI Pipeline",
-        "ONNX Runtime React Native CI Pipeline",
-    ]
+    pipelines = get_pipeline_names()
 
     # remove pipelines that have already run successfully
     pipelines = [p for p in pipelines if p not in checked_pipelines]
 
+    print("Pipelines to run:")
+    for p in pipelines:
+        print("\t" + p)
+
     # azp run is limited to 10 pipelines at a time
     max_pipelines_per_comment = 10
     start = 0
     num_pipelines = len(pipelines)
 
+    print("Adding azp run commands")
     while start < num_pipelines:
         end = start + max_pipelines_per_comment
         if end > num_pipelines:
diff --git a/tools/python/util/convert_onnx_models_to_ort.py b/tools/python/util/convert_onnx_models_to_ort.py
index 18bba7866179..08e840092bc2 100644
--- a/tools/python/util/convert_onnx_models_to_ort.py
+++ b/tools/python/util/convert_onnx_models_to_ort.py
@@ -302,9 +302,7 @@ def convert_onnx_models_to_ort(
 
     for optimization_style in optimization_styles:
         print(
-            "Converting models with optimization style '{}' and level '{}'".format(
-                optimization_style.name, optimization_level_str
-            )
+            f"Converting models with optimization style '{optimization_style.name}' and level '{optimization_level_str}'"
         )
 
         converted_models = _convert(
@@ -330,9 +328,9 @@ def convert_onnx_models_to_ort(
                 )
                 session_options_config_entries_for_second_conversion = session_options_config_entries.copy()
                 # Limit the optimizations to those that can run in a model with runtime optimizations.
-                session_options_config_entries_for_second_conversion[
-                    "optimization.minimal_build_optimizations"
-                ] = "apply"
+                session_options_config_entries_for_second_conversion["optimization.minimal_build_optimizations"] = (
+                    "apply"
+                )
 
                 print(
                     "Converting models again without runtime optimizations to generate a complete config file. "
@@ -351,9 +349,7 @@ def convert_onnx_models_to_ort(
                 )
 
             print(
-                "Generating config file from ORT format models with optimization style '{}' and level '{}'".format(
-                    optimization_style.name, optimization_level_str
-                )
+                f"Generating config file from ORT format models with optimization style '{optimization_style.name}' and level '{optimization_level_str}'"
             )
 
             config_file = _create_config_file_path(
diff --git a/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py b/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
index 9eccb7c36455..548d4a8ba6c4 100644
--- a/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
+++ b/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
@@ -105,7 +105,7 @@ def _node_output_is_supported(name):
 
         # some models don't have complete imports. use 1 as a default as that's valid for custom domains and should
         # result in an error for any others. not sure why ONNX or ORT validation allows this though.
-        opset = opsets[domain] if domain in opsets else 1
+        opset = opsets.get(domain, 1)
         if (
             domain not in required_ops
             or opset not in required_ops[domain]
@@ -230,7 +230,7 @@ def run_check_with_model(
     if unsupported_ops:
         logger.info("Unsupported operators:")
         for entry in sorted(unsupported_ops):
-            logger.info("  " + entry)
+            logger.info("  " + entry)  # noqa: G003
 
     if unsupported:
         logger.info("\nModel is not supported by the pre-built package due to unsupported types and/or operators.")
diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py
index 22d7dff3e13b..598549c42b60 100644
--- a/tools/python/util/ort_format_model/operator_type_usage_processors.py
+++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py
@@ -92,7 +92,6 @@ def to_config_entry(self):
         Generate a configuration file entry in JSON format with the required types for the operator.
         :return: JSON string with required type information.
         """
-        pass
 
     @abstractmethod
     def from_config_entry(self, entry: str):
@@ -101,7 +100,6 @@ def from_config_entry(self, entry: str):
         NOTE: Any existing type information should be cleared prior to re-creating from a config file entry.
         :param entry: Configuration file entry
         """
-        pass
 
 
 class DefaultTypeUsageProcessor(TypeUsageProcessor):
@@ -182,9 +180,7 @@ def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
             # Don't know of any ops where the number of outputs changed across versions, so require a valid length
             if o >= node.OutputsLength():
                 raise RuntimeError(
-                    "Node has {} outputs. Tracker for {} incorrectly configured as it requires {}.".format(
-                        node.OutputsLength(), self.name, o
-                    )
+                    f"Node has {node.OutputsLength()} outputs. Tracker for {self.name} incorrectly configured as it requires {o}."
                 )
 
             type_str = value_name_to_typestr(node.Outputs(o), value_name_to_typeinfo)
@@ -514,7 +510,6 @@ def is_typed_registration_needed(self, domain: str, optype: str, type_registrati
         :param type_registration_str: Type string from kernel registration
         :return: True is required. False if not.
         """
-        pass
 
     @abstractmethod
     def get_cpp_entries(self):
@@ -522,7 +517,6 @@ def get_cpp_entries(self):
         Get the C++ code that specifies the operator types to enable.
         :return: List of strings. One line of C++ code per entry.
         """
-        pass
 
 
 class OperatorTypeUsageManager:
@@ -644,9 +638,7 @@ def __init__(self, globally_allowed_types: typing.Set[str]):
 
         if not globally_allowed_types.issubset(self._valid_allowed_types):
             raise ValueError(
-                "Globally allowed types must all be valid. Invalid types: {}".format(
-                    sorted(globally_allowed_types - self._valid_allowed_types)
-                )
+                f"Globally allowed types must all be valid. Invalid types: {sorted(globally_allowed_types - self._valid_allowed_types)}"
             )
 
         self._globally_allowed_types = globally_allowed_types
diff --git a/tools/python/util/ort_format_model/ort_model_processor.py b/tools/python/util/ort_format_model/ort_model_processor.py
index d3a07efe92aa..b20f3a0cfd97 100644
--- a/tools/python/util/ort_format_model/ort_model_processor.py
+++ b/tools/python/util/ort_format_model/ort_model_processor.py
@@ -35,7 +35,7 @@ def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo={}):  # noqa:
         :return: Dictionary of NodeArg name to TypeInfo
         """
         value_name_to_typeinfo = outer_scope_value_typeinfo.copy()
-        for j in range(0, graph.NodeArgsLength()):
+        for j in range(graph.NodeArgsLength()):
             n = graph.NodeArgs(j)
             value_name_to_typeinfo[n.Name()] = n.Type()  # TypeInfo for this NodeArg's name
 
@@ -57,7 +57,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
         # Merge the TypeInfo for all values in this level of the graph with the outer scope value TypeInfo.
         value_name_to_typeinfo = OrtFormatModelProcessor._setup_type_info(graph, outer_scope_value_typeinfo)
 
-        for i in range(0, graph.NodesLength()):
+        for i in range(graph.NodesLength()):
             node = graph.Nodes(i)
 
             optype = node.OpType().decode()
@@ -69,7 +69,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
                 self._op_type_processors.process_node(node, value_name_to_typeinfo)
 
             # Read all the attributes
-            for j in range(0, node.AttributesLength()):
+            for j in range(node.AttributesLength()):
                 attr = node.Attributes(j)
                 attr_type = attr.Type()
                 if attr_type == fbs.AttributeType.AttributeType.GRAPH:
@@ -77,7 +77,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
                 elif attr_type == fbs.AttributeType.AttributeType.GRAPHS:
                     # the ONNX spec doesn't currently define any operators that have multiple graphs in an attribute
                     # so entering this 'elif' isn't currently possible
-                    for k in range(0, attr.GraphsLength()):
+                    for k in range(attr.GraphsLength()):
                         self._process_graph(attr.Graphs(k), value_name_to_typeinfo)
 
     def process(self):
diff --git a/tools/scripts/build_riscv64.sh b/tools/scripts/build_riscv64.sh
new file mode 100755
index 000000000000..65681c0b6307
--- /dev/null
+++ b/tools/scripts/build_riscv64.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# Copyright (c) 2024 SiFive, Inc. All rights reserved.
+# Copyright (c) 2024, Phoebe Chen <phoebe.chen@sifive.com>
+# Licensed under the MIT License.
+
+
+# The script is a sample for RISC-V 64-bit cross compilation in
+# GNU/Linux, and you should ensure that your environment meets
+# ORT requirements. You may need to make changes before using it.
+
+set -e
+set -o pipefail
+
+# Get directory this script is in
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+OS=$(uname -s)
+
+if [ "$OS" == "Linux" ]; then
+    LINUX_DISTRO=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+    if [[ "${LINUX_DISTRO}" == "ubuntu" ]] ;then
+        DIR_OS="Linux"
+    else
+        echo "${LINUX_DISTRO} is not supported"
+        return 1
+    fi
+else
+    echo "$OS is not supported"
+    return 1
+fi
+
+function cleanup {
+  if [ -d "$WORK_DIR" ]; then
+    rm -rf "$WORK_DIR"
+  fi
+}
+
+# The riscv toolchain, qemu and other platform related settings.
+ORT_ROOT_DIR=$DIR/../..
+
+PREBUILT_DIR="${ORT_ROOT_DIR}/riscv_tools"
+
+read -rp "Enter the riscv tools root path(press enter to use default path:${PREBUILT_DIR}): " INPUT_PATH
+if [[ "${INPUT_PATH}" ]]; then
+  PREBUILT_DIR=${INPUT_PATH}
+fi
+echo "The riscv tool prefix path: ${PREBUILT_DIR}"
+
+WORK_DIR=$DIR/.prebuilt
+
+# The prebuit toolchain download from riscv-collab works with Ubuntu.
+RISCV_GNU_TOOLCHAIN_URL="https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download"
+TOOLCHAIN_VERSION="2023.11.20"
+RISCV_TOOLCHAIN_FILE_NAME="riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.11.20-nightly.tar.gz"
+RISCV_TOOLCHAIN_FILE_SHA="98d6531b757fac01e065460c19abe8974976c607a8d88631cc5c1529d90ba7ba"
+
+TOOLCHAIN_PATH_PREFIX=${PREBUILT_DIR}
+
+execute () {
+  if ! eval "$1"; then
+    echo "command:\"$1\" error"
+    exit 1
+  fi
+}
+
+execute "mkdir -p $WORK_DIR"
+
+# Call the cleanup function when this tool exits.
+trap cleanup EXIT
+
+# Download and install the toolchain from
+# https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download
+download_file() {
+  local file_name="$1"
+  local install_path="$2"
+  local file_sha="$3"
+
+  echo "Install $1 to $2"
+  if [[ "$(ls -A "$2")" ]]; then
+    read -rp "The file already exists. Keep it (y/n)? " replaced
+    case ${replaced:0:1} in
+      y|Y )
+        echo "Skip download $1."
+        return
+      ;;
+      * )
+        rm -rf "$2"
+      ;;
+    esac
+  fi
+
+  echo "Download ${file_name} ..."
+  mkdir -p "$install_path"
+  wget --progress=bar:force:noscroll --directory-prefix="${WORK_DIR}" \
+    "${RISCV_GNU_TOOLCHAIN_URL}/${TOOLCHAIN_VERSION}/${file_name}" && \
+    echo "${file_sha} ${WORK_DIR}/${file_name}" | sha256sum -c -
+  echo "Extract ${file_name} ..."
+  tar -C "${install_path}" -xf "${WORK_DIR}/${file_name}" --no-same-owner \
+    --strip-components=1
+}
+
+
+read -rp "Install RISCV toolchain(y/n)? " answer
+case ${answer:0:1} in
+  y|Y )
+    download_file "${RISCV_TOOLCHAIN_FILE_NAME}" \
+                  "${TOOLCHAIN_PATH_PREFIX}" \
+                  "${RISCV_TOOLCHAIN_FILE_SHA}"
+  ;;
+  * )
+    echo "Skip install RISCV toolchain."
+  ;;
+esac
+echo "download finished."
+
+
+# RISC-V cross compilation in GNU/Linux
+RISCV_TOOLCHAIN_ROOT=${TOOLCHAIN_PATH_PREFIX}
+RISCV_QEMU_PATH=${TOOLCHAIN_PATH_PREFIX}/bin/qemu-riscv64
+python3 "${ORT_ROOT_DIR}"/tools/ci_build/build.py \
+    --build_dir "${ORT_ROOT_DIR}/build/${DIR_OS}" \
+    --rv64 \
+    --parallel \
+    --skip_tests \
+    --config RelWithDebInfo \
+    --cmake_generator=Ninja \
+    --riscv_qemu_path="${RISCV_QEMU_PATH}" \
+    --riscv_toolchain_root="${RISCV_TOOLCHAIN_ROOT}" "$@"
+
+
diff --git a/tools/scripts/python_test.sh b/tools/scripts/python_test.sh
index bfdd4663feed..39d9ed432a1d 100755
--- a/tools/scripts/python_test.sh
+++ b/tools/scripts/python_test.sh
@@ -24,5 +24,5 @@ python3 -m pip install $build_dir/$config/dist/*.whl
 
 echo Run $config unit tests
 pushd $build_dir/$config/
-python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_onnx_tests --enable_transformers_tool_test --ctest_path ""
+python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags  --build_wheel --enable_onnx_tests --enable_transformers_tool_test --ctest_path ""
 popd
diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp
index a89ac561f886..1763290718a8 100644
--- a/winml/lib/Api/HardwareCoreEnumerator.cpp
+++ b/winml/lib/Api/HardwareCoreEnumerator.cpp
@@ -14,7 +14,7 @@ struct LogicalProcessorInformation {
 
 struct CoreCounter {
   uint32_t PhysicalCores = 0;
-  uint32_t SocDieCores = 0;
+  uint32_t LLCCores = 0;
 };
 
 static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
@@ -42,7 +42,7 @@ uint32_t CountSetBits(DWORD input) {
   return c;
 }
 
-static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
+static CoreCounter GetCoreInfo() {
   auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);
 
   CoreCounter cores;
@@ -64,6 +64,7 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
         cores.PhysicalCores++;
         break;
       case RelationCache:
+        //Cache level masks count Logicial processors
         if (currentProcessorInfo->Cache.Level == 2) {
           dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
         } else if (currentProcessorInfo->Cache.Level == 3) {
@@ -75,16 +76,35 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
     read += currentProcessorInfo->Size;
   }
 
-  cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
+  cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
+
   return cores;
 }
 
 uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
   // # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
   // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
-  auto cores = GetNumberOPhysicalAndEngineeringCores();
-  // We want to use the number of physical cores, but exclude soc cores
-  return cores.PhysicalCores - cores.SocDieCores;
+  auto cores = GetCoreInfo();
+
+#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
+  const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69};  // "GenuntelineI"
+  int regs_leaf0[4];
+  int regs_leaf7[4];
+  __cpuid(regs_leaf0, 0);
+  __cpuid(regs_leaf7, 0x7);
+
+  auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) &&
+    (kVendorID_Intel[2] == regs_leaf0[3]);
+
+  auto isHybrid = (regs_leaf7[3] & (1 << 15));
+
+  if (isIntel && isHybrid) {
+    // We want to use the number of physical cores, but exclude cores without an LLC
+    return cores.LLCCores;
+  }
+#endif
+
+  return cores.PhysicalCores;
 }
 
 }  // namespace WINMLP
diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp
index f40f08ad2696..27d74d7d6b03 100644
--- a/winml/test/model/model_tests.cpp
+++ b/winml/test/model/model_tests.cpp
@@ -118,7 +118,7 @@ TEST_P(ModelTest, Run) {
   LearningModelDevice device = nullptr;
   LearningModelSession session = nullptr;
   LearningModelBinding binding = nullptr;
-  WINML_EXPECT_NO_THROW(model = LearningModel::LoadFromFilePath(m_testCase->GetModelUrl()));
+  WINML_EXPECT_NO_THROW(model = LearningModel::LoadFromFilePath(m_testCase->GetModelUrl().native()));
   WINML_EXPECT_NO_THROW(device = LearningModelDevice(m_deviceKind));
   WINML_EXPECT_NO_THROW(session = LearningModelSession(model, device));
   for (size_t i = 0; i < m_testCase->GetDataCount(); i++) {